diff options
Diffstat (limited to 'fs/xfs')
104 files changed, 7474 insertions, 6852 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 56641fe52a23..c8fb13f83b3f 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -16,7 +16,7 @@ # Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA # -EXTRA_CFLAGS += -I$(src) -I$(src)/linux-2.6 -funsigned-char +EXTRA_CFLAGS += -I$(src) -I$(src)/linux-2.6 XFS_LINUX := linux-2.6 @@ -77,6 +77,7 @@ xfs-y += xfs_alloc.o \ xfs_itable.o \ xfs_dfrag.o \ xfs_log.o \ + xfs_log_cil.o \ xfs_log_recover.o \ xfs_mount.o \ xfs_mru_cache.o \ @@ -105,7 +106,6 @@ xfs-y += $(addprefix $(XFS_LINUX)/, \ xfs_globals.o \ xfs_ioctl.o \ xfs_iops.o \ - xfs_lrw.o \ xfs_super.o \ xfs_sync.o \ xfs_xattr.o) diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c index 2d3f90afe5f1..666c9db48eb6 100644 --- a/fs/xfs/linux-2.6/kmem.c +++ b/fs/xfs/linux-2.6/kmem.c @@ -16,16 +16,33 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include <linux/mm.h> -#include <linux/vmalloc.h> #include <linux/highmem.h> +#include <linux/slab.h> #include <linux/swap.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include "time.h" #include "kmem.h" -#define MAX_VMALLOCS 6 -#define MAX_SLAB_SIZE 0x20000 +/* + * Greedy allocation. May fail and may return vmalloced memory. + * + * Must be freed using kmem_free_large. + */ +void * +kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize) +{ + void *ptr; + size_t kmsize = maxsize; + + while (!(ptr = kmem_zalloc_large(kmsize))) { + if ((kmsize >>= 1) <= minsize) + kmsize = minsize; + } + if (ptr) + *size = kmsize; + return ptr; +} void * kmem_alloc(size_t size, unsigned int __nocast flags) @@ -34,19 +51,8 @@ kmem_alloc(size_t size, unsigned int __nocast flags) gfp_t lflags = kmem_flags_convert(flags); void *ptr; -#ifdef DEBUG - if (unlikely(!(flags & KM_LARGE) && (size > PAGE_SIZE))) { - printk(KERN_WARNING "Large %s attempt, size=%ld\n", - __func__, (long)size); - dump_stack(); - } -#endif - do { - if (size < MAX_SLAB_SIZE || retries > MAX_VMALLOCS) - ptr = kmalloc(size, lflags); - else - ptr = __vmalloc(size, lflags, PAGE_KERNEL); + ptr = kmalloc(size, lflags); if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) return ptr; if (!(++retries % 100)) @@ -68,27 +74,6 @@ kmem_zalloc(size_t size, unsigned int __nocast flags) return ptr; } -void * -kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize, - unsigned int __nocast flags) -{ - void *ptr; - size_t kmsize = maxsize; - unsigned int kmflags = (flags & ~KM_SLEEP) | KM_NOSLEEP; - - while (!(ptr = kmem_zalloc(kmsize, kmflags))) { - if ((kmsize <= minsize) && (flags & KM_NOSLEEP)) - break; - if ((kmsize >>= 1) <= minsize) { - kmsize = minsize; - kmflags = flags; - } - } - if (ptr) - *size = kmsize; - return ptr; -} - void kmem_free(const void *ptr) { diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h index 179cbd630f69..f7c8f7a9ea6d 100644 --- a/fs/xfs/linux-2.6/kmem.h +++ b/fs/xfs/linux-2.6/kmem.h @@ -21,6 +21,7 @@ #include <linux/slab.h> #include <linux/sched.h> #include <linux/mm.h> +#include <linux/vmalloc.h> /* * General memory allocation interfaces @@ -30,7 +31,6 @@ #define KM_NOSLEEP 0x0002u #define KM_NOFS 0x0004u #define KM_MAYFAIL 0x0008u -#define KM_LARGE 0x0010u /* * We use a special process flag to avoid recursive callbacks into @@ -42,7 +42,7 @@ kmem_flags_convert(unsigned int __nocast flags) { gfp_t lflags; - BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_LARGE)); + BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL)); if (flags & KM_NOSLEEP) { lflags = GFP_ATOMIC | __GFP_NOWARN; @@ -56,10 +56,25 @@ kmem_flags_convert(unsigned int __nocast flags) extern void *kmem_alloc(size_t, unsigned int __nocast); extern void *kmem_zalloc(size_t, unsigned int __nocast); -extern void *kmem_zalloc_greedy(size_t *, size_t, size_t, unsigned int __nocast); extern void *kmem_realloc(const void *, size_t, size_t, unsigned int __nocast); extern void kmem_free(const void *); +static inline void *kmem_zalloc_large(size_t size) +{ + void *ptr; + + ptr = vmalloc(size); + if (ptr) + memset(ptr, 0, size); + return ptr; +} +static inline void kmem_free_large(void *ptr) +{ + vfree(ptr); +} + +extern void *kmem_zalloc_greedy(size_t *, size_t, size_t); + /* * Zone interfaces */ diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c index 883ca5ab8af5..9f769b5b38fc 100644 --- a/fs/xfs/linux-2.6/xfs_acl.c +++ b/fs/xfs/linux-2.6/xfs_acl.c @@ -22,6 +22,7 @@ #include "xfs_inode.h" #include "xfs_vnodeops.h" #include "xfs_trace.h" +#include <linux/slab.h> #include <linux/xattr.h> #include <linux/posix_acl_xattr.h> @@ -106,7 +107,7 @@ xfs_get_acl(struct inode *inode, int type) struct posix_acl *acl; struct xfs_acl *xfs_acl; int len = sizeof(struct xfs_acl); - char *ea_name; + unsigned char *ea_name; int error; acl = get_cached_acl(inode, type); @@ -133,7 +134,8 @@ xfs_get_acl(struct inode *inode, int type) if (!xfs_acl) return ERR_PTR(-ENOMEM); - error = -xfs_attr_get(ip, ea_name, (char *)xfs_acl, &len, ATTR_ROOT); + error = -xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl, + &len, ATTR_ROOT); if (error) { /* * If the attribute doesn't exist make sure we have a negative @@ -162,7 +164,7 @@ STATIC int xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) { struct xfs_inode *ip = XFS_I(inode); - char *ea_name; + unsigned char *ea_name; int error; if (S_ISLNK(inode->i_mode)) @@ -194,7 +196,7 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) (sizeof(struct xfs_acl_entry) * (XFS_ACL_MAX_ENTRIES - acl->a_count)); - error = -xfs_attr_set(ip, ea_name, (char *)xfs_acl, + error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl, len, ATTR_ROOT); kfree(xfs_acl); @@ -262,7 +264,7 @@ xfs_set_mode(struct inode *inode, mode_t mode) } static int -xfs_acl_exists(struct inode *inode, char *name) +xfs_acl_exists(struct inode *inode, unsigned char *name) { int len = sizeof(struct xfs_acl); @@ -438,14 +440,14 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name, return error; } -struct xattr_handler xfs_xattr_acl_access_handler = { +const struct xattr_handler xfs_xattr_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, .flags = ACL_TYPE_ACCESS, .get = xfs_xattr_acl_get, .set = xfs_xattr_acl_set, }; -struct xattr_handler xfs_xattr_acl_default_handler = { +const struct xattr_handler xfs_xattr_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, .flags = ACL_TYPE_DEFAULT, .get = xfs_xattr_acl_get, diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 66abe36c1213..34640d6dbdcb 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -39,10 +39,21 @@ #include "xfs_iomap.h" #include "xfs_vnodeops.h" #include "xfs_trace.h" +#include "xfs_bmap.h" +#include <linux/gfp.h> #include <linux/mpage.h> #include <linux/pagevec.h> #include <linux/writeback.h> +/* + * Types of I/O for bmap clustering and I/O completion tracking. + */ +enum { + IO_READ, /* mapping for a read */ + IO_DELAY, /* mapping covers delalloc region */ + IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */ + IO_NEW /* just allocated */ +}; /* * Prime number of hash buckets since address is used as the key. @@ -101,8 +112,9 @@ xfs_count_page_state( STATIC struct block_device * xfs_find_bdev_for_inode( - struct xfs_inode *ip) + struct inode *inode) { + struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; if (XFS_IS_REALTIME_INODE(ip)) @@ -163,14 +175,17 @@ xfs_ioend_new_eof( } /* - * Update on-disk file size now that data has been written to disk. - * The current in-memory file size is i_size. If a write is beyond - * eof i_new_size will be the intended file size until i_size is - * updated. If this write does not extend all the way to the valid - * file size then restrict this update to the end of the write. + * Update on-disk file size now that data has been written to disk. The + * current in-memory file size is i_size. If a write is beyond eof i_new_size + * will be the intended file size until i_size is updated. If this write does + * not extend all the way to the valid file size then restrict this update to + * the end of the write. + * + * This function does not block as blocking on the inode lock in IO completion + * can lead to IO completion order dependency deadlocks.. If it can't get the + * inode ilock it will return EAGAIN. Callers must handle this. */ - -STATIC void +STATIC int xfs_setfilesize( xfs_ioend_t *ioend) { @@ -178,19 +193,43 @@ xfs_setfilesize( xfs_fsize_t isize; ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); - ASSERT(ioend->io_type != IOMAP_READ); + ASSERT(ioend->io_type != IO_READ); if (unlikely(ioend->io_error)) - return; + return 0; + + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) + return EAGAIN; - xfs_ilock(ip, XFS_ILOCK_EXCL); isize = xfs_ioend_new_eof(ioend); if (isize) { ip->i_d.di_size = isize; - xfs_mark_inode_dirty_sync(ip); + xfs_mark_inode_dirty(ip); } xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; +} + +/* + * Schedule IO completion handling on a xfsdatad if this was + * the final hold on this ioend. If we are asked to wait, + * flush the workqueue. + */ +STATIC void +xfs_finish_ioend( + xfs_ioend_t *ioend, + int wait) +{ + if (atomic_dec_and_test(&ioend->io_remaining)) { + struct workqueue_struct *wq; + + wq = (ioend->io_type == IO_UNWRITTEN) ? + xfsconvertd_workqueue : xfsdatad_workqueue; + queue_work(wq, &ioend->io_work); + if (wait) + flush_workqueue(wq); + } } /* @@ -198,19 +237,18 @@ xfs_setfilesize( */ STATIC void xfs_end_io( - struct work_struct *work) + struct work_struct *work) { - xfs_ioend_t *ioend = - container_of(work, xfs_ioend_t, io_work); - struct xfs_inode *ip = XFS_I(ioend->io_inode); + xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work); + struct xfs_inode *ip = XFS_I(ioend->io_inode); + int error = 0; /* * For unwritten extents we need to issue transactions to convert a * range to normal written extens after the data I/O has finished. */ - if (ioend->io_type == IOMAP_UNWRITTEN && + if (ioend->io_type == IO_UNWRITTEN && likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) { - int error; error = xfs_iomap_write_unwritten(ip, ioend->io_offset, ioend->io_size); @@ -222,30 +260,23 @@ xfs_end_io( * We might have to update the on-disk file size after extending * writes. */ - if (ioend->io_type != IOMAP_READ) - xfs_setfilesize(ioend); - xfs_destroy_ioend(ioend); -} - -/* - * Schedule IO completion handling on a xfsdatad if this was - * the final hold on this ioend. If we are asked to wait, - * flush the workqueue. - */ -STATIC void -xfs_finish_ioend( - xfs_ioend_t *ioend, - int wait) -{ - if (atomic_dec_and_test(&ioend->io_remaining)) { - struct workqueue_struct *wq; - - wq = (ioend->io_type == IOMAP_UNWRITTEN) ? - xfsconvertd_workqueue : xfsdatad_workqueue; - queue_work(wq, &ioend->io_work); - if (wait) - flush_workqueue(wq); + if (ioend->io_type != IO_READ) { + error = xfs_setfilesize(ioend); + ASSERT(!error || error == EAGAIN); } + + /* + * If we didn't complete processing of the ioend, requeue it to the + * tail of the workqueue for another attempt later. Otherwise destroy + * it. + */ + if (error == EAGAIN) { + atomic_inc(&ioend->io_remaining); + xfs_finish_ioend(ioend, 0); + /* ensure we don't spin on blocked ioends */ + delay(1); + } else + xfs_destroy_ioend(ioend); } /* @@ -288,21 +319,25 @@ xfs_map_blocks( struct inode *inode, loff_t offset, ssize_t count, - xfs_iomap_t *mapp, + struct xfs_bmbt_irec *imap, int flags) { int nmaps = 1; + int new = 0; - return -xfs_iomap(XFS_I(inode), offset, count, flags, mapp, &nmaps); + return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new); } STATIC int -xfs_iomap_valid( - xfs_iomap_t *iomapp, - loff_t offset) +xfs_imap_valid( + struct inode *inode, + struct xfs_bmbt_irec *imap, + xfs_off_t offset) { - return offset >= iomapp->iomap_offset && - offset < iomapp->iomap_offset + iomapp->iomap_bsize; + offset >>= inode->i_blkbits; + + return offset >= imap->br_startoff && + offset < imap->br_startoff + imap->br_blockcount; } /* @@ -341,7 +376,7 @@ xfs_submit_ioend_bio( * but don't update the inode size until I/O completion. */ if (xfs_ioend_new_eof(ioend)) - xfs_mark_inode_dirty_sync(XFS_I(ioend->io_inode)); + xfs_mark_inode_dirty(XFS_I(ioend->io_inode)); submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC_PLUG : WRITE, bio); @@ -533,19 +568,23 @@ xfs_add_to_ioend( STATIC void xfs_map_buffer( + struct inode *inode, struct buffer_head *bh, - xfs_iomap_t *mp, - xfs_off_t offset, - uint block_bits) + struct xfs_bmbt_irec *imap, + xfs_off_t offset) { sector_t bn; + struct xfs_mount *m = XFS_I(inode)->i_mount; + xfs_off_t iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff); + xfs_daddr_t iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock); - ASSERT(mp->iomap_bn != IOMAP_DADDR_NULL); + ASSERT(imap->br_startblock != HOLESTARTBLOCK); + ASSERT(imap->br_startblock != DELAYSTARTBLOCK); - bn = (mp->iomap_bn >> (block_bits - BBSHIFT)) + - ((offset - mp->iomap_offset) >> block_bits); + bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) + + ((offset - iomap_offset) >> inode->i_blkbits); - ASSERT(bn || (mp->iomap_flags & IOMAP_REALTIME)); + ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode))); bh->b_blocknr = bn; set_buffer_mapped(bh); @@ -553,17 +592,17 @@ xfs_map_buffer( STATIC void xfs_map_at_offset( + struct inode *inode, struct buffer_head *bh, - loff_t offset, - int block_bits, - xfs_iomap_t *iomapp) + struct xfs_bmbt_irec *imap, + xfs_off_t offset) { - ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE)); - ASSERT(!(iomapp->iomap_flags & IOMAP_DELAY)); + ASSERT(imap->br_startblock != HOLESTARTBLOCK); + ASSERT(imap->br_startblock != DELAYSTARTBLOCK); lock_buffer(bh); - xfs_map_buffer(bh, iomapp, offset, block_bits); - bh->b_bdev = iomapp->iomap_target->bt_bdev; + xfs_map_buffer(inode, bh, imap, offset); + bh->b_bdev = xfs_find_bdev_for_inode(inode); set_buffer_mapped(bh); clear_buffer_delay(bh); clear_buffer_unwritten(bh); @@ -692,11 +731,11 @@ xfs_is_delayed_page( bh = head = page_buffers(page); do { if (buffer_unwritten(bh)) - acceptable = (type == IOMAP_UNWRITTEN); + acceptable = (type == IO_UNWRITTEN); else if (buffer_delay(bh)) - acceptable = (type == IOMAP_DELAY); + acceptable = (type == IO_DELAY); else if (buffer_dirty(bh) && buffer_mapped(bh)) - acceptable = (type == IOMAP_NEW); + acceptable = (type == IO_NEW); else break; } while ((bh = bh->b_this_page) != head); @@ -719,7 +758,7 @@ xfs_convert_page( struct inode *inode, struct page *page, loff_t tindex, - xfs_iomap_t *mp, + struct xfs_bmbt_irec *imap, xfs_ioend_t **ioendp, struct writeback_control *wbc, int startio, @@ -729,7 +768,6 @@ xfs_convert_page( xfs_off_t end_offset; unsigned long p_offset; unsigned int type; - int bbits = inode->i_blkbits; int len, page_dirty; int count = 0, done = 0, uptodate = 1; xfs_off_t offset = page_offset(page); @@ -781,19 +819,19 @@ xfs_convert_page( if (buffer_unwritten(bh) || buffer_delay(bh)) { if (buffer_unwritten(bh)) - type = IOMAP_UNWRITTEN; + type = IO_UNWRITTEN; else - type = IOMAP_DELAY; + type = IO_DELAY; - if (!xfs_iomap_valid(mp, offset)) { + if (!xfs_imap_valid(inode, imap, offset)) { done = 1; continue; } - ASSERT(!(mp->iomap_flags & IOMAP_HOLE)); - ASSERT(!(mp->iomap_flags & IOMAP_DELAY)); + ASSERT(imap->br_startblock != HOLESTARTBLOCK); + ASSERT(imap->br_startblock != DELAYSTARTBLOCK); - xfs_map_at_offset(bh, offset, bbits, mp); + xfs_map_at_offset(inode, bh, imap, offset); if (startio) { xfs_add_to_ioend(inode, bh, offset, type, ioendp, done); @@ -805,7 +843,7 @@ xfs_convert_page( page_dirty--; count++; } else { - type = IOMAP_NEW; + type = IO_NEW; if (buffer_mapped(bh) && all_bh && startio) { lock_buffer(bh); xfs_add_to_ioend(inode, bh, offset, @@ -845,7 +883,7 @@ STATIC void xfs_cluster_write( struct inode *inode, pgoff_t tindex, - xfs_iomap_t *iomapp, + struct xfs_bmbt_irec *imap, xfs_ioend_t **ioendp, struct writeback_control *wbc, int startio, @@ -864,7 +902,7 @@ xfs_cluster_write( for (i = 0; i < pagevec_count(&pvec); i++) { done = xfs_convert_page(inode, pvec.pages[i], tindex++, - iomapp, ioendp, wbc, startio, all_bh); + imap, ioendp, wbc, startio, all_bh); if (done) break; } @@ -874,6 +912,125 @@ xfs_cluster_write( } } +STATIC void +xfs_vm_invalidatepage( + struct page *page, + unsigned long offset) +{ + trace_xfs_invalidatepage(page->mapping->host, page, offset); + block_invalidatepage(page, offset); +} + +/* + * If the page has delalloc buffers on it, we need to punch them out before we + * invalidate the page. If we don't, we leave a stale delalloc mapping on the + * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read + * is done on that same region - the delalloc extent is returned when none is + * supposed to be there. + * + * We prevent this by truncating away the delalloc regions on the page before + * invalidating it. Because they are delalloc, we can do this without needing a + * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this + * truncation without a transaction as there is no space left for block + * reservation (typically why we see a ENOSPC in writeback). + * + * This is not a performance critical path, so for now just do the punching a + * buffer head at a time. + */ +STATIC void +xfs_aops_discard_page( + struct page *page) +{ + struct inode *inode = page->mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct buffer_head *bh, *head; + loff_t offset = page_offset(page); + ssize_t len = 1 << inode->i_blkbits; + + if (!xfs_is_delayed_page(page, IO_DELAY)) + goto out_invalidate; + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + goto out_invalidate; + + xfs_fs_cmn_err(CE_ALERT, ip->i_mount, + "page discard on page %p, inode 0x%llx, offset %llu.", + page, ip->i_ino, offset); + + xfs_ilock(ip, XFS_ILOCK_EXCL); + bh = head = page_buffers(page); + do { + int done; + xfs_fileoff_t offset_fsb; + xfs_bmbt_irec_t imap; + int nimaps = 1; + int error; + xfs_fsblock_t firstblock; + xfs_bmap_free_t flist; + + if (!buffer_delay(bh)) + goto next_buffer; + + offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); + + /* + * Map the range first and check that it is a delalloc extent + * before trying to unmap the range. Otherwise we will be + * trying to remove a real extent (which requires a + * transaction) or a hole, which is probably a bad idea... + */ + error = xfs_bmapi(NULL, ip, offset_fsb, 1, + XFS_BMAPI_ENTIRE, NULL, 0, &imap, + &nimaps, NULL, NULL); + + if (error) { + /* something screwed, just bail */ + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_fs_cmn_err(CE_ALERT, ip->i_mount, + "page discard failed delalloc mapping lookup."); + } + break; + } + if (!nimaps) { + /* nothing there */ + goto next_buffer; + } + if (imap.br_startblock != DELAYSTARTBLOCK) { + /* been converted, ignore */ + goto next_buffer; + } + WARN_ON(imap.br_blockcount == 0); + + /* + * Note: while we initialise the firstblock/flist pair, they + * should never be used because blocks should never be + * allocated or freed for a delalloc extent and hence we need + * don't cancel or finish them after the xfs_bunmapi() call. + */ + xfs_bmap_init(&flist, &firstblock); + error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock, + &flist, NULL, &done); + + ASSERT(!flist.xbf_count && !flist.xbf_first); + if (error) { + /* something screwed, just bail */ + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_fs_cmn_err(CE_ALERT, ip->i_mount, + "page discard unable to remove delalloc mapping."); + } + break; + } +next_buffer: + offset += len; + + } while ((bh = bh->b_this_page) != head); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); +out_invalidate: + xfs_vm_invalidatepage(page, 0); + return; +} + /* * Calling this without startio set means we are being asked to make a dirty * page ready for freeing it's buffers. When called with startio set then @@ -902,15 +1059,15 @@ xfs_page_state_convert( int unmapped) /* also implies page uptodate */ { struct buffer_head *bh, *head; - xfs_iomap_t iomap; + struct xfs_bmbt_irec imap; xfs_ioend_t *ioend = NULL, *iohead = NULL; loff_t offset; unsigned long p_offset = 0; unsigned int type; __uint64_t end_offset; - pgoff_t end_index, last_index, tlast; + pgoff_t end_index, last_index; ssize_t size, len; - int flags, err, iomap_valid = 0, uptodate = 1; + int flags, err, imap_valid = 0, uptodate = 1; int page_dirty, count = 0; int trylock = 0; int all_bh = unmapped; @@ -957,7 +1114,7 @@ xfs_page_state_convert( bh = head = page_buffers(page); offset = page_offset(page); flags = BMAPI_READ; - type = IOMAP_NEW; + type = IO_NEW; /* TODO: cleanup count and page_dirty */ @@ -971,12 +1128,12 @@ xfs_page_state_convert( * the iomap is actually still valid, but the ioend * isn't. shouldn't happen too often. */ - iomap_valid = 0; + imap_valid = 0; continue; } - if (iomap_valid) - iomap_valid = xfs_iomap_valid(&iomap, offset); + if (imap_valid) + imap_valid = xfs_imap_valid(inode, &imap, offset); /* * First case, map an unwritten extent and prepare for @@ -997,20 +1154,20 @@ xfs_page_state_convert( * Make sure we don't use a read-only iomap */ if (flags == BMAPI_READ) - iomap_valid = 0; + imap_valid = 0; if (buffer_unwritten(bh)) { - type = IOMAP_UNWRITTEN; + type = IO_UNWRITTEN; flags = BMAPI_WRITE | BMAPI_IGNSTATE; } else if (buffer_delay(bh)) { - type = IOMAP_DELAY; + type = IO_DELAY; flags = BMAPI_ALLOCATE | trylock; } else { - type = IOMAP_NEW; + type = IO_NEW; flags = BMAPI_WRITE | BMAPI_MMAP; } - if (!iomap_valid) { + if (!imap_valid) { /* * if we didn't have a valid mapping then we * need to ensure that we put the new mapping @@ -1020,7 +1177,7 @@ xfs_page_state_convert( * for unwritten extent conversion. */ new_ioend = 1; - if (type == IOMAP_NEW) { + if (type == IO_NEW) { size = xfs_probe_cluster(inode, page, bh, head, 0); } else { @@ -1028,14 +1185,14 @@ xfs_page_state_convert( } err = xfs_map_blocks(inode, offset, size, - &iomap, flags); + &imap, flags); if (err) goto error; - iomap_valid = xfs_iomap_valid(&iomap, offset); + imap_valid = xfs_imap_valid(inode, &imap, + offset); } - if (iomap_valid) { - xfs_map_at_offset(bh, offset, - inode->i_blkbits, &iomap); + if (imap_valid) { + xfs_map_at_offset(inode, bh, &imap, offset); if (startio) { xfs_add_to_ioend(inode, bh, offset, type, &ioend, @@ -1054,40 +1211,41 @@ xfs_page_state_convert( * That means it must already have extents allocated * underneath it. Map the extent by reading it. */ - if (!iomap_valid || flags != BMAPI_READ) { + if (!imap_valid || flags != BMAPI_READ) { flags = BMAPI_READ; size = xfs_probe_cluster(inode, page, bh, head, 1); err = xfs_map_blocks(inode, offset, size, - &iomap, flags); + &imap, flags); if (err) goto error; - iomap_valid = xfs_iomap_valid(&iomap, offset); + imap_valid = xfs_imap_valid(inode, &imap, + offset); } /* - * We set the type to IOMAP_NEW in case we are doing a + * We set the type to IO_NEW in case we are doing a * small write at EOF that is extending the file but * without needing an allocation. We need to update the * file size on I/O completion in this case so it is * the same case as having just allocated a new extent * that we are writing into for the first time. */ - type = IOMAP_NEW; + type = IO_NEW; if (trylock_buffer(bh)) { ASSERT(buffer_mapped(bh)); - if (iomap_valid) + if (imap_valid) all_bh = 1; xfs_add_to_ioend(inode, bh, offset, type, - &ioend, !iomap_valid); + &ioend, !imap_valid); page_dirty--; count++; } else { - iomap_valid = 0; + imap_valid = 0; } } else if ((buffer_uptodate(bh) || PageUptodate(page)) && (unmapped || startio)) { - iomap_valid = 0; + imap_valid = 0; } if (!iohead) @@ -1101,12 +1259,23 @@ xfs_page_state_convert( if (startio) xfs_start_page_writeback(page, 1, count); - if (ioend && iomap_valid) { - offset = (iomap.iomap_offset + iomap.iomap_bsize - 1) >> - PAGE_CACHE_SHIFT; - tlast = min_t(pgoff_t, offset, last_index); - xfs_cluster_write(inode, page->index + 1, &iomap, &ioend, - wbc, startio, all_bh, tlast); + if (ioend && imap_valid) { + xfs_off_t end_index; + + end_index = imap.br_startoff + imap.br_blockcount; + + /* to bytes */ + end_index <<= inode->i_blkbits; + + /* to pages */ + end_index = (end_index - 1) >> PAGE_CACHE_SHIFT; + + /* check against file size */ + if (end_index > last_index) + end_index = last_index; + + xfs_cluster_write(inode, page->index + 1, &imap, &ioend, + wbc, startio, all_bh, end_index); } if (iohead) @@ -1125,7 +1294,7 @@ error: */ if (err != -EAGAIN) { if (!unmapped) - block_invalidatepage(page, 0); + xfs_aops_discard_page(page); ClearPageUptodate(page); } return err; @@ -1164,6 +1333,21 @@ xfs_vm_writepage( trace_xfs_writepage(inode, page, 0); /* + * Refuse to write the page out if we are called from reclaim context. + * + * This is primarily to avoid stack overflows when called from deep + * used stacks in random callers for direct reclaim, but disabling + * reclaim for kswap is a nice side-effect as kswapd causes rather + * suboptimal I/O patters, too. + * + * This should really be done by the core VM, but until that happens + * filesystems like XFS, btrfs and ext4 have to take care of this + * by themselves. + */ + if (current->flags & PF_MEMALLOC) + goto out_fail; + + /* * We need a transaction if: * 1. There are delalloc buffers on the page * 2. The page is uptodate and we have unmapped buffers @@ -1197,14 +1381,6 @@ xfs_vm_writepage( if (!page_has_buffers(page)) create_empty_buffers(page, 1 << inode->i_blkbits, 0); - - /* - * VM calculation for nr_to_write seems off. Bump it way - * up, this gets simple streaming writes zippy again. - * To be reviewed again after Jens' writeback changes. - */ - wbc->nr_to_write *= 4; - /* * Convert delayed allocate, unwritten or unmapped space * to real space and flush out to disk. @@ -1308,10 +1484,11 @@ __xfs_get_blocks( int direct, bmapi_flags_t flags) { - xfs_iomap_t iomap; + struct xfs_bmbt_irec imap; xfs_off_t offset; ssize_t size; - int niomap = 1; + int nimap = 1; + int new = 0; int error; offset = (xfs_off_t)iblock << inode->i_blkbits; @@ -1322,22 +1499,21 @@ __xfs_get_blocks( return 0; error = xfs_iomap(XFS_I(inode), offset, size, - create ? flags : BMAPI_READ, &iomap, &niomap); + create ? flags : BMAPI_READ, &imap, &nimap, &new); if (error) return -error; - if (niomap == 0) + if (nimap == 0) return 0; - if (iomap.iomap_bn != IOMAP_DADDR_NULL) { + if (imap.br_startblock != HOLESTARTBLOCK && + imap.br_startblock != DELAYSTARTBLOCK) { /* * For unwritten extents do not report a disk address on * the read case (treat as if we're reading into a hole). */ - if (create || !(iomap.iomap_flags & IOMAP_UNWRITTEN)) { - xfs_map_buffer(bh_result, &iomap, offset, - inode->i_blkbits); - } - if (create && (iomap.iomap_flags & IOMAP_UNWRITTEN)) { + if (create || !ISUNWRITTEN(&imap)) + xfs_map_buffer(inode, bh_result, &imap, offset); + if (create && ISUNWRITTEN(&imap)) { if (direct) bh_result->b_private = inode; set_buffer_unwritten(bh_result); @@ -1348,7 +1524,7 @@ __xfs_get_blocks( * If this is a realtime file, data may be on a different device. * to that pointed to from the buffer_head b_bdev currently. */ - bh_result->b_bdev = iomap.iomap_target->bt_bdev; + bh_result->b_bdev = xfs_find_bdev_for_inode(inode); /* * If we previously allocated a block out beyond eof and we are now @@ -1362,10 +1538,10 @@ __xfs_get_blocks( if (create && ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) || (offset >= i_size_read(inode)) || - (iomap.iomap_flags & (IOMAP_NEW|IOMAP_UNWRITTEN)))) + (new || ISUNWRITTEN(&imap)))) set_buffer_new(bh_result); - if (iomap.iomap_flags & IOMAP_DELAY) { + if (imap.br_startblock == DELAYSTARTBLOCK) { BUG_ON(direct); if (create) { set_buffer_uptodate(bh_result); @@ -1374,11 +1550,23 @@ __xfs_get_blocks( } } + /* + * If this is O_DIRECT or the mpage code calling tell them how large + * the mapping is, so that we can avoid repeated get_blocks calls. + */ if (direct || size > (1 << inode->i_blkbits)) { - ASSERT(iomap.iomap_bsize - iomap.iomap_delta > 0); - offset = min_t(xfs_off_t, - iomap.iomap_bsize - iomap.iomap_delta, size); - bh_result->b_size = (ssize_t)min_t(xfs_off_t, LONG_MAX, offset); + xfs_off_t mapping_size; + + mapping_size = imap.br_startoff + imap.br_blockcount - iblock; + mapping_size <<= inode->i_blkbits; + + ASSERT(mapping_size > 0); + if (mapping_size > size) + mapping_size = size; + if (mapping_size > LONG_MAX) + mapping_size = LONG_MAX; + + bh_result->b_size = mapping_size; } return 0; @@ -1436,7 +1624,7 @@ xfs_end_io_direct( */ ioend->io_offset = offset; ioend->io_size = size; - if (ioend->io_type == IOMAP_READ) { + if (ioend->io_type == IO_READ) { xfs_finish_ioend(ioend, 0); } else if (private && size > 0) { xfs_finish_ioend(ioend, is_sync_kiocb(iocb)); @@ -1447,7 +1635,7 @@ xfs_end_io_direct( * didn't map an unwritten extent so switch it's completion * handler. */ - ioend->io_type = IOMAP_NEW; + ioend->io_type = IO_NEW; xfs_finish_ioend(ioend, 0); } @@ -1472,10 +1660,10 @@ xfs_vm_direct_IO( struct block_device *bdev; ssize_t ret; - bdev = xfs_find_bdev_for_inode(XFS_I(inode)); + bdev = xfs_find_bdev_for_inode(inode); iocb->private = xfs_alloc_ioend(inode, rw == WRITE ? - IOMAP_UNWRITTEN : IOMAP_READ); + IO_UNWRITTEN : IO_READ); ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov, offset, nr_segs, @@ -1535,15 +1723,6 @@ xfs_vm_readpages( return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); } -STATIC void -xfs_vm_invalidatepage( - struct page *page, - unsigned long offset) -{ - trace_xfs_invalidatepage(page->mapping->host, page, offset); - block_invalidatepage(page, offset); -} - const struct address_space_operations xfs_address_space_operations = { .readpage = xfs_vm_readpage, .readpages = xfs_vm_readpages, diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 77b8be81c769..2ee3f7a60163 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -18,7 +18,7 @@ #include "xfs.h" #include <linux/stddef.h> #include <linux/errno.h> -#include <linux/slab.h> +#include <linux/gfp.h> #include <linux/pagemap.h> #include <linux/init.h> #include <linux/vmalloc.h> @@ -33,9 +33,11 @@ #include <linux/migrate.h> #include <linux/backing-dev.h> #include <linux/freezer.h> +#include <linux/list_sort.h> #include "xfs_sb.h" #include "xfs_inum.h" +#include "xfs_log.h" #include "xfs_ag.h" #include "xfs_dmapi.h" #include "xfs_mount.h" @@ -43,7 +45,7 @@ static kmem_zone_t *xfs_buf_zone; STATIC int xfsbufd(void *); -STATIC int xfsbufd_wakeup(int, gfp_t); +STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t); STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int); static struct shrinker xfs_buf_shake = { .shrink = xfsbufd_wakeup, @@ -76,6 +78,27 @@ struct workqueue_struct *xfsconvertd_workqueue; #define xfs_buf_deallocate(bp) \ kmem_zone_free(xfs_buf_zone, (bp)); +static inline int +xfs_buf_is_vmapped( + struct xfs_buf *bp) +{ + /* + * Return true if the buffer is vmapped. + * + * The XBF_MAPPED flag is set if the buffer should be mapped, but the + * code is clever enough to know it doesn't have to map a single page, + * so the check has to be both for XBF_MAPPED and bp->b_page_count > 1. + */ + return (bp->b_flags & XBF_MAPPED) && bp->b_page_count > 1; +} + +static inline int +xfs_buf_vmap_len( + struct xfs_buf *bp) +{ + return (bp->b_page_count * PAGE_SIZE) - bp->b_offset; +} + /* * Page Region interfaces. * @@ -146,75 +169,6 @@ test_page_region( } /* - * Mapping of multi-page buffers into contiguous virtual space - */ - -typedef struct a_list { - void *vm_addr; - struct a_list *next; -} a_list_t; - -static a_list_t *as_free_head; -static int as_list_len; -static DEFINE_SPINLOCK(as_lock); - -/* - * Try to batch vunmaps because they are costly. - */ -STATIC void -free_address( - void *addr) -{ - a_list_t *aentry; - -#ifdef CONFIG_XEN - /* - * Xen needs to be able to make sure it can get an exclusive - * RO mapping of pages it wants to turn into a pagetable. If - * a newly allocated page is also still being vmap()ed by xfs, - * it will cause pagetable construction to fail. This is a - * quick workaround to always eagerly unmap pages so that Xen - * is happy. - */ - vunmap(addr); - return; -#endif - - aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT); - if (likely(aentry)) { - spin_lock(&as_lock); - aentry->next = as_free_head; - aentry->vm_addr = addr; - as_free_head = aentry; - as_list_len++; - spin_unlock(&as_lock); - } else { - vunmap(addr); - } -} - -STATIC void -purge_addresses(void) -{ - a_list_t *aentry, *old; - - if (as_free_head == NULL) - return; - - spin_lock(&as_lock); - aentry = as_free_head; - as_free_head = NULL; - as_list_len = 0; - spin_unlock(&as_lock); - - while ((old = aentry) != NULL) { - vunmap(aentry->vm_addr); - aentry = aentry->next; - kfree(old); - } -} - -/* * Internal xfs_buf_t object manipulation */ @@ -314,8 +268,9 @@ xfs_buf_free( if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { uint i; - if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1)) - free_address(bp->b_addr - bp->b_offset); + if (xfs_buf_is_vmapped(bp)) + vm_unmap_ram(bp->b_addr - bp->b_offset, + bp->b_page_count); for (i = 0; i < bp->b_page_count; i++) { struct page *page = bp->b_pages[i]; @@ -385,7 +340,7 @@ _xfs_buf_lookup_pages( __func__, gfp_mask); XFS_STATS_INC(xb_page_retries); - xfsbufd_wakeup(0, gfp_mask); + xfsbufd_wakeup(NULL, 0, gfp_mask); congestion_wait(BLK_RW_ASYNC, HZ/50); goto retry; } @@ -435,10 +390,8 @@ _xfs_buf_map_pages( bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; bp->b_flags |= XBF_MAPPED; } else if (flags & XBF_MAPPED) { - if (as_list_len > 64) - purge_addresses(); - bp->b_addr = vmap(bp->b_pages, bp->b_page_count, - VM_MAP, PAGE_KERNEL); + bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, + -1, PAGE_KERNEL); if (unlikely(bp->b_addr == NULL)) return -ENOMEM; bp->b_addr += bp->b_offset; @@ -898,6 +851,12 @@ xfs_buf_lock_value( * Note that this in no way locks the underlying pages, so it is only * useful for synchronizing concurrent use of buffer objects, not for * synchronizing independent access to the underlying pages. + * + * If we come across a stale, pinned, locked buffer, we know that we + * are being asked to lock a buffer that has been reallocated. Because + * it is pinned, we know that the log has not been pushed to disk and + * hence it will still be locked. Rather than sleeping until someone + * else pushes the log, push it ourselves before trying to get the lock. */ void xfs_buf_lock( @@ -905,6 +864,8 @@ xfs_buf_lock( { trace_xfs_buf_lock(bp, _RET_IP_); + if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) + xfs_log_force(bp->b_mount, 0); if (atomic_read(&bp->b_io_remaining)) blk_run_address_space(bp->b_target->bt_mapping); down(&bp->b_sema); @@ -1051,22 +1012,25 @@ xfs_buf_ioerror( } int -xfs_bawrite( - void *mp, +xfs_bwrite( + struct xfs_mount *mp, struct xfs_buf *bp) { - trace_xfs_buf_bawrite(bp, _RET_IP_); + int error; - ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL); + bp->b_strat = xfs_bdstrat_cb; + bp->b_mount = mp; + bp->b_flags |= XBF_WRITE; + bp->b_flags &= ~(XBF_ASYNC | XBF_READ); xfs_buf_delwri_dequeue(bp); + xfs_buf_iostrategy(bp); - bp->b_flags &= ~(XBF_READ | XBF_DELWRI | XBF_READ_AHEAD); - bp->b_flags |= (XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES); - - bp->b_mount = mp; - bp->b_strat = xfs_bdstrat_cb; - return xfs_bdstrat_cb(bp); + error = xfs_buf_iowait(bp); + if (error) + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + xfs_buf_relse(bp); + return error; } void @@ -1085,6 +1049,126 @@ xfs_bdwrite( xfs_buf_delwri_queue(bp, 1); } +/* + * Called when we want to stop a buffer from getting written or read. + * We attach the EIO error, muck with its flags, and call biodone + * so that the proper iodone callbacks get called. + */ +STATIC int +xfs_bioerror( + xfs_buf_t *bp) +{ +#ifdef XFSERRORDEBUG + ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone); +#endif + + /* + * No need to wait until the buffer is unpinned, we aren't flushing it. + */ + XFS_BUF_ERROR(bp, EIO); + + /* + * We're calling biodone, so delete XBF_DONE flag. + */ + XFS_BUF_UNREAD(bp); + XFS_BUF_UNDELAYWRITE(bp); + XFS_BUF_UNDONE(bp); + XFS_BUF_STALE(bp); + + XFS_BUF_CLR_BDSTRAT_FUNC(bp); + xfs_biodone(bp); + + return EIO; +} + +/* + * Same as xfs_bioerror, except that we are releasing the buffer + * here ourselves, and avoiding the biodone call. + * This is meant for userdata errors; metadata bufs come with + * iodone functions attached, so that we can track down errors. + */ +STATIC int +xfs_bioerror_relse( + struct xfs_buf *bp) +{ + int64_t fl = XFS_BUF_BFLAGS(bp); + /* + * No need to wait until the buffer is unpinned. + * We aren't flushing it. + * + * chunkhold expects B_DONE to be set, whether + * we actually finish the I/O or not. We don't want to + * change that interface. + */ + XFS_BUF_UNREAD(bp); + XFS_BUF_UNDELAYWRITE(bp); + XFS_BUF_DONE(bp); + XFS_BUF_STALE(bp); + XFS_BUF_CLR_IODONE_FUNC(bp); + XFS_BUF_CLR_BDSTRAT_FUNC(bp); + if (!(fl & XBF_ASYNC)) { + /* + * Mark b_error and B_ERROR _both_. + * Lot's of chunkcache code assumes that. + * There's no reason to mark error for + * ASYNC buffers. + */ + XFS_BUF_ERROR(bp, EIO); + XFS_BUF_FINISH_IOWAIT(bp); + } else { + xfs_buf_relse(bp); + } + + return EIO; +} + + +/* + * All xfs metadata buffers except log state machine buffers + * get this attached as their b_bdstrat callback function. + * This is so that we can catch a buffer + * after prematurely unpinning it to forcibly shutdown the filesystem. + */ +int +xfs_bdstrat_cb( + struct xfs_buf *bp) +{ + if (XFS_FORCED_SHUTDOWN(bp->b_mount)) { + trace_xfs_bdstrat_shut(bp, _RET_IP_); + /* + * Metadata write that didn't get logged but + * written delayed anyway. These aren't associated + * with a transaction, and can be ignored. + */ + if (!bp->b_iodone && !XFS_BUF_ISREAD(bp)) + return xfs_bioerror_relse(bp); + else + return xfs_bioerror(bp); + } + + xfs_buf_iorequest(bp); + return 0; +} + +/* + * Wrapper around bdstrat so that we can stop data from going to disk in case + * we are shutting down the filesystem. Typically user data goes thru this + * path; one of the exceptions is the superblock. + */ +void +xfsbdstrat( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + if (XFS_FORCED_SHUTDOWN(mp)) { + trace_xfs_bdstrat_shut(bp, _RET_IP_); + xfs_bioerror_relse(bp); + return; + } + + xfs_buf_iorequest(bp); +} + STATIC void _xfs_buf_ioend( xfs_buf_t *bp, @@ -1107,6 +1191,9 @@ xfs_buf_bio_end_io( xfs_buf_ioerror(bp, -error); + if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) + invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); + do { struct page *page = bvec->bv_page; @@ -1216,6 +1303,10 @@ next_chunk: submit_io: if (likely(bio->bi_size)) { + if (xfs_buf_is_vmapped(bp)) { + flush_kernel_vmap_range(bp->b_addr, + xfs_buf_vmap_len(bp)); + } submit_bio(rw, bio); if (size) goto next_chunk; @@ -1296,7 +1387,7 @@ xfs_buf_iomove( xfs_buf_t *bp, /* buffer to process */ size_t boff, /* starting buffer offset */ size_t bsize, /* length to copy */ - caddr_t data, /* data address */ + void *data, /* data address */ xfs_buf_rw_t mode) /* read/write/zero flag */ { size_t bend, cpoff, csize; @@ -1378,8 +1469,8 @@ xfs_alloc_bufhash( btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */ btp->bt_hashmask = (1 << btp->bt_hashshift) - 1; - btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) * - sizeof(xfs_bufhash_t), KM_SLEEP | KM_LARGE); + btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) * + sizeof(xfs_bufhash_t)); for (i = 0; i < (1 << btp->bt_hashshift); i++) { spin_lock_init(&btp->bt_hash[i].bh_lock); INIT_LIST_HEAD(&btp->bt_hash[i].bh_list); @@ -1390,7 +1481,7 @@ STATIC void xfs_free_bufhash( xfs_buftarg_t *btp) { - kmem_free(btp->bt_hash); + kmem_free_large(btp->bt_hash); btp->bt_hash = NULL; } @@ -1527,7 +1618,8 @@ xfs_mapping_buftarg( STATIC int xfs_alloc_delwrite_queue( - xfs_buftarg_t *btp) + xfs_buftarg_t *btp, + const char *fsname) { int error = 0; @@ -1535,7 +1627,7 @@ xfs_alloc_delwrite_queue( INIT_LIST_HEAD(&btp->bt_delwrite_queue); spin_lock_init(&btp->bt_delwrite_lock); btp->bt_flags = 0; - btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd"); + btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname); if (IS_ERR(btp->bt_task)) { error = PTR_ERR(btp->bt_task); goto out_error; @@ -1548,7 +1640,8 @@ out_error: xfs_buftarg_t * xfs_alloc_buftarg( struct block_device *bdev, - int external) + int external, + const char *fsname) { xfs_buftarg_t *btp; @@ -1560,7 +1653,7 @@ xfs_alloc_buftarg( goto error; if (xfs_mapping_buftarg(btp, bdev)) goto error; - if (xfs_alloc_delwrite_queue(btp)) + if (xfs_alloc_delwrite_queue(btp, fsname)) goto error; xfs_alloc_bufhash(btp, external); return btp; @@ -1595,6 +1688,11 @@ xfs_buf_delwri_queue( list_del(&bp->b_list); } + if (list_empty(dwq)) { + /* start xfsbufd as it is about to have something to do */ + wake_up_process(bp->b_target->bt_task); + } + bp->b_flags |= _XBF_DELWRI_Q; list_add_tail(&bp->b_list, dwq); bp->b_queuetime = jiffies; @@ -1626,6 +1724,35 @@ xfs_buf_delwri_dequeue( trace_xfs_buf_delwri_dequeue(bp, _RET_IP_); } +/* + * If a delwri buffer needs to be pushed before it has aged out, then promote + * it to the head of the delwri queue so that it will be flushed on the next + * xfsbufd run. We do this by resetting the queuetime of the buffer to be older + * than the age currently needed to flush the buffer. Hence the next time the + * xfsbufd sees it is guaranteed to be considered old enough to flush. + */ +void +xfs_buf_delwri_promote( + struct xfs_buf *bp) +{ + struct xfs_buftarg *btp = bp->b_target; + long age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1; + + ASSERT(bp->b_flags & XBF_DELWRI); + ASSERT(bp->b_flags & _XBF_DELWRI_Q); + + /* + * Check the buffer age before locking the delayed write queue as we + * don't need to promote buffers that are already past the flush age. + */ + if (bp->b_queuetime < jiffies - age) + return; + bp->b_queuetime = jiffies - age; + spin_lock(&btp->bt_delwrite_lock); + list_move(&bp->b_list, &btp->bt_delwrite_queue); + spin_unlock(&btp->bt_delwrite_lock); +} + STATIC void xfs_buf_runall_queues( struct workqueue_struct *queue) @@ -1635,6 +1762,7 @@ xfs_buf_runall_queues( STATIC int xfsbufd_wakeup( + struct shrinker *shrink, int priority, gfp_t mask) { @@ -1644,6 +1772,8 @@ xfsbufd_wakeup( list_for_each_entry(btp, &xfs_buftarg_list, bt_list) { if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags)) continue; + if (list_empty(&btp->bt_delwrite_queue)) + continue; set_bit(XBT_FORCE_FLUSH, &btp->bt_flags); wake_up_process(btp->bt_task); } @@ -1694,20 +1824,53 @@ xfs_buf_delwri_split( } +/* + * Compare function is more complex than it needs to be because + * the return value is only 32 bits and we are doing comparisons + * on 64 bit values + */ +static int +xfs_buf_cmp( + void *priv, + struct list_head *a, + struct list_head *b) +{ + struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list); + struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list); + xfs_daddr_t diff; + + diff = ap->b_bn - bp->b_bn; + if (diff < 0) + return -1; + if (diff > 0) + return 1; + return 0; +} + +void +xfs_buf_delwri_sort( + xfs_buftarg_t *target, + struct list_head *list) +{ + list_sort(NULL, list, xfs_buf_cmp); +} + STATIC int xfsbufd( void *data) { - struct list_head tmp; - xfs_buftarg_t *target = (xfs_buftarg_t *)data; - int count; - xfs_buf_t *bp; + xfs_buftarg_t *target = (xfs_buftarg_t *)data; current->flags |= PF_MEMALLOC; set_freezable(); do { + long age = xfs_buf_age_centisecs * msecs_to_jiffies(10); + long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10); + int count = 0; + struct list_head tmp; + if (unlikely(freezing(current))) { set_bit(XBT_FORCE_SLEEP, &target->bt_flags); refrigerator(); @@ -1715,24 +1878,20 @@ xfsbufd( clear_bit(XBT_FORCE_SLEEP, &target->bt_flags); } - schedule_timeout_interruptible( - xfs_buf_timer_centisecs * msecs_to_jiffies(10)); + /* sleep for a long time if there is nothing to do. */ + if (list_empty(&target->bt_delwrite_queue)) + tout = MAX_SCHEDULE_TIMEOUT; + schedule_timeout_interruptible(tout); - xfs_buf_delwri_split(target, &tmp, - xfs_buf_age_centisecs * msecs_to_jiffies(10)); - - count = 0; + xfs_buf_delwri_split(target, &tmp, age); + list_sort(NULL, &tmp, xfs_buf_cmp); while (!list_empty(&tmp)) { - bp = list_entry(tmp.next, xfs_buf_t, b_list); - ASSERT(target == bp->b_target); - + struct xfs_buf *bp; + bp = list_first_entry(&tmp, struct xfs_buf, b_list); list_del_init(&bp->b_list); xfs_buf_iostrategy(bp); count++; } - - if (as_list_len > 0) - purge_addresses(); if (count) blk_run_address_space(target->bt_mapping); @@ -1751,42 +1910,45 @@ xfs_flush_buftarg( xfs_buftarg_t *target, int wait) { - struct list_head tmp; - xfs_buf_t *bp, *n; + xfs_buf_t *bp; int pincount = 0; + LIST_HEAD(tmp_list); + LIST_HEAD(wait_list); xfs_buf_runall_queues(xfsconvertd_workqueue); xfs_buf_runall_queues(xfsdatad_workqueue); xfs_buf_runall_queues(xfslogd_workqueue); set_bit(XBT_FORCE_FLUSH, &target->bt_flags); - pincount = xfs_buf_delwri_split(target, &tmp, 0); + pincount = xfs_buf_delwri_split(target, &tmp_list, 0); /* - * Dropped the delayed write list lock, now walk the temporary list + * Dropped the delayed write list lock, now walk the temporary list. + * All I/O is issued async and then if we need to wait for completion + * we do that after issuing all the IO. */ - list_for_each_entry_safe(bp, n, &tmp, b_list) { + list_sort(NULL, &tmp_list, xfs_buf_cmp); + while (!list_empty(&tmp_list)) { + bp = list_first_entry(&tmp_list, struct xfs_buf, b_list); ASSERT(target == bp->b_target); - if (wait) + list_del_init(&bp->b_list); + if (wait) { bp->b_flags &= ~XBF_ASYNC; - else - list_del_init(&bp->b_list); - + list_add(&bp->b_list, &wait_list); + } xfs_buf_iostrategy(bp); } - if (wait) + if (wait) { + /* Expedite and wait for IO to complete. */ blk_run_address_space(target->bt_mapping); + while (!list_empty(&wait_list)) { + bp = list_first_entry(&wait_list, struct xfs_buf, b_list); - /* - * Remaining list items must be flushed before returning - */ - while (!list_empty(&tmp)) { - bp = list_entry(tmp.next, xfs_buf_t, b_list); - - list_del_init(&bp->b_list); - xfs_iowait(bp); - xfs_buf_relse(bp); + list_del_init(&bp->b_list); + xfs_iowait(bp); + xfs_buf_relse(bp); + } } return pincount; diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index a34c7b54822d..5fbecefa5dfd 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -232,13 +232,17 @@ extern void xfs_buf_lock(xfs_buf_t *); extern void xfs_buf_unlock(xfs_buf_t *); /* Buffer Read and Write Routines */ -extern int xfs_bawrite(void *mp, xfs_buf_t *bp); +extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp); extern void xfs_bdwrite(void *mp, xfs_buf_t *bp); + +extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *); +extern int xfs_bdstrat_cb(struct xfs_buf *); + extern void xfs_buf_ioend(xfs_buf_t *, int); extern void xfs_buf_ioerror(xfs_buf_t *, int); extern int xfs_buf_iorequest(xfs_buf_t *); extern int xfs_buf_iowait(xfs_buf_t *); -extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, xfs_caddr_t, +extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, xfs_buf_rw_t); static inline int xfs_buf_iostrategy(xfs_buf_t *bp) @@ -261,6 +265,7 @@ extern int xfs_buf_ispin(xfs_buf_t *); /* Delayed Write Buffer Routines */ extern void xfs_buf_delwri_dequeue(xfs_buf_t *); +extern void xfs_buf_delwri_promote(xfs_buf_t *); /* Buffer Daemon Setup Routines */ extern int xfs_buf_init(void); @@ -270,33 +275,19 @@ extern void xfs_buf_terminate(void); ({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; }) -#define XFS_B_ASYNC XBF_ASYNC -#define XFS_B_DELWRI XBF_DELWRI -#define XFS_B_READ XBF_READ -#define XFS_B_WRITE XBF_WRITE -#define XFS_B_STALE XBF_STALE - -#define XFS_BUF_TRYLOCK XBF_TRYLOCK -#define XFS_INCORE_TRYLOCK XBF_TRYLOCK -#define XFS_BUF_LOCK XBF_LOCK -#define XFS_BUF_MAPPED XBF_MAPPED - -#define BUF_BUSY XBF_DONT_BLOCK - #define XFS_BUF_BFLAGS(bp) ((bp)->b_flags) #define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \ ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED)) -#define XFS_BUF_STALE(bp) ((bp)->b_flags |= XFS_B_STALE) -#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XFS_B_STALE) -#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XFS_B_STALE) +#define XFS_BUF_STALE(bp) ((bp)->b_flags |= XBF_STALE) +#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE) +#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE) #define XFS_BUF_SUPER_STALE(bp) do { \ XFS_BUF_STALE(bp); \ xfs_buf_delwri_dequeue(bp); \ XFS_BUF_DONE(bp); \ } while (0) -#define XFS_BUF_MANAGE XBF_FS_MANAGED #define XFS_BUF_UNMANAGE(bp) ((bp)->b_flags &= ~XBF_FS_MANAGED) #define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI) @@ -385,31 +376,11 @@ static inline void xfs_buf_relse(xfs_buf_t *bp) #define xfs_biomove(bp, off, len, data, rw) \ xfs_buf_iomove((bp), (off), (len), (data), \ - ((rw) == XFS_B_WRITE) ? XBRW_WRITE : XBRW_READ) + ((rw) == XBF_WRITE) ? XBRW_WRITE : XBRW_READ) #define xfs_biozero(bp, off, len) \ xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO) - -static inline int XFS_bwrite(xfs_buf_t *bp) -{ - int iowait = (bp->b_flags & XBF_ASYNC) == 0; - int error = 0; - - if (!iowait) - bp->b_flags |= _XBF_RUN_QUEUES; - - xfs_buf_delwri_dequeue(bp); - xfs_buf_iostrategy(bp); - if (iowait) { - error = xfs_buf_iowait(bp); - xfs_buf_relse(bp); - } - return error; -} - -#define XFS_bdstrat(bp) xfs_buf_iorequest(bp) - #define xfs_iowait(bp) xfs_buf_iowait(bp) #define xfs_baread(target, rablkno, ralen) \ @@ -419,11 +390,12 @@ static inline int XFS_bwrite(xfs_buf_t *bp) /* * Handling of buftargs. */ -extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int); +extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int, const char *); extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); extern void xfs_wait_buftarg(xfs_buftarg_t *); extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); extern int xfs_flush_buftarg(xfs_buftarg_t *, int); + #ifdef CONFIG_KDB_MODULES extern struct list_head *xfs_get_buftarg_list(void); #endif diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c index 87b8cbd23d4b..e7839ee49e43 100644 --- a/fs/xfs/linux-2.6/xfs_export.c +++ b/fs/xfs/linux-2.6/xfs_export.c @@ -29,6 +29,7 @@ #include "xfs_vnodeops.h" #include "xfs_bmap_btree.h" #include "xfs_inode.h" +#include "xfs_inode_item.h" /* * Note that we only accept fileids which are long enough rather than allow @@ -127,13 +128,12 @@ xfs_nfs_get_inode( return ERR_PTR(-ESTALE); /* - * The XFS_IGET_BULKSTAT means that an invalid inode number is just - * fine and not an indication of a corrupted filesystem. Because - * clients can send any kind of invalid file handle, e.g. after - * a restore on the server we have to deal with this case gracefully. + * The XFS_IGET_UNTRUSTED means that an invalid inode number is just + * fine and not an indication of a corrupted filesystem as clients can + * send invalid file handles and we have to handle it gracefully.. */ - error = xfs_iget(mp, NULL, ino, XFS_IGET_BULKSTAT, - XFS_ILOCK_SHARED, &ip, 0); + error = xfs_iget(mp, NULL, ino, XFS_IGET_UNTRUSTED, + XFS_ILOCK_SHARED, &ip); if (error) { /* * EINVAL means the inode cluster doesn't exist anymore. @@ -215,9 +215,28 @@ xfs_fs_get_parent( return d_obtain_alias(VFS_I(cip)); } +STATIC int +xfs_fs_nfs_commit_metadata( + struct inode *inode) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + int error = 0; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (xfs_ipincount(ip)) { + error = _xfs_log_force_lsn(mp, ip->i_itemp->ili_last_lsn, + XFS_LOG_SYNC, NULL); + } + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + return error; +} + const struct export_operations xfs_export_operations = { .encode_fh = xfs_fs_encode_fh, .fh_to_dentry = xfs_fs_fh_to_dentry, .fh_to_parent = xfs_fs_fh_to_parent, .get_parent = xfs_fs_get_parent, + .commit_metadata = xfs_fs_nfs_commit_metadata, }; diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index e4caeb28ce2e..257a56b127cf 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -16,6 +16,7 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" +#include "xfs_fs.h" #include "xfs_bit.h" #include "xfs_log.h" #include "xfs_inum.h" @@ -34,52 +35,281 @@ #include "xfs_dir2_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" #include "xfs_error.h" #include "xfs_rw.h" #include "xfs_vnodeops.h" #include "xfs_da_btree.h" #include "xfs_ioctl.h" +#include "xfs_trace.h" #include <linux/dcache.h> static const struct vm_operations_struct xfs_file_vm_ops; -STATIC ssize_t -xfs_file_aio_read( - struct kiocb *iocb, - const struct iovec *iov, - unsigned long nr_segs, - loff_t pos) +/* + * xfs_iozero + * + * xfs_iozero clears the specified range of buffer supplied, + * and marks all the affected blocks as valid and modified. If + * an affected block is not allocated, it will be allocated. If + * an affected block is not completely overwritten, and is not + * valid before the operation, it will be read from disk before + * being partially zeroed. + */ +STATIC int +xfs_iozero( + struct xfs_inode *ip, /* inode */ + loff_t pos, /* offset in file */ + size_t count) /* size of data to zero */ { - struct file *file = iocb->ki_filp; - int ioflags = 0; + struct page *page; + struct address_space *mapping; + int status; - BUG_ON(iocb->ki_pos != pos); - if (unlikely(file->f_flags & O_DIRECT)) - ioflags |= IO_ISDIRECT; - if (file->f_mode & FMODE_NOCMTIME) - ioflags |= IO_INVIS; - return xfs_read(XFS_I(file->f_path.dentry->d_inode), iocb, iov, - nr_segs, &iocb->ki_pos, ioflags); + mapping = VFS_I(ip)->i_mapping; + do { + unsigned offset, bytes; + void *fsdata; + + offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ + bytes = PAGE_CACHE_SIZE - offset; + if (bytes > count) + bytes = count; + + status = pagecache_write_begin(NULL, mapping, pos, bytes, + AOP_FLAG_UNINTERRUPTIBLE, + &page, &fsdata); + if (status) + break; + + zero_user(page, offset, bytes); + + status = pagecache_write_end(NULL, mapping, pos, bytes, bytes, + page, fsdata); + WARN_ON(status <= 0); /* can't return less than zero! */ + pos += bytes; + count -= bytes; + status = 0; + } while (count); + + return (-status); +} + +STATIC int +xfs_file_fsync( + struct file *file, + int datasync) +{ + struct inode *inode = file->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_trans *tp; + int error = 0; + int log_flushed = 0; + + xfs_itrace_entry(ip); + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -XFS_ERROR(EIO); + + xfs_iflags_clear(ip, XFS_ITRUNCATED); + + xfs_ioend_wait(ip); + + /* + * We always need to make sure that the required inode state is safe on + * disk. The inode might be clean but we still might need to force the + * log because of committed transactions that haven't hit the disk yet. + * Likewise, there could be unflushed non-transactional changes to the + * inode core that have to go to disk and this requires us to issue + * a synchronous transaction to capture these changes correctly. + * + * This code relies on the assumption that if the i_update_core field + * of the inode is clear and the inode is unpinned then it is clean + * and no action is required. + */ + xfs_ilock(ip, XFS_ILOCK_SHARED); + + /* + * First check if the VFS inode is marked dirty. All the dirtying + * of non-transactional updates no goes through mark_inode_dirty*, + * which allows us to distinguish beteeen pure timestamp updates + * and i_size updates which need to be caught for fdatasync. + * After that also theck for the dirty state in the XFS inode, which + * might gets cleared when the inode gets written out via the AIL + * or xfs_iflush_cluster. + */ + if (((inode->i_state & I_DIRTY_DATASYNC) || + ((inode->i_state & I_DIRTY_SYNC) && !datasync)) && + ip->i_update_core) { + /* + * Kick off a transaction to log the inode core to get the + * updates. The sync transaction will also force the log. + */ + xfs_iunlock(ip, XFS_ILOCK_SHARED); + tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS); + error = xfs_trans_reserve(tp, 0, + XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return -error; + } + xfs_ilock(ip, XFS_ILOCK_EXCL); + + /* + * Note - it's possible that we might have pushed ourselves out + * of the way during trans_reserve which would flush the inode. + * But there's no guarantee that the inode buffer has actually + * gone out yet (it's delwri). Plus the buffer could be pinned + * anyway if it's part of an inode in another recent + * transaction. So we play it safe and fire off the + * transaction anyway. + */ + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ihold(tp, ip); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + xfs_trans_set_sync(tp); + error = _xfs_trans_commit(tp, 0, &log_flushed); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } else { + /* + * Timestamps/size haven't changed since last inode flush or + * inode transaction commit. That means either nothing got + * written or a transaction committed which caught the updates. + * If the latter happened and the transaction hasn't hit the + * disk yet, the inode will be still be pinned. If it is, + * force the log. + */ + if (xfs_ipincount(ip)) { + error = _xfs_log_force_lsn(ip->i_mount, + ip->i_itemp->ili_last_lsn, + XFS_LOG_SYNC, &log_flushed); + } + xfs_iunlock(ip, XFS_ILOCK_SHARED); + } + + if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) { + /* + * If the log write didn't issue an ordered tag we need + * to flush the disk cache for the data device now. + */ + if (!log_flushed) + xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp); + + /* + * If this inode is on the RT dev we need to flush that + * cache as well. + */ + if (XFS_IS_REALTIME_INODE(ip)) + xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp); + } + + return -error; } STATIC ssize_t -xfs_file_aio_write( +xfs_file_aio_read( struct kiocb *iocb, - const struct iovec *iov, + const struct iovec *iovp, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + size_t size = 0; + ssize_t ret = 0; int ioflags = 0; + xfs_fsize_t n; + unsigned long seg; + + XFS_STATS_INC(xs_read_calls); BUG_ON(iocb->ki_pos != pos); + if (unlikely(file->f_flags & O_DIRECT)) ioflags |= IO_ISDIRECT; if (file->f_mode & FMODE_NOCMTIME) ioflags |= IO_INVIS; - return xfs_write(XFS_I(file->f_mapping->host), iocb, iov, nr_segs, - &iocb->ki_pos, ioflags); + + /* START copy & waste from filemap.c */ + for (seg = 0; seg < nr_segs; seg++) { + const struct iovec *iv = &iovp[seg]; + + /* + * If any segment has a negative length, or the cumulative + * length ever wraps negative then return -EINVAL. + */ + size += iv->iov_len; + if (unlikely((ssize_t)(size|iv->iov_len) < 0)) + return XFS_ERROR(-EINVAL); + } + /* END copy & waste from filemap.c */ + + if (unlikely(ioflags & IO_ISDIRECT)) { + xfs_buftarg_t *target = + XFS_IS_REALTIME_INODE(ip) ? + mp->m_rtdev_targp : mp->m_ddev_targp; + if ((iocb->ki_pos & target->bt_smask) || + (size & target->bt_smask)) { + if (iocb->ki_pos == ip->i_size) + return 0; + return -XFS_ERROR(EINVAL); + } + } + + n = XFS_MAXIOFFSET(mp) - iocb->ki_pos; + if (n <= 0 || size == 0) + return 0; + + if (n < size) + size = n; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + if (unlikely(ioflags & IO_ISDIRECT)) + mutex_lock(&inode->i_mutex); + xfs_ilock(ip, XFS_IOLOCK_SHARED); + + if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) { + int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags); + int iolock = XFS_IOLOCK_SHARED; + + ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, iocb->ki_pos, size, + dmflags, &iolock); + if (ret) { + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + if (unlikely(ioflags & IO_ISDIRECT)) + mutex_unlock(&inode->i_mutex); + return ret; + } + } + + if (unlikely(ioflags & IO_ISDIRECT)) { + if (inode->i_mapping->nrpages) { + ret = -xfs_flushinval_pages(ip, + (iocb->ki_pos & PAGE_CACHE_MASK), + -1, FI_REMAPF_LOCKED); + } + mutex_unlock(&inode->i_mutex); + if (ret) { + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return ret; + } + } + + trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags); + + ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos); + if (ret > 0) + XFS_STATS_ADD(xs_read_bytes, ret); + + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return ret; } STATIC ssize_t @@ -87,16 +317,44 @@ xfs_file_splice_read( struct file *infilp, loff_t *ppos, struct pipe_inode_info *pipe, - size_t len, + size_t count, unsigned int flags) { + struct xfs_inode *ip = XFS_I(infilp->f_mapping->host); + struct xfs_mount *mp = ip->i_mount; int ioflags = 0; + ssize_t ret; + + XFS_STATS_INC(xs_read_calls); if (infilp->f_mode & FMODE_NOCMTIME) ioflags |= IO_INVIS; - return xfs_splice_read(XFS_I(infilp->f_path.dentry->d_inode), - infilp, ppos, pipe, len, flags, ioflags); + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; + + xfs_ilock(ip, XFS_IOLOCK_SHARED); + + if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) { + int iolock = XFS_IOLOCK_SHARED; + int error; + + error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count, + FILP_DELAY_FLAG(infilp), &iolock); + if (error) { + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return -error; + } + } + + trace_xfs_file_splice_read(ip, count, *ppos, ioflags); + + ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); + if (ret > 0) + XFS_STATS_ADD(xs_read_bytes, ret); + + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return ret; } STATIC ssize_t @@ -104,16 +362,538 @@ xfs_file_splice_write( struct pipe_inode_info *pipe, struct file *outfilp, loff_t *ppos, - size_t len, + size_t count, unsigned int flags) { + struct inode *inode = outfilp->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + xfs_fsize_t isize, new_size; int ioflags = 0; + ssize_t ret; + + XFS_STATS_INC(xs_write_calls); if (outfilp->f_mode & FMODE_NOCMTIME) ioflags |= IO_INVIS; - return xfs_splice_write(XFS_I(outfilp->f_path.dentry->d_inode), - pipe, outfilp, ppos, len, flags, ioflags); + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; + + xfs_ilock(ip, XFS_IOLOCK_EXCL); + + if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) { + int iolock = XFS_IOLOCK_EXCL; + int error; + + error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count, + FILP_DELAY_FLAG(outfilp), &iolock); + if (error) { + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return -error; + } + } + + new_size = *ppos + count; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (new_size > ip->i_size) + ip->i_new_size = new_size; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + trace_xfs_file_splice_write(ip, count, *ppos, ioflags); + + ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); + if (ret > 0) + XFS_STATS_ADD(xs_write_bytes, ret); + + isize = i_size_read(inode); + if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize)) + *ppos = isize; + + if (*ppos > ip->i_size) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (*ppos > ip->i_size) + ip->i_size = *ppos; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + + if (ip->i_new_size) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + ip->i_new_size = 0; + if (ip->i_d.di_size > ip->i_size) + ip->i_d.di_size = ip->i_size; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return ret; +} + +/* + * This routine is called to handle zeroing any space in the last + * block of the file that is beyond the EOF. We do this since the + * size is being increased without writing anything to that block + * and we don't want anyone to read the garbage on the disk. + */ +STATIC int /* error (positive) */ +xfs_zero_last_block( + xfs_inode_t *ip, + xfs_fsize_t offset, + xfs_fsize_t isize) +{ + xfs_fileoff_t last_fsb; + xfs_mount_t *mp = ip->i_mount; + int nimaps; + int zero_offset; + int zero_len; + int error = 0; + xfs_bmbt_irec_t imap; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + zero_offset = XFS_B_FSB_OFFSET(mp, isize); + if (zero_offset == 0) { + /* + * There are no extra bytes in the last block on disk to + * zero, so return. + */ + return 0; + } + + last_fsb = XFS_B_TO_FSBT(mp, isize); + nimaps = 1; + error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap, + &nimaps, NULL, NULL); + if (error) { + return error; + } + ASSERT(nimaps > 0); + /* + * If the block underlying isize is just a hole, then there + * is nothing to zero. + */ + if (imap.br_startblock == HOLESTARTBLOCK) { + return 0; + } + /* + * Zero the part of the last block beyond the EOF, and write it + * out sync. We need to drop the ilock while we do this so we + * don't deadlock when the buffer cache calls back to us. + */ + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + zero_len = mp->m_sb.sb_blocksize - zero_offset; + if (isize + zero_len > offset) + zero_len = offset - isize; + error = xfs_iozero(ip, isize, zero_len); + + xfs_ilock(ip, XFS_ILOCK_EXCL); + ASSERT(error >= 0); + return error; +} + +/* + * Zero any on disk space between the current EOF and the new, + * larger EOF. This handles the normal case of zeroing the remainder + * of the last block in the file and the unusual case of zeroing blocks + * out beyond the size of the file. This second case only happens + * with fixed size extents and when the system crashes before the inode + * size was updated but after blocks were allocated. If fill is set, + * then any holes in the range are filled and zeroed. If not, the holes + * are left alone as holes. + */ + +int /* error (positive) */ +xfs_zero_eof( + xfs_inode_t *ip, + xfs_off_t offset, /* starting I/O offset */ + xfs_fsize_t isize) /* current inode size */ +{ + xfs_mount_t *mp = ip->i_mount; + xfs_fileoff_t start_zero_fsb; + xfs_fileoff_t end_zero_fsb; + xfs_fileoff_t zero_count_fsb; + xfs_fileoff_t last_fsb; + xfs_fileoff_t zero_off; + xfs_fsize_t zero_len; + int nimaps; + int error = 0; + xfs_bmbt_irec_t imap; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); + ASSERT(offset > isize); + + /* + * First handle zeroing the block on which isize resides. + * We only zero a part of that block so it is handled specially. + */ + error = xfs_zero_last_block(ip, offset, isize); + if (error) { + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); + return error; + } + + /* + * Calculate the range between the new size and the old + * where blocks needing to be zeroed may exist. To get the + * block where the last byte in the file currently resides, + * we need to subtract one from the size and truncate back + * to a block boundary. We subtract 1 in case the size is + * exactly on a block boundary. + */ + last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1; + start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); + end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1); + ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb); + if (last_fsb == end_zero_fsb) { + /* + * The size was only incremented on its last block. + * We took care of that above, so just return. + */ + return 0; + } + + ASSERT(start_zero_fsb <= end_zero_fsb); + while (start_zero_fsb <= end_zero_fsb) { + nimaps = 1; + zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; + error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb, + 0, NULL, 0, &imap, &nimaps, NULL, NULL); + if (error) { + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); + return error; + } + ASSERT(nimaps > 0); + + if (imap.br_state == XFS_EXT_UNWRITTEN || + imap.br_startblock == HOLESTARTBLOCK) { + /* + * This loop handles initializing pages that were + * partially initialized by the code below this + * loop. It basically zeroes the part of the page + * that sits on a hole and sets the page as P_HOLE + * and calls remapf if it is a mapped file. + */ + start_zero_fsb = imap.br_startoff + imap.br_blockcount; + ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); + continue; + } + + /* + * There are blocks we need to zero. + * Drop the inode lock while we're doing the I/O. + * We'll still have the iolock to protect us. + */ + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + zero_off = XFS_FSB_TO_B(mp, start_zero_fsb); + zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount); + + if ((zero_off + zero_len) > offset) + zero_len = offset - zero_off; + + error = xfs_iozero(ip, zero_off, zero_len); + if (error) { + goto out_lock; + } + + start_zero_fsb = imap.br_startoff + imap.br_blockcount; + ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); + + xfs_ilock(ip, XFS_ILOCK_EXCL); + } + + return 0; + +out_lock: + xfs_ilock(ip, XFS_ILOCK_EXCL); + ASSERT(error >= 0); + return error; +} + +STATIC ssize_t +xfs_file_aio_write( + struct kiocb *iocb, + const struct iovec *iovp, + unsigned long nr_segs, + loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + ssize_t ret = 0, error = 0; + int ioflags = 0; + xfs_fsize_t isize, new_size; + int iolock; + int eventsent = 0; + size_t ocount = 0, count; + int need_i_mutex; + + XFS_STATS_INC(xs_write_calls); + + BUG_ON(iocb->ki_pos != pos); + + if (unlikely(file->f_flags & O_DIRECT)) + ioflags |= IO_ISDIRECT; + if (file->f_mode & FMODE_NOCMTIME) + ioflags |= IO_INVIS; + + error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ); + if (error) + return error; + + count = ocount; + if (count == 0) + return 0; + + xfs_wait_for_freeze(mp, SB_FREEZE_WRITE); + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + +relock: + if (ioflags & IO_ISDIRECT) { + iolock = XFS_IOLOCK_SHARED; + need_i_mutex = 0; + } else { + iolock = XFS_IOLOCK_EXCL; + need_i_mutex = 1; + mutex_lock(&inode->i_mutex); + } + + xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); + +start: + error = -generic_write_checks(file, &pos, &count, + S_ISBLK(inode->i_mode)); + if (error) { + xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); + goto out_unlock_mutex; + } + + if ((DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && + !(ioflags & IO_INVIS) && !eventsent)) { + int dmflags = FILP_DELAY_FLAG(file); + + if (need_i_mutex) + dmflags |= DM_FLAGS_IMUX; + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + error = XFS_SEND_DATA(ip->i_mount, DM_EVENT_WRITE, ip, + pos, count, dmflags, &iolock); + if (error) { + goto out_unlock_internal; + } + xfs_ilock(ip, XFS_ILOCK_EXCL); + eventsent = 1; + + /* + * The iolock was dropped and reacquired in XFS_SEND_DATA + * so we have to recheck the size when appending. + * We will only "goto start;" once, since having sent the + * event prevents another call to XFS_SEND_DATA, which is + * what allows the size to change in the first place. + */ + if ((file->f_flags & O_APPEND) && pos != ip->i_size) + goto start; + } + + if (ioflags & IO_ISDIRECT) { + xfs_buftarg_t *target = + XFS_IS_REALTIME_INODE(ip) ? + mp->m_rtdev_targp : mp->m_ddev_targp; + + if ((pos & target->bt_smask) || (count & target->bt_smask)) { + xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); + return XFS_ERROR(-EINVAL); + } + + if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) { + xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); + iolock = XFS_IOLOCK_EXCL; + need_i_mutex = 1; + mutex_lock(&inode->i_mutex); + xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); + goto start; + } + } + + new_size = pos + count; + if (new_size > ip->i_size) + ip->i_new_size = new_size; + + if (likely(!(ioflags & IO_INVIS))) + file_update_time(file); + + /* + * If the offset is beyond the size of the file, we have a couple + * of things to do. First, if there is already space allocated + * we need to either create holes or zero the disk or ... + * + * If there is a page where the previous size lands, we need + * to zero it out up to the new size. + */ + + if (pos > ip->i_size) { + error = xfs_zero_eof(ip, pos, ip->i_size); + if (error) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + goto out_unlock_internal; + } + } + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + /* + * If we're writing the file then make sure to clear the + * setuid and setgid bits if the process is not being run + * by root. This keeps people from modifying setuid and + * setgid binaries. + */ + error = -file_remove_suid(file); + if (unlikely(error)) + goto out_unlock_internal; + + /* We can write back this queue in page reclaim */ + current->backing_dev_info = mapping->backing_dev_info; + + if ((ioflags & IO_ISDIRECT)) { + if (mapping->nrpages) { + WARN_ON(need_i_mutex == 0); + error = xfs_flushinval_pages(ip, + (pos & PAGE_CACHE_MASK), + -1, FI_REMAPF_LOCKED); + if (error) + goto out_unlock_internal; + } + + if (need_i_mutex) { + /* demote the lock now the cached pages are gone */ + xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); + mutex_unlock(&inode->i_mutex); + + iolock = XFS_IOLOCK_SHARED; + need_i_mutex = 0; + } + + trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags); + ret = generic_file_direct_write(iocb, iovp, + &nr_segs, pos, &iocb->ki_pos, count, ocount); + + /* + * direct-io write to a hole: fall through to buffered I/O + * for completing the rest of the request. + */ + if (ret >= 0 && ret != count) { + XFS_STATS_ADD(xs_write_bytes, ret); + + pos += ret; + count -= ret; + + ioflags &= ~IO_ISDIRECT; + xfs_iunlock(ip, iolock); + goto relock; + } + } else { + int enospc = 0; + ssize_t ret2 = 0; + +write_retry: + trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags); + ret2 = generic_file_buffered_write(iocb, iovp, nr_segs, + pos, &iocb->ki_pos, count, ret); + /* + * if we just got an ENOSPC, flush the inode now we + * aren't holding any page locks and retry *once* + */ + if (ret2 == -ENOSPC && !enospc) { + error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE); + if (error) + goto out_unlock_internal; + enospc = 1; + goto write_retry; + } + ret = ret2; + } + + current->backing_dev_info = NULL; + + isize = i_size_read(inode); + if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize)) + iocb->ki_pos = isize; + + if (iocb->ki_pos > ip->i_size) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (iocb->ki_pos > ip->i_size) + ip->i_size = iocb->ki_pos; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + + if (ret == -ENOSPC && + DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) { + xfs_iunlock(ip, iolock); + if (need_i_mutex) + mutex_unlock(&inode->i_mutex); + error = XFS_SEND_NAMESP(ip->i_mount, DM_EVENT_NOSPACE, ip, + DM_RIGHT_NULL, ip, DM_RIGHT_NULL, NULL, NULL, + 0, 0, 0); /* Delay flag intentionally unused */ + if (need_i_mutex) + mutex_lock(&inode->i_mutex); + xfs_ilock(ip, iolock); + if (error) + goto out_unlock_internal; + goto start; + } + + error = -ret; + if (ret <= 0) + goto out_unlock_internal; + + XFS_STATS_ADD(xs_write_bytes, ret); + + /* Handle various SYNC-type writes */ + if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { + loff_t end = pos + ret - 1; + int error2; + + xfs_iunlock(ip, iolock); + if (need_i_mutex) + mutex_unlock(&inode->i_mutex); + + error2 = filemap_write_and_wait_range(mapping, pos, end); + if (!error) + error = error2; + if (need_i_mutex) + mutex_lock(&inode->i_mutex); + xfs_ilock(ip, iolock); + + error2 = -xfs_file_fsync(file, + (file->f_flags & __O_SYNC) ? 0 : 1); + if (!error) + error = error2; + } + + out_unlock_internal: + if (ip->i_new_size) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + ip->i_new_size = 0; + /* + * If this was a direct or synchronous I/O that failed (such + * as ENOSPC) then part of the I/O may have been written to + * disk before the error occured. In this case the on-disk + * file size may have been adjusted beyond the in-memory file + * size and now needs to be truncated back. + */ + if (ip->i_d.di_size > ip->i_size) + ip->i_d.di_size = ip->i_size; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + xfs_iunlock(ip, iolock); + out_unlock_mutex: + if (need_i_mutex) + mutex_unlock(&inode->i_mutex); + return -error; } STATIC int @@ -160,28 +940,6 @@ xfs_file_release( return -xfs_release(XFS_I(inode)); } -/* - * We ignore the datasync flag here because a datasync is effectively - * identical to an fsync. That is, datasync implies that we need to write - * only the metadata needed to be able to access the data that is written - * if we crash after the call completes. Hence if we are writing beyond - * EOF we have to log the inode size change as well, which makes it a - * full fsync. If we don't write beyond EOF, the inode core will be - * clean in memory and so we don't need to log the inode, just like - * fsync. - */ -STATIC int -xfs_file_fsync( - struct file *file, - struct dentry *dentry, - int datasync) -{ - struct xfs_inode *ip = XFS_I(dentry->d_inode); - - xfs_iflags_clear(ip, XFS_ITRUNCATED); - return -xfs_fsync(ip); -} - STATIC int xfs_file_readdir( struct file *filp, @@ -203,9 +961,9 @@ xfs_file_readdir( * * Try to give it an estimate that's good enough, maybe at some * point we can change the ->readdir prototype to include the - * buffer size. + * buffer size. For now we use the current glibc buffer size. */ - bufsize = (size_t)min_t(loff_t, PAGE_SIZE, ip->i_d.di_size); + bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size); error = xfs_readdir(ip, dirent, bufsize, (xfs_off_t *)&filp->f_pos, filldir); diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c index 7501b85fd860..b6918d76bc7b 100644 --- a/fs/xfs/linux-2.6/xfs_fs_subr.c +++ b/fs/xfs/linux-2.6/xfs_fs_subr.c @@ -79,7 +79,7 @@ xfs_flush_pages( xfs_iflags_clear(ip, XFS_ITRUNCATED); ret = -filemap_fdatawrite(mapping); } - if (flags & XFS_B_ASYNC) + if (flags & XBF_ASYNC) return ret; ret2 = xfs_wait_on_pages(ip, first, last); if (!ret) diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index a034cf624437..e59a81062830 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -58,6 +58,7 @@ #include <linux/mount.h> #include <linux/namei.h> #include <linux/pagemap.h> +#include <linux/slab.h> #include <linux/exportfs.h> /* @@ -447,12 +448,12 @@ xfs_attrlist_by_handle( int xfs_attrmulti_attr_get( struct inode *inode, - char *name, - char __user *ubuf, + unsigned char *name, + unsigned char __user *ubuf, __uint32_t *len, __uint32_t flags) { - char *kbuf; + unsigned char *kbuf; int error = EFAULT; if (*len > XATTR_SIZE_MAX) @@ -476,12 +477,12 @@ xfs_attrmulti_attr_get( int xfs_attrmulti_attr_set( struct inode *inode, - char *name, - const char __user *ubuf, + unsigned char *name, + const unsigned char __user *ubuf, __uint32_t len, __uint32_t flags) { - char *kbuf; + unsigned char *kbuf; int error = EFAULT; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) @@ -501,7 +502,7 @@ xfs_attrmulti_attr_set( int xfs_attrmulti_attr_remove( struct inode *inode, - char *name, + unsigned char *name, __uint32_t flags) { if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) @@ -519,13 +520,17 @@ xfs_attrmulti_by_handle( xfs_fsop_attrmulti_handlereq_t am_hreq; struct dentry *dentry; unsigned int i, size; - char *attr_name; + unsigned char *attr_name; if (!capable(CAP_SYS_ADMIN)) return -XFS_ERROR(EPERM); if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t))) return -XFS_ERROR(EFAULT); + /* overflow check */ + if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t)) + return -E2BIG; + dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -547,7 +552,7 @@ xfs_attrmulti_by_handle( error = 0; for (i = 0; i < am_hreq.opcount; i++) { - ops[i].am_error = strncpy_from_user(attr_name, + ops[i].am_error = strncpy_from_user((char *)attr_name, ops[i].am_attrname, MAXNAMELEN); if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) error = -ERANGE; @@ -674,10 +679,9 @@ xfs_ioc_bulkstat( error = xfs_bulkstat_single(mp, &inlast, bulkreq.ubuffer, &done); else /* XFS_IOC_FSBULKSTAT */ - error = xfs_bulkstat(mp, &inlast, &count, - (bulkstat_one_pf)xfs_bulkstat_one, NULL, - sizeof(xfs_bstat_t), bulkreq.ubuffer, - BULKSTAT_FG_QUICK, &done); + error = xfs_bulkstat(mp, &inlast, &count, xfs_bulkstat_one, + sizeof(xfs_bstat_t), bulkreq.ubuffer, + &done); if (error) return -error; @@ -1431,6 +1435,9 @@ xfs_file_ioctl( if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (mp->m_flags & XFS_MOUNT_RDONLY) + return -XFS_ERROR(EROFS); + if (copy_from_user(&inout, arg, sizeof(inout))) return -XFS_ERROR(EFAULT); diff --git a/fs/xfs/linux-2.6/xfs_ioctl.h b/fs/xfs/linux-2.6/xfs_ioctl.h index 7bd7c6afc1eb..d56173b34a2a 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.h +++ b/fs/xfs/linux-2.6/xfs_ioctl.h @@ -45,23 +45,23 @@ xfs_readlink_by_handle( extern int xfs_attrmulti_attr_get( struct inode *inode, - char *name, - char __user *ubuf, + unsigned char *name, + unsigned char __user *ubuf, __uint32_t *len, __uint32_t flags); extern int - xfs_attrmulti_attr_set( +xfs_attrmulti_attr_set( struct inode *inode, - char *name, - const char __user *ubuf, + unsigned char *name, + const unsigned char __user *ubuf, __uint32_t len, __uint32_t flags); extern int xfs_attrmulti_attr_remove( struct inode *inode, - char *name, + unsigned char *name, __uint32_t flags); extern struct dentry * diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c index be1527b1670c..52ed49e6465c 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.c +++ b/fs/xfs/linux-2.6/xfs_ioctl32.c @@ -18,6 +18,7 @@ #include <linux/compat.h> #include <linux/ioctl.h> #include <linux/mount.h> +#include <linux/slab.h> #include <asm/uaccess.h> #include "xfs.h" #include "xfs_fs.h" @@ -236,15 +237,12 @@ xfs_bulkstat_one_compat( xfs_ino_t ino, /* inode number to get data for */ void __user *buffer, /* buffer to place output in */ int ubsize, /* size of buffer */ - void *private_data, /* my private data */ - xfs_daddr_t bno, /* starting bno of inode cluster */ int *ubused, /* bytes used by me */ - void *dibuff, /* on-disk inode buffer */ int *stat) /* BULKSTAT_RV_... */ { return xfs_bulkstat_one_int(mp, ino, buffer, ubsize, - xfs_bulkstat_one_fmt_compat, bno, - ubused, dibuff, stat); + xfs_bulkstat_one_fmt_compat, + ubused, stat); } /* copied from xfs_ioctl.c */ @@ -297,13 +295,11 @@ xfs_compat_ioc_bulkstat( int res; error = xfs_bulkstat_one_compat(mp, inlast, bulkreq.ubuffer, - sizeof(compat_xfs_bstat_t), - NULL, 0, NULL, NULL, &res); + sizeof(compat_xfs_bstat_t), 0, &res); } else if (cmd == XFS_IOC_FSBULKSTAT_32) { error = xfs_bulkstat(mp, &inlast, &count, - xfs_bulkstat_one_compat, NULL, - sizeof(compat_xfs_bstat_t), bulkreq.ubuffer, - BULKSTAT_FG_QUICK, &done); + xfs_bulkstat_one_compat, sizeof(compat_xfs_bstat_t), + bulkreq.ubuffer, &done); } else error = XFS_ERROR(EINVAL); if (error) @@ -411,7 +407,7 @@ xfs_compat_attrmulti_by_handle( compat_xfs_fsop_attrmulti_handlereq_t am_hreq; struct dentry *dentry; unsigned int i, size; - char *attr_name; + unsigned char *attr_name; if (!capable(CAP_SYS_ADMIN)) return -XFS_ERROR(EPERM); @@ -419,6 +415,10 @@ xfs_compat_attrmulti_by_handle( sizeof(compat_xfs_fsop_attrmulti_handlereq_t))) return -XFS_ERROR(EFAULT); + /* overflow check */ + if (am_hreq.opcount >= INT_MAX / sizeof(compat_xfs_attr_multiop_t)) + return -E2BIG; + dentry = xfs_compat_handlereq_to_dentry(parfilp, &am_hreq.hreq); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -440,7 +440,7 @@ xfs_compat_attrmulti_by_handle( error = 0; for (i = 0; i < am_hreq.opcount; i++) { - ops[i].am_error = strncpy_from_user(attr_name, + ops[i].am_error = strncpy_from_user((char *)attr_name, compat_ptr(ops[i].am_attrname), MAXNAMELEN); if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 225946012d0b..44f0b2de153e 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -56,6 +56,7 @@ #include <linux/security.h> #include <linux/falloc.h> #include <linux/fiemap.h> +#include <linux/slab.h> /* * Bring the timestamps in the XFS inode uptodate. @@ -91,6 +92,16 @@ xfs_mark_inode_dirty_sync( mark_inode_dirty_sync(inode); } +void +xfs_mark_inode_dirty( + xfs_inode_t *ip) +{ + struct inode *inode = VFS_I(ip); + + if (!(inode->i_state & (I_WILL_FREE|I_FREEING|I_CLEAR))) + mark_inode_dirty(inode); +} + /* * Change the requested timestamp in the given inode. * We don't lock across timestamp updates, and we don't log them but @@ -140,10 +151,10 @@ xfs_init_security( struct xfs_inode *ip = XFS_I(inode); size_t length; void *value; - char *name; + unsigned char *name; int error; - error = security_inode_init_security(inode, dir, &name, + error = security_inode_init_security(inode, dir, (char **)&name, &value, &length); if (error) { if (error == -EOPNOTSUPP) @@ -574,11 +585,20 @@ xfs_vn_fallocate( bf.l_len = len; xfs_ilock(ip, XFS_IOLOCK_EXCL); + + /* check the new inode size is valid before allocating */ + if (!(mode & FALLOC_FL_KEEP_SIZE) && + offset + len > i_size_read(inode)) { + new_size = offset + len; + error = inode_newsize_ok(inode, new_size); + if (error) + goto out_unlock; + } + error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf, 0, XFS_ATTR_NOLOCK); - if (!error && !(mode & FALLOC_FL_KEEP_SIZE) && - offset + len > i_size_read(inode)) - new_size = offset + len; + if (error) + goto out_unlock; /* Change file size if needed */ if (new_size) { @@ -589,6 +609,7 @@ xfs_vn_fallocate( error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK); } +out_unlock: xfs_iunlock(ip, XFS_IOLOCK_EXCL); out_error: return error; @@ -662,7 +683,10 @@ xfs_vn_fiemap( bm.bmv_length = BTOBB(length); /* We add one because in getbmap world count includes the header */ - bm.bmv_count = fieinfo->fi_extents_max + 1; + bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM : + fieinfo->fi_extents_max + 1; + bm.bmv_count = min_t(__s32, bm.bmv_count, + (PAGE_SIZE * 16 / sizeof(struct getbmapx))); bm.bmv_iflags = BMV_IF_PREALLOC; if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) bm.bmv_iflags |= BMV_IF_ATTRFORK; diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index 5af0c81ca1ae..facfb323a706 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -88,7 +88,6 @@ #include <xfs_super.h> #include <xfs_globals.h> #include <xfs_fs_subr.h> -#include <xfs_lrw.h> #include <xfs_buf.h> /* diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c deleted file mode 100644 index 0d32457abef1..000000000000 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ /dev/null @@ -1,852 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_alloc.h" -#include "xfs_dmapi.h" -#include "xfs_quota.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_bmap.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" -#include "xfs_rtalloc.h" -#include "xfs_error.h" -#include "xfs_itable.h" -#include "xfs_rw.h" -#include "xfs_attr.h" -#include "xfs_inode_item.h" -#include "xfs_buf_item.h" -#include "xfs_utils.h" -#include "xfs_iomap.h" -#include "xfs_vnodeops.h" -#include "xfs_trace.h" - -#include <linux/capability.h> -#include <linux/writeback.h> - - -/* - * xfs_iozero - * - * xfs_iozero clears the specified range of buffer supplied, - * and marks all the affected blocks as valid and modified. If - * an affected block is not allocated, it will be allocated. If - * an affected block is not completely overwritten, and is not - * valid before the operation, it will be read from disk before - * being partially zeroed. - */ -STATIC int -xfs_iozero( - struct xfs_inode *ip, /* inode */ - loff_t pos, /* offset in file */ - size_t count) /* size of data to zero */ -{ - struct page *page; - struct address_space *mapping; - int status; - - mapping = VFS_I(ip)->i_mapping; - do { - unsigned offset, bytes; - void *fsdata; - - offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ - bytes = PAGE_CACHE_SIZE - offset; - if (bytes > count) - bytes = count; - - status = pagecache_write_begin(NULL, mapping, pos, bytes, - AOP_FLAG_UNINTERRUPTIBLE, - &page, &fsdata); - if (status) - break; - - zero_user(page, offset, bytes); - - status = pagecache_write_end(NULL, mapping, pos, bytes, bytes, - page, fsdata); - WARN_ON(status <= 0); /* can't return less than zero! */ - pos += bytes; - count -= bytes; - status = 0; - } while (count); - - return (-status); -} - -ssize_t /* bytes read, or (-) error */ -xfs_read( - xfs_inode_t *ip, - struct kiocb *iocb, - const struct iovec *iovp, - unsigned int segs, - loff_t *offset, - int ioflags) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - xfs_mount_t *mp = ip->i_mount; - size_t size = 0; - ssize_t ret = 0; - xfs_fsize_t n; - unsigned long seg; - - - XFS_STATS_INC(xs_read_calls); - - /* START copy & waste from filemap.c */ - for (seg = 0; seg < segs; seg++) { - const struct iovec *iv = &iovp[seg]; - - /* - * If any segment has a negative length, or the cumulative - * length ever wraps negative then return -EINVAL. - */ - size += iv->iov_len; - if (unlikely((ssize_t)(size|iv->iov_len) < 0)) - return XFS_ERROR(-EINVAL); - } - /* END copy & waste from filemap.c */ - - if (unlikely(ioflags & IO_ISDIRECT)) { - xfs_buftarg_t *target = - XFS_IS_REALTIME_INODE(ip) ? - mp->m_rtdev_targp : mp->m_ddev_targp; - if ((*offset & target->bt_smask) || - (size & target->bt_smask)) { - if (*offset == ip->i_size) { - return (0); - } - return -XFS_ERROR(EINVAL); - } - } - - n = XFS_MAXIOFFSET(mp) - *offset; - if ((n <= 0) || (size == 0)) - return 0; - - if (n < size) - size = n; - - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - - if (unlikely(ioflags & IO_ISDIRECT)) - mutex_lock(&inode->i_mutex); - xfs_ilock(ip, XFS_IOLOCK_SHARED); - - if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) { - int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags); - int iolock = XFS_IOLOCK_SHARED; - - ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *offset, size, - dmflags, &iolock); - if (ret) { - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - if (unlikely(ioflags & IO_ISDIRECT)) - mutex_unlock(&inode->i_mutex); - return ret; - } - } - - if (unlikely(ioflags & IO_ISDIRECT)) { - if (inode->i_mapping->nrpages) - ret = -xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK), - -1, FI_REMAPF_LOCKED); - mutex_unlock(&inode->i_mutex); - if (ret) { - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - return ret; - } - } - - trace_xfs_file_read(ip, size, *offset, ioflags); - - iocb->ki_pos = *offset; - ret = generic_file_aio_read(iocb, iovp, segs, *offset); - if (ret > 0) - XFS_STATS_ADD(xs_read_bytes, ret); - - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - return ret; -} - -ssize_t -xfs_splice_read( - xfs_inode_t *ip, - struct file *infilp, - loff_t *ppos, - struct pipe_inode_info *pipe, - size_t count, - int flags, - int ioflags) -{ - xfs_mount_t *mp = ip->i_mount; - ssize_t ret; - - XFS_STATS_INC(xs_read_calls); - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return -EIO; - - xfs_ilock(ip, XFS_IOLOCK_SHARED); - - if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) { - int iolock = XFS_IOLOCK_SHARED; - int error; - - error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count, - FILP_DELAY_FLAG(infilp), &iolock); - if (error) { - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - return -error; - } - } - - trace_xfs_file_splice_read(ip, count, *ppos, ioflags); - - ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); - if (ret > 0) - XFS_STATS_ADD(xs_read_bytes, ret); - - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - return ret; -} - -ssize_t -xfs_splice_write( - xfs_inode_t *ip, - struct pipe_inode_info *pipe, - struct file *outfilp, - loff_t *ppos, - size_t count, - int flags, - int ioflags) -{ - xfs_mount_t *mp = ip->i_mount; - ssize_t ret; - struct inode *inode = outfilp->f_mapping->host; - xfs_fsize_t isize, new_size; - - XFS_STATS_INC(xs_write_calls); - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return -EIO; - - xfs_ilock(ip, XFS_IOLOCK_EXCL); - - if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) { - int iolock = XFS_IOLOCK_EXCL; - int error; - - error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count, - FILP_DELAY_FLAG(outfilp), &iolock); - if (error) { - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return -error; - } - } - - new_size = *ppos + count; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (new_size > ip->i_size) - ip->i_new_size = new_size; - xfs_iunlock(ip, XFS_ILOCK_EXCL); - - trace_xfs_file_splice_write(ip, count, *ppos, ioflags); - - ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); - if (ret > 0) - XFS_STATS_ADD(xs_write_bytes, ret); - - isize = i_size_read(inode); - if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize)) - *ppos = isize; - - if (*ppos > ip->i_size) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (*ppos > ip->i_size) - ip->i_size = *ppos; - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } - - if (ip->i_new_size) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - ip->i_new_size = 0; - if (ip->i_d.di_size > ip->i_size) - ip->i_d.di_size = ip->i_size; - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return ret; -} - -/* - * This routine is called to handle zeroing any space in the last - * block of the file that is beyond the EOF. We do this since the - * size is being increased without writing anything to that block - * and we don't want anyone to read the garbage on the disk. - */ -STATIC int /* error (positive) */ -xfs_zero_last_block( - xfs_inode_t *ip, - xfs_fsize_t offset, - xfs_fsize_t isize) -{ - xfs_fileoff_t last_fsb; - xfs_mount_t *mp = ip->i_mount; - int nimaps; - int zero_offset; - int zero_len; - int error = 0; - xfs_bmbt_irec_t imap; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - - zero_offset = XFS_B_FSB_OFFSET(mp, isize); - if (zero_offset == 0) { - /* - * There are no extra bytes in the last block on disk to - * zero, so return. - */ - return 0; - } - - last_fsb = XFS_B_TO_FSBT(mp, isize); - nimaps = 1; - error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap, - &nimaps, NULL, NULL); - if (error) { - return error; - } - ASSERT(nimaps > 0); - /* - * If the block underlying isize is just a hole, then there - * is nothing to zero. - */ - if (imap.br_startblock == HOLESTARTBLOCK) { - return 0; - } - /* - * Zero the part of the last block beyond the EOF, and write it - * out sync. We need to drop the ilock while we do this so we - * don't deadlock when the buffer cache calls back to us. - */ - xfs_iunlock(ip, XFS_ILOCK_EXCL); - - zero_len = mp->m_sb.sb_blocksize - zero_offset; - if (isize + zero_len > offset) - zero_len = offset - isize; - error = xfs_iozero(ip, isize, zero_len); - - xfs_ilock(ip, XFS_ILOCK_EXCL); - ASSERT(error >= 0); - return error; -} - -/* - * Zero any on disk space between the current EOF and the new, - * larger EOF. This handles the normal case of zeroing the remainder - * of the last block in the file and the unusual case of zeroing blocks - * out beyond the size of the file. This second case only happens - * with fixed size extents and when the system crashes before the inode - * size was updated but after blocks were allocated. If fill is set, - * then any holes in the range are filled and zeroed. If not, the holes - * are left alone as holes. - */ - -int /* error (positive) */ -xfs_zero_eof( - xfs_inode_t *ip, - xfs_off_t offset, /* starting I/O offset */ - xfs_fsize_t isize) /* current inode size */ -{ - xfs_mount_t *mp = ip->i_mount; - xfs_fileoff_t start_zero_fsb; - xfs_fileoff_t end_zero_fsb; - xfs_fileoff_t zero_count_fsb; - xfs_fileoff_t last_fsb; - xfs_fileoff_t zero_off; - xfs_fsize_t zero_len; - int nimaps; - int error = 0; - xfs_bmbt_irec_t imap; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); - ASSERT(offset > isize); - - /* - * First handle zeroing the block on which isize resides. - * We only zero a part of that block so it is handled specially. - */ - error = xfs_zero_last_block(ip, offset, isize); - if (error) { - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); - return error; - } - - /* - * Calculate the range between the new size and the old - * where blocks needing to be zeroed may exist. To get the - * block where the last byte in the file currently resides, - * we need to subtract one from the size and truncate back - * to a block boundary. We subtract 1 in case the size is - * exactly on a block boundary. - */ - last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1; - start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); - end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1); - ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb); - if (last_fsb == end_zero_fsb) { - /* - * The size was only incremented on its last block. - * We took care of that above, so just return. - */ - return 0; - } - - ASSERT(start_zero_fsb <= end_zero_fsb); - while (start_zero_fsb <= end_zero_fsb) { - nimaps = 1; - zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; - error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb, - 0, NULL, 0, &imap, &nimaps, NULL, NULL); - if (error) { - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); - return error; - } - ASSERT(nimaps > 0); - - if (imap.br_state == XFS_EXT_UNWRITTEN || - imap.br_startblock == HOLESTARTBLOCK) { - /* - * This loop handles initializing pages that were - * partially initialized by the code below this - * loop. It basically zeroes the part of the page - * that sits on a hole and sets the page as P_HOLE - * and calls remapf if it is a mapped file. - */ - start_zero_fsb = imap.br_startoff + imap.br_blockcount; - ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); - continue; - } - - /* - * There are blocks we need to zero. - * Drop the inode lock while we're doing the I/O. - * We'll still have the iolock to protect us. - */ - xfs_iunlock(ip, XFS_ILOCK_EXCL); - - zero_off = XFS_FSB_TO_B(mp, start_zero_fsb); - zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount); - - if ((zero_off + zero_len) > offset) - zero_len = offset - zero_off; - - error = xfs_iozero(ip, zero_off, zero_len); - if (error) { - goto out_lock; - } - - start_zero_fsb = imap.br_startoff + imap.br_blockcount; - ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); - - xfs_ilock(ip, XFS_ILOCK_EXCL); - } - - return 0; - -out_lock: - xfs_ilock(ip, XFS_ILOCK_EXCL); - ASSERT(error >= 0); - return error; -} - -ssize_t /* bytes written, or (-) error */ -xfs_write( - struct xfs_inode *xip, - struct kiocb *iocb, - const struct iovec *iovp, - unsigned int nsegs, - loff_t *offset, - int ioflags) -{ - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - unsigned long segs = nsegs; - xfs_mount_t *mp; - ssize_t ret = 0, error = 0; - xfs_fsize_t isize, new_size; - int iolock; - int eventsent = 0; - size_t ocount = 0, count; - loff_t pos; - int need_i_mutex; - - XFS_STATS_INC(xs_write_calls); - - error = generic_segment_checks(iovp, &segs, &ocount, VERIFY_READ); - if (error) - return error; - - count = ocount; - pos = *offset; - - if (count == 0) - return 0; - - mp = xip->i_mount; - - xfs_wait_for_freeze(mp, SB_FREEZE_WRITE); - - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - -relock: - if (ioflags & IO_ISDIRECT) { - iolock = XFS_IOLOCK_SHARED; - need_i_mutex = 0; - } else { - iolock = XFS_IOLOCK_EXCL; - need_i_mutex = 1; - mutex_lock(&inode->i_mutex); - } - - xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); - -start: - error = -generic_write_checks(file, &pos, &count, - S_ISBLK(inode->i_mode)); - if (error) { - xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); - goto out_unlock_mutex; - } - - if ((DM_EVENT_ENABLED(xip, DM_EVENT_WRITE) && - !(ioflags & IO_INVIS) && !eventsent)) { - int dmflags = FILP_DELAY_FLAG(file); - - if (need_i_mutex) - dmflags |= DM_FLAGS_IMUX; - - xfs_iunlock(xip, XFS_ILOCK_EXCL); - error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, xip, - pos, count, dmflags, &iolock); - if (error) { - goto out_unlock_internal; - } - xfs_ilock(xip, XFS_ILOCK_EXCL); - eventsent = 1; - - /* - * The iolock was dropped and reacquired in XFS_SEND_DATA - * so we have to recheck the size when appending. - * We will only "goto start;" once, since having sent the - * event prevents another call to XFS_SEND_DATA, which is - * what allows the size to change in the first place. - */ - if ((file->f_flags & O_APPEND) && pos != xip->i_size) - goto start; - } - - if (ioflags & IO_ISDIRECT) { - xfs_buftarg_t *target = - XFS_IS_REALTIME_INODE(xip) ? - mp->m_rtdev_targp : mp->m_ddev_targp; - - if ((pos & target->bt_smask) || (count & target->bt_smask)) { - xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); - return XFS_ERROR(-EINVAL); - } - - if (!need_i_mutex && (mapping->nrpages || pos > xip->i_size)) { - xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); - iolock = XFS_IOLOCK_EXCL; - need_i_mutex = 1; - mutex_lock(&inode->i_mutex); - xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); - goto start; - } - } - - new_size = pos + count; - if (new_size > xip->i_size) - xip->i_new_size = new_size; - - if (likely(!(ioflags & IO_INVIS))) - file_update_time(file); - - /* - * If the offset is beyond the size of the file, we have a couple - * of things to do. First, if there is already space allocated - * we need to either create holes or zero the disk or ... - * - * If there is a page where the previous size lands, we need - * to zero it out up to the new size. - */ - - if (pos > xip->i_size) { - error = xfs_zero_eof(xip, pos, xip->i_size); - if (error) { - xfs_iunlock(xip, XFS_ILOCK_EXCL); - goto out_unlock_internal; - } - } - xfs_iunlock(xip, XFS_ILOCK_EXCL); - - /* - * If we're writing the file then make sure to clear the - * setuid and setgid bits if the process is not being run - * by root. This keeps people from modifying setuid and - * setgid binaries. - */ - - if (((xip->i_d.di_mode & S_ISUID) || - ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) == - (S_ISGID | S_IXGRP))) && - !capable(CAP_FSETID)) { - error = xfs_write_clear_setuid(xip); - if (likely(!error)) - error = -file_remove_suid(file); - if (unlikely(error)) { - goto out_unlock_internal; - } - } - - /* We can write back this queue in page reclaim */ - current->backing_dev_info = mapping->backing_dev_info; - - if ((ioflags & IO_ISDIRECT)) { - if (mapping->nrpages) { - WARN_ON(need_i_mutex == 0); - error = xfs_flushinval_pages(xip, - (pos & PAGE_CACHE_MASK), - -1, FI_REMAPF_LOCKED); - if (error) - goto out_unlock_internal; - } - - if (need_i_mutex) { - /* demote the lock now the cached pages are gone */ - xfs_ilock_demote(xip, XFS_IOLOCK_EXCL); - mutex_unlock(&inode->i_mutex); - - iolock = XFS_IOLOCK_SHARED; - need_i_mutex = 0; - } - - trace_xfs_file_direct_write(xip, count, *offset, ioflags); - ret = generic_file_direct_write(iocb, iovp, - &segs, pos, offset, count, ocount); - - /* - * direct-io write to a hole: fall through to buffered I/O - * for completing the rest of the request. - */ - if (ret >= 0 && ret != count) { - XFS_STATS_ADD(xs_write_bytes, ret); - - pos += ret; - count -= ret; - - ioflags &= ~IO_ISDIRECT; - xfs_iunlock(xip, iolock); - goto relock; - } - } else { - int enospc = 0; - ssize_t ret2 = 0; - -write_retry: - trace_xfs_file_buffered_write(xip, count, *offset, ioflags); - ret2 = generic_file_buffered_write(iocb, iovp, segs, - pos, offset, count, ret); - /* - * if we just got an ENOSPC, flush the inode now we - * aren't holding any page locks and retry *once* - */ - if (ret2 == -ENOSPC && !enospc) { - error = xfs_flush_pages(xip, 0, -1, 0, FI_NONE); - if (error) - goto out_unlock_internal; - enospc = 1; - goto write_retry; - } - ret = ret2; - } - - current->backing_dev_info = NULL; - - isize = i_size_read(inode); - if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize)) - *offset = isize; - - if (*offset > xip->i_size) { - xfs_ilock(xip, XFS_ILOCK_EXCL); - if (*offset > xip->i_size) - xip->i_size = *offset; - xfs_iunlock(xip, XFS_ILOCK_EXCL); - } - - if (ret == -ENOSPC && - DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) { - xfs_iunlock(xip, iolock); - if (need_i_mutex) - mutex_unlock(&inode->i_mutex); - error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, xip, - DM_RIGHT_NULL, xip, DM_RIGHT_NULL, NULL, NULL, - 0, 0, 0); /* Delay flag intentionally unused */ - if (need_i_mutex) - mutex_lock(&inode->i_mutex); - xfs_ilock(xip, iolock); - if (error) - goto out_unlock_internal; - goto start; - } - - error = -ret; - if (ret <= 0) - goto out_unlock_internal; - - XFS_STATS_ADD(xs_write_bytes, ret); - - /* Handle various SYNC-type writes */ - if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { - loff_t end = pos + ret - 1; - int error2; - - xfs_iunlock(xip, iolock); - if (need_i_mutex) - mutex_unlock(&inode->i_mutex); - - error2 = filemap_write_and_wait_range(mapping, pos, end); - if (!error) - error = error2; - if (need_i_mutex) - mutex_lock(&inode->i_mutex); - xfs_ilock(xip, iolock); - - error2 = xfs_fsync(xip); - if (!error) - error = error2; - } - - out_unlock_internal: - if (xip->i_new_size) { - xfs_ilock(xip, XFS_ILOCK_EXCL); - xip->i_new_size = 0; - /* - * If this was a direct or synchronous I/O that failed (such - * as ENOSPC) then part of the I/O may have been written to - * disk before the error occured. In this case the on-disk - * file size may have been adjusted beyond the in-memory file - * size and now needs to be truncated back. - */ - if (xip->i_d.di_size > xip->i_size) - xip->i_d.di_size = xip->i_size; - xfs_iunlock(xip, XFS_ILOCK_EXCL); - } - xfs_iunlock(xip, iolock); - out_unlock_mutex: - if (need_i_mutex) - mutex_unlock(&inode->i_mutex); - return -error; -} - -/* - * All xfs metadata buffers except log state machine buffers - * get this attached as their b_bdstrat callback function. - * This is so that we can catch a buffer - * after prematurely unpinning it to forcibly shutdown the filesystem. - */ -int -xfs_bdstrat_cb(struct xfs_buf *bp) -{ - if (XFS_FORCED_SHUTDOWN(bp->b_mount)) { - trace_xfs_bdstrat_shut(bp, _RET_IP_); - /* - * Metadata write that didn't get logged but - * written delayed anyway. These aren't associated - * with a transaction, and can be ignored. - */ - if (XFS_BUF_IODONE_FUNC(bp) == NULL && - (XFS_BUF_ISREAD(bp)) == 0) - return (xfs_bioerror_relse(bp)); - else - return (xfs_bioerror(bp)); - } - - xfs_buf_iorequest(bp); - return 0; -} - -/* - * Wrapper around bdstrat so that we can stop data from going to disk in case - * we are shutting down the filesystem. Typically user data goes thru this - * path; one of the exceptions is the superblock. - */ -void -xfsbdstrat( - struct xfs_mount *mp, - struct xfs_buf *bp) -{ - ASSERT(mp); - if (!XFS_FORCED_SHUTDOWN(mp)) { - xfs_buf_iorequest(bp); - return; - } - - trace_xfs_bdstrat_shut(bp, _RET_IP_); - xfs_bioerror_relse(bp); -} - -/* - * If the underlying (data/log/rt) device is readonly, there are some - * operations that cannot proceed. - */ -int -xfs_dev_is_read_only( - xfs_mount_t *mp, - char *message) -{ - if (xfs_readonly_buftarg(mp->m_ddev_targp) || - xfs_readonly_buftarg(mp->m_logdev_targp) || - (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) { - cmn_err(CE_NOTE, - "XFS: %s required on read-only device.", message); - cmn_err(CE_NOTE, - "XFS: write access unavailable, cannot proceed."); - return EROFS; - } - return 0; -} diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h deleted file mode 100644 index d1f7789c7ffb..000000000000 --- a/fs/xfs/linux-2.6/xfs_lrw.h +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_LRW_H__ -#define __XFS_LRW_H__ - -struct xfs_mount; -struct xfs_inode; -struct xfs_buf; - -/* errors from xfsbdstrat() must be extracted from the buffer */ -extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *); -extern int xfs_bdstrat_cb(struct xfs_buf *); -extern int xfs_dev_is_read_only(struct xfs_mount *, char *); - -extern int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); - -#endif /* __XFS_LRW_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c index 3d4a0c84d634..067cafbfc635 100644 --- a/fs/xfs/linux-2.6/xfs_quotaops.c +++ b/fs/xfs/linux-2.6/xfs_quotaops.c @@ -19,10 +19,10 @@ #include "xfs_dmapi.h" #include "xfs_sb.h" #include "xfs_inum.h" +#include "xfs_log.h" #include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_quota.h" -#include "xfs_log.h" #include "xfs_trans.h" #include "xfs_bmap_btree.h" #include "xfs_inode.h" @@ -44,20 +44,6 @@ xfs_quota_type(int type) } STATIC int -xfs_fs_quota_sync( - struct super_block *sb, - int type) -{ - struct xfs_mount *mp = XFS_M(sb); - - if (sb->s_flags & MS_RDONLY) - return -EROFS; - if (!XFS_IS_QUOTA_RUNNING(mp)) - return -ENOSYS; - return -xfs_sync_data(mp, 0); -} - -STATIC int xfs_fs_get_xstate( struct super_block *sb, struct fs_quota_stat *fqs) @@ -82,8 +68,6 @@ xfs_fs_set_xstate( return -EROFS; if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp)) return -ENOSYS; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; if (uflags & XFS_QUOTA_UDQ_ACCT) flags |= XFS_UQUOTA_ACCT; @@ -113,7 +97,7 @@ xfs_fs_set_xstate( } STATIC int -xfs_fs_get_xquota( +xfs_fs_get_dqblk( struct super_block *sb, int type, qid_t id, @@ -130,7 +114,7 @@ xfs_fs_get_xquota( } STATIC int -xfs_fs_set_xquota( +xfs_fs_set_dqblk( struct super_block *sb, int type, qid_t id, @@ -144,16 +128,13 @@ xfs_fs_set_xquota( return -ENOSYS; if (!XFS_IS_QUOTA_ON(mp)) return -ESRCH; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq); } const struct quotactl_ops xfs_quotactl_operations = { - .quota_sync = xfs_fs_quota_sync, .get_xstate = xfs_fs_get_xstate, .set_xstate = xfs_fs_set_xstate, - .get_xquota = xfs_fs_get_xquota, - .set_xquota = xfs_fs_set_xquota, + .get_dqblk = xfs_fs_get_dqblk, + .set_dqblk = xfs_fs_set_dqblk, }; diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 77414db10dc2..80938c736c27 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -61,6 +61,7 @@ #include <linux/namei.h> #include <linux/init.h> +#include <linux/slab.h> #include <linux/mount.h> #include <linux/mempool.h> #include <linux/writeback.h> @@ -118,6 +119,8 @@ mempool_t *xfs_ioend_pool; #define MNTOPT_DMAPI "dmapi" /* DMI enabled (DMAPI / XDSM) */ #define MNTOPT_XDSM "xdsm" /* DMI enabled (DMAPI / XDSM) */ #define MNTOPT_DMI "dmi" /* DMI enabled (DMAPI / XDSM) */ +#define MNTOPT_DELAYLOG "delaylog" /* Delayed loging enabled */ +#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed loging disabled */ /* * Table driven mount option parser. @@ -373,6 +376,13 @@ xfs_parseargs( mp->m_flags |= XFS_MOUNT_DMAPI; } else if (!strcmp(this_char, MNTOPT_DMI)) { mp->m_flags |= XFS_MOUNT_DMAPI; + } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) { + mp->m_flags |= XFS_MOUNT_DELAYLOG; + cmn_err(CE_WARN, + "Enabling EXPERIMENTAL delayed logging feature " + "- use at your own risk.\n"); + } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { + mp->m_flags &= ~XFS_MOUNT_DELAYLOG; } else if (!strcmp(this_char, "ihashsize")) { cmn_err(CE_WARN, "XFS: ihashsize no longer used, option is deprecated."); @@ -534,6 +544,7 @@ xfs_showargs( { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, { XFS_MOUNT_DMAPI, "," MNTOPT_DMAPI }, { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, + { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG }, { 0, NULL } }; static struct proc_xfs_info xfs_info_unset[] = { @@ -724,7 +735,8 @@ void xfs_blkdev_issue_flush( xfs_buftarg_t *buftarg) { - blkdev_issue_flush(buftarg->bt_bdev, NULL); + blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL, + BLKDEV_IFL_WAIT); } STATIC void @@ -788,18 +800,18 @@ xfs_open_devices( * Setup xfs_mount buffer target pointers */ error = ENOMEM; - mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0); + mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0, mp->m_fsname); if (!mp->m_ddev_targp) goto out_close_rtdev; if (rtdev) { - mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1); + mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1, mp->m_fsname); if (!mp->m_rtdev_targp) goto out_free_ddev_targ; } if (logdev && logdev != ddev) { - mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1); + mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1, mp->m_fsname); if (!mp->m_logdev_targp) goto out_free_rtdev_targ; } else { @@ -877,12 +889,11 @@ xfsaild( { struct xfs_ail *ailp = data; xfs_lsn_t last_pushed_lsn = 0; - long tout = 0; + long tout = 0; /* milliseconds */ while (!kthread_should_stop()) { - if (tout) - schedule_timeout_interruptible(msecs_to_jiffies(tout)); - tout = 1000; + schedule_timeout_interruptible(tout ? + msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT); /* swsusp */ try_to_freeze(); @@ -902,7 +913,8 @@ xfsaild_start( struct xfs_ail *ailp) { ailp->xa_target = 0; - ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild"); + ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s", + ailp->xa_mount->m_fsname); if (IS_ERR(ailp->xa_task)) return -PTR_ERR(ailp->xa_task); return 0; @@ -1022,59 +1034,109 @@ xfs_fs_dirty_inode( XFS_I(inode)->i_update_core = 1; } -/* - * Attempt to flush the inode, this will actually fail - * if the inode is pinned, but we dirty the inode again - * at the point when it is unpinned after a log write, - * since this is when the inode itself becomes flushable. - */ +STATIC int +xfs_log_inode( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + + xfs_iunlock(ip, XFS_ILOCK_SHARED); + tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); + error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); + + if (error) { + xfs_trans_cancel(tp, 0); + /* we need to return with the lock hold shared */ + xfs_ilock(ip, XFS_ILOCK_SHARED); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + /* + * Note - it's possible that we might have pushed ourselves out of the + * way during trans_reserve which would flush the inode. But there's + * no guarantee that the inode buffer has actually gone out yet (it's + * delwri). Plus the buffer could be pinned anyway if it's part of + * an inode in another recent transaction. So we play it safe and + * fire off the transaction anyway. + */ + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ihold(tp, ip); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0); + xfs_ilock_demote(ip, XFS_ILOCK_EXCL); + + return error; +} + STATIC int xfs_fs_write_inode( struct inode *inode, - int sync) + struct writeback_control *wbc) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - int error = 0; + int error = EAGAIN; xfs_itrace_entry(ip); if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); - if (sync) { - error = xfs_wait_on_pages(ip, 0, -1); - if (error) + if (wbc->sync_mode == WB_SYNC_ALL) { + /* + * Make sure the inode has hit stable storage. By using the + * log and the fsync transactions we reduce the IOs we have + * to do here from two (log and inode) to just the log. + * + * Note: We still need to do a delwri write of the inode after + * this to flush it to the backing buffer so that bulkstat + * works properly if this is the first time the inode has been + * written. Because we hold the ilock atomically over the + * transaction commit and the inode flush we are guaranteed + * that the inode is not pinned when it returns. If the flush + * lock is already held, then the inode has already been + * flushed once and we don't need to flush it again. Hence + * the code will only flush the inode if it isn't already + * being flushed. + */ + xfs_ioend_wait(ip); + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (ip->i_update_core) { + error = xfs_log_inode(ip); + if (error) + goto out_unlock; + } + } else { + /* + * We make this non-blocking if the inode is contended, return + * EAGAIN to indicate to the caller that they did not succeed. + * This prevents the flush path from blocking on inodes inside + * another operation right now, they get caught later by xfs_sync. + */ + if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) goto out; } - /* - * Bypass inodes which have already been cleaned by - * the inode flush clustering code inside xfs_iflush - */ - if (xfs_inode_clean(ip)) - goto out; + if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) + goto out_unlock; /* - * We make this non-blocking if the inode is contended, return - * EAGAIN to indicate to the caller that they did not succeed. - * This prevents the flush path from blocking on inodes inside - * another operation right now, they get caught later by xfs_sync. + * Now we have the flush lock and the inode is not pinned, we can check + * if the inode is really clean as we know that there are no pending + * transaction completions, it is not waiting on the delayed write + * queue and there is no IO in progress. */ - if (sync) { - xfs_ilock(ip, XFS_ILOCK_SHARED); - xfs_iflock(ip); - - error = xfs_iflush(ip, XFS_IFLUSH_SYNC); - } else { - error = EAGAIN; - if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) - goto out; - if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) - goto out_unlock; - - error = xfs_iflush(ip, XFS_IFLUSH_ASYNC_NOBLOCK); + if (xfs_inode_clean(ip)) { + xfs_ifunlock(ip); + error = 0; + goto out_unlock; } + error = xfs_iflush(ip, 0); out_unlock: xfs_iunlock(ip, XFS_ILOCK_SHARED); @@ -1160,6 +1222,7 @@ xfs_fs_put_super( xfs_unmountfs(mp); xfs_freesb(mp); + xfs_inode_shrinker_unregister(mp); xfs_icsb_destroy_counters(mp); xfs_close_devices(mp); xfs_dmops_put(mp); @@ -1257,6 +1320,29 @@ xfs_fs_statfs( return 0; } +STATIC void +xfs_save_resvblks(struct xfs_mount *mp) +{ + __uint64_t resblks = 0; + + mp->m_resblks_save = mp->m_resblks; + xfs_reserve_blocks(mp, &resblks, NULL); +} + +STATIC void +xfs_restore_resvblks(struct xfs_mount *mp) +{ + __uint64_t resblks; + + if (mp->m_resblks_save) { + resblks = mp->m_resblks_save; + mp->m_resblks_save = 0; + } else + resblks = xfs_default_resblks(mp); + + xfs_reserve_blocks(mp, &resblks, NULL); +} + STATIC int xfs_fs_remount( struct super_block *sb, @@ -1336,11 +1422,27 @@ xfs_fs_remount( } mp->m_update_flags = 0; } + + /* + * Fill out the reserve pool if it is empty. Use the stashed + * value if it is non-zero, otherwise go with the default. + */ + xfs_restore_resvblks(mp); } /* rw -> ro */ if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) { + /* + * After we have synced the data but before we sync the + * metadata, we need to free up the reserve block pool so that + * the used block count in the superblock on disk is correct at + * the end of the remount. Stash the current reserve pool size + * so that if we get remounted rw, we can return it to the same + * size. + */ + xfs_quiesce_data(mp); + xfs_save_resvblks(mp); xfs_quiesce_attr(mp); mp->m_flags |= XFS_MOUNT_RDONLY; } @@ -1359,11 +1461,22 @@ xfs_fs_freeze( { struct xfs_mount *mp = XFS_M(sb); + xfs_save_resvblks(mp); xfs_quiesce_attr(mp); return -xfs_fs_log_dummy(mp); } STATIC int +xfs_fs_unfreeze( + struct super_block *sb) +{ + struct xfs_mount *mp = XFS_M(sb); + + xfs_restore_resvblks(mp); + return 0; +} + +STATIC int xfs_fs_show_options( struct seq_file *m, struct vfsmount *mnt) @@ -1523,6 +1636,8 @@ xfs_fs_fill_super( if (error) goto fail_vnrele; + xfs_inode_shrinker_register(mp); + kfree(mtpt); return 0; @@ -1585,6 +1700,7 @@ static const struct super_operations xfs_super_operations = { .put_super = xfs_fs_put_super, .sync_fs = xfs_fs_sync_fs, .freeze_fs = xfs_fs_freeze, + .unfreeze_fs = xfs_fs_unfreeze, .statfs = xfs_fs_statfs, .remount_fs = xfs_fs_remount, .show_options = xfs_fs_show_options, @@ -1649,7 +1765,7 @@ xfs_init_zones(void) * but it is much faster. */ xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) + - (((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) / + (((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD) * sizeof(int))), "xfs_buf_item"); if (!xfs_buf_item_zone) goto out_destroy_trans_zone; diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h index 233d4b9881b1..519618e9279e 100644 --- a/fs/xfs/linux-2.6/xfs_super.h +++ b/fs/xfs/linux-2.6/xfs_super.h @@ -85,7 +85,7 @@ extern __uint64_t xfs_max_file_offset(unsigned int); extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); extern const struct export_operations xfs_export_operations; -extern struct xattr_handler *xfs_xattr_handlers[]; +extern const struct xattr_handler *xfs_xattr_handlers[]; extern const struct quotactl_ops xfs_quotactl_operations; #define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 1f5e4bb5e970..a51a07c3a70c 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -90,14 +90,14 @@ xfs_inode_ag_lookup( STATIC int xfs_inode_ag_walk( struct xfs_mount *mp, - xfs_agnumber_t ag, + struct xfs_perag *pag, int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), int flags, int tag, - int exclusive) + int exclusive, + int *nr_to_scan) { - struct xfs_perag *pag = &mp->m_perag[ag]; uint32_t first_index; int last_error = 0; int skipped; @@ -135,17 +135,50 @@ restart: if (error == EFSCORRUPTED) break; - } while (1); + } while ((*nr_to_scan)--); if (skipped) { delay(1); goto restart; } - - xfs_put_perag(mp, pag); return last_error; } +/* + * Select the next per-ag structure to iterate during the walk. The reclaim + * walk is optimised only to walk AGs with reclaimable inodes in them. + */ +static struct xfs_perag * +xfs_inode_ag_iter_next_pag( + struct xfs_mount *mp, + xfs_agnumber_t *first, + int tag) +{ + struct xfs_perag *pag = NULL; + + if (tag == XFS_ICI_RECLAIM_TAG) { + int found; + int ref; + + spin_lock(&mp->m_perag_lock); + found = radix_tree_gang_lookup_tag(&mp->m_perag_tree, + (void **)&pag, *first, 1, tag); + if (found <= 0) { + spin_unlock(&mp->m_perag_lock); + return NULL; + } + *first = pag->pag_agno + 1; + /* open coded pag reference increment */ + ref = atomic_inc_return(&pag->pag_ref); + spin_unlock(&mp->m_perag_lock); + trace_xfs_perag_get_reclaim(mp, pag->pag_agno, ref, _RET_IP_); + } else { + pag = xfs_perag_get(mp, *first); + (*first)++; + } + return pag; +} + int xfs_inode_ag_iterator( struct xfs_mount *mp, @@ -153,23 +186,31 @@ xfs_inode_ag_iterator( struct xfs_perag *pag, int flags), int flags, int tag, - int exclusive) + int exclusive, + int *nr_to_scan) { + struct xfs_perag *pag; int error = 0; int last_error = 0; xfs_agnumber_t ag; - - for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { - if (!mp->m_perag[ag].pag_ici_init) - continue; - error = xfs_inode_ag_walk(mp, ag, execute, flags, tag, - exclusive); + int nr; + + nr = nr_to_scan ? *nr_to_scan : INT_MAX; + ag = 0; + while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, tag))) { + error = xfs_inode_ag_walk(mp, pag, execute, flags, tag, + exclusive, &nr); + xfs_perag_put(pag); if (error) { last_error = error; if (error == EFSCORRUPTED) break; } + if (nr <= 0) + break; } + if (nr_to_scan) + *nr_to_scan = nr; return XFS_ERROR(last_error); } @@ -231,7 +272,7 @@ xfs_sync_inode_data( } error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ? - 0 : XFS_B_ASYNC, FI_NONE); + 0 : XBF_ASYNC, FI_NONE); xfs_iunlock(ip, XFS_IOLOCK_SHARED); out_wait: @@ -267,8 +308,7 @@ xfs_sync_inode_attr( goto out_unlock; } - error = xfs_iflush(ip, (flags & SYNC_WAIT) ? - XFS_IFLUSH_SYNC : XFS_IFLUSH_DELWRI); + error = xfs_iflush(ip, flags); out_unlock: xfs_iunlock(ip, XFS_ILOCK_SHARED); @@ -289,14 +329,11 @@ xfs_sync_data( ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags, - XFS_ICI_NO_TAG, 0); + XFS_ICI_NO_TAG, 0, NULL); if (error) return XFS_ERROR(error); - xfs_log_force(mp, 0, - (flags & SYNC_WAIT) ? - XFS_LOG_FORCE | XFS_LOG_SYNC : - XFS_LOG_FORCE); + xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0); return 0; } @@ -311,7 +348,7 @@ xfs_sync_attr( ASSERT((flags & ~SYNC_WAIT) == 0); return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags, - XFS_ICI_NO_TAG, 0); + XFS_ICI_NO_TAG, 0, NULL); } STATIC int @@ -322,10 +359,6 @@ xfs_commit_dummy_trans( struct xfs_inode *ip = mp->m_rootip; struct xfs_trans *tp; int error; - int log_flags = XFS_LOG_FORCE; - - if (flags & SYNC_WAIT) - log_flags |= XFS_LOG_SYNC; /* * Put a dummy transaction in the log to tell recovery @@ -347,74 +380,29 @@ xfs_commit_dummy_trans( xfs_iunlock(ip, XFS_ILOCK_EXCL); /* the log force ensures this transaction is pushed to disk */ - xfs_log_force(mp, 0, log_flags); + xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0); return error; } -int +STATIC int xfs_sync_fsdata( - struct xfs_mount *mp, - int flags) + struct xfs_mount *mp) { struct xfs_buf *bp; - struct xfs_buf_log_item *bip; - int error = 0; /* - * If this is xfssyncd() then only sync the superblock if we can - * lock it without sleeping and it is not pinned. + * If the buffer is pinned then push on the log so we won't get stuck + * waiting in the write for someone, maybe ourselves, to flush the log. + * + * Even though we just pushed the log above, we did not have the + * superblock buffer locked at that point so it can become pinned in + * between there and here. */ - if (flags & SYNC_TRYLOCK) { - ASSERT(!(flags & SYNC_WAIT)); - - bp = xfs_getsb(mp, XFS_BUF_TRYLOCK); - if (!bp) - goto out; - - bip = XFS_BUF_FSPRIVATE(bp, struct xfs_buf_log_item *); - if (!bip || !xfs_buf_item_dirty(bip) || XFS_BUF_ISPINNED(bp)) - goto out_brelse; - } else { - bp = xfs_getsb(mp, 0); + bp = xfs_getsb(mp, 0); + if (XFS_BUF_ISPINNED(bp)) + xfs_log_force(mp, 0); - /* - * If the buffer is pinned then push on the log so we won't - * get stuck waiting in the write for someone, maybe - * ourselves, to flush the log. - * - * Even though we just pushed the log above, we did not have - * the superblock buffer locked at that point so it can - * become pinned in between there and here. - */ - if (XFS_BUF_ISPINNED(bp)) - xfs_log_force(mp, 0, XFS_LOG_FORCE); - } - - - if (flags & SYNC_WAIT) - XFS_BUF_UNASYNC(bp); - else - XFS_BUF_ASYNC(bp); - - error = xfs_bwrite(mp, bp); - if (error) - return error; - - /* - * If this is a data integrity sync make sure all pending buffers - * are flushed out for the log coverage check below. - */ - if (flags & SYNC_WAIT) - xfs_flush_buftarg(mp->m_ddev_targp, 1); - - if (xfs_log_need_covered(mp)) - error = xfs_commit_dummy_trans(mp, flags); - return error; - - out_brelse: - xfs_buf_relse(bp); - out: - return error; + return xfs_bwrite(mp, bp); } /* @@ -438,7 +426,7 @@ int xfs_quiesce_data( struct xfs_mount *mp) { - int error; + int error, error2 = 0; /* push non-blocking */ xfs_sync_data(mp, 0); @@ -448,17 +436,21 @@ xfs_quiesce_data( xfs_sync_data(mp, SYNC_WAIT); xfs_qm_sync(mp, SYNC_WAIT); - /* drop inode references pinned by filestreams */ - xfs_filestream_flush(mp); - /* write superblock and hoover up shutdown errors */ - error = xfs_sync_fsdata(mp, SYNC_WAIT); + error = xfs_sync_fsdata(mp); + + /* make sure all delwri buffers are written out */ + xfs_flush_buftarg(mp->m_ddev_targp, 1); + + /* mark the log as covered if needed */ + if (xfs_log_need_covered(mp)) + error2 = xfs_commit_dummy_trans(mp, SYNC_WAIT); /* flush data-only devices */ if (mp->m_rtdev_targp) XFS_bflush(mp->m_rtdev_targp); - return error; + return error ? error : error2; } STATIC void @@ -467,16 +459,18 @@ xfs_quiesce_fs( { int count = 0, pincount; + xfs_reclaim_inodes(mp, 0); xfs_flush_buftarg(mp->m_ddev_targp, 0); - xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC); /* * This loop must run at least twice. The first instance of the loop * will flush most meta data but that will generate more meta data * (typically directory updates). Which then must be flushed and - * logged before we can write the unmount record. + * logged before we can write the unmount record. We also so sync + * reclaim of inodes to catch any that the above delwri flush skipped. */ do { + xfs_reclaim_inodes(mp, SYNC_WAIT); xfs_sync_attr(mp, SYNC_WAIT); pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); if (!pincount) { @@ -575,13 +569,13 @@ xfs_flush_inodes( igrab(inode); xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion); wait_for_completion(&completion); - xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC); + xfs_log_force(ip->i_mount, XFS_LOG_SYNC); } /* - * Every sync period we need to unpin all items, reclaim inodes, sync - * quota and write out the superblock. We might need to cover the log - * to indicate it is idle. + * Every sync period we need to unpin all items, reclaim inodes and sync + * disk quotas. We might need to cover the log to indicate that the + * filesystem is idle. */ STATIC void xfs_sync_worker( @@ -591,11 +585,12 @@ xfs_sync_worker( int error; if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { - xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); - xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC); + xfs_log_force(mp, 0); + xfs_reclaim_inodes(mp, 0); /* dgc: errors ignored here */ error = xfs_qm_sync(mp, SYNC_TRYLOCK); - error = xfs_sync_fsdata(mp, SYNC_TRYLOCK); + if (xfs_log_need_covered(mp)) + error = xfs_commit_dummy_trans(mp, 0); } mp->m_sync_seq++; wake_up(&mp->m_wait_single_sync_task); @@ -613,7 +608,8 @@ xfssyncd( set_freezable(); timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10); for (;;) { - timeleft = schedule_timeout_interruptible(timeleft); + if (list_empty(&mp->m_sync_list)) + timeleft = schedule_timeout_interruptible(timeleft); /* swsusp */ try_to_freeze(); if (kthread_should_stop() && list_empty(&mp->m_sync_list)) @@ -633,8 +629,7 @@ xfssyncd( list_add_tail(&mp->m_sync_work.w_list, &mp->m_sync_list); } - list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list) - list_move(&work->w_list, &tmp); + list_splice_init(&mp->m_sync_list, &tmp); spin_unlock(&mp->m_sync_lock); list_for_each_entry_safe(work, n, &tmp, w_list) { @@ -658,7 +653,7 @@ xfs_syncd_init( mp->m_sync_work.w_syncer = xfs_sync_worker; mp->m_sync_work.w_mount = mp; mp->m_sync_work.w_completion = NULL; - mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd"); + mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd/%s", mp->m_fsname); if (IS_ERR(mp->m_sync_task)) return -PTR_ERR(mp->m_sync_task); return 0; @@ -679,6 +674,18 @@ __xfs_inode_set_reclaim_tag( radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), XFS_ICI_RECLAIM_TAG); + + if (!pag->pag_ici_reclaimable) { + /* propagate the reclaim tag up into the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_set(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } + pag->pag_ici_reclaimable++; } /* @@ -690,16 +697,17 @@ void xfs_inode_set_reclaim_tag( xfs_inode_t *ip) { - xfs_mount_t *mp = ip->i_mount; - xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino); + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; - read_lock(&pag->pag_ici_lock); + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + write_lock(&pag->pag_ici_lock); spin_lock(&ip->i_flags_lock); __xfs_inode_set_reclaim_tag(pag, ip); __xfs_iflags_set(ip, XFS_IRECLAIMABLE); spin_unlock(&ip->i_flags_lock); - read_unlock(&pag->pag_ici_lock); - xfs_put_perag(mp, pag); + write_unlock(&pag->pag_ici_lock); + xfs_perag_put(pag); } void @@ -710,14 +718,77 @@ __xfs_inode_clear_reclaim_tag( { radix_tree_tag_clear(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); + pag->pag_ici_reclaimable--; + if (!pag->pag_ici_reclaimable) { + /* clear the reclaim tag from the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_clear(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } } +/* + * Inodes in different states need to be treated differently, and the return + * value of xfs_iflush is not sufficient to get this right. The following table + * lists the inode states and the reclaim actions necessary for non-blocking + * reclaim: + * + * + * inode state iflush ret required action + * --------------- ---------- --------------- + * bad - reclaim + * shutdown EIO unpin and reclaim + * clean, unpinned 0 reclaim + * stale, unpinned 0 reclaim + * clean, pinned(*) 0 requeue + * stale, pinned EAGAIN requeue + * dirty, delwri ok 0 requeue + * dirty, delwri blocked EAGAIN requeue + * dirty, sync flush 0 reclaim + * + * (*) dgc: I don't think the clean, pinned state is possible but it gets + * handled anyway given the order of checks implemented. + * + * As can be seen from the table, the return value of xfs_iflush() is not + * sufficient to correctly decide the reclaim action here. The checks in + * xfs_iflush() might look like duplicates, but they are not. + * + * Also, because we get the flush lock first, we know that any inode that has + * been flushed delwri has had the flush completed by the time we check that + * the inode is clean. The clean inode check needs to be done before flushing + * the inode delwri otherwise we would loop forever requeuing clean inodes as + * we cannot tell apart a successful delwri flush and a clean inode from the + * return value of xfs_iflush(). + * + * Note that because the inode is flushed delayed write by background + * writeback, the flush lock may already be held here and waiting on it can + * result in very long latencies. Hence for sync reclaims, where we wait on the + * flush lock, the caller should push out delayed write inodes first before + * trying to reclaim them to minimise the amount of time spent waiting. For + * background relaim, we just requeue the inode for the next pass. + * + * Hence the order of actions after gaining the locks should be: + * bad => reclaim + * shutdown => unpin and reclaim + * pinned, delwri => requeue + * pinned, sync => unpin + * stale => reclaim + * clean => reclaim + * dirty, delwri => flush and requeue + * dirty, sync => flush, wait and reclaim + */ STATIC int xfs_reclaim_inode( struct xfs_inode *ip, struct xfs_perag *pag, int sync_mode) { + int error = 0; + /* * The radix tree lock here protects a thread in xfs_iget from racing * with us starting reclaim on the inode. Once we have the @@ -735,33 +806,70 @@ xfs_reclaim_inode( spin_unlock(&ip->i_flags_lock); write_unlock(&pag->pag_ici_lock); - /* - * If the inode is still dirty, then flush it out. If the inode - * is not in the AIL, then it will be OK to flush it delwri as - * long as xfs_iflush() does not keep any references to the inode. - * We leave that decision up to xfs_iflush() since it has the - * knowledge of whether it's OK to simply do a delwri flush of - * the inode or whether we need to wait until the inode is - * pulled from the AIL. - * We get the flush lock regardless, though, just to make sure - * we don't free it while it is being flushed. - */ xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_iflock(ip); + if (!xfs_iflock_nowait(ip)) { + if (!(sync_mode & SYNC_WAIT)) + goto out; + xfs_iflock(ip); + } + + if (is_bad_inode(VFS_I(ip))) + goto reclaim; + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_iunpin_wait(ip); + goto reclaim; + } + if (xfs_ipincount(ip)) { + if (!(sync_mode & SYNC_WAIT)) { + xfs_ifunlock(ip); + goto out; + } + xfs_iunpin_wait(ip); + } + if (xfs_iflags_test(ip, XFS_ISTALE)) + goto reclaim; + if (xfs_inode_clean(ip)) + goto reclaim; + + /* Now we have an inode that needs flushing */ + error = xfs_iflush(ip, sync_mode); + if (sync_mode & SYNC_WAIT) { + xfs_iflock(ip); + goto reclaim; + } /* - * In the case of a forced shutdown we rely on xfs_iflush() to - * wait for the inode to be unpinned before returning an error. + * When we have to flush an inode but don't have SYNC_WAIT set, we + * flush the inode out using a delwri buffer and wait for the next + * call into reclaim to find it in a clean state instead of waiting for + * it now. We also don't return errors here - if the error is transient + * then the next reclaim pass will flush the inode, and if the error + * is permanent then the next sync reclaim will reclaim the inode and + * pass on the error. */ - if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) { - /* synchronize with xfs_iflush_done */ - xfs_iflock(ip); - xfs_ifunlock(ip); + if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_fs_cmn_err(CE_WARN, ip->i_mount, + "inode 0x%llx background reclaim flush failed with %d", + (long long)ip->i_ino, error); } +out: + xfs_iflags_clear(ip, XFS_IRECLAIM); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* + * We could return EAGAIN here to make reclaim rescan the inode tree in + * a short while. However, this just burns CPU time scanning the tree + * waiting for IO to complete and xfssyncd never goes back to the idle + * state. Instead, return 0 to let the next scheduled background reclaim + * attempt to reclaim the inode again. + */ + return 0; +reclaim: + xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_ireclaim(ip); - return 0; + return error; + } int @@ -770,5 +878,57 @@ xfs_reclaim_inodes( int mode) { return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode, - XFS_ICI_RECLAIM_TAG, 1); + XFS_ICI_RECLAIM_TAG, 1, NULL); +} + +/* + * Shrinker infrastructure. + */ +static int +xfs_reclaim_inode_shrink( + struct shrinker *shrink, + int nr_to_scan, + gfp_t gfp_mask) +{ + struct xfs_mount *mp; + struct xfs_perag *pag; + xfs_agnumber_t ag; + int reclaimable; + + mp = container_of(shrink, struct xfs_mount, m_inode_shrink); + if (nr_to_scan) { + if (!(gfp_mask & __GFP_FS)) + return -1; + + xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0, + XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan); + /* if we don't exhaust the scan, don't bother coming back */ + if (nr_to_scan > 0) + return -1; + } + + reclaimable = 0; + ag = 0; + while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, + XFS_ICI_RECLAIM_TAG))) { + reclaimable += pag->pag_ici_reclaimable; + xfs_perag_put(pag); + } + return reclaimable; +} + +void +xfs_inode_shrinker_register( + struct xfs_mount *mp) +{ + mp->m_inode_shrink.shrink = xfs_reclaim_inode_shrink; + mp->m_inode_shrink.seeks = DEFAULT_SEEKS; + register_shrinker(&mp->m_inode_shrink); +} + +void +xfs_inode_shrinker_unregister( + struct xfs_mount *mp) +{ + unregister_shrinker(&mp->m_inode_shrink); } diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h index ea932b43335d..e28139aaa4aa 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/linux-2.6/xfs_sync.h @@ -37,7 +37,6 @@ void xfs_syncd_stop(struct xfs_mount *mp); int xfs_sync_attr(struct xfs_mount *mp, int flags); int xfs_sync_data(struct xfs_mount *mp, int flags); -int xfs_sync_fsdata(struct xfs_mount *mp, int flags); int xfs_quiesce_data(struct xfs_mount *mp); void xfs_quiesce_attr(struct xfs_mount *mp); @@ -54,6 +53,9 @@ void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag); int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), - int flags, int tag, int write_lock); + int flags, int tag, int write_lock, int *nr_to_scan); + +void xfs_inode_shrinker_register(struct xfs_mount *mp); +void xfs_inode_shrinker_unregister(struct xfs_mount *mp); #endif diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c index 856eb3c8d605..d12be8470cba 100644 --- a/fs/xfs/linux-2.6/xfs_trace.c +++ b/fs/xfs/linux-2.6/xfs_trace.c @@ -41,7 +41,6 @@ #include "xfs_alloc.h" #include "xfs_bmap.h" #include "xfs_attr.h" -#include "xfs_attr_sf.h" #include "xfs_attr_leaf.h" #include "xfs_log_priv.h" #include "xfs_buf_item.h" @@ -50,22 +49,8 @@ #include "xfs_aops.h" #include "quota/xfs_dquot_item.h" #include "quota/xfs_dquot.h" - -/* - * Format fsblock number into a static buffer & return it. - */ -STATIC char *xfs_fmtfsblock(xfs_fsblock_t bno) -{ - static char rval[50]; - - if (bno == NULLFSBLOCK) - sprintf(rval, "NULLFSBLOCK"); - else if (isnullstartblock(bno)) - sprintf(rval, "NULLSTARTBLOCK(%lld)", startblockval(bno)); - else - sprintf(rval, "%lld", (xfs_dfsbno_t)bno); - return rval; -} +#include "xfs_log_recover.h" +#include "xfs_inode_item.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h index c22a608321a3..302820690904 100644 --- a/fs/xfs/linux-2.6/xfs_trace.h +++ b/fs/xfs/linux-2.6/xfs_trace.h @@ -32,6 +32,10 @@ struct xfs_da_node_entry; struct xfs_dquot; struct xlog_ticket; struct log; +struct xlog_recover; +struct xlog_recover_item; +struct xfs_buf_log_format; +struct xfs_inode_log_format; DECLARE_EVENT_CLASS(xfs_attr_list_class, TP_PROTO(struct xfs_attr_list_context *ctx), @@ -91,6 +95,40 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add); DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk); DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound); +DECLARE_EVENT_CLASS(xfs_perag_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, + unsigned long caller_ip), + TP_ARGS(mp, agno, refcount, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(int, refcount) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->refcount = refcount; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d agno %u refcount %d caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->refcount, + (char *)__entry->caller_ip) +); + +#define DEFINE_PERAG_REF_EVENT(name) \ +DEFINE_EVENT(xfs_perag_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \ + unsigned long caller_ip), \ + TP_ARGS(mp, agno, refcount, caller_ip)) +DEFINE_PERAG_REF_EVENT(xfs_perag_get); +DEFINE_PERAG_REF_EVENT(xfs_perag_get_reclaim); +DEFINE_PERAG_REF_EVENT(xfs_perag_put); +DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim); +DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim); + TRACE_EVENT(xfs_attr_list_node_descend, TP_PROTO(struct xfs_attr_list_context *ctx, struct xfs_da_node_entry *btree), @@ -170,13 +208,13 @@ TRACE_EVENT(xfs_iext_insert, __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " - "offset %lld block %s count %lld flag %d caller %pf", + "offset %lld block %lld count %lld flag %d caller %pf", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), (long)__entry->idx, __entry->startoff, - xfs_fmtfsblock(__entry->startblock), + (__int64_t)__entry->startblock, __entry->blockcount, __entry->state, (char *)__entry->caller_ip) @@ -214,13 +252,13 @@ DECLARE_EVENT_CLASS(xfs_bmap_class, __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " - "offset %lld block %s count %lld flag %d caller %pf", + "offset %lld block %lld count %lld flag %d caller %pf", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), (long)__entry->idx, __entry->startoff, - xfs_fmtfsblock(__entry->startblock), + (__int64_t)__entry->startblock, __entry->blockcount, __entry->state, (char *)__entry->caller_ip) @@ -456,6 +494,7 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pushbuf); DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf); DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur); DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb); @@ -534,18 +573,21 @@ DECLARE_EVENT_CLASS(xfs_inode_class, __field(dev_t, dev) __field(xfs_ino_t, ino) __field(int, count) + __field(int, pincount) __field(unsigned long, caller_ip) ), TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; __entry->count = atomic_read(&VFS_I(ip)->i_count); + __entry->pincount = atomic_read(&ip->i_pincount); __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d ino 0x%llx count %d caller %pf", + TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pf", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->count, + __entry->pincount, (char *)__entry->caller_ip) ) @@ -555,6 +597,10 @@ DEFINE_EVENT(xfs_inode_class, name, \ TP_ARGS(ip, caller_ip)) DEFINE_INODE_EVENT(xfs_ihold); DEFINE_INODE_EVENT(xfs_irele); +DEFINE_INODE_EVENT(xfs_inode_pin); +DEFINE_INODE_EVENT(xfs_inode_unpin); +DEFINE_INODE_EVENT(xfs_inode_unpin_nowait); + /* the old xfs_itrace_entry tracer - to be replaced by s.th. in the VFS */ DEFINE_INODE_EVENT(xfs_inode); #define xfs_itrace_entry(ip) \ @@ -565,7 +611,7 @@ DECLARE_EVENT_CLASS(xfs_dquot_class, TP_ARGS(dqp), TP_STRUCT__entry( __field(dev_t, dev) - __field(__be32, id) + __field(u32, id) __field(unsigned, flags) __field(unsigned, nrefs) __field(unsigned long long, res_bcount) @@ -578,7 +624,7 @@ DECLARE_EVENT_CLASS(xfs_dquot_class, ), \ TP_fast_assign( __entry->dev = dqp->q_mount->m_super->s_dev; - __entry->id = dqp->q_core.d_id; + __entry->id = be32_to_cpu(dqp->q_core.d_id); __entry->flags = dqp->dq_flags; __entry->nrefs = dqp->q_nrefs; __entry->res_bcount = dqp->q_res_bcount; @@ -594,10 +640,10 @@ DECLARE_EVENT_CLASS(xfs_dquot_class, be64_to_cpu(dqp->q_core.d_ino_softlimit); ), TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx " - "bcnt 0x%llx [hard 0x%llx | soft 0x%llx] " - "icnt 0x%llx [hard 0x%llx | soft 0x%llx]", + "bcnt 0x%llx bhardlimit 0x%llx bsoftlimit 0x%llx " + "icnt 0x%llx ihardlimit 0x%llx isoftlimit 0x%llx]", MAJOR(__entry->dev), MINOR(__entry->dev), - be32_to_cpu(__entry->id), + __entry->id, __print_flags(__entry->flags, "|", XFS_DQ_FLAGS), __entry->nrefs, __entry->res_bcount, @@ -614,8 +660,6 @@ DEFINE_EVENT(xfs_dquot_class, name, \ TP_PROTO(struct xfs_dquot *dqp), \ TP_ARGS(dqp)) DEFINE_DQUOT_EVENT(xfs_dqadjust); -DEFINE_DQUOT_EVENT(xfs_dqshake_dirty); -DEFINE_DQUOT_EVENT(xfs_dqshake_unlink); DEFINE_DQUOT_EVENT(xfs_dqreclaim_want); DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty); DEFINE_DQUOT_EVENT(xfs_dqreclaim_unlink); @@ -630,7 +674,6 @@ DEFINE_DQUOT_EVENT(xfs_dqread_fail); DEFINE_DQUOT_EVENT(xfs_dqlookup_found); DEFINE_DQUOT_EVENT(xfs_dqlookup_want); DEFINE_DQUOT_EVENT(xfs_dqlookup_freelist); -DEFINE_DQUOT_EVENT(xfs_dqlookup_move); DEFINE_DQUOT_EVENT(xfs_dqlookup_done); DEFINE_DQUOT_EVENT(xfs_dqget_hit); DEFINE_DQUOT_EVENT(xfs_dqget_miss); @@ -739,165 +782,181 @@ DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter); DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit); DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub); -#define DEFINE_RW_EVENT(name) \ -TRACE_EVENT(name, \ - TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \ - TP_ARGS(ip, count, offset, flags), \ - TP_STRUCT__entry( \ - __field(dev_t, dev) \ - __field(xfs_ino_t, ino) \ - __field(xfs_fsize_t, size) \ - __field(xfs_fsize_t, new_size) \ - __field(loff_t, offset) \ - __field(size_t, count) \ - __field(int, flags) \ - ), \ - TP_fast_assign( \ - __entry->dev = VFS_I(ip)->i_sb->s_dev; \ - __entry->ino = ip->i_ino; \ - __entry->size = ip->i_d.di_size; \ - __entry->new_size = ip->i_new_size; \ - __entry->offset = offset; \ - __entry->count = count; \ - __entry->flags = flags; \ - ), \ - TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \ - "offset 0x%llx count 0x%zx ioflags %s", \ - MAJOR(__entry->dev), MINOR(__entry->dev), \ - __entry->ino, \ - __entry->size, \ - __entry->new_size, \ - __entry->offset, \ - __entry->count, \ - __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) \ +DECLARE_EVENT_CLASS(xfs_file_class, + TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), + TP_ARGS(ip, count, offset, flags), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsize_t, size) + __field(xfs_fsize_t, new_size) + __field(loff_t, offset) + __field(size_t, count) + __field(int, flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->size = ip->i_d.di_size; + __entry->new_size = ip->i_new_size; + __entry->offset = offset; + __entry->count = count; + __entry->flags = flags; + ), + TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " + "offset 0x%llx count 0x%zx ioflags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->new_size, + __entry->offset, + __entry->count, + __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) ) + +#define DEFINE_RW_EVENT(name) \ +DEFINE_EVENT(xfs_file_class, name, \ + TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \ + TP_ARGS(ip, count, offset, flags)) DEFINE_RW_EVENT(xfs_file_read); DEFINE_RW_EVENT(xfs_file_buffered_write); DEFINE_RW_EVENT(xfs_file_direct_write); DEFINE_RW_EVENT(xfs_file_splice_read); DEFINE_RW_EVENT(xfs_file_splice_write); +DECLARE_EVENT_CLASS(xfs_page_class, + TP_PROTO(struct inode *inode, struct page *page, unsigned long off), + TP_ARGS(inode, page, off), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(pgoff_t, pgoff) + __field(loff_t, size) + __field(unsigned long, offset) + __field(int, delalloc) + __field(int, unmapped) + __field(int, unwritten) + ), + TP_fast_assign( + int delalloc = -1, unmapped = -1, unwritten = -1; -#define DEFINE_PAGE_EVENT(name) \ -TRACE_EVENT(name, \ - TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \ - TP_ARGS(inode, page, off), \ - TP_STRUCT__entry( \ - __field(dev_t, dev) \ - __field(xfs_ino_t, ino) \ - __field(pgoff_t, pgoff) \ - __field(loff_t, size) \ - __field(unsigned long, offset) \ - __field(int, delalloc) \ - __field(int, unmapped) \ - __field(int, unwritten) \ - ), \ - TP_fast_assign( \ - int delalloc = -1, unmapped = -1, unwritten = -1; \ - \ - if (page_has_buffers(page)) \ - xfs_count_page_state(page, &delalloc, \ - &unmapped, &unwritten); \ - __entry->dev = inode->i_sb->s_dev; \ - __entry->ino = XFS_I(inode)->i_ino; \ - __entry->pgoff = page_offset(page); \ - __entry->size = i_size_read(inode); \ - __entry->offset = off; \ - __entry->delalloc = delalloc; \ - __entry->unmapped = unmapped; \ - __entry->unwritten = unwritten; \ - ), \ - TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " \ - "delalloc %d unmapped %d unwritten %d", \ - MAJOR(__entry->dev), MINOR(__entry->dev), \ - __entry->ino, \ - __entry->pgoff, \ - __entry->size, \ - __entry->offset, \ - __entry->delalloc, \ - __entry->unmapped, \ - __entry->unwritten) \ + if (page_has_buffers(page)) + xfs_count_page_state(page, &delalloc, + &unmapped, &unwritten); + __entry->dev = inode->i_sb->s_dev; + __entry->ino = XFS_I(inode)->i_ino; + __entry->pgoff = page_offset(page); + __entry->size = i_size_read(inode); + __entry->offset = off; + __entry->delalloc = delalloc; + __entry->unmapped = unmapped; + __entry->unwritten = unwritten; + ), + TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " + "delalloc %d unmapped %d unwritten %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->pgoff, + __entry->size, + __entry->offset, + __entry->delalloc, + __entry->unmapped, + __entry->unwritten) ) + +#define DEFINE_PAGE_EVENT(name) \ +DEFINE_EVENT(xfs_page_class, name, \ + TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \ + TP_ARGS(inode, page, off)) DEFINE_PAGE_EVENT(xfs_writepage); DEFINE_PAGE_EVENT(xfs_releasepage); DEFINE_PAGE_EVENT(xfs_invalidatepage); -#define DEFINE_IOMAP_EVENT(name) \ -TRACE_EVENT(name, \ - TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \ - int flags, struct xfs_bmbt_irec *irec), \ - TP_ARGS(ip, offset, count, flags, irec), \ - TP_STRUCT__entry( \ - __field(dev_t, dev) \ - __field(xfs_ino_t, ino) \ - __field(loff_t, size) \ - __field(loff_t, new_size) \ - __field(loff_t, offset) \ - __field(size_t, count) \ - __field(int, flags) \ - __field(xfs_fileoff_t, startoff) \ - __field(xfs_fsblock_t, startblock) \ - __field(xfs_filblks_t, blockcount) \ - ), \ - TP_fast_assign( \ - __entry->dev = VFS_I(ip)->i_sb->s_dev; \ - __entry->ino = ip->i_ino; \ - __entry->size = ip->i_d.di_size; \ - __entry->new_size = ip->i_new_size; \ - __entry->offset = offset; \ - __entry->count = count; \ - __entry->flags = flags; \ - __entry->startoff = irec ? irec->br_startoff : 0; \ - __entry->startblock = irec ? irec->br_startblock : 0; \ - __entry->blockcount = irec ? irec->br_blockcount : 0; \ - ), \ - TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \ - "offset 0x%llx count %zd flags %s " \ - "startoff 0x%llx startblock %s blockcount 0x%llx", \ - MAJOR(__entry->dev), MINOR(__entry->dev), \ - __entry->ino, \ - __entry->size, \ - __entry->new_size, \ - __entry->offset, \ - __entry->count, \ - __print_flags(__entry->flags, "|", BMAPI_FLAGS), \ - __entry->startoff, \ - xfs_fmtfsblock(__entry->startblock), \ - __entry->blockcount) \ +DECLARE_EVENT_CLASS(xfs_iomap_class, + TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, + int flags, struct xfs_bmbt_irec *irec), + TP_ARGS(ip, offset, count, flags, irec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(loff_t, size) + __field(loff_t, new_size) + __field(loff_t, offset) + __field(size_t, count) + __field(int, flags) + __field(xfs_fileoff_t, startoff) + __field(xfs_fsblock_t, startblock) + __field(xfs_filblks_t, blockcount) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->size = ip->i_d.di_size; + __entry->new_size = ip->i_new_size; + __entry->offset = offset; + __entry->count = count; + __entry->flags = flags; + __entry->startoff = irec ? irec->br_startoff : 0; + __entry->startblock = irec ? irec->br_startblock : 0; + __entry->blockcount = irec ? irec->br_blockcount : 0; + ), + TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " + "offset 0x%llx count %zd flags %s " + "startoff 0x%llx startblock %lld blockcount 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->new_size, + __entry->offset, + __entry->count, + __print_flags(__entry->flags, "|", BMAPI_FLAGS), + __entry->startoff, + (__int64_t)__entry->startblock, + __entry->blockcount) ) + +#define DEFINE_IOMAP_EVENT(name) \ +DEFINE_EVENT(xfs_iomap_class, name, \ + TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \ + int flags, struct xfs_bmbt_irec *irec), \ + TP_ARGS(ip, offset, count, flags, irec)) DEFINE_IOMAP_EVENT(xfs_iomap_enter); DEFINE_IOMAP_EVENT(xfs_iomap_found); DEFINE_IOMAP_EVENT(xfs_iomap_alloc); -#define DEFINE_SIMPLE_IO_EVENT(name) \ -TRACE_EVENT(name, \ - TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), \ - TP_ARGS(ip, offset, count), \ - TP_STRUCT__entry( \ - __field(dev_t, dev) \ - __field(xfs_ino_t, ino) \ - __field(loff_t, size) \ - __field(loff_t, new_size) \ - __field(loff_t, offset) \ - __field(size_t, count) \ - ), \ - TP_fast_assign( \ - __entry->dev = VFS_I(ip)->i_sb->s_dev; \ - __entry->ino = ip->i_ino; \ - __entry->size = ip->i_d.di_size; \ - __entry->new_size = ip->i_new_size; \ - __entry->offset = offset; \ - __entry->count = count; \ - ), \ - TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \ - "offset 0x%llx count %zd", \ - MAJOR(__entry->dev), MINOR(__entry->dev), \ - __entry->ino, \ - __entry->size, \ - __entry->new_size, \ - __entry->offset, \ - __entry->count) \ +DECLARE_EVENT_CLASS(xfs_simple_io_class, + TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), + TP_ARGS(ip, offset, count), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(loff_t, size) + __field(loff_t, new_size) + __field(loff_t, offset) + __field(size_t, count) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->size = ip->i_d.di_size; + __entry->new_size = ip->i_new_size; + __entry->offset = offset; + __entry->count = count; + ), + TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " + "offset 0x%llx count %zd", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->new_size, + __entry->offset, + __entry->count) ); + +#define DEFINE_SIMPLE_IO_EVENT(name) \ +DEFINE_EVENT(xfs_simple_io_class, name, \ + TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), \ + TP_ARGS(ip, offset, count)) DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc); DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert); @@ -1023,83 +1082,112 @@ TRACE_EVENT(xfs_bunmap, ); +#define XFS_BUSY_SYNC \ + { 0, "async" }, \ + { 1, "sync" } + TRACE_EVENT(xfs_alloc_busy, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, - xfs_extlen_t len, int slot), - TP_ARGS(mp, agno, agbno, len, slot), + TP_PROTO(struct xfs_trans *trans, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len, int sync), + TP_ARGS(trans, agno, agbno, len, sync), TP_STRUCT__entry( __field(dev_t, dev) + __field(struct xfs_trans *, tp) + __field(int, tid) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) - __field(int, slot) + __field(int, sync) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; + __entry->dev = trans->t_mountp->m_super->s_dev; + __entry->tp = trans; + __entry->tid = trans->t_ticket->t_tid; __entry->agno = agno; __entry->agbno = agbno; __entry->len = len; - __entry->slot = slot; + __entry->sync = sync; ), - TP_printk("dev %d:%d agno %u agbno %u len %u slot %d", + TP_printk("dev %d:%d trans 0x%p tid 0x%x agno %u agbno %u len %u %s", MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->tp, + __entry->tid, __entry->agno, __entry->agbno, __entry->len, - __entry->slot) + __print_symbolic(__entry->sync, XFS_BUSY_SYNC)) ); -#define XFS_BUSY_STATES \ - { 0, "found" }, \ - { 1, "missing" } - TRACE_EVENT(xfs_alloc_unbusy, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - int slot, int found), - TP_ARGS(mp, agno, slot, found), + xfs_agblock_t agbno, xfs_extlen_t len), + TP_ARGS(mp, agno, agbno, len), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) - __field(int, slot) - __field(int, found) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; - __entry->slot = slot; - __entry->found = found; + __entry->agbno = agbno; + __entry->len = len; ), - TP_printk("dev %d:%d agno %u slot %d %s", + TP_printk("dev %d:%d agno %u agbno %u len %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, - __entry->slot, - __print_symbolic(__entry->found, XFS_BUSY_STATES)) + __entry->agbno, + __entry->len) ); +#define XFS_BUSY_STATES \ + { 0, "missing" }, \ + { 1, "found" } + TRACE_EVENT(xfs_alloc_busysearch, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, - xfs_extlen_t len, xfs_lsn_t lsn), - TP_ARGS(mp, agno, agbno, len, lsn), + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len, int found), + TP_ARGS(mp, agno, agbno, len, found), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) - __field(xfs_lsn_t, lsn) + __field(int, found) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; __entry->agbno = agbno; __entry->len = len; - __entry->lsn = lsn; + __entry->found = found; ), - TP_printk("dev %d:%d agno %u agbno %u len %u force lsn 0x%llx", + TP_printk("dev %d:%d agno %u agbno %u len %u %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, __entry->len, + __print_symbolic(__entry->found, XFS_BUSY_STATES)) +); + +TRACE_EVENT(xfs_trans_commit_lsn, + TP_PROTO(struct xfs_trans *trans), + TP_ARGS(trans), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(struct xfs_trans *, tp) + __field(xfs_lsn_t, lsn) + ), + TP_fast_assign( + __entry->dev = trans->t_mountp->m_super->s_dev; + __entry->tp = trans; + __entry->lsn = trans->t_commit_lsn; + ), + TP_printk("dev %d:%d trans 0x%p commit_lsn 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->tp, __entry->lsn) ); @@ -1414,6 +1502,193 @@ TRACE_EVENT(xfs_dir2_leafn_moveents, __entry->count) ); +#define XFS_SWAPEXT_INODES \ + { 0, "target" }, \ + { 1, "temp" } + +#define XFS_INODE_FORMAT_STR \ + { 0, "invalid" }, \ + { 1, "local" }, \ + { 2, "extent" }, \ + { 3, "btree" } + +DECLARE_EVENT_CLASS(xfs_swap_extent_class, + TP_PROTO(struct xfs_inode *ip, int which), + TP_ARGS(ip, which), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, which) + __field(xfs_ino_t, ino) + __field(int, format) + __field(int, nex) + __field(int, max_nex) + __field(int, broot_size) + __field(int, fork_off) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->which = which; + __entry->ino = ip->i_ino; + __entry->format = ip->i_d.di_format; + __entry->nex = ip->i_d.di_nextents; + __entry->max_nex = ip->i_df.if_ext_max; + __entry->broot_size = ip->i_df.if_broot_bytes; + __entry->fork_off = XFS_IFORK_BOFF(ip); + ), + TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, " + "Max in-fork extents %d, broot size %d, fork offset %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->which, XFS_SWAPEXT_INODES), + __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR), + __entry->nex, + __entry->max_nex, + __entry->broot_size, + __entry->fork_off) +) + +#define DEFINE_SWAPEXT_EVENT(name) \ +DEFINE_EVENT(xfs_swap_extent_class, name, \ + TP_PROTO(struct xfs_inode *ip, int which), \ + TP_ARGS(ip, which)) + +DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before); +DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after); + +DECLARE_EVENT_CLASS(xfs_log_recover_item_class, + TP_PROTO(struct log *log, struct xlog_recover *trans, + struct xlog_recover_item *item, int pass), + TP_ARGS(log, trans, item, pass), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long, item) + __field(xlog_tid_t, tid) + __field(int, type) + __field(int, pass) + __field(int, count) + __field(int, total) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->item = (unsigned long)item; + __entry->tid = trans->r_log_tid; + __entry->type = ITEM_TYPE(item); + __entry->pass = pass; + __entry->count = item->ri_cnt; + __entry->total = item->ri_total; + ), + TP_printk("dev %d:%d trans 0x%x, pass %d, item 0x%p, item type %s " + "item region count/total %d/%d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->tid, + __entry->pass, + (void *)__entry->item, + __print_symbolic(__entry->type, XFS_LI_TYPE_DESC), + __entry->count, + __entry->total) +) + +#define DEFINE_LOG_RECOVER_ITEM(name) \ +DEFINE_EVENT(xfs_log_recover_item_class, name, \ + TP_PROTO(struct log *log, struct xlog_recover *trans, \ + struct xlog_recover_item *item, int pass), \ + TP_ARGS(log, trans, item, pass)) + +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add); +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add_cont); +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_head); +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_tail); +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_recover); + +DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class, + TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f), + TP_ARGS(log, buf_f), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(__int64_t, blkno) + __field(unsigned short, len) + __field(unsigned short, flags) + __field(unsigned short, size) + __field(unsigned int, map_size) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->blkno = buf_f->blf_blkno; + __entry->len = buf_f->blf_len; + __entry->flags = buf_f->blf_flags; + __entry->size = buf_f->blf_size; + __entry->map_size = buf_f->blf_map_size; + ), + TP_printk("dev %d:%d blkno 0x%llx, len %u, flags 0x%x, size %d, " + "map_size %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->blkno, + __entry->len, + __entry->flags, + __entry->size, + __entry->map_size) +) + +#define DEFINE_LOG_RECOVER_BUF_ITEM(name) \ +DEFINE_EVENT(xfs_log_recover_buf_item_class, name, \ + TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f), \ + TP_ARGS(log, buf_f)) + +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_not_cancel); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_add); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_ref_inc); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_recover); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_inode_buf); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf); + +DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class, + TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f), + TP_ARGS(log, in_f), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned short, size) + __field(int, fields) + __field(unsigned short, asize) + __field(unsigned short, dsize) + __field(__int64_t, blkno) + __field(int, len) + __field(int, boffset) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->ino = in_f->ilf_ino; + __entry->size = in_f->ilf_size; + __entry->fields = in_f->ilf_fields; + __entry->asize = in_f->ilf_asize; + __entry->dsize = in_f->ilf_dsize; + __entry->blkno = in_f->ilf_blkno; + __entry->len = in_f->ilf_len; + __entry->boffset = in_f->ilf_boffset; + ), + TP_printk("dev %d:%d ino 0x%llx, size %u, fields 0x%x, asize %d, " + "dsize %d, blkno 0x%llx, len %d, boffset %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->fields, + __entry->asize, + __entry->dsize, + __entry->blkno, + __entry->len, + __entry->boffset) +) +#define DEFINE_LOG_RECOVER_INO_ITEM(name) \ +DEFINE_EVENT(xfs_log_recover_ino_item_class, name, \ + TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f), \ + TP_ARGS(log, in_f)) + +DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover); +DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel); +DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c index 0b1878857fc3..87d3e03878c8 100644 --- a/fs/xfs/linux-2.6/xfs_xattr.c +++ b/fs/xfs/linux-2.6/xfs_xattr.c @@ -45,7 +45,7 @@ xfs_xattr_get(struct dentry *dentry, const char *name, value = NULL; } - error = -xfs_attr_get(ip, name, value, &asize, xflags); + error = -xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags); if (error) return error; return asize; @@ -67,32 +67,33 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value, xflags |= ATTR_REPLACE; if (!value) - return -xfs_attr_remove(ip, name, xflags); - return -xfs_attr_set(ip, name, (void *)value, size, xflags); + return -xfs_attr_remove(ip, (unsigned char *)name, xflags); + return -xfs_attr_set(ip, (unsigned char *)name, + (void *)value, size, xflags); } -static struct xattr_handler xfs_xattr_user_handler = { +static const struct xattr_handler xfs_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .flags = 0, /* no flags implies user namespace */ .get = xfs_xattr_get, .set = xfs_xattr_set, }; -static struct xattr_handler xfs_xattr_trusted_handler = { +static const struct xattr_handler xfs_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, .flags = ATTR_ROOT, .get = xfs_xattr_get, .set = xfs_xattr_set, }; -static struct xattr_handler xfs_xattr_security_handler = { +static const struct xattr_handler xfs_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .flags = ATTR_SECURE, .get = xfs_xattr_get, .set = xfs_xattr_set, }; -struct xattr_handler *xfs_xattr_handlers[] = { +const struct xattr_handler *xfs_xattr_handlers[] = { &xfs_xattr_user_handler, &xfs_xattr_trusted_handler, &xfs_xattr_security_handler, @@ -124,8 +125,13 @@ static const char *xfs_xattr_prefix(int flags) } static int -xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags, - char *name, int namelen, int valuelen, char *value) +xfs_xattr_put_listent( + struct xfs_attr_list_context *context, + int flags, + unsigned char *name, + int namelen, + int valuelen, + unsigned char *value) { unsigned int prefix_len = xfs_xattr_prefix_len(flags); char *offset; @@ -148,7 +154,7 @@ xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags, offset = (char *)context->alist + context->count; strncpy(offset, xfs_xattr_prefix(flags), prefix_len); offset += prefix_len; - strncpy(offset, name, namelen); /* real name */ + strncpy(offset, (char *)name, namelen); /* real name */ offset += namelen; *offset = '\0'; context->count += prefix_len + namelen + 1; @@ -156,8 +162,13 @@ xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags, } static int -xfs_xattr_put_listent_sizes(struct xfs_attr_list_context *context, int flags, - char *name, int namelen, int valuelen, char *value) +xfs_xattr_put_listent_sizes( + struct xfs_attr_list_context *context, + int flags, + unsigned char *name, + int namelen, + int valuelen, + unsigned char *value) { context->count += xfs_xattr_prefix_len(flags) + namelen + 1; return 0; diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index d7c7eea09fc2..585e7633dfc7 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c @@ -101,7 +101,7 @@ xfs_qm_dqinit( * No need to re-initialize these if this is a reclaimed dquot. */ if (brandnewdquot) { - dqp->dq_flnext = dqp->dq_flprev = dqp; + INIT_LIST_HEAD(&dqp->q_freelist); mutex_init(&dqp->q_qlock); init_waitqueue_head(&dqp->q_pinwait); @@ -119,20 +119,20 @@ xfs_qm_dqinit( * Only the q_core portion was zeroed in dqreclaim_one(). * So, we need to reset others. */ - dqp->q_nrefs = 0; - dqp->q_blkno = 0; - dqp->MPL_NEXT = dqp->HL_NEXT = NULL; - dqp->HL_PREVP = dqp->MPL_PREVP = NULL; - dqp->q_bufoffset = 0; - dqp->q_fileoffset = 0; - dqp->q_transp = NULL; - dqp->q_gdquot = NULL; - dqp->q_res_bcount = 0; - dqp->q_res_icount = 0; - dqp->q_res_rtbcount = 0; - atomic_set(&dqp->q_pincount, 0); - dqp->q_hash = NULL; - ASSERT(dqp->dq_flnext == dqp->dq_flprev); + dqp->q_nrefs = 0; + dqp->q_blkno = 0; + INIT_LIST_HEAD(&dqp->q_mplist); + INIT_LIST_HEAD(&dqp->q_hashlist); + dqp->q_bufoffset = 0; + dqp->q_fileoffset = 0; + dqp->q_transp = NULL; + dqp->q_gdquot = NULL; + dqp->q_res_bcount = 0; + dqp->q_res_icount = 0; + dqp->q_res_rtbcount = 0; + atomic_set(&dqp->q_pincount, 0); + dqp->q_hash = NULL; + ASSERT(list_empty(&dqp->q_freelist)); trace_xfs_dqreuse(dqp); } @@ -158,7 +158,7 @@ void xfs_qm_dqdestroy( xfs_dquot_t *dqp) { - ASSERT(! XFS_DQ_IS_ON_FREELIST(dqp)); + ASSERT(list_empty(&dqp->q_freelist)); mutex_destroy(&dqp->q_qlock); sv_destroy(&dqp->q_pinwait); @@ -252,7 +252,7 @@ xfs_qm_adjust_dqtimers( (be64_to_cpu(d->d_bcount) >= be64_to_cpu(d->d_blk_hardlimit)))) { d->d_btimer = cpu_to_be32(get_seconds() + - XFS_QI_BTIMELIMIT(mp)); + mp->m_quotainfo->qi_btimelimit); } else { d->d_bwarns = 0; } @@ -275,7 +275,7 @@ xfs_qm_adjust_dqtimers( (be64_to_cpu(d->d_icount) >= be64_to_cpu(d->d_ino_hardlimit)))) { d->d_itimer = cpu_to_be32(get_seconds() + - XFS_QI_ITIMELIMIT(mp)); + mp->m_quotainfo->qi_itimelimit); } else { d->d_iwarns = 0; } @@ -298,7 +298,7 @@ xfs_qm_adjust_dqtimers( (be64_to_cpu(d->d_rtbcount) >= be64_to_cpu(d->d_rtb_hardlimit)))) { d->d_rtbtimer = cpu_to_be32(get_seconds() + - XFS_QI_RTBTIMELIMIT(mp)); + mp->m_quotainfo->qi_rtbtimelimit); } else { d->d_rtbwarns = 0; } @@ -325,6 +325,7 @@ xfs_qm_init_dquot_blk( uint type, xfs_buf_t *bp) { + struct xfs_quotainfo *q = mp->m_quotainfo; xfs_dqblk_t *d; int curid, i; @@ -337,16 +338,16 @@ xfs_qm_init_dquot_blk( /* * ID of the first dquot in the block - id's are zero based. */ - curid = id - (id % XFS_QM_DQPERBLK(mp)); + curid = id - (id % q->qi_dqperchunk); ASSERT(curid >= 0); - memset(d, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp))); - for (i = 0; i < XFS_QM_DQPERBLK(mp); i++, d++, curid++) + memset(d, 0, BBTOB(q->qi_dqchunklen)); + for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) xfs_qm_dqinit_core(curid, type, d); xfs_trans_dquot_buf(tp, bp, - (type & XFS_DQ_USER ? XFS_BLI_UDQUOT_BUF : - ((type & XFS_DQ_PROJ) ? XFS_BLI_PDQUOT_BUF : - XFS_BLI_GDQUOT_BUF))); - xfs_trans_log_buf(tp, bp, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1); + (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF : + ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF : + XFS_BLF_GDQUOT_BUF))); + xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); } @@ -419,7 +420,7 @@ xfs_qm_dqalloc( /* now we can just get the buffer (there's nothing to read yet) */ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, dqp->q_blkno, - XFS_QI_DQCHUNKLEN(mp), + mp->m_quotainfo->qi_dqchunklen, 0); if (!bp || (error = XFS_BUF_GETERROR(bp))) goto error1; @@ -500,7 +501,8 @@ xfs_qm_dqtobp( */ if (dqp->q_blkno == (xfs_daddr_t) 0) { /* We use the id as an index */ - dqp->q_fileoffset = (xfs_fileoff_t)id / XFS_QM_DQPERBLK(mp); + dqp->q_fileoffset = (xfs_fileoff_t)id / + mp->m_quotainfo->qi_dqperchunk; nmaps = 1; quotip = XFS_DQ_TO_QIP(dqp); xfs_ilock(quotip, XFS_ILOCK_SHARED); @@ -529,7 +531,7 @@ xfs_qm_dqtobp( /* * offset of dquot in the (fixed sized) dquot chunk. */ - dqp->q_bufoffset = (id % XFS_QM_DQPERBLK(mp)) * + dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) * sizeof(xfs_dqblk_t); if (map.br_startblock == HOLESTARTBLOCK) { /* @@ -559,15 +561,13 @@ xfs_qm_dqtobp( * Read in the buffer, unless we've just done the allocation * (in which case we already have the buf). */ - if (! newdquot) { + if (!newdquot) { trace_xfs_dqtobp_read(dqp); - if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - dqp->q_blkno, - XFS_QI_DQCHUNKLEN(mp), - 0, &bp))) { - return (error); - } + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, + 0, &bp); if (error || !bp) return XFS_ERROR(error); } @@ -689,14 +689,14 @@ xfs_qm_idtodq( tp = NULL; if (flags & XFS_QMOPT_DQALLOC) { tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); - if ((error = xfs_trans_reserve(tp, - XFS_QM_DQALLOC_SPACE_RES(mp), - XFS_WRITE_LOG_RES(mp) + - BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1 + - 128, - 0, - XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT))) { + error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp), + XFS_WRITE_LOG_RES(mp) + + BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 + + 128, + 0, + XFS_TRANS_PERM_LOG_RES, + XFS_WRITE_LOG_COUNT); + if (error) { cancelflags = 0; goto error0; } @@ -751,7 +751,6 @@ xfs_qm_dqlookup( { xfs_dquot_t *dqp; uint flist_locked; - xfs_dquot_t *d; ASSERT(mutex_is_locked(&qh->qh_lock)); @@ -760,7 +759,7 @@ xfs_qm_dqlookup( /* * Traverse the hashchain looking for a match */ - for (dqp = qh->qh_next; dqp != NULL; dqp = dqp->HL_NEXT) { + list_for_each_entry(dqp, &qh->qh_list, q_hashlist) { /* * We already have the hashlock. We don't need the * dqlock to look at the id field of the dquot, since the @@ -772,12 +771,12 @@ xfs_qm_dqlookup( /* * All in core dquots must be on the dqlist of mp */ - ASSERT(dqp->MPL_PREVP != NULL); + ASSERT(!list_empty(&dqp->q_mplist)); xfs_dqlock(dqp); if (dqp->q_nrefs == 0) { - ASSERT (XFS_DQ_IS_ON_FREELIST(dqp)); - if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) { + ASSERT(!list_empty(&dqp->q_freelist)); + if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) { trace_xfs_dqlookup_want(dqp); /* @@ -787,7 +786,7 @@ xfs_qm_dqlookup( */ dqp->dq_flags |= XFS_DQ_WANT; xfs_dqunlock(dqp); - xfs_qm_freelist_lock(xfs_Gqm); + mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); xfs_dqlock(dqp); dqp->dq_flags &= ~(XFS_DQ_WANT); } @@ -802,46 +801,28 @@ xfs_qm_dqlookup( if (flist_locked) { if (dqp->q_nrefs != 0) { - xfs_qm_freelist_unlock(xfs_Gqm); + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); flist_locked = B_FALSE; } else { - /* - * take it off the freelist - */ + /* take it off the freelist */ trace_xfs_dqlookup_freelist(dqp); - XQM_FREELIST_REMOVE(dqp); - /* xfs_qm_freelist_print(&(xfs_Gqm-> - qm_dqfreelist), - "after removal"); */ + list_del_init(&dqp->q_freelist); + xfs_Gqm->qm_dqfrlist_cnt--; } } - /* - * grab a reference - */ XFS_DQHOLD(dqp); if (flist_locked) - xfs_qm_freelist_unlock(xfs_Gqm); + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); /* * move the dquot to the front of the hashchain */ ASSERT(mutex_is_locked(&qh->qh_lock)); - if (dqp->HL_PREVP != &qh->qh_next) { - trace_xfs_dqlookup_move(dqp); - if ((d = dqp->HL_NEXT)) - d->HL_PREVP = dqp->HL_PREVP; - *(dqp->HL_PREVP) = d; - d = qh->qh_next; - d->HL_PREVP = &dqp->HL_NEXT; - dqp->HL_NEXT = d; - dqp->HL_PREVP = &qh->qh_next; - qh->qh_next = dqp; - } + list_move(&dqp->q_hashlist, &qh->qh_list); trace_xfs_dqlookup_done(dqp); *O_dqpp = dqp; - ASSERT(mutex_is_locked(&qh->qh_lock)); - return (0); + return 0; } } @@ -975,16 +956,17 @@ xfs_qm_dqget( */ if (ip) { xfs_ilock(ip, XFS_ILOCK_EXCL); - if (! XFS_IS_DQTYPE_ON(mp, type)) { - /* inode stays locked on return */ - xfs_qm_dqdestroy(dqp); - return XFS_ERROR(ESRCH); - } + /* * A dquot could be attached to this inode by now, since * we had dropped the ilock. */ if (type == XFS_DQ_USER) { + if (!XFS_IS_UQUOTA_ON(mp)) { + /* inode stays locked on return */ + xfs_qm_dqdestroy(dqp); + return XFS_ERROR(ESRCH); + } if (ip->i_udquot) { xfs_qm_dqdestroy(dqp); dqp = ip->i_udquot; @@ -992,6 +974,11 @@ xfs_qm_dqget( goto dqret; } } else { + if (!XFS_IS_OQUOTA_ON(mp)) { + /* inode stays locked on return */ + xfs_qm_dqdestroy(dqp); + return XFS_ERROR(ESRCH); + } if (ip->i_gdquot) { xfs_qm_dqdestroy(dqp); dqp = ip->i_gdquot; @@ -1033,13 +1020,14 @@ xfs_qm_dqget( */ ASSERT(mutex_is_locked(&h->qh_lock)); dqp->q_hash = h; - XQM_HASHLIST_INSERT(h, dqp); + list_add(&dqp->q_hashlist, &h->qh_list); + h->qh_version++; /* * Attach this dquot to this filesystem's list of all dquots, * kept inside the mount structure in m_quotainfo field */ - xfs_qm_mplist_lock(mp); + mutex_lock(&mp->m_quotainfo->qi_dqlist_lock); /* * We return a locked dquot to the caller, with a reference taken @@ -1047,9 +1035,9 @@ xfs_qm_dqget( xfs_dqlock(dqp); dqp->q_nrefs = 1; - XQM_MPLIST_INSERT(&(XFS_QI_MPL_LIST(mp)), dqp); - - xfs_qm_mplist_unlock(mp); + list_add(&dqp->q_mplist, &mp->m_quotainfo->qi_dqlist); + mp->m_quotainfo->qi_dquots++; + mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock); mutex_unlock(&h->qh_lock); dqret: ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -1086,10 +1074,10 @@ xfs_qm_dqput( * drop the dqlock and acquire the freelist and dqlock * in the right order; but try to get it out-of-order first */ - if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) { + if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) { trace_xfs_dqput_wait(dqp); xfs_dqunlock(dqp); - xfs_qm_freelist_lock(xfs_Gqm); + mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); xfs_dqlock(dqp); } @@ -1100,10 +1088,8 @@ xfs_qm_dqput( if (--dqp->q_nrefs == 0) { trace_xfs_dqput_free(dqp); - /* - * insert at end of the freelist. - */ - XQM_FREELIST_INSERT(&(xfs_Gqm->qm_dqfreelist), dqp); + list_add_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist); + xfs_Gqm->qm_dqfrlist_cnt++; /* * If we just added a udquot to the freelist, then @@ -1118,10 +1104,6 @@ xfs_qm_dqput( xfs_dqlock(gdqp); dqp->q_gdquot = NULL; } - - /* xfs_qm_freelist_print(&(xfs_Gqm->qm_dqfreelist), - "@@@@@++ Free list (after append) @@@@@+"); - */ } xfs_dqunlock(dqp); @@ -1133,7 +1115,7 @@ xfs_qm_dqput( break; dqp = gdqp; } - xfs_qm_freelist_unlock(xfs_Gqm); + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); } /* @@ -1187,7 +1169,7 @@ xfs_qm_dqflush( * block, nada. */ if (!XFS_DQ_IS_DIRTY(dqp) || - (!(flags & XFS_QMOPT_SYNC) && atomic_read(&dqp->q_pincount) > 0)) { + (!(flags & SYNC_WAIT) && atomic_read(&dqp->q_pincount) > 0)) { xfs_dqfunlock(dqp); return 0; } @@ -1248,23 +1230,20 @@ xfs_qm_dqflush( */ if (XFS_BUF_ISPINNED(bp)) { trace_xfs_dqflush_force(dqp); - xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); + xfs_log_force(mp, 0); } - if (flags & XFS_QMOPT_DELWRI) { - xfs_bdwrite(mp, bp); - } else if (flags & XFS_QMOPT_ASYNC) { - error = xfs_bawrite(mp, bp); - } else { + if (flags & SYNC_WAIT) error = xfs_bwrite(mp, bp); - } + else + xfs_bdwrite(mp, bp); trace_xfs_dqflush_done(dqp); /* * dqp is still locked, but caller is free to unlock it now. */ - return (error); + return error; } @@ -1389,10 +1368,10 @@ int xfs_qm_dqpurge( xfs_dquot_t *dqp) { - xfs_dqhash_t *thishash; + xfs_dqhash_t *qh = dqp->q_hash; xfs_mount_t *mp = dqp->q_mount; - ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); + ASSERT(mutex_is_locked(&mp->m_quotainfo->qi_dqlist_lock)); ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock)); xfs_dqlock(dqp); @@ -1410,7 +1389,7 @@ xfs_qm_dqpurge( return (1); } - ASSERT(XFS_DQ_IS_ON_FREELIST(dqp)); + ASSERT(!list_empty(&dqp->q_freelist)); /* * If we're turning off quotas, we have to make sure that, for @@ -1445,7 +1424,7 @@ xfs_qm_dqpurge( * We don't care about getting disk errors here. We need * to purge this dquot anyway, so we go ahead regardless. */ - error = xfs_qm_dqflush(dqp, XFS_QMOPT_SYNC); + error = xfs_qm_dqflush(dqp, SYNC_WAIT); if (error) xfs_fs_cmn_err(CE_WARN, mp, "xfs_qm_dqpurge: dquot %p flush failed", dqp); @@ -1455,14 +1434,16 @@ xfs_qm_dqpurge( ASSERT(XFS_FORCED_SHUTDOWN(mp) || !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL)); - thishash = dqp->q_hash; - XQM_HASHLIST_REMOVE(thishash, dqp); - XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(mp)), dqp); + list_del_init(&dqp->q_hashlist); + qh->qh_version++; + list_del_init(&dqp->q_mplist); + mp->m_quotainfo->qi_dqreclaims++; + mp->m_quotainfo->qi_dquots--; /* * XXX Move this to the front of the freelist, if we can get the * freelist lock. */ - ASSERT(XFS_DQ_IS_ON_FREELIST(dqp)); + ASSERT(!list_empty(&dqp->q_freelist)); dqp->q_mount = NULL; dqp->q_hash = NULL; @@ -1470,7 +1451,7 @@ xfs_qm_dqpurge( memset(&dqp->q_core, 0, sizeof(dqp->q_core)); xfs_dqfunlock(dqp); xfs_dqunlock(dqp); - mutex_unlock(&thishash->qh_lock); + mutex_unlock(&qh->qh_lock); return (0); } @@ -1520,6 +1501,7 @@ void xfs_qm_dqflock_pushbuf_wait( xfs_dquot_t *dqp) { + xfs_mount_t *mp = dqp->q_mount; xfs_buf_t *bp; /* @@ -1528,26 +1510,18 @@ xfs_qm_dqflock_pushbuf_wait( * out immediately. We'll be able to acquire * the flush lock when the I/O completes. */ - bp = xfs_incore(dqp->q_mount->m_ddev_targp, dqp->q_blkno, - XFS_QI_DQCHUNKLEN(dqp->q_mount), - XFS_INCORE_TRYLOCK); - if (bp != NULL) { - if (XFS_BUF_ISDELAYWRITE(bp)) { - int error; - if (XFS_BUF_ISPINNED(bp)) { - xfs_log_force(dqp->q_mount, - (xfs_lsn_t)0, - XFS_LOG_FORCE); - } - error = xfs_bawrite(dqp->q_mount, bp); - if (error) - xfs_fs_cmn_err(CE_WARN, dqp->q_mount, - "xfs_qm_dqflock_pushbuf_wait: " - "pushbuf error %d on dqp %p, bp %p", - error, dqp, bp); - } else { - xfs_buf_relse(bp); - } + bp = xfs_incore(mp->m_ddev_targp, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK); + if (!bp) + goto out_lock; + + if (XFS_BUF_ISDELAYWRITE(bp)) { + if (XFS_BUF_ISPINNED(bp)) + xfs_log_force(mp, 0); + xfs_buf_delwri_promote(bp); + wake_up_process(bp->b_target->bt_task); } + xfs_buf_relse(bp); +out_lock: xfs_dqflock(dqp); } diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h index a0f7da586d1b..5da3a23b820d 100644 --- a/fs/xfs/quota/xfs_dquot.h +++ b/fs/xfs/quota/xfs_dquot.h @@ -33,40 +33,23 @@ * The hash chain headers (hash buckets) */ typedef struct xfs_dqhash { - struct xfs_dquot *qh_next; + struct list_head qh_list; struct mutex qh_lock; uint qh_version; /* ever increasing version */ uint qh_nelems; /* number of dquots on the list */ } xfs_dqhash_t; -typedef struct xfs_dqlink { - struct xfs_dquot *ql_next; /* forward link */ - struct xfs_dquot **ql_prevp; /* pointer to prev ql_next */ -} xfs_dqlink_t; - struct xfs_mount; struct xfs_trans; /* - * This is the marker which is designed to occupy the first few - * bytes of the xfs_dquot_t structure. Even inside this, the freelist pointers - * must come first. - * This serves as the marker ("sentinel") when we have to restart list - * iterations because of locking considerations. - */ -typedef struct xfs_dqmarker { - struct xfs_dquot*dqm_flnext; /* link to freelist: must be first */ - struct xfs_dquot*dqm_flprev; - xfs_dqlink_t dqm_mplist; /* link to mount's list of dquots */ - xfs_dqlink_t dqm_hashlist; /* link to the hash chain */ - uint dqm_flags; /* various flags (XFS_DQ_*) */ -} xfs_dqmarker_t; - -/* * The incore dquot structure */ typedef struct xfs_dquot { - xfs_dqmarker_t q_lists; /* list ptrs, q_flags (marker) */ + uint dq_flags; /* various flags (XFS_DQ_*) */ + struct list_head q_freelist; /* global free list of dquots */ + struct list_head q_mplist; /* mount's list of dquots */ + struct list_head q_hashlist; /* gloabl hash list of dquots */ xfs_dqhash_t *q_hash; /* the hashchain header */ struct xfs_mount*q_mount; /* filesystem this relates to */ struct xfs_trans*q_transp; /* trans this belongs to currently */ @@ -87,13 +70,6 @@ typedef struct xfs_dquot { wait_queue_head_t q_pinwait; /* dquot pinning wait queue */ } xfs_dquot_t; - -#define dq_flnext q_lists.dqm_flnext -#define dq_flprev q_lists.dqm_flprev -#define dq_mplist q_lists.dqm_mplist -#define dq_hashlist q_lists.dqm_hashlist -#define dq_flags q_lists.dqm_flags - /* * Lock hierarchy for q_qlock: * XFS_QLOCK_NORMAL is the implicit default, @@ -127,7 +103,6 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp) } #define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock))) -#define XFS_DQ_IS_ON_FREELIST(dqp) ((dqp)->dq_flnext != (dqp)) #define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY) #define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) #define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ) diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c index d0d4a9a0bbd7..8d89a24ae324 100644 --- a/fs/xfs/quota/xfs_dquot_item.c +++ b/fs/xfs/quota/xfs_dquot_item.c @@ -74,11 +74,11 @@ xfs_qm_dquot_logitem_format( logvec->i_addr = (xfs_caddr_t)&logitem->qli_format; logvec->i_len = sizeof(xfs_dq_logformat_t); - XLOG_VEC_SET_TYPE(logvec, XLOG_REG_TYPE_QFORMAT); + logvec->i_type = XLOG_REG_TYPE_QFORMAT; logvec++; logvec->i_addr = (xfs_caddr_t)&logitem->qli_dquot->q_core; logvec->i_len = sizeof(xfs_disk_dquot_t); - XLOG_VEC_SET_TYPE(logvec, XLOG_REG_TYPE_DQUOT); + logvec->i_type = XLOG_REG_TYPE_DQUOT; ASSERT(2 == logitem->qli_item.li_desc->lid_size); logitem->qli_format.qlf_size = 2; @@ -107,8 +107,7 @@ xfs_qm_dquot_logitem_pin( /* ARGSUSED */ STATIC void xfs_qm_dquot_logitem_unpin( - xfs_dq_logitem_t *logitem, - int stale) + xfs_dq_logitem_t *logitem) { xfs_dquot_t *dqp = logitem->qli_dquot; @@ -123,7 +122,7 @@ xfs_qm_dquot_logitem_unpin_remove( xfs_dq_logitem_t *logitem, xfs_trans_t *tp) { - xfs_qm_dquot_logitem_unpin(logitem, 0); + xfs_qm_dquot_logitem_unpin(logitem); } /* @@ -153,7 +152,7 @@ xfs_qm_dquot_logitem_push( * lock without sleeping, then there must not have been * anyone in the process of flushing the dquot. */ - error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI); + error = xfs_qm_dqflush(dqp, 0); if (error) xfs_fs_cmn_err(CE_WARN, dqp->q_mount, "xfs_qm_dquot_logitem_push: push error %d on dqp %p", @@ -190,7 +189,7 @@ xfs_qm_dqunpin_wait( /* * Give the log a push so we don't wait here too long. */ - xfs_log_force(dqp->q_mount, (xfs_lsn_t)0, XFS_LOG_FORCE); + xfs_log_force(dqp->q_mount, 0); wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0)); } @@ -212,68 +211,31 @@ xfs_qm_dquot_logitem_pushbuf( xfs_dquot_t *dqp; xfs_mount_t *mp; xfs_buf_t *bp; - uint dopush; dqp = qip->qli_dquot; ASSERT(XFS_DQ_IS_LOCKED(dqp)); /* - * The qli_pushbuf_flag keeps others from - * trying to duplicate our effort. - */ - ASSERT(qip->qli_pushbuf_flag != 0); - ASSERT(qip->qli_push_owner == current_pid()); - - /* * If flushlock isn't locked anymore, chances are that the * inode flush completed and the inode was taken off the AIL. * So, just get out. */ if (completion_done(&dqp->q_flush) || ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) { - qip->qli_pushbuf_flag = 0; xfs_dqunlock(dqp); return; } mp = dqp->q_mount; bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno, - XFS_QI_DQCHUNKLEN(mp), - XFS_INCORE_TRYLOCK); - if (bp != NULL) { - if (XFS_BUF_ISDELAYWRITE(bp)) { - dopush = ((qip->qli_item.li_flags & XFS_LI_IN_AIL) && - !completion_done(&dqp->q_flush)); - qip->qli_pushbuf_flag = 0; - xfs_dqunlock(dqp); - - if (XFS_BUF_ISPINNED(bp)) { - xfs_log_force(mp, (xfs_lsn_t)0, - XFS_LOG_FORCE); - } - if (dopush) { - int error; -#ifdef XFSRACEDEBUG - delay_for_intr(); - delay(300); -#endif - error = xfs_bawrite(mp, bp); - if (error) - xfs_fs_cmn_err(CE_WARN, mp, - "xfs_qm_dquot_logitem_pushbuf: pushbuf error %d on qip %p, bp %p", - error, qip, bp); - } else { - xfs_buf_relse(bp); - } - } else { - qip->qli_pushbuf_flag = 0; - xfs_dqunlock(dqp); - xfs_buf_relse(bp); - } + mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK); + xfs_dqunlock(dqp); + if (!bp) return; - } + if (XFS_BUF_ISDELAYWRITE(bp)) + xfs_buf_delwri_promote(bp); + xfs_buf_relse(bp); + return; - qip->qli_pushbuf_flag = 0; - xfs_dqunlock(dqp); } /* @@ -291,50 +253,24 @@ xfs_qm_dquot_logitem_trylock( xfs_dq_logitem_t *qip) { xfs_dquot_t *dqp; - uint retval; dqp = qip->qli_dquot; if (atomic_read(&dqp->q_pincount) > 0) - return (XFS_ITEM_PINNED); + return XFS_ITEM_PINNED; if (! xfs_qm_dqlock_nowait(dqp)) - return (XFS_ITEM_LOCKED); + return XFS_ITEM_LOCKED; - retval = XFS_ITEM_SUCCESS; if (!xfs_dqflock_nowait(dqp)) { /* - * The dquot is already being flushed. It may have been - * flushed delayed write, however, and we don't want to - * get stuck waiting for that to complete. So, we want to check - * to see if we can lock the dquot's buffer without sleeping. - * If we can and it is marked for delayed write, then we - * hold it and send it out from the push routine. We don't - * want to do that now since we might sleep in the device - * strategy routine. We also don't want to grab the buffer lock - * here because we'd like not to call into the buffer cache - * while holding the AIL lock. - * Make sure to only return PUSHBUF if we set pushbuf_flag - * ourselves. If someone else is doing it then we don't - * want to go to the push routine and duplicate their efforts. + * dquot has already been flushed to the backing buffer, + * leave it locked, pushbuf routine will unlock it. */ - if (qip->qli_pushbuf_flag == 0) { - qip->qli_pushbuf_flag = 1; - ASSERT(qip->qli_format.qlf_blkno == dqp->q_blkno); -#ifdef DEBUG - qip->qli_push_owner = current_pid(); -#endif - /* - * The dquot is left locked. - */ - retval = XFS_ITEM_PUSHBUF; - } else { - retval = XFS_ITEM_FLUSHING; - xfs_dqunlock_nonotify(dqp); - } + return XFS_ITEM_PUSHBUF; } ASSERT(qip->qli_item.li_flags & XFS_LI_IN_AIL); - return (retval); + return XFS_ITEM_SUCCESS; } @@ -392,8 +328,7 @@ static struct xfs_item_ops xfs_dquot_item_ops = { .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_qm_dquot_logitem_format, .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_pin, - .iop_unpin = (void(*)(xfs_log_item_t*, int)) - xfs_qm_dquot_logitem_unpin, + .iop_unpin = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_unpin, .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) xfs_qm_dquot_logitem_unpin_remove, .iop_trylock = (uint(*)(xfs_log_item_t*)) @@ -420,9 +355,8 @@ xfs_qm_dquot_logitem_init( xfs_dq_logitem_t *lp; lp = &dqp->q_logitem; - lp->qli_item.li_type = XFS_LI_DQUOT; - lp->qli_item.li_ops = &xfs_dquot_item_ops; - lp->qli_item.li_mountp = dqp->q_mount; + xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT, + &xfs_dquot_item_ops); lp->qli_dquot = dqp; lp->qli_format.qlf_type = XFS_LI_DQUOT; lp->qli_format.qlf_id = be32_to_cpu(dqp->q_core.d_id); @@ -467,7 +401,7 @@ xfs_qm_qoff_logitem_format(xfs_qoff_logitem_t *qf, log_vector->i_addr = (xfs_caddr_t)&(qf->qql_format); log_vector->i_len = sizeof(xfs_qoff_logitem_t); - XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_QUOTAOFF); + log_vector->i_type = XLOG_REG_TYPE_QUOTAOFF; qf->qql_format.qf_size = 1; } @@ -489,7 +423,7 @@ xfs_qm_qoff_logitem_pin(xfs_qoff_logitem_t *qf) */ /*ARGSUSED*/ STATIC void -xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf, int stale) +xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf) { return; } @@ -600,8 +534,7 @@ static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_qm_qoff_logitem_format, .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin, - .iop_unpin = (void(*)(xfs_log_item_t* ,int)) - xfs_qm_qoff_logitem_unpin, + .iop_unpin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin, .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*)) xfs_qm_qoff_logitem_unpin_remove, .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock, @@ -622,8 +555,7 @@ static struct xfs_item_ops xfs_qm_qoff_logitem_ops = { .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_qm_qoff_logitem_format, .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin, - .iop_unpin = (void(*)(xfs_log_item_t*, int)) - xfs_qm_qoff_logitem_unpin, + .iop_unpin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin, .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*)) xfs_qm_qoff_logitem_unpin_remove, .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock, @@ -649,11 +581,8 @@ xfs_qm_qoff_logitem_init( qf = (xfs_qoff_logitem_t*) kmem_zalloc(sizeof(xfs_qoff_logitem_t), KM_SLEEP); - qf->qql_item.li_type = XFS_LI_QUOTAOFF; - if (start) - qf->qql_item.li_ops = &xfs_qm_qoffend_logitem_ops; - else - qf->qql_item.li_ops = &xfs_qm_qoff_logitem_ops; + xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ? + &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops); qf->qql_item.li_mountp = mp; qf->qql_format.qf_type = XFS_LI_QUOTAOFF; qf->qql_format.qf_flags = flags; diff --git a/fs/xfs/quota/xfs_dquot_item.h b/fs/xfs/quota/xfs_dquot_item.h index 5a632531f843..5acae2ada70b 100644 --- a/fs/xfs/quota/xfs_dquot_item.h +++ b/fs/xfs/quota/xfs_dquot_item.h @@ -27,10 +27,6 @@ typedef struct xfs_dq_logitem { xfs_log_item_t qli_item; /* common portion */ struct xfs_dquot *qli_dquot; /* dquot ptr */ xfs_lsn_t qli_flush_lsn; /* lsn at last flush */ - unsigned short qli_pushbuf_flag; /* 1 bit used in push_ail */ -#ifdef DEBUG - uint64_t qli_push_owner; -#endif xfs_dq_logformat_t qli_format; /* logged structure */ } xfs_dq_logitem_t; diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 9e627a8b5b0e..67c018392d62 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -67,12 +67,9 @@ static cred_t xfs_zerocr; STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int); STATIC void xfs_qm_list_destroy(xfs_dqlist_t *); -STATIC void xfs_qm_freelist_init(xfs_frlist_t *); -STATIC void xfs_qm_freelist_destroy(xfs_frlist_t *); - STATIC int xfs_qm_init_quotainos(xfs_mount_t *); STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); -STATIC int xfs_qm_shake(int, gfp_t); +STATIC int xfs_qm_shake(struct shrinker *, int, gfp_t); static struct shrinker xfs_qm_shaker = { .shrink = xfs_qm_shake, @@ -84,21 +81,25 @@ extern struct mutex qcheck_lock; #endif #ifdef QUOTADEBUG -#define XQM_LIST_PRINT(l, NXT, title) \ -{ \ - xfs_dquot_t *dqp; int i = 0; \ - cmn_err(CE_DEBUG, "%s (#%d)", title, (int) (l)->qh_nelems); \ - for (dqp = (l)->qh_next; dqp != NULL; dqp = dqp->NXT) { \ - cmn_err(CE_DEBUG, " %d. \"%d (%s)\" " \ - "bcnt = %d, icnt = %d, refs = %d", \ - ++i, (int) be32_to_cpu(dqp->q_core.d_id), \ - DQFLAGTO_TYPESTR(dqp), \ - (int) be64_to_cpu(dqp->q_core.d_bcount), \ - (int) be64_to_cpu(dqp->q_core.d_icount), \ - (int) dqp->q_nrefs); } \ +static void +xfs_qm_dquot_list_print( + struct xfs_mount *mp) +{ + xfs_dquot_t *dqp; + int i = 0; + + list_for_each_entry(dqp, &mp->m_quotainfo->qi_dqlist_lock, qi_mplist) { + cmn_err(CE_DEBUG, " %d. \"%d (%s)\" " + "bcnt = %lld, icnt = %lld, refs = %d", + i++, be32_to_cpu(dqp->q_core.d_id), + DQFLAGTO_TYPESTR(dqp), + (long long)be64_to_cpu(dqp->q_core.d_bcount), + (long long)be64_to_cpu(dqp->q_core.d_icount), + dqp->q_nrefs); + } } #else -#define XQM_LIST_PRINT(l, NXT, title) do { } while (0) +static void xfs_qm_dquot_list_print(struct xfs_mount *mp) { } #endif /* @@ -118,9 +119,14 @@ xfs_Gqm_init(void) */ udqhash = kmem_zalloc_greedy(&hsize, XFS_QM_HASHSIZE_LOW * sizeof(xfs_dqhash_t), - XFS_QM_HASHSIZE_HIGH * sizeof(xfs_dqhash_t), - KM_SLEEP | KM_MAYFAIL | KM_LARGE); - gdqhash = kmem_zalloc(hsize, KM_SLEEP | KM_LARGE); + XFS_QM_HASHSIZE_HIGH * sizeof(xfs_dqhash_t)); + if (!udqhash) + goto out; + + gdqhash = kmem_zalloc_large(hsize); + if (!gdqhash) + goto out_free_udqhash; + hsize /= sizeof(xfs_dqhash_t); ndquot = hsize << 8; @@ -139,7 +145,9 @@ xfs_Gqm_init(void) /* * Freelist of all dquots of all file systems */ - xfs_qm_freelist_init(&(xqm->qm_dqfreelist)); + INIT_LIST_HEAD(&xqm->qm_dqfrlist); + xqm->qm_dqfrlist_cnt = 0; + mutex_init(&xqm->qm_dqfrlist_lock); /* * dquot zone. we register our own low-memory callback. @@ -170,6 +178,11 @@ xfs_Gqm_init(void) mutex_init(&qcheck_lock); #endif return xqm; + + out_free_udqhash: + kmem_free_large(udqhash); + out: + return NULL; } /* @@ -179,6 +192,7 @@ STATIC void xfs_qm_destroy( struct xfs_qm *xqm) { + struct xfs_dquot *dqp, *n; int hsize, i; ASSERT(xqm != NULL); @@ -189,12 +203,26 @@ xfs_qm_destroy( xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i])); xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i])); } - kmem_free(xqm->qm_usr_dqhtable); - kmem_free(xqm->qm_grp_dqhtable); + kmem_free_large(xqm->qm_usr_dqhtable); + kmem_free_large(xqm->qm_grp_dqhtable); xqm->qm_usr_dqhtable = NULL; xqm->qm_grp_dqhtable = NULL; xqm->qm_dqhashmask = 0; - xfs_qm_freelist_destroy(&(xqm->qm_dqfreelist)); + + /* frlist cleanup */ + mutex_lock(&xqm->qm_dqfrlist_lock); + list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) { + xfs_dqlock(dqp); +#ifdef QUOTADEBUG + cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp); +#endif + list_del_init(&dqp->q_freelist); + xfs_Gqm->qm_dqfrlist_cnt--; + xfs_dqunlock(dqp); + xfs_qm_dqdestroy(dqp); + } + mutex_unlock(&xqm->qm_dqfrlist_lock); + mutex_destroy(&xqm->qm_dqfrlist_lock); #ifdef DEBUG mutex_destroy(&qcheck_lock); #endif @@ -219,8 +247,14 @@ xfs_qm_hold_quotafs_ref( */ mutex_lock(&xfs_Gqm_lock); - if (xfs_Gqm == NULL) + if (!xfs_Gqm) { xfs_Gqm = xfs_Gqm_init(); + if (!xfs_Gqm) { + mutex_unlock(&xfs_Gqm_lock); + return ENOMEM; + } + } + /* * We can keep a list of all filesystems with quotas mounted for * debugging and statistical purposes, but ... @@ -242,7 +276,7 @@ STATIC void xfs_qm_rele_quotafs_ref( struct xfs_mount *mp) { - xfs_dquot_t *dqp, *nextdqp; + xfs_dquot_t *dqp, *n; ASSERT(xfs_Gqm); ASSERT(xfs_Gqm->qm_nrefs > 0); @@ -250,26 +284,24 @@ xfs_qm_rele_quotafs_ref( /* * Go thru the freelist and destroy all inactive dquots. */ - xfs_qm_freelist_lock(xfs_Gqm); + mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); - for (dqp = xfs_Gqm->qm_dqfreelist.qh_next; - dqp != (xfs_dquot_t *)&(xfs_Gqm->qm_dqfreelist); ) { + list_for_each_entry_safe(dqp, n, &xfs_Gqm->qm_dqfrlist, q_freelist) { xfs_dqlock(dqp); - nextdqp = dqp->dq_flnext; if (dqp->dq_flags & XFS_DQ_INACTIVE) { ASSERT(dqp->q_mount == NULL); ASSERT(! XFS_DQ_IS_DIRTY(dqp)); - ASSERT(dqp->HL_PREVP == NULL); - ASSERT(dqp->MPL_PREVP == NULL); - XQM_FREELIST_REMOVE(dqp); + ASSERT(list_empty(&dqp->q_hashlist)); + ASSERT(list_empty(&dqp->q_mplist)); + list_del_init(&dqp->q_freelist); + xfs_Gqm->qm_dqfrlist_cnt--; xfs_dqunlock(dqp); xfs_qm_dqdestroy(dqp); } else { xfs_dqunlock(dqp); } - dqp = nextdqp; } - xfs_qm_freelist_unlock(xfs_Gqm); + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); /* * Destroy the entire XQM. If somebody mounts with quotaon, this'll @@ -291,7 +323,7 @@ xfs_qm_unmount( struct xfs_mount *mp) { if (mp->m_quotainfo) { - xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING); + xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL); xfs_qm_destroy_quotainfo(mp); } } @@ -435,20 +467,21 @@ xfs_qm_unmount_quotas( */ STATIC int xfs_qm_dqflush_all( - xfs_mount_t *mp, - int flags) + struct xfs_mount *mp, + int sync_mode) { - int recl; - xfs_dquot_t *dqp; - int niters; - int error; + struct xfs_quotainfo *q = mp->m_quotainfo; + int recl; + struct xfs_dquot *dqp; + int niters; + int error; - if (mp->m_quotainfo == NULL) + if (!q) return 0; niters = 0; again: - xfs_qm_mplist_lock(mp); - FOREACH_DQUOT_IN_MP(dqp, mp) { + mutex_lock(&q->qi_dqlist_lock); + list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) { xfs_dqlock(dqp); if (! XFS_DQ_IS_DIRTY(dqp)) { xfs_dqunlock(dqp); @@ -456,7 +489,7 @@ again: } /* XXX a sentinel would be better */ - recl = XFS_QI_MPLRECLAIMS(mp); + recl = q->qi_dqreclaims; if (!xfs_dqflock_nowait(dqp)) { /* * If we can't grab the flush lock then check @@ -471,21 +504,21 @@ again: * Let go of the mplist lock. We don't want to hold it * across a disk write. */ - xfs_qm_mplist_unlock(mp); - error = xfs_qm_dqflush(dqp, flags); + mutex_unlock(&q->qi_dqlist_lock); + error = xfs_qm_dqflush(dqp, sync_mode); xfs_dqunlock(dqp); if (error) return error; - xfs_qm_mplist_lock(mp); - if (recl != XFS_QI_MPLRECLAIMS(mp)) { - xfs_qm_mplist_unlock(mp); + mutex_lock(&q->qi_dqlist_lock); + if (recl != q->qi_dqreclaims) { + mutex_unlock(&q->qi_dqlist_lock); /* XXX restart limit */ goto again; } } - xfs_qm_mplist_unlock(mp); + mutex_unlock(&q->qi_dqlist_lock); /* return ! busy */ return 0; } @@ -495,15 +528,15 @@ again: */ STATIC void xfs_qm_detach_gdquots( - xfs_mount_t *mp) + struct xfs_mount *mp) { - xfs_dquot_t *dqp, *gdqp; - int nrecl; + struct xfs_quotainfo *q = mp->m_quotainfo; + struct xfs_dquot *dqp, *gdqp; + int nrecl; again: - ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); - dqp = XFS_QI_MPLNEXT(mp); - while (dqp) { + ASSERT(mutex_is_locked(&q->qi_dqlist_lock)); + list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) { xfs_dqlock(dqp); if ((gdqp = dqp->q_gdquot)) { xfs_dqlock(gdqp); @@ -516,15 +549,14 @@ xfs_qm_detach_gdquots( * Can't hold the mplist lock across a dqput. * XXXmust convert to marker based iterations here. */ - nrecl = XFS_QI_MPLRECLAIMS(mp); - xfs_qm_mplist_unlock(mp); + nrecl = q->qi_dqreclaims; + mutex_unlock(&q->qi_dqlist_lock); xfs_qm_dqput(gdqp); - xfs_qm_mplist_lock(mp); - if (nrecl != XFS_QI_MPLRECLAIMS(mp)) + mutex_lock(&q->qi_dqlist_lock); + if (nrecl != q->qi_dqreclaims) goto again; } - dqp = dqp->MPL_NEXT; } } @@ -536,23 +568,23 @@ xfs_qm_detach_gdquots( */ STATIC int xfs_qm_dqpurge_int( - xfs_mount_t *mp, - uint flags) /* QUOTAOFF/UMOUNTING/UQUOTA/PQUOTA/GQUOTA */ + struct xfs_mount *mp, + uint flags) { - xfs_dquot_t *dqp; - uint dqtype; - int nrecl; - xfs_dquot_t *nextdqp; - int nmisses; + struct xfs_quotainfo *q = mp->m_quotainfo; + struct xfs_dquot *dqp, *n; + uint dqtype; + int nrecl; + int nmisses; - if (mp->m_quotainfo == NULL) + if (!q) return 0; dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0; dqtype |= (flags & XFS_QMOPT_PQUOTA) ? XFS_DQ_PROJ : 0; dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0; - xfs_qm_mplist_lock(mp); + mutex_lock(&q->qi_dqlist_lock); /* * In the first pass through all incore dquots of this filesystem, @@ -564,28 +596,25 @@ xfs_qm_dqpurge_int( again: nmisses = 0; - ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); + ASSERT(mutex_is_locked(&q->qi_dqlist_lock)); /* * Try to get rid of all of the unwanted dquots. The idea is to * get them off mplist and hashlist, but leave them on freelist. */ - dqp = XFS_QI_MPLNEXT(mp); - while (dqp) { + list_for_each_entry_safe(dqp, n, &q->qi_dqlist, q_mplist) { /* * It's OK to look at the type without taking dqlock here. * We're holding the mplist lock here, and that's needed for * a dqreclaim. */ - if ((dqp->dq_flags & dqtype) == 0) { - dqp = dqp->MPL_NEXT; + if ((dqp->dq_flags & dqtype) == 0) continue; - } if (!mutex_trylock(&dqp->q_hash->qh_lock)) { - nrecl = XFS_QI_MPLRECLAIMS(mp); - xfs_qm_mplist_unlock(mp); + nrecl = q->qi_dqreclaims; + mutex_unlock(&q->qi_dqlist_lock); mutex_lock(&dqp->q_hash->qh_lock); - xfs_qm_mplist_lock(mp); + mutex_lock(&q->qi_dqlist_lock); /* * XXXTheoretically, we can get into a very long @@ -593,7 +622,7 @@ xfs_qm_dqpurge_int( * No one can be adding dquots to the mplist at * this point, but somebody might be taking things off. */ - if (nrecl != XFS_QI_MPLRECLAIMS(mp)) { + if (nrecl != q->qi_dqreclaims) { mutex_unlock(&dqp->q_hash->qh_lock); goto again; } @@ -603,11 +632,9 @@ xfs_qm_dqpurge_int( * Take the dquot off the mplist and hashlist. It may remain on * freelist in INACTIVE state. */ - nextdqp = dqp->MPL_NEXT; nmisses += xfs_qm_dqpurge(dqp); - dqp = nextdqp; } - xfs_qm_mplist_unlock(mp); + mutex_unlock(&q->qi_dqlist_lock); return nmisses; } @@ -907,33 +934,33 @@ xfs_qm_dqdetach( int xfs_qm_sync( - xfs_mount_t *mp, - int flags) + struct xfs_mount *mp, + int flags) { - int recl, restarts; - xfs_dquot_t *dqp; - uint flush_flags; - int error; + struct xfs_quotainfo *q = mp->m_quotainfo; + int recl, restarts; + struct xfs_dquot *dqp; + int error; if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) return 0; - flush_flags = (flags & SYNC_WAIT) ? XFS_QMOPT_SYNC : XFS_QMOPT_DELWRI; restarts = 0; again: - xfs_qm_mplist_lock(mp); + mutex_lock(&q->qi_dqlist_lock); /* * dqpurge_all() also takes the mplist lock and iterate thru all dquots * in quotaoff. However, if the QUOTA_ACTIVE bits are not cleared * when we have the mplist lock, we know that dquots will be consistent * as long as we have it locked. */ - if (! XFS_IS_QUOTA_ON(mp)) { - xfs_qm_mplist_unlock(mp); + if (!XFS_IS_QUOTA_ON(mp)) { + mutex_unlock(&q->qi_dqlist_lock); return 0; } - FOREACH_DQUOT_IN_MP(dqp, mp) { + ASSERT(mutex_is_locked(&q->qi_dqlist_lock)); + list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) { /* * If this is vfs_sync calling, then skip the dquots that * don't 'seem' to be dirty. ie. don't acquire dqlock. @@ -957,7 +984,7 @@ xfs_qm_sync( } /* XXX a sentinel would be better */ - recl = XFS_QI_MPLRECLAIMS(mp); + recl = q->qi_dqreclaims; if (!xfs_dqflock_nowait(dqp)) { if (flags & SYNC_TRYLOCK) { xfs_dqunlock(dqp); @@ -977,25 +1004,25 @@ xfs_qm_sync( * Let go of the mplist lock. We don't want to hold it * across a disk write */ - xfs_qm_mplist_unlock(mp); - error = xfs_qm_dqflush(dqp, flush_flags); + mutex_unlock(&q->qi_dqlist_lock); + error = xfs_qm_dqflush(dqp, flags); xfs_dqunlock(dqp); if (error && XFS_FORCED_SHUTDOWN(mp)) return 0; /* Need to prevent umount failure */ else if (error) return error; - xfs_qm_mplist_lock(mp); - if (recl != XFS_QI_MPLRECLAIMS(mp)) { + mutex_lock(&q->qi_dqlist_lock); + if (recl != q->qi_dqreclaims) { if (++restarts >= XFS_QM_SYNC_MAX_RESTARTS) break; - xfs_qm_mplist_unlock(mp); + mutex_unlock(&q->qi_dqlist_lock); goto again; } } - xfs_qm_mplist_unlock(mp); + mutex_unlock(&q->qi_dqlist_lock); return 0; } @@ -1040,8 +1067,9 @@ xfs_qm_init_quotainfo( return error; } - xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0); - lockdep_set_class(&qinf->qi_dqlist.qh_lock, &xfs_quota_mplist_class); + INIT_LIST_HEAD(&qinf->qi_dqlist); + mutex_init(&qinf->qi_dqlist_lock); + lockdep_set_class(&qinf->qi_dqlist_lock, &xfs_quota_mplist_class); qinf->qi_dqreclaims = 0; @@ -1138,7 +1166,8 @@ xfs_qm_destroy_quotainfo( */ xfs_qm_rele_quotafs_ref(mp); - xfs_qm_list_destroy(&qi->qi_dqlist); + ASSERT(list_empty(&qi->qi_dqlist)); + mutex_destroy(&qi->qi_dqlist_lock); if (qi->qi_uquotaip) { IRELE(qi->qi_uquotaip); @@ -1165,7 +1194,7 @@ xfs_qm_list_init( int n) { mutex_init(&list->qh_lock); - list->qh_next = NULL; + INIT_LIST_HEAD(&list->qh_list); list->qh_version = 0; list->qh_nelems = 0; } @@ -1304,9 +1333,6 @@ xfs_qm_qino_alloc( */ spin_lock(&mp->m_sb_lock); if (flags & XFS_QMOPT_SBVERSION) { -#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY) - unsigned oldv = mp->m_sb.sb_versionnum; -#endif ASSERT(!xfs_sb_version_hasquota(&mp->m_sb)); ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) == @@ -1319,11 +1345,6 @@ xfs_qm_qino_alloc( /* qflags will get updated _after_ quotacheck */ mp->m_sb.sb_qflags = 0; -#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY) - cmn_err(CE_NOTE, - "Old superblock version %x, converting to %x.", - oldv, mp->m_sb.sb_versionnum); -#endif } if (flags & XFS_QMOPT_UQUOTA) mp->m_sb.sb_uquotino = (*ip)->i_ino; @@ -1359,10 +1380,10 @@ xfs_qm_reset_dqcounts( #ifdef DEBUG j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); do_div(j, sizeof(xfs_dqblk_t)); - ASSERT(XFS_QM_DQPERBLK(mp) == j); + ASSERT(mp->m_quotainfo->qi_dqperchunk == j); #endif ddq = (xfs_disk_dquot_t *)XFS_BUF_PTR(bp); - for (j = 0; j < XFS_QM_DQPERBLK(mp); j++) { + for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) { /* * Do a sanity check, and if needed, repair the dqblk. Don't * output any warnings because it's perfectly possible to @@ -1417,7 +1438,7 @@ xfs_qm_dqiter_bufs( while (blkcnt--) { error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, bno), - (int)XFS_QI_DQCHUNKLEN(mp), 0, &bp); + mp->m_quotainfo->qi_dqchunklen, 0, &bp); if (error) break; @@ -1427,7 +1448,7 @@ xfs_qm_dqiter_bufs( * goto the next block. */ bno++; - firstid += XFS_QM_DQPERBLK(mp); + firstid += mp->m_quotainfo->qi_dqperchunk; } return error; } @@ -1493,7 +1514,7 @@ xfs_qm_dqiterate( continue; firstid = (xfs_dqid_t) map[i].br_startoff * - XFS_QM_DQPERBLK(mp); + mp->m_quotainfo->qi_dqperchunk; /* * Do a read-ahead on the next extent. */ @@ -1504,7 +1525,7 @@ xfs_qm_dqiterate( while (rablkcnt--) { xfs_baread(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, rablkno), - (int)XFS_QI_DQCHUNKLEN(mp)); + mp->m_quotainfo->qi_dqchunklen); rablkno++; } } @@ -1564,8 +1585,10 @@ xfs_qm_quotacheck_dqadjust( /* * Set default limits, adjust timers (since we changed usages) + * + * There are no timers for the default values set in the root dquot. */ - if (! XFS_IS_SUSER_DQUOT(dqp)) { + if (dqp->q_core.d_id) { xfs_qm_adjust_dqlimits(dqp->q_mount, &dqp->q_core); xfs_qm_adjust_dqtimers(dqp->q_mount, &dqp->q_core); } @@ -1609,10 +1632,7 @@ xfs_qm_dqusage_adjust( xfs_ino_t ino, /* inode number to get data for */ void __user *buffer, /* not used */ int ubsize, /* not used */ - void *private_data, /* not used */ - xfs_daddr_t bno, /* starting block of inode cluster */ int *ubused, /* not used */ - void *dip, /* on-disk inode pointer (not used) */ int *res) /* result code value */ { xfs_inode_t *ip; @@ -1637,7 +1657,7 @@ xfs_qm_dqusage_adjust( * the case in all other instances. It's OK that we do this because * quotacheck is done only at mount time. */ - if ((error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip, bno))) { + if ((error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip))) { *res = BULKSTAT_RV_NOTHING; return error; } @@ -1735,14 +1755,14 @@ xfs_qm_quotacheck( lastino = 0; flags = 0; - ASSERT(XFS_QI_UQIP(mp) || XFS_QI_GQIP(mp)); + ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip); ASSERT(XFS_IS_QUOTA_RUNNING(mp)); /* * There should be no cached dquots. The (simplistic) quotacheck * algorithm doesn't like that. */ - ASSERT(XFS_QI_MPLNDQUOTS(mp) == 0); + ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist)); cmn_err(CE_NOTE, "XFS quotacheck %s: Please wait.", mp->m_fsname); @@ -1751,15 +1771,19 @@ xfs_qm_quotacheck( * their counters to zero. We need a clean slate. * We don't log our changes till later. */ - if ((uip = XFS_QI_UQIP(mp))) { - if ((error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA))) + uip = mp->m_quotainfo->qi_uquotaip; + if (uip) { + error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA); + if (error) goto error_return; flags |= XFS_UQUOTA_CHKD; } - if ((gip = XFS_QI_GQIP(mp))) { - if ((error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ? - XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA))) + gip = mp->m_quotainfo->qi_gquotaip; + if (gip) { + error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ? + XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA); + if (error) goto error_return; flags |= XFS_OQUOTA_CHKD; } @@ -1769,12 +1793,13 @@ xfs_qm_quotacheck( * Iterate thru all the inodes in the file system, * adjusting the corresponding dquot counters in core. */ - if ((error = xfs_bulkstat(mp, &lastino, &count, - xfs_qm_dqusage_adjust, NULL, - structsz, NULL, BULKSTAT_FG_IGET, &done))) + error = xfs_bulkstat(mp, &lastino, &count, + xfs_qm_dqusage_adjust, + structsz, NULL, &done); + if (error) break; - } while (! done); + } while (!done); /* * We've made all the changes that we need to make incore. @@ -1782,7 +1807,7 @@ xfs_qm_quotacheck( * successfully. */ if (!error) - error = xfs_qm_dqflush_all(mp, XFS_QMOPT_DELWRI); + error = xfs_qm_dqflush_all(mp, 0); /* * We can get this error if we couldn't do a dquot allocation inside @@ -1792,7 +1817,7 @@ xfs_qm_quotacheck( * at this point (because we intentionally didn't in dqget_noattach). */ if (error) { - xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_QUOTAOFF); + xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL); goto error_return; } @@ -1813,7 +1838,7 @@ xfs_qm_quotacheck( mp->m_qflags &= ~(XFS_OQUOTA_CHKD | XFS_UQUOTA_CHKD); mp->m_qflags |= flags; - XQM_LIST_PRINT(&(XFS_QI_MPL_LIST(mp)), MPL_NEXT, "++++ Mp list +++"); + xfs_qm_dquot_list_print(mp); error_return: if (error) { @@ -1862,14 +1887,14 @@ xfs_qm_init_quotainos( mp->m_sb.sb_uquotino != NULLFSINO) { ASSERT(mp->m_sb.sb_uquotino > 0); if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, - 0, 0, &uip, 0))) + 0, 0, &uip))) return XFS_ERROR(error); } if (XFS_IS_OQUOTA_ON(mp) && mp->m_sb.sb_gquotino != NULLFSINO) { ASSERT(mp->m_sb.sb_gquotino > 0); if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, - 0, 0, &gip, 0))) { + 0, 0, &gip))) { if (uip) IRELE(uip); return XFS_ERROR(error); @@ -1908,59 +1933,53 @@ xfs_qm_init_quotainos( } } - XFS_QI_UQIP(mp) = uip; - XFS_QI_GQIP(mp) = gip; + mp->m_quotainfo->qi_uquotaip = uip; + mp->m_quotainfo->qi_gquotaip = gip; return 0; } + /* - * Traverse the freelist of dquots and attempt to reclaim a maximum of - * 'howmany' dquots. This operation races with dqlookup(), and attempts to - * favor the lookup function ... - * XXXsup merge this with qm_reclaim_one(). + * Just pop the least recently used dquot off the freelist and + * recycle it. The returned dquot is locked. */ -STATIC int -xfs_qm_shake_freelist( - int howmany) +STATIC xfs_dquot_t * +xfs_qm_dqreclaim_one(void) { - int nreclaimed; - xfs_dqhash_t *hash; - xfs_dquot_t *dqp, *nextdqp; + xfs_dquot_t *dqpout; + xfs_dquot_t *dqp; int restarts; - int nflushes; - if (howmany <= 0) - return 0; - - nreclaimed = 0; restarts = 0; - nflushes = 0; + dqpout = NULL; -#ifdef QUOTADEBUG - cmn_err(CE_DEBUG, "Shake free 0x%x", howmany); -#endif - /* lock order is : hashchainlock, freelistlock, mplistlock */ - tryagain: - xfs_qm_freelist_lock(xfs_Gqm); + /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */ +startagain: + mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); - for (dqp = xfs_Gqm->qm_dqfreelist.qh_next; - ((dqp != (xfs_dquot_t *) &xfs_Gqm->qm_dqfreelist) && - nreclaimed < howmany); ) { + list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) { + struct xfs_mount *mp = dqp->q_mount; xfs_dqlock(dqp); /* * We are racing with dqlookup here. Naturally we don't - * want to reclaim a dquot that lookup wants. + * want to reclaim a dquot that lookup wants. We release the + * freelist lock and start over, so that lookup will grab + * both the dquot and the freelistlock. */ if (dqp->dq_flags & XFS_DQ_WANT) { + ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE)); + + trace_xfs_dqreclaim_want(dqp); + xfs_dqunlock(dqp); - xfs_qm_freelist_unlock(xfs_Gqm); + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) - return nreclaimed; + return NULL; XQM_STATS_INC(xqmstats.xs_qm_dqwants); - goto tryagain; + goto startagain; } /* @@ -1969,23 +1988,27 @@ xfs_qm_shake_freelist( * life easier. */ if (dqp->dq_flags & XFS_DQ_INACTIVE) { - ASSERT(dqp->q_mount == NULL); + ASSERT(mp == NULL); ASSERT(! XFS_DQ_IS_DIRTY(dqp)); - ASSERT(dqp->HL_PREVP == NULL); - ASSERT(dqp->MPL_PREVP == NULL); + ASSERT(list_empty(&dqp->q_hashlist)); + ASSERT(list_empty(&dqp->q_mplist)); + list_del_init(&dqp->q_freelist); + xfs_Gqm->qm_dqfrlist_cnt--; + xfs_dqunlock(dqp); + dqpout = dqp; XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims); - nextdqp = dqp->dq_flnext; - goto off_freelist; + break; } - ASSERT(dqp->MPL_PREVP); + ASSERT(dqp->q_hash); + ASSERT(!list_empty(&dqp->q_mplist)); + /* * Try to grab the flush lock. If this dquot is in the process of * getting flushed to disk, we don't want to reclaim it. */ if (!xfs_dqflock_nowait(dqp)) { xfs_dqunlock(dqp); - dqp = dqp->dq_flnext; continue; } @@ -1998,21 +2021,21 @@ xfs_qm_shake_freelist( if (XFS_DQ_IS_DIRTY(dqp)) { int error; - trace_xfs_dqshake_dirty(dqp); + trace_xfs_dqreclaim_dirty(dqp); /* * We flush it delayed write, so don't bother - * releasing the mplock. + * releasing the freelist lock. */ - error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI); + error = xfs_qm_dqflush(dqp, 0); if (error) { - xfs_fs_cmn_err(CE_WARN, dqp->q_mount, - "xfs_qm_dqflush_all: dquot %p flush failed", dqp); + xfs_fs_cmn_err(CE_WARN, mp, + "xfs_qm_dqreclaim: dquot %p flush failed", dqp); } xfs_dqunlock(dqp); /* dqflush unlocks dqflock */ - dqp = dqp->dq_flnext; continue; } + /* * We're trying to get the hashlock out of order. This races * with dqlookup; so, we giveup and goto the next dquot if @@ -2021,62 +2044,83 @@ xfs_qm_shake_freelist( * waiting for the freelist lock. */ if (!mutex_trylock(&dqp->q_hash->qh_lock)) { - xfs_dqfunlock(dqp); - xfs_dqunlock(dqp); - dqp = dqp->dq_flnext; - continue; + restarts++; + goto dqfunlock; } + /* * This races with dquot allocation code as well as dqflush_all * and reclaim code. So, if we failed to grab the mplist lock, * giveup everything and start over. */ - hash = dqp->q_hash; - ASSERT(hash); - if (! xfs_qm_mplist_nowait(dqp->q_mount)) { - /* XXX put a sentinel so that we can come back here */ + if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) { + restarts++; + mutex_unlock(&dqp->q_hash->qh_lock); xfs_dqfunlock(dqp); xfs_dqunlock(dqp); - mutex_unlock(&hash->qh_lock); - xfs_qm_freelist_unlock(xfs_Gqm); - if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) - return nreclaimed; - goto tryagain; + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); + if (restarts++ >= XFS_QM_RECLAIM_MAX_RESTARTS) + return NULL; + goto startagain; } - trace_xfs_dqshake_unlink(dqp); - -#ifdef QUOTADEBUG - cmn_err(CE_DEBUG, "Shake 0x%p, ID 0x%x\n", - dqp, be32_to_cpu(dqp->q_core.d_id)); -#endif ASSERT(dqp->q_nrefs == 0); - nextdqp = dqp->dq_flnext; - XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp); - XQM_HASHLIST_REMOVE(hash, dqp); + list_del_init(&dqp->q_mplist); + mp->m_quotainfo->qi_dquots--; + mp->m_quotainfo->qi_dqreclaims++; + list_del_init(&dqp->q_hashlist); + dqp->q_hash->qh_version++; + list_del_init(&dqp->q_freelist); + xfs_Gqm->qm_dqfrlist_cnt--; + dqpout = dqp; + mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock); + mutex_unlock(&dqp->q_hash->qh_lock); +dqfunlock: xfs_dqfunlock(dqp); - xfs_qm_mplist_unlock(dqp->q_mount); - mutex_unlock(&hash->qh_lock); - - off_freelist: - XQM_FREELIST_REMOVE(dqp); xfs_dqunlock(dqp); - nreclaimed++; - XQM_STATS_INC(xqmstats.xs_qm_dqshake_reclaims); + if (dqpout) + break; + if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) + return NULL; + } + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); + return dqpout; +} + +/* + * Traverse the freelist of dquots and attempt to reclaim a maximum of + * 'howmany' dquots. This operation races with dqlookup(), and attempts to + * favor the lookup function ... + */ +STATIC int +xfs_qm_shake_freelist( + int howmany) +{ + int nreclaimed = 0; + xfs_dquot_t *dqp; + + if (howmany <= 0) + return 0; + + while (nreclaimed < howmany) { + dqp = xfs_qm_dqreclaim_one(); + if (!dqp) + return nreclaimed; xfs_qm_dqdestroy(dqp); - dqp = nextdqp; + nreclaimed++; } - xfs_qm_freelist_unlock(xfs_Gqm); return nreclaimed; } - /* * The kmem_shake interface is invoked when memory is running low. */ /* ARGSUSED */ STATIC int -xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask) +xfs_qm_shake( + struct shrinker *shrink, + int nr_to_scan, + gfp_t gfp_mask) { int ndqused, nfree, n; @@ -2085,7 +2129,7 @@ xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask) if (!xfs_Gqm) return 0; - nfree = xfs_Gqm->qm_dqfreelist.qh_nelems; /* free dquots */ + nfree = xfs_Gqm->qm_dqfrlist_cnt; /* free dquots */ /* incore dquots in all f/s's */ ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree; @@ -2101,131 +2145,6 @@ xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask) } -/* - * Just pop the least recently used dquot off the freelist and - * recycle it. The returned dquot is locked. - */ -STATIC xfs_dquot_t * -xfs_qm_dqreclaim_one(void) -{ - xfs_dquot_t *dqpout; - xfs_dquot_t *dqp; - int restarts; - int nflushes; - - restarts = 0; - dqpout = NULL; - nflushes = 0; - - /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */ - startagain: - xfs_qm_freelist_lock(xfs_Gqm); - - FOREACH_DQUOT_IN_FREELIST(dqp, &(xfs_Gqm->qm_dqfreelist)) { - xfs_dqlock(dqp); - - /* - * We are racing with dqlookup here. Naturally we don't - * want to reclaim a dquot that lookup wants. We release the - * freelist lock and start over, so that lookup will grab - * both the dquot and the freelistlock. - */ - if (dqp->dq_flags & XFS_DQ_WANT) { - ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE)); - - trace_xfs_dqreclaim_want(dqp); - - xfs_dqunlock(dqp); - xfs_qm_freelist_unlock(xfs_Gqm); - if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) - return NULL; - XQM_STATS_INC(xqmstats.xs_qm_dqwants); - goto startagain; - } - - /* - * If the dquot is inactive, we are assured that it is - * not on the mplist or the hashlist, and that makes our - * life easier. - */ - if (dqp->dq_flags & XFS_DQ_INACTIVE) { - ASSERT(dqp->q_mount == NULL); - ASSERT(! XFS_DQ_IS_DIRTY(dqp)); - ASSERT(dqp->HL_PREVP == NULL); - ASSERT(dqp->MPL_PREVP == NULL); - XQM_FREELIST_REMOVE(dqp); - xfs_dqunlock(dqp); - dqpout = dqp; - XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims); - break; - } - - ASSERT(dqp->q_hash); - ASSERT(dqp->MPL_PREVP); - - /* - * Try to grab the flush lock. If this dquot is in the process of - * getting flushed to disk, we don't want to reclaim it. - */ - if (!xfs_dqflock_nowait(dqp)) { - xfs_dqunlock(dqp); - continue; - } - - /* - * We have the flush lock so we know that this is not in the - * process of being flushed. So, if this is dirty, flush it - * DELWRI so that we don't get a freelist infested with - * dirty dquots. - */ - if (XFS_DQ_IS_DIRTY(dqp)) { - int error; - - trace_xfs_dqreclaim_dirty(dqp); - - /* - * We flush it delayed write, so don't bother - * releasing the freelist lock. - */ - error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI); - if (error) { - xfs_fs_cmn_err(CE_WARN, dqp->q_mount, - "xfs_qm_dqreclaim: dquot %p flush failed", dqp); - } - xfs_dqunlock(dqp); /* dqflush unlocks dqflock */ - continue; - } - - if (! xfs_qm_mplist_nowait(dqp->q_mount)) { - xfs_dqfunlock(dqp); - xfs_dqunlock(dqp); - continue; - } - - if (!mutex_trylock(&dqp->q_hash->qh_lock)) - goto mplistunlock; - - trace_xfs_dqreclaim_unlink(dqp); - - ASSERT(dqp->q_nrefs == 0); - XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp); - XQM_HASHLIST_REMOVE(dqp->q_hash, dqp); - XQM_FREELIST_REMOVE(dqp); - dqpout = dqp; - mutex_unlock(&dqp->q_hash->qh_lock); - mplistunlock: - xfs_qm_mplist_unlock(dqp->q_mount); - xfs_dqfunlock(dqp); - xfs_dqunlock(dqp); - if (dqpout) - break; - } - - xfs_qm_freelist_unlock(xfs_Gqm); - return dqpout; -} - - /*------------------------------------------------------------------*/ /* @@ -2650,66 +2569,3 @@ xfs_qm_vop_create_dqattach( } } -/* ------------- list stuff -----------------*/ -STATIC void -xfs_qm_freelist_init(xfs_frlist_t *ql) -{ - ql->qh_next = ql->qh_prev = (xfs_dquot_t *) ql; - mutex_init(&ql->qh_lock); - ql->qh_version = 0; - ql->qh_nelems = 0; -} - -STATIC void -xfs_qm_freelist_destroy(xfs_frlist_t *ql) -{ - xfs_dquot_t *dqp, *nextdqp; - - mutex_lock(&ql->qh_lock); - for (dqp = ql->qh_next; - dqp != (xfs_dquot_t *)ql; ) { - xfs_dqlock(dqp); - nextdqp = dqp->dq_flnext; -#ifdef QUOTADEBUG - cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp); -#endif - XQM_FREELIST_REMOVE(dqp); - xfs_dqunlock(dqp); - xfs_qm_dqdestroy(dqp); - dqp = nextdqp; - } - mutex_unlock(&ql->qh_lock); - mutex_destroy(&ql->qh_lock); - - ASSERT(ql->qh_nelems == 0); -} - -STATIC void -xfs_qm_freelist_insert(xfs_frlist_t *ql, xfs_dquot_t *dq) -{ - dq->dq_flnext = ql->qh_next; - dq->dq_flprev = (xfs_dquot_t *)ql; - ql->qh_next = dq; - dq->dq_flnext->dq_flprev = dq; - xfs_Gqm->qm_dqfreelist.qh_nelems++; - xfs_Gqm->qm_dqfreelist.qh_version++; -} - -void -xfs_qm_freelist_unlink(xfs_dquot_t *dq) -{ - xfs_dquot_t *next = dq->dq_flnext; - xfs_dquot_t *prev = dq->dq_flprev; - - next->dq_flprev = prev; - prev->dq_flnext = next; - dq->dq_flnext = dq->dq_flprev = dq; - xfs_Gqm->qm_dqfreelist.qh_nelems--; - xfs_Gqm->qm_dqfreelist.qh_version++; -} - -void -xfs_qm_freelist_append(xfs_frlist_t *ql, xfs_dquot_t *dq) -{ - xfs_qm_freelist_insert((xfs_frlist_t *)ql->qh_prev, dq); -} diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h index 495564b8af38..c9446f1c726d 100644 --- a/fs/xfs/quota/xfs_qm.h +++ b/fs/xfs/quota/xfs_qm.h @@ -72,17 +72,6 @@ extern kmem_zone_t *qm_dqtrxzone; #define XFS_QM_MAX_DQCLUSTER_LOGSZ 3 typedef xfs_dqhash_t xfs_dqlist_t; -/* - * The freelist head. The first two fields match the first two in the - * xfs_dquot_t structure (in xfs_dqmarker_t) - */ -typedef struct xfs_frlist { - struct xfs_dquot *qh_next; - struct xfs_dquot *qh_prev; - struct mutex qh_lock; - uint qh_version; - uint qh_nelems; -} xfs_frlist_t; /* * Quota Manager (global) structure. Lives only in core. @@ -91,7 +80,9 @@ typedef struct xfs_qm { xfs_dqlist_t *qm_usr_dqhtable;/* udquot hash table */ xfs_dqlist_t *qm_grp_dqhtable;/* gdquot hash table */ uint qm_dqhashmask; /* # buckets in dq hashtab - 1 */ - xfs_frlist_t qm_dqfreelist; /* freelist of dquots */ + struct list_head qm_dqfrlist; /* freelist of dquots */ + struct mutex qm_dqfrlist_lock; + int qm_dqfrlist_cnt; atomic_t qm_totaldquots; /* total incore dquots */ uint qm_nrefs; /* file systems with quota on */ int qm_dqfree_ratio;/* ratio of free to inuse dquots */ @@ -106,7 +97,9 @@ typedef struct xfs_qm { typedef struct xfs_quotainfo { xfs_inode_t *qi_uquotaip; /* user quota inode */ xfs_inode_t *qi_gquotaip; /* group quota inode */ - xfs_dqlist_t qi_dqlist; /* all dquots in filesys */ + struct list_head qi_dqlist; /* all dquots in filesys */ + struct mutex qi_dqlist_lock; + int qi_dquots; int qi_dqreclaims; /* a change here indicates a removal in the dqlist */ time_t qi_btimelimit; /* limit for blks timer */ @@ -175,10 +168,6 @@ extern int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *); extern int xfs_qm_scall_quotaon(xfs_mount_t *, uint); extern int xfs_qm_scall_quotaoff(xfs_mount_t *, uint); -/* list stuff */ -extern void xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *); -extern void xfs_qm_freelist_unlink(xfs_dquot_t *); - #ifdef DEBUG extern int xfs_qm_internalqcheck(xfs_mount_t *); #else diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c index a5346630dfae..97b410c12794 100644 --- a/fs/xfs/quota/xfs_qm_bhv.c +++ b/fs/xfs/quota/xfs_qm_bhv.c @@ -59,7 +59,7 @@ xfs_fill_statvfs_from_dquot( be64_to_cpu(dp->d_blk_hardlimit); if (limit && statp->f_blocks > limit) { statp->f_blocks = limit; - statp->f_bfree = + statp->f_bfree = statp->f_bavail = (statp->f_blocks > be64_to_cpu(dp->d_bcount)) ? (statp->f_blocks - be64_to_cpu(dp->d_bcount)) : 0; } diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c index 83e7ea3e25fa..3d1fc79532e2 100644 --- a/fs/xfs/quota/xfs_qm_stats.c +++ b/fs/xfs/quota/xfs_qm_stats.c @@ -55,7 +55,7 @@ static int xqm_proc_show(struct seq_file *m, void *v) ndquot, xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0, xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0, - xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0); + xfs_Gqm? xfs_Gqm->qm_dqfrlist_cnt : 0); return 0; } diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c index 873e07e29074..b4487764e923 100644 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ b/fs/xfs/quota/xfs_qm_syscalls.c @@ -79,6 +79,7 @@ xfs_qm_scall_quotaoff( xfs_mount_t *mp, uint flags) { + struct xfs_quotainfo *q = mp->m_quotainfo; uint dqtype; int error; uint inactivate_flags; @@ -102,11 +103,8 @@ xfs_qm_scall_quotaoff( * critical thing. * If quotaoff, then we must be dealing with the root filesystem. */ - ASSERT(mp->m_quotainfo); - if (mp->m_quotainfo) - mutex_lock(&(XFS_QI_QOFFLOCK(mp))); - - ASSERT(mp->m_quotainfo); + ASSERT(q); + mutex_lock(&q->qi_quotaofflock); /* * If we're just turning off quota enforcement, change mp and go. @@ -117,7 +115,7 @@ xfs_qm_scall_quotaoff( spin_lock(&mp->m_sb_lock); mp->m_sb.sb_qflags = mp->m_qflags; spin_unlock(&mp->m_sb_lock); - mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); + mutex_unlock(&q->qi_quotaofflock); /* XXX what to do if error ? Revert back to old vals incore ? */ error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS); @@ -150,10 +148,8 @@ xfs_qm_scall_quotaoff( * Nothing to do? Don't complain. This happens when we're just * turning off quota enforcement. */ - if ((mp->m_qflags & flags) == 0) { - mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); - return (0); - } + if ((mp->m_qflags & flags) == 0) + goto out_unlock; /* * Write the LI_QUOTAOFF log record, and do SB changes atomically, @@ -162,7 +158,7 @@ xfs_qm_scall_quotaoff( */ error = xfs_qm_log_quotaoff(mp, &qoffstart, flags); if (error) - goto out_error; + goto out_unlock; /* * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct @@ -204,7 +200,7 @@ xfs_qm_scall_quotaoff( * So, if we couldn't purge all the dquots from the filesystem, * we can't get rid of the incore data structures. */ - while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype|XFS_QMOPT_QUOTAOFF))) + while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype))) delay(10 * nculprits); /* @@ -222,7 +218,7 @@ xfs_qm_scall_quotaoff( if (error) { /* We're screwed now. Shutdown is the only option. */ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - goto out_error; + goto out_unlock; } /* @@ -230,27 +226,26 @@ xfs_qm_scall_quotaoff( */ if (((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET1) || ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET2)) { - mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); + mutex_unlock(&q->qi_quotaofflock); xfs_qm_destroy_quotainfo(mp); return (0); } /* - * Release our quotainode references, and vn_purge them, - * if we don't need them anymore. + * Release our quotainode references if we don't need them anymore. */ - if ((dqtype & XFS_QMOPT_UQUOTA) && XFS_QI_UQIP(mp)) { - IRELE(XFS_QI_UQIP(mp)); - XFS_QI_UQIP(mp) = NULL; + if ((dqtype & XFS_QMOPT_UQUOTA) && q->qi_uquotaip) { + IRELE(q->qi_uquotaip); + q->qi_uquotaip = NULL; } - if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && XFS_QI_GQIP(mp)) { - IRELE(XFS_QI_GQIP(mp)); - XFS_QI_GQIP(mp) = NULL; + if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && q->qi_gquotaip) { + IRELE(q->qi_gquotaip); + q->qi_gquotaip = NULL; } -out_error: - mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); - return (error); +out_unlock: + mutex_unlock(&q->qi_quotaofflock); + return error; } int @@ -267,7 +262,7 @@ xfs_qm_scall_trunc_qfiles( } if ((flags & XFS_DQ_USER) && mp->m_sb.sb_uquotino != NULLFSINO) { - error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &qip, 0); + error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &qip); if (!error) { error = xfs_truncate_file(mp, qip); IRELE(qip); @@ -276,7 +271,7 @@ xfs_qm_scall_trunc_qfiles( if ((flags & (XFS_DQ_GROUP|XFS_DQ_PROJ)) && mp->m_sb.sb_gquotino != NULLFSINO) { - error2 = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &qip, 0); + error2 = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &qip); if (!error2) { error2 = xfs_truncate_file(mp, qip); IRELE(qip); @@ -379,9 +374,9 @@ xfs_qm_scall_quotaon( /* * Switch on quota enforcement in core. */ - mutex_lock(&(XFS_QI_QOFFLOCK(mp))); + mutex_lock(&mp->m_quotainfo->qi_quotaofflock); mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD); - mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); + mutex_unlock(&mp->m_quotainfo->qi_quotaofflock); return (0); } @@ -392,11 +387,12 @@ xfs_qm_scall_quotaon( */ int xfs_qm_scall_getqstat( - xfs_mount_t *mp, - fs_quota_stat_t *out) + struct xfs_mount *mp, + struct fs_quota_stat *out) { - xfs_inode_t *uip, *gip; - boolean_t tempuqip, tempgqip; + struct xfs_quotainfo *q = mp->m_quotainfo; + struct xfs_inode *uip, *gip; + boolean_t tempuqip, tempgqip; uip = gip = NULL; tempuqip = tempgqip = B_FALSE; @@ -415,18 +411,18 @@ xfs_qm_scall_getqstat( out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino; out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino; - if (mp->m_quotainfo) { - uip = mp->m_quotainfo->qi_uquotaip; - gip = mp->m_quotainfo->qi_gquotaip; + if (q) { + uip = q->qi_uquotaip; + gip = q->qi_gquotaip; } if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) { if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, - 0, 0, &uip, 0) == 0) + 0, 0, &uip) == 0) tempuqip = B_TRUE; } if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) { if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, - 0, 0, &gip, 0) == 0) + 0, 0, &gip) == 0) tempgqip = B_TRUE; } if (uip) { @@ -441,17 +437,20 @@ xfs_qm_scall_getqstat( if (tempgqip) IRELE(gip); } - if (mp->m_quotainfo) { - out->qs_incoredqs = XFS_QI_MPLNDQUOTS(mp); - out->qs_btimelimit = XFS_QI_BTIMELIMIT(mp); - out->qs_itimelimit = XFS_QI_ITIMELIMIT(mp); - out->qs_rtbtimelimit = XFS_QI_RTBTIMELIMIT(mp); - out->qs_bwarnlimit = XFS_QI_BWARNLIMIT(mp); - out->qs_iwarnlimit = XFS_QI_IWARNLIMIT(mp); + if (q) { + out->qs_incoredqs = q->qi_dquots; + out->qs_btimelimit = q->qi_btimelimit; + out->qs_itimelimit = q->qi_itimelimit; + out->qs_rtbtimelimit = q->qi_rtbtimelimit; + out->qs_bwarnlimit = q->qi_bwarnlimit; + out->qs_iwarnlimit = q->qi_iwarnlimit; } - return (0); + return 0; } +#define XFS_DQ_MASK \ + (FS_DQ_LIMIT_MASK | FS_DQ_TIMER_MASK | FS_DQ_WARNS_MASK) + /* * Adjust quota limits, and start/stop timers accordingly. */ @@ -462,15 +461,17 @@ xfs_qm_scall_setqlim( uint type, fs_disk_quota_t *newlim) { + struct xfs_quotainfo *q = mp->m_quotainfo; xfs_disk_dquot_t *ddq; xfs_dquot_t *dqp; xfs_trans_t *tp; int error; xfs_qcnt_t hard, soft; - if ((newlim->d_fieldmask & - (FS_DQ_LIMIT_MASK|FS_DQ_TIMER_MASK|FS_DQ_WARNS_MASK)) == 0) - return (0); + if (newlim->d_fieldmask & ~XFS_DQ_MASK) + return EINVAL; + if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0) + return 0; tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_disk_dquot_t) + 128, @@ -485,7 +486,7 @@ xfs_qm_scall_setqlim( * a quotaoff from happening). (XXXThis doesn't currently happen * because we take the vfslock before calling xfs_qm_sysent). */ - mutex_lock(&(XFS_QI_QOFFLOCK(mp))); + mutex_lock(&q->qi_quotaofflock); /* * Get the dquot (locked), and join it to the transaction. @@ -493,9 +494,8 @@ xfs_qm_scall_setqlim( */ if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) { xfs_trans_cancel(tp, XFS_TRANS_ABORT); - mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); ASSERT(error != ENOENT); - return (error); + goto out_unlock; } xfs_trans_dqjoin(tp, dqp); ddq = &dqp->q_core; @@ -513,8 +513,8 @@ xfs_qm_scall_setqlim( ddq->d_blk_hardlimit = cpu_to_be64(hard); ddq->d_blk_softlimit = cpu_to_be64(soft); if (id == 0) { - mp->m_quotainfo->qi_bhardlimit = hard; - mp->m_quotainfo->qi_bsoftlimit = soft; + q->qi_bhardlimit = hard; + q->qi_bsoftlimit = soft; } } else { qdprintk("blkhard %Ld < blksoft %Ld\n", hard, soft); @@ -529,8 +529,8 @@ xfs_qm_scall_setqlim( ddq->d_rtb_hardlimit = cpu_to_be64(hard); ddq->d_rtb_softlimit = cpu_to_be64(soft); if (id == 0) { - mp->m_quotainfo->qi_rtbhardlimit = hard; - mp->m_quotainfo->qi_rtbsoftlimit = soft; + q->qi_rtbhardlimit = hard; + q->qi_rtbsoftlimit = soft; } } else { qdprintk("rtbhard %Ld < rtbsoft %Ld\n", hard, soft); @@ -546,8 +546,8 @@ xfs_qm_scall_setqlim( ddq->d_ino_hardlimit = cpu_to_be64(hard); ddq->d_ino_softlimit = cpu_to_be64(soft); if (id == 0) { - mp->m_quotainfo->qi_ihardlimit = hard; - mp->m_quotainfo->qi_isoftlimit = soft; + q->qi_ihardlimit = hard; + q->qi_isoftlimit = soft; } } else { qdprintk("ihard %Ld < isoft %Ld\n", hard, soft); @@ -572,23 +572,23 @@ xfs_qm_scall_setqlim( * for warnings. */ if (newlim->d_fieldmask & FS_DQ_BTIMER) { - mp->m_quotainfo->qi_btimelimit = newlim->d_btimer; + q->qi_btimelimit = newlim->d_btimer; ddq->d_btimer = cpu_to_be32(newlim->d_btimer); } if (newlim->d_fieldmask & FS_DQ_ITIMER) { - mp->m_quotainfo->qi_itimelimit = newlim->d_itimer; + q->qi_itimelimit = newlim->d_itimer; ddq->d_itimer = cpu_to_be32(newlim->d_itimer); } if (newlim->d_fieldmask & FS_DQ_RTBTIMER) { - mp->m_quotainfo->qi_rtbtimelimit = newlim->d_rtbtimer; + q->qi_rtbtimelimit = newlim->d_rtbtimer; ddq->d_rtbtimer = cpu_to_be32(newlim->d_rtbtimer); } if (newlim->d_fieldmask & FS_DQ_BWARNS) - mp->m_quotainfo->qi_bwarnlimit = newlim->d_bwarns; + q->qi_bwarnlimit = newlim->d_bwarns; if (newlim->d_fieldmask & FS_DQ_IWARNS) - mp->m_quotainfo->qi_iwarnlimit = newlim->d_iwarns; + q->qi_iwarnlimit = newlim->d_iwarns; if (newlim->d_fieldmask & FS_DQ_RTBWARNS) - mp->m_quotainfo->qi_rtbwarnlimit = newlim->d_rtbwarns; + q->qi_rtbwarnlimit = newlim->d_rtbwarns; } else { /* * If the user is now over quota, start the timelimit. @@ -605,8 +605,9 @@ xfs_qm_scall_setqlim( error = xfs_trans_commit(tp, 0); xfs_qm_dqprint(dqp); xfs_qm_dqrele(dqp); - mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); + out_unlock: + mutex_unlock(&q->qi_quotaofflock); return error; } @@ -853,7 +854,8 @@ xfs_dqrele_inode( int error; /* skip quota inodes */ - if (ip == XFS_QI_UQIP(ip->i_mount) || ip == XFS_QI_GQIP(ip->i_mount)) { + if (ip == ip->i_mount->m_quotainfo->qi_uquotaip || + ip == ip->i_mount->m_quotainfo->qi_gquotaip) { ASSERT(ip->i_udquot == NULL); ASSERT(ip->i_gdquot == NULL); read_unlock(&pag->pag_ici_lock); @@ -891,7 +893,8 @@ xfs_qm_dqrele_all_inodes( uint flags) { ASSERT(mp->m_quotainfo); - xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG, 0); + xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, + XFS_ICI_NO_TAG, 0, NULL); } /*------------------------------------------------------------------------*/ @@ -930,7 +933,8 @@ struct mutex qcheck_lock; } typedef struct dqtest { - xfs_dqmarker_t q_lists; + uint dq_flags; /* various flags (XFS_DQ_*) */ + struct list_head q_hashlist; xfs_dqhash_t *q_hash; /* the hashchain header */ xfs_mount_t *q_mount; /* filesystem this relates to */ xfs_dqid_t d_id; /* user id or group id */ @@ -941,14 +945,9 @@ typedef struct dqtest { STATIC void xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp) { - xfs_dquot_t *d; - if (((d) = (h)->qh_next)) - (d)->HL_PREVP = &((dqp)->HL_NEXT); - (dqp)->HL_NEXT = d; - (dqp)->HL_PREVP = &((h)->qh_next); - (h)->qh_next = (xfs_dquot_t *)dqp; - (h)->qh_version++; - (h)->qh_nelems++; + list_add(&dqp->q_hashlist, &h->qh_list); + h->qh_version++; + h->qh_nelems++; } STATIC void xfs_qm_dqtest_print( @@ -1060,9 +1059,7 @@ xfs_qm_internalqcheck_dqget( xfs_dqhash_t *h; h = DQTEST_HASH(mp, id, type); - for (d = (xfs_dqtest_t *) h->qh_next; d != NULL; - d = (xfs_dqtest_t *) d->HL_NEXT) { - /* DQTEST_LIST_PRINT(h, HL_NEXT, "@@@@@ dqtestlist @@@@@"); */ + list_for_each_entry(d, &h->qh_list, q_hashlist) { if (d->d_id == id && mp == d->q_mount) { *O_dq = d; return (0); @@ -1073,6 +1070,7 @@ xfs_qm_internalqcheck_dqget( d->d_id = id; d->q_mount = mp; d->q_hash = h; + INIT_LIST_HEAD(&d->q_hashlist); xfs_qm_hashinsert(h, d); *O_dq = d; return (0); @@ -1111,10 +1109,7 @@ xfs_qm_internalqcheck_adjust( xfs_ino_t ino, /* inode number to get data for */ void __user *buffer, /* not used */ int ubsize, /* not used */ - void *private_data, /* not used */ - xfs_daddr_t bno, /* starting block of inode cluster */ int *ubused, /* not used */ - void *dip, /* not used */ int *res) /* bulkstat result code */ { xfs_inode_t *ip; @@ -1136,7 +1131,7 @@ xfs_qm_internalqcheck_adjust( ipreleased = B_FALSE; again: lock_flags = XFS_ILOCK_SHARED; - if ((error = xfs_iget(mp, NULL, ino, 0, lock_flags, &ip, bno))) { + if ((error = xfs_iget(mp, NULL, ino, 0, lock_flags, &ip))) { *res = BULKSTAT_RV_NOTHING; return (error); } @@ -1179,8 +1174,6 @@ xfs_qm_internalqcheck( xfs_ino_t lastino; int done, count; int i; - xfs_dqtest_t *d, *e; - xfs_dqhash_t *h1; int error; lastino = 0; @@ -1192,9 +1185,9 @@ xfs_qm_internalqcheck( if (! XFS_IS_QUOTA_ON(mp)) return XFS_ERROR(ESRCH); - xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); + xfs_log_force(mp, XFS_LOG_SYNC); XFS_bflush(mp->m_ddev_targp); - xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); + xfs_log_force(mp, XFS_LOG_SYNC); XFS_bflush(mp->m_ddev_targp); mutex_lock(&qcheck_lock); @@ -1209,30 +1202,29 @@ xfs_qm_internalqcheck( * Iterate thru all the inodes in the file system, * adjusting the corresponding dquot counters */ - if ((error = xfs_bulkstat(mp, &lastino, &count, - xfs_qm_internalqcheck_adjust, NULL, - 0, NULL, BULKSTAT_FG_IGET, &done))) { + error = xfs_bulkstat(mp, &lastino, &count, + xfs_qm_internalqcheck_adjust, + 0, NULL, &done); + if (error) { + cmn_err(CE_DEBUG, "Bulkstat returned error 0x%x", error); break; } - } while (! done); - if (error) { - cmn_err(CE_DEBUG, "Bulkstat returned error 0x%x", error); - } + } while (!done); + cmn_err(CE_DEBUG, "Checking results against system dquots"); for (i = 0; i < qmtest_hashmask; i++) { - h1 = &qmtest_udqtab[i]; - for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) { + xfs_dqtest_t *d, *n; + xfs_dqhash_t *h; + + h = &qmtest_udqtab[i]; + list_for_each_entry_safe(d, n, &h->qh_list, q_hashlist) { xfs_dqtest_cmp(d); - e = (xfs_dqtest_t *) d->HL_NEXT; kmem_free(d); - d = e; } - h1 = &qmtest_gdqtab[i]; - for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) { + h = &qmtest_gdqtab[i]; + list_for_each_entry_safe(d, n, &h->qh_list, q_hashlist) { xfs_dqtest_cmp(d); - e = (xfs_dqtest_t *) d->HL_NEXT; kmem_free(d); - d = e; } } diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h index 8286b2842b6b..94a3d927d716 100644 --- a/fs/xfs/quota/xfs_quota_priv.h +++ b/fs/xfs/quota/xfs_quota_priv.h @@ -24,43 +24,6 @@ */ #define XFS_DQITER_MAP_SIZE 10 -/* Number of dquots that fit in to a dquot block */ -#define XFS_QM_DQPERBLK(mp) ((mp)->m_quotainfo->qi_dqperchunk) - -#define XFS_DQ_IS_ADDEDTO_TRX(t, d) ((d)->q_transp == (t)) - -#define XFS_QI_MPLRECLAIMS(mp) ((mp)->m_quotainfo->qi_dqreclaims) -#define XFS_QI_UQIP(mp) ((mp)->m_quotainfo->qi_uquotaip) -#define XFS_QI_GQIP(mp) ((mp)->m_quotainfo->qi_gquotaip) -#define XFS_QI_DQCHUNKLEN(mp) ((mp)->m_quotainfo->qi_dqchunklen) -#define XFS_QI_BTIMELIMIT(mp) ((mp)->m_quotainfo->qi_btimelimit) -#define XFS_QI_RTBTIMELIMIT(mp) ((mp)->m_quotainfo->qi_rtbtimelimit) -#define XFS_QI_ITIMELIMIT(mp) ((mp)->m_quotainfo->qi_itimelimit) -#define XFS_QI_BWARNLIMIT(mp) ((mp)->m_quotainfo->qi_bwarnlimit) -#define XFS_QI_RTBWARNLIMIT(mp) ((mp)->m_quotainfo->qi_rtbwarnlimit) -#define XFS_QI_IWARNLIMIT(mp) ((mp)->m_quotainfo->qi_iwarnlimit) -#define XFS_QI_QOFFLOCK(mp) ((mp)->m_quotainfo->qi_quotaofflock) - -#define XFS_QI_MPL_LIST(mp) ((mp)->m_quotainfo->qi_dqlist) -#define XFS_QI_MPLNEXT(mp) ((mp)->m_quotainfo->qi_dqlist.qh_next) -#define XFS_QI_MPLNDQUOTS(mp) ((mp)->m_quotainfo->qi_dqlist.qh_nelems) - -#define xfs_qm_mplist_lock(mp) \ - mutex_lock(&(XFS_QI_MPL_LIST(mp).qh_lock)) -#define xfs_qm_mplist_nowait(mp) \ - mutex_trylock(&(XFS_QI_MPL_LIST(mp).qh_lock)) -#define xfs_qm_mplist_unlock(mp) \ - mutex_unlock(&(XFS_QI_MPL_LIST(mp).qh_lock)) -#define XFS_QM_IS_MPLIST_LOCKED(mp) \ - mutex_is_locked(&(XFS_QI_MPL_LIST(mp).qh_lock)) - -#define xfs_qm_freelist_lock(qm) \ - mutex_lock(&((qm)->qm_dqfreelist.qh_lock)) -#define xfs_qm_freelist_lock_nowait(qm) \ - mutex_trylock(&((qm)->qm_dqfreelist.qh_lock)) -#define xfs_qm_freelist_unlock(qm) \ - mutex_unlock(&((qm)->qm_dqfreelist.qh_lock)) - /* * Hash into a bucket in the dquot hash table, based on <mp, id>. */ @@ -72,9 +35,6 @@ XFS_DQ_HASHVAL(mp, id)) : \ (xfs_Gqm->qm_grp_dqhtable + \ XFS_DQ_HASHVAL(mp, id))) -#define XFS_IS_DQTYPE_ON(mp, type) (type == XFS_DQ_USER ? \ - XFS_IS_UQUOTA_ON(mp) : \ - XFS_IS_OQUOTA_ON(mp)) #define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \ !dqp->q_core.d_blk_hardlimit && \ !dqp->q_core.d_blk_softlimit && \ @@ -86,68 +46,6 @@ !dqp->q_core.d_rtbcount && \ !dqp->q_core.d_icount) -#define HL_PREVP dq_hashlist.ql_prevp -#define HL_NEXT dq_hashlist.ql_next -#define MPL_PREVP dq_mplist.ql_prevp -#define MPL_NEXT dq_mplist.ql_next - - -#define _LIST_REMOVE(h, dqp, PVP, NXT) \ - { \ - xfs_dquot_t *d; \ - if (((d) = (dqp)->NXT)) \ - (d)->PVP = (dqp)->PVP; \ - *((dqp)->PVP) = d; \ - (dqp)->NXT = NULL; \ - (dqp)->PVP = NULL; \ - (h)->qh_version++; \ - (h)->qh_nelems--; \ - } - -#define _LIST_INSERT(h, dqp, PVP, NXT) \ - { \ - xfs_dquot_t *d; \ - if (((d) = (h)->qh_next)) \ - (d)->PVP = &((dqp)->NXT); \ - (dqp)->NXT = d; \ - (dqp)->PVP = &((h)->qh_next); \ - (h)->qh_next = dqp; \ - (h)->qh_version++; \ - (h)->qh_nelems++; \ - } - -#define FOREACH_DQUOT_IN_MP(dqp, mp) \ - for ((dqp) = XFS_QI_MPLNEXT(mp); (dqp) != NULL; (dqp) = (dqp)->MPL_NEXT) - -#define FOREACH_DQUOT_IN_FREELIST(dqp, qlist) \ -for ((dqp) = (qlist)->qh_next; (dqp) != (xfs_dquot_t *)(qlist); \ - (dqp) = (dqp)->dq_flnext) - -#define XQM_HASHLIST_INSERT(h, dqp) \ - _LIST_INSERT(h, dqp, HL_PREVP, HL_NEXT) - -#define XQM_FREELIST_INSERT(h, dqp) \ - xfs_qm_freelist_append(h, dqp) - -#define XQM_MPLIST_INSERT(h, dqp) \ - _LIST_INSERT(h, dqp, MPL_PREVP, MPL_NEXT) - -#define XQM_HASHLIST_REMOVE(h, dqp) \ - _LIST_REMOVE(h, dqp, HL_PREVP, HL_NEXT) -#define XQM_FREELIST_REMOVE(dqp) \ - xfs_qm_freelist_unlink(dqp) -#define XQM_MPLIST_REMOVE(h, dqp) \ - { _LIST_REMOVE(h, dqp, MPL_PREVP, MPL_NEXT); \ - XFS_QI_MPLRECLAIMS((dqp)->q_mount)++; } - -#define XFS_DQ_IS_LOGITEM_INITD(dqp) ((dqp)->q_logitem.qli_dquot == (dqp)) - -#define XFS_QM_DQP_TO_DQACCT(tp, dqp) (XFS_QM_ISUDQ(dqp) ? \ - (tp)->t_dqinfo->dqa_usrdquots : \ - (tp)->t_dqinfo->dqa_grpdquots) -#define XFS_IS_SUSER_DQUOT(dqp) \ - (!((dqp)->q_core.d_id)) - #define DQFLAGTO_TYPESTR(d) (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \ (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \ (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???"))) diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c index 97ac9640be98..061d827da33c 100644 --- a/fs/xfs/quota/xfs_trans_dquot.c +++ b/fs/xfs/quota/xfs_trans_dquot.c @@ -59,12 +59,11 @@ xfs_trans_dqjoin( xfs_trans_t *tp, xfs_dquot_t *dqp) { - xfs_dq_logitem_t *lp; + xfs_dq_logitem_t *lp = &dqp->q_logitem; - ASSERT(! XFS_DQ_IS_ADDEDTO_TRX(tp, dqp)); + ASSERT(dqp->q_transp != tp); ASSERT(XFS_DQ_IS_LOCKED(dqp)); - ASSERT(XFS_DQ_IS_LOGITEM_INITD(dqp)); - lp = &dqp->q_logitem; + ASSERT(lp->qli_dquot == dqp); /* * Get a log_item_desc to point at the new item. @@ -96,7 +95,7 @@ xfs_trans_log_dquot( { xfs_log_item_desc_t *lidp; - ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp)); + ASSERT(dqp->q_transp == tp); ASSERT(XFS_DQ_IS_LOCKED(dqp)); lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(&dqp->q_logitem)); @@ -198,16 +197,16 @@ xfs_trans_get_dqtrx( int i; xfs_dqtrx_t *qa; - for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { - qa = XFS_QM_DQP_TO_DQACCT(tp, dqp); + qa = XFS_QM_ISUDQ(dqp) ? + tp->t_dqinfo->dqa_usrdquots : tp->t_dqinfo->dqa_grpdquots; + for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { if (qa[i].qt_dquot == NULL || - qa[i].qt_dquot == dqp) { - return (&qa[i]); - } + qa[i].qt_dquot == dqp) + return &qa[i]; } - return (NULL); + return NULL; } /* @@ -381,7 +380,7 @@ xfs_trans_apply_dquot_deltas( break; ASSERT(XFS_DQ_IS_LOCKED(dqp)); - ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp)); + ASSERT(dqp->q_transp == tp); /* * adjust the actual number of blocks used @@ -589,12 +588,18 @@ xfs_trans_unreserve_and_mod_dquots( } } -STATIC int -xfs_quota_error(uint flags) +STATIC void +xfs_quota_warn( + struct xfs_mount *mp, + struct xfs_dquot *dqp, + int type) { - if (flags & XFS_QMOPT_ENOSPC) - return ENOSPC; - return EDQUOT; + /* no warnings for project quotas - we just return ENOSPC later */ + if (dqp->dq_flags & XFS_DQ_PROJ) + return; + quota_send_warning((dqp->dq_flags & XFS_DQ_USER) ? USRQUOTA : GRPQUOTA, + be32_to_cpu(dqp->q_core.d_id), mp->m_super->s_dev, + type); } /* @@ -612,7 +617,6 @@ xfs_trans_dqresv( long ninos, uint flags) { - int error; xfs_qcnt_t hardlimit; xfs_qcnt_t softlimit; time_t timer; @@ -634,7 +638,7 @@ xfs_trans_dqresv( softlimit = q->qi_bsoftlimit; timer = be32_to_cpu(dqp->q_core.d_btimer); warns = be16_to_cpu(dqp->q_core.d_bwarns); - warnlimit = XFS_QI_BWARNLIMIT(dqp->q_mount); + warnlimit = dqp->q_mount->m_quotainfo->qi_bwarnlimit; resbcountp = &dqp->q_res_bcount; } else { ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS); @@ -646,10 +650,9 @@ xfs_trans_dqresv( softlimit = q->qi_rtbsoftlimit; timer = be32_to_cpu(dqp->q_core.d_rtbtimer); warns = be16_to_cpu(dqp->q_core.d_rtbwarns); - warnlimit = XFS_QI_RTBWARNLIMIT(dqp->q_mount); + warnlimit = dqp->q_mount->m_quotainfo->qi_rtbwarnlimit; resbcountp = &dqp->q_res_rtbcount; } - error = 0; if ((flags & XFS_QMOPT_FORCE_RES) == 0 && dqp->q_core.d_id && @@ -667,40 +670,46 @@ xfs_trans_dqresv( * nblks. */ if (hardlimit > 0ULL && - (hardlimit <= nblks + *resbcountp)) { - error = xfs_quota_error(flags); + hardlimit <= nblks + *resbcountp) { + xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN); goto error_return; } - if (softlimit > 0ULL && - (softlimit <= nblks + *resbcountp)) { + softlimit <= nblks + *resbcountp) { if ((timer != 0 && get_seconds() > timer) || (warns != 0 && warns >= warnlimit)) { - error = xfs_quota_error(flags); + xfs_quota_warn(mp, dqp, + QUOTA_NL_BSOFTLONGWARN); goto error_return; } + + xfs_quota_warn(mp, dqp, QUOTA_NL_BSOFTWARN); } } if (ninos > 0) { count = be64_to_cpu(dqp->q_core.d_icount); timer = be32_to_cpu(dqp->q_core.d_itimer); warns = be16_to_cpu(dqp->q_core.d_iwarns); - warnlimit = XFS_QI_IWARNLIMIT(dqp->q_mount); + warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit; hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit); if (!hardlimit) hardlimit = q->qi_ihardlimit; softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit); if (!softlimit) softlimit = q->qi_isoftlimit; + if (hardlimit > 0ULL && count >= hardlimit) { - error = xfs_quota_error(flags); + xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN); goto error_return; - } else if (softlimit > 0ULL && count >= softlimit) { - if ((timer != 0 && get_seconds() > timer) || + } + if (softlimit > 0ULL && count >= softlimit) { + if ((timer != 0 && get_seconds() > timer) || (warns != 0 && warns >= warnlimit)) { - error = xfs_quota_error(flags); + xfs_quota_warn(mp, dqp, + QUOTA_NL_ISOFTLONGWARN); goto error_return; } + xfs_quota_warn(mp, dqp, QUOTA_NL_ISOFTWARN); } } } @@ -736,9 +745,14 @@ xfs_trans_dqresv( ASSERT(dqp->q_res_rtbcount >= be64_to_cpu(dqp->q_core.d_rtbcount)); ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount)); + xfs_dqunlock(dqp); + return 0; + error_return: xfs_dqunlock(dqp); - return error; + if (flags & XFS_QMOPT_ENOSPC) + return ENOSPC; + return EDQUOT; } diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 00fd357c3e46..0135e2a669d7 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -36,8 +36,8 @@ struct xfs_acl { }; /* On-disk XFS extended attribute names */ -#define SGI_ACL_FILE "SGI_ACL_FILE" -#define SGI_ACL_DEFAULT "SGI_ACL_DEFAULT" +#define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE" +#define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT" #define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1) #define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1) @@ -49,8 +49,8 @@ extern int xfs_acl_chmod(struct inode *inode); extern int posix_acl_access_exists(struct inode *inode); extern int posix_acl_default_exists(struct inode *inode); -extern struct xattr_handler xfs_xattr_acl_access_handler; -extern struct xattr_handler xfs_xattr_acl_default_handler; +extern const struct xattr_handler xfs_xattr_acl_access_handler; +extern const struct xattr_handler xfs_xattr_acl_default_handler; #else # define xfs_check_acl NULL # define xfs_get_acl(inode, type) NULL diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index 6702bd865811..4917d4eed4ed 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -175,29 +175,31 @@ typedef struct xfs_agfl { } xfs_agfl_t; /* - * Busy block/extent entry. Used in perag to mark blocks that have been freed - * but whose transactions aren't committed to disk yet. + * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that + * have been freed but whose transactions aren't committed to disk yet. + * + * Note that we use the transaction ID to record the transaction, not the + * transaction structure itself. See xfs_alloc_busy_insert() for details. */ -typedef struct xfs_perag_busy { - xfs_agblock_t busy_start; - xfs_extlen_t busy_length; - struct xfs_trans *busy_tp; /* transaction that did the free */ -} xfs_perag_busy_t; +struct xfs_busy_extent { + struct rb_node rb_node; /* ag by-bno indexed search tree */ + struct list_head list; /* transaction busy extent list */ + xfs_agnumber_t agno; + xfs_agblock_t bno; + xfs_extlen_t length; + xlog_tid_t tid; /* transaction that created this */ +}; /* * Per-ag incore structure, copies of information in agf and agi, * to improve the performance of allocation group selection. - * - * pick sizes which fit in allocation buckets well */ -#if (BITS_PER_LONG == 32) -#define XFS_PAGB_NUM_SLOTS 84 -#elif (BITS_PER_LONG == 64) #define XFS_PAGB_NUM_SLOTS 128 -#endif -typedef struct xfs_perag -{ +typedef struct xfs_perag { + struct xfs_mount *pag_mount; /* owner filesystem */ + xfs_agnumber_t pag_agno; /* AG this structure belongs to */ + atomic_t pag_ref; /* perag reference count */ char pagf_init; /* this agf's entry is initialized */ char pagi_init; /* this agi's entry is initialized */ char pagf_metadata; /* the agf is preferred to be metadata */ @@ -210,8 +212,6 @@ typedef struct xfs_perag __uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */ xfs_agino_t pagi_freecount; /* number of free inodes */ xfs_agino_t pagi_count; /* number of allocated inodes */ - int pagb_count; /* pagb slots in use */ - xfs_perag_busy_t *pagb_list; /* unstable blocks */ /* * Inode allocation search lookup optimisation. @@ -222,14 +222,16 @@ typedef struct xfs_perag xfs_agino_t pagl_leftrec; xfs_agino_t pagl_rightrec; #ifdef __KERNEL__ - spinlock_t pagb_lock; /* lock for pagb_list */ + spinlock_t pagb_lock; /* lock for pagb_tree */ + struct rb_root pagb_tree; /* ordered tree of busy extents */ atomic_t pagf_fstrms; /* # of filestreams active in this AG */ - int pag_ici_init; /* incore inode cache initialised */ rwlock_t pag_ici_lock; /* incore inode lock */ struct radix_tree_root pag_ici_root; /* incore inode cache root */ + int pag_ici_reclaimable; /* reclaimable inodes */ #endif + int pagb_count; /* pagb slots in use */ } xfs_perag_t; /* diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 275b1f4f9430..a7fbe8a99b12 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -46,11 +46,9 @@ #define XFSA_FIXUP_BNO_OK 1 #define XFSA_FIXUP_CNT_OK 2 -STATIC void -xfs_alloc_search_busy(xfs_trans_t *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len); +static int +xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t bno, xfs_extlen_t len); /* * Prototypes for per-ag allocation routines @@ -540,9 +538,16 @@ xfs_alloc_ag_vextent( be32_to_cpu(agf->agf_length)); xfs_alloc_log_agf(args->tp, args->agbp, XFS_AGF_FREEBLKS); - /* search the busylist for these blocks */ - xfs_alloc_search_busy(args->tp, args->agno, - args->agbno, args->len); + /* + * Search the busylist for these blocks and mark the + * transaction as synchronous if blocks are found. This + * avoids the need to block due to a synchronous log + * force to ensure correct ordering as the synchronous + * transaction will guarantee that for us. + */ + if (xfs_alloc_busy_search(args->mp, args->agno, + args->agbno, args->len)) + xfs_trans_set_sync(args->tp); } if (!args->isfl) xfs_trans_mod_sb(args->tp, @@ -1662,11 +1667,13 @@ xfs_free_ag_extent( xfs_agf_t *agf; xfs_perag_t *pag; /* per allocation group data */ + pag = xfs_perag_get(mp, agno); + pag->pagf_freeblks += len; + xfs_perag_put(pag); + agf = XFS_BUF_TO_AGF(agbp); - pag = &mp->m_perag[agno]; be32_add_cpu(&agf->agf_freeblks, len); xfs_trans_agblocks_delta(tp, len); - pag->pagf_freeblks += len; XFS_WANT_CORRUPTED_GOTO( be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length), @@ -1691,7 +1698,7 @@ xfs_free_ag_extent( * when the iclog commits to disk. If a busy block is allocated, * the iclog is pushed up to the LSN that freed the block. */ - xfs_alloc_mark_busy(tp, agno, bno, len); + xfs_alloc_busy_insert(tp, agno, bno, len); return 0; error0: @@ -1969,10 +1976,12 @@ xfs_alloc_get_freelist( xfs_trans_brelse(tp, agflbp); if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp)) agf->agf_flfirst = 0; - pag = &mp->m_perag[be32_to_cpu(agf->agf_seqno)]; + + pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno)); be32_add_cpu(&agf->agf_flcount, -1); xfs_trans_agflist_delta(tp, -1); pag->pagf_flcount--; + xfs_perag_put(pag); logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT; if (btreeblk) { @@ -1985,14 +1994,20 @@ xfs_alloc_get_freelist( *bnop = bno; /* - * As blocks are freed, they are added to the per-ag busy list - * and remain there until the freeing transaction is committed to - * disk. Now that we have allocated blocks, this list must be - * searched to see if a block is being reused. If one is, then - * the freeing transaction must be pushed to disk NOW by forcing - * to disk all iclogs up that transaction's LSN. + * As blocks are freed, they are added to the per-ag busy list and + * remain there until the freeing transaction is committed to disk. + * Now that we have allocated blocks, this list must be searched to see + * if a block is being reused. If one is, then the freeing transaction + * must be pushed to disk before this transaction. + * + * We do this by setting the current transaction to a sync transaction + * which guarantees that the freeing transaction is on disk before this + * transaction. This is done instead of a synchronous log force here so + * that we don't sit and wait with the AGF locked in the transaction + * during the log force. */ - xfs_alloc_search_busy(tp, be32_to_cpu(agf->agf_seqno), bno, 1); + if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1)) + xfs_trans_set_sync(tp); return 0; } @@ -2078,7 +2093,8 @@ xfs_alloc_put_freelist( be32_add_cpu(&agf->agf_fllast, 1); if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp)) agf->agf_fllast = 0; - pag = &mp->m_perag[be32_to_cpu(agf->agf_seqno)]; + + pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno)); be32_add_cpu(&agf->agf_flcount, 1); xfs_trans_agflist_delta(tp, 1); pag->pagf_flcount++; @@ -2089,6 +2105,7 @@ xfs_alloc_put_freelist( pag->pagf_btreeblks--; logflags |= XFS_AGF_BTREEBLKS; } + xfs_perag_put(pag); xfs_alloc_log_agf(tp, agbp, logflags); @@ -2152,7 +2169,6 @@ xfs_read_agf( xfs_trans_brelse(tp, *bpp); return XFS_ERROR(EFSCORRUPTED); } - XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGF, XFS_AGF_REF); return 0; } @@ -2175,7 +2191,7 @@ xfs_alloc_read_agf( ASSERT(agno != NULLAGNUMBER); error = xfs_read_agf(mp, tp, agno, - (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XFS_BUF_TRYLOCK : 0, + (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0, bpp); if (error) return error; @@ -2184,7 +2200,7 @@ xfs_alloc_read_agf( ASSERT(!XFS_BUF_GETERROR(*bpp)); agf = XFS_BUF_TO_AGF(*bpp); - pag = &mp->m_perag[agno]; + pag = xfs_perag_get(mp, agno); if (!pag->pagf_init) { pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); @@ -2195,8 +2211,8 @@ xfs_alloc_read_agf( pag->pagf_levels[XFS_BTNUM_CNTi] = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); spin_lock_init(&pag->pagb_lock); - pag->pagb_list = kmem_zalloc(XFS_PAGB_NUM_SLOTS * - sizeof(xfs_perag_busy_t), KM_SLEEP); + pag->pagb_count = 0; + pag->pagb_tree = RB_ROOT; pag->pagf_init = 1; } #ifdef DEBUG @@ -2211,6 +2227,7 @@ xfs_alloc_read_agf( be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi])); } #endif + xfs_perag_put(pag); return 0; } @@ -2270,8 +2287,7 @@ xfs_alloc_vextent( * These three force us into a single a.g. */ args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno); - down_read(&mp->m_peraglock); - args->pag = &mp->m_perag[args->agno]; + args->pag = xfs_perag_get(mp, args->agno); args->minleft = 0; error = xfs_alloc_fix_freelist(args, 0); args->minleft = minleft; @@ -2280,14 +2296,12 @@ xfs_alloc_vextent( goto error0; } if (!args->agbp) { - up_read(&mp->m_peraglock); trace_xfs_alloc_vextent_noagbp(args); break; } args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); if ((error = xfs_alloc_ag_vextent(args))) goto error0; - up_read(&mp->m_peraglock); break; case XFS_ALLOCTYPE_START_BNO: /* @@ -2339,9 +2353,8 @@ xfs_alloc_vextent( * Loop over allocation groups twice; first time with * trylock set, second time without. */ - down_read(&mp->m_peraglock); for (;;) { - args->pag = &mp->m_perag[args->agno]; + args->pag = xfs_perag_get(mp, args->agno); if (no_min) args->minleft = 0; error = xfs_alloc_fix_freelist(args, flags); args->minleft = minleft; @@ -2400,8 +2413,8 @@ xfs_alloc_vextent( } } } + xfs_perag_put(args->pag); } - up_read(&mp->m_peraglock); if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) { if (args->agno == sagno) mp->m_agfrotor = (mp->m_agfrotor + 1) % @@ -2427,9 +2440,10 @@ xfs_alloc_vextent( args->len); #endif } + xfs_perag_put(args->pag); return 0; error0: - up_read(&mp->m_peraglock); + xfs_perag_put(args->pag); return error; } @@ -2454,8 +2468,7 @@ xfs_free_extent( args.agno = XFS_FSB_TO_AGNO(args.mp, bno); ASSERT(args.agno < args.mp->m_sb.sb_agcount); args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno); - down_read(&args.mp->m_peraglock); - args.pag = &args.mp->m_perag[args.agno]; + args.pag = xfs_perag_get(args.mp, args.agno); if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING))) goto error0; #ifdef DEBUG @@ -2465,7 +2478,7 @@ xfs_free_extent( #endif error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); error0: - up_read(&args.mp->m_peraglock); + xfs_perag_put(args.pag); return error; } @@ -2477,127 +2490,263 @@ error0: * list is reused, the transaction that freed it must be forced to disk * before continuing to use the block. * - * xfs_alloc_mark_busy - add to the per-ag busy list - * xfs_alloc_clear_busy - remove an item from the per-ag busy list + * xfs_alloc_busy_insert - add to the per-ag busy list + * xfs_alloc_busy_clear - remove an item from the per-ag busy list + * xfs_alloc_busy_search - search for a busy extent + */ + +/* + * Insert a new extent into the busy tree. + * + * The busy extent tree is indexed by the start block of the busy extent. + * there can be multiple overlapping ranges in the busy extent tree but only + * ever one entry at a given start block. The reason for this is that + * multi-block extents can be freed, then smaller chunks of that extent + * allocated and freed again before the first transaction commit is on disk. + * If the exact same start block is freed a second time, we have to wait for + * that busy extent to pass out of the tree before the new extent is inserted. + * There are two main cases we have to handle here. + * + * The first case is a transaction that triggers a "free - allocate - free" + * cycle. This can occur during btree manipulations as a btree block is freed + * to the freelist, then allocated from the free list, then freed again. In + * this case, the second extxpnet free is what triggers the duplicate and as + * such the transaction IDs should match. Because the extent was allocated in + * this transaction, the transaction must be marked as synchronous. This is + * true for all cases where the free/alloc/free occurs in the one transaction, + * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case. + * This serves to catch violations of the second case quite effectively. + * + * The second case is where the free/alloc/free occur in different + * transactions. In this case, the thread freeing the extent the second time + * can't mark the extent busy immediately because it is already tracked in a + * transaction that may be committing. When the log commit for the existing + * busy extent completes, the busy extent will be removed from the tree. If we + * allow the second busy insert to continue using that busy extent structure, + * it can be freed before this transaction is safely in the log. Hence our + * only option in this case is to force the log to remove the existing busy + * extent from the list before we insert the new one with the current + * transaction ID. + * + * The problem we are trying to avoid in the free-alloc-free in separate + * transactions is most easily described with a timeline: + * + * Thread 1 Thread 2 Thread 3 xfslogd + * xact alloc + * free X + * mark busy + * commit xact + * free xact + * xact alloc + * alloc X + * busy search + * mark xact sync + * commit xact + * free xact + * force log + * checkpoint starts + * .... + * xact alloc + * free X + * mark busy + * finds match + * *** KABOOM! *** + * .... + * log IO completes + * unbusy X + * checkpoint completes + * + * By issuing a log force in thread 3 @ "KABOOM", the thread will block until + * the checkpoint completes, and the busy extent it matched will have been + * removed from the tree when it is woken. Hence it can then continue safely. + * + * However, to ensure this matching process is robust, we need to use the + * transaction ID for identifying transaction, as delayed logging results in + * the busy extent and transaction lifecycles being different. i.e. the busy + * extent is active for a lot longer than the transaction. Hence the + * transaction structure can be freed and reallocated, then mark the same + * extent busy again in the new transaction. In this case the new transaction + * will have a different tid but can have the same address, and hence we need + * to check against the tid. + * + * Future: for delayed logging, we could avoid the log force if the extent was + * first freed in the current checkpoint sequence. This, however, requires the + * ability to pin the current checkpoint in memory until this transaction + * commits to ensure that both the original free and the current one combine + * logically into the one checkpoint. If the checkpoint sequences are + * different, however, we still need to wait on a log force. */ void -xfs_alloc_mark_busy(xfs_trans_t *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len) +xfs_alloc_busy_insert( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len) { - xfs_mount_t *mp; - xfs_perag_busy_t *bsy; - int n; + struct xfs_busy_extent *new; + struct xfs_busy_extent *busyp; + struct xfs_perag *pag; + struct rb_node **rbp; + struct rb_node *parent; + int match; - mp = tp->t_mountp; - spin_lock(&mp->m_perag[agno].pagb_lock); - - /* search pagb_list for an open slot */ - for (bsy = mp->m_perag[agno].pagb_list, n = 0; - n < XFS_PAGB_NUM_SLOTS; - bsy++, n++) { - if (bsy->busy_tp == NULL) { - break; - } - } - trace_xfs_alloc_busy(mp, agno, bno, len, n); - - if (n < XFS_PAGB_NUM_SLOTS) { - bsy = &mp->m_perag[agno].pagb_list[n]; - mp->m_perag[agno].pagb_count++; - bsy->busy_start = bno; - bsy->busy_length = len; - bsy->busy_tp = tp; - xfs_trans_add_busy(tp, agno, n); - } else { + new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL); + if (!new) { /* - * The busy list is full! Since it is now not possible to - * track the free block, make this a synchronous transaction - * to insure that the block is not reused before this - * transaction commits. + * No Memory! Since it is now not possible to track the free + * block, make this a synchronous transaction to insure that + * the block is not reused before this transaction commits. */ + trace_xfs_alloc_busy(tp, agno, bno, len, 1); xfs_trans_set_sync(tp); + return; } - spin_unlock(&mp->m_perag[agno].pagb_lock); -} - -void -xfs_alloc_clear_busy(xfs_trans_t *tp, - xfs_agnumber_t agno, - int idx) -{ - xfs_mount_t *mp; - xfs_perag_busy_t *list; - - mp = tp->t_mountp; - - spin_lock(&mp->m_perag[agno].pagb_lock); - list = mp->m_perag[agno].pagb_list; - - ASSERT(idx < XFS_PAGB_NUM_SLOTS); - - trace_xfs_alloc_unbusy(mp, agno, idx, list[idx].busy_tp == tp); - - if (list[idx].busy_tp == tp) { - list[idx].busy_tp = NULL; - mp->m_perag[agno].pagb_count--; + new->agno = agno; + new->bno = bno; + new->length = len; + new->tid = xfs_log_get_trans_ident(tp); + + INIT_LIST_HEAD(&new->list); + + /* trace before insert to be able to see failed inserts */ + trace_xfs_alloc_busy(tp, agno, bno, len, 0); + + pag = xfs_perag_get(tp->t_mountp, new->agno); +restart: + spin_lock(&pag->pagb_lock); + rbp = &pag->pagb_tree.rb_node; + parent = NULL; + busyp = NULL; + match = 0; + while (*rbp && match >= 0) { + parent = *rbp; + busyp = rb_entry(parent, struct xfs_busy_extent, rb_node); + + if (new->bno < busyp->bno) { + /* may overlap, but exact start block is lower */ + rbp = &(*rbp)->rb_left; + if (new->bno + new->length > busyp->bno) + match = busyp->tid == new->tid ? 1 : -1; + } else if (new->bno > busyp->bno) { + /* may overlap, but exact start block is higher */ + rbp = &(*rbp)->rb_right; + if (bno < busyp->bno + busyp->length) + match = busyp->tid == new->tid ? 1 : -1; + } else { + match = busyp->tid == new->tid ? 1 : -1; + break; + } + } + if (match < 0) { + /* overlap marked busy in different transaction */ + spin_unlock(&pag->pagb_lock); + xfs_log_force(tp->t_mountp, XFS_LOG_SYNC); + goto restart; } + if (match > 0) { + /* + * overlap marked busy in same transaction. Update if exact + * start block match, otherwise combine the busy extents into + * a single range. + */ + if (busyp->bno == new->bno) { + busyp->length = max(busyp->length, new->length); + spin_unlock(&pag->pagb_lock); + ASSERT(tp->t_flags & XFS_TRANS_SYNC); + xfs_perag_put(pag); + kmem_free(new); + return; + } + rb_erase(&busyp->rb_node, &pag->pagb_tree); + new->length = max(busyp->bno + busyp->length, + new->bno + new->length) - + min(busyp->bno, new->bno); + new->bno = min(busyp->bno, new->bno); + } else + busyp = NULL; - spin_unlock(&mp->m_perag[agno].pagb_lock); -} + rb_link_node(&new->rb_node, parent, rbp); + rb_insert_color(&new->rb_node, &pag->pagb_tree); + list_add(&new->list, &tp->t_busy); + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); + kmem_free(busyp); +} /* - * If we find the extent in the busy list, force the log out to get the - * extent out of the busy list so the caller can use it straight away. + * Search for a busy extent within the range of the extent we are about to + * allocate. You need to be holding the busy extent tree lock when calling + * xfs_alloc_busy_search(). This function returns 0 for no overlapping busy + * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact + * match. This is done so that a non-zero return indicates an overlap that + * will require a synchronous transaction, but it can still be + * used to distinguish between a partial or exact match. */ -STATIC void -xfs_alloc_search_busy(xfs_trans_t *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len) +static int +xfs_alloc_busy_search( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len) { - xfs_mount_t *mp; - xfs_perag_busy_t *bsy; - xfs_agblock_t uend, bend; - xfs_lsn_t lsn = 0; - int cnt; - - mp = tp->t_mountp; + struct xfs_perag *pag; + struct rb_node *rbp; + struct xfs_busy_extent *busyp; + int match = 0; + + pag = xfs_perag_get(mp, agno); + spin_lock(&pag->pagb_lock); + + rbp = pag->pagb_tree.rb_node; + + /* find closest start bno overlap */ + while (rbp) { + busyp = rb_entry(rbp, struct xfs_busy_extent, rb_node); + if (bno < busyp->bno) { + /* may overlap, but exact start block is lower */ + if (bno + len > busyp->bno) + match = -1; + rbp = rbp->rb_left; + } else if (bno > busyp->bno) { + /* may overlap, but exact start block is higher */ + if (bno < busyp->bno + busyp->length) + match = -1; + rbp = rbp->rb_right; + } else { + /* bno matches busyp, length determines exact match */ + match = (busyp->length == len) ? 1 : -1; + break; + } + } + spin_unlock(&pag->pagb_lock); + trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match); + xfs_perag_put(pag); + return match; +} - spin_lock(&mp->m_perag[agno].pagb_lock); +void +xfs_alloc_busy_clear( + struct xfs_mount *mp, + struct xfs_busy_extent *busyp) +{ + struct xfs_perag *pag; - uend = bno + len - 1; + trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno, + busyp->length); - /* - * search pagb_list for this slot, skipping open slots. We have to - * search the entire array as there may be multiple overlaps and - * we have to get the most recent LSN for the log force to push out - * all the transactions that span the range. - */ - for (cnt = 0; cnt < mp->m_perag[agno].pagb_count; cnt++) { - bsy = &mp->m_perag[agno].pagb_list[cnt]; - if (!bsy->busy_tp) - continue; + ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno, + busyp->length) == 1); - bend = bsy->busy_start + bsy->busy_length - 1; - if (bno > bend || uend < bsy->busy_start) - continue; + list_del_init(&busyp->list); - /* (start1,length1) within (start2, length2) */ - if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0) - lsn = bsy->busy_tp->t_commit_lsn; - } - spin_unlock(&mp->m_perag[agno].pagb_lock); - trace_xfs_alloc_busysearch(tp->t_mountp, agno, bno, len, lsn); + pag = xfs_perag_get(mp, busyp->agno); + spin_lock(&pag->pagb_lock); + rb_erase(&busyp->rb_node, &pag->pagb_tree); + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); - /* - * If a block was found, force the log through the LSN of the - * transaction that freed the block - */ - if (lsn) - xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC); + kmem_free(busyp); } diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 599bffa39784..6d05199b667c 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -22,6 +22,7 @@ struct xfs_buf; struct xfs_mount; struct xfs_perag; struct xfs_trans; +struct xfs_busy_extent; /* * Freespace allocation types. Argument to xfs_alloc_[v]extent. @@ -119,15 +120,13 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp, #ifdef __KERNEL__ void -xfs_alloc_mark_busy(xfs_trans_t *tp, +xfs_alloc_busy_insert(xfs_trans_t *tp, xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len); void -xfs_alloc_clear_busy(xfs_trans_t *tp, - xfs_agnumber_t ag, - int idx); +xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp); #endif /* __KERNEL__ */ diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index adbd9141aea1..83f494218759 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -61,12 +61,14 @@ xfs_allocbt_set_root( struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); int btnum = cur->bc_btnum; + struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); ASSERT(ptr->s != 0); agf->agf_roots[btnum] = ptr->s; be32_add_cpu(&agf->agf_levels[btnum], inc); - cur->bc_mp->m_perag[seqno].pagf_levels[btnum] += inc; + pag->pagf_levels[btnum] += inc; + xfs_perag_put(pag); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); } @@ -132,7 +134,7 @@ xfs_allocbt_free_block( * disk. If a busy block is allocated, the iclog is pushed up to the * LSN that freed the block. */ - xfs_alloc_mark_busy(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1); + xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1); xfs_trans_agbtree_delta(cur->bc_tp, -1); return 0; } @@ -150,6 +152,7 @@ xfs_allocbt_update_lastrec( { struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); + struct xfs_perag *pag; __be32 len; int numrecs; @@ -193,7 +196,9 @@ xfs_allocbt_update_lastrec( } agf->agf_longest = len; - cur->bc_mp->m_perag[seqno].pagf_longest = be32_to_cpu(len); + pag = xfs_perag_get(cur->bc_mp, seqno); + pag->pagf_longest = be32_to_cpu(len); + xfs_perag_put(pag); xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST); } diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index e953b6cfb2a8..b9c196a53c42 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -93,12 +93,12 @@ STATIC int xfs_attr_rmtval_remove(xfs_da_args_t *args); STATIC int xfs_attr_name_to_xname( struct xfs_name *xname, - const char *aname) + const unsigned char *aname) { if (!aname) return EINVAL; xname->name = aname; - xname->len = strlen(aname); + xname->len = strlen((char *)aname); if (xname->len >= MAXNAMELEN) return EFAULT; /* match IRIX behaviour */ @@ -124,7 +124,7 @@ STATIC int xfs_attr_get_int( struct xfs_inode *ip, struct xfs_name *name, - char *value, + unsigned char *value, int *valuelenp, int flags) { @@ -171,8 +171,8 @@ xfs_attr_get_int( int xfs_attr_get( xfs_inode_t *ip, - const char *name, - char *value, + const unsigned char *name, + unsigned char *value, int *valuelenp, int flags) { @@ -197,7 +197,7 @@ xfs_attr_get( /* * Calculate how many blocks we need for the new attribute, */ -int +STATIC int xfs_attr_calc_size( struct xfs_inode *ip, int namelen, @@ -235,8 +235,12 @@ xfs_attr_calc_size( } STATIC int -xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name, - char *value, int valuelen, int flags) +xfs_attr_set_int( + struct xfs_inode *dp, + struct xfs_name *name, + unsigned char *value, + int valuelen, + int flags) { xfs_da_args_t args; xfs_fsblock_t firstblock; @@ -452,8 +456,8 @@ out: int xfs_attr_set( xfs_inode_t *dp, - const char *name, - char *value, + const unsigned char *name, + unsigned char *value, int valuelen, int flags) { @@ -600,7 +604,7 @@ out: int xfs_attr_remove( xfs_inode_t *dp, - const char *name, + const unsigned char *name, int flags) { int error; @@ -669,9 +673,13 @@ xfs_attr_list_int(xfs_attr_list_context_t *context) */ /*ARGSUSED*/ STATIC int -xfs_attr_put_listent(xfs_attr_list_context_t *context, int flags, - char *name, int namelen, - int valuelen, char *value) +xfs_attr_put_listent( + xfs_attr_list_context_t *context, + int flags, + unsigned char *name, + int namelen, + int valuelen, + unsigned char *value) { struct attrlist *alist = (struct attrlist *)context->alist; attrlist_ent_t *aep; @@ -1980,7 +1988,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args) xfs_bmbt_irec_t map[ATTR_RMTVALUE_MAPSIZE]; xfs_mount_t *mp; xfs_daddr_t dblkno; - xfs_caddr_t dst; + void *dst; xfs_buf_t *bp; int nmap, error, tmp, valuelen, blkcnt, i; xfs_dablk_t lblkno; @@ -2007,15 +2015,14 @@ xfs_attr_rmtval_get(xfs_da_args_t *args) dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno, - blkcnt, - XFS_BUF_LOCK | XBF_DONT_BLOCK, + blkcnt, XBF_LOCK | XBF_DONT_BLOCK, &bp); if (error) return(error); tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen : XFS_BUF_SIZE(bp); - xfs_biomove(bp, 0, tmp, dst, XFS_B_READ); + xfs_biomove(bp, 0, tmp, dst, XBF_READ); xfs_buf_relse(bp); dst += tmp; valuelen -= tmp; @@ -2039,7 +2046,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) xfs_inode_t *dp; xfs_bmbt_irec_t map; xfs_daddr_t dblkno; - xfs_caddr_t src; + void *src; xfs_buf_t *bp; xfs_dablk_t lblkno; int blkcnt, valuelen, nmap, error, tmp, committed; @@ -2141,13 +2148,13 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, - XFS_BUF_LOCK | XBF_DONT_BLOCK); + XBF_LOCK | XBF_DONT_BLOCK); ASSERT(bp); ASSERT(!XFS_BUF_GETERROR(bp)); tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen : XFS_BUF_SIZE(bp); - xfs_biomove(bp, 0, tmp, src, XFS_B_WRITE); + xfs_biomove(bp, 0, tmp, src, XBF_WRITE); if (tmp < XFS_BUF_SIZE(bp)) xfs_biozero(bp, tmp, XFS_BUF_SIZE(bp) - tmp); if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */ @@ -2208,8 +2215,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) /* * If the "remote" value is in the cache, remove it. */ - bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, - XFS_INCORE_TRYLOCK); + bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK); if (bp) { XFS_BUF_STALE(bp); XFS_BUF_UNDELAYWRITE(bp); diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h index 59b410ce69a1..e920d68ef509 100644 --- a/fs/xfs/xfs_attr.h +++ b/fs/xfs/xfs_attr.h @@ -113,7 +113,7 @@ typedef struct attrlist_cursor_kern { typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int, - char *, int, int, char *); + unsigned char *, int, int, unsigned char *); typedef struct xfs_attr_list_context { struct xfs_inode *dp; /* inode */ @@ -139,7 +139,6 @@ typedef struct xfs_attr_list_context { /* * Overall external interface routines. */ -int xfs_attr_calc_size(struct xfs_inode *, int, int, int *); int xfs_attr_inactive(struct xfs_inode *dp); int xfs_attr_rmtval_get(struct xfs_da_args *args); int xfs_attr_list_int(struct xfs_attr_list_context *); diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index baf41b5af756..a90ce74fc256 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -521,11 +521,11 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; i++) { - nargs.name = (char *)sfe->nameval; + nargs.name = sfe->nameval; nargs.namelen = sfe->namelen; - nargs.value = (char *)&sfe->nameval[nargs.namelen]; + nargs.value = &sfe->nameval[nargs.namelen]; nargs.valuelen = sfe->valuelen; - nargs.hashval = xfs_da_hashname((char *)sfe->nameval, + nargs.hashval = xfs_da_hashname(sfe->nameval, sfe->namelen); nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags); error = xfs_attr_leaf_lookup_int(bp, &nargs); /* set a->index */ @@ -612,10 +612,10 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { error = context->put_listent(context, sfe->flags, - (char *)sfe->nameval, + sfe->nameval, (int)sfe->namelen, (int)sfe->valuelen, - (char*)&sfe->nameval[sfe->namelen]); + &sfe->nameval[sfe->namelen]); /* * Either search callback finished early or @@ -659,8 +659,8 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) } sbp->entno = i; - sbp->hash = xfs_da_hashname((char *)sfe->nameval, sfe->namelen); - sbp->name = (char *)sfe->nameval; + sbp->hash = xfs_da_hashname(sfe->nameval, sfe->namelen); + sbp->name = sfe->nameval; sbp->namelen = sfe->namelen; /* These are bytes, and both on-disk, don't endian-flip */ sbp->valuelen = sfe->valuelen; @@ -818,9 +818,9 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff) continue; ASSERT(entry->flags & XFS_ATTR_LOCAL); name_loc = xfs_attr_leaf_name_local(leaf, i); - nargs.name = (char *)name_loc->nameval; + nargs.name = name_loc->nameval; nargs.namelen = name_loc->namelen; - nargs.value = (char *)&name_loc->nameval[nargs.namelen]; + nargs.value = &name_loc->nameval[nargs.namelen]; nargs.valuelen = be16_to_cpu(name_loc->valuelen); nargs.hashval = be32_to_cpu(entry->hashval); nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags); @@ -2370,10 +2370,10 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context) retval = context->put_listent(context, entry->flags, - (char *)name_loc->nameval, + name_loc->nameval, (int)name_loc->namelen, be16_to_cpu(name_loc->valuelen), - (char *)&name_loc->nameval[name_loc->namelen]); + &name_loc->nameval[name_loc->namelen]); if (retval) return retval; } else { @@ -2397,15 +2397,15 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context) return retval; retval = context->put_listent(context, entry->flags, - (char *)name_rmt->name, + name_rmt->name, (int)name_rmt->namelen, valuelen, - (char*)args.value); + args.value); kmem_free(args.value); } else { retval = context->put_listent(context, entry->flags, - (char *)name_rmt->name, + name_rmt->name, (int)name_rmt->namelen, valuelen, NULL); @@ -2950,7 +2950,7 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, map.br_blockcount); bp = xfs_trans_get_buf(*trans, dp->i_mount->m_ddev_targp, - dblkno, dblkcnt, XFS_BUF_LOCK); + dblkno, dblkcnt, XBF_LOCK); xfs_trans_binval(*trans, bp); /* * Roll to next transaction. diff --git a/fs/xfs/xfs_attr_sf.h b/fs/xfs/xfs_attr_sf.h index 76ab7b0cbb3a..919756e3ba53 100644 --- a/fs/xfs/xfs_attr_sf.h +++ b/fs/xfs/xfs_attr_sf.h @@ -52,7 +52,7 @@ typedef struct xfs_attr_sf_sort { __uint8_t valuelen; /* length of value */ __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ xfs_dahash_t hash; /* this entry's hash value */ - char *name; /* name value, pointer into buffer */ + unsigned char *name; /* name value, pointer into buffer */ } xfs_attr_sf_sort_t; #define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen) /* space name/value uses */ \ diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 98251cdc52aa..99587ded043f 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2550,22 +2550,134 @@ xfs_bmap_rtalloc( } STATIC int +xfs_bmap_btalloc_nullfb( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args, + xfs_extlen_t *blen) +{ + struct xfs_mount *mp = ap->ip->i_mount; + struct xfs_perag *pag; + xfs_agnumber_t ag, startag; + int notinit = 0; + int error; + + if (ap->userdata && xfs_inode_is_filestream(ap->ip)) + args->type = XFS_ALLOCTYPE_NEAR_BNO; + else + args->type = XFS_ALLOCTYPE_START_BNO; + args->total = ap->total; + + /* + * Search for an allocation group with a single extent large enough + * for the request. If one isn't found, then adjust the minimum + * allocation size to the largest space found. + */ + startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno); + if (startag == NULLAGNUMBER) + startag = ag = 0; + + pag = xfs_perag_get(mp, ag); + while (*blen < ap->alen) { + if (!pag->pagf_init) { + error = xfs_alloc_pagf_init(mp, args->tp, ag, + XFS_ALLOC_FLAG_TRYLOCK); + if (error) { + xfs_perag_put(pag); + return error; + } + } + + /* + * See xfs_alloc_fix_freelist... + */ + if (pag->pagf_init) { + xfs_extlen_t longest; + longest = xfs_alloc_longest_free_extent(mp, pag); + if (*blen < longest) + *blen = longest; + } else + notinit = 1; + + if (xfs_inode_is_filestream(ap->ip)) { + if (*blen >= ap->alen) + break; + + if (ap->userdata) { + /* + * If startag is an invalid AG, we've + * come here once before and + * xfs_filestream_new_ag picked the + * best currently available. + * + * Don't continue looping, since we + * could loop forever. + */ + if (startag == NULLAGNUMBER) + break; + + error = xfs_filestream_new_ag(ap, &ag); + xfs_perag_put(pag); + if (error) + return error; + + /* loop again to set 'blen'*/ + startag = NULLAGNUMBER; + pag = xfs_perag_get(mp, ag); + continue; + } + } + if (++ag == mp->m_sb.sb_agcount) + ag = 0; + if (ag == startag) + break; + xfs_perag_put(pag); + pag = xfs_perag_get(mp, ag); + } + xfs_perag_put(pag); + + /* + * Since the above loop did a BUF_TRYLOCK, it is + * possible that there is space for this request. + */ + if (notinit || *blen < ap->minlen) + args->minlen = ap->minlen; + /* + * If the best seen length is less than the request + * length, use the best as the minimum. + */ + else if (*blen < ap->alen) + args->minlen = *blen; + /* + * Otherwise we've seen an extent as big as alen, + * use that as the minimum. + */ + else + args->minlen = ap->alen; + + /* + * set the failure fallback case to look in the selected + * AG as the stream may have moved. + */ + if (xfs_inode_is_filestream(ap->ip)) + ap->rval = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0); + + return 0; +} + +STATIC int xfs_bmap_btalloc( xfs_bmalloca_t *ap) /* bmap alloc argument struct */ { xfs_mount_t *mp; /* mount point structure */ xfs_alloctype_t atype = 0; /* type for allocation routines */ xfs_extlen_t align; /* minimum allocation alignment */ - xfs_agnumber_t ag; xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ - xfs_agnumber_t startag; + xfs_agnumber_t ag; xfs_alloc_arg_t args; xfs_extlen_t blen; xfs_extlen_t nextminlen = 0; - xfs_perag_t *pag; int nullfb; /* true if ap->firstblock isn't set */ int isaligned; - int notinit; int tryagain; int error; @@ -2612,102 +2724,9 @@ xfs_bmap_btalloc( args.firstblock = ap->firstblock; blen = 0; if (nullfb) { - if (ap->userdata && xfs_inode_is_filestream(ap->ip)) - args.type = XFS_ALLOCTYPE_NEAR_BNO; - else - args.type = XFS_ALLOCTYPE_START_BNO; - args.total = ap->total; - - /* - * Search for an allocation group with a single extent - * large enough for the request. - * - * If one isn't found, then adjust the minimum allocation - * size to the largest space found. - */ - startag = ag = XFS_FSB_TO_AGNO(mp, args.fsbno); - if (startag == NULLAGNUMBER) - startag = ag = 0; - notinit = 0; - down_read(&mp->m_peraglock); - while (blen < ap->alen) { - pag = &mp->m_perag[ag]; - if (!pag->pagf_init && - (error = xfs_alloc_pagf_init(mp, args.tp, - ag, XFS_ALLOC_FLAG_TRYLOCK))) { - up_read(&mp->m_peraglock); - return error; - } - /* - * See xfs_alloc_fix_freelist... - */ - if (pag->pagf_init) { - xfs_extlen_t longest; - longest = xfs_alloc_longest_free_extent(mp, pag); - if (blen < longest) - blen = longest; - } else - notinit = 1; - - if (xfs_inode_is_filestream(ap->ip)) { - if (blen >= ap->alen) - break; - - if (ap->userdata) { - /* - * If startag is an invalid AG, we've - * come here once before and - * xfs_filestream_new_ag picked the - * best currently available. - * - * Don't continue looping, since we - * could loop forever. - */ - if (startag == NULLAGNUMBER) - break; - - error = xfs_filestream_new_ag(ap, &ag); - if (error) { - up_read(&mp->m_peraglock); - return error; - } - - /* loop again to set 'blen'*/ - startag = NULLAGNUMBER; - continue; - } - } - if (++ag == mp->m_sb.sb_agcount) - ag = 0; - if (ag == startag) - break; - } - up_read(&mp->m_peraglock); - /* - * Since the above loop did a BUF_TRYLOCK, it is - * possible that there is space for this request. - */ - if (notinit || blen < ap->minlen) - args.minlen = ap->minlen; - /* - * If the best seen length is less than the request - * length, use the best as the minimum. - */ - else if (blen < ap->alen) - args.minlen = blen; - /* - * Otherwise we've seen an extent as big as alen, - * use that as the minimum. - */ - else - args.minlen = ap->alen; - - /* - * set the failure fallback case to look in the selected - * AG as the stream may have moved. - */ - if (xfs_inode_is_filestream(ap->ip)) - ap->rval = args.fsbno = XFS_AGB_TO_FSB(mp, ag, 0); + error = xfs_bmap_btalloc_nullfb(ap, &args, &blen); + if (error) + return error; } else if (ap->low) { if (xfs_inode_is_filestream(ap->ip)) args.type = XFS_ALLOCTYPE_FIRST_AG; @@ -3810,7 +3829,7 @@ xfs_bmap_add_attrfork( } if ((error = xfs_bmap_finish(&tp, &flist, &committed))) goto error2; - error = xfs_trans_commit(tp, XFS_TRANS_PERM_LOG_RES); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); ASSERT(ip->i_df.if_ext_max == XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); return error; @@ -4470,7 +4489,7 @@ xfs_bmapi( xfs_fsblock_t abno; /* allocated block number */ xfs_extlen_t alen; /* allocated extent length */ xfs_fileoff_t aoff; /* allocated file offset */ - xfs_bmalloca_t bma; /* args for xfs_bmap_alloc */ + xfs_bmalloca_t bma = { 0 }; /* args for xfs_bmap_alloc */ xfs_btree_cur_t *cur; /* bmap btree cursor */ xfs_fileoff_t end; /* end of mapped file region */ int eof; /* we've hit the end of extents */ diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index 38751d5fac6f..416e47e54b83 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -334,7 +334,7 @@ xfs_bmbt_disk_set_allf( /* * Set all the fields in a bmap extent record from the uncompressed form. */ -void +STATIC void xfs_bmbt_disk_set_all( xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s) diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h index cf07ca7c22e7..0e66c4ea0f85 100644 --- a/fs/xfs/xfs_bmap_btree.h +++ b/fs/xfs/xfs_bmap_btree.h @@ -223,7 +223,6 @@ extern void xfs_bmbt_set_startblock(xfs_bmbt_rec_host_t *r, xfs_fsblock_t v); extern void xfs_bmbt_set_startoff(xfs_bmbt_rec_host_t *r, xfs_fileoff_t v); extern void xfs_bmbt_set_state(xfs_bmbt_rec_host_t *r, xfs_exntst_t v); -extern void xfs_bmbt_disk_set_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s); extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o, xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v); diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 36a0992dd669..96be4b0f2496 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -977,7 +977,7 @@ xfs_btree_get_buf_block( xfs_daddr_t d; /* need to sort out how callers deal with failures first */ - ASSERT(!(flags & XFS_BUF_TRYLOCK)); + ASSERT(!(flags & XBF_TRYLOCK)); d = xfs_btree_ptr_to_daddr(cur, ptr); *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, @@ -1008,7 +1008,7 @@ xfs_btree_read_buf_block( int error; /* need to sort out how callers deal with failures first */ - ASSERT(!(flags & XFS_BUF_TRYLOCK)); + ASSERT(!(flags & XBF_TRYLOCK)); d = xfs_btree_ptr_to_daddr(cur, ptr); error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d, diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index a30f7e9eb2b9..02a80984aa05 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -64,7 +64,7 @@ xfs_buf_item_log_debug( nbytes = last - first + 1; bfset(bip->bli_logged, first, nbytes); for (x = 0; x < nbytes; x++) { - chunk_num = byte >> XFS_BLI_SHIFT; + chunk_num = byte >> XFS_BLF_SHIFT; word_num = chunk_num >> BIT_TO_WORD_SHIFT; bit_num = chunk_num & (NBWORD - 1); wordp = &(bip->bli_format.blf_data_map[word_num]); @@ -166,7 +166,7 @@ xfs_buf_item_size( * cancel flag in it. */ trace_xfs_buf_item_size_stale(bip); - ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); + ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); return 1; } @@ -197,9 +197,9 @@ xfs_buf_item_size( } else if (next_bit != last_bit + 1) { last_bit = next_bit; nvecs++; - } else if (xfs_buf_offset(bp, next_bit * XFS_BLI_CHUNK) != - (xfs_buf_offset(bp, last_bit * XFS_BLI_CHUNK) + - XFS_BLI_CHUNK)) { + } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) != + (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) + + XFS_BLF_CHUNK)) { last_bit = next_bit; nvecs++; } else { @@ -250,10 +250,24 @@ xfs_buf_item_format( ((bip->bli_format.blf_map_size - 1) * sizeof(uint))); vecp->i_addr = (xfs_caddr_t)&bip->bli_format; vecp->i_len = base_size; - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BFORMAT); + vecp->i_type = XLOG_REG_TYPE_BFORMAT; vecp++; nvecs = 1; + /* + * If it is an inode buffer, transfer the in-memory state to the + * format flags and clear the in-memory state. We do not transfer + * this state if the inode buffer allocation has not yet been committed + * to the log as setting the XFS_BLI_INODE_BUF flag will prevent + * correct replay of the inode allocation. + */ + if (bip->bli_flags & XFS_BLI_INODE_BUF) { + if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && + xfs_log_item_in_current_chkpt(&bip->bli_item))) + bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF; + bip->bli_flags &= ~XFS_BLI_INODE_BUF; + } + if (bip->bli_flags & XFS_BLI_STALE) { /* * The buffer is stale, so all we need to log @@ -261,7 +275,7 @@ xfs_buf_item_format( * cancel flag in it. */ trace_xfs_buf_item_format_stale(bip); - ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); + ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); bip->bli_format.blf_size = nvecs; return; } @@ -294,29 +308,29 @@ xfs_buf_item_format( * keep counting and scanning. */ if (next_bit == -1) { - buffer_offset = first_bit * XFS_BLI_CHUNK; + buffer_offset = first_bit * XFS_BLF_CHUNK; vecp->i_addr = xfs_buf_offset(bp, buffer_offset); - vecp->i_len = nbits * XFS_BLI_CHUNK; - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK); + vecp->i_len = nbits * XFS_BLF_CHUNK; + vecp->i_type = XLOG_REG_TYPE_BCHUNK; nvecs++; break; } else if (next_bit != last_bit + 1) { - buffer_offset = first_bit * XFS_BLI_CHUNK; + buffer_offset = first_bit * XFS_BLF_CHUNK; vecp->i_addr = xfs_buf_offset(bp, buffer_offset); - vecp->i_len = nbits * XFS_BLI_CHUNK; - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK); + vecp->i_len = nbits * XFS_BLF_CHUNK; + vecp->i_type = XLOG_REG_TYPE_BCHUNK; nvecs++; vecp++; first_bit = next_bit; last_bit = next_bit; nbits = 1; - } else if (xfs_buf_offset(bp, next_bit << XFS_BLI_SHIFT) != - (xfs_buf_offset(bp, last_bit << XFS_BLI_SHIFT) + - XFS_BLI_CHUNK)) { - buffer_offset = first_bit * XFS_BLI_CHUNK; + } else if (xfs_buf_offset(bp, next_bit << XFS_BLF_SHIFT) != + (xfs_buf_offset(bp, last_bit << XFS_BLF_SHIFT) + + XFS_BLF_CHUNK)) { + buffer_offset = first_bit * XFS_BLF_CHUNK; vecp->i_addr = xfs_buf_offset(bp, buffer_offset); - vecp->i_len = nbits * XFS_BLI_CHUNK; - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK); + vecp->i_len = nbits * XFS_BLF_CHUNK; + vecp->i_type = XLOG_REG_TYPE_BCHUNK; /* You would think we need to bump the nvecs here too, but we do not * this number is used by recovery, and it gets confused by the boundary * split here @@ -341,10 +355,15 @@ xfs_buf_item_format( } /* - * This is called to pin the buffer associated with the buf log - * item in memory so it cannot be written out. Simply call bpin() - * on the buffer to do this. + * This is called to pin the buffer associated with the buf log item in memory + * so it cannot be written out. Simply call bpin() on the buffer to do this. + * + * We also always take a reference to the buffer log item here so that the bli + * is held while the item is pinned in memory. This means that we can + * unconditionally drop the reference count a transaction holds when the + * transaction is completed. */ + STATIC void xfs_buf_item_pin( xfs_buf_log_item_t *bip) @@ -356,6 +375,7 @@ xfs_buf_item_pin( ASSERT(atomic_read(&bip->bli_refcount) > 0); ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || (bip->bli_flags & XFS_BLI_STALE)); + atomic_inc(&bip->bli_refcount); trace_xfs_buf_item_pin(bip); xfs_bpin(bp); } @@ -372,12 +392,12 @@ xfs_buf_item_pin( */ STATIC void xfs_buf_item_unpin( - xfs_buf_log_item_t *bip, - int stale) + xfs_buf_log_item_t *bip) { struct xfs_ail *ailp; xfs_buf_t *bp; int freed; + int stale = bip->bli_flags & XFS_BLI_STALE; bp = bip->bli_buf; ASSERT(bp != NULL); @@ -393,7 +413,7 @@ xfs_buf_item_unpin( ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); ASSERT(XFS_BUF_ISSTALE(bp)); - ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); + ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); trace_xfs_buf_item_unpin_stale(bip); /* @@ -428,47 +448,43 @@ xfs_buf_item_unpin_remove( xfs_buf_log_item_t *bip, xfs_trans_t *tp) { - xfs_buf_t *bp; - xfs_log_item_desc_t *lidp; - int stale = 0; - - bp = bip->bli_buf; - /* - * will xfs_buf_item_unpin() call xfs_buf_item_relse()? - */ + /* will xfs_buf_item_unpin() call xfs_buf_item_relse()? */ if ((atomic_read(&bip->bli_refcount) == 1) && (bip->bli_flags & XFS_BLI_STALE)) { + /* + * yes -- We can safely do some work here and then call + * buf_item_unpin to do the rest because we are + * are holding the buffer locked so no one else will be + * able to bump up the refcount. We have to remove the + * log item from the transaction as we are about to release + * our reference to the buffer. If we don't, the unlock that + * occurs later in the xfs_trans_uncommit() will try to + * reference the buffer which we no longer have a hold on. + */ + struct xfs_log_item_desc *lidp; + ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0); trace_xfs_buf_item_unpin_stale(bip); - /* - * yes -- clear the xaction descriptor in-use flag - * and free the chunk if required. We can safely - * do some work here and then call buf_item_unpin - * to do the rest because if the if is true, then - * we are holding the buffer locked so no one else - * will be able to bump up the refcount. - */ - lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) bip); - stale = lidp->lid_flags & XFS_LID_BUF_STALE; + lidp = xfs_trans_find_item(tp, (xfs_log_item_t *)bip); xfs_trans_free_item(tp, lidp); + /* - * Since the transaction no longer refers to the buffer, - * the buffer should no longer refer to the transaction. + * Since the transaction no longer refers to the buffer, the + * buffer should no longer refer to the transaction. */ - XFS_BUF_SET_FSPRIVATE2(bp, NULL); + XFS_BUF_SET_FSPRIVATE2(bip->bli_buf, NULL); } - - xfs_buf_item_unpin(bip, stale); - - return; + xfs_buf_item_unpin(bip); } /* * This is called to attempt to lock the buffer associated with this * buf log item. Don't sleep on the buffer lock. If we can't get - * the lock right away, return 0. If we can get the lock, pull the - * buffer from the free list, mark it busy, and return 1. + * the lock right away, return 0. If we can get the lock, take a + * reference to the buffer. If this is a delayed write buffer that + * needs AIL help to be written back, invoke the pushbuf routine + * rather than the normal success path. */ STATIC uint xfs_buf_item_trylock( @@ -477,42 +493,39 @@ xfs_buf_item_trylock( xfs_buf_t *bp; bp = bip->bli_buf; - - if (XFS_BUF_ISPINNED(bp)) { + if (XFS_BUF_ISPINNED(bp)) return XFS_ITEM_PINNED; - } - - if (!XFS_BUF_CPSEMA(bp)) { + if (!XFS_BUF_CPSEMA(bp)) return XFS_ITEM_LOCKED; - } - /* - * Remove the buffer from the free list. Only do this - * if it's on the free list. Private buffers like the - * superblock buffer are not. - */ + /* take a reference to the buffer. */ XFS_BUF_HOLD(bp); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); trace_xfs_buf_item_trylock(bip); + if (XFS_BUF_ISDELAYWRITE(bp)) + return XFS_ITEM_PUSHBUF; return XFS_ITEM_SUCCESS; } /* - * Release the buffer associated with the buf log item. - * If there is no dirty logged data associated with the - * buffer recorded in the buf log item, then free the - * buf log item and remove the reference to it in the - * buffer. + * Release the buffer associated with the buf log item. If there is no dirty + * logged data associated with the buffer recorded in the buf log item, then + * free the buf log item and remove the reference to it in the buffer. + * + * This call ignores the recursion count. It is only called when the buffer + * should REALLY be unlocked, regardless of the recursion count. * - * This call ignores the recursion count. It is only called - * when the buffer should REALLY be unlocked, regardless - * of the recursion count. + * We unconditionally drop the transaction's reference to the log item. If the + * item was logged, then another reference was taken when it was pinned, so we + * can safely drop the transaction reference now. This also allows us to avoid + * potential races with the unpin code freeing the bli by not referencing the + * bli after we've dropped the reference count. * - * If the XFS_BLI_HOLD flag is set in the buf log item, then - * free the log item if necessary but do not unlock the buffer. - * This is for support of xfs_trans_bhold(). Make sure the - * XFS_BLI_HOLD field is cleared if we don't free the item. + * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item + * if necessary but do not unlock the buffer. This is for support of + * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't + * free the item. */ STATIC void xfs_buf_item_unlock( @@ -524,73 +537,54 @@ xfs_buf_item_unlock( bp = bip->bli_buf; - /* - * Clear the buffer's association with this transaction. - */ + /* Clear the buffer's association with this transaction. */ XFS_BUF_SET_FSPRIVATE2(bp, NULL); /* - * If this is a transaction abort, don't return early. - * Instead, allow the brelse to happen. - * Normally it would be done for stale (cancelled) buffers - * at unpin time, but we'll never go through the pin/unpin - * cycle if we abort inside commit. + * If this is a transaction abort, don't return early. Instead, allow + * the brelse to happen. Normally it would be done for stale + * (cancelled) buffers at unpin time, but we'll never go through the + * pin/unpin cycle if we abort inside commit. */ aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0; /* - * If the buf item is marked stale, then don't do anything. - * We'll unlock the buffer and free the buf item when the - * buffer is unpinned for the last time. + * Before possibly freeing the buf item, determine if we should + * release the buffer at the end of this routine. */ - if (bip->bli_flags & XFS_BLI_STALE) { - bip->bli_flags &= ~XFS_BLI_LOGGED; - trace_xfs_buf_item_unlock_stale(bip); - ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); - if (!aborted) - return; - } + hold = bip->bli_flags & XFS_BLI_HOLD; + + /* Clear the per transaction state. */ + bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD); /* - * Drop the transaction's reference to the log item if - * it was not logged as part of the transaction. Otherwise - * we'll drop the reference in xfs_buf_item_unpin() when - * the transaction is really through with the buffer. + * If the buf item is marked stale, then don't do anything. We'll + * unlock the buffer and free the buf item when the buffer is unpinned + * for the last time. */ - if (!(bip->bli_flags & XFS_BLI_LOGGED)) { - atomic_dec(&bip->bli_refcount); - } else { - /* - * Clear the logged flag since this is per - * transaction state. - */ - bip->bli_flags &= ~XFS_BLI_LOGGED; + if (bip->bli_flags & XFS_BLI_STALE) { + trace_xfs_buf_item_unlock_stale(bip); + ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); + if (!aborted) { + atomic_dec(&bip->bli_refcount); + return; + } } - /* - * Before possibly freeing the buf item, determine if we should - * release the buffer at the end of this routine. - */ - hold = bip->bli_flags & XFS_BLI_HOLD; trace_xfs_buf_item_unlock(bip); /* - * If the buf item isn't tracking any data, free it. - * Otherwise, if XFS_BLI_HOLD is set clear it. + * If the buf item isn't tracking any data, free it, otherwise drop the + * reference we hold to it. */ if (xfs_bitmap_empty(bip->bli_format.blf_data_map, - bip->bli_format.blf_map_size)) { + bip->bli_format.blf_map_size)) xfs_buf_item_relse(bp); - } else if (hold) { - bip->bli_flags &= ~XFS_BLI_HOLD; - } + else + atomic_dec(&bip->bli_refcount); - /* - * Release the buffer if XFS_BLI_HOLD was not set. - */ - if (!hold) { + if (!hold) xfs_buf_relse(bp); - } } /* @@ -626,11 +620,9 @@ xfs_buf_item_committed( } /* - * This is called to asynchronously write the buffer associated with this - * buf log item out to disk. The buffer will already have been locked by - * a successful call to xfs_buf_item_trylock(). If the buffer still has - * B_DELWRI set, then get it going out to disk with a call to bawrite(). - * If not, then just release the buffer. + * The buffer is locked, but is not a delayed write buffer. This happens + * if we race with IO completion and hence we don't want to try to write it + * again. Just release the buffer. */ STATIC void xfs_buf_item_push( @@ -642,17 +634,29 @@ xfs_buf_item_push( trace_xfs_buf_item_push(bip); bp = bip->bli_buf; + ASSERT(!XFS_BUF_ISDELAYWRITE(bp)); + xfs_buf_relse(bp); +} - if (XFS_BUF_ISDELAYWRITE(bp)) { - int error; - error = xfs_bawrite(bip->bli_item.li_mountp, bp); - if (error) - xfs_fs_cmn_err(CE_WARN, bip->bli_item.li_mountp, - "xfs_buf_item_push: pushbuf error %d on bip %p, bp %p", - error, bip, bp); - } else { - xfs_buf_relse(bp); - } +/* + * The buffer is locked and is a delayed write buffer. Promote the buffer + * in the delayed write queue as the caller knows that they must invoke + * the xfsbufd to get this buffer written. We have to unlock the buffer + * to allow the xfsbufd to write it, too. + */ +STATIC void +xfs_buf_item_pushbuf( + xfs_buf_log_item_t *bip) +{ + xfs_buf_t *bp; + + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + trace_xfs_buf_item_pushbuf(bip); + + bp = bip->bli_buf; + ASSERT(XFS_BUF_ISDELAYWRITE(bp)); + xfs_buf_delwri_promote(bp); + xfs_buf_relse(bp); } /* ARGSUSED */ @@ -669,7 +673,7 @@ static struct xfs_item_ops xfs_buf_item_ops = { .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_buf_item_format, .iop_pin = (void(*)(xfs_log_item_t*))xfs_buf_item_pin, - .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_buf_item_unpin, + .iop_unpin = (void(*)(xfs_log_item_t*))xfs_buf_item_unpin, .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *)) xfs_buf_item_unpin_remove, .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_buf_item_trylock, @@ -677,7 +681,7 @@ static struct xfs_item_ops xfs_buf_item_ops = { .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) xfs_buf_item_committed, .iop_push = (void(*)(xfs_log_item_t*))xfs_buf_item_push, - .iop_pushbuf = NULL, + .iop_pushbuf = (void(*)(xfs_log_item_t*))xfs_buf_item_pushbuf, .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) xfs_buf_item_committing }; @@ -717,20 +721,17 @@ xfs_buf_item_init( } /* - * chunks is the number of XFS_BLI_CHUNK size pieces + * chunks is the number of XFS_BLF_CHUNK size pieces * the buffer can be divided into. Make sure not to * truncate any pieces. map_size is the size of the * bitmap needed to describe the chunks of the buffer. */ - chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLI_CHUNK - 1)) >> XFS_BLI_SHIFT); + chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLF_CHUNK - 1)) >> XFS_BLF_SHIFT); map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT); bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP); - bip->bli_item.li_type = XFS_LI_BUF; - bip->bli_item.li_ops = &xfs_buf_item_ops; - bip->bli_item.li_mountp = mp; - bip->bli_item.li_ailp = mp->m_ail; + xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); bip->bli_buf = bp; xfs_buf_hold(bp); bip->bli_format.blf_type = XFS_LI_BUF; @@ -793,8 +794,8 @@ xfs_buf_item_log( /* * Convert byte offsets to bit numbers. */ - first_bit = first >> XFS_BLI_SHIFT; - last_bit = last >> XFS_BLI_SHIFT; + first_bit = first >> XFS_BLF_SHIFT; + last_bit = last >> XFS_BLF_SHIFT; /* * Calculate the total number of bits to be set. diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 217f34af00cb..f20bb472d582 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -26,7 +26,7 @@ extern kmem_zone_t *xfs_buf_item_zone; * have been logged. * For 6.2 and beyond, this is XFS_LI_BUF. We use this to log everything. */ -typedef struct xfs_buf_log_format_t { +typedef struct xfs_buf_log_format { unsigned short blf_type; /* buf log item type indicator */ unsigned short blf_size; /* size of this item */ ushort blf_flags; /* misc state */ @@ -41,22 +41,22 @@ typedef struct xfs_buf_log_format_t { * This flag indicates that the buffer contains on disk inodes * and requires special recovery handling. */ -#define XFS_BLI_INODE_BUF 0x1 +#define XFS_BLF_INODE_BUF 0x1 /* * This flag indicates that the buffer should not be replayed * during recovery because its blocks are being freed. */ -#define XFS_BLI_CANCEL 0x2 +#define XFS_BLF_CANCEL 0x2 /* * This flag indicates that the buffer contains on disk * user or group dquots and may require special recovery handling. */ -#define XFS_BLI_UDQUOT_BUF 0x4 -#define XFS_BLI_PDQUOT_BUF 0x8 -#define XFS_BLI_GDQUOT_BUF 0x10 +#define XFS_BLF_UDQUOT_BUF 0x4 +#define XFS_BLF_PDQUOT_BUF 0x8 +#define XFS_BLF_GDQUOT_BUF 0x10 -#define XFS_BLI_CHUNK 128 -#define XFS_BLI_SHIFT 7 +#define XFS_BLF_CHUNK 128 +#define XFS_BLF_SHIFT 7 #define BIT_TO_WORD_SHIFT 5 #define NBWORD (NBBY * sizeof(unsigned int)) @@ -69,6 +69,7 @@ typedef struct xfs_buf_log_format_t { #define XFS_BLI_LOGGED 0x08 #define XFS_BLI_INODE_ALLOC_BUF 0x10 #define XFS_BLI_STALE_INODE 0x20 +#define XFS_BLI_INODE_BUF 0x40 #define XFS_BLI_FLAGS \ { XFS_BLI_HOLD, "HOLD" }, \ @@ -76,7 +77,8 @@ typedef struct xfs_buf_log_format_t { { XFS_BLI_STALE, "STALE" }, \ { XFS_BLI_LOGGED, "LOGGED" }, \ { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \ - { XFS_BLI_STALE_INODE, "STALE_INODE" } + { XFS_BLI_STALE_INODE, "STALE_INODE" }, \ + { XFS_BLI_INODE_BUF, "INODE_BUF" } #ifdef __KERNEL__ diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index c0c8869115b1..0ca556b4bf31 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -1534,8 +1534,8 @@ xfs_da_hashname(const __uint8_t *name, int namelen) enum xfs_dacmp xfs_da_compname( struct xfs_da_args *args, - const char *name, - int len) + const unsigned char *name, + int len) { return (args->namelen == len && memcmp(args->name, name, len) == 0) ? XFS_CMP_EXACT : XFS_CMP_DIFFERENT; diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index 30cd08f56a3a..fe9f5a8c1d2a 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -209,7 +209,8 @@ typedef struct xfs_da_state { */ struct xfs_nameops { xfs_dahash_t (*hashname)(struct xfs_name *); - enum xfs_dacmp (*compname)(struct xfs_da_args *, const char *, int); + enum xfs_dacmp (*compname)(struct xfs_da_args *, + const unsigned char *, int); }; @@ -260,7 +261,7 @@ int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, uint xfs_da_hashname(const __uint8_t *name_string, int name_length); enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, - const char *name, int len); + const unsigned char *name, int len); xfs_da_state_t *xfs_da_state_alloc(void); diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 84ca1cf16a1e..7f159d2a429a 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -45,15 +45,21 @@ #include "xfs_vnodeops.h" #include "xfs_trace.h" + +static int xfs_swap_extents( + xfs_inode_t *ip, /* target inode */ + xfs_inode_t *tip, /* tmp inode */ + xfs_swapext_t *sxp); + /* - * Syssgi interface for swapext + * ioctl interface for swapext */ int xfs_swapext( xfs_swapext_t *sxp) { xfs_inode_t *ip, *tip; - struct file *file, *target_file; + struct file *file, *tmp_file; int error = 0; /* Pull information for the target fd */ @@ -63,51 +69,54 @@ xfs_swapext( goto out; } - if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) { + if (!(file->f_mode & FMODE_WRITE) || + !(file->f_mode & FMODE_READ) || + (file->f_flags & O_APPEND)) { error = XFS_ERROR(EBADF); goto out_put_file; } - target_file = fget((int)sxp->sx_fdtmp); - if (!target_file) { + tmp_file = fget((int)sxp->sx_fdtmp); + if (!tmp_file) { error = XFS_ERROR(EINVAL); goto out_put_file; } - if (!(target_file->f_mode & FMODE_WRITE) || - (target_file->f_flags & O_APPEND)) { + if (!(tmp_file->f_mode & FMODE_WRITE) || + !(tmp_file->f_mode & FMODE_READ) || + (tmp_file->f_flags & O_APPEND)) { error = XFS_ERROR(EBADF); - goto out_put_target_file; + goto out_put_tmp_file; } if (IS_SWAPFILE(file->f_path.dentry->d_inode) || - IS_SWAPFILE(target_file->f_path.dentry->d_inode)) { + IS_SWAPFILE(tmp_file->f_path.dentry->d_inode)) { error = XFS_ERROR(EINVAL); - goto out_put_target_file; + goto out_put_tmp_file; } ip = XFS_I(file->f_path.dentry->d_inode); - tip = XFS_I(target_file->f_path.dentry->d_inode); + tip = XFS_I(tmp_file->f_path.dentry->d_inode); if (ip->i_mount != tip->i_mount) { error = XFS_ERROR(EINVAL); - goto out_put_target_file; + goto out_put_tmp_file; } if (ip->i_ino == tip->i_ino) { error = XFS_ERROR(EINVAL); - goto out_put_target_file; + goto out_put_tmp_file; } if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { error = XFS_ERROR(EIO); - goto out_put_target_file; + goto out_put_tmp_file; } error = xfs_swap_extents(ip, tip, sxp); - out_put_target_file: - fput(target_file); + out_put_tmp_file: + fput(tmp_file); out_put_file: fput(file); out: @@ -171,22 +180,32 @@ xfs_swap_extents_check_format( XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max) return EINVAL; - /* Check root block of temp in btree form to max in target */ + /* + * If we are in a btree format, check that the temp root block will fit + * in the target and that it has enough extents to be in btree format + * in the target. + * + * Note that we have to be careful to allow btree->extent conversions + * (a common defrag case) which will occur when the temp inode is in + * extent format... + */ if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE && - XFS_IFORK_BOFF(ip) && - tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) + ((XFS_IFORK_BOFF(ip) && + tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) || + XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= ip->i_df.if_ext_max)) return EINVAL; - /* Check root block of target in btree form to max in temp */ + /* Reciprocal target->temp btree format checks */ if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && - XFS_IFORK_BOFF(tip) && - ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) + ((XFS_IFORK_BOFF(tip) && + ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) || + XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= tip->i_df.if_ext_max)) return EINVAL; return 0; } -int +static int xfs_swap_extents( xfs_inode_t *ip, /* target inode */ xfs_inode_t *tip, /* tmp inode */ @@ -254,6 +273,9 @@ xfs_swap_extents( goto out_unlock; } + trace_xfs_swap_extent_before(ip, 0); + trace_xfs_swap_extent_before(tip, 1); + /* check inode formats now that data is flushed */ error = xfs_swap_extents_check_format(ip, tip); if (error) { @@ -421,6 +443,8 @@ xfs_swap_extents( error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT); + trace_xfs_swap_extent_after(ip, 0); + trace_xfs_swap_extent_after(tip, 1); out: kmem_free(tempifp); return error; diff --git a/fs/xfs/xfs_dfrag.h b/fs/xfs/xfs_dfrag.h index 4f55a6306558..20bdd935c121 100644 --- a/fs/xfs/xfs_dfrag.h +++ b/fs/xfs/xfs_dfrag.h @@ -48,9 +48,6 @@ typedef struct xfs_swapext */ int xfs_swapext(struct xfs_swapext *sx); -int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip, - struct xfs_swapext *sxp); - #endif /* __KERNEL__ */ #endif /* __XFS_DFRAG_H__ */ diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index 93634a7e90e9..42520f041265 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -44,7 +44,7 @@ #include "xfs_vnodeops.h" #include "xfs_trace.h" -struct xfs_name xfs_name_dotdot = {"..", 2}; +struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2}; /* * ASCII case-insensitive (ie. A-Z) support for directories that was @@ -66,8 +66,8 @@ xfs_ascii_ci_hashname( STATIC enum xfs_dacmp xfs_ascii_ci_compname( struct xfs_da_args *args, - const char *name, - int len) + const unsigned char *name, + int len) { enum xfs_dacmp result; int i; @@ -247,7 +247,7 @@ xfs_dir_createname( int xfs_dir_cilookup_result( struct xfs_da_args *args, - const char *name, + const unsigned char *name, int len) { if (args->cmpresult == XFS_CMP_DIFFERENT) diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h index 1d9ef96f33aa..74a3b1057685 100644 --- a/fs/xfs/xfs_dir2.h +++ b/fs/xfs/xfs_dir2.h @@ -100,7 +100,7 @@ extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, struct xfs_dabuf *bp); -extern int xfs_dir_cilookup_result(struct xfs_da_args *args, const char *name, - int len); +extern int xfs_dir_cilookup_result(struct xfs_da_args *args, + const unsigned char *name, int len); #endif /* __XFS_DIR2_H__ */ diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index ddc4ecc7807f..779a267b0a84 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -57,8 +57,8 @@ static xfs_dahash_t xfs_dir_hash_dot, xfs_dir_hash_dotdot; void xfs_dir_startup(void) { - xfs_dir_hash_dot = xfs_da_hashname(".", 1); - xfs_dir_hash_dotdot = xfs_da_hashname("..", 2); + xfs_dir_hash_dot = xfs_da_hashname((unsigned char *)".", 1); + xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2); } /* @@ -513,8 +513,9 @@ xfs_dir2_block_getdents( /* * If it didn't fit, set the final offset to here & return. */ - if (filldir(dirent, dep->name, dep->namelen, cook & 0x7fffffff, - be64_to_cpu(dep->inumber), DT_UNKNOWN)) { + if (filldir(dirent, (char *)dep->name, dep->namelen, + cook & 0x7fffffff, be64_to_cpu(dep->inumber), + DT_UNKNOWN)) { *offset = cook & 0x7fffffff; xfs_da_brelse(NULL, bp); return 0; diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 29f484c11b3a..e2d89854ec9e 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -1081,7 +1081,7 @@ xfs_dir2_leaf_getdents( dep = (xfs_dir2_data_entry_t *)ptr; length = xfs_dir2_data_entsize(dep->namelen); - if (filldir(dirent, dep->name, dep->namelen, + if (filldir(dirent, (char *)dep->name, dep->namelen, xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff, be64_to_cpu(dep->inumber), DT_UNKNOWN)) break; diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index ce6e355199b5..78fc4d9ae756 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -65,7 +65,7 @@ static int xfs_dir2_node_addname_int(xfs_da_args_t *args, /* * Log entries from a freespace block. */ -void +STATIC void xfs_dir2_free_log_bests( xfs_trans_t *tp, /* transaction pointer */ xfs_dabuf_t *bp, /* freespace buffer */ diff --git a/fs/xfs/xfs_dir2_node.h b/fs/xfs/xfs_dir2_node.h index dde72db3d695..82dfe7147195 100644 --- a/fs/xfs/xfs_dir2_node.h +++ b/fs/xfs/xfs_dir2_node.h @@ -75,8 +75,6 @@ xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db) return ((db) % XFS_DIR2_MAX_FREE_BESTS(mp)); } -extern void xfs_dir2_free_log_bests(struct xfs_trans *tp, struct xfs_dabuf *bp, - int first, int last); extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args, struct xfs_dabuf *lbp); extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_dabuf *bp, int *count); diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c index 9d4f17a69676..c1a5945d463a 100644 --- a/fs/xfs/xfs_dir2_sf.c +++ b/fs/xfs/xfs_dir2_sf.c @@ -782,7 +782,7 @@ xfs_dir2_sf_getdents( } ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep)); - if (filldir(dirent, sfep->name, sfep->namelen, + if (filldir(dirent, (char *)sfep->name, sfep->namelen, off & 0x7fffffff, ino, DT_UNKNOWN)) { *offset = off & 0x7fffffff; return 0; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 92d5cd5bf4f2..047b8a8e5c29 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -170,7 +170,7 @@ xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...) va_list ap; #ifdef DEBUG - xfs_panic_mask |= XFS_PTAG_SHUTDOWN_CORRUPT; + xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES); #endif if (xfs_panic_mask && (xfs_panic_mask & panic_tag) @@ -186,18 +186,18 @@ xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...) void xfs_error_report( - char *tag, - int level, - xfs_mount_t *mp, - char *fname, - int linenum, - inst_t *ra) + const char *tag, + int level, + struct xfs_mount *mp, + const char *filename, + int linenum, + inst_t *ra) { if (level <= xfs_error_level) { xfs_cmn_err(XFS_PTAG_ERROR_REPORT, CE_ALERT, mp, "XFS internal error %s at line %d of file %s. Caller 0x%p\n", - tag, linenum, fname, ra); + tag, linenum, filename, ra); xfs_stack_trace(); } @@ -205,15 +205,15 @@ xfs_error_report( void xfs_corruption_error( - char *tag, - int level, - xfs_mount_t *mp, - void *p, - char *fname, - int linenum, - inst_t *ra) + const char *tag, + int level, + struct xfs_mount *mp, + void *p, + const char *filename, + int linenum, + inst_t *ra) { if (level <= xfs_error_level) xfs_hex_dump(p, 16); - xfs_error_report(tag, level, mp, fname, linenum, ra); + xfs_error_report(tag, level, mp, filename, linenum, ra); } diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 0c93051c4651..c2c1a072bb82 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -29,10 +29,11 @@ extern int xfs_error_trap(int); struct xfs_mount; -extern void xfs_error_report(char *tag, int level, struct xfs_mount *mp, - char *fname, int linenum, inst_t *ra); -extern void xfs_corruption_error(char *tag, int level, struct xfs_mount *mp, - void *p, char *fname, int linenum, inst_t *ra); +extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, + const char *filename, int linenum, inst_t *ra); +extern void xfs_corruption_error(const char *tag, int level, + struct xfs_mount *mp, void *p, const char *filename, + int linenum, inst_t *ra); #define XFS_ERROR_REPORT(e, lvl, mp) \ xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address) diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 05a4bdd4be39..409fe81585fd 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -82,7 +82,7 @@ xfs_efi_item_format(xfs_efi_log_item_t *efip, log_vector->i_addr = (xfs_caddr_t)&(efip->efi_format); log_vector->i_len = size; - XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_EFI_FORMAT); + log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT; ASSERT(size >= sizeof(xfs_efi_log_format_t)); } @@ -106,7 +106,7 @@ xfs_efi_item_pin(xfs_efi_log_item_t *efip) */ /*ARGSUSED*/ STATIC void -xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale) +xfs_efi_item_unpin(xfs_efi_log_item_t *efip) { struct xfs_ail *ailp = efip->efi_item.li_ailp; @@ -224,7 +224,7 @@ static struct xfs_item_ops xfs_efi_item_ops = { .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_efi_item_format, .iop_pin = (void(*)(xfs_log_item_t*))xfs_efi_item_pin, - .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_efi_item_unpin, + .iop_unpin = (void(*)(xfs_log_item_t*))xfs_efi_item_unpin, .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *)) xfs_efi_item_unpin_remove, .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efi_item_trylock, @@ -259,10 +259,7 @@ xfs_efi_init(xfs_mount_t *mp, KM_SLEEP); } - efip->efi_item.li_type = XFS_LI_EFI; - efip->efi_item.li_ops = &xfs_efi_item_ops; - efip->efi_item.li_mountp = mp; - efip->efi_item.li_ailp = mp->m_ail; + xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); efip->efi_format.efi_nextents = nextents; efip->efi_format.efi_id = (__psint_t)(void*)efip; @@ -406,7 +403,7 @@ xfs_efd_item_format(xfs_efd_log_item_t *efdp, log_vector->i_addr = (xfs_caddr_t)&(efdp->efd_format); log_vector->i_len = size; - XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_EFD_FORMAT); + log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT; ASSERT(size >= sizeof(xfs_efd_log_format_t)); } @@ -428,7 +425,7 @@ xfs_efd_item_pin(xfs_efd_log_item_t *efdp) */ /*ARGSUSED*/ STATIC void -xfs_efd_item_unpin(xfs_efd_log_item_t *efdp, int stale) +xfs_efd_item_unpin(xfs_efd_log_item_t *efdp) { return; } @@ -518,7 +515,7 @@ static struct xfs_item_ops xfs_efd_item_ops = { .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_efd_item_format, .iop_pin = (void(*)(xfs_log_item_t*))xfs_efd_item_pin, - .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_efd_item_unpin, + .iop_unpin = (void(*)(xfs_log_item_t*))xfs_efd_item_unpin, .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) xfs_efd_item_unpin_remove, .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efd_item_trylock, @@ -554,10 +551,7 @@ xfs_efd_init(xfs_mount_t *mp, KM_SLEEP); } - efdp->efd_item.li_type = XFS_LI_EFD; - efdp->efd_item.li_ops = &xfs_efd_item_ops; - efdp->efd_item.li_mountp = mp; - efdp->efd_item.li_ailp = mp->m_ail; + xfs_log_item_init(mp, &efdp->efd_item, XFS_LI_EFD, &xfs_efd_item_ops); efdp->efd_efip = efip; efdp->efd_format.efd_nextents = nextents; efdp->efd_format.efd_efi_id = efip->efi_format.efi_id; diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index a631e1451abb..390850ee6603 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -140,6 +140,7 @@ _xfs_filestream_pick_ag( int flags, xfs_extlen_t minlen) { + int streams, max_streams; int err, trylock, nscan; xfs_extlen_t longest, free, minfree, maxfree = 0; xfs_agnumber_t ag, max_ag = NULLAGNUMBER; @@ -155,15 +156,15 @@ _xfs_filestream_pick_ag( trylock = XFS_ALLOC_FLAG_TRYLOCK; for (nscan = 0; 1; nscan++) { - - TRACE_AG_SCAN(mp, ag, xfs_filestream_peek_ag(mp, ag)); - - pag = mp->m_perag + ag; + pag = xfs_perag_get(mp, ag); + TRACE_AG_SCAN(mp, ag, atomic_read(&pag->pagf_fstrms)); if (!pag->pagf_init) { err = xfs_alloc_pagf_init(mp, NULL, ag, trylock); - if (err && !trylock) + if (err && !trylock) { + xfs_perag_put(pag); return err; + } } /* Might fail sometimes during the 1st pass with trylock set. */ @@ -173,6 +174,7 @@ _xfs_filestream_pick_ag( /* Keep track of the AG with the most free blocks. */ if (pag->pagf_freeblks > maxfree) { maxfree = pag->pagf_freeblks; + max_streams = atomic_read(&pag->pagf_fstrms); max_ag = ag; } @@ -195,6 +197,8 @@ _xfs_filestream_pick_ag( /* Break out, retaining the reference on the AG. */ free = pag->pagf_freeblks; + streams = atomic_read(&pag->pagf_fstrms); + xfs_perag_put(pag); *agp = ag; break; } @@ -202,6 +206,7 @@ _xfs_filestream_pick_ag( /* Drop the reference on this AG, it's not usable. */ xfs_filestream_put_ag(mp, ag); next_ag: + xfs_perag_put(pag); /* Move to the next AG, wrapping to AG 0 if necessary. */ if (++ag >= mp->m_sb.sb_agcount) ag = 0; @@ -229,6 +234,7 @@ next_ag: if (max_ag != NULLAGNUMBER) { xfs_filestream_get_ag(mp, max_ag); TRACE_AG_PICK1(mp, max_ag, maxfree); + streams = max_streams; free = maxfree; *agp = max_ag; break; @@ -240,16 +246,14 @@ next_ag: return 0; } - TRACE_AG_PICK2(mp, startag, *agp, xfs_filestream_peek_ag(mp, *agp), - free, nscan, flags); + TRACE_AG_PICK2(mp, startag, *agp, streams, free, nscan, flags); return 0; } /* * Set the allocation group number for a file or a directory, updating inode - * references and per-AG references as appropriate. Must be called with the - * m_peraglock held in read mode. + * references and per-AG references as appropriate. */ static int _xfs_filestream_update_ag( @@ -451,20 +455,6 @@ xfs_filestream_unmount( } /* - * If the mount point's m_perag array is going to be reallocated, all - * outstanding cache entries must be flushed to avoid accessing reference count - * addresses that have been freed. The call to xfs_filestream_flush() must be - * made inside the block that holds the m_peraglock in write mode to do the - * reallocation. - */ -void -xfs_filestream_flush( - xfs_mount_t *mp) -{ - xfs_mru_cache_flush(mp->m_filestream); -} - -/* * Return the AG of the filestream the file or directory belongs to, or * NULLAGNUMBER otherwise. */ @@ -526,7 +516,6 @@ xfs_filestream_associate( mp = pip->i_mount; cache = mp->m_filestream; - down_read(&mp->m_peraglock); /* * We have a problem, Houston. @@ -543,10 +532,8 @@ xfs_filestream_associate( * * So, if we can't get the iolock without sleeping then just give up */ - if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL)) { - up_read(&mp->m_peraglock); + if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL)) return 1; - } /* If the parent directory is already in the cache, use its AG. */ item = xfs_mru_cache_lookup(cache, pip->i_ino); @@ -601,7 +588,6 @@ exit_did_pick: exit: xfs_iunlock(pip, XFS_IOLOCK_EXCL); - up_read(&mp->m_peraglock); return -err; } diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h index 4aba67c5f64f..260f757bbc5d 100644 --- a/fs/xfs/xfs_filestream.h +++ b/fs/xfs/xfs_filestream.h @@ -79,12 +79,21 @@ extern ktrace_t *xfs_filestreams_trace_buf; * the cache that reference per-ag array elements that have since been * reallocated. */ +/* + * xfs_filestream_peek_ag is only used in tracing code + */ static inline int xfs_filestream_peek_ag( xfs_mount_t *mp, xfs_agnumber_t agno) { - return atomic_read(&mp->m_perag[agno].pagf_fstrms); + struct xfs_perag *pag; + int ret; + + pag = xfs_perag_get(mp, agno); + ret = atomic_read(&pag->pagf_fstrms); + xfs_perag_put(pag); + return ret; } static inline int @@ -92,7 +101,13 @@ xfs_filestream_get_ag( xfs_mount_t *mp, xfs_agnumber_t agno) { - return atomic_inc_return(&mp->m_perag[agno].pagf_fstrms); + struct xfs_perag *pag; + int ret; + + pag = xfs_perag_get(mp, agno); + ret = atomic_inc_return(&pag->pagf_fstrms); + xfs_perag_put(pag); + return ret; } static inline int @@ -100,7 +115,13 @@ xfs_filestream_put_ag( xfs_mount_t *mp, xfs_agnumber_t agno) { - return atomic_dec_return(&mp->m_perag[agno].pagf_fstrms); + struct xfs_perag *pag; + int ret; + + pag = xfs_perag_get(mp, agno); + ret = atomic_dec_return(&pag->pagf_fstrms); + xfs_perag_put(pag); + return ret; } /* allocation selection flags */ @@ -114,7 +135,6 @@ int xfs_filestream_init(void); void xfs_filestream_uninit(void); int xfs_filestream_mount(struct xfs_mount *mp); void xfs_filestream_unmount(struct xfs_mount *mp); -void xfs_filestream_flush(struct xfs_mount *mp); xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip); int xfs_filestream_associate(struct xfs_inode *dip, struct xfs_inode *ip); void xfs_filestream_deassociate(struct xfs_inode *ip); diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index f52ac276277e..7cf7220e7d5f 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -292,7 +292,8 @@ typedef struct xfs_bstat { __s32 bs_extents; /* number of extents */ __u32 bs_gen; /* generation count */ __u16 bs_projid; /* project id */ - unsigned char bs_pad[14]; /* pad space, unused */ + __u16 bs_forkoff; /* inode fork offset in bytes */ + unsigned char bs_pad[12]; /* pad space, unused */ __u32 bs_dmevmask; /* DMIG event mask */ __u16 bs_dmstate; /* DMIG state info */ __u16 bs_aextents; /* attribute number of extents */ diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index a13919a6a364..37a6f62c57b6 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -167,27 +167,14 @@ xfs_growfs_data_private( } new = nb - mp->m_sb.sb_dblocks; oagcount = mp->m_sb.sb_agcount; - if (nagcount > oagcount) { - void *new_perag, *old_perag; - - xfs_filestream_flush(mp); - - new_perag = kmem_zalloc(sizeof(xfs_perag_t) * nagcount, - KM_MAYFAIL); - if (!new_perag) - return XFS_ERROR(ENOMEM); - - down_write(&mp->m_peraglock); - memcpy(new_perag, mp->m_perag, sizeof(xfs_perag_t) * oagcount); - old_perag = mp->m_perag; - mp->m_perag = new_perag; - - mp->m_flags |= XFS_MOUNT_32BITINODES; - nagimax = xfs_initialize_perag(mp, nagcount); - up_write(&mp->m_peraglock); - kmem_free(old_perag); + /* allocate the new per-ag structures */ + if (nagcount > oagcount) { + error = xfs_initialize_perag(mp, nagcount, &nagimax); + if (error) + return error; } + tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS); tp->t_flags |= XFS_TRANS_RESERVE; if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp), @@ -196,6 +183,11 @@ xfs_growfs_data_private( return error; } + /* + * Write new AG headers to disk. Non-transactional, but written + * synchronously so they are completed prior to the growfs transaction + * being logged. + */ nfree = 0; for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) { /* @@ -359,6 +351,12 @@ xfs_growfs_data_private( goto error0; } } + + /* + * Update changed superblock fields transactionally. These are not + * seen by the rest of the world until the transaction commit applies + * them atomically to the superblock. + */ if (nagcount > oagcount) xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount); if (nb > mp->m_sb.sb_dblocks) @@ -369,9 +367,9 @@ xfs_growfs_data_private( if (dpct) xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct); error = xfs_trans_commit(tp, 0); - if (error) { + if (error) return error; - } + /* New allocation groups fully initialized, so update mount struct */ if (nagimax) mp->m_maxagi = nagimax; @@ -381,6 +379,8 @@ xfs_growfs_data_private( mp->m_maxicount = icount << mp->m_sb.sb_inopblog; } else mp->m_maxicount = 0; + + /* update secondary superblocks. */ for (agno = 1; agno < nagcount; agno++) { error = xfs_read_buf(mp, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index cb907ba69c4c..c7142a064c48 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -205,7 +205,7 @@ xfs_ialloc_inode_init( d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster)); fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize * blks_per_cluster, - XFS_BUF_LOCK); + XBF_LOCK); ASSERT(fbuf); ASSERT(!XFS_BUF_GETERROR(fbuf)); @@ -253,6 +253,7 @@ xfs_ialloc_ag_alloc( xfs_agino_t thisino; /* current inode number, for loop */ int isaligned = 0; /* inode allocation at stripe unit */ /* boundary */ + struct xfs_perag *pag; args.tp = tp; args.mp = tp->t_mountp; @@ -382,9 +383,9 @@ xfs_ialloc_ag_alloc( newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0); be32_add_cpu(&agi->agi_count, newlen); be32_add_cpu(&agi->agi_freecount, newlen); - down_read(&args.mp->m_peraglock); - args.mp->m_perag[agno].pagi_freecount += newlen; - up_read(&args.mp->m_peraglock); + pag = xfs_perag_get(args.mp, agno); + pag->pagi_freecount += newlen; + xfs_perag_put(pag); agi->agi_newino = cpu_to_be32(newino); /* @@ -486,9 +487,8 @@ xfs_ialloc_ag_select( */ agno = pagno; flags = XFS_ALLOC_FLAG_TRYLOCK; - down_read(&mp->m_peraglock); for (;;) { - pag = &mp->m_perag[agno]; + pag = xfs_perag_get(mp, agno); if (!pag->pagi_init) { if (xfs_ialloc_read_agi(mp, tp, agno, &agbp)) { agbp = NULL; @@ -527,7 +527,7 @@ xfs_ialloc_ag_select( agbp = NULL; goto nextag; } - up_read(&mp->m_peraglock); + xfs_perag_put(pag); return agbp; } } @@ -535,22 +535,19 @@ unlock_nextag: if (agbp) xfs_trans_brelse(tp, agbp); nextag: + xfs_perag_put(pag); /* * No point in iterating over the rest, if we're shutting * down. */ - if (XFS_FORCED_SHUTDOWN(mp)) { - up_read(&mp->m_peraglock); + if (XFS_FORCED_SHUTDOWN(mp)) return NULL; - } agno++; if (agno >= agcount) agno = 0; if (agno == pagno) { - if (flags == 0) { - up_read(&mp->m_peraglock); + if (flags == 0) return NULL; - } flags = 0; } } @@ -672,6 +669,7 @@ xfs_dialloc( xfs_agnumber_t tagno; /* testing allocation group number */ xfs_btree_cur_t *tcur; /* temp cursor */ xfs_inobt_rec_incore_t trec; /* temp inode allocation record */ + struct xfs_perag *pag; if (*IO_agbp == NULL) { @@ -771,13 +769,13 @@ nextag: *inop = NULLFSINO; return noroom ? ENOSPC : 0; } - down_read(&mp->m_peraglock); - if (mp->m_perag[tagno].pagi_inodeok == 0) { - up_read(&mp->m_peraglock); + pag = xfs_perag_get(mp, tagno); + if (pag->pagi_inodeok == 0) { + xfs_perag_put(pag); goto nextag; } error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp); - up_read(&mp->m_peraglock); + xfs_perag_put(pag); if (error) goto nextag; agi = XFS_BUF_TO_AGI(agbp); @@ -790,6 +788,7 @@ nextag: */ agno = tagno; *IO_agbp = NULL; + pag = xfs_perag_get(mp, agno); restart_pagno: cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno)); @@ -808,7 +807,6 @@ nextag: * If in the same AG as the parent, try to get near the parent. */ if (pagno == agno) { - xfs_perag_t *pag = &mp->m_perag[agno]; int doneleft; /* done, to the left */ int doneright; /* done, to the right */ int searchdistance = 10; @@ -1006,9 +1004,7 @@ alloc_inode: goto error0; be32_add_cpu(&agi->agi_freecount, -1); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); - down_read(&mp->m_peraglock); - mp->m_perag[tagno].pagi_freecount--; - up_read(&mp->m_peraglock); + pag->pagi_freecount--; error = xfs_check_agi_freecount(cur, agi); if (error) @@ -1016,12 +1012,14 @@ alloc_inode: xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); + xfs_perag_put(pag); *inop = ino; return 0; error1: xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); error0: xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + xfs_perag_put(pag); return error; } @@ -1052,6 +1050,7 @@ xfs_difree( xfs_mount_t *mp; /* mount structure for filesystem */ int off; /* offset of inode in inode chunk */ xfs_inobt_rec_incore_t rec; /* btree record */ + struct xfs_perag *pag; mp = tp->t_mountp; @@ -1088,9 +1087,7 @@ xfs_difree( /* * Get the allocation group header. */ - down_read(&mp->m_peraglock); error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); - up_read(&mp->m_peraglock); if (error) { cmn_err(CE_WARN, "xfs_difree: xfs_ialloc_read_agi() returned an error %d on %s. Returning error.", @@ -1157,9 +1154,9 @@ xfs_difree( be32_add_cpu(&agi->agi_count, -ilen); be32_add_cpu(&agi->agi_freecount, -(ilen - 1)); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); - down_read(&mp->m_peraglock); - mp->m_perag[agno].pagi_freecount -= ilen - 1; - up_read(&mp->m_peraglock); + pag = xfs_perag_get(mp, agno); + pag->pagi_freecount -= ilen - 1; + xfs_perag_put(pag); xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1)); @@ -1188,9 +1185,9 @@ xfs_difree( */ be32_add_cpu(&agi->agi_freecount, 1); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); - down_read(&mp->m_peraglock); - mp->m_perag[agno].pagi_freecount++; - up_read(&mp->m_peraglock); + pag = xfs_perag_get(mp, agno); + pag->pagi_freecount++; + xfs_perag_put(pag); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1); } @@ -1206,6 +1203,63 @@ error0: return error; } +STATIC int +xfs_imap_lookup( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agino_t agino, + xfs_agblock_t agbno, + xfs_agblock_t *chunk_agbno, + xfs_agblock_t *offset_agbno, + int flags) +{ + struct xfs_inobt_rec_incore rec; + struct xfs_btree_cur *cur; + struct xfs_buf *agbp; + xfs_agino_t startino; + int error; + int i; + + error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); + if (error) { + xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " + "xfs_ialloc_read_agi() returned " + "error %d, agno %d", + error, agno); + return error; + } + + /* + * derive and lookup the exact inode record for the given agino. If the + * record cannot be found, then it's an invalid inode number and we + * should abort. + */ + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); + startino = agino & ~(XFS_IALLOC_INODES(mp) - 1); + error = xfs_inobt_lookup(cur, startino, XFS_LOOKUP_EQ, &i); + if (!error) { + if (i) + error = xfs_inobt_get_rec(cur, &rec, &i); + if (!error && i == 0) + error = EINVAL; + } + + xfs_trans_brelse(tp, agbp); + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + if (error) + return error; + + /* for untrusted inodes check it is allocated first */ + if ((flags & XFS_IGET_UNTRUSTED) && + (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino))) + return EINVAL; + + *chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino); + *offset_agbno = agbno - *chunk_agbno; + return 0; +} + /* * Return the location of the inode in imap, for mapping it into a buffer. */ @@ -1238,8 +1292,11 @@ xfs_imap( if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks || ino != XFS_AGINO_TO_INO(mp, agno, agino)) { #ifdef DEBUG - /* no diagnostics for bulkstat, ino comes from userspace */ - if (flags & XFS_IGET_BULKSTAT) + /* + * Don't output diagnostic information for untrusted inodes + * as they can be invalid without implying corruption. + */ + if (flags & XFS_IGET_UNTRUSTED) return XFS_ERROR(EINVAL); if (agno >= mp->m_sb.sb_agcount) { xfs_fs_cmn_err(CE_ALERT, mp, @@ -1266,6 +1323,23 @@ xfs_imap( return XFS_ERROR(EINVAL); } + blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog; + + /* + * For bulkstat and handle lookups, we have an untrusted inode number + * that we have to verify is valid. We cannot do this just by reading + * the inode buffer as it may have been unlinked and removed leaving + * inodes in stale state on disk. Hence we have to do a btree lookup + * in all cases where an untrusted inode number is passed. + */ + if (flags & XFS_IGET_UNTRUSTED) { + error = xfs_imap_lookup(mp, tp, agno, agino, agbno, + &chunk_agbno, &offset_agbno, flags); + if (error) + return error; + goto out_map; + } + /* * If the inode cluster size is the same as the blocksize or * smaller we get to the buffer by simple arithmetics. @@ -1280,24 +1354,6 @@ xfs_imap( return 0; } - blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog; - - /* - * If we get a block number passed from bulkstat we can use it to - * find the buffer easily. - */ - if (imap->im_blkno) { - offset = XFS_INO_TO_OFFSET(mp, ino); - ASSERT(offset < mp->m_sb.sb_inopblock); - - cluster_agbno = xfs_daddr_to_agbno(mp, imap->im_blkno); - offset += (agbno - cluster_agbno) * mp->m_sb.sb_inopblock; - - imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster); - imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog); - return 0; - } - /* * If the inode chunks are aligned then use simple maths to * find the location. Otherwise we have to do a btree @@ -1307,52 +1363,13 @@ xfs_imap( offset_agbno = agbno & mp->m_inoalign_mask; chunk_agbno = agbno - offset_agbno; } else { - xfs_btree_cur_t *cur; /* inode btree cursor */ - xfs_inobt_rec_incore_t chunk_rec; - xfs_buf_t *agbp; /* agi buffer */ - int i; /* temp state */ - - down_read(&mp->m_peraglock); - error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); - up_read(&mp->m_peraglock); - if (error) { - xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " - "xfs_ialloc_read_agi() returned " - "error %d, agno %d", - error, agno); - return error; - } - - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); - error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i); - if (error) { - xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " - "xfs_inobt_lookup() failed"); - goto error0; - } - - error = xfs_inobt_get_rec(cur, &chunk_rec, &i); - if (error) { - xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " - "xfs_inobt_get_rec() failed"); - goto error0; - } - if (i == 0) { -#ifdef DEBUG - xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " - "xfs_inobt_get_rec() failed"); -#endif /* DEBUG */ - error = XFS_ERROR(EINVAL); - } - error0: - xfs_trans_brelse(tp, agbp); - xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + error = xfs_imap_lookup(mp, tp, agno, agino, agbno, + &chunk_agbno, &offset_agbno, flags); if (error) return error; - chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_rec.ir_startino); - offset_agbno = agbno - chunk_agbno; } +out_map: ASSERT(agbno >= chunk_agbno); cluster_agbno = chunk_agbno + ((offset_agbno / blks_per_cluster) * blks_per_cluster); @@ -1379,7 +1396,6 @@ xfs_imap( XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); return XFS_ERROR(EINVAL); } - return 0; } @@ -1523,8 +1539,7 @@ xfs_ialloc_read_agi( return error; agi = XFS_BUF_TO_AGI(*bpp); - pag = &mp->m_perag[agno]; - + pag = xfs_perag_get(mp, agno); if (!pag->pagi_init) { pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); pag->pagi_count = be32_to_cpu(agi->agi_count); @@ -1537,6 +1552,7 @@ xfs_ialloc_read_agi( */ ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) || XFS_FORCED_SHUTDOWN(mp)); + xfs_perag_put(pag); return 0; } diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 155e798f30a1..8f8b91be2c99 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -190,13 +190,12 @@ xfs_iget_cache_hit( trace_xfs_iget_reclaim(ip); /* - * We need to set XFS_INEW atomically with clearing the - * reclaimable tag so that we do have an indicator of the - * inode still being initialized. + * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode + * from stomping over us while we recycle the inode. We can't + * clear the radix tree reclaimable tag yet as it requires + * pag_ici_lock to be held exclusive. */ - ip->i_flags |= XFS_INEW; - ip->i_flags &= ~XFS_IRECLAIMABLE; - __xfs_inode_clear_reclaim_tag(mp, pag, ip); + ip->i_flags |= XFS_IRECLAIM; spin_unlock(&ip->i_flags_lock); read_unlock(&pag->pag_ici_lock); @@ -216,7 +215,15 @@ xfs_iget_cache_hit( trace_xfs_iget_reclaim(ip); goto out_error; } + + write_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM); + ip->i_flags |= XFS_INEW; + __xfs_inode_clear_reclaim_tag(mp, pag, ip); inode->i_state = I_NEW; + spin_unlock(&ip->i_flags_lock); + write_unlock(&pag->pag_ici_lock); } else { /* If the VFS inode is being torn down, pause and try again. */ if (!igrab(inode)) { @@ -252,7 +259,6 @@ xfs_iget_cache_miss( xfs_trans_t *tp, xfs_ino_t ino, struct xfs_inode **ipp, - xfs_daddr_t bno, int flags, int lock_flags) { @@ -265,7 +271,7 @@ xfs_iget_cache_miss( if (!ip) return ENOMEM; - error = xfs_iread(mp, tp, ip, bno, flags); + error = xfs_iread(mp, tp, ip, flags); if (error) goto out_destroy; @@ -351,8 +357,6 @@ out_destroy: * within the file system for the inode being requested. * lock_flags -- flags indicating how to lock the inode. See the comment * for xfs_ilock() for a list of valid values. - * bno -- the block number starting the buffer containing the inode, - * if known (as by bulkstat), else 0. */ int xfs_iget( @@ -361,8 +365,7 @@ xfs_iget( xfs_ino_t ino, uint flags, uint lock_flags, - xfs_inode_t **ipp, - xfs_daddr_t bno) + xfs_inode_t **ipp) { xfs_inode_t *ip; int error; @@ -374,10 +377,7 @@ xfs_iget( return EINVAL; /* get the perag structure and ensure that it's inode capable */ - pag = xfs_get_perag(mp, ino); - if (!pag->pagi_inodeok) - return EINVAL; - ASSERT(pag->pag_ici_init); + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); agino = XFS_INO_TO_AGINO(mp, ino); again: @@ -393,12 +393,12 @@ again: read_unlock(&pag->pag_ici_lock); XFS_STATS_INC(xs_ig_missed); - error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, bno, + error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, flags, lock_flags); if (error) goto out_error_or_again; } - xfs_put_perag(mp, pag); + xfs_perag_put(pag); *ipp = ip; @@ -417,7 +417,7 @@ out_error_or_again: delay(1); goto again; } - xfs_put_perag(mp, pag); + xfs_perag_put(pag); return error; } @@ -488,12 +488,12 @@ xfs_ireclaim( * added to the tree assert that it's been there before to catch * problems with the inode life time early on. */ - pag = xfs_get_perag(mp, ip->i_ino); + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); write_lock(&pag->pag_ici_lock); if (!radix_tree_delete(&pag->pag_ici_root, agino)) ASSERT(0); write_unlock(&pag->pag_ici_lock); - xfs_put_perag(mp, pag); + xfs_perag_put(pag); /* * Here we do an (almost) spurious inode lock in order to coordinate @@ -737,30 +737,24 @@ xfs_ilock_demote( } #ifdef DEBUG -/* - * Debug-only routine, without additional rw_semaphore APIs, we can - * now only answer requests regarding whether we hold the lock for write - * (reader state is outside our visibility, we only track writer state). - * - * Note: this means !xfs_isilocked would give false positives, so don't do that. - */ int xfs_isilocked( xfs_inode_t *ip, uint lock_flags) { - if ((lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) == - XFS_ILOCK_EXCL) { - if (!ip->i_lock.mr_writer) - return 0; + if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) { + if (!(lock_flags & XFS_ILOCK_SHARED)) + return !!ip->i_lock.mr_writer; + return rwsem_is_locked(&ip->i_lock.mr_lock); } - if ((lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) == - XFS_IOLOCK_EXCL) { - if (!ip->i_iolock.mr_writer) - return 0; + if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { + if (!(lock_flags & XFS_IOLOCK_SHARED)) + return !!ip->i_iolock.mr_writer; + return rwsem_is_locked(&ip->i_iolock.mr_lock); } - return 1; + ASSERT(0); + return 0; } #endif diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ef77fd88c8e3..b76a829d7e20 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -151,7 +151,7 @@ xfs_imap_to_bp( "an error %d on %s. Returning error.", error, mp->m_fsname); } else { - ASSERT(buf_flags & XFS_BUF_TRYLOCK); + ASSERT(buf_flags & XBF_TRYLOCK); } return error; } @@ -177,7 +177,7 @@ xfs_imap_to_bp( if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP, XFS_RANDOM_ITOBP_INOTOBP))) { - if (iget_flags & XFS_IGET_BULKSTAT) { + if (iget_flags & XFS_IGET_UNTRUSTED) { xfs_trans_brelse(tp, bp); return XFS_ERROR(EINVAL); } @@ -239,7 +239,7 @@ xfs_inotobp( if (error) return error; - error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, imap_flags); + error = xfs_imap_to_bp(mp, tp, &imap, &bp, XBF_LOCK, imap_flags); if (error) return error; @@ -285,7 +285,7 @@ xfs_itobp( return error; if (!bp) { - ASSERT(buf_flags & XFS_BUF_TRYLOCK); + ASSERT(buf_flags & XBF_TRYLOCK); ASSERT(tp == NULL); *bpp = NULL; return EAGAIN; @@ -787,7 +787,6 @@ xfs_iread( xfs_mount_t *mp, xfs_trans_t *tp, xfs_inode_t *ip, - xfs_daddr_t bno, uint iget_flags) { xfs_buf_t *bp; @@ -797,17 +796,15 @@ xfs_iread( /* * Fill in the location information in the in-core inode. */ - ip->i_imap.im_blkno = bno; error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags); if (error) return error; - ASSERT(bno == 0 || bno == ip->i_imap.im_blkno); /* * Get pointers to the on-disk inode and the buffer containing it. */ error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, - XFS_BUF_LOCK, iget_flags); + XBF_LOCK, iget_flags); if (error) return error; dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); @@ -1751,7 +1748,7 @@ xfs_iunlink( * Here we put the head pointer into our next pointer, * and then we fall through to point the head at us. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK); + error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); if (error) return error; @@ -1833,7 +1830,7 @@ xfs_iunlink_remove( * of dealing with the buffer when there is no need to * change it. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK); + error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); if (error) { cmn_err(CE_WARN, "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", @@ -1895,7 +1892,7 @@ xfs_iunlink_remove( * Now last_ibp points to the buffer previous to us on * the unlinked list. Pull us from the list. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK); + error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); if (error) { cmn_err(CE_WARN, "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", @@ -1940,14 +1937,15 @@ xfs_ifree_cluster( int blks_per_cluster; int nbufs; int ninodes; - int i, j, found, pre_flushed; + int i, j; xfs_daddr_t blkno; xfs_buf_t *bp; - xfs_inode_t *ip, **ip_found; + xfs_inode_t *ip; xfs_inode_log_item_t *iip; xfs_log_item_t *lip; - xfs_perag_t *pag = xfs_get_perag(mp, inum); + struct xfs_perag *pag; + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { blks_per_cluster = 1; ninodes = mp->m_sb.sb_inopblock; @@ -1959,114 +1957,97 @@ xfs_ifree_cluster( nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster; } - ip_found = kmem_alloc(ninodes * sizeof(xfs_inode_t *), KM_NOFS); - for (j = 0; j < nbufs; j++, inum += ninodes) { + int found = 0; + blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), XFS_INO_TO_AGBNO(mp, inum)); + /* + * We obtain and lock the backing buffer first in the process + * here, as we have to ensure that any dirty inode that we + * can't get the flush lock on is attached to the buffer. + * If we scan the in-memory inodes first, then buffer IO can + * complete before we get a lock on it, and hence we may fail + * to mark all the active inodes on the buffer stale. + */ + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, + mp->m_bsize * blks_per_cluster, + XBF_LOCK); /* - * Look for each inode in memory and attempt to lock it, - * we can be racing with flush and tail pushing here. - * any inode we get the locks on, add to an array of - * inode items to process later. + * Walk the inodes already attached to the buffer and mark them + * stale. These will all have the flush locks held, so an + * in-memory inode walk can't lock them. + */ + lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); + while (lip) { + if (lip->li_type == XFS_LI_INODE) { + iip = (xfs_inode_log_item_t *)lip; + ASSERT(iip->ili_logged == 1); + lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done; + xfs_trans_ail_copy_lsn(mp->m_ail, + &iip->ili_flush_lsn, + &iip->ili_item.li_lsn); + xfs_iflags_set(iip->ili_inode, XFS_ISTALE); + found++; + } + lip = lip->li_bio_list; + } + + /* + * For each inode in memory attempt to add it to the inode + * buffer and set it up for being staled on buffer IO + * completion. This is safe as we've locked out tail pushing + * and flushing by locking the buffer. * - * The get the buffer lock, we could beat a flush - * or tail pushing thread to the lock here, in which - * case they will go looking for the inode buffer - * and fail, we need some other form of interlock - * here. + * We have already marked every inode that was part of a + * transaction stale above, which means there is no point in + * even trying to lock them. */ - found = 0; for (i = 0; i < ninodes; i++) { read_lock(&pag->pag_ici_lock); ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, (inum + i))); - /* Inode not in memory or we found it already, - * nothing to do - */ + /* Inode not in memory or stale, nothing to do */ if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) { read_unlock(&pag->pag_ici_lock); continue; } - if (xfs_inode_clean(ip)) { - read_unlock(&pag->pag_ici_lock); - continue; - } - - /* If we can get the locks then add it to the - * list, otherwise by the time we get the bp lock - * below it will already be attached to the - * inode buffer. - */ - - /* This inode will already be locked - by us, lets - * keep it that way. - */ - - if (ip == free_ip) { - if (xfs_iflock_nowait(ip)) { - xfs_iflags_set(ip, XFS_ISTALE); - if (xfs_inode_clean(ip)) { - xfs_ifunlock(ip); - } else { - ip_found[found++] = ip; - } - } + /* don't try to lock/unlock the current inode */ + if (ip != free_ip && + !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { read_unlock(&pag->pag_ici_lock); continue; } + read_unlock(&pag->pag_ici_lock); - if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { - if (xfs_iflock_nowait(ip)) { - xfs_iflags_set(ip, XFS_ISTALE); - - if (xfs_inode_clean(ip)) { - xfs_ifunlock(ip); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } else { - ip_found[found++] = ip; - } - } else { + if (!xfs_iflock_nowait(ip)) { + if (ip != free_ip) xfs_iunlock(ip, XFS_ILOCK_EXCL); - } + continue; } - read_unlock(&pag->pag_ici_lock); - } - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, - mp->m_bsize * blks_per_cluster, - XFS_BUF_LOCK); - - pre_flushed = 0; - lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); - while (lip) { - if (lip->li_type == XFS_LI_INODE) { - iip = (xfs_inode_log_item_t *)lip; - ASSERT(iip->ili_logged == 1); - lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done; - xfs_trans_ail_copy_lsn(mp->m_ail, - &iip->ili_flush_lsn, - &iip->ili_item.li_lsn); - xfs_iflags_set(iip->ili_inode, XFS_ISTALE); - pre_flushed++; + xfs_iflags_set(ip, XFS_ISTALE); + if (xfs_inode_clean(ip)) { + ASSERT(ip != free_ip); + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + continue; } - lip = lip->li_bio_list; - } - for (i = 0; i < found; i++) { - ip = ip_found[i]; iip = ip->i_itemp; - if (!iip) { + /* inode with unlogged changes only */ + ASSERT(ip != free_ip); ip->i_update_core = 0; xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); continue; } + found++; iip->ili_last_fields = iip->ili_format.ilf_fields; iip->ili_format.ilf_fields = 0; @@ -2077,18 +2058,17 @@ xfs_ifree_cluster( xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done, (xfs_log_item_t *)iip); - if (ip != free_ip) { + + if (ip != free_ip) xfs_iunlock(ip, XFS_ILOCK_EXCL); - } } - if (found || pre_flushed) + if (found) xfs_trans_stale_inode_buf(tp, bp); xfs_trans_binval(tp, bp); } - kmem_free(ip_found); - xfs_put_perag(mp, pag); + xfs_perag_put(pag); } /* @@ -2150,7 +2130,7 @@ xfs_ifree( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XFS_BUF_LOCK); + error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XBF_LOCK); if (error) return error; @@ -2438,72 +2418,33 @@ xfs_idestroy_fork( } /* - * Increment the pin count of the given buffer. - * This value is protected by ipinlock spinlock in the mount structure. + * This is called to unpin an inode. The caller must have the inode locked + * in at least shared mode so that the buffer cannot be subsequently pinned + * once someone is waiting for it to be unpinned. */ -void -xfs_ipin( - xfs_inode_t *ip) -{ - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - - atomic_inc(&ip->i_pincount); -} - -/* - * Decrement the pin count of the given inode, and wake up - * anyone in xfs_iwait_unpin() if the count goes to 0. The - * inode must have been previously pinned with a call to xfs_ipin(). - */ -void -xfs_iunpin( - xfs_inode_t *ip) -{ - ASSERT(atomic_read(&ip->i_pincount) > 0); - - if (atomic_dec_and_test(&ip->i_pincount)) - wake_up(&ip->i_ipin_wait); -} - -/* - * This is called to unpin an inode. It can be directed to wait or to return - * immediately without waiting for the inode to be unpinned. The caller must - * have the inode locked in at least shared mode so that the buffer cannot be - * subsequently pinned once someone is waiting for it to be unpinned. - */ -STATIC void -__xfs_iunpin_wait( - xfs_inode_t *ip, - int wait) +static void +xfs_iunpin_nowait( + struct xfs_inode *ip) { - xfs_inode_log_item_t *iip = ip->i_itemp; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - if (atomic_read(&ip->i_pincount) == 0) - return; + + trace_xfs_inode_unpin_nowait(ip, _RET_IP_); /* Give the log a push to start the unpinning I/O */ - xfs_log_force(ip->i_mount, (iip && iip->ili_last_lsn) ? - iip->ili_last_lsn : 0, XFS_LOG_FORCE); - if (wait) - wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0)); -} + xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0); -static inline void -xfs_iunpin_wait( - xfs_inode_t *ip) -{ - __xfs_iunpin_wait(ip, 1); } -static inline void -xfs_iunpin_nowait( - xfs_inode_t *ip) +void +xfs_iunpin_wait( + struct xfs_inode *ip) { - __xfs_iunpin_wait(ip, 0); + if (xfs_ipincount(ip)) { + xfs_iunpin_nowait(ip); + wait_event(ip->i_ipin_wait, (xfs_ipincount(ip) == 0)); + } } - /* * xfs_iextents_copy() * @@ -2675,7 +2616,7 @@ xfs_iflush_cluster( xfs_buf_t *bp) { xfs_mount_t *mp = ip->i_mount; - xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino); + struct xfs_perag *pag; unsigned long first_index, mask; unsigned long inodes_per_cluster; int ilist_size; @@ -2686,14 +2627,13 @@ xfs_iflush_cluster( int bufwasdelwri; int i; - ASSERT(pag->pagi_inodeok); - ASSERT(pag->pag_ici_init); + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog; ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS); if (!ilist) - return 0; + goto out_put; mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; @@ -2762,6 +2702,8 @@ xfs_iflush_cluster( out_free: read_unlock(&pag->pag_ici_lock); kmem_free(ilist); +out_put: + xfs_perag_put(pag); return 0; @@ -2805,6 +2747,7 @@ cluster_corrupt_out: */ xfs_iflush_abort(iq); kmem_free(ilist); + xfs_perag_put(pag); return XFS_ERROR(EFSCORRUPTED); } @@ -2827,8 +2770,6 @@ xfs_iflush( xfs_dinode_t *dip; xfs_mount_t *mp; int error; - int noblock = (flags == XFS_IFLUSH_ASYNC_NOBLOCK); - enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) }; XFS_STATS_INC(xs_iflush_count); @@ -2841,15 +2782,6 @@ xfs_iflush( mp = ip->i_mount; /* - * If the inode isn't dirty, then just release the inode flush lock and - * do nothing. - */ - if (xfs_inode_clean(ip)) { - xfs_ifunlock(ip); - return 0; - } - - /* * We can't flush the inode until it is unpinned, so wait for it if we * are allowed to block. We know noone new can pin it, because we are * holding the inode lock shared and you need to hold it exclusively to @@ -2860,7 +2792,7 @@ xfs_iflush( * in the same cluster are dirty, they will probably write the inode * out for us if they occur after the log force completes. */ - if (noblock && xfs_ipincount(ip)) { + if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) { xfs_iunpin_nowait(ip); xfs_ifunlock(ip); return EAGAIN; @@ -2894,60 +2826,10 @@ xfs_iflush( } /* - * Decide how buffer will be flushed out. This is done before - * the call to xfs_iflush_int because this field is zeroed by it. - */ - if (iip != NULL && iip->ili_format.ilf_fields != 0) { - /* - * Flush out the inode buffer according to the directions - * of the caller. In the cases where the caller has given - * us a choice choose the non-delwri case. This is because - * the inode is in the AIL and we need to get it out soon. - */ - switch (flags) { - case XFS_IFLUSH_SYNC: - case XFS_IFLUSH_DELWRI_ELSE_SYNC: - flags = 0; - break; - case XFS_IFLUSH_ASYNC_NOBLOCK: - case XFS_IFLUSH_ASYNC: - case XFS_IFLUSH_DELWRI_ELSE_ASYNC: - flags = INT_ASYNC; - break; - case XFS_IFLUSH_DELWRI: - flags = INT_DELWRI; - break; - default: - ASSERT(0); - flags = 0; - break; - } - } else { - switch (flags) { - case XFS_IFLUSH_DELWRI_ELSE_SYNC: - case XFS_IFLUSH_DELWRI_ELSE_ASYNC: - case XFS_IFLUSH_DELWRI: - flags = INT_DELWRI; - break; - case XFS_IFLUSH_ASYNC_NOBLOCK: - case XFS_IFLUSH_ASYNC: - flags = INT_ASYNC; - break; - case XFS_IFLUSH_SYNC: - flags = 0; - break; - default: - ASSERT(0); - flags = 0; - break; - } - } - - /* * Get the buffer containing the on-disk inode. */ error = xfs_itobp(mp, NULL, ip, &dip, &bp, - noblock ? XFS_BUF_TRYLOCK : XFS_BUF_LOCK); + (flags & SYNC_WAIT) ? XBF_LOCK : XBF_TRYLOCK); if (error || !bp) { xfs_ifunlock(ip); return error; @@ -2965,7 +2847,7 @@ xfs_iflush( * get stuck waiting in the write for too long. */ if (XFS_BUF_ISPINNED(bp)) - xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); + xfs_log_force(mp, 0); /* * inode clustering: @@ -2975,13 +2857,10 @@ xfs_iflush( if (error) goto cluster_corrupt_out; - if (flags & INT_DELWRI) { - xfs_bdwrite(mp, bp); - } else if (flags & INT_ASYNC) { - error = xfs_bawrite(mp, bp); - } else { + if (flags & SYNC_WAIT) error = xfs_bwrite(mp, bp); - } + else + xfs_bdwrite(mp, bp); return error; corrupt_out: @@ -3016,16 +2895,6 @@ xfs_iflush_int( iip = ip->i_itemp; mp = ip->i_mount; - - /* - * If the inode isn't dirty, then just release the inode - * flush lock and do nothing. - */ - if (xfs_inode_clean(ip)) { - xfs_ifunlock(ip); - return 0; - } - /* set *dip = inode's place in the buffer */ dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index ec1f28c4fc4f..78550df13cd6 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -420,16 +420,6 @@ static inline void xfs_ifunlock(xfs_inode_t *ip) #define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT) /* - * Flags for xfs_iflush() - */ -#define XFS_IFLUSH_DELWRI_ELSE_SYNC 1 -#define XFS_IFLUSH_DELWRI_ELSE_ASYNC 2 -#define XFS_IFLUSH_SYNC 3 -#define XFS_IFLUSH_ASYNC 4 -#define XFS_IFLUSH_DELWRI 5 -#define XFS_IFLUSH_ASYNC_NOBLOCK 6 - -/* * Flags for xfs_itruncate_start(). */ #define XFS_ITRUNC_DEFINITE 0x1 @@ -452,7 +442,7 @@ static inline void xfs_ifunlock(xfs_inode_t *ip) * xfs_iget.c prototypes. */ int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, - uint, uint, xfs_inode_t **, xfs_daddr_t); + uint, uint, xfs_inode_t **); void xfs_iput(xfs_inode_t *, uint); void xfs_iput_new(xfs_inode_t *, uint); void xfs_ilock(xfs_inode_t *, uint); @@ -481,14 +471,14 @@ int xfs_itruncate_finish(struct xfs_trans **, xfs_inode_t *, int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); void xfs_iext_realloc(xfs_inode_t *, int, int); -void xfs_ipin(xfs_inode_t *); -void xfs_iunpin(xfs_inode_t *); +void xfs_iunpin_wait(xfs_inode_t *); int xfs_iflush(xfs_inode_t *, uint); void xfs_ichgtime(xfs_inode_t *, int); void xfs_lock_inodes(xfs_inode_t **, int, uint); void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); void xfs_synchronize_times(xfs_inode_t *); +void xfs_mark_inode_dirty(xfs_inode_t *); void xfs_mark_inode_dirty_sync(xfs_inode_t *); #define IHOLD(ip) \ @@ -510,7 +500,7 @@ do { \ * Flags for xfs_iget() */ #define XFS_IGET_CREATE 0x1 -#define XFS_IGET_BULKSTAT 0x2 +#define XFS_IGET_UNTRUSTED 0x2 int xfs_inotobp(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, struct xfs_dinode **, @@ -519,7 +509,7 @@ int xfs_itobp(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, struct xfs_dinode **, struct xfs_buf **, uint); int xfs_iread(struct xfs_mount *, struct xfs_trans *, - struct xfs_inode *, xfs_daddr_t, uint); + struct xfs_inode *, uint); void xfs_dinode_to_disk(struct xfs_dinode *, struct xfs_icdinode *); void xfs_idestroy_fork(struct xfs_inode *, int); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index f38855d21ea5..cf8249a60004 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -228,7 +228,7 @@ xfs_inode_item_format( vecp->i_addr = (xfs_caddr_t)&iip->ili_format; vecp->i_len = sizeof(xfs_inode_log_format_t); - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IFORMAT); + vecp->i_type = XLOG_REG_TYPE_IFORMAT; vecp++; nvecs = 1; @@ -279,7 +279,7 @@ xfs_inode_item_format( vecp->i_addr = (xfs_caddr_t)&ip->i_d; vecp->i_len = sizeof(struct xfs_icdinode); - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ICORE); + vecp->i_type = XLOG_REG_TYPE_ICORE; vecp++; nvecs++; iip->ili_format.ilf_fields |= XFS_ILOG_CORE; @@ -336,7 +336,7 @@ xfs_inode_item_format( vecp->i_addr = (char *)(ip->i_df.if_u1.if_extents); vecp->i_len = ip->i_df.if_bytes; - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IEXT); + vecp->i_type = XLOG_REG_TYPE_IEXT; } else #endif { @@ -355,7 +355,7 @@ xfs_inode_item_format( vecp->i_addr = (xfs_caddr_t)ext_buffer; vecp->i_len = xfs_iextents_copy(ip, ext_buffer, XFS_DATA_FORK); - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IEXT); + vecp->i_type = XLOG_REG_TYPE_IEXT; } ASSERT(vecp->i_len <= ip->i_df.if_bytes); iip->ili_format.ilf_dsize = vecp->i_len; @@ -373,7 +373,7 @@ xfs_inode_item_format( ASSERT(ip->i_df.if_broot != NULL); vecp->i_addr = (xfs_caddr_t)ip->i_df.if_broot; vecp->i_len = ip->i_df.if_broot_bytes; - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IBROOT); + vecp->i_type = XLOG_REG_TYPE_IBROOT; vecp++; nvecs++; iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes; @@ -399,7 +399,7 @@ xfs_inode_item_format( ASSERT((ip->i_df.if_real_bytes == 0) || (ip->i_df.if_real_bytes == data_bytes)); vecp->i_len = (int)data_bytes; - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ILOCAL); + vecp->i_type = XLOG_REG_TYPE_ILOCAL; vecp++; nvecs++; iip->ili_format.ilf_dsize = (unsigned)data_bytes; @@ -477,7 +477,7 @@ xfs_inode_item_format( vecp->i_len = xfs_iextents_copy(ip, ext_buffer, XFS_ATTR_FORK); #endif - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_EXT); + vecp->i_type = XLOG_REG_TYPE_IATTR_EXT; iip->ili_format.ilf_asize = vecp->i_len; vecp++; nvecs++; @@ -492,7 +492,7 @@ xfs_inode_item_format( ASSERT(ip->i_afp->if_broot != NULL); vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_broot; vecp->i_len = ip->i_afp->if_broot_bytes; - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_BROOT); + vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT; vecp++; nvecs++; iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes; @@ -516,7 +516,7 @@ xfs_inode_item_format( ASSERT((ip->i_afp->if_real_bytes == 0) || (ip->i_afp->if_real_bytes == data_bytes)); vecp->i_len = (int)data_bytes; - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_LOCAL); + vecp->i_type = XLOG_REG_TYPE_IATTR_LOCAL; vecp++; nvecs++; iip->ili_format.ilf_asize = (unsigned)data_bytes; @@ -535,31 +535,36 @@ xfs_inode_item_format( /* * This is called to pin the inode associated with the inode log - * item in memory so it cannot be written out. Do this by calling - * xfs_ipin() to bump the pin count in the inode while holding the - * inode pin lock. + * item in memory so it cannot be written out. */ STATIC void xfs_inode_item_pin( xfs_inode_log_item_t *iip) { ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL)); - xfs_ipin(iip->ili_inode); + + trace_xfs_inode_pin(iip->ili_inode, _RET_IP_); + atomic_inc(&iip->ili_inode->i_pincount); } /* * This is called to unpin the inode associated with the inode log * item which was previously pinned with a call to xfs_inode_item_pin(). - * Just call xfs_iunpin() on the inode to do this. + * + * Also wake up anyone in xfs_iunpin_wait() if the count goes to 0. */ /* ARGSUSED */ STATIC void xfs_inode_item_unpin( - xfs_inode_log_item_t *iip, - int stale) + xfs_inode_log_item_t *iip) { - xfs_iunpin(iip->ili_inode); + struct xfs_inode *ip = iip->ili_inode; + + trace_xfs_inode_unpin(ip, _RET_IP_); + ASSERT(atomic_read(&ip->i_pincount) > 0); + if (atomic_dec_and_test(&ip->i_pincount)) + wake_up(&ip->i_ipin_wait); } /* ARGSUSED */ @@ -568,7 +573,7 @@ xfs_inode_item_unpin_remove( xfs_inode_log_item_t *iip, xfs_trans_t *tp) { - xfs_iunpin(iip->ili_inode); + xfs_inode_item_unpin(iip); } /* @@ -602,33 +607,20 @@ xfs_inode_item_trylock( if (!xfs_iflock_nowait(ip)) { /* - * If someone else isn't already trying to push the inode - * buffer, we get to do it. + * inode has already been flushed to the backing buffer, + * leave it locked in shared mode, pushbuf routine will + * unlock it. */ - if (iip->ili_pushbuf_flag == 0) { - iip->ili_pushbuf_flag = 1; -#ifdef DEBUG - iip->ili_push_owner = current_pid(); -#endif - /* - * Inode is left locked in shared mode. - * Pushbuf routine gets to unlock it. - */ - return XFS_ITEM_PUSHBUF; - } else { - /* - * We hold the AIL lock, so we must specify the - * NONOTIFY flag so that we won't double trip. - */ - xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY); - return XFS_ITEM_FLUSHING; - } - /* NOTREACHED */ + return XFS_ITEM_PUSHBUF; } /* Stale items should force out the iclog */ if (ip->i_flags & XFS_ISTALE) { xfs_ifunlock(ip); + /* + * we hold the AIL lock - notify the unlock routine of this + * so it doesn't try to get the lock again. + */ xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY); return XFS_ITEM_PINNED; } @@ -746,11 +738,8 @@ xfs_inode_item_committed( * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK * failed to get the inode flush lock but did get the inode locked SHARED. * Here we're trying to see if the inode buffer is incore, and if so whether it's - * marked delayed write. If that's the case, we'll initiate a bawrite on that - * buffer to expedite the process. - * - * We aren't holding the AIL lock (or the flush lock) when this gets called, - * so it is inherently race-y. + * marked delayed write. If that's the case, we'll promote it and that will + * allow the caller to write the buffer by triggering the xfsbufd to run. */ STATIC void xfs_inode_item_pushbuf( @@ -759,82 +748,30 @@ xfs_inode_item_pushbuf( xfs_inode_t *ip; xfs_mount_t *mp; xfs_buf_t *bp; - uint dopush; ip = iip->ili_inode; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); /* - * The ili_pushbuf_flag keeps others from - * trying to duplicate our effort. - */ - ASSERT(iip->ili_pushbuf_flag != 0); - ASSERT(iip->ili_push_owner == current_pid()); - - /* * If a flush is not in progress anymore, chances are that the * inode was taken off the AIL. So, just get out. */ if (completion_done(&ip->i_flush) || ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) { - iip->ili_pushbuf_flag = 0; xfs_iunlock(ip, XFS_ILOCK_SHARED); return; } mp = ip->i_mount; bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno, - iip->ili_format.ilf_len, XFS_INCORE_TRYLOCK); - - if (bp != NULL) { - if (XFS_BUF_ISDELAYWRITE(bp)) { - /* - * We were racing with iflush because we don't hold - * the AIL lock or the flush lock. However, at this point, - * we have the buffer, and we know that it's dirty. - * So, it's possible that iflush raced with us, and - * this item is already taken off the AIL. - * If not, we can flush it async. - */ - dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) && - !completion_done(&ip->i_flush)); - iip->ili_pushbuf_flag = 0; - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - trace_xfs_inode_item_push(bp, _RET_IP_); + iip->ili_format.ilf_len, XBF_TRYLOCK); - if (XFS_BUF_ISPINNED(bp)) { - xfs_log_force(mp, (xfs_lsn_t)0, - XFS_LOG_FORCE); - } - if (dopush) { - int error; - error = xfs_bawrite(mp, bp); - if (error) - xfs_fs_cmn_err(CE_WARN, mp, - "xfs_inode_item_pushbuf: pushbuf error %d on iip %p, bp %p", - error, iip, bp); - } else { - xfs_buf_relse(bp); - } - } else { - iip->ili_pushbuf_flag = 0; - xfs_iunlock(ip, XFS_ILOCK_SHARED); - xfs_buf_relse(bp); - } - return; - } - /* - * We have to be careful about resetting pushbuf flag too early (above). - * Even though in theory we can do it as soon as we have the buflock, - * we don't want others to be doing work needlessly. They'll come to - * this function thinking that pushing the buffer is their - * responsibility only to find that the buffer is still locked by - * another doing the same thing - */ - iip->ili_pushbuf_flag = 0; xfs_iunlock(ip, XFS_ILOCK_SHARED); + if (!bp) + return; + if (XFS_BUF_ISDELAYWRITE(bp)) + xfs_buf_delwri_promote(bp); + xfs_buf_relse(bp); return; } @@ -867,10 +804,14 @@ xfs_inode_item_push( iip->ili_format.ilf_fields != 0); /* - * Write out the inode. The completion routine ('iflush_done') will - * pull it from the AIL, mark it clean, unlock the flush lock. + * Push the inode to it's backing buffer. This will not remove the + * inode from the AIL - a further push will be required to trigger a + * buffer push. However, this allows all the dirty inodes to be pushed + * to the buffer before it is pushed to disk. THe buffer IO completion + * will pull th einode from the AIL, mark it clean and unlock the flush + * lock. */ - (void) xfs_iflush(ip, XFS_IFLUSH_ASYNC); + (void) xfs_iflush(ip, 0); xfs_iunlock(ip, XFS_ILOCK_SHARED); return; @@ -898,7 +839,7 @@ static struct xfs_item_ops xfs_inode_item_ops = { .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_inode_item_format, .iop_pin = (void(*)(xfs_log_item_t*))xfs_inode_item_pin, - .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_inode_item_unpin, + .iop_unpin = (void(*)(xfs_log_item_t*))xfs_inode_item_unpin, .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) xfs_inode_item_unpin_remove, .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_inode_item_trylock, @@ -925,18 +866,9 @@ xfs_inode_item_init( ASSERT(ip->i_itemp == NULL); iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP); - iip->ili_item.li_type = XFS_LI_INODE; - iip->ili_item.li_ops = &xfs_inode_item_ops; - iip->ili_item.li_mountp = mp; - iip->ili_item.li_ailp = mp->m_ail; iip->ili_inode = ip; - - /* - We have zeroed memory. No need ... - iip->ili_extents_buf = NULL; - iip->ili_pushbuf_flag = 0; - */ - + xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE, + &xfs_inode_item_ops); iip->ili_format.ilf_type = XFS_LI_INODE; iip->ili_format.ilf_ino = ip->i_ino; iip->ili_format.ilf_blkno = ip->i_imap.im_blkno; diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index cc8df1ac7783..9a467958ecdd 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -144,12 +144,6 @@ typedef struct xfs_inode_log_item { data exts */ struct xfs_bmbt_rec *ili_aextents_buf; /* array of logged attr exts */ - unsigned int ili_pushbuf_flag; /* one bit used in push_ail */ - -#ifdef DEBUG - uint64_t ili_push_owner; /* one who sets pushbuf_flag - above gets to push the buf */ -#endif #ifdef XFS_TRANS_DEBUG int ili_root_size; char *ili_orig_root; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 0b65039951a0..ef14943829da 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -55,71 +55,33 @@ #define XFS_STRAT_WRITE_IMAPS 2 #define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP -STATIC int -xfs_imap_to_bmap( - xfs_inode_t *ip, - xfs_off_t offset, - xfs_bmbt_irec_t *imap, - xfs_iomap_t *iomapp, - int imaps, /* Number of imap entries */ - int iomaps, /* Number of iomap entries */ - int flags) -{ - xfs_mount_t *mp = ip->i_mount; - int pbm; - xfs_fsblock_t start_block; - - - for (pbm = 0; imaps && pbm < iomaps; imaps--, iomapp++, imap++, pbm++) { - iomapp->iomap_offset = XFS_FSB_TO_B(mp, imap->br_startoff); - iomapp->iomap_delta = offset - iomapp->iomap_offset; - iomapp->iomap_bsize = XFS_FSB_TO_B(mp, imap->br_blockcount); - iomapp->iomap_flags = flags; - - if (XFS_IS_REALTIME_INODE(ip)) { - iomapp->iomap_flags |= IOMAP_REALTIME; - iomapp->iomap_target = mp->m_rtdev_targp; - } else { - iomapp->iomap_target = mp->m_ddev_targp; - } - start_block = imap->br_startblock; - if (start_block == HOLESTARTBLOCK) { - iomapp->iomap_bn = IOMAP_DADDR_NULL; - iomapp->iomap_flags |= IOMAP_HOLE; - } else if (start_block == DELAYSTARTBLOCK) { - iomapp->iomap_bn = IOMAP_DADDR_NULL; - iomapp->iomap_flags |= IOMAP_DELAY; - } else { - iomapp->iomap_bn = xfs_fsb_to_db(ip, start_block); - if (ISUNWRITTEN(imap)) - iomapp->iomap_flags |= IOMAP_UNWRITTEN; - } - - offset += iomapp->iomap_bsize - iomapp->iomap_delta; - } - return pbm; /* Return the number filled */ -} +STATIC int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, + int, struct xfs_bmbt_irec *, int *); +STATIC int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int, + struct xfs_bmbt_irec *, int *); +STATIC int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t, + struct xfs_bmbt_irec *, int *); int xfs_iomap( - xfs_inode_t *ip, - xfs_off_t offset, - ssize_t count, - int flags, - xfs_iomap_t *iomapp, - int *niomaps) + struct xfs_inode *ip, + xfs_off_t offset, + ssize_t count, + int flags, + struct xfs_bmbt_irec *imap, + int *nimaps, + int *new) { - xfs_mount_t *mp = ip->i_mount; - xfs_fileoff_t offset_fsb, end_fsb; - int error = 0; - int lockmode = 0; - xfs_bmbt_irec_t imap; - int nimaps = 1; - int bmapi_flags = 0; - int iomap_flags = 0; + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb, end_fsb; + int error = 0; + int lockmode = 0; + int bmapi_flags = 0; ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); + *new = 0; + if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); @@ -160,8 +122,8 @@ xfs_iomap( error = xfs_bmapi(NULL, ip, offset_fsb, (xfs_filblks_t)(end_fsb - offset_fsb), - bmapi_flags, NULL, 0, &imap, - &nimaps, NULL, NULL); + bmapi_flags, NULL, 0, imap, + nimaps, NULL, NULL); if (error) goto out; @@ -169,46 +131,41 @@ xfs_iomap( switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) { case BMAPI_WRITE: /* If we found an extent, return it */ - if (nimaps && - (imap.br_startblock != HOLESTARTBLOCK) && - (imap.br_startblock != DELAYSTARTBLOCK)) { - trace_xfs_iomap_found(ip, offset, count, flags, &imap); + if (*nimaps && + (imap->br_startblock != HOLESTARTBLOCK) && + (imap->br_startblock != DELAYSTARTBLOCK)) { + trace_xfs_iomap_found(ip, offset, count, flags, imap); break; } if (flags & (BMAPI_DIRECT|BMAPI_MMAP)) { error = xfs_iomap_write_direct(ip, offset, count, flags, - &imap, &nimaps, nimaps); + imap, nimaps); } else { error = xfs_iomap_write_delay(ip, offset, count, flags, - &imap, &nimaps); + imap, nimaps); } if (!error) { - trace_xfs_iomap_alloc(ip, offset, count, flags, &imap); + trace_xfs_iomap_alloc(ip, offset, count, flags, imap); } - iomap_flags = IOMAP_NEW; + *new = 1; break; case BMAPI_ALLOCATE: /* If we found an extent, return it */ xfs_iunlock(ip, lockmode); lockmode = 0; - if (nimaps && !isnullstartblock(imap.br_startblock)) { - trace_xfs_iomap_found(ip, offset, count, flags, &imap); + if (*nimaps && !isnullstartblock(imap->br_startblock)) { + trace_xfs_iomap_found(ip, offset, count, flags, imap); break; } error = xfs_iomap_write_allocate(ip, offset, count, - &imap, &nimaps); + imap, nimaps); break; } - if (nimaps) { - *niomaps = xfs_imap_to_bmap(ip, offset, &imap, - iomapp, nimaps, *niomaps, iomap_flags); - } else if (niomaps) { - *niomaps = 0; - } + ASSERT(*nimaps <= 1); out: if (lockmode) @@ -216,7 +173,6 @@ out: return XFS_ERROR(error); } - STATIC int xfs_iomap_eof_align_last_fsb( xfs_mount_t *mp, @@ -285,15 +241,14 @@ xfs_cmn_err_fsblock_zero( return EFSCORRUPTED; } -int +STATIC int xfs_iomap_write_direct( xfs_inode_t *ip, xfs_off_t offset, size_t count, int flags, xfs_bmbt_irec_t *ret_imap, - int *nmaps, - int found) + int *nmaps) { xfs_mount_t *mp = ip->i_mount; xfs_fileoff_t offset_fsb; @@ -330,7 +285,7 @@ xfs_iomap_write_direct( if (error) goto error_out; } else { - if (found && (ret_imap->br_startblock == HOLESTARTBLOCK)) + if (*nmaps && (ret_imap->br_startblock == HOLESTARTBLOCK)) last_fsb = MIN(last_fsb, (xfs_fileoff_t) ret_imap->br_blockcount + ret_imap->br_startoff); @@ -485,7 +440,7 @@ xfs_iomap_eof_want_preallocate( return 0; } -int +STATIC int xfs_iomap_write_delay( xfs_inode_t *ip, xfs_off_t offset, @@ -588,7 +543,7 @@ retry: * We no longer bother to look at the incoming map - all we have to * guarantee is that whatever we allocate fills the required range. */ -int +STATIC int xfs_iomap_write_allocate( xfs_inode_t *ip, xfs_off_t offset, diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 174f29990991..81ac4afd45b3 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -18,19 +18,6 @@ #ifndef __XFS_IOMAP_H__ #define __XFS_IOMAP_H__ -#define IOMAP_DADDR_NULL ((xfs_daddr_t) (-1LL)) - - -typedef enum { /* iomap_flags values */ - IOMAP_READ = 0, /* mapping for a read */ - IOMAP_HOLE = 0x02, /* mapping covers a hole */ - IOMAP_DELAY = 0x04, /* mapping covers delalloc region */ - IOMAP_REALTIME = 0x10, /* mapping on the realtime device */ - IOMAP_UNWRITTEN = 0x20, /* mapping covers allocated */ - /* but uninitialized file data */ - IOMAP_NEW = 0x40 /* just allocate */ -} iomap_flags_t; - typedef enum { /* base extent manipulation calls */ BMAPI_READ = (1 << 0), /* read extents */ @@ -52,43 +39,11 @@ typedef enum { { BMAPI_MMAP, "MMAP" }, \ { BMAPI_TRYLOCK, "TRYLOCK" } -/* - * xfs_iomap_t: File system I/O map - * - * The iomap_bn field is expressed in 512-byte blocks, and is where the - * mapping starts on disk. - * - * The iomap_offset, iomap_bsize and iomap_delta fields are in bytes. - * iomap_offset is the offset of the mapping in the file itself. - * iomap_bsize is the size of the mapping, iomap_delta is the - * desired data's offset into the mapping, given the offset supplied - * to the file I/O map routine. - * - * When a request is made to read beyond the logical end of the object, - * iomap_size may be set to 0, but iomap_offset and iomap_length should be set - * to the actual amount of underlying storage that has been allocated, if any. - */ - -typedef struct xfs_iomap { - xfs_daddr_t iomap_bn; /* first 512B blk of mapping */ - xfs_buftarg_t *iomap_target; - xfs_off_t iomap_offset; /* offset of mapping, bytes */ - xfs_off_t iomap_bsize; /* size of mapping, bytes */ - xfs_off_t iomap_delta; /* offset into mapping, bytes */ - iomap_flags_t iomap_flags; -} xfs_iomap_t; - struct xfs_inode; struct xfs_bmbt_irec; extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int, - struct xfs_iomap *, int *); -extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, - int, struct xfs_bmbt_irec *, int *, int); -extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int, - struct xfs_bmbt_irec *, int *); -extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t, - struct xfs_bmbt_irec *, int *); + struct xfs_bmbt_irec *, int *, int *); extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t); #endif /* __XFS_IOMAP_H__*/ diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 62efab2f3839..2b86f8610512 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -49,24 +49,40 @@ xfs_internal_inum( (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino))); } -STATIC int -xfs_bulkstat_one_iget( - xfs_mount_t *mp, /* mount point for filesystem */ - xfs_ino_t ino, /* inode number to get data for */ - xfs_daddr_t bno, /* starting bno of inode cluster */ - xfs_bstat_t *buf, /* return buffer */ - int *stat) /* BULKSTAT_RV_... */ +/* + * Return stat information for one inode. + * Return 0 if ok, else errno. + */ +int +xfs_bulkstat_one_int( + struct xfs_mount *mp, /* mount point for filesystem */ + xfs_ino_t ino, /* inode to get data for */ + void __user *buffer, /* buffer to place output in */ + int ubsize, /* size of buffer */ + bulkstat_one_fmt_pf formatter, /* formatter, copy to user */ + int *ubused, /* bytes used by me */ + int *stat) /* BULKSTAT_RV_... */ { - xfs_icdinode_t *dic; /* dinode core info pointer */ - xfs_inode_t *ip; /* incore inode pointer */ - struct inode *inode; - int error; + struct xfs_icdinode *dic; /* dinode core info pointer */ + struct xfs_inode *ip; /* incore inode pointer */ + struct inode *inode; + struct xfs_bstat *buf; /* return buffer */ + int error = 0; /* error value */ + + *stat = BULKSTAT_RV_NOTHING; + + if (!buffer || xfs_internal_inum(mp, ino)) + return XFS_ERROR(EINVAL); + + buf = kmem_alloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL); + if (!buf) + return XFS_ERROR(ENOMEM); error = xfs_iget(mp, NULL, ino, - XFS_IGET_BULKSTAT, XFS_ILOCK_SHARED, &ip, bno); + XFS_IGET_UNTRUSTED, XFS_ILOCK_SHARED, &ip); if (error) { *stat = BULKSTAT_RV_NOTHING; - return error; + goto out_free; } ASSERT(ip != NULL); @@ -106,6 +122,7 @@ xfs_bulkstat_one_iget( buf->bs_dmevmask = dic->di_dmevmask; buf->bs_dmstate = dic->di_dmstate; buf->bs_aextents = dic->di_anextents; + buf->bs_forkoff = XFS_IFORK_BOFF(ip); switch (dic->di_format) { case XFS_DINODE_FMT_DEV: @@ -126,76 +143,16 @@ xfs_bulkstat_one_iget( buf->bs_blocks = dic->di_nblocks + ip->i_delayed_blks; break; } - xfs_iput(ip, XFS_ILOCK_SHARED); - return error; -} -STATIC void -xfs_bulkstat_one_dinode( - xfs_mount_t *mp, /* mount point for filesystem */ - xfs_ino_t ino, /* inode number to get data for */ - xfs_dinode_t *dic, /* dinode inode pointer */ - xfs_bstat_t *buf) /* return buffer */ -{ - /* - * The inode format changed when we moved the link count and - * made it 32 bits long. If this is an old format inode, - * convert it in memory to look like a new one. If it gets - * flushed to disk we will convert back before flushing or - * logging it. We zero out the new projid field and the old link - * count field. We'll handle clearing the pad field (the remains - * of the old uuid field) when we actually convert the inode to - * the new format. We don't change the version number so that we - * can distinguish this from a real new format inode. - */ - if (dic->di_version == 1) { - buf->bs_nlink = be16_to_cpu(dic->di_onlink); - buf->bs_projid = 0; - } else { - buf->bs_nlink = be32_to_cpu(dic->di_nlink); - buf->bs_projid = be16_to_cpu(dic->di_projid); - } + error = formatter(buffer, ubsize, ubused, buf); - buf->bs_ino = ino; - buf->bs_mode = be16_to_cpu(dic->di_mode); - buf->bs_uid = be32_to_cpu(dic->di_uid); - buf->bs_gid = be32_to_cpu(dic->di_gid); - buf->bs_size = be64_to_cpu(dic->di_size); - buf->bs_atime.tv_sec = be32_to_cpu(dic->di_atime.t_sec); - buf->bs_atime.tv_nsec = be32_to_cpu(dic->di_atime.t_nsec); - buf->bs_mtime.tv_sec = be32_to_cpu(dic->di_mtime.t_sec); - buf->bs_mtime.tv_nsec = be32_to_cpu(dic->di_mtime.t_nsec); - buf->bs_ctime.tv_sec = be32_to_cpu(dic->di_ctime.t_sec); - buf->bs_ctime.tv_nsec = be32_to_cpu(dic->di_ctime.t_nsec); - buf->bs_xflags = xfs_dic2xflags(dic); - buf->bs_extsize = be32_to_cpu(dic->di_extsize) << mp->m_sb.sb_blocklog; - buf->bs_extents = be32_to_cpu(dic->di_nextents); - buf->bs_gen = be32_to_cpu(dic->di_gen); - memset(buf->bs_pad, 0, sizeof(buf->bs_pad)); - buf->bs_dmevmask = be32_to_cpu(dic->di_dmevmask); - buf->bs_dmstate = be16_to_cpu(dic->di_dmstate); - buf->bs_aextents = be16_to_cpu(dic->di_anextents); + if (!error) + *stat = BULKSTAT_RV_DIDONE; - switch (dic->di_format) { - case XFS_DINODE_FMT_DEV: - buf->bs_rdev = xfs_dinode_get_rdev(dic); - buf->bs_blksize = BLKDEV_IOSIZE; - buf->bs_blocks = 0; - break; - case XFS_DINODE_FMT_LOCAL: - case XFS_DINODE_FMT_UUID: - buf->bs_rdev = 0; - buf->bs_blksize = mp->m_sb.sb_blocksize; - buf->bs_blocks = 0; - break; - case XFS_DINODE_FMT_EXTENTS: - case XFS_DINODE_FMT_BTREE: - buf->bs_rdev = 0; - buf->bs_blksize = mp->m_sb.sb_blocksize; - buf->bs_blocks = be64_to_cpu(dic->di_nblocks); - break; - } + out_free: + kmem_free(buf); + return error; } /* Return 0 on success or positive error */ @@ -215,118 +172,17 @@ xfs_bulkstat_one_fmt( return 0; } -/* - * Return stat information for one inode. - * Return 0 if ok, else errno. - */ -int /* error status */ -xfs_bulkstat_one_int( - xfs_mount_t *mp, /* mount point for filesystem */ - xfs_ino_t ino, /* inode number to get data for */ - void __user *buffer, /* buffer to place output in */ - int ubsize, /* size of buffer */ - bulkstat_one_fmt_pf formatter, /* formatter, copy to user */ - xfs_daddr_t bno, /* starting bno of inode cluster */ - int *ubused, /* bytes used by me */ - void *dibuff, /* on-disk inode buffer */ - int *stat) /* BULKSTAT_RV_... */ -{ - xfs_bstat_t *buf; /* return buffer */ - int error = 0; /* error value */ - xfs_dinode_t *dip; /* dinode inode pointer */ - - dip = (xfs_dinode_t *)dibuff; - *stat = BULKSTAT_RV_NOTHING; - - if (!buffer || xfs_internal_inum(mp, ino)) - return XFS_ERROR(EINVAL); - - buf = kmem_alloc(sizeof(*buf), KM_SLEEP); - - if (dip == NULL) { - /* We're not being passed a pointer to a dinode. This happens - * if BULKSTAT_FG_IGET is selected. Do the iget. - */ - error = xfs_bulkstat_one_iget(mp, ino, bno, buf, stat); - if (error) - goto out_free; - } else { - xfs_bulkstat_one_dinode(mp, ino, dip, buf); - } - - error = formatter(buffer, ubsize, ubused, buf); - if (error) - goto out_free; - - *stat = BULKSTAT_RV_DIDONE; - - out_free: - kmem_free(buf); - return error; -} - int xfs_bulkstat_one( xfs_mount_t *mp, /* mount point for filesystem */ xfs_ino_t ino, /* inode number to get data for */ void __user *buffer, /* buffer to place output in */ int ubsize, /* size of buffer */ - void *private_data, /* my private data */ - xfs_daddr_t bno, /* starting bno of inode cluster */ int *ubused, /* bytes used by me */ - void *dibuff, /* on-disk inode buffer */ int *stat) /* BULKSTAT_RV_... */ { return xfs_bulkstat_one_int(mp, ino, buffer, ubsize, - xfs_bulkstat_one_fmt, bno, - ubused, dibuff, stat); -} - -/* - * Test to see whether we can use the ondisk inode directly, based - * on the given bulkstat flags, filling in dipp accordingly. - * Returns zero if the inode is dodgey. - */ -STATIC int -xfs_bulkstat_use_dinode( - xfs_mount_t *mp, - int flags, - xfs_buf_t *bp, - int clustidx, - xfs_dinode_t **dipp) -{ - xfs_dinode_t *dip; - unsigned int aformat; - - *dipp = NULL; - if (!bp || (flags & BULKSTAT_FG_IGET)) - return 1; - dip = (xfs_dinode_t *) - xfs_buf_offset(bp, clustidx << mp->m_sb.sb_inodelog); - /* - * Check the buffer containing the on-disk inode for di_mode == 0. - * This is to prevent xfs_bulkstat from picking up just reclaimed - * inodes that have their in-core state initialized but not flushed - * to disk yet. This is a temporary hack that would require a proper - * fix in the future. - */ - if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC || - !XFS_DINODE_GOOD_VERSION(dip->di_version) || - !dip->di_mode) - return 0; - if (flags & BULKSTAT_FG_QUICK) { - *dipp = dip; - return 1; - } - /* BULKSTAT_FG_INLINE: if attr fork is local, or not there, use it */ - aformat = dip->di_aformat; - if ((XFS_DFORK_Q(dip) == 0) || - (aformat == XFS_DINODE_FMT_LOCAL) || - (aformat == XFS_DINODE_FMT_EXTENTS && !dip->di_anextents)) { - *dipp = dip; - return 1; - } - return 1; + xfs_bulkstat_one_fmt, ubused, stat); } #define XFS_BULKSTAT_UBLEFT(ubleft) ((ubleft) >= statstruct_size) @@ -340,10 +196,8 @@ xfs_bulkstat( xfs_ino_t *lastinop, /* last inode returned */ int *ubcountp, /* size of buffer/count returned */ bulkstat_one_pf formatter, /* func that'd fill a single buf */ - void *private_data,/* private data for formatter */ size_t statstruct_size, /* sizeof struct filling */ char __user *ubuffer, /* buffer with inode stats */ - int flags, /* defined in xfs_itable.h */ int *done) /* 1 if there are more stats to get */ { xfs_agblock_t agbno=0;/* allocation group block number */ @@ -378,14 +232,12 @@ xfs_bulkstat( int ubelem; /* spaces used in user's buffer */ int ubused; /* bytes used by formatter */ xfs_buf_t *bp; /* ptr to on-disk inode cluster buf */ - xfs_dinode_t *dip; /* ptr into bp for specific inode */ /* * Get the last inode value, see if there's nothing to do. */ ino = (xfs_ino_t)*lastinop; lastino = ino; - dip = NULL; agno = XFS_INO_TO_AGNO(mp, ino); agino = XFS_INO_TO_AGINO(mp, ino); if (agno >= mp->m_sb.sb_agcount || @@ -408,8 +260,10 @@ xfs_bulkstat( (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog); nimask = ~(nicluster - 1); nbcluster = nicluster >> mp->m_sb.sb_inopblog; - irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4, - KM_SLEEP | KM_MAYFAIL | KM_LARGE); + irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4); + if (!irbuf) + return ENOMEM; + nirbuf = irbsize / sizeof(*irbuf); /* @@ -420,9 +274,7 @@ xfs_bulkstat( while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) { cond_resched(); bp = NULL; - down_read(&mp->m_peraglock); error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); - up_read(&mp->m_peraglock); if (error) { /* * Skip this allocation group and go to the next one. @@ -610,37 +462,6 @@ xfs_bulkstat( irbp->ir_startino) + ((chunkidx & nimask) >> mp->m_sb.sb_inopblog); - - if (flags & (BULKSTAT_FG_QUICK | - BULKSTAT_FG_INLINE)) { - int offset; - - ino = XFS_AGINO_TO_INO(mp, agno, - agino); - bno = XFS_AGB_TO_DADDR(mp, agno, - agbno); - - /* - * Get the inode cluster buffer - */ - if (bp) - xfs_buf_relse(bp); - - error = xfs_inotobp(mp, NULL, ino, &dip, - &bp, &offset, - XFS_IGET_BULKSTAT); - - if (!error) - clustidx = offset / mp->m_sb.sb_inodesize; - if (XFS_TEST_ERROR(error != 0, - mp, XFS_ERRTAG_BULKSTAT_READ_CHUNK, - XFS_RANDOM_BULKSTAT_READ_CHUNK)) { - bp = NULL; - ubleft = 0; - rval = error; - break; - } - } } ino = XFS_AGINO_TO_INO(mp, agno, agino); bno = XFS_AGB_TO_DADDR(mp, agno, agbno); @@ -656,35 +477,13 @@ xfs_bulkstat( * when the chunk is used up. */ irbp->ir_freecount++; - if (!xfs_bulkstat_use_dinode(mp, flags, bp, - clustidx, &dip)) { - lastino = ino; - continue; - } - /* - * If we need to do an iget, cannot hold bp. - * Drop it, until starting the next cluster. - */ - if ((flags & BULKSTAT_FG_INLINE) && !dip) { - if (bp) - xfs_buf_relse(bp); - bp = NULL; - } /* * Get the inode and fill in a single buffer. - * BULKSTAT_FG_QUICK uses dip to fill it in. - * BULKSTAT_FG_IGET uses igets. - * BULKSTAT_FG_INLINE uses dip if we have an - * inline attr fork, else igets. - * See: xfs_bulkstat_one & xfs_dm_bulkstat_one. - * This is also used to count inodes/blks, etc - * in xfs_qm_quotacheck. */ ubused = statstruct_size; - error = formatter(mp, ino, ubufp, - ubleft, private_data, - bno, &ubused, dip, &fmterror); + error = formatter(mp, ino, ubufp, ubleft, + &ubused, &fmterror); if (fmterror == BULKSTAT_RV_NOTHING) { if (error && error != ENOENT && error != EINVAL) { @@ -729,7 +528,7 @@ xfs_bulkstat( /* * Done, we're either out of filesystem or space to put the data. */ - kmem_free(irbuf); + kmem_free_large(irbuf); *ubcountp = ubelem; /* * Found some inodes, return them now and return the error next time. @@ -776,8 +575,7 @@ xfs_bulkstat_single( */ ino = (xfs_ino_t)*lastinop; - error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t), - NULL, 0, NULL, NULL, &res); + error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t), 0, &res); if (error) { /* * Special case way failed, do it the "long" way @@ -786,8 +584,7 @@ xfs_bulkstat_single( (*lastinop)--; count = 1; if (xfs_bulkstat(mp, lastinop, &count, xfs_bulkstat_one, - NULL, sizeof(xfs_bstat_t), buffer, - BULKSTAT_FG_IGET, done)) + sizeof(xfs_bstat_t), buffer, done)) return error; if (count == 0 || (xfs_ino_t)*lastinop != ino) return error == EFSCORRUPTED ? @@ -849,9 +646,7 @@ xfs_inumbers( agbp = NULL; while (left > 0 && agno < mp->m_sb.sb_agcount) { if (agbp == NULL) { - down_read(&mp->m_peraglock); error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); - up_read(&mp->m_peraglock); if (error) { /* * If we can't read the AGI of this ag, diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index 20792bf45946..97295d91d170 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -27,10 +27,7 @@ typedef int (*bulkstat_one_pf)(struct xfs_mount *mp, xfs_ino_t ino, void __user *buffer, int ubsize, - void *private_data, - xfs_daddr_t bno, int *ubused, - void *dip, int *stat); /* @@ -41,13 +38,6 @@ typedef int (*bulkstat_one_pf)(struct xfs_mount *mp, #define BULKSTAT_RV_GIVEUP 2 /* - * Values for bulkstat flag argument. - */ -#define BULKSTAT_FG_IGET 0x1 /* Go through the buffer cache */ -#define BULKSTAT_FG_QUICK 0x2 /* No iget, walk the dinode cluster */ -#define BULKSTAT_FG_INLINE 0x4 /* No iget if inline attrs */ - -/* * Return stat information in bulk (by-inode) for the filesystem. */ int /* error status */ @@ -56,10 +46,8 @@ xfs_bulkstat( xfs_ino_t *lastino, /* last inode returned */ int *count, /* size of buffer/count returned */ bulkstat_one_pf formatter, /* func that'd fill a single buf */ - void *private_data, /* private data for formatter */ size_t statstruct_size,/* sizeof struct that we're filling */ char __user *ubuffer,/* buffer with inode stats */ - int flags, /* flag to control access method */ int *done); /* 1 if there are more stats to get */ int @@ -82,9 +70,7 @@ xfs_bulkstat_one_int( void __user *buffer, int ubsize, bulkstat_one_fmt_pf formatter, - xfs_daddr_t bno, int *ubused, - void *dibuff, int *stat); int @@ -93,10 +79,7 @@ xfs_bulkstat_one( xfs_ino_t ino, void __user *buffer, int ubsize, - void *private_data, - xfs_daddr_t bno, int *ubused, - void *dibuff, int *stat); typedef int (*inumbers_fmt_pf)( diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 600b5b06aaeb..5215abc8023a 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -44,14 +44,8 @@ kmem_zone_t *xfs_log_ticket_zone; -#define xlog_write_adv_cnt(ptr, len, off, bytes) \ - { (ptr) += (bytes); \ - (len) -= (bytes); \ - (off) += (bytes);} - /* Local miscellaneous function prototypes */ -STATIC int xlog_bdstrat_cb(struct xfs_buf *); -STATIC int xlog_commit_record(xfs_mount_t *mp, xlog_ticket_t *ticket, +STATIC int xlog_commit_record(struct log *log, struct xlog_ticket *ticket, xlog_in_core_t **, xfs_lsn_t *); STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp, xfs_buftarg_t *log_target, @@ -60,11 +54,6 @@ STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp, STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes); STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); STATIC void xlog_dealloc_log(xlog_t *log); -STATIC int xlog_write(xfs_mount_t *mp, xfs_log_iovec_t region[], - int nentries, xfs_log_ticket_t tic, - xfs_lsn_t *start_lsn, - xlog_in_core_t **commit_iclog, - uint flags); /* local state machine functions */ STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int); @@ -80,11 +69,6 @@ STATIC int xlog_state_release_iclog(xlog_t *log, STATIC void xlog_state_switch_iclogs(xlog_t *log, xlog_in_core_t *iclog, int eventual_size); -STATIC int xlog_state_sync(xlog_t *log, - xfs_lsn_t lsn, - uint flags, - int *log_flushed); -STATIC int xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed); STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog); /* local functions to manipulate grant head */ @@ -99,16 +83,8 @@ STATIC int xlog_regrant_write_log_space(xlog_t *log, STATIC void xlog_ungrant_log_space(xlog_t *log, xlog_ticket_t *ticket); - -/* local ticket functions */ -STATIC xlog_ticket_t *xlog_ticket_alloc(xlog_t *log, - int unit_bytes, - int count, - char clientid, - uint flags); - #if defined(DEBUG) -STATIC void xlog_verify_dest_ptr(xlog_t *log, __psint_t ptr); +STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr); STATIC void xlog_verify_grant_head(xlog_t *log, int equals); STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog, int count, boolean_t syncing); @@ -249,14 +225,14 @@ xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type) * out when the next write occurs. */ xfs_lsn_t -xfs_log_done(xfs_mount_t *mp, - xfs_log_ticket_t xtic, - void **iclog, - uint flags) +xfs_log_done( + struct xfs_mount *mp, + struct xlog_ticket *ticket, + struct xlog_in_core **iclog, + uint flags) { - xlog_t *log = mp->m_log; - xlog_ticket_t *ticket = (xfs_log_ticket_t) xtic; - xfs_lsn_t lsn = 0; + struct log *log = mp->m_log; + xfs_lsn_t lsn = 0; if (XLOG_FORCED_SHUTDOWN(log) || /* @@ -264,8 +240,7 @@ xfs_log_done(xfs_mount_t *mp, * If we get an error, just continue and give back the log ticket. */ (((ticket->t_flags & XLOG_TIC_INITED) == 0) && - (xlog_commit_record(mp, ticket, - (xlog_in_core_t **)iclog, &lsn)))) { + (xlog_commit_record(log, ticket, iclog, &lsn)))) { lsn = (xfs_lsn_t) -1; if (ticket->t_flags & XLOG_TIC_PERM_RESERV) { flags |= XFS_LOG_REL_PERM_RESERV; @@ -295,67 +270,8 @@ xfs_log_done(xfs_mount_t *mp, } return lsn; -} /* xfs_log_done */ - - -/* - * Force the in-core log to disk. If flags == XFS_LOG_SYNC, - * the force is done synchronously. - * - * Asynchronous forces are implemented by setting the WANT_SYNC - * bit in the appropriate in-core log and then returning. - * - * Synchronous forces are implemented with a signal variable. All callers - * to force a given lsn to disk will wait on a the sv attached to the - * specific in-core log. When given in-core log finally completes its - * write to disk, that thread will wake up all threads waiting on the - * sv. - */ -int -_xfs_log_force( - xfs_mount_t *mp, - xfs_lsn_t lsn, - uint flags, - int *log_flushed) -{ - xlog_t *log = mp->m_log; - int dummy; - - if (!log_flushed) - log_flushed = &dummy; - - ASSERT(flags & XFS_LOG_FORCE); - - XFS_STATS_INC(xs_log_force); - - if (log->l_flags & XLOG_IO_ERROR) - return XFS_ERROR(EIO); - if (lsn == 0) - return xlog_state_sync_all(log, flags, log_flushed); - else - return xlog_state_sync(log, lsn, flags, log_flushed); -} /* _xfs_log_force */ - -/* - * Wrapper for _xfs_log_force(), to be used when caller doesn't care - * about errors or whether the log was flushed or not. This is the normal - * interface to use when trying to unpin items or move the log forward. - */ -void -xfs_log_force( - xfs_mount_t *mp, - xfs_lsn_t lsn, - uint flags) -{ - int error; - error = _xfs_log_force(mp, lsn, flags, NULL); - if (error) { - xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: " - "error %d returned.", error); - } } - /* * Attaches a new iclog I/O completion callback routine during * transaction commit. If the log is in error state, a non-zero @@ -363,11 +279,11 @@ xfs_log_force( * executing the callback at an appropriate time. */ int -xfs_log_notify(xfs_mount_t *mp, /* mount of partition */ - void *iclog_hndl, /* iclog to hang callback off */ - xfs_log_callback_t *cb) +xfs_log_notify( + struct xfs_mount *mp, + struct xlog_in_core *iclog, + xfs_log_callback_t *cb) { - xlog_in_core_t *iclog = (xlog_in_core_t *)iclog_hndl; int abortflg; spin_lock(&iclog->ic_callback_lock); @@ -381,16 +297,14 @@ xfs_log_notify(xfs_mount_t *mp, /* mount of partition */ } spin_unlock(&iclog->ic_callback_lock); return abortflg; -} /* xfs_log_notify */ +} int -xfs_log_release_iclog(xfs_mount_t *mp, - void *iclog_hndl) +xfs_log_release_iclog( + struct xfs_mount *mp, + struct xlog_in_core *iclog) { - xlog_t *log = mp->m_log; - xlog_in_core_t *iclog = (xlog_in_core_t *)iclog_hndl; - - if (xlog_state_release_iclog(log, iclog)) { + if (xlog_state_release_iclog(mp->m_log, iclog)) { xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); return EIO; } @@ -409,17 +323,18 @@ xfs_log_release_iclog(xfs_mount_t *mp, * reservation, we prevent over allocation problems. */ int -xfs_log_reserve(xfs_mount_t *mp, - int unit_bytes, - int cnt, - xfs_log_ticket_t *ticket, - __uint8_t client, - uint flags, - uint t_type) +xfs_log_reserve( + struct xfs_mount *mp, + int unit_bytes, + int cnt, + struct xlog_ticket **ticket, + __uint8_t client, + uint flags, + uint t_type) { - xlog_t *log = mp->m_log; - xlog_ticket_t *internal_ticket; - int retval = 0; + struct log *log = mp->m_log; + struct xlog_ticket *internal_ticket; + int retval = 0; ASSERT(client == XFS_TRANSACTION || client == XFS_LOG); ASSERT((flags & XFS_LOG_NOSLEEP) == 0); @@ -432,7 +347,16 @@ xfs_log_reserve(xfs_mount_t *mp, if (*ticket != NULL) { ASSERT(flags & XFS_LOG_PERM_RESERV); - internal_ticket = (xlog_ticket_t *)*ticket; + internal_ticket = *ticket; + + /* + * this is a new transaction on the ticket, so we need to + * change the transaction ID so that the next transaction has a + * different TID in the log. Just add one to the existing tid + * so that we can see chains of rolling transactions in the log + * easily. + */ + internal_ticket->t_tid++; trace_xfs_log_reserve(log, internal_ticket); @@ -441,7 +365,8 @@ xfs_log_reserve(xfs_mount_t *mp, } else { /* may sleep if need to allocate more tickets */ internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt, - client, flags); + client, flags, + KM_SLEEP|KM_MAYFAIL); if (!internal_ticket) return XFS_ERROR(ENOMEM); internal_ticket->t_trans_type = t_type; @@ -526,6 +451,13 @@ xfs_log_mount( /* Normal transactions can now occur */ mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY; + /* + * Now the log has been fully initialised and we know were our + * space grant counters are, we can initialise the permanent ticket + * needed for delayed logging to work. + */ + xlog_cil_init_post_recovery(mp->m_log); + return 0; out_destroy_ail: @@ -583,18 +515,10 @@ xfs_log_unmount_write(xfs_mount_t *mp) #ifdef DEBUG xlog_in_core_t *first_iclog; #endif - xfs_log_iovec_t reg[1]; - xfs_log_ticket_t tic = NULL; + xlog_ticket_t *tic = NULL; xfs_lsn_t lsn; int error; - /* the data section must be 32 bit size aligned */ - struct { - __uint16_t magic; - __uint16_t pad1; - __uint32_t pad2; /* may as well make it 64 bits */ - } magic = { XLOG_UNMOUNT_TYPE, 0, 0 }; - /* * Don't write out unmount record on read-only mounts. * Or, if we are doing a forced umount (typically because of IO errors). @@ -602,7 +526,7 @@ xfs_log_unmount_write(xfs_mount_t *mp) if (mp->m_flags & XFS_MOUNT_RDONLY) return 0; - error = _xfs_log_force(mp, 0, XFS_LOG_FORCE|XFS_LOG_SYNC, NULL); + error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL); ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log))); #ifdef DEBUG @@ -616,16 +540,30 @@ xfs_log_unmount_write(xfs_mount_t *mp) } while (iclog != first_iclog); #endif if (! (XLOG_FORCED_SHUTDOWN(log))) { - reg[0].i_addr = (void*)&magic; - reg[0].i_len = sizeof(magic); - XLOG_VEC_SET_TYPE(®[0], XLOG_REG_TYPE_UNMOUNT); - error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE); if (!error) { + /* the data section must be 32 bit size aligned */ + struct { + __uint16_t magic; + __uint16_t pad1; + __uint32_t pad2; /* may as well make it 64 bits */ + } magic = { + .magic = XLOG_UNMOUNT_TYPE, + }; + struct xfs_log_iovec reg = { + .i_addr = (void *)&magic, + .i_len = sizeof(magic), + .i_type = XLOG_REG_TYPE_UNMOUNT, + }; + struct xfs_log_vec vec = { + .lv_niovecs = 1, + .lv_iovecp = ®, + }; + /* remove inited flag */ - ((xlog_ticket_t *)tic)->t_flags = 0; - error = xlog_write(mp, reg, 1, tic, &lsn, + tic->t_flags = 0; + error = xlog_write(log, &vec, tic, &lsn, NULL, XLOG_UNMOUNT_TRANS); /* * At this point, we're umounting anyway, @@ -715,30 +653,54 @@ xfs_log_unmount(xfs_mount_t *mp) xlog_dealloc_log(mp->m_log); } +void +xfs_log_item_init( + struct xfs_mount *mp, + struct xfs_log_item *item, + int type, + struct xfs_item_ops *ops) +{ + item->li_mountp = mp; + item->li_ailp = mp->m_ail; + item->li_type = type; + item->li_ops = ops; + item->li_lv = NULL; + + INIT_LIST_HEAD(&item->li_ail); + INIT_LIST_HEAD(&item->li_cil); +} + /* * Write region vectors to log. The write happens using the space reservation * of the ticket (tic). It is not a requirement that all writes for a given - * transaction occur with one call to xfs_log_write(). + * transaction occur with one call to xfs_log_write(). However, it is important + * to note that the transaction reservation code makes an assumption about the + * number of log headers a transaction requires that may be violated if you + * don't pass all the transaction vectors in one call.... */ int -xfs_log_write(xfs_mount_t * mp, - xfs_log_iovec_t reg[], - int nentries, - xfs_log_ticket_t tic, - xfs_lsn_t *start_lsn) +xfs_log_write( + struct xfs_mount *mp, + struct xfs_log_iovec reg[], + int nentries, + struct xlog_ticket *tic, + xfs_lsn_t *start_lsn) { - int error; - xlog_t *log = mp->m_log; + struct log *log = mp->m_log; + int error; + struct xfs_log_vec vec = { + .lv_niovecs = nentries, + .lv_iovecp = reg, + }; if (XLOG_FORCED_SHUTDOWN(log)) return XFS_ERROR(EIO); - if ((error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0))) { + error = xlog_write(log, &vec, tic, start_lsn, NULL, 0); + if (error) xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); - } return error; -} /* xfs_log_write */ - +} void xfs_log_move_tail(xfs_mount_t *mp, @@ -812,9 +774,16 @@ xfs_log_move_tail(xfs_mount_t *mp, /* * Determine if we have a transaction that has gone to disk - * that needs to be covered. Log activity needs to be idle (no AIL and - * nothing in the iclogs). And, we need to be in the right state indicating - * something has gone out. + * that needs to be covered. To begin the transition to the idle state + * firstly the log needs to be idle (no AIL and nothing in the iclogs). + * If we are then in a state where covering is needed, the caller is informed + * that dummy transactions are required to move the log into the idle state. + * + * Because this is called as part of the sync process, we should also indicate + * that dummy transactions should be issued in anything but the covered or + * idle states. This ensures that the log tail is accurately reflected in + * the log at the end of the sync, hence if a crash occurrs avoids replay + * of transactions where the metadata is already on disk. */ int xfs_log_need_covered(xfs_mount_t *mp) @@ -826,17 +795,24 @@ xfs_log_need_covered(xfs_mount_t *mp) return 0; spin_lock(&log->l_icloglock); - if (((log->l_covered_state == XLOG_STATE_COVER_NEED) || - (log->l_covered_state == XLOG_STATE_COVER_NEED2)) - && !xfs_trans_ail_tail(log->l_ailp) - && xlog_iclogs_empty(log)) { - if (log->l_covered_state == XLOG_STATE_COVER_NEED) - log->l_covered_state = XLOG_STATE_COVER_DONE; - else { - ASSERT(log->l_covered_state == XLOG_STATE_COVER_NEED2); - log->l_covered_state = XLOG_STATE_COVER_DONE2; + switch (log->l_covered_state) { + case XLOG_STATE_COVER_DONE: + case XLOG_STATE_COVER_DONE2: + case XLOG_STATE_COVER_IDLE: + break; + case XLOG_STATE_COVER_NEED: + case XLOG_STATE_COVER_NEED2: + if (!xfs_trans_ail_tail(log->l_ailp) && + xlog_iclogs_empty(log)) { + if (log->l_covered_state == XLOG_STATE_COVER_NEED) + log->l_covered_state = XLOG_STATE_COVER_DONE; + else + log->l_covered_state = XLOG_STATE_COVER_DONE2; } + /* FALLTHRU */ + default: needed = 1; + break; } spin_unlock(&log->l_icloglock); return needed; @@ -988,35 +964,6 @@ xlog_iodone(xfs_buf_t *bp) } /* xlog_iodone */ /* - * The bdstrat callback function for log bufs. This gives us a central - * place to trap bufs in case we get hit by a log I/O error and need to - * shutdown. Actually, in practice, even when we didn't get a log error, - * we transition the iclogs to IOERROR state *after* flushing all existing - * iclogs to disk. This is because we don't want anymore new transactions to be - * started or completed afterwards. - */ -STATIC int -xlog_bdstrat_cb(struct xfs_buf *bp) -{ - xlog_in_core_t *iclog; - - iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *); - - if ((iclog->ic_state & XLOG_STATE_IOERROR) == 0) { - /* note for irix bstrat will need struct bdevsw passed - * Fix the following macro if the code ever is merged - */ - XFS_bdstrat(bp); - return 0; - } - - XFS_BUF_ERROR(bp, EIO); - XFS_BUF_STALE(bp); - xfs_biodone(bp); - return XFS_ERROR(EIO); -} - -/* * Return size of each in-core log record buffer. * * All machines get 8 x 32kB buffers by default, unless tuned otherwise. @@ -1102,6 +1049,7 @@ xlog_alloc_log(xfs_mount_t *mp, int i; int iclogsize; int error = ENOMEM; + uint log2_size = 0; log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL); if (!log) { @@ -1127,29 +1075,30 @@ xlog_alloc_log(xfs_mount_t *mp, error = EFSCORRUPTED; if (xfs_sb_version_hassector(&mp->m_sb)) { - log->l_sectbb_log = mp->m_sb.sb_logsectlog - BBSHIFT; - if (log->l_sectbb_log < 0 || - log->l_sectbb_log > mp->m_sectbb_log) { - xlog_warn("XFS: Log sector size (0x%x) out of range.", - log->l_sectbb_log); + log2_size = mp->m_sb.sb_logsectlog; + if (log2_size < BBSHIFT) { + xlog_warn("XFS: Log sector size too small " + "(0x%x < 0x%x)", log2_size, BBSHIFT); goto out_free_log; } - /* for larger sector sizes, must have v2 or external log */ - if (log->l_sectbb_log != 0 && - (log->l_logBBstart != 0 && - !xfs_sb_version_haslogv2(&mp->m_sb))) { - xlog_warn("XFS: log sector size (0x%x) invalid " - "for configuration.", log->l_sectbb_log); + log2_size -= BBSHIFT; + if (log2_size > mp->m_sectbb_log) { + xlog_warn("XFS: Log sector size too large " + "(0x%x > 0x%x)", log2_size, mp->m_sectbb_log); goto out_free_log; } - if (mp->m_sb.sb_logsectlog < BBSHIFT) { - xlog_warn("XFS: Log sector log (0x%x) too small.", - mp->m_sb.sb_logsectlog); + + /* for larger sector sizes, must have v2 or external log */ + if (log2_size && log->l_logBBstart > 0 && + !xfs_sb_version_haslogv2(&mp->m_sb)) { + + xlog_warn("XFS: log sector size (0x%x) invalid " + "for configuration.", log2_size); goto out_free_log; } } - log->l_sectbb_mask = (1 << log->l_sectbb_log) - 1; + log->l_sectBBsize = 1 << log2_size; xlog_get_iclog_buffer_size(mp, log); @@ -1158,7 +1107,6 @@ xlog_alloc_log(xfs_mount_t *mp, if (!bp) goto out_free_log; XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone); - XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb); XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); @@ -1196,7 +1144,6 @@ xlog_alloc_log(xfs_mount_t *mp, if (!XFS_BUF_CPSEMA(bp)) ASSERT(0); XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone); - XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb); XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); iclog->ic_bp = bp; iclog->ic_data = bp->b_addr; @@ -1231,6 +1178,9 @@ xlog_alloc_log(xfs_mount_t *mp, *iclogp = log->l_iclog; /* complete ring */ log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ + error = xlog_cil_init(log); + if (error) + goto out_free_iclog; return log; out_free_iclog: @@ -1258,26 +1208,31 @@ out: * ticket. Return the lsn of the commit record. */ STATIC int -xlog_commit_record(xfs_mount_t *mp, - xlog_ticket_t *ticket, - xlog_in_core_t **iclog, - xfs_lsn_t *commitlsnp) +xlog_commit_record( + struct log *log, + struct xlog_ticket *ticket, + struct xlog_in_core **iclog, + xfs_lsn_t *commitlsnp) { - int error; - xfs_log_iovec_t reg[1]; - - reg[0].i_addr = NULL; - reg[0].i_len = 0; - XLOG_VEC_SET_TYPE(®[0], XLOG_REG_TYPE_COMMIT); + struct xfs_mount *mp = log->l_mp; + int error; + struct xfs_log_iovec reg = { + .i_addr = NULL, + .i_len = 0, + .i_type = XLOG_REG_TYPE_COMMIT, + }; + struct xfs_log_vec vec = { + .lv_niovecs = 1, + .lv_iovecp = ®, + }; ASSERT_ALWAYS(iclog); - if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp, - iclog, XLOG_COMMIT_TRANS))) { + error = xlog_write(log, &vec, ticket, commitlsnp, iclog, + XLOG_COMMIT_TRANS); + if (error) xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); - } return error; -} /* xlog_commit_record */ - +} /* * Push on the buffer cache code if we ever use more than 75% of the on-disk @@ -1343,6 +1298,37 @@ xlog_grant_push_ail(xfs_mount_t *mp, xfs_trans_ail_push(log->l_ailp, threshold_lsn); } /* xlog_grant_push_ail */ +/* + * The bdstrat callback function for log bufs. This gives us a central + * place to trap bufs in case we get hit by a log I/O error and need to + * shutdown. Actually, in practice, even when we didn't get a log error, + * we transition the iclogs to IOERROR state *after* flushing all existing + * iclogs to disk. This is because we don't want anymore new transactions to be + * started or completed afterwards. + */ +STATIC int +xlog_bdstrat( + struct xfs_buf *bp) +{ + struct xlog_in_core *iclog; + + iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *); + if (iclog->ic_state & XLOG_STATE_IOERROR) { + XFS_BUF_ERROR(bp, EIO); + XFS_BUF_STALE(bp); + xfs_biodone(bp); + /* + * It would seem logical to return EIO here, but we rely on + * the log state machine to propagate I/O errors instead of + * doing it here. + */ + return 0; + } + + bp->b_flags |= _XBF_RUN_QUEUES; + xfs_buf_iorequest(bp); + return 0; +} /* * Flush out the in-core log (iclog) to the on-disk log in an asynchronous @@ -1462,7 +1448,7 @@ xlog_sync(xlog_t *log, */ XFS_BUF_WRITE(bp); - if ((error = XFS_bwrite(bp))) { + if ((error = xlog_bdstrat(bp))) { xfs_ioerror_alert("xlog_sync", log->l_mp, bp, XFS_BUF_ADDR(bp)); return error; @@ -1502,7 +1488,7 @@ xlog_sync(xlog_t *log, /* account for internal log which doesn't start at block #0 */ XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); XFS_BUF_WRITE(bp); - if ((error = XFS_bwrite(bp))) { + if ((error = xlog_bdstrat(bp))) { xfs_ioerror_alert("xlog_sync (split)", log->l_mp, bp, XFS_BUF_ADDR(bp)); return error; @@ -1521,6 +1507,8 @@ xlog_dealloc_log(xlog_t *log) xlog_in_core_t *iclog, *next_iclog; int i; + xlog_cil_destroy(log); + iclog = log->l_iclog; for (i=0; i<log->l_iclog_bufs; i++) { sv_destroy(&iclog->ic_force_wait); @@ -1563,8 +1551,10 @@ xlog_state_finish_copy(xlog_t *log, * print out info relating to regions written which consume * the reservation */ -STATIC void -xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket) +void +xlog_print_tic_res( + struct xfs_mount *mp, + struct xlog_ticket *ticket) { uint i; uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t); @@ -1664,6 +1654,196 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket) "bad-rtype" : res_type_str[r_type-1]), ticket->t_res_arr[i].r_len); } + + xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp, + "xfs_log_write: reservation ran out. Need to up reservation"); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); +} + +/* + * Calculate the potential space needed by the log vector. Each region gets + * its own xlog_op_header_t and may need to be double word aligned. + */ +static int +xlog_write_calc_vec_length( + struct xlog_ticket *ticket, + struct xfs_log_vec *log_vector) +{ + struct xfs_log_vec *lv; + int headers = 0; + int len = 0; + int i; + + /* acct for start rec of xact */ + if (ticket->t_flags & XLOG_TIC_INITED) + headers++; + + for (lv = log_vector; lv; lv = lv->lv_next) { + headers += lv->lv_niovecs; + + for (i = 0; i < lv->lv_niovecs; i++) { + struct xfs_log_iovec *vecp = &lv->lv_iovecp[i]; + + len += vecp->i_len; + xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type); + } + } + + ticket->t_res_num_ophdrs += headers; + len += headers * sizeof(struct xlog_op_header); + + return len; +} + +/* + * If first write for transaction, insert start record We can't be trying to + * commit if we are inited. We can't have any "partial_copy" if we are inited. + */ +static int +xlog_write_start_rec( + struct xlog_op_header *ophdr, + struct xlog_ticket *ticket) +{ + if (!(ticket->t_flags & XLOG_TIC_INITED)) + return 0; + + ophdr->oh_tid = cpu_to_be32(ticket->t_tid); + ophdr->oh_clientid = ticket->t_clientid; + ophdr->oh_len = 0; + ophdr->oh_flags = XLOG_START_TRANS; + ophdr->oh_res2 = 0; + + ticket->t_flags &= ~XLOG_TIC_INITED; + + return sizeof(struct xlog_op_header); +} + +static xlog_op_header_t * +xlog_write_setup_ophdr( + struct log *log, + struct xlog_op_header *ophdr, + struct xlog_ticket *ticket, + uint flags) +{ + ophdr->oh_tid = cpu_to_be32(ticket->t_tid); + ophdr->oh_clientid = ticket->t_clientid; + ophdr->oh_res2 = 0; + + /* are we copying a commit or unmount record? */ + ophdr->oh_flags = flags; + + /* + * We've seen logs corrupted with bad transaction client ids. This + * makes sure that XFS doesn't generate them on. Turn this into an EIO + * and shut down the filesystem. + */ + switch (ophdr->oh_clientid) { + case XFS_TRANSACTION: + case XFS_VOLUME: + case XFS_LOG: + break; + default: + xfs_fs_cmn_err(CE_WARN, log->l_mp, + "Bad XFS transaction clientid 0x%x in ticket 0x%p", + ophdr->oh_clientid, ticket); + return NULL; + } + + return ophdr; +} + +/* + * Set up the parameters of the region copy into the log. This has + * to handle region write split across multiple log buffers - this + * state is kept external to this function so that this code can + * can be written in an obvious, self documenting manner. + */ +static int +xlog_write_setup_copy( + struct xlog_ticket *ticket, + struct xlog_op_header *ophdr, + int space_available, + int space_required, + int *copy_off, + int *copy_len, + int *last_was_partial_copy, + int *bytes_consumed) +{ + int still_to_copy; + + still_to_copy = space_required - *bytes_consumed; + *copy_off = *bytes_consumed; + + if (still_to_copy <= space_available) { + /* write of region completes here */ + *copy_len = still_to_copy; + ophdr->oh_len = cpu_to_be32(*copy_len); + if (*last_was_partial_copy) + ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS); + *last_was_partial_copy = 0; + *bytes_consumed = 0; + return 0; + } + + /* partial write of region, needs extra log op header reservation */ + *copy_len = space_available; + ophdr->oh_len = cpu_to_be32(*copy_len); + ophdr->oh_flags |= XLOG_CONTINUE_TRANS; + if (*last_was_partial_copy) + ophdr->oh_flags |= XLOG_WAS_CONT_TRANS; + *bytes_consumed += *copy_len; + (*last_was_partial_copy)++; + + /* account for new log op header */ + ticket->t_curr_res -= sizeof(struct xlog_op_header); + ticket->t_res_num_ophdrs++; + + return sizeof(struct xlog_op_header); +} + +static int +xlog_write_copy_finish( + struct log *log, + struct xlog_in_core *iclog, + uint flags, + int *record_cnt, + int *data_cnt, + int *partial_copy, + int *partial_copy_len, + int log_offset, + struct xlog_in_core **commit_iclog) +{ + if (*partial_copy) { + /* + * This iclog has already been marked WANT_SYNC by + * xlog_state_get_iclog_space. + */ + xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); + *record_cnt = 0; + *data_cnt = 0; + return xlog_state_release_iclog(log, iclog); + } + + *partial_copy = 0; + *partial_copy_len = 0; + + if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) { + /* no more space in this iclog - push it. */ + xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); + *record_cnt = 0; + *data_cnt = 0; + + spin_lock(&log->l_icloglock); + xlog_state_want_sync(log, iclog); + spin_unlock(&log->l_icloglock); + + if (!commit_iclog) + return xlog_state_release_iclog(log, iclog); + ASSERT(flags & XLOG_COMMIT_TRANS); + *commit_iclog = iclog; + } + + return 0; } /* @@ -1706,211 +1886,163 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket) * we don't update ic_offset until the end when we know exactly how many * bytes have been written out. */ -STATIC int -xlog_write(xfs_mount_t * mp, - xfs_log_iovec_t reg[], - int nentries, - xfs_log_ticket_t tic, - xfs_lsn_t *start_lsn, - xlog_in_core_t **commit_iclog, - uint flags) -{ - xlog_t *log = mp->m_log; - xlog_ticket_t *ticket = (xlog_ticket_t *)tic; - xlog_in_core_t *iclog = NULL; /* ptr to current in-core log */ - xlog_op_header_t *logop_head; /* ptr to log operation header */ - __psint_t ptr; /* copy address into data region */ - int len; /* # xlog_write() bytes 2 still copy */ - int index; /* region index currently copying */ - int log_offset; /* offset (from 0) into data region */ - int start_rec_copy; /* # bytes to copy for start record */ - int partial_copy; /* did we split a region? */ - int partial_copy_len;/* # bytes copied if split region */ - int need_copy; /* # bytes need to memcpy this region */ - int copy_len; /* # bytes actually memcpy'ing */ - int copy_off; /* # bytes from entry start */ - int contwr; /* continued write of in-core log? */ - int error; - int record_cnt = 0, data_cnt = 0; - - partial_copy_len = partial_copy = 0; - - /* Calculate potential maximum space. Each region gets its own - * xlog_op_header_t and may need to be double word aligned. - */ - len = 0; - if (ticket->t_flags & XLOG_TIC_INITED) { /* acct for start rec of xact */ - len += sizeof(xlog_op_header_t); - ticket->t_res_num_ophdrs++; - } - - for (index = 0; index < nentries; index++) { - len += sizeof(xlog_op_header_t); /* each region gets >= 1 */ - ticket->t_res_num_ophdrs++; - len += reg[index].i_len; - xlog_tic_add_region(ticket, reg[index].i_len, reg[index].i_type); - } - contwr = *start_lsn = 0; +int +xlog_write( + struct log *log, + struct xfs_log_vec *log_vector, + struct xlog_ticket *ticket, + xfs_lsn_t *start_lsn, + struct xlog_in_core **commit_iclog, + uint flags) +{ + struct xlog_in_core *iclog = NULL; + struct xfs_log_iovec *vecp; + struct xfs_log_vec *lv; + int len; + int index; + int partial_copy = 0; + int partial_copy_len = 0; + int contwr = 0; + int record_cnt = 0; + int data_cnt = 0; + int error; + + *start_lsn = 0; + + len = xlog_write_calc_vec_length(ticket, log_vector); + if (log->l_cilp) { + /* + * Region headers and bytes are already accounted for. + * We only need to take into account start records and + * split regions in this function. + */ + if (ticket->t_flags & XLOG_TIC_INITED) + ticket->t_curr_res -= sizeof(xlog_op_header_t); - if (ticket->t_curr_res < len) { - xlog_print_tic_res(mp, ticket); -#ifdef DEBUG - xlog_panic( - "xfs_log_write: reservation ran out. Need to up reservation"); -#else - /* Customer configurable panic */ - xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp, - "xfs_log_write: reservation ran out. Need to up reservation"); - /* If we did not panic, shutdown the filesystem */ - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); -#endif - } else - ticket->t_curr_res -= len; + /* + * Commit record headers need to be accounted for. These + * come in as separate writes so are easy to detect. + */ + if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS)) + ticket->t_curr_res -= sizeof(xlog_op_header_t); + } else + ticket->t_curr_res -= len; + + if (ticket->t_curr_res < 0) + xlog_print_tic_res(log->l_mp, ticket); + + index = 0; + lv = log_vector; + vecp = lv->lv_iovecp; + while (lv && index < lv->lv_niovecs) { + void *ptr; + int log_offset; + + error = xlog_state_get_iclog_space(log, len, &iclog, ticket, + &contwr, &log_offset); + if (error) + return error; - for (index = 0; index < nentries; ) { - if ((error = xlog_state_get_iclog_space(log, len, &iclog, ticket, - &contwr, &log_offset))) - return error; + ASSERT(log_offset <= iclog->ic_size - 1); + ptr = iclog->ic_datap + log_offset; - ASSERT(log_offset <= iclog->ic_size - 1); - ptr = (__psint_t) ((char *)iclog->ic_datap+log_offset); + /* start_lsn is the first lsn written to. That's all we need. */ + if (!*start_lsn) + *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn); - /* start_lsn is the first lsn written to. That's all we need. */ - if (! *start_lsn) - *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn); + /* + * This loop writes out as many regions as can fit in the amount + * of space which was allocated by xlog_state_get_iclog_space(). + */ + while (lv && index < lv->lv_niovecs) { + struct xfs_log_iovec *reg = &vecp[index]; + struct xlog_op_header *ophdr; + int start_rec_copy; + int copy_len; + int copy_off; + + ASSERT(reg->i_len % sizeof(__int32_t) == 0); + ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0); + + start_rec_copy = xlog_write_start_rec(ptr, ticket); + if (start_rec_copy) { + record_cnt++; + xlog_write_adv_cnt(&ptr, &len, &log_offset, + start_rec_copy); + } - /* This loop writes out as many regions as can fit in the amount - * of space which was allocated by xlog_state_get_iclog_space(). - */ - while (index < nentries) { - ASSERT(reg[index].i_len % sizeof(__int32_t) == 0); - ASSERT((__psint_t)ptr % sizeof(__int32_t) == 0); - start_rec_copy = 0; - - /* If first write for transaction, insert start record. - * We can't be trying to commit if we are inited. We can't - * have any "partial_copy" if we are inited. - */ - if (ticket->t_flags & XLOG_TIC_INITED) { - logop_head = (xlog_op_header_t *)ptr; - logop_head->oh_tid = cpu_to_be32(ticket->t_tid); - logop_head->oh_clientid = ticket->t_clientid; - logop_head->oh_len = 0; - logop_head->oh_flags = XLOG_START_TRANS; - logop_head->oh_res2 = 0; - ticket->t_flags &= ~XLOG_TIC_INITED; /* clear bit */ - record_cnt++; - - start_rec_copy = sizeof(xlog_op_header_t); - xlog_write_adv_cnt(ptr, len, log_offset, start_rec_copy); - } + ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags); + if (!ophdr) + return XFS_ERROR(EIO); - /* Copy log operation header directly into data section */ - logop_head = (xlog_op_header_t *)ptr; - logop_head->oh_tid = cpu_to_be32(ticket->t_tid); - logop_head->oh_clientid = ticket->t_clientid; - logop_head->oh_res2 = 0; + xlog_write_adv_cnt(&ptr, &len, &log_offset, + sizeof(struct xlog_op_header)); + + len += xlog_write_setup_copy(ticket, ophdr, + iclog->ic_size-log_offset, + reg->i_len, + ©_off, ©_len, + &partial_copy, + &partial_copy_len); + xlog_verify_dest_ptr(log, ptr); + + /* copy region */ + ASSERT(copy_len >= 0); + memcpy(ptr, reg->i_addr + copy_off, copy_len); + xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len); + + copy_len += start_rec_copy + sizeof(xlog_op_header_t); + record_cnt++; + data_cnt += contwr ? copy_len : 0; + + error = xlog_write_copy_finish(log, iclog, flags, + &record_cnt, &data_cnt, + &partial_copy, + &partial_copy_len, + log_offset, + commit_iclog); + if (error) + return error; - /* header copied directly */ - xlog_write_adv_cnt(ptr, len, log_offset, sizeof(xlog_op_header_t)); + /* + * if we had a partial copy, we need to get more iclog + * space but we don't want to increment the region + * index because there is still more is this region to + * write. + * + * If we completed writing this region, and we flushed + * the iclog (indicated by resetting of the record + * count), then we also need to get more log space. If + * this was the last record, though, we are done and + * can just return. + */ + if (partial_copy) + break; - /* are we copying a commit or unmount record? */ - logop_head->oh_flags = flags; + if (++index == lv->lv_niovecs) { + lv = lv->lv_next; + index = 0; + if (lv) + vecp = lv->lv_iovecp; + } + if (record_cnt == 0) { + if (!lv) + return 0; + break; + } + } + } - /* - * We've seen logs corrupted with bad transaction client - * ids. This makes sure that XFS doesn't generate them on. - * Turn this into an EIO and shut down the filesystem. - */ - switch (logop_head->oh_clientid) { - case XFS_TRANSACTION: - case XFS_VOLUME: - case XFS_LOG: - break; - default: - xfs_fs_cmn_err(CE_WARN, mp, - "Bad XFS transaction clientid 0x%x in ticket 0x%p", - logop_head->oh_clientid, tic); - return XFS_ERROR(EIO); - } + ASSERT(len == 0); - /* Partial write last time? => (partial_copy != 0) - * need_copy is the amount we'd like to copy if everything could - * fit in the current memcpy. - */ - need_copy = reg[index].i_len - partial_copy_len; - - copy_off = partial_copy_len; - if (need_copy <= iclog->ic_size - log_offset) { /*complete write */ - copy_len = need_copy; - logop_head->oh_len = cpu_to_be32(copy_len); - if (partial_copy) - logop_head->oh_flags|= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS); - partial_copy_len = partial_copy = 0; - } else { /* partial write */ - copy_len = iclog->ic_size - log_offset; - logop_head->oh_len = cpu_to_be32(copy_len); - logop_head->oh_flags |= XLOG_CONTINUE_TRANS; - if (partial_copy) - logop_head->oh_flags |= XLOG_WAS_CONT_TRANS; - partial_copy_len += copy_len; - partial_copy++; - len += sizeof(xlog_op_header_t); /* from splitting of region */ - /* account for new log op header */ - ticket->t_curr_res -= sizeof(xlog_op_header_t); - ticket->t_res_num_ophdrs++; - } - xlog_verify_dest_ptr(log, ptr); - - /* copy region */ - ASSERT(copy_len >= 0); - memcpy((xfs_caddr_t)ptr, reg[index].i_addr + copy_off, copy_len); - xlog_write_adv_cnt(ptr, len, log_offset, copy_len); - - /* make copy_len total bytes copied, including headers */ - copy_len += start_rec_copy + sizeof(xlog_op_header_t); - record_cnt++; - data_cnt += contwr ? copy_len : 0; - if (partial_copy) { /* copied partial region */ - /* already marked WANT_SYNC by xlog_state_get_iclog_space */ - xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); - record_cnt = data_cnt = 0; - if ((error = xlog_state_release_iclog(log, iclog))) - return error; - break; /* don't increment index */ - } else { /* copied entire region */ - index++; - partial_copy_len = partial_copy = 0; - - if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) { - xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); - record_cnt = data_cnt = 0; - spin_lock(&log->l_icloglock); - xlog_state_want_sync(log, iclog); - spin_unlock(&log->l_icloglock); - if (commit_iclog) { - ASSERT(flags & XLOG_COMMIT_TRANS); - *commit_iclog = iclog; - } else if ((error = xlog_state_release_iclog(log, iclog))) - return error; - if (index == nentries) - return 0; /* we are done */ - else - break; - } - } /* if (partial_copy) */ - } /* while (index < nentries) */ - } /* for (index = 0; index < nentries; ) */ - ASSERT(len == 0); + xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); + if (!commit_iclog) + return xlog_state_release_iclog(log, iclog); - xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); - if (commit_iclog) { ASSERT(flags & XLOG_COMMIT_TRANS); *commit_iclog = iclog; return 0; - } - return xlog_state_release_iclog(log, iclog); -} /* xlog_write */ +} /***************************************************************************** @@ -2854,7 +2986,6 @@ xlog_state_switch_iclogs(xlog_t *log, log->l_iclog = iclog->ic_next; } /* xlog_state_switch_iclogs */ - /* * Write out all data in the in-core log as of this exact moment in time. * @@ -2882,11 +3013,19 @@ xlog_state_switch_iclogs(xlog_t *log, * b) when we return from flushing out this iclog, it is still * not in the active nor dirty state. */ -STATIC int -xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed) +int +_xfs_log_force( + struct xfs_mount *mp, + uint flags, + int *log_flushed) { - xlog_in_core_t *iclog; - xfs_lsn_t lsn; + struct log *log = mp->m_log; + struct xlog_in_core *iclog; + xfs_lsn_t lsn; + + XFS_STATS_INC(xs_log_force); + + xlog_cil_push(log, 1); spin_lock(&log->l_icloglock); @@ -2932,7 +3071,9 @@ xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed) if (xlog_state_release_iclog(log, iclog)) return XFS_ERROR(EIO); - *log_flushed = 1; + + if (log_flushed) + *log_flushed = 1; spin_lock(&log->l_icloglock); if (be64_to_cpu(iclog->ic_header.h_lsn) == lsn && iclog->ic_state != XLOG_STATE_DIRTY) @@ -2976,19 +3117,37 @@ maybe_sleep: */ if (iclog->ic_state & XLOG_STATE_IOERROR) return XFS_ERROR(EIO); - *log_flushed = 1; - + if (log_flushed) + *log_flushed = 1; } else { no_sleep: spin_unlock(&log->l_icloglock); } return 0; -} /* xlog_state_sync_all */ +} +/* + * Wrapper for _xfs_log_force(), to be used when caller doesn't care + * about errors or whether the log was flushed or not. This is the normal + * interface to use when trying to unpin items or move the log forward. + */ +void +xfs_log_force( + xfs_mount_t *mp, + uint flags) +{ + int error; + + error = _xfs_log_force(mp, flags, NULL); + if (error) { + xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: " + "error %d returned.", error); + } +} /* - * Used by code which implements synchronous log forces. + * Force the in-core log to disk for a specific LSN. * * Find in-core log with lsn. * If it is in the DIRTY state, just return. @@ -2996,109 +3155,148 @@ no_sleep: * state and go to sleep or return. * If it is in any other state, go to sleep or return. * - * If filesystem activity goes to zero, the iclog will get flushed only by - * bdflush(). + * Synchronous forces are implemented with a signal variable. All callers + * to force a given lsn to disk will wait on a the sv attached to the + * specific in-core log. When given in-core log finally completes its + * write to disk, that thread will wake up all threads waiting on the + * sv. */ -STATIC int -xlog_state_sync(xlog_t *log, - xfs_lsn_t lsn, - uint flags, - int *log_flushed) +int +_xfs_log_force_lsn( + struct xfs_mount *mp, + xfs_lsn_t lsn, + uint flags, + int *log_flushed) { - xlog_in_core_t *iclog; - int already_slept = 0; + struct log *log = mp->m_log; + struct xlog_in_core *iclog; + int already_slept = 0; -try_again: - spin_lock(&log->l_icloglock); - iclog = log->l_iclog; + ASSERT(lsn != 0); - if (iclog->ic_state & XLOG_STATE_IOERROR) { - spin_unlock(&log->l_icloglock); - return XFS_ERROR(EIO); - } + XFS_STATS_INC(xs_log_force); - do { - if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) { - iclog = iclog->ic_next; - continue; + if (log->l_cilp) { + lsn = xlog_cil_push_lsn(log, lsn); + if (lsn == NULLCOMMITLSN) + return 0; } - if (iclog->ic_state == XLOG_STATE_DIRTY) { +try_again: + spin_lock(&log->l_icloglock); + iclog = log->l_iclog; + if (iclog->ic_state & XLOG_STATE_IOERROR) { spin_unlock(&log->l_icloglock); - return 0; + return XFS_ERROR(EIO); } - if (iclog->ic_state == XLOG_STATE_ACTIVE) { - /* - * We sleep here if we haven't already slept (e.g. - * this is the first time we've looked at the correct - * iclog buf) and the buffer before us is going to - * be sync'ed. The reason for this is that if we - * are doing sync transactions here, by waiting for - * the previous I/O to complete, we can allow a few - * more transactions into this iclog before we close - * it down. - * - * Otherwise, we mark the buffer WANT_SYNC, and bump - * up the refcnt so we can release the log (which drops - * the ref count). The state switch keeps new transaction - * commits from using this buffer. When the current commits - * finish writing into the buffer, the refcount will drop to - * zero and the buffer will go out then. - */ - if (!already_slept && - (iclog->ic_prev->ic_state & (XLOG_STATE_WANT_SYNC | - XLOG_STATE_SYNCING))) { - ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR)); - XFS_STATS_INC(xs_log_force_sleep); - sv_wait(&iclog->ic_prev->ic_write_wait, PSWP, - &log->l_icloglock, s); - *log_flushed = 1; - already_slept = 1; - goto try_again; - } else { + do { + if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) { + iclog = iclog->ic_next; + continue; + } + + if (iclog->ic_state == XLOG_STATE_DIRTY) { + spin_unlock(&log->l_icloglock); + return 0; + } + + if (iclog->ic_state == XLOG_STATE_ACTIVE) { + /* + * We sleep here if we haven't already slept (e.g. + * this is the first time we've looked at the correct + * iclog buf) and the buffer before us is going to + * be sync'ed. The reason for this is that if we + * are doing sync transactions here, by waiting for + * the previous I/O to complete, we can allow a few + * more transactions into this iclog before we close + * it down. + * + * Otherwise, we mark the buffer WANT_SYNC, and bump + * up the refcnt so we can release the log (which + * drops the ref count). The state switch keeps new + * transaction commits from using this buffer. When + * the current commits finish writing into the buffer, + * the refcount will drop to zero and the buffer will + * go out then. + */ + if (!already_slept && + (iclog->ic_prev->ic_state & + (XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) { + ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR)); + + XFS_STATS_INC(xs_log_force_sleep); + + sv_wait(&iclog->ic_prev->ic_write_wait, + PSWP, &log->l_icloglock, s); + if (log_flushed) + *log_flushed = 1; + already_slept = 1; + goto try_again; + } atomic_inc(&iclog->ic_refcnt); xlog_state_switch_iclogs(log, iclog, 0); spin_unlock(&log->l_icloglock); if (xlog_state_release_iclog(log, iclog)) return XFS_ERROR(EIO); - *log_flushed = 1; + if (log_flushed) + *log_flushed = 1; spin_lock(&log->l_icloglock); } - } - if ((flags & XFS_LOG_SYNC) && /* sleep */ - !(iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) { + if ((flags & XFS_LOG_SYNC) && /* sleep */ + !(iclog->ic_state & + (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) { + /* + * Don't wait on completion if we know that we've + * gotten a log write error. + */ + if (iclog->ic_state & XLOG_STATE_IOERROR) { + spin_unlock(&log->l_icloglock); + return XFS_ERROR(EIO); + } + XFS_STATS_INC(xs_log_force_sleep); + sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s); + /* + * No need to grab the log lock here since we're + * only deciding whether or not to return EIO + * and the memory read should be atomic. + */ + if (iclog->ic_state & XLOG_STATE_IOERROR) + return XFS_ERROR(EIO); - /* - * Don't wait on completion if we know that we've - * gotten a log write error. - */ - if (iclog->ic_state & XLOG_STATE_IOERROR) { + if (log_flushed) + *log_flushed = 1; + } else { /* just return */ spin_unlock(&log->l_icloglock); - return XFS_ERROR(EIO); } - XFS_STATS_INC(xs_log_force_sleep); - sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s); - /* - * No need to grab the log lock here since we're - * only deciding whether or not to return EIO - * and the memory read should be atomic. - */ - if (iclog->ic_state & XLOG_STATE_IOERROR) - return XFS_ERROR(EIO); - *log_flushed = 1; - } else { /* just return */ - spin_unlock(&log->l_icloglock); - } - return 0; - } while (iclog != log->l_iclog); + return 0; + } while (iclog != log->l_iclog); - spin_unlock(&log->l_icloglock); - return 0; -} /* xlog_state_sync */ + spin_unlock(&log->l_icloglock); + return 0; +} +/* + * Wrapper for _xfs_log_force_lsn(), to be used when caller doesn't care + * about errors or whether the log was flushed or not. This is the normal + * interface to use when trying to unpin items or move the log forward. + */ +void +xfs_log_force_lsn( + xfs_mount_t *mp, + xfs_lsn_t lsn, + uint flags) +{ + int error; + + error = _xfs_log_force_lsn(mp, lsn, flags, NULL); + if (error) { + xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: " + "error %d returned.", error); + } +} /* * Called when we want to mark the current iclog as being ready to sync to @@ -3148,20 +3346,30 @@ xfs_log_ticket_get( return ticket; } +xlog_tid_t +xfs_log_get_trans_ident( + struct xfs_trans *tp) +{ + return tp->t_ticket->t_tid; +} + /* * Allocate and initialise a new log ticket. */ -STATIC xlog_ticket_t * -xlog_ticket_alloc(xlog_t *log, - int unit_bytes, - int cnt, - char client, - uint xflags) +xlog_ticket_t * +xlog_ticket_alloc( + struct log *log, + int unit_bytes, + int cnt, + char client, + uint xflags, + int alloc_flags) { - xlog_ticket_t *tic; + struct xlog_ticket *tic; uint num_headers; + int iclog_space; - tic = kmem_zone_zalloc(xfs_log_ticket_zone, KM_SLEEP|KM_MAYFAIL); + tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags); if (!tic) return NULL; @@ -3203,16 +3411,40 @@ xlog_ticket_alloc(xlog_t *log, /* for start-rec */ unit_bytes += sizeof(xlog_op_header_t); - /* for LR headers */ - num_headers = ((unit_bytes + log->l_iclog_size-1) >> log->l_iclog_size_log); + /* + * for LR headers - the space for data in an iclog is the size minus + * the space used for the headers. If we use the iclog size, then we + * undercalculate the number of headers required. + * + * Furthermore - the addition of op headers for split-recs might + * increase the space required enough to require more log and op + * headers, so take that into account too. + * + * IMPORTANT: This reservation makes the assumption that if this + * transaction is the first in an iclog and hence has the LR headers + * accounted to it, then the remaining space in the iclog is + * exclusively for this transaction. i.e. if the transaction is larger + * than the iclog, it will be the only thing in that iclog. + * Fundamentally, this means we must pass the entire log vector to + * xlog_write to guarantee this. + */ + iclog_space = log->l_iclog_size - log->l_iclog_hsize; + num_headers = howmany(unit_bytes, iclog_space); + + /* for split-recs - ophdrs added when data split over LRs */ + unit_bytes += sizeof(xlog_op_header_t) * num_headers; + + /* add extra header reservations if we overrun */ + while (!num_headers || + howmany(unit_bytes, iclog_space) > num_headers) { + unit_bytes += sizeof(xlog_op_header_t); + num_headers++; + } unit_bytes += log->l_iclog_hsize * num_headers; /* for commit-rec LR header - note: padding will subsume the ophdr */ unit_bytes += log->l_iclog_hsize; - /* for split-recs - ophdrs added when data split over LRs */ - unit_bytes += sizeof(xlog_op_header_t) * num_headers; - /* for roundoff padding for transaction data and one for commit record */ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) && log->l_mp->m_sb.sb_logsunit > 1) { @@ -3228,13 +3460,13 @@ xlog_ticket_alloc(xlog_t *log, tic->t_curr_res = unit_bytes; tic->t_cnt = cnt; tic->t_ocnt = cnt; - tic->t_tid = (xlog_tid_t)((__psint_t)tic & 0xffffffff); + tic->t_tid = random32(); tic->t_clientid = client; tic->t_flags = XLOG_TIC_INITED; tic->t_trans_type = 0; if (xflags & XFS_LOG_PERM_RESERV) tic->t_flags |= XLOG_TIC_PERM_RESERV; - sv_init(&(tic->t_wait), SV_DEFAULT, "logtick"); + sv_init(&tic->t_wait, SV_DEFAULT, "logtick"); xlog_tic_reset_res(tic); @@ -3255,20 +3487,22 @@ xlog_ticket_alloc(xlog_t *log, * part of the log in case we trash the log structure. */ void -xlog_verify_dest_ptr(xlog_t *log, - __psint_t ptr) +xlog_verify_dest_ptr( + struct log *log, + char *ptr) { int i; int good_ptr = 0; - for (i=0; i < log->l_iclog_bufs; i++) { - if (ptr >= (__psint_t)log->l_iclog_bak[i] && - ptr <= (__psint_t)log->l_iclog_bak[i]+log->l_iclog_size) + for (i = 0; i < log->l_iclog_bufs; i++) { + if (ptr >= log->l_iclog_bak[i] && + ptr <= log->l_iclog_bak[i] + log->l_iclog_size) good_ptr++; } - if (! good_ptr) + + if (!good_ptr) xlog_panic("xlog_verify_dest_ptr: invalid ptr"); -} /* xlog_verify_dest_ptr */ +} STATIC void xlog_verify_grant_head(xlog_t *log, int equals) @@ -3454,6 +3688,11 @@ xlog_state_ioerror( * c. nothing new gets queued up after (a) and (b) are done. * d. if !logerror, flush the iclogs to disk, then seal them off * for business. + * + * Note: for delayed logging the !logerror case needs to flush the regions + * held in memory out to the iclogs before flushing them to disk. This needs + * to be done before the log is marked as shutdown, otherwise the flush to the + * iclogs will fail. */ int xfs_log_force_umount( @@ -3463,7 +3702,6 @@ xfs_log_force_umount( xlog_ticket_t *tic; xlog_t *log; int retval; - int dummy; log = mp->m_log; @@ -3488,6 +3726,16 @@ xfs_log_force_umount( return 1; } retval = 0; + + /* + * Flush the in memory commit item list before marking the log as + * being shut down. We need to do it in this order to ensure all the + * completed transactions are flushed to disk with the xfs_log_force() + * call below. + */ + if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG)) + xlog_cil_push(log, 1); + /* * We must hold both the GRANT lock and the LOG lock, * before we mark the filesystem SHUTDOWN and wake @@ -3537,13 +3785,14 @@ xfs_log_force_umount( } spin_unlock(&log->l_grant_lock); - if (! (log->l_iclog->ic_state & XLOG_STATE_IOERROR)) { + if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) { ASSERT(!logerror); /* * Force the incore logs to disk before shutting the * log down completely. */ - xlog_state_sync_all(log, XFS_LOG_FORCE|XFS_LOG_SYNC, &dummy); + _xfs_log_force(mp, XFS_LOG_SYNC, NULL); + spin_lock(&log->l_icloglock); retval = xlog_state_ioerror(log); spin_unlock(&log->l_icloglock); diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index d0c9baa50b1a..04c78e642cc8 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -19,7 +19,6 @@ #define __XFS_LOG_H__ /* get lsn fields */ - #define CYCLE_LSN(lsn) ((uint)((lsn)>>32)) #define BLOCK_LSN(lsn) ((uint)(lsn)) @@ -70,14 +69,8 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2) * Flags to xfs_log_force() * * XFS_LOG_SYNC: Synchronous force in-core log to disk - * XFS_LOG_FORCE: Start in-core log write now. - * XFS_LOG_URGE: Start write within some window of time. - * - * Note: Either XFS_LOG_FORCE or XFS_LOG_URGE must be set. */ #define XFS_LOG_SYNC 0x1 -#define XFS_LOG_FORCE 0x2 -#define XFS_LOG_URGE 0x4 #endif /* __KERNEL__ */ @@ -110,15 +103,20 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2) #define XLOG_REG_TYPE_TRANSHDR 19 #define XLOG_REG_TYPE_MAX 19 -#define XLOG_VEC_SET_TYPE(vecp, t) ((vecp)->i_type = (t)) - typedef struct xfs_log_iovec { - xfs_caddr_t i_addr; /* beginning address of region */ + xfs_caddr_t i_addr; /* beginning address of region */ int i_len; /* length in bytes of region */ uint i_type; /* type of region */ } xfs_log_iovec_t; -typedef void* xfs_log_ticket_t; +struct xfs_log_vec { + struct xfs_log_vec *lv_next; /* next lv in build list */ + int lv_niovecs; /* number of iovecs in lv */ + struct xfs_log_iovec *lv_iovecp; /* iovec array */ + struct xfs_log_item *lv_item; /* owner */ + char *lv_buf; /* formatted buffer */ + int lv_buf_len; /* size of formatted buffer */ +}; /* * Structure used to pass callback function and the function's argument @@ -134,18 +132,33 @@ typedef struct xfs_log_callback { #ifdef __KERNEL__ /* Log manager interfaces */ struct xfs_mount; +struct xlog_in_core; struct xlog_ticket; +struct xfs_log_item; +struct xfs_item_ops; +struct xfs_trans; + +void xfs_log_item_init(struct xfs_mount *mp, + struct xfs_log_item *item, + int type, + struct xfs_item_ops *ops); + xfs_lsn_t xfs_log_done(struct xfs_mount *mp, - xfs_log_ticket_t ticket, - void **iclog, + struct xlog_ticket *ticket, + struct xlog_in_core **iclog, uint flags); int _xfs_log_force(struct xfs_mount *mp, - xfs_lsn_t lsn, uint flags, int *log_forced); void xfs_log_force(struct xfs_mount *mp, - xfs_lsn_t lsn, uint flags); +int _xfs_log_force_lsn(struct xfs_mount *mp, + xfs_lsn_t lsn, + uint flags, + int *log_forced); +void xfs_log_force_lsn(struct xfs_mount *mp, + xfs_lsn_t lsn, + uint flags); int xfs_log_mount(struct xfs_mount *mp, struct xfs_buftarg *log_target, xfs_daddr_t start_block, @@ -154,21 +167,21 @@ int xfs_log_mount_finish(struct xfs_mount *mp); void xfs_log_move_tail(struct xfs_mount *mp, xfs_lsn_t tail_lsn); int xfs_log_notify(struct xfs_mount *mp, - void *iclog, + struct xlog_in_core *iclog, xfs_log_callback_t *callback_entry); int xfs_log_release_iclog(struct xfs_mount *mp, - void *iclog_hndl); + struct xlog_in_core *iclog); int xfs_log_reserve(struct xfs_mount *mp, int length, int count, - xfs_log_ticket_t *ticket, + struct xlog_ticket **ticket, __uint8_t clientid, uint flags, uint t_type); int xfs_log_write(struct xfs_mount *mp, xfs_log_iovec_t region[], int nentries, - xfs_log_ticket_t ticket, + struct xlog_ticket *ticket, xfs_lsn_t *start_lsn); int xfs_log_unmount_write(struct xfs_mount *mp); void xfs_log_unmount(struct xfs_mount *mp); @@ -177,9 +190,16 @@ int xfs_log_need_covered(struct xfs_mount *mp); void xlog_iodone(struct xfs_buf *); -struct xlog_ticket * xfs_log_ticket_get(struct xlog_ticket *ticket); +struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); void xfs_log_ticket_put(struct xlog_ticket *ticket); +xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp); + +int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, + struct xfs_log_vec *log_vector, + xfs_lsn_t *commit_lsn, int flags); +bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); + #endif diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c new file mode 100644 index 000000000000..bb17cc044bf3 --- /dev/null +++ b/fs/xfs/xfs_log_cil.c @@ -0,0 +1,725 @@ +/* + * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_log_priv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_error.h" +#include "xfs_alloc.h" + +/* + * Perform initial CIL structure initialisation. If the CIL is not + * enabled in this filesystem, ensure the log->l_cilp is null so + * we can check this conditional to determine if we are doing delayed + * logging or not. + */ +int +xlog_cil_init( + struct log *log) +{ + struct xfs_cil *cil; + struct xfs_cil_ctx *ctx; + + log->l_cilp = NULL; + if (!(log->l_mp->m_flags & XFS_MOUNT_DELAYLOG)) + return 0; + + cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL); + if (!cil) + return ENOMEM; + + ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL); + if (!ctx) { + kmem_free(cil); + return ENOMEM; + } + + INIT_LIST_HEAD(&cil->xc_cil); + INIT_LIST_HEAD(&cil->xc_committing); + spin_lock_init(&cil->xc_cil_lock); + init_rwsem(&cil->xc_ctx_lock); + sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait"); + + INIT_LIST_HEAD(&ctx->committing); + INIT_LIST_HEAD(&ctx->busy_extents); + ctx->sequence = 1; + ctx->cil = cil; + cil->xc_ctx = ctx; + + cil->xc_log = log; + log->l_cilp = cil; + return 0; +} + +void +xlog_cil_destroy( + struct log *log) +{ + if (!log->l_cilp) + return; + + if (log->l_cilp->xc_ctx) { + if (log->l_cilp->xc_ctx->ticket) + xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket); + kmem_free(log->l_cilp->xc_ctx); + } + + ASSERT(list_empty(&log->l_cilp->xc_cil)); + kmem_free(log->l_cilp); +} + +/* + * Allocate a new ticket. Failing to get a new ticket makes it really hard to + * recover, so we don't allow failure here. Also, we allocate in a context that + * we don't want to be issuing transactions from, so we need to tell the + * allocation code this as well. + * + * We don't reserve any space for the ticket - we are going to steal whatever + * space we require from transactions as they commit. To ensure we reserve all + * the space required, we need to set the current reservation of the ticket to + * zero so that we know to steal the initial transaction overhead from the + * first transaction commit. + */ +static struct xlog_ticket * +xlog_cil_ticket_alloc( + struct log *log) +{ + struct xlog_ticket *tic; + + tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0, + KM_SLEEP|KM_NOFS); + tic->t_trans_type = XFS_TRANS_CHECKPOINT; + + /* + * set the current reservation to zero so we know to steal the basic + * transaction overhead reservation from the first transaction commit. + */ + tic->t_curr_res = 0; + return tic; +} + +/* + * After the first stage of log recovery is done, we know where the head and + * tail of the log are. We need this log initialisation done before we can + * initialise the first CIL checkpoint context. + * + * Here we allocate a log ticket to track space usage during a CIL push. This + * ticket is passed to xlog_write() directly so that we don't slowly leak log + * space by failing to account for space used by log headers and additional + * region headers for split regions. + */ +void +xlog_cil_init_post_recovery( + struct log *log) +{ + if (!log->l_cilp) + return; + + log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log); + log->l_cilp->xc_ctx->sequence = 1; + log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle, + log->l_curr_block); +} + +/* + * Insert the log item into the CIL and calculate the difference in space + * consumed by the item. Add the space to the checkpoint ticket and calculate + * if the change requires additional log metadata. If it does, take that space + * as well. Remove the amount of space we addded to the checkpoint ticket from + * the current transaction ticket so that the accounting works out correctly. + * + * If this is the first time the item is being placed into the CIL in this + * context, pin it so it can't be written to disk until the CIL is flushed to + * the iclog and the iclog written to disk. + */ +static void +xlog_cil_insert( + struct log *log, + struct xlog_ticket *ticket, + struct xfs_log_item *item, + struct xfs_log_vec *lv) +{ + struct xfs_cil *cil = log->l_cilp; + struct xfs_log_vec *old = lv->lv_item->li_lv; + struct xfs_cil_ctx *ctx = cil->xc_ctx; + int len; + int diff_iovecs; + int iclog_space; + + if (old) { + /* existing lv on log item, space used is a delta */ + ASSERT(!list_empty(&item->li_cil)); + ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs); + + len = lv->lv_buf_len - old->lv_buf_len; + diff_iovecs = lv->lv_niovecs - old->lv_niovecs; + kmem_free(old->lv_buf); + kmem_free(old); + } else { + /* new lv, must pin the log item */ + ASSERT(!lv->lv_item->li_lv); + ASSERT(list_empty(&item->li_cil)); + + len = lv->lv_buf_len; + diff_iovecs = lv->lv_niovecs; + IOP_PIN(lv->lv_item); + + } + len += diff_iovecs * sizeof(xlog_op_header_t); + + /* attach new log vector to log item */ + lv->lv_item->li_lv = lv; + + spin_lock(&cil->xc_cil_lock); + list_move_tail(&item->li_cil, &cil->xc_cil); + ctx->nvecs += diff_iovecs; + + /* + * If this is the first time the item is being committed to the CIL, + * store the sequence number on the log item so we can tell + * in future commits whether this is the first checkpoint the item is + * being committed into. + */ + if (!item->li_seq) + item->li_seq = ctx->sequence; + + /* + * Now transfer enough transaction reservation to the context ticket + * for the checkpoint. The context ticket is special - the unit + * reservation has to grow as well as the current reservation as we + * steal from tickets so we can correctly determine the space used + * during the transaction commit. + */ + if (ctx->ticket->t_curr_res == 0) { + /* first commit in checkpoint, steal the header reservation */ + ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len); + ctx->ticket->t_curr_res = ctx->ticket->t_unit_res; + ticket->t_curr_res -= ctx->ticket->t_unit_res; + } + + /* do we need space for more log record headers? */ + iclog_space = log->l_iclog_size - log->l_iclog_hsize; + if (len > 0 && (ctx->space_used / iclog_space != + (ctx->space_used + len) / iclog_space)) { + int hdrs; + + hdrs = (len + iclog_space - 1) / iclog_space; + /* need to take into account split region headers, too */ + hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header); + ctx->ticket->t_unit_res += hdrs; + ctx->ticket->t_curr_res += hdrs; + ticket->t_curr_res -= hdrs; + ASSERT(ticket->t_curr_res >= len); + } + ticket->t_curr_res -= len; + ctx->space_used += len; + + spin_unlock(&cil->xc_cil_lock); +} + +/* + * Format log item into a flat buffers + * + * For delayed logging, we need to hold a formatted buffer containing all the + * changes on the log item. This enables us to relog the item in memory and + * write it out asynchronously without needing to relock the object that was + * modified at the time it gets written into the iclog. + * + * This function builds a vector for the changes in each log item in the + * transaction. It then works out the length of the buffer needed for each log + * item, allocates them and formats the vector for the item into the buffer. + * The buffer is then attached to the log item are then inserted into the + * Committed Item List for tracking until the next checkpoint is written out. + * + * We don't set up region headers during this process; we simply copy the + * regions into the flat buffer. We can do this because we still have to do a + * formatting step to write the regions into the iclog buffer. Writing the + * ophdrs during the iclog write means that we can support splitting large + * regions across iclog boundares without needing a change in the format of the + * item/region encapsulation. + * + * Hence what we need to do now is change the rewrite the vector array to point + * to the copied region inside the buffer we just allocated. This allows us to + * format the regions into the iclog as though they are being formatted + * directly out of the objects themselves. + */ +static void +xlog_cil_format_items( + struct log *log, + struct xfs_log_vec *log_vector, + struct xlog_ticket *ticket, + xfs_lsn_t *start_lsn) +{ + struct xfs_log_vec *lv; + + if (start_lsn) + *start_lsn = log->l_cilp->xc_ctx->sequence; + + ASSERT(log_vector); + for (lv = log_vector; lv; lv = lv->lv_next) { + void *ptr; + int index; + int len = 0; + + /* build the vector array and calculate it's length */ + IOP_FORMAT(lv->lv_item, lv->lv_iovecp); + for (index = 0; index < lv->lv_niovecs; index++) + len += lv->lv_iovecp[index].i_len; + + lv->lv_buf_len = len; + lv->lv_buf = kmem_zalloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS); + ptr = lv->lv_buf; + + for (index = 0; index < lv->lv_niovecs; index++) { + struct xfs_log_iovec *vec = &lv->lv_iovecp[index]; + + memcpy(ptr, vec->i_addr, vec->i_len); + vec->i_addr = ptr; + ptr += vec->i_len; + } + ASSERT(ptr == lv->lv_buf + lv->lv_buf_len); + + xlog_cil_insert(log, ticket, lv->lv_item, lv); + } +} + +static void +xlog_cil_free_logvec( + struct xfs_log_vec *log_vector) +{ + struct xfs_log_vec *lv; + + for (lv = log_vector; lv; ) { + struct xfs_log_vec *next = lv->lv_next; + kmem_free(lv->lv_buf); + kmem_free(lv); + lv = next; + } +} + +/* + * Commit a transaction with the given vector to the Committed Item List. + * + * To do this, we need to format the item, pin it in memory if required and + * account for the space used by the transaction. Once we have done that we + * need to release the unused reservation for the transaction, attach the + * transaction to the checkpoint context so we carry the busy extents through + * to checkpoint completion, and then unlock all the items in the transaction. + * + * For more specific information about the order of operations in + * xfs_log_commit_cil() please refer to the comments in + * xfs_trans_commit_iclog(). + * + * Called with the context lock already held in read mode to lock out + * background commit, returns without it held once background commits are + * allowed again. + */ +int +xfs_log_commit_cil( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_log_vec *log_vector, + xfs_lsn_t *commit_lsn, + int flags) +{ + struct log *log = mp->m_log; + int log_flags = 0; + int push = 0; + + if (flags & XFS_TRANS_RELEASE_LOG_RES) + log_flags = XFS_LOG_REL_PERM_RESERV; + + if (XLOG_FORCED_SHUTDOWN(log)) { + xlog_cil_free_logvec(log_vector); + return XFS_ERROR(EIO); + } + + /* lock out background commit */ + down_read(&log->l_cilp->xc_ctx_lock); + xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn); + + /* check we didn't blow the reservation */ + if (tp->t_ticket->t_curr_res < 0) + xlog_print_tic_res(log->l_mp, tp->t_ticket); + + /* attach the transaction to the CIL if it has any busy extents */ + if (!list_empty(&tp->t_busy)) { + spin_lock(&log->l_cilp->xc_cil_lock); + list_splice_init(&tp->t_busy, + &log->l_cilp->xc_ctx->busy_extents); + spin_unlock(&log->l_cilp->xc_cil_lock); + } + + tp->t_commit_lsn = *commit_lsn; + xfs_log_done(mp, tp->t_ticket, NULL, log_flags); + xfs_trans_unreserve_and_mod_sb(tp); + + /* check for background commit before unlock */ + if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log)) + push = 1; + up_read(&log->l_cilp->xc_ctx_lock); + + /* + * We need to push CIL every so often so we don't cache more than we + * can fit in the log. The limit really is that a checkpoint can't be + * more than half the log (the current checkpoint is not allowed to + * overwrite the previous checkpoint), but commit latency and memory + * usage limit this to a smaller size in most cases. + */ + if (push) + xlog_cil_push(log, 0); + return 0; +} + +/* + * Mark all items committed and clear busy extents. We free the log vector + * chains in a separate pass so that we unpin the log items as quickly as + * possible. + */ +static void +xlog_cil_committed( + void *args, + int abort) +{ + struct xfs_cil_ctx *ctx = args; + struct xfs_log_vec *lv; + int abortflag = abort ? XFS_LI_ABORTED : 0; + struct xfs_busy_extent *busyp, *n; + + /* unpin all the log items */ + for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) { + xfs_trans_item_committed(lv->lv_item, ctx->start_lsn, + abortflag); + } + + list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list) + xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp); + + spin_lock(&ctx->cil->xc_cil_lock); + list_del(&ctx->committing); + spin_unlock(&ctx->cil->xc_cil_lock); + + xlog_cil_free_logvec(ctx->lv_chain); + kmem_free(ctx); +} + +/* + * Push the Committed Item List to the log. If the push_now flag is not set, + * then it is a background flush and so we can chose to ignore it. + */ +int +xlog_cil_push( + struct log *log, + int push_now) +{ + struct xfs_cil *cil = log->l_cilp; + struct xfs_log_vec *lv; + struct xfs_cil_ctx *ctx; + struct xfs_cil_ctx *new_ctx; + struct xlog_in_core *commit_iclog; + struct xlog_ticket *tic; + int num_lv; + int num_iovecs; + int len; + int error = 0; + struct xfs_trans_header thdr; + struct xfs_log_iovec lhdr; + struct xfs_log_vec lvhdr = { NULL }; + xfs_lsn_t commit_lsn; + + if (!cil) + return 0; + + new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); + new_ctx->ticket = xlog_cil_ticket_alloc(log); + + /* lock out transaction commit, but don't block on background push */ + if (!down_write_trylock(&cil->xc_ctx_lock)) { + if (!push_now) + goto out_free_ticket; + down_write(&cil->xc_ctx_lock); + } + ctx = cil->xc_ctx; + + /* check if we've anything to push */ + if (list_empty(&cil->xc_cil)) + goto out_skip; + + /* check for spurious background flush */ + if (!push_now && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) + goto out_skip; + + /* + * pull all the log vectors off the items in the CIL, and + * remove the items from the CIL. We don't need the CIL lock + * here because it's only needed on the transaction commit + * side which is currently locked out by the flush lock. + */ + lv = NULL; + num_lv = 0; + num_iovecs = 0; + len = 0; + while (!list_empty(&cil->xc_cil)) { + struct xfs_log_item *item; + int i; + + item = list_first_entry(&cil->xc_cil, + struct xfs_log_item, li_cil); + list_del_init(&item->li_cil); + if (!ctx->lv_chain) + ctx->lv_chain = item->li_lv; + else + lv->lv_next = item->li_lv; + lv = item->li_lv; + item->li_lv = NULL; + + num_lv++; + num_iovecs += lv->lv_niovecs; + for (i = 0; i < lv->lv_niovecs; i++) + len += lv->lv_iovecp[i].i_len; + } + + /* + * initialise the new context and attach it to the CIL. Then attach + * the current context to the CIL committing lsit so it can be found + * during log forces to extract the commit lsn of the sequence that + * needs to be forced. + */ + INIT_LIST_HEAD(&new_ctx->committing); + INIT_LIST_HEAD(&new_ctx->busy_extents); + new_ctx->sequence = ctx->sequence + 1; + new_ctx->cil = cil; + cil->xc_ctx = new_ctx; + + /* + * The switch is now done, so we can drop the context lock and move out + * of a shared context. We can't just go straight to the commit record, + * though - we need to synchronise with previous and future commits so + * that the commit records are correctly ordered in the log to ensure + * that we process items during log IO completion in the correct order. + * + * For example, if we get an EFI in one checkpoint and the EFD in the + * next (e.g. due to log forces), we do not want the checkpoint with + * the EFD to be committed before the checkpoint with the EFI. Hence + * we must strictly order the commit records of the checkpoints so + * that: a) the checkpoint callbacks are attached to the iclogs in the + * correct order; and b) the checkpoints are replayed in correct order + * in log recovery. + * + * Hence we need to add this context to the committing context list so + * that higher sequences will wait for us to write out a commit record + * before they do. + */ + spin_lock(&cil->xc_cil_lock); + list_add(&ctx->committing, &cil->xc_committing); + spin_unlock(&cil->xc_cil_lock); + up_write(&cil->xc_ctx_lock); + + /* + * Build a checkpoint transaction header and write it to the log to + * begin the transaction. We need to account for the space used by the + * transaction header here as it is not accounted for in xlog_write(). + * + * The LSN we need to pass to the log items on transaction commit is + * the LSN reported by the first log vector write. If we use the commit + * record lsn then we can move the tail beyond the grant write head. + */ + tic = ctx->ticket; + thdr.th_magic = XFS_TRANS_HEADER_MAGIC; + thdr.th_type = XFS_TRANS_CHECKPOINT; + thdr.th_tid = tic->t_tid; + thdr.th_num_items = num_iovecs; + lhdr.i_addr = (xfs_caddr_t)&thdr; + lhdr.i_len = sizeof(xfs_trans_header_t); + lhdr.i_type = XLOG_REG_TYPE_TRANSHDR; + tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t); + + lvhdr.lv_niovecs = 1; + lvhdr.lv_iovecp = &lhdr; + lvhdr.lv_next = ctx->lv_chain; + + error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0); + if (error) + goto out_abort; + + /* + * now that we've written the checkpoint into the log, strictly + * order the commit records so replay will get them in the right order. + */ +restart: + spin_lock(&cil->xc_cil_lock); + list_for_each_entry(new_ctx, &cil->xc_committing, committing) { + /* + * Higher sequences will wait for this one so skip them. + * Don't wait for own own sequence, either. + */ + if (new_ctx->sequence >= ctx->sequence) + continue; + if (!new_ctx->commit_lsn) { + /* + * It is still being pushed! Wait for the push to + * complete, then start again from the beginning. + */ + sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); + goto restart; + } + } + spin_unlock(&cil->xc_cil_lock); + + commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0); + if (error || commit_lsn == -1) + goto out_abort; + + /* attach all the transactions w/ busy extents to iclog */ + ctx->log_cb.cb_func = xlog_cil_committed; + ctx->log_cb.cb_arg = ctx; + error = xfs_log_notify(log->l_mp, commit_iclog, &ctx->log_cb); + if (error) + goto out_abort; + + /* + * now the checkpoint commit is complete and we've attached the + * callbacks to the iclog we can assign the commit LSN to the context + * and wake up anyone who is waiting for the commit to complete. + */ + spin_lock(&cil->xc_cil_lock); + ctx->commit_lsn = commit_lsn; + sv_broadcast(&cil->xc_commit_wait); + spin_unlock(&cil->xc_cil_lock); + + /* release the hounds! */ + return xfs_log_release_iclog(log->l_mp, commit_iclog); + +out_skip: + up_write(&cil->xc_ctx_lock); +out_free_ticket: + xfs_log_ticket_put(new_ctx->ticket); + kmem_free(new_ctx); + return 0; + +out_abort: + xlog_cil_committed(ctx, XFS_LI_ABORTED); + return XFS_ERROR(EIO); +} + +/* + * Conditionally push the CIL based on the sequence passed in. + * + * We only need to push if we haven't already pushed the sequence + * number given. Hence the only time we will trigger a push here is + * if the push sequence is the same as the current context. + * + * We return the current commit lsn to allow the callers to determine if a + * iclog flush is necessary following this call. + * + * XXX: Initially, just push the CIL unconditionally and return whatever + * commit lsn is there. It'll be empty, so this is broken for now. + */ +xfs_lsn_t +xlog_cil_push_lsn( + struct log *log, + xfs_lsn_t push_seq) +{ + struct xfs_cil *cil = log->l_cilp; + struct xfs_cil_ctx *ctx; + xfs_lsn_t commit_lsn = NULLCOMMITLSN; + +restart: + down_write(&cil->xc_ctx_lock); + ASSERT(push_seq <= cil->xc_ctx->sequence); + + /* check to see if we need to force out the current context */ + if (push_seq == cil->xc_ctx->sequence) { + up_write(&cil->xc_ctx_lock); + xlog_cil_push(log, 1); + goto restart; + } + + /* + * See if we can find a previous sequence still committing. + * We can drop the flush lock as soon as we have the cil lock + * because we are now only comparing contexts protected by + * the cil lock. + * + * We need to wait for all previous sequence commits to complete + * before allowing the force of push_seq to go ahead. Hence block + * on commits for those as well. + */ + spin_lock(&cil->xc_cil_lock); + up_write(&cil->xc_ctx_lock); + list_for_each_entry(ctx, &cil->xc_committing, committing) { + if (ctx->sequence > push_seq) + continue; + if (!ctx->commit_lsn) { + /* + * It is still being pushed! Wait for the push to + * complete, then start again from the beginning. + */ + sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); + goto restart; + } + if (ctx->sequence != push_seq) + continue; + /* found it! */ + commit_lsn = ctx->commit_lsn; + } + spin_unlock(&cil->xc_cil_lock); + return commit_lsn; +} + +/* + * Check if the current log item was first committed in this sequence. + * We can't rely on just the log item being in the CIL, we have to check + * the recorded commit sequence number. + * + * Note: for this to be used in a non-racy manner, it has to be called with + * CIL flushing locked out. As a result, it should only be used during the + * transaction commit process when deciding what to format into the item. + */ +bool +xfs_log_item_in_current_chkpt( + struct xfs_log_item *lip) +{ + struct xfs_cil_ctx *ctx; + + if (!(lip->li_mountp->m_flags & XFS_MOUNT_DELAYLOG)) + return false; + if (list_empty(&lip->li_cil)) + return false; + + ctx = lip->li_mountp->m_log->l_cilp->xc_ctx; + + /* + * li_seq is written on the first commit of a log item to record the + * first checkpoint it is written to. Hence if it is different to the + * current sequence, we're in a new checkpoint. + */ + if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0) + return false; + return true; +} diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index d55662db7077..8c072618965c 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -152,8 +152,6 @@ static inline uint xlog_get_client_id(__be32 i) #define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ #define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being shutdown */ -typedef __uint32_t xlog_tid_t; - #ifdef __KERNEL__ /* @@ -379,6 +377,99 @@ typedef struct xlog_in_core { } xlog_in_core_t; /* + * The CIL context is used to aggregate per-transaction details as well be + * passed to the iclog for checkpoint post-commit processing. After being + * passed to the iclog, another context needs to be allocated for tracking the + * next set of transactions to be aggregated into a checkpoint. + */ +struct xfs_cil; + +struct xfs_cil_ctx { + struct xfs_cil *cil; + xfs_lsn_t sequence; /* chkpt sequence # */ + xfs_lsn_t start_lsn; /* first LSN of chkpt commit */ + xfs_lsn_t commit_lsn; /* chkpt commit record lsn */ + struct xlog_ticket *ticket; /* chkpt ticket */ + int nvecs; /* number of regions */ + int space_used; /* aggregate size of regions */ + struct list_head busy_extents; /* busy extents in chkpt */ + struct xfs_log_vec *lv_chain; /* logvecs being pushed */ + xfs_log_callback_t log_cb; /* completion callback hook. */ + struct list_head committing; /* ctx committing list */ +}; + +/* + * Committed Item List structure + * + * This structure is used to track log items that have been committed but not + * yet written into the log. It is used only when the delayed logging mount + * option is enabled. + * + * This structure tracks the list of committing checkpoint contexts so + * we can avoid the problem of having to hold out new transactions during a + * flush until we have a the commit record LSN of the checkpoint. We can + * traverse the list of committing contexts in xlog_cil_push_lsn() to find a + * sequence match and extract the commit LSN directly from there. If the + * checkpoint is still in the process of committing, we can block waiting for + * the commit LSN to be determined as well. This should make synchronous + * operations almost as efficient as the old logging methods. + */ +struct xfs_cil { + struct log *xc_log; + struct list_head xc_cil; + spinlock_t xc_cil_lock; + struct xfs_cil_ctx *xc_ctx; + struct rw_semaphore xc_ctx_lock; + struct list_head xc_committing; + sv_t xc_commit_wait; +}; + +/* + * The amount of log space we should the CIL to aggregate is difficult to size. + * Whatever we chose we have to make we can get a reservation for the log space + * effectively, that it is large enough to capture sufficient relogging to + * reduce log buffer IO significantly, but it is not too large for the log or + * induces too much latency when writing out through the iclogs. We track both + * space consumed and the number of vectors in the checkpoint context, so we + * need to decide which to use for limiting. + * + * Every log buffer we write out during a push needs a header reserved, which + * is at least one sector and more for v2 logs. Hence we need a reservation of + * at least 512 bytes per 32k of log space just for the LR headers. That means + * 16KB of reservation per megabyte of delayed logging space we will consume, + * plus various headers. The number of headers will vary based on the num of + * io vectors, so limiting on a specific number of vectors is going to result + * in transactions of varying size. IOWs, it is more consistent to track and + * limit space consumed in the log rather than by the number of objects being + * logged in order to prevent checkpoint ticket overruns. + * + * Further, use of static reservations through the log grant mechanism is + * problematic. It introduces a lot of complexity (e.g. reserve grant vs write + * grant) and a significant deadlock potential because regranting write space + * can block on log pushes. Hence if we have to regrant log space during a log + * push, we can deadlock. + * + * However, we can avoid this by use of a dynamic "reservation stealing" + * technique during transaction commit whereby unused reservation space in the + * transaction ticket is transferred to the CIL ctx commit ticket to cover the + * space needed by the checkpoint transaction. This means that we never need to + * specifically reserve space for the CIL checkpoint transaction, nor do we + * need to regrant space once the checkpoint completes. This also means the + * checkpoint transaction ticket is specific to the checkpoint context, rather + * than the CIL itself. + * + * With dynamic reservations, we can basically make up arbitrary limits for the + * checkpoint size so long as they don't violate any other size rules. Hence + * the initial maximum size for the checkpoint transaction will be set to a + * quarter of the log or 8MB, which ever is smaller. 8MB is an arbitrary limit + * right now based on the latency of writing out a large amount of data through + * the circular iclog buffers. + */ + +#define XLOG_CIL_SPACE_LIMIT(log) \ + (min((log->l_logsize >> 2), (8 * 1024 * 1024))) + +/* * The reservation head lsn is not made up of a cycle number and block number. * Instead, it uses a cycle number and byte number. Logs don't expect to * overflow 31 bits worth of byte offset, so using a byte number will mean @@ -388,6 +479,7 @@ typedef struct log { /* The following fields don't need locking */ struct xfs_mount *l_mp; /* mount point */ struct xfs_ail *l_ailp; /* AIL log is working with */ + struct xfs_cil *l_cilp; /* CIL log is working with */ struct xfs_buf *l_xbuf; /* extra buffer for log * wrapping */ struct xfs_buftarg *l_targ; /* buftarg of log */ @@ -396,9 +488,7 @@ typedef struct log { struct xfs_buf_cancel **l_buf_cancel_table; int l_iclog_hsize; /* size of iclog header */ int l_iclog_heads; /* # of iclog header sectors */ - uint l_sectbb_log; /* log2 of sector size in BBs */ - uint l_sectbb_mask; /* sector size (in BBs) - * alignment mask */ + uint l_sectBBsize; /* sector size in BBs (2^n) */ int l_iclog_size; /* size of log in bytes */ int l_iclog_size_log; /* log power size of log */ int l_iclog_bufs; /* number of iclog buffers */ @@ -440,19 +530,40 @@ typedef struct log { #define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) - /* common routines */ extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); -extern int xlog_find_tail(xlog_t *log, - xfs_daddr_t *head_blk, - xfs_daddr_t *tail_blk); extern int xlog_recover(xlog_t *log); extern int xlog_recover_finish(xlog_t *log); extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); -extern struct xfs_buf *xlog_get_bp(xlog_t *, int); -extern void xlog_put_bp(struct xfs_buf *); -extern kmem_zone_t *xfs_log_ticket_zone; +extern kmem_zone_t *xfs_log_ticket_zone; +struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes, + int count, char client, uint xflags, + int alloc_flags); + + +static inline void +xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) +{ + *ptr += bytes; + *len -= bytes; + *off += bytes; +} + +void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); +int xlog_write(struct log *log, struct xfs_log_vec *log_vector, + struct xlog_ticket *tic, xfs_lsn_t *start_lsn, + xlog_in_core_t **commit_iclog, uint flags); + +/* + * Committed Item List interfaces + */ +int xlog_cil_init(struct log *log); +void xlog_cil_init_post_recovery(struct log *log); +void xlog_cil_destroy(struct log *log); + +int xlog_cil_push(struct log *log, int push_now); +xfs_lsn_t xlog_cil_push_lsn(struct log *log, xfs_lsn_t push_sequence); /* * Unmount record type is used as a pseudo transaction type for the ticket. diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 69ac2e5ef20c..9ac5cfab27b9 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -50,51 +50,81 @@ STATIC int xlog_find_zeroed(xlog_t *, xfs_daddr_t *); STATIC int xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t); -STATIC void xlog_recover_insert_item_backq(xlog_recover_item_t **q, - xlog_recover_item_t *item); #if defined(DEBUG) STATIC void xlog_recover_check_summary(xlog_t *); #else #define xlog_recover_check_summary(log) #endif - /* * Sector aligned buffer routines for buffer create/read/write/access */ -#define XLOG_SECTOR_ROUNDUP_BBCOUNT(log, bbs) \ - ( ((log)->l_sectbb_mask && (bbs & (log)->l_sectbb_mask)) ? \ - ((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) ) -#define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno) ((bno) & ~(log)->l_sectbb_mask) +/* + * Verify the given count of basic blocks is valid number of blocks + * to specify for an operation involving the given XFS log buffer. + * Returns nonzero if the count is valid, 0 otherwise. + */ -xfs_buf_t * +static inline int +xlog_buf_bbcount_valid( + xlog_t *log, + int bbcount) +{ + return bbcount > 0 && bbcount <= log->l_logBBsize; +} + +/* + * Allocate a buffer to hold log data. The buffer needs to be able + * to map to a range of nbblks basic blocks at any valid (basic + * block) offset within the log. + */ +STATIC xfs_buf_t * xlog_get_bp( xlog_t *log, int nbblks) { - if (nbblks <= 0 || nbblks > log->l_logBBsize) { - xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks); - XFS_ERROR_REPORT("xlog_get_bp(1)", - XFS_ERRLEVEL_HIGH, log->l_mp); + if (!xlog_buf_bbcount_valid(log, nbblks)) { + xlog_warn("XFS: Invalid block length (0x%x) given for buffer", + nbblks); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); return NULL; } - if (log->l_sectbb_log) { - if (nbblks > 1) - nbblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1); - nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); - } + /* + * We do log I/O in units of log sectors (a power-of-2 + * multiple of the basic block size), so we round up the + * requested size to acommodate the basic blocks required + * for complete log sectors. + * + * In addition, the buffer may be used for a non-sector- + * aligned block offset, in which case an I/O of the + * requested size could extend beyond the end of the + * buffer. If the requested size is only 1 basic block it + * will never straddle a sector boundary, so this won't be + * an issue. Nor will this be a problem if the log I/O is + * done in basic blocks (sector size 1). But otherwise we + * extend the buffer by one extra log sector to ensure + * there's space to accomodate this possiblility. + */ + if (nbblks > 1 && log->l_sectBBsize > 1) + nbblks += log->l_sectBBsize; + nbblks = round_up(nbblks, log->l_sectBBsize); + return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp); } -void +STATIC void xlog_put_bp( xfs_buf_t *bp) { xfs_buf_free(bp); } +/* + * Return the address of the start of the given block number's data + * in a log buffer. The buffer covers a log sector-aligned region. + */ STATIC xfs_caddr_t xlog_align( xlog_t *log, @@ -102,15 +132,10 @@ xlog_align( int nbblks, xfs_buf_t *bp) { - xfs_caddr_t ptr; - - if (!log->l_sectbb_log) - return XFS_BUF_PTR(bp); + xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1); - ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask); - ASSERT(XFS_BUF_SIZE(bp) >= - BBTOB(nbblks + (blk_no & log->l_sectbb_mask))); - return ptr; + ASSERT(BBTOB(offset + nbblks) <= XFS_BUF_SIZE(bp)); + return XFS_BUF_PTR(bp) + BBTOB(offset); } @@ -126,21 +151,18 @@ xlog_bread_noalign( { int error; - if (nbblks <= 0 || nbblks > log->l_logBBsize) { - xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks); - XFS_ERROR_REPORT("xlog_bread(1)", - XFS_ERRLEVEL_HIGH, log->l_mp); + if (!xlog_buf_bbcount_valid(log, nbblks)) { + xlog_warn("XFS: Invalid block length (0x%x) given for buffer", + nbblks); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); return EFSCORRUPTED; } - if (log->l_sectbb_log) { - blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no); - nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); - } + blk_no = round_down(blk_no, log->l_sectBBsize); + nbblks = round_up(nbblks, log->l_sectBBsize); ASSERT(nbblks > 0); ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); - ASSERT(bp); XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); XFS_BUF_READ(bp); @@ -188,17 +210,15 @@ xlog_bwrite( { int error; - if (nbblks <= 0 || nbblks > log->l_logBBsize) { - xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks); - XFS_ERROR_REPORT("xlog_bwrite(1)", - XFS_ERRLEVEL_HIGH, log->l_mp); + if (!xlog_buf_bbcount_valid(log, nbblks)) { + xlog_warn("XFS: Invalid block length (0x%x) given for buffer", + nbblks); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); return EFSCORRUPTED; } - if (log->l_sectbb_log) { - blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no); - nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); - } + blk_no = round_down(blk_no, log->l_sectBBsize); + nbblks = round_up(nbblks, log->l_sectBBsize); ASSERT(nbblks > 0); ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); @@ -329,39 +349,38 @@ xlog_find_cycle_start( { xfs_caddr_t offset; xfs_daddr_t mid_blk; + xfs_daddr_t end_blk; uint mid_cycle; int error; - mid_blk = BLK_AVG(first_blk, *last_blk); - while (mid_blk != first_blk && mid_blk != *last_blk) { + end_blk = *last_blk; + mid_blk = BLK_AVG(first_blk, end_blk); + while (mid_blk != first_blk && mid_blk != end_blk) { error = xlog_bread(log, mid_blk, 1, bp, &offset); if (error) return error; mid_cycle = xlog_get_cycle(offset); - if (mid_cycle == cycle) { - *last_blk = mid_blk; - /* last_half_cycle == mid_cycle */ - } else { - first_blk = mid_blk; - /* first_half_cycle == mid_cycle */ - } - mid_blk = BLK_AVG(first_blk, *last_blk); + if (mid_cycle == cycle) + end_blk = mid_blk; /* last_half_cycle == mid_cycle */ + else + first_blk = mid_blk; /* first_half_cycle == mid_cycle */ + mid_blk = BLK_AVG(first_blk, end_blk); } - ASSERT((mid_blk == first_blk && mid_blk+1 == *last_blk) || - (mid_blk == *last_blk && mid_blk-1 == first_blk)); + ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) || + (mid_blk == end_blk && mid_blk-1 == first_blk)); + + *last_blk = end_blk; return 0; } /* - * Check that the range of blocks does not contain the cycle number - * given. The scan needs to occur from front to back and the ptr into the - * region must be updated since a later routine will need to perform another - * test. If the region is completely good, we end up returning the same - * last block number. - * - * Set blkno to -1 if we encounter no errors. This is an invalid block number - * since we don't ever expect logs to get this large. + * Check that a range of blocks does not contain stop_on_cycle_no. + * Fill in *new_blk with the block offset where such a block is + * found, or with -1 (an invalid block number) if there is no such + * block in the range. The scan needs to occur from front to back + * and the pointer into the region must be updated since a later + * routine will need to perform another test. */ STATIC int xlog_find_verify_cycle( @@ -378,12 +397,16 @@ xlog_find_verify_cycle( xfs_caddr_t buf = NULL; int error = 0; + /* + * Greedily allocate a buffer big enough to handle the full + * range of basic blocks we'll be examining. If that fails, + * try a smaller size. We need to be able to read at least + * a log sector, or we're out of luck. + */ bufblks = 1 << ffs(nbblks); - while (!(bp = xlog_get_bp(log, bufblks))) { - /* can't get enough memory to do everything in one big buffer */ bufblks >>= 1; - if (bufblks <= log->l_sectbb_log) + if (bufblks < log->l_sectBBsize) return ENOMEM; } @@ -631,7 +654,7 @@ xlog_find_head( * In this case we want to find the first block with cycle * number matching last_half_cycle. We expect the log to be * some variation on - * x + 1 ... | x ... + * x + 1 ... | x ... | x * The first block with cycle number x (last_half_cycle) will * be where the new head belongs. First we do a binary search * for the first occurrence of last_half_cycle. The binary @@ -641,11 +664,13 @@ xlog_find_head( * the log, then we look for occurrences of last_half_cycle - 1 * at the end of the log. The cases we're looking for look * like - * x + 1 ... | x | x + 1 | x ... - * ^ binary search stopped here + * v binary search stopped here + * x + 1 ... | x | x + 1 | x ... | x + * ^ but we want to locate this spot * or - * x + 1 ... | x ... | x - 1 | x * <---------> less than scan distance + * x + 1 ... | x ... | x - 1 | x + * ^ we want to locate this spot */ stop_on_cycle = last_half_cycle; if ((error = xlog_find_cycle_start(log, bp, first_blk, @@ -701,16 +726,16 @@ xlog_find_head( * certainly not the head of the log. By searching for * last_half_cycle-1 we accomplish that. */ - start_blk = log_bbnum - num_scan_bblks + head_blk; ASSERT(head_blk <= INT_MAX && - (xfs_daddr_t) num_scan_bblks - head_blk >= 0); + (xfs_daddr_t) num_scan_bblks >= head_blk); + start_blk = log_bbnum - (num_scan_bblks - head_blk); if ((error = xlog_find_verify_cycle(log, start_blk, num_scan_bblks - (int)head_blk, (stop_on_cycle - 1), &new_blk))) goto bp_err; if (new_blk != -1) { head_blk = new_blk; - goto bad_blk; + goto validate_head; } /* @@ -728,7 +753,7 @@ xlog_find_head( head_blk = new_blk; } - bad_blk: +validate_head: /* * Now we need to make sure head_blk is not pointing to a block in * the middle of a log record. @@ -750,7 +775,7 @@ xlog_find_head( if ((error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0)) == -1) { /* We hit the beginning of the log during our search */ - start_blk = log_bbnum - num_scan_bblks + head_blk; + start_blk = log_bbnum - (num_scan_bblks - head_blk); new_blk = log_bbnum; ASSERT(start_blk <= INT_MAX && (xfs_daddr_t) log_bbnum-start_blk >= 0); @@ -805,7 +830,7 @@ xlog_find_head( * We could speed up search by using current head_blk buffer, but it is not * available. */ -int +STATIC int xlog_find_tail( xlog_t *log, xfs_daddr_t *head_blk, @@ -835,12 +860,12 @@ xlog_find_tail( if (*head_blk == 0) { /* special case */ error = xlog_bread(log, 0, 1, bp, &offset); if (error) - goto bread_err; + goto done; if (xlog_get_cycle(offset) == 0) { *tail_blk = 0; /* leave all other log inited values alone */ - goto exit; + goto done; } } @@ -851,7 +876,7 @@ xlog_find_tail( for (i = (int)(*head_blk) - 1; i >= 0; i--) { error = xlog_bread(log, i, 1, bp, &offset); if (error) - goto bread_err; + goto done; if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) { found = 1; @@ -868,7 +893,7 @@ xlog_find_tail( for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) { error = xlog_bread(log, i, 1, bp, &offset); if (error) - goto bread_err; + goto done; if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) { @@ -943,7 +968,7 @@ xlog_find_tail( umount_data_blk = (i + hblks) % log->l_logBBsize; error = xlog_bread(log, umount_data_blk, 1, bp, &offset); if (error) - goto bread_err; + goto done; op_head = (xlog_op_header_t *)offset; if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) { @@ -989,12 +1014,10 @@ xlog_find_tail( * But... if the -device- itself is readonly, just skip this. * We can't recover this device anyway, so it won't matter. */ - if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) { + if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) error = xlog_clear_stale_blocks(log, tail_lsn); - } -bread_err: -exit: +done: xlog_put_bp(bp); if (error) @@ -1154,16 +1177,22 @@ xlog_write_log_records( xfs_caddr_t offset; xfs_buf_t *bp; int balign, ealign; - int sectbb = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1); + int sectbb = log->l_sectBBsize; int end_block = start_block + blocks; int bufblks; int error = 0; int i, j = 0; + /* + * Greedily allocate a buffer big enough to handle the full + * range of basic blocks to be written. If that fails, try + * a smaller size. We need to be able to write at least a + * log sector, or we're out of luck. + */ bufblks = 1 << ffs(blocks); while (!(bp = xlog_get_bp(log, bufblks))) { bufblks >>= 1; - if (bufblks <= log->l_sectbb_log) + if (bufblks < sectbb) return ENOMEM; } @@ -1171,7 +1200,7 @@ xlog_write_log_records( * the buffer in the starting sector not covered by the first * write below. */ - balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block); + balign = round_down(start_block, sectbb); if (balign != start_block) { error = xlog_bread_noalign(log, start_block, 1, bp); if (error) @@ -1190,7 +1219,7 @@ xlog_write_log_records( * the buffer in the final sector not covered by the write. * If this is the same sector as the above read, skip it. */ - ealign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, end_block); + ealign = round_down(end_block, sectbb); if (j == 0 && (start_block + endcount > ealign)) { offset = XFS_BUF_PTR(bp); balign = BBTOB(ealign - start_block); @@ -1367,40 +1396,50 @@ xlog_clear_stale_blocks( STATIC xlog_recover_t * xlog_recover_find_tid( - xlog_recover_t *q, + struct hlist_head *head, xlog_tid_t tid) { - xlog_recover_t *p = q; + xlog_recover_t *trans; + struct hlist_node *n; - while (p != NULL) { - if (p->r_log_tid == tid) - break; - p = p->r_next; + hlist_for_each_entry(trans, n, head, r_list) { + if (trans->r_log_tid == tid) + return trans; } - return p; + return NULL; } STATIC void -xlog_recover_put_hashq( - xlog_recover_t **q, - xlog_recover_t *trans) +xlog_recover_new_tid( + struct hlist_head *head, + xlog_tid_t tid, + xfs_lsn_t lsn) { - trans->r_next = *q; - *q = trans; + xlog_recover_t *trans; + + trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP); + trans->r_log_tid = tid; + trans->r_lsn = lsn; + INIT_LIST_HEAD(&trans->r_itemq); + + INIT_HLIST_NODE(&trans->r_list); + hlist_add_head(&trans->r_list, head); } STATIC void xlog_recover_add_item( - xlog_recover_item_t **itemq) + struct list_head *head) { xlog_recover_item_t *item; item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP); - xlog_recover_insert_item_backq(itemq, item); + INIT_LIST_HEAD(&item->ri_list); + list_add_tail(&item->ri_list, head); } STATIC int xlog_recover_add_to_cont_trans( + struct log *log, xlog_recover_t *trans, xfs_caddr_t dp, int len) @@ -1409,8 +1448,7 @@ xlog_recover_add_to_cont_trans( xfs_caddr_t ptr, old_ptr; int old_len; - item = trans->r_itemq; - if (item == NULL) { + if (list_empty(&trans->r_itemq)) { /* finish copying rest of trans header */ xlog_recover_add_item(&trans->r_itemq); ptr = (xfs_caddr_t) &trans->r_theader + @@ -1418,7 +1456,8 @@ xlog_recover_add_to_cont_trans( memcpy(ptr, dp, len); /* d, s, l */ return 0; } - item = item->ri_prev; + /* take the tail entry */ + item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; old_len = item->ri_buf[item->ri_cnt-1].i_len; @@ -1427,6 +1466,7 @@ xlog_recover_add_to_cont_trans( memcpy(&ptr[old_len], dp, len); /* d, s, l */ item->ri_buf[item->ri_cnt-1].i_len += len; item->ri_buf[item->ri_cnt-1].i_addr = ptr; + trace_xfs_log_recover_item_add_cont(log, trans, item, 0); return 0; } @@ -1445,6 +1485,7 @@ xlog_recover_add_to_cont_trans( */ STATIC int xlog_recover_add_to_trans( + struct log *log, xlog_recover_t *trans, xfs_caddr_t dp, int len) @@ -1455,8 +1496,7 @@ xlog_recover_add_to_trans( if (!len) return 0; - item = trans->r_itemq; - if (item == NULL) { + if (list_empty(&trans->r_itemq)) { /* we need to catch log corruptions here */ if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) { xlog_warn("XFS: xlog_recover_add_to_trans: " @@ -1474,12 +1514,15 @@ xlog_recover_add_to_trans( memcpy(ptr, dp, len); in_f = (xfs_inode_log_format_t *)ptr; - if (item->ri_prev->ri_total != 0 && - item->ri_prev->ri_total == item->ri_prev->ri_cnt) { + /* take the tail entry */ + item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); + if (item->ri_total != 0 && + item->ri_total == item->ri_cnt) { + /* tail item is in use, get a new one */ xlog_recover_add_item(&trans->r_itemq); + item = list_entry(trans->r_itemq.prev, + xlog_recover_item_t, ri_list); } - item = trans->r_itemq; - item = item->ri_prev; if (item->ri_total == 0) { /* first region to be added */ if (in_f->ilf_size == 0 || @@ -1501,99 +1544,37 @@ xlog_recover_add_to_trans( item->ri_buf[item->ri_cnt].i_addr = ptr; item->ri_buf[item->ri_cnt].i_len = len; item->ri_cnt++; + trace_xfs_log_recover_item_add(log, trans, item, 0); return 0; } -STATIC void -xlog_recover_new_tid( - xlog_recover_t **q, - xlog_tid_t tid, - xfs_lsn_t lsn) -{ - xlog_recover_t *trans; - - trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP); - trans->r_log_tid = tid; - trans->r_lsn = lsn; - xlog_recover_put_hashq(q, trans); -} - -STATIC int -xlog_recover_unlink_tid( - xlog_recover_t **q, - xlog_recover_t *trans) -{ - xlog_recover_t *tp; - int found = 0; - - ASSERT(trans != NULL); - if (trans == *q) { - *q = (*q)->r_next; - } else { - tp = *q; - while (tp) { - if (tp->r_next == trans) { - found = 1; - break; - } - tp = tp->r_next; - } - if (!found) { - xlog_warn( - "XFS: xlog_recover_unlink_tid: trans not found"); - ASSERT(0); - return XFS_ERROR(EIO); - } - tp->r_next = tp->r_next->r_next; - } - return 0; -} - -STATIC void -xlog_recover_insert_item_backq( - xlog_recover_item_t **q, - xlog_recover_item_t *item) -{ - if (*q == NULL) { - item->ri_prev = item->ri_next = item; - *q = item; - } else { - item->ri_next = *q; - item->ri_prev = (*q)->ri_prev; - (*q)->ri_prev = item; - item->ri_prev->ri_next = item; - } -} - -STATIC void -xlog_recover_insert_item_frontq( - xlog_recover_item_t **q, - xlog_recover_item_t *item) -{ - xlog_recover_insert_item_backq(q, item); - *q = item; -} - +/* + * Sort the log items in the transaction. Cancelled buffers need + * to be put first so they are processed before any items that might + * modify the buffers. If they are cancelled, then the modifications + * don't need to be replayed. + */ STATIC int xlog_recover_reorder_trans( - xlog_recover_t *trans) + struct log *log, + xlog_recover_t *trans, + int pass) { - xlog_recover_item_t *first_item, *itemq, *itemq_next; - xfs_buf_log_format_t *buf_f; - ushort flags = 0; + xlog_recover_item_t *item, *n; + LIST_HEAD(sort_list); + + list_splice_init(&trans->r_itemq, &sort_list); + list_for_each_entry_safe(item, n, &sort_list, ri_list) { + xfs_buf_log_format_t *buf_f; - first_item = itemq = trans->r_itemq; - trans->r_itemq = NULL; - do { - itemq_next = itemq->ri_next; - buf_f = (xfs_buf_log_format_t *)itemq->ri_buf[0].i_addr; + buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr; - switch (ITEM_TYPE(itemq)) { + switch (ITEM_TYPE(item)) { case XFS_LI_BUF: - flags = buf_f->blf_flags; - if (!(flags & XFS_BLI_CANCEL)) { - xlog_recover_insert_item_frontq(&trans->r_itemq, - itemq); + if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) { + trace_xfs_log_recover_item_reorder_head(log, + trans, item, pass); + list_move(&item->ri_list, &trans->r_itemq); break; } case XFS_LI_INODE: @@ -1601,7 +1582,9 @@ xlog_recover_reorder_trans( case XFS_LI_QUOTAOFF: case XFS_LI_EFD: case XFS_LI_EFI: - xlog_recover_insert_item_backq(&trans->r_itemq, itemq); + trace_xfs_log_recover_item_reorder_tail(log, + trans, item, pass); + list_move_tail(&item->ri_list, &trans->r_itemq); break; default: xlog_warn( @@ -1609,8 +1592,8 @@ xlog_recover_reorder_trans( ASSERT(0); return XFS_ERROR(EIO); } - itemq = itemq_next; - } while (first_item != itemq); + } + ASSERT(list_empty(&sort_list)); return 0; } @@ -1650,8 +1633,10 @@ xlog_recover_do_buffer_pass1( /* * If this isn't a cancel buffer item, then just return. */ - if (!(flags & XFS_BLI_CANCEL)) + if (!(flags & XFS_BLF_CANCEL)) { + trace_xfs_log_recover_buf_not_cancel(log, buf_f); return; + } /* * Insert an xfs_buf_cancel record into the hash table of @@ -1685,6 +1670,7 @@ xlog_recover_do_buffer_pass1( while (nextp != NULL) { if (nextp->bc_blkno == blkno && nextp->bc_len == len) { nextp->bc_refcount++; + trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f); return; } prevp = nextp; @@ -1698,13 +1684,14 @@ xlog_recover_do_buffer_pass1( bcp->bc_refcount = 1; bcp->bc_next = NULL; prevp->bc_next = bcp; + trace_xfs_log_recover_buf_cancel_add(log, buf_f); } /* * Check to see whether the buffer being recovered has a corresponding * entry in the buffer cancel record table. If it does then return 1 * so that it will be cancelled, otherwise return 0. If the buffer is - * actually a buffer cancel item (XFS_BLI_CANCEL is set), then decrement + * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement * the refcount on the entry in the table and remove it from the table * if this is the last reference. * @@ -1729,7 +1716,7 @@ xlog_check_buffer_cancelled( * There is nothing in the table built in pass one, * so this buffer must not be cancelled. */ - ASSERT(!(flags & XFS_BLI_CANCEL)); + ASSERT(!(flags & XFS_BLF_CANCEL)); return 0; } @@ -1741,7 +1728,7 @@ xlog_check_buffer_cancelled( * There is no corresponding entry in the table built * in pass one, so this buffer has not been cancelled. */ - ASSERT(!(flags & XFS_BLI_CANCEL)); + ASSERT(!(flags & XFS_BLF_CANCEL)); return 0; } @@ -1760,7 +1747,7 @@ xlog_check_buffer_cancelled( * one in the table and remove it if this is the * last reference. */ - if (flags & XFS_BLI_CANCEL) { + if (flags & XFS_BLF_CANCEL) { bcp->bc_refcount--; if (bcp->bc_refcount == 0) { if (prevp == NULL) { @@ -1780,7 +1767,7 @@ xlog_check_buffer_cancelled( * We didn't find a corresponding entry in the table, so * return 0 so that the buffer is NOT cancelled. */ - ASSERT(!(flags & XFS_BLI_CANCEL)); + ASSERT(!(flags & XFS_BLF_CANCEL)); return 0; } @@ -1837,6 +1824,8 @@ xlog_recover_do_inode_buffer( unsigned int *data_map = NULL; unsigned int map_size = 0; + trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); + switch (buf_f->blf_type) { case XFS_LI_BUF: data_map = buf_f->blf_data_map; @@ -1880,8 +1869,8 @@ xlog_recover_do_inode_buffer( nbits = xfs_contig_bits(data_map, map_size, bit); ASSERT(nbits > 0); - reg_buf_offset = bit << XFS_BLI_SHIFT; - reg_buf_bytes = nbits << XFS_BLI_SHIFT; + reg_buf_offset = bit << XFS_BLF_SHIFT; + reg_buf_bytes = nbits << XFS_BLF_SHIFT; item_index++; } @@ -1895,7 +1884,7 @@ xlog_recover_do_inode_buffer( } ASSERT(item->ri_buf[item_index].i_addr != NULL); - ASSERT((item->ri_buf[item_index].i_len % XFS_BLI_CHUNK) == 0); + ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp)); /* @@ -1932,6 +1921,7 @@ xlog_recover_do_inode_buffer( /*ARGSUSED*/ STATIC void xlog_recover_do_reg_buffer( + struct xfs_mount *mp, xlog_recover_item_t *item, xfs_buf_t *bp, xfs_buf_log_format_t *buf_f) @@ -1943,6 +1933,8 @@ xlog_recover_do_reg_buffer( unsigned int map_size = 0; int error; + trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); + switch (buf_f->blf_type) { case XFS_LI_BUF: data_map = buf_f->blf_data_map; @@ -1958,9 +1950,9 @@ xlog_recover_do_reg_buffer( nbits = xfs_contig_bits(data_map, map_size, bit); ASSERT(nbits > 0); ASSERT(item->ri_buf[i].i_addr != NULL); - ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0); + ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); ASSERT(XFS_BUF_COUNT(bp) >= - ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT)); + ((uint)bit << XFS_BLF_SHIFT)+(nbits<<XFS_BLF_SHIFT)); /* * Do a sanity check if this is a dquot buffer. Just checking @@ -1969,7 +1961,7 @@ xlog_recover_do_reg_buffer( */ error = 0; if (buf_f->blf_flags & - (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { + (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { if (item->ri_buf[i].i_addr == NULL) { cmn_err(CE_ALERT, "XFS: NULL dquot in %s.", __func__); @@ -1990,9 +1982,9 @@ xlog_recover_do_reg_buffer( } memcpy(xfs_buf_offset(bp, - (uint)bit << XFS_BLI_SHIFT), /* dest */ + (uint)bit << XFS_BLF_SHIFT), /* dest */ item->ri_buf[i].i_addr, /* source */ - nbits<<XFS_BLI_SHIFT); /* length */ + nbits<<XFS_BLF_SHIFT); /* length */ next: i++; bit += nbits; @@ -2141,6 +2133,8 @@ xlog_recover_do_dquot_buffer( { uint type; + trace_xfs_log_recover_buf_dquot_buf(log, buf_f); + /* * Filesystems are required to send in quota flags at mount time. */ @@ -2149,11 +2143,11 @@ xlog_recover_do_dquot_buffer( } type = 0; - if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF) + if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF) type |= XFS_DQ_USER; - if (buf_f->blf_flags & XFS_BLI_PDQUOT_BUF) + if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF) type |= XFS_DQ_PROJ; - if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF) + if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF) type |= XFS_DQ_GROUP; /* * This type of quotas was turned off, so ignore this buffer @@ -2161,7 +2155,7 @@ xlog_recover_do_dquot_buffer( if (log->l_quotaoffs_flag & type) return; - xlog_recover_do_reg_buffer(item, bp, buf_f); + xlog_recover_do_reg_buffer(mp, item, bp, buf_f); } /* @@ -2174,7 +2168,7 @@ xlog_recover_do_dquot_buffer( * here which overlaps that may be stale. * * When meta-data buffers are freed at run time we log a buffer item - * with the XFS_BLI_CANCEL bit set to indicate that previous copies + * with the XFS_BLF_CANCEL bit set to indicate that previous copies * of the buffer in the log should not be replayed at recovery time. * This is so that if the blocks covered by the buffer are reused for * file data before we crash we don't end up replaying old, freed @@ -2208,7 +2202,7 @@ xlog_recover_do_buffer_trans( if (pass == XLOG_RECOVER_PASS1) { /* * In this pass we're only looking for buf items - * with the XFS_BLI_CANCEL bit set. + * with the XFS_BLF_CANCEL bit set. */ xlog_recover_do_buffer_pass1(log, buf_f); return 0; @@ -2222,9 +2216,11 @@ xlog_recover_do_buffer_trans( */ cancel = xlog_recover_do_buffer_pass2(log, buf_f); if (cancel) { + trace_xfs_log_recover_buf_cancel(log, buf_f); return 0; } } + trace_xfs_log_recover_buf_recover(log, buf_f); switch (buf_f->blf_type) { case XFS_LI_BUF: blkno = buf_f->blf_blkno; @@ -2242,9 +2238,9 @@ xlog_recover_do_buffer_trans( } mp = log->l_mp; - buf_flags = XFS_BUF_LOCK; - if (!(flags & XFS_BLI_INODE_BUF)) - buf_flags |= XFS_BUF_MAPPED; + buf_flags = XBF_LOCK; + if (!(flags & XFS_BLF_INODE_BUF)) + buf_flags |= XBF_MAPPED; bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags); if (XFS_BUF_ISERROR(bp)) { @@ -2256,13 +2252,13 @@ xlog_recover_do_buffer_trans( } error = 0; - if (flags & XFS_BLI_INODE_BUF) { + if (flags & XFS_BLF_INODE_BUF) { error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); } else if (flags & - (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { + (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); } else { - xlog_recover_do_reg_buffer(item, bp, buf_f); + xlog_recover_do_reg_buffer(mp, item, bp, buf_f); } if (error) return XFS_ERROR(error); @@ -2342,11 +2338,13 @@ xlog_recover_do_inode_trans( if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno, in_f->ilf_len, 0)) { error = 0; + trace_xfs_log_recover_inode_cancel(log, in_f); goto error; } + trace_xfs_log_recover_inode_recover(log, in_f); bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, - XFS_BUF_LOCK); + XBF_LOCK); if (XFS_BUF_ISERROR(bp)) { xfs_ioerror_alert("xlog_recover_do..(read#2)", mp, bp, in_f->ilf_blkno); @@ -2395,6 +2393,7 @@ xlog_recover_do_inode_trans( /* do nothing */ } else { xfs_buf_relse(bp); + trace_xfs_log_recover_inode_skip(log, in_f); error = 0; goto error; } @@ -2814,14 +2813,14 @@ xlog_recover_do_trans( int pass) { int error = 0; - xlog_recover_item_t *item, *first_item; + xlog_recover_item_t *item; - error = xlog_recover_reorder_trans(trans); + error = xlog_recover_reorder_trans(log, trans, pass); if (error) return error; - first_item = item = trans->r_itemq; - do { + list_for_each_entry(item, &trans->r_itemq, ri_list) { + trace_xfs_log_recover_item_recover(log, trans, item, pass); switch (ITEM_TYPE(item)) { case XFS_LI_BUF: error = xlog_recover_do_buffer_trans(log, item, pass); @@ -2854,8 +2853,7 @@ xlog_recover_do_trans( if (error) return error; - item = item->ri_next; - } while (first_item != item); + } return 0; } @@ -2869,21 +2867,18 @@ STATIC void xlog_recover_free_trans( xlog_recover_t *trans) { - xlog_recover_item_t *first_item, *item, *free_item; + xlog_recover_item_t *item, *n; int i; - item = first_item = trans->r_itemq; - do { - free_item = item; - item = item->ri_next; - /* Free the regions in the item. */ - for (i = 0; i < free_item->ri_cnt; i++) { - kmem_free(free_item->ri_buf[i].i_addr); - } + list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) { + /* Free the regions in the item. */ + list_del(&item->ri_list); + for (i = 0; i < item->ri_cnt; i++) + kmem_free(item->ri_buf[i].i_addr); /* Free the item itself */ - kmem_free(free_item->ri_buf); - kmem_free(free_item); - } while (first_item != item); + kmem_free(item->ri_buf); + kmem_free(item); + } /* Free the transaction recover structure */ kmem_free(trans); } @@ -2891,14 +2886,12 @@ xlog_recover_free_trans( STATIC int xlog_recover_commit_trans( xlog_t *log, - xlog_recover_t **q, xlog_recover_t *trans, int pass) { int error; - if ((error = xlog_recover_unlink_tid(q, trans))) - return error; + hlist_del(&trans->r_list); if ((error = xlog_recover_do_trans(log, trans, pass))) return error; xlog_recover_free_trans(trans); /* no error */ @@ -2926,7 +2919,7 @@ xlog_recover_unmount_trans( STATIC int xlog_recover_process_data( xlog_t *log, - xlog_recover_t *rhash[], + struct hlist_head rhash[], xlog_rec_header_t *rhead, xfs_caddr_t dp, int pass) @@ -2960,7 +2953,7 @@ xlog_recover_process_data( } tid = be32_to_cpu(ohead->oh_tid); hash = XLOG_RHASH(tid); - trans = xlog_recover_find_tid(rhash[hash], tid); + trans = xlog_recover_find_tid(&rhash[hash], tid); if (trans == NULL) { /* not found; add new tid */ if (ohead->oh_flags & XLOG_START_TRANS) xlog_recover_new_tid(&rhash[hash], tid, @@ -2978,14 +2971,15 @@ xlog_recover_process_data( switch (flags) { case XLOG_COMMIT_TRANS: error = xlog_recover_commit_trans(log, - &rhash[hash], trans, pass); + trans, pass); break; case XLOG_UNMOUNT_TRANS: error = xlog_recover_unmount_trans(trans); break; case XLOG_WAS_CONT_TRANS: - error = xlog_recover_add_to_cont_trans(trans, - dp, be32_to_cpu(ohead->oh_len)); + error = xlog_recover_add_to_cont_trans(log, + trans, dp, + be32_to_cpu(ohead->oh_len)); break; case XLOG_START_TRANS: xlog_warn( @@ -2995,7 +2989,7 @@ xlog_recover_process_data( break; case 0: case XLOG_CONTINUE_TRANS: - error = xlog_recover_add_to_trans(trans, + error = xlog_recover_add_to_trans(log, trans, dp, be32_to_cpu(ohead->oh_len)); break; default: @@ -3204,14 +3198,14 @@ xlog_recover_process_one_iunlink( int error; ino = XFS_AGINO_TO_INO(mp, agno, agino); - error = xfs_iget(mp, NULL, ino, 0, 0, &ip, 0); + error = xfs_iget(mp, NULL, ino, 0, 0, &ip); if (error) goto fail; /* * Get the on disk inode to find the next inode in the bucket. */ - error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XFS_BUF_LOCK); + error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XBF_LOCK); if (error) goto fail_iput; @@ -3396,42 +3390,6 @@ xlog_pack_data( } } -#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY) -STATIC void -xlog_unpack_data_checksum( - xlog_rec_header_t *rhead, - xfs_caddr_t dp, - xlog_t *log) -{ - __be32 *up = (__be32 *)dp; - uint chksum = 0; - int i; - - /* divide length by 4 to get # words */ - for (i=0; i < be32_to_cpu(rhead->h_len) >> 2; i++) { - chksum ^= be32_to_cpu(*up); - up++; - } - if (chksum != be32_to_cpu(rhead->h_chksum)) { - if (rhead->h_chksum || - ((log->l_flags & XLOG_CHKSUM_MISMATCH) == 0)) { - cmn_err(CE_DEBUG, - "XFS: LogR chksum mismatch: was (0x%x) is (0x%x)\n", - be32_to_cpu(rhead->h_chksum), chksum); - cmn_err(CE_DEBUG, -"XFS: Disregard message if filesystem was created with non-DEBUG kernel"); - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { - cmn_err(CE_DEBUG, - "XFS: LogR this is a LogV2 filesystem\n"); - } - log->l_flags |= XLOG_CHKSUM_MISMATCH; - } - } -} -#else -#define xlog_unpack_data_checksum(rhead, dp, log) -#endif - STATIC void xlog_unpack_data( xlog_rec_header_t *rhead, @@ -3455,8 +3413,6 @@ xlog_unpack_data( dp += BBSIZE; } } - - xlog_unpack_data_checksum(rhead, dp, log); } STATIC int @@ -3517,7 +3473,7 @@ xlog_do_recovery_pass( int error = 0, h_size; int bblks, split_bblks; int hblks, split_hblks, wrapped_hblks; - xlog_recover_t *rhash[XLOG_RHASH_SIZE]; + struct hlist_head rhash[XLOG_RHASH_SIZE]; ASSERT(head_blk != tail_blk); @@ -3555,7 +3511,7 @@ xlog_do_recovery_pass( hblks = 1; } } else { - ASSERT(log->l_sectbb_log == 0); + ASSERT(log->l_sectBBsize == 1); hblks = 1; hbp = xlog_get_bp(log, 1); h_size = XLOG_BIG_RECORD_BSIZE; @@ -3978,8 +3934,7 @@ xlog_recover_finish( * case the unlink transactions would have problems * pushing the EFIs out of the way. */ - xfs_log_force(log->l_mp, (xfs_lsn_t)0, - (XFS_LOG_FORCE | XFS_LOG_SYNC)); + xfs_log_force(log->l_mp, XFS_LOG_SYNC); xlog_recover_process_iunlinks(log); @@ -4012,10 +3967,6 @@ xlog_recover_check_summary( xfs_agf_t *agfp; xfs_buf_t *agfbp; xfs_buf_t *agibp; - xfs_buf_t *sbbp; -#ifdef XFS_LOUD_RECOVERY - xfs_sb_t *sbp; -#endif xfs_agnumber_t agno; __uint64_t freeblks; __uint64_t itotal; @@ -4050,30 +4001,5 @@ xlog_recover_check_summary( xfs_buf_relse(agibp); } } - - sbbp = xfs_getsb(mp, 0); -#ifdef XFS_LOUD_RECOVERY - sbp = &mp->m_sb; - xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(sbbp)); - cmn_err(CE_NOTE, - "xlog_recover_check_summary: sb_icount %Lu itotal %Lu", - sbp->sb_icount, itotal); - cmn_err(CE_NOTE, - "xlog_recover_check_summary: sb_ifree %Lu itotal %Lu", - sbp->sb_ifree, ifree); - cmn_err(CE_NOTE, - "xlog_recover_check_summary: sb_fdblocks %Lu freeblks %Lu", - sbp->sb_fdblocks, freeblks); -#if 0 - /* - * This is turned off until I account for the allocation - * btree blocks which live in free space. - */ - ASSERT(sbp->sb_icount == itotal); - ASSERT(sbp->sb_ifree == ifree); - ASSERT(sbp->sb_fdblocks == freeblks); -#endif -#endif - xfs_buf_relse(sbbp); } #endif /* DEBUG */ diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/xfs_log_recover.h index b22545555301..1c55ccbb379d 100644 --- a/fs/xfs/xfs_log_recover.h +++ b/fs/xfs/xfs_log_recover.h @@ -28,29 +28,28 @@ #define XLOG_RHASH(tid) \ ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1)) -#define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK / 2 + 1) +#define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK / 2 + 1) /* * item headers are in ri_buf[0]. Additional buffers follow. */ typedef struct xlog_recover_item { - struct xlog_recover_item *ri_next; - struct xlog_recover_item *ri_prev; - int ri_type; - int ri_cnt; /* count of regions found */ - int ri_total; /* total regions */ - xfs_log_iovec_t *ri_buf; /* ptr to regions buffer */ + struct list_head ri_list; + int ri_type; + int ri_cnt; /* count of regions found */ + int ri_total; /* total regions */ + xfs_log_iovec_t *ri_buf; /* ptr to regions buffer */ } xlog_recover_item_t; struct xlog_tid; typedef struct xlog_recover { - struct xlog_recover *r_next; - xlog_tid_t r_log_tid; /* log's transaction id */ - xfs_trans_header_t r_theader; /* trans header for partial */ - int r_state; /* not needed */ - xfs_lsn_t r_lsn; /* xact lsn */ - xlog_recover_item_t *r_itemq; /* q for items */ + struct hlist_node r_list; + xlog_tid_t r_log_tid; /* log's transaction id */ + xfs_trans_header_t r_theader; /* trans header for partial */ + int r_state; /* not needed */ + xfs_lsn_t r_lsn; /* xact lsn */ + struct list_head r_itemq; /* q for items */ } xlog_recover_t; #define ITEM_TYPE(i) (*(ushort *)(i)->ri_buf[0].i_addr) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index eb403b40e120..69f62d8b2816 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -201,6 +201,38 @@ xfs_uuid_unmount( /* + * Reference counting access wrappers to the perag structures. + */ +struct xfs_perag * +xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + int ref = 0; + + spin_lock(&mp->m_perag_lock); + pag = radix_tree_lookup(&mp->m_perag_tree, agno); + if (pag) { + ASSERT(atomic_read(&pag->pag_ref) >= 0); + /* catch leaks in the positive direction during testing */ + ASSERT(atomic_read(&pag->pag_ref) < 1000); + ref = atomic_inc_return(&pag->pag_ref); + } + spin_unlock(&mp->m_perag_lock); + trace_xfs_perag_get(mp, agno, ref, _RET_IP_); + return pag; +} + +void +xfs_perag_put(struct xfs_perag *pag) +{ + int ref; + + ASSERT(atomic_read(&pag->pag_ref) > 0); + ref = atomic_dec_return(&pag->pag_ref); + trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_); +} + +/* * Free up the resources associated with a mount structure. Assume that * the structure was initially zeroed, so we can tell which fields got * initialized. @@ -209,13 +241,16 @@ STATIC void xfs_free_perag( xfs_mount_t *mp) { - if (mp->m_perag) { - int agno; + xfs_agnumber_t agno; + struct xfs_perag *pag; - for (agno = 0; agno < mp->m_maxagi; agno++) - if (mp->m_perag[agno].pagb_list) - kmem_free(mp->m_perag[agno].pagb_list); - kmem_free(mp->m_perag); + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + spin_lock(&mp->m_perag_lock); + pag = radix_tree_delete(&mp->m_perag_tree, agno); + ASSERT(pag); + ASSERT(atomic_read(&pag->pag_ref) == 0); + spin_unlock(&mp->m_perag_lock); + kmem_free(pag); } } @@ -233,10 +268,10 @@ xfs_sb_validate_fsb_count( #if XFS_BIG_BLKNOS /* Limited by ULONG_MAX of page cache index */ if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX) - return E2BIG; + return EFBIG; #else /* Limited by UINT_MAX of sectors */ if (nblocks << (sbp->sb_blocklog - BBSHIFT) > UINT_MAX) - return E2BIG; + return EFBIG; #endif return 0; } @@ -358,7 +393,7 @@ xfs_mount_validate_sb( xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { xfs_fs_mount_cmn_err(flags, "file system too large to be mounted on this system."); - return XFS_ERROR(E2BIG); + return XFS_ERROR(EFBIG); } if (unlikely(sbp->sb_inprogress)) { @@ -378,46 +413,73 @@ xfs_mount_validate_sb( return 0; } -STATIC void -xfs_initialize_perag_icache( - xfs_perag_t *pag) -{ - if (!pag->pag_ici_init) { - rwlock_init(&pag->pag_ici_lock); - INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); - pag->pag_ici_init = 1; - } -} - -xfs_agnumber_t +int xfs_initialize_perag( xfs_mount_t *mp, - xfs_agnumber_t agcount) + xfs_agnumber_t agcount, + xfs_agnumber_t *maxagi) { xfs_agnumber_t index, max_metadata; + xfs_agnumber_t first_initialised = 0; xfs_perag_t *pag; xfs_agino_t agino; xfs_ino_t ino; xfs_sb_t *sbp = &mp->m_sb; - xfs_ino_t max_inum = XFS_MAXINUMBER_32; + int error = -ENOMEM; - /* Check to see if the filesystem can overflow 32 bit inodes */ + /* + * Walk the current per-ag tree so we don't try to initialise AGs + * that already exist (growfs case). Allocate and insert all the + * AGs we don't find ready for initialisation. + */ + for (index = 0; index < agcount; index++) { + pag = xfs_perag_get(mp, index); + if (pag) { + xfs_perag_put(pag); + continue; + } + if (!first_initialised) + first_initialised = index; + + pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL); + if (!pag) + goto out_unwind; + pag->pag_agno = index; + pag->pag_mount = mp; + rwlock_init(&pag->pag_ici_lock); + INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); + + if (radix_tree_preload(GFP_NOFS)) + goto out_unwind; + + spin_lock(&mp->m_perag_lock); + if (radix_tree_insert(&mp->m_perag_tree, index, pag)) { + BUG(); + spin_unlock(&mp->m_perag_lock); + radix_tree_preload_end(); + error = -EEXIST; + goto out_unwind; + } + spin_unlock(&mp->m_perag_lock); + radix_tree_preload_end(); + } + + /* + * If we mount with the inode64 option, or no inode overflows + * the legacy 32-bit address space clear the inode32 option. + */ agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0); ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino); - /* Clear the mount flag if no inode can overflow 32 bits - * on this filesystem, or if specifically requested.. - */ - if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > max_inum) { + if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32) mp->m_flags |= XFS_MOUNT_32BITINODES; - } else { + else mp->m_flags &= ~XFS_MOUNT_32BITINODES; - } - /* If we can overflow then setup the ag headers accordingly */ if (mp->m_flags & XFS_MOUNT_32BITINODES) { - /* Calculate how much should be reserved for inodes to - * meet the max inode percentage. + /* + * Calculate how much should be reserved for inodes to meet + * the max inode percentage. */ if (mp->m_maxicount) { __uint64_t icount; @@ -430,29 +492,39 @@ xfs_initialize_perag( } else { max_metadata = agcount; } + for (index = 0; index < agcount; index++) { ino = XFS_AGINO_TO_INO(mp, index, agino); - if (ino > max_inum) { + if (ino > XFS_MAXINUMBER_32) { index++; break; } - /* This ag is preferred for inodes */ - pag = &mp->m_perag[index]; + pag = xfs_perag_get(mp, index); pag->pagi_inodeok = 1; if (index < max_metadata) pag->pagf_metadata = 1; - xfs_initialize_perag_icache(pag); + xfs_perag_put(pag); } } else { - /* Setup default behavior for smaller filesystems */ for (index = 0; index < agcount; index++) { - pag = &mp->m_perag[index]; + pag = xfs_perag_get(mp, index); pag->pagi_inodeok = 1; - xfs_initialize_perag_icache(pag); + xfs_perag_put(pag); } } - return index; + + if (maxagi) + *maxagi = index; + return 0; + +out_unwind: + kmem_free(pag); + for (; index > first_initialised; index--) { + pag = radix_tree_delete(&mp->m_perag_tree, index); + kmem_free(pag); + } + return error; } void @@ -583,7 +655,7 @@ xfs_readsb(xfs_mount_t *mp, int flags) * access to the superblock. */ sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); - extra_flags = XFS_BUF_LOCK | XFS_BUF_MANAGE | XFS_BUF_MAPPED; + extra_flags = XBF_LOCK | XBF_FS_MANAGED | XBF_MAPPED; bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size), extra_flags); @@ -731,12 +803,13 @@ xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount) error = xfs_ialloc_pagi_init(mp, NULL, index); if (error) return error; - pag = &mp->m_perag[index]; + pag = xfs_perag_get(mp, index); ifree += pag->pagi_freecount; ialloc += pag->pagi_count; bfree += pag->pagf_freeblks; bfreelst += pag->pagf_flcount; btree += pag->pagf_btreeblks; + xfs_perag_put(pag); } /* * Overwrite incore superblock counters with just-read data @@ -926,7 +999,7 @@ xfs_check_sizes(xfs_mount_t *mp) d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) { cmn_err(CE_WARN, "XFS: size check 1 failed"); - return XFS_ERROR(E2BIG); + return XFS_ERROR(EFBIG); } error = xfs_read_buf(mp, mp->m_ddev_targp, d - XFS_FSS_TO_BB(mp, 1), @@ -936,7 +1009,7 @@ xfs_check_sizes(xfs_mount_t *mp) } else { cmn_err(CE_WARN, "XFS: size check 2 failed"); if (error == ENOSPC) - error = XFS_ERROR(E2BIG); + error = XFS_ERROR(EFBIG); return error; } @@ -944,7 +1017,7 @@ xfs_check_sizes(xfs_mount_t *mp) d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) { cmn_err(CE_WARN, "XFS: size check 3 failed"); - return XFS_ERROR(E2BIG); + return XFS_ERROR(EFBIG); } error = xfs_read_buf(mp, mp->m_logdev_targp, d - XFS_FSB_TO_BB(mp, 1), @@ -954,7 +1027,7 @@ xfs_check_sizes(xfs_mount_t *mp) } else { cmn_err(CE_WARN, "XFS: size check 3 failed"); if (error == ENOSPC) - error = XFS_ERROR(E2BIG); + error = XFS_ERROR(EFBIG); return error; } } @@ -1008,6 +1081,24 @@ xfs_mount_reset_sbqflags( return xfs_trans_commit(tp, 0); } +__uint64_t +xfs_default_resblks(xfs_mount_t *mp) +{ + __uint64_t resblks; + + /* + * We default to 5% or 8192 fsbs of space reserved, whichever is + * smaller. This is intended to cover concurrent allocation + * transactions when we initially hit enospc. These each require a 4 + * block reservation. Hence by default we cover roughly 2000 concurrent + * allocation reservations. + */ + resblks = mp->m_sb.sb_dblocks; + do_div(resblks, 20); + resblks = min_t(__uint64_t, resblks, 8192); + return resblks; +} + /* * This function does the following on an initial mount of a file system: * - reads the superblock from disk and init the mount struct @@ -1152,13 +1243,13 @@ xfs_mountfs( /* * Allocate and initialize the per-ag data. */ - init_rwsem(&mp->m_peraglock); - mp->m_perag = kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t), - KM_MAYFAIL); - if (!mp->m_perag) + spin_lock_init(&mp->m_perag_lock); + INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC); + error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi); + if (error) { + cmn_err(CE_WARN, "XFS: Failed per-ag init: %d", error); goto out_remove_uuid; - - mp->m_maxagi = xfs_initialize_perag(mp, sbp->sb_agcount); + } if (!sbp->sb_logblocks) { cmn_err(CE_WARN, "XFS: no log defined"); @@ -1209,7 +1300,7 @@ xfs_mountfs( * Get and sanity-check the root inode. * Save the pointer to it in the mount structure. */ - error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip, 0); + error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip); if (error) { cmn_err(CE_WARN, "XFS: failed to read root inode"); goto out_log_dealloc; @@ -1304,13 +1395,6 @@ xfs_mountfs( xfs_qm_mount_quotas(mp); } -#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY) - if (XFS_IS_QUOTA_ON(mp)) - xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas turned on"); - else - xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas not turned on"); -#endif - /* * Now we are mounted, reserve a small amount of unused space for * privileged transactions. This is needed so that transaction @@ -1319,17 +1403,16 @@ xfs_mountfs( * attr, unwritten extent conversion at ENOSPC, etc. Data allocations * are not allowed to use this reserved space. * - * We default to 5% or 1024 fsbs of space reserved, whichever is smaller. * This may drive us straight to ENOSPC on mount, but that implies * we were already there on the last unmount. Warn if this occurs. */ - resblks = mp->m_sb.sb_dblocks; - do_div(resblks, 20); - resblks = min_t(__uint64_t, resblks, 1024); - error = xfs_reserve_blocks(mp, &resblks, NULL); - if (error) - cmn_err(CE_WARN, "XFS: Unable to allocate reserve blocks. " - "Continuing without a reserve pool."); + if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { + resblks = xfs_default_resblks(mp); + error = xfs_reserve_blocks(mp, &resblks, NULL); + if (error) + cmn_err(CE_WARN, "XFS: Unable to allocate reserve " + "blocks. Continuing without a reserve pool."); + } return 0; @@ -1372,8 +1455,19 @@ xfs_unmountfs( * push out the iclog we will never get that unlocked. hence we * need to force the log first. */ - xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); - xfs_reclaim_inodes(mp, XFS_IFLUSH_ASYNC); + xfs_log_force(mp, XFS_LOG_SYNC); + + /* + * Do a delwri reclaim pass first so that as many dirty inodes are + * queued up for IO as possible. Then flush the buffers before making + * a synchronous path to catch all the remaining inodes are reclaimed. + * This makes the reclaim process as quick as possible by avoiding + * synchronous writeout and blocking on inodes already in the delwri + * state as much as possible. + */ + xfs_reclaim_inodes(mp, 0); + XFS_bflush(mp->m_ddev_targp); + xfs_reclaim_inodes(mp, SYNC_WAIT); xfs_qm_unmount(mp); @@ -1382,7 +1476,7 @@ xfs_unmountfs( * that nothing is pinned. This is important because bflush() * will skip pinned buffers. */ - xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); + xfs_log_force(mp, XFS_LOG_SYNC); xfs_binval(mp->m_ddev_targp); if (mp->m_rtdev_targp) { @@ -1548,15 +1642,14 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields) xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields); /* find modified range */ + f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields); + ASSERT((1LL << f) & XFS_SB_MOD_BITS); + last = xfs_sb_info[f + 1].offset - 1; f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); ASSERT((1LL << f) & XFS_SB_MOD_BITS); first = xfs_sb_info[f].offset; - f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields); - ASSERT((1LL << f) & XFS_SB_MOD_BITS); - last = xfs_sb_info[f + 1].offset - 1; - xfs_trans_log_buf(tp, bp, first, last); } @@ -1620,26 +1713,30 @@ xfs_mod_incore_sb_unlocked( lcounter += rem; } } else { /* Taking blocks away */ - lcounter += delta; + if (lcounter >= 0) { + mp->m_sb.sb_fdblocks = lcounter + + XFS_ALLOC_SET_ASIDE(mp); + return 0; + } - /* - * If were out of blocks, use any available reserved blocks if - * were allowed to. - */ + /* + * We are out of blocks, use any available reserved + * blocks if were allowed to. + */ + if (!rsvd) + return XFS_ERROR(ENOSPC); - if (lcounter < 0) { - if (rsvd) { - lcounter = (long long)mp->m_resblks_avail + delta; - if (lcounter < 0) { - return XFS_ERROR(ENOSPC); - } - mp->m_resblks_avail = lcounter; - return 0; - } else { /* not reserved */ - return XFS_ERROR(ENOSPC); - } + lcounter = (long long)mp->m_resblks_avail + delta; + if (lcounter >= 0) { + mp->m_resblks_avail = lcounter; + return 0; } + printk_once(KERN_WARNING + "Filesystem \"%s\": reserve blocks depleted! " + "Consider increasing reserve pool size.", + mp->m_fsname); + return XFS_ERROR(ENOSPC); } mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp); @@ -1887,7 +1984,7 @@ xfs_getsb( ASSERT(mp->m_sb_bp != NULL); bp = mp->m_sb_bp; - if (flags & XFS_BUF_TRYLOCK) { + if (flags & XBF_TRYLOCK) { if (!XFS_BUF_CPSEMA(bp)) { return NULL; } @@ -1947,6 +2044,26 @@ xfs_mount_log_sb( return error; } +/* + * If the underlying (data/log/rt) device is readonly, there are some + * operations that cannot proceed. + */ +int +xfs_dev_is_read_only( + struct xfs_mount *mp, + char *message) +{ + if (xfs_readonly_buftarg(mp->m_ddev_targp) || + xfs_readonly_buftarg(mp->m_logdev_targp) || + (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) { + cmn_err(CE_NOTE, + "XFS: %s required on read-only device.", message); + cmn_err(CE_NOTE, + "XFS: write access unavailable, cannot proceed."); + return EROFS; + } + return 0; +} #ifdef HAVE_PERCPU_SB /* diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 1df7e4502967..5761087ee8ea 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -78,7 +78,8 @@ typedef int (*xfs_send_destroy_t)(struct xfs_inode *, dm_right_t); typedef int (*xfs_send_namesp_t)(dm_eventtype_t, struct xfs_mount *, struct xfs_inode *, dm_right_t, struct xfs_inode *, dm_right_t, - const char *, const char *, mode_t, int, int); + const unsigned char *, const unsigned char *, + mode_t, int, int); typedef int (*xfs_send_mount_t)(struct xfs_mount *, dm_right_t, char *, char *); typedef void (*xfs_send_unmount_t)(struct xfs_mount *, struct xfs_inode *, @@ -207,8 +208,8 @@ typedef struct xfs_mount { uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */ uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */ uint m_in_maxlevels; /* max inobt btree levels. */ - struct xfs_perag *m_perag; /* per-ag accounting info */ - struct rw_semaphore m_peraglock; /* lock for m_perag (pointer) */ + struct radix_tree_root m_perag_tree; /* per-ag accounting info */ + spinlock_t m_perag_lock; /* lock for m_perag_tree */ struct mutex m_growlock; /* growfs mutex */ int m_fixedfsid[2]; /* unchanged for life of FS */ uint m_dmevmask; /* DMI events for this FS */ @@ -224,6 +225,7 @@ typedef struct xfs_mount { __uint64_t m_maxioffset; /* maximum inode offset */ __uint64_t m_resblks; /* total reserved blocks */ __uint64_t m_resblks_avail;/* available reserved blocks */ + __uint64_t m_resblks_save; /* reserved blks @ remount,ro */ int m_dalign; /* stripe unit */ int m_swidth; /* stripe width */ int m_sinoalign; /* stripe unit inode alignment */ @@ -243,7 +245,7 @@ typedef struct xfs_mount { struct xfs_qmops *m_qm_ops; /* vector of XQM ops */ atomic_t m_active_trans; /* number trans frozen */ #ifdef HAVE_PERCPU_SB - xfs_icsb_cnts_t *m_sb_cnts; /* per-cpu superblock counters */ + xfs_icsb_cnts_t __percpu *m_sb_cnts; /* per-cpu superblock counters */ unsigned long m_icsb_counters; /* disabled per-cpu counters */ struct notifier_block m_icsb_notifier; /* hotplug cpu notifier */ struct mutex m_icsb_mutex; /* balancer sync lock */ @@ -257,6 +259,7 @@ typedef struct xfs_mount { wait_queue_head_t m_wait_single_sync_task; __int64_t m_update_flags; /* sb flags we need to update on the next remount,rw */ + struct shrinker m_inode_shrink; /* inode reclaim shrinker */ } xfs_mount_t; /* @@ -265,6 +268,7 @@ typedef struct xfs_mount { #define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops must be synchronous except for space allocations */ +#define XFS_MOUNT_DELAYLOG (1ULL << 1) /* delayed logging is enabled */ #define XFS_MOUNT_DMAPI (1ULL << 2) /* dmapi is enabled */ #define XFS_MOUNT_WAS_CLEAN (1ULL << 3) #define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem @@ -384,19 +388,10 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) } /* - * perag get/put wrappers for eventual ref counting + * perag get/put wrappers for ref counting */ -static inline xfs_perag_t * -xfs_get_perag(struct xfs_mount *mp, xfs_ino_t ino) -{ - return &mp->m_perag[XFS_INO_TO_AGNO(mp, ino)]; -} - -static inline void -xfs_put_perag(struct xfs_mount *mp, xfs_perag_t *pag) -{ - /* nothing to see here, move along */ -} +struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno); +void xfs_perag_put(struct xfs_perag *pag); /* * Per-cpu superblock locking functions @@ -428,6 +423,7 @@ typedef struct xfs_mod_sb { } xfs_mod_sb_t; extern int xfs_log_sbcount(xfs_mount_t *, uint); +extern __uint64_t xfs_default_resblks(xfs_mount_t *mp); extern int xfs_mountfs(xfs_mount_t *mp); extern void xfs_unmountfs(xfs_mount_t *); @@ -442,6 +438,8 @@ extern void xfs_freesb(xfs_mount_t *); extern int xfs_fs_writable(xfs_mount_t *); extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t); +extern int xfs_dev_is_read_only(struct xfs_mount *, char *); + extern int xfs_dmops_get(struct xfs_mount *); extern void xfs_dmops_put(struct xfs_mount *); @@ -450,7 +448,8 @@ extern struct xfs_dmops xfs_dmcore_xfs; #endif /* __KERNEL__ */ extern void xfs_mod_sb(struct xfs_trans *, __int64_t); -extern xfs_agnumber_t xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t); +extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t, + xfs_agnumber_t *); extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *); extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t); diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index 4b0613d99faa..45ce15dc5b2b 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -398,7 +398,7 @@ exit: * guaranteed that all the free functions for all the elements have finished * executing and the reaper is not running. */ -void +static void xfs_mru_cache_flush( xfs_mru_cache_t *mru) { diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h index 5d439f34b0c9..36dd3ec8b4eb 100644 --- a/fs/xfs/xfs_mru_cache.h +++ b/fs/xfs/xfs_mru_cache.h @@ -42,7 +42,6 @@ void xfs_mru_cache_uninit(void); int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms, unsigned int grp_count, xfs_mru_cache_free_func_t free_func); -void xfs_mru_cache_flush(xfs_mru_cache_t *mru); void xfs_mru_cache_destroy(struct xfs_mru_cache *mru); int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key, void *value); diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 91bfd60f4c74..e0e64b113bd6 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -201,9 +201,6 @@ typedef struct xfs_qoff_logformat { #define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */ #define XFS_QMOPT_DQSUSER 0x0000020 /* don't cache super users dquot */ #define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */ -#define XFS_QMOPT_QUOTAOFF 0x0000080 /* quotas are being turned off */ -#define XFS_QMOPT_UMOUNTING 0x0000100 /* filesys is being unmounted */ -#define XFS_QMOPT_DOLOG 0x0000200 /* log buf changes (in quotacheck) */ #define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */ #define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */ #define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ @@ -223,16 +220,9 @@ typedef struct xfs_qoff_logformat { #define XFS_QMOPT_RES_INOS 0x0800000 /* - * flags for dqflush and dqflush_all. - */ -#define XFS_QMOPT_SYNC 0x1000000 -#define XFS_QMOPT_ASYNC 0x2000000 -#define XFS_QMOPT_DELWRI 0x4000000 - -/* * flags for dqalloc. */ -#define XFS_QMOPT_INHERIT 0x8000000 +#define XFS_QMOPT_INHERIT 0x1000000 /* * flags to xfs_trans_mod_dquot. diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 6be05f756d59..a2d32ce335aa 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -2247,7 +2247,7 @@ xfs_rtmount_init( cmn_err(CE_WARN, "XFS: realtime mount -- %llu != %llu", (unsigned long long) XFS_BB_TO_FSB(mp, d), (unsigned long long) mp->m_sb.sb_rblocks); - return XFS_ERROR(E2BIG); + return XFS_ERROR(EFBIG); } error = xfs_read_buf(mp, mp->m_rtdev_targp, d - XFS_FSB_TO_BB(mp, 1), @@ -2256,7 +2256,7 @@ xfs_rtmount_init( cmn_err(CE_WARN, "XFS: realtime mount -- xfs_read_buf failed, returned %d", error); if (error == ENOSPC) - return XFS_ERROR(E2BIG); + return XFS_ERROR(EFBIG); return error; } xfs_buf_relse(bp); @@ -2277,12 +2277,12 @@ xfs_rtmount_inodes( sbp = &mp->m_sb; if (sbp->sb_rbmino == NULLFSINO) return 0; - error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip, 0); + error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip); if (error) return error; ASSERT(mp->m_rbmip != NULL); ASSERT(sbp->sb_rsumino != NULLFSINO); - error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip, 0); + error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip); if (error) { IRELE(mp->m_rbmip); return error; diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index b2d67adb6a08..ff614c29b441 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -147,7 +147,16 @@ xfs_growfs_rt( # define xfs_rtfree_extent(t,b,l) (ENOSYS) # define xfs_rtpick_extent(m,t,l,rb) (ENOSYS) # define xfs_growfs_rt(mp,in) (ENOSYS) -# define xfs_rtmount_init(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) +static inline int /* error */ +xfs_rtmount_init( + xfs_mount_t *mp) /* file system mount structure */ +{ + if (mp->m_sb.sb_rblocks == 0) + return 0; + + cmn_err(CE_WARN, "XFS: Not built with CONFIG_XFS_RT"); + return ENOSYS; +} # define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) # define xfs_rtunmount_inodes(m) #endif /* CONFIG_XFS_RT */ diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c index 5aa07caea5f1..e336742a58a4 100644 --- a/fs/xfs/xfs_rw.c +++ b/fs/xfs/xfs_rw.c @@ -47,48 +47,6 @@ #include "xfs_trace.h" /* - * This is a subroutine for xfs_write() and other writers (xfs_ioctl) - * which clears the setuid and setgid bits when a file is written. - */ -int -xfs_write_clear_setuid( - xfs_inode_t *ip) -{ - xfs_mount_t *mp; - xfs_trans_t *tp; - int error; - - mp = ip->i_mount; - tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID); - if ((error = xfs_trans_reserve(tp, 0, - XFS_WRITEID_LOG_RES(mp), - 0, 0, 0))) { - xfs_trans_cancel(tp, 0); - return error; - } - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); - ip->i_d.di_mode &= ~S_ISUID; - - /* - * Note that we don't have to worry about mandatory - * file locking being disabled here because we only - * clear the S_ISGID bit if the Group execute bit is - * on, but if it was on then mandatory locking wouldn't - * have been enabled. - */ - if (ip->i_d.di_mode & S_IXGRP) { - ip->i_d.di_mode &= ~S_ISGID; - } - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - return 0; -} - -/* * Force a shutdown of the filesystem instantly while keeping * the filesystem consistent. We don't do an unmount here; just shutdown * the shop, make sure that absolutely nothing persistent happens to @@ -153,88 +111,6 @@ xfs_do_force_shutdown( } } - -/* - * Called when we want to stop a buffer from getting written or read. - * We attach the EIO error, muck with its flags, and call biodone - * so that the proper iodone callbacks get called. - */ -int -xfs_bioerror( - xfs_buf_t *bp) -{ - -#ifdef XFSERRORDEBUG - ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone); -#endif - - /* - * No need to wait until the buffer is unpinned. - * We aren't flushing it. - */ - XFS_BUF_ERROR(bp, EIO); - /* - * We're calling biodone, so delete B_DONE flag. Either way - * we have to call the iodone callback, and calling biodone - * probably is the best way since it takes care of - * GRIO as well. - */ - XFS_BUF_UNREAD(bp); - XFS_BUF_UNDELAYWRITE(bp); - XFS_BUF_UNDONE(bp); - XFS_BUF_STALE(bp); - - XFS_BUF_CLR_BDSTRAT_FUNC(bp); - xfs_biodone(bp); - - return (EIO); -} - -/* - * Same as xfs_bioerror, except that we are releasing the buffer - * here ourselves, and avoiding the biodone call. - * This is meant for userdata errors; metadata bufs come with - * iodone functions attached, so that we can track down errors. - */ -int -xfs_bioerror_relse( - xfs_buf_t *bp) -{ - int64_t fl; - - ASSERT(XFS_BUF_IODONE_FUNC(bp) != xfs_buf_iodone_callbacks); - ASSERT(XFS_BUF_IODONE_FUNC(bp) != xlog_iodone); - - fl = XFS_BUF_BFLAGS(bp); - /* - * No need to wait until the buffer is unpinned. - * We aren't flushing it. - * - * chunkhold expects B_DONE to be set, whether - * we actually finish the I/O or not. We don't want to - * change that interface. - */ - XFS_BUF_UNREAD(bp); - XFS_BUF_UNDELAYWRITE(bp); - XFS_BUF_DONE(bp); - XFS_BUF_STALE(bp); - XFS_BUF_CLR_IODONE_FUNC(bp); - XFS_BUF_CLR_BDSTRAT_FUNC(bp); - if (!(fl & XFS_B_ASYNC)) { - /* - * Mark b_error and B_ERROR _both_. - * Lot's of chunkcache code assumes that. - * There's no reason to mark error for - * ASYNC buffers. - */ - XFS_BUF_ERROR(bp, EIO); - XFS_BUF_FINISH_IOWAIT(bp); - } else { - xfs_buf_relse(bp); - } - return (EIO); -} - /* * Prints out an ALERT message about I/O error. */ @@ -306,37 +182,6 @@ xfs_read_buf( } /* - * Wrapper around bwrite() so that we can trap - * write errors, and act accordingly. - */ -int -xfs_bwrite( - struct xfs_mount *mp, - struct xfs_buf *bp) -{ - int error; - - /* - * XXXsup how does this work for quotas. - */ - XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb); - bp->b_mount = mp; - XFS_BUF_WRITE(bp); - - if ((error = XFS_bwrite(bp))) { - ASSERT(mp); - /* - * Cannot put a buftrace here since if the buffer is not - * B_HOLD then we will brelse() the buffer before returning - * from bwrite and we could be tracing a buffer that has - * been reused. - */ - xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); - } - return (error); -} - -/* * helper function to extract extent size hint from inode */ xfs_extlen_t diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h index 571f2174435c..11c41ec6ed75 100644 --- a/fs/xfs/xfs_rw.h +++ b/fs/xfs/xfs_rw.h @@ -39,10 +39,6 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) /* * Prototypes for functions in xfs_rw.c. */ -extern int xfs_write_clear_setuid(struct xfs_inode *ip); -extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp); -extern int xfs_bioerror(struct xfs_buf *bp); -extern int xfs_bioerror_relse(struct xfs_buf *bp); extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp, xfs_daddr_t blkno, int len, uint flags, struct xfs_buf **bpp); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 237badcbac3b..28547dfce037 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -44,148 +44,493 @@ #include "xfs_trans_priv.h" #include "xfs_trans_space.h" #include "xfs_inode_item.h" - - -STATIC void xfs_trans_apply_sb_deltas(xfs_trans_t *); -STATIC uint xfs_trans_count_vecs(xfs_trans_t *); -STATIC void xfs_trans_fill_vecs(xfs_trans_t *, xfs_log_iovec_t *); -STATIC void xfs_trans_uncommit(xfs_trans_t *, uint); -STATIC void xfs_trans_committed(xfs_trans_t *, int); -STATIC void xfs_trans_chunk_committed(xfs_log_item_chunk_t *, xfs_lsn_t, int); -STATIC void xfs_trans_free(xfs_trans_t *); +#include "xfs_trace.h" kmem_zone_t *xfs_trans_zone; /* - * Reservation functions here avoid a huge stack in xfs_trans_init - * due to register overflow from temporaries in the calculations. + * Various log reservation values. + * + * These are based on the size of the file system block because that is what + * most transactions manipulate. Each adds in an additional 128 bytes per + * item logged to try to account for the overhead of the transaction mechanism. + * + * Note: Most of the reservations underestimate the number of allocation + * groups into which they could free extents in the xfs_bmap_finish() call. + * This is because the number in the worst case is quite high and quite + * unusual. In order to fix this we need to change xfs_bmap_finish() to free + * extents in only a single AG at a time. This will require changes to the + * EFI code as well, however, so that the EFI for the extents not freed is + * logged again in each transaction. See SGI PV #261917. + * + * Reservation functions here avoid a huge stack in xfs_trans_init due to + * register overflow from temporaries in the calculations. */ + +/* + * In a write transaction we can allocate a maximum of 2 + * extents. This gives: + * the inode getting the new extents: inode size + * the inode's bmap btree: max depth * block size + * the agfs of the ags from which the extents are allocated: 2 * sector + * the superblock free block counter: sector size + * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + * And the bmap_finish transaction can free bmap blocks in a join: + * the agfs of the ags containing the blocks: 2 * sector size + * the agfls of the ags containing the blocks: 2 * sector size + * the super block free block counter: sector size + * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + */ STATIC uint -xfs_calc_write_reservation(xfs_mount_t *mp) +xfs_calc_write_reservation( + struct xfs_mount *mp) { - return XFS_CALC_WRITE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); + return XFS_DQUOT_LOGRES(mp) + + MAX((mp->m_sb.sb_inodesize + + XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + + 2 * mp->m_sb.sb_sectsize + + mp->m_sb.sb_sectsize + + XFS_ALLOCFREE_LOG_RES(mp, 2) + + 128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + + XFS_ALLOCFREE_LOG_COUNT(mp, 2))), + (2 * mp->m_sb.sb_sectsize + + 2 * mp->m_sb.sb_sectsize + + mp->m_sb.sb_sectsize + + XFS_ALLOCFREE_LOG_RES(mp, 2) + + 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))); } +/* + * In truncating a file we free up to two extents at once. We can modify: + * the inode being truncated: inode size + * the inode's bmap btree: (max depth + 1) * block size + * And the bmap_finish transaction can free the blocks and bmap blocks: + * the agf for each of the ags: 4 * sector size + * the agfl for each of the ags: 4 * sector size + * the super block to reflect the freed blocks: sector size + * worst case split in allocation btrees per extent assuming 4 extents: + * 4 exts * 2 trees * (2 * max depth - 1) * block size + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (max depth - 1) * block size + */ STATIC uint -xfs_calc_itruncate_reservation(xfs_mount_t *mp) +xfs_calc_itruncate_reservation( + struct xfs_mount *mp) { - return XFS_CALC_ITRUNCATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); + return XFS_DQUOT_LOGRES(mp) + + MAX((mp->m_sb.sb_inodesize + + XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) + + 128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))), + (4 * mp->m_sb.sb_sectsize + + 4 * mp->m_sb.sb_sectsize + + mp->m_sb.sb_sectsize + + XFS_ALLOCFREE_LOG_RES(mp, 4) + + 128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4)) + + 128 * 5 + + XFS_ALLOCFREE_LOG_RES(mp, 1) + + 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels + + XFS_ALLOCFREE_LOG_COUNT(mp, 1)))); } +/* + * In renaming a files we can modify: + * the four inodes involved: 4 * inode size + * the two directory btrees: 2 * (max depth + v2) * dir block size + * the two directory bmap btrees: 2 * max depth * block size + * And the bmap_finish transaction can free dir and bmap blocks (two sets + * of bmap blocks) giving: + * the agf for the ags in which the blocks live: 3 * sector size + * the agfl for the ags in which the blocks live: 3 * sector size + * the superblock for the free block count: sector size + * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size + */ STATIC uint -xfs_calc_rename_reservation(xfs_mount_t *mp) +xfs_calc_rename_reservation( + struct xfs_mount *mp) { - return XFS_CALC_RENAME_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); + return XFS_DQUOT_LOGRES(mp) + + MAX((4 * mp->m_sb.sb_inodesize + + 2 * XFS_DIROP_LOG_RES(mp) + + 128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp))), + (3 * mp->m_sb.sb_sectsize + + 3 * mp->m_sb.sb_sectsize + + mp->m_sb.sb_sectsize + + XFS_ALLOCFREE_LOG_RES(mp, 3) + + 128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3)))); } +/* + * For creating a link to an inode: + * the parent directory inode: inode size + * the linked inode: inode size + * the directory btree could split: (max depth + v2) * dir block size + * the directory bmap btree could join or split: (max depth + v2) * blocksize + * And the bmap_finish transaction can free some bmap blocks giving: + * the agf for the ag in which the blocks live: sector size + * the agfl for the ag in which the blocks live: sector size + * the superblock for the free block count: sector size + * the allocation btrees: 2 trees * (2 * max depth - 1) * block size + */ STATIC uint -xfs_calc_link_reservation(xfs_mount_t *mp) +xfs_calc_link_reservation( + struct xfs_mount *mp) { - return XFS_CALC_LINK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); + return XFS_DQUOT_LOGRES(mp) + + MAX((mp->m_sb.sb_inodesize + + mp->m_sb.sb_inodesize + + XFS_DIROP_LOG_RES(mp) + + 128 * (2 + XFS_DIROP_LOG_COUNT(mp))), + (mp->m_sb.sb_sectsize + + mp->m_sb.sb_sectsize + + mp->m_sb.sb_sectsize + + XFS_ALLOCFREE_LOG_RES(mp, 1) + + 128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1)))); } +/* + * For removing a directory entry we can modify: + * the parent directory inode: inode size + * the removed inode: inode size + * the directory btree could join: (max depth + v2) * dir block size + * the directory bmap btree could join or split: (max depth + v2) * blocksize + * And the bmap_finish transaction can free the dir and bmap blocks giving: + * the agf for the ag in which the blocks live: 2 * sector size + * the agfl for the ag in which the blocks live: 2 * sector size + * the superblock for the free block count: sector size + * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + */ STATIC uint -xfs_calc_remove_reservation(xfs_mount_t *mp) +xfs_calc_remove_reservation( + struct xfs_mount *mp) { - return XFS_CALC_REMOVE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); + return XFS_DQUOT_LOGRES(mp) + + MAX((mp->m_sb.sb_inodesize + + mp->m_sb.sb_inodesize + + XFS_DIROP_LOG_RES(mp) + + 128 * (2 + XFS_DIROP_LOG_COUNT(mp))), + (2 * mp->m_sb.sb_sectsize + + 2 * mp->m_sb.sb_sectsize + + mp->m_sb.sb_sectsize + + XFS_ALLOCFREE_LOG_RES(mp, 2) + + 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))); } +/* + * For symlink we can modify: + * the parent directory inode: inode size + * the new inode: inode size + * the inode btree entry: 1 block + * the directory btree: (max depth + v2) * dir block size + * the directory inode's bmap btree: (max depth + v2) * block size + * the blocks for the symlink: 1 kB + * Or in the first xact we allocate some inodes giving: + * the agi and agf of the ag getting the new inodes: 2 * sectorsize + * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (2 * max depth - 1) * block size + */ STATIC uint -xfs_calc_symlink_reservation(xfs_mount_t *mp) +xfs_calc_symlink_reservation( + struct xfs_mount *mp) { - return XFS_CALC_SYMLINK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); + return XFS_DQUOT_LOGRES(mp) + + MAX((mp->m_sb.sb_inodesize + + mp->m_sb.sb_inodesize + + XFS_FSB_TO_B(mp, 1) + + XFS_DIROP_LOG_RES(mp) + + 1024 + + 128 * (4 + XFS_DIROP_LOG_COUNT(mp))), + (2 * mp->m_sb.sb_sectsize + + XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) + + XFS_FSB_TO_B(mp, mp->m_in_maxlevels) + + XFS_ALLOCFREE_LOG_RES(mp, 1) + + 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels + + XFS_ALLOCFREE_LOG_COUNT(mp, 1)))); } +/* + * For create we can modify: + * the parent directory inode: inode size + * the new inode: inode size + * the inode btree entry: block size + * the superblock for the nlink flag: sector size + * the directory btree: (max depth + v2) * dir block size + * the directory inode's bmap btree: (max depth + v2) * block size + * Or in the first xact we allocate some inodes giving: + * the agi and agf of the ag getting the new inodes: 2 * sectorsize + * the superblock for the nlink flag: sector size + * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (max depth - 1) * block size + */ STATIC uint -xfs_calc_create_reservation(xfs_mount_t *mp) +xfs_calc_create_reservation( + struct xfs_mount *mp) { - return XFS_CALC_CREATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); + return XFS_DQUOT_LOGRES(mp) + + MAX((mp->m_sb.sb_inodesize + + mp->m_sb.sb_inodesize + + mp->m_sb.sb_sectsize + + XFS_FSB_TO_B(mp, 1) + + XFS_DIROP_LOG_RES(mp) + + 128 * (3 + XFS_DIROP_LOG_COUNT(mp))), + (3 * mp->m_sb.sb_sectsize + + XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) + + XFS_FSB_TO_B(mp, mp->m_in_maxlevels) + + XFS_ALLOCFREE_LOG_RES(mp, 1) + + 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels + + XFS_ALLOCFREE_LOG_COUNT(mp, 1)))); } +/* + * Making a new directory is the same as creating a new file. + */ STATIC uint -xfs_calc_mkdir_reservation(xfs_mount_t *mp) +xfs_calc_mkdir_reservation( + struct xfs_mount *mp) { - return XFS_CALC_MKDIR_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); + return xfs_calc_create_reservation(mp); } +/* + * In freeing an inode we can modify: + * the inode being freed: inode size + * the super block free inode counter: sector size + * the agi hash list and counters: sector size + * the inode btree entry: block size + * the on disk inode before ours in the agi hash list: inode cluster size + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (max depth - 1) * block size + */ STATIC uint -xfs_calc_ifree_reservation(xfs_mount_t *mp) +xfs_calc_ifree_reservation( + struct xfs_mount *mp) { - return XFS_CALC_IFREE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); + return XFS_DQUOT_LOGRES(mp) + + mp->m_sb.sb_inodesize + + mp->m_sb.sb_sectsize + + mp->m_sb.sb_sectsize + + XFS_FSB_TO_B(mp, 1) + + MAX((__uint16_t)XFS_FSB_TO_B(mp, 1), + XFS_INODE_CLUSTER_SIZE(mp)) + + 128 * 5 + + XFS_ALLOCFREE_LOG_RES(mp, 1) + + 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels + + XFS_ALLOCFREE_LOG_COUNT(mp, 1)); } +/* + * When only changing the inode we log the inode and possibly the superblock + * We also add a bit of slop for the transaction stuff. + */ STATIC uint -xfs_calc_ichange_reservation(xfs_mount_t *mp) +xfs_calc_ichange_reservation( + struct xfs_mount *mp) { - return XFS_CALC_ICHANGE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); + return XFS_DQUOT_LOGRES(mp) + + mp->m_sb.sb_inodesize + + mp->m_sb.sb_sectsize + + 512; + } +/* + * Growing the data section of the filesystem. + * superblock + * agi and agf + * allocation btrees + */ STATIC uint -xfs_calc_growdata_reservation(xfs_mount_t *mp) +xfs_calc_growdata_reservation( + struct xfs_mount *mp) { - return XFS_CALC_GROWDATA_LOG_RES(mp); + return mp->m_sb.sb_sectsize * 3 + + XFS_ALLOCFREE_LOG_RES(mp, 1) + + 128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1)); } +/* + * Growing the rt section of the filesystem. + * In the first set of transactions (ALLOC) we allocate space to the + * bitmap or summary files. + * superblock: sector size + * agf of the ag from which the extent is allocated: sector size + * bmap btree for bitmap/summary inode: max depth * blocksize + * bitmap/summary inode: inode size + * allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize + */ STATIC uint -xfs_calc_growrtalloc_reservation(xfs_mount_t *mp) +xfs_calc_growrtalloc_reservation( + struct xfs_mount *mp) { - return XFS_CALC_GROWRTALLOC_LOG_RES(mp); + return 2 * mp->m_sb.sb_sectsize + + XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + + mp->m_sb.sb_inodesize + + XFS_ALLOCFREE_LOG_RES(mp, 1) + + 128 * (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + + XFS_ALLOCFREE_LOG_COUNT(mp, 1)); } +/* + * Growing the rt section of the filesystem. + * In the second set of transactions (ZERO) we zero the new metadata blocks. + * one bitmap/summary block: blocksize + */ STATIC uint -xfs_calc_growrtzero_reservation(xfs_mount_t *mp) +xfs_calc_growrtzero_reservation( + struct xfs_mount *mp) { - return XFS_CALC_GROWRTZERO_LOG_RES(mp); + return mp->m_sb.sb_blocksize + 128; } +/* + * Growing the rt section of the filesystem. + * In the third set of transactions (FREE) we update metadata without + * allocating any new blocks. + * superblock: sector size + * bitmap inode: inode size + * summary inode: inode size + * one bitmap block: blocksize + * summary blocks: new summary size + */ STATIC uint -xfs_calc_growrtfree_reservation(xfs_mount_t *mp) +xfs_calc_growrtfree_reservation( + struct xfs_mount *mp) { - return XFS_CALC_GROWRTFREE_LOG_RES(mp); + return mp->m_sb.sb_sectsize + + 2 * mp->m_sb.sb_inodesize + + mp->m_sb.sb_blocksize + + mp->m_rsumsize + + 128 * 5; } +/* + * Logging the inode modification timestamp on a synchronous write. + * inode + */ STATIC uint -xfs_calc_swrite_reservation(xfs_mount_t *mp) +xfs_calc_swrite_reservation( + struct xfs_mount *mp) { - return XFS_CALC_SWRITE_LOG_RES(mp); + return mp->m_sb.sb_inodesize + 128; } +/* + * Logging the inode mode bits when writing a setuid/setgid file + * inode + */ STATIC uint xfs_calc_writeid_reservation(xfs_mount_t *mp) { - return XFS_CALC_WRITEID_LOG_RES(mp); + return mp->m_sb.sb_inodesize + 128; } +/* + * Converting the inode from non-attributed to attributed. + * the inode being converted: inode size + * agf block and superblock (for block allocation) + * the new block (directory sized) + * bmap blocks for the new directory block + * allocation btrees + */ STATIC uint -xfs_calc_addafork_reservation(xfs_mount_t *mp) +xfs_calc_addafork_reservation( + struct xfs_mount *mp) { - return XFS_CALC_ADDAFORK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); + return XFS_DQUOT_LOGRES(mp) + + mp->m_sb.sb_inodesize + + mp->m_sb.sb_sectsize * 2 + + mp->m_dirblksize + + XFS_FSB_TO_B(mp, XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) + + XFS_ALLOCFREE_LOG_RES(mp, 1) + + 128 * (4 + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1 + + XFS_ALLOCFREE_LOG_COUNT(mp, 1)); } +/* + * Removing the attribute fork of a file + * the inode being truncated: inode size + * the inode's bmap btree: max depth * block size + * And the bmap_finish transaction can free the blocks and bmap blocks: + * the agf for each of the ags: 4 * sector size + * the agfl for each of the ags: 4 * sector size + * the super block to reflect the freed blocks: sector size + * worst case split in allocation btrees per extent assuming 4 extents: + * 4 exts * 2 trees * (2 * max depth - 1) * block size + */ STATIC uint -xfs_calc_attrinval_reservation(xfs_mount_t *mp) +xfs_calc_attrinval_reservation( + struct xfs_mount *mp) { - return XFS_CALC_ATTRINVAL_LOG_RES(mp); + return MAX((mp->m_sb.sb_inodesize + + XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + + 128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))), + (4 * mp->m_sb.sb_sectsize + + 4 * mp->m_sb.sb_sectsize + + mp->m_sb.sb_sectsize + + XFS_ALLOCFREE_LOG_RES(mp, 4) + + 128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4)))); } +/* + * Setting an attribute. + * the inode getting the attribute + * the superblock for allocations + * the agfs extents are allocated from + * the attribute btree * max depth + * the inode allocation btree + * Since attribute transaction space is dependent on the size of the attribute, + * the calculation is done partially at mount time and partially at runtime. + */ STATIC uint -xfs_calc_attrset_reservation(xfs_mount_t *mp) +xfs_calc_attrset_reservation( + struct xfs_mount *mp) { - return XFS_CALC_ATTRSET_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); + return XFS_DQUOT_LOGRES(mp) + + mp->m_sb.sb_inodesize + + mp->m_sb.sb_sectsize + + XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) + + 128 * (2 + XFS_DA_NODE_MAXDEPTH); } +/* + * Removing an attribute. + * the inode: inode size + * the attribute btree could join: max depth * block size + * the inode bmap btree could join or split: max depth * block size + * And the bmap_finish transaction can free the attr blocks freed giving: + * the agf for the ag in which the blocks live: 2 * sector size + * the agfl for the ag in which the blocks live: 2 * sector size + * the superblock for the free block count: sector size + * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + */ STATIC uint -xfs_calc_attrrm_reservation(xfs_mount_t *mp) +xfs_calc_attrrm_reservation( + struct xfs_mount *mp) { - return XFS_CALC_ATTRRM_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); + return XFS_DQUOT_LOGRES(mp) + + MAX((mp->m_sb.sb_inodesize + + XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) + + XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + + 128 * (1 + XFS_DA_NODE_MAXDEPTH + + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))), + (2 * mp->m_sb.sb_sectsize + + 2 * mp->m_sb.sb_sectsize + + mp->m_sb.sb_sectsize + + XFS_ALLOCFREE_LOG_RES(mp, 2) + + 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))); } +/* + * Clearing a bad agino number in an agi hash bucket. + */ STATIC uint -xfs_calc_clear_agi_bucket_reservation(xfs_mount_t *mp) +xfs_calc_clear_agi_bucket_reservation( + struct xfs_mount *mp) { - return XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp); + return mp->m_sb.sb_sectsize + 128; } /* @@ -194,11 +539,10 @@ xfs_calc_clear_agi_bucket_reservation(xfs_mount_t *mp) */ void xfs_trans_init( - xfs_mount_t *mp) + struct xfs_mount *mp) { - xfs_trans_reservations_t *resp; + struct xfs_trans_reservations *resp = &mp->m_reservations; - resp = &(mp->m_reservations); resp->tr_write = xfs_calc_write_reservation(mp); resp->tr_itruncate = xfs_calc_itruncate_reservation(mp); resp->tr_rename = xfs_calc_rename_reservation(mp); @@ -254,13 +598,30 @@ _xfs_trans_alloc( tp->t_type = type; tp->t_mountp = mp; tp->t_items_free = XFS_LIC_NUM_SLOTS; - tp->t_busy_free = XFS_LBC_NUM_SLOTS; xfs_lic_init(&(tp->t_items)); - XFS_LBC_INIT(&(tp->t_busy)); + INIT_LIST_HEAD(&tp->t_busy); return tp; } /* + * Free the transaction structure. If there is more clean up + * to do when the structure is freed, add it here. + */ +STATIC void +xfs_trans_free( + struct xfs_trans *tp) +{ + struct xfs_busy_extent *busyp, *n; + + list_for_each_entry_safe(busyp, n, &tp->t_busy, list) + xfs_alloc_busy_clear(tp->t_mountp, busyp); + + atomic_dec(&tp->t_mountp->m_active_trans); + xfs_trans_free_dqinfo(tp); + kmem_zone_free(xfs_trans_zone, tp); +} + +/* * This is called to create a new transaction which will share the * permanent log reservation of the given transaction. The remaining * unused block and rt extent reservations are also inherited. This @@ -283,9 +644,8 @@ xfs_trans_dup( ntp->t_type = tp->t_type; ntp->t_mountp = tp->t_mountp; ntp->t_items_free = XFS_LIC_NUM_SLOTS; - ntp->t_busy_free = XFS_LBC_NUM_SLOTS; xfs_lic_init(&(ntp->t_items)); - XFS_LBC_INIT(&(ntp->t_busy)); + INIT_LIST_HEAD(&ntp->t_busy); ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); ASSERT(tp->t_ticket != NULL); @@ -421,7 +781,6 @@ undo_blocks: return error; } - /* * Record the indicated change to the given field for application * to the file system's superblock when the transaction commits. @@ -650,7 +1009,7 @@ xfs_trans_apply_sb_deltas( * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we * still need to update the incore superblock with the changes. */ -STATIC void +void xfs_trans_unreserve_and_mod_sb( xfs_trans_t *tp) { @@ -764,94 +1123,256 @@ xfs_trans_unreserve_and_mod_sb( } } +/* + * Total up the number of log iovecs needed to commit this + * transaction. The transaction itself needs one for the + * transaction header. Ask each dirty item in turn how many + * it needs to get the total. + */ +static uint +xfs_trans_count_vecs( + struct xfs_trans *tp) +{ + int nvecs; + xfs_log_item_desc_t *lidp; + + nvecs = 1; + lidp = xfs_trans_first_item(tp); + ASSERT(lidp != NULL); + + /* In the non-debug case we need to start bailing out if we + * didn't find a log_item here, return zero and let trans_commit + * deal with it. + */ + if (lidp == NULL) + return 0; + + while (lidp != NULL) { + /* + * Skip items which aren't dirty in this transaction. + */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) { + lidp = xfs_trans_next_item(tp, lidp); + continue; + } + lidp->lid_size = IOP_SIZE(lidp->lid_item); + nvecs += lidp->lid_size; + lidp = xfs_trans_next_item(tp, lidp); + } + + return nvecs; +} /* - * xfs_trans_commit + * Fill in the vector with pointers to data to be logged + * by this transaction. The transaction header takes + * the first vector, and then each dirty item takes the + * number of vectors it indicated it needed in xfs_trans_count_vecs(). * - * Commit the given transaction to the log a/synchronously. + * As each item fills in the entries it needs, also pin the item + * so that it cannot be flushed out until the log write completes. + */ +static void +xfs_trans_fill_vecs( + struct xfs_trans *tp, + struct xfs_log_iovec *log_vector) +{ + xfs_log_item_desc_t *lidp; + struct xfs_log_iovec *vecp; + uint nitems; + + /* + * Skip over the entry for the transaction header, we'll + * fill that in at the end. + */ + vecp = log_vector + 1; + + nitems = 0; + lidp = xfs_trans_first_item(tp); + ASSERT(lidp); + while (lidp) { + /* Skip items which aren't dirty in this transaction. */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) { + lidp = xfs_trans_next_item(tp, lidp); + continue; + } + + /* + * The item may be marked dirty but not log anything. This can + * be used to get called when a transaction is committed. + */ + if (lidp->lid_size) + nitems++; + IOP_FORMAT(lidp->lid_item, vecp); + vecp += lidp->lid_size; + IOP_PIN(lidp->lid_item); + lidp = xfs_trans_next_item(tp, lidp); + } + + /* + * Now that we've counted the number of items in this transaction, fill + * in the transaction header. Note that the transaction header does not + * have a log item. + */ + tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC; + tp->t_header.th_type = tp->t_type; + tp->t_header.th_num_items = nitems; + log_vector->i_addr = (xfs_caddr_t)&tp->t_header; + log_vector->i_len = sizeof(xfs_trans_header_t); + log_vector->i_type = XLOG_REG_TYPE_TRANSHDR; +} + +/* + * The committed item processing consists of calling the committed routine of + * each logged item, updating the item's position in the AIL if necessary, and + * unpinning each item. If the committed routine returns -1, then do nothing + * further with the item because it may have been freed. * - * XFS disk error handling mechanism is not based on a typical - * transaction abort mechanism. Logically after the filesystem - * gets marked 'SHUTDOWN', we can't let any new transactions - * be durable - ie. committed to disk - because some metadata might - * be inconsistent. In such cases, this returns an error, and the - * caller may assume that all locked objects joined to the transaction - * have already been unlocked as if the commit had succeeded. - * Do not reference the transaction structure after this call. + * Since items are unlocked when they are copied to the incore log, it is + * possible for two transactions to be completing and manipulating the same + * item simultaneously. The AIL lock will protect the lsn field of each item. + * The value of this field can never go backwards. + * + * We unpin the items after repositioning them in the AIL, because otherwise + * they could be immediately flushed and we'd have to race with the flusher + * trying to pull the item from the AIL as we add it. */ - /*ARGSUSED*/ -int -_xfs_trans_commit( - xfs_trans_t *tp, - uint flags, - int *log_flushed) +void +xfs_trans_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t commit_lsn, + int aborted) { - xfs_log_iovec_t *log_vector; - int nvec; - xfs_mount_t *mp; - xfs_lsn_t commit_lsn; - /* REFERENCED */ - int error; - int log_flags; - int sync; -#define XFS_TRANS_LOGVEC_COUNT 16 - xfs_log_iovec_t log_vector_fast[XFS_TRANS_LOGVEC_COUNT]; - void *commit_iclog; - int shutdown; + xfs_lsn_t item_lsn; + struct xfs_ail *ailp; + + if (aborted) + lip->li_flags |= XFS_LI_ABORTED; + item_lsn = IOP_COMMITTED(lip, commit_lsn); - commit_lsn = -1; + /* If the committed routine returns -1, item has been freed. */ + if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) + return; /* - * Determine whether this commit is releasing a permanent - * log reservation or not. + * If the returned lsn is greater than what it contained before, update + * the location of the item in the AIL. If it is not, then do nothing. + * Items can never move backwards in the AIL. + * + * While the new lsn should usually be greater, it is possible that a + * later transaction completing simultaneously with an earlier one + * using the same item could complete first with a higher lsn. This + * would cause the earlier transaction to fail the test below. */ - if (flags & XFS_TRANS_RELEASE_LOG_RES) { - ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); - log_flags = XFS_LOG_REL_PERM_RESERV; + ailp = lip->li_ailp; + spin_lock(&ailp->xa_lock); + if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) { + /* + * This will set the item's lsn to item_lsn and update the + * position of the item in the AIL. + * + * xfs_trans_ail_update() drops the AIL lock. + */ + xfs_trans_ail_update(ailp, lip, item_lsn); } else { - log_flags = 0; + spin_unlock(&ailp->xa_lock); } - mp = tp->t_mountp; /* - * If there is nothing to be logged by the transaction, - * then unlock all of the items associated with the - * transaction and free the transaction structure. - * Also make sure to return any reserved blocks to - * the free pool. + * Now that we've repositioned the item in the AIL, unpin it so it can + * be flushed. Pass information about buffer stale state down from the + * log item flags, if anyone else stales the buffer we do not want to + * pay any attention to it. */ -shut_us_down: - shutdown = XFS_FORCED_SHUTDOWN(mp) ? EIO : 0; - if (!(tp->t_flags & XFS_TRANS_DIRTY) || shutdown) { - xfs_trans_unreserve_and_mod_sb(tp); + IOP_UNPIN(lip); +} + +/* + * This is typically called by the LM when a transaction has been fully + * committed to disk. It needs to unpin the items which have + * been logged by the transaction and update their positions + * in the AIL if necessary. + * + * This also gets called when the transactions didn't get written out + * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then. + */ +STATIC void +xfs_trans_committed( + struct xfs_trans *tp, + int abortflag) +{ + xfs_log_item_desc_t *lidp; + xfs_log_item_chunk_t *licp; + xfs_log_item_chunk_t *next_licp; + + /* Call the transaction's completion callback if there is one. */ + if (tp->t_callback != NULL) + tp->t_callback(tp, tp->t_callarg); + + for (lidp = xfs_trans_first_item(tp); + lidp != NULL; + lidp = xfs_trans_next_item(tp, lidp)) { + xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag); + } + + /* free the item chunks, ignoring the embedded chunk */ + for (licp = tp->t_items.lic_next; licp != NULL; licp = next_licp) { + next_licp = licp->lic_next; + kmem_free(licp); + } + + xfs_trans_free(tp); +} + +/* + * Called from the trans_commit code when we notice that + * the filesystem is in the middle of a forced shutdown. + */ +STATIC void +xfs_trans_uncommit( + struct xfs_trans *tp, + uint flags) +{ + xfs_log_item_desc_t *lidp; + + for (lidp = xfs_trans_first_item(tp); + lidp != NULL; + lidp = xfs_trans_next_item(tp, lidp)) { /* - * It is indeed possible for the transaction to be - * not dirty but the dqinfo portion to be. All that - * means is that we have some (non-persistent) quota - * reservations that need to be unreserved. + * Unpin all but those that aren't dirty. */ - xfs_trans_unreserve_and_mod_dquots(tp); - if (tp->t_ticket) { - commit_lsn = xfs_log_done(mp, tp->t_ticket, - NULL, log_flags); - if (commit_lsn == -1 && !shutdown) - shutdown = XFS_ERROR(EIO); - } - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - xfs_trans_free_items(tp, shutdown? XFS_TRANS_ABORT : 0); - xfs_trans_free_busy(tp); - xfs_trans_free(tp); - XFS_STATS_INC(xs_trans_empty); - return (shutdown); + if (lidp->lid_flags & XFS_LID_DIRTY) + IOP_UNPIN_REMOVE(lidp->lid_item, tp); } - ASSERT(tp->t_ticket != NULL); - /* - * If we need to update the superblock, then do it now. - */ - if (tp->t_flags & XFS_TRANS_SB_DIRTY) - xfs_trans_apply_sb_deltas(tp); - xfs_trans_apply_dquot_deltas(tp); + xfs_trans_unreserve_and_mod_sb(tp); + xfs_trans_unreserve_and_mod_dquots(tp); + + xfs_trans_free_items(tp, NULLCOMMITLSN, flags); + xfs_trans_free(tp); +} + +/* + * Format the transaction direct to the iclog. This isolates the physical + * transaction commit operation from the logical operation and hence allows + * other methods to be introduced without affecting the existing commit path. + */ +static int +xfs_trans_commit_iclog( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_lsn_t *commit_lsn, + int flags) +{ + int shutdown; + int error; + int log_flags = 0; + struct xlog_in_core *commit_iclog; +#define XFS_TRANS_LOGVEC_COUNT 16 + struct xfs_log_iovec log_vector_fast[XFS_TRANS_LOGVEC_COUNT]; + struct xfs_log_iovec *log_vector; + uint nvec; + /* * Ask each log item how many log_vector entries it will @@ -861,8 +1382,7 @@ shut_us_down: */ nvec = xfs_trans_count_vecs(tp); if (nvec == 0) { - xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); - goto shut_us_down; + return ENOMEM; /* triggers a shutdown! */ } else if (nvec <= XFS_TRANS_LOGVEC_COUNT) { log_vector = log_vector_fast; } else { @@ -877,6 +1397,9 @@ shut_us_down: */ xfs_trans_fill_vecs(tp, log_vector); + if (flags & XFS_TRANS_RELEASE_LOG_RES) + log_flags = XFS_LOG_REL_PERM_RESERV; + error = xfs_log_write(mp, log_vector, nvec, tp->t_ticket, &(tp->t_lsn)); /* @@ -884,18 +1407,19 @@ shut_us_down: * at any time after this call. However, all the items associated * with the transaction are still locked and pinned in memory. */ - commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags); + *commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags); - tp->t_commit_lsn = commit_lsn; - if (nvec > XFS_TRANS_LOGVEC_COUNT) { + tp->t_commit_lsn = *commit_lsn; + trace_xfs_trans_commit_lsn(tp); + + if (nvec > XFS_TRANS_LOGVEC_COUNT) kmem_free(log_vector); - } /* * If we got a log write error. Unpin the logitems that we * had pinned, clean up, free trans structure, and return error. */ - if (error || commit_lsn == -1) { + if (error || *commit_lsn == -1) { current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); xfs_trans_uncommit(tp, flags|XFS_TRANS_ABORT); return XFS_ERROR(EIO); @@ -909,8 +1433,6 @@ shut_us_down: */ xfs_trans_unreserve_and_mod_sb(tp); - sync = tp->t_flags & XFS_TRANS_SYNC; - /* * Tell the LM to call the transaction completion routine * when the log write with LSN commit_lsn completes (e.g. @@ -953,7 +1475,7 @@ shut_us_down: * the commit lsn of this transaction for dependency tracking * purposes. */ - xfs_trans_unlock_items(tp, commit_lsn); + xfs_trans_unlock_items(tp, *commit_lsn); /* * If we detected a log error earlier, finish committing @@ -973,157 +1495,204 @@ shut_us_down: * and the items are released we can finally allow the iclog to * go to disk. */ - error = xfs_log_release_iclog(mp, commit_iclog); - - /* - * If the transaction needs to be synchronous, then force the - * log out now and wait for it. - */ - if (sync) { - if (!error) { - error = _xfs_log_force(mp, commit_lsn, - XFS_LOG_FORCE | XFS_LOG_SYNC, - log_flushed); - } - XFS_STATS_INC(xs_trans_sync); - } else { - XFS_STATS_INC(xs_trans_async); - } - - return (error); + return xfs_log_release_iclog(mp, commit_iclog); } - /* - * Total up the number of log iovecs needed to commit this - * transaction. The transaction itself needs one for the - * transaction header. Ask each dirty item in turn how many - * it needs to get the total. + * Walk the log items and allocate log vector structures for + * each item large enough to fit all the vectors they require. + * Note that this format differs from the old log vector format in + * that there is no transaction header in these log vectors. */ -STATIC uint -xfs_trans_count_vecs( +STATIC struct xfs_log_vec * +xfs_trans_alloc_log_vecs( xfs_trans_t *tp) { - int nvecs; xfs_log_item_desc_t *lidp; + struct xfs_log_vec *lv = NULL; + struct xfs_log_vec *ret_lv = NULL; - nvecs = 1; lidp = xfs_trans_first_item(tp); - ASSERT(lidp != NULL); - /* In the non-debug case we need to start bailing out if we - * didn't find a log_item here, return zero and let trans_commit - * deal with it. - */ - if (lidp == NULL) - return 0; + /* Bail out if we didn't find a log item. */ + if (!lidp) { + ASSERT(0); + return NULL; + } while (lidp != NULL) { - /* - * Skip items which aren't dirty in this transaction. - */ + struct xfs_log_vec *new_lv; + + /* Skip items which aren't dirty in this transaction. */ if (!(lidp->lid_flags & XFS_LID_DIRTY)) { lidp = xfs_trans_next_item(tp, lidp); continue; } + + /* Skip items that do not have any vectors for writing */ lidp->lid_size = IOP_SIZE(lidp->lid_item); - nvecs += lidp->lid_size; + if (!lidp->lid_size) { + lidp = xfs_trans_next_item(tp, lidp); + continue; + } + + new_lv = kmem_zalloc(sizeof(*new_lv) + + lidp->lid_size * sizeof(struct xfs_log_iovec), + KM_SLEEP); + + /* The allocated iovec region lies beyond the log vector. */ + new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1]; + new_lv->lv_niovecs = lidp->lid_size; + new_lv->lv_item = lidp->lid_item; + if (!ret_lv) + ret_lv = new_lv; + else + lv->lv_next = new_lv; + lv = new_lv; lidp = xfs_trans_next_item(tp, lidp); } - return nvecs; + return ret_lv; } -/* - * Called from the trans_commit code when we notice that - * the filesystem is in the middle of a forced shutdown. - */ -STATIC void -xfs_trans_uncommit( - xfs_trans_t *tp, - uint flags) +static int +xfs_trans_commit_cil( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_lsn_t *commit_lsn, + int flags) { - xfs_log_item_desc_t *lidp; + struct xfs_log_vec *log_vector; + int error; - for (lidp = xfs_trans_first_item(tp); - lidp != NULL; - lidp = xfs_trans_next_item(tp, lidp)) { - /* - * Unpin all but those that aren't dirty. - */ - if (lidp->lid_flags & XFS_LID_DIRTY) - IOP_UNPIN_REMOVE(lidp->lid_item, tp); - } + /* + * Get each log item to allocate a vector structure for + * the log item to to pass to the log write code. The + * CIL commit code will format the vector and save it away. + */ + log_vector = xfs_trans_alloc_log_vecs(tp); + if (!log_vector) + return ENOMEM; - xfs_trans_unreserve_and_mod_sb(tp); - xfs_trans_unreserve_and_mod_dquots(tp); + error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags); + if (error) + return error; + + current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - xfs_trans_free_items(tp, flags); - xfs_trans_free_busy(tp); + /* xfs_trans_free_items() unlocks them first */ + xfs_trans_free_items(tp, *commit_lsn, 0); xfs_trans_free(tp); + return 0; } /* - * Fill in the vector with pointers to data to be logged - * by this transaction. The transaction header takes - * the first vector, and then each dirty item takes the - * number of vectors it indicated it needed in xfs_trans_count_vecs(). + * xfs_trans_commit * - * As each item fills in the entries it needs, also pin the item - * so that it cannot be flushed out until the log write completes. + * Commit the given transaction to the log a/synchronously. + * + * XFS disk error handling mechanism is not based on a typical + * transaction abort mechanism. Logically after the filesystem + * gets marked 'SHUTDOWN', we can't let any new transactions + * be durable - ie. committed to disk - because some metadata might + * be inconsistent. In such cases, this returns an error, and the + * caller may assume that all locked objects joined to the transaction + * have already been unlocked as if the commit had succeeded. + * Do not reference the transaction structure after this call. */ -STATIC void -xfs_trans_fill_vecs( - xfs_trans_t *tp, - xfs_log_iovec_t *log_vector) +int +_xfs_trans_commit( + struct xfs_trans *tp, + uint flags, + int *log_flushed) { - xfs_log_item_desc_t *lidp; - xfs_log_iovec_t *vecp; - uint nitems; + struct xfs_mount *mp = tp->t_mountp; + xfs_lsn_t commit_lsn = -1; + int error = 0; + int log_flags = 0; + int sync = tp->t_flags & XFS_TRANS_SYNC; /* - * Skip over the entry for the transaction header, we'll - * fill that in at the end. + * Determine whether this commit is releasing a permanent + * log reservation or not. */ - vecp = log_vector + 1; /* pointer arithmetic */ + if (flags & XFS_TRANS_RELEASE_LOG_RES) { + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + log_flags = XFS_LOG_REL_PERM_RESERV; + } - nitems = 0; - lidp = xfs_trans_first_item(tp); - ASSERT(lidp != NULL); - while (lidp != NULL) { - /* - * Skip items which aren't dirty in this transaction. - */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) { - lidp = xfs_trans_next_item(tp, lidp); - continue; - } - /* - * The item may be marked dirty but not log anything. - * This can be used to get called when a transaction - * is committed. - */ - if (lidp->lid_size) { - nitems++; + /* + * If there is nothing to be logged by the transaction, + * then unlock all of the items associated with the + * transaction and free the transaction structure. + * Also make sure to return any reserved blocks to + * the free pool. + */ + if (!(tp->t_flags & XFS_TRANS_DIRTY)) + goto out_unreserve; + + if (XFS_FORCED_SHUTDOWN(mp)) { + error = XFS_ERROR(EIO); + goto out_unreserve; + } + + ASSERT(tp->t_ticket != NULL); + + /* + * If we need to update the superblock, then do it now. + */ + if (tp->t_flags & XFS_TRANS_SB_DIRTY) + xfs_trans_apply_sb_deltas(tp); + xfs_trans_apply_dquot_deltas(tp); + + if (mp->m_flags & XFS_MOUNT_DELAYLOG) + error = xfs_trans_commit_cil(mp, tp, &commit_lsn, flags); + else + error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags); + + if (error == ENOMEM) { + xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); + error = XFS_ERROR(EIO); + goto out_unreserve; + } + + /* + * If the transaction needs to be synchronous, then force the + * log out now and wait for it. + */ + if (sync) { + if (!error) { + error = _xfs_log_force_lsn(mp, commit_lsn, + XFS_LOG_SYNC, log_flushed); } - IOP_FORMAT(lidp->lid_item, vecp); - vecp += lidp->lid_size; /* pointer arithmetic */ - IOP_PIN(lidp->lid_item); - lidp = xfs_trans_next_item(tp, lidp); + XFS_STATS_INC(xs_trans_sync); + } else { + XFS_STATS_INC(xs_trans_async); } + return error; + +out_unreserve: + xfs_trans_unreserve_and_mod_sb(tp); + /* - * Now that we've counted the number of items in this - * transaction, fill in the transaction header. + * It is indeed possible for the transaction to be not dirty but + * the dqinfo portion to be. All that means is that we have some + * (non-persistent) quota reservations that need to be unreserved. */ - tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC; - tp->t_header.th_type = tp->t_type; - tp->t_header.th_num_items = nitems; - log_vector->i_addr = (xfs_caddr_t)&tp->t_header; - log_vector->i_len = sizeof(xfs_trans_header_t); - XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_TRANSHDR); -} + xfs_trans_unreserve_and_mod_dquots(tp); + if (tp->t_ticket) { + commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags); + if (commit_lsn == -1 && !error) + error = XFS_ERROR(EIO); + } + current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0); + xfs_trans_free(tp); + XFS_STATS_INC(xs_trans_empty); + return error; +} /* * Unlock all of the transaction's items and free the transaction. @@ -1196,25 +1765,10 @@ xfs_trans_cancel( /* mark this thread as no longer being in a transaction */ current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - xfs_trans_free_items(tp, flags); - xfs_trans_free_busy(tp); + xfs_trans_free_items(tp, NULLCOMMITLSN, flags); xfs_trans_free(tp); } - -/* - * Free the transaction structure. If there is more clean up - * to do when the structure is freed, add it here. - */ -STATIC void -xfs_trans_free( - xfs_trans_t *tp) -{ - atomic_dec(&tp->t_mountp->m_active_trans); - xfs_trans_free_dqinfo(tp); - kmem_zone_free(xfs_trans_zone, tp); -} - /* * Roll from one trans in the sequence of PERMANENT transactions to * the next: permanent transactions are only flushed out when @@ -1284,174 +1838,3 @@ xfs_trans_roll( xfs_trans_ihold(trans, dp); return 0; } - -/* - * THIS SHOULD BE REWRITTEN TO USE xfs_trans_next_item(). - * - * This is typically called by the LM when a transaction has been fully - * committed to disk. It needs to unpin the items which have - * been logged by the transaction and update their positions - * in the AIL if necessary. - * This also gets called when the transactions didn't get written out - * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then. - * - * Call xfs_trans_chunk_committed() to process the items in - * each chunk. - */ -STATIC void -xfs_trans_committed( - xfs_trans_t *tp, - int abortflag) -{ - xfs_log_item_chunk_t *licp; - xfs_log_item_chunk_t *next_licp; - xfs_log_busy_chunk_t *lbcp; - xfs_log_busy_slot_t *lbsp; - int i; - - /* - * Call the transaction's completion callback if there - * is one. - */ - if (tp->t_callback != NULL) { - tp->t_callback(tp, tp->t_callarg); - } - - /* - * Special case the chunk embedded in the transaction. - */ - licp = &(tp->t_items); - if (!(xfs_lic_are_all_free(licp))) { - xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag); - } - - /* - * Process the items in each chunk in turn. - */ - licp = licp->lic_next; - while (licp != NULL) { - ASSERT(!xfs_lic_are_all_free(licp)); - xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag); - next_licp = licp->lic_next; - kmem_free(licp); - licp = next_licp; - } - - /* - * Clear all the per-AG busy list items listed in this transaction - */ - lbcp = &tp->t_busy; - while (lbcp != NULL) { - for (i = 0, lbsp = lbcp->lbc_busy; i < lbcp->lbc_unused; i++, lbsp++) { - if (!XFS_LBC_ISFREE(lbcp, i)) { - xfs_alloc_clear_busy(tp, lbsp->lbc_ag, - lbsp->lbc_idx); - } - } - lbcp = lbcp->lbc_next; - } - xfs_trans_free_busy(tp); - - /* - * That's it for the transaction structure. Free it. - */ - xfs_trans_free(tp); -} - -/* - * This is called to perform the commit processing for each - * item described by the given chunk. - * - * The commit processing consists of unlocking items which were - * held locked with the SYNC_UNLOCK attribute, calling the committed - * routine of each logged item, updating the item's position in the AIL - * if necessary, and unpinning each item. If the committed routine - * returns -1, then do nothing further with the item because it - * may have been freed. - * - * Since items are unlocked when they are copied to the incore - * log, it is possible for two transactions to be completing - * and manipulating the same item simultaneously. The AIL lock - * will protect the lsn field of each item. The value of this - * field can never go backwards. - * - * We unpin the items after repositioning them in the AIL, because - * otherwise they could be immediately flushed and we'd have to race - * with the flusher trying to pull the item from the AIL as we add it. - */ -STATIC void -xfs_trans_chunk_committed( - xfs_log_item_chunk_t *licp, - xfs_lsn_t lsn, - int aborted) -{ - xfs_log_item_desc_t *lidp; - xfs_log_item_t *lip; - xfs_lsn_t item_lsn; - int i; - - lidp = licp->lic_descs; - for (i = 0; i < licp->lic_unused; i++, lidp++) { - struct xfs_ail *ailp; - - if (xfs_lic_isfree(licp, i)) { - continue; - } - - lip = lidp->lid_item; - if (aborted) - lip->li_flags |= XFS_LI_ABORTED; - - /* - * Send in the ABORTED flag to the COMMITTED routine - * so that it knows whether the transaction was aborted - * or not. - */ - item_lsn = IOP_COMMITTED(lip, lsn); - - /* - * If the committed routine returns -1, make - * no more references to the item. - */ - if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) { - continue; - } - - /* - * If the returned lsn is greater than what it - * contained before, update the location of the - * item in the AIL. If it is not, then do nothing. - * Items can never move backwards in the AIL. - * - * While the new lsn should usually be greater, it - * is possible that a later transaction completing - * simultaneously with an earlier one using the - * same item could complete first with a higher lsn. - * This would cause the earlier transaction to fail - * the test below. - */ - ailp = lip->li_ailp; - spin_lock(&ailp->xa_lock); - if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) { - /* - * This will set the item's lsn to item_lsn - * and update the position of the item in - * the AIL. - * - * xfs_trans_ail_update() drops the AIL lock. - */ - xfs_trans_ail_update(ailp, lip, item_lsn); - } else { - spin_unlock(&ailp->xa_lock); - } - - /* - * Now that we've repositioned the item in the AIL, - * unpin it so it can be flushed. Pass information - * about buffer stale state down from the log item - * flags, if anyone else stales the buffer we do not - * want to pay any attention to it. - */ - IOP_UNPIN(lip, lidp->lid_flags & XFS_LID_BUF_STALE); - } -} diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index ca64f33c63a3..e639e8e9a2a9 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -49,6 +49,15 @@ typedef struct xfs_trans_header { #define XFS_LI_DQUOT 0x123d #define XFS_LI_QUOTAOFF 0x123e +#define XFS_LI_TYPE_DESC \ + { XFS_LI_EFI, "XFS_LI_EFI" }, \ + { XFS_LI_EFD, "XFS_LI_EFD" }, \ + { XFS_LI_IUNLINK, "XFS_LI_IUNLINK" }, \ + { XFS_LI_INODE, "XFS_LI_INODE" }, \ + { XFS_LI_BUF, "XFS_LI_BUF" }, \ + { XFS_LI_DQUOT, "XFS_LI_DQUOT" }, \ + { XFS_LI_QUOTAOFF, "XFS_LI_QUOTAOFF" } + /* * Transaction types. Used to distinguish types of buffers. */ @@ -97,7 +106,8 @@ typedef struct xfs_trans_header { #define XFS_TRANS_GROWFSRT_FREE 39 #define XFS_TRANS_SWAPEXT 40 #define XFS_TRANS_SB_COUNT 41 -#define XFS_TRANS_TYPE_MAX 41 +#define XFS_TRANS_CHECKPOINT 42 +#define XFS_TRANS_TYPE_MAX 42 /* new transaction types need to be reflected in xfs_logprint(8) */ #define XFS_TRANS_TYPES \ @@ -139,6 +149,7 @@ typedef struct xfs_trans_header { { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \ { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \ { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \ + { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \ { XFS_TRANS_DUMMY1, "DUMMY1" }, \ { XFS_TRANS_DUMMY2, "DUMMY2" }, \ { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" } @@ -159,7 +170,6 @@ typedef struct xfs_log_item_desc { #define XFS_LID_DIRTY 0x1 #define XFS_LID_PINNED 0x2 -#define XFS_LID_BUF_STALE 0x8 /* * This structure is used to maintain a chunk list of log_item_desc @@ -290,24 +300,6 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp) /* - * Various log reservation values. - * These are based on the size of the file system block - * because that is what most transactions manipulate. - * Each adds in an additional 128 bytes per item logged to - * try to account for the overhead of the transaction mechanism. - * - * Note: - * Most of the reservations underestimate the number of allocation - * groups into which they could free extents in the xfs_bmap_finish() - * call. This is because the number in the worst case is quite high - * and quite unusual. In order to fix this we need to change - * xfs_bmap_finish() to free extents in only a single AG at a time. - * This will require changes to the EFI code as well, however, so that - * the EFI for the extents not freed is logged again in each transaction. - * See bug 261917. - */ - -/* * Per-extent log reservation for the allocation btree changes * involved in freeing or allocating an extent. * 2 trees * (2 blocks/level * max depth - 1) * block size @@ -331,429 +323,36 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp) (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \ XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1) -/* - * In a write transaction we can allocate a maximum of 2 - * extents. This gives: - * the inode getting the new extents: inode size - * the inode's bmap btree: max depth * block size - * the agfs of the ags from which the extents are allocated: 2 * sector - * the superblock free block counter: sector size - * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size - * And the bmap_finish transaction can free bmap blocks in a join: - * the agfs of the ags containing the blocks: 2 * sector size - * the agfls of the ags containing the blocks: 2 * sector size - * the super block free block counter: sector size - * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size - */ -#define XFS_CALC_WRITE_LOG_RES(mp) \ - (MAX( \ - ((mp)->m_sb.sb_inodesize + \ - XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \ - (2 * (mp)->m_sb.sb_sectsize) + \ - (mp)->m_sb.sb_sectsize + \ - XFS_ALLOCFREE_LOG_RES(mp, 2) + \ - (128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))),\ - ((2 * (mp)->m_sb.sb_sectsize) + \ - (2 * (mp)->m_sb.sb_sectsize) + \ - (mp)->m_sb.sb_sectsize + \ - XFS_ALLOCFREE_LOG_RES(mp, 2) + \ - (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))))) #define XFS_WRITE_LOG_RES(mp) ((mp)->m_reservations.tr_write) - -/* - * In truncating a file we free up to two extents at once. We can modify: - * the inode being truncated: inode size - * the inode's bmap btree: (max depth + 1) * block size - * And the bmap_finish transaction can free the blocks and bmap blocks: - * the agf for each of the ags: 4 * sector size - * the agfl for each of the ags: 4 * sector size - * the super block to reflect the freed blocks: sector size - * worst case split in allocation btrees per extent assuming 4 extents: - * 4 exts * 2 trees * (2 * max depth - 1) * block size - * the inode btree: max depth * blocksize - * the allocation btrees: 2 trees * (max depth - 1) * block size - */ -#define XFS_CALC_ITRUNCATE_LOG_RES(mp) \ - (MAX( \ - ((mp)->m_sb.sb_inodesize + \ - XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) + \ - (128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \ - ((4 * (mp)->m_sb.sb_sectsize) + \ - (4 * (mp)->m_sb.sb_sectsize) + \ - (mp)->m_sb.sb_sectsize + \ - XFS_ALLOCFREE_LOG_RES(mp, 4) + \ - (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))) + \ - (128 * 5) + \ - XFS_ALLOCFREE_LOG_RES(mp, 1) + \ - (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \ - XFS_ALLOCFREE_LOG_COUNT(mp, 1)))))) - #define XFS_ITRUNCATE_LOG_RES(mp) ((mp)->m_reservations.tr_itruncate) - -/* - * In renaming a files we can modify: - * the four inodes involved: 4 * inode size - * the two directory btrees: 2 * (max depth + v2) * dir block size - * the two directory bmap btrees: 2 * max depth * block size - * And the bmap_finish transaction can free dir and bmap blocks (two sets - * of bmap blocks) giving: - * the agf for the ags in which the blocks live: 3 * sector size - * the agfl for the ags in which the blocks live: 3 * sector size - * the superblock for the free block count: sector size - * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size - */ -#define XFS_CALC_RENAME_LOG_RES(mp) \ - (MAX( \ - ((4 * (mp)->m_sb.sb_inodesize) + \ - (2 * XFS_DIROP_LOG_RES(mp)) + \ - (128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp)))), \ - ((3 * (mp)->m_sb.sb_sectsize) + \ - (3 * (mp)->m_sb.sb_sectsize) + \ - (mp)->m_sb.sb_sectsize + \ - XFS_ALLOCFREE_LOG_RES(mp, 3) + \ - (128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3)))))) - #define XFS_RENAME_LOG_RES(mp) ((mp)->m_reservations.tr_rename) - -/* - * For creating a link to an inode: - * the parent directory inode: inode size - * the linked inode: inode size - * the directory btree could split: (max depth + v2) * dir block size - * the directory bmap btree could join or split: (max depth + v2) * blocksize - * And the bmap_finish transaction can free some bmap blocks giving: - * the agf for the ag in which the blocks live: sector size - * the agfl for the ag in which the blocks live: sector size - * the superblock for the free block count: sector size - * the allocation btrees: 2 trees * (2 * max depth - 1) * block size - */ -#define XFS_CALC_LINK_LOG_RES(mp) \ - (MAX( \ - ((mp)->m_sb.sb_inodesize + \ - (mp)->m_sb.sb_inodesize + \ - XFS_DIROP_LOG_RES(mp) + \ - (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \ - ((mp)->m_sb.sb_sectsize + \ - (mp)->m_sb.sb_sectsize + \ - (mp)->m_sb.sb_sectsize + \ - XFS_ALLOCFREE_LOG_RES(mp, 1) + \ - (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1)))))) - #define XFS_LINK_LOG_RES(mp) ((mp)->m_reservations.tr_link) - -/* - * For removing a directory entry we can modify: - * the parent directory inode: inode size - * the removed inode: inode size - * the directory btree could join: (max depth + v2) * dir block size - * the directory bmap btree could join or split: (max depth + v2) * blocksize - * And the bmap_finish transaction can free the dir and bmap blocks giving: - * the agf for the ag in which the blocks live: 2 * sector size - * the agfl for the ag in which the blocks live: 2 * sector size - * the superblock for the free block count: sector size - * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size - */ -#define XFS_CALC_REMOVE_LOG_RES(mp) \ - (MAX( \ - ((mp)->m_sb.sb_inodesize + \ - (mp)->m_sb.sb_inodesize + \ - XFS_DIROP_LOG_RES(mp) + \ - (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \ - ((2 * (mp)->m_sb.sb_sectsize) + \ - (2 * (mp)->m_sb.sb_sectsize) + \ - (mp)->m_sb.sb_sectsize + \ - XFS_ALLOCFREE_LOG_RES(mp, 2) + \ - (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))))) - #define XFS_REMOVE_LOG_RES(mp) ((mp)->m_reservations.tr_remove) - -/* - * For symlink we can modify: - * the parent directory inode: inode size - * the new inode: inode size - * the inode btree entry: 1 block - * the directory btree: (max depth + v2) * dir block size - * the directory inode's bmap btree: (max depth + v2) * block size - * the blocks for the symlink: 1 kB - * Or in the first xact we allocate some inodes giving: - * the agi and agf of the ag getting the new inodes: 2 * sectorsize - * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize - * the inode btree: max depth * blocksize - * the allocation btrees: 2 trees * (2 * max depth - 1) * block size - */ -#define XFS_CALC_SYMLINK_LOG_RES(mp) \ - (MAX( \ - ((mp)->m_sb.sb_inodesize + \ - (mp)->m_sb.sb_inodesize + \ - XFS_FSB_TO_B(mp, 1) + \ - XFS_DIROP_LOG_RES(mp) + \ - 1024 + \ - (128 * (4 + XFS_DIROP_LOG_COUNT(mp)))), \ - (2 * (mp)->m_sb.sb_sectsize + \ - XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \ - XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \ - XFS_ALLOCFREE_LOG_RES(mp, 1) + \ - (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \ - XFS_ALLOCFREE_LOG_COUNT(mp, 1)))))) - #define XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink) - -/* - * For create we can modify: - * the parent directory inode: inode size - * the new inode: inode size - * the inode btree entry: block size - * the superblock for the nlink flag: sector size - * the directory btree: (max depth + v2) * dir block size - * the directory inode's bmap btree: (max depth + v2) * block size - * Or in the first xact we allocate some inodes giving: - * the agi and agf of the ag getting the new inodes: 2 * sectorsize - * the superblock for the nlink flag: sector size - * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize - * the inode btree: max depth * blocksize - * the allocation btrees: 2 trees * (max depth - 1) * block size - */ -#define XFS_CALC_CREATE_LOG_RES(mp) \ - (MAX( \ - ((mp)->m_sb.sb_inodesize + \ - (mp)->m_sb.sb_inodesize + \ - (mp)->m_sb.sb_sectsize + \ - XFS_FSB_TO_B(mp, 1) + \ - XFS_DIROP_LOG_RES(mp) + \ - (128 * (3 + XFS_DIROP_LOG_COUNT(mp)))), \ - (3 * (mp)->m_sb.sb_sectsize + \ - XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \ - XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \ - XFS_ALLOCFREE_LOG_RES(mp, 1) + \ - (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \ - XFS_ALLOCFREE_LOG_COUNT(mp, 1)))))) - #define XFS_CREATE_LOG_RES(mp) ((mp)->m_reservations.tr_create) - -/* - * Making a new directory is the same as creating a new file. - */ -#define XFS_CALC_MKDIR_LOG_RES(mp) XFS_CALC_CREATE_LOG_RES(mp) - #define XFS_MKDIR_LOG_RES(mp) ((mp)->m_reservations.tr_mkdir) - -/* - * In freeing an inode we can modify: - * the inode being freed: inode size - * the super block free inode counter: sector size - * the agi hash list and counters: sector size - * the inode btree entry: block size - * the on disk inode before ours in the agi hash list: inode cluster size - * the inode btree: max depth * blocksize - * the allocation btrees: 2 trees * (max depth - 1) * block size - */ -#define XFS_CALC_IFREE_LOG_RES(mp) \ - ((mp)->m_sb.sb_inodesize + \ - (mp)->m_sb.sb_sectsize + \ - (mp)->m_sb.sb_sectsize + \ - XFS_FSB_TO_B((mp), 1) + \ - MAX((__uint16_t)XFS_FSB_TO_B((mp), 1), XFS_INODE_CLUSTER_SIZE(mp)) + \ - (128 * 5) + \ - XFS_ALLOCFREE_LOG_RES(mp, 1) + \ - (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \ - XFS_ALLOCFREE_LOG_COUNT(mp, 1)))) - - #define XFS_IFREE_LOG_RES(mp) ((mp)->m_reservations.tr_ifree) - -/* - * When only changing the inode we log the inode and possibly the superblock - * We also add a bit of slop for the transaction stuff. - */ -#define XFS_CALC_ICHANGE_LOG_RES(mp) ((mp)->m_sb.sb_inodesize + \ - (mp)->m_sb.sb_sectsize + 512) - #define XFS_ICHANGE_LOG_RES(mp) ((mp)->m_reservations.tr_ichange) - -/* - * Growing the data section of the filesystem. - * superblock - * agi and agf - * allocation btrees - */ -#define XFS_CALC_GROWDATA_LOG_RES(mp) \ - ((mp)->m_sb.sb_sectsize * 3 + \ - XFS_ALLOCFREE_LOG_RES(mp, 1) + \ - (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1)))) - #define XFS_GROWDATA_LOG_RES(mp) ((mp)->m_reservations.tr_growdata) - -/* - * Growing the rt section of the filesystem. - * In the first set of transactions (ALLOC) we allocate space to the - * bitmap or summary files. - * superblock: sector size - * agf of the ag from which the extent is allocated: sector size - * bmap btree for bitmap/summary inode: max depth * blocksize - * bitmap/summary inode: inode size - * allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize - */ -#define XFS_CALC_GROWRTALLOC_LOG_RES(mp) \ - (2 * (mp)->m_sb.sb_sectsize + \ - XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \ - (mp)->m_sb.sb_inodesize + \ - XFS_ALLOCFREE_LOG_RES(mp, 1) + \ - (128 * \ - (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + \ - XFS_ALLOCFREE_LOG_COUNT(mp, 1)))) - #define XFS_GROWRTALLOC_LOG_RES(mp) ((mp)->m_reservations.tr_growrtalloc) - -/* - * Growing the rt section of the filesystem. - * In the second set of transactions (ZERO) we zero the new metadata blocks. - * one bitmap/summary block: blocksize - */ -#define XFS_CALC_GROWRTZERO_LOG_RES(mp) \ - ((mp)->m_sb.sb_blocksize + 128) - #define XFS_GROWRTZERO_LOG_RES(mp) ((mp)->m_reservations.tr_growrtzero) - -/* - * Growing the rt section of the filesystem. - * In the third set of transactions (FREE) we update metadata without - * allocating any new blocks. - * superblock: sector size - * bitmap inode: inode size - * summary inode: inode size - * one bitmap block: blocksize - * summary blocks: new summary size - */ -#define XFS_CALC_GROWRTFREE_LOG_RES(mp) \ - ((mp)->m_sb.sb_sectsize + \ - 2 * (mp)->m_sb.sb_inodesize + \ - (mp)->m_sb.sb_blocksize + \ - (mp)->m_rsumsize + \ - (128 * 5)) - #define XFS_GROWRTFREE_LOG_RES(mp) ((mp)->m_reservations.tr_growrtfree) - -/* - * Logging the inode modification timestamp on a synchronous write. - * inode - */ -#define XFS_CALC_SWRITE_LOG_RES(mp) \ - ((mp)->m_sb.sb_inodesize + 128) - #define XFS_SWRITE_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) - /* * Logging the inode timestamps on an fsync -- same as SWRITE * as long as SWRITE logs the entire inode core */ #define XFS_FSYNC_TS_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) - -/* - * Logging the inode mode bits when writing a setuid/setgid file - * inode - */ -#define XFS_CALC_WRITEID_LOG_RES(mp) \ - ((mp)->m_sb.sb_inodesize + 128) - #define XFS_WRITEID_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) - -/* - * Converting the inode from non-attributed to attributed. - * the inode being converted: inode size - * agf block and superblock (for block allocation) - * the new block (directory sized) - * bmap blocks for the new directory block - * allocation btrees - */ -#define XFS_CALC_ADDAFORK_LOG_RES(mp) \ - ((mp)->m_sb.sb_inodesize + \ - (mp)->m_sb.sb_sectsize * 2 + \ - (mp)->m_dirblksize + \ - XFS_FSB_TO_B(mp, (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1)) + \ - XFS_ALLOCFREE_LOG_RES(mp, 1) + \ - (128 * (4 + (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) + \ - XFS_ALLOCFREE_LOG_COUNT(mp, 1)))) - #define XFS_ADDAFORK_LOG_RES(mp) ((mp)->m_reservations.tr_addafork) - -/* - * Removing the attribute fork of a file - * the inode being truncated: inode size - * the inode's bmap btree: max depth * block size - * And the bmap_finish transaction can free the blocks and bmap blocks: - * the agf for each of the ags: 4 * sector size - * the agfl for each of the ags: 4 * sector size - * the super block to reflect the freed blocks: sector size - * worst case split in allocation btrees per extent assuming 4 extents: - * 4 exts * 2 trees * (2 * max depth - 1) * block size - */ -#define XFS_CALC_ATTRINVAL_LOG_RES(mp) \ - (MAX( \ - ((mp)->m_sb.sb_inodesize + \ - XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \ - (128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))), \ - ((4 * (mp)->m_sb.sb_sectsize) + \ - (4 * (mp)->m_sb.sb_sectsize) + \ - (mp)->m_sb.sb_sectsize + \ - XFS_ALLOCFREE_LOG_RES(mp, 4) + \ - (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4)))))) - #define XFS_ATTRINVAL_LOG_RES(mp) ((mp)->m_reservations.tr_attrinval) - -/* - * Setting an attribute. - * the inode getting the attribute - * the superblock for allocations - * the agfs extents are allocated from - * the attribute btree * max depth - * the inode allocation btree - * Since attribute transaction space is dependent on the size of the attribute, - * the calculation is done partially at mount time and partially at runtime. - */ -#define XFS_CALC_ATTRSET_LOG_RES(mp) \ - ((mp)->m_sb.sb_inodesize + \ - (mp)->m_sb.sb_sectsize + \ - XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \ - (128 * (2 + XFS_DA_NODE_MAXDEPTH))) - #define XFS_ATTRSET_LOG_RES(mp, ext) \ ((mp)->m_reservations.tr_attrset + \ (ext * (mp)->m_sb.sb_sectsize) + \ (ext * XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))) + \ (128 * (ext + (ext * XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))))) - -/* - * Removing an attribute. - * the inode: inode size - * the attribute btree could join: max depth * block size - * the inode bmap btree could join or split: max depth * block size - * And the bmap_finish transaction can free the attr blocks freed giving: - * the agf for the ag in which the blocks live: 2 * sector size - * the agfl for the ag in which the blocks live: 2 * sector size - * the superblock for the free block count: sector size - * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size - */ -#define XFS_CALC_ATTRRM_LOG_RES(mp) \ - (MAX( \ - ((mp)->m_sb.sb_inodesize + \ - XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \ - XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \ - (128 * (1 + XFS_DA_NODE_MAXDEPTH + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \ - ((2 * (mp)->m_sb.sb_sectsize) + \ - (2 * (mp)->m_sb.sb_sectsize) + \ - (mp)->m_sb.sb_sectsize + \ - XFS_ALLOCFREE_LOG_RES(mp, 2) + \ - (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))))) - #define XFS_ATTRRM_LOG_RES(mp) ((mp)->m_reservations.tr_attrrm) - -/* - * Clearing a bad agino number in an agi hash bucket. - */ -#define XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp) \ - ((mp)->m_sb.sb_sectsize + 128) - #define XFS_CLEAR_AGI_BUCKET_LOG_RES(mp) ((mp)->m_reservations.tr_clearagi) @@ -805,6 +404,7 @@ struct xfs_log_item_desc; struct xfs_mount; struct xfs_trans; struct xfs_dquot_acct; +struct xfs_busy_extent; typedef struct xfs_log_item { struct list_head li_ail; /* AIL pointers */ @@ -820,6 +420,11 @@ typedef struct xfs_log_item { /* buffer item iodone */ /* callback func */ struct xfs_item_ops *li_ops; /* function list */ + + /* delayed logging */ + struct list_head li_cil; /* CIL pointers */ + struct xfs_log_vec *li_lv; /* active log vector */ + xfs_lsn_t li_seq; /* CIL commit seq */ } xfs_log_item_t; #define XFS_LI_IN_AIL 0x1 @@ -833,7 +438,7 @@ typedef struct xfs_item_ops { uint (*iop_size)(xfs_log_item_t *); void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); void (*iop_pin)(xfs_log_item_t *); - void (*iop_unpin)(xfs_log_item_t *, int); + void (*iop_unpin)(xfs_log_item_t *); void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *); uint (*iop_trylock)(xfs_log_item_t *); void (*iop_unlock)(xfs_log_item_t *); @@ -846,7 +451,7 @@ typedef struct xfs_item_ops { #define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip) #define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp) #define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip) -#define IOP_UNPIN(ip, flags) (*(ip)->li_ops->iop_unpin)(ip, flags) +#define IOP_UNPIN(ip) (*(ip)->li_ops->iop_unpin)(ip) #define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp) #define IOP_TRYLOCK(ip) (*(ip)->li_ops->iop_trylock)(ip) #define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip) @@ -861,36 +466,7 @@ typedef struct xfs_item_ops { #define XFS_ITEM_SUCCESS 0 #define XFS_ITEM_PINNED 1 #define XFS_ITEM_LOCKED 2 -#define XFS_ITEM_FLUSHING 3 -#define XFS_ITEM_PUSHBUF 4 - -/* - * This structure is used to maintain a list of block ranges that have been - * freed in the transaction. The ranges are listed in the perag[] busy list - * between when they're freed and the transaction is committed to disk. - */ - -typedef struct xfs_log_busy_slot { - xfs_agnumber_t lbc_ag; - ushort lbc_idx; /* index in perag.busy[] */ -} xfs_log_busy_slot_t; - -#define XFS_LBC_NUM_SLOTS 31 -typedef struct xfs_log_busy_chunk { - struct xfs_log_busy_chunk *lbc_next; - uint lbc_free; /* free slots bitmask */ - ushort lbc_unused; /* first unused */ - xfs_log_busy_slot_t lbc_busy[XFS_LBC_NUM_SLOTS]; -} xfs_log_busy_chunk_t; - -#define XFS_LBC_MAX_SLOT (XFS_LBC_NUM_SLOTS - 1) -#define XFS_LBC_FREEMASK ((1U << XFS_LBC_NUM_SLOTS) - 1) - -#define XFS_LBC_INIT(cp) ((cp)->lbc_free = XFS_LBC_FREEMASK) -#define XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot))) -#define XFS_LBC_SLOT(cp, slot) (&((cp)->lbc_busy[(slot)])) -#define XFS_LBC_VACANCY(cp) (((cp)->lbc_free) & XFS_LBC_FREEMASK) -#define XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot))) +#define XFS_ITEM_PUSHBUF 3 /* * This is the type of function which can be given to xfs_trans_callback() @@ -911,7 +487,7 @@ typedef struct xfs_trans { unsigned int t_blk_res_used; /* # of resvd blocks used */ unsigned int t_rtx_res; /* # of rt extents resvd */ unsigned int t_rtx_res_used; /* # of resvd rt extents used */ - xfs_log_ticket_t t_ticket; /* log mgr ticket */ + struct xlog_ticket *t_ticket; /* log mgr ticket */ xfs_lsn_t t_lsn; /* log seq num of start of * transaction. */ xfs_lsn_t t_commit_lsn; /* log seq num of end of @@ -943,8 +519,7 @@ typedef struct xfs_trans { unsigned int t_items_free; /* log item descs free */ xfs_log_item_chunk_t t_items; /* first log item desc chunk */ xfs_trans_header_t t_header; /* header for in-log trans */ - unsigned int t_busy_free; /* busy descs free */ - xfs_log_busy_chunk_t t_busy; /* busy/async free blocks */ + struct list_head t_busy; /* list of busy extents */ unsigned long t_pflags; /* saved process flags state */ } xfs_trans_t; @@ -1018,9 +593,6 @@ int _xfs_trans_commit(xfs_trans_t *, void xfs_trans_cancel(xfs_trans_t *, int); int xfs_trans_ail_init(struct xfs_mount *); void xfs_trans_ail_destroy(struct xfs_mount *); -xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp, - xfs_agnumber_t ag, - xfs_extlen_t idx); extern kmem_zone_t *xfs_trans_zone; diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 2ffc570679be..e799824f7245 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -237,14 +237,15 @@ out: } /* - * Function that does the work of pushing on the AIL + * xfsaild_push does the work of pushing on the AIL. Returning a timeout of + * zero indicates that the caller should sleep until woken. */ long xfsaild_push( struct xfs_ail *ailp, xfs_lsn_t *last_lsn) { - long tout = 1000; /* milliseconds */ + long tout = 0; xfs_lsn_t last_pushed_lsn = *last_lsn; xfs_lsn_t target = ailp->xa_target; xfs_lsn_t lsn; @@ -252,6 +253,7 @@ xfsaild_push( int flush_log, count, stuck; xfs_mount_t *mp = ailp->xa_mount; struct xfs_ail_cursor *cur = &ailp->xa_cursors; + int push_xfsbufd = 0; spin_lock(&ailp->xa_lock); xfs_trans_ail_cursor_init(ailp, cur); @@ -262,7 +264,7 @@ xfsaild_push( */ xfs_trans_ail_cursor_done(ailp, cur); spin_unlock(&ailp->xa_lock); - last_pushed_lsn = 0; + *last_lsn = 0; return tout; } @@ -279,7 +281,6 @@ xfsaild_push( * prevents use from spinning when we can't do anything or there is * lots of contention on the AIL lists. */ - tout = 10; lsn = lip->li_lsn; flush_log = stuck = count = 0; while ((XFS_LSN_CMP(lip->li_lsn, target) < 0)) { @@ -308,6 +309,7 @@ xfsaild_push( XFS_STATS_INC(xs_push_ail_pushbuf); IOP_PUSHBUF(lip); last_pushed_lsn = lsn; + push_xfsbufd = 1; break; case XFS_ITEM_PINNED: @@ -322,12 +324,6 @@ xfsaild_push( stuck++; break; - case XFS_ITEM_FLUSHING: - XFS_STATS_INC(xs_push_ail_flushing); - last_pushed_lsn = lsn; - stuck++; - break; - default: ASSERT(0); break; @@ -371,19 +367,24 @@ xfsaild_push( * move forward in the AIL. */ XFS_STATS_INC(xs_push_ail_flush); - xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); + xfs_log_force(mp, 0); + } + + if (push_xfsbufd) { + /* we've got delayed write buffers to flush */ + wake_up_process(mp->m_ddev_targp->bt_task); } if (!count) { /* We're past our target or empty, so idle */ - tout = 1000; + last_pushed_lsn = 0; } else if (XFS_LSN_CMP(lsn, target) >= 0) { /* * We reached the target so wait a bit longer for I/O to * complete and remove pushed items from the AIL before we * start the next scan from the start of the AIL. */ - tout += 20; + tout = 50; last_pushed_lsn = 0; } else if ((stuck * 100) / count > 90) { /* @@ -395,11 +396,14 @@ xfsaild_push( * Backoff a bit more to allow some I/O to complete before * continuing from where we were. */ - tout += 10; + tout = 20; + } else { + /* more to do, but wait a short while before continuing */ + tout = 10; } *last_lsn = last_pushed_lsn; return tout; -} /* xfsaild_push */ +} /* diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 49130628d5ef..63d81a22f4fd 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -40,12 +40,111 @@ #include "xfs_rw.h" #include "xfs_trace.h" +/* + * Check to see if a buffer matching the given parameters is already + * a part of the given transaction. + */ +STATIC struct xfs_buf * +xfs_trans_buf_item_match( + struct xfs_trans *tp, + struct xfs_buftarg *target, + xfs_daddr_t blkno, + int len) +{ + xfs_log_item_chunk_t *licp; + xfs_log_item_desc_t *lidp; + xfs_buf_log_item_t *blip; + int i; + + len = BBTOB(len); + for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) { + if (xfs_lic_are_all_free(licp)) { + ASSERT(licp == &tp->t_items); + ASSERT(licp->lic_next == NULL); + return NULL; + } + + for (i = 0; i < licp->lic_unused; i++) { + /* + * Skip unoccupied slots. + */ + if (xfs_lic_isfree(licp, i)) + continue; + + lidp = xfs_lic_slot(licp, i); + blip = (xfs_buf_log_item_t *)lidp->lid_item; + if (blip->bli_item.li_type != XFS_LI_BUF) + continue; -STATIC xfs_buf_t *xfs_trans_buf_item_match(xfs_trans_t *, xfs_buftarg_t *, - xfs_daddr_t, int); -STATIC xfs_buf_t *xfs_trans_buf_item_match_all(xfs_trans_t *, xfs_buftarg_t *, - xfs_daddr_t, int); + if (XFS_BUF_TARGET(blip->bli_buf) == target && + XFS_BUF_ADDR(blip->bli_buf) == blkno && + XFS_BUF_COUNT(blip->bli_buf) == len) + return blip->bli_buf; + } + } + return NULL; +} + +/* + * Add the locked buffer to the transaction. + * + * The buffer must be locked, and it cannot be associated with any + * transaction. + * + * If the buffer does not yet have a buf log item associated with it, + * then allocate one for it. Then add the buf item to the transaction. + */ +STATIC void +_xfs_trans_bjoin( + struct xfs_trans *tp, + struct xfs_buf *bp, + int reset_recur) +{ + struct xfs_buf_log_item *bip; + + ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL); + + /* + * The xfs_buf_log_item pointer is stored in b_fsprivate. If + * it doesn't have one yet, then allocate one and initialize it. + * The checks to see if one is there are in xfs_buf_item_init(). + */ + xfs_buf_item_init(bp, tp->t_mountp); + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); + ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); + if (reset_recur) + bip->bli_recur = 0; + + /* + * Take a reference for this transaction on the buf item. + */ + atomic_inc(&bip->bli_refcount); + + /* + * Get a log_item_desc to point at the new item. + */ + (void) xfs_trans_add_item(tp, (xfs_log_item_t *)bip); + + /* + * Initialize b_fsprivate2 so we can find it with incore_match() + * in xfs_trans_get_buf() and friends above. + */ + XFS_BUF_SET_FSPRIVATE2(bp, tp); + +} + +void +xfs_trans_bjoin( + struct xfs_trans *tp, + struct xfs_buf *bp) +{ + _xfs_trans_bjoin(tp, bp, 0); + trace_xfs_trans_bjoin(bp->b_fspriv); +} /* * Get and lock the buffer for the caller if it is not already @@ -53,14 +152,6 @@ STATIC xfs_buf_t *xfs_trans_buf_item_match_all(xfs_trans_t *, xfs_buftarg_t *, * within the transaction, just increment its lock recursion count * and return a pointer to it. * - * Use the fast path function xfs_trans_buf_item_match() or the buffer - * cache routine incore_match() to find the buffer - * if it is already owned by this transaction. - * - * If we don't already own the buffer, use get_buf() to get it. - * If it doesn't yet have an associated xfs_buf_log_item structure, - * then allocate one and add the item to this transaction. - * * If the transaction pointer is NULL, make this just a normal * get_buf() call. */ @@ -75,13 +166,14 @@ xfs_trans_get_buf(xfs_trans_t *tp, xfs_buf_log_item_t *bip; if (flags == 0) - flags = XFS_BUF_LOCK | XFS_BUF_MAPPED; + flags = XBF_LOCK | XBF_MAPPED; /* * Default to a normal get_buf() call if the tp is NULL. */ if (tp == NULL) - return xfs_buf_get(target_dev, blkno, len, flags | BUF_BUSY); + return xfs_buf_get(target_dev, blkno, len, + flags | XBF_DONT_BLOCK); /* * If we find the buffer in the cache with this transaction @@ -89,11 +181,7 @@ xfs_trans_get_buf(xfs_trans_t *tp, * have it locked. In this case we just increment the lock * recursion count and return the buffer to the caller. */ - if (tp->t_items.lic_next == NULL) { - bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len); - } else { - bp = xfs_trans_buf_item_match_all(tp, target_dev, blkno, len); - } + bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len); if (bp != NULL) { ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) @@ -117,54 +205,22 @@ xfs_trans_get_buf(xfs_trans_t *tp, } /* - * We always specify the BUF_BUSY flag within a transaction so - * that get_buf does not try to push out a delayed write buffer + * We always specify the XBF_DONT_BLOCK flag within a transaction + * so that get_buf does not try to push out a delayed write buffer * which might cause another transaction to take place (if the * buffer was delayed alloc). Such recursive transactions can * easily deadlock with our current transaction as well as cause * us to run out of stack space. */ - bp = xfs_buf_get(target_dev, blkno, len, flags | BUF_BUSY); + bp = xfs_buf_get(target_dev, blkno, len, flags | XBF_DONT_BLOCK); if (bp == NULL) { return NULL; } ASSERT(!XFS_BUF_GETERROR(bp)); - /* - * The xfs_buf_log_item pointer is stored in b_fsprivate. If - * it doesn't have one yet, then allocate one and initialize it. - * The checks to see if one is there are in xfs_buf_item_init(). - */ - xfs_buf_item_init(bp, tp->t_mountp); - - /* - * Set the recursion count for the buffer within this transaction - * to 0. - */ - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); - ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); - ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); - bip->bli_recur = 0; - - /* - * Take a reference for this transaction on the buf item. - */ - atomic_inc(&bip->bli_refcount); - - /* - * Get a log_item_desc to point at the new item. - */ - (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip); - - /* - * Initialize b_fsprivate2 so we can find it with incore_match() - * above. - */ - XFS_BUF_SET_FSPRIVATE2(bp, tp); - - trace_xfs_trans_get_buf(bip); + _xfs_trans_bjoin(tp, bp, 1); + trace_xfs_trans_get_buf(bp->b_fspriv); return (bp); } @@ -209,44 +265,11 @@ xfs_trans_getsb(xfs_trans_t *tp, } bp = xfs_getsb(mp, flags); - if (bp == NULL) { + if (bp == NULL) return NULL; - } - - /* - * The xfs_buf_log_item pointer is stored in b_fsprivate. If - * it doesn't have one yet, then allocate one and initialize it. - * The checks to see if one is there are in xfs_buf_item_init(). - */ - xfs_buf_item_init(bp, mp); - - /* - * Set the recursion count for the buffer within this transaction - * to 0. - */ - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); - ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); - ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); - bip->bli_recur = 0; - - /* - * Take a reference for this transaction on the buf item. - */ - atomic_inc(&bip->bli_refcount); - /* - * Get a log_item_desc to point at the new item. - */ - (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip); - - /* - * Initialize b_fsprivate2 so we can find it with incore_match() - * above. - */ - XFS_BUF_SET_FSPRIVATE2(bp, tp); - - trace_xfs_trans_getsb(bip); + _xfs_trans_bjoin(tp, bp, 1); + trace_xfs_trans_getsb(bp->b_fspriv); return (bp); } @@ -264,14 +287,6 @@ int xfs_error_mod = 33; * within the transaction and already read in, just increment its * lock recursion count and return a pointer to it. * - * Use the fast path function xfs_trans_buf_item_match() or the buffer - * cache routine incore_match() to find the buffer - * if it is already owned by this transaction. - * - * If we don't already own the buffer, use read_buf() to get it. - * If it doesn't yet have an associated xfs_buf_log_item structure, - * then allocate one and add the item to this transaction. - * * If the transaction pointer is NULL, make this just a normal * read_buf() call. */ @@ -290,15 +305,15 @@ xfs_trans_read_buf( int error; if (flags == 0) - flags = XFS_BUF_LOCK | XFS_BUF_MAPPED; + flags = XBF_LOCK | XBF_MAPPED; /* * Default to a normal get_buf() call if the tp is NULL. */ if (tp == NULL) { - bp = xfs_buf_read(target, blkno, len, flags | BUF_BUSY); + bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK); if (!bp) - return (flags & XFS_BUF_TRYLOCK) ? + return (flags & XBF_TRYLOCK) ? EAGAIN : XFS_ERROR(ENOMEM); if (XFS_BUF_GETERROR(bp) != 0) { @@ -333,11 +348,7 @@ xfs_trans_read_buf( * If the buffer is not yet read in, then we read it in, increment * the lock recursion count, and return it to the caller. */ - if (tp->t_items.lic_next == NULL) { - bp = xfs_trans_buf_item_match(tp, target, blkno, len); - } else { - bp = xfs_trans_buf_item_match_all(tp, target, blkno, len); - } + bp = xfs_trans_buf_item_match(tp, target, blkno, len); if (bp != NULL) { ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); @@ -385,14 +396,14 @@ xfs_trans_read_buf( } /* - * We always specify the BUF_BUSY flag within a transaction so - * that get_buf does not try to push out a delayed write buffer + * We always specify the XBF_DONT_BLOCK flag within a transaction + * so that get_buf does not try to push out a delayed write buffer * which might cause another transaction to take place (if the * buffer was delayed alloc). Such recursive transactions can * easily deadlock with our current transaction as well as cause * us to run out of stack space. */ - bp = xfs_buf_read(target, blkno, len, flags | BUF_BUSY); + bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK); if (bp == NULL) { *bpp = NULL; return 0; @@ -424,40 +435,9 @@ xfs_trans_read_buf( if (XFS_FORCED_SHUTDOWN(mp)) goto shutdown_abort; - /* - * The xfs_buf_log_item pointer is stored in b_fsprivate. If - * it doesn't have one yet, then allocate one and initialize it. - * The checks to see if one is there are in xfs_buf_item_init(). - */ - xfs_buf_item_init(bp, tp->t_mountp); - - /* - * Set the recursion count for the buffer within this transaction - * to 0. - */ - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); - ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); - ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); - bip->bli_recur = 0; - - /* - * Take a reference for this transaction on the buf item. - */ - atomic_inc(&bip->bli_refcount); + _xfs_trans_bjoin(tp, bp, 1); + trace_xfs_trans_read_buf(bp->b_fspriv); - /* - * Get a log_item_desc to point at the new item. - */ - (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip); - - /* - * Initialize b_fsprivate2 so we can find it with incore_match() - * above. - */ - XFS_BUF_SET_FSPRIVATE2(bp, tp); - - trace_xfs_trans_read_buf(bip); *bpp = bp; return 0; @@ -472,8 +452,8 @@ shutdown_abort: if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp)) cmn_err(CE_NOTE, "about to pop assert, bp == 0x%p", bp); #endif - ASSERT((XFS_BUF_BFLAGS(bp) & (XFS_B_STALE|XFS_B_DELWRI)) != - (XFS_B_STALE|XFS_B_DELWRI)); + ASSERT((XFS_BUF_BFLAGS(bp) & (XBF_STALE|XBF_DELWRI)) != + (XBF_STALE|XBF_DELWRI)); trace_xfs_trans_read_buf_shut(bp, _RET_IP_); xfs_buf_relse(bp); @@ -531,7 +511,7 @@ xfs_trans_brelse(xfs_trans_t *tp, bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); ASSERT(bip->bli_item.li_type == XFS_LI_BUF); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); /* @@ -622,53 +602,6 @@ xfs_trans_brelse(xfs_trans_t *tp, } /* - * Add the locked buffer to the transaction. - * The buffer must be locked, and it cannot be associated with any - * transaction. - * - * If the buffer does not yet have a buf log item associated with it, - * then allocate one for it. Then add the buf item to the transaction. - */ -void -xfs_trans_bjoin(xfs_trans_t *tp, - xfs_buf_t *bp) -{ - xfs_buf_log_item_t *bip; - - ASSERT(XFS_BUF_ISBUSY(bp)); - ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL); - - /* - * The xfs_buf_log_item pointer is stored in b_fsprivate. If - * it doesn't have one yet, then allocate one and initialize it. - * The checks to see if one is there are in xfs_buf_item_init(). - */ - xfs_buf_item_init(bp, tp->t_mountp); - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); - ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); - ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); - - /* - * Take a reference for this transaction on the buf item. - */ - atomic_inc(&bip->bli_refcount); - - /* - * Get a log_item_desc to point at the new item. - */ - (void) xfs_trans_add_item(tp, (xfs_log_item_t *)bip); - - /* - * Initialize b_fsprivate2 so we can find it with incore_match() - * in xfs_trans_get_buf() and friends above. - */ - XFS_BUF_SET_FSPRIVATE2(bp, tp); - - trace_xfs_trans_bjoin(bip); -} - -/* * Mark the buffer as not needing to be unlocked when the buf item's * IOP_UNLOCK() routine is called. The buffer must already be locked * and associated with the given transaction. @@ -686,7 +619,7 @@ xfs_trans_bhold(xfs_trans_t *tp, bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_HOLD; trace_xfs_trans_bhold(bip); @@ -708,7 +641,7 @@ xfs_trans_bhold_release(xfs_trans_t *tp, bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); ASSERT(bip->bli_flags & XFS_BLI_HOLD); bip->bli_flags &= ~XFS_BLI_HOLD; @@ -771,7 +704,7 @@ xfs_trans_log_buf(xfs_trans_t *tp, bip->bli_flags &= ~XFS_BLI_STALE; ASSERT(XFS_BUF_ISSTALE(bp)); XFS_BUF_UNSTALE(bp); - bip->bli_format.blf_flags &= ~XFS_BLI_CANCEL; + bip->bli_format.blf_flags &= ~XFS_BLF_CANCEL; } lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip); @@ -779,7 +712,6 @@ xfs_trans_log_buf(xfs_trans_t *tp, tp->t_flags |= XFS_TRANS_DIRTY; lidp->lid_flags |= XFS_LID_DIRTY; - lidp->lid_flags &= ~XFS_LID_BUF_STALE; bip->bli_flags |= XFS_BLI_LOGGED; xfs_buf_item_log(bip, first, last); } @@ -830,8 +762,8 @@ xfs_trans_binval( ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); ASSERT(XFS_BUF_ISSTALE(bp)); ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_INODE_BUF)); - ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF)); + ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); ASSERT(lidp->lid_flags & XFS_LID_DIRTY); ASSERT(tp->t_flags & XFS_TRANS_DIRTY); return; @@ -842,7 +774,7 @@ xfs_trans_binval( * in the buf log item. The STALE flag will be used in * xfs_buf_item_unpin() to determine if it should clean up * when the last reference to the buf item is given up. - * We set the XFS_BLI_CANCEL flag in the buf log format structure + * We set the XFS_BLF_CANCEL flag in the buf log format structure * and log the buf item. This will be used at recovery time * to determine that copies of the buffer in the log before * this should not be replayed. @@ -860,26 +792,26 @@ xfs_trans_binval( XFS_BUF_UNDELAYWRITE(bp); XFS_BUF_STALE(bp); bip->bli_flags |= XFS_BLI_STALE; - bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_DIRTY); - bip->bli_format.blf_flags &= ~XFS_BLI_INODE_BUF; - bip->bli_format.blf_flags |= XFS_BLI_CANCEL; + bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); + bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; + bip->bli_format.blf_flags |= XFS_BLF_CANCEL; memset((char *)(bip->bli_format.blf_data_map), 0, (bip->bli_format.blf_map_size * sizeof(uint))); - lidp->lid_flags |= XFS_LID_DIRTY|XFS_LID_BUF_STALE; + lidp->lid_flags |= XFS_LID_DIRTY; tp->t_flags |= XFS_TRANS_DIRTY; } /* - * This call is used to indicate that the buffer contains on-disk - * inodes which must be handled specially during recovery. They - * require special handling because only the di_next_unlinked from - * the inodes in the buffer should be recovered. The rest of the - * data in the buffer is logged via the inodes themselves. + * This call is used to indicate that the buffer contains on-disk inodes which + * must be handled specially during recovery. They require special handling + * because only the di_next_unlinked from the inodes in the buffer should be + * recovered. The rest of the data in the buffer is logged via the inodes + * themselves. * - * All we do is set the XFS_BLI_INODE_BUF flag in the buffer's log - * format structure so that we'll know what to do at recovery time. + * All we do is set the XFS_BLI_INODE_BUF flag in the items flags so it can be + * transferred to the buffer's log format structure so that we'll know what to + * do at recovery time. */ -/* ARGSUSED */ void xfs_trans_inode_buf( xfs_trans_t *tp, @@ -894,7 +826,7 @@ xfs_trans_inode_buf( bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); ASSERT(atomic_read(&bip->bli_refcount) > 0); - bip->bli_format.blf_flags |= XFS_BLI_INODE_BUF; + bip->bli_flags |= XFS_BLI_INODE_BUF; } /* @@ -976,120 +908,12 @@ xfs_trans_dquot_buf( ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); - ASSERT(type == XFS_BLI_UDQUOT_BUF || - type == XFS_BLI_PDQUOT_BUF || - type == XFS_BLI_GDQUOT_BUF); + ASSERT(type == XFS_BLF_UDQUOT_BUF || + type == XFS_BLF_PDQUOT_BUF || + type == XFS_BLF_GDQUOT_BUF); bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_format.blf_flags |= type; } - -/* - * Check to see if a buffer matching the given parameters is already - * a part of the given transaction. Only check the first, embedded - * chunk, since we don't want to spend all day scanning large transactions. - */ -STATIC xfs_buf_t * -xfs_trans_buf_item_match( - xfs_trans_t *tp, - xfs_buftarg_t *target, - xfs_daddr_t blkno, - int len) -{ - xfs_log_item_chunk_t *licp; - xfs_log_item_desc_t *lidp; - xfs_buf_log_item_t *blip; - xfs_buf_t *bp; - int i; - - bp = NULL; - len = BBTOB(len); - licp = &tp->t_items; - if (!xfs_lic_are_all_free(licp)) { - for (i = 0; i < licp->lic_unused; i++) { - /* - * Skip unoccupied slots. - */ - if (xfs_lic_isfree(licp, i)) { - continue; - } - - lidp = xfs_lic_slot(licp, i); - blip = (xfs_buf_log_item_t *)lidp->lid_item; - if (blip->bli_item.li_type != XFS_LI_BUF) { - continue; - } - - bp = blip->bli_buf; - if ((XFS_BUF_TARGET(bp) == target) && - (XFS_BUF_ADDR(bp) == blkno) && - (XFS_BUF_COUNT(bp) == len)) { - /* - * We found it. Break out and - * return the pointer to the buffer. - */ - break; - } else { - bp = NULL; - } - } - } - return bp; -} - -/* - * Check to see if a buffer matching the given parameters is already - * a part of the given transaction. Check all the chunks, we - * want to be thorough. - */ -STATIC xfs_buf_t * -xfs_trans_buf_item_match_all( - xfs_trans_t *tp, - xfs_buftarg_t *target, - xfs_daddr_t blkno, - int len) -{ - xfs_log_item_chunk_t *licp; - xfs_log_item_desc_t *lidp; - xfs_buf_log_item_t *blip; - xfs_buf_t *bp; - int i; - - bp = NULL; - len = BBTOB(len); - for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) { - if (xfs_lic_are_all_free(licp)) { - ASSERT(licp == &tp->t_items); - ASSERT(licp->lic_next == NULL); - return NULL; - } - for (i = 0; i < licp->lic_unused; i++) { - /* - * Skip unoccupied slots. - */ - if (xfs_lic_isfree(licp, i)) { - continue; - } - - lidp = xfs_lic_slot(licp, i); - blip = (xfs_buf_log_item_t *)lidp->lid_item; - if (blip->bli_item.li_type != XFS_LI_BUF) { - continue; - } - - bp = blip->bli_buf; - if ((XFS_BUF_TARGET(bp) == target) && - (XFS_BUF_ADDR(bp) == blkno) && - (XFS_BUF_COUNT(bp) == len)) { - /* - * We found it. Break out and - * return the pointer to the buffer. - */ - return bp; - } - } - } - return NULL; -} diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index 785ff101da0a..2559dfec946b 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -62,7 +62,7 @@ xfs_trans_iget( { int error; - error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp, 0); + error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp); if (!error && tp) xfs_trans_ijoin(tp, *ipp, lock_flags); return error; diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c index eb3fc57f9eef..f11d37d06dcc 100644 --- a/fs/xfs/xfs_trans_item.c +++ b/fs/xfs/xfs_trans_item.c @@ -299,6 +299,7 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp) void xfs_trans_free_items( xfs_trans_t *tp, + xfs_lsn_t commit_lsn, int flags) { xfs_log_item_chunk_t *licp; @@ -311,7 +312,7 @@ xfs_trans_free_items( * Special case the embedded chunk so we don't free it below. */ if (!xfs_lic_are_all_free(licp)) { - (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN); + (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn); xfs_lic_all_free(licp); licp->lic_unused = 0; } @@ -322,7 +323,7 @@ xfs_trans_free_items( */ while (licp != NULL) { ASSERT(!xfs_lic_are_all_free(licp)); - (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN); + (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn); next_licp = licp->lic_next; kmem_free(licp); licp = next_licp; @@ -438,112 +439,3 @@ xfs_trans_unlock_chunk( return freed; } - - -/* - * This is called to add the given busy item to the transaction's - * list of busy items. It must find a free busy item descriptor - * or allocate a new one and add the item to that descriptor. - * The function returns a pointer to busy descriptor used to point - * to the new busy entry. The log busy entry will now point to its new - * descriptor with its ???? field. - */ -xfs_log_busy_slot_t * -xfs_trans_add_busy(xfs_trans_t *tp, xfs_agnumber_t ag, xfs_extlen_t idx) -{ - xfs_log_busy_chunk_t *lbcp; - xfs_log_busy_slot_t *lbsp; - int i=0; - - /* - * If there are no free descriptors, allocate a new chunk - * of them and put it at the front of the chunk list. - */ - if (tp->t_busy_free == 0) { - lbcp = (xfs_log_busy_chunk_t*) - kmem_alloc(sizeof(xfs_log_busy_chunk_t), KM_SLEEP); - ASSERT(lbcp != NULL); - /* - * Initialize the chunk, and then - * claim the first slot in the newly allocated chunk. - */ - XFS_LBC_INIT(lbcp); - XFS_LBC_CLAIM(lbcp, 0); - lbcp->lbc_unused = 1; - lbsp = XFS_LBC_SLOT(lbcp, 0); - - /* - * Link in the new chunk and update the free count. - */ - lbcp->lbc_next = tp->t_busy.lbc_next; - tp->t_busy.lbc_next = lbcp; - tp->t_busy_free = XFS_LIC_NUM_SLOTS - 1; - - /* - * Initialize the descriptor and the generic portion - * of the log item. - * - * Point the new slot at this item and return it. - * Also point the log item at its currently active - * descriptor and set the item's mount pointer. - */ - lbsp->lbc_ag = ag; - lbsp->lbc_idx = idx; - return lbsp; - } - - /* - * Find the free descriptor. It is somewhere in the chunklist - * of descriptors. - */ - lbcp = &tp->t_busy; - while (lbcp != NULL) { - if (XFS_LBC_VACANCY(lbcp)) { - if (lbcp->lbc_unused <= XFS_LBC_MAX_SLOT) { - i = lbcp->lbc_unused; - break; - } else { - /* out-of-order vacancy */ - cmn_err(CE_DEBUG, "OOO vacancy lbcp 0x%p\n", lbcp); - ASSERT(0); - } - } - lbcp = lbcp->lbc_next; - } - ASSERT(lbcp != NULL); - /* - * If we find a free descriptor, claim it, - * initialize it, and return it. - */ - XFS_LBC_CLAIM(lbcp, i); - if (lbcp->lbc_unused <= i) { - lbcp->lbc_unused = i + 1; - } - lbsp = XFS_LBC_SLOT(lbcp, i); - tp->t_busy_free--; - lbsp->lbc_ag = ag; - lbsp->lbc_idx = idx; - return lbsp; -} - - -/* - * xfs_trans_free_busy - * Free all of the busy lists from a transaction - */ -void -xfs_trans_free_busy(xfs_trans_t *tp) -{ - xfs_log_busy_chunk_t *lbcp; - xfs_log_busy_chunk_t *lbcq; - - lbcp = tp->t_busy.lbc_next; - while (lbcp != NULL) { - lbcq = lbcp->lbc_next; - kmem_free(lbcp); - lbcp = lbcq; - } - - XFS_LBC_INIT(&tp->t_busy); - tp->t_busy.lbc_unused = 0; -} diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 73e2ad397432..c6e4f2c8de6e 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -35,13 +35,14 @@ struct xfs_log_item_desc *xfs_trans_find_item(struct xfs_trans *, struct xfs_log_item_desc *xfs_trans_first_item(struct xfs_trans *); struct xfs_log_item_desc *xfs_trans_next_item(struct xfs_trans *, struct xfs_log_item_desc *); -void xfs_trans_free_items(struct xfs_trans *, int); -void xfs_trans_unlock_items(struct xfs_trans *, - xfs_lsn_t); -void xfs_trans_free_busy(xfs_trans_t *tp); -xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp, - xfs_agnumber_t ag, - xfs_extlen_t idx); + +void xfs_trans_unlock_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn); +void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, + int flags); + +void xfs_trans_item_committed(struct xfs_log_item *lip, + xfs_lsn_t commit_lsn, int aborted); +void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); /* * AIL traversal cursor. diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h index d725428c9df6..320775295e32 100644 --- a/fs/xfs/xfs_types.h +++ b/fs/xfs/xfs_types.h @@ -75,6 +75,8 @@ typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */ typedef __uint16_t xfs_prid_t; /* prid_t truncated to 16bits in XFS */ +typedef __uint32_t xlog_tid_t; /* transaction ID type */ + /* * These types are 64 bits on disk but are either 32 or 64 bits in memory. * Disk based types: @@ -151,8 +153,8 @@ typedef enum { } xfs_btnum_t; struct xfs_name { - const char *name; - int len; + const unsigned char *name; + int len; }; #endif /* __XFS_TYPES_H__ */ diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 6f268756bf36..c1646838898f 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -256,7 +256,7 @@ xfs_setattr( iattr->ia_size > ip->i_d.di_size) { code = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, - XFS_B_ASYNC, FI_NONE); + XBF_ASYNC, FI_NONE); } /* wait for all I/O to complete */ @@ -267,7 +267,7 @@ xfs_setattr( if (code) { ASSERT(tp == NULL); lock_flags &= ~XFS_ILOCK_EXCL; - ASSERT(lock_flags == XFS_IOLOCK_EXCL); + ASSERT(lock_flags == XFS_IOLOCK_EXCL || !need_iolock); goto error_return; } tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); @@ -584,116 +584,6 @@ xfs_readlink( } /* - * xfs_fsync - * - * This is called to sync the inode and its data out to disk. We need to hold - * the I/O lock while flushing the data, and the inode lock while flushing the - * inode. The inode lock CANNOT be held while flushing the data, so acquire - * after we're done with that. - */ -int -xfs_fsync( - xfs_inode_t *ip) -{ - xfs_trans_t *tp; - int error = 0; - int log_flushed = 0, changed = 1; - - xfs_itrace_entry(ip); - - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return XFS_ERROR(EIO); - - /* - * We always need to make sure that the required inode state is safe on - * disk. The inode might be clean but we still might need to force the - * log because of committed transactions that haven't hit the disk yet. - * Likewise, there could be unflushed non-transactional changes to the - * inode core that have to go to disk and this requires us to issue - * a synchronous transaction to capture these changes correctly. - * - * This code relies on the assumption that if the update_* fields - * of the inode are clear and the inode is unpinned then it is clean - * and no action is required. - */ - xfs_ilock(ip, XFS_ILOCK_SHARED); - - if (!ip->i_update_core) { - /* - * Timestamps/size haven't changed since last inode flush or - * inode transaction commit. That means either nothing got - * written or a transaction committed which caught the updates. - * If the latter happened and the transaction hasn't hit the - * disk yet, the inode will be still be pinned. If it is, - * force the log. - */ - - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - if (xfs_ipincount(ip)) { - error = _xfs_log_force(ip->i_mount, (xfs_lsn_t)0, - XFS_LOG_FORCE | XFS_LOG_SYNC, - &log_flushed); - } else { - /* - * If the inode is not pinned and nothing has changed - * we don't need to flush the cache. - */ - changed = 0; - } - } else { - /* - * Kick off a transaction to log the inode core to get the - * updates. The sync transaction will also force the log. - */ - xfs_iunlock(ip, XFS_ILOCK_SHARED); - tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS); - error = xfs_trans_reserve(tp, 0, - XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - return error; - } - xfs_ilock(ip, XFS_ILOCK_EXCL); - - /* - * Note - it's possible that we might have pushed ourselves out - * of the way during trans_reserve which would flush the inode. - * But there's no guarantee that the inode buffer has actually - * gone out yet (it's delwri). Plus the buffer could be pinned - * anyway if it's part of an inode in another recent - * transaction. So we play it safe and fire off the - * transaction anyway. - */ - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - xfs_trans_set_sync(tp); - error = _xfs_trans_commit(tp, 0, &log_flushed); - - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } - - if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) { - /* - * If the log write didn't issue an ordered tag we need - * to flush the disk cache for the data device now. - */ - if (!log_flushed) - xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp); - - /* - * If this inode is on the RT dev we need to flush that - * cache as well. - */ - if (XFS_IS_REALTIME_INODE(ip)) - xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp); - } - - return error; -} - -/* * Flags for xfs_free_eofblocks */ #define XFS_FREE_EOF_TRYLOCK (1<<0) @@ -1096,7 +986,7 @@ xfs_release( */ truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); if (truncated && VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) - xfs_flush_pages(ip, 0, -1, XFS_B_ASYNC, FI_NONE); + xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE); } if (ip->i_d.di_nlink != 0) { @@ -1379,7 +1269,7 @@ xfs_lookup( if (error) goto out; - error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp, 0); + error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp); if (error) goto out_free_name; @@ -2199,7 +2089,8 @@ xfs_symlink( if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) { error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp, DM_RIGHT_NULL, NULL, DM_RIGHT_NULL, - link_name->name, target_path, 0, 0, 0); + link_name->name, + (unsigned char *)target_path, 0, 0, 0); if (error) return error; } @@ -2395,7 +2286,8 @@ std_return: dp, DM_RIGHT_NULL, error ? NULL : ip, DM_RIGHT_NULL, link_name->name, - target_path, 0, error, 0); + (unsigned char *)target_path, + 0, error, 0); } if (!error) diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index 167a467403a5..d8dfa8d0dadd 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -21,7 +21,6 @@ int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags); #define XFS_ATTR_NOACL 0x08 /* Don't call xfs_acl_chmod */ int xfs_readlink(struct xfs_inode *ip, char *link); -int xfs_fsync(struct xfs_inode *ip); int xfs_release(struct xfs_inode *ip); int xfs_inactive(struct xfs_inode *ip); int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, @@ -43,25 +42,13 @@ int xfs_change_file_space(struct xfs_inode *ip, int cmd, int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name, struct xfs_inode *src_ip, struct xfs_inode *target_dp, struct xfs_name *target_name, struct xfs_inode *target_ip); -int xfs_attr_get(struct xfs_inode *ip, const char *name, char *value, - int *valuelenp, int flags); -int xfs_attr_set(struct xfs_inode *dp, const char *name, char *value, - int valuelen, int flags); -int xfs_attr_remove(struct xfs_inode *dp, const char *name, int flags); +int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name, + unsigned char *value, int *valuelenp, int flags); +int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, + unsigned char *value, int valuelen, int flags); +int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, int flags, struct attrlist_cursor_kern *cursor); -ssize_t xfs_read(struct xfs_inode *ip, struct kiocb *iocb, - const struct iovec *iovp, unsigned int segs, - loff_t *offset, int ioflags); -ssize_t xfs_splice_read(struct xfs_inode *ip, struct file *infilp, - loff_t *ppos, struct pipe_inode_info *pipe, size_t count, - int flags, int ioflags); -ssize_t xfs_splice_write(struct xfs_inode *ip, - struct pipe_inode_info *pipe, struct file *outfilp, - loff_t *ppos, size_t count, int flags, int ioflags); -ssize_t xfs_write(struct xfs_inode *xip, struct kiocb *iocb, - const struct iovec *iovp, unsigned int nsegs, - loff_t *offset, int ioflags); int xfs_bmap(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, int flags, struct xfs_iomap *iomapp, int *niomaps); void xfs_tosspages(struct xfs_inode *inode, xfs_off_t first, @@ -72,4 +59,6 @@ int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last, uint64_t flags, int fiopt); int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last); +int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); + #endif /* _XFS_VNODEOPS_H */ |