diff options
Diffstat (limited to 'fs/xfs/linux-2.6/xfs_lrw.c')
-rw-r--r-- | fs/xfs/linux-2.6/xfs_lrw.c | 1082 |
1 files changed, 1082 insertions, 0 deletions
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c new file mode 100644 index 000000000000..ff145fd0d1a4 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -0,0 +1,1082 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +/* + * fs/xfs/linux/xfs_lrw.c (Linux Read Write stuff) + * + */ + +#include "xfs.h" + +#include "xfs_fs.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_alloc.h" +#include "xfs_dmapi.h" +#include "xfs_quota.h" +#include "xfs_mount.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_bit.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_itable.h" +#include "xfs_rw.h" +#include "xfs_acl.h" +#include "xfs_cap.h" +#include "xfs_mac.h" +#include "xfs_attr.h" +#include "xfs_inode_item.h" +#include "xfs_buf_item.h" +#include "xfs_utils.h" +#include "xfs_iomap.h" + +#include <linux/capability.h> +#include <linux/writeback.h> + + +#if defined(XFS_RW_TRACE) +void +xfs_rw_enter_trace( + int tag, + xfs_iocore_t *io, + void *data, + size_t segs, + loff_t offset, + int ioflags) +{ + xfs_inode_t *ip = XFS_IO_INODE(io); + + if (ip->i_rwtrace == NULL) + return; + ktrace_enter(ip->i_rwtrace, + (void *)(unsigned long)tag, + (void *)ip, + (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)), + (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)), + (void *)data, + (void *)((unsigned long)segs), + (void *)((unsigned long)((offset >> 32) & 0xffffffff)), + (void *)((unsigned long)(offset & 0xffffffff)), + (void *)((unsigned long)ioflags), + (void *)((unsigned long)((io->io_new_size >> 32) & 0xffffffff)), + (void *)((unsigned long)(io->io_new_size & 0xffffffff)), + (void *)NULL, + (void *)NULL, + (void *)NULL, + (void *)NULL, + (void *)NULL); +} + +void +xfs_inval_cached_trace( + xfs_iocore_t *io, + xfs_off_t offset, + xfs_off_t len, + xfs_off_t first, + xfs_off_t last) +{ + xfs_inode_t *ip = XFS_IO_INODE(io); + + if (ip->i_rwtrace == NULL) + return; + ktrace_enter(ip->i_rwtrace, + (void *)(__psint_t)XFS_INVAL_CACHED, + (void *)ip, + (void *)((unsigned long)((offset >> 32) & 0xffffffff)), + (void *)((unsigned long)(offset & 0xffffffff)), + (void *)((unsigned long)((len >> 32) & 0xffffffff)), + (void *)((unsigned long)(len & 0xffffffff)), + (void *)((unsigned long)((first >> 32) & 0xffffffff)), + (void *)((unsigned long)(first & 0xffffffff)), + (void *)((unsigned long)((last >> 32) & 0xffffffff)), + (void *)((unsigned long)(last & 0xffffffff)), + (void *)NULL, + (void *)NULL, + (void *)NULL, + (void *)NULL, + (void *)NULL, + (void *)NULL); +} +#endif + +/* + * xfs_iozero + * + * xfs_iozero clears the specified range of buffer supplied, + * and marks all the affected blocks as valid and modified. If + * an affected block is not allocated, it will be allocated. If + * an affected block is not completely overwritten, and is not + * valid before the operation, it will be read from disk before + * being partially zeroed. + */ +STATIC int +xfs_iozero( + struct inode *ip, /* inode */ + loff_t pos, /* offset in file */ + size_t count, /* size of data to zero */ + loff_t end_size) /* max file size to set */ +{ + unsigned bytes; + struct page *page; + struct address_space *mapping; + char *kaddr; + int status; + + mapping = ip->i_mapping; + do { + unsigned long index, offset; + + offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ + index = pos >> PAGE_CACHE_SHIFT; + bytes = PAGE_CACHE_SIZE - offset; + if (bytes > count) + bytes = count; + + status = -ENOMEM; + page = grab_cache_page(mapping, index); + if (!page) + break; + + kaddr = kmap(page); + status = mapping->a_ops->prepare_write(NULL, page, offset, + offset + bytes); + if (status) { + goto unlock; + } + + memset((void *) (kaddr + offset), 0, bytes); + flush_dcache_page(page); + status = mapping->a_ops->commit_write(NULL, page, offset, + offset + bytes); + if (!status) { + pos += bytes; + count -= bytes; + if (pos > i_size_read(ip)) + i_size_write(ip, pos < end_size ? pos : end_size); + } + +unlock: + kunmap(page); + unlock_page(page); + page_cache_release(page); + if (status) + break; + } while (count); + + return (-status); +} + +/* + * xfs_inval_cached_pages + * + * This routine is responsible for keeping direct I/O and buffered I/O + * somewhat coherent. From here we make sure that we're at least + * temporarily holding the inode I/O lock exclusively and then call + * the page cache to flush and invalidate any cached pages. If there + * are no cached pages this routine will be very quick. + */ +void +xfs_inval_cached_pages( + vnode_t *vp, + xfs_iocore_t *io, + xfs_off_t offset, + int write, + int relock) +{ + if (VN_CACHED(vp)) { + xfs_inval_cached_trace(io, offset, -1, ctooff(offtoct(offset)), -1); + VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(offset)), -1, FI_REMAPF_LOCKED); + } + +} + +ssize_t /* bytes read, or (-) error */ +xfs_read( + bhv_desc_t *bdp, + struct kiocb *iocb, + const struct iovec *iovp, + unsigned int segs, + loff_t *offset, + int ioflags, + cred_t *credp) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + size_t size = 0; + ssize_t ret; + xfs_fsize_t n; + xfs_inode_t *ip; + xfs_mount_t *mp; + vnode_t *vp; + unsigned long seg; + + ip = XFS_BHVTOI(bdp); + vp = BHV_TO_VNODE(bdp); + mp = ip->i_mount; + + XFS_STATS_INC(xs_read_calls); + + /* START copy & waste from filemap.c */ + for (seg = 0; seg < segs; seg++) { + const struct iovec *iv = &iovp[seg]; + + /* + * If any segment has a negative length, or the cumulative + * length ever wraps negative then return -EINVAL. + */ + size += iv->iov_len; + if (unlikely((ssize_t)(size|iv->iov_len) < 0)) + return XFS_ERROR(-EINVAL); + } + /* END copy & waste from filemap.c */ + + if (unlikely(ioflags & IO_ISDIRECT)) { + xfs_buftarg_t *target = + (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? + mp->m_rtdev_targp : mp->m_ddev_targp; + if ((*offset & target->pbr_smask) || + (size & target->pbr_smask)) { + if (*offset == ip->i_d.di_size) { + return (0); + } + return -XFS_ERROR(EINVAL); + } + } + + n = XFS_MAXIOFFSET(mp) - *offset; + if ((n <= 0) || (size == 0)) + return 0; + + if (n < size) + size = n; + + if (XFS_FORCED_SHUTDOWN(mp)) { + return -EIO; + } + + if (unlikely(ioflags & IO_ISDIRECT)) + down(&inode->i_sem); + xfs_ilock(ip, XFS_IOLOCK_SHARED); + + if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) && + !(ioflags & IO_INVIS)) { + vrwlock_t locktype = VRWLOCK_READ; + + ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, + BHV_TO_VNODE(bdp), *offset, size, + FILP_DELAY_FLAG(file), &locktype); + if (ret) { + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + goto unlock_isem; + } + } + + xfs_rw_enter_trace(XFS_READ_ENTER, &ip->i_iocore, + (void *)iovp, segs, *offset, ioflags); + ret = __generic_file_aio_read(iocb, iovp, segs, offset); + if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO)) + ret = wait_on_sync_kiocb(iocb); + if (ret > 0) + XFS_STATS_ADD(xs_read_bytes, ret); + + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + + if (likely(!(ioflags & IO_INVIS))) + xfs_ichgtime(ip, XFS_ICHGTIME_ACC); + +unlock_isem: + if (unlikely(ioflags & IO_ISDIRECT)) + up(&inode->i_sem); + return ret; +} + +ssize_t +xfs_sendfile( + bhv_desc_t *bdp, + struct file *filp, + loff_t *offset, + int ioflags, + size_t count, + read_actor_t actor, + void *target, + cred_t *credp) +{ + ssize_t ret; + xfs_fsize_t n; + xfs_inode_t *ip; + xfs_mount_t *mp; + vnode_t *vp; + + ip = XFS_BHVTOI(bdp); + vp = BHV_TO_VNODE(bdp); + mp = ip->i_mount; + + XFS_STATS_INC(xs_read_calls); + + n = XFS_MAXIOFFSET(mp) - *offset; + if ((n <= 0) || (count == 0)) + return 0; + + if (n < count) + count = n; + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; + + xfs_ilock(ip, XFS_IOLOCK_SHARED); + + if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) && + (!(ioflags & IO_INVIS))) { + vrwlock_t locktype = VRWLOCK_READ; + int error; + + error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp), *offset, count, + FILP_DELAY_FLAG(filp), &locktype); + if (error) { + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return -error; + } + } + xfs_rw_enter_trace(XFS_SENDFILE_ENTER, &ip->i_iocore, + (void *)(unsigned long)target, count, *offset, ioflags); + ret = generic_file_sendfile(filp, offset, count, actor, target); + + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + + if (ret > 0) + XFS_STATS_ADD(xs_read_bytes, ret); + + if (likely(!(ioflags & IO_INVIS))) + xfs_ichgtime(ip, XFS_ICHGTIME_ACC); + + return ret; +} + +/* + * This routine is called to handle zeroing any space in the last + * block of the file that is beyond the EOF. We do this since the + * size is being increased without writing anything to that block + * and we don't want anyone to read the garbage on the disk. + */ +STATIC int /* error (positive) */ +xfs_zero_last_block( + struct inode *ip, + xfs_iocore_t *io, + xfs_off_t offset, + xfs_fsize_t isize, + xfs_fsize_t end_size) +{ + xfs_fileoff_t last_fsb; + xfs_mount_t *mp; + int nimaps; + int zero_offset; + int zero_len; + int isize_fsb_offset; + int error = 0; + xfs_bmbt_irec_t imap; + loff_t loff; + size_t lsize; + + ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0); + ASSERT(offset > isize); + + mp = io->io_mount; + + isize_fsb_offset = XFS_B_FSB_OFFSET(mp, isize); + if (isize_fsb_offset == 0) { + /* + * There are no extra bytes in the last block on disk to + * zero, so return. + */ + return 0; + } + + last_fsb = XFS_B_TO_FSBT(mp, isize); + nimaps = 1; + error = XFS_BMAPI(mp, NULL, io, last_fsb, 1, 0, NULL, 0, &imap, + &nimaps, NULL); + if (error) { + return error; + } + ASSERT(nimaps > 0); + /* + * If the block underlying isize is just a hole, then there + * is nothing to zero. + */ + if (imap.br_startblock == HOLESTARTBLOCK) { + return 0; + } + /* + * Zero the part of the last block beyond the EOF, and write it + * out sync. We need to drop the ilock while we do this so we + * don't deadlock when the buffer cache calls back to us. + */ + XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL| XFS_EXTSIZE_RD); + loff = XFS_FSB_TO_B(mp, last_fsb); + lsize = XFS_FSB_TO_B(mp, 1); + + zero_offset = isize_fsb_offset; + zero_len = mp->m_sb.sb_blocksize - isize_fsb_offset; + + error = xfs_iozero(ip, loff + zero_offset, zero_len, end_size); + + XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); + ASSERT(error >= 0); + return error; +} + +/* + * Zero any on disk space between the current EOF and the new, + * larger EOF. This handles the normal case of zeroing the remainder + * of the last block in the file and the unusual case of zeroing blocks + * out beyond the size of the file. This second case only happens + * with fixed size extents and when the system crashes before the inode + * size was updated but after blocks were allocated. If fill is set, + * then any holes in the range are filled and zeroed. If not, the holes + * are left alone as holes. + */ + +int /* error (positive) */ +xfs_zero_eof( + vnode_t *vp, + xfs_iocore_t *io, + xfs_off_t offset, /* starting I/O offset */ + xfs_fsize_t isize, /* current inode size */ + xfs_fsize_t end_size) /* terminal inode size */ +{ + struct inode *ip = LINVFS_GET_IP(vp); + xfs_fileoff_t start_zero_fsb; + xfs_fileoff_t end_zero_fsb; + xfs_fileoff_t prev_zero_fsb; + xfs_fileoff_t zero_count_fsb; + xfs_fileoff_t last_fsb; + xfs_extlen_t buf_len_fsb; + xfs_extlen_t prev_zero_count; + xfs_mount_t *mp; + int nimaps; + int error = 0; + xfs_bmbt_irec_t imap; + loff_t loff; + size_t lsize; + + ASSERT(ismrlocked(io->io_lock, MR_UPDATE)); + ASSERT(ismrlocked(io->io_iolock, MR_UPDATE)); + + mp = io->io_mount; + + /* + * First handle zeroing the block on which isize resides. + * We only zero a part of that block so it is handled specially. + */ + error = xfs_zero_last_block(ip, io, offset, isize, end_size); + if (error) { + ASSERT(ismrlocked(io->io_lock, MR_UPDATE)); + ASSERT(ismrlocked(io->io_iolock, MR_UPDATE)); + return error; + } + + /* + * Calculate the range between the new size and the old + * where blocks needing to be zeroed may exist. To get the + * block where the last byte in the file currently resides, + * we need to subtract one from the size and truncate back + * to a block boundary. We subtract 1 in case the size is + * exactly on a block boundary. + */ + last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1; + start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); + end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1); + ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb); + if (last_fsb == end_zero_fsb) { + /* + * The size was only incremented on its last block. + * We took care of that above, so just return. + */ + return 0; + } + + ASSERT(start_zero_fsb <= end_zero_fsb); + prev_zero_fsb = NULLFILEOFF; + prev_zero_count = 0; + while (start_zero_fsb <= end_zero_fsb) { + nimaps = 1; + zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; + error = XFS_BMAPI(mp, NULL, io, start_zero_fsb, zero_count_fsb, + 0, NULL, 0, &imap, &nimaps, NULL); + if (error) { + ASSERT(ismrlocked(io->io_lock, MR_UPDATE)); + ASSERT(ismrlocked(io->io_iolock, MR_UPDATE)); + return error; + } + ASSERT(nimaps > 0); + + if (imap.br_state == XFS_EXT_UNWRITTEN || + imap.br_startblock == HOLESTARTBLOCK) { + /* + * This loop handles initializing pages that were + * partially initialized by the code below this + * loop. It basically zeroes the part of the page + * that sits on a hole and sets the page as P_HOLE + * and calls remapf if it is a mapped file. + */ + prev_zero_fsb = NULLFILEOFF; + prev_zero_count = 0; + start_zero_fsb = imap.br_startoff + + imap.br_blockcount; + ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); + continue; + } + + /* + * There are blocks in the range requested. + * Zero them a single write at a time. We actually + * don't zero the entire range returned if it is + * too big and simply loop around to get the rest. + * That is not the most efficient thing to do, but it + * is simple and this path should not be exercised often. + */ + buf_len_fsb = XFS_FILBLKS_MIN(imap.br_blockcount, + mp->m_writeio_blocks << 8); + /* + * Drop the inode lock while we're doing the I/O. + * We'll still have the iolock to protect us. + */ + XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); + + loff = XFS_FSB_TO_B(mp, start_zero_fsb); + lsize = XFS_FSB_TO_B(mp, buf_len_fsb); + + error = xfs_iozero(ip, loff, lsize, end_size); + + if (error) { + goto out_lock; + } + + prev_zero_fsb = start_zero_fsb; + prev_zero_count = buf_len_fsb; + start_zero_fsb = imap.br_startoff + buf_len_fsb; + ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); + + XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); + } + + return 0; + +out_lock: + + XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); + ASSERT(error >= 0); + return error; +} + +ssize_t /* bytes written, or (-) error */ +xfs_write( + bhv_desc_t *bdp, + struct kiocb *iocb, + const struct iovec *iovp, + unsigned int nsegs, + loff_t *offset, + int ioflags, + cred_t *credp) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + unsigned long segs = nsegs; + xfs_inode_t *xip; + xfs_mount_t *mp; + ssize_t ret = 0, error = 0; + xfs_fsize_t isize, new_size; + xfs_iocore_t *io; + vnode_t *vp; + unsigned long seg; + int iolock; + int eventsent = 0; + vrwlock_t locktype; + size_t ocount = 0, count; + loff_t pos; + int need_isem = 1, need_flush = 0; + + XFS_STATS_INC(xs_write_calls); + + vp = BHV_TO_VNODE(bdp); + xip = XFS_BHVTOI(bdp); + + for (seg = 0; seg < segs; seg++) { + const struct iovec *iv = &iovp[seg]; + + /* + * If any segment has a negative length, or the cumulative + * length ever wraps negative then return -EINVAL. + */ + ocount += iv->iov_len; + if (unlikely((ssize_t)(ocount|iv->iov_len) < 0)) + return -EINVAL; + if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len)) + continue; + if (seg == 0) + return -EFAULT; + segs = seg; + ocount -= iv->iov_len; /* This segment is no good */ + break; + } + + count = ocount; + pos = *offset; + + if (count == 0) + return 0; + + io = &xip->i_iocore; + mp = io->io_mount; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + fs_check_frozen(vp->v_vfsp, SB_FREEZE_WRITE); + + if (ioflags & IO_ISDIRECT) { + xfs_buftarg_t *target = + (xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? + mp->m_rtdev_targp : mp->m_ddev_targp; + + if ((pos & target->pbr_smask) || (count & target->pbr_smask)) + return XFS_ERROR(-EINVAL); + + if (!VN_CACHED(vp) && pos < i_size_read(inode)) + need_isem = 0; + + if (VN_CACHED(vp)) + need_flush = 1; + } + +relock: + if (need_isem) { + iolock = XFS_IOLOCK_EXCL; + locktype = VRWLOCK_WRITE; + + down(&inode->i_sem); + } else { + iolock = XFS_IOLOCK_SHARED; + locktype = VRWLOCK_WRITE_DIRECT; + } + + xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); + + isize = i_size_read(inode); + + if (file->f_flags & O_APPEND) + *offset = isize; + +start: + error = -generic_write_checks(file, &pos, &count, + S_ISBLK(inode->i_mode)); + if (error) { + xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); + goto out_unlock_isem; + } + + new_size = pos + count; + if (new_size > isize) + io->io_new_size = new_size; + + if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) && + !(ioflags & IO_INVIS) && !eventsent)) { + loff_t savedsize = pos; + int dmflags = FILP_DELAY_FLAG(file); + + if (need_isem) + dmflags |= DM_FLAGS_ISEM; + + xfs_iunlock(xip, XFS_ILOCK_EXCL); + error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp, + pos, count, + dmflags, &locktype); + if (error) { + xfs_iunlock(xip, iolock); + goto out_unlock_isem; + } + xfs_ilock(xip, XFS_ILOCK_EXCL); + eventsent = 1; + + /* + * The iolock was dropped and reaquired in XFS_SEND_DATA + * so we have to recheck the size when appending. + * We will only "goto start;" once, since having sent the + * event prevents another call to XFS_SEND_DATA, which is + * what allows the size to change in the first place. + */ + if ((file->f_flags & O_APPEND) && savedsize != isize) { + pos = isize = xip->i_d.di_size; + goto start; + } + } + + /* + * On Linux, generic_file_write updates the times even if + * no data is copied in so long as the write had a size. + * + * We must update xfs' times since revalidate will overcopy xfs. + */ + if (!(ioflags & IO_INVIS)) { + xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + inode_update_time(inode, 1); + } + + /* + * If the offset is beyond the size of the file, we have a couple + * of things to do. First, if there is already space allocated + * we need to either create holes or zero the disk or ... + * + * If there is a page where the previous size lands, we need + * to zero it out up to the new size. + */ + + if (pos > isize) { + error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos, + isize, pos + count); + if (error) { + xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); + goto out_unlock_isem; + } + } + xfs_iunlock(xip, XFS_ILOCK_EXCL); + + /* + * If we're writing the file then make sure to clear the + * setuid and setgid bits if the process is not being run + * by root. This keeps people from modifying setuid and + * setgid binaries. + */ + + if (((xip->i_d.di_mode & S_ISUID) || + ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) == + (S_ISGID | S_IXGRP))) && + !capable(CAP_FSETID)) { + error = xfs_write_clear_setuid(xip); + if (likely(!error)) + error = -remove_suid(file->f_dentry); + if (unlikely(error)) { + xfs_iunlock(xip, iolock); + goto out_unlock_isem; + } + } + +retry: + /* We can write back this queue in page reclaim */ + current->backing_dev_info = mapping->backing_dev_info; + + if ((ioflags & IO_ISDIRECT)) { + if (need_flush) { + xfs_inval_cached_trace(io, pos, -1, + ctooff(offtoct(pos)), -1); + VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(pos)), + -1, FI_REMAPF_LOCKED); + } + + if (need_isem) { + /* demote the lock now the cached pages are gone */ + XFS_ILOCK_DEMOTE(mp, io, XFS_IOLOCK_EXCL); + up(&inode->i_sem); + + iolock = XFS_IOLOCK_SHARED; + locktype = VRWLOCK_WRITE_DIRECT; + need_isem = 0; + } + + xfs_rw_enter_trace(XFS_DIOWR_ENTER, io, (void *)iovp, segs, + *offset, ioflags); + ret = generic_file_direct_write(iocb, iovp, + &segs, pos, offset, count, ocount); + + /* + * direct-io write to a hole: fall through to buffered I/O + * for completing the rest of the request. + */ + if (ret >= 0 && ret != count) { + XFS_STATS_ADD(xs_write_bytes, ret); + + pos += ret; + count -= ret; + + need_isem = 1; + ioflags &= ~IO_ISDIRECT; + xfs_iunlock(xip, iolock); + goto relock; + } + } else { + xfs_rw_enter_trace(XFS_WRITE_ENTER, io, (void *)iovp, segs, + *offset, ioflags); + ret = generic_file_buffered_write(iocb, iovp, segs, + pos, offset, count, ret); + } + + current->backing_dev_info = NULL; + + if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO)) + ret = wait_on_sync_kiocb(iocb); + + if ((ret == -ENOSPC) && + DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) && + !(ioflags & IO_INVIS)) { + + xfs_rwunlock(bdp, locktype); + error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp, + DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL, + 0, 0, 0); /* Delay flag intentionally unused */ + if (error) + goto out_unlock_isem; + xfs_rwlock(bdp, locktype); + pos = xip->i_d.di_size; + ret = 0; + goto retry; + } + + if (*offset > xip->i_d.di_size) { + xfs_ilock(xip, XFS_ILOCK_EXCL); + if (*offset > xip->i_d.di_size) { + xip->i_d.di_size = *offset; + i_size_write(inode, *offset); + xip->i_update_core = 1; + xip->i_update_size = 1; + } + xfs_iunlock(xip, XFS_ILOCK_EXCL); + } + + error = -ret; + if (ret <= 0) + goto out_unlock_internal; + + XFS_STATS_ADD(xs_write_bytes, ret); + + /* Handle various SYNC-type writes */ + if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { + /* + * If we're treating this as O_DSYNC and we have not updated the + * size, force the log. + */ + if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) && + !(xip->i_update_size)) { + xfs_inode_log_item_t *iip = xip->i_itemp; + + /* + * If an allocation transaction occurred + * without extending the size, then we have to force + * the log up the proper point to ensure that the + * allocation is permanent. We can't count on + * the fact that buffered writes lock out direct I/O + * writes - the direct I/O write could have extended + * the size nontransactionally, then finished before + * we started. xfs_write_file will think that the file + * didn't grow but the update isn't safe unless the + * size change is logged. + * + * Force the log if we've committed a transaction + * against the inode or if someone else has and + * the commit record hasn't gone to disk (e.g. + * the inode is pinned). This guarantees that + * all changes affecting the inode are permanent + * when we return. + */ + if (iip && iip->ili_last_lsn) { + xfs_log_force(mp, iip->ili_last_lsn, + XFS_LOG_FORCE | XFS_LOG_SYNC); + } else if (xfs_ipincount(xip) > 0) { + xfs_log_force(mp, (xfs_lsn_t)0, + XFS_LOG_FORCE | XFS_LOG_SYNC); + } + + } else { + xfs_trans_t *tp; + + /* + * O_SYNC or O_DSYNC _with_ a size update are handled + * the same way. + * + * If the write was synchronous then we need to make + * sure that the inode modification time is permanent. + * We'll have updated the timestamp above, so here + * we use a synchronous transaction to log the inode. + * It's not fast, but it's necessary. + * + * If this a dsync write and the size got changed + * non-transactionally, then we need to ensure that + * the size change gets logged in a synchronous + * transaction. + */ + + tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC); + if ((error = xfs_trans_reserve(tp, 0, + XFS_SWRITE_LOG_RES(mp), + 0, 0, 0))) { + /* Transaction reserve failed */ + xfs_trans_cancel(tp, 0); + } else { + /* Transaction reserve successful */ + xfs_ilock(xip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL); + xfs_trans_ihold(tp, xip); + xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE); + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0, NULL); + xfs_iunlock(xip, XFS_ILOCK_EXCL); + } + if (error) + goto out_unlock_internal; + } + + xfs_rwunlock(bdp, locktype); + if (need_isem) + up(&inode->i_sem); + + error = sync_page_range(inode, mapping, pos, ret); + if (!error) + error = ret; + return error; + } + + out_unlock_internal: + xfs_rwunlock(bdp, locktype); + out_unlock_isem: + if (need_isem) + up(&inode->i_sem); + return -error; +} + +/* + * All xfs metadata buffers except log state machine buffers + * get this attached as their b_bdstrat callback function. + * This is so that we can catch a buffer + * after prematurely unpinning it to forcibly shutdown the filesystem. + */ +int +xfs_bdstrat_cb(struct xfs_buf *bp) +{ + xfs_mount_t *mp; + + mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *); + if (!XFS_FORCED_SHUTDOWN(mp)) { + pagebuf_iorequest(bp); + return 0; + } else { + xfs_buftrace("XFS__BDSTRAT IOERROR", bp); + /* + * Metadata write that didn't get logged but + * written delayed anyway. These aren't associated + * with a transaction, and can be ignored. + */ + if (XFS_BUF_IODONE_FUNC(bp) == NULL && + (XFS_BUF_ISREAD(bp)) == 0) + return (xfs_bioerror_relse(bp)); + else + return (xfs_bioerror(bp)); + } +} + + +int +xfs_bmap(bhv_desc_t *bdp, + xfs_off_t offset, + ssize_t count, + int flags, + xfs_iomap_t *iomapp, + int *niomaps) +{ + xfs_inode_t *ip = XFS_BHVTOI(bdp); + xfs_iocore_t *io = &ip->i_iocore; + + ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); + ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) == + ((ip->i_iocore.io_flags & XFS_IOCORE_RT) != 0)); + + return xfs_iomap(io, offset, count, flags, iomapp, niomaps); +} + +/* + * Wrapper around bdstrat so that we can stop data + * from going to disk in case we are shutting down the filesystem. + * Typically user data goes thru this path; one of the exceptions + * is the superblock. + */ +int +xfsbdstrat( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + ASSERT(mp); + if (!XFS_FORCED_SHUTDOWN(mp)) { + /* Grio redirection would go here + * if (XFS_BUF_IS_GRIO(bp)) { + */ + + pagebuf_iorequest(bp); + return 0; + } + + xfs_buftrace("XFSBDSTRAT IOERROR", bp); + return (xfs_bioerror_relse(bp)); +} + +/* + * If the underlying (data/log/rt) device is readonly, there are some + * operations that cannot proceed. + */ +int +xfs_dev_is_read_only( + xfs_mount_t *mp, + char *message) +{ + if (xfs_readonly_buftarg(mp->m_ddev_targp) || + xfs_readonly_buftarg(mp->m_logdev_targp) || + (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) { + cmn_err(CE_NOTE, + "XFS: %s required on read-only device.", message); + cmn_err(CE_NOTE, + "XFS: write access unavailable, cannot proceed."); + return EROFS; + } + return 0; +} |