diff options
Diffstat (limited to 'fs')
| -rw-r--r-- | fs/xfs/xfs_aops.c | 51 | ||||
| -rw-r--r-- | fs/xfs/xfs_bmap.c | 17 | ||||
| -rw-r--r-- | fs/xfs/xfs_bmap_util.c | 13 | ||||
| -rw-r--r-- | fs/xfs/xfs_buf.c | 16 | ||||
| -rw-r--r-- | fs/xfs/xfs_file.c | 2 | ||||
| -rw-r--r-- | fs/xfs/xfs_inode.c | 5 | ||||
| -rw-r--r-- | fs/xfs/xfs_inode.h | 2 | ||||
| -rw-r--r-- | fs/xfs/xfs_iops.c | 20 | ||||
| -rw-r--r-- | fs/xfs/xfs_log.c | 53 | ||||
| -rw-r--r-- | fs/xfs/xfs_trace.h | 1 | 
10 files changed, 147 insertions, 33 deletions
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 75df77d09f75..0479c32c5eb1 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1344,6 +1344,14 @@ __xfs_get_blocks(  	/*  	 * If this is O_DIRECT or the mpage code calling tell them how large  	 * the mapping is, so that we can avoid repeated get_blocks calls. +	 * +	 * If the mapping spans EOF, then we have to break the mapping up as the +	 * mapping for blocks beyond EOF must be marked new so that sub block +	 * regions can be correctly zeroed. We can't do this for mappings within +	 * EOF unless the mapping was just allocated or is unwritten, otherwise +	 * the callers would overwrite existing data with zeros. Hence we have +	 * to split the mapping into a range up to and including EOF, and a +	 * second mapping for beyond EOF.  	 */  	if (direct || size > (1 << inode->i_blkbits)) {  		xfs_off_t		mapping_size; @@ -1354,6 +1362,12 @@ __xfs_get_blocks(  		ASSERT(mapping_size > 0);  		if (mapping_size > size)  			mapping_size = size; +		if (offset < i_size_read(inode) && +		    offset + mapping_size >= i_size_read(inode)) { +			/* limit mapping to block that spans EOF */ +			mapping_size = roundup_64(i_size_read(inode) - offset, +						  1 << inode->i_blkbits); +		}  		if (mapping_size > LONG_MAX)  			mapping_size = LONG_MAX; @@ -1566,6 +1580,16 @@ xfs_vm_write_failed(  		xfs_vm_kill_delalloc_range(inode, block_offset,  					   block_offset + bh->b_size); + +		/* +		 * This buffer does not contain data anymore. make sure anyone +		 * who finds it knows that for certain. +		 */ +		clear_buffer_delay(bh); +		clear_buffer_uptodate(bh); +		clear_buffer_mapped(bh); +		clear_buffer_new(bh); +		clear_buffer_dirty(bh);  	}  } @@ -1599,12 +1623,21 @@ xfs_vm_write_begin(  	status = __block_write_begin(page, pos, len, xfs_get_blocks);  	if (unlikely(status)) {  		struct inode	*inode = mapping->host; +		size_t		isize = i_size_read(inode);  		xfs_vm_write_failed(inode, page, pos, len);  		unlock_page(page); -		if (pos + len > i_size_read(inode)) -			truncate_pagecache(inode, i_size_read(inode)); +		/* +		 * If the write is beyond EOF, we only want to kill blocks +		 * allocated in this write, not blocks that were previously +		 * written successfully. +		 */ +		if (pos + len > isize) { +			ssize_t start = max_t(ssize_t, pos, isize); + +			truncate_pagecache_range(inode, start, pos + len); +		}  		page_cache_release(page);  		page = NULL; @@ -1615,9 +1648,12 @@ xfs_vm_write_begin(  }  /* - * On failure, we only need to kill delalloc blocks beyond EOF because they - * will never be written. For blocks within EOF, generic_write_end() zeros them - * so they are safe to leave alone and be written with all the other valid data. + * On failure, we only need to kill delalloc blocks beyond EOF in the range of + * this specific write because they will never be written. Previous writes + * beyond EOF where block allocation succeeded do not need to be trashed, so + * only new blocks from this write should be trashed. For blocks within + * EOF, generic_write_end() zeros them so they are safe to leave alone and be + * written with all the other valid data.   */  STATIC int  xfs_vm_write_end( @@ -1640,8 +1676,11 @@ xfs_vm_write_end(  		loff_t		to = pos + len;  		if (to > isize) { -			truncate_pagecache(inode, isize); +			/* only kill blocks in this write beyond EOF */ +			if (pos > isize) +				isize = pos;  			xfs_vm_kill_delalloc_range(inode, isize, to); +			truncate_pagecache_range(inode, isize, to);  		}  	}  	return ret; diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 5b6092ef51ef..f0efc7e970ef 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -5413,6 +5413,7 @@ xfs_bmap_shift_extents(  	int				whichfork = XFS_DATA_FORK;  	int				logflags;  	xfs_filblks_t			blockcount = 0; +	int				total_extents;  	if (unlikely(XFS_TEST_ERROR(  	    (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && @@ -5429,7 +5430,6 @@ xfs_bmap_shift_extents(  	ASSERT(current_ext != NULL);  	ifp = XFS_IFORK_PTR(ip, whichfork); -  	if (!(ifp->if_flags & XFS_IFEXTENTS)) {  		/* Read in all the extents */  		error = xfs_iread_extents(tp, ip, whichfork); @@ -5456,7 +5456,6 @@ xfs_bmap_shift_extents(  	/* We are going to change core inode */  	logflags = XFS_ILOG_CORE; -  	if (ifp->if_flags & XFS_IFBROOT) {  		cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);  		cur->bc_private.b.firstblock = *firstblock; @@ -5467,8 +5466,14 @@ xfs_bmap_shift_extents(  		logflags |= XFS_ILOG_DEXT;  	} -	while (nexts++ < num_exts && -	       *current_ext <  XFS_IFORK_NEXTENTS(ip, whichfork)) { +	/* +	 * There may be delalloc extents in the data fork before the range we +	 * are collapsing out, so we cannot +	 * use the count of real extents here. Instead we have to calculate it +	 * from the incore fork. +	 */ +	total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); +	while (nexts++ < num_exts && *current_ext < total_extents) {  		gotp = xfs_iext_get_ext(ifp, *current_ext);  		xfs_bmbt_get_all(gotp, &got); @@ -5556,10 +5561,11 @@ xfs_bmap_shift_extents(  		}  		(*current_ext)++; +		total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);  	}  	/* Check if we are done */ -	if (*current_ext ==  XFS_IFORK_NEXTENTS(ip, whichfork)) +	if (*current_ext == total_extents)  		*done = 1;  del_cursor: @@ -5568,6 +5574,5 @@ del_cursor:  			error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);  	xfs_trans_log_inode(tp, ip, logflags); -  	return error;  } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 01f6a646caa1..296160b8e78c 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1418,6 +1418,8 @@ xfs_zero_file_space(  	xfs_off_t		end_boundary;  	int			error; +	trace_xfs_zero_file_space(ip); +  	granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);  	/* @@ -1432,9 +1434,18 @@ xfs_zero_file_space(  	ASSERT(end_boundary <= offset + len);  	if (start_boundary < end_boundary - 1) { -		/* punch out the page cache over the conversion range */ +		/* +		 * punch out delayed allocation blocks and the page cache over +		 * the conversion range +		 */ +		xfs_ilock(ip, XFS_ILOCK_EXCL); +		error = xfs_bmap_punch_delalloc_range(ip, +				XFS_B_TO_FSBT(mp, start_boundary), +				XFS_B_TO_FSB(mp, end_boundary - start_boundary)); +		xfs_iunlock(ip, XFS_ILOCK_EXCL);  		truncate_pagecache_range(VFS_I(ip), start_boundary,  					 end_boundary - 1); +  		/* convert the blocks */  		error = xfs_alloc_file_space(ip, start_boundary,  					end_boundary - start_boundary - 1, diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 107f2fdfe41f..cb10a0aaab3a 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1372,21 +1372,29 @@ xfs_buf_iorequest(  		xfs_buf_wait_unpin(bp);  	xfs_buf_hold(bp); -	/* Set the count to 1 initially, this will stop an I/O +	/* +	 * Set the count to 1 initially, this will stop an I/O  	 * completion callout which happens before we have started  	 * all the I/O from calling xfs_buf_ioend too early.  	 */  	atomic_set(&bp->b_io_remaining, 1);  	_xfs_buf_ioapply(bp); -	_xfs_buf_ioend(bp, 1); +	/* +	 * If _xfs_buf_ioapply failed, we'll get back here with +	 * only the reference we took above.  _xfs_buf_ioend will +	 * drop it to zero, so we'd better not queue it for later, +	 * or we'll free it before it's done. +	 */ +	_xfs_buf_ioend(bp, bp->b_error ? 0 : 1);  	xfs_buf_rele(bp);  }  /*   * Waits for I/O to complete on the buffer supplied.  It returns immediately if - * no I/O is pending or there is already a pending error on the buffer.  It - * returns the I/O error code, if any, or 0 if there was no error. + * no I/O is pending or there is already a pending error on the buffer, in which + * case nothing will ever complete.  It returns the I/O error code, if any, or + * 0 if there was no error.   */  int  xfs_buf_iowait( diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 79e96ce98733..82afdcb33183 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -679,7 +679,7 @@ xfs_file_dio_aio_write(  		goto out;  	if (mapping->nrpages) { -		ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, +		ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,  						    pos, -1);  		if (ret)  			goto out; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 5e7a38fa6ee6..768087bedbac 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1334,7 +1334,8 @@ int  xfs_create_tmpfile(  	struct xfs_inode	*dp,  	struct dentry		*dentry, -	umode_t			mode) +	umode_t			mode, +	struct xfs_inode	**ipp)  {  	struct xfs_mount	*mp = dp->i_mount;  	struct xfs_inode	*ip = NULL; @@ -1402,7 +1403,6 @@ xfs_create_tmpfile(  	xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);  	ip->i_d.di_nlink--; -	d_tmpfile(dentry, VFS_I(ip));  	error = xfs_iunlink(tp, ip);  	if (error)  		goto out_trans_abort; @@ -1415,6 +1415,7 @@ xfs_create_tmpfile(  	xfs_qm_dqrele(gdqp);  	xfs_qm_dqrele(pdqp); +	*ipp = ip;  	return 0;   out_trans_abort: diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 396cc1fafd0d..f2fcde52b66d 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -334,7 +334,7 @@ int		xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,  int		xfs_create(struct xfs_inode *dp, struct xfs_name *name,  			   umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp);  int		xfs_create_tmpfile(struct xfs_inode *dp, struct dentry *dentry, -			   umode_t mode); +			   umode_t mode, struct xfs_inode **ipp);  int		xfs_remove(struct xfs_inode *dp, struct xfs_name *name,  			   struct xfs_inode *ip);  int		xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 89b07e43ca28..ef1ca010f417 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1053,11 +1053,25 @@ xfs_vn_tmpfile(  	struct dentry	*dentry,  	umode_t		mode)  { -	int		error; +	int			error; +	struct xfs_inode	*ip; +	struct inode		*inode; -	error = xfs_create_tmpfile(XFS_I(dir), dentry, mode); +	error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip); +	if (unlikely(error)) +		return -error; -	return -error; +	inode = VFS_I(ip); + +	error = xfs_init_security(inode, dir, &dentry->d_name); +	if (unlikely(error)) { +		iput(inode); +		return -error; +	} + +	d_tmpfile(dentry, inode); + +	return 0;  }  static const struct inode_operations xfs_inode_operations = { diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 8497a00e399d..08624dc67317 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1181,11 +1181,14 @@ xlog_iodone(xfs_buf_t *bp)  	/* log I/O is always issued ASYNC */  	ASSERT(XFS_BUF_ISASYNC(bp));  	xlog_state_done_syncing(iclog, aborted); +  	/* -	 * do not reference the buffer (bp) here as we could race -	 * with it being freed after writing the unmount record to the -	 * log. +	 * drop the buffer lock now that we are done. Nothing references +	 * the buffer after this, so an unmount waiting on this lock can now +	 * tear it down safely. As such, it is unsafe to reference the buffer +	 * (bp) after the unlock as we could race with it being freed.  	 */ +	xfs_buf_unlock(bp);  }  /* @@ -1368,8 +1371,16 @@ xlog_alloc_log(  	bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0);  	if (!bp)  		goto out_free_log; -	bp->b_iodone = xlog_iodone; + +	/* +	 * The iclogbuf buffer locks are held over IO but we are not going to do +	 * IO yet.  Hence unlock the buffer so that the log IO path can grab it +	 * when appropriately. +	 */  	ASSERT(xfs_buf_islocked(bp)); +	xfs_buf_unlock(bp); + +	bp->b_iodone = xlog_iodone;  	log->l_xbuf = bp;  	spin_lock_init(&log->l_icloglock); @@ -1398,6 +1409,9 @@ xlog_alloc_log(  		if (!bp)  			goto out_free_iclog; +		ASSERT(xfs_buf_islocked(bp)); +		xfs_buf_unlock(bp); +  		bp->b_iodone = xlog_iodone;  		iclog->ic_bp = bp;  		iclog->ic_data = bp->b_addr; @@ -1422,7 +1436,6 @@ xlog_alloc_log(  		iclog->ic_callback_tail = &(iclog->ic_callback);  		iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; -		ASSERT(xfs_buf_islocked(iclog->ic_bp));  		init_waitqueue_head(&iclog->ic_force_wait);  		init_waitqueue_head(&iclog->ic_write_wait); @@ -1631,6 +1644,12 @@ xlog_cksum(   * we transition the iclogs to IOERROR state *after* flushing all existing   * iclogs to disk. This is because we don't want anymore new transactions to be   * started or completed afterwards. + * + * We lock the iclogbufs here so that we can serialise against IO completion + * during unmount. We might be processing a shutdown triggered during unmount, + * and that can occur asynchronously to the unmount thread, and hence we need to + * ensure that completes before tearing down the iclogbufs. Hence we need to + * hold the buffer lock across the log IO to acheive that.   */  STATIC int  xlog_bdstrat( @@ -1638,6 +1657,7 @@ xlog_bdstrat(  {  	struct xlog_in_core	*iclog = bp->b_fspriv; +	xfs_buf_lock(bp);  	if (iclog->ic_state & XLOG_STATE_IOERROR) {  		xfs_buf_ioerror(bp, EIO);  		xfs_buf_stale(bp); @@ -1645,7 +1665,8 @@ xlog_bdstrat(  		/*  		 * It would seem logical to return EIO here, but we rely on  		 * the log state machine to propagate I/O errors instead of -		 * doing it here. +		 * doing it here. Similarly, IO completion will unlock the +		 * buffer, so we don't do it here.  		 */  		return 0;  	} @@ -1847,14 +1868,28 @@ xlog_dealloc_log(  	xlog_cil_destroy(log);  	/* -	 * always need to ensure that the extra buffer does not point to memory -	 * owned by another log buffer before we free it. +	 * Cycle all the iclogbuf locks to make sure all log IO completion +	 * is done before we tear down these buffers.  	 */ +	iclog = log->l_iclog; +	for (i = 0; i < log->l_iclog_bufs; i++) { +		xfs_buf_lock(iclog->ic_bp); +		xfs_buf_unlock(iclog->ic_bp); +		iclog = iclog->ic_next; +	} + +	/* +	 * Always need to ensure that the extra buffer does not point to memory +	 * owned by another log buffer before we free it. Also, cycle the lock +	 * first to ensure we've completed IO on it. +	 */ +	xfs_buf_lock(log->l_xbuf); +	xfs_buf_unlock(log->l_xbuf);  	xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size));  	xfs_buf_free(log->l_xbuf);  	iclog = log->l_iclog; -	for (i=0; i<log->l_iclog_bufs; i++) { +	for (i = 0; i < log->l_iclog_bufs; i++) {  		xfs_buf_free(iclog->ic_bp);  		next_iclog = iclog->ic_next;  		kmem_free(iclog); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index a4ae41c179a8..65d8c793a25c 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -603,6 +603,7 @@ DEFINE_INODE_EVENT(xfs_readlink);  DEFINE_INODE_EVENT(xfs_inactive_symlink);  DEFINE_INODE_EVENT(xfs_alloc_file_space);  DEFINE_INODE_EVENT(xfs_free_file_space); +DEFINE_INODE_EVENT(xfs_zero_file_space);  DEFINE_INODE_EVENT(xfs_collapse_file_space);  DEFINE_INODE_EVENT(xfs_readdir);  #ifdef CONFIG_XFS_POSIX_ACL  | 
