diff options
Diffstat (limited to 'fs/ocfs2')
43 files changed, 2342 insertions, 1666 deletions
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index ce210d4951a1..e27e6527912b 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -41,7 +41,8 @@ ocfs2-objs := \ quota_local.o \ quota_global.o \ xattr.o \ - acl.o + acl.o \ + filecheck.o ocfs2_stackglue-objs := stackglue.o ocfs2_stack_o2cb-objs := stack_o2cb.o diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 86181d6526dc..e361d1a0ca09 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -164,7 +164,7 @@ static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *rec); static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et); static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et); -static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = { +static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = { .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk, .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk, .eo_update_clusters = ocfs2_dinode_update_clusters, @@ -286,7 +286,7 @@ static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et, le32_add_cpu(&vb->vb_xv->xr_clusters, clusters); } -static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = { +static const struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = { .eo_set_last_eb_blk = ocfs2_xattr_value_set_last_eb_blk, .eo_get_last_eb_blk = ocfs2_xattr_value_get_last_eb_blk, .eo_update_clusters = ocfs2_xattr_value_update_clusters, @@ -332,7 +332,7 @@ static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et, le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters); } -static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = { +static const struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = { .eo_set_last_eb_blk = ocfs2_xattr_tree_set_last_eb_blk, .eo_get_last_eb_blk = ocfs2_xattr_tree_get_last_eb_blk, .eo_update_clusters = ocfs2_xattr_tree_update_clusters, @@ -379,7 +379,7 @@ static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et) et->et_root_el = &dx_root->dr_list; } -static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = { +static const struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = { .eo_set_last_eb_blk = ocfs2_dx_root_set_last_eb_blk, .eo_get_last_eb_blk = ocfs2_dx_root_get_last_eb_blk, .eo_update_clusters = ocfs2_dx_root_update_clusters, @@ -425,7 +425,7 @@ ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et, return CONTIG_NONE; } -static struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = { +static const struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = { .eo_set_last_eb_blk = ocfs2_refcount_tree_set_last_eb_blk, .eo_get_last_eb_blk = ocfs2_refcount_tree_get_last_eb_blk, .eo_update_clusters = ocfs2_refcount_tree_update_clusters, @@ -438,7 +438,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, struct buffer_head *bh, ocfs2_journal_access_func access, void *obj, - struct ocfs2_extent_tree_operations *ops) + const struct ocfs2_extent_tree_operations *ops) { et->et_ops = ops; et->et_root_bh = bh; @@ -2516,21 +2516,6 @@ static int ocfs2_update_edge_lengths(handle_t *handle, struct ocfs2_extent_block *eb; u32 range; - /* - * In normal tree rotation process, we will never touch the - * tree branch above subtree_index and ocfs2_extend_rotate_transaction - * doesn't reserve the credits for them either. - * - * But we do have a special case here which will update the rightmost - * records for all the bh in the path. - * So we have to allocate extra credits and access them. - */ - ret = ocfs2_extend_trans(handle, subtree_index); - if (ret) { - mlog_errno(ret); - goto out; - } - ret = ocfs2_journal_access_path(et->et_ci, handle, path); if (ret) { mlog_errno(ret); @@ -2956,7 +2941,7 @@ static int __ocfs2_rotate_tree_left(handle_t *handle, right_path->p_node[subtree_root].bh->b_blocknr, right_path->p_tree_depth); - ret = ocfs2_extend_rotate_transaction(handle, subtree_root, + ret = ocfs2_extend_rotate_transaction(handle, 0, orig_credits, left_path); if (ret) { mlog_errno(ret); @@ -3029,21 +3014,9 @@ static int ocfs2_remove_rightmost_path(handle_t *handle, struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; - ret = ocfs2_et_sanity_check(et); if (ret) goto out; - /* - * There's two ways we handle this depending on - * whether path is the only existing one. - */ - ret = ocfs2_extend_rotate_transaction(handle, 0, - handle->h_buffer_credits, - path); - if (ret) { - mlog_errno(ret); - goto out; - } ret = ocfs2_journal_access_path(et->et_ci, handle, path); if (ret) { @@ -3641,6 +3614,14 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path, */ if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 && le16_to_cpu(el->l_next_free_rec) == 1) { + /* extend credit for ocfs2_remove_rightmost_path */ + ret = ocfs2_extend_rotate_transaction(handle, 0, + handle->h_buffer_credits, + right_path); + if (ret) { + mlog_errno(ret); + goto out; + } ret = ocfs2_remove_rightmost_path(handle, et, right_path, @@ -3679,6 +3660,14 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, BUG_ON(ctxt->c_contig_type == CONTIG_NONE); if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) { + /* extend credit for ocfs2_remove_rightmost_path */ + ret = ocfs2_extend_rotate_transaction(handle, 0, + handle->h_buffer_credits, + path); + if (ret) { + mlog_errno(ret); + goto out; + } /* * The merge code will need to create an empty * extent to take the place of the newly @@ -3727,6 +3716,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, */ BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); + /* extend credit for ocfs2_remove_rightmost_path */ + ret = ocfs2_extend_rotate_transaction(handle, 0, + handle->h_buffer_credits, + path); + if (ret) { + mlog_errno(ret); + goto out; + } + /* The merge left us with an empty extent, remove it. */ ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); if (ret) { @@ -3748,6 +3746,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, goto out; } + /* extend credit for ocfs2_remove_rightmost_path */ + ret = ocfs2_extend_rotate_transaction(handle, 0, + handle->h_buffer_credits, + path); + if (ret) { + mlog_errno(ret); + goto out; + } + ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); /* * Error from this last rotate is not critical, so @@ -3783,6 +3790,16 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, } if (ctxt->c_split_covers_rec) { + /* extend credit for ocfs2_remove_rightmost_path */ + ret = ocfs2_extend_rotate_transaction(handle, 0, + handle->h_buffer_credits, + path); + if (ret) { + mlog_errno(ret); + ret = 0; + goto out; + } + /* * The merge may have left an empty extent in * our leaf. Try to rotate it away. @@ -5342,6 +5359,15 @@ static int ocfs2_truncate_rec(handle_t *handle, struct ocfs2_extent_block *eb; if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) { + /* extend credit for ocfs2_remove_rightmost_path */ + ret = ocfs2_extend_rotate_transaction(handle, 0, + handle->h_buffer_credits, + path); + if (ret) { + mlog_errno(ret); + goto out; + } + ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); if (ret) { mlog_errno(ret); @@ -5719,7 +5745,7 @@ int ocfs2_remove_btree_range(struct inode *inode, goto bail; } - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); if (ocfs2_truncate_log_needs_flush(osb)) { ret = __ocfs2_flush_truncate_log(osb); @@ -5776,7 +5802,7 @@ int ocfs2_remove_btree_range(struct inode *inode, out_commit: ocfs2_commit_trans(osb, handle); out: - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); bail: if (meta_ac) ocfs2_free_alloc_context(meta_ac); @@ -5832,7 +5858,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb, struct ocfs2_dinode *di; struct ocfs2_truncate_log *tl; - BUG_ON(mutex_trylock(&tl_inode->i_mutex)); + BUG_ON(inode_trylock(tl_inode)); start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk); @@ -5928,16 +5954,6 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, ocfs2_journal_dirty(handle, tl_bh); - /* TODO: Perhaps we can calculate the bulk of the - * credits up front rather than extending like - * this. */ - status = ocfs2_extend_trans(handle, - OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); - if (status < 0) { - mlog_errno(status); - goto bail; - } - rec = tl->tl_recs[i]; start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb, le32_to_cpu(rec.t_start)); @@ -5958,6 +5974,13 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, goto bail; } } + + status = ocfs2_extend_trans(handle, + OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); + if (status < 0) { + mlog_errno(status); + goto bail; + } i--; } @@ -5980,7 +6003,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) struct ocfs2_dinode *di; struct ocfs2_truncate_log *tl; - BUG_ON(mutex_trylock(&tl_inode->i_mutex)); + BUG_ON(inode_trylock(tl_inode)); di = (struct ocfs2_dinode *) tl_bh->b_data; @@ -6008,7 +6031,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) goto out; } - mutex_lock(&data_alloc_inode->i_mutex); + inode_lock(data_alloc_inode); status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1); if (status < 0) { @@ -6016,7 +6039,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) goto out_mutex; } - handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); + handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); @@ -6035,7 +6058,7 @@ out_unlock: ocfs2_inode_unlock(data_alloc_inode, 1); out_mutex: - mutex_unlock(&data_alloc_inode->i_mutex); + inode_unlock(data_alloc_inode); iput(data_alloc_inode); out: @@ -6047,9 +6070,9 @@ int ocfs2_flush_truncate_log(struct ocfs2_super *osb) int status; struct inode *tl_inode = osb->osb_tl_inode; - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); status = __ocfs2_flush_truncate_log(osb); - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); return status; } @@ -6079,7 +6102,7 @@ void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, if (cancel) cancel_delayed_work(&osb->osb_truncate_log_wq); - queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq, + queue_delayed_work(osb->ocfs2_wq, &osb->osb_truncate_log_wq, OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL); } } @@ -6174,8 +6197,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, } bail: - if (tl_inode) - iput(tl_inode); + iput(tl_inode); brelse(tl_bh); if (status < 0) { @@ -6209,7 +6231,7 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, (unsigned long long)le64_to_cpu(tl_copy->i_blkno), num_recs); - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); for(i = 0; i < num_recs; i++) { if (ocfs2_truncate_log_needs_flush(osb)) { status = __ocfs2_flush_truncate_log(osb); @@ -6240,7 +6262,7 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, } bail_up: - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); return status; } @@ -6254,7 +6276,7 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb) if (tl_inode) { cancel_delayed_work(&osb->osb_truncate_log_wq); - flush_workqueue(ocfs2_wq); + flush_workqueue(osb->ocfs2_wq); status = ocfs2_flush_truncate_log(osb); if (status < 0) @@ -6347,7 +6369,7 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb, goto out; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = ocfs2_inode_lock(inode, &di_bh, 1); if (ret) { @@ -6396,7 +6418,7 @@ out_unlock: ocfs2_inode_unlock(inode, 1); brelse(di_bh); out_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); iput(inode); out: while(head) { @@ -6440,7 +6462,7 @@ static int ocfs2_free_cached_clusters(struct ocfs2_super *osb, handle_t *handle; int ret = 0; - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); while (head) { if (ocfs2_truncate_log_needs_flush(osb)) { @@ -6472,7 +6494,7 @@ static int ocfs2_free_cached_clusters(struct ocfs2_super *osb, } } - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); while (head) { /* Premature exit may have left some dangling items. */ @@ -6649,7 +6671,7 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start, { int i; struct page *page; - unsigned int from, to = PAGE_CACHE_SIZE; + unsigned int from, to = PAGE_SIZE; struct super_block *sb = inode->i_sb; BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb))); @@ -6657,21 +6679,21 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start, if (numpages == 0) goto out; - to = PAGE_CACHE_SIZE; + to = PAGE_SIZE; for(i = 0; i < numpages; i++) { page = pages[i]; - from = start & (PAGE_CACHE_SIZE - 1); - if ((end >> PAGE_CACHE_SHIFT) == page->index) - to = end & (PAGE_CACHE_SIZE - 1); + from = start & (PAGE_SIZE - 1); + if ((end >> PAGE_SHIFT) == page->index) + to = end & (PAGE_SIZE - 1); - BUG_ON(from > PAGE_CACHE_SIZE); - BUG_ON(to > PAGE_CACHE_SIZE); + BUG_ON(from > PAGE_SIZE); + BUG_ON(to > PAGE_SIZE); ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1, &phys); - start = (page->index + 1) << PAGE_CACHE_SHIFT; + start = (page->index + 1) << PAGE_SHIFT; } out: if (pages) @@ -6690,7 +6712,7 @@ int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end, numpages = 0; last_page_bytes = PAGE_ALIGN(end); - index = start >> PAGE_CACHE_SHIFT; + index = start >> PAGE_SHIFT; do { pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS); if (!pages[numpages]) { @@ -6701,7 +6723,7 @@ int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end, numpages++; index++; - } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT)); + } while (index < (last_page_bytes >> PAGE_SHIFT)); out: if (ret != 0) { @@ -6928,8 +6950,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, * to do that now. */ if (!ocfs2_sparse_alloc(osb) && - PAGE_CACHE_SIZE < osb->s_clustersize) - end = PAGE_CACHE_SIZE; + PAGE_SIZE < osb->s_clustersize) + end = PAGE_SIZE; ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages); if (ret) { @@ -6949,8 +6971,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, goto out_unlock; } - page_end = PAGE_CACHE_SIZE; - if (PAGE_CACHE_SIZE > osb->s_clustersize) + page_end = PAGE_SIZE; + if (PAGE_SIZE > osb->s_clustersize) page_end = osb->s_clustersize; for (i = 0; i < num_pages; i++) @@ -7356,7 +7378,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) goto out; } - mutex_lock(&main_bm_inode->i_mutex); + inode_lock(main_bm_inode); ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0); if (ret < 0) { @@ -7423,7 +7445,7 @@ out_unlock: ocfs2_inode_unlock(main_bm_inode, 0); brelse(main_bm_bh); out_mutex: - mutex_unlock(&main_bm_inode->i_mutex); + inode_unlock(main_bm_inode); iput(main_bm_inode); out: return ret; diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index fb09b97db162..f3dc1b0dfffc 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -54,7 +54,7 @@ */ struct ocfs2_extent_tree_operations; struct ocfs2_extent_tree { - struct ocfs2_extent_tree_operations *et_ops; + const struct ocfs2_extent_tree_operations *et_ops; struct buffer_head *et_root_bh; struct ocfs2_extent_list *et_root_el; struct ocfs2_caching_info *et_ci; diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 7f604727f487..ad1577348a92 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -234,7 +234,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page, size = i_size_read(inode); - if (size > PAGE_CACHE_SIZE || + if (size > PAGE_SIZE || size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) { ocfs2_error(inode->i_sb, "Inode %llu has with inline data has bad size: %Lu\n", @@ -247,7 +247,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page, if (size) memcpy(kaddr, di->id2.i_data.id_data, size); /* Clear the remaining part of the page */ - memset(kaddr + size, 0, PAGE_CACHE_SIZE - size); + memset(kaddr + size, 0, PAGE_SIZE - size); flush_dcache_page(page); kunmap_atomic(kaddr); @@ -282,7 +282,7 @@ static int ocfs2_readpage(struct file *file, struct page *page) { struct inode *inode = page->mapping->host; struct ocfs2_inode_info *oi = OCFS2_I(inode); - loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT; + loff_t start = (loff_t)page->index << PAGE_SHIFT; int ret, unlock = 1; trace_ocfs2_readpage((unsigned long long)oi->ip_blkno, @@ -385,7 +385,7 @@ static int ocfs2_readpages(struct file *filp, struct address_space *mapping, * drop out in that case as it's not worth handling here. */ last = list_entry(pages->prev, struct page, lru); - start = (loff_t)last->index << PAGE_CACHE_SHIFT; + start = (loff_t)last->index << PAGE_SHIFT; if (start >= i_size_read(inode)) goto out_unlock; @@ -499,153 +499,6 @@ bail: return status; } -/* - * TODO: Make this into a generic get_blocks function. - * - * From do_direct_io in direct-io.c: - * "So what we do is to permit the ->get_blocks function to populate - * bh.b_size with the size of IO which is permitted at this offset and - * this i_blkbits." - * - * This function is called directly from get_more_blocks in direct-io.c. - * - * called like this: dio->get_blocks(dio->inode, fs_startblk, - * fs_count, map_bh, dio->rw == WRITE); - */ -static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - int ret; - u32 cpos = 0; - int alloc_locked = 0; - u64 p_blkno, inode_blocks, contig_blocks; - unsigned int ext_flags; - unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; - unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; - unsigned long len = bh_result->b_size; - unsigned int clusters_to_alloc = 0, contig_clusters = 0; - - cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock); - - /* This function won't even be called if the request isn't all - * nicely aligned and of the right size, so there's no need - * for us to check any of that. */ - - inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); - - down_read(&OCFS2_I(inode)->ip_alloc_sem); - - /* This figures out the size of the next contiguous block, and - * our logical offset */ - ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, - &contig_blocks, &ext_flags); - up_read(&OCFS2_I(inode)->ip_alloc_sem); - - if (ret) { - mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", - (unsigned long long)iblock); - ret = -EIO; - goto bail; - } - - /* We should already CoW the refcounted extent in case of create. */ - BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED)); - - /* allocate blocks if no p_blkno is found, and create == 1 */ - if (!p_blkno && create) { - ret = ocfs2_inode_lock(inode, NULL, 1); - if (ret < 0) { - mlog_errno(ret); - goto bail; - } - - alloc_locked = 1; - - down_write(&OCFS2_I(inode)->ip_alloc_sem); - - /* fill hole, allocate blocks can't be larger than the size - * of the hole */ - clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len); - contig_clusters = ocfs2_clusters_for_blocks(inode->i_sb, - contig_blocks); - if (clusters_to_alloc > contig_clusters) - clusters_to_alloc = contig_clusters; - - /* allocate extent and insert them into the extent tree */ - ret = ocfs2_extend_allocation(inode, cpos, - clusters_to_alloc, 0); - if (ret < 0) { - up_write(&OCFS2_I(inode)->ip_alloc_sem); - mlog_errno(ret); - goto bail; - } - - ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, - &contig_blocks, &ext_flags); - if (ret < 0) { - up_write(&OCFS2_I(inode)->ip_alloc_sem); - mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", - (unsigned long long)iblock); - ret = -EIO; - goto bail; - } - set_buffer_new(bh_result); - up_write(&OCFS2_I(inode)->ip_alloc_sem); - } - - /* - * get_more_blocks() expects us to describe a hole by clearing - * the mapped bit on bh_result(). - * - * Consider an unwritten extent as a hole. - */ - if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) - map_bh(bh_result, inode->i_sb, p_blkno); - else - clear_buffer_mapped(bh_result); - - /* make sure we don't map more than max_blocks blocks here as - that's all the kernel will handle at this point. */ - if (max_blocks < contig_blocks) - contig_blocks = max_blocks; - bh_result->b_size = contig_blocks << blocksize_bits; -bail: - if (alloc_locked) - ocfs2_inode_unlock(inode, 1); - return ret; -} - -/* - * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're - * particularly interested in the aio/dio case. We use the rw_lock DLM lock - * to protect io on one node from truncation on another. - */ -static void ocfs2_dio_end_io(struct kiocb *iocb, - loff_t offset, - ssize_t bytes, - void *private) -{ - struct inode *inode = file_inode(iocb->ki_filp); - int level; - - /* this io's submitter should not have unlocked this before we could */ - BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); - - if (ocfs2_iocb_is_unaligned_aio(iocb)) { - ocfs2_iocb_clear_unaligned_aio(iocb); - - mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio); - } - - /* Let rw unlock to be done later to protect append direct io write */ - if (offset + bytes <= i_size_read(inode)) { - ocfs2_iocb_clear_rw_locked(iocb); - - level = ocfs2_iocb_rw_locked_level(iocb); - ocfs2_rw_unlock(inode, level); - } -} - static int ocfs2_releasepage(struct page *page, gfp_t wait) { if (!page_has_buffers(page)) @@ -653,373 +506,17 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait) return try_to_free_buffers(page); } -static int ocfs2_is_overwrite(struct ocfs2_super *osb, - struct inode *inode, loff_t offset) -{ - int ret = 0; - u32 v_cpos = 0; - u32 p_cpos = 0; - unsigned int num_clusters = 0; - unsigned int ext_flags = 0; - - v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset); - ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, - &num_clusters, &ext_flags); - if (ret < 0) { - mlog_errno(ret); - return ret; - } - - if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) - return 1; - - return 0; -} - -static int ocfs2_direct_IO_zero_extend(struct ocfs2_super *osb, - struct inode *inode, loff_t offset, - u64 zero_len, int cluster_align) -{ - u32 p_cpos = 0; - u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode)); - unsigned int num_clusters = 0; - unsigned int ext_flags = 0; - int ret = 0; - - if (offset <= i_size_read(inode) || cluster_align) - return 0; - - ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters, - &ext_flags); - if (ret < 0) { - mlog_errno(ret); - return ret; - } - - if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) { - u64 s = i_size_read(inode); - sector_t sector = ((u64)p_cpos << (osb->s_clustersize_bits - 9)) + - (do_div(s, osb->s_clustersize) >> 9); - - ret = blkdev_issue_zeroout(osb->sb->s_bdev, sector, - zero_len >> 9, GFP_NOFS, false); - if (ret < 0) - mlog_errno(ret); - } - - return ret; -} - -static int ocfs2_direct_IO_extend_no_holes(struct ocfs2_super *osb, - struct inode *inode, loff_t offset) -{ - u64 zero_start, zero_len, total_zero_len; - u32 p_cpos = 0, clusters_to_add; - u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode)); - unsigned int num_clusters = 0; - unsigned int ext_flags = 0; - u32 size_div, offset_div; - int ret = 0; - - { - u64 o = offset; - u64 s = i_size_read(inode); - - offset_div = do_div(o, osb->s_clustersize); - size_div = do_div(s, osb->s_clustersize); - } - - if (offset <= i_size_read(inode)) - return 0; - - clusters_to_add = ocfs2_bytes_to_clusters(inode->i_sb, offset) - - ocfs2_bytes_to_clusters(inode->i_sb, i_size_read(inode)); - total_zero_len = offset - i_size_read(inode); - if (clusters_to_add) - total_zero_len -= offset_div; - - /* Allocate clusters to fill out holes, and this is only needed - * when we add more than one clusters. Otherwise the cluster will - * be allocated during direct IO */ - if (clusters_to_add > 1) { - ret = ocfs2_extend_allocation(inode, - OCFS2_I(inode)->ip_clusters, - clusters_to_add - 1, 0); - if (ret) { - mlog_errno(ret); - goto out; - } - } - - while (total_zero_len) { - ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters, - &ext_flags); - if (ret < 0) { - mlog_errno(ret); - goto out; - } - - zero_start = ocfs2_clusters_to_bytes(osb->sb, p_cpos) + - size_div; - zero_len = ocfs2_clusters_to_bytes(osb->sb, num_clusters) - - size_div; - zero_len = min(total_zero_len, zero_len); - - if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) { - ret = blkdev_issue_zeroout(osb->sb->s_bdev, - zero_start >> 9, zero_len >> 9, - GFP_NOFS, false); - if (ret < 0) { - mlog_errno(ret); - goto out; - } - } - - total_zero_len -= zero_len; - v_cpos += ocfs2_bytes_to_clusters(osb->sb, zero_len + size_div); - - /* Only at first iteration can be cluster not aligned. - * So set size_div to 0 for the rest */ - size_div = 0; - } - -out: - return ret; -} - -static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, - struct iov_iter *iter, - loff_t offset) -{ - ssize_t ret = 0; - ssize_t written = 0; - bool orphaned = false; - int is_overwrite = 0; - struct file *file = iocb->ki_filp; - struct inode *inode = file_inode(file)->i_mapping->host; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct buffer_head *di_bh = NULL; - size_t count = iter->count; - journal_t *journal = osb->journal->j_journal; - u64 zero_len_head, zero_len_tail; - int cluster_align_head, cluster_align_tail; - loff_t final_size = offset + count; - int append_write = offset >= i_size_read(inode) ? 1 : 0; - unsigned int num_clusters = 0; - unsigned int ext_flags = 0; - - { - u64 o = offset; - u64 s = i_size_read(inode); - - zero_len_head = do_div(o, 1 << osb->s_clustersize_bits); - cluster_align_head = !zero_len_head; - - zero_len_tail = osb->s_clustersize - - do_div(s, osb->s_clustersize); - if ((offset - i_size_read(inode)) < zero_len_tail) - zero_len_tail = offset - i_size_read(inode); - cluster_align_tail = !zero_len_tail; - } - - /* - * when final_size > inode->i_size, inode->i_size will be - * updated after direct write, so add the inode to orphan - * dir first. - */ - if (final_size > i_size_read(inode)) { - ret = ocfs2_add_inode_to_orphan(osb, inode); - if (ret < 0) { - mlog_errno(ret); - goto out; - } - orphaned = true; - } - - if (append_write) { - ret = ocfs2_inode_lock(inode, NULL, 1); - if (ret < 0) { - mlog_errno(ret); - goto clean_orphan; - } - - /* zeroing out the previously allocated cluster tail - * that but not zeroed */ - if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { - down_read(&OCFS2_I(inode)->ip_alloc_sem); - ret = ocfs2_direct_IO_zero_extend(osb, inode, offset, - zero_len_tail, cluster_align_tail); - up_read(&OCFS2_I(inode)->ip_alloc_sem); - } else { - down_write(&OCFS2_I(inode)->ip_alloc_sem); - ret = ocfs2_direct_IO_extend_no_holes(osb, inode, - offset); - up_write(&OCFS2_I(inode)->ip_alloc_sem); - } - if (ret < 0) { - mlog_errno(ret); - ocfs2_inode_unlock(inode, 1); - goto clean_orphan; - } - - is_overwrite = ocfs2_is_overwrite(osb, inode, offset); - if (is_overwrite < 0) { - mlog_errno(is_overwrite); - ret = is_overwrite; - ocfs2_inode_unlock(inode, 1); - goto clean_orphan; - } - - ocfs2_inode_unlock(inode, 1); - } - - written = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, - offset, ocfs2_direct_IO_get_blocks, - ocfs2_dio_end_io, NULL, 0); - /* overwrite aio may return -EIOCBQUEUED, and it is not an error */ - if ((written < 0) && (written != -EIOCBQUEUED)) { - loff_t i_size = i_size_read(inode); - - if (offset + count > i_size) { - ret = ocfs2_inode_lock(inode, &di_bh, 1); - if (ret < 0) { - mlog_errno(ret); - goto clean_orphan; - } - - if (i_size == i_size_read(inode)) { - ret = ocfs2_truncate_file(inode, di_bh, - i_size); - if (ret < 0) { - if (ret != -ENOSPC) - mlog_errno(ret); - - ocfs2_inode_unlock(inode, 1); - brelse(di_bh); - di_bh = NULL; - goto clean_orphan; - } - } - - ocfs2_inode_unlock(inode, 1); - brelse(di_bh); - di_bh = NULL; - - ret = jbd2_journal_force_commit(journal); - if (ret < 0) - mlog_errno(ret); - } - } else if (written > 0 && append_write && !is_overwrite && - !cluster_align_head) { - /* zeroing out the allocated cluster head */ - u32 p_cpos = 0; - u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset); - - ret = ocfs2_inode_lock(inode, NULL, 0); - if (ret < 0) { - mlog_errno(ret); - goto clean_orphan; - } - - ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, - &num_clusters, &ext_flags); - if (ret < 0) { - mlog_errno(ret); - ocfs2_inode_unlock(inode, 0); - goto clean_orphan; - } - - BUG_ON(!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN)); - - ret = blkdev_issue_zeroout(osb->sb->s_bdev, - (u64)p_cpos << (osb->s_clustersize_bits - 9), - zero_len_head >> 9, GFP_NOFS, false); - if (ret < 0) - mlog_errno(ret); - - ocfs2_inode_unlock(inode, 0); - } - -clean_orphan: - if (orphaned) { - int tmp_ret; - int update_isize = written > 0 ? 1 : 0; - loff_t end = update_isize ? offset + written : 0; - - tmp_ret = ocfs2_inode_lock(inode, &di_bh, 1); - if (tmp_ret < 0) { - ret = tmp_ret; - mlog_errno(ret); - goto out; - } - - tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, - update_isize, end); - if (tmp_ret < 0) { - ret = tmp_ret; - mlog_errno(ret); - brelse(di_bh); - goto out; - } - - ocfs2_inode_unlock(inode, 1); - brelse(di_bh); - - tmp_ret = jbd2_journal_force_commit(journal); - if (tmp_ret < 0) { - ret = tmp_ret; - mlog_errno(tmp_ret); - } - } - -out: - if (ret >= 0) - ret = written; - return ret; -} - -static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file_inode(file)->i_mapping->host; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - int full_coherency = !(osb->s_mount_opt & - OCFS2_MOUNT_COHERENCY_BUFFERED); - - /* - * Fallback to buffered I/O if we see an inode without - * extents. - */ - if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) - return 0; - - /* Fallback to buffered I/O if we are appending and - * concurrent O_DIRECT writes are allowed. - */ - if (i_size_read(inode) <= offset && !full_coherency) - return 0; - - if (iov_iter_rw(iter) == READ) - return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, - iter, offset, - ocfs2_direct_IO_get_blocks, - ocfs2_dio_end_io, NULL, 0); - else - return ocfs2_direct_IO_write(iocb, iter, offset); -} - static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, u32 cpos, unsigned int *start, unsigned int *end) { - unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE; + unsigned int cluster_start = 0, cluster_end = PAGE_SIZE; - if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) { + if (unlikely(PAGE_SHIFT > osb->s_clustersize_bits)) { unsigned int cpp; - cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits); + cpp = 1 << (PAGE_SHIFT - osb->s_clustersize_bits); cluster_start = cpos % cpp; cluster_start = cluster_start << osb->s_clustersize_bits; @@ -1187,13 +684,20 @@ next_bh: return ret; } -#if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE) +#if (PAGE_SIZE >= OCFS2_MAX_CLUSTERSIZE) #define OCFS2_MAX_CTXT_PAGES 1 #else -#define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE) +#define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_SIZE) #endif -#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE) +#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_SIZE / OCFS2_MIN_CLUSTERSIZE) + +struct ocfs2_unwritten_extent { + struct list_head ue_node; + struct list_head ue_ip_node; + u32 ue_cpos; + u32 ue_phys; +}; /* * Describe the state of a single cluster to be written to. @@ -1206,7 +710,7 @@ struct ocfs2_write_cluster_desc { * filled. */ unsigned c_new; - unsigned c_unwritten; + unsigned c_clear_unwritten; unsigned c_needs_zero; }; @@ -1218,6 +722,9 @@ struct ocfs2_write_ctxt { /* First cluster allocated in a nonsparse extend */ u32 w_first_new_cpos; + /* Type of caller. Must be one of buffer, mmap, direct. */ + ocfs2_write_type_t w_type; + struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE]; /* @@ -1266,6 +773,8 @@ struct ocfs2_write_ctxt { struct buffer_head *w_di_bh; struct ocfs2_cached_dealloc_ctxt w_dealloc; + + struct list_head w_unwritten_list; }; void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages) @@ -1276,7 +785,7 @@ void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages) if (pages[i]) { unlock_page(pages[i]); mark_page_accessed(pages[i]); - page_cache_release(pages[i]); + put_page(pages[i]); } } } @@ -1299,13 +808,30 @@ static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc) } } mark_page_accessed(wc->w_target_page); - page_cache_release(wc->w_target_page); + put_page(wc->w_target_page); } ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages); } -static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) +static void ocfs2_free_unwritten_list(struct inode *inode, + struct list_head *head) { + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_unwritten_extent *ue = NULL, *tmp = NULL; + + list_for_each_entry_safe(ue, tmp, head, ue_node) { + list_del(&ue->ue_node); + spin_lock(&oi->ip_lock); + list_del(&ue->ue_ip_node); + spin_unlock(&oi->ip_lock); + kfree(ue); + } +} + +static void ocfs2_free_write_ctxt(struct inode *inode, + struct ocfs2_write_ctxt *wc) +{ + ocfs2_free_unwritten_list(inode, &wc->w_unwritten_list); ocfs2_unlock_pages(wc); brelse(wc->w_di_bh); kfree(wc); @@ -1313,7 +839,8 @@ static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, struct ocfs2_super *osb, loff_t pos, - unsigned len, struct buffer_head *di_bh) + unsigned len, ocfs2_write_type_t type, + struct buffer_head *di_bh) { u32 cend; struct ocfs2_write_ctxt *wc; @@ -1328,13 +855,15 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, wc->w_clen = cend - wc->w_cpos + 1; get_bh(di_bh); wc->w_di_bh = di_bh; + wc->w_type = type; - if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) + if (unlikely(PAGE_SHIFT > osb->s_clustersize_bits)) wc->w_large_pages = 1; else wc->w_large_pages = 0; ocfs2_init_dealloc_ctxt(&wc->w_dealloc); + INIT_LIST_HEAD(&wc->w_unwritten_list); *wcp = wc; @@ -1391,16 +920,17 @@ static void ocfs2_write_failure(struct inode *inode, loff_t user_pos, unsigned user_len) { int i; - unsigned from = user_pos & (PAGE_CACHE_SIZE - 1), + unsigned from = user_pos & (PAGE_SIZE - 1), to = user_pos + user_len; struct page *tmppage; - ocfs2_zero_new_buffers(wc->w_target_page, from, to); + if (wc->w_target_page) + ocfs2_zero_new_buffers(wc->w_target_page, from, to); for(i = 0; i < wc->w_num_pages; i++) { tmppage = wc->w_pages[i]; - if (page_has_buffers(tmppage)) { + if (tmppage && page_has_buffers(tmppage)) { if (ocfs2_should_order_data(inode)) ocfs2_jbd2_file_inode(wc->w_handle, inode); @@ -1430,7 +960,7 @@ static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno, (page_offset(page) <= user_pos)); if (page == wc->w_target_page) { - map_from = user_pos & (PAGE_CACHE_SIZE - 1); + map_from = user_pos & (PAGE_SIZE - 1); map_to = map_from + user_len; if (new) @@ -1504,7 +1034,7 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, struct inode *inode = mapping->host; loff_t last_byte; - target_index = user_pos >> PAGE_CACHE_SHIFT; + target_index = user_pos >> PAGE_SHIFT; /* * Figure out how many pages we'll be manipulating here. For @@ -1523,18 +1053,20 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, */ last_byte = max(user_pos + user_len, i_size_read(inode)); BUG_ON(last_byte < 1); - end_index = ((last_byte - 1) >> PAGE_CACHE_SHIFT) + 1; + end_index = ((last_byte - 1) >> PAGE_SHIFT) + 1; if ((start + wc->w_num_pages) > end_index) wc->w_num_pages = end_index - start; } else { wc->w_num_pages = 1; start = target_index; } + end_index = (user_pos + user_len - 1) >> PAGE_SHIFT; for(i = 0; i < wc->w_num_pages; i++) { index = start + i; - if (index == target_index && mmap_page) { + if (index >= target_index && index <= end_index && + wc->w_type == OCFS2_WRITE_MMAP) { /* * ocfs2_pagemkwrite() is a little different * and wants us to directly use the page @@ -1550,9 +1082,14 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, goto out; } - page_cache_get(mmap_page); + get_page(mmap_page); wc->w_pages[i] = mmap_page; wc->w_target_locked = true; + } else if (index >= target_index && index <= end_index && + wc->w_type == OCFS2_WRITE_DIRECT) { + /* Direct write has no mapping page. */ + wc->w_pages[i] = NULL; + continue; } else { wc->w_pages[i] = find_or_create_page(mapping, index, GFP_NOFS); @@ -1577,19 +1114,20 @@ out: * Prepare a single cluster for write one cluster into the file. */ static int ocfs2_write_cluster(struct address_space *mapping, - u32 phys, unsigned int unwritten, + u32 *phys, unsigned int new, + unsigned int clear_unwritten, unsigned int should_zero, struct ocfs2_alloc_context *data_ac, struct ocfs2_alloc_context *meta_ac, struct ocfs2_write_ctxt *wc, u32 cpos, loff_t user_pos, unsigned user_len) { - int ret, i, new; - u64 v_blkno, p_blkno; + int ret, i; + u64 p_blkno; struct inode *inode = mapping->host; struct ocfs2_extent_tree et; + int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); - new = phys == 0 ? 1 : 0; if (new) { u32 tmp_pos; @@ -1599,9 +1137,9 @@ static int ocfs2_write_cluster(struct address_space *mapping, */ tmp_pos = cpos; ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode, - &tmp_pos, 1, 0, wc->w_di_bh, - wc->w_handle, data_ac, - meta_ac, NULL); + &tmp_pos, 1, !clear_unwritten, + wc->w_di_bh, wc->w_handle, + data_ac, meta_ac, NULL); /* * This shouldn't happen because we must have already * calculated the correct meta data allocation required. The @@ -1618,11 +1156,11 @@ static int ocfs2_write_cluster(struct address_space *mapping, mlog_errno(ret); goto out; } - } else if (unwritten) { + } else if (clear_unwritten) { ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), wc->w_di_bh); ret = ocfs2_mark_extent_written(inode, &et, - wc->w_handle, cpos, 1, phys, + wc->w_handle, cpos, 1, *phys, meta_ac, &wc->w_dealloc); if (ret < 0) { mlog_errno(ret); @@ -1630,30 +1168,33 @@ static int ocfs2_write_cluster(struct address_space *mapping, } } - if (should_zero) - v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos); - else - v_blkno = user_pos >> inode->i_sb->s_blocksize_bits; - /* * The only reason this should fail is due to an inability to * find the extent added. */ - ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, - NULL); + ret = ocfs2_get_clusters(inode, cpos, phys, NULL, NULL); if (ret < 0) { mlog(ML_ERROR, "Get physical blkno failed for inode %llu, " - "at logical block %llu", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - (unsigned long long)v_blkno); + "at logical cluster %u", + (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos); goto out; } - BUG_ON(p_blkno == 0); + BUG_ON(*phys == 0); + + p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *phys); + if (!should_zero) + p_blkno += (user_pos >> inode->i_sb->s_blocksize_bits) & (u64)(bpc - 1); for(i = 0; i < wc->w_num_pages; i++) { int tmpret; + /* This is the direct io target page. */ + if (wc->w_pages[i] == NULL) { + p_blkno++; + continue; + } + tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc, wc->w_pages[i], cpos, user_pos, user_len, @@ -1700,8 +1241,9 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping, if ((cluster_off + local_len) > osb->s_clustersize) local_len = osb->s_clustersize - cluster_off; - ret = ocfs2_write_cluster(mapping, desc->c_phys, - desc->c_unwritten, + ret = ocfs2_write_cluster(mapping, &desc->c_phys, + desc->c_new, + desc->c_clear_unwritten, desc->c_needs_zero, data_ac, meta_ac, wc, desc->c_cpos, pos, local_len); @@ -1730,7 +1272,7 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb, { struct ocfs2_write_cluster_desc *desc; - wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1); + wc->w_target_from = pos & (PAGE_SIZE - 1); wc->w_target_to = wc->w_target_from + len; if (alloc == 0) @@ -1767,8 +1309,68 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb, &wc->w_target_to); } else { wc->w_target_from = 0; - wc->w_target_to = PAGE_CACHE_SIZE; + wc->w_target_to = PAGE_SIZE; + } +} + +/* + * Check if this extent is marked UNWRITTEN by direct io. If so, we need not to + * do the zero work. And should not to clear UNWRITTEN since it will be cleared + * by the direct io procedure. + * If this is a new extent that allocated by direct io, we should mark it in + * the ip_unwritten_list. + */ +static int ocfs2_unwritten_check(struct inode *inode, + struct ocfs2_write_ctxt *wc, + struct ocfs2_write_cluster_desc *desc) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_unwritten_extent *ue = NULL, *new = NULL; + int ret = 0; + + if (!desc->c_needs_zero) + return 0; + +retry: + spin_lock(&oi->ip_lock); + /* Needs not to zero no metter buffer or direct. The one who is zero + * the cluster is doing zero. And he will clear unwritten after all + * cluster io finished. */ + list_for_each_entry(ue, &oi->ip_unwritten_list, ue_ip_node) { + if (desc->c_cpos == ue->ue_cpos) { + BUG_ON(desc->c_new); + desc->c_needs_zero = 0; + desc->c_clear_unwritten = 0; + goto unlock; + } } + + if (wc->w_type != OCFS2_WRITE_DIRECT) + goto unlock; + + if (new == NULL) { + spin_unlock(&oi->ip_lock); + new = kmalloc(sizeof(struct ocfs2_unwritten_extent), + GFP_NOFS); + if (new == NULL) { + ret = -ENOMEM; + goto out; + } + goto retry; + } + /* This direct write will doing zero. */ + new->ue_cpos = desc->c_cpos; + new->ue_phys = desc->c_phys; + desc->c_clear_unwritten = 0; + list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list); + list_add_tail(&new->ue_node, &wc->w_unwritten_list); + new = NULL; +unlock: + spin_unlock(&oi->ip_lock); +out: + if (new) + kfree(new); + return ret; } /* @@ -1846,14 +1448,21 @@ static int ocfs2_populate_write_desc(struct inode *inode, if (phys == 0) { desc->c_new = 1; desc->c_needs_zero = 1; + desc->c_clear_unwritten = 1; *clusters_to_alloc = *clusters_to_alloc + 1; } if (ext_flags & OCFS2_EXT_UNWRITTEN) { - desc->c_unwritten = 1; + desc->c_clear_unwritten = 1; desc->c_needs_zero = 1; } + ret = ocfs2_unwritten_check(inode, wc, desc); + if (ret) { + mlog_errno(ret); + goto out; + } + num_clusters--; } @@ -2016,8 +1625,10 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, if (ret) mlog_errno(ret); - wc->w_first_new_cpos = - ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)); + /* There is no wc if this is call from direct. */ + if (wc) + wc->w_first_new_cpos = + ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)); return ret; } @@ -2046,9 +1657,9 @@ static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb, int ret = 0; unsigned int truncated_clusters; - mutex_lock(&osb->osb_tl_inode->i_mutex); + inode_lock(osb->osb_tl_inode); truncated_clusters = osb->truncated_clusters; - mutex_unlock(&osb->osb_tl_inode->i_mutex); + inode_unlock(osb->osb_tl_inode); /* * Check whether we can succeed in allocating if we free @@ -2071,9 +1682,8 @@ out: return ret; } -int ocfs2_write_begin_nolock(struct file *filp, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, +int ocfs2_write_begin_nolock(struct address_space *mapping, + loff_t pos, unsigned len, ocfs2_write_type_t type, struct page **pagep, void **fsdata, struct buffer_head *di_bh, struct page *mmap_page) { @@ -2090,7 +1700,7 @@ int ocfs2_write_begin_nolock(struct file *filp, int try_free = 1, ret1; try_again: - ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh); + ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, type, di_bh); if (ret) { mlog_errno(ret); return ret; @@ -2109,14 +1719,17 @@ try_again: } } - if (ocfs2_sparse_alloc(osb)) - ret = ocfs2_zero_tail(inode, di_bh, pos); - else - ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len, - wc); - if (ret) { - mlog_errno(ret); - goto out; + /* Direct io change i_size late, should not zero tail here. */ + if (type != OCFS2_WRITE_DIRECT) { + if (ocfs2_sparse_alloc(osb)) + ret = ocfs2_zero_tail(inode, di_bh, pos); + else + ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, + len, wc); + if (ret) { + mlog_errno(ret); + goto out; + } } ret = ocfs2_check_range_for_refcount(inode, pos, len); @@ -2147,7 +1760,7 @@ try_again: (unsigned long long)OCFS2_I(inode)->ip_blkno, (long long)i_size_read(inode), le32_to_cpu(di->i_clusters), - pos, len, flags, mmap_page, + pos, len, type, mmap_page, clusters_to_alloc, extents_to_split); /* @@ -2177,17 +1790,17 @@ try_again: credits = ocfs2_calc_extend_credits(inode->i_sb, &di->id2.i_list); - - } + } else if (type == OCFS2_WRITE_DIRECT) + /* direct write needs not to start trans if no extents alloc. */ + goto success; /* * We have to zero sparse allocated clusters, unwritten extent clusters, * and non-sparse clusters we just extended. For non-sparse writes, * we know zeros will only be needed in the first and/or last cluster. */ - if (clusters_to_alloc || extents_to_split || - (wc->w_clen && (wc->w_desc[0].c_needs_zero || - wc->w_desc[wc->w_clen - 1].c_needs_zero))) + if (wc->w_clen && (wc->w_desc[0].c_needs_zero || + wc->w_desc[wc->w_clen - 1].c_needs_zero)) cluster_of_pages = 1; else cluster_of_pages = 0; @@ -2254,7 +1867,8 @@ try_again: ocfs2_free_alloc_context(meta_ac); success: - *pagep = wc->w_target_page; + if (pagep) + *pagep = wc->w_target_page; *fsdata = wc; return 0; out_quota: @@ -2265,7 +1879,7 @@ out_commit: ocfs2_commit_trans(osb, handle); out: - ocfs2_free_write_ctxt(wc); + ocfs2_free_write_ctxt(inode, wc); if (data_ac) { ocfs2_free_alloc_context(data_ac); @@ -2317,8 +1931,8 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping, */ down_write(&OCFS2_I(inode)->ip_alloc_sem); - ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep, - fsdata, di_bh, NULL); + ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_BUFFER, + pagep, fsdata, di_bh, NULL); if (ret) { mlog_errno(ret); goto out_fail; @@ -2367,7 +1981,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping, struct page *page, void *fsdata) { int i, ret; - unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1); + unsigned from, to, start = pos & (PAGE_SIZE - 1); struct inode *inode = mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_write_ctxt *wc = fsdata; @@ -2375,12 +1989,16 @@ int ocfs2_write_end_nolock(struct address_space *mapping, handle_t *handle = wc->w_handle; struct page *tmppage; - ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret) { - copied = ret; - mlog_errno(ret); - goto out; + BUG_ON(!list_empty(&wc->w_unwritten_list)); + + if (handle) { + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), + wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + copied = ret; + mlog_errno(ret); + goto out; + } } if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { @@ -2388,24 +2006,29 @@ int ocfs2_write_end_nolock(struct address_space *mapping, goto out_write_size; } - if (unlikely(copied < len)) { + if (unlikely(copied < len) && wc->w_target_page) { if (!PageUptodate(wc->w_target_page)) copied = 0; ocfs2_zero_new_buffers(wc->w_target_page, start+copied, start+len); } - flush_dcache_page(wc->w_target_page); + if (wc->w_target_page) + flush_dcache_page(wc->w_target_page); for(i = 0; i < wc->w_num_pages; i++) { tmppage = wc->w_pages[i]; + /* This is the direct io target page. */ + if (tmppage == NULL) + continue; + if (tmppage == wc->w_target_page) { from = wc->w_target_from; to = wc->w_target_to; - BUG_ON(from > PAGE_CACHE_SIZE || - to > PAGE_CACHE_SIZE || + BUG_ON(from > PAGE_SIZE || + to > PAGE_SIZE || to < from); } else { /* @@ -2414,29 +2037,33 @@ int ocfs2_write_end_nolock(struct address_space *mapping, * to flush their entire range. */ from = 0; - to = PAGE_CACHE_SIZE; + to = PAGE_SIZE; } if (page_has_buffers(tmppage)) { - if (ocfs2_should_order_data(inode)) - ocfs2_jbd2_file_inode(wc->w_handle, inode); + if (handle && ocfs2_should_order_data(inode)) + ocfs2_jbd2_file_inode(handle, inode); block_commit_write(tmppage, from, to); } } out_write_size: - pos += copied; - if (pos > i_size_read(inode)) { - i_size_write(inode, pos); - mark_inode_dirty(inode); - } - inode->i_blocks = ocfs2_inode_sector_count(inode); - di->i_size = cpu_to_le64((u64)i_size_read(inode)); - inode->i_mtime = inode->i_ctime = CURRENT_TIME; - di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); - di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); - ocfs2_update_inode_fsync_trans(handle, inode, 1); - ocfs2_journal_dirty(handle, wc->w_di_bh); + /* Direct io do not update i_size here. */ + if (wc->w_type != OCFS2_WRITE_DIRECT) { + pos += copied; + if (pos > i_size_read(inode)) { + i_size_write(inode, pos); + mark_inode_dirty(inode); + } + inode->i_blocks = ocfs2_inode_sector_count(inode); + di->i_size = cpu_to_le64((u64)i_size_read(inode)); + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); + di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, inode, 1); + } + if (handle) + ocfs2_journal_dirty(handle, wc->w_di_bh); out: /* unlock pages before dealloc since it needs acquiring j_trans_barrier @@ -2446,7 +2073,8 @@ out: */ ocfs2_unlock_pages(wc); - ocfs2_commit_trans(osb, handle); + if (handle) + ocfs2_commit_trans(osb, handle); ocfs2_run_deallocs(osb, &wc->w_dealloc); @@ -2471,6 +2099,360 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping, return ret; } +struct ocfs2_dio_write_ctxt { + struct list_head dw_zero_list; + unsigned dw_zero_count; + int dw_orphaned; + pid_t dw_writer_pid; +}; + +static struct ocfs2_dio_write_ctxt * +ocfs2_dio_alloc_write_ctx(struct buffer_head *bh, int *alloc) +{ + struct ocfs2_dio_write_ctxt *dwc = NULL; + + if (bh->b_private) + return bh->b_private; + + dwc = kmalloc(sizeof(struct ocfs2_dio_write_ctxt), GFP_NOFS); + if (dwc == NULL) + return NULL; + INIT_LIST_HEAD(&dwc->dw_zero_list); + dwc->dw_zero_count = 0; + dwc->dw_orphaned = 0; + dwc->dw_writer_pid = task_pid_nr(current); + bh->b_private = dwc; + *alloc = 1; + + return dwc; +} + +static void ocfs2_dio_free_write_ctx(struct inode *inode, + struct ocfs2_dio_write_ctxt *dwc) +{ + ocfs2_free_unwritten_list(inode, &dwc->dw_zero_list); + kfree(dwc); +} + +/* + * TODO: Make this into a generic get_blocks function. + * + * From do_direct_io in direct-io.c: + * "So what we do is to permit the ->get_blocks function to populate + * bh.b_size with the size of IO which is permitted at this offset and + * this i_blkbits." + * + * This function is called directly from get_more_blocks in direct-io.c. + * + * called like this: dio->get_blocks(dio->inode, fs_startblk, + * fs_count, map_bh, dio->rw == WRITE); + */ +static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_write_ctxt *wc; + struct ocfs2_write_cluster_desc *desc = NULL; + struct ocfs2_dio_write_ctxt *dwc = NULL; + struct buffer_head *di_bh = NULL; + u64 p_blkno; + loff_t pos = iblock << inode->i_sb->s_blocksize_bits; + unsigned len, total_len = bh_result->b_size; + int ret = 0, first_get_block = 0; + + len = osb->s_clustersize - (pos & (osb->s_clustersize - 1)); + len = min(total_len, len); + + mlog(0, "get block of %lu at %llu:%u req %u\n", + inode->i_ino, pos, len, total_len); + + /* + * Because we need to change file size in ocfs2_dio_end_io_write(), or + * we may need to add it to orphan dir. So can not fall to fast path + * while file size will be changed. + */ + if (pos + total_len <= i_size_read(inode)) { + down_read(&oi->ip_alloc_sem); + /* This is the fast path for re-write. */ + ret = ocfs2_get_block(inode, iblock, bh_result, create); + + up_read(&oi->ip_alloc_sem); + + if (buffer_mapped(bh_result) && + !buffer_new(bh_result) && + ret == 0) + goto out; + + /* Clear state set by ocfs2_get_block. */ + bh_result->b_state = 0; + } + + dwc = ocfs2_dio_alloc_write_ctx(bh_result, &first_get_block); + if (unlikely(dwc == NULL)) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + if (ocfs2_clusters_for_bytes(inode->i_sb, pos + total_len) > + ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)) && + !dwc->dw_orphaned) { + /* + * when we are going to alloc extents beyond file size, add the + * inode to orphan dir, so we can recall those spaces when + * system crashed during write. + */ + ret = ocfs2_add_inode_to_orphan(osb, inode); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + dwc->dw_orphaned = 1; + } + + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret) { + mlog_errno(ret); + goto out; + } + + down_write(&oi->ip_alloc_sem); + + if (first_get_block) { + if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) + ret = ocfs2_zero_tail(inode, di_bh, pos); + else + ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, + total_len, NULL); + if (ret < 0) { + mlog_errno(ret); + goto unlock; + } + } + + ret = ocfs2_write_begin_nolock(inode->i_mapping, pos, len, + OCFS2_WRITE_DIRECT, NULL, + (void **)&wc, di_bh, NULL); + if (ret) { + mlog_errno(ret); + goto unlock; + } + + desc = &wc->w_desc[0]; + + p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, desc->c_phys); + BUG_ON(p_blkno == 0); + p_blkno += iblock & (u64)(ocfs2_clusters_to_blocks(inode->i_sb, 1) - 1); + + map_bh(bh_result, inode->i_sb, p_blkno); + bh_result->b_size = len; + if (desc->c_needs_zero) + set_buffer_new(bh_result); + + /* May sleep in end_io. It should not happen in a irq context. So defer + * it to dio work queue. */ + set_buffer_defer_completion(bh_result); + + if (!list_empty(&wc->w_unwritten_list)) { + struct ocfs2_unwritten_extent *ue = NULL; + + ue = list_first_entry(&wc->w_unwritten_list, + struct ocfs2_unwritten_extent, + ue_node); + BUG_ON(ue->ue_cpos != desc->c_cpos); + /* The physical address may be 0, fill it. */ + ue->ue_phys = desc->c_phys; + + list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list); + dwc->dw_zero_count++; + } + + ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, NULL, wc); + BUG_ON(ret != len); + ret = 0; +unlock: + up_write(&oi->ip_alloc_sem); + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); +out: + if (ret < 0) + ret = -EIO; + return ret; +} + +static void ocfs2_dio_end_io_write(struct inode *inode, + struct ocfs2_dio_write_ctxt *dwc, + loff_t offset, + ssize_t bytes) +{ + struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_extent_tree et; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_unwritten_extent *ue = NULL; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; + handle_t *handle = NULL; + loff_t end = offset + bytes; + int ret = 0, credits = 0, locked = 0; + + ocfs2_init_dealloc_ctxt(&dealloc); + + /* We do clear unwritten, delete orphan, change i_size here. If neither + * of these happen, we can skip all this. */ + if (list_empty(&dwc->dw_zero_list) && + end <= i_size_read(inode) && + !dwc->dw_orphaned) + goto out; + + /* ocfs2_file_write_iter will get i_mutex, so we need not lock if we + * are in that context. */ + if (dwc->dw_writer_pid != task_pid_nr(current)) { + mutex_lock(&inode->i_mutex); + locked = 1; + } + + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + down_write(&oi->ip_alloc_sem); + + /* Delete orphan before acquire i_mutex. */ + if (dwc->dw_orphaned) { + BUG_ON(dwc->dw_writer_pid != task_pid_nr(current)); + + end = end > i_size_read(inode) ? end : 0; + + ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, + !!end, end); + if (ret < 0) + mlog_errno(ret); + } + + di = (struct ocfs2_dinode *)di_bh; + + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); + + ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2, + &data_ac, &meta_ac); + if (ret) { + mlog_errno(ret); + goto unlock; + } + + credits = ocfs2_calc_extend_credits(inode->i_sb, &di->id2.i_list); + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto unlock; + } + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto commit; + } + + list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) { + ret = ocfs2_mark_extent_written(inode, &et, handle, + ue->ue_cpos, 1, + ue->ue_phys, + meta_ac, &dealloc); + if (ret < 0) { + mlog_errno(ret); + break; + } + } + + if (end > i_size_read(inode)) { + ret = ocfs2_set_inode_size(handle, inode, di_bh, end); + if (ret < 0) + mlog_errno(ret); + } +commit: + ocfs2_commit_trans(osb, handle); +unlock: + up_write(&oi->ip_alloc_sem); + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); +out: + if (data_ac) + ocfs2_free_alloc_context(data_ac); + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + ocfs2_run_deallocs(osb, &dealloc); + if (locked) + mutex_unlock(&inode->i_mutex); + ocfs2_dio_free_write_ctx(inode, dwc); +} + +/* + * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're + * particularly interested in the aio/dio case. We use the rw_lock DLM lock + * to protect io on one node from truncation on another. + */ +static int ocfs2_dio_end_io(struct kiocb *iocb, + loff_t offset, + ssize_t bytes, + void *private) +{ + struct inode *inode = file_inode(iocb->ki_filp); + int level; + + if (bytes <= 0) + return 0; + + /* this io's submitter should not have unlocked this before we could */ + BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); + + if (private) + ocfs2_dio_end_io_write(inode, private, offset, bytes); + + ocfs2_iocb_clear_rw_locked(iocb); + + level = ocfs2_iocb_rw_locked_level(iocb); + ocfs2_rw_unlock(inode, level); + return 0; +} + +static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file)->i_mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + loff_t end = offset + iter->count; + get_block_t *get_block; + + /* + * Fallback to buffered I/O if we see an inode without + * extents. + */ + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) + return 0; + + /* Fallback to buffered I/O if we do not support append dio. */ + if (end > i_size_read(inode) && !ocfs2_supports_append_dio(osb)) + return 0; + + if (iov_iter_rw(iter) == READ) + get_block = ocfs2_get_block; + else + get_block = ocfs2_dio_get_block; + + return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, + iter, offset, get_block, + ocfs2_dio_end_io, NULL, 0); +} + const struct address_space_operations ocfs2_aops = { .readpage = ocfs2_readpage, .readpages = ocfs2_readpages, diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index 24e496d6bdcd..b1c9f28a57b1 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -47,9 +47,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); -int ocfs2_write_begin_nolock(struct file *filp, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, +typedef enum { + OCFS2_WRITE_BUFFER = 0, + OCFS2_WRITE_DIRECT, + OCFS2_WRITE_MMAP, +} ocfs2_write_type_t; + +int ocfs2_write_begin_nolock(struct address_space *mapping, + loff_t pos, unsigned len, ocfs2_write_type_t type, struct page **pagep, void **fsdata, struct buffer_head *di_bh, struct page *mmap_page); @@ -79,7 +84,6 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level) enum ocfs2_iocb_lock_bits { OCFS2_IOCB_RW_LOCK = 0, OCFS2_IOCB_RW_LOCK_LEVEL, - OCFS2_IOCB_UNALIGNED_IO, OCFS2_IOCB_NUM_LOCKS }; @@ -88,11 +92,4 @@ enum ocfs2_iocb_lock_bits { #define ocfs2_iocb_rw_locked_level(iocb) \ test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private) -#define ocfs2_iocb_set_unaligned_aio(iocb) \ - set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) -#define ocfs2_iocb_clear_unaligned_aio(iocb) \ - clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) -#define ocfs2_iocb_is_unaligned_aio(iocb) \ - test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) - #endif /* OCFS2_FILE_H */ diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index ddddef0021a0..1934abb6b680 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -287,7 +287,6 @@ struct o2hb_bio_wait_ctxt { static void o2hb_write_timeout(struct work_struct *work) { int failed, quorum; - unsigned long flags; struct o2hb_region *reg = container_of(work, struct o2hb_region, hr_write_timeout_work.work); @@ -297,14 +296,14 @@ static void o2hb_write_timeout(struct work_struct *work) jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); if (o2hb_global_heartbeat_active()) { - spin_lock_irqsave(&o2hb_live_lock, flags); + spin_lock(&o2hb_live_lock); if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) set_bit(reg->hr_region_num, o2hb_failed_region_bitmap); failed = bitmap_weight(o2hb_failed_region_bitmap, O2NM_MAX_REGIONS); quorum = bitmap_weight(o2hb_quorum_region_bitmap, O2NM_MAX_REGIONS); - spin_unlock_irqrestore(&o2hb_live_lock, flags); + spin_unlock(&o2hb_live_lock); mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n", quorum, failed); @@ -418,13 +417,13 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, bio->bi_private = wc; bio->bi_end_io = o2hb_bio_end_io; - vec_start = (cs << bits) % PAGE_CACHE_SIZE; + vec_start = (cs << bits) % PAGE_SIZE; while(cs < max_slots) { current_page = cs / spp; page = reg->hr_slot_data[current_page]; - vec_len = min(PAGE_CACHE_SIZE - vec_start, - (max_slots-cs) * (PAGE_CACHE_SIZE/spp) ); + vec_len = min(PAGE_SIZE - vec_start, + (max_slots-cs) * (PAGE_SIZE/spp) ); mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n", current_page, vec_len, vec_start); @@ -432,7 +431,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, len = bio_add_page(bio, page, vec_len, vec_start); if (len != vec_len) break; - cs += vec_len / (PAGE_CACHE_SIZE/spp); + cs += vec_len / (PAGE_SIZE/spp); vec_start = 0; } @@ -1254,15 +1253,15 @@ static const struct file_operations o2hb_debug_fops = { void o2hb_exit(void) { - kfree(o2hb_db_livenodes); - kfree(o2hb_db_liveregions); - kfree(o2hb_db_quorumregions); - kfree(o2hb_db_failedregions); debugfs_remove(o2hb_debug_failedregions); debugfs_remove(o2hb_debug_quorumregions); debugfs_remove(o2hb_debug_liveregions); debugfs_remove(o2hb_debug_livenodes); debugfs_remove(o2hb_debug_dir); + kfree(o2hb_db_livenodes); + kfree(o2hb_db_liveregions); + kfree(o2hb_db_quorumregions); + kfree(o2hb_db_failedregions); } static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir, @@ -1438,13 +1437,15 @@ static void o2hb_region_release(struct config_item *item) kfree(reg->hr_slots); - kfree(reg->hr_db_regnum); - kfree(reg->hr_db_livenodes); debugfs_remove(reg->hr_debug_livenodes); debugfs_remove(reg->hr_debug_regnum); debugfs_remove(reg->hr_debug_elapsed_time); debugfs_remove(reg->hr_debug_pinned); debugfs_remove(reg->hr_debug_dir); + kfree(reg->hr_db_livenodes); + kfree(reg->hr_db_regnum); + kfree(reg->hr_db_elapsed_time); + kfree(reg->hr_db_pinned); spin_lock(&o2hb_live_lock); list_del(®->hr_all_item); @@ -1480,16 +1481,17 @@ static int o2hb_read_block_input(struct o2hb_region *reg, return 0; } -static ssize_t o2hb_region_block_bytes_read(struct o2hb_region *reg, +static ssize_t o2hb_region_block_bytes_show(struct config_item *item, char *page) { - return sprintf(page, "%u\n", reg->hr_block_bytes); + return sprintf(page, "%u\n", to_o2hb_region(item)->hr_block_bytes); } -static ssize_t o2hb_region_block_bytes_write(struct o2hb_region *reg, +static ssize_t o2hb_region_block_bytes_store(struct config_item *item, const char *page, size_t count) { + struct o2hb_region *reg = to_o2hb_region(item); int status; unsigned long block_bytes; unsigned int block_bits; @@ -1508,16 +1510,17 @@ static ssize_t o2hb_region_block_bytes_write(struct o2hb_region *reg, return count; } -static ssize_t o2hb_region_start_block_read(struct o2hb_region *reg, +static ssize_t o2hb_region_start_block_show(struct config_item *item, char *page) { - return sprintf(page, "%llu\n", reg->hr_start_block); + return sprintf(page, "%llu\n", to_o2hb_region(item)->hr_start_block); } -static ssize_t o2hb_region_start_block_write(struct o2hb_region *reg, +static ssize_t o2hb_region_start_block_store(struct config_item *item, const char *page, size_t count) { + struct o2hb_region *reg = to_o2hb_region(item); unsigned long long tmp; char *p = (char *)page; @@ -1533,16 +1536,16 @@ static ssize_t o2hb_region_start_block_write(struct o2hb_region *reg, return count; } -static ssize_t o2hb_region_blocks_read(struct o2hb_region *reg, - char *page) +static ssize_t o2hb_region_blocks_show(struct config_item *item, char *page) { - return sprintf(page, "%d\n", reg->hr_blocks); + return sprintf(page, "%d\n", to_o2hb_region(item)->hr_blocks); } -static ssize_t o2hb_region_blocks_write(struct o2hb_region *reg, +static ssize_t o2hb_region_blocks_store(struct config_item *item, const char *page, size_t count) { + struct o2hb_region *reg = to_o2hb_region(item); unsigned long tmp; char *p = (char *)page; @@ -1561,20 +1564,19 @@ static ssize_t o2hb_region_blocks_write(struct o2hb_region *reg, return count; } -static ssize_t o2hb_region_dev_read(struct o2hb_region *reg, - char *page) +static ssize_t o2hb_region_dev_show(struct config_item *item, char *page) { unsigned int ret = 0; - if (reg->hr_bdev) - ret = sprintf(page, "%s\n", reg->hr_dev_name); + if (to_o2hb_region(item)->hr_bdev) + ret = sprintf(page, "%s\n", to_o2hb_region(item)->hr_dev_name); return ret; } static void o2hb_init_region_params(struct o2hb_region *reg) { - reg->hr_slots_per_page = PAGE_CACHE_SIZE >> reg->hr_block_bits; + reg->hr_slots_per_page = PAGE_SIZE >> reg->hr_block_bits; reg->hr_timeout_ms = O2HB_REGION_TIMEOUT_MS; mlog(ML_HEARTBEAT, "hr_start_block = %llu, hr_blocks = %u\n", @@ -1677,10 +1679,11 @@ out: } /* this is acting as commit; we set up all of hr_bdev and hr_task or nothing */ -static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, +static ssize_t o2hb_region_dev_store(struct config_item *item, const char *page, size_t count) { + struct o2hb_region *reg = to_o2hb_region(item); struct task_struct *hb_task; long fd; int sectsize; @@ -1778,8 +1781,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, } ++live_threshold; atomic_set(®->hr_steady_iterations, live_threshold); - /* unsteady_iterations is double the steady_iterations */ - atomic_set(®->hr_unsteady_iterations, (live_threshold << 1)); + /* unsteady_iterations is triple the steady_iterations */ + atomic_set(®->hr_unsteady_iterations, (live_threshold * 3)); hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s", reg->hr_item.ci_name); @@ -1841,9 +1844,9 @@ out: return ret; } -static ssize_t o2hb_region_pid_read(struct o2hb_region *reg, - char *page) +static ssize_t o2hb_region_pid_show(struct config_item *item, char *page) { + struct o2hb_region *reg = to_o2hb_region(item); pid_t pid = 0; spin_lock(&o2hb_live_lock); @@ -1857,92 +1860,23 @@ static ssize_t o2hb_region_pid_read(struct o2hb_region *reg, return sprintf(page, "%u\n", pid); } -struct o2hb_region_attribute { - struct configfs_attribute attr; - ssize_t (*show)(struct o2hb_region *, char *); - ssize_t (*store)(struct o2hb_region *, const char *, size_t); -}; - -static struct o2hb_region_attribute o2hb_region_attr_block_bytes = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "block_bytes", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2hb_region_block_bytes_read, - .store = o2hb_region_block_bytes_write, -}; - -static struct o2hb_region_attribute o2hb_region_attr_start_block = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "start_block", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2hb_region_start_block_read, - .store = o2hb_region_start_block_write, -}; - -static struct o2hb_region_attribute o2hb_region_attr_blocks = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "blocks", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2hb_region_blocks_read, - .store = o2hb_region_blocks_write, -}; - -static struct o2hb_region_attribute o2hb_region_attr_dev = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "dev", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2hb_region_dev_read, - .store = o2hb_region_dev_write, -}; - -static struct o2hb_region_attribute o2hb_region_attr_pid = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "pid", - .ca_mode = S_IRUGO | S_IRUSR }, - .show = o2hb_region_pid_read, -}; +CONFIGFS_ATTR(o2hb_region_, block_bytes); +CONFIGFS_ATTR(o2hb_region_, start_block); +CONFIGFS_ATTR(o2hb_region_, blocks); +CONFIGFS_ATTR(o2hb_region_, dev); +CONFIGFS_ATTR_RO(o2hb_region_, pid); static struct configfs_attribute *o2hb_region_attrs[] = { - &o2hb_region_attr_block_bytes.attr, - &o2hb_region_attr_start_block.attr, - &o2hb_region_attr_blocks.attr, - &o2hb_region_attr_dev.attr, - &o2hb_region_attr_pid.attr, + &o2hb_region_attr_block_bytes, + &o2hb_region_attr_start_block, + &o2hb_region_attr_blocks, + &o2hb_region_attr_dev, + &o2hb_region_attr_pid, NULL, }; -static ssize_t o2hb_region_show(struct config_item *item, - struct configfs_attribute *attr, - char *page) -{ - struct o2hb_region *reg = to_o2hb_region(item); - struct o2hb_region_attribute *o2hb_region_attr = - container_of(attr, struct o2hb_region_attribute, attr); - ssize_t ret = 0; - - if (o2hb_region_attr->show) - ret = o2hb_region_attr->show(reg, page); - return ret; -} - -static ssize_t o2hb_region_store(struct config_item *item, - struct configfs_attribute *attr, - const char *page, size_t count) -{ - struct o2hb_region *reg = to_o2hb_region(item); - struct o2hb_region_attribute *o2hb_region_attr = - container_of(attr, struct o2hb_region_attribute, attr); - ssize_t ret = -EINVAL; - - if (o2hb_region_attr->store) - ret = o2hb_region_attr->store(reg, page, count); - return ret; -} - static struct configfs_item_operations o2hb_region_item_ops = { .release = o2hb_region_release, - .show_attribute = o2hb_region_show, - .store_attribute = o2hb_region_store, }; static struct config_item_type o2hb_region_type = { @@ -2137,49 +2071,14 @@ unlock: spin_unlock(&o2hb_live_lock); } -struct o2hb_heartbeat_group_attribute { - struct configfs_attribute attr; - ssize_t (*show)(struct o2hb_heartbeat_group *, char *); - ssize_t (*store)(struct o2hb_heartbeat_group *, const char *, size_t); -}; - -static ssize_t o2hb_heartbeat_group_show(struct config_item *item, - struct configfs_attribute *attr, - char *page) -{ - struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item)); - struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr = - container_of(attr, struct o2hb_heartbeat_group_attribute, attr); - ssize_t ret = 0; - - if (o2hb_heartbeat_group_attr->show) - ret = o2hb_heartbeat_group_attr->show(reg, page); - return ret; -} - -static ssize_t o2hb_heartbeat_group_store(struct config_item *item, - struct configfs_attribute *attr, - const char *page, size_t count) -{ - struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item)); - struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr = - container_of(attr, struct o2hb_heartbeat_group_attribute, attr); - ssize_t ret = -EINVAL; - - if (o2hb_heartbeat_group_attr->store) - ret = o2hb_heartbeat_group_attr->store(reg, page, count); - return ret; -} - -static ssize_t o2hb_heartbeat_group_threshold_show(struct o2hb_heartbeat_group *group, - char *page) +static ssize_t o2hb_heartbeat_group_threshold_show(struct config_item *item, + char *page) { return sprintf(page, "%u\n", o2hb_dead_threshold); } -static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group *group, - const char *page, - size_t count) +static ssize_t o2hb_heartbeat_group_threshold_store(struct config_item *item, + const char *page, size_t count) { unsigned long tmp; char *p = (char *)page; @@ -2194,17 +2093,15 @@ static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group return count; } -static -ssize_t o2hb_heartbeat_group_mode_show(struct o2hb_heartbeat_group *group, - char *page) +static ssize_t o2hb_heartbeat_group_mode_show(struct config_item *item, + char *page) { return sprintf(page, "%s\n", o2hb_heartbeat_mode_desc[o2hb_heartbeat_mode]); } -static -ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group, - const char *page, size_t count) +static ssize_t o2hb_heartbeat_group_mode_store(struct config_item *item, + const char *page, size_t count) { unsigned int i; int ret; @@ -2229,33 +2126,15 @@ ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group, } -static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "dead_threshold", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2hb_heartbeat_group_threshold_show, - .store = o2hb_heartbeat_group_threshold_store, -}; - -static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_mode = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "mode", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2hb_heartbeat_group_mode_show, - .store = o2hb_heartbeat_group_mode_store, -}; +CONFIGFS_ATTR(o2hb_heartbeat_group_, threshold); +CONFIGFS_ATTR(o2hb_heartbeat_group_, mode); static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = { - &o2hb_heartbeat_group_attr_threshold.attr, - &o2hb_heartbeat_group_attr_mode.attr, + &o2hb_heartbeat_group_attr_threshold, + &o2hb_heartbeat_group_attr_mode, NULL, }; -static struct configfs_item_operations o2hb_heartbeat_group_item_ops = { - .show_attribute = o2hb_heartbeat_group_show, - .store_attribute = o2hb_heartbeat_group_store, -}; - static struct configfs_group_operations o2hb_heartbeat_group_group_ops = { .make_item = o2hb_heartbeat_group_make_item, .drop_item = o2hb_heartbeat_group_drop_item, @@ -2263,7 +2142,6 @@ static struct configfs_group_operations o2hb_heartbeat_group_group_ops = { static struct config_item_type o2hb_heartbeat_group_type = { .ct_group_ops = &o2hb_heartbeat_group_group_ops, - .ct_item_ops = &o2hb_heartbeat_group_item_ops, .ct_attrs = o2hb_heartbeat_group_attrs, .ct_owner = THIS_MODULE, }; @@ -2546,11 +2424,10 @@ EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating); int o2hb_check_node_heartbeating_no_sem(u8 node_num) { unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; - unsigned long flags; - spin_lock_irqsave(&o2hb_live_lock, flags); + spin_lock(&o2hb_live_lock); o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map)); - spin_unlock_irqrestore(&o2hb_live_lock, flags); + spin_unlock(&o2hb_live_lock); if (!test_bit(node_num, testing_map)) { mlog(ML_HEARTBEAT, "node (%u) does not have heartbeating enabled.\n", diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index 441c84e169e6..b17d180bdc16 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -172,9 +172,9 @@ static void o2nm_node_release(struct config_item *item) kfree(node); } -static ssize_t o2nm_node_num_read(struct o2nm_node *node, char *page) +static ssize_t o2nm_node_num_show(struct config_item *item, char *page) { - return sprintf(page, "%d\n", node->nd_num); + return sprintf(page, "%d\n", to_o2nm_node(item)->nd_num); } static struct o2nm_cluster *to_o2nm_cluster_from_node(struct o2nm_node *node) @@ -188,15 +188,16 @@ enum { O2NM_NODE_ATTR_NUM = 0, O2NM_NODE_ATTR_PORT, O2NM_NODE_ATTR_ADDRESS, - O2NM_NODE_ATTR_LOCAL, }; -static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page, +static ssize_t o2nm_node_num_store(struct config_item *item, const char *page, size_t count) { + struct o2nm_node *node = to_o2nm_node(item); struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); unsigned long tmp; char *p = (char *)page; + int ret = 0; tmp = simple_strtoul(p, &p, 0); if (!p || (*p && (*p != '\n'))) @@ -215,26 +216,30 @@ static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page, write_lock(&cluster->cl_nodes_lock); if (cluster->cl_nodes[tmp]) - p = NULL; + ret = -EEXIST; + else if (test_and_set_bit(O2NM_NODE_ATTR_NUM, + &node->nd_set_attributes)) + ret = -EBUSY; else { cluster->cl_nodes[tmp] = node; node->nd_num = tmp; set_bit(tmp, cluster->cl_nodes_bitmap); } write_unlock(&cluster->cl_nodes_lock); - if (p == NULL) - return -EEXIST; + if (ret) + return ret; return count; } -static ssize_t o2nm_node_ipv4_port_read(struct o2nm_node *node, char *page) +static ssize_t o2nm_node_ipv4_port_show(struct config_item *item, char *page) { - return sprintf(page, "%u\n", ntohs(node->nd_ipv4_port)); + return sprintf(page, "%u\n", ntohs(to_o2nm_node(item)->nd_ipv4_port)); } -static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node, +static ssize_t o2nm_node_ipv4_port_store(struct config_item *item, const char *page, size_t count) { + struct o2nm_node *node = to_o2nm_node(item); unsigned long tmp; char *p = (char *)page; @@ -247,20 +252,23 @@ static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node, if (tmp >= (u16)-1) return -ERANGE; + if (test_and_set_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes)) + return -EBUSY; node->nd_ipv4_port = htons(tmp); return count; } -static ssize_t o2nm_node_ipv4_address_read(struct o2nm_node *node, char *page) +static ssize_t o2nm_node_ipv4_address_show(struct config_item *item, char *page) { - return sprintf(page, "%pI4\n", &node->nd_ipv4_address); + return sprintf(page, "%pI4\n", &to_o2nm_node(item)->nd_ipv4_address); } -static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node, +static ssize_t o2nm_node_ipv4_address_store(struct config_item *item, const char *page, size_t count) { + struct o2nm_node *node = to_o2nm_node(item); struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); int ret, i; struct rb_node **p, *parent; @@ -282,6 +290,9 @@ static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node, write_lock(&cluster->cl_nodes_lock); if (o2nm_node_ip_tree_lookup(cluster, ipv4_addr, &p, &parent)) ret = -EEXIST; + else if (test_and_set_bit(O2NM_NODE_ATTR_ADDRESS, + &node->nd_set_attributes)) + ret = -EBUSY; else { rb_link_node(&node->nd_ip_node, parent, p); rb_insert_color(&node->nd_ip_node, &cluster->cl_node_ip_tree); @@ -295,14 +306,15 @@ static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node, return count; } -static ssize_t o2nm_node_local_read(struct o2nm_node *node, char *page) +static ssize_t o2nm_node_local_show(struct config_item *item, char *page) { - return sprintf(page, "%d\n", node->nd_local); + return sprintf(page, "%d\n", to_o2nm_node(item)->nd_local); } -static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page, +static ssize_t o2nm_node_local_store(struct config_item *item, const char *page, size_t count) { + struct o2nm_node *node = to_o2nm_node(item); struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); unsigned long tmp; char *p = (char *)page; @@ -349,108 +361,21 @@ static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page, return count; } -struct o2nm_node_attribute { - struct configfs_attribute attr; - ssize_t (*show)(struct o2nm_node *, char *); - ssize_t (*store)(struct o2nm_node *, const char *, size_t); -}; - -static struct o2nm_node_attribute o2nm_node_attr_num = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "num", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2nm_node_num_read, - .store = o2nm_node_num_write, -}; - -static struct o2nm_node_attribute o2nm_node_attr_ipv4_port = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "ipv4_port", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2nm_node_ipv4_port_read, - .store = o2nm_node_ipv4_port_write, -}; - -static struct o2nm_node_attribute o2nm_node_attr_ipv4_address = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "ipv4_address", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2nm_node_ipv4_address_read, - .store = o2nm_node_ipv4_address_write, -}; - -static struct o2nm_node_attribute o2nm_node_attr_local = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "local", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2nm_node_local_read, - .store = o2nm_node_local_write, -}; +CONFIGFS_ATTR(o2nm_node_, num); +CONFIGFS_ATTR(o2nm_node_, ipv4_port); +CONFIGFS_ATTR(o2nm_node_, ipv4_address); +CONFIGFS_ATTR(o2nm_node_, local); static struct configfs_attribute *o2nm_node_attrs[] = { - [O2NM_NODE_ATTR_NUM] = &o2nm_node_attr_num.attr, - [O2NM_NODE_ATTR_PORT] = &o2nm_node_attr_ipv4_port.attr, - [O2NM_NODE_ATTR_ADDRESS] = &o2nm_node_attr_ipv4_address.attr, - [O2NM_NODE_ATTR_LOCAL] = &o2nm_node_attr_local.attr, + &o2nm_node_attr_num, + &o2nm_node_attr_ipv4_port, + &o2nm_node_attr_ipv4_address, + &o2nm_node_attr_local, NULL, }; -static int o2nm_attr_index(struct configfs_attribute *attr) -{ - int i; - for (i = 0; i < ARRAY_SIZE(o2nm_node_attrs); i++) { - if (attr == o2nm_node_attrs[i]) - return i; - } - BUG(); - return 0; -} - -static ssize_t o2nm_node_show(struct config_item *item, - struct configfs_attribute *attr, - char *page) -{ - struct o2nm_node *node = to_o2nm_node(item); - struct o2nm_node_attribute *o2nm_node_attr = - container_of(attr, struct o2nm_node_attribute, attr); - ssize_t ret = 0; - - if (o2nm_node_attr->show) - ret = o2nm_node_attr->show(node, page); - return ret; -} - -static ssize_t o2nm_node_store(struct config_item *item, - struct configfs_attribute *attr, - const char *page, size_t count) -{ - struct o2nm_node *node = to_o2nm_node(item); - struct o2nm_node_attribute *o2nm_node_attr = - container_of(attr, struct o2nm_node_attribute, attr); - ssize_t ret; - int attr_index = o2nm_attr_index(attr); - - if (o2nm_node_attr->store == NULL) { - ret = -EINVAL; - goto out; - } - - if (test_bit(attr_index, &node->nd_set_attributes)) - return -EBUSY; - - ret = o2nm_node_attr->store(node, page, count); - if (ret < count) - goto out; - - set_bit(attr_index, &node->nd_set_attributes); -out: - return ret; -} - static struct configfs_item_operations o2nm_node_item_ops = { .release = o2nm_node_release, - .show_attribute = o2nm_node_show, - .store_attribute = o2nm_node_store, }; static struct config_item_type o2nm_node_type = { @@ -475,12 +400,6 @@ static struct o2nm_node_group *to_o2nm_node_group(struct config_group *group) } #endif -struct o2nm_cluster_attribute { - struct configfs_attribute attr; - ssize_t (*show)(struct o2nm_cluster *, char *); - ssize_t (*store)(struct o2nm_cluster *, const char *, size_t); -}; - static ssize_t o2nm_cluster_attr_write(const char *page, ssize_t count, unsigned int *val) { @@ -501,15 +420,16 @@ static ssize_t o2nm_cluster_attr_write(const char *page, ssize_t count, return count; } -static ssize_t o2nm_cluster_attr_idle_timeout_ms_read( - struct o2nm_cluster *cluster, char *page) +static ssize_t o2nm_cluster_idle_timeout_ms_show(struct config_item *item, + char *page) { - return sprintf(page, "%u\n", cluster->cl_idle_timeout_ms); + return sprintf(page, "%u\n", to_o2nm_cluster(item)->cl_idle_timeout_ms); } -static ssize_t o2nm_cluster_attr_idle_timeout_ms_write( - struct o2nm_cluster *cluster, const char *page, size_t count) +static ssize_t o2nm_cluster_idle_timeout_ms_store(struct config_item *item, + const char *page, size_t count) { + struct o2nm_cluster *cluster = to_o2nm_cluster(item); ssize_t ret; unsigned int val; @@ -536,15 +456,17 @@ static ssize_t o2nm_cluster_attr_idle_timeout_ms_write( return ret; } -static ssize_t o2nm_cluster_attr_keepalive_delay_ms_read( - struct o2nm_cluster *cluster, char *page) +static ssize_t o2nm_cluster_keepalive_delay_ms_show( + struct config_item *item, char *page) { - return sprintf(page, "%u\n", cluster->cl_keepalive_delay_ms); + return sprintf(page, "%u\n", + to_o2nm_cluster(item)->cl_keepalive_delay_ms); } -static ssize_t o2nm_cluster_attr_keepalive_delay_ms_write( - struct o2nm_cluster *cluster, const char *page, size_t count) +static ssize_t o2nm_cluster_keepalive_delay_ms_store( + struct config_item *item, const char *page, size_t count) { + struct o2nm_cluster *cluster = to_o2nm_cluster(item); ssize_t ret; unsigned int val; @@ -571,22 +493,24 @@ static ssize_t o2nm_cluster_attr_keepalive_delay_ms_write( return ret; } -static ssize_t o2nm_cluster_attr_reconnect_delay_ms_read( - struct o2nm_cluster *cluster, char *page) +static ssize_t o2nm_cluster_reconnect_delay_ms_show( + struct config_item *item, char *page) { - return sprintf(page, "%u\n", cluster->cl_reconnect_delay_ms); + return sprintf(page, "%u\n", + to_o2nm_cluster(item)->cl_reconnect_delay_ms); } -static ssize_t o2nm_cluster_attr_reconnect_delay_ms_write( - struct o2nm_cluster *cluster, const char *page, size_t count) +static ssize_t o2nm_cluster_reconnect_delay_ms_store( + struct config_item *item, const char *page, size_t count) { return o2nm_cluster_attr_write(page, count, - &cluster->cl_reconnect_delay_ms); + &to_o2nm_cluster(item)->cl_reconnect_delay_ms); } -static ssize_t o2nm_cluster_attr_fence_method_read( - struct o2nm_cluster *cluster, char *page) +static ssize_t o2nm_cluster_fence_method_show( + struct config_item *item, char *page) { + struct o2nm_cluster *cluster = to_o2nm_cluster(item); ssize_t ret = 0; if (cluster) @@ -595,8 +519,8 @@ static ssize_t o2nm_cluster_attr_fence_method_read( return ret; } -static ssize_t o2nm_cluster_attr_fence_method_write( - struct o2nm_cluster *cluster, const char *page, size_t count) +static ssize_t o2nm_cluster_fence_method_store( + struct config_item *item, const char *page, size_t count) { unsigned int i; @@ -608,10 +532,10 @@ static ssize_t o2nm_cluster_attr_fence_method_write( continue; if (strncasecmp(page, o2nm_fence_method_desc[i], count - 1)) continue; - if (cluster->cl_fence_method != i) { + if (to_o2nm_cluster(item)->cl_fence_method != i) { printk(KERN_INFO "ocfs2: Changing fence method to %s\n", o2nm_fence_method_desc[i]); - cluster->cl_fence_method = i; + to_o2nm_cluster(item)->cl_fence_method = i; } return count; } @@ -620,79 +544,18 @@ bail: return -EINVAL; } -static struct o2nm_cluster_attribute o2nm_cluster_attr_idle_timeout_ms = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "idle_timeout_ms", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2nm_cluster_attr_idle_timeout_ms_read, - .store = o2nm_cluster_attr_idle_timeout_ms_write, -}; - -static struct o2nm_cluster_attribute o2nm_cluster_attr_keepalive_delay_ms = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "keepalive_delay_ms", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2nm_cluster_attr_keepalive_delay_ms_read, - .store = o2nm_cluster_attr_keepalive_delay_ms_write, -}; - -static struct o2nm_cluster_attribute o2nm_cluster_attr_reconnect_delay_ms = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "reconnect_delay_ms", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2nm_cluster_attr_reconnect_delay_ms_read, - .store = o2nm_cluster_attr_reconnect_delay_ms_write, -}; - -static struct o2nm_cluster_attribute o2nm_cluster_attr_fence_method = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "fence_method", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2nm_cluster_attr_fence_method_read, - .store = o2nm_cluster_attr_fence_method_write, -}; +CONFIGFS_ATTR(o2nm_cluster_, idle_timeout_ms); +CONFIGFS_ATTR(o2nm_cluster_, keepalive_delay_ms); +CONFIGFS_ATTR(o2nm_cluster_, reconnect_delay_ms); +CONFIGFS_ATTR(o2nm_cluster_, fence_method); static struct configfs_attribute *o2nm_cluster_attrs[] = { - &o2nm_cluster_attr_idle_timeout_ms.attr, - &o2nm_cluster_attr_keepalive_delay_ms.attr, - &o2nm_cluster_attr_reconnect_delay_ms.attr, - &o2nm_cluster_attr_fence_method.attr, + &o2nm_cluster_attr_idle_timeout_ms, + &o2nm_cluster_attr_keepalive_delay_ms, + &o2nm_cluster_attr_reconnect_delay_ms, + &o2nm_cluster_attr_fence_method, NULL, }; -static ssize_t o2nm_cluster_show(struct config_item *item, - struct configfs_attribute *attr, - char *page) -{ - struct o2nm_cluster *cluster = to_o2nm_cluster(item); - struct o2nm_cluster_attribute *o2nm_cluster_attr = - container_of(attr, struct o2nm_cluster_attribute, attr); - ssize_t ret = 0; - - if (o2nm_cluster_attr->show) - ret = o2nm_cluster_attr->show(cluster, page); - return ret; -} - -static ssize_t o2nm_cluster_store(struct config_item *item, - struct configfs_attribute *attr, - const char *page, size_t count) -{ - struct o2nm_cluster *cluster = to_o2nm_cluster(item); - struct o2nm_cluster_attribute *o2nm_cluster_attr = - container_of(attr, struct o2nm_cluster_attribute, attr); - ssize_t ret; - - if (o2nm_cluster_attr->store == NULL) { - ret = -EINVAL; - goto out; - } - - ret = o2nm_cluster_attr->store(cluster, page, count); - if (ret < count) - goto out; -out: - return ret; -} static struct config_item *o2nm_node_group_make_item(struct config_group *group, const char *name) @@ -767,14 +630,11 @@ static void o2nm_cluster_release(struct config_item *item) { struct o2nm_cluster *cluster = to_o2nm_cluster(item); - kfree(cluster->cl_group.default_groups); kfree(cluster); } static struct configfs_item_operations o2nm_cluster_item_ops = { .release = o2nm_cluster_release, - .show_attribute = o2nm_cluster_show, - .store_attribute = o2nm_cluster_store, }; static struct config_item_type o2nm_cluster_type = { @@ -805,7 +665,6 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g struct o2nm_cluster *cluster = NULL; struct o2nm_node_group *ns = NULL; struct config_group *o2hb_group = NULL, *ret = NULL; - void *defs = NULL; /* this runs under the parent dir's i_mutex; there can be only * one caller in here at a time */ @@ -814,20 +673,18 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g cluster = kzalloc(sizeof(struct o2nm_cluster), GFP_KERNEL); ns = kzalloc(sizeof(struct o2nm_node_group), GFP_KERNEL); - defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL); o2hb_group = o2hb_alloc_hb_set(); - if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL) + if (cluster == NULL || ns == NULL || o2hb_group == NULL) goto out; config_group_init_type_name(&cluster->cl_group, name, &o2nm_cluster_type); + configfs_add_default_group(&ns->ns_group, &cluster->cl_group); + config_group_init_type_name(&ns->ns_group, "node", &o2nm_node_group_type); + configfs_add_default_group(o2hb_group, &cluster->cl_group); - cluster->cl_group.default_groups = defs; - cluster->cl_group.default_groups[0] = &ns->ns_group; - cluster->cl_group.default_groups[1] = o2hb_group; - cluster->cl_group.default_groups[2] = NULL; rwlock_init(&cluster->cl_nodes_lock); cluster->cl_node_ip_tree = RB_ROOT; cluster->cl_reconnect_delay_ms = O2NET_RECONNECT_DELAY_MS_DEFAULT; @@ -843,7 +700,6 @@ out: kfree(cluster); kfree(ns); o2hb_free_hb_set(o2hb_group); - kfree(defs); ret = ERR_PTR(-ENOMEM); } @@ -853,18 +709,11 @@ out: static void o2nm_cluster_group_drop_item(struct config_group *group, struct config_item *item) { struct o2nm_cluster *cluster = to_o2nm_cluster(item); - int i; - struct config_item *killme; BUG_ON(o2nm_single_cluster != cluster); o2nm_single_cluster = NULL; - for (i = 0; cluster->cl_group.default_groups[i]; i++) { - killme = &cluster->cl_group.default_groups[i]->cg_item; - cluster->cl_group.default_groups[i] = NULL; - config_item_put(killme); - } - + configfs_remove_default_groups(&cluster->cl_group); config_item_put(item); } @@ -896,7 +745,7 @@ int o2nm_depend_item(struct config_item *item) void o2nm_undepend_item(struct config_item *item) { - configfs_undepend_item(&o2nm_cluster_group.cs_subsys, item); + configfs_undepend_item(item); } int o2nm_depend_this_node(void) diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index ffecf89c8c1c..e1adf285fc31 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -4361,7 +4361,7 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir, mlog_errno(ret); goto out; } - mutex_lock(&dx_alloc_inode->i_mutex); + inode_lock(dx_alloc_inode); ret = ocfs2_inode_lock(dx_alloc_inode, &dx_alloc_bh, 1); if (ret) { @@ -4410,7 +4410,7 @@ out_unlock: ocfs2_inode_unlock(dx_alloc_inode, 1); out_mutex: - mutex_unlock(&dx_alloc_inode->i_mutex); + inode_unlock(dx_alloc_inode); brelse(dx_alloc_bh); out: iput(dx_alloc_inode); diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index e88ccf8c83ff..004f2cbe8f71 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -282,6 +282,7 @@ static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm, #define DLM_LOCK_RES_DROPPING_REF 0x00000040 #define DLM_LOCK_RES_BLOCK_DIRTY 0x00001000 #define DLM_LOCK_RES_SETREF_INPROG 0x00002000 +#define DLM_LOCK_RES_RECOVERY_WAITING 0x00004000 /* max milliseconds to wait to sync up a network failure with a node death */ #define DLM_NODE_DEATH_WAIT_MAX (5 * 1000) @@ -376,17 +377,6 @@ struct dlm_lock lksb_kernel_allocated:1; }; - -#define DLM_LKSB_UNUSED1 0x01 -#define DLM_LKSB_PUT_LVB 0x02 -#define DLM_LKSB_GET_LVB 0x04 -#define DLM_LKSB_UNUSED2 0x08 -#define DLM_LKSB_UNUSED3 0x10 -#define DLM_LKSB_UNUSED4 0x20 -#define DLM_LKSB_UNUSED5 0x40 -#define DLM_LKSB_UNUSED6 0x80 - - enum dlm_lockres_list { DLM_GRANTED_LIST = 0, DLM_CONVERTING_LIST = 1, @@ -462,6 +452,7 @@ enum { DLM_QUERY_REGION = 519, DLM_QUERY_NODEINFO = 520, DLM_BEGIN_EXIT_DOMAIN_MSG = 521, + DLM_DEREF_LOCKRES_DONE = 522, }; struct dlm_reco_node_data @@ -556,7 +547,7 @@ struct dlm_master_requery * }; * * from ../cluster/tcp.h - * NET_MAX_PAYLOAD_BYTES (4096 - sizeof(net_msg)) + * O2NET_MAX_PAYLOAD_BYTES (4096 - sizeof(net_msg)) * (roughly 4080 bytes) * and sizeof(dlm_migratable_lockres) = 112 bytes * and sizeof(dlm_migratable_lock) = 16 bytes @@ -597,7 +588,7 @@ struct dlm_migratable_lockres /* from above, 128 bytes * for some undetermined future use */ -#define DLM_MIG_LOCKRES_RESERVED (NET_MAX_PAYLOAD_BYTES - \ +#define DLM_MIG_LOCKRES_RESERVED (O2NET_MAX_PAYLOAD_BYTES - \ DLM_MIG_LOCKRES_MAX_LEN) struct dlm_create_lock @@ -793,6 +784,20 @@ struct dlm_deref_lockres u8 name[O2NM_MAX_NAME_LEN]; }; +enum { + DLM_DEREF_RESPONSE_DONE = 0, + DLM_DEREF_RESPONSE_INPROG = 1, +}; + +struct dlm_deref_lockres_done { + u32 pad1; + u16 pad2; + u8 node_idx; + u8 namelen; + + u8 name[O2NM_MAX_NAME_LEN]; +}; + static inline enum dlm_status __dlm_lockres_state_to_status(struct dlm_lock_resource *res) { @@ -800,7 +805,8 @@ __dlm_lockres_state_to_status(struct dlm_lock_resource *res) assert_spin_locked(&res->spinlock); - if (res->state & DLM_LOCK_RES_RECOVERING) + if (res->state & (DLM_LOCK_RES_RECOVERING| + DLM_LOCK_RES_RECOVERY_WAITING)) status = DLM_RECOVERING; else if (res->state & DLM_LOCK_RES_MIGRATING) status = DLM_MIGRATING; @@ -979,6 +985,8 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data, void dlm_assert_master_post_handler(int status, void *data, void *ret_data); int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, void **ret_data); +int dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, void **ret_data); int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, @@ -1020,6 +1028,7 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res) { __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_IN_PROGRESS| DLM_LOCK_RES_RECOVERING| + DLM_LOCK_RES_RECOVERY_WAITING| DLM_LOCK_RES_MIGRATING)); } diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index e36d63ff1783..cdeafb4e7ed6 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -212,6 +212,12 @@ grant: if (lock->lksb->flags & DLM_LKSB_PUT_LVB) memcpy(res->lvb, lock->lksb->lvb, DLM_LVB_LEN); + /* + * Move the lock to the tail because it may be the only lock which has + * an invalid lvb. + */ + list_move_tail(&lock->list, &res->granted); + status = DLM_NORMAL; *call_ast = 1; goto unlock_exit; @@ -262,6 +268,7 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, struct dlm_lock *lock, int flags, int type) { enum dlm_status status; + u8 old_owner = res->owner; mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type, lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS); @@ -287,6 +294,19 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, status = DLM_DENIED; goto bail; } + + if (lock->ml.type == type && lock->ml.convert_type == LKM_IVMODE) { + mlog(0, "last convert request returned DLM_RECOVERING, but " + "owner has already queued and sent ast to me. res %.*s, " + "(cookie=%u:%llu, type=%d, conv=%d)\n", + res->lockname.len, res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), + lock->ml.type, lock->ml.convert_type); + status = DLM_NORMAL; + goto bail; + } + res->state |= DLM_LOCK_RES_IN_PROGRESS; /* move lock to local convert queue */ /* do not alter lock refcount. switching lists. */ @@ -316,11 +336,19 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, spin_lock(&res->spinlock); res->state &= ~DLM_LOCK_RES_IN_PROGRESS; lock->convert_pending = 0; - /* if it failed, move it back to granted queue */ + /* if it failed, move it back to granted queue. + * if master returns DLM_NORMAL and then down before sending ast, + * it may have already been moved to granted queue, reset to + * DLM_RECOVERING and retry convert */ if (status != DLM_NORMAL) { if (status != DLM_NOTQUEUED) dlm_error(status); dlm_revert_pending_convert(res, lock); + } else if ((res->state & DLM_LOCK_RES_RECOVERING) || + (old_owner != res->owner)) { + mlog(0, "res %.*s is in recovering or has been recovered.\n", + res->lockname.len, res->lockname.name); + status = DLM_RECOVERING; } bail: spin_unlock(&res->spinlock); diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 2ee7fe747cea..12e064b8be9a 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -132,10 +132,13 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events); * - Message DLM_QUERY_NODEINFO added to allow online node removes * New in version 1.2: * - Message DLM_BEGIN_EXIT_DOMAIN_MSG added to mark start of exit domain + * New in version 1.3: + * - Message DLM_DEREF_LOCKRES_DONE added to inform non-master that the + * refmap is cleared */ static const struct dlm_protocol_version dlm_protocol = { .pv_major = 1, - .pv_minor = 2, + .pv_minor = 3, }; #define DLM_DOMAIN_BACKOFF_MS 200 @@ -1396,7 +1399,7 @@ static int dlm_send_join_cancels(struct dlm_ctxt *dlm, unsigned int map_size) { int status, tmpstat; - unsigned int node; + int node; if (map_size != (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long))) { @@ -1853,7 +1856,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm) sizeof(struct dlm_exit_domain), dlm_begin_exit_domain_handler, dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + status = o2net_register_handler(DLM_DEREF_LOCKRES_DONE, dlm->key, + sizeof(struct dlm_deref_lockres_done), + dlm_deref_lockres_done_handler, + dlm, NULL, &dlm->dlm_domain_handlers); bail: if (status) dlm_unregister_domain_handlers(dlm); diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index ce38b4ccc9ab..9aed6e202201 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2278,7 +2278,7 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) dlm_print_one_lock_resource(res); BUG(); } - return ret; + return ret ? ret : r; } int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, @@ -2345,7 +2345,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, res->lockname.len, res->lockname.name, node); dlm_print_one_lock_resource(res); } - ret = 0; + ret = DLM_DEREF_RESPONSE_DONE; goto done; } @@ -2365,7 +2365,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, spin_unlock(&dlm->work_lock); queue_work(dlm->dlm_worker, &dlm->dispatched_work); - return 0; + return DLM_DEREF_RESPONSE_INPROG; done: if (res) @@ -2375,6 +2375,122 @@ done: return ret; } +int dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_deref_lockres_done *deref + = (struct dlm_deref_lockres_done *)msg->buf; + struct dlm_lock_resource *res = NULL; + char *name; + unsigned int namelen; + int ret = -EINVAL; + u8 node; + unsigned int hash; + + if (!dlm_grab(dlm)) + return 0; + + name = deref->name; + namelen = deref->namelen; + node = deref->node_idx; + + if (namelen > DLM_LOCKID_NAME_MAX) { + mlog(ML_ERROR, "Invalid name length!"); + goto done; + } + if (deref->node_idx >= O2NM_MAX_NODES) { + mlog(ML_ERROR, "Invalid node number: %u\n", node); + goto done; + } + + hash = dlm_lockid_hash(name, namelen); + + spin_lock(&dlm->spinlock); + res = __dlm_lookup_lockres_full(dlm, name, namelen, hash); + if (!res) { + spin_unlock(&dlm->spinlock); + mlog(ML_ERROR, "%s:%.*s: bad lockres name\n", + dlm->name, namelen, name); + goto done; + } + + spin_lock(&res->spinlock); + BUG_ON(!(res->state & DLM_LOCK_RES_DROPPING_REF)); + if (!list_empty(&res->purge)) { + mlog(0, "%s: Removing res %.*s from purgelist\n", + dlm->name, res->lockname.len, res->lockname.name); + list_del_init(&res->purge); + dlm_lockres_put(res); + dlm->purge_count--; + } + + if (!__dlm_lockres_unused(res)) { + mlog(ML_ERROR, "%s: res %.*s in use after deref\n", + dlm->name, res->lockname.len, res->lockname.name); + __dlm_print_one_lock_resource(res); + BUG(); + } + + __dlm_unhash_lockres(dlm, res); + + spin_lock(&dlm->track_lock); + if (!list_empty(&res->tracking)) + list_del_init(&res->tracking); + else { + mlog(ML_ERROR, "%s: Resource %.*s not on the Tracking list\n", + dlm->name, res->lockname.len, res->lockname.name); + __dlm_print_one_lock_resource(res); + } + spin_unlock(&dlm->track_lock); + + /* lockres is not in the hash now. drop the flag and wake up + * any processes waiting in dlm_get_lock_resource. + */ + res->state &= ~DLM_LOCK_RES_DROPPING_REF; + spin_unlock(&res->spinlock); + wake_up(&res->wq); + + dlm_lockres_put(res); + + spin_unlock(&dlm->spinlock); + +done: + dlm_put(dlm); + return ret; +} + +static void dlm_drop_lockres_ref_done(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, u8 node) +{ + struct dlm_deref_lockres_done deref; + int ret = 0, r; + const char *lockname; + unsigned int namelen; + + lockname = res->lockname.name; + namelen = res->lockname.len; + BUG_ON(namelen > O2NM_MAX_NAME_LEN); + + memset(&deref, 0, sizeof(deref)); + deref.node_idx = dlm->node_num; + deref.namelen = namelen; + memcpy(deref.name, lockname, namelen); + + ret = o2net_send_message(DLM_DEREF_LOCKRES_DONE, dlm->key, + &deref, sizeof(deref), node, &r); + if (ret < 0) { + mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF DONE " + " to node %u\n", dlm->name, namelen, + lockname, ret, node); + } else if (r < 0) { + /* ignore the error */ + mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n", + dlm->name, namelen, lockname, node, r); + dlm_print_one_lock_resource(res); + } +} + static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) { struct dlm_ctxt *dlm; @@ -2388,13 +2504,15 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) spin_lock(&res->spinlock); BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); + __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); if (test_bit(node, res->refmap)) { - __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); dlm_lockres_clear_refmap_bit(dlm, res, node); cleared = 1; } spin_unlock(&res->spinlock); + dlm_drop_lockres_ref_done(dlm, res, node); + if (cleared) { mlog(0, "%s:%.*s node %u ref dropped in dispatch\n", dlm->name, res->lockname.len, res->lockname.name, node); @@ -2432,7 +2550,8 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, return 0; /* delay migration when the lockres is in RECOCERING state */ - if (res->state & DLM_LOCK_RES_RECOVERING) + if (res->state & (DLM_LOCK_RES_RECOVERING| + DLM_LOCK_RES_RECOVERY_WAITING)) return 0; if (res->owner != dlm->node_num) @@ -2519,6 +2638,11 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, spin_lock(&dlm->master_lock); ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name, namelen, target, dlm->node_num); + /* get an extra reference on the mle. + * otherwise the assert_master from the new + * master will destroy this. + */ + dlm_get_mle_inuse(mle); spin_unlock(&dlm->master_lock); spin_unlock(&dlm->spinlock); @@ -2544,7 +2668,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, } fail: - if (oldmle) { + if (ret != -EEXIST && oldmle) { /* master is known, detach if not already detached */ dlm_mle_detach_hb_events(dlm, oldmle); dlm_put_mle(oldmle); @@ -2554,6 +2678,7 @@ fail: if (mle_added) { dlm_mle_detach_hb_events(dlm, mle); dlm_put_mle(mle); + dlm_put_mle_inuse(mle); } else if (mle) { kmem_cache_free(dlm_mle_cache, mle); mle = NULL; @@ -2571,17 +2696,6 @@ fail: * ensure that all assert_master work is flushed. */ flush_workqueue(dlm->dlm_worker); - /* get an extra reference on the mle. - * otherwise the assert_master from the new - * master will destroy this. - * also, make sure that all callers of dlm_get_mle - * take both dlm->spinlock and dlm->master_lock */ - spin_lock(&dlm->spinlock); - spin_lock(&dlm->master_lock); - dlm_get_mle_inuse(mle); - spin_unlock(&dlm->master_lock); - spin_unlock(&dlm->spinlock); - /* notify new node and send all lock state */ /* call send_one_lockres with migration flag. * this serves as notice to the target node that a @@ -2843,6 +2957,8 @@ again: res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY; if (!ret) BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING)); + else + res->migration_pending = 0; spin_unlock(&res->spinlock); /* @@ -3048,7 +3164,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, int ret = 0; if (!dlm_grab(dlm)) - return -EINVAL; + return 0; name = migrate->name; namelen = migrate->namelen; @@ -3139,7 +3255,8 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm, mlog(0, "tried to migrate %.*s, but some " "process beat me to it\n", namelen, name); - ret = -EEXIST; + spin_unlock(&tmp->spinlock); + return -EEXIST; } else { /* bad. 2 NODES are trying to migrate! */ mlog(ML_ERROR, "migration error mle: " @@ -3310,6 +3427,15 @@ top: mle->new_master != dead_node) continue; + if (mle->new_master == dead_node && mle->inuse) { + mlog(ML_NOTICE, "%s: target %u died during " + "migration from %u, the MLE is " + "still keep used, ignore it!\n", + dlm->name, dead_node, + mle->master); + continue; + } + /* If we have reached this point, this mle needs to be * removed from the list and freed. */ dlm_clean_migration_mle(dlm, mle); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 9e4f862d20fe..f6b313898763 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1373,6 +1373,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, char *buf = NULL; struct dlm_work_item *item = NULL; struct dlm_lock_resource *res = NULL; + unsigned int hash; if (!dlm_grab(dlm)) return -EINVAL; @@ -1400,11 +1401,26 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, /* lookup the lock to see if we have a secondary queue for this * already... just add the locks in and this will have its owner * and RECOVERY flag changed when it completes. */ - res = dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len); + hash = dlm_lockid_hash(mres->lockname, mres->lockname_len); + spin_lock(&dlm->spinlock); + res = __dlm_lookup_lockres_full(dlm, mres->lockname, mres->lockname_len, + hash); if (res) { /* this will get a ref on res */ /* mark it as recovering/migrating and hash it */ spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_DROPPING_REF) { + mlog(0, "%s: node is attempting to migrate " + "lockres %.*s, but marked as dropping " + " ref!\n", dlm->name, + mres->lockname_len, mres->lockname); + ret = -EINVAL; + spin_unlock(&res->spinlock); + spin_unlock(&dlm->spinlock); + dlm_lockres_put(res); + goto leave; + } + if (mres->flags & DLM_MRES_RECOVERY) { res->state |= DLM_LOCK_RES_RECOVERING; } else { @@ -1421,13 +1437,16 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, mres->lockname_len, mres->lockname); ret = -EFAULT; spin_unlock(&res->spinlock); + spin_unlock(&dlm->spinlock); dlm_lockres_put(res); goto leave; } res->state |= DLM_LOCK_RES_MIGRATING; } spin_unlock(&res->spinlock); + spin_unlock(&dlm->spinlock); } else { + spin_unlock(&dlm->spinlock); /* need to allocate, just like if it was * mastered here normally */ res = dlm_new_lockres(dlm, mres->lockname, mres->lockname_len); @@ -2064,7 +2083,6 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm, dlm_lock_get(lock); if (lock->convert_pending) { /* move converting lock back to granted */ - BUG_ON(i != DLM_CONVERTING_LIST); mlog(0, "node died with convert pending " "on %.*s. move back to granted list.\n", res->lockname.len, res->lockname.name); @@ -2156,6 +2174,13 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, for (i = 0; i < DLM_HASH_BUCKETS; i++) { bucket = dlm_lockres_hash(dlm, i); hlist_for_each_entry(res, bucket, hash_node) { + if (res->state & DLM_LOCK_RES_RECOVERY_WAITING) { + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_RECOVERY_WAITING; + spin_unlock(&res->spinlock); + wake_up(&res->wq); + } + if (!(res->state & DLM_LOCK_RES_RECOVERING)) continue; @@ -2293,6 +2318,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, res->lockname.len, res->lockname.name, freed, dead_node); __dlm_print_one_lock_resource(res); } + res->state |= DLM_LOCK_RES_RECOVERY_WAITING; dlm_lockres_clear_refmap_bit(dlm, res, dead_node); } else if (test_bit(dead_node, res->refmap)) { mlog(0, "%s:%.*s: dead node %u had a ref, but had " @@ -2360,6 +2386,8 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) break; } } + dlm_lockres_clear_refmap_bit(dlm, res, + dead_node); spin_unlock(&res->spinlock); continue; } @@ -2368,14 +2396,16 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) dlm_revalidate_lvb(dlm, res, dead_node); if (res->owner == dead_node) { if (res->state & DLM_LOCK_RES_DROPPING_REF) { - mlog(ML_NOTICE, "%s: res %.*s, Skip " - "recovery as it is being freed\n", - dlm->name, res->lockname.len, - res->lockname.name); - } else - dlm_move_lockres_to_recovery_list(dlm, - res); - + mlog(0, "%s:%.*s: owned by " + "dead node %u, this node was " + "dropping its ref when it died. " + "continue, dropping the flag.\n", + dlm->name, res->lockname.len, + res->lockname.name, dead_node); + } + res->state &= ~DLM_LOCK_RES_DROPPING_REF; + dlm_move_lockres_to_recovery_list(dlm, + res); } else if (res->owner == dlm->node_num) { dlm_free_dead_locks(dlm, res, dead_node); __dlm_lockres_calc_usage(dlm, res); @@ -2450,11 +2480,7 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx) * perhaps later we can genericize this for other waiters. */ wake_up(&dlm->migration_wq); - if (test_bit(idx, dlm->recovery_map)) - mlog(0, "domain %s, node %u already added " - "to recovery map!\n", dlm->name, idx); - else - set_bit(idx, dlm->recovery_map); + set_bit(idx, dlm->recovery_map); } void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data) diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index c5f6c241ecd7..68d239ba0c63 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -106,7 +106,8 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res) if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY) return 0; - if (res->state & DLM_LOCK_RES_RECOVERING) + if (res->state & (DLM_LOCK_RES_RECOVERING| + DLM_LOCK_RES_RECOVERY_WAITING)) return 0; /* Another node has this resource with this node as the master */ @@ -202,6 +203,13 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm, dlm->purge_count--; } + if (!master && ret != 0) { + mlog(0, "%s: deref %.*s in progress or master goes down\n", + dlm->name, res->lockname.len, res->lockname.name); + spin_unlock(&res->spinlock); + return; + } + if (!__dlm_lockres_unused(res)) { mlog(ML_ERROR, "%s: res %.*s in use after deref\n", dlm->name, res->lockname.len, res->lockname.name); @@ -700,7 +708,8 @@ static int dlm_thread(void *data) * dirty for a short while. */ BUG_ON(res->state & DLM_LOCK_RES_MIGRATING); if (res->state & (DLM_LOCK_RES_IN_PROGRESS | - DLM_LOCK_RES_RECOVERING)) { + DLM_LOCK_RES_RECOVERING | + DLM_LOCK_RES_RECOVERY_WAITING)) { /* move it to the tail and keep going */ res->state &= ~DLM_LOCK_RES_DIRTY; spin_unlock(&res->spinlock); diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 2e3c9dbab68c..1082b2c3014b 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -421,7 +421,7 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, } if (!dlm_grab(dlm)) - return DLM_REJECTED; + return DLM_FORWARD; mlog_bug_on_msg(!dlm_domain_fully_joined(dlm), "Domain %s not fully joined!\n", dlm->name); diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index b5cf27dcb18a..47b3b2d4e775 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -571,8 +571,8 @@ static int dlmfs_fill_super(struct super_block * sb, int silent) { sb->s_maxbytes = MAX_LFS_FILESIZE; - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = DLMFS_MAGIC; sb->s_op = &dlmfs_ops; sb->s_root = d_make_root(dlmfs_get_root_inode(sb)); @@ -638,7 +638,7 @@ static int __init init_dlmfs_fs(void) dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache", sizeof(struct dlmfs_inode_private), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), dlmfs_init_once); if (!dlmfs_inode_cache) { status = -ENOMEM; diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 20276e340339..474e57f834e6 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -1390,6 +1390,7 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb, unsigned int gen; int noqueue_attempted = 0; int dlm_locked = 0; + int kick_dc = 0; if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) { mlog_errno(-EINVAL); @@ -1524,7 +1525,12 @@ update_holders: unlock: lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING); + /* ocfs2_unblock_lock reques on seeing OCFS2_LOCK_UPCONVERT_FINISHING */ + kick_dc = (lockres->l_flags & OCFS2_LOCK_BLOCKED); + spin_unlock_irqrestore(&lockres->l_lock, flags); + if (kick_dc) + ocfs2_wake_downconvert_thread(osb); out: /* * This is helping work around a lock inversion between the page lock @@ -2432,12 +2438,6 @@ bail: * done this we have to return AOP_TRUNCATED_PAGE so the aop method * that called us can bubble that back up into the VFS who will then * immediately retry the aop call. - * - * We do a blocking lock and immediate unlock before returning, though, so that - * the lock has a great chance of being cached on this node by the time the VFS - * calls back to retry the aop. This has a potential to livelock as nodes - * ping locks back and forth, but that's a risk we're willing to take to avoid - * the lock inversion simply. */ int ocfs2_inode_lock_with_page(struct inode *inode, struct buffer_head **ret_bh, @@ -2449,8 +2449,6 @@ int ocfs2_inode_lock_with_page(struct inode *inode, ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK); if (ret == -EAGAIN) { unlock_page(page); - if (ocfs2_inode_lock(inode, ret_bh, ex) == 0) - ocfs2_inode_unlock(inode, ex); ret = AOP_TRUNCATED_PAGE; } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 0e5b4515f92e..5308841756be 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -770,14 +770,14 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, { struct address_space *mapping = inode->i_mapping; struct page *page; - unsigned long index = abs_from >> PAGE_CACHE_SHIFT; + unsigned long index = abs_from >> PAGE_SHIFT; handle_t *handle; int ret = 0; unsigned zero_from, zero_to, block_start, block_end; struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; BUG_ON(abs_from >= abs_to); - BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT)); + BUG_ON(abs_to > (((u64)index + 1) << PAGE_SHIFT)); BUG_ON(abs_from & (inode->i_blkbits - 1)); handle = ocfs2_zero_start_ordered_transaction(inode, di_bh); @@ -794,10 +794,10 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, } /* Get the offsets within the page that we want to zero */ - zero_from = abs_from & (PAGE_CACHE_SIZE - 1); - zero_to = abs_to & (PAGE_CACHE_SIZE - 1); + zero_from = abs_from & (PAGE_SIZE - 1); + zero_to = abs_to & (PAGE_SIZE - 1); if (!zero_to) - zero_to = PAGE_CACHE_SIZE; + zero_to = PAGE_SIZE; trace_ocfs2_write_zero_page( (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -851,7 +851,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, out_unlock: unlock_page(page); - page_cache_release(page); + put_page(page); out_commit_trans: if (handle) ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); @@ -959,7 +959,7 @@ static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start, BUG_ON(range_start >= range_end); while (zero_pos < range_end) { - next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE; + next_pos = (zero_pos & PAGE_MASK) + PAGE_SIZE; if (next_pos > range_end) next_pos = range_end; rc = ocfs2_write_zero_page(inode, zero_pos, next_pos, di_bh); @@ -1302,6 +1302,14 @@ int ocfs2_getattr(struct vfsmount *mnt, } generic_fillattr(inode, stat); + /* + * If there is inline data in the inode, the inode will normally not + * have data blocks allocated (it may have an external xattr block). + * Report at least one sector for such files, so tools like tar, rsync, + * others don't incorrectly think the file is completely sparse. + */ + if (unlikely(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) + stat->blocks += (stat->size + 511)>>9; /* We set the blksize from the cluster size for performance */ stat->blksize = osb->s_clustersize; @@ -1373,44 +1381,6 @@ out: return ret; } -/* - * Will look for holes and unwritten extents in the range starting at - * pos for count bytes (inclusive). - */ -static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos, - size_t count) -{ - int ret = 0; - unsigned int extent_flags; - u32 cpos, clusters, extent_len, phys_cpos; - struct super_block *sb = inode->i_sb; - - cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; - clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos; - - while (clusters) { - ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len, - &extent_flags); - if (ret < 0) { - mlog_errno(ret); - goto out; - } - - if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) { - ret = 1; - break; - } - - if (extent_len > clusters) - extent_len = clusters; - - clusters -= extent_len; - cpos += extent_len; - } -out: - return ret; -} - static int ocfs2_write_remove_suid(struct inode *inode) { int ret; @@ -1864,7 +1834,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) return -EROFS; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * This prevents concurrent writes on other nodes @@ -1983,7 +1953,7 @@ out_rw_unlock: ocfs2_rw_unlock(inode, 1); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -2121,18 +2091,12 @@ out: static int ocfs2_prepare_inode_for_write(struct file *file, loff_t pos, - size_t count, - int appending, - int *direct_io, - int *has_refcount) + size_t count) { int ret = 0, meta_level = 0; struct dentry *dentry = file->f_path.dentry; struct inode *inode = d_inode(dentry); loff_t end; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - int full_coherency = !(osb->s_mount_opt & - OCFS2_MOUNT_COHERENCY_BUFFERED); /* * We start with a read level meta lock and only jump to an ex @@ -2181,10 +2145,6 @@ static int ocfs2_prepare_inode_for_write(struct file *file, pos, count, &meta_level); - if (has_refcount) - *has_refcount = 1; - if (direct_io) - *direct_io = 0; } if (ret < 0) { @@ -2192,67 +2152,12 @@ static int ocfs2_prepare_inode_for_write(struct file *file, goto out_unlock; } - /* - * Skip the O_DIRECT checks if we don't need - * them. - */ - if (!direct_io || !(*direct_io)) - break; - - /* - * There's no sane way to do direct writes to an inode - * with inline data. - */ - if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { - *direct_io = 0; - break; - } - - /* - * Allowing concurrent direct writes means - * i_size changes wouldn't be synchronized, so - * one node could wind up truncating another - * nodes writes. - */ - if (end > i_size_read(inode) && !full_coherency) { - *direct_io = 0; - break; - } - - /* - * Fallback to old way if the feature bit is not set. - */ - if (end > i_size_read(inode) && - !ocfs2_supports_append_dio(osb)) { - *direct_io = 0; - break; - } - - /* - * We don't fill holes during direct io, so - * check for them here. If any are found, the - * caller will have to retake some cluster - * locks and initiate the io as buffered. - */ - ret = ocfs2_check_range_for_holes(inode, pos, count); - if (ret == 1) { - /* - * Fallback to old way if the feature bit is not set. - * Otherwise try dio first and then complete the rest - * request through buffer io. - */ - if (!ocfs2_supports_append_dio(osb)) - *direct_io = 0; - ret = 0; - } else if (ret < 0) - mlog_errno(ret); break; } out_unlock: trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno, - pos, appending, count, - direct_io, has_refcount); + pos, count); if (meta_level >= 0) ocfs2_inode_unlock(inode, meta_level); @@ -2264,18 +2169,16 @@ out: static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { - int direct_io, appending, rw_level; - int can_do_direct, has_refcount = 0; + int direct_io, rw_level; ssize_t written = 0; ssize_t ret; - size_t count = iov_iter_count(from), orig_count; + size_t count = iov_iter_count(from); struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); int full_coherency = !(osb->s_mount_opt & OCFS2_MOUNT_COHERENCY_BUFFERED); - int unaligned_dio = 0; - int dropped_dio = 0; + void *saved_ki_complete = NULL; int append_write = ((iocb->ki_pos + count) >= i_size_read(inode) ? 1 : 0); @@ -2288,12 +2191,10 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, if (count == 0) return 0; - appending = iocb->ki_flags & IOCB_APPEND ? 1 : 0; direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); -relock: /* * Concurrent O_DIRECT writes are allowed with * mount_option "coherency=buffered". @@ -2326,7 +2227,6 @@ relock: ocfs2_inode_unlock(inode, 1); } - orig_count = iov_iter_count(from); ret = generic_write_checks(iocb, from); if (ret <= 0) { if (ret) @@ -2335,41 +2235,18 @@ relock: } count = ret; - can_do_direct = direct_io; - ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, appending, - &can_do_direct, &has_refcount); + ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count); if (ret < 0) { mlog_errno(ret); goto out; } - if (direct_io && !is_sync_kiocb(iocb)) - unaligned_dio = ocfs2_is_io_unaligned(inode, count, iocb->ki_pos); - - /* - * We can't complete the direct I/O as requested, fall back to - * buffered I/O. - */ - if (direct_io && !can_do_direct) { - ocfs2_rw_unlock(inode, rw_level); - - rw_level = -1; - - direct_io = 0; - iocb->ki_flags &= ~IOCB_DIRECT; - iov_iter_reexpand(from, orig_count); - dropped_dio = 1; - goto relock; - } - - if (unaligned_dio) { + if (direct_io && !is_sync_kiocb(iocb) && + ocfs2_is_io_unaligned(inode, count, iocb->ki_pos)) { /* - * Wait on previous unaligned aio to complete before - * proceeding. + * Make it a sync io if it's an unaligned aio. */ - mutex_lock(&OCFS2_I(inode)->ip_unaligned_aio); - /* Mark the iocb as needing an unlock in ocfs2_dio_end_io */ - ocfs2_iocb_set_unaligned_aio(iocb); + saved_ki_complete = xchg(&iocb->ki_complete, NULL); } /* communicate with ocfs2_dio_end_io */ @@ -2390,14 +2267,13 @@ relock: */ if ((written == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { rw_level = -1; - unaligned_dio = 0; } if (unlikely(written <= 0)) - goto no_sync; + goto out; if (((file->f_flags & O_DSYNC) && !direct_io) || - IS_SYNC(inode) || dropped_dio) { + IS_SYNC(inode)) { ret = filemap_fdatawrite_range(file->f_mapping, iocb->ki_pos - written, iocb->ki_pos - 1); @@ -2416,18 +2292,15 @@ relock: iocb->ki_pos - 1); } -no_sync: - if (unaligned_dio && ocfs2_iocb_is_unaligned_aio(iocb)) { - ocfs2_iocb_clear_unaligned_aio(iocb); - mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio); - } - out: + if (saved_ki_complete) + xchg(&iocb->ki_complete, saved_ki_complete); + if (rw_level != -1) ocfs2_rw_unlock(inode, rw_level); out_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (written) ret = written; @@ -2539,7 +2412,7 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence) struct inode *inode = file->f_mapping->host; int ret = 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); switch (whence) { case SEEK_SET: @@ -2577,7 +2450,7 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence) offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (ret) return ret; return offset; diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c new file mode 100644 index 000000000000..2cabbcf2f28e --- /dev/null +++ b/fs/ocfs2/filecheck.c @@ -0,0 +1,606 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * filecheck.c + * + * Code which implements online file check. + * + * Copyright (C) 2016 SuSE. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/kmod.h> +#include <linux/fs.h> +#include <linux/kobject.h> +#include <linux/sysfs.h> +#include <linux/sysctl.h> +#include <cluster/masklog.h> + +#include "ocfs2.h" +#include "ocfs2_fs.h" +#include "stackglue.h" +#include "inode.h" + +#include "filecheck.h" + + +/* File check error strings, + * must correspond with error number in header file. + */ +static const char * const ocfs2_filecheck_errs[] = { + "SUCCESS", + "FAILED", + "INPROGRESS", + "READONLY", + "INJBD", + "INVALIDINO", + "BLOCKECC", + "BLOCKNO", + "VALIDFLAG", + "GENERATION", + "UNSUPPORTED" +}; + +static DEFINE_SPINLOCK(ocfs2_filecheck_sysfs_lock); +static LIST_HEAD(ocfs2_filecheck_sysfs_list); + +struct ocfs2_filecheck { + struct list_head fc_head; /* File check entry list head */ + spinlock_t fc_lock; + unsigned int fc_max; /* Maximum number of entry in list */ + unsigned int fc_size; /* Current entry count in list */ + unsigned int fc_done; /* Finished entry count in list */ +}; + +struct ocfs2_filecheck_sysfs_entry { /* sysfs entry per mounting */ + struct list_head fs_list; + atomic_t fs_count; + struct super_block *fs_sb; + struct kset *fs_devicekset; + struct kset *fs_fcheckkset; + struct ocfs2_filecheck *fs_fcheck; +}; + +#define OCFS2_FILECHECK_MAXSIZE 100 +#define OCFS2_FILECHECK_MINSIZE 10 + +/* File check operation type */ +enum { + OCFS2_FILECHECK_TYPE_CHK = 0, /* Check a file(inode) */ + OCFS2_FILECHECK_TYPE_FIX, /* Fix a file(inode) */ + OCFS2_FILECHECK_TYPE_SET = 100 /* Set entry list maximum size */ +}; + +struct ocfs2_filecheck_entry { + struct list_head fe_list; + unsigned long fe_ino; + unsigned int fe_type; + unsigned int fe_done:1; + unsigned int fe_status:31; +}; + +struct ocfs2_filecheck_args { + unsigned int fa_type; + union { + unsigned long fa_ino; + unsigned int fa_len; + }; +}; + +static const char * +ocfs2_filecheck_error(int errno) +{ + if (!errno) + return ocfs2_filecheck_errs[errno]; + + BUG_ON(errno < OCFS2_FILECHECK_ERR_START || + errno > OCFS2_FILECHECK_ERR_END); + return ocfs2_filecheck_errs[errno - OCFS2_FILECHECK_ERR_START + 1]; +} + +static ssize_t ocfs2_filecheck_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf); +static ssize_t ocfs2_filecheck_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count); +static struct kobj_attribute ocfs2_attr_filecheck_chk = + __ATTR(check, S_IRUSR | S_IWUSR, + ocfs2_filecheck_show, + ocfs2_filecheck_store); +static struct kobj_attribute ocfs2_attr_filecheck_fix = + __ATTR(fix, S_IRUSR | S_IWUSR, + ocfs2_filecheck_show, + ocfs2_filecheck_store); +static struct kobj_attribute ocfs2_attr_filecheck_set = + __ATTR(set, S_IRUSR | S_IWUSR, + ocfs2_filecheck_show, + ocfs2_filecheck_store); + +static int ocfs2_filecheck_sysfs_wait(atomic_t *p) +{ + schedule(); + return 0; +} + +static void +ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry) +{ + struct ocfs2_filecheck_entry *p; + + if (!atomic_dec_and_test(&entry->fs_count)) + wait_on_atomic_t(&entry->fs_count, ocfs2_filecheck_sysfs_wait, + TASK_UNINTERRUPTIBLE); + + spin_lock(&entry->fs_fcheck->fc_lock); + while (!list_empty(&entry->fs_fcheck->fc_head)) { + p = list_first_entry(&entry->fs_fcheck->fc_head, + struct ocfs2_filecheck_entry, fe_list); + list_del(&p->fe_list); + BUG_ON(!p->fe_done); /* To free a undone file check entry */ + kfree(p); + } + spin_unlock(&entry->fs_fcheck->fc_lock); + + kset_unregister(entry->fs_fcheckkset); + kset_unregister(entry->fs_devicekset); + kfree(entry->fs_fcheck); + kfree(entry); +} + +static void +ocfs2_filecheck_sysfs_add(struct ocfs2_filecheck_sysfs_entry *entry) +{ + spin_lock(&ocfs2_filecheck_sysfs_lock); + list_add_tail(&entry->fs_list, &ocfs2_filecheck_sysfs_list); + spin_unlock(&ocfs2_filecheck_sysfs_lock); +} + +static int ocfs2_filecheck_sysfs_del(const char *devname) +{ + struct ocfs2_filecheck_sysfs_entry *p; + + spin_lock(&ocfs2_filecheck_sysfs_lock); + list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) { + if (!strcmp(p->fs_sb->s_id, devname)) { + list_del(&p->fs_list); + spin_unlock(&ocfs2_filecheck_sysfs_lock); + ocfs2_filecheck_sysfs_free(p); + return 0; + } + } + spin_unlock(&ocfs2_filecheck_sysfs_lock); + return 1; +} + +static void +ocfs2_filecheck_sysfs_put(struct ocfs2_filecheck_sysfs_entry *entry) +{ + if (atomic_dec_and_test(&entry->fs_count)) + wake_up_atomic_t(&entry->fs_count); +} + +static struct ocfs2_filecheck_sysfs_entry * +ocfs2_filecheck_sysfs_get(const char *devname) +{ + struct ocfs2_filecheck_sysfs_entry *p = NULL; + + spin_lock(&ocfs2_filecheck_sysfs_lock); + list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) { + if (!strcmp(p->fs_sb->s_id, devname)) { + atomic_inc(&p->fs_count); + spin_unlock(&ocfs2_filecheck_sysfs_lock); + return p; + } + } + spin_unlock(&ocfs2_filecheck_sysfs_lock); + return NULL; +} + +int ocfs2_filecheck_create_sysfs(struct super_block *sb) +{ + int ret = 0; + struct kset *device_kset = NULL; + struct kset *fcheck_kset = NULL; + struct ocfs2_filecheck *fcheck = NULL; + struct ocfs2_filecheck_sysfs_entry *entry = NULL; + struct attribute **attrs = NULL; + struct attribute_group attrgp; + + if (!ocfs2_kset) + return -ENOMEM; + + attrs = kmalloc(sizeof(struct attribute *) * 4, GFP_NOFS); + if (!attrs) { + ret = -ENOMEM; + goto error; + } else { + attrs[0] = &ocfs2_attr_filecheck_chk.attr; + attrs[1] = &ocfs2_attr_filecheck_fix.attr; + attrs[2] = &ocfs2_attr_filecheck_set.attr; + attrs[3] = NULL; + memset(&attrgp, 0, sizeof(attrgp)); + attrgp.attrs = attrs; + } + + fcheck = kmalloc(sizeof(struct ocfs2_filecheck), GFP_NOFS); + if (!fcheck) { + ret = -ENOMEM; + goto error; + } else { + INIT_LIST_HEAD(&fcheck->fc_head); + spin_lock_init(&fcheck->fc_lock); + fcheck->fc_max = OCFS2_FILECHECK_MINSIZE; + fcheck->fc_size = 0; + fcheck->fc_done = 0; + } + + if (strlen(sb->s_id) <= 0) { + mlog(ML_ERROR, + "Cannot get device basename when create filecheck sysfs\n"); + ret = -ENODEV; + goto error; + } + + device_kset = kset_create_and_add(sb->s_id, NULL, &ocfs2_kset->kobj); + if (!device_kset) { + ret = -ENOMEM; + goto error; + } + + fcheck_kset = kset_create_and_add("filecheck", NULL, + &device_kset->kobj); + if (!fcheck_kset) { + ret = -ENOMEM; + goto error; + } + + ret = sysfs_create_group(&fcheck_kset->kobj, &attrgp); + if (ret) + goto error; + + entry = kmalloc(sizeof(struct ocfs2_filecheck_sysfs_entry), GFP_NOFS); + if (!entry) { + ret = -ENOMEM; + goto error; + } else { + atomic_set(&entry->fs_count, 1); + entry->fs_sb = sb; + entry->fs_devicekset = device_kset; + entry->fs_fcheckkset = fcheck_kset; + entry->fs_fcheck = fcheck; + ocfs2_filecheck_sysfs_add(entry); + } + + kfree(attrs); + return 0; + +error: + kfree(attrs); + kfree(entry); + kfree(fcheck); + kset_unregister(fcheck_kset); + kset_unregister(device_kset); + return ret; +} + +int ocfs2_filecheck_remove_sysfs(struct super_block *sb) +{ + return ocfs2_filecheck_sysfs_del(sb->s_id); +} + +static int +ocfs2_filecheck_erase_entries(struct ocfs2_filecheck_sysfs_entry *ent, + unsigned int count); +static int +ocfs2_filecheck_adjust_max(struct ocfs2_filecheck_sysfs_entry *ent, + unsigned int len) +{ + int ret; + + if ((len < OCFS2_FILECHECK_MINSIZE) || (len > OCFS2_FILECHECK_MAXSIZE)) + return -EINVAL; + + spin_lock(&ent->fs_fcheck->fc_lock); + if (len < (ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done)) { + mlog(ML_ERROR, + "Cannot set online file check maximum entry number " + "to %u due to too many pending entries(%u)\n", + len, ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done); + ret = -EBUSY; + } else { + if (len < ent->fs_fcheck->fc_size) + BUG_ON(!ocfs2_filecheck_erase_entries(ent, + ent->fs_fcheck->fc_size - len)); + + ent->fs_fcheck->fc_max = len; + ret = 0; + } + spin_unlock(&ent->fs_fcheck->fc_lock); + + return ret; +} + +#define OCFS2_FILECHECK_ARGS_LEN 24 +static int +ocfs2_filecheck_args_get_long(const char *buf, size_t count, + unsigned long *val) +{ + char buffer[OCFS2_FILECHECK_ARGS_LEN]; + + memcpy(buffer, buf, count); + buffer[count] = '\0'; + + if (kstrtoul(buffer, 0, val)) + return 1; + + return 0; +} + +static int +ocfs2_filecheck_type_parse(const char *name, unsigned int *type) +{ + if (!strncmp(name, "fix", 4)) + *type = OCFS2_FILECHECK_TYPE_FIX; + else if (!strncmp(name, "check", 6)) + *type = OCFS2_FILECHECK_TYPE_CHK; + else if (!strncmp(name, "set", 4)) + *type = OCFS2_FILECHECK_TYPE_SET; + else + return 1; + + return 0; +} + +static int +ocfs2_filecheck_args_parse(const char *name, const char *buf, size_t count, + struct ocfs2_filecheck_args *args) +{ + unsigned long val = 0; + unsigned int type; + + /* too short/long args length */ + if ((count < 1) || (count >= OCFS2_FILECHECK_ARGS_LEN)) + return 1; + + if (ocfs2_filecheck_type_parse(name, &type)) + return 1; + if (ocfs2_filecheck_args_get_long(buf, count, &val)) + return 1; + + if (val <= 0) + return 1; + + args->fa_type = type; + if (type == OCFS2_FILECHECK_TYPE_SET) + args->fa_len = (unsigned int)val; + else + args->fa_ino = val; + + return 0; +} + +static ssize_t ocfs2_filecheck_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + + ssize_t ret = 0, total = 0, remain = PAGE_SIZE; + unsigned int type; + struct ocfs2_filecheck_entry *p; + struct ocfs2_filecheck_sysfs_entry *ent; + + if (ocfs2_filecheck_type_parse(attr->attr.name, &type)) + return -EINVAL; + + ent = ocfs2_filecheck_sysfs_get(kobj->parent->name); + if (!ent) { + mlog(ML_ERROR, + "Cannot get the corresponding entry via device basename %s\n", + kobj->name); + return -ENODEV; + } + + if (type == OCFS2_FILECHECK_TYPE_SET) { + spin_lock(&ent->fs_fcheck->fc_lock); + total = snprintf(buf, remain, "%u\n", ent->fs_fcheck->fc_max); + spin_unlock(&ent->fs_fcheck->fc_lock); + goto exit; + } + + ret = snprintf(buf, remain, "INO\t\tDONE\tERROR\n"); + total += ret; + remain -= ret; + spin_lock(&ent->fs_fcheck->fc_lock); + list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) { + if (p->fe_type != type) + continue; + + ret = snprintf(buf + total, remain, "%lu\t\t%u\t%s\n", + p->fe_ino, p->fe_done, + ocfs2_filecheck_error(p->fe_status)); + if (ret < 0) { + total = ret; + break; + } + if (ret == remain) { + /* snprintf() didn't fit */ + total = -E2BIG; + break; + } + total += ret; + remain -= ret; + } + spin_unlock(&ent->fs_fcheck->fc_lock); + +exit: + ocfs2_filecheck_sysfs_put(ent); + return total; +} + +static int +ocfs2_filecheck_erase_entry(struct ocfs2_filecheck_sysfs_entry *ent) +{ + struct ocfs2_filecheck_entry *p; + + list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) { + if (p->fe_done) { + list_del(&p->fe_list); + kfree(p); + ent->fs_fcheck->fc_size--; + ent->fs_fcheck->fc_done--; + return 1; + } + } + + return 0; +} + +static int +ocfs2_filecheck_erase_entries(struct ocfs2_filecheck_sysfs_entry *ent, + unsigned int count) +{ + unsigned int i = 0; + unsigned int ret = 0; + + while (i++ < count) { + if (ocfs2_filecheck_erase_entry(ent)) + ret++; + else + break; + } + + return (ret == count ? 1 : 0); +} + +static void +ocfs2_filecheck_done_entry(struct ocfs2_filecheck_sysfs_entry *ent, + struct ocfs2_filecheck_entry *entry) +{ + entry->fe_done = 1; + spin_lock(&ent->fs_fcheck->fc_lock); + ent->fs_fcheck->fc_done++; + spin_unlock(&ent->fs_fcheck->fc_lock); +} + +static unsigned int +ocfs2_filecheck_handle(struct super_block *sb, + unsigned long ino, unsigned int flags) +{ + unsigned int ret = OCFS2_FILECHECK_ERR_SUCCESS; + struct inode *inode = NULL; + int rc; + + inode = ocfs2_iget(OCFS2_SB(sb), ino, flags, 0); + if (IS_ERR(inode)) { + rc = (int)(-(long)inode); + if (rc >= OCFS2_FILECHECK_ERR_START && + rc < OCFS2_FILECHECK_ERR_END) + ret = rc; + else + ret = OCFS2_FILECHECK_ERR_FAILED; + } else + iput(inode); + + return ret; +} + +static void +ocfs2_filecheck_handle_entry(struct ocfs2_filecheck_sysfs_entry *ent, + struct ocfs2_filecheck_entry *entry) +{ + if (entry->fe_type == OCFS2_FILECHECK_TYPE_CHK) + entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb, + entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_CHK); + else if (entry->fe_type == OCFS2_FILECHECK_TYPE_FIX) + entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb, + entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_FIX); + else + entry->fe_status = OCFS2_FILECHECK_ERR_UNSUPPORTED; + + ocfs2_filecheck_done_entry(ent, entry); +} + +static ssize_t ocfs2_filecheck_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct ocfs2_filecheck_args args; + struct ocfs2_filecheck_entry *entry; + struct ocfs2_filecheck_sysfs_entry *ent; + ssize_t ret = 0; + + if (count == 0) + return count; + + if (ocfs2_filecheck_args_parse(attr->attr.name, buf, count, &args)) { + mlog(ML_ERROR, "Invalid arguments for online file check\n"); + return -EINVAL; + } + + ent = ocfs2_filecheck_sysfs_get(kobj->parent->name); + if (!ent) { + mlog(ML_ERROR, + "Cannot get the corresponding entry via device basename %s\n", + kobj->parent->name); + return -ENODEV; + } + + if (args.fa_type == OCFS2_FILECHECK_TYPE_SET) { + ret = ocfs2_filecheck_adjust_max(ent, args.fa_len); + goto exit; + } + + entry = kmalloc(sizeof(struct ocfs2_filecheck_entry), GFP_NOFS); + if (!entry) { + ret = -ENOMEM; + goto exit; + } + + spin_lock(&ent->fs_fcheck->fc_lock); + if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) && + (ent->fs_fcheck->fc_done == 0)) { + mlog(ML_ERROR, + "Cannot do more file check " + "since file check queue(%u) is full now\n", + ent->fs_fcheck->fc_max); + ret = -EBUSY; + kfree(entry); + } else { + if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) && + (ent->fs_fcheck->fc_done > 0)) { + /* Delete the oldest entry which was done, + * make sure the entry size in list does + * not exceed maximum value + */ + BUG_ON(!ocfs2_filecheck_erase_entry(ent)); + } + + entry->fe_ino = args.fa_ino; + entry->fe_type = args.fa_type; + entry->fe_done = 0; + entry->fe_status = OCFS2_FILECHECK_ERR_INPROGRESS; + list_add_tail(&entry->fe_list, &ent->fs_fcheck->fc_head); + ent->fs_fcheck->fc_size++; + } + spin_unlock(&ent->fs_fcheck->fc_lock); + + if (!ret) + ocfs2_filecheck_handle_entry(ent, entry); + +exit: + ocfs2_filecheck_sysfs_put(ent); + return (!ret ? count : ret); +} diff --git a/fs/ocfs2/filecheck.h b/fs/ocfs2/filecheck.h new file mode 100644 index 000000000000..e5cd002a2c09 --- /dev/null +++ b/fs/ocfs2/filecheck.h @@ -0,0 +1,49 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * filecheck.h + * + * Online file check. + * + * Copyright (C) 2016 SuSE. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + + +#ifndef FILECHECK_H +#define FILECHECK_H + +#include <linux/types.h> +#include <linux/list.h> + + +/* File check errno */ +enum { + OCFS2_FILECHECK_ERR_SUCCESS = 0, /* Success */ + OCFS2_FILECHECK_ERR_FAILED = 1000, /* Other failure */ + OCFS2_FILECHECK_ERR_INPROGRESS, /* In progress */ + OCFS2_FILECHECK_ERR_READONLY, /* Read only */ + OCFS2_FILECHECK_ERR_INJBD, /* Buffer in jbd */ + OCFS2_FILECHECK_ERR_INVALIDINO, /* Invalid ino */ + OCFS2_FILECHECK_ERR_BLOCKECC, /* Block ecc */ + OCFS2_FILECHECK_ERR_BLOCKNO, /* Block number */ + OCFS2_FILECHECK_ERR_VALIDFLAG, /* Inode valid flag */ + OCFS2_FILECHECK_ERR_GENERATION, /* Inode generation */ + OCFS2_FILECHECK_ERR_UNSUPPORTED /* Unsupported */ +}; + +#define OCFS2_FILECHECK_ERR_START OCFS2_FILECHECK_ERR_FAILED +#define OCFS2_FILECHECK_ERR_END OCFS2_FILECHECK_ERR_UNSUPPORTED + +int ocfs2_filecheck_create_sysfs(struct super_block *sb); +int ocfs2_filecheck_remove_sysfs(struct super_block *sb); + +#endif /* FILECHECK_H */ diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 8f87e05ee25d..12f4a9e9800f 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -53,6 +53,7 @@ #include "xattr.h" #include "refcounttree.h" #include "ocfs2_trace.h" +#include "filecheck.h" #include "buffer_head_io.h" @@ -74,6 +75,14 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, struct inode *inode, struct buffer_head *fe_bh); +static int ocfs2_filecheck_read_inode_block_full(struct inode *inode, + struct buffer_head **bh, + int flags, int type); +static int ocfs2_filecheck_validate_inode_block(struct super_block *sb, + struct buffer_head *bh); +static int ocfs2_filecheck_repair_inode_block(struct super_block *sb, + struct buffer_head *bh); + void ocfs2_set_inode_flags(struct inode *inode) { unsigned int flags = OCFS2_I(inode)->ip_attr; @@ -127,6 +136,7 @@ struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno) struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, int sysfile_type) { + int rc = 0; struct inode *inode = NULL; struct super_block *sb = osb->sb; struct ocfs2_find_inode_args args; @@ -161,12 +171,17 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, } trace_ocfs2_iget5_locked(inode->i_state); if (inode->i_state & I_NEW) { - ocfs2_read_locked_inode(inode, &args); + rc = ocfs2_read_locked_inode(inode, &args); unlock_new_inode(inode); } if (is_bad_inode(inode)) { iput(inode); - inode = ERR_PTR(-ESTALE); + if ((flags & OCFS2_FI_FLAG_FILECHECK_CHK) || + (flags & OCFS2_FI_FLAG_FILECHECK_FIX)) + /* Return OCFS2_FILECHECK_ERR_XXX related errno */ + inode = ERR_PTR(rc); + else + inode = ERR_PTR(-ESTALE); goto bail; } @@ -361,6 +376,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, break; case S_IFLNK: inode->i_op = &ocfs2_symlink_inode_operations; + inode_nohighmem(inode); i_size_write(inode, le64_to_cpu(fe->i_size)); break; default: @@ -409,7 +425,7 @@ static int ocfs2_read_locked_inode(struct inode *inode, struct ocfs2_super *osb; struct ocfs2_dinode *fe; struct buffer_head *bh = NULL; - int status, can_lock; + int status, can_lock, lock_level = 0; u32 generation = 0; status = -EINVAL; @@ -477,7 +493,7 @@ static int ocfs2_read_locked_inode(struct inode *inode, mlog_errno(status); return status; } - status = ocfs2_inode_lock(inode, NULL, 0); + status = ocfs2_inode_lock(inode, NULL, lock_level); if (status) { make_bad_inode(inode); mlog_errno(status); @@ -494,16 +510,32 @@ static int ocfs2_read_locked_inode(struct inode *inode, } if (can_lock) { - status = ocfs2_read_inode_block_full(inode, &bh, - OCFS2_BH_IGNORE_CACHE); + if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK) + status = ocfs2_filecheck_read_inode_block_full(inode, + &bh, OCFS2_BH_IGNORE_CACHE, 0); + else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX) + status = ocfs2_filecheck_read_inode_block_full(inode, + &bh, OCFS2_BH_IGNORE_CACHE, 1); + else + status = ocfs2_read_inode_block_full(inode, + &bh, OCFS2_BH_IGNORE_CACHE); } else { status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh); /* * If buffer is in jbd, then its checksum may not have been * computed as yet. */ - if (!status && !buffer_jbd(bh)) - status = ocfs2_validate_inode_block(osb->sb, bh); + if (!status && !buffer_jbd(bh)) { + if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK) + status = ocfs2_filecheck_validate_inode_block( + osb->sb, bh); + else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX) + status = ocfs2_filecheck_repair_inode_block( + osb->sb, bh); + else + status = ocfs2_validate_inode_block( + osb->sb, bh); + } } if (status < 0) { mlog_errno(status); @@ -531,11 +563,24 @@ static int ocfs2_read_locked_inode(struct inode *inode, BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno)); + if (buffer_dirty(bh) && !buffer_jbd(bh)) { + if (can_lock) { + ocfs2_inode_unlock(inode, lock_level); + lock_level = 1; + ocfs2_inode_lock(inode, NULL, lock_level); + } + status = ocfs2_write_block(osb, bh, INODE_CACHE(inode)); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + status = 0; bail: if (can_lock) - ocfs2_inode_unlock(inode, 0); + ocfs2_inode_unlock(inode, lock_level); if (status < 0) make_bad_inode(inode); @@ -629,10 +674,10 @@ static int ocfs2_remove_inode(struct inode *inode, goto bail; } - mutex_lock(&inode_alloc_inode->i_mutex); + inode_lock(inode_alloc_inode); status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1); if (status < 0) { - mutex_unlock(&inode_alloc_inode->i_mutex); + inode_unlock(inode_alloc_inode); mlog_errno(status); goto bail; @@ -679,7 +724,7 @@ bail_commit: ocfs2_commit_trans(osb, handle); bail_unlock: ocfs2_inode_unlock(inode_alloc_inode, 1); - mutex_unlock(&inode_alloc_inode->i_mutex); + inode_unlock(inode_alloc_inode); brelse(inode_alloc_bh); bail: iput(inode_alloc_inode); @@ -750,10 +795,10 @@ static int ocfs2_wipe_inode(struct inode *inode, /* Lock the orphan dir. The lock will be held for the entire * delete_inode operation. We do this now to avoid races with * recovery completion on other nodes. */ - mutex_lock(&orphan_dir_inode->i_mutex); + inode_lock(orphan_dir_inode); status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); if (status < 0) { - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); mlog_errno(status); goto bail; @@ -802,7 +847,7 @@ bail_unlock_dir: return status; ocfs2_inode_unlock(orphan_dir_inode, 1); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); brelse(orphan_dir_bh); bail: iput(orphan_dir_inode); @@ -1125,6 +1170,9 @@ static void ocfs2_clear_inode(struct inode *inode) mlog_bug_on_msg(!list_empty(&oi->ip_io_markers), "Clear inode of %llu, inode has io markers\n", (unsigned long long)oi->ip_blkno); + mlog_bug_on_msg(!list_empty(&oi->ip_unwritten_list), + "Clear inode of %llu, inode has unwritten extents\n", + (unsigned long long)oi->ip_blkno); ocfs2_extent_map_trunc(inode, 0); @@ -1396,6 +1444,169 @@ bail: return rc; } +static int ocfs2_filecheck_validate_inode_block(struct super_block *sb, + struct buffer_head *bh) +{ + int rc = 0; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; + + trace_ocfs2_filecheck_validate_inode_block( + (unsigned long long)bh->b_blocknr); + + BUG_ON(!buffer_uptodate(bh)); + + /* + * Call ocfs2_validate_meta_ecc() first since it has ecc repair + * function, but we should not return error immediately when ecc + * validation fails, because the reason is quite likely the invalid + * inode number inputed. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check); + if (rc) { + mlog(ML_ERROR, + "Filecheck: checksum failed for dinode %llu\n", + (unsigned long long)bh->b_blocknr); + rc = -OCFS2_FILECHECK_ERR_BLOCKECC; + } + + if (!OCFS2_IS_VALID_DINODE(di)) { + mlog(ML_ERROR, + "Filecheck: invalid dinode #%llu: signature = %.*s\n", + (unsigned long long)bh->b_blocknr, 7, di->i_signature); + rc = -OCFS2_FILECHECK_ERR_INVALIDINO; + goto bail; + } else if (rc) + goto bail; + + if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { + mlog(ML_ERROR, + "Filecheck: invalid dinode #%llu: i_blkno is %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(di->i_blkno)); + rc = -OCFS2_FILECHECK_ERR_BLOCKNO; + goto bail; + } + + if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { + mlog(ML_ERROR, + "Filecheck: invalid dinode #%llu: OCFS2_VALID_FL " + "not set\n", + (unsigned long long)bh->b_blocknr); + rc = -OCFS2_FILECHECK_ERR_VALIDFLAG; + goto bail; + } + + if (le32_to_cpu(di->i_fs_generation) != + OCFS2_SB(sb)->fs_generation) { + mlog(ML_ERROR, + "Filecheck: invalid dinode #%llu: fs_generation is %u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(di->i_fs_generation)); + rc = -OCFS2_FILECHECK_ERR_GENERATION; + goto bail; + } + +bail: + return rc; +} + +static int ocfs2_filecheck_repair_inode_block(struct super_block *sb, + struct buffer_head *bh) +{ + int changed = 0; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; + + if (!ocfs2_filecheck_validate_inode_block(sb, bh)) + return 0; + + trace_ocfs2_filecheck_repair_inode_block( + (unsigned long long)bh->b_blocknr); + + if (ocfs2_is_hard_readonly(OCFS2_SB(sb)) || + ocfs2_is_soft_readonly(OCFS2_SB(sb))) { + mlog(ML_ERROR, + "Filecheck: cannot repair dinode #%llu " + "on readonly filesystem\n", + (unsigned long long)bh->b_blocknr); + return -OCFS2_FILECHECK_ERR_READONLY; + } + + if (buffer_jbd(bh)) { + mlog(ML_ERROR, + "Filecheck: cannot repair dinode #%llu, " + "its buffer is in jbd\n", + (unsigned long long)bh->b_blocknr); + return -OCFS2_FILECHECK_ERR_INJBD; + } + + if (!OCFS2_IS_VALID_DINODE(di)) { + /* Cannot fix invalid inode block */ + return -OCFS2_FILECHECK_ERR_INVALIDINO; + } + + if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { + /* Cannot just add VALID_FL flag back as a fix, + * need more things to check here. + */ + return -OCFS2_FILECHECK_ERR_VALIDFLAG; + } + + if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { + di->i_blkno = cpu_to_le64(bh->b_blocknr); + changed = 1; + mlog(ML_ERROR, + "Filecheck: reset dinode #%llu: i_blkno to %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(di->i_blkno)); + } + + if (le32_to_cpu(di->i_fs_generation) != + OCFS2_SB(sb)->fs_generation) { + di->i_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); + changed = 1; + mlog(ML_ERROR, + "Filecheck: reset dinode #%llu: fs_generation to %u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(di->i_fs_generation)); + } + + if (changed || ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check)) { + ocfs2_compute_meta_ecc(sb, bh->b_data, &di->i_check); + mark_buffer_dirty(bh); + mlog(ML_ERROR, + "Filecheck: reset dinode #%llu: compute meta ecc\n", + (unsigned long long)bh->b_blocknr); + } + + return 0; +} + +static int +ocfs2_filecheck_read_inode_block_full(struct inode *inode, + struct buffer_head **bh, + int flags, int type) +{ + int rc; + struct buffer_head *tmp = *bh; + + if (!type) /* Check inode block */ + rc = ocfs2_read_blocks(INODE_CACHE(inode), + OCFS2_I(inode)->ip_blkno, + 1, &tmp, flags, + ocfs2_filecheck_validate_inode_block); + else /* Repair inode block */ + rc = ocfs2_read_blocks(INODE_CACHE(inode), + OCFS2_I(inode)->ip_blkno, + 1, &tmp, flags, + ocfs2_filecheck_repair_inode_block); + + /* If ocfs2_read_blocks() got us a new bh, pass it up. */ + if (!rc && !*bh) + *bh = tmp; + + return rc; +} + int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh, int flags) { diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index aac8b86f312e..d8f3fc8d2551 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -43,9 +43,6 @@ struct ocfs2_inode_info /* protects extended attribute changes on this inode */ struct rw_semaphore ip_xattr_sem; - /* Number of outstanding AIO's which are not page aligned */ - struct mutex ip_unaligned_aio; - /* These fields are protected by ip_lock */ spinlock_t ip_lock; u32 ip_open_count; @@ -57,6 +54,9 @@ struct ocfs2_inode_info u32 ip_flags; /* see below */ u32 ip_attr; /* inode attributes */ + /* Record unwritten extents during direct io. */ + struct list_head ip_unwritten_list; + /* protected by recovery_lock. */ struct inode *ip_next_orphan; @@ -139,6 +139,9 @@ int ocfs2_drop_inode(struct inode *inode); /* Flags for ocfs2_iget() */ #define OCFS2_FI_FLAG_SYSFILE 0x1 #define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2 +#define OCFS2_FI_FLAG_FILECHECK_CHK 0x4 +#define OCFS2_FI_FLAG_FILECHECK_FIX 0x8 + struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff); struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags, int sysfile_type); diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 3cb097ccce60..4506ec5ec2ea 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -86,7 +86,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, unsigned oldflags; int status; - mutex_lock(&inode->i_mutex); + inode_lock(inode); status = ocfs2_inode_lock(inode, &bh, 1); if (status < 0) { @@ -135,7 +135,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, bail_unlock: ocfs2_inode_unlock(inode, 1); bail: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); brelse(bh); @@ -287,7 +287,7 @@ static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, struct ocfs2_dinode *dinode_alloc = NULL; if (inode_alloc) - mutex_lock(&inode_alloc->i_mutex); + inode_lock(inode_alloc); if (o2info_coherent(&fi->ifi_req)) { status = ocfs2_inode_lock(inode_alloc, &bh, 0); @@ -317,7 +317,7 @@ bail: ocfs2_inode_unlock(inode_alloc, 0); if (inode_alloc) - mutex_unlock(&inode_alloc->i_mutex); + inode_unlock(inode_alloc); brelse(bh); @@ -547,7 +547,7 @@ static int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb, struct ocfs2_dinode *gb_dinode = NULL; if (gb_inode) - mutex_lock(&gb_inode->i_mutex); + inode_lock(gb_inode); if (o2info_coherent(&ffg->iff_req)) { status = ocfs2_inode_lock(gb_inode, &bh, 0); @@ -604,11 +604,9 @@ bail: ocfs2_inode_unlock(gb_inode, 0); if (gb_inode) - mutex_unlock(&gb_inode->i_mutex); - - if (gb_inode) - iput(gb_inode); + inode_unlock(gb_inode); + iput(gb_inode); brelse(bh); return status; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 13534f4fe5b5..e607419cdfa4 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -231,7 +231,7 @@ void ocfs2_recovery_exit(struct ocfs2_super *osb) /* At this point, we know that no more recovery threads can be * launched, so wait for any recovery completion work to * complete. */ - flush_workqueue(ocfs2_wq); + flush_workqueue(osb->ocfs2_wq); /* * Now that recovery is shut down, and the osb is about to be @@ -1042,8 +1042,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) // up_write(&journal->j_trans_barrier); done: - if (inode) - iput(inode); + iput(inode); } static void ocfs2_clear_journal_error(struct super_block *sb, @@ -1327,7 +1326,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, spin_lock(&journal->j_lock); list_add_tail(&item->lri_list, &journal->j_la_cleanups); - queue_work(ocfs2_wq, &journal->j_recovery_work); + queue_work(journal->j_osb->ocfs2_wq, &journal->j_recovery_work); spin_unlock(&journal->j_lock); } @@ -1687,9 +1686,7 @@ done: if (got_lock) ocfs2_inode_unlock(inode, 1); - if (inode) - iput(inode); - + iput(inode); brelse(bh); return status; @@ -1796,8 +1793,7 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb, ocfs2_inode_unlock(inode, 1); bail: - if (inode) - iput(inode); + iput(inode); return status; } @@ -1972,7 +1968,7 @@ static void ocfs2_orphan_scan_work(struct work_struct *work) mutex_lock(&os->os_lock); ocfs2_queue_orphan_scan(osb); if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE) - queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work, + queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work, ocfs2_orphan_scan_timeout()); mutex_unlock(&os->os_lock); } @@ -2012,7 +2008,7 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb) atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE); else { atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE); - queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work, + queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work, ocfs2_orphan_scan_timeout()); } } @@ -2092,7 +2088,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb, return status; } - mutex_lock(&orphan_dir_inode->i_mutex); + inode_lock(orphan_dir_inode); status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0); if (status < 0) { mlog_errno(status); @@ -2110,7 +2106,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb, out_cluster: ocfs2_inode_unlock(orphan_dir_inode, 0); out: - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); return status; } @@ -2200,7 +2196,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, oi->ip_next_orphan = NULL; if (oi->ip_flags & OCFS2_INODE_DIO_ORPHAN_ENTRY) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = ocfs2_rw_lock(inode, 1); if (ret < 0) { mlog_errno(ret); @@ -2239,7 +2235,7 @@ unlock_inode: unlock_rw: ocfs2_rw_unlock(inode, 1); unlock_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* clear dio flag in ocfs2_inode_info */ oi->ip_flags &= ~OCFS2_INODE_DIO_ORPHAN_ENTRY; diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 0a4457fb0711..fe0d1f9571bb 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -358,8 +358,7 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) bail: if (status < 0) brelse(alloc_bh); - if (inode) - iput(inode); + iput(inode); trace_ocfs2_load_local_alloc(osb->local_alloc_bits); @@ -387,7 +386,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) struct ocfs2_dinode *alloc = NULL; cancel_delayed_work(&osb->la_enable_wq); - flush_workqueue(ocfs2_wq); + flush_workqueue(osb->ocfs2_wq); if (osb->local_alloc_state == OCFS2_LA_UNUSED) goto out; @@ -415,7 +414,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) goto out; } - mutex_lock(&main_bm_inode->i_mutex); + inode_lock(main_bm_inode); status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); if (status < 0) { @@ -469,12 +468,11 @@ out_unlock: ocfs2_inode_unlock(main_bm_inode, 1); out_mutex: - mutex_unlock(&main_bm_inode->i_mutex); + inode_unlock(main_bm_inode); iput(main_bm_inode); out: - if (local_alloc_inode) - iput(local_alloc_inode); + iput(local_alloc_inode); kfree(alloc_copy); } @@ -508,7 +506,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, goto bail; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); status = ocfs2_read_inode_block_full(inode, &alloc_bh, OCFS2_BH_IGNORE_CACHE); @@ -541,7 +539,7 @@ bail: brelse(alloc_bh); if (inode) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); iput(inode); } @@ -573,7 +571,7 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb, goto out; } - mutex_lock(&main_bm_inode->i_mutex); + inode_lock(main_bm_inode); status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); if (status < 0) { @@ -603,7 +601,7 @@ out_unlock: ocfs2_inode_unlock(main_bm_inode, 1); out_mutex: - mutex_unlock(&main_bm_inode->i_mutex); + inode_unlock(main_bm_inode); brelse(main_bm_bh); @@ -645,7 +643,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, goto bail; } - mutex_lock(&local_alloc_inode->i_mutex); + inode_lock(local_alloc_inode); /* * We must double check state and allocator bits because @@ -711,7 +709,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, status = 0; bail: if (status < 0 && local_alloc_inode) { - mutex_unlock(&local_alloc_inode->i_mutex); + inode_unlock(local_alloc_inode); iput(local_alloc_inode); } @@ -1087,7 +1085,7 @@ static int ocfs2_recalc_la_window(struct ocfs2_super *osb, } else { osb->local_alloc_state = OCFS2_LA_DISABLED; } - queue_delayed_work(ocfs2_wq, &osb->la_enable_wq, + queue_delayed_work(osb->ocfs2_wq, &osb->la_enable_wq, OCFS2_LA_ENABLE_INTERVAL); goto out_unlock; } @@ -1327,9 +1325,7 @@ bail: brelse(main_bm_bh); - if (main_bm_inode) - iput(main_bm_inode); - + iput(main_bm_inode); kfree(alloc_copy); if (ac) diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c index 652ece4a9d9e..d56f0079b858 100644 --- a/fs/ocfs2/locks.c +++ b/fs/ocfs2/locks.c @@ -67,7 +67,10 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode, */ locks_lock_file_wait(file, - &(struct file_lock){.fl_type = F_UNLCK}); + &(struct file_lock) { + .fl_type = F_UNLCK, + .fl_flags = FL_FLOCK + }); ocfs2_file_unlock(file); } diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 9581d190f6e1..71545ad4628c 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -65,13 +65,13 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; loff_t pos = page_offset(page); - unsigned int len = PAGE_CACHE_SIZE; + unsigned int len = PAGE_SIZE; pgoff_t last_index; struct page *locked_page = NULL; void *fsdata; loff_t size = i_size_read(inode); - last_index = (size - 1) >> PAGE_CACHE_SHIFT; + last_index = (size - 1) >> PAGE_SHIFT; /* * There are cases that lead to the page no longer bebongs to the @@ -102,10 +102,10 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, * because the "write" would invalidate their data. */ if (page->index == last_index) - len = ((size - 1) & ~PAGE_CACHE_MASK) + 1; + len = ((size - 1) & ~PAGE_MASK) + 1; - ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page, - &fsdata, di_bh, page); + ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP, + &locked_page, &fsdata, di_bh, page); if (ret) { if (ret != -ENOSPC) mlog_errno(ret); @@ -147,6 +147,10 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = ocfs2_inode_lock(inode, &di_bh, 1); if (ret < 0) { mlog_errno(ret); + if (ret == -ENOMEM) + ret = VM_FAULT_OOM; + else + ret = VM_FAULT_SIGBUS; goto out; } diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 124471d26a73..e3d05d9901a3 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -276,7 +276,7 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, * context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; */ - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); if (ocfs2_truncate_log_needs_flush(osb)) { ret = __ocfs2_flush_truncate_log(osb); @@ -338,7 +338,7 @@ out_commit: ocfs2_commit_trans(osb, handle); out_unlock_mutex: - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); if (context->data_ac) { ocfs2_free_alloc_context(context->data_ac); @@ -632,7 +632,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, goto out; } - mutex_lock(&gb_inode->i_mutex); + inode_lock(gb_inode); ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1); if (ret) { @@ -640,7 +640,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, goto out_unlock_gb_mutex; } - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { @@ -708,11 +708,11 @@ out_commit: brelse(gd_bh); out_unlock_tl_inode: - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); ocfs2_inode_unlock(gb_inode, 1); out_unlock_gb_mutex: - mutex_unlock(&gb_inode->i_mutex); + inode_unlock(gb_inode); brelse(gb_bh); iput(gb_inode); @@ -905,7 +905,7 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) return -EROFS; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * This prevents concurrent writes from other nodes @@ -969,7 +969,7 @@ out_inode_unlock: out_rw_unlock: ocfs2_rw_unlock(inode, 1); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return status; } diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 3b48ac25d8a7..6b3e87189a64 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -367,7 +367,7 @@ static int ocfs2_mknod(struct inode *dir, goto leave; } - status = posix_acl_create(dir, &mode, &default_acl, &acl); + status = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); if (status) { mlog_errno(status); goto leave; @@ -1045,7 +1045,7 @@ leave: if (orphan_dir) { /* This was locked for us in ocfs2_prepare_orphan_dir() */ ocfs2_inode_unlock(orphan_dir, 1); - mutex_unlock(&orphan_dir->i_mutex); + inode_unlock(orphan_dir); iput(orphan_dir); } @@ -1664,7 +1664,7 @@ bail: if (orphan_dir) { /* This was locked for us in ocfs2_prepare_orphan_dir() */ ocfs2_inode_unlock(orphan_dir, 1); - mutex_unlock(&orphan_dir->i_mutex); + inode_unlock(orphan_dir); iput(orphan_dir); } @@ -1683,8 +1683,7 @@ bail: if (new_inode) sync_mapping_buffers(old_inode->i_mapping); - if (new_inode) - iput(new_inode); + iput(new_inode); ocfs2_free_dir_lookup_result(&target_lookup_res); ocfs2_free_dir_lookup_result(&old_entry_lookup); @@ -1958,6 +1957,7 @@ static int ocfs2_symlink(struct inode *dir, inode->i_rdev = 0; newsize = l - 1; inode->i_op = &ocfs2_symlink_inode_operations; + inode_nohighmem(inode); if (l > ocfs2_fast_symlink_chars(sb)) { u32 offset = 0; @@ -2121,11 +2121,11 @@ static int ocfs2_lookup_lock_orphan_dir(struct ocfs2_super *osb, return ret; } - mutex_lock(&orphan_dir_inode->i_mutex); + inode_lock(orphan_dir_inode); ret = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); if (ret < 0) { - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); mlog_errno(ret); @@ -2226,7 +2226,7 @@ out: if (ret) { ocfs2_inode_unlock(orphan_dir_inode, 1); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); } @@ -2372,6 +2372,15 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno, name, strlen(name)); + status = ocfs2_journal_access_di(handle, + INODE_CACHE(orphan_dir_inode), + orphan_dir_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + /* find it's spot in the orphan directory */ status = ocfs2_find_entry(name, strlen(name), orphan_dir_inode, &lookup); @@ -2387,15 +2396,6 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, goto leave; } - status = ocfs2_journal_access_di(handle, - INODE_CACHE(orphan_dir_inode), - orphan_dir_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto leave; - } - /* do the i_nlink dance! :) */ orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; if (S_ISDIR(inode->i_mode)) @@ -2495,7 +2495,7 @@ out: ocfs2_free_alloc_context(inode_ac); /* Unroll orphan dir locking */ - mutex_unlock(&orphan_dir->i_mutex); + inode_unlock(orphan_dir); ocfs2_inode_unlock(orphan_dir, 1); iput(orphan_dir); } @@ -2602,7 +2602,7 @@ leave: if (orphan_dir) { /* This was locked for us in ocfs2_prepare_orphan_dir() */ ocfs2_inode_unlock(orphan_dir, 1); - mutex_unlock(&orphan_dir->i_mutex); + inode_unlock(orphan_dir); iput(orphan_dir); } @@ -2689,7 +2689,7 @@ int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, bail_unlock_orphan: ocfs2_inode_unlock(orphan_dir_inode, 1); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); ocfs2_free_dir_lookup_result(&orphan_insert); @@ -2721,10 +2721,10 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, goto bail; } - mutex_lock(&orphan_dir_inode->i_mutex); + inode_lock(orphan_dir_inode); status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); if (status < 0) { - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); mlog_errno(status); goto bail; @@ -2770,7 +2770,7 @@ bail_commit: bail_unlock_orphan: ocfs2_inode_unlock(orphan_dir_inode, 1); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); brelse(orphan_dir_bh); iput(orphan_dir_inode); @@ -2834,12 +2834,12 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, goto leave; } - mutex_lock(&orphan_dir_inode->i_mutex); + inode_lock(orphan_dir_inode); status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); if (status < 0) { mlog_errno(status); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); goto leave; } @@ -2901,7 +2901,7 @@ out_commit: ocfs2_commit_trans(osb, handle); orphan_unlock: ocfs2_inode_unlock(orphan_dir_inode, 1); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); leave: diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 7a0126267847..e63af7ddfe68 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -464,6 +464,14 @@ struct ocfs2_super struct ocfs2_refcount_tree *osb_ref_tree_lru; struct mutex system_file_mutex; + + /* + * OCFS2 needs to schedule several different types of work which + * require cluster locking, disk I/O, recovery waits, etc. Since these + * types of work tend to be heavy we avoid using the kernel events + * workqueue and schedule on our own. + */ + struct workqueue_struct *ocfs2_wq; }; #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) @@ -814,10 +822,10 @@ static inline unsigned int ocfs2_page_index_to_clusters(struct super_block *sb, u32 clusters = pg_index; unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; - if (unlikely(PAGE_CACHE_SHIFT > cbits)) - clusters = pg_index << (PAGE_CACHE_SHIFT - cbits); - else if (PAGE_CACHE_SHIFT < cbits) - clusters = pg_index >> (cbits - PAGE_CACHE_SHIFT); + if (unlikely(PAGE_SHIFT > cbits)) + clusters = pg_index << (PAGE_SHIFT - cbits); + else if (PAGE_SHIFT < cbits) + clusters = pg_index >> (cbits - PAGE_SHIFT); return clusters; } @@ -831,10 +839,10 @@ static inline pgoff_t ocfs2_align_clusters_to_page_index(struct super_block *sb, unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; pgoff_t index = clusters; - if (PAGE_CACHE_SHIFT > cbits) { - index = (pgoff_t)clusters >> (PAGE_CACHE_SHIFT - cbits); - } else if (PAGE_CACHE_SHIFT < cbits) { - index = (pgoff_t)clusters << (cbits - PAGE_CACHE_SHIFT); + if (PAGE_SHIFT > cbits) { + index = (pgoff_t)clusters >> (PAGE_SHIFT - cbits); + } else if (PAGE_SHIFT < cbits) { + index = (pgoff_t)clusters << (cbits - PAGE_SHIFT); } return index; @@ -845,8 +853,8 @@ static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb) unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; unsigned int pages_per_cluster = 1; - if (PAGE_CACHE_SHIFT < cbits) - pages_per_cluster = 1 << (cbits - PAGE_CACHE_SHIFT); + if (PAGE_SHIFT < cbits) + pages_per_cluster = 1 << (cbits - PAGE_SHIFT); return pages_per_cluster; } diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index 6cb019b7c6a8..f8f5fc5e6c05 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -1450,28 +1450,20 @@ DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range); TRACE_EVENT(ocfs2_prepare_inode_for_write, TP_PROTO(unsigned long long ino, unsigned long long saved_pos, - int appending, unsigned long count, - int *direct_io, int *has_refcount), - TP_ARGS(ino, saved_pos, appending, count, direct_io, has_refcount), + unsigned long count), + TP_ARGS(ino, saved_pos, count), TP_STRUCT__entry( __field(unsigned long long, ino) __field(unsigned long long, saved_pos) - __field(int, appending) __field(unsigned long, count) - __field(int, direct_io) - __field(int, has_refcount) ), TP_fast_assign( __entry->ino = ino; __entry->saved_pos = saved_pos; - __entry->appending = appending; __entry->count = count; - __entry->direct_io = direct_io ? *direct_io : -1; - __entry->has_refcount = has_refcount ? *has_refcount : -1; ), - TP_printk("%llu %llu %d %lu %d %d", __entry->ino, - __entry->saved_pos, __entry->appending, __entry->count, - __entry->direct_io, __entry->has_refcount) + TP_printk("%llu %llu %lu", __entry->ino, + __entry->saved_pos, __entry->count) ); DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret); @@ -1540,6 +1532,8 @@ DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_read_locked_inode); DEFINE_OCFS2_INT_INT_EVENT(ocfs2_check_orphan_recovery_state); DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_inode_block); +DEFINE_OCFS2_ULL_EVENT(ocfs2_filecheck_validate_inode_block); +DEFINE_OCFS2_ULL_EVENT(ocfs2_filecheck_repair_inode_block); TRACE_EVENT(ocfs2_inode_is_valid_to_delete, TP_PROTO(void *task, void *dc_task, unsigned long long ino, @@ -2035,6 +2029,8 @@ DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_release_dquot); DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_acquire_dquot); +DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_get_next_id); + DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_mark_dquot_dirty); /* End of trace events for fs/ocfs2/quota_global.c. */ diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h index b6d51333ad02..d153e6e31529 100644 --- a/fs/ocfs2/quota.h +++ b/fs/ocfs2/quota.h @@ -82,7 +82,7 @@ struct ocfs2_quota_chunk { extern struct kmem_cache *ocfs2_dquot_cachep; extern struct kmem_cache *ocfs2_qf_chunk_cachep; -extern struct qtree_fmt_operations ocfs2_global_ops; +extern const struct qtree_fmt_operations ocfs2_global_ops; struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery( struct ocfs2_super *osb, int slot_num); diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index c93d67220887..ab6a6cdcf91c 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -123,7 +123,7 @@ static int ocfs2_global_is_id(void *dp, struct dquot *dquot) dquot->dq_id); } -struct qtree_fmt_operations ocfs2_global_ops = { +const struct qtree_fmt_operations ocfs2_global_ops = { .mem2disk_dqblk = ocfs2_global_mem2diskdqb, .disk2mem_dqblk = ocfs2_global_disk2memdqb, .is_id = ocfs2_global_is_id, @@ -308,7 +308,7 @@ int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex) WARN_ON(bh != oinfo->dqi_gqi_bh); spin_unlock(&dq_data_lock); if (ex) { - mutex_lock(&oinfo->dqi_gqinode->i_mutex); + inode_lock(oinfo->dqi_gqinode); down_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); } else { down_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); @@ -320,7 +320,7 @@ void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex) { if (ex) { up_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); - mutex_unlock(&oinfo->dqi_gqinode->i_mutex); + inode_unlock(oinfo->dqi_gqinode); } else { up_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); } @@ -726,7 +726,7 @@ static int ocfs2_release_dquot(struct dquot *dquot) dqgrab(dquot); /* First entry on list -> queue work */ if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list)) - queue_work(ocfs2_wq, &osb->dquot_drop_work); + queue_work(osb->ocfs2_wq, &osb->dquot_drop_work); goto out; } status = ocfs2_lock_global_qf(oinfo, 1); @@ -860,6 +860,37 @@ out: return status; } +static int ocfs2_get_next_id(struct super_block *sb, struct kqid *qid) +{ + int type = qid->type; + struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv; + int status = 0; + + trace_ocfs2_get_next_id(from_kqid(&init_user_ns, *qid), type); + if (!sb_has_quota_loaded(sb, type)) { + status = -ESRCH; + goto out; + } + status = ocfs2_lock_global_qf(info, 0); + if (status < 0) + goto out; + status = ocfs2_qinfo_lock(info, 0); + if (status < 0) + goto out_global; + status = qtree_get_next_id(&info->dqi_gi, qid); + ocfs2_qinfo_unlock(info, 0); +out_global: + ocfs2_unlock_global_qf(info, 0); +out: + /* + * Avoid logging ENOENT since it just means there isn't next ID and + * ESRCH which means quota isn't enabled for the filesystem. + */ + if (status && status != -ENOENT && status != -ESRCH) + mlog_errno(status); + return status; +} + static int ocfs2_mark_dquot_dirty(struct dquot *dquot) { unsigned long mask = (1 << (DQ_LASTSET_B + QIF_ILIMITS_B)) | @@ -968,4 +999,5 @@ const struct dquot_operations ocfs2_quota_operations = { .write_info = ocfs2_write_info, .alloc_dquot = ocfs2_alloc_dquot, .destroy_dquot = ocfs2_destroy_dquot, + .get_next_id = ocfs2_get_next_id, }; diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 252119860e6c..744d5d90c363 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -807,7 +807,7 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh) mlog_errno(ret); goto out; } - mutex_lock(&alloc_inode->i_mutex); + inode_lock(alloc_inode); ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1); if (ret) { @@ -867,7 +867,7 @@ out_unlock: } out_mutex: if (alloc_inode) { - mutex_unlock(&alloc_inode->i_mutex); + inode_unlock(alloc_inode); iput(alloc_inode); } out: @@ -2937,16 +2937,16 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, end = i_size_read(inode); while (offset < end) { - page_index = offset >> PAGE_CACHE_SHIFT; - map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT; + page_index = offset >> PAGE_SHIFT; + map_end = ((loff_t)page_index + 1) << PAGE_SHIFT; if (map_end > end) map_end = end; /* from, to is the offset within the page. */ - from = offset & (PAGE_CACHE_SIZE - 1); - to = PAGE_CACHE_SIZE; - if (map_end & (PAGE_CACHE_SIZE - 1)) - to = map_end & (PAGE_CACHE_SIZE - 1); + from = offset & (PAGE_SIZE - 1); + to = PAGE_SIZE; + if (map_end & (PAGE_SIZE - 1)) + to = map_end & (PAGE_SIZE - 1); page = find_or_create_page(mapping, page_index, GFP_NOFS); if (!page) { @@ -2956,10 +2956,10 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, } /* - * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page + * In case PAGE_SIZE <= CLUSTER_SIZE, This page * can't be dirtied before we CoW it out. */ - if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize) + if (PAGE_SIZE <= OCFS2_SB(sb)->s_clustersize) BUG_ON(PageDirty(page)); if (!PageUptodate(page)) { @@ -2987,7 +2987,7 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, mark_page_accessed(page); unlock: unlock_page(page); - page_cache_release(page); + put_page(page); page = NULL; offset = map_end; if (ret) @@ -3165,8 +3165,8 @@ int ocfs2_cow_sync_writeback(struct super_block *sb, } while (offset < end) { - page_index = offset >> PAGE_CACHE_SHIFT; - map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT; + page_index = offset >> PAGE_SHIFT; + map_end = ((loff_t)page_index + 1) << PAGE_SHIFT; if (map_end > end) map_end = end; @@ -3182,7 +3182,7 @@ int ocfs2_cow_sync_writeback(struct super_block *sb, mark_page_accessed(page); unlock_page(page); - page_cache_release(page); + put_page(page); page = NULL; offset = map_end; if (ret) @@ -4197,7 +4197,7 @@ static int __ocfs2_reflink(struct dentry *old_dentry, goto out; } - mutex_lock_nested(&new_inode->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(new_inode, I_MUTEX_CHILD); ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1, OI_LS_REFLINK_TARGET); if (ret) { @@ -4231,7 +4231,7 @@ inode_unlock: ocfs2_inode_unlock(new_inode, 1); brelse(new_bh); out_unlock: - mutex_unlock(&new_inode->i_mutex); + inode_unlock(new_inode); out: if (!ret) { ret = filemap_fdatawait(inode->i_mapping); @@ -4402,11 +4402,11 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, return error; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); error = dquot_initialize(dir); if (!error) error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (!error) fsnotify_create(dir, new_dentry); return error; diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index d5da6f624142..18451e0fab81 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -54,11 +54,12 @@ static u16 ocfs2_calc_new_backup_super(struct inode *inode, struct ocfs2_group_desc *gd, u16 cl_cpg, + u16 old_bg_clusters, int set) { int i; u16 backups = 0; - u32 cluster; + u32 cluster, lgd_cluster; u64 blkno, gd_blkno, lgd_blkno = le64_to_cpu(gd->bg_blkno); for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) { @@ -71,6 +72,12 @@ static u16 ocfs2_calc_new_backup_super(struct inode *inode, else if (gd_blkno > lgd_blkno) break; + /* check if already done backup super */ + lgd_cluster = ocfs2_blocks_to_clusters(inode->i_sb, lgd_blkno); + lgd_cluster += old_bg_clusters; + if (lgd_cluster >= cluster) + continue; + if (set) ocfs2_set_bit(cluster % cl_cpg, (unsigned long *)gd->bg_bitmap); @@ -99,6 +106,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, u16 chain, num_bits, backups = 0; u16 cl_bpc = le16_to_cpu(cl->cl_bpc); u16 cl_cpg = le16_to_cpu(cl->cl_cpg); + u16 old_bg_clusters; trace_ocfs2_update_last_group_and_inode(new_clusters, first_new_cluster); @@ -112,6 +120,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, group = (struct ocfs2_group_desc *)group_bh->b_data; + old_bg_clusters = le16_to_cpu(group->bg_bits) / cl_bpc; /* update the group first. */ num_bits = new_clusters * cl_bpc; le16_add_cpu(&group->bg_bits, num_bits); @@ -125,7 +134,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, OCFS2_FEATURE_COMPAT_BACKUP_SB)) { backups = ocfs2_calc_new_backup_super(bm_inode, group, - cl_cpg, 1); + cl_cpg, old_bg_clusters, 1); le16_add_cpu(&group->bg_free_bits_count, -1 * backups); } @@ -163,7 +172,7 @@ out_rollback: if (ret < 0) { ocfs2_calc_new_backup_super(bm_inode, group, - cl_cpg, 0); + cl_cpg, old_bg_clusters, 0); le16_add_cpu(&group->bg_free_bits_count, backups); le16_add_cpu(&group->bg_bits, -1 * num_bits); le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits); @@ -187,7 +196,7 @@ static int update_backups(struct inode * inode, u32 clusters, char *data) for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) { blkno = ocfs2_backup_super_blkno(inode->i_sb, i); cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno); - if (cluster > clusters) + if (cluster >= clusters) break; ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup); @@ -292,7 +301,7 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters) goto out; } - mutex_lock(&main_bm_inode->i_mutex); + inode_lock(main_bm_inode); ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); if (ret < 0) { @@ -366,7 +375,7 @@ out_unlock: ocfs2_inode_unlock(main_bm_inode, 1); out_mutex: - mutex_unlock(&main_bm_inode->i_mutex); + inode_unlock(main_bm_inode); iput(main_bm_inode); out: @@ -477,7 +486,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) goto out; } - mutex_lock(&main_bm_inode->i_mutex); + inode_lock(main_bm_inode); ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); if (ret < 0) { @@ -581,7 +590,7 @@ out_unlock: ocfs2_inode_unlock(main_bm_inode, 1); out_mutex: - mutex_unlock(&main_bm_inode->i_mutex); + inode_unlock(main_bm_inode); iput(main_bm_inode); out: diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index e78a203d44c8..1e09592148ad 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -322,8 +322,7 @@ static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si) if (si == NULL) return; - if (si->si_inode) - iput(si->si_inode); + iput(si->si_inode); if (si->si_bh) { for (i = 0; i < si->si_blocks; i++) { if (si->si_bh[i]) { @@ -503,8 +502,17 @@ int ocfs2_find_slot(struct ocfs2_super *osb) trace_ocfs2_find_slot(osb->slot_num); status = ocfs2_update_disk_slot(osb, si, osb->slot_num); - if (status < 0) + if (status < 0) { mlog_errno(status); + /* + * if write block failed, invalidate slot to avoid overwrite + * slot during dismount in case another node rightly has mounted + */ + spin_lock(&osb->osb_lock); + ocfs2_invalidate_slot(si, osb->slot_num); + osb->slot_num = OCFS2_INVALID_SLOT; + spin_unlock(&osb->osb_lock); + } bail: return status; diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 5d965e83bd43..13219ed73e1d 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -629,7 +629,8 @@ static struct attribute_group ocfs2_attr_group = { .attrs = ocfs2_attrs, }; -static struct kset *ocfs2_kset; +struct kset *ocfs2_kset; +EXPORT_SYMBOL_GPL(ocfs2_kset); static void ocfs2_sysfs_exit(void) { diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h index 66334a30cea8..f2dce10fae54 100644 --- a/fs/ocfs2/stackglue.h +++ b/fs/ocfs2/stackglue.h @@ -298,4 +298,6 @@ void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_p int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin); void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin); +extern struct kset *ocfs2_kset; + #endif /* STACKGLUE_H */ diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index fc6d25f6d444..2f19aeec5482 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -141,7 +141,7 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) if (ac->ac_which != OCFS2_AC_USE_LOCAL) ocfs2_inode_unlock(inode, 1); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); iput(inode); ac->ac_inode = NULL; @@ -797,11 +797,11 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, return -EINVAL; } - mutex_lock(&alloc_inode->i_mutex); + inode_lock(alloc_inode); status = ocfs2_inode_lock(alloc_inode, &bh, 1); if (status < 0) { - mutex_unlock(&alloc_inode->i_mutex); + inode_unlock(alloc_inode); iput(alloc_inode); mlog_errno(status); @@ -2875,10 +2875,10 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) goto bail; } - mutex_lock(&inode_alloc_inode->i_mutex); + inode_lock(inode_alloc_inode); status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0); if (status < 0) { - mutex_unlock(&inode_alloc_inode->i_mutex); + inode_unlock(inode_alloc_inode); iput(inode_alloc_inode); mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n", (u32)suballoc_slot, status); @@ -2891,7 +2891,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) mlog(ML_ERROR, "test suballoc bit failed %d\n", status); ocfs2_inode_unlock(inode_alloc_inode, 0); - mutex_unlock(&inode_alloc_inode->i_mutex); + inode_unlock(inode_alloc_inode); iput(inode_alloc_inode); brelse(alloc_bh); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 2de4c8a9340c..d7cae3327de5 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -74,17 +74,12 @@ #include "suballoc.h" #include "buffer_head_io.h" +#include "filecheck.h" static struct kmem_cache *ocfs2_inode_cachep; struct kmem_cache *ocfs2_dquot_cachep; struct kmem_cache *ocfs2_qf_chunk_cachep; -/* OCFS2 needs to schedule several different types of work which - * require cluster locking, disk I/O, recovery waits, etc. Since these - * types of work tend to be heavy we avoid using the kernel events - * workqueue and schedule on our own. */ -struct workqueue_struct *ocfs2_wq = NULL; - static struct dentry *ocfs2_debugfs_root; MODULE_AUTHOR("Oracle"); @@ -236,6 +231,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) struct ocfs2_recovery_map *rm = osb->recovery_map; struct ocfs2_orphan_scan *os = &osb->osb_orphan_scan; int i, out = 0; + unsigned long flags; out += snprintf(buf + out, len - out, "%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n", @@ -271,14 +267,14 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) cconn->cc_version.pv_minor); } - spin_lock(&osb->dc_task_lock); + spin_lock_irqsave(&osb->dc_task_lock, flags); out += snprintf(buf + out, len - out, "%10s => Pid: %d Count: %lu WakeSeq: %lu " "WorkSeq: %lu\n", "DownCnvt", (osb->dc_task ? task_pid_nr(osb->dc_task) : -1), osb->blocked_lock_count, osb->dc_wake_sequence, osb->dc_work_sequence); - spin_unlock(&osb->dc_task_lock); + spin_unlock_irqrestore(&osb->dc_task_lock, flags); spin_lock(&osb->osb_lock); out += snprintf(buf + out, len - out, "%10s => Pid: %d Nodes:", @@ -609,8 +605,8 @@ static unsigned long long ocfs2_max_file_offset(unsigned int bbits, /* * We might be limited by page cache size. */ - if (bytes > PAGE_CACHE_SIZE) { - bytes = PAGE_CACHE_SIZE; + if (bytes > PAGE_SIZE) { + bytes = PAGE_SIZE; trim = 1; /* * Shift by 31 here so that we don't get larger than @@ -1204,6 +1200,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) /* Start this when the mount is almost sure of being successful */ ocfs2_orphan_scan_start(osb); + /* Create filecheck sysfile /sys/fs/ocfs2/<devname>/filecheck */ + ocfs2_filecheck_create_sysfs(sb); + return status; read_super_error: @@ -1280,6 +1279,8 @@ static int ocfs2_parse_options(struct super_block *sb, int status, user_stack = 0; char *p; u32 tmp; + int token, option; + substring_t args[MAX_OPT_ARGS]; trace_ocfs2_parse_options(is_remount, options ? options : "(none)"); @@ -1298,9 +1299,6 @@ static int ocfs2_parse_options(struct super_block *sb, } while ((p = strsep(&options, ",")) != NULL) { - int token, option; - substring_t args[MAX_OPT_ARGS]; - if (!*p) continue; @@ -1367,7 +1365,6 @@ static int ocfs2_parse_options(struct super_block *sb, mopt->atime_quantum = option; break; case Opt_slot: - option = 0; if (match_int(&args[0], &option)) { status = 0; goto bail; @@ -1376,7 +1373,6 @@ static int ocfs2_parse_options(struct super_block *sb, mopt->slot = (s16)option; break; case Opt_commit: - option = 0; if (match_int(&args[0], &option)) { status = 0; goto bail; @@ -1388,7 +1384,6 @@ static int ocfs2_parse_options(struct super_block *sb, mopt->commit_interval = HZ * option; break; case Opt_localalloc: - option = 0; if (match_int(&args[0], &option)) { status = 0; goto bail; @@ -1612,33 +1607,25 @@ static int __init ocfs2_init(void) if (status < 0) goto out2; - ocfs2_wq = create_singlethread_workqueue("ocfs2_wq"); - if (!ocfs2_wq) { - status = -ENOMEM; - goto out3; - } - ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL); if (!ocfs2_debugfs_root) { status = -ENOMEM; mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); - goto out4; + goto out3; } ocfs2_set_locking_protocol(); status = register_quota_format(&ocfs2_quota_format); if (status < 0) - goto out4; + goto out3; status = register_filesystem(&ocfs2_fs_type); if (!status) return 0; unregister_quota_format(&ocfs2_quota_format); -out4: - destroy_workqueue(ocfs2_wq); - debugfs_remove(ocfs2_debugfs_root); out3: + debugfs_remove(ocfs2_debugfs_root); ocfs2_free_mem_caches(); out2: exit_ocfs2_uptodate_cache(); @@ -1649,11 +1636,6 @@ out1: static void __exit ocfs2_exit(void) { - if (ocfs2_wq) { - flush_workqueue(ocfs2_wq); - destroy_workqueue(ocfs2_wq); - } - unregister_quota_format(&ocfs2_quota_format); debugfs_remove(ocfs2_debugfs_root); @@ -1671,6 +1653,7 @@ static void ocfs2_put_super(struct super_block *sb) ocfs2_sync_blockdev(sb); ocfs2_dismount_volume(sb, 0); + ocfs2_filecheck_remove_sysfs(sb); } static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf) @@ -1726,8 +1709,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf) ocfs2_inode_unlock(inode, 0); status = 0; bail: - if (inode) - iput(inode); + iput(inode); if (status) mlog_errno(status); @@ -1744,8 +1726,8 @@ static void ocfs2_inode_init_once(void *data) spin_lock_init(&oi->ip_lock); ocfs2_extent_map_init(&oi->vfs_inode); INIT_LIST_HEAD(&oi->ip_io_markers); + INIT_LIST_HEAD(&oi->ip_unwritten_list); oi->ip_dir_start_lookup = 0; - mutex_init(&oi->ip_unaligned_aio); init_rwsem(&oi->ip_alloc_sem); init_rwsem(&oi->ip_xattr_sem); mutex_init(&oi->ip_io_mutex); @@ -1771,7 +1753,7 @@ static int ocfs2_initialize_mem_caches(void) sizeof(struct ocfs2_inode_info), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), ocfs2_inode_init_once); ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache", sizeof(struct ocfs2_dquot), @@ -2348,6 +2330,12 @@ static int ocfs2_initialize_super(struct super_block *sb, } cleancache_init_shared_fs(sb); + osb->ocfs2_wq = create_singlethread_workqueue("ocfs2_wq"); + if (!osb->ocfs2_wq) { + status = -ENOMEM; + mlog_errno(status); + } + bail: return status; } @@ -2535,6 +2523,12 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb) { /* This function assumes that the caller has the main osb resource */ + /* ocfs2_initializer_super have already created this workqueue */ + if (osb->ocfs2_wq) { + flush_workqueue(osb->ocfs2_wq); + destroy_workqueue(osb->ocfs2_wq); + } + ocfs2_free_slot_info(osb); kfree(osb->osb_orphan_wipes); diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h index b477d0b1c7b6..b023e4f3d740 100644 --- a/fs/ocfs2/super.h +++ b/fs/ocfs2/super.h @@ -26,8 +26,6 @@ #ifndef OCFS2_SUPER_H #define OCFS2_SUPER_H -extern struct workqueue_struct *ocfs2_wq; - int ocfs2_publish_get_mount_state(struct ocfs2_super *osb, int node_num); diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index 66edce7ecfd7..6c2a3e3c521c 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -88,8 +88,7 @@ const struct address_space_operations ocfs2_fast_symlink_aops = { const struct inode_operations ocfs2_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, + .get_link = page_get_link, .getattr = ocfs2_getattr, .setattr = ocfs2_setattr, .setxattr = generic_setxattr, diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index ebfdea78659b..7d3d979f57d9 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -544,8 +544,7 @@ static inline const char *ocfs2_xattr_prefix(int name_index) if (name_index > 0 && name_index < OCFS2_XATTR_MAX) handler = ocfs2_xattr_handler_map[name_index]; - - return handler ? handler->prefix : NULL; + return handler ? xattr_prefix(handler) : NULL; } static u32 ocfs2_xattr_name_hash(struct inode *inode, @@ -884,14 +883,39 @@ static int ocfs2_xattr_value_truncate(struct inode *inode, return ret; } -static int ocfs2_xattr_list_entry(char *buffer, size_t size, - size_t *result, const char *prefix, +static int ocfs2_xattr_list_entry(struct super_block *sb, + char *buffer, size_t size, + size_t *result, int type, const char *name, int name_len) { char *p = buffer + *result; - int prefix_len = strlen(prefix); - int total_len = prefix_len + name_len + 1; + const char *prefix; + int prefix_len; + int total_len; + switch(type) { + case OCFS2_XATTR_INDEX_USER: + if (OCFS2_SB(sb)->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) + return 0; + break; + + case OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS: + case OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT: + if (!(sb->s_flags & MS_POSIXACL)) + return 0; + break; + + case OCFS2_XATTR_INDEX_TRUSTED: + if (!capable(CAP_SYS_ADMIN)) + return 0; + break; + } + + prefix = ocfs2_xattr_prefix(type); + if (!prefix) + return 0; + prefix_len = strlen(prefix); + total_len = prefix_len + name_len + 1; *result += total_len; /* we are just looking for how big our buffer needs to be */ @@ -914,23 +938,20 @@ static int ocfs2_xattr_list_entries(struct inode *inode, { size_t result = 0; int i, type, ret; - const char *prefix, *name; + const char *name; for (i = 0 ; i < le16_to_cpu(header->xh_count); i++) { struct ocfs2_xattr_entry *entry = &header->xh_entries[i]; type = ocfs2_xattr_get_type(entry); - prefix = ocfs2_xattr_prefix(type); - - if (prefix) { - name = (const char *)header + - le16_to_cpu(entry->xe_name_offset); + name = (const char *)header + + le16_to_cpu(entry->xe_name_offset); - ret = ocfs2_xattr_list_entry(buffer, buffer_size, - &result, prefix, name, - entry->xe_name_len); - if (ret) - return ret; - } + ret = ocfs2_xattr_list_entry(inode->i_sb, + buffer, buffer_size, + &result, type, name, + entry->xe_name_len); + if (ret) + return ret; } return result; @@ -2503,7 +2524,7 @@ static int ocfs2_xattr_free_block(struct inode *inode, mlog_errno(ret); goto out; } - mutex_lock(&xb_alloc_inode->i_mutex); + inode_lock(xb_alloc_inode); ret = ocfs2_inode_lock(xb_alloc_inode, &xb_alloc_bh, 1); if (ret < 0) { @@ -2528,7 +2549,7 @@ out_unlock: ocfs2_inode_unlock(xb_alloc_inode, 1); brelse(xb_alloc_bh); out_mutex: - mutex_unlock(&xb_alloc_inode->i_mutex); + inode_unlock(xb_alloc_inode); iput(xb_alloc_inode); out: brelse(blk_bh); @@ -3598,17 +3619,17 @@ int ocfs2_xattr_set(struct inode *inode, } } - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); if (ocfs2_truncate_log_needs_flush(osb)) { ret = __ocfs2_flush_truncate_log(osb); if (ret < 0) { - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); mlog_errno(ret); goto cleanup; } } - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis, &xbs, &ctxt, ref_meta, &credits); @@ -4033,32 +4054,30 @@ static int ocfs2_list_xattr_bucket(struct inode *inode, int ret = 0, type; struct ocfs2_xattr_tree_list *xl = (struct ocfs2_xattr_tree_list *)para; int i, block_off, new_offset; - const char *prefix, *name; + const char *name; for (i = 0 ; i < le16_to_cpu(bucket_xh(bucket)->xh_count); i++) { struct ocfs2_xattr_entry *entry = &bucket_xh(bucket)->xh_entries[i]; type = ocfs2_xattr_get_type(entry); - prefix = ocfs2_xattr_prefix(type); - if (prefix) { - ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, - bucket_xh(bucket), - i, - &block_off, - &new_offset); - if (ret) - break; + ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, + bucket_xh(bucket), + i, + &block_off, + &new_offset); + if (ret) + break; - name = (const char *)bucket_block(bucket, block_off) + - new_offset; - ret = ocfs2_xattr_list_entry(xl->buffer, - xl->buffer_size, - &xl->result, - prefix, name, - entry->xe_name_len); - if (ret) - break; - } + name = (const char *)bucket_block(bucket, block_off) + + new_offset; + ret = ocfs2_xattr_list_entry(inode->i_sb, + xl->buffer, + xl->buffer_size, + &xl->result, + type, name, + entry->xe_name_len); + if (ret) + break; } return ret; @@ -5441,7 +5460,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode, return ret; } - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); if (ocfs2_truncate_log_needs_flush(osb)) { ret = __ocfs2_flush_truncate_log(osb); @@ -5485,7 +5504,7 @@ out_commit: out: ocfs2_schedule_truncate_log_flush(osb, 1); - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); if (meta_ac) ocfs2_free_alloc_context(meta_ac); @@ -7226,39 +7245,22 @@ int ocfs2_init_security_and_acl(struct inode *dir, leave: return ret; } + /* * 'security' attributes support */ -static size_t ocfs2_xattr_security_list(struct dentry *dentry, char *list, - size_t list_size, const char *name, - size_t name_len, int type) -{ - const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN; - const size_t total_len = prefix_len + name_len + 1; - - if (list && total_len <= list_size) { - memcpy(list, XATTR_SECURITY_PREFIX, prefix_len); - memcpy(list + prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; - } - return total_len; -} - -static int ocfs2_xattr_security_get(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) +static int ocfs2_xattr_security_get(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + void *buffer, size_t size) { - if (strcmp(name, "") == 0) - return -EINVAL; return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY, name, buffer, size); } -static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) +static int ocfs2_xattr_security_set(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) { - if (strcmp(name, "") == 0) - return -EINVAL; - return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY, name, value, size, flags); } @@ -7311,7 +7313,6 @@ int ocfs2_init_security_set(handle_t *handle, const struct xattr_handler ocfs2_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, - .list = ocfs2_xattr_security_list, .get = ocfs2_xattr_security_get, .set = ocfs2_xattr_security_set, }; @@ -7319,46 +7320,24 @@ const struct xattr_handler ocfs2_xattr_security_handler = { /* * 'trusted' attributes support */ -static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list, - size_t list_size, const char *name, - size_t name_len, int type) -{ - const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; - const size_t total_len = prefix_len + name_len + 1; - - if (!capable(CAP_SYS_ADMIN)) - return 0; - - if (list && total_len <= list_size) { - memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); - memcpy(list + prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; - } - return total_len; -} - -static int ocfs2_xattr_trusted_get(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) +static int ocfs2_xattr_trusted_get(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + void *buffer, size_t size) { - if (strcmp(name, "") == 0) - return -EINVAL; return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED, name, buffer, size); } -static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) +static int ocfs2_xattr_trusted_set(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) { - if (strcmp(name, "") == 0) - return -EINVAL; - return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED, name, value, size, flags); } const struct xattr_handler ocfs2_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, - .list = ocfs2_xattr_trusted_list, .get = ocfs2_xattr_trusted_get, .set = ocfs2_xattr_trusted_set, }; @@ -7366,45 +7345,24 @@ const struct xattr_handler ocfs2_xattr_trusted_handler = { /* * 'user' attributes support */ -static size_t ocfs2_xattr_user_list(struct dentry *dentry, char *list, - size_t list_size, const char *name, - size_t name_len, int type) -{ - const size_t prefix_len = XATTR_USER_PREFIX_LEN; - const size_t total_len = prefix_len + name_len + 1; - struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); - - if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) - return 0; - - if (list && total_len <= list_size) { - memcpy(list, XATTR_USER_PREFIX, prefix_len); - memcpy(list + prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; - } - return total_len; -} - -static int ocfs2_xattr_user_get(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) +static int ocfs2_xattr_user_get(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + void *buffer, size_t size) { struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); - if (strcmp(name, "") == 0) - return -EINVAL; if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) return -EOPNOTSUPP; return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_USER, name, buffer, size); } -static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) +static int ocfs2_xattr_user_set(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) { struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); - if (strcmp(name, "") == 0) - return -EINVAL; if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) return -EOPNOTSUPP; @@ -7414,7 +7372,6 @@ static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name, const struct xattr_handler ocfs2_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, - .list = ocfs2_xattr_user_list, .get = ocfs2_xattr_user_get, .set = ocfs2_xattr_user_set, }; |