diff options
Diffstat (limited to 'fs')
56 files changed, 609 insertions, 400 deletions
diff --git a/fs/afs/cell.c b/fs/afs/cell.c index f3d0bef16d78..6127f0fcd62c 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -514,6 +514,8 @@ static int afs_alloc_anon_key(struct afs_cell *cell) */ static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell) { + struct hlist_node **p; + struct afs_cell *pcell; int ret; if (!cell->anonymous_key) { @@ -534,7 +536,18 @@ static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell) return ret; mutex_lock(&net->proc_cells_lock); - list_add_tail(&cell->proc_link, &net->proc_cells); + for (p = &net->proc_cells.first; *p; p = &(*p)->next) { + pcell = hlist_entry(*p, struct afs_cell, proc_link); + if (strcmp(cell->name, pcell->name) < 0) + break; + } + + cell->proc_link.pprev = p; + cell->proc_link.next = *p; + rcu_assign_pointer(*p, &cell->proc_link.next); + if (cell->proc_link.next) + cell->proc_link.next->pprev = &cell->proc_link.next; + afs_dynroot_mkdir(net, cell); mutex_unlock(&net->proc_cells_lock); return 0; @@ -550,7 +563,7 @@ static void afs_deactivate_cell(struct afs_net *net, struct afs_cell *cell) afs_proc_cell_remove(cell); mutex_lock(&net->proc_cells_lock); - list_del_init(&cell->proc_link); + hlist_del_rcu(&cell->proc_link); afs_dynroot_rmdir(net, cell); mutex_unlock(&net->proc_cells_lock); diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index 1cde710a8013..f29c6dade7f6 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -265,7 +265,7 @@ int afs_dynroot_populate(struct super_block *sb) return -ERESTARTSYS; net->dynroot_sb = sb; - list_for_each_entry(cell, &net->proc_cells, proc_link) { + hlist_for_each_entry(cell, &net->proc_cells, proc_link) { ret = afs_dynroot_mkdir(net, cell); if (ret < 0) goto error; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 871a228d7f37..34c02fdcc25f 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -242,7 +242,7 @@ struct afs_net { seqlock_t cells_lock; struct mutex proc_cells_lock; - struct list_head proc_cells; + struct hlist_head proc_cells; /* Known servers. Theoretically each fileserver can only be in one * cell, but in practice, people create aliases and subsets and there's @@ -320,7 +320,7 @@ struct afs_cell { struct afs_net *net; struct key *anonymous_key; /* anonymous user key for this cell */ struct work_struct manager; /* Manager for init/deinit/dns */ - struct list_head proc_link; /* /proc cell list link */ + struct hlist_node proc_link; /* /proc cell list link */ #ifdef CONFIG_AFS_FSCACHE struct fscache_cookie *cache; /* caching cookie */ #endif diff --git a/fs/afs/main.c b/fs/afs/main.c index e84fe822a960..107427688edd 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -87,7 +87,7 @@ static int __net_init afs_net_init(struct net *net_ns) timer_setup(&net->cells_timer, afs_cells_timer, 0); mutex_init(&net->proc_cells_lock); - INIT_LIST_HEAD(&net->proc_cells); + INIT_HLIST_HEAD(&net->proc_cells); seqlock_init(&net->fs_lock); net->fs_servers = RB_ROOT; diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 476dcbb79713..9101f62707af 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -33,9 +33,8 @@ static inline struct afs_net *afs_seq2net_single(struct seq_file *m) static int afs_proc_cells_show(struct seq_file *m, void *v) { struct afs_cell *cell = list_entry(v, struct afs_cell, proc_link); - struct afs_net *net = afs_seq2net(m); - if (v == &net->proc_cells) { + if (v == SEQ_START_TOKEN) { /* display header on line 1 */ seq_puts(m, "USE NAME\n"); return 0; @@ -50,12 +49,12 @@ static void *afs_proc_cells_start(struct seq_file *m, loff_t *_pos) __acquires(rcu) { rcu_read_lock(); - return seq_list_start_head(&afs_seq2net(m)->proc_cells, *_pos); + return seq_hlist_start_head_rcu(&afs_seq2net(m)->proc_cells, *_pos); } static void *afs_proc_cells_next(struct seq_file *m, void *v, loff_t *pos) { - return seq_list_next(v, &afs_seq2net(m)->proc_cells, pos); + return seq_hlist_next_rcu(v, &afs_seq2net(m)->proc_cells, pos); } static void afs_proc_cells_stop(struct seq_file *m, void *v) diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 35f2ae30f31f..77a83790a31f 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -690,8 +690,6 @@ static void afs_process_async_call(struct work_struct *work) } if (call->state == AFS_CALL_COMPLETE) { - call->reply[0] = NULL; - /* We have two refs to release - one from the alloc and one * queued with the work item - and we can't just deallocate the * call because the work item may be queued again. diff --git a/fs/buffer.c b/fs/buffer.c index 6f1ae3ac9789..109f55196866 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3060,11 +3060,6 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, */ bio = bio_alloc(GFP_NOIO, 1); - if (wbc) { - wbc_init_bio(wbc, bio); - wbc_account_io(wbc, bh->b_page, bh->b_size); - } - bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio_set_dev(bio, bh->b_bdev); bio->bi_write_hint = write_hint; @@ -3084,6 +3079,11 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, op_flags |= REQ_PRIO; bio_set_op_attrs(bio, op, op_flags); + if (wbc) { + wbc_init_bio(wbc, bio); + wbc_account_io(wbc, bh->b_page, bh->b_size); + } + submit_bio(bio); return 0; } diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index af2b17b21b94..95983c744164 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -343,7 +343,7 @@ try_again: trap = lock_rename(cache->graveyard, dir); /* do some checks before getting the grave dentry */ - if (rep->d_parent != dir) { + if (rep->d_parent != dir || IS_DEADDIR(d_inode(rep))) { /* the entry was probably culled when we dropped the parent dir * lock */ unlock_rename(cache->graveyard, dir); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 0c9ab62c3df4..9dcaed031843 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1553,6 +1553,7 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param, /* Flags */ #define MID_WAIT_CANCELLED 1 /* Cancelled while waiting for response */ +#define MID_DELETED 2 /* Mid has been dequeued/deleted */ /* Types of response buffer returned from SendReceive2 */ #define CIFS_NO_BUFFER 0 /* Response buffer not returned */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 7aa08dba4719..52d71b64c0c6 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -659,7 +659,15 @@ dequeue_mid(struct mid_q_entry *mid, bool malformed) mid->mid_state = MID_RESPONSE_RECEIVED; else mid->mid_state = MID_RESPONSE_MALFORMED; - list_del_init(&mid->qhead); + /* + * Trying to handle/dequeue a mid after the send_recv() + * function has finished processing it is a bug. + */ + if (mid->mid_flags & MID_DELETED) + printk_once(KERN_WARNING + "trying to dequeue a deleted mid\n"); + else + list_del_init(&mid->qhead); spin_unlock(&GlobalMid_Lock); } @@ -938,8 +946,7 @@ next_pdu: } else { mids[0] = server->ops->find_mid(server, buf); bufs[0] = buf; - if (mids[0]) - num_mids = 1; + num_mids = 1; if (!mids[0] || !mids[0]->receive) length = standard_receive3(server, mids[0]); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index d954ce36b473..89985a0a6819 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1477,7 +1477,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, } srch_inf->entries_in_buffer = 0; - srch_inf->index_of_last_entry = 0; + srch_inf->index_of_last_entry = 2; rc = SMB2_query_directory(xid, tcon, fid->persistent_fid, fid->volatile_fid, 0, srch_inf); diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 78f96fa3d7d9..b48f43963da6 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -142,7 +142,8 @@ void cifs_delete_mid(struct mid_q_entry *mid) { spin_lock(&GlobalMid_Lock); - list_del(&mid->qhead); + list_del_init(&mid->qhead); + mid->mid_flags |= MID_DELETED; spin_unlock(&GlobalMid_Lock); DeleteMidQEntry(mid); @@ -772,6 +773,11 @@ cifs_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst) return mid; } +static void +cifs_noop_callback(struct mid_q_entry *mid) +{ +} + int compound_send_recv(const unsigned int xid, struct cifs_ses *ses, const int flags, const int num_rqst, struct smb_rqst *rqst, @@ -826,8 +832,13 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, } midQ[i]->mid_state = MID_REQUEST_SUBMITTED; + /* + * We don't invoke the callback compounds unless it is the last + * request. + */ + if (i < num_rqst - 1) + midQ[i]->callback = cifs_noop_callback; } - cifs_in_send_inc(ses->server); rc = smb_send_rqst(ses->server, num_rqst, rqst, flags); cifs_in_send_dec(ses->server); @@ -908,6 +919,12 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, midQ[i]->resp_buf = NULL; } out: + /* + * This will dequeue all mids. After this it is important that the + * demultiplex_thread will not process any of these mids any futher. + * This is prevented above by using a noop callback that will not + * wake this thread except for the very last PDU. + */ for (i = 0; i < num_rqst; i++) cifs_delete_mid(midQ[i]); add_credits(ses->server, credits, optype); @@ -447,6 +447,7 @@ bool dax_lock_mapping_entry(struct page *page) xa_unlock_irq(&mapping->i_pages); break; } else if (IS_ERR(entry)) { + xa_unlock_irq(&mapping->i_pages); WARN_ON_ONCE(PTR_ERR(entry) != -EAGAIN); continue; } @@ -665,6 +666,8 @@ struct page *dax_layout_busy_page(struct address_space *mapping) while (index < end && pagevec_lookup_entries(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) { + pgoff_t nr_pages = 1; + for (i = 0; i < pagevec_count(&pvec); i++) { struct page *pvec_ent = pvec.pages[i]; void *entry; @@ -679,8 +682,15 @@ struct page *dax_layout_busy_page(struct address_space *mapping) xa_lock_irq(&mapping->i_pages); entry = get_unlocked_mapping_entry(mapping, index, NULL); - if (entry) + if (entry) { page = dax_busy_page(entry); + /* + * Account for multi-order entries at + * the end of the pagevec. + */ + if (i + 1 >= pagevec_count(&pvec)) + nr_pages = 1UL << dax_radix_order(entry); + } put_unlocked_mapping_entry(mapping, index, entry); xa_unlock_irq(&mapping->i_pages); if (page) @@ -695,7 +705,7 @@ struct page *dax_layout_busy_page(struct address_space *mapping) */ pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); - index++; + index += nr_pages; if (page) break; @@ -1120,21 +1130,12 @@ static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry, { struct inode *inode = mapping->host; unsigned long vaddr = vmf->address; - vm_fault_t ret = VM_FAULT_NOPAGE; - struct page *zero_page; - pfn_t pfn; - - zero_page = ZERO_PAGE(0); - if (unlikely(!zero_page)) { - ret = VM_FAULT_OOM; - goto out; - } + pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr)); + vm_fault_t ret; - pfn = page_to_pfn_t(zero_page); dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_ZERO_PAGE, false); ret = vmf_insert_mixed(vmf->vma, vaddr, pfn); -out: trace_dax_load_hole(inode, vmf, ret); return ret; } diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 7f7ee18fe179..e4bb9386c045 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1448,6 +1448,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) } inode->i_blocks = le32_to_cpu(raw_inode->i_blocks); ei->i_flags = le32_to_cpu(raw_inode->i_flags); + ext2_set_inode_flags(inode); ei->i_faddr = le32_to_cpu(raw_inode->i_faddr); ei->i_frag_no = raw_inode->i_frag; ei->i_frag_size = raw_inode->i_fsize; @@ -1517,7 +1518,6 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); } brelse (bh); - ext2_set_inode_flags(inode); unlock_new_inode(inode); return inode; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index db7590178dfc..2aa62d58d8dd 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -374,13 +374,13 @@ static int io_submit_init_bio(struct ext4_io_submit *io, bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); if (!bio) return -ENOMEM; - wbc_init_bio(io->io_wbc, bio); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio_set_dev(bio, bh->b_bdev); bio->bi_end_io = ext4_end_bio; bio->bi_private = ext4_get_io_end(io->io_end); io->io_bio = bio; io->io_next_block = bh->b_blocknr; + wbc_init_bio(io->io_wbc, bio); return 0; } diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index defc2168de91..f58c0cacc531 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -682,6 +682,7 @@ int fat_count_free_clusters(struct super_block *sb) if (ops->ent_get(&fatent) == FAT_ENT_FREE) free++; } while (fat_ent_next(sbi, &fatent)); + cond_resched(); } sbi->free_clusters = free; sbi->free_clus_valid = 1; diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 83bfe04456b6..c550512ce335 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -70,20 +70,7 @@ void fscache_free_cookie(struct fscache_cookie *cookie) } /* - * initialise an cookie jar slab element prior to any use - */ -void fscache_cookie_init_once(void *_cookie) -{ - struct fscache_cookie *cookie = _cookie; - - memset(cookie, 0, sizeof(*cookie)); - spin_lock_init(&cookie->lock); - spin_lock_init(&cookie->stores_lock); - INIT_HLIST_HEAD(&cookie->backing_objects); -} - -/* - * Set the index key in a cookie. The cookie struct has space for a 12-byte + * Set the index key in a cookie. The cookie struct has space for a 16-byte * key plus length and hash, but if that's not big enough, it's instead a * pointer to a buffer containing 3 bytes of hash, 1 byte of length and then * the key data. @@ -93,20 +80,18 @@ static int fscache_set_key(struct fscache_cookie *cookie, { unsigned long long h; u32 *buf; + int bufs; int i; - cookie->key_len = index_key_len; + bufs = DIV_ROUND_UP(index_key_len, sizeof(*buf)); if (index_key_len > sizeof(cookie->inline_key)) { - buf = kzalloc(index_key_len, GFP_KERNEL); + buf = kcalloc(bufs, sizeof(*buf), GFP_KERNEL); if (!buf) return -ENOMEM; cookie->key = buf; } else { buf = (u32 *)cookie->inline_key; - buf[0] = 0; - buf[1] = 0; - buf[2] = 0; } memcpy(buf, index_key, index_key_len); @@ -116,7 +101,8 @@ static int fscache_set_key(struct fscache_cookie *cookie, */ h = (unsigned long)cookie->parent; h += index_key_len + cookie->type; - for (i = 0; i < (index_key_len + sizeof(u32) - 1) / sizeof(u32); i++) + + for (i = 0; i < bufs; i++) h += buf[i]; cookie->key_hash = h ^ (h >> 32); @@ -161,7 +147,7 @@ struct fscache_cookie *fscache_alloc_cookie( struct fscache_cookie *cookie; /* allocate and initialise a cookie */ - cookie = kmem_cache_alloc(fscache_cookie_jar, GFP_KERNEL); + cookie = kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL); if (!cookie) return NULL; @@ -192,6 +178,9 @@ struct fscache_cookie *fscache_alloc_cookie( cookie->netfs_data = netfs_data; cookie->flags = (1 << FSCACHE_COOKIE_NO_DATA_YET); cookie->type = def->type; + spin_lock_init(&cookie->lock); + spin_lock_init(&cookie->stores_lock); + INIT_HLIST_HEAD(&cookie->backing_objects); /* radix tree insertion won't use the preallocation pool unless it's * told it may not wait */ diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index f83328a7f048..d6209022e965 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -51,7 +51,6 @@ extern struct fscache_cache *fscache_select_cache_for_object( extern struct kmem_cache *fscache_cookie_jar; extern void fscache_free_cookie(struct fscache_cookie *); -extern void fscache_cookie_init_once(void *); extern struct fscache_cookie *fscache_alloc_cookie(struct fscache_cookie *, const struct fscache_cookie_def *, const void *, size_t, diff --git a/fs/fscache/main.c b/fs/fscache/main.c index 7dce110bf17d..30ad89db1efc 100644 --- a/fs/fscache/main.c +++ b/fs/fscache/main.c @@ -143,9 +143,7 @@ static int __init fscache_init(void) fscache_cookie_jar = kmem_cache_create("fscache_cookie_jar", sizeof(struct fscache_cookie), - 0, - 0, - fscache_cookie_init_once); + 0, 0, NULL); if (!fscache_cookie_jar) { pr_notice("Failed to allocate a cookie jar\n"); ret = -ENOMEM; diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 03128ed1f34e..84544a4f012d 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1057,7 +1057,7 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, } } release_metapath(&mp); - if (gfs2_is_jdata(ip)) + if (!gfs2_is_stuffed(ip) && gfs2_is_jdata(ip)) iomap->page_done = gfs2_iomap_journaled_page_done; return 0; diff --git a/fs/ioctl.c b/fs/ioctl.c index 3212c29235ce..2005529af560 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -230,7 +230,7 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd, ret = -EXDEV; if (src_file.file->f_path.mnt != dst_file->f_path.mnt) goto fdput; - ret = do_clone_file_range(src_file.file, off, dst_file, destoff, olen); + ret = vfs_clone_file_range(src_file.file, off, dst_file, destoff, olen); fdput: fdput(src_file); return ret; diff --git a/fs/iomap.c b/fs/iomap.c index 74762b1ec233..ec15cf2ec696 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1051,6 +1051,7 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, } else { WARN_ON_ONCE(!PageUptodate(page)); iomap_page_create(inode, page); + set_page_dirty(page); } return length; @@ -1090,7 +1091,6 @@ int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) length -= ret; } - set_page_dirty(page); wait_for_stable_page(page); return VM_FAULT_LOCKED; out_unlock: diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 87bdf0f4cba1..902a7dd10e5c 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -285,10 +285,8 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent) sb->s_fs_info = c; ret = jffs2_parse_options(c, data); - if (ret) { - kfree(c); + if (ret) return -EINVAL; - } /* Initialize JFFS2 superblock locks, the further initialization will * be done later */ diff --git a/fs/namespace.c b/fs/namespace.c index 99186556f8d3..d86830c86ce8 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2642,6 +2642,7 @@ static long exact_copy_from_user(void *to, const void __user * from, if (!access_ok(VERIFY_READ, from, n)) return n; + current->kernel_uaccess_faults_ok++; while (n) { if (__get_user(c, f)) { memset(t, 0, n); @@ -2651,6 +2652,7 @@ static long exact_copy_from_user(void *to, const void __user * from, f++; n--; } + current->kernel_uaccess_faults_ok--; return n; } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 55a099e47ba2..b53e76391e52 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -541,7 +541,8 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, __be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst, u64 dst_pos, u64 count) { - return nfserrno(do_clone_file_range(src, src_pos, dst, dst_pos, count)); + return nfserrno(vfs_clone_file_range(src, src_pos, dst, dst_pos, + count)); } ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst, diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index aaca0949fe53..826f0567ec43 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -584,9 +584,9 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, res->last_used = 0; - spin_lock(&dlm->spinlock); + spin_lock(&dlm->track_lock); list_add_tail(&res->tracking, &dlm->tracking_list); - spin_unlock(&dlm->spinlock); + spin_unlock(&dlm->track_lock); memset(res->lvb, 0, DLM_LVB_LEN); memset(res->refmap, 0, sizeof(res->refmap)); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 8e712b614e6e..933aac5da193 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -96,7 +96,9 @@ struct ocfs2_unblock_ctl { }; /* Lockdep class keys */ +#ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key lockdep_keys[OCFS2_NUM_LOCK_TYPES]; +#endif static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres, int new_level); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 7869622af22a..7a5ee145c733 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2946,6 +2946,7 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, if (map_end & (PAGE_SIZE - 1)) to = map_end & (PAGE_SIZE - 1); +retry: page = find_or_create_page(mapping, page_index, GFP_NOFS); if (!page) { ret = -ENOMEM; @@ -2954,11 +2955,18 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, } /* - * In case PAGE_SIZE <= CLUSTER_SIZE, This page - * can't be dirtied before we CoW it out. + * In case PAGE_SIZE <= CLUSTER_SIZE, we do not expect a dirty + * page, so write it back. */ - if (PAGE_SIZE <= OCFS2_SB(sb)->s_clustersize) - BUG_ON(PageDirty(page)); + if (PAGE_SIZE <= OCFS2_SB(sb)->s_clustersize) { + if (PageDirty(page)) { + /* + * write_on_page will unlock the page on return + */ + ret = write_one_page(page); + goto retry; + } + } if (!PageUptodate(page)) { ret = block_read_full_page(page, ocfs2_get_block); diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 296037afecdb..1cc797a08a5b 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -141,7 +141,7 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) } /* Try to use clone_file_range to clone up within the same fs */ - error = vfs_clone_file_range(old_file, 0, new_file, 0, len); + error = do_clone_file_range(old_file, 0, new_file, 0, len); if (!error) goto out; /* Couldn't clone, so now we try to copy the data */ diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index aeaefd2a551b..986313da0c88 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -240,8 +240,10 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) goto out_unlock; old_cred = ovl_override_creds(file_inode(file)->i_sb); + file_start_write(real.file); ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, ovl_iocb_to_rwf(iocb)); + file_end_write(real.file); revert_creds(old_cred); /* Update size */ diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index b6ac545b5a32..3b7ed5d2279c 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -504,7 +504,7 @@ static const struct inode_operations ovl_special_inode_operations = { .update_time = ovl_update_time, }; -const struct address_space_operations ovl_aops = { +static const struct address_space_operations ovl_aops = { /* For O_DIRECT dentry_open() checks f_mapping->a_ops->direct_IO */ .direct_IO = noop_direct_IO, }; diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index f28711846dd6..9c0ca6a7becf 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -686,7 +686,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, index = NULL; goto out; } - pr_warn_ratelimited("overlayfs: failed inode index lookup (ino=%lu, key=%*s, err=%i);\n" + pr_warn_ratelimited("overlayfs: failed inode index lookup (ino=%lu, key=%.*s, err=%i);\n" "overlayfs: mount with '-o index=off' to disable inodes index.\n", d_inode(origin)->i_ino, name.len, name.name, err); diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index f61839e1054c..a3c0d9584312 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -152,8 +152,8 @@ static inline int ovl_do_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { int err = vfs_setxattr(dentry, name, value, size, flags); - pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n", - dentry, name, (int) size, (char *) value, flags, err); + pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, 0x%x) = %i\n", + dentry, name, min((int)size, 48), value, size, flags, err); return err; } diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 8cfb62cc8672..ace4fe4c39a9 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -683,7 +683,7 @@ static void ovl_cleanup_index(struct dentry *dentry) struct dentry *upperdentry = ovl_dentry_upper(dentry); struct dentry *index = NULL; struct inode *inode; - struct qstr name; + struct qstr name = { }; int err; err = ovl_get_index_name(lowerdentry, &name); @@ -726,6 +726,7 @@ static void ovl_cleanup_index(struct dentry *dentry) goto fail; out: + kfree(name.name); dput(index); return; diff --git a/fs/proc/base.c b/fs/proc/base.c index ccf86f16d9f0..7e9f07bf260d 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -407,6 +407,20 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, unsigned long *entries; int err; + /* + * The ability to racily run the kernel stack unwinder on a running task + * and then observe the unwinder output is scary; while it is useful for + * debugging kernel issues, it can also allow an attacker to leak kernel + * stack contents. + * Doing this in a manner that is at least safe from races would require + * some work to ensure that the remote task can not be scheduled; and + * even then, this would still expose the unwinder as local attack + * surface. + * Therefore, this interface is restricted to root. + */ + if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN)) + return -EACCES; + entries = kmalloc_array(MAX_STACK_TRACE_DEPTH, sizeof(*entries), GFP_KERNEL); if (!entries) diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index bbd1e357c23d..f4fd2e72add4 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -898,8 +898,22 @@ static struct platform_driver ramoops_driver = { }, }; -static void ramoops_register_dummy(void) +static inline void ramoops_unregister_dummy(void) { + platform_device_unregister(dummy); + dummy = NULL; + + kfree(dummy_data); + dummy_data = NULL; +} + +static void __init ramoops_register_dummy(void) +{ + /* + * Prepare a dummy platform data structure to carry the module + * parameters. If mem_size isn't set, then there are no module + * parameters, and we can skip this. + */ if (!mem_size) return; @@ -932,21 +946,28 @@ static void ramoops_register_dummy(void) if (IS_ERR(dummy)) { pr_info("could not create platform device: %ld\n", PTR_ERR(dummy)); + dummy = NULL; + ramoops_unregister_dummy(); } } static int __init ramoops_init(void) { + int ret; + ramoops_register_dummy(); - return platform_driver_register(&ramoops_driver); + ret = platform_driver_register(&ramoops_driver); + if (ret != 0) + ramoops_unregister_dummy(); + + return ret; } late_initcall(ramoops_init); static void __exit ramoops_exit(void) { platform_driver_unregister(&ramoops_driver); - platform_device_unregister(dummy); - kfree(dummy_data); + ramoops_unregister_dummy(); } module_exit(ramoops_exit); diff --git a/fs/read_write.c b/fs/read_write.c index 39b4a21dd933..8a2737f0d61d 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1818,8 +1818,8 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, } EXPORT_SYMBOL(vfs_clone_file_prep_inodes); -int vfs_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len) +int do_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, u64 len) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); @@ -1866,6 +1866,19 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in, return ret; } +EXPORT_SYMBOL(do_clone_file_range); + +int vfs_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, u64 len) +{ + int ret; + + file_start_write(file_out); + ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len); + file_end_write(file_out); + + return ret; +} EXPORT_SYMBOL(vfs_clone_file_range); /* diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index bf000c8aeffb..fec62e9dfbe6 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2337,8 +2337,8 @@ late_initcall(ubifs_init); static void __exit ubifs_exit(void) { - WARN_ON(list_empty(&ubifs_infos)); - WARN_ON(atomic_long_read(&ubifs_clean_zn_cnt) == 0); + WARN_ON(!list_empty(&ubifs_infos)); + WARN_ON(atomic_long_read(&ubifs_clean_zn_cnt) != 0); dbg_debugfs_exit(); ubifs_compressors_exit(); diff --git a/fs/xattr.c b/fs/xattr.c index daa732550088..0d6a6a4af861 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -948,17 +948,19 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, int err = 0; #ifdef CONFIG_FS_POSIX_ACL - if (inode->i_acl) { - err = xattr_list_one(&buffer, &remaining_size, - XATTR_NAME_POSIX_ACL_ACCESS); - if (err) - return err; - } - if (inode->i_default_acl) { - err = xattr_list_one(&buffer, &remaining_size, - XATTR_NAME_POSIX_ACL_DEFAULT); - if (err) - return err; + if (IS_POSIXACL(inode)) { + if (inode->i_acl) { + err = xattr_list_one(&buffer, &remaining_size, + XATTR_NAME_POSIX_ACL_ACCESS); + if (err) + return err; + } + if (inode->i_default_acl) { + err = xattr_list_one(&buffer, &remaining_size, + XATTR_NAME_POSIX_ACL_DEFAULT); + if (err) + return err; + } } #endif diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 1e671d4eb6fa..c6299f82a6e4 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -587,7 +587,7 @@ xfs_attr_leaf_addname( */ error = xfs_attr3_leaf_to_node(args); if (error) - goto out_defer_cancel; + return error; error = xfs_defer_finish(&args->trans); if (error) return error; @@ -675,7 +675,7 @@ xfs_attr_leaf_addname( error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (error) - goto out_defer_cancel; + return error; error = xfs_defer_finish(&args->trans); if (error) return error; @@ -693,9 +693,6 @@ xfs_attr_leaf_addname( error = xfs_attr3_leaf_clearflag(args); } return error; -out_defer_cancel: - xfs_defer_cancel(args->trans); - return error; } /* @@ -738,15 +735,12 @@ xfs_attr_leaf_removename( error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (error) - goto out_defer_cancel; + return error; error = xfs_defer_finish(&args->trans); if (error) return error; } return 0; -out_defer_cancel: - xfs_defer_cancel(args->trans); - return error; } /* @@ -864,7 +858,7 @@ restart: state = NULL; error = xfs_attr3_leaf_to_node(args); if (error) - goto out_defer_cancel; + goto out; error = xfs_defer_finish(&args->trans); if (error) goto out; @@ -888,7 +882,7 @@ restart: */ error = xfs_da3_split(state); if (error) - goto out_defer_cancel; + goto out; error = xfs_defer_finish(&args->trans); if (error) goto out; @@ -984,7 +978,7 @@ restart: if (retval && (state->path.active > 1)) { error = xfs_da3_join(state); if (error) - goto out_defer_cancel; + goto out; error = xfs_defer_finish(&args->trans); if (error) goto out; @@ -1013,9 +1007,6 @@ out: if (error) return error; return retval; -out_defer_cancel: - xfs_defer_cancel(args->trans); - goto out; } /* @@ -1107,7 +1098,7 @@ xfs_attr_node_removename( if (retval && (state->path.active > 1)) { error = xfs_da3_join(state); if (error) - goto out_defer_cancel; + goto out; error = xfs_defer_finish(&args->trans); if (error) goto out; @@ -1138,7 +1129,7 @@ xfs_attr_node_removename( error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (error) - goto out_defer_cancel; + goto out; error = xfs_defer_finish(&args->trans); if (error) goto out; @@ -1150,9 +1141,6 @@ xfs_attr_node_removename( out: xfs_da_state_free(state); return error; -out_defer_cancel: - xfs_defer_cancel(args->trans); - goto out; } /* diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index af094063e402..d89363c6b523 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -485,7 +485,7 @@ xfs_attr_rmtval_set( blkcnt, XFS_BMAPI_ATTRFORK, args->total, &map, &nmap); if (error) - goto out_defer_cancel; + return error; error = xfs_defer_finish(&args->trans); if (error) return error; @@ -553,9 +553,6 @@ xfs_attr_rmtval_set( } ASSERT(valuelen == 0); return 0; -out_defer_cancel: - xfs_defer_cancel(args->trans); - return error; } /* @@ -625,7 +622,7 @@ xfs_attr_rmtval_remove( error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, XFS_BMAPI_ATTRFORK, 1, &done); if (error) - goto out_defer_cancel; + return error; error = xfs_defer_finish(&args->trans); if (error) return error; @@ -638,7 +635,4 @@ xfs_attr_rmtval_remove( return error; } return 0; -out_defer_cancel: - xfs_defer_cancel(args->trans); - return error; } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 2760314fdf7f..a47670332326 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -673,7 +673,8 @@ xfs_bmap_extents_to_btree( ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); /* - * Make space in the inode incore. + * Make space in the inode incore. This needs to be undone if we fail + * to expand the root. */ xfs_iroot_realloc(ip, 1, whichfork); ifp->if_flags |= XFS_IFBROOT; @@ -711,16 +712,15 @@ xfs_bmap_extents_to_btree( args.minlen = args.maxlen = args.prod = 1; args.wasdel = wasdel; *logflagsp = 0; - if ((error = xfs_alloc_vextent(&args))) { - ASSERT(ifp->if_broot == NULL); - goto err1; - } + error = xfs_alloc_vextent(&args); + if (error) + goto out_root_realloc; if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) { - ASSERT(ifp->if_broot == NULL); error = -ENOSPC; - goto err1; + goto out_root_realloc; } + /* * Allocation can't fail, the space was reserved. */ @@ -732,9 +732,10 @@ xfs_bmap_extents_to_btree( xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0); if (!abp) { - error = -ENOSPC; - goto err2; + error = -EFSCORRUPTED; + goto out_unreserve_dquot; } + /* * Fill in the child block. */ @@ -775,11 +776,12 @@ xfs_bmap_extents_to_btree( *logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork); return 0; -err2: +out_unreserve_dquot: xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); -err1: +out_root_realloc: xfs_iroot_realloc(ip, -1, whichfork); XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + ASSERT(ifp->if_broot == NULL); xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 059bc44c27e8..afbe336600e1 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1016,6 +1016,8 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) #define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */ #define XFS_DIFLAG_NODEFRAG_BIT 13 /* do not reorganize/defragment */ #define XFS_DIFLAG_FILESTREAM_BIT 14 /* use filestream allocator */ +/* Do not use bit 15, di_flags is legacy and unchanging now */ + #define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT) #define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT) #define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT) diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 30d1d60f1d46..09d9c8cfa4a0 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -415,6 +415,31 @@ xfs_dinode_verify_fork( return NULL; } +static xfs_failaddr_t +xfs_dinode_verify_forkoff( + struct xfs_dinode *dip, + struct xfs_mount *mp) +{ + if (!XFS_DFORK_Q(dip)) + return NULL; + + switch (dip->di_format) { + case XFS_DINODE_FMT_DEV: + if (dip->di_forkoff != (roundup(sizeof(xfs_dev_t), 8) >> 3)) + return __this_address; + break; + case XFS_DINODE_FMT_LOCAL: /* fall through ... */ + case XFS_DINODE_FMT_EXTENTS: /* fall through ... */ + case XFS_DINODE_FMT_BTREE: + if (dip->di_forkoff >= (XFS_LITINO(mp, dip->di_version) >> 3)) + return __this_address; + break; + default: + return __this_address; + } + return NULL; +} + xfs_failaddr_t xfs_dinode_verify( struct xfs_mount *mp, @@ -470,6 +495,11 @@ xfs_dinode_verify( if (mode && (flags & XFS_DIFLAG_REALTIME) && !mp->m_rtdev_targp) return __this_address; + /* check for illegal values of forkoff */ + fa = xfs_dinode_verify_forkoff(dip, mp); + if (fa) + return fa; + /* Do we have appropriate data fork formats for the mode? */ switch (mode & S_IFMT) { case S_IFIFO: diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index 036b5c7021eb..376bcb585ae6 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -17,7 +17,6 @@ #include "xfs_sb.h" #include "xfs_alloc.h" #include "xfs_rmap.h" -#include "xfs_alloc.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 5b3b177c0fc9..e386c9b0b4ab 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -126,6 +126,7 @@ xchk_inode_flags( { struct xfs_mount *mp = sc->mp; + /* di_flags are all taken, last bit cannot be used */ if (flags & ~XFS_DIFLAG_ANY) goto bad; @@ -172,8 +173,9 @@ xchk_inode_flags2( { struct xfs_mount *mp = sc->mp; + /* Unknown di_flags2 could be from a future kernel */ if (flags2 & ~XFS_DIFLAG2_ANY) - goto bad; + xchk_ino_set_warning(sc, ino); /* reflink flag requires reflink feature */ if ((flags2 & XFS_DIFLAG2_REFLINK) && diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index addbd74ecd8e..6de8d90041ff 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -702,13 +702,9 @@ xfs_bmap_punch_delalloc_range( struct xfs_iext_cursor icur; int error = 0; - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); - if (error) - goto out_unlock; - } + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + xfs_ilock(ip, XFS_ILOCK_EXCL); if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got)) goto out_unlock; @@ -1584,7 +1580,7 @@ xfs_swap_extent_rmap( tirec.br_blockcount, &irec, &nimaps, 0); if (error) - goto out_defer; + goto out; ASSERT(nimaps == 1); ASSERT(tirec.br_startoff == irec.br_startoff); trace_xfs_swap_extent_rmap_remap_piece(ip, &irec); @@ -1599,22 +1595,22 @@ xfs_swap_extent_rmap( /* Remove the mapping from the donor file. */ error = xfs_bmap_unmap_extent(tp, tip, &uirec); if (error) - goto out_defer; + goto out; /* Remove the mapping from the source file. */ error = xfs_bmap_unmap_extent(tp, ip, &irec); if (error) - goto out_defer; + goto out; /* Map the donor file's blocks into the source file. */ error = xfs_bmap_map_extent(tp, ip, &uirec); if (error) - goto out_defer; + goto out; /* Map the source file's blocks into the donor file. */ error = xfs_bmap_map_extent(tp, tip, &irec); if (error) - goto out_defer; + goto out; error = xfs_defer_finish(tpp); tp = *tpp; @@ -1636,8 +1632,6 @@ xfs_swap_extent_rmap( tip->i_d.di_flags2 = tip_flags2; return 0; -out_defer: - xfs_defer_cancel(tp); out: trace_xfs_swap_extent_rmap_error(ip, error, _RET_IP_); tip->i_d.di_flags2 = tip_flags2; diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 1c9d1398980b..12d8455bfbb2 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -532,6 +532,49 @@ xfs_buf_item_push( } /* + * Drop the buffer log item refcount and take appropriate action. This helper + * determines whether the bli must be freed or not, since a decrement to zero + * does not necessarily mean the bli is unused. + * + * Return true if the bli is freed, false otherwise. + */ +bool +xfs_buf_item_put( + struct xfs_buf_log_item *bip) +{ + struct xfs_log_item *lip = &bip->bli_item; + bool aborted; + bool dirty; + + /* drop the bli ref and return if it wasn't the last one */ + if (!atomic_dec_and_test(&bip->bli_refcount)) + return false; + + /* + * We dropped the last ref and must free the item if clean or aborted. + * If the bli is dirty and non-aborted, the buffer was clean in the + * transaction but still awaiting writeback from previous changes. In + * that case, the bli is freed on buffer writeback completion. + */ + aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags) || + XFS_FORCED_SHUTDOWN(lip->li_mountp); + dirty = bip->bli_flags & XFS_BLI_DIRTY; + if (dirty && !aborted) + return false; + + /* + * The bli is aborted or clean. An aborted item may be in the AIL + * regardless of dirty state. For example, consider an aborted + * transaction that invalidated a dirty bli and cleared the dirty + * state. + */ + if (aborted) + xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR); + xfs_buf_item_relse(bip->bli_buf); + return true; +} + +/* * Release the buffer associated with the buf log item. If there is no dirty * logged data associated with the buffer recorded in the buf log item, then * free the buf log item and remove the reference to it in the buffer. @@ -556,76 +599,42 @@ xfs_buf_item_unlock( { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; - bool aborted; - bool hold = !!(bip->bli_flags & XFS_BLI_HOLD); - bool dirty = !!(bip->bli_flags & XFS_BLI_DIRTY); + bool released; + bool hold = bip->bli_flags & XFS_BLI_HOLD; + bool stale = bip->bli_flags & XFS_BLI_STALE; #if defined(DEBUG) || defined(XFS_WARN) - bool ordered = !!(bip->bli_flags & XFS_BLI_ORDERED); + bool ordered = bip->bli_flags & XFS_BLI_ORDERED; + bool dirty = bip->bli_flags & XFS_BLI_DIRTY; #endif - aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags); - - /* Clear the buffer's association with this transaction. */ - bp->b_transp = NULL; - - /* - * The per-transaction state has been copied above so clear it from the - * bli. - */ - bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); - - /* - * If the buf item is marked stale, then don't do anything. We'll - * unlock the buffer and free the buf item when the buffer is unpinned - * for the last time. - */ - if (bip->bli_flags & XFS_BLI_STALE) { - trace_xfs_buf_item_unlock_stale(bip); - ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); - if (!aborted) { - atomic_dec(&bip->bli_refcount); - return; - } - } - trace_xfs_buf_item_unlock(bip); /* - * If the buf item isn't tracking any data, free it, otherwise drop the - * reference we hold to it. If we are aborting the transaction, this may - * be the only reference to the buf item, so we free it anyway - * regardless of whether it is dirty or not. A dirty abort implies a - * shutdown, anyway. - * * The bli dirty state should match whether the blf has logged segments * except for ordered buffers, where only the bli should be dirty. */ ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) || (ordered && dirty && !xfs_buf_item_dirty_format(bip))); + ASSERT(!stale || (bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); /* - * Clean buffers, by definition, cannot be in the AIL. However, aborted - * buffers may be in the AIL regardless of dirty state. An aborted - * transaction that invalidates a buffer already in the AIL may have - * marked it stale and cleared the dirty state, for example. - * - * Therefore if we are aborting a buffer and we've just taken the last - * reference away, we have to check if it is in the AIL before freeing - * it. We need to free it in this case, because an aborted transaction - * has already shut the filesystem down and this is the last chance we - * will have to do so. + * Clear the buffer's association with this transaction and + * per-transaction state from the bli, which has been copied above. */ - if (atomic_dec_and_test(&bip->bli_refcount)) { - if (aborted) { - ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); - xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR); - xfs_buf_item_relse(bp); - } else if (!dirty) - xfs_buf_item_relse(bp); - } + bp->b_transp = NULL; + bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); - if (!hold) - xfs_buf_relse(bp); + /* + * Unref the item and unlock the buffer unless held or stale. Stale + * buffers remain locked until final unpin unless the bli is freed by + * the unref call. The latter implies shutdown because buffer + * invalidation dirties the bli and transaction. + */ + released = xfs_buf_item_put(bip); + if (hold || (stale && !released)) + return; + ASSERT(!stale || test_bit(XFS_LI_ABORTED, &lip->li_flags)); + xfs_buf_relse(bp); } /* diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 3f7d7b72e7e6..90f65f891fab 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -51,6 +51,7 @@ struct xfs_buf_log_item { int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); void xfs_buf_item_relse(struct xfs_buf *); +bool xfs_buf_item_put(struct xfs_buf_log_item *); void xfs_buf_item_log(struct xfs_buf_log_item *, uint, uint); bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *); void xfs_buf_attach_iodone(struct xfs_buf *, diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index d957a46dc1cb..05db9540e459 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1563,7 +1563,7 @@ xfs_itruncate_extents_flags( error = xfs_bunmapi(tp, ip, first_unmap_block, unmap_len, flags, XFS_ITRUNC_MAX_EXTENTS, &done); if (error) - goto out_bmap_cancel; + goto out; /* * Duplicate the transaction that has the permanent @@ -1599,14 +1599,6 @@ xfs_itruncate_extents_flags( out: *tpp = tp; return error; -out_bmap_cancel: - /* - * If the bunmapi call encounters an error, return to the caller where - * the transaction can be properly aborted. We just need to make sure - * we're not holding any resources that we were not when we came in. - */ - xfs_defer_cancel(tp); - goto out; } int diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index c3e74f9128e8..f48ffd7a8d3e 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -471,8 +471,18 @@ xfs_vn_get_link_inline( struct inode *inode, struct delayed_call *done) { + char *link; + ASSERT(XFS_I(inode)->i_df.if_flags & XFS_IFINLINE); - return XFS_I(inode)->i_df.if_u1.if_data; + + /* + * The VFS crashes on a NULL pointer, so return -EFSCORRUPTED if + * if_data is junk. + */ + link = XFS_I(inode)->i_df.if_u1.if_data; + if (!link) + return ERR_PTR(-EFSCORRUPTED); + return link; } STATIC int diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index a21dc61ec09e..1fc9e9042e0e 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1570,16 +1570,6 @@ xlog_find_zeroed( if (last_cycle != 0) { /* log completely written to */ xlog_put_bp(bp); return 0; - } else if (first_cycle != 1) { - /* - * If the cycle of the last block is zero, the cycle of - * the first block must be 1. If it's not, maybe we're - * not looking at a log... Bail out. - */ - xfs_warn(log->l_mp, - "Log inconsistent or not a log (last==0, first!=1)"); - error = -EINVAL; - goto bp_err; } /* we have a partially zeroed log */ diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 38f405415b88..42ea7bab9144 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -352,6 +352,47 @@ xfs_reflink_convert_cow( return error; } +/* + * Find the extent that maps the given range in the COW fork. Even if the extent + * is not shared we might have a preallocation for it in the COW fork. If so we + * use it that rather than trigger a new allocation. + */ +static int +xfs_find_trim_cow_extent( + struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, + bool *shared, + bool *found) +{ + xfs_fileoff_t offset_fsb = imap->br_startoff; + xfs_filblks_t count_fsb = imap->br_blockcount; + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec got; + bool trimmed; + + *found = false; + + /* + * If we don't find an overlapping extent, trim the range we need to + * allocate to fit the hole we found. + */ + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got) || + got.br_startoff > offset_fsb) + return xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed); + + *shared = true; + if (isnullstartblock(got.br_startblock)) { + xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); + return 0; + } + + /* real extent found - no need to allocate */ + xfs_trim_extent(&got, offset_fsb, count_fsb); + *imap = got; + *found = true; + return 0; +} + /* Allocate all CoW reservations covering a range of blocks in a file. */ int xfs_reflink_allocate_cow( @@ -363,78 +404,64 @@ xfs_reflink_allocate_cow( struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb = imap->br_startoff; xfs_filblks_t count_fsb = imap->br_blockcount; - struct xfs_bmbt_irec got; - struct xfs_trans *tp = NULL; + struct xfs_trans *tp; int nimaps, error = 0; - bool trimmed; + bool found; xfs_filblks_t resaligned; xfs_extlen_t resblks = 0; - struct xfs_iext_cursor icur; -retry: - ASSERT(xfs_is_reflink_inode(ip)); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(xfs_is_reflink_inode(ip)); - /* - * Even if the extent is not shared we might have a preallocation for - * it in the COW fork. If so use it. - */ - if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got) && - got.br_startoff <= offset_fsb) { - *shared = true; - - /* If we have a real allocation in the COW fork we're done. */ - if (!isnullstartblock(got.br_startblock)) { - xfs_trim_extent(&got, offset_fsb, count_fsb); - *imap = got; - goto convert; - } + error = xfs_find_trim_cow_extent(ip, imap, shared, &found); + if (error || !*shared) + return error; + if (found) + goto convert; - xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); - } else { - error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed); - if (error || !*shared) - goto out; - } + resaligned = xfs_aligned_fsb_count(imap->br_startoff, + imap->br_blockcount, xfs_get_cowextsz_hint(ip)); + resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); - if (!tp) { - resaligned = xfs_aligned_fsb_count(imap->br_startoff, - imap->br_blockcount, xfs_get_cowextsz_hint(ip)); - resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); + xfs_iunlock(ip, *lockmode); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); + *lockmode = XFS_ILOCK_EXCL; + xfs_ilock(ip, *lockmode); - xfs_iunlock(ip, *lockmode); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); - *lockmode = XFS_ILOCK_EXCL; - xfs_ilock(ip, *lockmode); + if (error) + return error; - if (error) - return error; + error = xfs_qm_dqattach_locked(ip, false); + if (error) + goto out_trans_cancel; - error = xfs_qm_dqattach_locked(ip, false); - if (error) - goto out; - goto retry; + /* + * Check for an overlapping extent again now that we dropped the ilock. + */ + error = xfs_find_trim_cow_extent(ip, imap, shared, &found); + if (error || !*shared) + goto out_trans_cancel; + if (found) { + xfs_trans_cancel(tp); + goto convert; } error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, XFS_QMOPT_RES_REGBLKS); if (error) - goto out; + goto out_trans_cancel; xfs_trans_ijoin(tp, ip, 0); - nimaps = 1; - /* Allocate the entire reservation as unwritten blocks. */ + nimaps = 1; error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount, XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, resblks, imap, &nimaps); if (error) - goto out_trans_cancel; + goto out_unreserve; xfs_inode_set_cowblocks_tag(ip); - - /* Finish up. */ error = xfs_trans_commit(tp); if (error) return error; @@ -447,12 +474,12 @@ retry: return -ENOSPC; convert: return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb); -out_trans_cancel: + +out_unreserve: xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0, XFS_QMOPT_RES_REGBLKS); -out: - if (tp) - xfs_trans_cancel(tp); +out_trans_cancel: + xfs_trans_cancel(tp); return error; } @@ -666,14 +693,12 @@ xfs_reflink_end_cow( if (!del.br_blockcount) goto prev_extent; - ASSERT(!isnullstartblock(got.br_startblock)); - /* - * Don't remap unwritten extents; these are - * speculatively preallocated CoW extents that have been - * allocated but have not yet been involved in a write. + * Only remap real extent that contain data. With AIO + * speculatively preallocations can leak into the range we + * are called upon, and we need to skip them. */ - if (got.br_state == XFS_EXT_UNWRITTEN) + if (!xfs_bmap_is_real_extent(&got)) goto prev_extent; /* Unmap the old blocks in the data fork. */ @@ -1195,35 +1220,92 @@ retry: return 0; } +/* Unlock both inodes after they've been prepped for a range clone. */ +STATIC void +xfs_reflink_remap_unlock( + struct file *file_in, + struct file *file_out) +{ + struct inode *inode_in = file_inode(file_in); + struct xfs_inode *src = XFS_I(inode_in); + struct inode *inode_out = file_inode(file_out); + struct xfs_inode *dest = XFS_I(inode_out); + bool same_inode = (inode_in == inode_out); + + xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); + if (!same_inode) + xfs_iunlock(src, XFS_MMAPLOCK_SHARED); + inode_unlock(inode_out); + if (!same_inode) + inode_unlock_shared(inode_in); +} + /* - * Link a range of blocks from one file to another. + * If we're reflinking to a point past the destination file's EOF, we must + * zero any speculative post-EOF preallocations that sit between the old EOF + * and the destination file offset. */ -int -xfs_reflink_remap_range( +static int +xfs_reflink_zero_posteof( + struct xfs_inode *ip, + loff_t pos) +{ + loff_t isize = i_size_read(VFS_I(ip)); + + if (pos <= isize) + return 0; + + trace_xfs_zero_eof(ip, isize, pos - isize); + return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL, + &xfs_iomap_ops); +} + +/* + * Prepare two files for range cloning. Upon a successful return both inodes + * will have the iolock and mmaplock held, the page cache of the out file will + * be truncated, and any leases on the out file will have been broken. This + * function borrows heavily from xfs_file_aio_write_checks. + * + * The VFS allows partial EOF blocks to "match" for dedupe even though it hasn't + * checked that the bytes beyond EOF physically match. Hence we cannot use the + * EOF block in the source dedupe range because it's not a complete block match, + * hence can introduce a corruption into the file that has it's block replaced. + * + * In similar fashion, the VFS file cloning also allows partial EOF blocks to be + * "block aligned" for the purposes of cloning entire files. However, if the + * source file range includes the EOF block and it lands within the existing EOF + * of the destination file, then we can expose stale data from beyond the source + * file EOF in the destination file. + * + * XFS doesn't support partial block sharing, so in both cases we have check + * these cases ourselves. For dedupe, we can simply round the length to dedupe + * down to the previous whole block and ignore the partial EOF block. While this + * means we can't dedupe the last block of a file, this is an acceptible + * tradeoff for simplicity on implementation. + * + * For cloning, we want to share the partial EOF block if it is also the new EOF + * block of the destination file. If the partial EOF block lies inside the + * existing destination EOF, then we have to abort the clone to avoid exposing + * stale data in the destination file. Hence we reject these clone attempts with + * -EINVAL in this case. + */ +STATIC int +xfs_reflink_remap_prep( struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, - u64 len, + u64 *len, bool is_dedupe) { struct inode *inode_in = file_inode(file_in); struct xfs_inode *src = XFS_I(inode_in); struct inode *inode_out = file_inode(file_out); struct xfs_inode *dest = XFS_I(inode_out); - struct xfs_mount *mp = src->i_mount; bool same_inode = (inode_in == inode_out); - xfs_fileoff_t sfsbno, dfsbno; - xfs_filblks_t fsblen; - xfs_extlen_t cowextsize; + u64 blkmask = i_blocksize(inode_in) - 1; ssize_t ret; - if (!xfs_sb_version_hasreflink(&mp->m_sb)) - return -EOPNOTSUPP; - - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - /* Lock both files against IO */ ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out); if (ret) @@ -1245,33 +1327,115 @@ xfs_reflink_remap_range( goto out_unlock; ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out, - &len, is_dedupe); + len, is_dedupe); if (ret <= 0) goto out_unlock; + /* + * If the dedupe data matches, chop off the partial EOF block + * from the source file so we don't try to dedupe the partial + * EOF block. + */ + if (is_dedupe) { + *len &= ~blkmask; + } else if (*len & blkmask) { + /* + * The user is attempting to share a partial EOF block, + * if it's inside the destination EOF then reject it. + */ + if (pos_out + *len < i_size_read(inode_out)) { + ret = -EINVAL; + goto out_unlock; + } + } + /* Attach dquots to dest inode before changing block map */ ret = xfs_qm_dqattach(dest); if (ret) goto out_unlock; - trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); - /* - * Clear out post-eof preallocations because we don't have page cache - * backing the delayed allocations and they'll never get freed on - * their own. + * Zero existing post-eof speculative preallocations in the destination + * file. */ - if (xfs_can_free_eofblocks(dest, true)) { - ret = xfs_free_eofblocks(dest); - if (ret) - goto out_unlock; - } + ret = xfs_reflink_zero_posteof(dest, pos_out); + if (ret) + goto out_unlock; /* Set flags and remap blocks. */ ret = xfs_reflink_set_inode_flag(src, dest); if (ret) goto out_unlock; + /* Zap any page cache for the destination file's range. */ + truncate_inode_pages_range(&inode_out->i_data, pos_out, + PAGE_ALIGN(pos_out + *len) - 1); + + /* If we're altering the file contents... */ + if (!is_dedupe) { + /* + * ...update the timestamps (which will grab the ilock again + * from xfs_fs_dirty_inode, so we have to call it before we + * take the ilock). + */ + if (!(file_out->f_mode & FMODE_NOCMTIME)) { + ret = file_update_time(file_out); + if (ret) + goto out_unlock; + } + + /* + * ...clear the security bits if the process is not being run + * by root. This keeps people from modifying setuid and setgid + * binaries. + */ + ret = file_remove_privs(file_out); + if (ret) + goto out_unlock; + } + + return 1; +out_unlock: + xfs_reflink_remap_unlock(file_in, file_out); + return ret; +} + +/* + * Link a range of blocks from one file to another. + */ +int +xfs_reflink_remap_range( + struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + u64 len, + bool is_dedupe) +{ + struct inode *inode_in = file_inode(file_in); + struct xfs_inode *src = XFS_I(inode_in); + struct inode *inode_out = file_inode(file_out); + struct xfs_inode *dest = XFS_I(inode_out); + struct xfs_mount *mp = src->i_mount; + xfs_fileoff_t sfsbno, dfsbno; + xfs_filblks_t fsblen; + xfs_extlen_t cowextsize; + ssize_t ret; + + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return -EOPNOTSUPP; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + /* Prepare and then clone file data. */ + ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out, + &len, is_dedupe); + if (ret <= 0) + return ret; + + trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); + dfsbno = XFS_B_TO_FSBT(mp, pos_out); sfsbno = XFS_B_TO_FSBT(mp, pos_in); fsblen = XFS_B_TO_FSB(mp, len); @@ -1280,10 +1444,6 @@ xfs_reflink_remap_range( if (ret) goto out_unlock; - /* Zap any page cache for the destination file's range. */ - truncate_inode_pages_range(&inode_out->i_data, pos_out, - PAGE_ALIGN(pos_out + len) - 1); - /* * Carry the cowextsize hint from src to dest if we're sharing the * entire source file to the entire destination file, the source file @@ -1300,12 +1460,7 @@ xfs_reflink_remap_range( is_dedupe); out_unlock: - xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); - if (!same_inode) - xfs_iunlock(src, XFS_MMAPLOCK_SHARED); - inode_unlock(inode_out); - if (!same_inode) - inode_unlock_shared(inode_in); + xfs_reflink_remap_unlock(file_in, file_out); if (ret) trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); return ret; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index ad315e83bc02..3043e5ed6495 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -473,7 +473,6 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock); -DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push); DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index bedc5a5133a5..912b42f5fe4a 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -259,6 +259,14 @@ xfs_trans_alloc( struct xfs_trans *tp; int error; + /* + * Allocate the handle before we do our freeze accounting and setting up + * GFP_NOFS allocation context so that we avoid lockdep false positives + * by doing GFP_KERNEL allocations inside sb_start_intwrite(). + */ + tp = kmem_zone_zalloc(xfs_trans_zone, + (flags & XFS_TRANS_NOFS) ? KM_NOFS : KM_SLEEP); + if (!(flags & XFS_TRANS_NO_WRITECOUNT)) sb_start_intwrite(mp->m_super); @@ -270,8 +278,6 @@ xfs_trans_alloc( mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); atomic_inc(&mp->m_active_trans); - tp = kmem_zone_zalloc(xfs_trans_zone, - (flags & XFS_TRANS_NOFS) ? KM_NOFS : KM_SLEEP); tp->t_magic = XFS_TRANS_HEADER_MAGIC; tp->t_flags = flags; tp->t_mountp = mp; diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 15919f67a88f..286a287ac57a 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -322,49 +322,38 @@ xfs_trans_read_buf_map( } /* - * Release the buffer bp which was previously acquired with one of the - * xfs_trans_... buffer allocation routines if the buffer has not - * been modified within this transaction. If the buffer is modified - * within this transaction, do decrement the recursion count but do - * not release the buffer even if the count goes to 0. If the buffer is not - * modified within the transaction, decrement the recursion count and - * release the buffer if the recursion count goes to 0. + * Release a buffer previously joined to the transaction. If the buffer is + * modified within this transaction, decrement the recursion count but do not + * release the buffer even if the count goes to 0. If the buffer is not modified + * within the transaction, decrement the recursion count and release the buffer + * if the recursion count goes to 0. * - * If the buffer is to be released and it was not modified before - * this transaction began, then free the buf_log_item associated with it. + * If the buffer is to be released and it was not already dirty before this + * transaction began, then also free the buf_log_item associated with it. * - * If the transaction pointer is NULL, make this just a normal - * brelse() call. + * If the transaction pointer is NULL, this is a normal xfs_buf_relse() call. */ void xfs_trans_brelse( - xfs_trans_t *tp, - xfs_buf_t *bp) + struct xfs_trans *tp, + struct xfs_buf *bp) { - struct xfs_buf_log_item *bip; - int freed; + struct xfs_buf_log_item *bip = bp->b_log_item; - /* - * Default to a normal brelse() call if the tp is NULL. - */ - if (tp == NULL) { - ASSERT(bp->b_transp == NULL); + ASSERT(bp->b_transp == tp); + + if (!tp) { xfs_buf_relse(bp); return; } - ASSERT(bp->b_transp == tp); - bip = bp->b_log_item; + trace_xfs_trans_brelse(bip); ASSERT(bip->bli_item.li_type == XFS_LI_BUF); - ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); - trace_xfs_trans_brelse(bip); - /* - * If the release is just for a recursive lock, - * then decrement the count and return. + * If the release is for a recursive lookup, then decrement the count + * and return. */ if (bip->bli_recur > 0) { bip->bli_recur--; @@ -372,64 +361,24 @@ xfs_trans_brelse( } /* - * If the buffer is dirty within this transaction, we can't + * If the buffer is invalidated or dirty in this transaction, we can't * release it until we commit. */ if (test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags)) return; - - /* - * If the buffer has been invalidated, then we can't release - * it until the transaction commits to disk unless it is re-dirtied - * as part of this transaction. This prevents us from pulling - * the item from the AIL before we should. - */ if (bip->bli_flags & XFS_BLI_STALE) return; - ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); - /* - * Free up the log item descriptor tracking the released item. + * Unlink the log item from the transaction and clear the hold flag, if + * set. We wouldn't want the next user of the buffer to get confused. */ + ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); xfs_trans_del_item(&bip->bli_item); + bip->bli_flags &= ~XFS_BLI_HOLD; - /* - * Clear the hold flag in the buf log item if it is set. - * We wouldn't want the next user of the buffer to - * get confused. - */ - if (bip->bli_flags & XFS_BLI_HOLD) { - bip->bli_flags &= ~XFS_BLI_HOLD; - } - - /* - * Drop our reference to the buf log item. - */ - freed = atomic_dec_and_test(&bip->bli_refcount); - - /* - * If the buf item is not tracking data in the log, then we must free it - * before releasing the buffer back to the free pool. - * - * If the fs has shutdown and we dropped the last reference, it may fall - * on us to release a (possibly dirty) bli if it never made it to the - * AIL (e.g., the aborted unpin already happened and didn't release it - * due to our reference). Since we're already shutdown and need - * ail_lock, just force remove from the AIL and release the bli here. - */ - if (XFS_FORCED_SHUTDOWN(tp->t_mountp) && freed) { - xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR); - xfs_buf_item_relse(bp); - } else if (!(bip->bli_flags & XFS_BLI_DIRTY)) { -/*** - ASSERT(bp->b_pincount == 0); -***/ - ASSERT(atomic_read(&bip->bli_refcount) == 0); - ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); - ASSERT(!(bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF)); - xfs_buf_item_relse(bp); - } + /* drop the reference to the bli */ + xfs_buf_item_put(bip); bp->b_transp = NULL; xfs_buf_relse(bp); |