diff options
Diffstat (limited to 'fs')
322 files changed, 14531 insertions, 8255 deletions
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 3576123d8299..6ecf863bfa2f 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -625,7 +625,7 @@ static void v9fs_mmap_vm_close(struct vm_area_struct *vma) inode = file_inode(vma->vm_file); - if (!mapping_cap_writeback_dirty(inode->i_mapping)) + if (!mapping_can_writeback(inode->i_mapping)) wbc.nr_to_write = 0; might_sleep(); diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 74df32be4c6a..e34fa20acf61 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -80,8 +80,10 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, if (ret) return ret; - if (v9ses->cache) - sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; + if (!v9ses->cache) { + sb->s_bdi->ra_pages = 0; + sb->s_bdi->io_pages = 0; + } sb->s_flags |= SB_ACTIVE | SB_DIRSYNC; if (!v9ses->cache) diff --git a/fs/Makefile b/fs/Makefile index 1c7b0e3f6daa..7bb2a05fda1f 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -13,7 +13,8 @@ obj-y := open.o read_write.o file_table.o super.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ pnode.o splice.o sync.o utimes.o d_path.o \ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \ - fs_types.o fs_context.o fs_parser.o fsopen.o init.o + fs_types.o fs_context.o fs_parser.o fsopen.o init.o \ + kernel_read_file.o ifeq ($(CONFIG_BLOCK),y) obj-y += buffer.o block_dev.o direct-io.o mpage.o @@ -37,7 +38,6 @@ obj-$(CONFIG_FS_DAX) += dax.o obj-$(CONFIG_FS_ENCRYPTION) += crypto/ obj-$(CONFIG_FS_VERITY) += verity/ obj-$(CONFIG_FILE_LOCKING) += locks.o -obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 5b79cdceefa0..52233fa6195f 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -18,8 +18,10 @@ static unsigned __read_mostly afs_cell_gc_delay = 10; static unsigned __read_mostly afs_cell_min_ttl = 10 * 60; static unsigned __read_mostly afs_cell_max_ttl = 24 * 60 * 60; +static atomic_t cell_debug_id; -static void afs_manage_cell(struct work_struct *); +static void afs_queue_cell_manager(struct afs_net *); +static void afs_manage_cell_work(struct work_struct *); static void afs_dec_cells_outstanding(struct afs_net *net) { @@ -37,19 +39,22 @@ static void afs_set_cell_timer(struct afs_net *net, time64_t delay) atomic_inc(&net->cells_outstanding); if (timer_reduce(&net->cells_timer, jiffies + delay * HZ)) afs_dec_cells_outstanding(net); + } else { + afs_queue_cell_manager(net); } } /* - * Look up and get an activation reference on a cell record under RCU - * conditions. The caller must hold the RCU read lock. + * Look up and get an activation reference on a cell record. The caller must + * hold net->cells_lock at least read-locked. */ -struct afs_cell *afs_lookup_cell_rcu(struct afs_net *net, - const char *name, unsigned int namesz) +static struct afs_cell *afs_find_cell_locked(struct afs_net *net, + const char *name, unsigned int namesz, + enum afs_cell_trace reason) { struct afs_cell *cell = NULL; struct rb_node *p; - int n, seq = 0, ret = 0; + int n; _enter("%*.*s", namesz, namesz, name); @@ -58,61 +63,48 @@ struct afs_cell *afs_lookup_cell_rcu(struct afs_net *net, if (namesz > AFS_MAXCELLNAME) return ERR_PTR(-ENAMETOOLONG); - do { - /* Unfortunately, rbtree walking doesn't give reliable results - * under just the RCU read lock, so we have to check for - * changes. - */ - if (cell) - afs_put_cell(net, cell); - cell = NULL; - ret = -ENOENT; - - read_seqbegin_or_lock(&net->cells_lock, &seq); - - if (!name) { - cell = rcu_dereference_raw(net->ws_cell); - if (cell) { - afs_get_cell(cell); - ret = 0; - break; - } - ret = -EDESTADDRREQ; - continue; - } + if (!name) { + cell = net->ws_cell; + if (!cell) + return ERR_PTR(-EDESTADDRREQ); + goto found; + } - p = rcu_dereference_raw(net->cells.rb_node); - while (p) { - cell = rb_entry(p, struct afs_cell, net_node); - - n = strncasecmp(cell->name, name, - min_t(size_t, cell->name_len, namesz)); - if (n == 0) - n = cell->name_len - namesz; - if (n < 0) { - p = rcu_dereference_raw(p->rb_left); - } else if (n > 0) { - p = rcu_dereference_raw(p->rb_right); - } else { - if (atomic_inc_not_zero(&cell->usage)) { - ret = 0; - break; - } - /* We want to repeat the search, this time with - * the lock properly locked. - */ - } - cell = NULL; - } + p = net->cells.rb_node; + while (p) { + cell = rb_entry(p, struct afs_cell, net_node); - } while (need_seqretry(&net->cells_lock, seq)); + n = strncasecmp(cell->name, name, + min_t(size_t, cell->name_len, namesz)); + if (n == 0) + n = cell->name_len - namesz; + if (n < 0) + p = p->rb_left; + else if (n > 0) + p = p->rb_right; + else + goto found; + } - done_seqretry(&net->cells_lock, seq); + return ERR_PTR(-ENOENT); - if (ret != 0 && cell) - afs_put_cell(net, cell); +found: + return afs_use_cell(cell, reason); +} - return ret == 0 ? cell : ERR_PTR(ret); +/* + * Look up and get an activation reference on a cell record. + */ +struct afs_cell *afs_find_cell(struct afs_net *net, + const char *name, unsigned int namesz, + enum afs_cell_trace reason) +{ + struct afs_cell *cell; + + down_read(&net->cells_lock); + cell = afs_find_cell_locked(net, name, namesz, reason); + up_read(&net->cells_lock); + return cell; } /* @@ -166,8 +158,9 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, cell->name[i] = tolower(name[i]); cell->name[i] = 0; - atomic_set(&cell->usage, 2); - INIT_WORK(&cell->manager, afs_manage_cell); + atomic_set(&cell->ref, 1); + atomic_set(&cell->active, 0); + INIT_WORK(&cell->manager, afs_manage_cell_work); cell->volumes = RB_ROOT; INIT_HLIST_HEAD(&cell->proc_volumes); seqlock_init(&cell->volume_lock); @@ -206,6 +199,9 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, cell->dns_source = vllist->source; cell->dns_status = vllist->status; smp_store_release(&cell->dns_lookup_count, 1); /* vs source/status */ + atomic_inc(&net->cells_outstanding); + cell->debug_id = atomic_inc_return(&cell_debug_id); + trace_afs_cell(cell->debug_id, 1, 0, afs_cell_trace_alloc); _leave(" = %p", cell); return cell; @@ -245,9 +241,7 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net, _enter("%s,%s", name, vllist); if (!excl) { - rcu_read_lock(); - cell = afs_lookup_cell_rcu(net, name, namesz); - rcu_read_unlock(); + cell = afs_find_cell(net, name, namesz, afs_cell_trace_use_lookup); if (!IS_ERR(cell)) goto wait_for_cell; } @@ -268,7 +262,7 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net, /* Find the insertion point and check to see if someone else added a * cell whilst we were allocating. */ - write_seqlock(&net->cells_lock); + down_write(&net->cells_lock); pp = &net->cells.rb_node; parent = NULL; @@ -290,23 +284,26 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net, cell = candidate; candidate = NULL; + atomic_set(&cell->active, 2); + trace_afs_cell(cell->debug_id, atomic_read(&cell->ref), 2, afs_cell_trace_insert); rb_link_node_rcu(&cell->net_node, parent, pp); rb_insert_color(&cell->net_node, &net->cells); - atomic_inc(&net->cells_outstanding); - write_sequnlock(&net->cells_lock); + up_write(&net->cells_lock); - queue_work(afs_wq, &cell->manager); + afs_queue_cell(cell, afs_cell_trace_get_queue_new); wait_for_cell: + trace_afs_cell(cell->debug_id, atomic_read(&cell->ref), atomic_read(&cell->active), + afs_cell_trace_wait); _debug("wait_for_cell"); wait_var_event(&cell->state, ({ state = smp_load_acquire(&cell->state); /* vs error */ - state == AFS_CELL_ACTIVE || state == AFS_CELL_FAILED; + state == AFS_CELL_ACTIVE || state == AFS_CELL_REMOVED; })); /* Check the state obtained from the wait check. */ - if (state == AFS_CELL_FAILED) { + if (state == AFS_CELL_REMOVED) { ret = cell->error; goto error; } @@ -320,16 +317,17 @@ cell_already_exists: if (excl) { ret = -EEXIST; } else { - afs_get_cell(cursor); + afs_use_cell(cursor, afs_cell_trace_use_lookup); ret = 0; } - write_sequnlock(&net->cells_lock); - kfree(candidate); + up_write(&net->cells_lock); + if (candidate) + afs_put_cell(candidate, afs_cell_trace_put_candidate); if (ret == 0) goto wait_for_cell; goto error_noput; error: - afs_put_cell(net, cell); + afs_unuse_cell(net, cell, afs_cell_trace_unuse_lookup); error_noput: _leave(" = %d [error]", ret); return ERR_PTR(ret); @@ -374,15 +372,16 @@ int afs_cell_init(struct afs_net *net, const char *rootcell) } if (!test_and_set_bit(AFS_CELL_FL_NO_GC, &new_root->flags)) - afs_get_cell(new_root); + afs_use_cell(new_root, afs_cell_trace_use_pin); /* install the new cell */ - write_seqlock(&net->cells_lock); - old_root = rcu_access_pointer(net->ws_cell); - rcu_assign_pointer(net->ws_cell, new_root); - write_sequnlock(&net->cells_lock); + down_write(&net->cells_lock); + afs_see_cell(new_root, afs_cell_trace_see_ws); + old_root = net->ws_cell; + net->ws_cell = new_root; + up_write(&net->cells_lock); - afs_put_cell(net, old_root); + afs_unuse_cell(net, old_root, afs_cell_trace_unuse_ws); _leave(" = 0"); return 0; } @@ -488,18 +487,22 @@ out_wake: static void afs_cell_destroy(struct rcu_head *rcu) { struct afs_cell *cell = container_of(rcu, struct afs_cell, rcu); + struct afs_net *net = cell->net; + int u; _enter("%p{%s}", cell, cell->name); - ASSERTCMP(atomic_read(&cell->usage), ==, 0); + u = atomic_read(&cell->ref); + ASSERTCMP(u, ==, 0); + trace_afs_cell(cell->debug_id, u, atomic_read(&cell->active), afs_cell_trace_free); - afs_put_volume(cell->net, cell->root_volume, afs_volume_trace_put_cell_root); - afs_put_vlserverlist(cell->net, rcu_access_pointer(cell->vl_servers)); - afs_put_cell(cell->net, cell->alias_of); + afs_put_vlserverlist(net, rcu_access_pointer(cell->vl_servers)); + afs_unuse_cell(net, cell->alias_of, afs_cell_trace_unuse_alias); key_put(cell->anonymous_key); kfree(cell->name); kfree(cell); + afs_dec_cells_outstanding(net); _leave(" [destroyed]"); } @@ -532,18 +535,63 @@ void afs_cells_timer(struct timer_list *timer) /* * Get a reference on a cell record. */ -struct afs_cell *afs_get_cell(struct afs_cell *cell) +struct afs_cell *afs_get_cell(struct afs_cell *cell, enum afs_cell_trace reason) { - atomic_inc(&cell->usage); + int u; + + if (atomic_read(&cell->ref) <= 0) + BUG(); + + u = atomic_inc_return(&cell->ref); + trace_afs_cell(cell->debug_id, u, atomic_read(&cell->active), reason); return cell; } /* * Drop a reference on a cell record. */ -void afs_put_cell(struct afs_net *net, struct afs_cell *cell) +void afs_put_cell(struct afs_cell *cell, enum afs_cell_trace reason) { + if (cell) { + unsigned int debug_id = cell->debug_id; + unsigned int u, a; + + a = atomic_read(&cell->active); + u = atomic_dec_return(&cell->ref); + trace_afs_cell(debug_id, u, a, reason); + if (u == 0) { + a = atomic_read(&cell->active); + WARN(a != 0, "Cell active count %u > 0\n", a); + call_rcu(&cell->rcu, afs_cell_destroy); + } + } +} + +/* + * Note a cell becoming more active. + */ +struct afs_cell *afs_use_cell(struct afs_cell *cell, enum afs_cell_trace reason) +{ + int u, a; + + if (atomic_read(&cell->ref) <= 0) + BUG(); + + u = atomic_read(&cell->ref); + a = atomic_inc_return(&cell->active); + trace_afs_cell(cell->debug_id, u, a, reason); + return cell; +} + +/* + * Record a cell becoming less active. When the active counter reaches 1, it + * is scheduled for destruction, but may get reactivated. + */ +void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_trace reason) +{ + unsigned int debug_id = cell->debug_id; time64_t now, expire_delay; + int u, a; if (!cell) return; @@ -556,11 +604,35 @@ void afs_put_cell(struct afs_net *net, struct afs_cell *cell) if (cell->vl_servers->nr_servers) expire_delay = afs_cell_gc_delay; - if (atomic_dec_return(&cell->usage) > 1) - return; + u = atomic_read(&cell->ref); + a = atomic_dec_return(&cell->active); + trace_afs_cell(debug_id, u, a, reason); + WARN_ON(a == 0); + if (a == 1) + /* 'cell' may now be garbage collected. */ + afs_set_cell_timer(net, expire_delay); +} + +/* + * Note that a cell has been seen. + */ +void afs_see_cell(struct afs_cell *cell, enum afs_cell_trace reason) +{ + int u, a; + + u = atomic_read(&cell->ref); + a = atomic_read(&cell->active); + trace_afs_cell(cell->debug_id, u, a, reason); +} - /* 'cell' may now be garbage collected. */ - afs_set_cell_timer(net, expire_delay); +/* + * Queue a cell for management, giving the workqueue a ref to hold. + */ +void afs_queue_cell(struct afs_cell *cell, enum afs_cell_trace reason) +{ + afs_get_cell(cell, reason); + if (!queue_work(afs_wq, &cell->manager)) + afs_put_cell(cell, afs_cell_trace_put_queue_fail); } /* @@ -660,12 +732,10 @@ static void afs_deactivate_cell(struct afs_net *net, struct afs_cell *cell) * Manage a cell record, initialising and destroying it, maintaining its DNS * records. */ -static void afs_manage_cell(struct work_struct *work) +static void afs_manage_cell(struct afs_cell *cell) { - struct afs_cell *cell = container_of(work, struct afs_cell, manager); struct afs_net *net = cell->net; - bool deleted; - int ret, usage; + int ret, active; _enter("%s", cell->name); @@ -674,14 +744,19 @@ again: switch (cell->state) { case AFS_CELL_INACTIVE: case AFS_CELL_FAILED: - write_seqlock(&net->cells_lock); - usage = 1; - deleted = atomic_try_cmpxchg_relaxed(&cell->usage, &usage, 0); - if (deleted) + down_write(&net->cells_lock); + active = 1; + if (atomic_try_cmpxchg_relaxed(&cell->active, &active, 0)) { rb_erase(&cell->net_node, &net->cells); - write_sequnlock(&net->cells_lock); - if (deleted) + trace_afs_cell(cell->debug_id, atomic_read(&cell->ref), 0, + afs_cell_trace_unuse_delete); + smp_store_release(&cell->state, AFS_CELL_REMOVED); + } + up_write(&net->cells_lock); + if (cell->state == AFS_CELL_REMOVED) { + wake_up_var(&cell->state); goto final_destruction; + } if (cell->state == AFS_CELL_FAILED) goto done; smp_store_release(&cell->state, AFS_CELL_UNSET); @@ -703,7 +778,7 @@ again: goto again; case AFS_CELL_ACTIVE: - if (atomic_read(&cell->usage) > 1) { + if (atomic_read(&cell->active) > 1) { if (test_and_clear_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags)) { ret = afs_update_cell(cell); if (ret < 0) @@ -716,13 +791,16 @@ again: goto again; case AFS_CELL_DEACTIVATING: - if (atomic_read(&cell->usage) > 1) + if (atomic_read(&cell->active) > 1) goto reverse_deactivation; afs_deactivate_cell(net, cell); smp_store_release(&cell->state, AFS_CELL_INACTIVE); wake_up_var(&cell->state); goto again; + case AFS_CELL_REMOVED: + goto done; + default: break; } @@ -748,9 +826,18 @@ done: return; final_destruction: - call_rcu(&cell->rcu, afs_cell_destroy); - afs_dec_cells_outstanding(net); - _leave(" [destruct %d]", atomic_read(&net->cells_outstanding)); + /* The root volume is pinning the cell */ + afs_put_volume(cell->net, cell->root_volume, afs_volume_trace_put_cell_root); + cell->root_volume = NULL; + afs_put_cell(cell, afs_cell_trace_put_destroy); +} + +static void afs_manage_cell_work(struct work_struct *work) +{ + struct afs_cell *cell = container_of(work, struct afs_cell, manager); + + afs_manage_cell(cell); + afs_put_cell(cell, afs_cell_trace_put_queue_work); } /* @@ -779,26 +866,29 @@ void afs_manage_cells(struct work_struct *work) * lack of use and cells whose DNS results have expired and dispatch * their managers. */ - read_seqlock_excl(&net->cells_lock); + down_read(&net->cells_lock); for (cursor = rb_first(&net->cells); cursor; cursor = rb_next(cursor)) { struct afs_cell *cell = rb_entry(cursor, struct afs_cell, net_node); - unsigned usage; + unsigned active; bool sched_cell = false; - usage = atomic_read(&cell->usage); - _debug("manage %s %u", cell->name, usage); + active = atomic_read(&cell->active); + trace_afs_cell(cell->debug_id, atomic_read(&cell->ref), + active, afs_cell_trace_manage); - ASSERTCMP(usage, >=, 1); + ASSERTCMP(active, >=, 1); if (purging) { - if (test_and_clear_bit(AFS_CELL_FL_NO_GC, &cell->flags)) - usage = atomic_dec_return(&cell->usage); - ASSERTCMP(usage, ==, 1); + if (test_and_clear_bit(AFS_CELL_FL_NO_GC, &cell->flags)) { + active = atomic_dec_return(&cell->active); + trace_afs_cell(cell->debug_id, atomic_read(&cell->ref), + active, afs_cell_trace_unuse_pin); + } } - if (usage == 1) { + if (active == 1) { struct afs_vlserver_list *vllist; time64_t expire_at = cell->last_inactive; @@ -821,10 +911,10 @@ void afs_manage_cells(struct work_struct *work) } if (sched_cell) - queue_work(afs_wq, &cell->manager); + afs_queue_cell(cell, afs_cell_trace_get_queue_manage); } - read_sequnlock_excl(&net->cells_lock); + up_read(&net->cells_lock); /* Update the timer on the way out. We have to pass an increment on * cells_outstanding in the namespace that we are in to the timer or @@ -854,11 +944,11 @@ void afs_cell_purge(struct afs_net *net) _enter(""); - write_seqlock(&net->cells_lock); - ws = rcu_access_pointer(net->ws_cell); - RCU_INIT_POINTER(net->ws_cell, NULL); - write_sequnlock(&net->cells_lock); - afs_put_cell(net, ws); + down_write(&net->cells_lock); + ws = net->ws_cell; + net->ws_cell = NULL; + up_write(&net->cells_lock); + afs_unuse_cell(net, ws, afs_cell_trace_unuse_ws); _debug("del timer"); if (del_timer_sync(&net->cells_timer)) diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index 7b784af604fd..db832cc931c8 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -123,9 +123,9 @@ static int afs_probe_cell_name(struct dentry *dentry) len--; } - cell = afs_lookup_cell_rcu(net, name, len); + cell = afs_find_cell(net, name, len, afs_cell_trace_use_probe); if (!IS_ERR(cell)) { - afs_put_cell(net, cell); + afs_unuse_cell(net, cell, afs_cell_trace_unuse_probe); return 0; } @@ -179,7 +179,6 @@ static struct dentry *afs_lookup_atcell(struct dentry *dentry) struct afs_cell *cell; struct afs_net *net = afs_d2net(dentry); struct dentry *ret; - unsigned int seq = 0; char *name; int len; @@ -191,17 +190,13 @@ static struct dentry *afs_lookup_atcell(struct dentry *dentry) if (!name) goto out_p; - rcu_read_lock(); - do { - read_seqbegin_or_lock(&net->cells_lock, &seq); - cell = rcu_dereference_raw(net->ws_cell); - if (cell) { - len = cell->name_len; - memcpy(name, cell->name, len + 1); - } - } while (need_seqretry(&net->cells_lock, seq)); - done_seqretry(&net->cells_lock, seq); - rcu_read_unlock(); + down_read(&net->cells_lock); + cell = net->ws_cell; + if (cell) { + len = cell->name_len; + memcpy(name, cell->name, len + 1); + } + up_read(&net->cells_lock); ret = ERR_PTR(-ENOENT); if (!cell) diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 1d13d2e882ad..0fe8844b4bee 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -810,14 +810,32 @@ void afs_evict_inode(struct inode *inode) static void afs_setattr_success(struct afs_operation *op) { - struct inode *inode = &op->file[0].vnode->vfs_inode; + struct afs_vnode_param *vp = &op->file[0]; + struct inode *inode = &vp->vnode->vfs_inode; + loff_t old_i_size = i_size_read(inode); + + op->setattr.old_i_size = old_i_size; + afs_vnode_commit_status(op, vp); + /* inode->i_size has now been changed. */ + + if (op->setattr.attr->ia_valid & ATTR_SIZE) { + loff_t size = op->setattr.attr->ia_size; + if (size > old_i_size) + pagecache_isize_extended(inode, old_i_size, size); + } +} + +static void afs_setattr_edit_file(struct afs_operation *op) +{ + struct afs_vnode_param *vp = &op->file[0]; + struct inode *inode = &vp->vnode->vfs_inode; - afs_vnode_commit_status(op, &op->file[0]); if (op->setattr.attr->ia_valid & ATTR_SIZE) { - loff_t i_size = inode->i_size, size = op->setattr.attr->ia_size; - if (size > i_size) - pagecache_isize_extended(inode, i_size, size); - truncate_pagecache(inode, size); + loff_t size = op->setattr.attr->ia_size; + loff_t i_size = op->setattr.old_i_size; + + if (size < i_size) + truncate_pagecache(inode, size); } } @@ -825,6 +843,7 @@ static const struct afs_operation_ops afs_setattr_operation = { .issue_afs_rpc = afs_fs_setattr, .issue_yfs_rpc = yfs_fs_setattr, .success = afs_setattr_success, + .edit_dir = afs_setattr_edit_file, }; /* @@ -863,11 +882,16 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr) if (S_ISREG(vnode->vfs_inode.i_mode)) filemap_write_and_wait(vnode->vfs_inode.i_mapping); + /* Prevent any new writebacks from starting whilst we do this. */ + down_write(&vnode->validate_lock); + op = afs_alloc_operation(((attr->ia_valid & ATTR_FILE) ? afs_file_key(attr->ia_file) : NULL), vnode->volume); - if (IS_ERR(op)) - return PTR_ERR(op); + if (IS_ERR(op)) { + ret = PTR_ERR(op); + goto out_unlock; + } afs_op_set_vnode(op, 0, vnode); op->setattr.attr = attr; @@ -880,5 +904,10 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr) op->file[0].update_ctime = 1; op->ops = &afs_setattr_operation; - return afs_do_sync_operation(op); + ret = afs_do_sync_operation(op); + +out_unlock: + up_write(&vnode->validate_lock); + _leave(" = %d", ret); + return ret; } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 18042b7dab6a..81b0485fd22a 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -263,11 +263,11 @@ struct afs_net { /* Cell database */ struct rb_root cells; - struct afs_cell __rcu *ws_cell; + struct afs_cell *ws_cell; struct work_struct cells_manager; struct timer_list cells_timer; atomic_t cells_outstanding; - seqlock_t cells_lock; + struct rw_semaphore cells_lock; struct mutex cells_alias_lock; struct mutex proc_cells_lock; @@ -326,6 +326,7 @@ enum afs_cell_state { AFS_CELL_DEACTIVATING, AFS_CELL_INACTIVE, AFS_CELL_FAILED, + AFS_CELL_REMOVED, }; /* @@ -363,7 +364,8 @@ struct afs_cell { #endif time64_t dns_expiry; /* Time AFSDB/SRV record expires */ time64_t last_inactive; /* Time of last drop of usage count */ - atomic_t usage; + atomic_t ref; /* Struct refcount */ + atomic_t active; /* Active usage counter */ unsigned long flags; #define AFS_CELL_FL_NO_GC 0 /* The cell was added manually, don't auto-gc */ #define AFS_CELL_FL_DO_LOOKUP 1 /* DNS lookup requested */ @@ -373,6 +375,7 @@ struct afs_cell { enum dns_record_source dns_source:8; /* Latest source of data from lookup */ enum dns_lookup_status dns_status:8; /* Latest status of data from lookup */ unsigned int dns_lookup_count; /* Counter of DNS lookups */ + unsigned int debug_id; /* The volumes belonging to this cell */ struct rb_root volumes; /* Tree of volumes on this server */ @@ -812,6 +815,7 @@ struct afs_operation { } store; struct { struct iattr *attr; + loff_t old_i_size; } setattr; struct afs_acl *acl; struct yfs_acl *yacl; @@ -916,11 +920,16 @@ static inline bool afs_cb_is_broken(unsigned int cb_break, * cell.c */ extern int afs_cell_init(struct afs_net *, const char *); -extern struct afs_cell *afs_lookup_cell_rcu(struct afs_net *, const char *, unsigned); +extern struct afs_cell *afs_find_cell(struct afs_net *, const char *, unsigned, + enum afs_cell_trace); extern struct afs_cell *afs_lookup_cell(struct afs_net *, const char *, unsigned, const char *, bool); -extern struct afs_cell *afs_get_cell(struct afs_cell *); -extern void afs_put_cell(struct afs_net *, struct afs_cell *); +extern struct afs_cell *afs_use_cell(struct afs_cell *, enum afs_cell_trace); +extern void afs_unuse_cell(struct afs_net *, struct afs_cell *, enum afs_cell_trace); +extern struct afs_cell *afs_get_cell(struct afs_cell *, enum afs_cell_trace); +extern void afs_see_cell(struct afs_cell *, enum afs_cell_trace); +extern void afs_put_cell(struct afs_cell *, enum afs_cell_trace); +extern void afs_queue_cell(struct afs_cell *, enum afs_cell_trace); extern void afs_manage_cells(struct work_struct *); extern void afs_cells_timer(struct timer_list *); extern void __net_exit afs_cell_purge(struct afs_net *); diff --git a/fs/afs/main.c b/fs/afs/main.c index 31b472f7c734..accdd8970e7c 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -78,7 +78,7 @@ static int __net_init afs_net_init(struct net *net_ns) mutex_init(&net->socket_mutex); net->cells = RB_ROOT; - seqlock_init(&net->cells_lock); + init_rwsem(&net->cells_lock); INIT_WORK(&net->cells_manager, afs_manage_cells); timer_setup(&net->cells_timer, afs_cells_timer, 0); diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 79bc5f1338ed..052dab2f5c03 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -88,7 +88,7 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) ctx->force = true; } if (ctx->cell) { - afs_put_cell(ctx->net, ctx->cell); + afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_mntpt); ctx->cell = NULL; } if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) { @@ -124,7 +124,7 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) char *buf; if (src_as->cell) - ctx->cell = afs_get_cell(src_as->cell); + ctx->cell = afs_use_cell(src_as->cell, afs_cell_trace_use_mntpt); if (size < 2 || size > PAGE_SIZE - 1) return -EINVAL; diff --git a/fs/afs/proc.c b/fs/afs/proc.c index e8babb62ed44..065a28bfa3f1 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -38,7 +38,7 @@ static int afs_proc_cells_show(struct seq_file *m, void *v) if (v == SEQ_START_TOKEN) { /* display header on line 1 */ - seq_puts(m, "USE TTL SV ST NAME\n"); + seq_puts(m, "USE ACT TTL SV ST NAME\n"); return 0; } @@ -46,10 +46,11 @@ static int afs_proc_cells_show(struct seq_file *m, void *v) vllist = rcu_dereference(cell->vl_servers); /* display one cell per line on subsequent lines */ - seq_printf(m, "%3u %6lld %2u %2u %s\n", - atomic_read(&cell->usage), + seq_printf(m, "%3u %3u %6lld %2u %2u %s\n", + atomic_read(&cell->ref), + atomic_read(&cell->active), cell->dns_expiry - ktime_get_real_seconds(), - vllist->nr_servers, + vllist ? vllist->nr_servers : 0, cell->state, cell->name); return 0; @@ -128,7 +129,7 @@ static int afs_proc_cells_write(struct file *file, char *buf, size_t size) } if (test_and_set_bit(AFS_CELL_FL_NO_GC, &cell->flags)) - afs_put_cell(net, cell); + afs_unuse_cell(net, cell, afs_cell_trace_unuse_no_pin); } else { goto inval; } @@ -154,13 +155,11 @@ static int afs_proc_rootcell_show(struct seq_file *m, void *v) struct afs_net *net; net = afs_seq2net_single(m); - if (rcu_access_pointer(net->ws_cell)) { - rcu_read_lock(); - cell = rcu_dereference(net->ws_cell); - if (cell) - seq_printf(m, "%s\n", cell->name); - rcu_read_unlock(); - } + down_read(&net->cells_lock); + cell = net->ws_cell; + if (cell) + seq_printf(m, "%s\n", cell->name); + up_read(&net->cells_lock); return 0; } diff --git a/fs/afs/server.c b/fs/afs/server.c index e82e452e2612..684a2b02b9ff 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -550,7 +550,12 @@ void afs_manage_servers(struct work_struct *work) _debug("manage %pU %u", &server->uuid, active); - ASSERTIFCMP(purging, active, ==, 0); + if (purging) { + trace_afs_server(server, atomic_read(&server->ref), + active, afs_server_trace_purging); + if (active != 0) + pr_notice("Can't purge s=%08x\n", server->debug_id); + } if (active == 0) { time64_t expire_at = server->unuse_time; diff --git a/fs/afs/super.c b/fs/afs/super.c index b552357b1d13..6c5900df6aa5 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -294,7 +294,8 @@ static int afs_parse_source(struct fs_context *fc, struct fs_parameter *param) cellnamesz, cellnamesz, cellname ?: ""); return PTR_ERR(cell); } - afs_put_cell(ctx->net, ctx->cell); + afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_parse); + afs_see_cell(cell, afs_cell_trace_see_source); ctx->cell = cell; } @@ -389,8 +390,9 @@ static int afs_validate_fc(struct fs_context *fc) _debug("switch to alias"); key_put(ctx->key); ctx->key = NULL; - cell = afs_get_cell(ctx->cell->alias_of); - afs_put_cell(ctx->net, ctx->cell); + cell = afs_use_cell(ctx->cell->alias_of, + afs_cell_trace_use_fc_alias); + afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_fc); ctx->cell = cell; goto reget_key; } @@ -456,7 +458,6 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx) ret = super_setup_bdi(sb); if (ret) return ret; - sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; /* allocate the root inode and dentry */ if (as->dyn_root) { @@ -508,7 +509,7 @@ static struct afs_super_info *afs_alloc_sbi(struct fs_context *fc) if (ctx->dyn_root) { as->dyn_root = true; } else { - as->cell = afs_get_cell(ctx->cell); + as->cell = afs_use_cell(ctx->cell, afs_cell_trace_use_sbi); as->volume = afs_get_volume(ctx->volume, afs_volume_trace_get_alloc_sbi); } @@ -521,7 +522,7 @@ static void afs_destroy_sbi(struct afs_super_info *as) if (as) { struct afs_net *net = afs_net(as->net_ns); afs_put_volume(net, as->volume, afs_volume_trace_put_destroy_sbi); - afs_put_cell(net, as->cell); + afs_unuse_cell(net, as->cell, afs_cell_trace_unuse_sbi); put_net(as->net_ns); kfree(as); } @@ -607,7 +608,7 @@ static void afs_free_fc(struct fs_context *fc) afs_destroy_sbi(fc->s_fs_info); afs_put_volume(ctx->net, ctx->volume, afs_volume_trace_put_free_fc); - afs_put_cell(ctx->net, ctx->cell); + afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_fc); key_put(ctx->key); kfree(ctx); } @@ -634,9 +635,7 @@ static int afs_init_fs_context(struct fs_context *fc) ctx->net = afs_net(fc->net_ns); /* Default to the workstation cell. */ - rcu_read_lock(); - cell = afs_lookup_cell_rcu(ctx->net, NULL, 0); - rcu_read_unlock(); + cell = afs_find_cell(ctx->net, NULL, 0, afs_cell_trace_use_fc); if (IS_ERR(cell)) cell = NULL; ctx->cell = cell; diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c index 5082ef04e99c..f04a80e4f5c3 100644 --- a/fs/afs/vl_alias.c +++ b/fs/afs/vl_alias.c @@ -177,7 +177,7 @@ static int afs_compare_cell_roots(struct afs_cell *cell) is_alias: rcu_read_unlock(); - cell->alias_of = afs_get_cell(p); + cell->alias_of = afs_use_cell(p, afs_cell_trace_use_alias); return 1; } @@ -247,18 +247,18 @@ static int afs_query_for_alias(struct afs_cell *cell, struct key *key) continue; if (p->root_volume) continue; /* Ignore cells that have a root.cell volume. */ - afs_get_cell(p); + afs_use_cell(p, afs_cell_trace_use_check_alias); mutex_unlock(&cell->net->proc_cells_lock); if (afs_query_for_alias_one(cell, key, p) != 0) goto is_alias; if (mutex_lock_interruptible(&cell->net->proc_cells_lock) < 0) { - afs_put_cell(cell->net, p); + afs_unuse_cell(cell->net, p, afs_cell_trace_unuse_check_alias); return -ERESTARTSYS; } - afs_put_cell(cell->net, p); + afs_unuse_cell(cell->net, p, afs_cell_trace_unuse_check_alias); } mutex_unlock(&cell->net->proc_cells_lock); diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c index c0458c903b31..488e58490b16 100644 --- a/fs/afs/vl_rotate.c +++ b/fs/afs/vl_rotate.c @@ -45,7 +45,7 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc) cell->dns_expiry <= ktime_get_real_seconds()) { dns_lookup_count = smp_load_acquire(&cell->dns_lookup_count); set_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags); - queue_work(afs_wq, &cell->manager); + afs_queue_cell(cell, afs_cell_trace_get_queue_dns); if (cell->dns_source == DNS_RECORD_UNAVAILABLE) { if (wait_var_event_interruptible( diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 9bc0509e3634..f84194b791d3 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -83,7 +83,7 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params, volume->vid = vldb->vid[params->type]; volume->update_at = ktime_get_real_seconds() + afs_volume_record_life; - volume->cell = afs_get_cell(params->cell); + volume->cell = afs_get_cell(params->cell, afs_cell_trace_get_vol); volume->type = params->type; volume->type_force = params->force; volume->name_len = vldb->name_len; @@ -106,7 +106,7 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params, return volume; error_1: - afs_put_cell(params->net, volume->cell); + afs_put_cell(volume->cell, afs_cell_trace_put_vol); kfree(volume); error_0: return ERR_PTR(ret); @@ -228,7 +228,7 @@ static void afs_destroy_volume(struct afs_net *net, struct afs_volume *volume) afs_remove_volume_from_cell(volume); afs_put_serverlist(net, rcu_access_pointer(volume->servers)); - afs_put_cell(net, volume->cell); + afs_put_cell(volume->cell, afs_cell_trace_put_vol); trace_afs_volume(volume->vid, atomic_read(&volume->usage), afs_volume_trace_free); kfree_rcu(volume, rcu); diff --git a/fs/afs/write.c b/fs/afs/write.c index 4b2265cb1891..da12abd6db21 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -738,11 +738,21 @@ static int afs_writepages_region(struct address_space *mapping, int afs_writepages(struct address_space *mapping, struct writeback_control *wbc) { + struct afs_vnode *vnode = AFS_FS_I(mapping->host); pgoff_t start, end, next; int ret; _enter(""); + /* We have to be careful as we can end up racing with setattr() + * truncating the pagecache since the caller doesn't take a lock here + * to prevent it. + */ + if (wbc->sync_mode == WB_SYNC_ALL) + down_read(&vnode->validate_lock); + else if (!down_read_trylock(&vnode->validate_lock)) + return 0; + if (wbc->range_cyclic) { start = mapping->writeback_index; end = -1; @@ -762,6 +772,7 @@ int afs_writepages(struct address_space *mapping, ret = afs_writepages_region(mapping, wbc, start, end, &next); } + up_read(&vnode->validate_lock); _leave(" = %d", ret); return ret; } @@ -1489,12 +1489,8 @@ static ssize_t aio_setup_rw(int rw, const struct iocb *iocb, *iovec = NULL; return ret; } -#ifdef CONFIG_COMPAT - if (compat) - return compat_import_iovec(rw, buf, len, UIO_FASTIOV, iovec, - iter); -#endif - return import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter); + + return __import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter, compat); } static inline void aio_rw_done(struct kiocb *req, ssize_t ret) diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c index 75105f45c51a..322b7dfb4ea0 100644 --- a/fs/autofs/dev-ioctl.c +++ b/fs/autofs/dev-ioctl.c @@ -8,6 +8,7 @@ #include <linux/compat.h> #include <linux/syscalls.h> #include <linux/magic.h> +#include <linux/nospec.h> #include "autofs_i.h" @@ -563,7 +564,7 @@ out: static ioctl_fn lookup_dev_ioctl(unsigned int cmd) { - static ioctl_fn _ioctls[] = { + static const ioctl_fn _ioctls[] = { autofs_dev_ioctl_version, autofs_dev_ioctl_protover, autofs_dev_ioctl_protosubver, @@ -581,7 +582,10 @@ static ioctl_fn lookup_dev_ioctl(unsigned int cmd) }; unsigned int idx = cmd_idx(cmd); - return (idx >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[idx]; + if (idx >= ARRAY_SIZE(_ioctls)) + return NULL; + idx = array_index_nospec(idx, ARRAY_SIZE(_ioctls)); + return _ioctls[idx]; } /* ioctl dispatcher */ diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c index 74c886f7c51c..5ced859dac53 100644 --- a/fs/autofs/waitq.c +++ b/fs/autofs/waitq.c @@ -53,7 +53,7 @@ static int autofs_write(struct autofs_sb_info *sbi, mutex_lock(&sbi->pipe_mutex); while (bytes) { - wr = kernel_write(file, data, bytes, &file->f_pos); + wr = __kernel_write(file, data, bytes, NULL); if (wr <= 0) break; data += wr; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 13d053982dd7..b6b3d052ca86 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -13,6 +13,7 @@ #include <linux/module.h> #include <linux/kernel.h> #include <linux/fs.h> +#include <linux/log2.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/errno.h> @@ -309,7 +310,10 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, * Grow the stack manually; some architectures have a limit on how * far ahead a user-space access may be in order to grow the stack. */ + if (mmap_read_lock_killable(mm)) + return -EINTR; vma = find_extend_vma(mm, bprm->p); + mmap_read_unlock(mm); if (!vma) return -EFAULT; @@ -421,6 +425,26 @@ static int elf_read(struct file *file, void *buf, size_t len, loff_t pos) return 0; } +static unsigned long maximum_alignment(struct elf_phdr *cmds, int nr) +{ + unsigned long alignment = 0; + int i; + + for (i = 0; i < nr; i++) { + if (cmds[i].p_type == PT_LOAD) { + unsigned long p_align = cmds[i].p_align; + + /* skip non-power of two alignments as invalid */ + if (!is_power_of_2(p_align)) + continue; + alignment = max(alignment, p_align); + } + } + + /* ensure we align to at least one page */ + return ELF_PAGEALIGN(alignment); +} + /** * load_elf_phdrs() - load ELF program headers * @elf_ex: ELF header of the binary whose program headers should be loaded @@ -1008,6 +1032,7 @@ out_free_interp: int elf_prot, elf_flags; unsigned long k, vaddr; unsigned long total_size = 0; + unsigned long alignment; if (elf_ppnt->p_type != PT_LOAD) continue; @@ -1086,6 +1111,9 @@ out_free_interp: load_bias = ELF_ET_DYN_BASE; if (current->flags & PF_RANDOMIZE) load_bias += arch_mmap_rnd(); + alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum); + if (alignment) + load_bias &= ~(alignment - 1); elf_flags |= MAP_FIXED; } else load_bias = 0; @@ -1389,126 +1417,6 @@ out: * Jeremy Fitzhardinge <jeremy@sw.oz.au> */ -/* - * The purpose of always_dump_vma() is to make sure that special kernel mappings - * that are useful for post-mortem analysis are included in every core dump. - * In that way we ensure that the core dump is fully interpretable later - * without matching up the same kernel and hardware config to see what PC values - * meant. These special mappings include - vDSO, vsyscall, and other - * architecture specific mappings - */ -static bool always_dump_vma(struct vm_area_struct *vma) -{ - /* Any vsyscall mappings? */ - if (vma == get_gate_vma(vma->vm_mm)) - return true; - - /* - * Assume that all vmas with a .name op should always be dumped. - * If this changes, a new vm_ops field can easily be added. - */ - if (vma->vm_ops && vma->vm_ops->name && vma->vm_ops->name(vma)) - return true; - - /* - * arch_vma_name() returns non-NULL for special architecture mappings, - * such as vDSO sections. - */ - if (arch_vma_name(vma)) - return true; - - return false; -} - -/* - * Decide what to dump of a segment, part, all or none. - */ -static unsigned long vma_dump_size(struct vm_area_struct *vma, - unsigned long mm_flags) -{ -#define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type)) - - /* always dump the vdso and vsyscall sections */ - if (always_dump_vma(vma)) - goto whole; - - if (vma->vm_flags & VM_DONTDUMP) - return 0; - - /* support for DAX */ - if (vma_is_dax(vma)) { - if ((vma->vm_flags & VM_SHARED) && FILTER(DAX_SHARED)) - goto whole; - if (!(vma->vm_flags & VM_SHARED) && FILTER(DAX_PRIVATE)) - goto whole; - return 0; - } - - /* Hugetlb memory check */ - if (is_vm_hugetlb_page(vma)) { - if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED)) - goto whole; - if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE)) - goto whole; - return 0; - } - - /* Do not dump I/O mapped devices or special mappings */ - if (vma->vm_flags & VM_IO) - return 0; - - /* By default, dump shared memory if mapped from an anonymous file. */ - if (vma->vm_flags & VM_SHARED) { - if (file_inode(vma->vm_file)->i_nlink == 0 ? - FILTER(ANON_SHARED) : FILTER(MAPPED_SHARED)) - goto whole; - return 0; - } - - /* Dump segments that have been written to. */ - if (vma->anon_vma && FILTER(ANON_PRIVATE)) - goto whole; - if (vma->vm_file == NULL) - return 0; - - if (FILTER(MAPPED_PRIVATE)) - goto whole; - - /* - * If this looks like the beginning of a DSO or executable mapping, - * check for an ELF header. If we find one, dump the first page to - * aid in determining what was mapped here. - */ - if (FILTER(ELF_HEADERS) && - vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) { - u32 __user *header = (u32 __user *) vma->vm_start; - u32 word; - /* - * Doing it this way gets the constant folded by GCC. - */ - union { - u32 cmp; - char elfmag[SELFMAG]; - } magic; - BUILD_BUG_ON(SELFMAG != sizeof word); - magic.elfmag[EI_MAG0] = ELFMAG0; - magic.elfmag[EI_MAG1] = ELFMAG1; - magic.elfmag[EI_MAG2] = ELFMAG2; - magic.elfmag[EI_MAG3] = ELFMAG3; - if (unlikely(get_user(word, header))) - word = 0; - if (word == magic.cmp) - return PAGE_SIZE; - } - -#undef FILTER - - return 0; - -whole: - return vma->vm_end - vma->vm_start; -} - /* An ELF note in memory */ struct memelfnote { @@ -2220,32 +2128,6 @@ static void free_note_info(struct elf_note_info *info) #endif -static struct vm_area_struct *first_vma(struct task_struct *tsk, - struct vm_area_struct *gate_vma) -{ - struct vm_area_struct *ret = tsk->mm->mmap; - - if (ret) - return ret; - return gate_vma; -} -/* - * Helper function for iterating across a vma list. It ensures that the caller - * will visit `gate_vma' prior to terminating the search. - */ -static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma, - struct vm_area_struct *gate_vma) -{ - struct vm_area_struct *ret; - - ret = this_vma->vm_next; - if (ret) - return ret; - if (this_vma == gate_vma) - return NULL; - return gate_vma; -} - static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, elf_addr_t e_shoff, int segs) { @@ -2272,9 +2154,8 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, static int elf_core_dump(struct coredump_params *cprm) { int has_dumped = 0; - int segs, i; - size_t vma_data_size = 0; - struct vm_area_struct *vma, *gate_vma; + int vma_count, segs, i; + size_t vma_data_size; struct elfhdr elf; loff_t offset = 0, dataoff; struct elf_note_info info = { }; @@ -2282,30 +2163,16 @@ static int elf_core_dump(struct coredump_params *cprm) struct elf_shdr *shdr4extnum = NULL; Elf_Half e_phnum; elf_addr_t e_shoff; - elf_addr_t *vma_filesz = NULL; + struct core_vma_metadata *vma_meta; + + if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size)) + return 0; - /* - * We no longer stop all VM operations. - * - * This is because those proceses that could possibly change map_count - * or the mmap / vma pages are now blocked in do_exit on current - * finishing this core dump. - * - * Only ptrace can touch these memory addresses, but it doesn't change - * the map_count or the pages allocated. So no possibility of crashing - * exists while dumping the mm->vm_next areas to the core file. - */ - /* * The number of segs are recored into ELF header as 16bit value. * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here. */ - segs = current->mm->map_count; - segs += elf_core_extra_phdrs(); - - gate_vma = get_gate_vma(current->mm); - if (gate_vma != NULL) - segs++; + segs = vma_count + elf_core_extra_phdrs(); /* for notes section */ segs++; @@ -2343,24 +2210,6 @@ static int elf_core_dump(struct coredump_params *cprm) dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); - /* - * Zero vma process will get ZERO_SIZE_PTR here. - * Let coredump continue for register state at least. - */ - vma_filesz = kvmalloc(array_size(sizeof(*vma_filesz), (segs - 1)), - GFP_KERNEL); - if (!vma_filesz) - goto end_coredump; - - for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; - vma = next_vma(vma, gate_vma)) { - unsigned long dump_size; - - dump_size = vma_dump_size(vma, cprm->mm_flags); - vma_filesz[i++] = dump_size; - vma_data_size += dump_size; - } - offset += vma_data_size; offset += elf_core_extra_data_size(); e_shoff = offset; @@ -2381,21 +2230,23 @@ static int elf_core_dump(struct coredump_params *cprm) goto end_coredump; /* Write program headers for segments dump */ - for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; - vma = next_vma(vma, gate_vma)) { + for (i = 0; i < vma_count; i++) { + struct core_vma_metadata *meta = vma_meta + i; struct elf_phdr phdr; phdr.p_type = PT_LOAD; phdr.p_offset = offset; - phdr.p_vaddr = vma->vm_start; + phdr.p_vaddr = meta->start; phdr.p_paddr = 0; - phdr.p_filesz = vma_filesz[i++]; - phdr.p_memsz = vma->vm_end - vma->vm_start; + phdr.p_filesz = meta->dump_size; + phdr.p_memsz = meta->end - meta->start; offset += phdr.p_filesz; - phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; - if (vma->vm_flags & VM_WRITE) + phdr.p_flags = 0; + if (meta->flags & VM_READ) + phdr.p_flags |= PF_R; + if (meta->flags & VM_WRITE) phdr.p_flags |= PF_W; - if (vma->vm_flags & VM_EXEC) + if (meta->flags & VM_EXEC) phdr.p_flags |= PF_X; phdr.p_align = ELF_EXEC_PAGESIZE; @@ -2417,28 +2268,11 @@ static int elf_core_dump(struct coredump_params *cprm) if (!dump_skip(cprm, dataoff - cprm->pos)) goto end_coredump; - for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; - vma = next_vma(vma, gate_vma)) { - unsigned long addr; - unsigned long end; + for (i = 0; i < vma_count; i++) { + struct core_vma_metadata *meta = vma_meta + i; - end = vma->vm_start + vma_filesz[i++]; - - for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) { - struct page *page; - int stop; - - page = get_dump_page(addr); - if (page) { - void *kaddr = kmap(page); - stop = !dump_emit(cprm, kaddr, PAGE_SIZE); - kunmap(page); - put_page(page); - } else - stop = !dump_skip(cprm, PAGE_SIZE); - if (stop) - goto end_coredump; - } + if (!dump_user_range(cprm, meta->start, meta->dump_size)) + goto end_coredump; } dump_truncate(cprm); @@ -2453,7 +2287,7 @@ static int elf_core_dump(struct coredump_params *cprm) end_coredump: free_note_info(&info); kfree(shdr4extnum); - kvfree(vma_filesz); + kvfree(vma_meta); kfree(phdr4note); return has_dumped; } diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 50f845702b92..be4062b8ba75 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1215,76 +1215,6 @@ struct elf_prstatus_fdpic int pr_fpvalid; /* True if math co-processor being used. */ }; -/* - * Decide whether a segment is worth dumping; default is yes to be - * sure (missing info is worse than too much; etc). - * Personally I'd include everything, and use the coredump limit... - * - * I think we should skip something. But I am not sure how. H.J. - */ -static int maydump(struct vm_area_struct *vma, unsigned long mm_flags) -{ - int dump_ok; - - /* Do not dump I/O mapped devices or special mappings */ - if (vma->vm_flags & VM_IO) { - kdcore("%08lx: %08lx: no (IO)", vma->vm_start, vma->vm_flags); - return 0; - } - - /* If we may not read the contents, don't allow us to dump - * them either. "dump_write()" can't handle it anyway. - */ - if (!(vma->vm_flags & VM_READ)) { - kdcore("%08lx: %08lx: no (!read)", vma->vm_start, vma->vm_flags); - return 0; - } - - /* support for DAX */ - if (vma_is_dax(vma)) { - if (vma->vm_flags & VM_SHARED) { - dump_ok = test_bit(MMF_DUMP_DAX_SHARED, &mm_flags); - kdcore("%08lx: %08lx: %s (DAX shared)", vma->vm_start, - vma->vm_flags, dump_ok ? "yes" : "no"); - } else { - dump_ok = test_bit(MMF_DUMP_DAX_PRIVATE, &mm_flags); - kdcore("%08lx: %08lx: %s (DAX private)", vma->vm_start, - vma->vm_flags, dump_ok ? "yes" : "no"); - } - return dump_ok; - } - - /* By default, dump shared memory if mapped from an anonymous file. */ - if (vma->vm_flags & VM_SHARED) { - if (file_inode(vma->vm_file)->i_nlink == 0) { - dump_ok = test_bit(MMF_DUMP_ANON_SHARED, &mm_flags); - kdcore("%08lx: %08lx: %s (share)", vma->vm_start, - vma->vm_flags, dump_ok ? "yes" : "no"); - return dump_ok; - } - - dump_ok = test_bit(MMF_DUMP_MAPPED_SHARED, &mm_flags); - kdcore("%08lx: %08lx: %s (share)", vma->vm_start, - vma->vm_flags, dump_ok ? "yes" : "no"); - return dump_ok; - } - -#ifdef CONFIG_MMU - /* By default, if it hasn't been written to, don't write it out */ - if (!vma->anon_vma) { - dump_ok = test_bit(MMF_DUMP_MAPPED_PRIVATE, &mm_flags); - kdcore("%08lx: %08lx: %s (!anon)", vma->vm_start, - vma->vm_flags, dump_ok ? "yes" : "no"); - return dump_ok; - } -#endif - - dump_ok = test_bit(MMF_DUMP_ANON_PRIVATE, &mm_flags); - kdcore("%08lx: %08lx: %s", vma->vm_start, vma->vm_flags, - dump_ok ? "yes" : "no"); - return dump_ok; -} - /* An ELF note in memory */ struct memelfnote { @@ -1524,54 +1454,21 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, /* * dump the segments for an MMU process */ -static bool elf_fdpic_dump_segments(struct coredump_params *cprm) +static bool elf_fdpic_dump_segments(struct coredump_params *cprm, + struct core_vma_metadata *vma_meta, + int vma_count) { - struct vm_area_struct *vma; + int i; - for (vma = current->mm->mmap; vma; vma = vma->vm_next) { -#ifdef CONFIG_MMU - unsigned long addr; -#endif + for (i = 0; i < vma_count; i++) { + struct core_vma_metadata *meta = vma_meta + i; - if (!maydump(vma, cprm->mm_flags)) - continue; - -#ifdef CONFIG_MMU - for (addr = vma->vm_start; addr < vma->vm_end; - addr += PAGE_SIZE) { - bool res; - struct page *page = get_dump_page(addr); - if (page) { - void *kaddr = kmap(page); - res = dump_emit(cprm, kaddr, PAGE_SIZE); - kunmap(page); - put_page(page); - } else { - res = dump_skip(cprm, PAGE_SIZE); - } - if (!res) - return false; - } -#else - if (!dump_emit(cprm, (void *) vma->vm_start, - vma->vm_end - vma->vm_start)) + if (!dump_user_range(cprm, meta->start, meta->dump_size)) return false; -#endif } return true; } -static size_t elf_core_vma_data_size(unsigned long mm_flags) -{ - struct vm_area_struct *vma; - size_t size = 0; - - for (vma = current->mm->mmap; vma; vma = vma->vm_next) - if (maydump(vma, mm_flags)) - size += vma->vm_end - vma->vm_start; - return size; -} - /* * Actual dumper * @@ -1582,9 +1479,8 @@ static size_t elf_core_vma_data_size(unsigned long mm_flags) static int elf_fdpic_core_dump(struct coredump_params *cprm) { int has_dumped = 0; - int segs; + int vma_count, segs; int i; - struct vm_area_struct *vma; struct elfhdr *elf = NULL; loff_t offset = 0, dataoff; struct memelfnote psinfo_note, auxv_note; @@ -1598,18 +1494,8 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) elf_addr_t e_shoff; struct core_thread *ct; struct elf_thread_status *tmp; - - /* - * We no longer stop all VM operations. - * - * This is because those proceses that could possibly change map_count - * or the mmap / vma pages are now blocked in do_exit on current - * finishing this core dump. - * - * Only ptrace can touch these memory addresses, but it doesn't change - * the map_count or the pages allocated. So no possibility of crashing - * exists while dumping the mm->vm_next areas to the core file. - */ + struct core_vma_metadata *vma_meta = NULL; + size_t vma_data_size; /* alloc memory for large data structures: too large to be on stack */ elf = kmalloc(sizeof(*elf), GFP_KERNEL); @@ -1619,6 +1505,9 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) if (!psinfo) goto end_coredump; + if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size)) + goto end_coredump; + for (ct = current->mm->core_state->dumper.next; ct; ct = ct->next) { tmp = elf_dump_thread_status(cprm->siginfo->si_signo, @@ -1638,8 +1527,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) tmp->next = thread_list; thread_list = tmp; - segs = current->mm->map_count; - segs += elf_core_extra_phdrs(); + segs = vma_count + elf_core_extra_phdrs(); /* for notes section */ segs++; @@ -1684,7 +1572,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) /* Page-align dumped data */ dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); - offset += elf_core_vma_data_size(cprm->mm_flags); + offset += vma_data_size; offset += elf_core_extra_data_size(); e_shoff = offset; @@ -1704,23 +1592,26 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) goto end_coredump; /* write program headers for segments dump */ - for (vma = current->mm->mmap; vma; vma = vma->vm_next) { + for (i = 0; i < vma_count; i++) { + struct core_vma_metadata *meta = vma_meta + i; struct elf_phdr phdr; size_t sz; - sz = vma->vm_end - vma->vm_start; + sz = meta->end - meta->start; phdr.p_type = PT_LOAD; phdr.p_offset = offset; - phdr.p_vaddr = vma->vm_start; + phdr.p_vaddr = meta->start; phdr.p_paddr = 0; - phdr.p_filesz = maydump(vma, cprm->mm_flags) ? sz : 0; + phdr.p_filesz = meta->dump_size; phdr.p_memsz = sz; offset += phdr.p_filesz; - phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; - if (vma->vm_flags & VM_WRITE) + phdr.p_flags = 0; + if (meta->flags & VM_READ) + phdr.p_flags |= PF_R; + if (meta->flags & VM_WRITE) phdr.p_flags |= PF_W; - if (vma->vm_flags & VM_EXEC) + if (meta->flags & VM_EXEC) phdr.p_flags |= PF_X; phdr.p_align = ELF_EXEC_PAGESIZE; @@ -1752,7 +1643,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) if (!dump_skip(cprm, dataoff - cprm->pos)) goto end_coredump; - if (!elf_fdpic_dump_segments(cprm)) + if (!elf_fdpic_dump_segments(cprm, vma_meta, vma_count)) goto end_coredump; if (!elf_core_write_extra_data(cprm)) @@ -1776,6 +1667,7 @@ end_coredump: thread_list = thread_list->next; kfree(tmp); } + kvfree(vma_meta); kfree(phdr4note); kfree(elf); kfree(psinfo); diff --git a/fs/block_dev.c b/fs/block_dev.c index 8ae833e00443..9e84b1928b94 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -103,6 +103,35 @@ void invalidate_bdev(struct block_device *bdev) } EXPORT_SYMBOL(invalidate_bdev); +/* + * Drop all buffers & page cache for given bdev range. This function bails + * with error if bdev has other exclusive owner (such as filesystem). + */ +int truncate_bdev_range(struct block_device *bdev, fmode_t mode, + loff_t lstart, loff_t lend) +{ + struct block_device *claimed_bdev = NULL; + int err; + + /* + * If we don't hold exclusive handle for the device, upgrade to it + * while we discard the buffer cache to avoid discarding buffers + * under live filesystem. + */ + if (!(mode & FMODE_EXCL)) { + claimed_bdev = bdev->bd_contains; + err = bd_prepare_to_claim(bdev, claimed_bdev, + truncate_bdev_range); + if (err) + return err; + } + truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend); + if (claimed_bdev) + bd_abort_claiming(bdev, claimed_bdev, truncate_bdev_range); + return 0; +} +EXPORT_SYMBOL(truncate_bdev_range); + static void set_init_blocksize(struct block_device *bdev) { bdev->bd_inode->i_blkbits = blksize_bits(bdev_logical_block_size(bdev)); @@ -862,7 +891,7 @@ static int bdev_set(struct inode *inode, void *data) return 0; } -struct block_device *bdget(dev_t dev) +static struct block_device *bdget(dev_t dev) { struct block_device *bdev; struct inode *inode; @@ -876,11 +905,11 @@ struct block_device *bdget(dev_t dev) bdev = &BDEV_I(inode)->bdev; if (inode->i_state & I_NEW) { + spin_lock_init(&bdev->bd_size_lock); bdev->bd_contains = NULL; bdev->bd_super = NULL; bdev->bd_inode = inode; bdev->bd_part_count = 0; - bdev->bd_invalidated = 0; inode->i_mode = S_IFBLK; inode->i_rdev = dev; inode->i_bdev = bdev; @@ -891,8 +920,6 @@ struct block_device *bdget(dev_t dev) return bdev; } -EXPORT_SYMBOL(bdget); - /** * bdgrab -- Grab a reference to an already referenced block device * @bdev: Block device to grab a reference to. @@ -904,6 +931,11 @@ struct block_device *bdgrab(struct block_device *bdev) } EXPORT_SYMBOL(bdgrab); +struct block_device *bdget_part(struct hd_struct *part) +{ + return bdget(part_devt(part)); +} + long nr_blockdev_pages(void) { struct inode *inode; @@ -1290,6 +1322,7 @@ static void check_disk_size_change(struct gendisk *disk, { loff_t disk_size, bdev_size; + spin_lock(&bdev->bd_size_lock); disk_size = (loff_t)get_capacity(disk) << 9; bdev_size = i_size_read(bdev->bd_inode); if (disk_size != bdev_size) { @@ -1299,85 +1332,51 @@ static void check_disk_size_change(struct gendisk *disk, disk->disk_name, bdev_size, disk_size); } i_size_write(bdev->bd_inode, disk_size); - if (bdev_size > disk_size && __invalidate_device(bdev, false)) + } + spin_unlock(&bdev->bd_size_lock); + + if (bdev_size > disk_size) { + if (__invalidate_device(bdev, false)) pr_warn("VFS: busy inodes on resized disk %s\n", disk->disk_name); } - bdev->bd_invalidated = 0; } /** - * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back - * @disk: struct gendisk to be revalidated + * revalidate_disk_size - checks for disk size change and adjusts bdev size. + * @disk: struct gendisk to check + * @verbose: if %true log a message about a size change if there is any * - * This routine is a wrapper for lower-level driver's revalidate_disk - * call-backs. It is used to do common pre and post operations needed - * for all revalidate_disk operations. + * This routine checks to see if the bdev size does not match the disk size + * and adjusts it if it differs. When shrinking the bdev size, its all caches + * are freed. */ -int revalidate_disk(struct gendisk *disk) +void revalidate_disk_size(struct gendisk *disk, bool verbose) { - int ret = 0; - - if (disk->fops->revalidate_disk) - ret = disk->fops->revalidate_disk(disk); + struct block_device *bdev; /* * Hidden disks don't have associated bdev so there's no point in - * revalidating it. + * revalidating them. */ - if (!(disk->flags & GENHD_FL_HIDDEN)) { - struct block_device *bdev = bdget_disk(disk, 0); - - if (!bdev) - return ret; + if (disk->flags & GENHD_FL_HIDDEN) + return; - mutex_lock(&bdev->bd_mutex); - check_disk_size_change(disk, bdev, ret == 0); - mutex_unlock(&bdev->bd_mutex); + bdev = bdget_disk(disk, 0); + if (bdev) { + check_disk_size_change(disk, bdev, verbose); bdput(bdev); } - return ret; } -EXPORT_SYMBOL(revalidate_disk); +EXPORT_SYMBOL(revalidate_disk_size); -/* - * This routine checks whether a removable media has been changed, - * and invalidates all buffer-cache-entries in that case. This - * is a relatively slow routine, so we have to try to minimize using - * it. Thus it is called only upon a 'mount' or 'open'. This - * is the best way of combining speed and utility, I think. - * People changing diskettes in the middle of an operation deserve - * to lose :-) - */ -int check_disk_change(struct block_device *bdev) +void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors) { - struct gendisk *disk = bdev->bd_disk; - const struct block_device_operations *bdops = disk->fops; - unsigned int events; - - events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE | - DISK_EVENT_EJECT_REQUEST); - if (!(events & DISK_EVENT_MEDIA_CHANGE)) - return 0; - - if (__invalidate_device(bdev, true)) - pr_warn("VFS: busy inodes on changed media %s\n", - disk->disk_name); - bdev->bd_invalidated = 1; - if (bdops->revalidate_disk) - bdops->revalidate_disk(bdev->bd_disk); - return 1; -} - -EXPORT_SYMBOL(check_disk_change); - -void bd_set_size(struct block_device *bdev, loff_t size) -{ - inode_lock(bdev->bd_inode); - i_size_write(bdev->bd_inode, size); - inode_unlock(bdev->bd_inode); + spin_lock(&bdev->bd_size_lock); + i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); + spin_unlock(&bdev->bd_size_lock); } -EXPORT_SYMBOL(bd_set_size); +EXPORT_SYMBOL(bd_set_nr_sectors); static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); @@ -1388,6 +1387,8 @@ int bdev_disk_changed(struct block_device *bdev, bool invalidate) lockdep_assert_held(&bdev->bd_mutex); + clear_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); + rescan: ret = blk_drop_partitions(bdev); if (ret) @@ -1446,22 +1447,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder, struct gendisk *disk; int ret; int partno; - int perm = 0; bool first_open = false, unblock_events = true, need_restart; - if (mode & FMODE_READ) - perm |= MAY_READ; - if (mode & FMODE_WRITE) - perm |= MAY_WRITE; - /* - * hooks: /n/, see "layering violations". - */ - if (!for_part) { - ret = devcgroup_inode_permission(bdev->bd_inode, perm); - if (ret != 0) - return ret; - } - restart: need_restart = false; ret = -ENXIO; @@ -1514,7 +1501,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder, } if (!ret) { - bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); + bd_set_nr_sectors(bdev, get_capacity(disk)); set_init_blocksize(bdev); } @@ -1524,7 +1511,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder, * The latter is necessary to prevent ghost * partitions on a removed medium. */ - if (bdev->bd_invalidated && + if (test_bit(GD_NEED_PART_SCAN, &disk->state) && (!ret || ret == -ENOMEDIUM)) bdev_disk_changed(bdev, ret == -ENOMEDIUM); @@ -1542,7 +1529,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder, ret = -ENXIO; goto out_clear; } - bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); + bd_set_nr_sectors(bdev, bdev->bd_part->nr_sects); set_init_blocksize(bdev); } @@ -1554,7 +1541,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder, if (bdev->bd_disk->fops->open) ret = bdev->bd_disk->fops->open(bdev, mode); /* the same as first opener case, read comment there */ - if (bdev->bd_invalidated && + if (test_bit(GD_NEED_PART_SCAN, &disk->state) && (!ret || ret == -ENOMEDIUM)) bdev_disk_changed(bdev, ret == -ENOMEDIUM); if (ret) @@ -1632,16 +1619,27 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder, * RETURNS: * 0 on success, -errno on failure. */ -int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) +static int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) { - int res; + int ret, perm = 0; - res =__blkdev_get(bdev, mode, holder, 0); - if (res) - bdput(bdev); - return res; + if (mode & FMODE_READ) + perm |= MAY_READ; + if (mode & FMODE_WRITE) + perm |= MAY_WRITE; + ret = devcgroup_inode_permission(bdev->bd_inode, perm); + if (ret) + goto bdput; + + ret =__blkdev_get(bdev, mode, holder, 0); + if (ret) + goto bdput; + return 0; + +bdput: + bdput(bdev); + return ret; } -EXPORT_SYMBOL(blkdev_get); /** * blkdev_get_by_path - open a block device by name @@ -1889,7 +1887,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) if (bdev_read_only(I_BDEV(bd_inode))) return -EPERM; - if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode)) + if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev)) return -ETXTBSY; if (!iov_iter_count(from)) @@ -1969,7 +1967,6 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, loff_t len) { struct block_device *bdev = I_BDEV(bdev_file_inode(file)); - struct address_space *mapping; loff_t end = start + len - 1; loff_t isize; int error; @@ -1997,8 +1994,9 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, return -EINVAL; /* Invalidate the page cache, including dirty pages. */ - mapping = bdev->bd_inode->i_mapping; - truncate_inode_pages_range(mapping, start, end); + error = truncate_bdev_range(bdev, file->f_mode, start, end); + if (error) + return error; switch (mode) { case FALLOC_FL_ZERO_RANGE: @@ -2025,7 +2023,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, * the caller will be given -EBUSY. The third argument is * inclusive, so the rounding here is safe. */ - return invalidate_inode_pages2_range(mapping, + return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping, start >> PAGE_SHIFT, end >> PAGE_SHIFT); } diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 575636f6491e..68b95ad82126 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -14,6 +14,7 @@ config BTRFS_FS select LZO_DECOMPRESS select ZSTD_COMPRESS select ZSTD_DECOMPRESS + select FS_IOMAP select RAID6_PQ select XOR_BLOCKS select SRCU diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index ea1c28ccb44f..b3268f4ea5f3 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -2997,7 +2997,6 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, while (!list_empty(&pending_edge)) { struct btrfs_backref_node *upper; struct btrfs_backref_node *lower; - struct rb_node *rb_node; edge = list_first_entry(&pending_edge, struct btrfs_backref_edge, list[UPPER]); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index ea8aaf36647e..c0f1d6818df7 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1766,16 +1766,10 @@ static void link_block_group(struct btrfs_block_group *cache) { struct btrfs_space_info *space_info = cache->space_info; int index = btrfs_bg_flags_to_raid_index(cache->flags); - bool first = false; down_write(&space_info->groups_sem); - if (list_empty(&space_info->block_groups[index])) - first = true; list_add_tail(&cache->list, &space_info->block_groups[index]); up_write(&space_info->groups_sem); - - if (first) - btrfs_sysfs_add_block_group_type(cache); } static struct btrfs_block_group *btrfs_create_block_group_cache( @@ -1873,7 +1867,7 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) return ret; } -static int read_block_group_item(struct btrfs_block_group *cache, +static void read_block_group_item(struct btrfs_block_group *cache, struct btrfs_path *path, const struct btrfs_key *key) { @@ -1887,8 +1881,6 @@ static int read_block_group_item(struct btrfs_block_group *cache, sizeof(bgi)); cache->used = btrfs_stack_block_group_used(&bgi); cache->flags = btrfs_stack_block_group_flags(&bgi); - - return 0; } static int read_one_block_group(struct btrfs_fs_info *info, @@ -1907,9 +1899,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, if (!cache) return -ENOMEM; - ret = read_block_group_item(cache, path, key); - if (ret < 0) - goto error; + read_block_group_item(cache, path, key); set_free_space_tree_thresholds(cache); @@ -2035,8 +2025,18 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) btrfs_release_path(path); } - rcu_read_lock(); - list_for_each_entry_rcu(space_info, &info->space_info, list) { + list_for_each_entry(space_info, &info->space_info, list) { + int i; + + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { + if (list_empty(&space_info->block_groups[i])) + continue; + cache = list_first_entry(&space_info->block_groups[i], + struct btrfs_block_group, + list); + btrfs_sysfs_add_block_group_type(cache); + } + if (!(btrfs_get_alloc_profile(info, space_info->flags) & (BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_RAID1_MASK | @@ -2056,7 +2056,6 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) list) inc_block_group_ro(cache, 1); } - rcu_read_unlock(); btrfs_init_global_block_rsv(info); ret = check_chunk_block_group_mappings(info); @@ -2097,12 +2096,16 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) return; while (!list_empty(&trans->new_bgs)) { + int index; + block_group = list_first_entry(&trans->new_bgs, struct btrfs_block_group, bg_list); if (ret) goto next; + index = btrfs_bg_flags_to_raid_index(block_group->flags); + ret = insert_block_group_item(trans, block_group); if (ret) btrfs_abort_transaction(trans, ret); @@ -2111,6 +2114,16 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) if (ret) btrfs_abort_transaction(trans, ret); add_block_group_free_space(trans, block_group); + + /* + * If we restriped during balance, we may have added a new raid + * type, so now add the sysfs entries when it is safe to do so. + * We don't have to worry about locking here as it's handled in + * btrfs_sysfs_add_block_group_type. + */ + if (block_group->space_info->block_group_kobjs[index] == NULL) + btrfs_sysfs_add_block_group_type(block_group); + /* Already aborted the transaction if it failed. */ next: btrfs_delayed_refs_rsv_release(fs_info, 1); @@ -2785,7 +2798,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) * finished yet (no block group item in the extent tree * yet, etc). If this is the case, wait for all free * space endio workers to finish and retry. This is a - * a very rare case so no need for a more efficient and + * very rare case so no need for a more efficient and * complex approach. */ if (ret == -ENOENT) { @@ -2961,6 +2974,13 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, space_info, -ram_bytes); if (delalloc) cache->delalloc_bytes += num_bytes; + + /* + * Compression can use less space than we reserved, so wake + * tickets if that happens + */ + if (num_bytes < ram_bytes) + btrfs_try_granting_tickets(cache->fs_info, space_info); } spin_unlock(&cache->lock); spin_unlock(&space_info->lock); @@ -2994,6 +3014,8 @@ void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, if (delalloc) cache->delalloc_bytes -= num_bytes; spin_unlock(&cache->lock); + + btrfs_try_granting_tickets(cache->fs_info, space_info); spin_unlock(&space_info->lock); } @@ -3002,12 +3024,10 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) struct list_head *head = &info->space_info; struct btrfs_space_info *found; - rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) { + list_for_each_entry(found, head, list) { if (found->flags & BTRFS_BLOCK_GROUP_METADATA) found->force_alloc = CHUNK_ALLOC_FORCE; } - rcu_read_unlock(); } static int should_alloc_chunk(struct btrfs_fs_info *fs_info, @@ -3338,14 +3358,6 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) } spin_unlock(&info->block_group_cache_lock); - /* - * Now that all the block groups are freed, go through and free all the - * space_info structs. This is only called during the final stages of - * unmount, and so we know nobody is using them. We call - * synchronize_rcu() once before we start, just to be on the safe side. - */ - synchronize_rcu(); - btrfs_release_global_block_rsv(info); while (!list_empty(&info->space_info)) { diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index c47b6c6fea9f..92dd86bceae3 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -21,14 +21,18 @@ * new data the application may have written before commit. */ enum { - BTRFS_INODE_ORDERED_DATA_CLOSE, + BTRFS_INODE_FLUSH_ON_CLOSE, BTRFS_INODE_DUMMY, BTRFS_INODE_IN_DEFRAG, BTRFS_INODE_HAS_ASYNC_EXTENT, + /* + * Always set under the VFS' inode lock, otherwise it can cause races + * during fsync (we start as a fast fsync and then end up in a full + * fsync racing with ordered extent completion). + */ BTRFS_INODE_NEEDS_FULL_SYNC, BTRFS_INODE_COPY_EVERYTHING, BTRFS_INODE_IN_DELALLOC_LIST, - BTRFS_INODE_READDIO_NEED_LOCK, BTRFS_INODE_HAS_PROPS, BTRFS_INODE_SNAPSHOT_FLUSH, }; @@ -212,6 +216,11 @@ struct btrfs_inode { struct inode vfs_inode; }; +static inline u32 btrfs_inode_sectorsize(const struct btrfs_inode *inode) +{ + return inode->root->fs_info->sectorsize; +} + static inline struct btrfs_inode *BTRFS_I(const struct inode *inode) { return container_of(inode, struct btrfs_inode, vfs_inode); @@ -324,23 +333,6 @@ struct btrfs_dio_private { u8 csums[]; }; -/* - * Disable DIO read nolock optimization, so new dio readers will be forced - * to grab i_mutex. It is used to avoid the endless truncate due to - * nonlocked dio read. - */ -static inline void btrfs_inode_block_unlocked_dio(struct btrfs_inode *inode) -{ - set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags); - smp_mb(); -} - -static inline void btrfs_inode_resume_unlocked_dio(struct btrfs_inode *inode) -{ - smp_mb__before_atomic(); - clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags); -} - /* Array of bytes with variable length, hexadecimal format 0x1234 */ #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 1ab56a734e70..eeface30facd 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -29,41 +29,6 @@ #include "extent_io.h" #include "extent_map.h" -int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, - unsigned long *total_in, unsigned long *total_out); -int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); -int zlib_decompress(struct list_head *ws, unsigned char *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, - size_t destlen); -struct list_head *zlib_alloc_workspace(unsigned int level); -void zlib_free_workspace(struct list_head *ws); -struct list_head *zlib_get_workspace(unsigned int level); - -int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, - unsigned long *total_in, unsigned long *total_out); -int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); -int lzo_decompress(struct list_head *ws, unsigned char *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, - size_t destlen); -struct list_head *lzo_alloc_workspace(unsigned int level); -void lzo_free_workspace(struct list_head *ws); - -int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, - unsigned long *total_in, unsigned long *total_out); -int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); -int zstd_decompress(struct list_head *ws, unsigned char *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, - size_t destlen); -void zstd_init_workspace_manager(void); -void zstd_cleanup_workspace_manager(void); -struct list_head *zstd_alloc_workspace(unsigned int level); -void zstd_free_workspace(struct list_head *ws); -struct list_head *zstd_get_workspace(unsigned int level); -void zstd_put_workspace(struct list_head *ws); - static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" }; const char* btrfs_compress_type2str(enum btrfs_compression_type type) diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 9f3dbe372631..8001b700ea3a 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -144,4 +144,39 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len); int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end); +int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, + u64 start, struct page **pages, unsigned long *out_pages, + unsigned long *total_in, unsigned long *total_out); +int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); +int zlib_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, unsigned long start_byte, size_t srclen, + size_t destlen); +struct list_head *zlib_alloc_workspace(unsigned int level); +void zlib_free_workspace(struct list_head *ws); +struct list_head *zlib_get_workspace(unsigned int level); + +int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, + u64 start, struct page **pages, unsigned long *out_pages, + unsigned long *total_in, unsigned long *total_out); +int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); +int lzo_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, unsigned long start_byte, size_t srclen, + size_t destlen); +struct list_head *lzo_alloc_workspace(unsigned int level); +void lzo_free_workspace(struct list_head *ws); + +int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, + u64 start, struct page **pages, unsigned long *out_pages, + unsigned long *total_in, unsigned long *total_out); +int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); +int zstd_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, unsigned long start_byte, size_t srclen, + size_t destlen); +void zstd_init_workspace_manager(void); +void zstd_cleanup_workspace_manager(void); +struct list_head *zstd_alloc_workspace(unsigned int level); +void zstd_free_workspace(struct list_head *ws); +struct list_head *zstd_get_workspace(unsigned int level); +void zstd_put_workspace(struct list_head *ws); + #endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index cd392da69b81..113da62dc17f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -198,7 +198,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, btrfs_node_key(buf, &disk_key, 0); cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, - &disk_key, level, buf->start, 0); + &disk_key, level, buf->start, 0, + BTRFS_NESTING_NEW_ROOT); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -957,7 +958,8 @@ static struct extent_buffer *alloc_tree_block_no_bg_flush( const struct btrfs_disk_key *disk_key, int level, u64 hint, - u64 empty_size) + u64 empty_size, + enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *ret; @@ -986,7 +988,7 @@ static struct extent_buffer *alloc_tree_block_no_bg_flush( ret = btrfs_alloc_tree_block(trans, root, parent_start, root->root_key.objectid, disk_key, level, - hint, empty_size); + hint, empty_size, nest); trans->can_flush_pending_bgs = true; return ret; @@ -1009,7 +1011,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, struct extent_buffer **cow_ret, - u64 search_start, u64 empty_size) + u64 search_start, u64 empty_size, + enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_disk_key disk_key; @@ -1040,7 +1043,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, parent_start = parent->start; cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key, - level, search_start, empty_size); + level, search_start, empty_size, nest); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -1061,6 +1064,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); if (ret) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); return ret; } @@ -1068,6 +1073,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { ret = btrfs_reloc_cow_block(trans, root, buf, cow); if (ret) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); return ret; } @@ -1100,6 +1107,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if (last_ref) { ret = tree_mod_log_free_eb(buf); if (ret) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); return ret; } @@ -1446,7 +1455,8 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, - struct extent_buffer **cow_ret) + struct extent_buffer **cow_ret, + enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; u64 search_start; @@ -1485,7 +1495,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, */ btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); ret = __btrfs_cow_block(trans, root, buf, parent, - parent_slot, cow_ret, search_start, 0); + parent_slot, cow_ret, search_start, 0, nest); trace_btrfs_cow_block(root, buf, *cow_ret); @@ -1657,7 +1667,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, err = __btrfs_cow_block(trans, root, cur, parent, i, &cur, search_start, min(16 * blocksize, - (end_slot - i) * blocksize)); + (end_slot - i) * blocksize), + BTRFS_NESTING_COW); if (err) { btrfs_tree_unlock(cur); free_extent_buffer(cur); @@ -1855,7 +1866,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_lock(child); btrfs_set_lock_blocking_write(child); - ret = btrfs_cow_block(trans, root, child, mid, 0, &child); + ret = btrfs_cow_block(trans, root, child, mid, 0, &child, + BTRFS_NESTING_COW); if (ret) { btrfs_tree_unlock(child); free_extent_buffer(child); @@ -1891,10 +1903,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, left = NULL; if (left) { - btrfs_tree_lock(left); + __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); btrfs_set_lock_blocking_write(left); wret = btrfs_cow_block(trans, root, left, - parent, pslot - 1, &left); + parent, pslot - 1, &left, + BTRFS_NESTING_LEFT_COW); if (wret) { ret = wret; goto enospc; @@ -1906,10 +1919,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, right = NULL; if (right) { - btrfs_tree_lock(right); + __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); btrfs_set_lock_blocking_write(right); wret = btrfs_cow_block(trans, root, right, - parent, pslot + 1, &right); + parent, pslot + 1, &right, + BTRFS_NESTING_RIGHT_COW); if (wret) { ret = wret; goto enospc; @@ -2069,7 +2083,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, if (left) { u32 left_nr; - btrfs_tree_lock(left); + __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); btrfs_set_lock_blocking_write(left); left_nr = btrfs_header_nritems(left); @@ -2077,7 +2091,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, wret = 1; } else { ret = btrfs_cow_block(trans, root, left, parent, - pslot - 1, &left); + pslot - 1, &left, + BTRFS_NESTING_LEFT_COW); if (ret) wret = 1; else { @@ -2123,7 +2138,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, if (right) { u32 right_nr; - btrfs_tree_lock(right); + __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); btrfs_set_lock_blocking_write(right); right_nr = btrfs_header_nritems(right); @@ -2132,7 +2147,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, } else { ret = btrfs_cow_block(trans, root, right, parent, pslot + 1, - &right); + &right, BTRFS_NESTING_RIGHT_COW); if (ret) wret = 1; else { @@ -2601,7 +2616,7 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, * We don't know the level of the root node until we actually * have it read locked */ - b = btrfs_read_lock_root_node(root); + b = __btrfs_read_lock_root_node(root, p->recurse); level = btrfs_header_level(b); if (level > write_lock_level) goto out; @@ -2740,11 +2755,13 @@ again: btrfs_set_path_blocking(p); if (last_level) err = btrfs_cow_block(trans, root, b, NULL, 0, - &b); + &b, + BTRFS_NESTING_COW); else err = btrfs_cow_block(trans, root, b, p->nodes[level + 1], - p->slots[level + 1], &b); + p->slots[level + 1], &b, + BTRFS_NESTING_COW); if (err) { ret = err; goto done; @@ -2875,7 +2892,8 @@ cow_done: } else { if (!btrfs_tree_read_lock_atomic(b)) { btrfs_set_path_blocking(p); - btrfs_tree_read_lock(b); + __btrfs_tree_read_lock(b, BTRFS_NESTING_NORMAL, + p->recurse); } p->locks[level] = BTRFS_READ_LOCK; } @@ -3164,6 +3182,58 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, } /* + * Check key order of two sibling extent buffers. + * + * Return true if something is wrong. + * Return false if everything is fine. + * + * Tree-checker only works inside one tree block, thus the following + * corruption can not be detected by tree-checker: + * + * Leaf @left | Leaf @right + * -------------------------------------------------------------- + * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 | + * + * Key f6 in leaf @left itself is valid, but not valid when the next + * key in leaf @right is 7. + * This can only be checked at tree block merge time. + * And since tree checker has ensured all key order in each tree block + * is correct, we only need to bother the last key of @left and the first + * key of @right. + */ +static bool check_sibling_keys(struct extent_buffer *left, + struct extent_buffer *right) +{ + struct btrfs_key left_last; + struct btrfs_key right_first; + int level = btrfs_header_level(left); + int nr_left = btrfs_header_nritems(left); + int nr_right = btrfs_header_nritems(right); + + /* No key to check in one of the tree blocks */ + if (!nr_left || !nr_right) + return false; + + if (level) { + btrfs_node_key_to_cpu(left, &left_last, nr_left - 1); + btrfs_node_key_to_cpu(right, &right_first, 0); + } else { + btrfs_item_key_to_cpu(left, &left_last, nr_left - 1); + btrfs_item_key_to_cpu(right, &right_first, 0); + } + + if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) { + btrfs_crit(left->fs_info, +"bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)", + left_last.objectid, left_last.type, + left_last.offset, right_first.objectid, + right_first.type, right_first.offset); + return true; + } + return false; +} + +/* * try to push data from one node into the next node left in the * tree. * @@ -3207,6 +3277,12 @@ static int push_node_left(struct btrfs_trans_handle *trans, } else push_items = min(src_nritems - 8, push_items); + /* dst is the left eb, src is the middle eb */ + if (check_sibling_keys(dst, src)) { + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + return ret; + } ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items); if (ret) { btrfs_abort_transaction(trans, ret); @@ -3275,6 +3351,12 @@ static int balance_node_right(struct btrfs_trans_handle *trans, if (max_push < push_items) push_items = max_push; + /* dst is the right eb, src is the middle eb */ + if (check_sibling_keys(src, dst)) { + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + return ret; + } ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems); BUG_ON(ret < 0); memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), @@ -3331,7 +3413,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_node_key(lower, &lower_key, 0); c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level, - root->node->start, 0); + root->node->start, 0, + BTRFS_NESTING_NEW_ROOT); if (IS_ERR(c)) return PTR_ERR(c); @@ -3461,7 +3544,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_node_key(c, &disk_key, mid); split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level, - c->start, 0); + c->start, 0, BTRFS_NESTING_SPLIT); if (IS_ERR(split)) return PTR_ERR(split); @@ -3730,7 +3813,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (IS_ERR(right)) return 1; - btrfs_tree_lock(right); + __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); btrfs_set_lock_blocking_write(right); free_space = btrfs_leaf_free_space(right); @@ -3739,7 +3822,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root /* cow and double check */ ret = btrfs_cow_block(trans, root, right, upper, - slot + 1, &right); + slot + 1, &right, BTRFS_NESTING_RIGHT_COW); if (ret) goto out_unlock; @@ -3751,6 +3834,12 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (left_nritems == 0) goto out_unlock; + if (check_sibling_keys(left, right)) { + ret = -EUCLEAN; + btrfs_tree_unlock(right); + free_extent_buffer(right); + return ret; + } if (path->slots[0] == left_nritems && !empty) { /* Key greater than all keys in the leaf, right neighbor has * enough room for it and we're not emptying our leaf to delete @@ -3963,7 +4052,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root if (IS_ERR(left)) return 1; - btrfs_tree_lock(left); + __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); btrfs_set_lock_blocking_write(left); free_space = btrfs_leaf_free_space(left); @@ -3974,7 +4063,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root /* cow and double check */ ret = btrfs_cow_block(trans, root, left, - path->nodes[1], slot - 1, &left); + path->nodes[1], slot - 1, &left, + BTRFS_NESTING_LEFT_COW); if (ret) { /* we hit -ENOSPC, but it isn't fatal here */ if (ret == -ENOSPC) @@ -3988,6 +4078,10 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root goto out; } + if (check_sibling_keys(left, right)) { + ret = -EUCLEAN; + goto out; + } return __push_leaf_left(path, min_data_size, empty, left, free_space, right_nritems, max_slot); @@ -4236,8 +4330,18 @@ again: else btrfs_item_key(l, &disk_key, mid); + /* + * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double + * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES + * subclasses, which is 8 at the time of this patch, and we've maxed it + * out. In the future we could add a + * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just + * use BTRFS_NESTING_NEW_ROOT. + */ right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0, - l->start, 0); + l->start, 0, num_doubles ? + BTRFS_NESTING_NEW_ROOT : + BTRFS_NESTING_SPLIT); if (IS_ERR(right)) return PTR_ERR(right); @@ -4482,9 +4586,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, return ret; path->slots[0]++; - setup_items_for_insert(root, path, new_key, &item_size, - item_size, item_size + - sizeof(struct btrfs_item), 1); + setup_items_for_insert(root, path, new_key, &item_size, 1); leaf = path->nodes[0]; memcpy_extent_buffer(leaf, btrfs_item_ptr_offset(leaf, path->slots[0]), @@ -4657,14 +4759,20 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) } } -/* - * this is a helper for btrfs_insert_empty_items, the main goal here is - * to save stack depth by doing the bulk of the work in a function - * that doesn't call btrfs_search_slot +/** + * setup_items_for_insert - Helper called before inserting one or more items + * to a leaf. Main purpose is to save stack depth by doing the bulk of the work + * in a function that doesn't call btrfs_search_slot + * + * @root: root we are inserting items to + * @path: points to the leaf/slot where we are going to insert new items + * @cpu_key: array of keys for items to be inserted + * @data_size: size of the body of each item we are going to insert + * @nr: size of @cpu_key/@data_size arrays */ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *cpu_key, u32 *data_size, - u32 total_data, u32 total_size, int nr) + int nr) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_item *item; @@ -4675,6 +4783,12 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, struct extent_buffer *leaf; int slot; struct btrfs_map_token token; + u32 total_size; + u32 total_data = 0; + + for (i = 0; i < nr; i++) + total_data += data_size[i]; + total_size = total_data + (nr * sizeof(struct btrfs_item)); if (path->slots[0] == 0) { btrfs_cpu_key_to_disk(&disk_key, cpu_key); @@ -4701,7 +4815,8 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, if (old_data < data_end) { btrfs_print_leaf(leaf); - btrfs_crit(fs_info, "slot %d old_data %d data_end %d", + btrfs_crit(fs_info, + "item at slot %d with data offset %u beyond data end of leaf %u", slot, old_data, data_end); BUG(); } @@ -4734,8 +4849,8 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); btrfs_set_item_key(leaf, &disk_key, slot + i); item = btrfs_item_nr(slot + i); - btrfs_set_token_item_offset(&token, item, data_end - data_size[i]); data_end -= data_size[i]; + btrfs_set_token_item_offset(&token, item, data_end); btrfs_set_token_item_size(&token, item, data_size[i]); } @@ -4777,8 +4892,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, slot = path->slots[0]; BUG_ON(slot < 0); - setup_items_for_insert(root, path, cpu_key, data_size, - total_data, total_size, nr); + setup_items_for_insert(root, path, cpu_key, data_size, nr); return 0; } @@ -5115,7 +5229,7 @@ again: slot--; /* * check this node pointer against the min_trans parameters. - * If it is too old, old, skip to the next one. + * If it is too old, skip to the next one. */ while (slot < nritems) { u64 gen; @@ -5379,7 +5493,9 @@ again: } if (!ret) { btrfs_set_path_blocking(path); - btrfs_tree_read_lock(next); + __btrfs_tree_read_lock(next, + BTRFS_NESTING_RIGHT, + path->recurse); } next_rw_lock = BTRFS_READ_LOCK; } @@ -5414,7 +5530,9 @@ again: ret = btrfs_try_tree_read_lock(next); if (!ret) { btrfs_set_path_blocking(path); - btrfs_tree_read_lock(next); + __btrfs_tree_read_lock(next, + BTRFS_NESTING_RIGHT, + path->recurse); } next_rw_lock = BTRFS_READ_LOCK; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9a72896bed2e..aac3d6f4e35b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -374,6 +374,7 @@ struct btrfs_path { unsigned int search_commit_root:1; unsigned int need_commit_sem:1; unsigned int skip_release_on_error:1; + unsigned int recurse:1; }; #define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \ sizeof(struct btrfs_item)) @@ -494,7 +495,7 @@ enum btrfs_orphan_cleanup_state { ORPHAN_CLEANUP_DONE = 2, }; -void btrfs_init_async_reclaim_work(struct work_struct *work); +void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info); /* fs_info */ struct reloc_control; @@ -541,11 +542,6 @@ enum { /* Used to record internally whether fs has been frozen */ BTRFS_FS_FROZEN, /* - * Indicate that a whole-filesystem exclusive operation is running - * (device replace, resize, device add/delete, balance) - */ - BTRFS_FS_EXCL_OP, - /* * Indicate that balance has been set up from the ioctl and is in the * main phase. The fs_info::balance_ctl is initialized. * Set and cleared while holding fs_info::balance_mutex. @@ -565,6 +561,19 @@ enum { BTRFS_FS_DISCARD_RUNNING, }; +/* + * Exclusive operations (device replace, resize, device add/remove, balance) + */ +enum btrfs_exclusive_operation { + BTRFS_EXCLOP_NONE, + BTRFS_EXCLOP_BALANCE, + BTRFS_EXCLOP_DEV_ADD, + BTRFS_EXCLOP_DEV_REMOVE, + BTRFS_EXCLOP_DEV_REPLACE, + BTRFS_EXCLOP_RESIZE, + BTRFS_EXCLOP_SWAP_ACTIVATE, +}; + struct btrfs_fs_info { u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; unsigned long flags; @@ -912,6 +921,7 @@ struct btrfs_fs_info { /* Used to reclaim the metadata space in the background. */ struct work_struct async_reclaim_work; + struct work_struct async_data_reclaim_work; spinlock_t unused_bgs_lock; struct list_head unused_bgs; @@ -935,6 +945,9 @@ struct btrfs_fs_info { */ int send_in_progress; + /* Type of exclusive operation running */ + unsigned long exclusive_operation; + #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; struct rb_root block_tree; @@ -1181,24 +1194,40 @@ struct btrfs_root { #endif }; -struct btrfs_clone_extent_info { +/* + * Structure that conveys information about an extent that is going to replace + * all the extents in a file range. + */ +struct btrfs_replace_extent_info { u64 disk_offset; u64 disk_len; u64 data_offset; u64 data_len; u64 file_offset; + /* Pointer to a file extent item of type regular or prealloc. */ char *extent_buf; - u32 item_size; + /* + * Set to true when attempting to replace a file range with a new extent + * described by this structure, set to false when attempting to clone an + * existing extent into a file range. + */ + bool is_new_extent; + /* Meaningful only if is_new_extent is true. */ + int qgroup_reserved; + /* + * Meaningful only if is_new_extent is true. + * Used to track how many extent items we have already inserted in a + * subvolume tree that refer to the extent described by this structure, + * so that we know when to create a new delayed ref or update an existing + * one. + */ + int insertions; }; struct btrfs_file_private { void *filldir_buf; }; -static inline u32 btrfs_inode_sectorsize(const struct inode *inode) -{ - return btrfs_sb(inode->i_sb)->sectorsize; -} static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info) { @@ -1391,6 +1420,16 @@ static inline void btrfs_init_map_token(struct btrfs_map_token *token, #define cpu_to_le8(v) (v) #define __le8 u8 +static inline u8 get_unaligned_le8(const void *p) +{ + return *(u8 *)p; +} + +static inline void put_unaligned_le8(u8 val, void *p) +{ + *(u8 *)p = val; +} + #define read_eb_member(eb, ptr, type, member, result) (\ read_extent_buffer(eb, (char *)(result), \ ((unsigned long)(ptr)) + \ @@ -1449,27 +1488,25 @@ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ static inline u##bits btrfs_##name(const struct extent_buffer *eb) \ { \ const type *p = page_address(eb->pages[0]); \ - u##bits res = le##bits##_to_cpu(p->member); \ - return res; \ + return get_unaligned_le##bits(&p->member); \ } \ static inline void btrfs_set_##name(const struct extent_buffer *eb, \ u##bits val) \ { \ type *p = page_address(eb->pages[0]); \ - p->member = cpu_to_le##bits(val); \ + put_unaligned_le##bits(val, &p->member); \ } #define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \ static inline u##bits btrfs_##name(const type *s) \ { \ - return le##bits##_to_cpu(s->member); \ + return get_unaligned_le##bits(&s->member); \ } \ static inline void btrfs_set_##name(type *s, u##bits val) \ { \ - s->member = cpu_to_le##bits(val); \ + put_unaligned_le##bits(val, &s->member); \ } - static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb, struct btrfs_dev_item *s) { @@ -2524,7 +2561,8 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, const struct btrfs_disk_key *key, int level, u64 hint, - u64 empty_size); + u64 empty_size, + enum btrfs_lock_nesting nest); void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, @@ -2592,6 +2630,8 @@ enum btrfs_reserve_flush_enum { * * Can be interruped by fatal signal. */ + BTRFS_RESERVE_FLUSH_DATA, + BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE, BTRFS_RESERVE_FLUSH_ALL, /* @@ -2619,7 +2659,7 @@ enum btrfs_flush_state { int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int nitems, bool use_global_rsv); -void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, +void btrfs_subvolume_release_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv); void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); @@ -2651,8 +2691,6 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, struct btrfs_path *path, const struct btrfs_key *new_key); struct extent_buffer *btrfs_root_node(struct btrfs_root *root); -struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); -struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root); int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, int lowest_level, u64 min_trans); @@ -2665,7 +2703,8 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, - struct extent_buffer **cow_ret); + struct extent_buffer **cow_ret, + enum btrfs_lock_nesting nest); int btrfs_copy_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, @@ -2713,7 +2752,7 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans, void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *cpu_key, u32 *data_size, - u32 total_data, u32 total_size, int nr); + int nr); int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, void *data, u32 data_size); int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, @@ -2930,6 +2969,10 @@ void btrfs_inode_safe_disk_i_size_write(struct inode *inode, u64 new_i_size); u64 btrfs_file_extent_end(const struct btrfs_path *path); /* inode.c */ +blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags); +int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, int mirror); struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, u64 start, u64 len); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, @@ -2956,7 +2999,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u32 min_type); int btrfs_start_delalloc_snapshot(struct btrfs_root *root); -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr); +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr); int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, unsigned int extra_bits, struct extent_state **cached_state); @@ -3017,6 +3060,7 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end); void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, u64 end, int uptodate); extern const struct dentry_operations btrfs_dentry_operations; +ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter); /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); @@ -3031,6 +3075,9 @@ void btrfs_get_block_group_info(struct list_head *groups_list, struct btrfs_ioctl_space_info *space); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); +bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type); +void btrfs_exclop_finish(struct btrfs_fs_info *fs_info); /* file.c */ int __init btrfs_auto_defrag_init(void); @@ -3053,9 +3100,9 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end, int drop_cache); -int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, +int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, const u64 start, const u64 end, - struct btrfs_clone_extent_info *clone_info, + struct btrfs_replace_extent_info *extent_info, struct btrfs_trans_handle **trans_out); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 start, u64 end); @@ -3536,9 +3583,7 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) /* Sanity test specific functions */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -void btrfs_test_inode_set_ops(struct inode *inode); void btrfs_test_destroy_inode(struct inode *inode); - static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) { return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 0e354e9e57d0..bacee09b7bfd 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -115,126 +115,15 @@ int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; - u64 used; - int ret = 0; - int need_commit = 2; - int have_pinned_space; + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_DATA; /* Make sure bytes are sectorsize aligned */ bytes = ALIGN(bytes, fs_info->sectorsize); - if (btrfs_is_free_space_inode(inode)) { - need_commit = 0; - ASSERT(current->journal_info); - } - -again: - /* Make sure we have enough space to handle the data first */ - spin_lock(&data_sinfo->lock); - used = btrfs_space_info_used(data_sinfo, true); - - if (used + bytes > data_sinfo->total_bytes) { - struct btrfs_trans_handle *trans; - - /* - * If we don't have enough free bytes in this space then we need - * to alloc a new chunk. - */ - if (!data_sinfo->full) { - u64 alloc_target; - - data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; - spin_unlock(&data_sinfo->lock); - - alloc_target = btrfs_data_alloc_profile(fs_info); - /* - * It is ugly that we don't call nolock join - * transaction for the free space inode case here. - * But it is safe because we only do the data space - * reservation for the free space cache in the - * transaction context, the common join transaction - * just increase the counter of the current transaction - * handler, doesn't try to acquire the trans_lock of - * the fs. - */ - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - ret = btrfs_chunk_alloc(trans, alloc_target, - CHUNK_ALLOC_NO_FORCE); - btrfs_end_transaction(trans); - if (ret < 0) { - if (ret != -ENOSPC) - return ret; - else { - have_pinned_space = 1; - goto commit_trans; - } - } - - goto again; - } - - /* - * If we don't have enough pinned space to deal with this - * allocation, and no removed chunk in current transaction, - * don't bother committing the transaction. - */ - have_pinned_space = __percpu_counter_compare( - &data_sinfo->total_bytes_pinned, - used + bytes - data_sinfo->total_bytes, - BTRFS_TOTAL_BYTES_PINNED_BATCH); - spin_unlock(&data_sinfo->lock); - - /* Commit the current transaction and try again */ -commit_trans: - if (need_commit) { - need_commit--; - - if (need_commit > 0) { - btrfs_start_delalloc_roots(fs_info, -1); - btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, - (u64)-1); - } - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - if (have_pinned_space >= 0 || - test_bit(BTRFS_TRANS_HAVE_FREE_BGS, - &trans->transaction->flags) || - need_commit > 0) { - ret = btrfs_commit_transaction(trans); - if (ret) - return ret; - /* - * The cleaner kthread might still be doing iput - * operations. Wait for it to finish so that - * more space is released. We don't need to - * explicitly run the delayed iputs here because - * the commit_transaction would have woken up - * the cleaner. - */ - ret = btrfs_wait_on_delayed_iputs(fs_info); - if (ret) - return ret; - goto again; - } else { - btrfs_end_transaction(trans); - } - } - - trace_btrfs_space_reservation(fs_info, - "space_info:enospc", - data_sinfo->flags, bytes, 1); - return -ENOSPC; - } - btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes); - spin_unlock(&data_sinfo->lock); + if (btrfs_is_free_space_inode(inode)) + flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE; - return 0; + return btrfs_reserve_data_bytes(fs_info, bytes, flush); } int btrfs_check_data_free_space(struct btrfs_inode *inode, @@ -277,9 +166,7 @@ void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info, ASSERT(IS_ALIGNED(len, fs_info->sectorsize)); data_sinfo = fs_info->data_sinfo; - spin_lock(&data_sinfo->lock); - btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, -len); - spin_unlock(&data_sinfo->lock); + btrfs_space_info_free_bytes_may_use(fs_info, data_sinfo, len); } /* diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index bf1595a42a98..5aba81e16113 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -627,8 +627,7 @@ static int btrfs_delayed_inode_reserve_metadata( */ if (!src_rsv || (!trans->bytes_reserved && src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { - ret = btrfs_qgroup_reserve_meta_prealloc(root, - fs_info->nodesize, true); + ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true); if (ret < 0) return ret; ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, @@ -769,8 +768,7 @@ static int btrfs_batch_insert_items(struct btrfs_root *root, } /* insert the keys of the items */ - setup_items_for_insert(root, path, keys, data_size, - total_data_size, total_size, nitems); + setup_items_for_insert(root, path, keys, data_size, nitems); /* insert the dir index items */ slot = path->slots[0]; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index db93909b25e0..4a0243cb9d97 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -64,10 +64,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret); -static void btrfs_dev_replace_update_device_in_mapping_tree( - struct btrfs_fs_info *fs_info, - struct btrfs_device *srcdev, - struct btrfs_device *tgtdev); static int btrfs_dev_replace_kthread(void *data); int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) @@ -224,13 +220,12 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, { struct btrfs_device *device; struct block_device *bdev; - struct list_head *devices; struct rcu_string *name; u64 devid = BTRFS_DEV_REPLACE_DEVID; int ret = 0; *device_out = NULL; - if (fs_info->fs_devices->seeding) { + if (srcdev->fs_devices->seeding) { btrfs_err(fs_info, "the filesystem is a seed filesystem!"); return -EINVAL; } @@ -244,8 +239,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, sync_blockdev(bdev); - devices = &fs_info->fs_devices->devices; - list_for_each_entry(device, devices, dev_list) { + list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { if (device->bdev == bdev) { btrfs_err(fs_info, "target device is in the filesystem!"); @@ -512,7 +506,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); up_write(&dev_replace->rwsem); - ret = btrfs_sysfs_add_devices_dir(tgt_device->fs_devices, tgt_device); + ret = btrfs_sysfs_add_device(tgt_device); if (ret) btrfs_err(fs_info, "kobj add dev failed %d", ret); @@ -599,6 +593,63 @@ static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info) wake_up(&fs_info->dev_replace.replace_wait); } +/* + * When finishing the device replace, before swapping the source device with the + * target device we must update the chunk allocation state in the target device, + * as it is empty because replace works by directly copying the chunks and not + * through the normal chunk allocation path. + */ +static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev, + struct btrfs_device *tgtdev) +{ + struct extent_state *cached_state = NULL; + u64 start = 0; + u64 found_start; + u64 found_end; + int ret = 0; + + lockdep_assert_held(&srcdev->fs_info->chunk_mutex); + + while (!find_first_extent_bit(&srcdev->alloc_state, start, + &found_start, &found_end, + CHUNK_ALLOCATED, &cached_state)) { + ret = set_extent_bits(&tgtdev->alloc_state, found_start, + found_end, CHUNK_ALLOCATED); + if (ret) + break; + start = found_end + 1; + } + + free_extent_state(cached_state); + return ret; +} + +static void btrfs_dev_replace_update_device_in_mapping_tree( + struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev, + struct btrfs_device *tgtdev) +{ + struct extent_map_tree *em_tree = &fs_info->mapping_tree; + struct extent_map *em; + struct map_lookup *map; + u64 start = 0; + int i; + + write_lock(&em_tree->lock); + do { + em = lookup_extent_mapping(em_tree, start, (u64)-1); + if (!em) + break; + map = em->map_lookup; + for (i = 0; i < map->num_stripes; i++) + if (srcdev == map->stripes[i].dev) + map->stripes[i].dev = tgtdev; + start = em->start + em->len; + free_extent_map(em); + } while (start); + write_unlock(&em_tree->lock); +} + static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret) { @@ -630,7 +681,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, * flush all outstanding I/O and inode extent mappings before the * copy operation is declared as being finished */ - ret = btrfs_start_delalloc_roots(fs_info, -1); + ret = btrfs_start_delalloc_roots(fs_info, U64_MAX); if (ret) { mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; @@ -673,8 +724,14 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, dev_replace->time_stopped = ktime_get_real_seconds(); dev_replace->item_needs_writeback = 1; - /* replace old device with new one in mapping tree */ + /* + * Update allocation state in the new device and replace the old device + * with the new one in the mapping tree. + */ if (!scrub_ret) { + scrub_ret = btrfs_set_target_alloc_state(src_device, tgt_device); + if (scrub_ret) + goto error; btrfs_dev_replace_update_device_in_mapping_tree(fs_info, src_device, tgt_device); @@ -685,6 +742,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, btrfs_dev_name(src_device), src_device->devid, rcu_str_deref(tgt_device->name), scrub_ret); +error: up_write(&dev_replace->rwsem); mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->fs_devices->device_list_mutex); @@ -743,9 +801,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&fs_info->fs_devices->device_list_mutex); /* replace the sysfs entry */ - btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, src_device); + btrfs_sysfs_remove_device(src_device); btrfs_sysfs_update_devid(tgt_device); - btrfs_rm_dev_replace_free_srcdev(src_device); + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state)) + btrfs_scratch_superblocks(fs_info, src_device->bdev, + src_device->name->str); /* write back the superblocks */ trans = btrfs_start_transaction(root, 0); @@ -754,33 +814,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); - return 0; -} - -static void btrfs_dev_replace_update_device_in_mapping_tree( - struct btrfs_fs_info *fs_info, - struct btrfs_device *srcdev, - struct btrfs_device *tgtdev) -{ - struct extent_map_tree *em_tree = &fs_info->mapping_tree; - struct extent_map *em; - struct map_lookup *map; - u64 start = 0; - int i; + btrfs_rm_dev_replace_free_srcdev(src_device); - write_lock(&em_tree->lock); - do { - em = lookup_extent_mapping(em_tree, start, (u64)-1); - if (!em) - break; - map = em->map_lookup; - for (i = 0; i < map->num_stripes; i++) - if (srcdev == map->stripes[i].dev) - map->stripes[i].dev = tgtdev; - start = em->start + em->len; - free_extent_map(em); - } while (start); - write_unlock(&em_tree->lock); + return 0; } /* @@ -983,7 +1019,7 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) * should never allow both to start and pause. We don't want to allow * dev-replace to start anyway. */ - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) { down_write(&dev_replace->rwsem); dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; @@ -1020,7 +1056,7 @@ static int btrfs_dev_replace_kthread(void *data) ret = btrfs_dev_replace_finishing(fs_info, ret); WARN_ON(ret && ret != -ECANCELED); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); return 0; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index abf86b202b43..8e3438672a82 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -50,7 +50,6 @@ BTRFS_SUPER_FLAG_METADUMP |\ BTRFS_SUPER_FLAG_METADUMP_V2) -static const struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); static void btrfs_destroy_ordered_extents(struct btrfs_root *root); static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, @@ -205,53 +204,6 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, #endif /* - * extents on the btree inode are pretty simple, there's one extent - * that covers the entire device - */ -struct extent_map *btree_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, - u64 start, u64 len) -{ - struct extent_map_tree *em_tree = &inode->extent_tree; - struct extent_map *em; - int ret; - - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); - if (em) { - read_unlock(&em_tree->lock); - goto out; - } - read_unlock(&em_tree->lock); - - em = alloc_extent_map(); - if (!em) { - em = ERR_PTR(-ENOMEM); - goto out; - } - em->start = 0; - em->len = (u64)-1; - em->block_len = (u64)-1; - em->block_start = 0; - - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); - if (ret == -EEXIST) { - free_extent_map(em); - em = lookup_extent_mapping(em_tree, start, len); - if (!em) - em = ERR_PTR(-EIO); - } else if (ret) { - free_extent_map(em); - em = ERR_PTR(ret); - } - write_unlock(&em_tree->lock); - -out: - return em; -} - -/* * Compute the csum of a btree block and store the result to provided buffer. */ static void csum_tree_block(struct extent_buffer *buf, u8 *result) @@ -545,38 +497,35 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) static int check_tree_block_fsid(struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; u8 fsid[BTRFS_FSID_SIZE]; - int ret = 1; + u8 *metadata_uuid; read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid), BTRFS_FSID_SIZE); - while (fs_devices) { - u8 *metadata_uuid; + /* + * Checking the incompat flag is only valid for the current fs. For + * seed devices it's forbidden to have their uuid changed so reading + * ->fsid in this case is fine + */ + if (btrfs_fs_incompat(fs_info, METADATA_UUID)) + metadata_uuid = fs_devices->metadata_uuid; + else + metadata_uuid = fs_devices->fsid; - /* - * Checking the incompat flag is only valid for the current - * fs. For seed devices it's forbidden to have their uuid - * changed so reading ->fsid in this case is fine - */ - if (fs_devices == fs_info->fs_devices && - btrfs_fs_incompat(fs_info, METADATA_UUID)) - metadata_uuid = fs_devices->metadata_uuid; - else - metadata_uuid = fs_devices->fsid; + if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) + return 0; - if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) { - ret = 0; - break; - } - fs_devices = fs_devices->seed; - } - return ret; + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) + if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE)) + return 0; + + return 1; } -static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, - u64 phy_offset, struct page *page, - u64 start, u64 end, int mirror) +int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, + int mirror) { u64 found_start; int found_level; @@ -636,16 +585,15 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, csum_tree_block(eb, result); if (memcmp_extent_buffer(eb, result, 0, csum_size)) { - u32 val; - u32 found = 0; - - memcpy(&found, result, csum_size); + u8 val[BTRFS_CSUM_SIZE] = { 0 }; read_extent_buffer(eb, &val, 0, csum_size); btrfs_warn_rl(fs_info, - "%s checksum verify failed on %llu wanted %x found %x level %d", + "%s checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d", fs_info->sb->s_id, eb->start, - val, found, btrfs_header_level(eb)); + CSUM_FMT_VALUE(csum_size, val), + CSUM_FMT_VALUE(csum_size, result), + btrfs_header_level(eb)); ret = -EUCLEAN; goto err; } @@ -865,9 +813,8 @@ static int check_async_write(struct btrfs_fs_info *fs_info, return 1; } -static blk_status_t btree_submit_bio_hook(struct inode *inode, struct bio *bio, - int mirror_num, - unsigned long bio_flags) +blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int async = check_async_write(fs_info, BTRFS_I(inode)); @@ -952,11 +899,6 @@ static int btree_writepages(struct address_space *mapping, return btree_write_cache_pages(mapping, wbc); } -static int btree_readpage(struct file *file, struct page *page) -{ - return extent_read_full_page(page, btree_get_extent, 0); -} - static int btree_releasepage(struct page *page, gfp_t gfp_flags) { if (PageWriteback(page) || PageDirty(page)) @@ -996,7 +938,6 @@ static int btree_set_page_dirty(struct page *page) } static const struct address_space_operations btree_aops = { - .readpage = btree_readpage, .writepages = btree_writepages, .releasepage = btree_releasepage, .invalidatepage = btree_invalidatepage, @@ -1209,7 +1150,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = 0; - leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0); + leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0, + BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); leaf = NULL; @@ -1281,7 +1223,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, */ leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID, - NULL, 0, 0, 0); + NULL, 0, 0, 0, BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) { btrfs_put_root(root); return ERR_CAST(leaf); @@ -1506,10 +1448,12 @@ void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info) struct btrfs_root *root; while (!list_empty(&fs_info->allocated_roots)) { + char buf[BTRFS_ROOT_NAME_BUF_LEN]; + root = list_first_entry(&fs_info->allocated_roots, struct btrfs_root, leak_list); - btrfs_err(fs_info, "leaked root %llu-%llu refcount %d", - root->root_key.objectid, root->root_key.offset, + btrfs_err(fs_info, "leaked root %s refcount %d", + btrfs_root_name(root->root_key.objectid, buf), refcount_read(&root->refs)); while (refcount_read(&root->refs) > 1) btrfs_put_root(root); @@ -2116,12 +2060,10 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree, - IO_TREE_INODE_IO, inode); + IO_TREE_BTREE_INODE_IO, inode); BTRFS_I(inode)->io_tree.track_uptodate = false; extent_map_tree_init(&BTRFS_I(inode)->extent_tree); - BTRFS_I(inode)->io_tree.ops = &btree_extent_io_ops; - BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root); memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key)); set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); @@ -2627,18 +2569,17 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) level = btrfs_super_root_level(sb); tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb), generation, level, NULL); - if (IS_ERR(tree_root->node) || - !extent_buffer_uptodate(tree_root->node)) { + if (IS_ERR(tree_root->node)) { handle_error = true; + ret = PTR_ERR(tree_root->node); + tree_root->node = NULL; + btrfs_warn(fs_info, "couldn't read tree root"); + continue; - if (IS_ERR(tree_root->node)) { - ret = PTR_ERR(tree_root->node); - tree_root->node = NULL; - } else if (!extent_buffer_uptodate(tree_root->node)) { - ret = -EUCLEAN; - } - - btrfs_warn(fs_info, "failed to read tree root"); + } else if (!extent_buffer_uptodate(tree_root->node)) { + handle_error = true; + ret = -EIO; + btrfs_warn(fs_info, "error while reading tree root"); continue; } @@ -2754,7 +2695,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) fs_info->check_integrity_print_mask = 0; #endif btrfs_init_balance(fs_info); - btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work); + btrfs_init_async_reclaim_work(fs_info); spin_lock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT; @@ -2929,7 +2870,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device } /* - * Verify the type first, if that or the the checksum value are + * Verify the type first, if that or the checksum value are * corrupted, we'll find out */ csum_type = btrfs_super_csum_type(disk_super); @@ -3091,8 +3032,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_sb_buffer; } - sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK; - sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super); sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE); @@ -3483,8 +3422,12 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, return ERR_CAST(page); super = page_address(page); - if (btrfs_super_bytenr(super) != bytenr || - btrfs_super_magic(super) != BTRFS_MAGIC) { + if (btrfs_super_magic(super) != BTRFS_MAGIC) { + btrfs_release_disk_super(super); + return ERR_PTR(-ENODATA); + } + + if (btrfs_super_bytenr(super) != bytenr) { btrfs_release_disk_super(super); return ERR_PTR(-EINVAL); } @@ -4057,6 +4000,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_cleanup_defrag_inodes(fs_info); cancel_work_sync(&fs_info->async_reclaim_work); + cancel_work_sync(&fs_info->async_data_reclaim_work); /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); @@ -4688,9 +4632,3 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) return 0; } - -static const struct extent_io_ops btree_extent_io_ops = { - /* mandatory callbacks */ - .submit_bio_hook = btree_submit_bio_hook, - .readpage_end_io_hook = btree_readpage_end_io_hook, -}; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 00dc39d47ed3..fee69ced58b4 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -76,7 +76,11 @@ void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info); void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); - +int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, + int mirror); +blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); #endif @@ -123,9 +127,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, u64 objectid); int btree_lock_page_hook(struct page *page, void *data, void (*flush_fn)(void *)); -struct extent_map *btree_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, - u64 start, u64 len); int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); int __init btrfs_end_io_wq_init(void); void __cold btrfs_end_io_wq_exit(void); diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 219a09a2b734..9800a8306368 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -40,6 +40,7 @@ struct io_failure_record; enum { IO_TREE_FS_PINNED_EXTENTS, IO_TREE_FS_EXCLUDED_EXTENTS, + IO_TREE_BTREE_INODE_IO, IO_TREE_INODE_IO, IO_TREE_INODE_IO_FAILURE, IO_TREE_RELOC_BLOCKS, @@ -48,6 +49,7 @@ enum { IO_TREE_INODE_FILE_EXTENT, IO_TREE_LOG_CSUM_RANGE, IO_TREE_SELFTEST, + IO_TREE_DEVICE_ALLOC_STATE, }; struct extent_io_tree { @@ -61,7 +63,6 @@ struct extent_io_tree { u8 owner; spinlock_t lock; - const struct extent_io_ops *ops; }; struct extent_state { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 780b9c9a98fe..3b21fee13e77 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1177,7 +1177,22 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, num_bytes, parent, root_objectid, owner, offset, 1); if (ret == 0) { - BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); + /* + * We're adding refs to a tree block we already own, this + * should not happen at all. + */ + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + btrfs_crit(trans->fs_info, +"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu", + bytenr, num_bytes, root_objectid); + if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) { + WARN_ON(1); + btrfs_crit(trans->fs_info, + "path->slots[0]=%d path->nodes[0]:", path->slots[0]); + btrfs_print_leaf(path->nodes[0]); + } + return -EUCLEAN; + } update_inline_extent_backref(path, iref, refs_to_add, extent_op, NULL); } else if (ret == -ENOENT) { @@ -1397,6 +1412,9 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, /* * __btrfs_inc_extent_ref - insert backreference for a given extent * + * The counterpart is in __btrfs_free_extent(), with examples and more details + * how it works. + * * @trans: Handle of transaction * * @node: The delayed ref node used to get the bytenr/length for @@ -2849,11 +2867,10 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, len -= to_add; } spin_unlock(&global_rsv->lock); - /* Add to any tickets we may have */ - if (len) - btrfs_try_granting_tickets(fs_info, - space_info); } + /* Add to any tickets we may have */ + if (!readonly && return_free_space && len) + btrfs_try_granting_tickets(fs_info, space_info); spin_unlock(&space_info->lock); } @@ -2935,6 +2952,65 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) return 0; } +/* + * Drop one or more refs of @node. + * + * 1. Locate the extent refs. + * It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item. + * Locate it, then reduce the refs number or remove the ref line completely. + * + * 2. Update the refs count in EXTENT/METADATA_ITEM + * + * Inline backref case: + * + * in extent tree we have: + * + * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82 + * refs 2 gen 6 flags DATA + * extent data backref root FS_TREE objectid 258 offset 0 count 1 + * extent data backref root FS_TREE objectid 257 offset 0 count 1 + * + * This function gets called with: + * + * node->bytenr = 13631488 + * node->num_bytes = 1048576 + * root_objectid = FS_TREE + * owner_objectid = 257 + * owner_offset = 0 + * refs_to_drop = 1 + * + * Then we should get some like: + * + * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82 + * refs 1 gen 6 flags DATA + * extent data backref root FS_TREE objectid 258 offset 0 count 1 + * + * Keyed backref case: + * + * in extent tree we have: + * + * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24 + * refs 754 gen 6 flags DATA + * [...] + * item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28 + * extent data backref root FS_TREE objectid 866 offset 0 count 1 + * + * This function get called with: + * + * node->bytenr = 13631488 + * node->num_bytes = 1048576 + * root_objectid = FS_TREE + * owner_objectid = 866 + * owner_offset = 0 + * refs_to_drop = 1 + * + * Then we should get some like: + * + * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24 + * refs 753 gen 6 flags DATA + * + * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed. + */ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner_objectid, @@ -2967,7 +3043,15 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, path->leave_spinning = 1; is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; - BUG_ON(!is_data && refs_to_drop != 1); + + if (!is_data && refs_to_drop != 1) { + btrfs_crit(info, +"invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u", + node->bytenr, refs_to_drop); + ret = -EINVAL; + btrfs_abort_transaction(trans, ret); + goto out; + } if (is_data) skinny_metadata = false; @@ -2976,6 +3060,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, parent, root_objectid, owner_objectid, owner_offset); if (ret == 0) { + /* + * Either the inline backref or the SHARED_DATA_REF/ + * SHARED_BLOCK_REF is found + * + * Here is a quick path to locate EXTENT/METADATA_ITEM. + * It's possible the EXTENT/METADATA_ITEM is near current slot. + */ extent_slot = path->slots[0]; while (extent_slot >= 0) { btrfs_item_key_to_cpu(path->nodes[0], &key, @@ -2992,13 +3083,21 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, found_extent = 1; break; } + + /* Quick path didn't find the EXTEMT/METADATA_ITEM */ if (path->slots[0] - extent_slot > 5) break; extent_slot--; } if (!found_extent) { - BUG_ON(iref); + if (iref) { + btrfs_crit(info, +"invalid iref, no EXTENT/METADATA_ITEM found but has inline extent ref"); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } + /* Must be SHARED_* item, remove the backref first */ ret = remove_extent_backref(trans, path, NULL, refs_to_drop, is_data, &last_ref); @@ -3009,6 +3108,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_release_path(path); path->leave_spinning = 1; + /* Slow path to locate EXTENT/METADATA_ITEM */ key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; @@ -3083,19 +3183,26 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && key.type == BTRFS_EXTENT_ITEM_KEY) { struct btrfs_tree_block_info *bi; - BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); + if (item_size < sizeof(*ei) + sizeof(*bi)) { + btrfs_crit(info, +"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %lu", + key.objectid, key.type, key.offset, + owner_objectid, item_size, + sizeof(*ei) + sizeof(*bi)); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } bi = (struct btrfs_tree_block_info *)(ei + 1); WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); } refs = btrfs_extent_refs(leaf, ei); if (refs < refs_to_drop) { - btrfs_err(info, - "trying to drop %d refs but we only have %Lu for bytenr %Lu", + btrfs_crit(info, + "trying to drop %d refs but we only have %llu for bytenr %llu", refs_to_drop, refs, bytenr); - ret = -EINVAL; - btrfs_abort_transaction(trans, ret); - goto out; + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; } refs -= refs_to_drop; @@ -3107,7 +3214,12 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, * be updated by remove_extent_backref */ if (iref) { - BUG_ON(!found_extent); + if (!found_extent) { + btrfs_crit(info, +"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found"); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } } else { btrfs_set_extent_refs(leaf, ei, refs); btrfs_mark_buffer_dirty(leaf); @@ -3122,13 +3234,39 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } } else { + /* In this branch refs == 1 */ if (found_extent) { - BUG_ON(is_data && refs_to_drop != - extent_data_ref_count(path, iref)); + if (is_data && refs_to_drop != + extent_data_ref_count(path, iref)) { + btrfs_crit(info, + "invalid refs_to_drop, current refs %u refs_to_drop %u", + extent_data_ref_count(path, iref), + refs_to_drop); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } if (iref) { - BUG_ON(path->slots[0] != extent_slot); + if (path->slots[0] != extent_slot) { + btrfs_crit(info, +"invalid iref, extent item key (%llu %u %llu) doesn't have wanted iref", + key.objectid, key.type, + key.offset); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } } else { - BUG_ON(path->slots[0] != extent_slot + 1); + /* + * No inline ref, we must be at SHARED_* item, + * And it's single ref, it must be: + * | extent_slot ||extent_slot + 1| + * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ] + */ + if (path->slots[0] != extent_slot + 1) { + btrfs_crit(info, + "invalid SHARED_* item, previous item is not EXTENT/METADATA_ITEM"); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } path->slots[0] = extent_slot; num_to_del = 2; } @@ -3169,6 +3307,19 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, out: btrfs_free_path(path); return ret; +err_dump: + /* + * Leaf dump can take up a lot of log buffer, so we only do full leaf + * dump for debug build. + */ + if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) { + btrfs_crit(info, "path->slots[0]=%d extent_slot=%d", + path->slots[0], extent_slot); + btrfs_print_leaf(path->nodes[0]); + } + + btrfs_free_path(path); + return -EUCLEAN; } /* @@ -3918,11 +4069,12 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, * |- Push harder to find free extents * |- If not found, re-iterate all block groups */ -static noinline int find_free_extent(struct btrfs_fs_info *fs_info, +static noinline int find_free_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, u64 empty_size, u64 hint_byte_orig, struct btrfs_key *ins, u64 flags, int delalloc) { + struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; int cache_block_group_error = 0; struct btrfs_block_group *block_group = NULL; @@ -3954,7 +4106,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, ins->objectid = 0; ins->offset = 0; - trace_find_free_extent(fs_info, num_bytes, empty_size, flags); + trace_find_free_extent(root, num_bytes, empty_size, flags); space_info = btrfs_find_space_info(fs_info, flags); if (!space_info) { @@ -4203,7 +4355,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, flags = get_alloc_profile_by_root(root, is_data); again: WARN_ON(num_bytes < fs_info->sectorsize); - ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size, + ret = find_free_extent(root, ram_bytes, num_bytes, empty_size, hint_byte, ins, flags, delalloc); if (!ret && !is_data) { btrfs_dec_block_group_reservations(fs_info, ins->objectid); @@ -4504,7 +4656,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, static struct extent_buffer * btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, int level, u64 owner) + u64 bytenr, int level, u64 owner, + enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *buf; @@ -4527,7 +4680,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, } btrfs_set_buffer_lockdep_class(owner, buf, level); - btrfs_tree_lock(buf); + __btrfs_tree_lock(buf, nest); btrfs_clean_tree_block(buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); @@ -4573,7 +4726,8 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, const struct btrfs_disk_key *key, int level, u64 hint, - u64 empty_size) + u64 empty_size, + enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key ins; @@ -4589,7 +4743,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (btrfs_is_testing(fs_info)) { buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, - level, root_objectid); + level, root_objectid, nest); if (!IS_ERR(buf)) root->alloc_bytenr += blocksize; return buf; @@ -4606,7 +4760,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, goto out_unuse; buf = btrfs_init_new_buffer(trans, root, ins.objectid, level, - root_objectid); + root_objectid, nest); if (IS_ERR(buf)) { ret = PTR_ERR(buf); goto out_free_reserved; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a940edb1e64f..60f5f68d892d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -160,19 +160,20 @@ static int add_extent_changeset(struct extent_state *state, unsigned bits, return ret; } -static int __must_check submit_one_bio(struct bio *bio, int mirror_num, - unsigned long bio_flags) +int __must_check submit_one_bio(struct bio *bio, int mirror_num, + unsigned long bio_flags) { blk_status_t ret = 0; struct extent_io_tree *tree = bio->bi_private; bio->bi_private = NULL; - if (tree->ops) - ret = tree->ops->submit_bio_hook(tree->private_data, bio, - mirror_num, bio_flags); + if (is_data_inode(tree->private_data)) + ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num, + bio_flags); else - btrfsic_submit_bio(bio); + ret = btrfs_submit_metadata_bio(tree->private_data, bio, + mirror_num, bio_flags); return blk_status_to_errno(ret); } @@ -280,7 +281,6 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info, { tree->fs_info = fs_info; tree->state = RB_ROOT; - tree->ops = NULL; tree->dirty_bytes = 0; spin_lock_init(&tree->lock); tree->private_data = private_data; @@ -2819,8 +2819,6 @@ static void end_bio_extent_readpage(struct bio *bio) struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - bool data_inode = btrfs_ino(BTRFS_I(inode)) - != BTRFS_BTREE_INODE_OBJECTID; btrfs_debug(fs_info, "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", @@ -2851,9 +2849,12 @@ static void end_bio_extent_readpage(struct bio *bio) mirror = io_bio->mirror_num; if (likely(uptodate)) { - ret = tree->ops->readpage_end_io_hook(io_bio, offset, - page, start, end, - mirror); + if (is_data_inode(inode)) + ret = btrfs_verify_data_csum(io_bio, offset, page, + start, end, mirror); + else + ret = btrfs_validate_metadata_buffer(io_bio, + offset, page, start, end, mirror); if (ret) uptodate = 0; else @@ -2866,7 +2867,7 @@ static void end_bio_extent_readpage(struct bio *bio) if (likely(uptodate)) goto readpage_ok; - if (data_inode) { + if (is_data_inode(inode)) { /* * The generic bio_readpage_error handles errors the @@ -2881,7 +2882,7 @@ static void end_bio_extent_readpage(struct bio *bio) if (!btrfs_submit_read_repair(inode, bio, offset, page, start - page_offset(page), start, end, mirror, - tree->ops->submit_bio_hook)) { + btrfs_submit_data_bio)) { uptodate = !bio->bi_status; offset += len; continue; @@ -3053,7 +3054,6 @@ static int submit_extent_page(unsigned int opf, else contig = bio_end_sector(bio) == sector; - ASSERT(tree->ops); if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags)) can_merge = false; @@ -3110,8 +3110,7 @@ void set_page_extent_mapped(struct page *page) static struct extent_map * __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, - u64 start, u64 len, get_extent_t *get_extent, - struct extent_map **em_cached) + u64 start, u64 len, struct extent_map **em_cached) { struct extent_map *em; @@ -3127,7 +3126,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, *em_cached = NULL; } - em = get_extent(BTRFS_I(inode), page, pg_offset, start, len); + em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len); if (em_cached && !IS_ERR_OR_NULL(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); @@ -3142,12 +3141,9 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, * XXX JDM: This needs looking at to ensure proper page locking * return 0 on success, otherwise return error */ -static int __do_readpage(struct page *page, - get_extent_t *get_extent, - struct extent_map **em_cached, - struct bio **bio, int mirror_num, - unsigned long *bio_flags, unsigned int read_flags, - u64 *prev_em_start) +int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, + struct bio **bio, unsigned long *bio_flags, + unsigned int read_flags, u64 *prev_em_start) { struct inode *inode = page->mapping->host; u64 start = page_offset(page); @@ -3209,7 +3205,7 @@ static int __do_readpage(struct page *page, break; } em = __get_extent_map(inode, page, pg_offset, cur, - end - cur + 1, get_extent, em_cached); + end - cur + 1, em_cached); if (IS_ERR_OR_NULL(em)) { SetPageError(page); unlock_extent(tree, cur, end); @@ -3241,7 +3237,7 @@ static int __do_readpage(struct page *page, /* * If we have a file range that points to a compressed extent - * and it's followed by a consecutive file range that points to + * and it's followed by a consecutive file range that points * to the same compressed extent (possibly with a different * offset and/or length, so it either points to the whole extent * or only part of it), we must make sure we do not submit a @@ -3325,7 +3321,7 @@ static int __do_readpage(struct page *page, ret = submit_extent_page(REQ_OP_READ | read_flags, NULL, page, offset, disk_io_size, pg_offset, bio, - end_bio_extent_readpage, mirror_num, + end_bio_extent_readpage, 0, *bio_flags, this_bio_flag, force_bio_submit); @@ -3362,44 +3358,12 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); for (index = 0; index < nr_pages; index++) { - __do_readpage(pages[index], btrfs_get_extent, em_cached, - bio, 0, bio_flags, REQ_RAHEAD, prev_em_start); + btrfs_do_readpage(pages[index], em_cached, bio, bio_flags, + REQ_RAHEAD, prev_em_start); put_page(pages[index]); } } -static int __extent_read_full_page(struct page *page, - get_extent_t *get_extent, - struct bio **bio, int mirror_num, - unsigned long *bio_flags, - unsigned int read_flags) -{ - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - u64 start = page_offset(page); - u64 end = start + PAGE_SIZE - 1; - int ret; - - btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - - ret = __do_readpage(page, get_extent, NULL, bio, mirror_num, - bio_flags, read_flags, NULL); - return ret; -} - -int extent_read_full_page(struct page *page, get_extent_t *get_extent, - int mirror_num) -{ - struct bio *bio = NULL; - unsigned long bio_flags = 0; - int ret; - - ret = __extent_read_full_page(page, get_extent, &bio, mirror_num, - &bio_flags, 0); - if (bio) - ret = submit_one_bio(bio, mirror_num, bio_flags); - return ret; -} - static void update_nr_written(struct writeback_control *wbc, unsigned long nr_written) { @@ -4552,7 +4516,7 @@ next: * helper function for fiemap, which doesn't want to see any holes. * This maps until we find something past 'last' */ -static struct extent_map *get_extent_skip_holes(struct inode *inode, +static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode, u64 offset, u64 last) { u64 sectorsize = btrfs_inode_sectorsize(inode); @@ -4567,7 +4531,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode, if (len == 0) break; len = ALIGN(len, sectorsize); - em = btrfs_get_extent_fiemap(BTRFS_I(inode), offset, len); + em = btrfs_get_extent_fiemap(inode, offset, len); if (IS_ERR_OR_NULL(em)) return em; @@ -4696,7 +4660,7 @@ static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo, return ret; } -int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, +int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { int ret = 0; @@ -4707,12 +4671,12 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 last; u64 last_for_get_extent = 0; u64 disko = 0; - u64 isize = i_size_read(inode); + u64 isize = i_size_read(&inode->vfs_inode); struct btrfs_key found_key; struct extent_map *em = NULL; struct extent_state *cached_state = NULL; struct btrfs_path *path; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct fiemap_cache cache = { 0 }; struct ulist *roots; struct ulist *tmp_ulist; @@ -4743,8 +4707,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, * lookup the last file extent. We're not using i_size here * because there might be preallocation past i_size */ - ret = btrfs_lookup_file_extent(NULL, root, path, - btrfs_ino(BTRFS_I(inode)), -1, 0); + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1, + 0); if (ret < 0) { goto out_free_ulist; } else { @@ -4758,7 +4722,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, found_type = found_key.type; /* No extents, but there might be delalloc bits */ - if (found_key.objectid != btrfs_ino(BTRFS_I(inode)) || + if (found_key.objectid != btrfs_ino(inode) || found_type != BTRFS_EXTENT_DATA_KEY) { /* have to trust i_size as the end */ last = (u64)-1; @@ -4784,7 +4748,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, last_for_get_extent = isize; } - lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, + lock_extent_bits(&inode->io_tree, start, start + len - 1, &cached_state); em = get_extent_skip_holes(inode, start, last_for_get_extent); @@ -4853,8 +4817,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, * then we're just getting a count and we can skip the * lookup stuff. */ - ret = btrfs_check_shared(root, - btrfs_ino(BTRFS_I(inode)), + ret = btrfs_check_shared(root, btrfs_ino(inode), bytenr, roots, tmp_ulist); if (ret < 0) goto out_free; @@ -4898,7 +4861,7 @@ out_free: ret = emit_last_fiemap_cache(fieinfo, &cache); free_extent_map(em); out: - unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, + unlock_extent_cached(&inode->io_tree, start, start + len - 1, &cached_state); out_free_ulist: @@ -4990,7 +4953,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, rwlock_init(&eb->lock); atomic_set(&eb->blocking_readers, 0); eb->blocking_writers = 0; - eb->lock_nested = false; + eb->lock_recursed = false; init_waitqueue_head(&eb->write_lock_wq); init_waitqueue_head(&eb->read_lock_wq); @@ -5574,20 +5537,19 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) } ClearPageError(page); - err = __extent_read_full_page(page, - btree_get_extent, &bio, - mirror_num, &bio_flags, - REQ_META); + err = submit_extent_page(REQ_OP_READ | REQ_META, NULL, + page, page_offset(page), PAGE_SIZE, 0, + &bio, end_bio_extent_readpage, + mirror_num, 0, 0, false); if (err) { - ret = err; /* - * We use &bio in above __extent_read_full_page, - * so we ensure that if it returns error, the - * current page fails to add itself to bio and - * it's been unlocked. - * - * We must dec io_pages by ourselves. + * We failed to submit the bio so it's the + * caller's responsibility to perform cleanup + * i.e unlock page/set error bit. */ + ret = err; + SetPageError(page); + unlock_page(page); atomic_dec(&eb->io_pages); } } else { @@ -5622,6 +5584,36 @@ unlock_exit: return ret; } +static bool report_eb_range(const struct extent_buffer *eb, unsigned long start, + unsigned long len) +{ + btrfs_warn(eb->fs_info, + "access to eb bytenr %llu len %lu out of range start %lu len %lu", + eb->start, eb->len, start, len); + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + + return true; +} + +/* + * Check if the [start, start + len) range is valid before reading/writing + * the eb. + * NOTE: @start and @len are offset inside the eb, not logical address. + * + * Caller should not touch the dst/src memory if this function returns error. + */ +static inline int check_eb_range(const struct extent_buffer *eb, + unsigned long start, unsigned long len) +{ + unsigned long offset; + + /* start, start + len should not go beyond eb->len nor overflow */ + if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len)) + return report_eb_range(eb, start, len); + + return false; +} + void read_extent_buffer(const struct extent_buffer *eb, void *dstv, unsigned long start, unsigned long len) { @@ -5632,12 +5624,8 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv, char *dst = (char *)dstv; unsigned long i = start >> PAGE_SHIFT; - if (start + len > eb->len) { - WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n", - eb->start, eb->len, start, len); - memset(dst, 0, len); + if (check_eb_range(eb, start, len)) return; - } offset = offset_in_page(start); @@ -5702,8 +5690,8 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, unsigned long i = start >> PAGE_SHIFT; int ret = 0; - WARN_ON(start > eb->len); - WARN_ON(start + len > eb->start + eb->len); + if (check_eb_range(eb, start, len)) + return -EINVAL; offset = offset_in_page(start); @@ -5756,8 +5744,8 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, char *src = (char *)srcv; unsigned long i = start >> PAGE_SHIFT; - WARN_ON(start > eb->len); - WARN_ON(start + len > eb->start + eb->len); + if (check_eb_range(eb, start, len)) + return; offset = offset_in_page(start); @@ -5785,8 +5773,8 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, char *kaddr; unsigned long i = start >> PAGE_SHIFT; - WARN_ON(start > eb->len); - WARN_ON(start + len > eb->start + eb->len); + if (check_eb_range(eb, start, len)) + return; offset = offset_in_page(start); @@ -5830,6 +5818,10 @@ void copy_extent_buffer(const struct extent_buffer *dst, char *kaddr; unsigned long i = dst_offset >> PAGE_SHIFT; + if (check_eb_range(dst, dst_offset, len) || + check_eb_range(src, src_offset, len)) + return; + WARN_ON(src->len != dst_len); offset = offset_in_page(dst_offset); @@ -6019,25 +6011,15 @@ void memcpy_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { - struct btrfs_fs_info *fs_info = dst->fs_info; size_t cur; size_t dst_off_in_page; size_t src_off_in_page; unsigned long dst_i; unsigned long src_i; - if (src_offset + len > dst->len) { - btrfs_err(fs_info, - "memmove bogus src_offset %lu move len %lu dst len %lu", - src_offset, len, dst->len); - BUG(); - } - if (dst_offset + len > dst->len) { - btrfs_err(fs_info, - "memmove bogus dst_offset %lu move len %lu dst len %lu", - dst_offset, len, dst->len); - BUG(); - } + if (check_eb_range(dst, dst_offset, len) || + check_eb_range(dst, src_offset, len)) + return; while (len > 0) { dst_off_in_page = offset_in_page(dst_offset); @@ -6064,7 +6046,6 @@ void memmove_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { - struct btrfs_fs_info *fs_info = dst->fs_info; size_t cur; size_t dst_off_in_page; size_t src_off_in_page; @@ -6073,18 +6054,9 @@ void memmove_extent_buffer(const struct extent_buffer *dst, unsigned long dst_i; unsigned long src_i; - if (src_offset + len > dst->len) { - btrfs_err(fs_info, - "memmove bogus src_offset %lu move len %lu len %lu", - src_offset, len, dst->len); - BUG(); - } - if (dst_offset + len > dst->len) { - btrfs_err(fs_info, - "memmove bogus dst_offset %lu move len %lu len %lu", - dst_offset, len, dst->len); - BUG(); - } + if (check_eb_range(dst, dst_offset, len) || + check_eb_range(dst, src_offset, len)) + return; if (dst_offset < src_offset) { memcpy_extent_buffer(dst, dst_offset, src_offset, len); return; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 30794ae58498..f39d02e7f7ef 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -74,18 +74,6 @@ typedef blk_status_t (submit_bio_hook_t)(struct inode *inode, struct bio *bio, typedef blk_status_t (extent_submit_bio_start_t)(void *private_data, struct bio *bio, u64 bio_offset); -struct extent_io_ops { - /* - * The following callbacks must be always defined, the function - * pointer will be called unconditionally. - */ - submit_bio_hook_t *submit_bio_hook; - int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset, - struct page *page, u64 start, u64 end, - int mirror); -}; - - #define INLINE_EXTENT_BUFFER_PAGES 16 #define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_SIZE) struct extent_buffer { @@ -102,7 +90,7 @@ struct extent_buffer { int blocking_writers; atomic_t blocking_readers; - bool lock_nested; + bool lock_recursed; /* >= 0 if eb belongs to a log tree, -1 otherwise */ short log_index; @@ -193,8 +181,11 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); -int extent_read_full_page(struct page *page, get_extent_t *get_extent, - int mirror_num); +int __must_check submit_one_bio(struct bio *bio, int mirror_num, + unsigned long bio_flags); +int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, + struct bio **bio, unsigned long *bio_flags, + unsigned int read_flags, u64 *prev_em_start); int extent_write_full_page(struct page *page, struct writeback_control *wbc); int extent_write_locked_range(struct inode *inode, u64 start, u64 end, int mode); @@ -203,7 +194,7 @@ int extent_writepages(struct address_space *mapping, int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc); void extent_readahead(struct readahead_control *rac); -int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, +int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); void set_page_extent_mapped(struct page *page); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 7d5ec71615b8..8f4f2bd6d9b9 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -318,8 +318,8 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, if (page_offsets) offset = page_offset(bvec.bv_page) + bvec.bv_offset; - count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, - csum, nblocks); + count = btrfs_find_ordered_sum(BTRFS_I(inode), offset, + disk_bytenr, csum, nblocks); if (count) goto found; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 4507c3d09399..0ff659455b1e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1057,11 +1057,7 @@ delete_extent_item: if (btrfs_comp_cpu_keys(&key, &slot_key) > 0) path->slots[0]++; } - setup_items_for_insert(root, path, &key, - &extent_item_size, - extent_item_size, - sizeof(struct btrfs_item) + - extent_item_size, 1); + setup_items_for_insert(root, path, &key, &extent_item_size, 1); *key_inserted = 1; } @@ -1477,9 +1473,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, int ret = 0; start_pos = round_down(pos, fs_info->sectorsize); - last_pos = start_pos - + round_up(pos + write_bytes - start_pos, - fs_info->sectorsize) - 1; + last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1; if (start_pos < inode->vfs_inode.i_size) { struct btrfs_ordered_extent *ordered; @@ -1497,8 +1491,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, unlock_page(pages[i]); put_page(pages[i]); } - btrfs_start_ordered_extent(&inode->vfs_inode, - ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); return -EAGAIN; } @@ -1872,7 +1865,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) loff_t endbyte; int err; - written = generic_file_direct_write(iocb, from); + written = btrfs_direct_IO(iocb, from); if (written < 0 || !iov_iter_count(from)) return written; @@ -2025,7 +2018,40 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, atomic_inc(&BTRFS_I(inode)->sync_writers); if (iocb->ki_flags & IOCB_DIRECT) { + /* + * 1. We must always clear IOCB_DSYNC in order to not deadlock + * in iomap, as it calls generic_write_sync() in this case. + * 2. If we are async, we can call iomap_dio_complete() either + * in + * + * 2.1. A worker thread from the last bio completed. In this + * case we need to mark the btrfs_dio_data that it is + * async in order to call generic_write_sync() properly. + * This is handled by setting BTRFS_DIO_SYNC_STUB in the + * current->journal_info. + * 2.2 The submitter context, because all IO completed + * before we exited iomap_dio_rw(). In this case we can + * just re-set the IOCB_DSYNC on the iocb and we'll do + * the sync below. If our ->end_io() gets called and + * current->journal_info is set, then we know we're in + * our current context and we will clear + * current->journal_info to indicate that we need to + * sync below. + */ + if (sync) { + ASSERT(current->journal_info == NULL); + iocb->ki_flags &= ~IOCB_DSYNC; + current->journal_info = BTRFS_DIO_SYNC_STUB; + } num_written = __btrfs_direct_write(iocb, from); + + /* + * As stated above, we cleared journal_info, so we need to do + * the sync ourselves. + */ + if (sync && current->journal_info == NULL) + iocb->ki_flags |= IOCB_DSYNC; + current->journal_info = NULL; } else { num_written = btrfs_buffered_write(iocb, from); if (num_written > 0) @@ -2065,12 +2091,12 @@ int btrfs_release_file(struct inode *inode, struct file *filp) filp->private_data = NULL; /* - * ordered_data_close is set by setattr when we are about to truncate - * a file from a non-zero size to a zero size. This tries to - * flush down new bytes that may have been written if the - * application were using truncate to replace a file in place. + * Set by setattr when we are about to truncate a file from a non-zero + * size to a zero size. This tries to flush down new bytes that may + * have been written if the application were using truncate to replace + * a file in place. */ - if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, + if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE, &BTRFS_I(inode)->runtime_flags)) filemap_flush(inode->i_mapping); return 0; @@ -2116,20 +2142,24 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct btrfs_trans_handle *trans; struct btrfs_log_ctx ctx; int ret = 0, err; + u64 len; + bool full_sync; trace_btrfs_sync_file(file, datasync); btrfs_init_log_ctx(&ctx, inode); /* - * Set the range to full if the NO_HOLES feature is not enabled. - * This is to avoid missing file extent items representing holes after - * replaying the log. + * Always set the range to a full range, otherwise we can get into + * several problems, from missing file extent items to represent holes + * when not using the NO_HOLES feature, to log tree corruption due to + * races between hole detection during logging and completion of ordered + * extents outside the range, to missing checksums due to ordered extents + * for which we flushed only a subset of their pages. */ - if (!btrfs_fs_incompat(fs_info, NO_HOLES)) { - start = 0; - end = LLONG_MAX; - } + start = 0; + end = LLONG_MAX; + len = (u64)LLONG_MAX + 1; /* * We write the dirty pages in the range and wait until they complete @@ -2153,19 +2183,12 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) atomic_inc(&root->log_batch); /* - * If the inode needs a full sync, make sure we use a full range to - * avoid log tree corruption, due to hole detection racing with ordered - * extent completion for adjacent ranges and races between logging and - * completion of ordered extents for adjancent ranges - both races - * could lead to file extent items in the log with overlapping ranges. - * Do this while holding the inode lock, to avoid races with other - * tasks. + * Always check for the full sync flag while holding the inode's lock, + * to avoid races with other tasks. The flag must be either set all the + * time during logging or always off all the time while logging. */ - if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags)) { - start = 0; - end = LLONG_MAX; - } + full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); /* * Before we acquired the inode's lock, someone may have dirtied more @@ -2196,20 +2219,42 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * We have to do this here to avoid the priority inversion of waiting on * IO of a lower priority task while holding a transaction open. * - * Also, the range length can be represented by u64, we have to do the - * typecasts to avoid signed overflow if it's [0, LLONG_MAX]. + * For a full fsync we wait for the ordered extents to complete while + * for a fast fsync we wait just for writeback to complete, and then + * attach the ordered extents to the transaction so that a transaction + * commit waits for their completion, to avoid data loss if we fsync, + * the current transaction commits before the ordered extents complete + * and a power failure happens right after that. */ - ret = btrfs_wait_ordered_range(inode, start, (u64)end - (u64)start + 1); - if (ret) { - up_write(&BTRFS_I(inode)->dio_sem); - inode_unlock(inode); - goto out; + if (full_sync) { + ret = btrfs_wait_ordered_range(inode, start, len); + } else { + /* + * Get our ordered extents as soon as possible to avoid doing + * checksum lookups in the csum tree, and use instead the + * checksums attached to the ordered extents. + */ + btrfs_get_ordered_extents_for_logging(BTRFS_I(inode), + &ctx.ordered_extents); + ret = filemap_fdatawait_range(inode->i_mapping, start, end); } + + if (ret) + goto out_release_extents; + atomic_inc(&root->log_batch); + /* + * If we are doing a fast fsync we can not bail out if the inode's + * last_trans is <= then the last committed transaction, because we only + * update the last_trans of the inode during ordered extent completion, + * and for a fast fsync we don't wait for that, we only wait for the + * writeback to complete. + */ smp_mb(); if (btrfs_inode_in_log(BTRFS_I(inode), fs_info->generation) || - BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed) { + (BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed && + (full_sync || list_empty(&ctx.ordered_extents)))) { /* * We've had everything committed since the last time we were * modified so clear this flag in case it was set for whatever @@ -2225,9 +2270,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * checked called fsync. */ ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err); - up_write(&BTRFS_I(inode)->dio_sem); - inode_unlock(inode); - goto out; + goto out_release_extents; } /* @@ -2244,12 +2287,11 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - up_write(&BTRFS_I(inode)->dio_sem); - inode_unlock(inode); - goto out; + goto out_release_extents; } - ret = btrfs_log_dentry_safe(trans, dentry, start, end, &ctx); + ret = btrfs_log_dentry_safe(trans, dentry, &ctx); + btrfs_release_log_ctx_extents(&ctx); if (ret < 0) { /* Fallthrough and commit/free transaction. */ ret = 1; @@ -2276,6 +2318,13 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) goto out; } } + if (!full_sync) { + ret = btrfs_wait_ordered_range(inode, start, len); + if (ret) { + btrfs_end_transaction(trans); + goto out; + } + } ret = btrfs_commit_transaction(trans); } else { ret = btrfs_end_transaction(trans); @@ -2286,6 +2335,12 @@ out: if (!ret) ret = err; return ret > 0 ? -EIO : ret; + +out_release_extents: + btrfs_release_log_ctx_extents(&ctx); + up_write(&BTRFS_I(inode)->dio_sem); + inode_unlock(inode); + goto out; } static const struct vm_operations_struct btrfs_file_vm_ops = { @@ -2481,7 +2536,8 @@ static int btrfs_punch_hole_lock_range(struct inode *inode, lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, cached_state); - ordered = btrfs_lookup_first_ordered_extent(inode, lockend); + ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), + lockend); /* * We need to make sure we have no ordered extents in this range @@ -2509,11 +2565,11 @@ static int btrfs_punch_hole_lock_range(struct inode *inode, return 0; } -static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans, +static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_path *path, - struct btrfs_clone_extent_info *clone_info, - const u64 clone_len) + struct btrfs_replace_extent_info *extent_info, + const u64 replace_len) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; @@ -2522,51 +2578,69 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans, struct btrfs_key key; int slot; struct btrfs_ref ref = { 0 }; - u64 ref_offset; int ret; - if (clone_len == 0) + if (replace_len == 0) return 0; - if (clone_info->disk_offset == 0 && + if (extent_info->disk_offset == 0 && btrfs_fs_incompat(fs_info, NO_HOLES)) return 0; key.objectid = btrfs_ino(BTRFS_I(inode)); key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = clone_info->file_offset; + key.offset = extent_info->file_offset; ret = btrfs_insert_empty_item(trans, root, path, &key, - clone_info->item_size); + sizeof(struct btrfs_file_extent_item)); if (ret) return ret; leaf = path->nodes[0]; slot = path->slots[0]; - write_extent_buffer(leaf, clone_info->extent_buf, + write_extent_buffer(leaf, extent_info->extent_buf, btrfs_item_ptr_offset(leaf, slot), - clone_info->item_size); + sizeof(struct btrfs_file_extent_item)); extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); - btrfs_set_file_extent_offset(leaf, extent, clone_info->data_offset); - btrfs_set_file_extent_num_bytes(leaf, extent, clone_len); + ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE); + btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset); + btrfs_set_file_extent_num_bytes(leaf, extent, replace_len); + if (extent_info->is_new_extent) + btrfs_set_file_extent_generation(leaf, extent, trans->transid); btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), - clone_info->file_offset, clone_len); + extent_info->file_offset, replace_len); if (ret) return ret; /* If it's a hole, nothing more needs to be done. */ - if (clone_info->disk_offset == 0) + if (extent_info->disk_offset == 0) return 0; - inode_add_bytes(inode, clone_len); - btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, - clone_info->disk_offset, - clone_info->disk_len, 0); - ref_offset = clone_info->file_offset - clone_info->data_offset; - btrfs_init_data_ref(&ref, root->root_key.objectid, - btrfs_ino(BTRFS_I(inode)), ref_offset); - ret = btrfs_inc_extent_ref(trans, &ref); + inode_add_bytes(inode, replace_len); + + if (extent_info->is_new_extent && extent_info->insertions == 0) { + key.objectid = extent_info->disk_offset; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = extent_info->disk_len; + ret = btrfs_alloc_reserved_file_extent(trans, root, + btrfs_ino(BTRFS_I(inode)), + extent_info->file_offset, + extent_info->qgroup_reserved, + &key); + } else { + u64 ref_offset; + + btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, + extent_info->disk_offset, + extent_info->disk_len, 0); + ref_offset = extent_info->file_offset - extent_info->data_offset; + btrfs_init_data_ref(&ref, root->root_key.objectid, + btrfs_ino(BTRFS_I(inode)), ref_offset); + ret = btrfs_inc_extent_ref(trans, &ref); + } + + extent_info->insertions++; return ret; } @@ -2574,15 +2648,15 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans, /* * The respective range must have been previously locked, as well as the inode. * The end offset is inclusive (last byte of the range). - * @clone_info is NULL for fallocate's hole punching and non-NULL for extent - * cloning. - * When cloning, we don't want to end up in a state where we dropped extents - * without inserting a new one, so we must abort the transaction to avoid a - * corruption. + * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing + * the file range with an extent. + * When not punching a hole, we don't want to end up in a state where we dropped + * extents without inserting a new one, so we must abort the transaction to avoid + * a corruption. */ -int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, +int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, const u64 start, const u64 end, - struct btrfs_clone_extent_info *clone_info, + struct btrfs_replace_extent_info *extent_info, struct btrfs_trans_handle **trans_out) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -2611,10 +2685,10 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, /* * 1 - update the inode * 1 - removing the extents in the range - * 1 - adding the hole extent if no_holes isn't set or if we are cloning - * an extent + * 1 - adding the hole extent if no_holes isn't set or if we are + * replacing the range with a new extent */ - if (!btrfs_fs_incompat(fs_info, NO_HOLES) || clone_info) + if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info) rsv_count = 3; else rsv_count = 2; @@ -2644,14 +2718,15 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, * returned by __btrfs_drop_extents() without having * changed anything in the file. */ - if (clone_info && ret && ret != -EOPNOTSUPP) + if (extent_info && !extent_info->is_new_extent && + ret && ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, ret); break; } trans->block_rsv = &fs_info->trans_block_rsv; - if (!clone_info && cur_offset < drop_end && + if (!extent_info && cur_offset < drop_end && cur_offset < ino_size) { ret = fill_holes(trans, BTRFS_I(inode), path, cur_offset, drop_end); @@ -2665,7 +2740,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, btrfs_abort_transaction(trans, ret); break; } - } else if (!clone_info && cur_offset < drop_end) { + } else if (!extent_info && cur_offset < drop_end) { /* * We are past the i_size here, but since we didn't * insert holes we need to clear the mapped area so we @@ -2685,18 +2760,18 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, } } - if (clone_info && drop_end > clone_info->file_offset) { - u64 clone_len = drop_end - clone_info->file_offset; + if (extent_info && drop_end > extent_info->file_offset) { + u64 replace_len = drop_end - extent_info->file_offset; - ret = btrfs_insert_clone_extent(trans, inode, path, - clone_info, clone_len); + ret = btrfs_insert_replace_extent(trans, inode, path, + extent_info, replace_len); if (ret) { btrfs_abort_transaction(trans, ret); break; } - clone_info->data_len -= clone_len; - clone_info->data_offset += clone_len; - clone_info->file_offset += clone_len; + extent_info->data_len -= replace_len; + extent_info->data_offset += replace_len; + extent_info->file_offset += replace_len; } cur_offset = drop_end; @@ -2720,7 +2795,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, BUG_ON(ret); /* shouldn't happen */ trans->block_rsv = rsv; - if (!clone_info) { + if (!extent_info) { ret = find_first_non_hole(inode, &cur_offset, &len); if (unlikely(ret < 0)) break; @@ -2739,7 +2814,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, * than 16Mb would force the full fsync any way (when * try_release_extent_mapping() is invoked during page cache truncation. */ - if (clone_info) + if (extent_info && !extent_info->is_new_extent) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); @@ -2765,7 +2840,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, * (because it's useless) or if it represents a 0 bytes range (when * cur_offset == drop_end). */ - if (!clone_info && cur_offset < ino_size && cur_offset < drop_end) { + if (!extent_info && cur_offset < ino_size && cur_offset < drop_end) { ret = fill_holes(trans, BTRFS_I(inode), path, cur_offset, drop_end); if (ret) { @@ -2773,7 +2848,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, btrfs_abort_transaction(trans, ret); goto out_trans; } - } else if (!clone_info && cur_offset < drop_end) { + } else if (!extent_info && cur_offset < drop_end) { /* See the comment in the loop above for the reasoning here. */ ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode), cur_offset, drop_end - cur_offset); @@ -2783,9 +2858,9 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, } } - if (clone_info) { - ret = btrfs_insert_clone_extent(trans, inode, path, clone_info, - clone_info->data_len); + if (extent_info) { + ret = btrfs_insert_replace_extent(trans, inode, path, extent_info, + extent_info->data_len); if (ret) { btrfs_abort_transaction(trans, ret); goto out_trans; @@ -2840,9 +2915,9 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out_only_mutex; } - lockstart = round_up(offset, btrfs_inode_sectorsize(inode)); + lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode))); lockend = round_down(offset + len, - btrfs_inode_sectorsize(inode)) - 1; + btrfs_inode_sectorsize(BTRFS_I(inode))) - 1; same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset)) == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)); /* @@ -2927,7 +3002,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out; } - ret = btrfs_punch_hole_range(inode, path, lockstart, lockend, NULL, + ret = btrfs_replace_file_extents(inode, path, lockstart, lockend, NULL, &trans); btrfs_free_path(path); if (ret) @@ -3044,7 +3119,7 @@ enum { RANGE_BOUNDARY_HOLE, }; -static int btrfs_zero_range_check_range_boundary(struct inode *inode, +static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode, u64 offset) { const u64 sectorsize = btrfs_inode_sectorsize(inode); @@ -3052,7 +3127,7 @@ static int btrfs_zero_range_check_range_boundary(struct inode *inode, int ret; offset = round_down(offset, sectorsize); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize); if (IS_ERR(em)) return PTR_ERR(em); @@ -3077,7 +3152,7 @@ static int btrfs_zero_range(struct inode *inode, struct extent_changeset *data_reserved = NULL; int ret; u64 alloc_hint = 0; - const u64 sectorsize = btrfs_inode_sectorsize(inode); + const u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode)); u64 alloc_start = round_down(offset, sectorsize); u64 alloc_end = round_up(offset + len, sectorsize); u64 bytes_to_reserve = 0; @@ -3167,7 +3242,8 @@ static int btrfs_zero_range(struct inode *inode, * to cover them. */ if (!IS_ALIGNED(offset, sectorsize)) { - ret = btrfs_zero_range_check_range_boundary(inode, offset); + ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode), + offset); if (ret < 0) goto out; if (ret == RANGE_BOUNDARY_HOLE) { @@ -3183,7 +3259,7 @@ static int btrfs_zero_range(struct inode *inode, } if (!IS_ALIGNED(offset + len, sectorsize)) { - ret = btrfs_zero_range_check_range_boundary(inode, + ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode), offset + len); if (ret < 0) goto out; @@ -3258,7 +3334,7 @@ static long btrfs_fallocate(struct file *file, int mode, u64 locked_end; u64 actual_end = 0; struct extent_map *em; - int blocksize = btrfs_inode_sectorsize(inode); + int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode)); int ret; alloc_start = round_down(offset, blocksize); @@ -3340,7 +3416,8 @@ static long btrfs_fallocate(struct file *file, int mode, */ lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state); - ordered = btrfs_lookup_first_ordered_extent(inode, locked_end); + ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), + locked_end); if (ordered && ordered->file_offset + ordered->num_bytes > alloc_start && @@ -3541,9 +3618,26 @@ static int btrfs_file_open(struct inode *inode, struct file *filp) return generic_file_open(inode, filp); } +static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + ssize_t ret = 0; + + if (iocb->ki_flags & IOCB_DIRECT) { + struct inode *inode = file_inode(iocb->ki_filp); + + inode_lock_shared(inode); + ret = btrfs_direct_IO(iocb, to); + inode_unlock_shared(inode); + if (ret < 0) + return ret; + } + + return generic_file_buffered_read(iocb, to, ret); +} + const struct file_operations btrfs_file_operations = { .llseek = btrfs_file_llseek, - .read_iter = generic_file_read_iter, + .read_iter = btrfs_file_read_iter, .splice_read = generic_file_splice_read, .write_iter = btrfs_file_write_iter, .splice_write = iter_file_splice_write, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index dc82fd0c80cb..af0013d3df63 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -413,8 +413,6 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation) { - __le64 *val; - io_ctl_map_page(io_ctl, 1); /* @@ -429,14 +427,13 @@ static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation) io_ctl->size -= sizeof(u64) * 2; } - val = io_ctl->cur; - *val = cpu_to_le64(generation); + put_unaligned_le64(generation, io_ctl->cur); io_ctl->cur += sizeof(u64); } static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) { - __le64 *gen; + u64 cache_gen; /* * Skip the crc area. If we don't check crcs then we just have a 64bit @@ -451,11 +448,11 @@ static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) io_ctl->size -= sizeof(u64) * 2; } - gen = io_ctl->cur; - if (le64_to_cpu(*gen) != generation) { + cache_gen = get_unaligned_le64(io_ctl->cur); + if (cache_gen != generation) { btrfs_err_rl(io_ctl->fs_info, "space cache generation (%llu) does not match inode (%llu)", - *gen, generation); + cache_gen, generation); io_ctl_unmap_page(io_ctl); return -EIO; } @@ -525,8 +522,8 @@ static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes, return -ENOSPC; entry = io_ctl->cur; - entry->offset = cpu_to_le64(offset); - entry->bytes = cpu_to_le64(bytes); + put_unaligned_le64(offset, &entry->offset); + put_unaligned_le64(bytes, &entry->bytes); entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP : BTRFS_FREE_SPACE_EXTENT; io_ctl->cur += sizeof(struct btrfs_free_space_entry); @@ -599,8 +596,8 @@ static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl, } e = io_ctl->cur; - entry->offset = le64_to_cpu(e->offset); - entry->bytes = le64_to_cpu(e->bytes); + entry->offset = get_unaligned_le64(&e->offset); + entry->bytes = get_unaligned_le64(&e->bytes); *type = e->type; io_ctl->cur += sizeof(struct btrfs_free_space_entry); io_ctl->size -= sizeof(struct btrfs_free_space_entry); @@ -1353,7 +1350,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, /* * at this point the pages are under IO and we're happy, - * The caller is responsible for waiting on them and updating the + * The caller is responsible for waiting on them and updating * the cache and the inode */ io_ctl->entries = entries; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9570458aa847..936c3137c646 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6,7 +6,6 @@ #include <crypto/hash.h> #include <linux/kernel.h> #include <linux/bio.h> -#include <linux/buffer_head.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/pagemap.h> @@ -31,6 +30,7 @@ #include <linux/swap.h> #include <linux/migrate.h> #include <linux/sched/mm.h> +#include <linux/iomap.h> #include <asm/unaligned.h> #include "misc.h" #include "ctree.h" @@ -59,9 +59,10 @@ struct btrfs_iget_args { struct btrfs_dio_data { u64 reserve; - u64 unsubmitted_oe_range_start; - u64 unsubmitted_oe_range_end; - int overwrite; + loff_t length; + ssize_t submitted; + struct extent_changeset *data_reserved; + bool sync; }; static const struct inode_operations btrfs_dir_inode_operations; @@ -70,7 +71,6 @@ static const struct inode_operations btrfs_special_inode_operations; static const struct inode_operations btrfs_file_inode_operations; static const struct address_space_operations btrfs_aops; static const struct file_operations btrfs_dir_file_operations; -static const struct extent_io_ops btrfs_extent_io_ops; static struct kmem_cache *btrfs_inode_cachep; struct kmem_cache *btrfs_trans_handle_cachep; @@ -140,13 +140,6 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, static int btrfs_dirty_inode(struct inode *inode); -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -void btrfs_test_inode_set_ops(struct inode *inode) -{ - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; -} -#endif - static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir, const struct qstr *qstr) @@ -2183,9 +2176,8 @@ static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio, * * c-3) otherwise: async submit */ -static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, - int mirror_num, - unsigned long bio_flags) +blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -2245,16 +2237,15 @@ out: * given a list of ordered sums record them in the inode. This happens * at IO completion time based on sums calculated at bio submission time. */ -static noinline int add_pending_csums(struct btrfs_trans_handle *trans, - struct inode *inode, struct list_head *list) +static int add_pending_csums(struct btrfs_trans_handle *trans, + struct list_head *list) { struct btrfs_ordered_sum *sum; int ret; list_for_each_entry(sum, list, list) { trans->adding_csums = true; - ret = btrfs_csum_file_blocks(trans, - BTRFS_I(inode)->root->fs_info->csum_root, sum); + ret = btrfs_csum_file_blocks(trans, trans->fs_info->csum_root, sum); trans->adding_csums = false; if (ret) return ret; @@ -2357,7 +2348,7 @@ again: unlock_extent_cached(&inode->io_tree, page_start, page_end, &cached_state); unlock_page(page); - btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); goto again; } @@ -2548,7 +2539,6 @@ static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info, } static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, - struct inode *inode, struct btrfs_ordered_extent *oe) { struct btrfs_file_extent_item stack_fi; @@ -2568,8 +2558,9 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); /* Encryption and other encoding is reserved and all 0 */ - return insert_reserved_file_extent(trans, BTRFS_I(inode), oe->file_offset, - &stack_fi, oe->qgroup_rsv); + return insert_reserved_file_extent(trans, BTRFS_I(oe->inode), + oe->file_offset, &stack_fi, + oe->qgroup_rsv); } /* @@ -2666,8 +2657,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) logical_len); } else { BUG_ON(root == fs_info->tree_root); - ret = insert_ordered_extent_file_extent(trans, inode, - ordered_extent); + ret = insert_ordered_extent_file_extent(trans, ordered_extent); if (!ret) { clear_reserved_extent = false; btrfs_release_delalloc_bytes(fs_info, @@ -2683,7 +2673,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } - ret = add_pending_csums(trans, inode, &ordered_extent->list); + ret = add_pending_csums(trans, &ordered_extent->list); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -2752,7 +2742,7 @@ out: * This needs to be done to make sure anybody waiting knows we are done * updating everything for this ordered extent. */ - btrfs_remove_ordered_extent(inode, ordered_extent); + btrfs_remove_ordered_extent(BTRFS_I(inode), ordered_extent); /* once for us */ btrfs_put_ordered_extent(ordered_extent); @@ -2772,8 +2762,8 @@ static void finish_ordered_fn(struct btrfs_work *work) void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, u64 end, int uptodate) { - struct inode *inode = page->mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_ordered_extent *ordered_extent = NULL; struct btrfs_workqueue *wq; @@ -2784,7 +2774,7 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, end - start + 1, uptodate)) return; - if (btrfs_is_free_space_inode(BTRFS_I(inode))) + if (btrfs_is_free_space_inode(inode)) wq = fs_info->endio_freespace_worker; else wq = fs_info->endio_write_workers; @@ -2833,9 +2823,8 @@ zeroit: * if there's a match, we allow the bio to finish. If not, the code in * extent_io.c will try to find good copies for us. */ -static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, - u64 phy_offset, struct page *page, - u64 start, u64 end, int mirror) +int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, int mirror) { size_t offset = start - page_offset(page); struct inode *inode = page->mapping->host; @@ -3055,7 +3044,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) if (ret == -ENOENT && root == fs_info->tree_root) { struct btrfs_root *dead_root; - struct btrfs_fs_info *fs_info = root->fs_info; int is_dead_root = 0; /* @@ -3395,7 +3383,6 @@ cache_acl: switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_mapping->a_ops = &btrfs_aops; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; break; @@ -4051,7 +4038,7 @@ out_end_trans: err = ret; inode->i_flags |= S_DEAD; out_release: - btrfs_subvolume_release_metadata(fs_info, &block_rsv); + btrfs_subvolume_release_metadata(root, &block_rsv); out_up_write: up_write(&fs_info->subvol_sem); if (err) { @@ -4583,7 +4570,7 @@ again: &cached_state); unlock_page(page); put_page(page); - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); goto again; } @@ -4848,19 +4835,16 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) /* * We're truncating a file that used to have good data down to - * zero. Make sure it gets into the ordered flush list so that - * any new writes get down to disk quickly. + * zero. Make sure any new writes to the file get on disk + * on close. */ if (newsize == 0) - set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, + set_bit(BTRFS_INODE_FLUSH_ON_CLOSE, &BTRFS_I(inode)->runtime_flags); truncate_setsize(inode, newsize); - /* Disable nonlocked read DIO to avoid the endless truncate */ - btrfs_inode_block_unlocked_dio(BTRFS_I(inode)); inode_dio_wait(inode); - btrfs_inode_resume_unlocked_dio(BTRFS_I(inode)); ret = btrfs_truncate(inode, newsize == oldsize); if (ret && inode->i_nlink) { @@ -5305,15 +5289,15 @@ static void inode_tree_add(struct inode *inode) spin_unlock(&root->inode_lock); } -static void inode_tree_del(struct inode *inode) +static void inode_tree_del(struct btrfs_inode *inode) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; int empty = 0; spin_lock(&root->inode_lock); - if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { - rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); - RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); + if (!RB_EMPTY_NODE(&inode->rb_node)) { + rb_erase(&inode->rb_node, &root->inode_tree); + RB_CLEAR_NODE(&inode->rb_node); empty = RB_EMPTY_ROOT(&root->inode_tree); } spin_unlock(&root->inode_lock); @@ -6311,7 +6295,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, if (err) goto out_unlock; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; d_instantiate_new(dentry, inode); out_unlock: @@ -6374,7 +6357,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, drop_inode = 1; } else { struct dentry *parent = dentry->d_parent; - int ret; err = btrfs_update_inode(trans, root, inode); if (err) @@ -6389,12 +6371,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, goto fail; } d_instantiate(dentry, inode); - ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent, - true, NULL); - if (ret == BTRFS_NEED_TRANS_COMMIT) { - err = btrfs_commit_transaction(trans); - trans = NULL; - } + btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent); } fail: @@ -6540,8 +6517,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, u64 start, u64 len) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - int ret; - int err = 0; + int ret = 0; u64 extent_start = 0; u64 extent_end = 0; u64 objectid = btrfs_ino(inode); @@ -6569,7 +6545,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, } em = alloc_extent_map(); if (!em) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } em->start = EXTENT_MAP_HOLE; @@ -6579,7 +6555,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, path = btrfs_alloc_path(); if (!path) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } @@ -6592,14 +6568,16 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, */ path->leave_spinning = 1; + path->recurse = btrfs_is_free_space_inode(inode); + ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0); if (ret < 0) { - err = ret; goto out; } else if (ret > 0) { if (path->slots[0] == 0) goto not_found; path->slots[0]--; + ret = 0; } leaf = path->nodes[0]; @@ -6625,7 +6603,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, extent_type == BTRFS_FILE_EXTENT_PREALLOC) { /* Only regular file could have regular/prealloc extent */ if (!S_ISREG(inode->vfs_inode.i_mode)) { - err = -EUCLEAN; + ret = -EUCLEAN; btrfs_crit(fs_info, "regular/prealloc extent found for non-regular inode %llu", btrfs_ino(inode)); @@ -6643,12 +6621,11 @@ next: path->slots[0]++; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); - if (ret < 0) { - err = ret; + if (ret < 0) goto out; - } else if (ret > 0) { + else if (ret > 0) goto not_found; - } + leaf = path->nodes[0]; } btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); @@ -6699,10 +6676,8 @@ next: BTRFS_COMPRESS_NONE) { ret = uncompress_inline(path, page, pg_offset, extent_offset, item); - if (ret) { - err = ret; + if (ret) goto out; - } } else { map = kmap(page); read_extent_buffer(leaf, map + pg_offset, ptr, @@ -6726,29 +6701,28 @@ not_found: em->len = len; em->block_start = EXTENT_MAP_HOLE; insert: + ret = 0; btrfs_release_path(path); if (em->start > start || extent_map_end(em) <= start) { btrfs_err(fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]", em->start, em->len, start, len); - err = -EIO; + ret = -EIO; goto out; } - err = 0; write_lock(&em_tree->lock); - err = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); write_unlock(&em_tree->lock); out: btrfs_free_path(path); trace_btrfs_get_extent(root, inode, em); - if (err) { + if (ret) { free_extent_map(em); - return ERR_PTR(err); + return ERR_PTR(ret); } - BUG_ON(!em); /* Error is always set */ return em; } @@ -7111,7 +7085,7 @@ out: } static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, - struct extent_state **cached_state, int writing) + struct extent_state **cached_state, bool writing) { struct btrfs_ordered_extent *ordered; int ret = 0; @@ -7160,7 +7134,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, */ if (writing || test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); else ret = -ENOTBLK; btrfs_put_ordered_extent(ordered); @@ -7249,30 +7223,7 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, } -static int btrfs_get_blocks_direct_read(struct extent_map *em, - struct buffer_head *bh_result, - struct inode *inode, - u64 start, u64 len) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - - if (em->block_start == EXTENT_MAP_HOLE || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - return -ENOENT; - - len = min(len, em->len - (start - em->start)); - - bh_result->b_blocknr = (em->block_start + (start - em->start)) >> - inode->i_blkbits; - bh_result->b_size = len; - bh_result->b_bdev = fs_info->fs_devices->latest_bdev; - set_buffer_mapped(bh_result); - - return 0; -} - static int btrfs_get_blocks_direct_write(struct extent_map **map, - struct buffer_head *bh_result, struct inode *inode, struct btrfs_dio_data *dio_data, u64 start, u64 len) @@ -7333,7 +7284,6 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, } /* this will cow the extent */ - len = bh_result->b_size; free_extent_map(em); *map = em = btrfs_new_extent_direct(BTRFS_I(inode), start, len); if (IS_ERR(em)) { @@ -7344,64 +7294,88 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, len = min(len, em->len - (start - em->start)); skip_cow: - bh_result->b_blocknr = (em->block_start + (start - em->start)) >> - inode->i_blkbits; - bh_result->b_size = len; - bh_result->b_bdev = fs_info->fs_devices->latest_bdev; - set_buffer_mapped(bh_result); - - if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - set_buffer_new(bh_result); - /* * Need to update the i_size under the extent lock so buffered * readers will get the updated i_size when we unlock. */ - if (!dio_data->overwrite && start + len > i_size_read(inode)) + if (start + len > i_size_read(inode)) i_size_write(inode, start + len); - WARN_ON(dio_data->reserve < len); dio_data->reserve -= len; - dio_data->unsubmitted_oe_range_end = start + len; - current->journal_info = dio_data; out: return ret; } -static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, + loff_t length, unsigned int flags, struct iomap *iomap, + struct iomap *srcmap) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *em; struct extent_state *cached_state = NULL; struct btrfs_dio_data *dio_data = NULL; - u64 start = iblock << inode->i_blkbits; u64 lockstart, lockend; - u64 len = bh_result->b_size; + const bool write = !!(flags & IOMAP_WRITE); int ret = 0; + u64 len = length; + bool unlock_extents = false; + bool sync = (current->journal_info == BTRFS_DIO_SYNC_STUB); + + /* + * We used current->journal_info here to see if we were sync, but + * there's a lot of tests in the enospc machinery to not do flushing if + * we have a journal_info set, so we need to clear this out and re-set + * it in iomap_end. + */ + ASSERT(current->journal_info == NULL || + current->journal_info == BTRFS_DIO_SYNC_STUB); + current->journal_info = NULL; - if (!create) + if (!write) len = min_t(u64, len, fs_info->sectorsize); lockstart = start; lockend = start + len - 1; - if (current->journal_info) { - /* - * Need to pull our outstanding extents and set journal_info to NULL so - * that anything that needs to check if there's a transaction doesn't get - * confused. - */ - dio_data = current->journal_info; - current->journal_info = NULL; + /* + * The generic stuff only does filemap_write_and_wait_range, which + * isn't enough if we've written compressed pages to this area, so we + * need to flush the dirty pages again to make absolutely sure that any + * outstanding dirty pages are on disk. + */ + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) { + ret = filemap_fdatawrite_range(inode->i_mapping, start, + start + length - 1); + if (ret) + return ret; + } + + dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS); + if (!dio_data) + return -ENOMEM; + + dio_data->sync = sync; + dio_data->length = length; + if (write) { + dio_data->reserve = round_up(length, fs_info->sectorsize); + ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), + &dio_data->data_reserved, + start, dio_data->reserve); + if (ret) { + extent_changeset_free(dio_data->data_reserved); + kfree(dio_data); + return ret; + } } + iomap->private = dio_data; + /* * If this errors out it's because we couldn't invalidate pagecache for * this range and we need to fallback to buffered. */ - if (lock_extent_direct(inode, lockstart, lockend, &cached_state, - create)) { + if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) { ret = -ENOTBLK; goto err; } @@ -7433,35 +7407,47 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, goto unlock_err; } - if (create) { - ret = btrfs_get_blocks_direct_write(&em, bh_result, inode, - dio_data, start, len); + len = min(len, em->len - (start - em->start)); + if (write) { + ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, + start, len); if (ret < 0) goto unlock_err; - - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, - lockend, &cached_state); + unlock_extents = true; + /* Recalc len in case the new em is smaller than requested */ + len = min(len, em->len - (start - em->start)); } else { - ret = btrfs_get_blocks_direct_read(em, bh_result, inode, - start, len); - /* Can be negative only if we read from a hole */ - if (ret < 0) { - ret = 0; - free_extent_map(em); - goto unlock_err; - } /* * We need to unlock only the end area that we aren't using. * The rest is going to be unlocked by the endio routine. */ - lockstart = start + bh_result->b_size; - if (lockstart < lockend) { - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - lockstart, lockend, &cached_state); - } else { - free_extent_state(cached_state); - } + lockstart = start + len; + if (lockstart < lockend) + unlock_extents = true; + } + + if (unlock_extents) + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + lockstart, lockend, &cached_state); + else + free_extent_state(cached_state); + + /* + * Translate extent map information to iomap. + * We trim the extents (and move the addr) even though iomap code does + * that, since we have locked only the parts we are performing I/O in. + */ + if ((em->block_start == EXTENT_MAP_HOLE) || + (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) { + iomap->addr = IOMAP_NULL_ADDR; + iomap->type = IOMAP_HOLE; + } else { + iomap->addr = em->block_start + (start - em->start); + iomap->type = IOMAP_MAPPED; } + iomap->offset = start; + iomap->bdev = fs_info->fs_devices->latest_bdev; + iomap->length = len; free_extent_map(em); @@ -7471,8 +7457,63 @@ unlock_err: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); err: - if (dio_data) - current->journal_info = dio_data; + if (dio_data) { + btrfs_delalloc_release_space(BTRFS_I(inode), + dio_data->data_reserved, start, + dio_data->reserve, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve); + extent_changeset_free(dio_data->data_reserved); + kfree(dio_data); + } + return ret; +} + +static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, + ssize_t written, unsigned int flags, struct iomap *iomap) +{ + int ret = 0; + struct btrfs_dio_data *dio_data = iomap->private; + size_t submitted = dio_data->submitted; + const bool write = !!(flags & IOMAP_WRITE); + + if (!write && (iomap->type == IOMAP_HOLE)) { + /* If reading from a hole, unlock and return */ + unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1); + goto out; + } + + if (submitted < length) { + pos += submitted; + length -= submitted; + if (write) + __endio_write_update_ordered(BTRFS_I(inode), pos, + length, false); + else + unlock_extent(&BTRFS_I(inode)->io_tree, pos, + pos + length - 1); + ret = -ENOTBLK; + } + + if (write) { + if (dio_data->reserve) + btrfs_delalloc_release_space(BTRFS_I(inode), + dio_data->data_reserved, pos, + dio_data->reserve, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length); + extent_changeset_free(dio_data->data_reserved); + } +out: + /* + * We're all done, we can re-set the current->journal_info now safely + * for our endio. + */ + if (dio_data->sync) { + ASSERT(current->journal_info == NULL); + current->journal_info = BTRFS_DIO_SYNC_STUB; + } + kfree(dio_data); + iomap->private = NULL; + return ret; } @@ -7496,7 +7537,7 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) dip->logical_offset + dip->bytes - 1); } - dio_end_io(dip->dio_bio); + bio_endio(dip->dio_bio); kfree(dip); } @@ -7730,24 +7771,11 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; dip->dio_bio = dio_bio; refcount_set(&dip->refs, 1); - - if (write) { - struct btrfs_dio_data *dio_data = current->journal_info; - - /* - * Setting range start and end to the same value means that - * no cleanup will happen in btrfs_direct_IO - */ - dio_data->unsubmitted_oe_range_end = dip->logical_offset + - dip->bytes; - dio_data->unsubmitted_oe_range_start = - dio_data->unsubmitted_oe_range_end; - } return dip; } -static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, - loff_t file_offset) +static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, + struct bio *dio_bio, loff_t file_offset) { const bool write = (bio_op(dio_bio) == REQ_OP_WRITE); const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); @@ -7764,6 +7792,7 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, int ret; blk_status_t status; struct btrfs_io_geometry geom; + struct btrfs_dio_data *dio_data = iomap->private; dip = btrfs_create_dio_private(dio_bio, inode, file_offset); if (!dip) { @@ -7772,8 +7801,8 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, file_offset + dio_bio->bi_iter.bi_size - 1); } dio_bio->bi_status = BLK_STS_RESOURCE; - dio_end_io(dio_bio); - return; + bio_endio(dio_bio); + return BLK_QC_T_NONE; } if (!write && csum) { @@ -7844,15 +7873,17 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, goto out_err; } + dio_data->submitted += clone_len; clone_offset += clone_len; start_sector += clone_len >> 9; file_offset += clone_len; } while (submit_len > 0); - return; + return BLK_QC_T_NONE; out_err: dip->dio_bio->bi_status = status; btrfs_dio_private_put(dip); + return BLK_QC_T_NONE; } static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, @@ -7888,37 +7919,59 @@ out: return retval; } -static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +static inline int btrfs_maybe_fsync_end_io(struct kiocb *iocb, ssize_t size, + int error, unsigned flags) +{ + /* + * Now if we're still in the context of our submitter we know we can't + * safely run generic_write_sync(), so clear our flag here so that the + * caller knows to follow up with a sync. + */ + if (current->journal_info == BTRFS_DIO_SYNC_STUB) { + current->journal_info = NULL; + return error; + } + + if (error) + return error; + + if (size) { + iocb->ki_flags |= IOCB_DSYNC; + return generic_write_sync(iocb, size); + } + + return 0; +} + +static const struct iomap_ops btrfs_dio_iomap_ops = { + .iomap_begin = btrfs_dio_iomap_begin, + .iomap_end = btrfs_dio_iomap_end, +}; + +static const struct iomap_dio_ops btrfs_dio_ops = { + .submit_io = btrfs_submit_direct, +}; + +static const struct iomap_dio_ops btrfs_sync_dops = { + .submit_io = btrfs_submit_direct, + .end_io = btrfs_maybe_fsync_end_io, +}; + +ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_dio_data dio_data = { 0 }; struct extent_changeset *data_reserved = NULL; loff_t offset = iocb->ki_pos; size_t count = 0; - int flags = 0; - bool wakeup = true; bool relock = false; ssize_t ret; if (check_direct_IO(fs_info, iter, offset)) return 0; - inode_dio_begin(inode); - - /* - * The generic stuff only does filemap_write_and_wait_range, which - * isn't enough if we've written compressed pages to this area, so - * we need to flush the dirty pages again to make absolutely sure - * that any outstanding dirty pages are on disk. - */ count = iov_iter_count(iter); - if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - filemap_fdatawrite_range(inode->i_mapping, offset, - offset + count - 1); - if (iov_iter_rw(iter) == WRITE) { /* * If the write DIO is beyond the EOF, we need update @@ -7926,66 +7979,29 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) * not unlock the i_mutex at this case. */ if (offset + count <= inode->i_size) { - dio_data.overwrite = 1; inode_unlock(inode); relock = true; } - ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, - offset, count); - if (ret) - goto out; - - /* - * We need to know how many extents we reserved so that we can - * do the accounting properly if we go over the number we - * originally calculated. Abuse current->journal_info for this. - */ - dio_data.reserve = round_up(count, - fs_info->sectorsize); - dio_data.unsubmitted_oe_range_start = (u64)offset; - dio_data.unsubmitted_oe_range_end = (u64)offset; - current->journal_info = &dio_data; down_read(&BTRFS_I(inode)->dio_sem); - } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, - &BTRFS_I(inode)->runtime_flags)) { - inode_dio_end(inode); - flags = DIO_LOCKING | DIO_SKIP_HOLES; - wakeup = false; } - ret = __blockdev_direct_IO(iocb, inode, - fs_info->fs_devices->latest_bdev, - iter, btrfs_get_blocks_direct, NULL, - btrfs_submit_direct, flags); - if (iov_iter_rw(iter) == WRITE) { + /* + * We have are actually a sync iocb, so we need our fancy endio to know + * if we need to sync. + */ + if (current->journal_info) + ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, + &btrfs_sync_dops, is_sync_kiocb(iocb)); + else + ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, + &btrfs_dio_ops, is_sync_kiocb(iocb)); + + if (ret == -ENOTBLK) + ret = 0; + + if (iov_iter_rw(iter) == WRITE) up_read(&BTRFS_I(inode)->dio_sem); - current->journal_info = NULL; - if (ret < 0 && ret != -EIOCBQUEUED) { - if (dio_data.reserve) - btrfs_delalloc_release_space(BTRFS_I(inode), - data_reserved, offset, dio_data.reserve, - true); - /* - * On error we might have left some ordered extents - * without submitting corresponding bios for them, so - * cleanup them up to avoid other tasks getting them - * and waiting for them to complete forever. - */ - if (dio_data.unsubmitted_oe_range_start < - dio_data.unsubmitted_oe_range_end) - __endio_write_update_ordered(BTRFS_I(inode), - dio_data.unsubmitted_oe_range_start, - dio_data.unsubmitted_oe_range_end - - dio_data.unsubmitted_oe_range_start, - false); - } else if (ret >= 0 && (size_t)ret < count) - btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, - offset, count - (size_t)ret, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), count); - } -out: - if (wakeup) - inode_dio_end(inode); + if (relock) inode_lock(inode); @@ -8002,12 +8018,24 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret) return ret; - return extent_fiemap(inode, fieinfo, start, len); + return extent_fiemap(BTRFS_I(inode), fieinfo, start, len); } int btrfs_readpage(struct file *file, struct page *page) { - return extent_read_full_page(page, btrfs_get_extent, 0); + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + u64 start = page_offset(page); + u64 end = start + PAGE_SIZE - 1; + unsigned long bio_flags = 0; + struct bio *bio = NULL; + int ret; + + btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); + + ret = btrfs_do_readpage(page, NULL, &bio, &bio_flags, 0, NULL); + if (bio) + ret = submit_one_bio(bio, 0, bio_flags); + return ret; } static int btrfs_writepage(struct page *page, struct writeback_control *wbc) @@ -8091,15 +8119,15 @@ static int btrfs_migratepage(struct address_space *mapping, static void btrfs_invalidatepage(struct page *page, unsigned int offset, unsigned int length) { - struct inode *inode = page->mapping->host; - struct extent_io_tree *tree; + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct extent_io_tree *tree = &inode->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; u64 page_start = page_offset(page); u64 page_end = page_start + PAGE_SIZE - 1; u64 start; u64 end; - int inode_evicting = inode->i_state & I_FREEING; + int inode_evicting = inode->vfs_inode.i_state & I_FREEING; /* * we have the page locked, so new writeback can't start, @@ -8110,7 +8138,6 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, */ wait_on_page_writeback(page); - tree = &BTRFS_I(inode)->io_tree; if (offset) { btrfs_releasepage(page, GFP_NOFS); return; @@ -8120,8 +8147,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, lock_extent_bits(tree, page_start, page_end, &cached_state); again: start = page_start; - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, - page_end - start + 1); + ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1); if (ordered) { end = min(page_end, ordered->file_offset + ordered->num_bytes - 1); @@ -8142,7 +8168,7 @@ again: struct btrfs_ordered_inode_tree *tree; u64 new_len; - tree = &BTRFS_I(inode)->ordered_tree; + tree = &inode->ordered_tree; spin_lock_irq(&tree->lock); set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); @@ -8181,7 +8207,7 @@ again: * bit of its io_tree, and free the qgroup reserved data space. * Since the IO will never happen for this page. */ - btrfs_qgroup_free_data(BTRFS_I(inode), NULL, page_start, PAGE_SIZE); + btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE); if (!inode_evicting) { clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | @@ -8283,7 +8309,7 @@ again: unlock_extent_cached(io_tree, page_start, page_end, &cached_state); unlock_page(page); - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); goto again; } @@ -8614,21 +8640,21 @@ void btrfs_free_inode(struct inode *inode) kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } -void btrfs_destroy_inode(struct inode *inode) +void btrfs_destroy_inode(struct inode *vfs_inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_extent *ordered; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_inode *inode = BTRFS_I(vfs_inode); + struct btrfs_root *root = inode->root; - WARN_ON(!hlist_empty(&inode->i_dentry)); - WARN_ON(inode->i_data.nrpages); - WARN_ON(BTRFS_I(inode)->block_rsv.reserved); - WARN_ON(BTRFS_I(inode)->block_rsv.size); - WARN_ON(BTRFS_I(inode)->outstanding_extents); - WARN_ON(BTRFS_I(inode)->delalloc_bytes); - WARN_ON(BTRFS_I(inode)->new_delalloc_bytes); - WARN_ON(BTRFS_I(inode)->csum_bytes); - WARN_ON(BTRFS_I(inode)->defrag_bytes); + WARN_ON(!hlist_empty(&vfs_inode->i_dentry)); + WARN_ON(vfs_inode->i_data.nrpages); + WARN_ON(inode->block_rsv.reserved); + WARN_ON(inode->block_rsv.size); + WARN_ON(inode->outstanding_extents); + WARN_ON(inode->delalloc_bytes); + WARN_ON(inode->new_delalloc_bytes); + WARN_ON(inode->csum_bytes); + WARN_ON(inode->defrag_bytes); /* * This can happen where we create an inode, but somebody else also @@ -8643,7 +8669,7 @@ void btrfs_destroy_inode(struct inode *inode) if (!ordered) break; else { - btrfs_err(fs_info, + btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup", ordered->file_offset, ordered->num_bytes); btrfs_remove_ordered_extent(inode, ordered); @@ -8651,11 +8677,11 @@ void btrfs_destroy_inode(struct inode *inode) btrfs_put_ordered_extent(ordered); } } - btrfs_qgroup_check_reserved_leak(BTRFS_I(inode)); + btrfs_qgroup_check_reserved_leak(inode); inode_tree_del(inode); - btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); - btrfs_inode_clear_file_extent_range(BTRFS_I(inode), 0, (u64)-1); - btrfs_put_root(BTRFS_I(inode)->root); + btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); + btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1); + btrfs_put_root(inode->root); } int btrfs_drop_inode(struct inode *inode) @@ -8780,27 +8806,19 @@ static int btrfs_rename_exchange(struct inode *old_dir, struct inode *new_inode = new_dentry->d_inode; struct inode *old_inode = old_dentry->d_inode; struct timespec64 ctime = current_time(old_inode); - struct dentry *parent; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); u64 new_ino = btrfs_ino(BTRFS_I(new_inode)); u64 old_idx = 0; u64 new_idx = 0; int ret; + int ret2; bool root_log_pinned = false; bool dest_log_pinned = false; - struct btrfs_log_ctx ctx_root; - struct btrfs_log_ctx ctx_dest; - bool sync_log_root = false; - bool sync_log_dest = false; - bool commit_transaction = false; /* we only allow rename subvolume link between subvolumes */ if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) return -EXDEV; - btrfs_init_log_ctx(&ctx_root, old_inode); - btrfs_init_log_ctx(&ctx_dest, new_inode); - /* close the race window with snapshot create/destroy ioctl */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID || new_ino == BTRFS_FIRST_FREE_OBJECTID) @@ -8942,30 +8960,14 @@ static int btrfs_rename_exchange(struct inode *old_dir, BTRFS_I(new_inode)->dir_index = new_idx; if (root_log_pinned) { - parent = new_dentry->d_parent; - ret = btrfs_log_new_name(trans, BTRFS_I(old_inode), - BTRFS_I(old_dir), parent, - false, &ctx_root); - if (ret == BTRFS_NEED_LOG_SYNC) - sync_log_root = true; - else if (ret == BTRFS_NEED_TRANS_COMMIT) - commit_transaction = true; - ret = 0; + btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir), + new_dentry->d_parent); btrfs_end_log_trans(root); root_log_pinned = false; } if (dest_log_pinned) { - if (!commit_transaction) { - parent = old_dentry->d_parent; - ret = btrfs_log_new_name(trans, BTRFS_I(new_inode), - BTRFS_I(new_dir), parent, - false, &ctx_dest); - if (ret == BTRFS_NEED_LOG_SYNC) - sync_log_dest = true; - else if (ret == BTRFS_NEED_TRANS_COMMIT) - commit_transaction = true; - ret = 0; - } + btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir), + old_dentry->d_parent); btrfs_end_log_trans(dest); dest_log_pinned = false; } @@ -8998,46 +9000,13 @@ out_fail: dest_log_pinned = false; } } - if (!ret && sync_log_root && !commit_transaction) { - ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, - &ctx_root); - if (ret) - commit_transaction = true; - } - if (!ret && sync_log_dest && !commit_transaction) { - ret = btrfs_sync_log(trans, BTRFS_I(new_inode)->root, - &ctx_dest); - if (ret) - commit_transaction = true; - } - if (commit_transaction) { - /* - * We may have set commit_transaction when logging the new name - * in the destination root, in which case we left the source - * root context in the list of log contextes. So make sure we - * remove it to avoid invalid memory accesses, since the context - * was allocated in our stack frame. - */ - if (sync_log_root) { - mutex_lock(&root->log_mutex); - list_del_init(&ctx_root.list); - mutex_unlock(&root->log_mutex); - } - ret = btrfs_commit_transaction(trans); - } else { - int ret2; - - ret2 = btrfs_end_transaction(trans); - ret = ret ? ret : ret2; - } + ret2 = btrfs_end_transaction(trans); + ret = ret ? ret : ret2; out_notrans: if (new_ino == BTRFS_FIRST_FREE_OBJECTID || old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); - ASSERT(list_empty(&ctx_root.list)); - ASSERT(list_empty(&ctx_dest.list)); - return ret; } @@ -9105,11 +9074,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *old_inode = d_inode(old_dentry); u64 index = 0; int ret; + int ret2; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); bool log_pinned = false; - struct btrfs_log_ctx ctx; - bool sync_log = false; - bool commit_transaction = false; if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; @@ -9259,17 +9226,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, BTRFS_I(old_inode)->dir_index = index; if (log_pinned) { - struct dentry *parent = new_dentry->d_parent; - - btrfs_init_log_ctx(&ctx, old_inode); - ret = btrfs_log_new_name(trans, BTRFS_I(old_inode), - BTRFS_I(old_dir), parent, - false, &ctx); - if (ret == BTRFS_NEED_LOG_SYNC) - sync_log = true; - else if (ret == BTRFS_NEED_TRANS_COMMIT) - commit_transaction = true; - ret = 0; + btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir), + new_dentry->d_parent); btrfs_end_log_trans(root); log_pinned = false; } @@ -9306,23 +9264,8 @@ out_fail: btrfs_end_log_trans(root); log_pinned = false; } - if (!ret && sync_log) { - ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, &ctx); - if (ret) - commit_transaction = true; - } else if (sync_log) { - mutex_lock(&root->log_mutex); - list_del(&ctx.list); - mutex_unlock(&root->log_mutex); - } - if (commit_transaction) { - ret = btrfs_commit_transaction(trans); - } else { - int ret2; - - ret2 = btrfs_end_transaction(trans); - ret = ret ? ret : ret2; - } + ret2 = btrfs_end_transaction(trans); + ret = ret ? ret : ret2; out_notrans: if (old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); @@ -9388,7 +9331,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. */ -static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot) +static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot) { struct btrfs_inode *binode; struct inode *inode; @@ -9428,9 +9371,11 @@ static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot) list_add_tail(&work->list, &works); btrfs_queue_work(root->fs_info->flush_workers, &work->work); - ret++; - if (nr != -1 && ret >= nr) - goto out; + if (*nr != U64_MAX) { + (*nr)--; + if (*nr == 0) + goto out; + } cond_resched(); spin_lock(&root->delalloc_lock); } @@ -9455,18 +9400,15 @@ out: int btrfs_start_delalloc_snapshot(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - int ret; + u64 nr = U64_MAX; if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) return -EROFS; - ret = start_delalloc_inodes(root, -1, true); - if (ret > 0) - ret = 0; - return ret; + return start_delalloc_inodes(root, &nr, true); } -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr) +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr) { struct btrfs_root *root; struct list_head splice; @@ -9489,15 +9431,10 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr) &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); - ret = start_delalloc_inodes(root, nr, false); + ret = start_delalloc_inodes(root, &nr, false); btrfs_put_root(root); if (ret < 0) goto out; - - if (nr != -1) { - nr -= ret; - WARN_ON(nr < 0); - } spin_lock(&fs_info->delalloc_root_lock); } spin_unlock(&fs_info->delalloc_root_lock); @@ -9568,7 +9505,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) @@ -9633,11 +9569,15 @@ out_unlock: return err; } -static int insert_prealloc_file_extent(struct btrfs_trans_handle *trans, +static struct btrfs_trans_handle *insert_prealloc_file_extent( + struct btrfs_trans_handle *trans_in, struct inode *inode, struct btrfs_key *ins, u64 file_offset) { struct btrfs_file_extent_item stack_fi; + struct btrfs_replace_extent_info extent_info; + struct btrfs_trans_handle *trans = trans_in; + struct btrfs_path *path; u64 start = ins->objectid; u64 len = ins->offset; int ret; @@ -9654,10 +9594,40 @@ static int insert_prealloc_file_extent(struct btrfs_trans_handle *trans, ret = btrfs_qgroup_release_data(BTRFS_I(inode), file_offset, len); if (ret < 0) - return ret; - return insert_reserved_file_extent(trans, BTRFS_I(inode), file_offset, - &stack_fi, ret); + return ERR_PTR(ret); + + if (trans) { + ret = insert_reserved_file_extent(trans, BTRFS_I(inode), + file_offset, &stack_fi, ret); + if (ret) + return ERR_PTR(ret); + return trans; + } + + extent_info.disk_offset = start; + extent_info.disk_len = len; + extent_info.data_offset = 0; + extent_info.data_len = len; + extent_info.file_offset = file_offset; + extent_info.extent_buf = (char *)&stack_fi; + extent_info.is_new_extent = true; + extent_info.qgroup_reserved = ret; + extent_info.insertions = 0; + + path = btrfs_alloc_path(); + if (!path) + return ERR_PTR(-ENOMEM); + + ret = btrfs_replace_file_extents(inode, path, file_offset, + file_offset + len - 1, &extent_info, + &trans); + btrfs_free_path(path); + if (ret) + return ERR_PTR(ret); + + return trans; } + static int __btrfs_prealloc_file_range(struct inode *inode, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint, @@ -9680,14 +9650,6 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, if (trans) own_trans = false; while (num_bytes > 0) { - if (own_trans) { - trans = btrfs_start_transaction(root, 3); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - break; - } - } - cur_bytes = min_t(u64, num_bytes, SZ_256M); cur_bytes = max(cur_bytes, min_size); /* @@ -9699,11 +9661,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, cur_bytes = min(cur_bytes, last_alloc); ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes, min_size, 0, *alloc_hint, &ins, 1, 0); - if (ret) { - if (own_trans) - btrfs_end_transaction(trans); + if (ret) break; - } /* * We've reserved this space, and thus converted it from @@ -9716,13 +9675,11 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, btrfs_dec_block_group_reservations(fs_info, ins.objectid); last_alloc = ins.offset; - ret = insert_prealloc_file_extent(trans, inode, &ins, cur_offset); - if (ret) { + trans = insert_prealloc_file_extent(trans, inode, &ins, cur_offset); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); - btrfs_abort_transaction(trans, ret); - if (own_trans) - btrfs_end_transaction(trans); break; } @@ -9785,8 +9742,10 @@ next: break; } - if (own_trans) + if (own_trans) { btrfs_end_transaction(trans); + trans = NULL; + } } if (clear_offset < end) btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset, @@ -9865,7 +9824,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; ret = btrfs_init_inode_security(trans, inode, dir, NULL); if (ret) @@ -10072,14 +10030,14 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, /* * Balance or device remove/replace/resize can move stuff around from - * under us. The EXCL_OP flag makes sure they aren't running/won't run - * concurrently while we are mapping the swap extents, and - * fs_info->swapfile_pins prevents them from running while the swap file - * is active and moving the extents. Note that this also prevents a - * concurrent device add which isn't actually necessary, but it's not + * under us. The exclop protection makes sure they aren't running/won't + * run concurrently while we are mapping the swap extents, and + * fs_info->swapfile_pins prevents them from running while the swap + * file is active and moving the extents. Note that this also prevents + * a concurrent device add which isn't actually necessary, but it's not * really worth the trouble to allow it. */ - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) { btrfs_warn(fs_info, "cannot activate swapfile while exclusive operation is running"); return -EBUSY; @@ -10225,7 +10183,7 @@ out: if (ret) btrfs_swap_deactivate(file); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); if (ret) return ret; @@ -10283,12 +10241,6 @@ static const struct file_operations btrfs_dir_file_operations = { .fsync = btrfs_sync_file, }; -static const struct extent_io_ops btrfs_extent_io_ops = { - /* mandatory callbacks */ - .submit_bio_hook = btrfs_submit_bio_hook, - .readpage_end_io_hook = btrfs_readpage_end_io_hook, -}; - /* * btrfs doesn't support the bmap operation because swapfiles * use bmap to make a mapping of extents in the file. They assume @@ -10306,7 +10258,7 @@ static const struct address_space_operations btrfs_aops = { .writepage = btrfs_writepage, .writepages = btrfs_writepages, .readahead = btrfs_readahead, - .direct_IO = btrfs_direct_IO, + .direct_IO = noop_direct_IO, .invalidatepage = btrfs_invalidatepage, .releasepage = btrfs_releasepage, #ifdef CONFIG_MIGRATION diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2d9109d9e98f..ab408a23ba32 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -378,6 +378,18 @@ static int check_xflags(unsigned int flags) return 0; } +bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type) +{ + return !cmpxchg(&fs_info->exclusive_operation, BTRFS_EXCLOP_NONE, type); +} + +void btrfs_exclop_finish(struct btrfs_fs_info *fs_info) +{ + WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE); + sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation"); +} + /* * Set the xflags from the internal inode flags. The remaining items of fsxattr * are zeroed. @@ -618,7 +630,7 @@ static noinline int create_subvol(struct inode *dir, trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - btrfs_subvolume_release_metadata(fs_info, &block_rsv); + btrfs_subvolume_release_metadata(root, &block_rsv); goto fail_free; } trans->block_rsv = &block_rsv; @@ -628,7 +640,8 @@ static noinline int create_subvol(struct inode *dir, if (ret) goto fail; - leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0); + leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0, + BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); goto fail; @@ -742,7 +755,7 @@ fail: kfree(root_item); trans->block_rsv = NULL; trans->bytes_reserved = 0; - btrfs_subvolume_release_metadata(fs_info, &block_rsv); + btrfs_subvolume_release_metadata(root, &block_rsv); err = btrfs_commit_transaction(trans); if (err && !ret) @@ -856,7 +869,7 @@ fail: if (ret && pending_snapshot->snap) pending_snapshot->snap->anon_dev = 0; btrfs_put_root(pending_snapshot->snap); - btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv); + btrfs_subvolume_release_metadata(root, &pending_snapshot->block_rsv); free_pending: if (pending_snapshot->anon_dev) free_anon_bdev(pending_snapshot->anon_dev); @@ -1306,7 +1319,7 @@ again: break; unlock_page(page); - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); lock_page(page); /* @@ -1638,7 +1651,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (ret) return ret; - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_RESIZE)) { mnt_drop_write_file(file); return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } @@ -1752,7 +1765,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, out_free: kfree(vol_args); out: - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); mnt_drop_write_file(file); return ret; } @@ -3126,7 +3139,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; vol_args = memdup_user(arg, sizeof(*vol_args)); @@ -3143,7 +3156,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) kfree(vol_args); out: - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); return ret; } @@ -3172,7 +3185,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) goto out; } - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; goto out; } @@ -3183,7 +3196,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; ret = btrfs_rm_device(fs_info, vol_args->name, 0); } - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); if (!ret) { if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) @@ -3214,7 +3227,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) if (ret) return ret; - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; goto out_drop_write; } @@ -3232,7 +3245,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) btrfs_info(fs_info, "disk deleted %s", vol_args->name); kfree(vol_args); out: - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); out_drop_write: mnt_drop_write_file(file); @@ -3462,15 +3475,12 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *tmp; info = NULL; - rcu_read_lock(); - list_for_each_entry_rcu(tmp, &fs_info->space_info, - list) { + list_for_each_entry(tmp, &fs_info->space_info, list) { if (tmp->flags == types[i]) { info = tmp; break; } } - rcu_read_unlock(); if (!info) continue; @@ -3518,15 +3528,12 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info, break; info = NULL; - rcu_read_lock(); - list_for_each_entry_rcu(tmp, &fs_info->space_info, - list) { + list_for_each_entry(tmp, &fs_info->space_info, list) { if (tmp->flags == types[i]) { info = tmp; break; } } - rcu_read_unlock(); if (!info) continue; @@ -3736,11 +3743,11 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info, ret = -EROFS; goto out; } - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } else { ret = btrfs_dev_replace_by_ioctl(fs_info, p); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); } break; case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS: @@ -3951,7 +3958,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) return ret; again: - if (!test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { mutex_lock(&fs_info->balance_mutex); need_unlock = true; goto locked; @@ -3997,7 +4004,6 @@ again: } locked: - BUG_ON(!test_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)); if (arg) { bargs = memdup_user(arg, sizeof(*bargs)); @@ -4052,10 +4058,10 @@ locked: do_balance: /* - * Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP goes to - * btrfs_balance. bctl is freed in reset_balance_state, or, if - * restriper was paused all the way until unmount, in free_fs_info. - * The flag should be cleared after reset_balance_state. + * Ownership of bctl and exclusive operation goes to btrfs_balance. + * bctl is freed in reset_balance_state, or, if restriper was paused + * all the way until unmount, in free_fs_info. The flag should be + * cleared after reset_balance_state. */ need_unlock = false; @@ -4074,7 +4080,7 @@ out_bargs: out_unlock: mutex_unlock(&fs_info->balance_mutex); if (need_unlock) - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); out: mnt_drop_write_file(file); return ret; @@ -4897,7 +4903,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SYNC: { int ret; - ret = btrfs_start_delalloc_roots(fs_info, -1); + ret = btrfs_start_delalloc_roots(fs_info, U64_MAX); if (ret) return ret; ret = btrfs_sync_fs(inode->i_sb, 1); diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index f75612e18a82..66e02ebdd340 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -57,8 +57,8 @@ * performance reasons. * * - * Lock nesting - * ------------ + * Lock recursion + * -------------- * * A write operation on a tree might indirectly start a look up on the same * tree. This can happen when btrfs_cow_block locks the tree and needs to @@ -201,7 +201,7 @@ void btrfs_set_lock_blocking_read(struct extent_buffer *eb) * lock, but it won't change to or away from us. If we have the write * lock, we are the owner and it'll never change. */ - if (eb->lock_nested && current->pid == eb->lock_owner) + if (eb->lock_recursed && current->pid == eb->lock_owner) return; btrfs_assert_tree_read_locked(eb); atomic_inc(&eb->blocking_readers); @@ -225,7 +225,7 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb) * lock, but it won't change to or away from us. If we have the write * lock, we are the owner and it'll never change. */ - if (eb->lock_nested && current->pid == eb->lock_owner) + if (eb->lock_recursed && current->pid == eb->lock_owner) return; if (eb->blocking_writers == 0) { btrfs_assert_spinning_writers_put(eb); @@ -244,7 +244,8 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb) * * The rwlock is held upon exit. */ -void btrfs_tree_read_lock(struct extent_buffer *eb) +void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest, + bool recurse) { u64 start_ns = 0; @@ -263,8 +264,9 @@ again: * depends on this as it may be called on a partly * (write-)locked tree. */ - BUG_ON(eb->lock_nested); - eb->lock_nested = true; + WARN_ON(!recurse); + BUG_ON(eb->lock_recursed); + eb->lock_recursed = true; read_unlock(&eb->lock); trace_btrfs_tree_read_lock(eb, start_ns); return; @@ -279,6 +281,11 @@ again: trace_btrfs_tree_read_lock(eb, start_ns); } +void btrfs_tree_read_lock(struct extent_buffer *eb) +{ + __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, false); +} + /* * Lock extent buffer for read, optimistically expecting that there are no * contending blocking writers. If there are, don't wait. @@ -362,11 +369,11 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb) /* * if we're nested, we have the write lock. No new locking * is needed as long as we are the lock owner. - * The write unlock will do a barrier for us, and the lock_nested + * The write unlock will do a barrier for us, and the lock_recursed * field only matters to the lock owner. */ - if (eb->lock_nested && current->pid == eb->lock_owner) { - eb->lock_nested = false; + if (eb->lock_recursed && current->pid == eb->lock_owner) { + eb->lock_recursed = false; return; } btrfs_assert_tree_read_locked(eb); @@ -388,11 +395,11 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) /* * if we're nested, we have the write lock. No new locking * is needed as long as we are the lock owner. - * The write unlock will do a barrier for us, and the lock_nested + * The write unlock will do a barrier for us, and the lock_recursed * field only matters to the lock owner. */ - if (eb->lock_nested && current->pid == eb->lock_owner) { - eb->lock_nested = false; + if (eb->lock_recursed && current->pid == eb->lock_owner) { + eb->lock_recursed = false; return; } btrfs_assert_tree_read_locked(eb); @@ -409,7 +416,7 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) * * The rwlock is held for write upon exit. */ -void btrfs_tree_lock(struct extent_buffer *eb) +void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) __acquires(&eb->lock) { u64 start_ns = 0; @@ -434,6 +441,11 @@ again: trace_btrfs_tree_lock(eb, start_ns); } +void btrfs_tree_lock(struct extent_buffer *eb) +{ + __btrfs_tree_lock(eb, BTRFS_NESTING_NORMAL); +} + /* * Release the write lock, either blocking or spinning (ie. there's no need * for an explicit blocking unlock, like btrfs_tree_read_unlock_blocking). @@ -552,13 +564,14 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) * * Return: root extent buffer with read lock held */ -struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) +struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root, + bool recurse) { struct extent_buffer *eb; while (1) { eb = btrfs_root_node(root); - btrfs_tree_read_lock(eb); + __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, recurse); if (eb == root->node) break; btrfs_tree_read_unlock(eb); diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index d715846c10b8..3ea81ed3320b 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -16,11 +16,81 @@ #define BTRFS_WRITE_LOCK_BLOCKING 3 #define BTRFS_READ_LOCK_BLOCKING 4 +/* + * We are limited in number of subclasses by MAX_LOCKDEP_SUBCLASSES, which at + * the time of this patch is 8, which is how many we use. Keep this in mind if + * you decide you want to add another subclass. + */ +enum btrfs_lock_nesting { + BTRFS_NESTING_NORMAL, + + /* + * When we COW a block we are holding the lock on the original block, + * and since our lockdep maps are rootid+level, this confuses lockdep + * when we lock the newly allocated COW'd block. Handle this by having + * a subclass for COW'ed blocks so that lockdep doesn't complain. + */ + BTRFS_NESTING_COW, + + /* + * Oftentimes we need to lock adjacent nodes on the same level while + * still holding the lock on the original node we searched to, such as + * for searching forward or for split/balance. + * + * Because of this we need to indicate to lockdep that this is + * acceptable by having a different subclass for each of these + * operations. + */ + BTRFS_NESTING_LEFT, + BTRFS_NESTING_RIGHT, + + /* + * When splitting we will be holding a lock on the left/right node when + * we need to cow that node, thus we need a new set of subclasses for + * these two operations. + */ + BTRFS_NESTING_LEFT_COW, + BTRFS_NESTING_RIGHT_COW, + + /* + * When splitting we may push nodes to the left or right, but still use + * the subsequent nodes in our path, keeping our locks on those adjacent + * blocks. Thus when we go to allocate a new split block we've already + * used up all of our available subclasses, so this subclass exists to + * handle this case where we need to allocate a new split block. + */ + BTRFS_NESTING_SPLIT, + + /* + * When promoting a new block to a root we need to have a special + * subclass so we don't confuse lockdep, as it will appear that we are + * locking a higher level node before a lower level one. Copying also + * has this problem as it appears we're locking the same block again + * when we make a snapshot of an existing root. + */ + BTRFS_NESTING_NEW_ROOT, + + /* + * We are limited to MAX_LOCKDEP_SUBLCLASSES number of subclasses, so + * add this in here and add a static_assert to keep us from going over + * the limit. As of this writing we're limited to 8, and we're + * definitely using 8, hence this check to keep us from messing up in + * the future. + */ + BTRFS_NESTING_MAX, +}; + +static_assert(BTRFS_NESTING_MAX <= MAX_LOCKDEP_SUBCLASSES, + "too many lock subclasses defined"); + struct btrfs_path; +void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest); void btrfs_tree_lock(struct extent_buffer *eb); void btrfs_tree_unlock(struct extent_buffer *eb); +void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest, + bool recurse); void btrfs_tree_read_lock(struct extent_buffer *eb); void btrfs_tree_read_unlock(struct extent_buffer *eb); void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb); @@ -29,6 +99,14 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb); int btrfs_try_tree_read_lock(struct extent_buffer *eb); int btrfs_try_tree_write_lock(struct extent_buffer *eb); int btrfs_tree_read_lock_atomic(struct extent_buffer *eb); +struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); +struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root, + bool recurse); + +static inline struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) +{ + return __btrfs_read_lock_root_node(root, false); +} #ifdef CONFIG_BTRFS_DEBUG static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index ebac13389e7e..87bac9ecdf4c 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -212,11 +212,12 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset refcount_set(&entry->refs, 1); init_waitqueue_head(&entry->wait); INIT_LIST_HEAD(&entry->list); + INIT_LIST_HEAD(&entry->log_list); INIT_LIST_HEAD(&entry->root_extent_list); INIT_LIST_HEAD(&entry->work_list); init_completion(&entry->completion); - trace_btrfs_ordered_extent_add(&inode->vfs_inode, entry); + trace_btrfs_ordered_extent_add(inode, entry); spin_lock_irq(&tree->lock); node = tree_insert(&tree->tree, file_offset, @@ -377,17 +378,16 @@ out: * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used * to make sure this function only returns 1 once for a given ordered extent. */ -int btrfs_dec_test_ordered_pending(struct inode *inode, +int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size, int uptodate) { - struct btrfs_ordered_inode_tree *tree; + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; unsigned long flags; int ret; - tree = &BTRFS_I(inode)->ordered_tree; spin_lock_irqsave(&tree->lock, flags); if (cached && *cached) { entry = *cached; @@ -408,7 +408,7 @@ have_entry: } if (io_size > entry->bytes_left) { - btrfs_crit(BTRFS_I(inode)->root->fs_info, + btrfs_crit(inode->root->fs_info, "bad ordered accounting left %llu size %llu", entry->bytes_left, io_size); } @@ -441,10 +441,11 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) struct list_head *cur; struct btrfs_ordered_sum *sum; - trace_btrfs_ordered_extent_put(entry->inode, entry); + trace_btrfs_ordered_extent_put(BTRFS_I(entry->inode), entry); if (refcount_dec_and_test(&entry->refs)) { ASSERT(list_empty(&entry->root_extent_list)); + ASSERT(list_empty(&entry->log_list)); ASSERT(RB_EMPTY_NODE(&entry->rb_node)); if (entry->inode) btrfs_add_delayed_iput(entry->inode); @@ -462,14 +463,14 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) * remove an ordered extent from the tree. No references are dropped * and waiters are woken up. */ -void btrfs_remove_ordered_extent(struct inode *inode, +void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_inode_tree *tree; - struct btrfs_inode *btrfs_inode = BTRFS_I(inode); struct btrfs_root *root = btrfs_inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *node; + bool pending; /* This is paired with btrfs_add_ordered_extent. */ spin_lock(&btrfs_inode->lock); @@ -491,13 +492,41 @@ void btrfs_remove_ordered_extent(struct inode *inode, if (tree->last == node) tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); + pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags); spin_unlock_irq(&tree->lock); + /* + * The current running transaction is waiting on us, we need to let it + * know that we're complete and wake it up. + */ + if (pending) { + struct btrfs_transaction *trans; + + /* + * The checks for trans are just a formality, it should be set, + * but if it isn't we don't want to deref/assert under the spin + * lock, so be nice and check if trans is set, but ASSERT() so + * if it isn't set a developer will notice. + */ + spin_lock(&fs_info->trans_lock); + trans = fs_info->running_transaction; + if (trans) + refcount_inc(&trans->use_count); + spin_unlock(&fs_info->trans_lock); + + ASSERT(trans); + if (trans) { + if (atomic_dec_and_test(&trans->pending_ordered)) + wake_up(&trans->pending_wait); + btrfs_put_transaction(trans); + } + } + spin_lock(&root->ordered_extent_lock); list_del_init(&entry->root_extent_list); root->nr_ordered_extents--; - trace_btrfs_ordered_extent_remove(inode, entry); + trace_btrfs_ordered_extent_remove(btrfs_inode, entry); if (!root->nr_ordered_extents) { spin_lock(&fs_info->ordered_root_lock); @@ -514,7 +543,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work) struct btrfs_ordered_extent *ordered; ordered = container_of(work, struct btrfs_ordered_extent, flush_work); - btrfs_start_ordered_extent(ordered->inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); complete(&ordered->completion); } @@ -620,12 +649,11 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, * in the extent, and it waits on the io completion code to insert * metadata into the btree corresponding to the extent */ -void btrfs_start_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry, - int wait) +void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait) { u64 start = entry->file_offset; u64 end = start + entry->num_bytes - 1; + struct btrfs_inode *inode = BTRFS_I(entry->inode); trace_btrfs_ordered_extent_start(inode, entry); @@ -635,7 +663,7 @@ void btrfs_start_ordered_extent(struct inode *inode, * for the flusher thread to find them */ if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) - filemap_fdatawrite_range(inode->i_mapping, start, end); + filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); if (wait) { wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); @@ -679,7 +707,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) end = orig_end; while (1) { - ordered = btrfs_lookup_first_ordered_extent(inode, end); + ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), end); if (!ordered) break; if (ordered->file_offset > orig_end) { @@ -690,7 +718,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) btrfs_put_ordered_extent(ordered); break; } - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); end = ordered->file_offset; /* * If the ordered extent had an error save the error but don't @@ -775,17 +803,45 @@ out: } /* + * Adds all ordered extents to the given list. The list ends up sorted by the + * file_offset of the ordered extents. + */ +void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, + struct list_head *list) +{ + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; + struct rb_node *n; + + ASSERT(inode_is_locked(&inode->vfs_inode)); + + spin_lock_irq(&tree->lock); + for (n = rb_first(&tree->tree); n; n = rb_next(n)) { + struct btrfs_ordered_extent *ordered; + + ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); + + if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags)) + continue; + + ASSERT(list_empty(&ordered->log_list)); + list_add_tail(&ordered->log_list, list); + refcount_inc(&ordered->refs); + } + spin_unlock_irq(&tree->lock); +} + +/* * lookup and return any extent before 'file_offset'. NULL is returned * if none is found */ struct btrfs_ordered_extent * -btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) +btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset) { struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - tree = &BTRFS_I(inode)->ordered_tree; + tree = &inode->ordered_tree; spin_lock_irq(&tree->lock); node = tree_search(tree, file_offset); if (!node) @@ -803,20 +859,21 @@ out: * try to find a checksum. This is used because we allow pages to * be reclaimed before their checksum is actually put into the btree */ -int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, - u8 *sum, int len) +int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset, + u64 disk_bytenr, u8 *sum, int len) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_ordered_sum *ordered_sum; struct btrfs_ordered_extent *ordered; - struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; unsigned long num_sectors; unsigned long i; u32 sectorsize = btrfs_inode_sectorsize(inode); + const u8 blocksize_bits = inode->vfs_inode.i_sb->s_blocksize_bits; const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); int index = 0; - ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), offset); + ordered = btrfs_lookup_ordered_extent(inode, offset); if (!ordered) return 0; @@ -824,10 +881,8 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { if (disk_bytenr >= ordered_sum->bytenr && disk_bytenr < ordered_sum->bytenr + ordered_sum->len) { - i = (disk_bytenr - ordered_sum->bytenr) >> - inode->i_sb->s_blocksize_bits; - num_sectors = ordered_sum->len >> - inode->i_sb->s_blocksize_bits; + i = (disk_bytenr - ordered_sum->bytenr) >> blocksize_bits; + num_sectors = ordered_sum->len >> blocksize_bits; num_sectors = min_t(int, len - index, num_sectors - i); memcpy(sum + index, ordered_sum->sums + i * csum_size, num_sectors * csum_size); @@ -883,7 +938,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, break; } unlock_extent_cached(&inode->io_tree, start, end, cachedp); - btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); } } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index d61ea9c880a3..c3a2325e64a4 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -56,6 +56,12 @@ enum { BTRFS_ORDERED_TRUNCATED, /* Regular IO for COW */ BTRFS_ORDERED_REGULAR, + /* Used during fsync to track already logged extents */ + BTRFS_ORDERED_LOGGED, + /* We have already logged all the csums of the ordered extent */ + BTRFS_ORDERED_LOGGED_CSUM, + /* We wait for this extent to complete in the current transaction */ + BTRFS_ORDERED_PENDING, }; struct btrfs_ordered_extent { @@ -104,6 +110,9 @@ struct btrfs_ordered_extent { /* list of checksums for insertion when the extent io is done */ struct list_head list; + /* used for fast fsyncs */ + struct list_head log_list; + /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ wait_queue_head_t wait; @@ -142,9 +151,9 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) } void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); -void btrfs_remove_ordered_extent(struct inode *inode, +void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry); -int btrfs_dec_test_ordered_pending(struct inode *inode, +int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size, int uptodate); int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, @@ -165,17 +174,18 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, u64 file_offset); -void btrfs_start_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry, int wait); +void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait); int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * -btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); +btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset); struct btrfs_ordered_extent *btrfs_lookup_ordered_range( struct btrfs_inode *inode, u64 file_offset, u64 len); -int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, - u8 *sum, int len); +void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, + struct list_head *list); +int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset, + u64 disk_bytenr, u8 *sum, int len); u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, const u64 range_start, const u64 range_len); void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 80567c11ec12..7695c4783d33 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -7,6 +7,44 @@ #include "disk-io.h" #include "print-tree.h" +struct root_name_map { + u64 id; + char name[16]; +}; + +static const struct root_name_map root_map[] = { + { BTRFS_ROOT_TREE_OBJECTID, "ROOT_TREE" }, + { BTRFS_EXTENT_TREE_OBJECTID, "EXTENT_TREE" }, + { BTRFS_CHUNK_TREE_OBJECTID, "CHUNK_TREE" }, + { BTRFS_DEV_TREE_OBJECTID, "DEV_TREE" }, + { BTRFS_FS_TREE_OBJECTID, "FS_TREE" }, + { BTRFS_CSUM_TREE_OBJECTID, "CSUM_TREE" }, + { BTRFS_TREE_LOG_OBJECTID, "TREE_LOG" }, + { BTRFS_QUOTA_TREE_OBJECTID, "QUOTA_TREE" }, + { BTRFS_UUID_TREE_OBJECTID, "UUID_TREE" }, + { BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" }, + { BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" }, +}; + +const char *btrfs_root_name(u64 objectid, char *buf) +{ + int i; + + if (objectid == BTRFS_TREE_RELOC_OBJECTID) { + snprintf(buf, BTRFS_ROOT_NAME_BUF_LEN, + "TREE_RELOC offset=%llu", objectid); + return buf; + } + + for (i = 0; i < ARRAY_SIZE(root_map); i++) { + if (root_map[i].id == objectid) + return root_map[i].name; + } + + snprintf(buf, BTRFS_ROOT_NAME_BUF_LEN, "%llu", objectid); + return buf; +} + static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk) { int num_stripes = btrfs_chunk_num_stripes(eb, chunk); diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h index e6bb38fd75ad..78b99385a503 100644 --- a/fs/btrfs/print-tree.h +++ b/fs/btrfs/print-tree.h @@ -6,7 +6,11 @@ #ifndef BTRFS_PRINT_TREE_H #define BTRFS_PRINT_TREE_H +/* Buffer size to contain tree name and possibly additional data (offset) */ +#define BTRFS_ROOT_NAME_BUF_LEN 48 + void btrfs_print_leaf(struct extent_buffer *l); void btrfs_print_tree(struct extent_buffer *c, bool follow); +const char *btrfs_root_name(u64 objectid, char *buf); #endif diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index c0f350c3a0cf..580899bdb991 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2315,7 +2315,7 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, * Update qgroup rfer/excl counters. * Rfer update is easy, codes can explain themselves. * - * Excl update is tricky, the update is split into 2 part. + * Excl update is tricky, the update is split into 2 parts. * Part 1: Possible exclusive <-> sharing detect: * | A | !A | * ------------------------------------- diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 243a2e44526e..9d4f5316a7e8 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -767,31 +767,39 @@ static void reada_start_machine_worker(struct btrfs_work *work) kfree(rmw); } -static void __reada_start_machine(struct btrfs_fs_info *fs_info) +/* Try to start up to 10k READA requests for a group of devices */ +static int reada_start_for_fsdevs(struct btrfs_fs_devices *fs_devices) { - struct btrfs_device *device; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; u64 enqueued; u64 total = 0; - int i; + struct btrfs_device *device; -again: do { enqueued = 0; - mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { if (atomic_read(&device->reada_in_flight) < MAX_IN_FLIGHT) enqueued += reada_start_machine_dev(device); } - mutex_unlock(&fs_devices->device_list_mutex); total += enqueued; } while (enqueued && total < 10000); - if (fs_devices->seed) { - fs_devices = fs_devices->seed; - goto again; - } + return total; +} + +static void __reada_start_machine(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; + int i; + u64 enqueued = 0; + + mutex_lock(&fs_devices->device_list_mutex); + + enqueued += reada_start_for_fsdevs(fs_devices); + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) + enqueued += reada_start_for_fsdevs(seed_devs); + + mutex_unlock(&fs_devices->device_list_mutex); if (enqueued == 0) return; diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 5cd02514cf4d..99aa87c08912 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -45,7 +45,7 @@ out: return ret; } -static int copy_inline_to_page(struct inode *inode, +static int copy_inline_to_page(struct btrfs_inode *inode, const u64 file_offset, char *inline_data, const u64 size, @@ -58,6 +58,7 @@ static int copy_inline_to_page(struct inode *inode, char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0); struct extent_changeset *data_reserved = NULL; struct page *page = NULL; + struct address_space *mapping = inode->vfs_inode.i_mapping; int ret; ASSERT(IS_ALIGNED(file_offset, block_size)); @@ -68,24 +69,23 @@ static int copy_inline_to_page(struct inode *inode, * reservation here. Also we must not do the reservation while holding * a transaction open, otherwise we would deadlock. */ - ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, - file_offset, block_size); + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset, + block_size); if (ret) goto out; - page = find_or_create_page(inode->i_mapping, file_offset >> PAGE_SHIFT, - btrfs_alloc_write_mask(inode->i_mapping)); + page = find_or_create_page(mapping, file_offset >> PAGE_SHIFT, + btrfs_alloc_write_mask(mapping)); if (!page) { ret = -ENOMEM; goto out_unlock; } set_page_extent_mapped(page); - clear_extent_bit(&BTRFS_I(inode)->io_tree, file_offset, range_end, + clear_extent_bit(&inode->io_tree, file_offset, range_end, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, NULL); - ret = btrfs_set_extent_delalloc(BTRFS_I(inode), file_offset, range_end, - 0, NULL); + ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL); if (ret) goto out_unlock; @@ -134,9 +134,9 @@ out_unlock: put_page(page); } if (ret) - btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, - file_offset, block_size, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), block_size); + btrfs_delalloc_release_space(inode, data_reserved, file_offset, + block_size, true); + btrfs_delalloc_release_extents(inode, block_size); out: extent_changeset_free(data_reserved); @@ -167,8 +167,8 @@ static int clone_copy_inline_extent(struct inode *dst, struct btrfs_key key; if (new_key->offset > 0) { - ret = copy_inline_to_page(dst, new_key->offset, inline_data, - size, datal, comp_type); + ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, + inline_data, size, datal, comp_type); goto out; } @@ -194,7 +194,7 @@ static int clone_copy_inline_extent(struct inode *dst, * inline extent's data to the page. */ ASSERT(key.offset > 0); - ret = copy_inline_to_page(dst, new_key->offset, + ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, inline_data, size, datal, comp_type); goto out; @@ -213,8 +213,8 @@ static int clone_copy_inline_extent(struct inode *dst, BTRFS_FILE_EXTENT_INLINE) goto copy_inline_extent; - ret = copy_inline_to_page(dst, new_key->offset, inline_data, - size, datal, comp_type); + ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, + inline_data, size, datal, comp_type); goto out; } @@ -231,8 +231,8 @@ copy_inline_extent: * clone. Deal with all these cases by copying the inline extent * data into the respective page at the destination inode. */ - ret = copy_inline_to_page(dst, new_key->offset, inline_data, - size, datal, comp_type); + ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, + inline_data, size, datal, comp_type); goto out; } @@ -439,7 +439,7 @@ process_slot: if (type == BTRFS_FILE_EXTENT_REG || type == BTRFS_FILE_EXTENT_PREALLOC) { - struct btrfs_clone_extent_info clone_info; + struct btrfs_replace_extent_info clone_info; /* * a | --- range to clone ---| b @@ -462,8 +462,8 @@ process_slot: clone_info.data_len = datal; clone_info.file_offset = new_key.offset; clone_info.extent_buf = buf; - clone_info.item_size = size; - ret = btrfs_punch_hole_range(inode, path, drop_start, + clone_info.is_new_extent = false; + ret = btrfs_replace_file_extents(inode, path, drop_start, new_key.offset + datal - 1, &clone_info, &trans); if (ret) @@ -520,6 +520,8 @@ process_slot: ret = -EINTR; goto out; } + + cond_resched(); } ret = 0; @@ -533,7 +535,7 @@ process_slot: btrfs_release_path(path); path->leave_spinning = 0; - ret = btrfs_punch_hole_range(inode, path, last_dest_end, + ret = btrfs_replace_file_extents(inode, path, last_dest_end, destoff + len - 1, NULL, &trans); if (ret) goto out; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 4ba1ab9cc76d..3602806d71bd 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1206,7 +1206,8 @@ again: } if (cow) { - ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); + ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb, + BTRFS_NESTING_COW); BUG_ON(ret); } btrfs_set_lock_blocking_write(eb); @@ -1274,7 +1275,8 @@ again: btrfs_tree_lock(eb); if (cow) { ret = btrfs_cow_block(trans, dest, eb, parent, - slot, &eb); + slot, &eb, + BTRFS_NESTING_COW); BUG_ON(ret); } btrfs_set_lock_blocking_write(eb); @@ -1781,7 +1783,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, * relocated and the block is tree root. */ leaf = btrfs_lock_root_node(root); - ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf); + ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf, + BTRFS_NESTING_COW); btrfs_tree_unlock(leaf); free_extent_buffer(leaf); if (ret < 0) @@ -2308,7 +2311,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, if (!node->eb) { ret = btrfs_cow_block(trans, root, eb, upper->eb, - slot, &eb); + slot, &eb, BTRFS_NESTING_COW); btrfs_tree_unlock(eb); free_extent_buffer(eb); if (ret < 0) { diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index c89697486366..702dc5441f03 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -512,11 +512,20 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, if (ret && qgroup_num_bytes) btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); + if (!ret) { + spin_lock(&rsv->lock); + rsv->qgroup_rsv_reserved += qgroup_num_bytes; + spin_unlock(&rsv->lock); + } return ret; } -void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, +void btrfs_subvolume_release_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv) { - btrfs_block_rsv_release(fs_info, rsv, (u64)-1, NULL); + struct btrfs_fs_info *fs_info = root->fs_info; + u64 qgroup_to_release; + + btrfs_block_rsv_release(fs_info, rsv, (u64)-1, &qgroup_to_release); + btrfs_qgroup_convert_reserved_meta(root, qgroup_to_release); } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 354ab9985a34..cf63f1e27a27 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -835,7 +835,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) int success; bool full_stripe_locked; unsigned int nofs_flag; - static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, + static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); BUG_ON(sblock_to_check->page_count < 1); @@ -969,14 +969,14 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) spin_lock(&sctx->stat_lock); sctx->stat.read_errors++; spin_unlock(&sctx->stat_lock); - if (__ratelimit(&_rs)) + if (__ratelimit(&rs)) scrub_print_warning("i/o error", sblock_to_check); btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); } else if (sblock_bad->checksum_error) { spin_lock(&sctx->stat_lock); sctx->stat.csum_errors++; spin_unlock(&sctx->stat_lock); - if (__ratelimit(&_rs)) + if (__ratelimit(&rs)) scrub_print_warning("checksum error", sblock_to_check); btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); @@ -984,7 +984,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) spin_lock(&sctx->stat_lock); sctx->stat.verify_errors++; spin_unlock(&sctx->stat_lock); - if (__ratelimit(&_rs)) + if (__ratelimit(&rs)) scrub_print_warning("checksum/header error", sblock_to_check); if (sblock_bad->generation_error) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index d9813a5b075a..340c76a12ce1 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -122,8 +122,6 @@ struct send_ctx { struct file_ra_state ra; - char *read_buf; - /* * We process inodes by their increasing order, so if before an * incremental send we reverse the parent/child relationship of @@ -278,11 +276,6 @@ enum btrfs_compare_tree_result { BTRFS_COMPARE_TREE_CHANGED, BTRFS_COMPARE_TREE_SAME, }; -typedef int (*btrfs_changed_cb_t)(struct btrfs_path *left_path, - struct btrfs_path *right_path, - struct btrfs_key *key, - enum btrfs_compare_tree_result result, - void *ctx); __cold static void inconsistent_snapshot_error(struct send_ctx *sctx, @@ -584,8 +577,8 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len) return -EOVERFLOW; hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size); - hdr->tlv_type = cpu_to_le16(attr); - hdr->tlv_len = cpu_to_le16(len); + put_unaligned_le16(attr, &hdr->tlv_type); + put_unaligned_le16(len, &hdr->tlv_len); memcpy(hdr + 1, data, len); sctx->send_size += total_len; @@ -695,7 +688,7 @@ static int begin_cmd(struct send_ctx *sctx, int cmd) sctx->send_size += sizeof(*hdr); hdr = (struct btrfs_cmd_header *)sctx->send_buf; - hdr->cmd = cpu_to_le16(cmd); + put_unaligned_le16(cmd, &hdr->cmd); return 0; } @@ -707,17 +700,17 @@ static int send_cmd(struct send_ctx *sctx) u32 crc; hdr = (struct btrfs_cmd_header *)sctx->send_buf; - hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr)); - hdr->crc = 0; + put_unaligned_le32(sctx->send_size - sizeof(*hdr), &hdr->len); + put_unaligned_le32(0, &hdr->crc); crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); - hdr->crc = cpu_to_le32(crc); + put_unaligned_le32(crc, &hdr->crc); ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, &sctx->send_off); sctx->total_send_size += sctx->send_size; - sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size; + sctx->cmd_send_size[get_unaligned_le16(&hdr->cmd)] += sctx->send_size; sctx->send_size = 0; return ret; @@ -3813,6 +3806,72 @@ static int update_ref_path(struct send_ctx *sctx, struct recorded_ref *ref) } /* + * When processing the new references for an inode we may orphanize an existing + * directory inode because its old name conflicts with one of the new references + * of the current inode. Later, when processing another new reference of our + * inode, we might need to orphanize another inode, but the path we have in the + * reference reflects the pre-orphanization name of the directory we previously + * orphanized. For example: + * + * parent snapshot looks like: + * + * . (ino 256) + * |----- f1 (ino 257) + * |----- f2 (ino 258) + * |----- d1/ (ino 259) + * |----- d2/ (ino 260) + * + * send snapshot looks like: + * + * . (ino 256) + * |----- d1 (ino 258) + * |----- f2/ (ino 259) + * |----- f2_link/ (ino 260) + * | |----- f1 (ino 257) + * | + * |----- d2 (ino 258) + * + * When processing inode 257 we compute the name for inode 259 as "d1", and we + * cache it in the name cache. Later when we start processing inode 258, when + * collecting all its new references we set a full path of "d1/d2" for its new + * reference with name "d2". When we start processing the new references we + * start by processing the new reference with name "d1", and this results in + * orphanizing inode 259, since its old reference causes a conflict. Then we + * move on the next new reference, with name "d2", and we find out we must + * orphanize inode 260, as its old reference conflicts with ours - but for the + * orphanization we use a source path corresponding to the path we stored in the + * new reference, which is "d1/d2" and not "o259-6-0/d2" - this makes the + * receiver fail since the path component "d1/" no longer exists, it was renamed + * to "o259-6-0/" when processing the previous new reference. So in this case we + * must recompute the path in the new reference and use it for the new + * orphanization operation. + */ +static int refresh_ref_path(struct send_ctx *sctx, struct recorded_ref *ref) +{ + char *name; + int ret; + + name = kmemdup(ref->name, ref->name_len, GFP_KERNEL); + if (!name) + return -ENOMEM; + + fs_path_reset(ref->full_path); + ret = get_cur_path(sctx, ref->dir, ref->dir_gen, ref->full_path); + if (ret < 0) + goto out; + + ret = fs_path_add(ref->full_path, name, ref->name_len); + if (ret < 0) + goto out; + + /* Update the reference's base name pointer. */ + set_ref_path(ref, ref->full_path); +out: + kfree(name); + return ret; +} + +/* * This does all the move/link/unlink/rmdir magic. */ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) @@ -3880,52 +3939,56 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) goto out; } + /* + * Before doing any rename and link operations, do a first pass on the + * new references to orphanize any unprocessed inodes that may have a + * reference that conflicts with one of the new references of the current + * inode. This needs to happen first because a new reference may conflict + * with the old reference of a parent directory, so we must make sure + * that the path used for link and rename commands don't use an + * orphanized name when an ancestor was not yet orphanized. + * + * Example: + * + * Parent snapshot: + * + * . (ino 256) + * |----- testdir/ (ino 259) + * | |----- a (ino 257) + * | + * |----- b (ino 258) + * + * Send snapshot: + * + * . (ino 256) + * |----- testdir_2/ (ino 259) + * | |----- a (ino 260) + * | + * |----- testdir (ino 257) + * |----- b (ino 257) + * |----- b2 (ino 258) + * + * Processing the new reference for inode 257 with name "b" may happen + * before processing the new reference with name "testdir". If so, we + * must make sure that by the time we send a link command to create the + * hard link "b", inode 259 was already orphanized, since the generated + * path in "valid_path" already contains the orphanized name for 259. + * We are processing inode 257, so only later when processing 259 we do + * the rename operation to change its temporary (orphanized) name to + * "testdir_2". + */ list_for_each_entry(cur, &sctx->new_refs, list) { - /* - * We may have refs where the parent directory does not exist - * yet. This happens if the parent directories inum is higher - * than the current inum. To handle this case, we create the - * parent directory out of order. But we need to check if this - * did already happen before due to other refs in the same dir. - */ ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; - if (ret == inode_state_will_create) { - ret = 0; - /* - * First check if any of the current inodes refs did - * already create the dir. - */ - list_for_each_entry(cur2, &sctx->new_refs, list) { - if (cur == cur2) - break; - if (cur2->dir == cur->dir) { - ret = 1; - break; - } - } - - /* - * If that did not happen, check if a previous inode - * did already create the dir. - */ - if (!ret) - ret = did_create_dir(sctx, cur->dir); - if (ret < 0) - goto out; - if (!ret) { - ret = send_create_inode(sctx, cur->dir); - if (ret < 0) - goto out; - } - } + if (ret == inode_state_will_create) + continue; /* - * Check if this new ref would overwrite the first ref of - * another unprocessed inode. If yes, orphanize the - * overwritten inode. If we find an overwritten ref that is - * not the first ref, simply unlink it. + * Check if this new ref would overwrite the first ref of another + * unprocessed inode. If yes, orphanize the overwritten inode. + * If we find an overwritten ref that is not the first ref, + * simply unlink it. */ ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen, cur->name, cur->name_len, @@ -3942,6 +4005,12 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) struct name_cache_entry *nce; struct waiting_dir_move *wdm; + if (orphanized_dir) { + ret = refresh_ref_path(sctx, cur); + if (ret < 0) + goto out; + } + ret = orphanize_inode(sctx, ow_inode, ow_gen, cur->full_path); if (ret < 0) @@ -4004,6 +4073,49 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) } } + } + + list_for_each_entry(cur, &sctx->new_refs, list) { + /* + * We may have refs where the parent directory does not exist + * yet. This happens if the parent directories inum is higher + * than the current inum. To handle this case, we create the + * parent directory out of order. But we need to check if this + * did already happen before due to other refs in the same dir. + */ + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); + if (ret < 0) + goto out; + if (ret == inode_state_will_create) { + ret = 0; + /* + * First check if any of the current inodes refs did + * already create the dir. + */ + list_for_each_entry(cur2, &sctx->new_refs, list) { + if (cur == cur2) + break; + if (cur2->dir == cur->dir) { + ret = 1; + break; + } + } + + /* + * If that did not happen, check if a previous inode + * did already create the dir. + */ + if (!ret) + ret = did_create_dir(sctx, cur->dir); + if (ret < 0) + goto out; + if (!ret) { + ret = send_create_inode(sctx, cur->dir); + if (ret < 0) + goto out; + } + } + if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) { ret = wait_for_dest_dir_move(sctx, cur, is_orphan); if (ret < 0) @@ -4799,7 +4911,25 @@ out: return ret; } -static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) +static inline u64 max_send_read_size(const struct send_ctx *sctx) +{ + return sctx->send_max_size - SZ_16K; +} + +static int put_data_header(struct send_ctx *sctx, u32 len) +{ + struct btrfs_tlv_header *hdr; + + if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len) + return -EOVERFLOW; + hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size); + put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type); + put_unaligned_le16(len, &hdr->tlv_len); + sctx->send_size += sizeof(*hdr); + return 0; +} + +static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) { struct btrfs_root *root = sctx->send_root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -4809,21 +4939,16 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) pgoff_t index = offset >> PAGE_SHIFT; pgoff_t last_index; unsigned pg_offset = offset_in_page(offset); - ssize_t ret = 0; + int ret; + + ret = put_data_header(sctx, len); + if (ret) + return ret; inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root); if (IS_ERR(inode)) return PTR_ERR(inode); - if (offset + len > i_size_read(inode)) { - if (offset > i_size_read(inode)) - len = 0; - else - len = offset - i_size_read(inode); - } - if (len == 0) - goto out; - last_index = (offset + len - 1) >> PAGE_SHIFT; /* initial readahead */ @@ -4864,16 +4989,16 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) } addr = kmap(page); - memcpy(sctx->read_buf + ret, addr + pg_offset, cur_len); + memcpy(sctx->send_buf + sctx->send_size, addr + pg_offset, + cur_len); kunmap(page); unlock_page(page); put_page(page); index++; pg_offset = 0; len -= cur_len; - ret += cur_len; + sctx->send_size += cur_len; } -out: iput(inode); return ret; } @@ -4887,7 +5012,6 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret = 0; struct fs_path *p; - ssize_t num_read = 0; p = fs_path_alloc(); if (!p) @@ -4895,13 +5019,6 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len); - num_read = fill_read_buf(sctx, offset, len); - if (num_read <= 0) { - if (num_read < 0) - ret = num_read; - goto out; - } - ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); if (ret < 0) goto out; @@ -4912,16 +5029,16 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); - TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, num_read); + ret = put_file_data(sctx, offset, len); + if (ret < 0) + goto out; ret = send_cmd(sctx); tlv_put_failure: out: fs_path_free(p); - if (ret < 0) - return ret; - return num_read; + return ret; } /* @@ -5033,8 +5150,8 @@ out: static int send_hole(struct send_ctx *sctx, u64 end) { struct fs_path *p = NULL; + u64 read_size = max_send_read_size(sctx); u64 offset = sctx->cur_inode_last_extent; - u64 len; int ret = 0; /* @@ -5061,16 +5178,19 @@ static int send_hole(struct send_ctx *sctx, u64 end) ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); if (ret < 0) goto tlv_put_failure; - memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE); while (offset < end) { - len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE); + u64 len = min(end - offset, read_size); ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); if (ret < 0) break; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); - TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len); + ret = put_data_header(sctx, len); + if (ret < 0) + break; + memset(sctx->send_buf + sctx->send_size, 0, len); + sctx->send_size += len; ret = send_cmd(sctx); if (ret < 0) break; @@ -5086,23 +5206,20 @@ static int send_extent_data(struct send_ctx *sctx, const u64 offset, const u64 len) { + u64 read_size = max_send_read_size(sctx); u64 sent = 0; if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) return send_update_extent(sctx, offset, len); while (sent < len) { - u64 size = len - sent; + u64 size = min(len - sent, read_size); int ret; - if (size > BTRFS_SEND_READ_SIZE) - size = BTRFS_SEND_READ_SIZE; ret = send_write(sctx, offset + sent, size); if (ret < 0) return ret; - if (!ret) - break; - sent += ret; + sent += size; } return 0; } @@ -5402,51 +5519,29 @@ static int send_write_or_clone(struct send_ctx *sctx, struct clone_root *clone_root) { int ret = 0; - struct btrfs_file_extent_item *ei; u64 offset = key->offset; - u64 len; - u8 type; + u64 end; u64 bs = sctx->send_root->fs_info->sb->s_blocksize; - ei = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); - type = btrfs_file_extent_type(path->nodes[0], ei); - if (type == BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_ram_bytes(path->nodes[0], ei); - /* - * it is possible the inline item won't cover the whole page, - * but there may be items after this page. Make - * sure to send the whole thing - */ - len = PAGE_ALIGN(len); - } else { - len = btrfs_file_extent_num_bytes(path->nodes[0], ei); - } - - if (offset >= sctx->cur_inode_size) { - ret = 0; - goto out; - } - if (offset + len > sctx->cur_inode_size) - len = sctx->cur_inode_size - offset; - if (len == 0) { - ret = 0; - goto out; - } + end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size); + if (offset >= end) + return 0; - if (clone_root && IS_ALIGNED(offset + len, bs)) { + if (clone_root && IS_ALIGNED(end, bs)) { + struct btrfs_file_extent_item *ei; u64 disk_byte; u64 data_offset; + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei); data_offset = btrfs_file_extent_offset(path->nodes[0], ei); ret = clone_range(sctx, clone_root, disk_byte, data_offset, - offset, len); + offset, end - offset); } else { - ret = send_extent_data(sctx, offset, len); + ret = send_extent_data(sctx, offset, end - offset); } - sctx->cur_inode_next_write_offset = offset + len; -out: + sctx->cur_inode_next_write_offset = end; return ret; } @@ -6692,8 +6787,7 @@ static int tree_compare_item(struct btrfs_path *left_path, * If it detects a change, it aborts immediately. */ static int btrfs_compare_trees(struct btrfs_root *left_root, - struct btrfs_root *right_root, - btrfs_changed_cb_t changed_cb, void *ctx) + struct btrfs_root *right_root, void *ctx) { struct btrfs_fs_info *fs_info = left_root->fs_info; int ret; @@ -6960,8 +7054,7 @@ static int send_subvol(struct send_ctx *sctx) goto out; if (sctx->parent_root) { - ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, - changed_cb, sctx); + ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, sctx); if (ret < 0) goto out; ret = finish_inode_if_needed(sctx, 1); @@ -7087,7 +7180,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) u32 i; u64 *clone_sources_tmp = NULL; int clone_sources_to_rollback = 0; - unsigned alloc_size; + size_t alloc_size; int sort_clone_roots = 0; if (!capable(CAP_SYS_ADMIN)) @@ -7169,25 +7262,20 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) goto out; } - sctx->read_buf = kvmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL); - if (!sctx->read_buf) { - ret = -ENOMEM; - goto out; - } - sctx->pending_dir_moves = RB_ROOT; sctx->waiting_dir_moves = RB_ROOT; sctx->orphan_dirs = RB_ROOT; - alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1); - - sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL); + sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots), + arg->clone_sources_count + 1, + GFP_KERNEL); if (!sctx->clone_roots) { ret = -ENOMEM; goto out; } - alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources); + alloc_size = array_size(sizeof(*arg->clone_sources), + arg->clone_sources_count); if (arg->clone_sources_count) { clone_sources_tmp = kvmalloc(alloc_size, GFP_KERNEL); @@ -7378,7 +7466,6 @@ out: kvfree(sctx->clone_roots); kvfree(sctx->send_buf); - kvfree(sctx->read_buf); name_cache_free(sctx); diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index ead397f7034f..de91488b7cd0 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -13,7 +13,6 @@ #define BTRFS_SEND_STREAM_VERSION 1 #define BTRFS_SEND_BUF_SIZE SZ_64K -#define BTRFS_SEND_READ_SIZE (48 * SZ_1K) enum btrfs_tlv_type { BTRFS_TLV_U8, diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 475968ccbd1d..64099565ab8f 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -175,10 +175,8 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) struct list_head *head = &info->space_info; struct btrfs_space_info *found; - rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) + list_for_each_entry(found, head, list) found->full = 0; - rcu_read_unlock(); } static int create_space_info(struct btrfs_fs_info *info, u64 flags) @@ -213,7 +211,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) if (ret) return ret; - list_add_rcu(&space_info->list, &info->space_info); + list_add(&space_info->list, &info->space_info); if (flags & BTRFS_BLOCK_GROUP_DATA) info->data_sinfo = space_info; @@ -290,22 +288,13 @@ struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; - rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) { - if (found->flags & flags) { - rcu_read_unlock(); + list_for_each_entry(found, head, list) { + if (found->flags & flags) return found; - } } - rcu_read_unlock(); return NULL; } -static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) -{ - return (global->size << 1); -} - static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, enum btrfs_reserve_flush_enum flush) @@ -476,28 +465,6 @@ again: up_read(&info->groups_sem); } -static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, - unsigned long nr_pages, int nr_items) -{ - struct super_block *sb = fs_info->sb; - - if (down_read_trylock(&sb->s_umount)) { - writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); - up_read(&sb->s_umount); - } else { - /* - * We needn't worry the filesystem going from r/w to r/o though - * we don't acquire ->s_umount mutex, because the filesystem - * should guarantee the delalloc inodes list be empty after - * the filesystem is readonly(all dirty pages are written to - * the disk). - */ - btrfs_start_delalloc_roots(fs_info, nr_items); - if (!current->journal_info) - btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); - } -} - static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, u64 to_reclaim) { @@ -516,25 +483,33 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, /* * shrink metadata reservation for delalloc */ -static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, - u64 orig, bool wait_ordered) +static void shrink_delalloc(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 to_reclaim, bool wait_ordered) { - struct btrfs_space_info *space_info; struct btrfs_trans_handle *trans; u64 delalloc_bytes; u64 dio_bytes; - u64 async_pages; u64 items; long time_left; - unsigned long nr_pages; int loops; /* Calc the number of the pages we need flush for space reservation */ - items = calc_reclaim_items_nr(fs_info, to_reclaim); - to_reclaim = items * EXTENT_SIZE_PER_ITEM; + if (to_reclaim == U64_MAX) { + items = U64_MAX; + } else { + /* + * to_reclaim is set to however much metadata we need to + * reclaim, but reclaiming that much data doesn't really track + * exactly, so increase the amount to reclaim by 2x in order to + * make sure we're flushing enough delalloc to hopefully reclaim + * some metadata reservations. + */ + items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2; + to_reclaim = items * EXTENT_SIZE_PER_ITEM; + } trans = (struct btrfs_trans_handle *)current->journal_info; - space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); delalloc_bytes = percpu_counter_sum_positive( &fs_info->delalloc_bytes); @@ -557,37 +532,17 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, loops = 0; while ((delalloc_bytes || dio_bytes) && loops < 3) { - nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; - - /* - * Triggers inode writeback for up to nr_pages. This will invoke - * ->writepages callback and trigger delalloc filling - * (btrfs_run_delalloc_range()). - */ - btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); + btrfs_start_delalloc_roots(fs_info, items); - /* - * We need to wait for the compressed pages to start before - * we continue. - */ - async_pages = atomic_read(&fs_info->async_delalloc_pages); - if (!async_pages) - goto skip_async; - - /* - * Calculate how many compressed pages we want to be written - * before we continue. I.e if there are more async pages than we - * require wait_event will wait until nr_pages are written. - */ - if (async_pages <= nr_pages) - async_pages = 0; - else - async_pages -= nr_pages; + loops++; + if (wait_ordered && !trans) { + btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); + } else { + time_left = schedule_timeout_killable(1); + if (time_left) + break; + } - wait_event(fs_info->async_submit_wait, - atomic_read(&fs_info->async_delalloc_pages) <= - (int)async_pages); -skip_async: spin_lock(&space_info->lock); if (list_empty(&space_info->tickets) && list_empty(&space_info->priority_tickets)) { @@ -596,14 +551,6 @@ skip_async: } spin_unlock(&space_info->lock); - loops++; - if (wait_ordered && !trans) { - btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); - } else { - time_left = schedule_timeout_killable(1); - if (time_left) - break; - } delalloc_bytes = percpu_counter_sum_positive( &fs_info->delalloc_bytes); dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); @@ -628,8 +575,8 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv; struct btrfs_trans_handle *trans; - u64 bytes_needed; u64 reclaim_bytes = 0; + u64 bytes_needed = 0; u64 cur_free_bytes = 0; trans = (struct btrfs_trans_handle *)current->journal_info; @@ -649,7 +596,8 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, else if (!list_empty(&space_info->tickets)) ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); - bytes_needed = (ticket) ? ticket->bytes : 0; + if (ticket) + bytes_needed = ticket->bytes; if (bytes_needed > cur_free_bytes) bytes_needed -= cur_free_bytes; @@ -676,8 +624,10 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, goto commit; /* - * See if there is some space in the delayed insertion reservation for - * this reservation. + * See if there is some space in the delayed insertion reserve for this + * reservation. If the space_info's don't match (like for DATA or + * SYSTEM) then just go enospc, reclaiming this space won't recover any + * space to satisfy those reservations. */ if (space_info != delayed_rsv->space_info) goto enospc; @@ -742,7 +692,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; case FLUSH_DELALLOC: case FLUSH_DELALLOC_WAIT: - shrink_delalloc(fs_info, num_bytes * 2, num_bytes, + shrink_delalloc(fs_info, space_info, num_bytes, state == FLUSH_DELALLOC_WAIT); break; case FLUSH_DELAYED_REFS_NR: @@ -767,7 +717,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; } ret = btrfs_chunk_alloc(trans, - btrfs_metadata_alloc_profile(fs_info), + btrfs_get_alloc_profile(fs_info, space_info->flags), (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE); btrfs_end_transaction(trans); @@ -1037,9 +987,132 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) } while (flush_state <= COMMIT_TRANS); } -void btrfs_init_async_reclaim_work(struct work_struct *work) +/* + * FLUSH_DELALLOC_WAIT: + * Space is freed from flushing delalloc in one of two ways. + * + * 1) compression is on and we allocate less space than we reserved + * 2) we are overwriting existing space + * + * For #1 that extra space is reclaimed as soon as the delalloc pages are + * COWed, by way of btrfs_add_reserved_bytes() which adds the actual extent + * length to ->bytes_reserved, and subtracts the reserved space from + * ->bytes_may_use. + * + * For #2 this is trickier. Once the ordered extent runs we will drop the + * extent in the range we are overwriting, which creates a delayed ref for + * that freed extent. This however is not reclaimed until the transaction + * commits, thus the next stages. + * + * RUN_DELAYED_IPUTS + * If we are freeing inodes, we want to make sure all delayed iputs have + * completed, because they could have been on an inode with i_nlink == 0, and + * thus have been truncated and freed up space. But again this space is not + * immediately re-usable, it comes in the form of a delayed ref, which must be + * run and then the transaction must be committed. + * + * FLUSH_DELAYED_REFS + * The above two cases generate delayed refs that will affect + * ->total_bytes_pinned. However this counter can be inconsistent with + * reality if there are outstanding delayed refs. This is because we adjust + * the counter based solely on the current set of delayed refs and disregard + * any on-disk state which might include more refs. So for example, if we + * have an extent with 2 references, but we only drop 1, we'll see that there + * is a negative delayed ref count for the extent and assume that the space + * will be freed, and thus increase ->total_bytes_pinned. + * + * Running the delayed refs gives us the actual real view of what will be + * freed at the transaction commit time. This stage will not actually free + * space for us, it just makes sure that may_commit_transaction() has all of + * the information it needs to make the right decision. + * + * COMMIT_TRANS + * This is where we reclaim all of the pinned space generated by the previous + * two stages. We will not commit the transaction if we don't think we're + * likely to satisfy our request, which means if our current free space + + * total_bytes_pinned < reservation we will not commit. This is why the + * previous states are actually important, to make sure we know for sure + * whether committing the transaction will allow us to make progress. + * + * ALLOC_CHUNK_FORCE + * For data we start with alloc chunk force, however we could have been full + * before, and then the transaction commit could have freed new block groups, + * so if we now have space to allocate do the force chunk allocation. + */ +static const enum btrfs_flush_state data_flush_states[] = { + FLUSH_DELALLOC_WAIT, + RUN_DELAYED_IPUTS, + FLUSH_DELAYED_REFS, + COMMIT_TRANS, + ALLOC_CHUNK_FORCE, +}; + +static void btrfs_async_reclaim_data_space(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_space_info *space_info; + u64 last_tickets_id; + int flush_state = 0; + + fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work); + space_info = fs_info->data_sinfo; + + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets)) { + space_info->flush = 0; + spin_unlock(&space_info->lock); + return; + } + last_tickets_id = space_info->tickets_id; + spin_unlock(&space_info->lock); + + while (!space_info->full) { + flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE); + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets)) { + space_info->flush = 0; + spin_unlock(&space_info->lock); + return; + } + last_tickets_id = space_info->tickets_id; + spin_unlock(&space_info->lock); + } + + while (flush_state < ARRAY_SIZE(data_flush_states)) { + flush_space(fs_info, space_info, U64_MAX, + data_flush_states[flush_state]); + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets)) { + space_info->flush = 0; + spin_unlock(&space_info->lock); + return; + } + + if (last_tickets_id == space_info->tickets_id) { + flush_state++; + } else { + last_tickets_id = space_info->tickets_id; + flush_state = 0; + } + + if (flush_state >= ARRAY_SIZE(data_flush_states)) { + if (space_info->full) { + if (maybe_fail_all_tickets(fs_info, space_info)) + flush_state = 0; + else + space_info->flush = 0; + } else { + flush_state = 0; + } + } + spin_unlock(&space_info->lock); + } +} + +void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info) { - INIT_WORK(work, btrfs_async_reclaim_metadata_space); + INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space); + INIT_WORK(&fs_info->async_data_reclaim_work, btrfs_async_reclaim_data_space); } static const enum btrfs_flush_state priority_flush_states[] = { @@ -1089,6 +1162,21 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, } while (flush_state < states_nr); } +static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + struct reserve_ticket *ticket) +{ + while (!space_info->full) { + flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE); + spin_lock(&space_info->lock); + if (ticket->bytes == 0) { + spin_unlock(&space_info->lock); + return; + } + spin_unlock(&space_info->lock); + } +} + static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, struct reserve_ticket *ticket) @@ -1141,6 +1229,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, int ret; switch (flush) { + case BTRFS_RESERVE_FLUSH_DATA: case BTRFS_RESERVE_FLUSH_ALL: case BTRFS_RESERVE_FLUSH_ALL_STEAL: wait_reserve_ticket(fs_info, space_info, ticket); @@ -1155,6 +1244,9 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, evict_flush_states, ARRAY_SIZE(evict_flush_states)); break; + case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE: + priority_reclaim_data_space(fs_info, space_info, ticket); + break; default: ASSERT(0); break; @@ -1214,11 +1306,11 @@ static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush) * regain reservations will be made and this will fail if there is not enough * space already. */ -static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 orig_bytes, - enum btrfs_reserve_flush_enum flush) +static int __reserve_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, u64 orig_bytes, + enum btrfs_reserve_flush_enum flush) { + struct work_struct *async_work; struct reserve_ticket ticket; u64 used; int ret = 0; @@ -1227,6 +1319,11 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, ASSERT(orig_bytes); ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); + if (flush == BTRFS_RESERVE_FLUSH_DATA) + async_work = &fs_info->async_data_reclaim_work; + else + async_work = &fs_info->async_reclaim_work; + spin_lock(&space_info->lock); ret = -ENOSPC; used = btrfs_space_info_used(space_info, true); @@ -1268,7 +1365,8 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, init_waitqueue_head(&ticket.wait); ticket.steal = (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL); if (flush == BTRFS_RESERVE_FLUSH_ALL || - flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) { + flush == BTRFS_RESERVE_FLUSH_ALL_STEAL || + flush == BTRFS_RESERVE_FLUSH_DATA) { list_add_tail(&ticket.list, &space_info->tickets); if (!space_info->flush) { space_info->flush = 1; @@ -1276,8 +1374,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, space_info->flags, orig_bytes, flush, "enospc"); - queue_work(system_unbound_wq, - &fs_info->async_reclaim_work); + queue_work(system_unbound_wq, async_work); } } else { list_add_tail(&ticket.list, @@ -1329,8 +1426,7 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root, struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; int ret; - ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, - orig_bytes, flush); + ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush); if (ret == -ENOSPC && unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { if (block_rsv != global_rsv && @@ -1348,3 +1444,32 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root, } return ret; } + +/** + * btrfs_reserve_data_bytes - try to reserve data bytes for an allocation + * @fs_info - the filesystem + * @bytes - the number of bytes we need + * @flush - how we are allowed to flush + * + * This will reserve bytes from the data space info. If there is not enough + * space then we will attempt to flush space as specified by flush. + */ +int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, + enum btrfs_reserve_flush_enum flush) +{ + struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; + int ret; + + ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA || + flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE); + ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA); + + ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush); + if (ret == -ENOSPC) { + trace_btrfs_space_reservation(fs_info, "space_info:enospc", + data_sinfo->flags, bytes, 1); + if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) + btrfs_dump_space_info(fs_info, data_sinfo, bytes, 0); + } + return ret; +} diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index c3c64019950a..5646393b928c 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -149,5 +149,7 @@ static inline void btrfs_space_info_free_bytes_may_use( btrfs_try_granting_tickets(fs_info, space_info); spin_unlock(&space_info->lock); } +int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, + enum btrfs_reserve_flush_enum flush); #endif /* BTRFS_SPACE_INFO_H */ diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index 079b059818e9..c46be27be700 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -7,16 +7,6 @@ #include "ctree.h" -static inline u8 get_unaligned_le8(const void *p) -{ - return *(u8 *)p; -} - -static inline void put_unaligned_le8(u8 val, void *p) -{ - *(u8 *)p = val; -} - static bool check_setget_bounds(const struct extent_buffer *eb, const void *ptr, unsigned off, int size) { diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 25967ecaaf0a..8840a4fa81eb 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1871,6 +1871,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) * the filesystem is busy. */ cancel_work_sync(&fs_info->async_reclaim_work); + cancel_work_sync(&fs_info->async_data_reclaim_work); btrfs_discard_cleanup(fs_info); @@ -2163,8 +2164,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) u64 thresh = 0; int mixed = 0; - rcu_read_lock(); - list_for_each_entry_rcu(found, &fs_info->space_info, list) { + list_for_each_entry(found, &fs_info->space_info, list) { if (found->flags & BTRFS_BLOCK_GROUP_DATA) { int i; @@ -2193,8 +2193,6 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) total_used += found->disk_used; } - rcu_read_unlock(); - buf->f_blocks = div_u64(btrfs_super_total_bytes(disk_super), factor); buf->f_blocks >>= bits; buf->f_bfree = buf->f_blocks - (div_u64(total_used, factor) >> bits); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index c8df2edafd85..279d9262b676 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -14,6 +14,7 @@ #include "ctree.h" #include "discard.h" #include "disk-io.h" +#include "send.h" #include "transaction.h" #include "sysfs.h" #include "volumes.h" @@ -321,9 +322,17 @@ static ssize_t supported_checksums_show(struct kobject *kobj, } BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show); +static ssize_t send_stream_version_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", BTRFS_SEND_STREAM_VERSION); +} +BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show); + static struct attribute *btrfs_supported_static_feature_attrs[] = { BTRFS_ATTR_PTR(static_feature, rmdir_subvol), BTRFS_ATTR_PTR(static_feature, supported_checksums), + BTRFS_ATTR_PTR(static_feature, send_stream_version), NULL }; @@ -809,6 +818,42 @@ static ssize_t btrfs_checksum_show(struct kobject *kobj, BTRFS_ATTR(, checksum, btrfs_checksum_show); +static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + const char *str; + + switch (READ_ONCE(fs_info->exclusive_operation)) { + case BTRFS_EXCLOP_NONE: + str = "none\n"; + break; + case BTRFS_EXCLOP_BALANCE: + str = "balance\n"; + break; + case BTRFS_EXCLOP_DEV_ADD: + str = "device add\n"; + break; + case BTRFS_EXCLOP_DEV_REMOVE: + str = "device remove\n"; + break; + case BTRFS_EXCLOP_DEV_REPLACE: + str = "device replace\n"; + break; + case BTRFS_EXCLOP_RESIZE: + str = "resize\n"; + break; + case BTRFS_EXCLOP_SWAP_ACTIVATE: + str = "swap activate\n"; + break; + default: + str = "UNKNOWN\n"; + break; + } + return scnprintf(buf, PAGE_SIZE, "%s", str); +} +BTRFS_ATTR(, exclusive_operation, btrfs_exclusive_operation_show); + static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, label), BTRFS_ATTR_PTR(, nodesize), @@ -817,6 +862,7 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, quota_override), BTRFS_ATTR_PTR(, metadata_uuid), BTRFS_ATTR_PTR(, checksum), + BTRFS_ATTR_PTR(, exclusive_operation), NULL, }; @@ -935,12 +981,24 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) } } +static void btrfs_sysfs_remove_fs_devices(struct btrfs_fs_devices *fs_devices) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *seed; + + list_for_each_entry(device, &fs_devices->devices, dev_list) + btrfs_sysfs_remove_device(device); + + list_for_each_entry(seed, &fs_devices->seed_list, seed_list) { + list_for_each_entry(device, &seed->devices, dev_list) + btrfs_sysfs_remove_device(device); + } +} + void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) { struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj; - btrfs_reset_fs_info_ptr(fs_info); - sysfs_remove_link(fsid_kobj, "bdi"); if (fs_info->space_info_kobj) { @@ -964,7 +1022,7 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) addrm_unknown_feature_attrs(fs_info, false); sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group); sysfs_remove_files(fsid_kobj, btrfs_attrs); - btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, NULL); + btrfs_sysfs_remove_fs_devices(fs_info->fs_devices); } static const char * const btrfs_feature_set_names[FEAT_MAX] = { @@ -973,7 +1031,7 @@ static const char * const btrfs_feature_set_names[FEAT_MAX] = { [FEAT_INCOMPAT] = "incompat", }; -const char * const btrfs_feature_set_name(enum btrfs_feature_set set) +const char *btrfs_feature_set_name(enum btrfs_feature_set set) { return btrfs_feature_set_names[set]; } @@ -1079,17 +1137,38 @@ void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache) rkobj->flags = cache->flags; kobject_init(&rkobj->kobj, &btrfs_raid_ktype); + + /* + * We call this either on mount, or if we've created a block group for a + * new index type while running (i.e. when restriping). The running + * case is tricky because we could race with other threads, so we need + * to have this check to make sure we didn't already init the kobject. + * + * We don't have to protect on the free side because it only happens on + * unmount. + */ + spin_lock(&space_info->lock); + if (space_info->block_group_kobjs[index]) { + spin_unlock(&space_info->lock); + kobject_put(&rkobj->kobj); + return; + } else { + space_info->block_group_kobjs[index] = &rkobj->kobj; + } + spin_unlock(&space_info->lock); + ret = kobject_add(&rkobj->kobj, &space_info->kobj, "%s", btrfs_bg_type_to_raid_name(rkobj->flags)); memalloc_nofs_restore(nofs_flag); if (ret) { + spin_lock(&space_info->lock); + space_info->block_group_kobjs[index] = NULL; + spin_unlock(&space_info->lock); kobject_put(&rkobj->kobj); btrfs_warn(fs_info, "failed to add kobject for block cache, ignoring"); return; } - - space_info->block_group_kobjs[index] = &rkobj->kobj; } /* @@ -1151,48 +1230,30 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, return 0; } -/* when one_device is NULL, it removes all device links */ - -int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *one_device) +void btrfs_sysfs_remove_device(struct btrfs_device *device) { struct hd_struct *disk; struct kobject *disk_kobj; + struct kobject *devices_kobj; - if (!fs_devices->devices_kobj) - return -EINVAL; - - if (one_device) { - if (one_device->bdev) { - disk = one_device->bdev->bd_part; - disk_kobj = &part_to_dev(disk)->kobj; - sysfs_remove_link(fs_devices->devices_kobj, - disk_kobj->name); - } - - kobject_del(&one_device->devid_kobj); - kobject_put(&one_device->devid_kobj); - - wait_for_completion(&one_device->kobj_unregister); + /* + * Seed fs_devices devices_kobj aren't used, fetch kobject from the + * fs_info::fs_devices. + */ + devices_kobj = device->fs_info->fs_devices->devices_kobj; + ASSERT(devices_kobj); - return 0; + if (device->bdev) { + disk = device->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; + sysfs_remove_link(devices_kobj, disk_kobj->name); } - list_for_each_entry(one_device, &fs_devices->devices, dev_list) { - - if (one_device->bdev) { - disk = one_device->bdev->bd_part; - disk_kobj = &part_to_dev(disk)->kobj; - sysfs_remove_link(fs_devices->devices_kobj, - disk_kobj->name); - } - kobject_del(&one_device->devid_kobj); - kobject_put(&one_device->devid_kobj); - - wait_for_completion(&one_device->kobj_unregister); + if (device->devid_kobj.state_initialized) { + kobject_del(&device->devid_kobj); + kobject_put(&device->devid_kobj); + wait_for_completion(&device->kobj_unregister); } - - return 0; } static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj, @@ -1273,44 +1334,80 @@ static struct kobj_type devid_ktype = { .release = btrfs_release_devid_kobj, }; -int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *one_device) +int btrfs_sysfs_add_device(struct btrfs_device *device) { - int error = 0; - struct btrfs_device *dev; + int ret; unsigned int nofs_flag; + struct kobject *devices_kobj; + struct kobject *devinfo_kobj; - nofs_flag = memalloc_nofs_save(); - list_for_each_entry(dev, &fs_devices->devices, dev_list) { + /* + * Make sure we use the fs_info::fs_devices to fetch the kobjects even + * for the seed fs_devices + */ + devices_kobj = device->fs_info->fs_devices->devices_kobj; + devinfo_kobj = device->fs_info->fs_devices->devinfo_kobj; + ASSERT(devices_kobj); + ASSERT(devinfo_kobj); - if (one_device && one_device != dev) - continue; + nofs_flag = memalloc_nofs_save(); - if (dev->bdev) { - struct hd_struct *disk; - struct kobject *disk_kobj; + if (device->bdev) { + struct hd_struct *disk; + struct kobject *disk_kobj; - disk = dev->bdev->bd_part; - disk_kobj = &part_to_dev(disk)->kobj; + disk = device->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; - error = sysfs_create_link(fs_devices->devices_kobj, - disk_kobj, disk_kobj->name); - if (error) - break; + ret = sysfs_create_link(devices_kobj, disk_kobj, disk_kobj->name); + if (ret) { + btrfs_warn(device->fs_info, + "creating sysfs device link for devid %llu failed: %d", + device->devid, ret); + goto out; } + } - init_completion(&dev->kobj_unregister); - error = kobject_init_and_add(&dev->devid_kobj, &devid_ktype, - fs_devices->devinfo_kobj, "%llu", - dev->devid); - if (error) { - kobject_put(&dev->devid_kobj); - break; - } + init_completion(&device->kobj_unregister); + ret = kobject_init_and_add(&device->devid_kobj, &devid_ktype, + devinfo_kobj, "%llu", device->devid); + if (ret) { + kobject_put(&device->devid_kobj); + btrfs_warn(device->fs_info, + "devinfo init for devid %llu failed: %d", + device->devid, ret); } + +out: memalloc_nofs_restore(nofs_flag); + return ret; +} - return error; +static int btrfs_sysfs_add_fs_devices(struct btrfs_fs_devices *fs_devices) +{ + int ret; + struct btrfs_device *device; + struct btrfs_fs_devices *seed; + + list_for_each_entry(device, &fs_devices->devices, dev_list) { + ret = btrfs_sysfs_add_device(device); + if (ret) + goto fail; + } + + list_for_each_entry(seed, &fs_devices->seed_list, seed_list) { + list_for_each_entry(device, &seed->devices, dev_list) { + ret = btrfs_sysfs_add_device(device); + if (ret) + goto fail; + } + } + + return 0; + +fail: + btrfs_sysfs_remove_fs_devices(fs_devices); + return ret; } void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action) @@ -1324,8 +1421,8 @@ void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action) &disk_to_dev(bdev->bd_disk)->kobj); } -void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices, - const u8 *fsid) +void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices) + { char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; @@ -1333,7 +1430,7 @@ void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices, * Sprouting changes fsid of the mounted filesystem, rename the fsid * directory */ - snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", fsid); + snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", fs_devices->fsid); if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf)) btrfs_warn(fs_devices->fs_info, "sysfs: failed to create fsid for sprout"); @@ -1400,15 +1497,13 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) struct btrfs_fs_devices *fs_devs = fs_info->fs_devices; struct kobject *fsid_kobj = &fs_devs->fsid_kobj; - btrfs_set_fs_info_ptr(fs_info); - - error = btrfs_sysfs_add_devices_dir(fs_devs, NULL); + error = btrfs_sysfs_add_fs_devices(fs_devs); if (error) return error; error = sysfs_create_files(fsid_kobj, btrfs_attrs); if (error) { - btrfs_sysfs_remove_devices_dir(fs_devs, NULL); + btrfs_sysfs_remove_fs_devices(fs_devs); return error; } @@ -1626,12 +1721,16 @@ void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, { struct btrfs_fs_devices *fs_devs; struct kobject *fsid_kobj; - u64 features; - int ret; + u64 __maybe_unused features; + int __maybe_unused ret; if (!fs_info) return; + /* + * See 14e46e04958df74 and e410e34fad913dd, feature bit updates are not + * safe when called from some contexts (eg. balance) + */ features = get_features(fs_info, set); ASSERT(bit & supported_feature_masks[set]); diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index cf839c46a131..bacef43f7267 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -13,15 +13,12 @@ enum btrfs_feature_set { }; char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); -const char * const btrfs_feature_set_name(enum btrfs_feature_set set); -int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *one_device); -int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *one_device); +const char *btrfs_feature_set_name(enum btrfs_feature_set set); +int btrfs_sysfs_add_device(struct btrfs_device *device); +void btrfs_sysfs_remove_device(struct btrfs_device *device); int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); -void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices, - const u8 *fsid); +void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices); void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, u64 bit, enum btrfs_feature_set set); void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action); diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index a1b9f9b5978e..df54cdfdc250 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -60,8 +60,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) key.type = BTRFS_EXTENT_CSUM_KEY; key.offset = 0; - setup_items_for_insert(root, path, &key, &value_len, value_len, - value_len + sizeof(struct btrfs_item), 1); + setup_items_for_insert(root, path, &key, &value_len, 1); item = btrfs_item_nr(0); write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0), value_len); diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 894a63a92236..e6719f7db386 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -33,8 +33,7 @@ static void insert_extent(struct btrfs_root *root, u64 start, u64 len, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = start; - setup_items_for_insert(root, &path, &key, &value_len, value_len, - value_len + sizeof(struct btrfs_item), 1); + setup_items_for_insert(root, &path, &key, &value_len, 1); fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, fi, 1); btrfs_set_file_extent_type(leaf, fi, type); @@ -64,8 +63,7 @@ static void insert_inode_item_key(struct btrfs_root *root) key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - setup_items_for_insert(root, &path, &key, &value_len, value_len, - value_len + sizeof(struct btrfs_item), 1); + setup_items_for_insert(root, &path, &key, &value_len, 1); } /* @@ -951,7 +949,6 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) } BTRFS_I(inode)->root = root; - btrfs_test_inode_set_ops(inode); /* [BTRFS_MAX_EXTENT_SIZE] */ ret = btrfs_set_extent_delalloc(BTRFS_I(inode), 0, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index d2fc292ac61b..52ada47aff50 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -292,6 +292,8 @@ loop: } cur_trans->fs_info = fs_info; + atomic_set(&cur_trans->pending_ordered, 0); + init_waitqueue_head(&cur_trans->pending_wait); atomic_set(&cur_trans->num_writers, 1); extwriter_counter_init(cur_trans, type); init_waitqueue_head(&cur_trans->writer_wait); @@ -1182,7 +1184,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) eb = btrfs_lock_root_node(fs_info->tree_root); ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, - 0, &eb); + 0, &eb, BTRFS_NESTING_COW); btrfs_tree_unlock(eb); free_extent_buffer(eb); @@ -1587,7 +1589,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_set_root_otransid(new_root_item, trans->transid); old = btrfs_lock_root_node(root); - ret = btrfs_cow_block(trans, root, old, NULL, 0, &old); + ret = btrfs_cow_block(trans, root, old, NULL, 0, &old, + BTRFS_NESTING_COW); if (ret) { btrfs_tree_unlock(old); free_extent_buffer(old); @@ -2165,6 +2168,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) btrfs_wait_delalloc_flush(trans); + /* + * Wait for all ordered extents started by a fast fsync that joined this + * transaction. Otherwise if this transaction commits before the ordered + * extents complete we lose logged data after a power failure. + */ + wait_event(cur_trans->pending_wait, + atomic_read(&cur_trans->pending_ordered) == 0); + btrfs_scrub_pause(fs_info); /* * Ok now we need to make sure to block out any other joins while we diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index d60b055b8695..858d9153a1cd 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -85,6 +85,13 @@ struct btrfs_transaction { spinlock_t dropped_roots_lock; struct btrfs_delayed_ref_root delayed_refs; struct btrfs_fs_info *fs_info; + + /* + * Number of ordered extents the transaction must wait for before + * committing. These are ordered extents started by a fast fsync. + */ + atomic_t pending_ordered; + wait_queue_head_t pending_wait; }; #define __TRANS_FREEZABLE (1U << 0) @@ -105,6 +112,7 @@ struct btrfs_transaction { #define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH) #define BTRFS_SEND_TRANS_STUB ((void *)1) +#define BTRFS_DIO_SYNC_STUB ((void *)2) struct btrfs_trans_handle { u64 transid; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 7b1fee630f97..f0ffd5ee77bd 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1035,7 +1035,7 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot) { struct btrfs_fs_info *fs_info = leaf->fs_info; - struct btrfs_root_item ri; + struct btrfs_root_item ri = { 0 }; const u64 valid_root_flags = BTRFS_ROOT_SUBVOL_RDONLY | BTRFS_ROOT_SUBVOL_DEAD; int ret; @@ -1044,14 +1044,21 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, if (ret < 0) return ret; - if (btrfs_item_size_nr(leaf, slot) != sizeof(ri)) { + if (btrfs_item_size_nr(leaf, slot) != sizeof(ri) && + btrfs_item_size_nr(leaf, slot) != btrfs_legacy_root_item_size()) { generic_err(leaf, slot, - "invalid root item size, have %u expect %zu", - btrfs_item_size_nr(leaf, slot), sizeof(ri)); + "invalid root item size, have %u expect %zu or %u", + btrfs_item_size_nr(leaf, slot), sizeof(ri), + btrfs_legacy_root_item_size()); } + /* + * For legacy root item, the members starting at generation_v2 will be + * all filled with 0. + * And since we allow geneartion_v2 as 0, it will still pass the check. + */ read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot), - sizeof(ri)); + btrfs_item_size_nr(leaf, slot)); /* Generation related */ if (btrfs_root_generation(&ri) > diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 39da9db35278..56cbc1706b6f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -96,8 +96,6 @@ enum { static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, int inode_only, - const loff_t start, - const loff_t end, struct btrfs_log_ctx *ctx); static int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -176,7 +174,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans, atomic_inc(&root->log_batch); atomic_inc(&root->log_writers); - if (ctx) { + if (ctx && !ctx->logging_new_name) { int index = root->log_transid % 2; list_add_tail(&ctx->list, &root->log_ctxs[index]); ctx->log_transid = root->log_transid; @@ -215,9 +213,7 @@ static int join_running_log_trans(struct btrfs_root *root) */ void btrfs_pin_log_trans(struct btrfs_root *root) { - mutex_lock(&root->log_mutex); atomic_inc(&root->log_writers); - mutex_unlock(&root->log_mutex); } /* @@ -3615,6 +3611,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, * search and this search we'll not find the key again and can just * bail. */ +search: ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); if (ret != 0) goto done; @@ -3634,6 +3631,13 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, if (min_key.objectid != ino || min_key.type != key_type) goto done; + + if (need_resched()) { + btrfs_release_path(path); + cond_resched(); + goto search; + } + ret = overwrite_item(trans, log, dst_path, src, i, &min_key); if (ret) { @@ -4082,10 +4086,14 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) static int log_extent_csums(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_root *log_root, - const struct extent_map *em) + const struct extent_map *em, + struct btrfs_log_ctx *ctx) { + struct btrfs_ordered_extent *ordered; u64 csum_offset; u64 csum_len; + u64 mod_start = em->mod_start; + u64 mod_len = em->mod_len; LIST_HEAD(ordered_sums); int ret = 0; @@ -4094,13 +4102,71 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, em->block_start == EXTENT_MAP_HOLE) return 0; + list_for_each_entry(ordered, &ctx->ordered_extents, log_list) { + const u64 ordered_end = ordered->file_offset + ordered->num_bytes; + const u64 mod_end = mod_start + mod_len; + struct btrfs_ordered_sum *sums; + + if (mod_len == 0) + break; + + if (ordered_end <= mod_start) + continue; + if (mod_end <= ordered->file_offset) + break; + + /* + * We are going to copy all the csums on this ordered extent, so + * go ahead and adjust mod_start and mod_len in case this ordered + * extent has already been logged. + */ + if (ordered->file_offset > mod_start) { + if (ordered_end >= mod_end) + mod_len = ordered->file_offset - mod_start; + /* + * If we have this case + * + * |--------- logged extent ---------| + * |----- ordered extent ----| + * + * Just don't mess with mod_start and mod_len, we'll + * just end up logging more csums than we need and it + * will be ok. + */ + } else { + if (ordered_end < mod_end) { + mod_len = mod_end - ordered_end; + mod_start = ordered_end; + } else { + mod_len = 0; + } + } + + /* + * To keep us from looping for the above case of an ordered + * extent that falls inside of the logged extent. + */ + if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags)) + continue; + + list_for_each_entry(sums, &ordered->list, list) { + ret = log_csums(trans, inode, log_root, sums); + if (ret) + return ret; + } + } + + /* We're done, found all csums in the ordered extents. */ + if (mod_len == 0) + return 0; + /* If we're compressed we have to save the entire range of csums. */ if (em->compress_type) { csum_offset = 0; csum_len = max(em->block_len, em->orig_block_len); } else { - csum_offset = em->mod_start - em->start; - csum_len = em->mod_len; + csum_offset = mod_start - em->start; + csum_len = mod_len; } /* block start is already adjusted for the file extent offset. */ @@ -4140,7 +4206,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, int ret; int extent_inserted = 0; - ret = log_extent_csums(trans, inode, log, em); + ret = log_extent_csums(trans, inode, log, em, ctx); if (ret) return ret; @@ -4342,10 +4408,10 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, struct btrfs_path *path, - struct btrfs_log_ctx *ctx, - const u64 start, - const u64 end) + struct btrfs_log_ctx *ctx) { + struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_extent *tmp; struct extent_map *em, *n; struct list_head extents; struct extent_map_tree *tree = &inode->extent_tree; @@ -4359,23 +4425,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, test_gen = root->fs_info->last_trans_committed; list_for_each_entry_safe(em, n, &tree->modified_extents, list) { - /* - * Skip extents outside our logging range. It's important to do - * it for correctness because if we don't ignore them, we may - * log them before their ordered extent completes, and therefore - * we could log them without logging their respective checksums - * (the checksum items are added to the csum tree at the very - * end of btrfs_finish_ordered_io()). Also leave such extents - * outside of our range in the list, since we may have another - * ranged fsync in the near future that needs them. If an extent - * outside our range corresponds to a hole, log it to avoid - * leaving gaps between extents (fsck will complain when we are - * not using the NO_HOLES feature). - */ - if ((em->start > end || em->start + em->len <= start) && - em->block_start != EXTENT_MAP_HOLE) - continue; - list_del_init(&em->list); /* * Just an arbitrary number, this can be really CPU intensive @@ -4434,8 +4483,32 @@ process: btrfs_release_path(path); if (!ret) ret = btrfs_log_prealloc_extents(trans, inode, path); + if (ret) + return ret; - return ret; + /* + * We have logged all extents successfully, now make sure the commit of + * the current transaction waits for the ordered extents to complete + * before it commits and wipes out the log trees, otherwise we would + * lose data if an ordered extents completes after the transaction + * commits and a power failure happens after the transaction commit. + */ + list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { + list_del_init(&ordered->log_list); + set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags); + + if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { + spin_lock_irq(&inode->ordered_tree.lock); + if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { + set_bit(BTRFS_ORDERED_PENDING, &ordered->flags); + atomic_inc(&trans->transaction->pending_ordered); + } + spin_unlock_irq(&inode->ordered_tree.lock); + } + btrfs_put_ordered_extent(ordered); + } + + return 0; } static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, @@ -4841,7 +4914,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, ret = btrfs_log_inode(trans, root, BTRFS_I(inode), LOG_OTHER_INODE_ALL, - 0, LLONG_MAX, ctx); + ctx); btrfs_add_delayed_iput(inode); } } @@ -4883,7 +4956,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * Check the inode's logged_trans only instead of * btrfs_inode_in_log(). This is because the last_log_commit of * the inode is not updated when we only log that it exists and - * and it has the full sync bit set (see btrfs_log_inode()). + * it has the full sync bit set (see btrfs_log_inode()). */ if (BTRFS_I(inode)->logged_trans == trans->transid) { spin_unlock(&BTRFS_I(inode)->lock); @@ -4899,7 +4972,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * log with the new name before we unpin it. */ ret = btrfs_log_inode(trans, root, BTRFS_I(inode), - LOG_OTHER_INODE, 0, LLONG_MAX, ctx); + LOG_OTHER_INODE, ctx); if (ret) { btrfs_add_delayed_iput(inode); continue; @@ -5112,8 +5185,6 @@ next_key: static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, int inode_only, - const loff_t start, - const loff_t end, struct btrfs_log_ctx *ctx) { struct btrfs_path *path; @@ -5292,7 +5363,7 @@ log_extents: } if (fast_search) { ret = btrfs_log_changed_extents(trans, root, inode, dst_path, - ctx, start, end); + ctx); if (ret) { err = ret; goto out_unlock; @@ -5301,31 +5372,8 @@ log_extents: struct extent_map *em, *n; write_lock(&em_tree->lock); - /* - * We can't just remove every em if we're called for a ranged - * fsync - that is, one that doesn't cover the whole possible - * file range (0 to LLONG_MAX). This is because we can have - * em's that fall outside the range we're logging and therefore - * their ordered operations haven't completed yet - * (btrfs_finish_ordered_io() not invoked yet). This means we - * didn't get their respective file extent item in the fs/subvol - * tree yet, and need to let the next fast fsync (one which - * consults the list of modified extent maps) find the em so - * that it logs a matching file extent item and waits for the - * respective ordered operation to complete (if it's still - * running). - * - * Removing every em outside the range we're logging would make - * the next fast fsync not log their matching file extent items, - * therefore making us lose data after a log replay. - */ - list_for_each_entry_safe(em, n, &em_tree->modified_extents, - list) { - const u64 mod_end = em->mod_start + em->mod_len - 1; - - if (em->mod_start >= start && mod_end <= end) - list_del_init(&em->list); - } + list_for_each_entry_safe(em, n, &em_tree->modified_extents, list) + list_del_init(&em->list); write_unlock(&em_tree->lock); } @@ -5339,19 +5387,34 @@ log_extents: } /* - * Don't update last_log_commit if we logged that an inode exists after - * it was loaded to memory (full_sync bit set). - * This is to prevent data loss when we do a write to the inode, then - * the inode gets evicted after all delalloc was flushed, then we log - * it exists (due to a rename for example) and then fsync it. This last - * fsync would do nothing (not logging the extents previously written). + * If we are logging that an ancestor inode exists as part of logging a + * new name from a link or rename operation, don't mark the inode as + * logged - otherwise if an explicit fsync is made against an ancestor, + * the fsync considers the inode in the log and doesn't sync the log, + * resulting in the ancestor missing after a power failure unless the + * log was synced as part of an fsync against any other unrelated inode. + * So keep it simple for this case and just don't flag the ancestors as + * logged. */ - spin_lock(&inode->lock); - inode->logged_trans = trans->transid; - if (inode_only != LOG_INODE_EXISTS || - !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) - inode->last_log_commit = inode->last_sub_trans; - spin_unlock(&inode->lock); + if (!ctx || + !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name && + &inode->vfs_inode != ctx->inode)) { + spin_lock(&inode->lock); + inode->logged_trans = trans->transid; + /* + * Don't update last_log_commit if we logged that an inode exists + * after it was loaded to memory (full_sync bit set). + * This is to prevent data loss when we do a write to the inode, + * then the inode gets evicted after all delalloc was flushed, + * then we log it exists (due to a rename for example) and then + * fsync it. This last fsync would do nothing (not logging the + * extents previously written). + */ + if (inode_only != LOG_INODE_EXISTS || + !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) + inode->last_log_commit = inode->last_sub_trans; + spin_unlock(&inode->lock); + } out_unlock: mutex_unlock(&inode->log_mutex); @@ -5591,7 +5654,7 @@ process_leaf: if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) log_mode = LOG_INODE_ALL; ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode), - log_mode, 0, LLONG_MAX, ctx); + log_mode, ctx); if (!ret && btrfs_must_commit_transaction(trans, BTRFS_I(di_inode))) ret = 1; @@ -5735,7 +5798,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, if (ctx) ctx->log_new_dentries = false; ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode), - LOG_INODE_ALL, 0, LLONG_MAX, ctx); + LOG_INODE_ALL, ctx); if (!ret && btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode))) ret = 1; @@ -5786,8 +5849,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, if (BTRFS_I(inode)->generation > last_committed) ret = btrfs_log_inode(trans, root, BTRFS_I(inode), - LOG_INODE_EXISTS, - 0, LLONG_MAX, ctx); + LOG_INODE_EXISTS, ctx); btrfs_add_delayed_iput(inode); if (ret) return ret; @@ -5842,7 +5904,7 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, if (inode->generation > fs_info->last_trans_committed) { ret = btrfs_log_inode(trans, root, inode, - LOG_INODE_EXISTS, 0, LLONG_MAX, ctx); + LOG_INODE_EXISTS, ctx); if (ret) break; } @@ -5950,8 +6012,6 @@ out: static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct dentry *parent, - const loff_t start, - const loff_t end, int inode_only, struct btrfs_log_ctx *ctx) { @@ -6004,7 +6064,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (ret) goto end_no_trans; - ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx); + ret = btrfs_log_inode(trans, root, inode, inode_only, ctx); if (ret) goto end_trans; @@ -6100,15 +6160,13 @@ end_no_trans: */ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct dentry *dentry, - const loff_t start, - const loff_t end, struct btrfs_log_ctx *ctx) { struct dentry *parent = dget_parent(dentry); int ret; ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent, - start, end, LOG_INODE_ALL, ctx); + LOG_INODE_ALL, ctx); dput(parent); return ret; @@ -6371,26 +6429,13 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, /* * Call this after adding a new name for a file and it will properly * update the log to reflect the new name. - * - * @ctx can not be NULL when @sync_log is false, and should be NULL when it's - * true (because it's not used). - * - * Return value depends on whether @sync_log is true or false. - * When true: returns BTRFS_NEED_TRANS_COMMIT if the transaction needs to be - * committed by the caller, and BTRFS_DONT_NEED_TRANS_COMMIT - * otherwise. - * When false: returns BTRFS_DONT_NEED_LOG_SYNC if the caller does not need to - * to sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log, - * or BTRFS_NEED_TRANS_COMMIT if the transaction needs to be - * committed (without attempting to sync the log). */ -int btrfs_log_new_name(struct btrfs_trans_handle *trans, +void btrfs_log_new_name(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_inode *old_dir, - struct dentry *parent, - bool sync_log, struct btrfs_log_ctx *ctx) + struct dentry *parent) { struct btrfs_fs_info *fs_info = trans->fs_info; - int ret; + struct btrfs_log_ctx ctx; /* * this will force the logging code to walk the dentry chain @@ -6405,34 +6450,17 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans, */ if (inode->logged_trans <= fs_info->last_trans_committed && (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed)) - return sync_log ? BTRFS_DONT_NEED_TRANS_COMMIT : - BTRFS_DONT_NEED_LOG_SYNC; - - if (sync_log) { - struct btrfs_log_ctx ctx2; - - btrfs_init_log_ctx(&ctx2, &inode->vfs_inode); - ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, - LOG_INODE_EXISTS, &ctx2); - if (ret == BTRFS_NO_LOG_SYNC) - return BTRFS_DONT_NEED_TRANS_COMMIT; - else if (ret) - return BTRFS_NEED_TRANS_COMMIT; - - ret = btrfs_sync_log(trans, inode->root, &ctx2); - if (ret) - return BTRFS_NEED_TRANS_COMMIT; - return BTRFS_DONT_NEED_TRANS_COMMIT; - } - - ASSERT(ctx); - ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, - LOG_INODE_EXISTS, ctx); - if (ret == BTRFS_NO_LOG_SYNC) - return BTRFS_DONT_NEED_LOG_SYNC; - else if (ret) - return BTRFS_NEED_TRANS_COMMIT; + return; - return BTRFS_NEED_LOG_SYNC; + btrfs_init_log_ctx(&ctx, &inode->vfs_inode); + ctx.logging_new_name = true; + /* + * We don't care about the return value. If we fail to log the new name + * then we know the next attempt to sync the log will fallback to a full + * transaction commit (due to a call to btrfs_set_log_full_commit()), so + * we don't need to worry about getting a log committed that has an + * inconsistent state after a rename operation. + */ + btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 132e43d29034..731bd9c029f5 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -16,8 +16,11 @@ struct btrfs_log_ctx { int log_ret; int log_transid; bool log_new_dentries; + bool logging_new_name; struct inode *inode; struct list_head list; + /* Only used for fast fsyncs. */ + struct list_head ordered_extents; }; static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, @@ -26,8 +29,23 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, ctx->log_ret = 0; ctx->log_transid = 0; ctx->log_new_dentries = false; + ctx->logging_new_name = false; ctx->inode = inode; INIT_LIST_HEAD(&ctx->list); + INIT_LIST_HEAD(&ctx->ordered_extents); +} + +static inline void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx) +{ + struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_extent *tmp; + + ASSERT(inode_is_locked(ctx->inode)); + + list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { + list_del_init(&ordered->log_list); + btrfs_put_ordered_extent(ordered); + } } static inline void btrfs_set_log_full_commit(struct btrfs_trans_handle *trans) @@ -49,8 +67,6 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, int btrfs_recover_log_trees(struct btrfs_root *tree_root); int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct dentry *dentry, - const loff_t start, - const loff_t end, struct btrfs_log_ctx *ctx); int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -67,16 +83,8 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, int for_rename); void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, struct btrfs_inode *dir); -/* Return values for btrfs_log_new_name() */ -enum { - BTRFS_DONT_NEED_TRANS_COMMIT, - BTRFS_NEED_TRANS_COMMIT, - BTRFS_DONT_NEED_LOG_SYNC, - BTRFS_NEED_LOG_SYNC, -}; -int btrfs_log_new_name(struct btrfs_trans_handle *trans, +void btrfs_log_new_name(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_inode *old_dir, - struct dentry *parent, - bool sync_log, struct btrfs_log_ctx *ctx); + struct dentry *parent); #endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 117b43367629..58b9c419a2b6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -291,8 +291,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * balance_mutex * * - * Exclusive operations, BTRFS_FS_EXCL_OP - * ====================================== + * Exclusive operations + * ==================== * * Maintains the exclusivity of the following operations that apply to the * whole filesystem and cannot run in parallel. @@ -318,11 +318,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * - system power-cycle and filesystem mounted as read-only * - filesystem or device errors leading to forced read-only * - * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations. - * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set. + * The status of exclusive operation is set and cleared atomically. + * During the course of Paused state, fs_info::exclusive_operation remains set. * A device operation in Paused or Running state can be canceled or resumed * either by ioctl (Balance only) or when remounted as read-write. - * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or + * The exclusive status is cleared when the device operation is canceled or * completed. */ @@ -356,6 +356,7 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, INIT_LIST_HEAD(&fs_devs->devices); INIT_LIST_HEAD(&fs_devs->alloc_list); INIT_LIST_HEAD(&fs_devs->fs_list); + INIT_LIST_HEAD(&fs_devs->seed_list); if (fsid) memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); @@ -406,7 +407,7 @@ void __exit btrfs_cleanup_fs_uuids(void) * Returned struct is not linked onto any lists and must be destroyed using * btrfs_free_device. */ -static struct btrfs_device *__alloc_device(void) +static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info) { struct btrfs_device *dev; @@ -433,7 +434,8 @@ static struct btrfs_device *__alloc_device(void) btrfs_device_data_ordered_init(dev); INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); - extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL); + extent_io_tree_init(fs_info, &dev->alloc_state, + IO_TREE_DEVICE_ALLOC_STATE, NULL); return dev; } @@ -593,8 +595,6 @@ static int btrfs_free_stale_devices(const char *path, btrfs_free_device(device); ret = 0; - if (fs_devices->num_devices == 0) - break; } mutex_unlock(&fs_devices->device_list_mutex); @@ -941,16 +941,18 @@ static noinline struct btrfs_device *device_list_add(const char *path, bdput(path_bdev); mutex_unlock(&fs_devices->device_list_mutex); btrfs_warn_in_rcu(device->fs_info, - "duplicate device fsid:devid for %pU:%llu old:%s new:%s", - disk_super->fsid, devid, - rcu_str_deref(device->name), path); + "duplicate device %s devid %llu generation %llu scanned by %s (%d)", + path, devid, found_transid, + current->comm, + task_pid_nr(current)); return ERR_PTR(-EEXIST); } bdput(path_bdev); btrfs_info_in_rcu(device->fs_info, - "device fsid %pU devid %llu moved old:%s new:%s", - disk_super->fsid, devid, - rcu_str_deref(device->name), path); + "devid %llu device path %s changed to %s scanned by %s (%d)", + devid, rcu_str_deref(device->name), + path, current->comm, + task_pid_nr(current)); } name = rcu_string_strdup(path, GFP_NOFS); @@ -1035,28 +1037,21 @@ error: return ERR_PTR(ret); } -/* - * After we have read the system tree and know devids belonging to - * this filesystem, remove the device which does not belong there. - */ -void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step) +static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, + int step, struct btrfs_device **latest_dev) { struct btrfs_device *device, *next; - struct btrfs_device *latest_dev = NULL; - mutex_lock(&uuid_mutex); -again: /* This is the initialized path, it is safe to release the devices. */ list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { - if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, - &device->dev_state)) { + if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) { if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, - &device->dev_state) && + &device->dev_state) && !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) && - (!latest_dev || - device->generation > latest_dev->generation)) { - latest_dev = device; + (!*latest_dev || + device->generation > (*latest_dev)->generation)) { + *latest_dev = device; } continue; } @@ -1094,10 +1089,22 @@ again: btrfs_free_device(device); } - if (fs_devices->seed) { - fs_devices = fs_devices->seed; - goto again; - } +} + +/* + * After we have read the system tree and know devids belonging to this + * filesystem, remove the device which does not belong there. + */ +void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step) +{ + struct btrfs_device *latest_dev = NULL; + struct btrfs_fs_devices *seed_dev; + + mutex_lock(&uuid_mutex); + __btrfs_free_extra_devids(fs_devices, step, &latest_dev); + + list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list) + __btrfs_free_extra_devids(seed_dev, step, &latest_dev); fs_devices->latest_bdev = latest_dev->bdev; @@ -1149,47 +1156,41 @@ static void btrfs_close_one_device(struct btrfs_device *device) ASSERT(atomic_read(&device->reada_in_flight) == 0); } -static int close_fs_devices(struct btrfs_fs_devices *fs_devices) +static void close_fs_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device, *tmp; + lockdep_assert_held(&uuid_mutex); + if (--fs_devices->opened > 0) - return 0; + return; - mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) { + list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) btrfs_close_one_device(device); - } - mutex_unlock(&fs_devices->device_list_mutex); WARN_ON(fs_devices->open_devices); WARN_ON(fs_devices->rw_devices); fs_devices->opened = 0; fs_devices->seeding = false; - - return 0; + fs_devices->fs_info = NULL; } -int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) +void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) { - struct btrfs_fs_devices *seed_devices = NULL; - int ret; + LIST_HEAD(list); + struct btrfs_fs_devices *tmp; mutex_lock(&uuid_mutex); - ret = close_fs_devices(fs_devices); - if (!fs_devices->opened) { - seed_devices = fs_devices->seed; - fs_devices->seed = NULL; - } - mutex_unlock(&uuid_mutex); + close_fs_devices(fs_devices); + if (!fs_devices->opened) + list_splice_init(&fs_devices->seed_list, &list); - while (seed_devices) { - fs_devices = seed_devices; - seed_devices = fs_devices->seed; + list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) { close_fs_devices(fs_devices); + list_del(&fs_devices->seed_list); free_fs_devices(fs_devices); } - return ret; + mutex_unlock(&uuid_mutex); } static int open_fs_devices(struct btrfs_fs_devices *fs_devices, @@ -1197,17 +1198,23 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, { struct btrfs_device *device; struct btrfs_device *latest_dev = NULL; + struct btrfs_device *tmp_device; flags |= FMODE_EXCL; - list_for_each_entry(device, &fs_devices->devices, dev_list) { - /* Just open everything we can; ignore failures here */ - if (btrfs_open_one_device(fs_devices, device, flags, holder)) - continue; + list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, + dev_list) { + int ret; - if (!latest_dev || - device->generation > latest_dev->generation) + ret = btrfs_open_one_device(fs_devices, device, flags, holder); + if (ret == 0 && + (!latest_dev || device->generation > latest_dev->generation)) { latest_dev = device; + } else if (ret == -ENODATA) { + fs_devices->num_devices--; + list_del(&device->dev_list); + btrfs_free_device(device); + } } if (fs_devices->open_devices == 0) return -EINVAL; @@ -1961,16 +1968,13 @@ static struct btrfs_device * btrfs_find_next_active_device( * this_dev) which is active. */ void __cold btrfs_assign_next_active_device(struct btrfs_device *device, - struct btrfs_device *this_dev) + struct btrfs_device *next_device) { struct btrfs_fs_info *fs_info = device->fs_info; - struct btrfs_device *next_device; - if (this_dev) - next_device = this_dev; - else + if (!next_device) next_device = btrfs_find_next_active_device(fs_info->fs_devices, - device); + device); ASSERT(next_device); if (fs_info->sb->s_bdev && @@ -1999,9 +2003,9 @@ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) return num_devices; } -static void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, - struct block_device *bdev, - const char *device_path) +void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, + struct block_device *bdev, + const char *device_path) { struct btrfs_super_block *disk_super; int copy_num; @@ -2040,7 +2044,7 @@ static void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, } int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, - u64 devid) + u64 devid) { struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; @@ -2144,7 +2148,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, if (device->bdev) { cur_devices->open_devices--; /* remove sysfs entry */ - btrfs_sysfs_remove_devices_dir(fs_devices, device); + btrfs_sysfs_remove_device(device); } num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; @@ -2165,14 +2169,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, btrfs_free_device(device); if (cur_devices->open_devices == 0) { - while (fs_devices) { - if (fs_devices->seed == cur_devices) { - fs_devices->seed = cur_devices->seed; - break; - } - fs_devices = fs_devices->seed; - } - cur_devices->seed = NULL; + list_del_init(&cur_devices->seed_list); close_fs_devices(cur_devices); free_fs_devices(cur_devices); } @@ -2221,14 +2218,9 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) { - struct btrfs_fs_info *fs_info = srcdev->fs_info; struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; - if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) { - /* zero out the old super if it is writable */ - btrfs_scratch_superblocks(fs_info, srcdev->bdev, - srcdev->name->str); - } + mutex_lock(&uuid_mutex); btrfs_close_bdev(srcdev); synchronize_rcu(); @@ -2236,8 +2228,6 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) /* if this is no devs we rather delete the fs_devices */ if (!fs_devices->num_devices) { - struct btrfs_fs_devices *tmp_fs_devices; - /* * On a mounted FS, num_devices can't be zero unless it's a * seed. In case of a seed device being replaced, the replace @@ -2246,18 +2236,11 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) */ ASSERT(fs_devices->seeding); - tmp_fs_devices = fs_info->fs_devices; - while (tmp_fs_devices) { - if (tmp_fs_devices->seed == fs_devices) { - tmp_fs_devices->seed = fs_devices->seed; - break; - } - tmp_fs_devices = tmp_fs_devices->seed; - } - fs_devices->seed = NULL; + list_del_init(&fs_devices->seed_list); close_fs_devices(fs_devices); free_fs_devices(fs_devices); } + mutex_unlock(&uuid_mutex); } void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) @@ -2266,7 +2249,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) mutex_lock(&fs_devices->device_list_mutex); - btrfs_sysfs_remove_devices_dir(fs_devices, tgtdev); + btrfs_sysfs_remove_device(tgtdev); if (tgtdev->bdev) fs_devices->open_devices--; @@ -2375,10 +2358,20 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) if (!fs_devices->seeding) return -EINVAL; + /* + * Private copy of the seed devices, anchored at + * fs_info->fs_devices->seed_list + */ seed_devices = alloc_fs_devices(NULL, NULL); if (IS_ERR(seed_devices)) return PTR_ERR(seed_devices); + /* + * It's necessary to retain a copy of the original seed fs_devices in + * fs_uuids so that filesystems which have been seeded can successfully + * reference the seed device from open_seed_devices. This also supports + * multiple fs seed. + */ old_devices = clone_fs_devices(fs_devices); if (IS_ERR(old_devices)) { kfree(seed_devices); @@ -2399,16 +2392,12 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) list_for_each_entry(device, &seed_devices->devices, dev_list) device->fs_devices = seed_devices; - mutex_lock(&fs_info->chunk_mutex); - list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); - mutex_unlock(&fs_info->chunk_mutex); - fs_devices->seeding = false; fs_devices->num_devices = 0; fs_devices->open_devices = 0; fs_devices->missing_devices = 0; fs_devices->rotating = false; - fs_devices->seed = seed_devices; + list_add(&seed_devices->seed_list, &fs_devices->seed_list); generate_random_uuid(fs_devices->fsid); memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE); @@ -2511,7 +2500,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path u64 orig_super_num_devices; int seeding_dev = 0; int ret = 0; - bool unlocked = false; + bool locked = false; if (sb_rdonly(sb) && !fs_devices->seeding) return -EROFS; @@ -2525,20 +2514,20 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path seeding_dev = 1; down_write(&sb->s_umount); mutex_lock(&uuid_mutex); + locked = true; } - filemap_write_and_wait(bdev->bd_inode->i_mapping); + sync_blockdev(bdev); - mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) { + rcu_read_lock(); + list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { if (device->bdev == bdev) { ret = -EEXIST; - mutex_unlock( - &fs_devices->device_list_mutex); + rcu_read_unlock(); goto error; } } - mutex_unlock(&fs_devices->device_list_mutex); + rcu_read_unlock(); device = btrfs_alloc_device(fs_info, NULL, NULL); if (IS_ERR(device)) { @@ -2613,9 +2602,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path btrfs_set_super_num_devices(fs_info->super_copy, orig_super_num_devices + 1); - /* add sysfs device entry */ - btrfs_sysfs_add_devices_dir(fs_devices, device); - /* * we've got more storage, clear any full flags on the space * infos @@ -2623,6 +2609,10 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path btrfs_clear_space_info_full(fs_info); mutex_unlock(&fs_info->chunk_mutex); + + /* Add sysfs device entry */ + btrfs_sysfs_add_device(device); + mutex_unlock(&fs_devices->device_list_mutex); if (seeding_dev) { @@ -2648,8 +2638,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path goto error_sysfs; } - btrfs_sysfs_update_sprout_fsid(fs_devices, - fs_info->fs_devices->fsid); + /* + * fs_devices now represents the newly sprouted filesystem and + * its fsid has been changed by btrfs_prepare_sprout + */ + btrfs_sysfs_update_sprout_fsid(fs_devices); } ret = btrfs_commit_transaction(trans); @@ -2657,7 +2650,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (seeding_dev) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); - unlocked = true; + locked = false; if (ret) /* transaction commit */ return ret; @@ -2692,7 +2685,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path return ret; error_sysfs: - btrfs_sysfs_remove_devices_dir(fs_devices, device); + btrfs_sysfs_remove_device(device); mutex_lock(&fs_info->fs_devices->device_list_mutex); mutex_lock(&fs_info->chunk_mutex); list_del_rcu(&device->dev_list); @@ -2718,7 +2711,7 @@ error_free_device: btrfs_free_device(device); error: blkdev_put(bdev, FMODE_EXCL); - if (seeding_dev && !unlocked) { + if (locked) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); } @@ -4045,7 +4038,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, /* * rw_devices will not change at the moment, device add/delete/replace - * are excluded by EXCL_OP + * are exclusive */ num_devices = fs_info->fs_devices->rw_devices; @@ -4181,7 +4174,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, if ((ret && ret != -ECANCELED && ret != -ENOSPC) || balance_need_close(fs_info)) { reset_balance_state(fs_info); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); } wake_up(&fs_info->balance_wait_q); @@ -4192,7 +4185,7 @@ out: reset_balance_state(fs_info); else kfree(bctl); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); return ret; } @@ -4294,7 +4287,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) * is in a paused state and must have fs_info::balance_ctl properly * set up. */ - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) btrfs_warn(fs_info, "balance: cannot set exclusive op status, resume manually"); @@ -4376,7 +4369,7 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) if (fs_info->balance_ctl) { reset_balance_state(fs_info); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); btrfs_info(fs_info, "balance: canceled"); } } @@ -6461,11 +6454,21 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, bool seed) { struct btrfs_device *device; + struct btrfs_fs_devices *seed_devs; + + if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (device->devid == devid && + (!uuid || memcmp(device->uuid, uuid, + BTRFS_UUID_SIZE) == 0)) + return device; + } + } - while (fs_devices) { + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { if (!fsid || - !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { - list_for_each_entry(device, &fs_devices->devices, + !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { + list_for_each_entry(device, &seed_devs->devices, dev_list) { if (device->devid == devid && (!uuid || memcmp(device->uuid, uuid, @@ -6473,11 +6476,8 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, return device; } } - if (seed) - fs_devices = fs_devices->seed; - else - return NULL; } + return NULL; } @@ -6532,7 +6532,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, if (WARN_ON(!devid && !fs_info)) return ERR_PTR(-EINVAL); - dev = __alloc_device(); + dev = __alloc_device(fs_info); if (IS_ERR(dev)) return dev; @@ -6728,13 +6728,11 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, lockdep_assert_held(&uuid_mutex); ASSERT(fsid); - fs_devices = fs_info->fs_devices->seed; - while (fs_devices) { + /* This will match only for multi-device seed fs */ + list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list) if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) return fs_devices; - fs_devices = fs_devices->seed; - } fs_devices = find_fsid(fsid, NULL); if (!fs_devices) { @@ -6750,6 +6748,10 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, return fs_devices; } + /* + * Upon first call for a seed fs fsid, just create a private copy of the + * respective fs_devices and anchor it at fs_info->fs_devices->seed_list + */ fs_devices = clone_fs_devices(fs_devices); if (IS_ERR(fs_devices)) return fs_devices; @@ -6757,20 +6759,17 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder); if (ret) { free_fs_devices(fs_devices); - fs_devices = ERR_PTR(ret); - goto out; + return ERR_PTR(ret); } if (!fs_devices->seeding) { close_fs_devices(fs_devices); free_fs_devices(fs_devices); - fs_devices = ERR_PTR(-EINVAL); - goto out; + return ERR_PTR(-EINVAL); } - fs_devices->seed = fs_info->fs_devices->seed; - fs_info->fs_devices->seed = fs_devices; -out: + list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list); + return fs_devices; } @@ -7189,17 +7188,22 @@ error: void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; struct btrfs_device *device; - while (fs_devices) { - mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) + fs_devices->fs_info = fs_info; + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) + device->fs_info = fs_info; + + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { + list_for_each_entry(device, &seed_devs->devices, dev_list) device->fs_info = fs_info; - mutex_unlock(&fs_devices->device_list_mutex); - fs_devices = fs_devices->seed; + seed_devs->fs_info = fs_info; } + mutex_unlock(&fs_devices->device_list_mutex); } static u64 btrfs_dev_stats_value(const struct extent_buffer *eb, @@ -7225,17 +7229,53 @@ static void btrfs_set_dev_stats_value(struct extent_buffer *eb, sizeof(val)); } -int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) +static int btrfs_device_init_dev_stats(struct btrfs_device *device, + struct btrfs_path *path) { - struct btrfs_key key; - struct btrfs_root *dev_root = fs_info->dev_root; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_dev_stats_item *ptr; struct extent_buffer *eb; - int slot; - int ret = 0; + struct btrfs_key key; + int item_size; + int i, ret, slot; + + key.objectid = BTRFS_DEV_STATS_OBJECTID; + key.type = BTRFS_PERSISTENT_ITEM_KEY; + key.offset = device->devid; + ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0); + if (ret) { + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) + btrfs_dev_stat_set(device, i, 0); + device->dev_stats_valid = 1; + btrfs_release_path(path); + return ret < 0 ? ret : 0; + } + slot = path->slots[0]; + eb = path->nodes[0]; + item_size = btrfs_item_size_nr(eb, slot); + + ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item); + + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { + if (item_size >= (1 + i) * sizeof(__le64)) + btrfs_dev_stat_set(device, i, + btrfs_dev_stats_value(eb, ptr, i)); + else + btrfs_dev_stat_set(device, i, 0); + } + + device->dev_stats_valid = 1; + btrfs_dev_stat_print_on_load(device); + btrfs_release_path(path); + + return 0; +} + +int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; struct btrfs_device *device; struct btrfs_path *path = NULL; - int i; + int ret = 0; path = btrfs_alloc_path(); if (!path) @@ -7243,43 +7283,22 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { - int item_size; - struct btrfs_dev_stats_item *ptr; - - key.objectid = BTRFS_DEV_STATS_OBJECTID; - key.type = BTRFS_PERSISTENT_ITEM_KEY; - key.offset = device->devid; - ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); - if (ret) { - for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) - btrfs_dev_stat_set(device, i, 0); - device->dev_stats_valid = 1; - btrfs_release_path(path); - continue; - } - slot = path->slots[0]; - eb = path->nodes[0]; - item_size = btrfs_item_size_nr(eb, slot); - - ptr = btrfs_item_ptr(eb, slot, - struct btrfs_dev_stats_item); - - for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { - if (item_size >= (1 + i) * sizeof(__le64)) - btrfs_dev_stat_set(device, i, - btrfs_dev_stats_value(eb, ptr, i)); - else - btrfs_dev_stat_set(device, i, 0); + ret = btrfs_device_init_dev_stats(device, path); + if (ret) + goto out; + } + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { + list_for_each_entry(device, &seed_devs->devices, dev_list) { + ret = btrfs_device_init_dev_stats(device, path); + if (ret) + goto out; } - - device->dev_stats_valid = 1; - btrfs_dev_stat_print_on_load(device); - btrfs_release_path(path); } +out: mutex_unlock(&fs_devices->device_list_mutex); btrfs_free_path(path); - return ret < 0 ? ret : 0; + return ret; } static int update_dev_stat_item(struct btrfs_trans_handle *trans, @@ -7496,24 +7515,6 @@ void btrfs_commit_device_sizes(struct btrfs_transaction *trans) mutex_unlock(&trans->fs_info->chunk_mutex); } -void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info) -{ - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - while (fs_devices) { - fs_devices->fs_info = fs_info; - fs_devices = fs_devices->seed; - } -} - -void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) -{ - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - while (fs_devices) { - fs_devices->fs_info = NULL; - fs_devices = fs_devices->seed; - } -} - /* * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10. */ @@ -7594,8 +7595,11 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, /* It's possible this device is a dummy for seed device */ if (dev->disk_total_bytes == 0) { - dev = btrfs_find_device(fs_info->fs_devices->seed, devid, NULL, - NULL, false); + struct btrfs_fs_devices *devs; + + devs = list_first_entry(&fs_info->fs_devices->seed_list, + struct btrfs_fs_devices, seed_list); + dev = btrfs_find_device(devs, devid, NULL, NULL, false); if (!dev) { btrfs_err(fs_info, "failed to find seed devid %llu", devid); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 5eea93916fbf..bf27ac07d315 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -58,7 +58,7 @@ struct btrfs_device { struct btrfs_fs_devices *fs_devices; struct btrfs_fs_info *fs_info; - struct rcu_string *name; + struct rcu_string __rcu *name; u64 generation; @@ -246,7 +246,7 @@ struct btrfs_fs_devices { */ struct list_head alloc_list; - struct btrfs_fs_devices *seed; + struct list_head seed_list; bool seeding; int opened; @@ -435,7 +435,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, void *holder); int btrfs_forget_devices(const char *path); -int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); +void btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step); void btrfs_assign_next_active_device(struct btrfs_device *device, struct btrfs_device *this_dev); @@ -569,10 +569,11 @@ static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags) void btrfs_commit_device_sizes(struct btrfs_transaction *trans); struct list_head * __attribute_const__ btrfs_get_fs_uuids(void); -void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info); -void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info); bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, struct btrfs_device *failing_dev); +void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, + struct block_device *bdev, + const char *device_path); int btrfs_bg_type_to_factor(u64 flags); const char *btrfs_bg_type_to_raid_name(u64 flags); diff --git a/fs/buffer.c b/fs/buffer.c index 50bbc99e3d96..23f645657488 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -842,13 +842,13 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, struct buffer_head *bh, *head; gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT; long offset; - struct mem_cgroup *memcg; + struct mem_cgroup *memcg, *old_memcg; if (retry) gfp |= __GFP_NOFAIL; memcg = get_mem_cgroup_from_page(page); - memalloc_use_memcg(memcg); + old_memcg = set_active_memcg(memcg); head = NULL; offset = PAGE_SIZE; @@ -867,7 +867,7 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, set_bh_page(bh, page, offset); } out: - memalloc_unuse_memcg(); + set_active_memcg(old_memcg); mem_cgroup_put(memcg); return head; /* @@ -2771,16 +2771,6 @@ int nobh_writepage(struct page *page, get_block_t *get_block, /* Is the page fully outside i_size? (truncate in progress) */ offset = i_size & (PAGE_SIZE-1); if (page->index >= end_index+1 || !offset) { - /* - * The page may have dirty, unmapped buffers. For example, - * they may have been added in ext3_writepage(). Make them - * freeable here, so the page does not leak. - */ -#if 0 - /* Not really sure about this - do we need this ? */ - if (page->mapping->a_ops->invalidatepage) - page->mapping->a_ops->invalidatepage(page, offset); -#endif unlock_page(page); return 0; /* don't care */ } @@ -2975,12 +2965,6 @@ int block_write_full_page(struct page *page, get_block_t *get_block, /* Is the page fully outside i_size? (truncate in progress) */ offset = i_size & (PAGE_SIZE-1); if (page->index >= end_index+1 || !offset) { - /* - * The page may have dirty, unmapped buffers. For example, - * they may have been added in ext3_writepage(). Make them - * freeable here, so the page does not leak. - */ - do_invalidatepage(page, 0, PAGE_SIZE); unlock_page(page); return 0; /* don't care */ } diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 32f90dc82c84..d44df8f95bcd 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1208,7 +1208,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, rqst[1].rq_iov = si_iov; rqst[1].rq_nvec = 1; - len = sizeof(ea) + ea_name_len + ea_value_len + 1; + len = sizeof(*ea) + ea_name_len + ea_value_len + 1; ea = kzalloc(len, GFP_KERNEL); if (ea == NULL) { rc = -ENOMEM; diff --git a/fs/compat.c b/fs/compat.c deleted file mode 100644 index 436d228cf71c..000000000000 --- a/fs/compat.c +++ /dev/null @@ -1,132 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/fs/compat.c - * - * Kernel compatibililty routines for e.g. 32 bit syscall support - * on 64 bit kernels. - * - * Copyright (C) 2002 Stephen Rothwell, IBM Corporation - * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com) - * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be) - * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs - * Copyright (C) 2003 Pavel Machek (pavel@ucw.cz) - */ - -#include <linux/compat.h> -#include <linux/nfs4_mount.h> -#include <linux/syscalls.h> -#include <linux/slab.h> -#include <linux/uaccess.h> -#include "internal.h" - -struct compat_nfs_string { - compat_uint_t len; - compat_uptr_t data; -}; - -static inline void compat_nfs_string(struct nfs_string *dst, - struct compat_nfs_string *src) -{ - dst->data = compat_ptr(src->data); - dst->len = src->len; -} - -struct compat_nfs4_mount_data_v1 { - compat_int_t version; - compat_int_t flags; - compat_int_t rsize; - compat_int_t wsize; - compat_int_t timeo; - compat_int_t retrans; - compat_int_t acregmin; - compat_int_t acregmax; - compat_int_t acdirmin; - compat_int_t acdirmax; - struct compat_nfs_string client_addr; - struct compat_nfs_string mnt_path; - struct compat_nfs_string hostname; - compat_uint_t host_addrlen; - compat_uptr_t host_addr; - compat_int_t proto; - compat_int_t auth_flavourlen; - compat_uptr_t auth_flavours; -}; - -static int do_nfs4_super_data_conv(void *raw_data) -{ - int version = *(compat_uint_t *) raw_data; - - if (version == 1) { - struct compat_nfs4_mount_data_v1 *raw = raw_data; - struct nfs4_mount_data *real = raw_data; - - /* copy the fields backwards */ - real->auth_flavours = compat_ptr(raw->auth_flavours); - real->auth_flavourlen = raw->auth_flavourlen; - real->proto = raw->proto; - real->host_addr = compat_ptr(raw->host_addr); - real->host_addrlen = raw->host_addrlen; - compat_nfs_string(&real->hostname, &raw->hostname); - compat_nfs_string(&real->mnt_path, &raw->mnt_path); - compat_nfs_string(&real->client_addr, &raw->client_addr); - real->acdirmax = raw->acdirmax; - real->acdirmin = raw->acdirmin; - real->acregmax = raw->acregmax; - real->acregmin = raw->acregmin; - real->retrans = raw->retrans; - real->timeo = raw->timeo; - real->wsize = raw->wsize; - real->rsize = raw->rsize; - real->flags = raw->flags; - real->version = raw->version; - } - - return 0; -} - -#define NFS4_NAME "nfs4" - -COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name, - const char __user *, dir_name, - const char __user *, type, compat_ulong_t, flags, - const void __user *, data) -{ - char *kernel_type; - void *options; - char *kernel_dev; - int retval; - - kernel_type = copy_mount_string(type); - retval = PTR_ERR(kernel_type); - if (IS_ERR(kernel_type)) - goto out; - - kernel_dev = copy_mount_string(dev_name); - retval = PTR_ERR(kernel_dev); - if (IS_ERR(kernel_dev)) - goto out1; - - options = copy_mount_options(data); - retval = PTR_ERR(options); - if (IS_ERR(options)) - goto out2; - - if (kernel_type && options) { - if (!strcmp(kernel_type, NFS4_NAME)) { - retval = -EINVAL; - if (do_nfs4_super_data_conv(options)) - goto out3; - } - } - - retval = do_mount(kernel_dev, dir_name, kernel_type, flags, options); - - out3: - kfree(options); - out2: - kfree(kernel_dev); - out1: - kfree(kernel_type); - out: - return retval; -} diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index ca2273727225..b0983e2a4e2c 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1168,7 +1168,7 @@ EXPORT_SYMBOL(configfs_depend_item); /* * Release the dependent linkage. This is much simpler than - * configfs_depend_item() because we know that that the client driver is + * configfs_depend_item() because we know that the client driver is * pinned, thus the subsystem is pinned, and therefore configfs is pinned. */ void configfs_undepend_item(struct config_item *target) diff --git a/fs/configfs/file.c b/fs/configfs/file.c index fb65b706cc0d..1f0270229d7b 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -267,7 +267,7 @@ flush_write_buffer(struct file *file, struct configfs_buffer *buffer, size_t cou * There is no easy way for us to know if userspace is only doing a partial * write, so we don't support them. We expect the entire buffer to come * on the first write. - * Hint: if you're writing a value, first read the file, modify only the + * Hint: if you're writing a value, first read the file, modify only * the value you're changing, then write entire buffer back. */ diff --git a/fs/coredump.c b/fs/coredump.c index 76e7c10edfc0..0cd9056d79cc 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -840,17 +840,17 @@ int dump_emit(struct coredump_params *cprm, const void *addr, int nr) ssize_t n; if (cprm->written + nr > cprm->limit) return 0; - while (nr) { - if (dump_interrupted()) - return 0; - n = __kernel_write(file, addr, nr, &pos); - if (n <= 0) - return 0; - file->f_pos = pos; - cprm->written += n; - cprm->pos += n; - nr -= n; - } + + + if (dump_interrupted()) + return 0; + n = __kernel_write(file, addr, nr, &pos); + if (n != nr) + return 0; + file->f_pos = pos; + cprm->written += n; + cprm->pos += n; + return 1; } EXPORT_SYMBOL(dump_emit); @@ -876,6 +876,40 @@ int dump_skip(struct coredump_params *cprm, size_t nr) } EXPORT_SYMBOL(dump_skip); +#ifdef CONFIG_ELF_CORE +int dump_user_range(struct coredump_params *cprm, unsigned long start, + unsigned long len) +{ + unsigned long addr; + + for (addr = start; addr < start + len; addr += PAGE_SIZE) { + struct page *page; + int stop; + + /* + * To avoid having to allocate page tables for virtual address + * ranges that have never been used yet, and also to make it + * easy to generate sparse core files, use a helper that returns + * NULL when encountering an empty page table entry that would + * otherwise have been filled with the zero page. + */ + page = get_dump_page(addr); + if (page) { + void *kaddr = kmap(page); + + stop = !dump_emit(cprm, kaddr, PAGE_SIZE); + kunmap(page); + put_page(page); + } else { + stop = !dump_skip(cprm, PAGE_SIZE); + } + if (stop) + return 0; + } + return 1; +} +#endif + int dump_align(struct coredump_params *cprm, int align) { unsigned mod = cprm->pos & (align - 1); @@ -902,3 +936,183 @@ void dump_truncate(struct coredump_params *cprm) } } EXPORT_SYMBOL(dump_truncate); + +/* + * The purpose of always_dump_vma() is to make sure that special kernel mappings + * that are useful for post-mortem analysis are included in every core dump. + * In that way we ensure that the core dump is fully interpretable later + * without matching up the same kernel and hardware config to see what PC values + * meant. These special mappings include - vDSO, vsyscall, and other + * architecture specific mappings + */ +static bool always_dump_vma(struct vm_area_struct *vma) +{ + /* Any vsyscall mappings? */ + if (vma == get_gate_vma(vma->vm_mm)) + return true; + + /* + * Assume that all vmas with a .name op should always be dumped. + * If this changes, a new vm_ops field can easily be added. + */ + if (vma->vm_ops && vma->vm_ops->name && vma->vm_ops->name(vma)) + return true; + + /* + * arch_vma_name() returns non-NULL for special architecture mappings, + * such as vDSO sections. + */ + if (arch_vma_name(vma)) + return true; + + return false; +} + +/* + * Decide how much of @vma's contents should be included in a core dump. + */ +static unsigned long vma_dump_size(struct vm_area_struct *vma, + unsigned long mm_flags) +{ +#define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type)) + + /* always dump the vdso and vsyscall sections */ + if (always_dump_vma(vma)) + goto whole; + + if (vma->vm_flags & VM_DONTDUMP) + return 0; + + /* support for DAX */ + if (vma_is_dax(vma)) { + if ((vma->vm_flags & VM_SHARED) && FILTER(DAX_SHARED)) + goto whole; + if (!(vma->vm_flags & VM_SHARED) && FILTER(DAX_PRIVATE)) + goto whole; + return 0; + } + + /* Hugetlb memory check */ + if (is_vm_hugetlb_page(vma)) { + if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED)) + goto whole; + if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE)) + goto whole; + return 0; + } + + /* Do not dump I/O mapped devices or special mappings */ + if (vma->vm_flags & VM_IO) + return 0; + + /* By default, dump shared memory if mapped from an anonymous file. */ + if (vma->vm_flags & VM_SHARED) { + if (file_inode(vma->vm_file)->i_nlink == 0 ? + FILTER(ANON_SHARED) : FILTER(MAPPED_SHARED)) + goto whole; + return 0; + } + + /* Dump segments that have been written to. */ + if ((!IS_ENABLED(CONFIG_MMU) || vma->anon_vma) && FILTER(ANON_PRIVATE)) + goto whole; + if (vma->vm_file == NULL) + return 0; + + if (FILTER(MAPPED_PRIVATE)) + goto whole; + + /* + * If this is the beginning of an executable file mapping, + * dump the first page to aid in determining what was mapped here. + */ + if (FILTER(ELF_HEADERS) && + vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ) && + (READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0) + return PAGE_SIZE; + +#undef FILTER + + return 0; + +whole: + return vma->vm_end - vma->vm_start; +} + +static struct vm_area_struct *first_vma(struct task_struct *tsk, + struct vm_area_struct *gate_vma) +{ + struct vm_area_struct *ret = tsk->mm->mmap; + + if (ret) + return ret; + return gate_vma; +} + +/* + * Helper function for iterating across a vma list. It ensures that the caller + * will visit `gate_vma' prior to terminating the search. + */ +static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma, + struct vm_area_struct *gate_vma) +{ + struct vm_area_struct *ret; + + ret = this_vma->vm_next; + if (ret) + return ret; + if (this_vma == gate_vma) + return NULL; + return gate_vma; +} + +/* + * Under the mmap_lock, take a snapshot of relevant information about the task's + * VMAs. + */ +int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, + struct core_vma_metadata **vma_meta, + size_t *vma_data_size_ptr) +{ + struct vm_area_struct *vma, *gate_vma; + struct mm_struct *mm = current->mm; + int i; + size_t vma_data_size = 0; + + /* + * Once the stack expansion code is fixed to not change VMA bounds + * under mmap_lock in read mode, this can be changed to take the + * mmap_lock in read mode. + */ + if (mmap_write_lock_killable(mm)) + return -EINTR; + + gate_vma = get_gate_vma(mm); + *vma_count = mm->map_count + (gate_vma ? 1 : 0); + + *vma_meta = kvmalloc_array(*vma_count, sizeof(**vma_meta), GFP_KERNEL); + if (!*vma_meta) { + mmap_write_unlock(mm); + return -ENOMEM; + } + + for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; + vma = next_vma(vma, gate_vma), i++) { + struct core_vma_metadata *m = (*vma_meta) + i; + + m->start = vma->vm_start; + m->end = vma->vm_end; + m->flags = vma->vm_flags; + m->dump_size = vma_dump_size(vma, cprm->mm_flags); + + vma_data_size += m->dump_size; + } + + mmap_write_unlock(mm); + + if (WARN_ON(i != *vma_count)) + return -EFAULT; + + *vma_data_size_ptr = vma_data_size; + return 0; +} diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 9212325763b0..4ef3f714046a 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -343,9 +343,11 @@ void fscrypt_msg(const struct inode *inode, const char *level, va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - if (inode) + if (inode && inode->i_ino) printk("%sfscrypt (%s, inode %lu): %pV\n", level, inode->i_sb->s_id, inode->i_ino, &vaf); + else if (inode) + printk("%sfscrypt (%s): %pV\n", level, inode->i_sb->s_id, &vaf); else printk("%sfscrypt: %pV\n", level, &vaf); va_end(args); diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 011830f84d8d..1fbe6c24d705 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -61,15 +61,6 @@ struct fscrypt_nokey_name { */ #define FSCRYPT_NOKEY_NAME_MAX offsetofend(struct fscrypt_nokey_name, sha256) -static void fscrypt_do_sha256(const u8 *data, unsigned int data_len, u8 *result) -{ - struct sha256_state sctx; - - sha256_init(&sctx); - sha256_update(&sctx, data, data_len); - sha256_final(&sctx, result); -} - static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) { if (str->len == 1 && str->name[0] == '.') @@ -242,11 +233,11 @@ static int base64_decode(const char *src, int len, u8 *dst) return cp - dst; } -bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, - u32 max_len, u32 *encrypted_len_ret) +bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, + u32 orig_len, u32 max_len, + u32 *encrypted_len_ret) { - const struct fscrypt_info *ci = inode->i_crypt_info; - int padding = 4 << (fscrypt_policy_flags(&ci->ci_policy) & + int padding = 4 << (fscrypt_policy_flags(policy) & FSCRYPT_POLICY_FLAGS_PAD_MASK); u32 encrypted_len; @@ -260,8 +251,6 @@ bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, /** * fscrypt_fname_alloc_buffer() - allocate a buffer for presented filenames - * @inode: inode of the parent directory (for regular filenames) - * or of the symlink (for symlink targets) * @max_encrypted_len: maximum length of encrypted filenames the buffer will be * used to present * @crypto_str: (output) buffer to allocate @@ -271,8 +260,7 @@ bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, * * Return: 0 on success, -errno on failure */ -int fscrypt_fname_alloc_buffer(const struct inode *inode, - u32 max_encrypted_len, +int fscrypt_fname_alloc_buffer(u32 max_encrypted_len, struct fscrypt_str *crypto_str) { const u32 max_encoded_len = BASE64_CHARS(FSCRYPT_NOKEY_NAME_MAX); @@ -369,9 +357,9 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode, } else { memcpy(nokey_name.bytes, iname->name, sizeof(nokey_name.bytes)); /* Compute strong hash of remaining part of name. */ - fscrypt_do_sha256(&iname->name[sizeof(nokey_name.bytes)], - iname->len - sizeof(nokey_name.bytes), - nokey_name.sha256); + sha256(&iname->name[sizeof(nokey_name.bytes)], + iname->len - sizeof(nokey_name.bytes), + nokey_name.sha256); size = FSCRYPT_NOKEY_NAME_MAX; } oname->len = base64_encode((const u8 *)&nokey_name, size, oname->name); @@ -394,9 +382,9 @@ EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); * directory's encryption key, then @iname is the plaintext, so we encrypt it to * get the disk_name. * - * Else, for keyless @lookup operations, @iname is the presented ciphertext, so - * we decode it to get the fscrypt_nokey_name. Non-@lookup operations will be - * impossible in this case, so we fail them with ENOKEY. + * Else, for keyless @lookup operations, @iname should be a no-key name, so we + * decode it to get the struct fscrypt_nokey_name. Non-@lookup operations will + * be impossible in this case, so we fail them with ENOKEY. * * If successful, fscrypt_free_filename() must be called later to clean up. * @@ -421,7 +409,8 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, return ret; if (fscrypt_has_encryption_key(dir)) { - if (!fscrypt_fname_encrypted_size(dir, iname->len, + if (!fscrypt_fname_encrypted_size(&dir->i_crypt_info->ci_policy, + iname->len, dir->i_sb->s_cop->max_namelen, &fname->crypto_buf.len)) return -ENAMETOOLONG; @@ -440,7 +429,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, } if (!lookup) return -ENOKEY; - fname->is_ciphertext_name = true; + fname->is_nokey_name = true; /* * We don't have the key and we are doing a lookup; decode the @@ -499,7 +488,7 @@ bool fscrypt_match_name(const struct fscrypt_name *fname, { const struct fscrypt_nokey_name *nokey_name = (const void *)fname->crypto_buf.name; - u8 sha256[SHA256_DIGEST_SIZE]; + u8 digest[SHA256_DIGEST_SIZE]; if (likely(fname->disk_name.name)) { if (de_name_len != fname->disk_name.len) @@ -510,9 +499,9 @@ bool fscrypt_match_name(const struct fscrypt_name *fname, return false; if (memcmp(de_name, nokey_name->bytes, sizeof(nokey_name->bytes))) return false; - fscrypt_do_sha256(&de_name[sizeof(nokey_name->bytes)], - de_name_len - sizeof(nokey_name->bytes), sha256); - return !memcmp(sha256, nokey_name->sha256, sizeof(sha256)); + sha256(&de_name[sizeof(nokey_name->bytes)], + de_name_len - sizeof(nokey_name->bytes), digest); + return !memcmp(digest, nokey_name->sha256, sizeof(digest)); } EXPORT_SYMBOL_GPL(fscrypt_match_name); @@ -541,7 +530,7 @@ EXPORT_SYMBOL_GPL(fscrypt_fname_siphash); * Validate dentries in encrypted directories to make sure we aren't potentially * caching stale dentries after a key has been added. */ -static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) +int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) { struct dentry *dir; int err; @@ -549,17 +538,17 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) /* * Plaintext names are always valid, since fscrypt doesn't support - * reverting to ciphertext names without evicting the directory's inode + * reverting to no-key names without evicting the directory's inode * -- which implies eviction of the dentries in the directory. */ - if (!(dentry->d_flags & DCACHE_ENCRYPTED_NAME)) + if (!(dentry->d_flags & DCACHE_NOKEY_NAME)) return 1; /* - * Ciphertext name; valid if the directory's key is still unavailable. + * No-key name; valid if the directory's key is still unavailable. * - * Although fscrypt forbids rename() on ciphertext names, we still must - * use dget_parent() here rather than use ->d_parent directly. That's + * Although fscrypt forbids rename() on no-key names, we still must use + * dget_parent() here rather than use ->d_parent directly. That's * because a corrupted fs image may contain directory hard links, which * the VFS handles by moving the directory's dentry tree in the dcache * each time ->lookup() finds the directory and it already has a dentry @@ -580,6 +569,7 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) return valid; } +EXPORT_SYMBOL_GPL(fscrypt_d_revalidate); const struct dentry_operations fscrypt_d_ops = { .d_revalidate = fscrypt_d_revalidate, diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 8117a61b6f55..4f5806a3b73d 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -97,7 +97,6 @@ static inline const u8 *fscrypt_context_nonce(const union fscrypt_context *ctx) return NULL; } -#undef fscrypt_policy union fscrypt_policy { u8 version; struct fscrypt_policy_v1 v1; @@ -292,8 +291,9 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num, /* fname.c */ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, u8 *out, unsigned int olen); -bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, - u32 max_len, u32 *encrypted_len_ret); +bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, + u32 orig_len, u32 max_len, + u32 *encrypted_len_ret); extern const struct dentry_operations fscrypt_d_ops; /* hkdf.c */ @@ -572,6 +572,9 @@ int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key); int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, const struct fscrypt_master_key *mk); +void fscrypt_hash_inode_number(struct fscrypt_info *ci, + const struct fscrypt_master_key *mk); + /* keysetup_v1.c */ void fscrypt_put_direct_key(struct fscrypt_direct_key *dk); @@ -590,5 +593,6 @@ bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, int fscrypt_policy_from_context(union fscrypt_policy *policy_u, const union fscrypt_context *ctx_u, int ctx_size); +const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir); #endif /* _FSCRYPT_PRIVATE_H */ diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 09fb8aa0f2e9..20b0df47fe6a 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -60,8 +60,8 @@ int __fscrypt_prepare_link(struct inode *inode, struct inode *dir, if (err) return err; - /* ... in case we looked up ciphertext name before key was added */ - if (dentry->d_flags & DCACHE_ENCRYPTED_NAME) + /* ... in case we looked up no-key name before key was added */ + if (dentry->d_flags & DCACHE_NOKEY_NAME) return -ENOKEY; if (!fscrypt_has_permitted_context(dir, inode)) @@ -85,9 +85,8 @@ int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, if (err) return err; - /* ... in case we looked up ciphertext name(s) before key was added */ - if ((old_dentry->d_flags | new_dentry->d_flags) & - DCACHE_ENCRYPTED_NAME) + /* ... in case we looked up no-key name(s) before key was added */ + if ((old_dentry->d_flags | new_dentry->d_flags) & DCACHE_NOKEY_NAME) return -ENOKEY; if (old_dir != new_dir) { @@ -114,9 +113,9 @@ int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, if (err && err != -ENOENT) return err; - if (fname->is_ciphertext_name) { + if (fname->is_nokey_name) { spin_lock(&dentry->d_lock); - dentry->d_flags |= DCACHE_ENCRYPTED_NAME; + dentry->d_flags |= DCACHE_NOKEY_NAME; spin_unlock(&dentry->d_lock); d_set_d_op(dentry, &fscrypt_d_ops); } @@ -166,26 +165,51 @@ int fscrypt_prepare_setflags(struct inode *inode, return 0; } -int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len, - unsigned int max_len, - struct fscrypt_str *disk_link) +/** + * fscrypt_prepare_symlink() - prepare to create a possibly-encrypted symlink + * @dir: directory in which the symlink is being created + * @target: plaintext symlink target + * @len: length of @target excluding null terminator + * @max_len: space the filesystem has available to store the symlink target + * @disk_link: (out) the on-disk symlink target being prepared + * + * This function computes the size the symlink target will require on-disk, + * stores it in @disk_link->len, and validates it against @max_len. An + * encrypted symlink may be longer than the original. + * + * Additionally, @disk_link->name is set to @target if the symlink will be + * unencrypted, but left NULL if the symlink will be encrypted. For encrypted + * symlinks, the filesystem must call fscrypt_encrypt_symlink() to create the + * on-disk target later. (The reason for the two-step process is that some + * filesystems need to know the size of the symlink target before creating the + * inode, e.g. to determine whether it will be a "fast" or "slow" symlink.) + * + * Return: 0 on success, -ENAMETOOLONG if the symlink target is too long, + * -ENOKEY if the encryption key is missing, or another -errno code if a problem + * occurred while setting up the encryption key. + */ +int fscrypt_prepare_symlink(struct inode *dir, const char *target, + unsigned int len, unsigned int max_len, + struct fscrypt_str *disk_link) { - int err; + const union fscrypt_policy *policy; /* * To calculate the size of the encrypted symlink target we need to know * the amount of NUL padding, which is determined by the flags set in * the encryption policy which will be inherited from the directory. - * The easiest way to get access to this is to just load the directory's - * fscrypt_info, since we'll need it to create the dir_entry anyway. - * - * Note: in test_dummy_encryption mode, @dir may be unencrypted. */ - err = fscrypt_get_encryption_info(dir); - if (err) - return err; - if (!fscrypt_has_encryption_key(dir)) - return -ENOKEY; + policy = fscrypt_policy_to_inherit(dir); + if (policy == NULL) { + /* Not encrypted */ + disk_link->name = (unsigned char *)target; + disk_link->len = len + 1; + if (disk_link->len > max_len) + return -ENAMETOOLONG; + return 0; + } + if (IS_ERR(policy)) + return PTR_ERR(policy); /* * Calculate the size of the encrypted symlink and verify it won't @@ -198,7 +222,7 @@ int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len, * counting it (even though it is meaningless for ciphertext) is simpler * for now since filesystems will assume it is there and subtract it. */ - if (!fscrypt_fname_encrypted_size(dir, len, + if (!fscrypt_fname_encrypted_size(policy, len, max_len - sizeof(struct fscrypt_symlink_data), &disk_link->len)) return -ENAMETOOLONG; @@ -207,7 +231,7 @@ int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len, disk_link->name = NULL; return 0; } -EXPORT_SYMBOL_GPL(__fscrypt_prepare_symlink); +EXPORT_SYMBOL_GPL(fscrypt_prepare_symlink); int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, unsigned int len, struct fscrypt_str *disk_link) @@ -217,9 +241,13 @@ int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, struct fscrypt_symlink_data *sd; unsigned int ciphertext_len; - err = fscrypt_require_key(inode); - if (err) - return err; + /* + * fscrypt_prepare_new_inode() should have already set up the new + * symlink inode's encryption key. We don't wait until now to do it, + * since we may be in a filesystem transaction now. + */ + if (WARN_ON_ONCE(!fscrypt_has_encryption_key(inode))) + return -ENOKEY; if (disk_link->name) { /* filesystem-provided buffer */ @@ -319,7 +347,7 @@ const char *fscrypt_get_symlink(struct inode *inode, const void *caddr, if (cstr.len + sizeof(*sd) - 1 > max_size) return ERR_PTR(-EUCLEAN); - err = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr); + err = fscrypt_fname_alloc_buffer(cstr.len, &pstr); if (err) return ERR_PTR(err); diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index faa25541ccb6..89bffa82ed74 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -106,7 +106,7 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci) crypto_cfg.data_unit_size = sb->s_blocksize; crypto_cfg.dun_bytes = fscrypt_get_dun_bytes(ci); num_devs = fscrypt_get_num_devices(sb); - devs = kmalloc_array(num_devs, sizeof(*devs), GFP_NOFS); + devs = kmalloc_array(num_devs, sizeof(*devs), GFP_KERNEL); if (!devs) return -ENOMEM; fscrypt_get_devices(sb, num_devs, devs); @@ -135,9 +135,8 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, struct fscrypt_blk_crypto_key *blk_key; int err; int i; - unsigned int flags; - blk_key = kzalloc(struct_size(blk_key, devs, num_devs), GFP_NOFS); + blk_key = kzalloc(struct_size(blk_key, devs, num_devs), GFP_KERNEL); if (!blk_key) return -ENOMEM; @@ -166,10 +165,8 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, } queue_refs++; - flags = memalloc_nofs_save(); err = blk_crypto_start_using_key(&blk_key->base, blk_key->devs[i]); - memalloc_nofs_restore(flags); if (err) { fscrypt_err(inode, "error %d starting to use blk-crypto", err); diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index e74f239c4428..53cc552a7b8f 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -817,6 +817,7 @@ static int check_for_busy_inodes(struct super_block *sb, struct list_head *pos; size_t busy_count = 0; unsigned long ino; + char ino_str[50] = ""; spin_lock(&mk->mk_decrypted_inodes_lock); @@ -838,11 +839,15 @@ static int check_for_busy_inodes(struct super_block *sb, } spin_unlock(&mk->mk_decrypted_inodes_lock); + /* If the inode is currently being created, ino may still be 0. */ + if (ino) + snprintf(ino_str, sizeof(ino_str), ", including ino %lu", ino); + fscrypt_warn(NULL, - "%s: %zu inode(s) still busy after removing key with %s %*phN, including ino %lu", + "%s: %zu inode(s) still busy after removing key with %s %*phN%s", sb->s_id, busy_count, master_key_spec_type(&mk->mk_spec), master_key_spec_len(&mk->mk_spec), (u8 *)&mk->mk_spec.u, - ino); + ino_str); return -EBUSY; } diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index fea6226afc2b..d3c3e5d9b41f 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -10,6 +10,7 @@ #include <crypto/skcipher.h> #include <linux/key.h> +#include <linux/random.h> #include "fscrypt_private.h" @@ -222,6 +223,16 @@ int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, return 0; } +void fscrypt_hash_inode_number(struct fscrypt_info *ci, + const struct fscrypt_master_key *mk) +{ + WARN_ON(ci->ci_inode->i_ino == 0); + WARN_ON(!mk->mk_ino_hash_key_initialized); + + ci->ci_hashed_ino = (u32)siphash_1u64(ci->ci_inode->i_ino, + &mk->mk_ino_hash_key); +} + static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_info *ci, struct fscrypt_master_key *mk) { @@ -254,13 +265,20 @@ unlock: return err; } - ci->ci_hashed_ino = (u32)siphash_1u64(ci->ci_inode->i_ino, - &mk->mk_ino_hash_key); + /* + * New inodes may not have an inode number assigned yet. + * Hashing their inode number is delayed until later. + */ + if (ci->ci_inode->i_ino == 0) + WARN_ON(!(ci->ci_inode->i_state & I_CREATING)); + else + fscrypt_hash_inode_number(ci, mk); return 0; } static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, - struct fscrypt_master_key *mk) + struct fscrypt_master_key *mk, + bool need_dirhash_key) { int err; @@ -306,7 +324,7 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, return err; /* Derive a secret dirhash key for directories that need it. */ - if (S_ISDIR(ci->ci_inode->i_mode) && IS_CASEFOLDED(ci->ci_inode)) { + if (need_dirhash_key) { err = fscrypt_derive_dirhash_key(ci, mk); if (err) return err; @@ -326,6 +344,7 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, * key being removed with a new inode starting to use it. */ static int setup_file_encryption_key(struct fscrypt_info *ci, + bool need_dirhash_key, struct key **master_key_ret) { struct key *key; @@ -400,7 +419,7 @@ static int setup_file_encryption_key(struct fscrypt_info *ci, err = fscrypt_setup_v1_file_key(ci, mk->mk_secret.raw); break; case FSCRYPT_POLICY_V2: - err = fscrypt_setup_v2_file_key(ci, mk); + err = fscrypt_setup_v2_file_key(ci, mk, need_dirhash_key); break; default: WARN_ON(1); @@ -454,57 +473,28 @@ static void put_crypt_info(struct fscrypt_info *ci) kmem_cache_free(fscrypt_info_cachep, ci); } -int fscrypt_get_encryption_info(struct inode *inode) +static int +fscrypt_setup_encryption_info(struct inode *inode, + const union fscrypt_policy *policy, + const u8 nonce[FSCRYPT_FILE_NONCE_SIZE], + bool need_dirhash_key) { struct fscrypt_info *crypt_info; - union fscrypt_context ctx; struct fscrypt_mode *mode; struct key *master_key = NULL; int res; - if (fscrypt_has_encryption_key(inode)) - return 0; - res = fscrypt_initialize(inode->i_sb->s_cop->flags); if (res) return res; - res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); - if (res < 0) { - const union fscrypt_context *dummy_ctx = - fscrypt_get_dummy_context(inode->i_sb); - - if (IS_ENCRYPTED(inode) || !dummy_ctx) { - fscrypt_warn(inode, - "Error %d getting encryption context", - res); - return res; - } - /* Fake up a context for an unencrypted directory */ - res = fscrypt_context_size(dummy_ctx); - memcpy(&ctx, dummy_ctx, res); - } - - crypt_info = kmem_cache_zalloc(fscrypt_info_cachep, GFP_NOFS); + crypt_info = kmem_cache_zalloc(fscrypt_info_cachep, GFP_KERNEL); if (!crypt_info) return -ENOMEM; crypt_info->ci_inode = inode; - - res = fscrypt_policy_from_context(&crypt_info->ci_policy, &ctx, res); - if (res) { - fscrypt_warn(inode, - "Unrecognized or corrupt encryption context"); - goto out; - } - - memcpy(crypt_info->ci_nonce, fscrypt_context_nonce(&ctx), - FSCRYPT_FILE_NONCE_SIZE); - - if (!fscrypt_supported_policy(&crypt_info->ci_policy, inode)) { - res = -EINVAL; - goto out; - } + crypt_info->ci_policy = *policy; + memcpy(crypt_info->ci_nonce, nonce, FSCRYPT_FILE_NONCE_SIZE); mode = select_encryption_mode(&crypt_info->ci_policy, inode); if (IS_ERR(mode)) { @@ -514,13 +504,14 @@ int fscrypt_get_encryption_info(struct inode *inode) WARN_ON(mode->ivsize > FSCRYPT_MAX_IV_SIZE); crypt_info->ci_mode = mode; - res = setup_file_encryption_key(crypt_info, &master_key); + res = setup_file_encryption_key(crypt_info, need_dirhash_key, + &master_key); if (res) goto out; /* - * Multiple tasks may race to set ->i_crypt_info, so use - * cmpxchg_release(). This pairs with the smp_load_acquire() in + * For existing inodes, multiple tasks may race to set ->i_crypt_info. + * So use cmpxchg_release(). This pairs with the smp_load_acquire() in * fscrypt_get_info(). I.e., here we publish ->i_crypt_info with a * RELEASE barrier so that other tasks can ACQUIRE it. */ @@ -550,14 +541,113 @@ out: up_read(&mk->mk_secret_sem); key_put(master_key); } + put_crypt_info(crypt_info); + return res; +} + +/** + * fscrypt_get_encryption_info() - set up an inode's encryption key + * @inode: the inode to set up the key for. Must be encrypted. + * + * Set up ->i_crypt_info, if it hasn't already been done. + * + * Note: unless ->i_crypt_info is already set, this isn't %GFP_NOFS-safe. So + * generally this shouldn't be called from within a filesystem transaction. + * + * Return: 0 if ->i_crypt_info was set or was already set, *or* if the + * encryption key is unavailable. (Use fscrypt_has_encryption_key() to + * distinguish these cases.) Also can return another -errno code. + */ +int fscrypt_get_encryption_info(struct inode *inode) +{ + int res; + union fscrypt_context ctx; + union fscrypt_policy policy; + + if (fscrypt_has_encryption_key(inode)) + return 0; + + res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); + if (res < 0) { + fscrypt_warn(inode, "Error %d getting encryption context", res); + return res; + } + + res = fscrypt_policy_from_context(&policy, &ctx, res); + if (res) { + fscrypt_warn(inode, + "Unrecognized or corrupt encryption context"); + return res; + } + + if (!fscrypt_supported_policy(&policy, inode)) + return -EINVAL; + + res = fscrypt_setup_encryption_info(inode, &policy, + fscrypt_context_nonce(&ctx), + IS_CASEFOLDED(inode) && + S_ISDIR(inode->i_mode)); if (res == -ENOKEY) res = 0; - put_crypt_info(crypt_info); return res; } EXPORT_SYMBOL(fscrypt_get_encryption_info); /** + * fscrypt_prepare_new_inode() - prepare to create a new inode in a directory + * @dir: a possibly-encrypted directory + * @inode: the new inode. ->i_mode must be set already. + * ->i_ino doesn't need to be set yet. + * @encrypt_ret: (output) set to %true if the new inode will be encrypted + * + * If the directory is encrypted, set up its ->i_crypt_info in preparation for + * encrypting the name of the new file. Also, if the new inode will be + * encrypted, set up its ->i_crypt_info and set *encrypt_ret=true. + * + * This isn't %GFP_NOFS-safe, and therefore it should be called before starting + * any filesystem transaction to create the inode. For this reason, ->i_ino + * isn't required to be set yet, as the filesystem may not have set it yet. + * + * This doesn't persist the new inode's encryption context. That still needs to + * be done later by calling fscrypt_set_context(). + * + * Return: 0 on success, -ENOKEY if the encryption key is missing, or another + * -errno code + */ +int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode, + bool *encrypt_ret) +{ + const union fscrypt_policy *policy; + u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; + + policy = fscrypt_policy_to_inherit(dir); + if (policy == NULL) + return 0; + if (IS_ERR(policy)) + return PTR_ERR(policy); + + if (WARN_ON_ONCE(inode->i_mode == 0)) + return -EINVAL; + + /* + * Only regular files, directories, and symlinks are encrypted. + * Special files like device nodes and named pipes aren't. + */ + if (!S_ISREG(inode->i_mode) && + !S_ISDIR(inode->i_mode) && + !S_ISLNK(inode->i_mode)) + return 0; + + *encrypt_ret = true; + + get_random_bytes(nonce, FSCRYPT_FILE_NONCE_SIZE); + return fscrypt_setup_encryption_info(inode, policy, nonce, + IS_CASEFOLDED(dir) && + S_ISDIR(inode->i_mode)); +} +EXPORT_SYMBOL_GPL(fscrypt_prepare_new_inode); + +/** * fscrypt_put_encryption_info() - free most of an inode's fscrypt data * @inode: an inode being evicted * diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c index a3cb52572b05..2762c5350432 100644 --- a/fs/crypto/keysetup_v1.c +++ b/fs/crypto/keysetup_v1.c @@ -60,7 +60,7 @@ static int derive_key_aes(const u8 *master_key, goto out; } crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); - req = skcipher_request_alloc(tfm, GFP_NOFS); + req = skcipher_request_alloc(tfm, GFP_KERNEL); if (!req) { res = -ENOMEM; goto out; @@ -99,7 +99,7 @@ find_and_lock_process_key(const char *prefix, const struct user_key_payload *ukp; const struct fscrypt_key *payload; - description = kasprintf(GFP_NOFS, "%s%*phN", prefix, + description = kasprintf(GFP_KERNEL, "%s%*phN", prefix, FSCRYPT_KEY_DESCRIPTOR_SIZE, descriptor); if (!description) return ERR_PTR(-ENOMEM); @@ -228,7 +228,7 @@ fscrypt_get_direct_key(const struct fscrypt_info *ci, const u8 *raw_key) return dk; /* Nope, allocate one. */ - dk = kzalloc(sizeof(*dk), GFP_NOFS); + dk = kzalloc(sizeof(*dk), GFP_KERNEL); if (!dk) return ERR_PTR(-ENOMEM); refcount_set(&dk->dk_refcount, 1); @@ -272,7 +272,7 @@ static int setup_v1_file_key_derived(struct fscrypt_info *ci, * This cannot be a stack buffer because it will be passed to the * scatterlist crypto API during derive_key_aes(). */ - derived_key = kmalloc(ci->ci_mode->keysize, GFP_NOFS); + derived_key = kmalloc(ci->ci_mode->keysize, GFP_KERNEL); if (!derived_key) return -ENOMEM; diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 2d73fd39ad96..4441d9944b9e 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -32,6 +32,14 @@ bool fscrypt_policies_equal(const union fscrypt_policy *policy1, return !memcmp(policy1, policy2, fscrypt_policy_size(policy1)); } +static const union fscrypt_policy * +fscrypt_get_dummy_policy(struct super_block *sb) +{ + if (!sb->s_cop->get_dummy_policy) + return NULL; + return sb->s_cop->get_dummy_policy(sb); +} + static bool fscrypt_valid_enc_modes(u32 contents_mode, u32 filenames_mode) { if (contents_mode == FSCRYPT_MODE_AES_256_XTS && @@ -192,10 +200,15 @@ static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy, 32, 32)) return false; + /* + * IV_INO_LBLK_32 hashes the inode number, so in principle it can + * support any ino_bits. However, currently the inode number is gotten + * from inode::i_ino which is 'unsigned long'. So for now the + * implementation limit is 32 bits. + */ if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) && - /* This uses hashed inode numbers, so ino_bits doesn't matter. */ !supported_iv_ino_lblk_policy(policy, inode, "IV_INO_LBLK_32", - INT_MAX, 32)) + 32, 32)) return false; if (memchr_inv(policy->__reserved, 0, sizeof(policy->__reserved))) { @@ -231,18 +244,19 @@ bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, } /** - * fscrypt_new_context_from_policy() - create a new fscrypt_context from - * an fscrypt_policy + * fscrypt_new_context() - create a new fscrypt_context * @ctx_u: output context * @policy_u: input policy + * @nonce: nonce to use * * Create an fscrypt_context for an inode that is being assigned the given - * encryption policy. A new nonce is randomly generated. + * encryption policy. @nonce must be a new random nonce. * * Return: the size of the new context in bytes. */ -static int fscrypt_new_context_from_policy(union fscrypt_context *ctx_u, - const union fscrypt_policy *policy_u) +static int fscrypt_new_context(union fscrypt_context *ctx_u, + const union fscrypt_policy *policy_u, + const u8 nonce[FSCRYPT_FILE_NONCE_SIZE]) { memset(ctx_u, 0, sizeof(*ctx_u)); @@ -260,7 +274,7 @@ static int fscrypt_new_context_from_policy(union fscrypt_context *ctx_u, memcpy(ctx->master_key_descriptor, policy->master_key_descriptor, sizeof(ctx->master_key_descriptor)); - get_random_bytes(ctx->nonce, sizeof(ctx->nonce)); + memcpy(ctx->nonce, nonce, FSCRYPT_FILE_NONCE_SIZE); return sizeof(*ctx); } case FSCRYPT_POLICY_V2: { @@ -276,7 +290,7 @@ static int fscrypt_new_context_from_policy(union fscrypt_context *ctx_u, memcpy(ctx->master_key_identifier, policy->master_key_identifier, sizeof(ctx->master_key_identifier)); - get_random_bytes(ctx->nonce, sizeof(ctx->nonce)); + memcpy(ctx->nonce, nonce, FSCRYPT_FILE_NONCE_SIZE); return sizeof(*ctx); } } @@ -372,6 +386,7 @@ static int fscrypt_get_policy(struct inode *inode, union fscrypt_policy *policy) static int set_encryption_policy(struct inode *inode, const union fscrypt_policy *policy) { + u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; union fscrypt_context ctx; int ctxsize; int err; @@ -409,7 +424,8 @@ static int set_encryption_policy(struct inode *inode, return -EINVAL; } - ctxsize = fscrypt_new_context_from_policy(&ctx, policy); + get_random_bytes(nonce, FSCRYPT_FILE_NONCE_SIZE); + ctxsize = fscrypt_new_context(&ctx, policy, nonce); return inode->i_sb->s_cop->set_context(inode, &ctx, ctxsize, NULL); } @@ -620,86 +636,99 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) } EXPORT_SYMBOL(fscrypt_has_permitted_context); +/* + * Return the encryption policy that new files in the directory will inherit, or + * NULL if none, or an ERR_PTR() on error. If the directory is encrypted, also + * ensure that its key is set up, so that the new filename can be encrypted. + */ +const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir) +{ + int err; + + if (IS_ENCRYPTED(dir)) { + err = fscrypt_require_key(dir); + if (err) + return ERR_PTR(err); + return &dir->i_crypt_info->ci_policy; + } + + return fscrypt_get_dummy_policy(dir->i_sb); +} + /** - * fscrypt_inherit_context() - Sets a child context from its parent - * @parent: Parent inode from which the context is inherited. - * @child: Child inode that inherits the context from @parent. - * @fs_data: private data given by FS. - * @preload: preload child i_crypt_info if true + * fscrypt_set_context() - Set the fscrypt context of a new inode + * @inode: a new inode + * @fs_data: private data given by FS and passed to ->set_context() + * + * This should be called after fscrypt_prepare_new_inode(), generally during a + * filesystem transaction. Everything here must be %GFP_NOFS-safe. * * Return: 0 on success, -errno on failure */ -int fscrypt_inherit_context(struct inode *parent, struct inode *child, - void *fs_data, bool preload) +int fscrypt_set_context(struct inode *inode, void *fs_data) { + struct fscrypt_info *ci = inode->i_crypt_info; union fscrypt_context ctx; int ctxsize; - struct fscrypt_info *ci; - int res; - - res = fscrypt_get_encryption_info(parent); - if (res < 0) - return res; - ci = fscrypt_get_info(parent); - if (ci == NULL) + /* fscrypt_prepare_new_inode() should have set up the key already. */ + if (WARN_ON_ONCE(!ci)) return -ENOKEY; - ctxsize = fscrypt_new_context_from_policy(&ctx, &ci->ci_policy); - BUILD_BUG_ON(sizeof(ctx) != FSCRYPT_SET_CONTEXT_MAX_SIZE); - res = parent->i_sb->s_cop->set_context(child, &ctx, ctxsize, fs_data); - if (res) - return res; - return preload ? fscrypt_get_encryption_info(child): 0; + ctxsize = fscrypt_new_context(&ctx, &ci->ci_policy, ci->ci_nonce); + + /* + * This may be the first time the inode number is available, so do any + * delayed key setup that requires the inode number. + */ + if (ci->ci_policy.version == FSCRYPT_POLICY_V2 && + (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) { + const struct fscrypt_master_key *mk = + ci->ci_master_key->payload.data[0]; + + fscrypt_hash_inode_number(ci, mk); + } + + return inode->i_sb->s_cop->set_context(inode, &ctx, ctxsize, fs_data); } -EXPORT_SYMBOL(fscrypt_inherit_context); +EXPORT_SYMBOL_GPL(fscrypt_set_context); /** * fscrypt_set_test_dummy_encryption() - handle '-o test_dummy_encryption' * @sb: the filesystem on which test_dummy_encryption is being specified - * @arg: the argument to the test_dummy_encryption option. - * If no argument was specified, then @arg->from == NULL. - * @dummy_ctx: the filesystem's current dummy context (input/output, see below) + * @arg: the argument to the test_dummy_encryption option. May be NULL. + * @dummy_policy: the filesystem's current dummy policy (input/output, see + * below) * * Handle the test_dummy_encryption mount option by creating a dummy encryption - * context, saving it in @dummy_ctx, and adding the corresponding dummy - * encryption key to the filesystem. If the @dummy_ctx is already set, then + * policy, saving it in @dummy_policy, and adding the corresponding dummy + * encryption key to the filesystem. If the @dummy_policy is already set, then * instead validate that it matches @arg. Don't support changing it via * remount, as that is difficult to do safely. * - * The reason we use an fscrypt_context rather than an fscrypt_policy is because - * we mustn't generate a new nonce each time we access a dummy-encrypted - * directory, as that would change the way filenames are encrypted. - * - * Return: 0 on success (dummy context set, or the same context is already set); - * -EEXIST if a different dummy context is already set; + * Return: 0 on success (dummy policy set, or the same policy is already set); + * -EEXIST if a different dummy policy is already set; * or another -errno value. */ -int fscrypt_set_test_dummy_encryption(struct super_block *sb, - const substring_t *arg, - struct fscrypt_dummy_context *dummy_ctx) +int fscrypt_set_test_dummy_encryption(struct super_block *sb, const char *arg, + struct fscrypt_dummy_policy *dummy_policy) { - const char *argstr = "v2"; - const char *argstr_to_free = NULL; struct fscrypt_key_specifier key_spec = { 0 }; int version; - union fscrypt_context *ctx = NULL; + union fscrypt_policy *policy = NULL; int err; - if (arg->from) { - argstr = argstr_to_free = match_strdup(arg); - if (!argstr) - return -ENOMEM; - } + if (!arg) + arg = "v2"; - if (!strcmp(argstr, "v1")) { - version = FSCRYPT_CONTEXT_V1; + if (!strcmp(arg, "v1")) { + version = FSCRYPT_POLICY_V1; key_spec.type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR; memset(key_spec.u.descriptor, 0x42, FSCRYPT_KEY_DESCRIPTOR_SIZE); - } else if (!strcmp(argstr, "v2")) { - version = FSCRYPT_CONTEXT_V2; + } else if (!strcmp(arg, "v2")) { + version = FSCRYPT_POLICY_V2; key_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER; /* key_spec.u.identifier gets filled in when adding the key */ } else { @@ -707,21 +736,8 @@ int fscrypt_set_test_dummy_encryption(struct super_block *sb, goto out; } - if (dummy_ctx->ctx) { - /* - * Note: if we ever make test_dummy_encryption support - * specifying other encryption settings, such as the encryption - * modes, we'll need to compare those settings here. - */ - if (dummy_ctx->ctx->version == version) - err = 0; - else - err = -EEXIST; - goto out; - } - - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); - if (!ctx) { + policy = kzalloc(sizeof(*policy), GFP_KERNEL); + if (!policy) { err = -ENOMEM; goto out; } @@ -730,18 +746,18 @@ int fscrypt_set_test_dummy_encryption(struct super_block *sb, if (err) goto out; - ctx->version = version; - switch (ctx->version) { - case FSCRYPT_CONTEXT_V1: - ctx->v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; - ctx->v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; - memcpy(ctx->v1.master_key_descriptor, key_spec.u.descriptor, + policy->version = version; + switch (policy->version) { + case FSCRYPT_POLICY_V1: + policy->v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; + policy->v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; + memcpy(policy->v1.master_key_descriptor, key_spec.u.descriptor, FSCRYPT_KEY_DESCRIPTOR_SIZE); break; - case FSCRYPT_CONTEXT_V2: - ctx->v2.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; - ctx->v2.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; - memcpy(ctx->v2.master_key_identifier, key_spec.u.identifier, + case FSCRYPT_POLICY_V2: + policy->v2.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; + policy->v2.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; + memcpy(policy->v2.master_key_identifier, key_spec.u.identifier, FSCRYPT_KEY_IDENTIFIER_SIZE); break; default: @@ -749,12 +765,19 @@ int fscrypt_set_test_dummy_encryption(struct super_block *sb, err = -EINVAL; goto out; } - dummy_ctx->ctx = ctx; - ctx = NULL; + + if (dummy_policy->policy) { + if (fscrypt_policies_equal(policy, dummy_policy->policy)) + err = 0; + else + err = -EEXIST; + goto out; + } + dummy_policy->policy = policy; + policy = NULL; err = 0; out: - kfree(ctx); - kfree(argstr_to_free); + kfree(policy); return err; } EXPORT_SYMBOL_GPL(fscrypt_set_test_dummy_encryption); @@ -771,10 +794,16 @@ EXPORT_SYMBOL_GPL(fscrypt_set_test_dummy_encryption); void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep, struct super_block *sb) { - const union fscrypt_context *ctx = fscrypt_get_dummy_context(sb); + const union fscrypt_policy *policy = fscrypt_get_dummy_policy(sb); + int vers; - if (!ctx) + if (!policy) return; - seq_printf(seq, "%ctest_dummy_encryption=v%d", sep, ctx->version); + + vers = policy->version; + if (vers == FSCRYPT_POLICY_V1) /* Handle numbering quirk */ + vers = 1; + + seq_printf(seq, "%ctest_dummy_encryption=v%d", sep, vers); } EXPORT_SYMBOL_GPL(fscrypt_show_test_dummy_encryption); diff --git a/fs/d_path.c b/fs/d_path.c index 0f1fc1743302..a69e2cd36e6e 100644 --- a/fs/d_path.c +++ b/fs/d_path.c @@ -102,6 +102,8 @@ restart: if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { struct mount *parent = READ_ONCE(mnt->mnt_parent); + struct mnt_namespace *mnt_ns; + /* Escaped? */ if (dentry != vfsmnt->mnt_root) { bptr = *buffer; @@ -116,7 +118,9 @@ restart: vfsmnt = &mnt->mnt; continue; } - if (is_mounted(vfsmnt) && !is_anon_ns(mnt->mnt_ns)) + mnt_ns = READ_ONCE(mnt->mnt_ns); + /* open-coded is_mounted() to use local mnt_ns */ + if (!IS_ERR_OR_NULL(mnt_ns) && !is_anon_ns(mnt_ns)) error = 1; // absolute root else error = 2; // detached or not attached yet @@ -559,8 +559,11 @@ fallback: } /** - * dax_layout_busy_page - find first pinned page in @mapping + * dax_layout_busy_page_range - find first pinned page in @mapping * @mapping: address space to scan for a page with ref count > 1 + * @start: Starting offset. Page containing 'start' is included. + * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX, + * pages from 'start' till the end of file are included. * * DAX requires ZONE_DEVICE mapped pages. These pages are never * 'onlined' to the page allocator so they are considered idle when @@ -573,12 +576,15 @@ fallback: * to be able to run unmap_mapping_range() and subsequently not race * mapping_mapped() becoming true. */ -struct page *dax_layout_busy_page(struct address_space *mapping) +struct page *dax_layout_busy_page_range(struct address_space *mapping, + loff_t start, loff_t end) { - XA_STATE(xas, &mapping->i_pages, 0); void *entry; unsigned int scanned = 0; struct page *page = NULL; + pgoff_t start_idx = start >> PAGE_SHIFT; + pgoff_t end_idx; + XA_STATE(xas, &mapping->i_pages, start_idx); /* * In the 'limited' case get_user_pages() for dax is disabled. @@ -589,6 +595,11 @@ struct page *dax_layout_busy_page(struct address_space *mapping) if (!dax_mapping(mapping) || !mapping_mapped(mapping)) return NULL; + /* If end == LLONG_MAX, all pages from start to till end of file */ + if (end == LLONG_MAX) + end_idx = ULONG_MAX; + else + end_idx = end >> PAGE_SHIFT; /* * If we race get_user_pages_fast() here either we'll see the * elevated page count in the iteration and wait, or @@ -596,15 +607,15 @@ struct page *dax_layout_busy_page(struct address_space *mapping) * against is no longer mapped in the page tables and bail to the * get_user_pages() slow path. The slow path is protected by * pte_lock() and pmd_lock(). New references are not taken without - * holding those locks, and unmap_mapping_range() will not zero the + * holding those locks, and unmap_mapping_pages() will not zero the * pte or pmd without holding the respective lock, so we are * guaranteed to either see new references or prevent new * references from being established. */ - unmap_mapping_range(mapping, 0, 0, 0); + unmap_mapping_pages(mapping, start_idx, end_idx - start_idx + 1, 0); xas_lock_irq(&xas); - xas_for_each(&xas, entry, ULONG_MAX) { + xas_for_each(&xas, entry, end_idx) { if (WARN_ON_ONCE(!xa_is_value(entry))) continue; if (unlikely(dax_is_locked(entry))) @@ -625,6 +636,12 @@ struct page *dax_layout_busy_page(struct address_space *mapping) xas_unlock_irq(&xas); return page; } +EXPORT_SYMBOL_GPL(dax_layout_busy_page_range); + +struct page *dax_layout_busy_page(struct address_space *mapping) +{ + return dax_layout_busy_page_range(mapping, 0, LLONG_MAX); +} EXPORT_SYMBOL_GPL(dax_layout_busy_page); static int __dax_invalidate_entry(struct address_space *mapping, @@ -1037,18 +1054,18 @@ static vm_fault_t dax_load_hole(struct xa_state *xas, return ret; } -int dax_iomap_zero(loff_t pos, unsigned offset, unsigned size, - struct iomap *iomap) +s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap) { sector_t sector = iomap_sector(iomap, pos & PAGE_MASK); pgoff_t pgoff; long rc, id; void *kaddr; bool page_aligned = false; - + unsigned offset = offset_in_page(pos); + unsigned size = min_t(u64, PAGE_SIZE - offset, length); if (IS_ALIGNED(sector << SECTOR_SHIFT, PAGE_SIZE) && - IS_ALIGNED(size, PAGE_SIZE)) + (size == PAGE_SIZE)) page_aligned = true; rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff); @@ -1058,8 +1075,7 @@ int dax_iomap_zero(loff_t pos, unsigned offset, unsigned size, id = dax_read_lock(); if (page_aligned) - rc = dax_zero_page_range(iomap->dax_dev, pgoff, - size >> PAGE_SHIFT); + rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1); else rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL); if (rc < 0) { @@ -1072,7 +1088,7 @@ int dax_iomap_zero(loff_t pos, unsigned offset, unsigned size, dax_flush(iomap->dax_dev, kaddr + offset, size); } dax_read_unlock(id); - return 0; + return size; } static loff_t diff --git a/fs/direct-io.c b/fs/direct-io.c index 183299892465..d53fa92a1ab6 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -386,25 +386,6 @@ static void dio_bio_end_io(struct bio *bio) spin_unlock_irqrestore(&dio->bio_lock, flags); } -/** - * dio_end_io - handle the end io action for the given bio - * @bio: The direct io bio thats being completed - * - * This is meant to be called by any filesystem that uses their own dio_submit_t - * so that the DIO specific endio actions are dealt with after the filesystem - * has done it's completion work. - */ -void dio_end_io(struct bio *bio) -{ - struct dio *dio = bio->bi_private; - - if (dio->is_async) - dio_bio_end_aio(bio); - else - dio_bio_end_io(bio); -} -EXPORT_SYMBOL_GPL(dio_end_io); - static inline void dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, struct block_device *bdev, @@ -1165,22 +1146,13 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, * the early prefetch in the caller enough time. */ - if (align & blocksize_mask) { - if (bdev) - blkbits = blksize_bits(bdev_logical_block_size(bdev)); - blocksize_mask = (1 << blkbits) - 1; - if (align & blocksize_mask) - goto out; - } - /* watch out for a 0 len io from a tricksy fs */ if (iov_iter_rw(iter) == READ && !count) return 0; dio = kmem_cache_alloc(dio_cache, GFP_KERNEL); - retval = -ENOMEM; if (!dio) - goto out; + return -ENOMEM; /* * Believe it or not, zeroing out the page array caused a .5% * performance regression in a database benchmark. So, we take @@ -1189,32 +1161,32 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, memset(dio, 0, offsetof(struct dio, pages)); dio->flags = flags; - if (dio->flags & DIO_LOCKING) { - if (iov_iter_rw(iter) == READ) { - struct address_space *mapping = - iocb->ki_filp->f_mapping; - - /* will be released by direct_io_worker */ - inode_lock(inode); - - retval = filemap_write_and_wait_range(mapping, offset, - end - 1); - if (retval) { - inode_unlock(inode); - kmem_cache_free(dio_cache, dio); - goto out; - } - } + if (dio->flags & DIO_LOCKING && iov_iter_rw(iter) == READ) { + /* will be released by direct_io_worker */ + inode_lock(inode); } /* Once we sampled i_size check for reads beyond EOF */ dio->i_size = i_size_read(inode); if (iov_iter_rw(iter) == READ && offset >= dio->i_size) { - if (dio->flags & DIO_LOCKING) - inode_unlock(inode); - kmem_cache_free(dio_cache, dio); retval = 0; - goto out; + goto fail_dio; + } + + if (align & blocksize_mask) { + if (bdev) + blkbits = blksize_bits(bdev_logical_block_size(bdev)); + blocksize_mask = (1 << blkbits) - 1; + if (align & blocksize_mask) + goto fail_dio; + } + + if (dio->flags & DIO_LOCKING && iov_iter_rw(iter) == READ) { + struct address_space *mapping = iocb->ki_filp->f_mapping; + + retval = filemap_write_and_wait_range(mapping, offset, end - 1); + if (retval) + goto fail_dio; } /* @@ -1258,14 +1230,8 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, */ retval = sb_init_dio_done_wq(dio->inode->i_sb); } - if (retval) { - /* - * We grab i_mutex only for reads so we don't have - * to release it here - */ - kmem_cache_free(dio_cache, dio); - goto out; - } + if (retval) + goto fail_dio; } /* @@ -1368,7 +1334,13 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, } else BUG_ON(retval != -EIOCBQUEUED); -out: + return retval; + +fail_dio: + if (dio->flags & DIO_LOCKING && iov_iter_rw(iter) == READ) + inode_unlock(inode); + + kmem_cache_free(dio_cache, dio); return retval; } diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig index f82a4952769d..ee92634196a8 100644 --- a/fs/dlm/Kconfig +++ b/fs/dlm/Kconfig @@ -4,6 +4,7 @@ menuconfig DLM depends on INET depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n) select IP_SCTP + select SRCU help A general purpose distributed lock manager for kernel or userspace applications. diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 47f0b98b707f..49c5f9407098 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -125,7 +125,7 @@ static ssize_t cluster_cluster_name_store(struct config_item *item, CONFIGFS_ATTR(cluster_, cluster_name); static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, - int *info_field, int check_zero, + int *info_field, bool (*check_cb)(unsigned int x), const char *buf, size_t len) { unsigned int x; @@ -137,7 +137,7 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, if (rc) return rc; - if (check_zero && !x) + if (check_cb && check_cb(x)) return -EINVAL; *cl_field = x; @@ -146,13 +146,13 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, return len; } -#define CLUSTER_ATTR(name, check_zero) \ +#define CLUSTER_ATTR(name, check_cb) \ static ssize_t cluster_##name##_store(struct config_item *item, \ const char *buf, size_t len) \ { \ struct dlm_cluster *cl = config_item_to_cluster(item); \ return cluster_set(cl, &cl->cl_##name, &dlm_config.ci_##name, \ - check_zero, buf, len); \ + check_cb, buf, len); \ } \ static ssize_t cluster_##name##_show(struct config_item *item, char *buf) \ { \ @@ -161,20 +161,30 @@ static ssize_t cluster_##name##_show(struct config_item *item, char *buf) \ } \ CONFIGFS_ATTR(cluster_, name); -CLUSTER_ATTR(tcp_port, 1); -CLUSTER_ATTR(buffer_size, 1); -CLUSTER_ATTR(rsbtbl_size, 1); -CLUSTER_ATTR(recover_timer, 1); -CLUSTER_ATTR(toss_secs, 1); -CLUSTER_ATTR(scan_secs, 1); -CLUSTER_ATTR(log_debug, 0); -CLUSTER_ATTR(log_info, 0); -CLUSTER_ATTR(protocol, 0); -CLUSTER_ATTR(mark, 0); -CLUSTER_ATTR(timewarn_cs, 1); -CLUSTER_ATTR(waitwarn_us, 0); -CLUSTER_ATTR(new_rsb_count, 0); -CLUSTER_ATTR(recover_callbacks, 0); +static bool dlm_check_zero(unsigned int x) +{ + return !x; +} + +static bool dlm_check_buffer_size(unsigned int x) +{ + return (x < DEFAULT_BUFFER_SIZE); +} + +CLUSTER_ATTR(tcp_port, dlm_check_zero); +CLUSTER_ATTR(buffer_size, dlm_check_buffer_size); +CLUSTER_ATTR(rsbtbl_size, dlm_check_zero); +CLUSTER_ATTR(recover_timer, dlm_check_zero); +CLUSTER_ATTR(toss_secs, dlm_check_zero); +CLUSTER_ATTR(scan_secs, dlm_check_zero); +CLUSTER_ATTR(log_debug, NULL); +CLUSTER_ATTR(log_info, NULL); +CLUSTER_ATTR(protocol, NULL); +CLUSTER_ATTR(mark, NULL); +CLUSTER_ATTR(timewarn_cs, dlm_check_zero); +CLUSTER_ATTR(waitwarn_us, NULL); +CLUSTER_ATTR(new_rsb_count, NULL); +CLUSTER_ATTR(recover_callbacks, NULL); static struct configfs_attribute *cluster_attrs[] = { [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port, @@ -221,6 +231,7 @@ struct dlm_space { struct list_head members; struct mutex members_lock; int members_count; + struct dlm_nodes *nds; }; struct dlm_comms { @@ -430,6 +441,7 @@ static struct config_group *make_space(struct config_group *g, const char *name) INIT_LIST_HEAD(&sp->members); mutex_init(&sp->members_lock); sp->members_count = 0; + sp->nds = nds; return &sp->group; fail: @@ -451,6 +463,7 @@ static void drop_space(struct config_group *g, struct config_item *i) static void release_space(struct config_item *i) { struct dlm_space *sp = config_item_to_space(i); + kfree(sp->nds); kfree(sp); } @@ -857,18 +870,22 @@ int dlm_comm_seq(int nodeid, uint32_t *seq) return 0; } -int dlm_comm_mark(int nodeid, unsigned int *mark) +void dlm_comm_mark(int nodeid, unsigned int *mark) { struct dlm_comm *cm; cm = get_comm(nodeid); - if (!cm) - return -ENOENT; + if (!cm) { + *mark = dlm_config.ci_mark; + return; + } - *mark = cm->mark; - put_comm(cm); + if (cm->mark) + *mark = cm->mark; + else + *mark = dlm_config.ci_mark; - return 0; + put_comm(cm); } int dlm_our_nodeid(void) @@ -889,7 +906,6 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num) /* Config file defaults */ #define DEFAULT_TCP_PORT 21064 -#define DEFAULT_BUFFER_SIZE 4096 #define DEFAULT_RSBTBL_SIZE 1024 #define DEFAULT_RECOVER_TIMER 5 #define DEFAULT_TOSS_SECS 10 diff --git a/fs/dlm/config.h b/fs/dlm/config.h index f62996cad561..c210250a2581 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -12,6 +12,8 @@ #ifndef __CONFIG_DOT_H__ #define __CONFIG_DOT_H__ +#define DEFAULT_BUFFER_SIZE 4096 + struct dlm_config_node { int nodeid; int weight; @@ -46,7 +48,7 @@ void dlm_config_exit(void); int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out, int *count_out); int dlm_comm_seq(int nodeid, uint32_t *seq); -int dlm_comm_mark(int nodeid, unsigned int *mark); +void dlm_comm_mark(int nodeid, unsigned int *mark); int dlm_our_nodeid(void); int dlm_our_addr(struct sockaddr_storage *addr, int num); diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 5050fe05769b..79f56f16bc2c 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -65,40 +65,6 @@ #define MAX_SEND_MSG_COUNT 25 #define DLM_SHUTDOWN_WAIT_TIMEOUT msecs_to_jiffies(10000) -struct cbuf { - unsigned int base; - unsigned int len; - unsigned int mask; -}; - -static void cbuf_add(struct cbuf *cb, int n) -{ - cb->len += n; -} - -static int cbuf_data(struct cbuf *cb) -{ - return ((cb->base + cb->len) & cb->mask); -} - -static void cbuf_init(struct cbuf *cb, int size) -{ - cb->base = cb->len = 0; - cb->mask = size-1; -} - -static void cbuf_eat(struct cbuf *cb, int n) -{ - cb->len -= n; - cb->base += n; - cb->base &= cb->mask; -} - -static bool cbuf_empty(struct cbuf *cb) -{ - return cb->len == 0; -} - struct connection { struct socket *sock; /* NULL if not connected */ uint32_t nodeid; /* So we know who we are in the list */ @@ -117,8 +83,6 @@ struct connection { int (*rx_action) (struct connection *); /* What to do when active */ void (*connect_action) (struct connection *); /* What to do to connect */ void (*shutdown_action)(struct connection *con); /* What to do to shutdown */ - struct page *rx_page; - struct cbuf cb; int retries; #define MAX_CONNECT_RETRIES 3 struct hlist_node list; @@ -126,6 +90,10 @@ struct connection { struct work_struct rwork; /* Receive workqueue */ struct work_struct swork; /* Send workqueue */ wait_queue_head_t shutdown_wait; /* wait for graceful shutdown */ + unsigned char *rx_buf; + int rx_buflen; + int rx_leftover; + struct rcu_head rcu; }; #define sock2con(x) ((struct connection *)(x)->sk_user_data) @@ -167,8 +135,8 @@ static struct workqueue_struct *recv_workqueue; static struct workqueue_struct *send_workqueue; static struct hlist_head connection_hash[CONN_HASH_SIZE]; -static DEFINE_MUTEX(connections_lock); -static struct kmem_cache *con_cache; +static DEFINE_SPINLOCK(connections_lock); +DEFINE_STATIC_SRCU(connections_srcu); static void process_recv_sockets(struct work_struct *work); static void process_send_sockets(struct work_struct *work); @@ -184,15 +152,20 @@ static inline int nodeid_hash(int nodeid) static struct connection *__find_con(int nodeid) { - int r; + int r, idx; struct connection *con; r = nodeid_hash(nodeid); - hlist_for_each_entry(con, &connection_hash[r], list) { - if (con->nodeid == nodeid) + idx = srcu_read_lock(&connections_srcu); + hlist_for_each_entry_rcu(con, &connection_hash[r], list) { + if (con->nodeid == nodeid) { + srcu_read_unlock(&connections_srcu, idx); return con; + } } + srcu_read_unlock(&connections_srcu, idx); + return NULL; } @@ -200,21 +173,25 @@ static struct connection *__find_con(int nodeid) * If 'allocation' is zero then we don't attempt to create a new * connection structure for this node. */ -static struct connection *__nodeid2con(int nodeid, gfp_t alloc) +static struct connection *nodeid2con(int nodeid, gfp_t alloc) { - struct connection *con = NULL; + struct connection *con, *tmp; int r; con = __find_con(nodeid); if (con || !alloc) return con; - con = kmem_cache_zalloc(con_cache, alloc); + con = kzalloc(sizeof(*con), alloc); if (!con) return NULL; - r = nodeid_hash(nodeid); - hlist_add_head(&con->list, &connection_hash[r]); + con->rx_buflen = dlm_config.ci_buffer_size; + con->rx_buf = kmalloc(con->rx_buflen, GFP_NOFS); + if (!con->rx_buf) { + kfree(con); + return NULL; + } con->nodeid = nodeid; mutex_init(&con->sock_mutex); @@ -233,31 +210,41 @@ static struct connection *__nodeid2con(int nodeid, gfp_t alloc) con->rx_action = zerocon->rx_action; } + r = nodeid_hash(nodeid); + + spin_lock(&connections_lock); + /* Because multiple workqueues/threads calls this function it can + * race on multiple cpu's. Instead of locking hot path __find_con() + * we just check in rare cases of recently added nodes again + * under protection of connections_lock. If this is the case we + * abort our connection creation and return the existing connection. + */ + tmp = __find_con(nodeid); + if (tmp) { + spin_unlock(&connections_lock); + kfree(con->rx_buf); + kfree(con); + return tmp; + } + + hlist_add_head_rcu(&con->list, &connection_hash[r]); + spin_unlock(&connections_lock); + return con; } /* Loop round all connections */ static void foreach_conn(void (*conn_func)(struct connection *c)) { - int i; - struct hlist_node *n; + int i, idx; struct connection *con; + idx = srcu_read_lock(&connections_srcu); for (i = 0; i < CONN_HASH_SIZE; i++) { - hlist_for_each_entry_safe(con, n, &connection_hash[i], list) + hlist_for_each_entry_rcu(con, &connection_hash[i], list) conn_func(con); } -} - -static struct connection *nodeid2con(int nodeid, gfp_t allocation) -{ - struct connection *con; - - mutex_lock(&connections_lock); - con = __nodeid2con(nodeid, allocation); - mutex_unlock(&connections_lock); - - return con; + srcu_read_unlock(&connections_srcu, idx); } static struct dlm_node_addr *find_node_addr(int nodeid) @@ -614,11 +601,8 @@ static void close_connection(struct connection *con, bool and_other, /* Will only re-enter once. */ close_connection(con->othercon, false, true, true); } - if (con->rx_page) { - __free_page(con->rx_page); - con->rx_page = NULL; - } + con->rx_leftover = 0; con->retries = 0; mutex_unlock(&con->sock_mutex); clear_bit(CF_CLOSING, &con->flags); @@ -672,16 +656,33 @@ static void dlm_tcp_shutdown(struct connection *con) shutdown_connection(con); } +static int con_realloc_receive_buf(struct connection *con, int newlen) +{ + unsigned char *newbuf; + + newbuf = kmalloc(newlen, GFP_NOFS); + if (!newbuf) + return -ENOMEM; + + /* copy any leftover from last receive */ + if (con->rx_leftover) + memmove(newbuf, con->rx_buf, con->rx_leftover); + + /* swap to new buffer space */ + kfree(con->rx_buf); + con->rx_buflen = newlen; + con->rx_buf = newbuf; + + return 0; +} + /* Data received from remote end */ static int receive_from_sock(struct connection *con) { - int ret = 0; - struct msghdr msg = {}; - struct kvec iov[2]; - unsigned len; - int r; int call_again_soon = 0; - int nvec; + struct msghdr msg; + struct kvec iov; + int ret, buflen; mutex_lock(&con->sock_mutex); @@ -689,71 +690,55 @@ static int receive_from_sock(struct connection *con) ret = -EAGAIN; goto out_close; } + if (con->nodeid == 0) { ret = -EINVAL; goto out_close; } - if (con->rx_page == NULL) { - /* - * This doesn't need to be atomic, but I think it should - * improve performance if it is. - */ - con->rx_page = alloc_page(GFP_ATOMIC); - if (con->rx_page == NULL) + /* realloc if we get new buffer size to read out */ + buflen = dlm_config.ci_buffer_size; + if (con->rx_buflen != buflen && con->rx_leftover <= buflen) { + ret = con_realloc_receive_buf(con, buflen); + if (ret < 0) goto out_resched; - cbuf_init(&con->cb, PAGE_SIZE); } - /* - * iov[0] is the bit of the circular buffer between the current end - * point (cb.base + cb.len) and the end of the buffer. - */ - iov[0].iov_len = con->cb.base - cbuf_data(&con->cb); - iov[0].iov_base = page_address(con->rx_page) + cbuf_data(&con->cb); - iov[1].iov_len = 0; - nvec = 1; - - /* - * iov[1] is the bit of the circular buffer between the start of the - * buffer and the start of the currently used section (cb.base) + /* calculate new buffer parameter regarding last receive and + * possible leftover bytes */ - if (cbuf_data(&con->cb) >= con->cb.base) { - iov[0].iov_len = PAGE_SIZE - cbuf_data(&con->cb); - iov[1].iov_len = con->cb.base; - iov[1].iov_base = page_address(con->rx_page); - nvec = 2; - } - len = iov[0].iov_len + iov[1].iov_len; - iov_iter_kvec(&msg.msg_iter, READ, iov, nvec, len); + iov.iov_base = con->rx_buf + con->rx_leftover; + iov.iov_len = con->rx_buflen - con->rx_leftover; - r = ret = sock_recvmsg(con->sock, &msg, MSG_DONTWAIT | MSG_NOSIGNAL); + memset(&msg, 0, sizeof(msg)); + msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; + ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len, + msg.msg_flags); if (ret <= 0) goto out_close; - else if (ret == len) + else if (ret == iov.iov_len) call_again_soon = 1; - cbuf_add(&con->cb, ret); - ret = dlm_process_incoming_buffer(con->nodeid, - page_address(con->rx_page), - con->cb.base, con->cb.len, - PAGE_SIZE); - if (ret < 0) { - log_print("lowcomms err %d: addr=%p, base=%u, len=%u, read=%d", - ret, page_address(con->rx_page), con->cb.base, - con->cb.len, r); - cbuf_eat(&con->cb, r); - } else { - cbuf_eat(&con->cb, ret); - } + /* new buflen according readed bytes and leftover from last receive */ + buflen = ret + con->rx_leftover; + ret = dlm_process_incoming_buffer(con->nodeid, con->rx_buf, buflen); + if (ret < 0) + goto out_close; - if (cbuf_empty(&con->cb) && !call_again_soon) { - __free_page(con->rx_page); - con->rx_page = NULL; + /* calculate leftover bytes from process and put it into begin of + * the receive buffer, so next receive we have the full message + * at the start address of the receive buffer. + */ + con->rx_leftover = buflen - ret; + if (con->rx_leftover) { + memmove(con->rx_buf, con->rx_buf + ret, + con->rx_leftover); + call_again_soon = true; } if (call_again_soon) goto out_resched; + mutex_unlock(&con->sock_mutex); return 0; @@ -791,13 +776,11 @@ static int accept_from_sock(struct connection *con) int nodeid; struct connection *newcon; struct connection *addcon; + unsigned int mark; - mutex_lock(&connections_lock); if (!dlm_allow_conn) { - mutex_unlock(&connections_lock); return -1; } - mutex_unlock(&connections_lock); mutex_lock_nested(&con->sock_mutex, 0); @@ -830,6 +813,9 @@ static int accept_from_sock(struct connection *con) return -1; } + dlm_comm_mark(nodeid, &mark); + sock_set_mark(newsock->sk, mark); + log_print("got connection from %d", nodeid); /* Check to see if we already have a connection to this node. This @@ -847,13 +833,24 @@ static int accept_from_sock(struct connection *con) struct connection *othercon = newcon->othercon; if (!othercon) { - othercon = kmem_cache_zalloc(con_cache, GFP_NOFS); + othercon = kzalloc(sizeof(*othercon), GFP_NOFS); if (!othercon) { log_print("failed to allocate incoming socket"); mutex_unlock(&newcon->sock_mutex); result = -ENOMEM; goto accept_err; } + + othercon->rx_buflen = dlm_config.ci_buffer_size; + othercon->rx_buf = kmalloc(othercon->rx_buflen, GFP_NOFS); + if (!othercon->rx_buf) { + mutex_unlock(&newcon->sock_mutex); + kfree(othercon); + log_print("failed to allocate incoming socket receive buffer"); + result = -ENOMEM; + goto accept_err; + } + othercon->nodeid = nodeid; othercon->rx_action = receive_from_sock; mutex_init(&othercon->sock_mutex); @@ -975,6 +972,8 @@ static void sctp_connect_to_sock(struct connection *con) return; } + dlm_comm_mark(con->nodeid, &mark); + mutex_lock(&con->sock_mutex); /* Some odd races can cause double-connects, ignore them */ @@ -999,11 +998,6 @@ static void sctp_connect_to_sock(struct connection *con) if (result < 0) goto socket_err; - /* set skb mark */ - result = dlm_comm_mark(con->nodeid, &mark); - if (result < 0) - goto bind_err; - sock_set_mark(sock->sk, mark); con->rx_action = receive_from_sock; @@ -1076,6 +1070,8 @@ static void tcp_connect_to_sock(struct connection *con) return; } + dlm_comm_mark(con->nodeid, &mark); + mutex_lock(&con->sock_mutex); if (con->retries++ > MAX_CONNECT_RETRIES) goto out; @@ -1090,11 +1086,6 @@ static void tcp_connect_to_sock(struct connection *con) if (result < 0) goto out_err; - /* set skb mark */ - result = dlm_comm_mark(con->nodeid, &mark); - if (result < 0) - goto out_err; - sock_set_mark(sock->sk, mark); memset(&saddr, 0, sizeof(saddr)); @@ -1238,6 +1229,14 @@ static void init_local(void) } } +static void deinit_local(void) +{ + int i; + + for (i = 0; i < dlm_local_count; i++) + kfree(dlm_local_addr[i]); +} + /* Initialise SCTP socket and bind to all interfaces */ static int sctp_listen_for_all(void) { @@ -1546,13 +1545,6 @@ static void process_send_sockets(struct work_struct *work) send_to_sock(con); } - -/* Discard all entries on the write queues */ -static void clean_writequeues(void) -{ - foreach_conn(clean_one_writequeue); -} - static void work_stop(void) { if (recv_workqueue) @@ -1608,26 +1600,34 @@ static void shutdown_conn(struct connection *con) con->shutdown_action(con); } +static void connection_release(struct rcu_head *rcu) +{ + struct connection *con = container_of(rcu, struct connection, rcu); + + kfree(con->rx_buf); + kfree(con); +} + static void free_conn(struct connection *con) { close_connection(con, true, true, true); - if (con->othercon) - kmem_cache_free(con_cache, con->othercon); - hlist_del(&con->list); - kmem_cache_free(con_cache, con); + spin_lock(&connections_lock); + hlist_del_rcu(&con->list); + spin_unlock(&connections_lock); + if (con->othercon) { + clean_one_writequeue(con->othercon); + call_rcu(&con->othercon->rcu, connection_release); + } + clean_one_writequeue(con); + call_rcu(&con->rcu, connection_release); } static void work_flush(void) { - int ok; + int ok, idx; int i; - struct hlist_node *n; struct connection *con; - if (recv_workqueue) - flush_workqueue(recv_workqueue); - if (send_workqueue) - flush_workqueue(send_workqueue); do { ok = 1; foreach_conn(stop_conn); @@ -1635,9 +1635,10 @@ static void work_flush(void) flush_workqueue(recv_workqueue); if (send_workqueue) flush_workqueue(send_workqueue); + idx = srcu_read_lock(&connections_srcu); for (i = 0; i < CONN_HASH_SIZE && ok; i++) { - hlist_for_each_entry_safe(con, n, - &connection_hash[i], list) { + hlist_for_each_entry_rcu(con, &connection_hash[i], + list) { ok &= test_bit(CF_READ_PENDING, &con->flags); ok &= test_bit(CF_WRITE_PENDING, &con->flags); if (con->othercon) { @@ -1648,6 +1649,7 @@ static void work_flush(void) } } } + srcu_read_unlock(&connections_srcu, idx); } while (!ok); } @@ -1656,16 +1658,18 @@ void dlm_lowcomms_stop(void) /* Set all the flags to prevent any socket activity. */ - mutex_lock(&connections_lock); dlm_allow_conn = 0; - mutex_unlock(&connections_lock); + + if (recv_workqueue) + flush_workqueue(recv_workqueue); + if (send_workqueue) + flush_workqueue(send_workqueue); + foreach_conn(shutdown_conn); work_flush(); - clean_writequeues(); foreach_conn(free_conn); work_stop(); - - kmem_cache_destroy(con_cache); + deinit_local(); } int dlm_lowcomms_start(void) @@ -1684,16 +1688,9 @@ int dlm_lowcomms_start(void) goto fail; } - error = -ENOMEM; - con_cache = kmem_cache_create("dlm_conn", sizeof(struct connection), - __alignof__(struct connection), 0, - NULL); - if (!con_cache) - goto fail; - error = work_start(); if (error) - goto fail_destroy; + goto fail; dlm_allow_conn = 1; @@ -1710,12 +1707,8 @@ int dlm_lowcomms_start(void) fail_unlisten: dlm_allow_conn = 0; con = nodeid2con(0,0); - if (con) { - close_connection(con, false, true, true); - kmem_cache_free(con_cache, con); - } -fail_destroy: - kmem_cache_destroy(con_cache); + if (con) + free_conn(con); fail: return error; } diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index 921322d133e3..fde3a6afe4be 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -22,114 +22,84 @@ * into packets and sends them to the comms layer. */ +#include <asm/unaligned.h> + #include "dlm_internal.h" #include "lowcomms.h" #include "config.h" #include "lock.h" #include "midcomms.h" - -static void copy_from_cb(void *dst, const void *base, unsigned offset, - unsigned len, unsigned limit) -{ - unsigned copy = len; - - if ((copy + offset) > limit) - copy = limit - offset; - memcpy(dst, base + offset, copy); - len -= copy; - if (len) - memcpy(dst + copy, base, len); -} - /* * Called from the low-level comms layer to process a buffer of * commands. - * - * Only complete messages are processed here, any "spare" bytes from - * the end of a buffer are saved and tacked onto the front of the next - * message that comes in. I doubt this will happen very often but we - * need to be able to cope with it and I don't want the task to be waiting - * for packets to come in when there is useful work to be done. */ -int dlm_process_incoming_buffer(int nodeid, const void *base, - unsigned offset, unsigned len, unsigned limit) +int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) { - union { - unsigned char __buf[DLM_INBUF_LEN]; - /* this is to force proper alignment on some arches */ - union dlm_packet p; - } __tmp; - union dlm_packet *p = &__tmp.p; - int ret = 0; - int err = 0; + const unsigned char *ptr = buf; + const struct dlm_header *hd; uint16_t msglen; - uint32_t lockspace; - - while (len > sizeof(struct dlm_header)) { - - /* Copy just the header to check the total length. The - message may wrap around the end of the buffer back to the - start, so we need to use a temp buffer and copy_from_cb. */ - - copy_from_cb(p, base, offset, sizeof(struct dlm_header), - limit); - - msglen = le16_to_cpu(p->header.h_length); - lockspace = p->header.h_lockspace; + int ret = 0; - err = -EINVAL; - if (msglen < sizeof(struct dlm_header)) - break; - if (p->header.h_cmd == DLM_MSG) { - if (msglen < sizeof(struct dlm_message)) - break; - } else { - if (msglen < sizeof(struct dlm_rcom)) - break; - } - err = -E2BIG; - if (msglen > dlm_config.ci_buffer_size) { - log_print("message size %d from %d too big, buf len %d", - msglen, nodeid, len); - break; + while (len >= sizeof(struct dlm_header)) { + hd = (struct dlm_header *)ptr; + + /* no message should be more than this otherwise we + * cannot deliver this message to upper layers + */ + msglen = get_unaligned_le16(&hd->h_length); + if (msglen > DEFAULT_BUFFER_SIZE) { + log_print("received invalid length header: %u, will abort message parsing", + msglen); + return -EBADMSG; } - err = 0; - - /* If only part of the full message is contained in this - buffer, then do nothing and wait for lowcomms to call - us again later with more data. We return 0 meaning - we've consumed none of the input buffer. */ + /* caller will take care that leftover + * will be parsed next call with more data + */ if (msglen > len) break; - /* Allocate a larger temp buffer if the full message won't fit - in the buffer on the stack (which should work for most - ordinary messages). */ - - if (msglen > sizeof(__tmp) && p == &__tmp.p) { - p = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS); - if (p == NULL) - return ret; - } + switch (hd->h_cmd) { + case DLM_MSG: + if (msglen < sizeof(struct dlm_message)) { + log_print("dlm msg too small: %u, will skip this message", + msglen); + goto skip; + } - copy_from_cb(p, base, offset, msglen, limit); + break; + case DLM_RCOM: + if (msglen < sizeof(struct dlm_rcom)) { + log_print("dlm rcom msg too small: %u, will skip this message", + msglen); + goto skip; + } - BUG_ON(lockspace != p->header.h_lockspace); + break; + default: + log_print("unsupported h_cmd received: %u, will skip this message", + hd->h_cmd); + goto skip; + } + /* for aligned memory access, we just copy current message + * to begin of the buffer which contains already parsed buffer + * data and should provide align access for upper layers + * because the start address of the buffer has a aligned + * address. This memmove can be removed when the upperlayer + * is capable of unaligned memory access. + */ + memmove(buf, ptr, msglen); + dlm_receive_buffer((union dlm_packet *)buf, nodeid); + +skip: ret += msglen; - offset += msglen; - offset &= (limit - 1); len -= msglen; - - dlm_receive_buffer(p, nodeid); + ptr += msglen; } - if (p != &__tmp.p) - kfree(p); - - return err ? err : ret; + return ret; } diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h index 2e122e81c8d0..61e90a921849 100644 --- a/fs/dlm/midcomms.h +++ b/fs/dlm/midcomms.h @@ -12,8 +12,7 @@ #ifndef __MIDCOMMS_DOT_H__ #define __MIDCOMMS_DOT_H__ -int dlm_process_incoming_buffer(int nodeid, const void *base, unsigned offset, - unsigned len, unsigned limit); +int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int buflen); #endif /* __MIDCOMMS_DOT_H__ */ diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c index e338c407cb75..67f68d48d60c 100644 --- a/fs/dlm/netlink.c +++ b/fs/dlm/netlink.c @@ -62,7 +62,7 @@ static int user_cmd(struct sk_buff *skb, struct genl_info *info) return 0; } -static const struct genl_ops dlm_nl_ops[] = { +static const struct genl_small_ops dlm_nl_ops[] = { { .cmd = DLM_CMD_HELLO, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, @@ -73,8 +73,8 @@ static const struct genl_ops dlm_nl_ops[] = { static struct genl_family family __ro_after_init = { .name = DLM_GENL_NAME, .version = DLM_GENL_VERSION, - .ops = dlm_nl_ops, - .n_ops = ARRAY_SIZE(dlm_nl_ops), + .small_ops = dlm_nl_ops, + .n_small_ops = ARRAY_SIZE(dlm_nl_ops), .module = THIS_MODULE, }; diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 28bb5689333a..15880a68faad 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -141,6 +141,9 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, name[len + EFI_VARIABLE_GUID_LEN+1] = '\0'; + /* replace invalid slashes like kobject_set_name_vargs does for /sys/firmware/efi/vars. */ + strreplace(name, '/', '!'); + inode = efivarfs_get_inode(sb, d_inode(root), S_IFREG | 0644, 0, is_removable); if (!inode) diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 459ecb42cbd3..347be146884c 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -224,7 +224,7 @@ submit_bio_retry: bio_set_dev(bio, sb->s_bdev); bio->bi_iter.bi_sector = (sector_t)blknr << LOG_SECTORS_PER_BLOCK; - bio->bi_opf = REQ_OP_READ; + bio->bi_opf = REQ_OP_READ | (ra ? REQ_RAHEAD : 0); } err = bio_add_page(bio, page, PAGE_SIZE, 0); diff --git a/fs/erofs/super.c b/fs/erofs/super.c index ddaa516c008a..b9a09806512a 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -211,9 +211,7 @@ static void erofs_default_options(struct erofs_fs_context *ctx) enum { Opt_user_xattr, - Opt_nouser_xattr, Opt_acl, - Opt_noacl, Opt_cache_strategy, Opt_err }; diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index c8c381eadcd6..5bde77d70852 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -473,8 +473,6 @@ static int erofs_xattr_generic_get(const struct xattr_handler *handler, return -EOPNOTSUPP; break; case EROFS_XATTR_INDEX_TRUSTED: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; break; case EROFS_XATTR_INDEX_SECURITY: break; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 6c939def00f9..50912a5420b4 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -135,6 +135,7 @@ struct z_erofs_decompress_frontend { struct z_erofs_collector clt; struct erofs_map_blocks map; + bool readahead; /* used for applying cache strategy on the fly */ bool backmost; erofs_off_t headoffset; @@ -153,8 +154,7 @@ static DEFINE_MUTEX(z_pagemap_global_lock); static void preload_compressed_pages(struct z_erofs_collector *clt, struct address_space *mc, - enum z_erofs_cache_alloctype type, - struct list_head *pagepool) + enum z_erofs_cache_alloctype type) { const struct z_erofs_pcluster *pcl = clt->pcl; const unsigned int clusterpages = BIT(pcl->clusterbits); @@ -562,8 +562,7 @@ static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe, } static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, - struct page *page, - struct list_head *pagepool) + struct page *page) { struct inode *const inode = fe->inode; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); @@ -620,8 +619,7 @@ restart_now: else cache_strategy = DONTALLOC; - preload_compressed_pages(clt, MNGD_MAPPING(sbi), - cache_strategy, pagepool); + preload_compressed_pages(clt, MNGD_MAPPING(sbi), cache_strategy); hitted: /* @@ -653,7 +651,7 @@ retry: /* should allocate an additional staging page for pagevec */ if (err == -EAGAIN) { struct page *const newpage = - erofs_allocpage(pagepool, GFP_NOFS | __GFP_NOFAIL); + alloc_page(GFP_NOFS | __GFP_NOFAIL); newpage->mapping = Z_EROFS_MAPPING_STAGING; err = z_erofs_attach_page(clt, newpage, @@ -1151,7 +1149,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, } static void z_erofs_submit_queue(struct super_block *sb, - z_erofs_next_pcluster_t owned_head, + struct z_erofs_decompress_frontend *f, struct list_head *pagepool, struct z_erofs_decompressqueue *fgq, bool *force_fg) @@ -1160,6 +1158,7 @@ static void z_erofs_submit_queue(struct super_block *sb, z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; void *bi_private; + z_erofs_next_pcluster_t owned_head = f->clt.owned_head; /* since bio will be NULL, no need to initialize last_index */ pgoff_t last_index; unsigned int nr_bios = 0; @@ -1193,7 +1192,6 @@ static void z_erofs_submit_queue(struct super_block *sb, do { struct page *page; - int err; page = pickup_page_for_submission(pcl, i++, pagepool, MNGD_MAPPING(sbi), @@ -1216,11 +1214,12 @@ submit_bio_retry: LOG_SECTORS_PER_BLOCK; bio->bi_private = bi_private; bio->bi_opf = REQ_OP_READ; + if (f->readahead) + bio->bi_opf |= REQ_RAHEAD; ++nr_bios; } - err = bio_add_page(bio, page, PAGE_SIZE, 0); - if (err < PAGE_SIZE) + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) goto submit_bio_retry; last_index = cur; @@ -1248,14 +1247,14 @@ submit_bio_retry: } static void z_erofs_runqueue(struct super_block *sb, - struct z_erofs_collector *clt, + struct z_erofs_decompress_frontend *f, struct list_head *pagepool, bool force_fg) { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; - if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL) + if (f->clt.owned_head == Z_EROFS_PCLUSTER_TAIL) return; - z_erofs_submit_queue(sb, clt->owned_head, pagepool, io, &force_fg); + z_erofs_submit_queue(sb, f, pagepool, io, &force_fg); /* handle bypass queue (no i/o pclusters) immediately */ z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool); @@ -1282,11 +1281,11 @@ static int z_erofs_readpage(struct file *file, struct page *page) f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT; - err = z_erofs_do_read_page(&f, page, &pagepool); + err = z_erofs_do_read_page(&f, page); (void)z_erofs_collector_end(&f.clt); /* if some compressed cluster ready, need submit them anyway */ - z_erofs_runqueue(inode->i_sb, &f.clt, &pagepool, true); + z_erofs_runqueue(inode->i_sb, &f, &pagepool, true); if (err) erofs_err(inode->i_sb, "failed to read, err [%d]", err); @@ -1299,25 +1298,20 @@ static int z_erofs_readpage(struct file *file, struct page *page) return err; } -static bool should_decompress_synchronously(struct erofs_sb_info *sbi, - unsigned int nr) -{ - return nr <= sbi->ctx.max_sync_decompress_pages; -} - static void z_erofs_readahead(struct readahead_control *rac) { struct inode *const inode = rac->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); - bool sync = should_decompress_synchronously(sbi, readahead_count(rac)); + unsigned int nr_pages = readahead_count(rac); + bool sync = (nr_pages <= sbi->ctx.max_sync_decompress_pages); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); struct page *page, *head = NULL; LIST_HEAD(pagepool); - trace_erofs_readpages(inode, readahead_index(rac), - readahead_count(rac), false); + trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false); + f.readahead = true; f.headoffset = readahead_pos(rac); while ((page = readahead_page(rac))) { @@ -1341,7 +1335,7 @@ static void z_erofs_readahead(struct readahead_control *rac) /* traversal in reverse order */ head = (void *)page_private(page); - err = z_erofs_do_read_page(&f, page, &pagepool); + err = z_erofs_do_read_page(&f, page); if (err) erofs_err(inode->i_sb, "readahead error at page %lu @ nid %llu", @@ -1351,7 +1345,7 @@ static void z_erofs_readahead(struct readahead_control *rac) (void)z_erofs_collector_end(&f.clt); - z_erofs_runqueue(inode->i_sb, &f.clt, &pagepool, sync); + z_erofs_runqueue(inode->i_sb, &f, &pagepool, sync); if (f.map.mpage) put_page(f.map.mpage); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 8107e06d7f6f..4df61129566d 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -218,8 +218,7 @@ struct eventpoll { struct file *file; /* used to optimize loop detection check */ - struct list_head visited_list_link; - int visited; + u64 gen; #ifdef CONFIG_NET_RX_BUSY_POLL /* used to track busy poll napi_id */ @@ -274,6 +273,8 @@ static long max_user_watches __read_mostly; */ static DEFINE_MUTEX(epmutex); +static u64 loop_check_gen = 0; + /* Used to check for epoll file descriptor inclusion loops */ static struct nested_calls poll_loop_ncalls; @@ -283,9 +284,6 @@ static struct kmem_cache *epi_cache __read_mostly; /* Slab cache used to allocate "struct eppoll_entry" */ static struct kmem_cache *pwq_cache __read_mostly; -/* Visited nodes during ep_loop_check(), so we can unset them when we finish */ -static LIST_HEAD(visited_list); - /* * List of files with newly added links, where we may need to limit the number * of emanating paths. Protected by the epmutex. @@ -1450,7 +1448,7 @@ static int reverse_path_check(void) static int ep_create_wakeup_source(struct epitem *epi) { - const char *name; + struct name_snapshot n; struct wakeup_source *ws; if (!epi->ep->ws) { @@ -1459,8 +1457,9 @@ static int ep_create_wakeup_source(struct epitem *epi) return -ENOMEM; } - name = epi->ffd.file->f_path.dentry->d_name.name; - ws = wakeup_source_register(NULL, name); + take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry); + ws = wakeup_source_register(NULL, n.name.name); + release_dentry_name_snapshot(&n); if (!ws) return -ENOMEM; @@ -1522,6 +1521,22 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, RCU_INIT_POINTER(epi->ws, NULL); } + /* Add the current item to the list of active epoll hook for this file */ + spin_lock(&tfile->f_lock); + list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links); + spin_unlock(&tfile->f_lock); + + /* + * Add the current item to the RB tree. All RB tree operations are + * protected by "mtx", and ep_insert() is called with "mtx" held. + */ + ep_rbtree_insert(ep, epi); + + /* now check if we've created too many backpaths */ + error = -EINVAL; + if (full_check && reverse_path_check()) + goto error_remove_epi; + /* Initialize the poll table using the queue callback */ epq.epi = epi; init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); @@ -1544,22 +1559,6 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, if (epi->nwait < 0) goto error_unregister; - /* Add the current item to the list of active epoll hook for this file */ - spin_lock(&tfile->f_lock); - list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links); - spin_unlock(&tfile->f_lock); - - /* - * Add the current item to the RB tree. All RB tree operations are - * protected by "mtx", and ep_insert() is called with "mtx" held. - */ - ep_rbtree_insert(ep, epi); - - /* now check if we've created too many backpaths */ - error = -EINVAL; - if (full_check && reverse_path_check()) - goto error_remove_epi; - /* We have to drop the new item inside our item list to keep track of it */ write_lock_irq(&ep->lock); @@ -1588,6 +1587,8 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, return 0; +error_unregister: + ep_unregister_pollwait(ep, epi); error_remove_epi: spin_lock(&tfile->f_lock); list_del_rcu(&epi->fllink); @@ -1595,9 +1596,6 @@ error_remove_epi: rb_erase_cached(&epi->rbn, &ep->rbr); -error_unregister: - ep_unregister_pollwait(ep, epi); - /* * We need to do this because an event could have been arrived on some * allocated wait queue. Note that we don't care about the ep->ovflist @@ -1972,13 +1970,12 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) struct epitem *epi; mutex_lock_nested(&ep->mtx, call_nests + 1); - ep->visited = 1; - list_add(&ep->visited_list_link, &visited_list); + ep->gen = loop_check_gen; for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); if (unlikely(is_file_epoll(epi->ffd.file))) { ep_tovisit = epi->ffd.file->private_data; - if (ep_tovisit->visited) + if (ep_tovisit->gen == loop_check_gen) continue; error = ep_call_nested(&poll_loop_ncalls, ep_loop_check_proc, epi->ffd.file, @@ -2019,18 +2016,8 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) */ static int ep_loop_check(struct eventpoll *ep, struct file *file) { - int ret; - struct eventpoll *ep_cur, *ep_next; - - ret = ep_call_nested(&poll_loop_ncalls, + return ep_call_nested(&poll_loop_ncalls, ep_loop_check_proc, file, ep, current); - /* clear visited list */ - list_for_each_entry_safe(ep_cur, ep_next, &visited_list, - visited_list_link) { - ep_cur->visited = 0; - list_del(&ep_cur->visited_list_link); - } - return ret; } static void clear_tfile_check_list(void) @@ -2195,11 +2182,13 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, goto error_tgt_fput; if (op == EPOLL_CTL_ADD) { if (!list_empty(&f.file->f_ep_links) || + ep->gen == loop_check_gen || is_file_epoll(tf.file)) { mutex_unlock(&ep->mtx); error = epoll_mutex_lock(&epmutex, 0, nonblock); if (error) goto error_tgt_fput; + loop_check_gen++; full_check = 1; if (is_file_epoll(tf.file)) { error = -ELOOP; @@ -2263,6 +2252,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, error_tgt_fput: if (full_check) { clear_tfile_check_list(); + loop_check_gen++; mutex_unlock(&epmutex); } diff --git a/fs/exec.c b/fs/exec.c index a91003e28eaa..547a2390baf5 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -23,6 +23,7 @@ * formats. */ +#include <linux/kernel_read_file.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/fdtable.h> @@ -62,6 +63,7 @@ #include <linux/oom.h> #include <linux/compat.h> #include <linux/vmalloc.h> +#include <linux/io_uring.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> @@ -949,137 +951,6 @@ struct file *open_exec(const char *name) } EXPORT_SYMBOL(open_exec); -int kernel_read_file(struct file *file, void **buf, loff_t *size, - loff_t max_size, enum kernel_read_file_id id) -{ - loff_t i_size, pos; - ssize_t bytes = 0; - int ret; - - if (!S_ISREG(file_inode(file)->i_mode) || max_size < 0) - return -EINVAL; - - ret = deny_write_access(file); - if (ret) - return ret; - - ret = security_kernel_read_file(file, id); - if (ret) - goto out; - - i_size = i_size_read(file_inode(file)); - if (i_size <= 0) { - ret = -EINVAL; - goto out; - } - if (i_size > SIZE_MAX || (max_size > 0 && i_size > max_size)) { - ret = -EFBIG; - goto out; - } - - if (id != READING_FIRMWARE_PREALLOC_BUFFER) - *buf = vmalloc(i_size); - if (!*buf) { - ret = -ENOMEM; - goto out; - } - - pos = 0; - while (pos < i_size) { - bytes = kernel_read(file, *buf + pos, i_size - pos, &pos); - if (bytes < 0) { - ret = bytes; - goto out_free; - } - - if (bytes == 0) - break; - } - - if (pos != i_size) { - ret = -EIO; - goto out_free; - } - - ret = security_kernel_post_read_file(file, *buf, i_size, id); - if (!ret) - *size = pos; - -out_free: - if (ret < 0) { - if (id != READING_FIRMWARE_PREALLOC_BUFFER) { - vfree(*buf); - *buf = NULL; - } - } - -out: - allow_write_access(file); - return ret; -} -EXPORT_SYMBOL_GPL(kernel_read_file); - -int kernel_read_file_from_path(const char *path, void **buf, loff_t *size, - loff_t max_size, enum kernel_read_file_id id) -{ - struct file *file; - int ret; - - if (!path || !*path) - return -EINVAL; - - file = filp_open(path, O_RDONLY, 0); - if (IS_ERR(file)) - return PTR_ERR(file); - - ret = kernel_read_file(file, buf, size, max_size, id); - fput(file); - return ret; -} -EXPORT_SYMBOL_GPL(kernel_read_file_from_path); - -int kernel_read_file_from_path_initns(const char *path, void **buf, - loff_t *size, loff_t max_size, - enum kernel_read_file_id id) -{ - struct file *file; - struct path root; - int ret; - - if (!path || !*path) - return -EINVAL; - - task_lock(&init_task); - get_fs_root(init_task.fs, &root); - task_unlock(&init_task); - - file = file_open_root(root.dentry, root.mnt, path, O_RDONLY, 0); - path_put(&root); - if (IS_ERR(file)) - return PTR_ERR(file); - - ret = kernel_read_file(file, buf, size, max_size, id); - fput(file); - return ret; -} -EXPORT_SYMBOL_GPL(kernel_read_file_from_path_initns); - -int kernel_read_file_from_fd(int fd, void **buf, loff_t *size, loff_t max_size, - enum kernel_read_file_id id) -{ - struct fd f = fdget(fd); - int ret = -EBADF; - - if (!f.file) - goto out; - - ret = kernel_read_file(f.file, buf, size, max_size, id); -out: - fdput(f); - return ret; -} -EXPORT_SYMBOL_GPL(kernel_read_file_from_fd); - #if defined(CONFIG_HAVE_AOUT) || defined(CONFIG_BINFMT_FLAT) || \ defined(CONFIG_BINFMT_ELF_FDPIC) ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) @@ -1130,11 +1001,24 @@ static int exec_mmap(struct mm_struct *mm) } task_lock(tsk); - active_mm = tsk->active_mm; membarrier_exec_mmap(mm); - tsk->mm = mm; + + local_irq_disable(); + active_mm = tsk->active_mm; tsk->active_mm = mm; + tsk->mm = mm; + /* + * This prevents preemption while active_mm is being loaded and + * it and mm are being updated, which could cause problems for + * lazy tlb mm refcounting when these are updated by context + * switches. Not all architectures can handle irqs off over + * activate_mm yet. + */ + if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) + local_irq_enable(); activate_mm(active_mm, mm); + if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) + local_irq_enable(); tsk->mm->vmacache_seqnum = 0; vmacache_flush(tsk); task_unlock(tsk); @@ -1895,6 +1779,11 @@ static int bprm_execve(struct linux_binprm *bprm, struct files_struct *displaced; int retval; + /* + * Cancel any io_uring activity across execve + */ + io_uring_task_cancel(); + retval = unshare_files(&displaced); if (retval) return retval; diff --git a/fs/exfat/cache.c b/fs/exfat/cache.c index 03d0824fc368..5a2f119b7e8c 100644 --- a/fs/exfat/cache.c +++ b/fs/exfat/cache.c @@ -17,7 +17,6 @@ #include "exfat_raw.h" #include "exfat_fs.h" -#define EXFAT_CACHE_VALID 0 #define EXFAT_MAX_CACHE 16 struct exfat_cache { @@ -61,16 +60,6 @@ void exfat_cache_shutdown(void) kmem_cache_destroy(exfat_cachep); } -void exfat_cache_init_inode(struct inode *inode) -{ - struct exfat_inode_info *ei = EXFAT_I(inode); - - spin_lock_init(&ei->cache_lru_lock); - ei->nr_caches = 0; - ei->cache_valid_id = EXFAT_CACHE_VALID + 1; - INIT_LIST_HEAD(&ei->cache_lru); -} - static inline struct exfat_cache *exfat_cache_alloc(void) { return kmem_cache_alloc(exfat_cachep, GFP_NOFS); diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 95d717f8620c..c013fe931d9c 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -248,6 +248,8 @@ struct exfat_sb_info { struct rcu_head rcu; }; +#define EXFAT_CACHE_VALID 0 + /* * EXFAT file system inode in-memory data */ @@ -428,7 +430,6 @@ extern const struct dentry_operations exfat_utf8_dentry_ops; /* cache.c */ int exfat_cache_init(void); void exfat_cache_shutdown(void); -void exfat_cache_init_inode(struct inode *inode); void exfat_cache_inval_inode(struct inode *inode); int exfat_get_cluster(struct inode *inode, unsigned int cluster, unsigned int *fclus, unsigned int *dclus, diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 7f90204adef5..a6de17cac3df 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -611,8 +611,6 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info) ei->i_crtime = info->crtime; inode->i_atime = info->atime; - exfat_cache_init_inode(inode); - return 0; } diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index e73f20f66cb2..c94ac239f740 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -578,7 +578,8 @@ static int exfat_create(struct inode *dir, struct dentry *dentry, umode_t mode, i_pos = exfat_make_i_pos(&info); inode = exfat_build_inode(sb, &info, i_pos); - if (IS_ERR(inode)) + err = PTR_ERR_OR_ZERO(inode); + if (err) goto unlock; inode_inc_iversion(inode); @@ -745,10 +746,9 @@ static struct dentry *exfat_lookup(struct inode *dir, struct dentry *dentry, i_pos = exfat_make_i_pos(&info); inode = exfat_build_inode(sb, &info, i_pos); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); + err = PTR_ERR_OR_ZERO(inode); + if (err) goto unlock; - } i_mode = inode->i_mode; alias = d_find_alias(inode); @@ -890,10 +890,9 @@ static int exfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) i_pos = exfat_make_i_pos(&info); inode = exfat_build_inode(sb, &info, i_pos); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); + err = PTR_ERR_OR_ZERO(inode); + if (err) goto unlock; - } inode_inc_iversion(inode); inode->i_mtime = inode->i_atime = inode->i_ctime = diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 3b6a1659892f..60b941ba557b 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -376,7 +376,6 @@ static int exfat_read_root(struct inode *inode) inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime = current_time(inode); exfat_truncate_atime(&inode->i_atime); - exfat_cache_init_inode(inode); return 0; } @@ -763,6 +762,10 @@ static void exfat_inode_init_once(void *foo) { struct exfat_inode_info *ei = (struct exfat_inode_info *)foo; + spin_lock_init(&ei->cache_lru_lock); + ei->nr_caches = 0; + ei->cache_valid_id = EXFAT_CACHE_VALID + 1; + INIT_LIST_HEAD(&ei->cache_lru); INIT_HLIST_NODE(&ei->i_hash_fat); inode_init_once(&ei->vfs_inode); } diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index fa9c951d3471..1f3f4326bf3c 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -189,7 +189,7 @@ static void group_adjust_blocks(struct super_block *sb, int group_no, /** * __rsv_window_dump() -- Dump the filesystem block allocation reservation map - * @rb_root: root of per-filesystem reservation rb tree + * @root: root of per-filesystem reservation rb tree * @verbose: verbose mode * @fn: function which wishes to dump the reservation map * @@ -282,7 +282,7 @@ goal_in_my_reservation(struct ext2_reserve_window *rsv, ext2_grpblk_t grp_goal, /** * search_reserve_window() - * @rb_root: root of reservation tree + * @root: root of reservation tree * @goal: target allocation block * * Find the reserved window which includes the goal, or the previous one @@ -859,7 +859,7 @@ static int find_next_reservable_window( * * failed: we failed to find a reservation window in this group * - * @rsv: the reservation + * @my_rsv: the reservation * * @grp_goal: The goal (group-relative). It is where the search for a * free reservable space should start from. diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 415c21f0e750..11c5c6fe75bb 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -36,7 +36,6 @@ #include <linux/iomap.h> #include <linux/namei.h> #include <linux/uio.h> -#include <linux/fiemap.h> #include "ext2.h" #include "acl.h" #include "xattr.h" diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 1d82336b1cd4..efe77cffc322 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -148,7 +148,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) } if (IS_ENCRYPTED(inode)) { - err = fscrypt_fname_alloc_buffer(inode, EXT4_NAME_LEN, &fstr); + err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN, &fstr); if (err < 0) return err; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 523e00d7b392..f9a692c0a66c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1401,7 +1401,7 @@ struct ext4_super_block { #define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ #ifdef CONFIG_FS_ENCRYPTION -#define DUMMY_ENCRYPTION_ENABLED(sbi) ((sbi)->s_dummy_enc_ctx.ctx != NULL) +#define DUMMY_ENCRYPTION_ENABLED(sbi) ((sbi)->s_dummy_enc_policy.policy != NULL) #else #define DUMMY_ENCRYPTION_ENABLED(sbi) (0) #endif @@ -1596,8 +1596,8 @@ struct ext4_sb_info { atomic_t s_warning_count; atomic_t s_msg_count; - /* Encryption context for '-o test_dummy_encryption' */ - struct fscrypt_dummy_context s_dummy_enc_ctx; + /* Encryption policy for '-o test_dummy_encryption' */ + struct fscrypt_dummy_policy s_dummy_enc_policy; /* * Barrier between writepages ops and changing any inode's JOURNAL_DATA diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index df25d38d6539..698ca4a4db5f 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -742,6 +742,53 @@ not_found: return 1; } +static int ext4_xattr_credits_for_new_inode(struct inode *dir, mode_t mode, + bool encrypt) +{ + struct super_block *sb = dir->i_sb; + int nblocks = 0; +#ifdef CONFIG_EXT4_FS_POSIX_ACL + struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT); + + if (IS_ERR(p)) + return PTR_ERR(p); + if (p) { + int acl_size = p->a_count * sizeof(ext4_acl_entry); + + nblocks += (S_ISDIR(mode) ? 2 : 1) * + __ext4_xattr_set_credits(sb, NULL /* inode */, + NULL /* block_bh */, acl_size, + true /* is_create */); + posix_acl_release(p); + } +#endif + +#ifdef CONFIG_SECURITY + { + int num_security_xattrs = 1; + +#ifdef CONFIG_INTEGRITY + num_security_xattrs++; +#endif + /* + * We assume that security xattrs are never more than 1k. + * In practice they are under 128 bytes. + */ + nblocks += num_security_xattrs * + __ext4_xattr_set_credits(sb, NULL /* inode */, + NULL /* block_bh */, 1024, + true /* is_create */); + } +#endif + if (encrypt) + nblocks += __ext4_xattr_set_credits(sb, + NULL /* inode */, + NULL /* block_bh */, + FSCRYPT_SET_CONTEXT_MAX_SIZE, + true /* is_create */); + return nblocks; +} + /* * There are two policies for allocating an inode. If the new inode is * a directory, then a forward search is made for a block group with both @@ -772,7 +819,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, ext4_group_t i; ext4_group_t flex_group; struct ext4_group_info *grp; - int encrypt = 0; + bool encrypt = false; /* Cannot create files in a deleted directory */ if (!dir || !dir->i_nlink) @@ -784,59 +831,6 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, if (unlikely(ext4_forced_shutdown(sbi))) return ERR_PTR(-EIO); - if ((IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) && - (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) && - !(i_flags & EXT4_EA_INODE_FL)) { - err = fscrypt_get_encryption_info(dir); - if (err) - return ERR_PTR(err); - if (!fscrypt_has_encryption_key(dir)) - return ERR_PTR(-ENOKEY); - encrypt = 1; - } - - if (!handle && sbi->s_journal && !(i_flags & EXT4_EA_INODE_FL)) { -#ifdef CONFIG_EXT4_FS_POSIX_ACL - struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT); - - if (IS_ERR(p)) - return ERR_CAST(p); - if (p) { - int acl_size = p->a_count * sizeof(ext4_acl_entry); - - nblocks += (S_ISDIR(mode) ? 2 : 1) * - __ext4_xattr_set_credits(sb, NULL /* inode */, - NULL /* block_bh */, acl_size, - true /* is_create */); - posix_acl_release(p); - } -#endif - -#ifdef CONFIG_SECURITY - { - int num_security_xattrs = 1; - -#ifdef CONFIG_INTEGRITY - num_security_xattrs++; -#endif - /* - * We assume that security xattrs are never - * more than 1k. In practice they are under - * 128 bytes. - */ - nblocks += num_security_xattrs * - __ext4_xattr_set_credits(sb, NULL /* inode */, - NULL /* block_bh */, 1024, - true /* is_create */); - } -#endif - if (encrypt) - nblocks += __ext4_xattr_set_credits(sb, - NULL /* inode */, NULL /* block_bh */, - FSCRYPT_SET_CONTEXT_MAX_SIZE, - true /* is_create */); - } - ngroups = ext4_get_groups_count(sb); trace_ext4_request_inode(dir, mode); inode = new_inode(sb); @@ -866,10 +860,25 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, else ei->i_projid = make_kprojid(&init_user_ns, EXT4_DEF_PROJID); + if (!(i_flags & EXT4_EA_INODE_FL)) { + err = fscrypt_prepare_new_inode(dir, inode, &encrypt); + if (err) + goto out; + } + err = dquot_initialize(inode); if (err) goto out; + if (!handle && sbi->s_journal && !(i_flags & EXT4_EA_INODE_FL)) { + ret2 = ext4_xattr_credits_for_new_inode(dir, mode, encrypt); + if (ret2 < 0) { + err = ret2; + goto out; + } + nblocks += ret2; + } + if (!goal) goal = sbi->s_inode_goal; @@ -1162,7 +1171,7 @@ got: * prevent its deduplication. */ if (encrypt) { - err = fscrypt_inherit_context(dir, inode, handle, true); + err = fscrypt_set_context(inode, handle); if (err) goto fail_free_drop; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 153a9fbe1dd0..0d74615fcce3 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -663,8 +663,7 @@ static struct stats dx_show_leaf(struct inode *dir, /* Directory is encrypted */ res = fscrypt_fname_alloc_buffer( - dir, len, - &fname_crypto_str); + len, &fname_crypto_str); if (res) printk(KERN_WARNING "Error " "allocating crypto " @@ -1016,8 +1015,8 @@ static int htree_dirblock_to_tree(struct file *dir_file, brelse(bh); return err; } - err = fscrypt_fname_alloc_buffer(dir, EXT4_NAME_LEN, - &fname_crypto_str); + err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN, + &fname_crypto_str); if (err < 0) { brelse(bh); return err; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ea425b49b345..8b2736283481 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1104,7 +1104,7 @@ static void ext4_put_super(struct super_block *sb) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->s_blockgroup_lock); fs_put_dax(sbi->s_daxdev); - fscrypt_free_dummy_context(&sbi->s_dummy_enc_ctx); + fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); #ifdef CONFIG_UNICODE utf8_unload(sbi->s_encoding); #endif @@ -1392,10 +1392,9 @@ retry: return res; } -static const union fscrypt_context * -ext4_get_dummy_context(struct super_block *sb) +static const union fscrypt_policy *ext4_get_dummy_policy(struct super_block *sb) { - return EXT4_SB(sb)->s_dummy_enc_ctx.ctx; + return EXT4_SB(sb)->s_dummy_enc_policy.policy; } static bool ext4_has_stable_inodes(struct super_block *sb) @@ -1414,7 +1413,7 @@ static const struct fscrypt_operations ext4_cryptops = { .key_prefix = "ext4:", .get_context = ext4_get_context, .set_context = ext4_set_context, - .get_dummy_context = ext4_get_dummy_context, + .get_dummy_policy = ext4_get_dummy_policy, .empty_dir = ext4_empty_dir, .max_namelen = EXT4_NAME_LEN, .has_stable_inodes = ext4_has_stable_inodes, @@ -1888,12 +1887,13 @@ static int ext4_set_test_dummy_encryption(struct super_block *sb, * needed to allow it to be set or changed during remount. We do allow * it to be specified during remount, but only if there is no change. */ - if (is_remount && !sbi->s_dummy_enc_ctx.ctx) { + if (is_remount && !sbi->s_dummy_enc_policy.policy) { ext4_msg(sb, KERN_WARNING, "Can't set test_dummy_encryption on remount"); return -1; } - err = fscrypt_set_test_dummy_encryption(sb, arg, &sbi->s_dummy_enc_ctx); + err = fscrypt_set_test_dummy_encryption(sb, arg->from, + &sbi->s_dummy_enc_policy); if (err) { if (err == -EEXIST) ext4_msg(sb, KERN_WARNING, @@ -4935,7 +4935,7 @@ failed_mount: for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(get_qf_name(sb, sbi, i)); #endif - fscrypt_free_dummy_context(&sbi->s_dummy_enc_ctx); + fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); ext4_blkdev_remove(sbi); brelse(bh); out_fail: diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index bbd5e7e0632b..5b7ba8f71153 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -349,6 +349,7 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) { + DEFINE_READAHEAD(ractl, NULL, inode->i_mapping, index); struct page *page; index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT; @@ -358,8 +359,7 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode, if (page) put_page(page); else if (num_ra_pages > 1) - page_cache_readahead_unbounded(inode->i_mapping, NULL, - index, num_ra_pages, 0); + page_cache_ra_unbounded(&ractl, num_ra_pages, 0); page = read_mapping_page(inode->i_mapping, index, NULL); } return page; diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 217b290ae3a5..306413589827 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -160,7 +160,7 @@ static void *f2fs_acl_to_disk(struct f2fs_sb_info *sbi, return (void *)f2fs_acl; fail: - kvfree(f2fs_acl); + kfree(f2fs_acl); return ERR_PTR(-EINVAL); } @@ -190,7 +190,7 @@ static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type, acl = NULL; else acl = ERR_PTR(retval); - kvfree(value); + kfree(value); return acl; } @@ -240,7 +240,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, error = f2fs_setxattr(inode, name_index, "", value, size, ipage, 0); - kvfree(value); + kfree(value); if (!error) set_cached_acl(inode, type, acl); diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index ff807e14c891..023462e80e58 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -107,7 +107,7 @@ struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) return __get_meta_page(sbi, index, true); } -struct page *f2fs_get_meta_page_nofail(struct f2fs_sb_info *sbi, pgoff_t index) +struct page *f2fs_get_meta_page_retry(struct f2fs_sb_info *sbi, pgoff_t index) { struct page *page; int count = 0; @@ -243,6 +243,8 @@ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, blkno * NAT_ENTRY_PER_BLOCK); break; case META_SIT: + if (unlikely(blkno >= TOTAL_SEGS(sbi))) + goto out; /* get sit block addr */ fio.new_blkaddr = current_sit_addr(sbi, blkno * SIT_ENTRY_PER_BLOCK); @@ -1047,8 +1049,12 @@ int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) get_pages(sbi, is_dir ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); retry: - if (unlikely(f2fs_cp_error(sbi))) + if (unlikely(f2fs_cp_error(sbi))) { + trace_f2fs_sync_dirty_inodes_exit(sbi->sb, is_dir, + get_pages(sbi, is_dir ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); return -EIO; + } spin_lock(&sbi->inode_lock[type]); @@ -1619,11 +1625,16 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_flush_sit_entries(sbi, cpc); + /* save inmem log status */ + f2fs_save_inmem_curseg(sbi); + err = do_checkpoint(sbi, cpc); if (err) f2fs_release_discard_addrs(sbi); else f2fs_clear_prefree_segments(sbi, cpc); + + f2fs_restore_inmem_curseg(sbi); stop: unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); @@ -1654,7 +1665,7 @@ void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi) } sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS - - NR_CURSEG_TYPE - __cp_payload(sbi)) * + NR_CURSEG_PERSIST_TYPE - __cp_payload(sbi)) * F2FS_ORPHANS_PER_BLOCK; } diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 1dfb126a0cb2..14262e0f1cd6 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -17,6 +17,33 @@ #include "node.h" #include <trace/events/f2fs.h> +static struct kmem_cache *cic_entry_slab; +static struct kmem_cache *dic_entry_slab; + +static void *page_array_alloc(struct inode *inode, int nr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned int size = sizeof(struct page *) * nr; + + if (likely(size <= sbi->page_array_slab_size)) + return kmem_cache_zalloc(sbi->page_array_slab, GFP_NOFS); + return f2fs_kzalloc(sbi, size, GFP_NOFS); +} + +static void page_array_free(struct inode *inode, void *pages, int nr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned int size = sizeof(struct page *) * nr; + + if (!pages) + return; + + if (likely(size <= sbi->page_array_slab_size)) + kmem_cache_free(sbi->page_array_slab, pages); + else + kfree(pages); +} + struct f2fs_compress_ops { int (*init_compress_ctx)(struct compress_ctx *cc); void (*destroy_compress_ctx)(struct compress_ctx *cc); @@ -130,19 +157,16 @@ struct page *f2fs_compress_control_page(struct page *page) int f2fs_init_compress_ctx(struct compress_ctx *cc) { - struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); - - if (cc->nr_rpages) + if (cc->rpages) return 0; - cc->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) << - cc->log_cluster_size, GFP_NOFS); + cc->rpages = page_array_alloc(cc->inode, cc->cluster_size); return cc->rpages ? 0 : -ENOMEM; } void f2fs_destroy_compress_ctx(struct compress_ctx *cc) { - kfree(cc->rpages); + page_array_free(cc->inode, cc->rpages, cc->cluster_size); cc->rpages = NULL; cc->nr_rpages = 0; cc->nr_cpages = 0; @@ -382,16 +406,17 @@ static int zstd_init_decompress_ctx(struct decompress_io_ctx *dic) ZSTD_DStream *stream; void *workspace; unsigned int workspace_size; + unsigned int max_window_size = + MAX_COMPRESS_WINDOW_SIZE(dic->log_cluster_size); - workspace_size = ZSTD_DStreamWorkspaceBound(MAX_COMPRESS_WINDOW_SIZE); + workspace_size = ZSTD_DStreamWorkspaceBound(max_window_size); workspace = f2fs_kvmalloc(F2FS_I_SB(dic->inode), workspace_size, GFP_NOFS); if (!workspace) return -ENOMEM; - stream = ZSTD_initDStream(MAX_COMPRESS_WINDOW_SIZE, - workspace, workspace_size); + stream = ZSTD_initDStream(max_window_size, workspace, workspace_size); if (!stream) { printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_initDStream failed\n", KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id, @@ -554,13 +579,29 @@ static void f2fs_compress_free_page(struct page *page) mempool_free(page, compress_page_pool); } +#define MAX_VMAP_RETRIES 3 + +static void *f2fs_vmap(struct page **pages, unsigned int count) +{ + int i; + void *buf = NULL; + + for (i = 0; i < MAX_VMAP_RETRIES; i++) { + buf = vm_map_ram(pages, count, -1); + if (buf) + break; + vm_unmap_aliases(); + } + return buf; +} + static int f2fs_compress_pages(struct compress_ctx *cc) { - struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); struct f2fs_inode_info *fi = F2FS_I(cc->inode); const struct f2fs_compress_ops *cops = f2fs_cops[fi->i_compress_algorithm]; - unsigned int max_len, nr_cpages; + unsigned int max_len, new_nr_cpages; + struct page **new_cpages; int i, ret; trace_f2fs_compress_pages_start(cc->inode, cc->cluster_idx, @@ -575,8 +616,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc) max_len = COMPRESS_HEADER_SIZE + cc->clen; cc->nr_cpages = DIV_ROUND_UP(max_len, PAGE_SIZE); - cc->cpages = f2fs_kzalloc(sbi, sizeof(struct page *) * - cc->nr_cpages, GFP_NOFS); + cc->cpages = page_array_alloc(cc->inode, cc->nr_cpages); if (!cc->cpages) { ret = -ENOMEM; goto destroy_compress_ctx; @@ -590,13 +630,13 @@ static int f2fs_compress_pages(struct compress_ctx *cc) } } - cc->rbuf = vmap(cc->rpages, cc->cluster_size, VM_MAP, PAGE_KERNEL_RO); + cc->rbuf = f2fs_vmap(cc->rpages, cc->cluster_size); if (!cc->rbuf) { ret = -ENOMEM; goto out_free_cpages; } - cc->cbuf = vmap(cc->cpages, cc->nr_cpages, VM_MAP, PAGE_KERNEL); + cc->cbuf = f2fs_vmap(cc->cpages, cc->nr_cpages); if (!cc->cbuf) { ret = -ENOMEM; goto out_vunmap_rbuf; @@ -618,16 +658,28 @@ static int f2fs_compress_pages(struct compress_ctx *cc) for (i = 0; i < COMPRESS_DATA_RESERVED_SIZE; i++) cc->cbuf->reserved[i] = cpu_to_le32(0); - nr_cpages = DIV_ROUND_UP(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE); + new_nr_cpages = DIV_ROUND_UP(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE); + + /* Now we're going to cut unnecessary tail pages */ + new_cpages = page_array_alloc(cc->inode, new_nr_cpages); + if (!new_cpages) { + ret = -ENOMEM; + goto out_vunmap_cbuf; + } /* zero out any unused part of the last page */ memset(&cc->cbuf->cdata[cc->clen], 0, - (nr_cpages * PAGE_SIZE) - (cc->clen + COMPRESS_HEADER_SIZE)); + (new_nr_cpages * PAGE_SIZE) - + (cc->clen + COMPRESS_HEADER_SIZE)); - vunmap(cc->cbuf); - vunmap(cc->rbuf); + vm_unmap_ram(cc->cbuf, cc->nr_cpages); + vm_unmap_ram(cc->rbuf, cc->cluster_size); - for (i = nr_cpages; i < cc->nr_cpages; i++) { + for (i = 0; i < cc->nr_cpages; i++) { + if (i < new_nr_cpages) { + new_cpages[i] = cc->cpages[i]; + continue; + } f2fs_compress_free_page(cc->cpages[i]); cc->cpages[i] = NULL; } @@ -635,22 +687,24 @@ static int f2fs_compress_pages(struct compress_ctx *cc) if (cops->destroy_compress_ctx) cops->destroy_compress_ctx(cc); - cc->nr_cpages = nr_cpages; + page_array_free(cc->inode, cc->cpages, cc->nr_cpages); + cc->cpages = new_cpages; + cc->nr_cpages = new_nr_cpages; trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx, cc->clen, ret); return 0; out_vunmap_cbuf: - vunmap(cc->cbuf); + vm_unmap_ram(cc->cbuf, cc->nr_cpages); out_vunmap_rbuf: - vunmap(cc->rbuf); + vm_unmap_ram(cc->rbuf, cc->cluster_size); out_free_cpages: for (i = 0; i < cc->nr_cpages; i++) { if (cc->cpages[i]) f2fs_compress_free_page(cc->cpages[i]); } - kfree(cc->cpages); + page_array_free(cc->inode, cc->cpages, cc->nr_cpages); cc->cpages = NULL; destroy_compress_ctx: if (cops->destroy_compress_ctx) @@ -677,7 +731,7 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) if (bio->bi_status || PageError(page)) dic->failed = true; - if (refcount_dec_not_one(&dic->ref)) + if (atomic_dec_return(&dic->pending_pages)) return; trace_f2fs_decompress_pages_start(dic->inode, dic->cluster_idx, @@ -689,8 +743,7 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) goto out_free_dic; } - dic->tpages = f2fs_kzalloc(sbi, sizeof(struct page *) * - dic->cluster_size, GFP_NOFS); + dic->tpages = page_array_alloc(dic->inode, dic->cluster_size); if (!dic->tpages) { ret = -ENOMEM; goto out_free_dic; @@ -715,13 +768,13 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) goto out_free_dic; } - dic->rbuf = vmap(dic->tpages, dic->cluster_size, VM_MAP, PAGE_KERNEL); + dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size); if (!dic->rbuf) { ret = -ENOMEM; goto destroy_decompress_ctx; } - dic->cbuf = vmap(dic->cpages, dic->nr_cpages, VM_MAP, PAGE_KERNEL_RO); + dic->cbuf = f2fs_vmap(dic->cpages, dic->nr_cpages); if (!dic->cbuf) { ret = -ENOMEM; goto out_vunmap_rbuf; @@ -738,15 +791,15 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) ret = cops->decompress_pages(dic); out_vunmap_cbuf: - vunmap(dic->cbuf); + vm_unmap_ram(dic->cbuf, dic->nr_cpages); out_vunmap_rbuf: - vunmap(dic->rbuf); + vm_unmap_ram(dic->rbuf, dic->cluster_size); destroy_decompress_ctx: if (cops->destroy_decompress_ctx) cops->destroy_decompress_ctx(dic); out_free_dic: if (verity) - refcount_set(&dic->ref, dic->nr_cpages); + atomic_set(&dic->pending_pages, dic->nr_cpages); if (!verity) f2fs_decompress_end_io(dic->rpages, dic->cluster_size, ret, false); @@ -1029,6 +1082,7 @@ bool f2fs_compress_write_end(struct inode *inode, void *fsdata, { struct compress_ctx cc = { + .inode = inode, .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, .cluster_size = F2FS_I(inode)->i_cluster_size, .rpages = fsdata, @@ -1132,7 +1186,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, */ down_read(&sbi->node_write); } else if (!f2fs_trylock_op(sbi)) { - return -EAGAIN; + goto out_free; } set_new_dnode(&dn, cc->inode, NULL, NULL, 0); @@ -1155,15 +1209,14 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, fio.version = ni.version; - cic = f2fs_kzalloc(sbi, sizeof(struct compress_io_ctx), GFP_NOFS); + cic = kmem_cache_zalloc(cic_entry_slab, GFP_NOFS); if (!cic) goto out_put_dnode; cic->magic = F2FS_COMPRESSED_PAGE_MAGIC; cic->inode = inode; - refcount_set(&cic->ref, cc->nr_cpages); - cic->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) << - cc->log_cluster_size, GFP_NOFS); + atomic_set(&cic->pending_pages, cc->nr_cpages); + cic->rpages = page_array_alloc(cc->inode, cc->cluster_size); if (!cic->rpages) goto out_put_cic; @@ -1257,11 +1310,13 @@ unlock_continue: spin_unlock(&fi->i_size_lock); f2fs_put_rpages(cc); + page_array_free(cc->inode, cc->cpages, cc->nr_cpages); + cc->cpages = NULL; f2fs_destroy_compress_ctx(cc); return 0; out_destroy_crypt: - kfree(cic->rpages); + page_array_free(cc->inode, cic->rpages, cc->cluster_size); for (--i; i >= 0; i--) fscrypt_finalize_bounce_page(&cc->cpages[i]); @@ -1271,7 +1326,7 @@ out_destroy_crypt: f2fs_put_page(cc->cpages[i], 1); } out_put_cic: - kfree(cic); + kmem_cache_free(cic_entry_slab, cic); out_put_dnode: f2fs_put_dnode(&dn); out_unlock_op: @@ -1279,6 +1334,9 @@ out_unlock_op: up_read(&sbi->node_write); else f2fs_unlock_op(sbi); +out_free: + page_array_free(cc->inode, cc->cpages, cc->nr_cpages); + cc->cpages = NULL; return -EAGAIN; } @@ -1296,7 +1354,7 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page) dec_page_count(sbi, F2FS_WB_DATA); - if (refcount_dec_not_one(&cic->ref)) + if (atomic_dec_return(&cic->pending_pages)) return; for (i = 0; i < cic->nr_rpages; i++) { @@ -1305,8 +1363,8 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page) end_page_writeback(cic->rpages[i]); } - kfree(cic->rpages); - kfree(cic); + page_array_free(cic->inode, cic->rpages, cic->nr_rpages); + kmem_cache_free(cic_entry_slab, cic); } static int f2fs_write_raw_pages(struct compress_ctx *cc, @@ -1388,9 +1446,6 @@ int f2fs_write_multi_pages(struct compress_ctx *cc, struct writeback_control *wbc, enum iostat_type io_type) { - struct f2fs_inode_info *fi = F2FS_I(cc->inode); - const struct f2fs_compress_ops *cops = - f2fs_cops[fi->i_compress_algorithm]; int err; *submitted = 0; @@ -1405,9 +1460,6 @@ int f2fs_write_multi_pages(struct compress_ctx *cc, err = f2fs_write_compressed_pages(cc, submitted, wbc, io_type); - cops->destroy_compress_ctx(cc); - kfree(cc->cpages); - cc->cpages = NULL; if (!err) return 0; f2fs_bug_on(F2FS_I_SB(cc->inode), err != -EAGAIN); @@ -1424,25 +1476,23 @@ destroy_out: struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) { - struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); struct decompress_io_ctx *dic; pgoff_t start_idx = start_idx_of_cluster(cc); int i; - dic = f2fs_kzalloc(sbi, sizeof(struct decompress_io_ctx), GFP_NOFS); + dic = kmem_cache_zalloc(dic_entry_slab, GFP_NOFS); if (!dic) return ERR_PTR(-ENOMEM); - dic->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) << - cc->log_cluster_size, GFP_NOFS); + dic->rpages = page_array_alloc(cc->inode, cc->cluster_size); if (!dic->rpages) { - kfree(dic); + kmem_cache_free(dic_entry_slab, dic); return ERR_PTR(-ENOMEM); } dic->magic = F2FS_COMPRESSED_PAGE_MAGIC; dic->inode = cc->inode; - refcount_set(&dic->ref, cc->nr_cpages); + atomic_set(&dic->pending_pages, cc->nr_cpages); dic->cluster_idx = cc->cluster_idx; dic->cluster_size = cc->cluster_size; dic->log_cluster_size = cc->log_cluster_size; @@ -1453,8 +1503,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) dic->rpages[i] = cc->rpages[i]; dic->nr_rpages = cc->cluster_size; - dic->cpages = f2fs_kzalloc(sbi, sizeof(struct page *) * - dic->nr_cpages, GFP_NOFS); + dic->cpages = page_array_alloc(dic->inode, dic->nr_cpages); if (!dic->cpages) goto out_free; @@ -1489,7 +1538,7 @@ void f2fs_free_dic(struct decompress_io_ctx *dic) continue; f2fs_compress_free_page(dic->tpages[i]); } - kfree(dic->tpages); + page_array_free(dic->inode, dic->tpages, dic->cluster_size); } if (dic->cpages) { @@ -1498,11 +1547,11 @@ void f2fs_free_dic(struct decompress_io_ctx *dic) continue; f2fs_compress_free_page(dic->cpages[i]); } - kfree(dic->cpages); + page_array_free(dic->inode, dic->cpages, dic->nr_cpages); } - kfree(dic->rpages); - kfree(dic); + page_array_free(dic->inode, dic->rpages, dic->nr_rpages); + kmem_cache_free(dic_entry_slab, dic); } void f2fs_decompress_end_io(struct page **rpages, @@ -1530,3 +1579,76 @@ unlock: unlock_page(rpage); } } + +int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) +{ + dev_t dev = sbi->sb->s_bdev->bd_dev; + char slab_name[32]; + + sprintf(slab_name, "f2fs_page_array_entry-%u:%u", MAJOR(dev), MINOR(dev)); + + sbi->page_array_slab_size = sizeof(struct page *) << + F2FS_OPTION(sbi).compress_log_size; + + sbi->page_array_slab = f2fs_kmem_cache_create(slab_name, + sbi->page_array_slab_size); + if (!sbi->page_array_slab) + return -ENOMEM; + return 0; +} + +void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) +{ + kmem_cache_destroy(sbi->page_array_slab); +} + +static int __init f2fs_init_cic_cache(void) +{ + cic_entry_slab = f2fs_kmem_cache_create("f2fs_cic_entry", + sizeof(struct compress_io_ctx)); + if (!cic_entry_slab) + return -ENOMEM; + return 0; +} + +static void f2fs_destroy_cic_cache(void) +{ + kmem_cache_destroy(cic_entry_slab); +} + +static int __init f2fs_init_dic_cache(void) +{ + dic_entry_slab = f2fs_kmem_cache_create("f2fs_dic_entry", + sizeof(struct decompress_io_ctx)); + if (!dic_entry_slab) + return -ENOMEM; + return 0; +} + +static void f2fs_destroy_dic_cache(void) +{ + kmem_cache_destroy(dic_entry_slab); +} + +int __init f2fs_init_compress_cache(void) +{ + int err; + + err = f2fs_init_cic_cache(); + if (err) + goto out; + err = f2fs_init_dic_cache(); + if (err) + goto free_cic; + return 0; +free_cic: + f2fs_destroy_cic_cache(); +out: + return -ENOMEM; +} + +void f2fs_destroy_compress_cache(void) +{ + f2fs_destroy_dic_cache(); + f2fs_destroy_cic_cache(); +} diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 73683e58a08d..be4da52604ed 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -202,7 +202,7 @@ static void f2fs_verify_bio(struct bio *bio) dic = (struct decompress_io_ctx *)page_private(page); if (dic) { - if (refcount_dec_not_one(&dic->ref)) + if (atomic_dec_return(&dic->pending_pages)) continue; f2fs_verify_pages(dic->rpages, dic->cluster_size); @@ -517,7 +517,7 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, zero_user_segment(page, 0, PAGE_SIZE); SetPagePrivate(page); - set_page_private(page, (unsigned long)DUMMY_WRITTEN_PAGE); + set_page_private(page, DUMMY_WRITTEN_PAGE); lock_page(page); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) f2fs_bug_on(sbi, 1); @@ -1416,7 +1416,7 @@ alloc: set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); old_blkaddr = dn->data_blkaddr; f2fs_allocate_data_block(sbi, NULL, old_blkaddr, &dn->data_blkaddr, - &sum, seg_type, NULL); + &sum, seg_type, NULL); if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) invalidate_mapping_pages(META_MAPPING(sbi), old_blkaddr, old_blkaddr); @@ -1803,10 +1803,6 @@ static int get_data_block_dio(struct inode *inode, sector_t iblock, static int get_data_block_bmap(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { - /* Block number less than F2FS MAX BLOCKS */ - if (unlikely(iblock >= F2FS_I_SB(inode)->max_file_blocks)) - return -EFBIG; - return __get_data_block(inode, iblock, bh_result, create, F2FS_GET_BLOCK_BMAP, NULL, NO_CHECK_TYPE, create); @@ -2272,8 +2268,8 @@ submit_and_realloc: if (IS_ERR(bio)) { ret = PTR_ERR(bio); dic->failed = true; - if (refcount_sub_and_test(dic->nr_cpages - i, - &dic->ref)) { + if (!atomic_sub_return(dic->nr_cpages - i, + &dic->pending_pages)) { f2fs_decompress_end_io(dic->rpages, cc->cluster_size, true, false); @@ -3133,6 +3129,8 @@ next: retry = 0; } } + if (f2fs_compressed_file(inode)) + f2fs_destroy_compress_ctx(&cc); #endif if (retry) { index = 0; @@ -3574,7 +3572,7 @@ static void f2fs_dio_end_io(struct bio *bio) bio->bi_private = dio->orig_private; bio->bi_end_io = dio->orig_end_io; - kvfree(dio); + kfree(dio); bio_endio(bio); } @@ -3673,12 +3671,18 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) err); if (!do_opu) set_inode_flag(inode, FI_UPDATE_WRITE); + } else if (err == -EIOCBQUEUED) { + f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO, + count - iov_iter_count(iter)); } else if (err < 0) { f2fs_write_failed(mapping, offset + count); } } else { if (err > 0) f2fs_update_iostat(sbi, APP_DIRECT_READ_IO, err); + else if (err == -EIOCBQUEUED) + f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_READ_IO, + count - iov_iter_count(iter)); } out: @@ -3807,11 +3811,16 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) filemap_write_and_wait(mapping); - if (f2fs_compressed_file(inode)) - blknr = f2fs_bmap_compress(inode, block); + /* Block number less than F2FS MAX BLOCKS */ + if (unlikely(block >= F2FS_I_SB(inode)->max_file_blocks)) + goto out; - if (!get_data_block_bmap(inode, block, &tmp, 0)) - blknr = tmp.b_blocknr; + if (f2fs_compressed_file(inode)) { + blknr = f2fs_bmap_compress(inode, block); + } else { + if (!get_data_block_bmap(inode, block, &tmp, 0)) + blknr = tmp.b_blocknr; + } out: trace_f2fs_bmap(inode, block, blknr); return blknr; @@ -3874,6 +3883,83 @@ int f2fs_migrate_page(struct address_space *mapping, #endif #ifdef CONFIG_SWAP +static int check_swap_activate_fast(struct swap_info_struct *sis, + struct file *swap_file, sector_t *span) +{ + struct address_space *mapping = swap_file->f_mapping; + struct inode *inode = mapping->host; + sector_t cur_lblock; + sector_t last_lblock; + sector_t pblock; + sector_t lowest_pblock = -1; + sector_t highest_pblock = 0; + int nr_extents = 0; + unsigned long nr_pblocks; + unsigned long len; + int ret; + + /* + * Map all the blocks into the extent list. This code doesn't try + * to be very smart. + */ + cur_lblock = 0; + last_lblock = logical_to_blk(inode, i_size_read(inode)); + len = i_size_read(inode); + + while (cur_lblock <= last_lblock && cur_lblock < sis->max) { + struct buffer_head map_bh; + pgoff_t next_pgofs; + + cond_resched(); + + memset(&map_bh, 0, sizeof(struct buffer_head)); + map_bh.b_size = len - cur_lblock; + + ret = get_data_block(inode, cur_lblock, &map_bh, 0, + F2FS_GET_BLOCK_FIEMAP, &next_pgofs); + if (ret) + goto err_out; + + /* hole */ + if (!buffer_mapped(&map_bh)) + goto err_out; + + pblock = map_bh.b_blocknr; + nr_pblocks = logical_to_blk(inode, map_bh.b_size); + + if (cur_lblock + nr_pblocks >= sis->max) + nr_pblocks = sis->max - cur_lblock; + + if (cur_lblock) { /* exclude the header page */ + if (pblock < lowest_pblock) + lowest_pblock = pblock; + if (pblock + nr_pblocks - 1 > highest_pblock) + highest_pblock = pblock + nr_pblocks - 1; + } + + /* + * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks + */ + ret = add_swap_extent(sis, cur_lblock, nr_pblocks, pblock); + if (ret < 0) + goto out; + nr_extents += ret; + cur_lblock += nr_pblocks; + } + ret = nr_extents; + *span = 1 + highest_pblock - lowest_pblock; + if (cur_lblock == 0) + cur_lblock = 1; /* force Empty message */ + sis->max = cur_lblock; + sis->pages = cur_lblock - 1; + sis->highest_bit = cur_lblock - 1; +out: + return ret; +err_out: + pr_err("swapon: swapfile has holes\n"); + return -EINVAL; +} + /* Copied from generic_swapfile_activate() to check any holes */ static int check_swap_activate(struct swap_info_struct *sis, struct file *swap_file, sector_t *span) @@ -3890,6 +3976,9 @@ static int check_swap_activate(struct swap_info_struct *sis, int nr_extents = 0; int ret; + if (PAGE_SIZE == F2FS_BLKSIZE) + return check_swap_activate_fast(sis, swap_file, span); + blkbits = inode->i_blkbits; blocks_per_page = PAGE_SIZE >> blkbits; @@ -3989,7 +4078,7 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, if (ret) return ret; - if (f2fs_disable_compressed_file(inode)) + if (!f2fs_disable_compressed_file(inode)) return -EINVAL; ret = check_swap_activate(sis, file, span); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 4276c0f79beb..a8357fd4f5fa 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -131,7 +131,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->inline_inode = atomic_read(&sbi->inline_inode); si->inline_dir = atomic_read(&sbi->inline_dir); si->compr_inode = atomic_read(&sbi->compr_inode); - si->compr_blocks = atomic_read(&sbi->compr_blocks); + si->compr_blocks = atomic64_read(&sbi->compr_blocks); si->append = sbi->im[APPEND_INO].ino_num; si->update = sbi->im[UPDATE_INO].ino_num; si->orphans = sbi->im[ORPHAN_INO].ino_num; @@ -164,7 +164,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) / 2; si->util_invalid = 50 - si->util_free - si->util_valid; - for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_NODE; i++) { + for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); si->curseg[i] = curseg->segno; si->cursec[i] = GET_SEC_FROM_SEG(sbi, curseg->segno); @@ -342,7 +342,7 @@ static int stat_show(struct seq_file *s, void *v) si->inline_inode); seq_printf(s, " - Inline_dentry Inode: %u\n", si->inline_dir); - seq_printf(s, " - Compressed Inode: %u, Blocks: %u\n", + seq_printf(s, " - Compressed Inode: %u, Blocks: %llu\n", si->compr_inode, si->compr_blocks); seq_printf(s, " - Orphan/Append/Update Inode: %u, %u, %u\n", si->orphans, si->append, si->update); @@ -393,6 +393,14 @@ static int stat_show(struct seq_file *s, void *v) si->dirty_seg[CURSEG_COLD_NODE], si->full_seg[CURSEG_COLD_NODE], si->valid_blks[CURSEG_COLD_NODE]); + seq_printf(s, " - Pinned file: %8d %8d %8d\n", + si->curseg[CURSEG_COLD_DATA_PINNED], + si->cursec[CURSEG_COLD_DATA_PINNED], + si->curzone[CURSEG_COLD_DATA_PINNED]); + seq_printf(s, " - ATGC data: %8d %8d %8d\n", + si->curseg[CURSEG_ALL_DATA_ATGC], + si->cursec[CURSEG_ALL_DATA_ATGC], + si->curzone[CURSEG_ALL_DATA_ATGC]); seq_printf(s, "\n - Valid: %d\n - Dirty: %d\n", si->main_area_segs - si->dirty_count - si->prefree_count - si->free_segs, @@ -542,7 +550,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->inline_inode, 0); atomic_set(&sbi->inline_dir, 0); atomic_set(&sbi->compr_inode, 0); - atomic_set(&sbi->compr_blocks, 0); + atomic64_set(&sbi->compr_blocks, 0); atomic_set(&sbi->inplace_count, 0); for (i = META_CP; i < META_MAX; i++) atomic_set(&sbi->meta_count[i], 0); @@ -566,7 +574,7 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi) list_del(&si->stat_list); mutex_unlock(&f2fs_stat_mutex); - kvfree(si); + kfree(si); } void __init f2fs_create_root_stats(void) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 069f498af1e3..4b9ef8bbfa4a 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -75,21 +75,22 @@ int f2fs_init_casefolded_name(const struct inode *dir, struct f2fs_filename *fname) { #ifdef CONFIG_UNICODE - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + struct super_block *sb = dir->i_sb; + struct f2fs_sb_info *sbi = F2FS_SB(sb); if (IS_CASEFOLDED(dir)) { fname->cf_name.name = f2fs_kmalloc(sbi, F2FS_NAME_LEN, GFP_NOFS); if (!fname->cf_name.name) return -ENOMEM; - fname->cf_name.len = utf8_casefold(sbi->s_encoding, + fname->cf_name.len = utf8_casefold(sb->s_encoding, fname->usr_fname, fname->cf_name.name, F2FS_NAME_LEN); if ((int)fname->cf_name.len <= 0) { kfree(fname->cf_name.name); fname->cf_name.name = NULL; - if (f2fs_has_strict_mode(sbi)) + if (sb_has_strict_encoding(sb)) return -EINVAL; /* fall back to treating name as opaque byte sequence */ } @@ -111,7 +112,7 @@ static int __f2fs_setup_filename(const struct inode *dir, #ifdef CONFIG_FS_ENCRYPTION fname->crypto_buf = crypt_name->crypto_buf; #endif - if (crypt_name->is_ciphertext_name) { + if (crypt_name->is_nokey_name) { /* hash was decoded from the no-key name */ fname->hash = cpu_to_le32(crypt_name->hash); } else { @@ -190,21 +191,15 @@ static unsigned long dir_block_index(unsigned int level, static struct f2fs_dir_entry *find_in_block(struct inode *dir, struct page *dentry_page, const struct f2fs_filename *fname, - int *max_slots, - struct page **res_page) + int *max_slots) { struct f2fs_dentry_block *dentry_blk; - struct f2fs_dir_entry *de; struct f2fs_dentry_ptr d; dentry_blk = (struct f2fs_dentry_block *)page_address(dentry_page); make_dentry_ptr_block(dir, &d, dentry_blk); - de = f2fs_find_target_dentry(&d, fname, max_slots); - if (de) - *res_page = dentry_page; - - return de; + return f2fs_find_target_dentry(&d, fname, max_slots); } #ifdef CONFIG_UNICODE @@ -215,8 +210,8 @@ static struct f2fs_dir_entry *find_in_block(struct inode *dir, static bool f2fs_match_ci_name(const struct inode *dir, const struct qstr *name, const u8 *de_name, u32 de_name_len) { - const struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); - const struct unicode_map *um = sbi->s_encoding; + const struct super_block *sb = dir->i_sb; + const struct unicode_map *um = sb->s_encoding; struct qstr entry = QSTR_INIT(de_name, de_name_len); int res; @@ -226,7 +221,7 @@ static bool f2fs_match_ci_name(const struct inode *dir, const struct qstr *name, * In strict mode, ignore invalid names. In non-strict mode, * fall back to treating them as opaque byte sequences. */ - if (f2fs_has_strict_mode(sbi) || name->len != entry.len) + if (sb_has_strict_encoding(sb) || name->len != entry.len) return false; return !memcmp(name->name, entry.name, name->len); } @@ -330,10 +325,11 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, } } - de = find_in_block(dir, dentry_page, fname, &max_slots, - res_page); - if (de) + de = find_in_block(dir, dentry_page, fname, &max_slots); + if (de) { + *res_page = dentry_page; break; + } if (max_slots >= s) room = true; @@ -357,16 +353,15 @@ struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, unsigned int max_depth; unsigned int level; + *res_page = NULL; + if (f2fs_has_inline_dentry(dir)) { - *res_page = NULL; de = f2fs_find_in_inline_dir(dir, fname, res_page); goto out; } - if (npages == 0) { - *res_page = NULL; + if (npages == 0) goto out; - } max_depth = F2FS_I(dir)->i_current_depth; if (unlikely(max_depth > MAX_DIR_HASH_DEPTH)) { @@ -377,7 +372,6 @@ struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, } for (level = 0; level < max_depth; level++) { - *res_page = NULL; de = find_in_level(dir, level, fname, res_page); if (de || IS_ERR(*res_page)) break; @@ -537,7 +531,7 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, goto put_error; if (IS_ENCRYPTED(inode)) { - err = fscrypt_inherit_context(dir, inode, page, false); + err = fscrypt_set_context(inode, page); if (err) goto put_error; } @@ -1032,7 +1026,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) if (err) goto out; - err = fscrypt_fname_alloc_buffer(inode, F2FS_NAME_LEN, &fstr); + err = fscrypt_fname_alloc_buffer(F2FS_NAME_LEN, &fstr); if (err < 0) goto out; } @@ -1107,75 +1101,8 @@ const struct file_operations f2fs_dir_operations = { }; #ifdef CONFIG_UNICODE -static int f2fs_d_compare(const struct dentry *dentry, unsigned int len, - const char *str, const struct qstr *name) -{ - const struct dentry *parent = READ_ONCE(dentry->d_parent); - const struct inode *dir = READ_ONCE(parent->d_inode); - const struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); - struct qstr entry = QSTR_INIT(str, len); - char strbuf[DNAME_INLINE_LEN]; - int res; - - if (!dir || !IS_CASEFOLDED(dir)) - goto fallback; - - /* - * If the dentry name is stored in-line, then it may be concurrently - * modified by a rename. If this happens, the VFS will eventually retry - * the lookup, so it doesn't matter what ->d_compare() returns. - * However, it's unsafe to call utf8_strncasecmp() with an unstable - * string. Therefore, we have to copy the name into a temporary buffer. - */ - if (len <= DNAME_INLINE_LEN - 1) { - memcpy(strbuf, str, len); - strbuf[len] = 0; - entry.name = strbuf; - /* prevent compiler from optimizing out the temporary buffer */ - barrier(); - } - - res = utf8_strncasecmp(sbi->s_encoding, name, &entry); - if (res >= 0) - return res; - - if (f2fs_has_strict_mode(sbi)) - return -EINVAL; -fallback: - if (len != name->len) - return 1; - return !!memcmp(str, name->name, len); -} - -static int f2fs_d_hash(const struct dentry *dentry, struct qstr *str) -{ - struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); - const struct unicode_map *um = sbi->s_encoding; - const struct inode *inode = READ_ONCE(dentry->d_inode); - unsigned char *norm; - int len, ret = 0; - - if (!inode || !IS_CASEFOLDED(inode)) - return 0; - - norm = f2fs_kmalloc(sbi, PATH_MAX, GFP_ATOMIC); - if (!norm) - return -ENOMEM; - - len = utf8_casefold(um, str, norm, PATH_MAX); - if (len < 0) { - if (f2fs_has_strict_mode(sbi)) - ret = -EINVAL; - goto out; - } - str->hash = full_name_hash(dentry, norm, len); -out: - kvfree(norm); - return ret; -} - const struct dentry_operations f2fs_dentry_ops = { - .d_hash = f2fs_d_hash, - .d_compare = f2fs_d_compare, + .d_hash = generic_ci_d_hash, + .d_compare = generic_ci_d_compare, }; #endif diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 686c68b98610..3ebf976a682d 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -58,6 +58,29 @@ struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root, return re; } +struct rb_node **f2fs_lookup_rb_tree_ext(struct f2fs_sb_info *sbi, + struct rb_root_cached *root, + struct rb_node **parent, + unsigned long long key, bool *leftmost) +{ + struct rb_node **p = &root->rb_root.rb_node; + struct rb_entry *re; + + while (*p) { + *parent = *p; + re = rb_entry(*parent, struct rb_entry, rb_node); + + if (key < re->key) { + p = &(*p)->rb_left; + } else { + p = &(*p)->rb_right; + *leftmost = false; + } + } + + return p; +} + struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, struct rb_root_cached *root, struct rb_node **parent, @@ -166,7 +189,7 @@ lookup_neighbors: } bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, - struct rb_root_cached *root) + struct rb_root_cached *root, bool check_key) { #ifdef CONFIG_F2FS_CHECK_FS struct rb_node *cur = rb_first_cached(root), *next; @@ -183,13 +206,23 @@ bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, cur_re = rb_entry(cur, struct rb_entry, rb_node); next_re = rb_entry(next, struct rb_entry, rb_node); + if (check_key) { + if (cur_re->key > next_re->key) { + f2fs_info(sbi, "inconsistent rbtree, " + "cur(%llu) next(%llu)", + cur_re->key, next_re->key); + return false; + } + goto next; + } + if (cur_re->ofs + cur_re->len > next_re->ofs) { f2fs_info(sbi, "inconsistent rbtree, cur(%u, %u) next(%u, %u)", cur_re->ofs, cur_re->len, next_re->ofs, next_re->len); return false; } - +next: cur = next; } #endif diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d9e52a7f3702..cb700d797296 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -98,6 +98,7 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_RESERVE_ROOT 0x01000000 #define F2FS_MOUNT_DISABLE_CHECKPOINT 0x02000000 #define F2FS_MOUNT_NORECOVERY 0x04000000 +#define F2FS_MOUNT_ATGC 0x08000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) @@ -138,7 +139,7 @@ struct f2fs_mount_info { int fsync_mode; /* fsync policy */ int fs_mode; /* fs mode: LFS or ADAPTIVE */ int bggc_mode; /* bggc mode: off, on or sync */ - struct fscrypt_dummy_context dummy_enc_ctx; /* test dummy encryption */ + struct fscrypt_dummy_policy dummy_enc_policy; /* test dummy encryption */ block_t unusable_cap_perc; /* percentage for cap */ block_t unusable_cap; /* Amount of space allowed to be * unusable when disabling checkpoint @@ -612,8 +613,13 @@ enum { struct rb_entry { struct rb_node rb_node; /* rb node located in rb-tree */ - unsigned int ofs; /* start offset of the entry */ - unsigned int len; /* length of the entry */ + union { + struct { + unsigned int ofs; /* start offset of the entry */ + unsigned int len; /* length of the entry */ + }; + unsigned long long key; /* 64-bits key */ + } __packed; }; struct extent_info { @@ -801,7 +807,7 @@ struct f2fs_inode_info { struct timespec64 i_disk_time[4];/* inode disk times */ /* for file compress */ - u64 i_compr_blocks; /* # of compressed blocks */ + atomic_t i_compr_blocks; /* # of compressed blocks */ unsigned char i_compress_algorithm; /* algorithm type */ unsigned char i_log_cluster_size; /* log of cluster size */ unsigned int i_cluster_size; /* cluster size */ @@ -973,7 +979,9 @@ static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode, */ #define NR_CURSEG_DATA_TYPE (3) #define NR_CURSEG_NODE_TYPE (3) -#define NR_CURSEG_TYPE (NR_CURSEG_DATA_TYPE + NR_CURSEG_NODE_TYPE) +#define NR_CURSEG_INMEM_TYPE (2) +#define NR_CURSEG_PERSIST_TYPE (NR_CURSEG_DATA_TYPE + NR_CURSEG_NODE_TYPE) +#define NR_CURSEG_TYPE (NR_CURSEG_INMEM_TYPE + NR_CURSEG_PERSIST_TYPE) enum { CURSEG_HOT_DATA = 0, /* directory entry blocks */ @@ -982,8 +990,11 @@ enum { CURSEG_HOT_NODE, /* direct node blocks of directory files */ CURSEG_WARM_NODE, /* direct node blocks of normal files */ CURSEG_COLD_NODE, /* indirect node blocks */ - NO_CHECK_TYPE, - CURSEG_COLD_DATA_PINNED,/* cold data for pinned file */ + NR_PERSISTENT_LOG, /* number of persistent log */ + CURSEG_COLD_DATA_PINNED = NR_PERSISTENT_LOG, + /* pinned file that needs consecutive block address */ + CURSEG_ALL_DATA_ATGC, /* SSR alloctor in hot/warm/cold data area */ + NO_CHECK_TYPE, /* number of persistent & inmem log */ }; struct flush_cmd { @@ -1209,6 +1220,7 @@ struct f2fs_dev_info { #ifdef CONFIG_BLK_DEV_ZONED unsigned int nr_blkz; /* Total number of zones */ unsigned long *blkz_seq; /* Bitmap indicating sequential zones */ + block_t *zone_capacity_blocks; /* Array of zone capacity in blks */ #endif }; @@ -1228,6 +1240,18 @@ struct inode_management { unsigned long ino_num; /* number of entries */ }; +/* for GC_AT */ +struct atgc_management { + bool atgc_enabled; /* ATGC is enabled or not */ + struct rb_root_cached root; /* root of victim rb-tree */ + struct list_head victim_list; /* linked with all victim entries */ + unsigned int victim_count; /* victim count in rb-tree */ + unsigned int candidate_ratio; /* candidate ratio */ + unsigned int max_candidate_count; /* max candidate count */ + unsigned int age_weight; /* age weight, vblock_weight = 100 - age_weight */ + unsigned long long age_threshold; /* age threshold */ +}; + /* For s_flag in struct f2fs_sb_info */ enum { SBI_IS_DIRTY, /* dirty flag for checkpoint */ @@ -1260,6 +1284,7 @@ enum { GC_NORMAL, GC_IDLE_CB, GC_IDLE_GREEDY, + GC_IDLE_AT, GC_URGENT_HIGH, GC_URGENT_LOW, }; @@ -1303,9 +1328,9 @@ enum fsync_mode { #define DUMMY_WRITTEN_PAGE ((unsigned long)-2) #define IS_ATOMIC_WRITTEN_PAGE(page) \ - (page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE) + (page_private(page) == ATOMIC_WRITTEN_PAGE) #define IS_DUMMY_WRITTEN_PAGE(page) \ - (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) + (page_private(page) == DUMMY_WRITTEN_PAGE) #ifdef CONFIG_F2FS_IO_TRACE #define IS_IO_TRACED_PAGE(page) \ @@ -1315,13 +1340,6 @@ enum fsync_mode { #define IS_IO_TRACED_PAGE(page) (0) #endif -#ifdef CONFIG_FS_ENCRYPTION -#define DUMMY_ENCRYPTION_ENABLED(sbi) \ - (unlikely(F2FS_OPTION(sbi).dummy_enc_ctx.ctx != NULL)) -#else -#define DUMMY_ENCRYPTION_ENABLED(sbi) (0) -#endif - /* For compression */ enum compress_algorithm_type { COMPRESS_LZO, @@ -1366,7 +1384,7 @@ struct compress_io_ctx { struct inode *inode; /* inode the context belong to */ struct page **rpages; /* pages store raw data in cluster */ unsigned int nr_rpages; /* total page number in rpages */ - refcount_t ref; /* referrence count of raw page */ + atomic_t pending_pages; /* in-flight compressed page count */ }; /* decompress io context for read IO path */ @@ -1385,7 +1403,7 @@ struct decompress_io_ctx { struct compress_data *cbuf; /* virtual mapped address on cpages */ size_t rlen; /* valid data length in rbuf */ size_t clen; /* valid data length in cbuf */ - refcount_t ref; /* referrence count of compressed page */ + atomic_t pending_pages; /* in-flight compressed page count */ bool failed; /* indicate IO error during decompression */ void *private; /* payload buffer for specified decompression algorithm */ void *private2; /* extra payload buffer */ @@ -1394,7 +1412,7 @@ struct decompress_io_ctx { #define NULL_CLUSTER ((unsigned int)(~0)) #define MIN_COMPRESS_LOG_SIZE 2 #define MAX_COMPRESS_LOG_SIZE 8 -#define MAX_COMPRESS_WINDOW_SIZE ((PAGE_SIZE) << MAX_COMPRESS_LOG_SIZE) +#define MAX_COMPRESS_WINDOW_SIZE(log_size) ((PAGE_SIZE) << (log_size)) struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ @@ -1404,10 +1422,6 @@ struct f2fs_sb_info { int valid_super_block; /* valid super block no */ unsigned long s_flag; /* flags for sbi */ struct mutex writepages; /* mutex for writepages() */ -#ifdef CONFIG_UNICODE - struct unicode_map *s_encoding; - __u16 s_encoding_flags; -#endif #ifdef CONFIG_BLK_DEV_ZONED unsigned int blocks_per_blkz; /* F2FS blocks per zone */ @@ -1515,6 +1529,7 @@ struct f2fs_sb_info { * race between GC and GC or CP */ struct f2fs_gc_kthread *gc_thread; /* GC thread */ + struct atgc_management am; /* atgc management */ unsigned int cur_victim_sec; /* current victim section num */ unsigned int gc_mode; /* current GC state */ unsigned int next_victim_seg[2]; /* next segment in victim section */ @@ -1551,7 +1566,7 @@ struct f2fs_sb_info { atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ atomic_t compr_inode; /* # of compressed inodes */ - atomic_t compr_blocks; /* # of compressed blocks */ + atomic64_t compr_blocks; /* # of compressed blocks */ atomic_t vw_cnt; /* # of volatile writes */ atomic_t max_aw_cnt; /* max # of atomic writes */ atomic_t max_vw_cnt; /* max # of volatile writes */ @@ -1600,6 +1615,11 @@ struct f2fs_sb_info { struct kmem_cache *inline_xattr_slab; /* inline xattr entry */ unsigned int inline_xattr_slab_size; /* default inline xattr slab size */ + +#ifdef CONFIG_F2FS_FS_COMPRESSION + struct kmem_cache *page_array_slab; /* page array entry */ + unsigned int page_array_slab_size; /* default page array slab size */ +#endif }; struct f2fs_private_dio { @@ -3332,6 +3352,11 @@ block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi); int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable); void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi); int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); +void f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi); +void f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi); +void f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi); +void f2fs_get_new_segment(struct f2fs_sb_info *sbi, + unsigned int *newseg, bool new_sec, int dir); void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, unsigned int start, unsigned int end); void f2fs_allocate_new_segment(struct f2fs_sb_info *sbi, int type); @@ -3350,7 +3375,8 @@ void f2fs_outplace_write_data(struct dnode_of_data *dn, int f2fs_inplace_write_data(struct f2fs_io_info *fio); void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, block_t old_blkaddr, block_t new_blkaddr, - bool recover_curseg, bool recover_newaddr); + bool recover_curseg, bool recover_newaddr, + bool from_gc); void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, block_t old_addr, block_t new_addr, unsigned char version, bool recover_curseg, @@ -3378,6 +3404,10 @@ void f2fs_destroy_segment_manager_caches(void); int f2fs_rw_hint_to_seg_type(enum rw_hint hint); enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, enum page_type type, enum temp_type temp); +unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi, + unsigned int segno); +unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi, + unsigned int segno); /* * checkpoint.c @@ -3385,7 +3415,7 @@ enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io); struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); -struct page *f2fs_get_meta_page_nofail(struct f2fs_sb_info *sbi, pgoff_t index); +struct page *f2fs_get_meta_page_retry(struct f2fs_sb_info *sbi, pgoff_t index); struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index); bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); @@ -3493,6 +3523,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, unsigned int segno); void f2fs_build_gc_manager(struct f2fs_sb_info *sbi); int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count); +int __init f2fs_create_garbage_collection_cache(void); +void f2fs_destroy_garbage_collection_cache(void); /* * recovery.c @@ -3528,7 +3560,8 @@ struct f2fs_stat_info { int nr_discard_cmd; unsigned int undiscard_blks; int inline_xattr, inline_inode, inline_dir, append, update, orphans; - int compr_inode, compr_blocks; + int compr_inode; + unsigned long long compr_blocks; int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; @@ -3613,9 +3646,9 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) (atomic_dec(&F2FS_I_SB(inode)->compr_inode)); \ } while (0) #define stat_add_compr_blocks(inode, blocks) \ - (atomic_add(blocks, &F2FS_I_SB(inode)->compr_blocks)) + (atomic64_add(blocks, &F2FS_I_SB(inode)->compr_blocks)) #define stat_sub_compr_blocks(inode, blocks) \ - (atomic_sub(blocks, &F2FS_I_SB(inode)->compr_blocks)) + (atomic64_sub(blocks, &F2FS_I_SB(inode)->compr_blocks)) #define stat_inc_meta_count(sbi, blkaddr) \ do { \ if (blkaddr < SIT_I(sbi)->sit_base_addr) \ @@ -3794,6 +3827,10 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); */ struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root, struct rb_entry *cached_re, unsigned int ofs); +struct rb_node **f2fs_lookup_rb_tree_ext(struct f2fs_sb_info *sbi, + struct rb_root_cached *root, + struct rb_node **parent, + unsigned long long key, bool *left_most); struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, struct rb_root_cached *root, struct rb_node **parent, @@ -3804,7 +3841,7 @@ struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root, struct rb_node ***insert_p, struct rb_node **insert_parent, bool force, bool *leftmost); bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, - struct rb_root_cached *root); + struct rb_root_cached *root, bool check_key); unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); void f2fs_init_extent_tree(struct inode *inode, struct page *ipage); void f2fs_drop_extent_tree(struct inode *inode); @@ -3890,6 +3927,10 @@ void f2fs_decompress_end_io(struct page **rpages, int f2fs_init_compress_ctx(struct compress_ctx *cc); void f2fs_destroy_compress_ctx(struct compress_ctx *cc); void f2fs_init_compress_info(struct f2fs_sb_info *sbi); +int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi); +void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi); +int __init f2fs_init_compress_cache(void); +void f2fs_destroy_compress_cache(void); #else static inline bool f2fs_is_compressed_page(struct page *page) { return false; } static inline bool f2fs_is_compress_backend_ready(struct inode *inode) @@ -3906,6 +3947,10 @@ static inline struct page *f2fs_compress_control_page(struct page *page) } static inline int f2fs_init_compress_mempool(void) { return 0; } static inline void f2fs_destroy_compress_mempool(void) { } +static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return 0; } +static inline void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) { } +static inline int __init f2fs_init_compress_cache(void) { return 0; } +static inline void f2fs_destroy_compress_cache(void) { } #endif static inline void set_compress_context(struct inode *inode) @@ -3924,24 +3969,21 @@ static inline void set_compress_context(struct inode *inode) f2fs_mark_inode_dirty_sync(inode, true); } -static inline u64 f2fs_disable_compressed_file(struct inode *inode) +static inline bool f2fs_disable_compressed_file(struct inode *inode) { struct f2fs_inode_info *fi = F2FS_I(inode); if (!f2fs_compressed_file(inode)) - return 0; - if (S_ISREG(inode->i_mode)) { - if (get_dirty_pages(inode)) - return 1; - if (fi->i_compr_blocks) - return fi->i_compr_blocks; - } + return true; + if (S_ISREG(inode->i_mode) && + (get_dirty_pages(inode) || atomic_read(&fi->i_compr_blocks))) + return false; fi->i_flags &= ~F2FS_COMPR_FL; stat_dec_compr_inode(inode); clear_inode_flag(inode, FI_COMPRESSED_FILE); f2fs_mark_inode_dirty_sync(inode, true); - return 0; + return true; } #define F2FS_FEATURE_FUNCS(name, flagname) \ @@ -4022,22 +4064,6 @@ static inline bool f2fs_lfs_mode(struct f2fs_sb_info *sbi) return F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS; } -static inline bool f2fs_may_encrypt(struct inode *dir, struct inode *inode) -{ -#ifdef CONFIG_FS_ENCRYPTION - struct f2fs_sb_info *sbi = F2FS_I_SB(dir); - umode_t mode = inode->i_mode; - - /* - * If the directory encrypted or dummy encryption enabled, - * then we should encrypt the inode. - */ - if (IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) - return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)); -#endif - return false; -} - static inline bool f2fs_may_compress(struct inode *inode) { if (IS_SWAPFILE(inode) || f2fs_is_pinned_file(inode) || @@ -4051,16 +4077,17 @@ static inline void f2fs_i_compr_blocks_update(struct inode *inode, u64 blocks, bool add) { int diff = F2FS_I(inode)->i_cluster_size - blocks; + struct f2fs_inode_info *fi = F2FS_I(inode); /* don't update i_compr_blocks if saved blocks were released */ - if (!add && !F2FS_I(inode)->i_compr_blocks) + if (!add && !atomic_read(&fi->i_compr_blocks)) return; if (add) { - F2FS_I(inode)->i_compr_blocks += diff; + atomic_add(diff, &fi->i_compr_blocks); stat_add_compr_blocks(inode, diff); } else { - F2FS_I(inode)->i_compr_blocks -= diff; + atomic_sub(diff, &fi->i_compr_blocks); stat_sub_compr_blocks(inode, diff); } f2fs_mark_inode_dirty_sync(inode, true); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 8a422400e824..ee861c6d9ff0 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -376,32 +376,15 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) return f2fs_do_sync_file(file, start, end, datasync, false); } -static pgoff_t __get_first_dirty_index(struct address_space *mapping, - pgoff_t pgofs, int whence) -{ - struct page *page; - int nr_pages; - - if (whence != SEEK_DATA) - return 0; - - /* find first dirty page index */ - nr_pages = find_get_pages_tag(mapping, &pgofs, PAGECACHE_TAG_DIRTY, - 1, &page); - if (!nr_pages) - return ULONG_MAX; - pgofs = page->index; - put_page(page); - return pgofs; -} - -static bool __found_offset(struct f2fs_sb_info *sbi, block_t blkaddr, - pgoff_t dirty, pgoff_t pgofs, int whence) +static bool __found_offset(struct address_space *mapping, block_t blkaddr, + pgoff_t index, int whence) { switch (whence) { case SEEK_DATA: - if ((blkaddr == NEW_ADDR && dirty == pgofs) || - __is_valid_data_blkaddr(blkaddr)) + if (__is_valid_data_blkaddr(blkaddr)) + return true; + if (blkaddr == NEW_ADDR && + xa_get_mark(&mapping->i_pages, index, PAGECACHE_TAG_DIRTY)) return true; break; case SEEK_HOLE: @@ -417,7 +400,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) struct inode *inode = file->f_mapping->host; loff_t maxbytes = inode->i_sb->s_maxbytes; struct dnode_of_data dn; - pgoff_t pgofs, end_offset, dirty; + pgoff_t pgofs, end_offset; loff_t data_ofs = offset; loff_t isize; int err = 0; @@ -429,16 +412,13 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) goto fail; /* handle inline data case */ - if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) { - if (whence == SEEK_HOLE) - data_ofs = isize; + if (f2fs_has_inline_data(inode) && whence == SEEK_HOLE) { + data_ofs = isize; goto found; } pgofs = (pgoff_t)(offset >> PAGE_SHIFT); - dirty = __get_first_dirty_index(inode->i_mapping, pgofs, whence); - for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_SHIFT) { set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_get_dnode_of_data(&dn, pgofs, LOOKUP_NODE); @@ -471,7 +451,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) goto fail; } - if (__found_offset(F2FS_I_SB(inode), blkaddr, dirty, + if (__found_offset(file->f_mapping, blkaddr, pgofs, whence)) { f2fs_put_dnode(&dn); goto found; @@ -564,7 +544,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) bool compressed_cluster = false; int cluster_index = 0, valid_blocks = 0; int cluster_size = F2FS_I(dn->inode)->i_cluster_size; - bool released = !F2FS_I(dn->inode)->i_compr_blocks; + bool released = !atomic_read(&F2FS_I(dn->inode)->i_compr_blocks); if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode)) base = get_extra_isize(dn->inode); @@ -753,11 +733,14 @@ int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock) return err; #ifdef CONFIG_F2FS_FS_COMPRESSION - if (from != free_from) + if (from != free_from) { err = f2fs_truncate_partial_cluster(inode, from, lock); + if (err) + return err; + } #endif - return err; + return 0; } int f2fs_truncate(struct inode *inode) @@ -1656,13 +1639,14 @@ next_alloc: } down_write(&sbi->pin_sem); - map.m_seg_type = CURSEG_COLD_DATA_PINNED; f2fs_lock_op(sbi); - f2fs_allocate_new_segment(sbi, CURSEG_COLD_DATA); + f2fs_allocate_new_segment(sbi, CURSEG_COLD_DATA_PINNED); f2fs_unlock_op(sbi); + map.m_seg_type = CURSEG_COLD_DATA_PINNED; err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO); + up_write(&sbi->pin_sem); done += map.m_len; @@ -1828,7 +1812,7 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) if ((iflags ^ masked_flags) & F2FS_COMPR_FL) { if (masked_flags & F2FS_COMPR_FL) { - if (f2fs_disable_compressed_file(inode)) + if (!f2fs_disable_compressed_file(inode)) return -EINVAL; } if (iflags & F2FS_NOCOMP_FL) @@ -1836,6 +1820,8 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) if (iflags & F2FS_COMPR_FL) { if (!f2fs_may_compress(inode)) return -EINVAL; + if (S_ISREG(inode->i_mode) && inode->i_size) + return -EINVAL; set_compress_context(inode); } @@ -2783,6 +2769,9 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, if (IS_ENCRYPTED(src) || IS_ENCRYPTED(dst)) return -EOPNOTSUPP; + if (pos_out < 0 || pos_in < 0) + return -EINVAL; + if (src == dst) { if (pos_in == pos_out) return 0; @@ -3258,7 +3247,7 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) if (ret) goto out; - if (f2fs_disable_compressed_file(inode)) { + if (!f2fs_disable_compressed_file(inode)) { ret = -EOPNOTSUPP; goto out; } @@ -3385,7 +3374,7 @@ static int f2fs_ioc_getfslabel(struct file *filp, unsigned long arg) min(FSLABEL_MAX, count))) err = -EFAULT; - kvfree(vbuf); + kfree(vbuf); return err; } @@ -3436,7 +3425,7 @@ static int f2fs_get_compress_blocks(struct file *filp, unsigned long arg) if (!f2fs_compressed_file(inode)) return -EINVAL; - blocks = F2FS_I(inode)->i_compr_blocks; + blocks = atomic_read(&F2FS_I(inode)->i_compr_blocks); return put_user(blocks, (u64 __user *)arg); } @@ -3521,7 +3510,8 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) inode_lock(inode); writecount = atomic_read(&inode->i_writecount); - if ((filp->f_mode & FMODE_WRITE && writecount != 1) || writecount) { + if ((filp->f_mode & FMODE_WRITE && writecount != 1) || + (!(filp->f_mode & FMODE_WRITE) && writecount)) { ret = -EBUSY; goto out; } @@ -3540,7 +3530,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, true); - if (!F2FS_I(inode)->i_compr_blocks) + if (!atomic_read(&F2FS_I(inode)->i_compr_blocks)) goto out; down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); @@ -3588,14 +3578,15 @@ out: if (ret >= 0) { ret = put_user(released_blocks, (u64 __user *)arg); - } else if (released_blocks && F2FS_I(inode)->i_compr_blocks) { + } else if (released_blocks && + atomic_read(&F2FS_I(inode)->i_compr_blocks)) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: partial blocks were released i_ino=%lx " - "iblocks=%llu, released=%u, compr_blocks=%llu, " + "iblocks=%llu, released=%u, compr_blocks=%u, " "run fsck to fix.", __func__, inode->i_ino, inode->i_blocks, released_blocks, - F2FS_I(inode)->i_compr_blocks); + atomic_read(&F2FS_I(inode)->i_compr_blocks)); } return ret; @@ -3683,7 +3674,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) if (ret) return ret; - if (F2FS_I(inode)->i_compr_blocks) + if (atomic_read(&F2FS_I(inode)->i_compr_blocks)) goto out; f2fs_balance_fs(F2FS_I_SB(inode), true); @@ -3747,14 +3738,15 @@ out: if (ret >= 0) { ret = put_user(reserved_blocks, (u64 __user *)arg); - } else if (reserved_blocks && F2FS_I(inode)->i_compr_blocks) { + } else if (reserved_blocks && + atomic_read(&F2FS_I(inode)->i_compr_blocks)) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: partial blocks were released i_ino=%lx " - "iblocks=%llu, reserved=%u, compr_blocks=%llu, " + "iblocks=%llu, reserved=%u, compr_blocks=%u, " "run fsck to fix.", __func__, inode->i_ino, inode->i_blocks, reserved_blocks, - F2FS_I(inode)->i_compr_blocks); + atomic_read(&F2FS_I(inode)->i_compr_blocks)); } return ret; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 11b4adde9baf..05641a1e36cc 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -21,6 +21,8 @@ #include "gc.h" #include <trace/events/f2fs.h> +static struct kmem_cache *victim_entry_slab; + static unsigned int count_bits(const unsigned long *addr, unsigned int offset, unsigned int len); @@ -150,7 +152,7 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(gc_th->f2fs_gc_task)) { err = PTR_ERR(gc_th->f2fs_gc_task); - kvfree(gc_th); + kfree(gc_th); sbi->gc_thread = NULL; } out: @@ -163,13 +165,22 @@ void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi) if (!gc_th) return; kthread_stop(gc_th->f2fs_gc_task); - kvfree(gc_th); + kfree(gc_th); sbi->gc_thread = NULL; } static int select_gc_type(struct f2fs_sb_info *sbi, int gc_type) { - int gc_mode = (gc_type == BG_GC) ? GC_CB : GC_GREEDY; + int gc_mode; + + if (gc_type == BG_GC) { + if (sbi->am.atgc_enabled) + gc_mode = GC_AT; + else + gc_mode = GC_CB; + } else { + gc_mode = GC_GREEDY; + } switch (sbi->gc_mode) { case GC_IDLE_CB: @@ -179,7 +190,11 @@ static int select_gc_type(struct f2fs_sb_info *sbi, int gc_type) case GC_URGENT_HIGH: gc_mode = GC_GREEDY; break; + case GC_IDLE_AT: + gc_mode = GC_AT; + break; } + return gc_mode; } @@ -193,6 +208,11 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, p->dirty_bitmap = dirty_i->dirty_segmap[type]; p->max_search = dirty_i->nr_dirty[type]; p->ofs_unit = 1; + } else if (p->alloc_mode == AT_SSR) { + p->gc_mode = GC_GREEDY; + p->dirty_bitmap = dirty_i->dirty_segmap[type]; + p->max_search = dirty_i->nr_dirty[type]; + p->ofs_unit = 1; } else { p->gc_mode = select_gc_type(sbi, gc_type); p->ofs_unit = sbi->segs_per_sec; @@ -212,6 +232,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, */ if (gc_type != FG_GC && (sbi->gc_mode != GC_URGENT_HIGH) && + (p->gc_mode != GC_AT && p->alloc_mode != AT_SSR) && p->max_search > sbi->max_victim_search) p->max_search = sbi->max_victim_search; @@ -229,10 +250,16 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi, /* SSR allocates in a segment unit */ if (p->alloc_mode == SSR) return sbi->blocks_per_seg; + else if (p->alloc_mode == AT_SSR) + return UINT_MAX; + + /* LFS */ if (p->gc_mode == GC_GREEDY) return 2 * sbi->blocks_per_seg * p->ofs_unit; else if (p->gc_mode == GC_CB) return UINT_MAX; + else if (p->gc_mode == GC_AT) + return UINT_MAX; else /* No other gc_mode */ return 0; } @@ -266,13 +293,14 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) unsigned char age = 0; unsigned char u; unsigned int i; + unsigned int usable_segs_per_sec = f2fs_usable_segs_in_sec(sbi, segno); - for (i = 0; i < sbi->segs_per_sec; i++) + for (i = 0; i < usable_segs_per_sec; i++) mtime += get_seg_entry(sbi, start + i)->mtime; vblocks = get_valid_blocks(sbi, segno, true); - mtime = div_u64(mtime, sbi->segs_per_sec); - vblocks = div_u64(vblocks, sbi->segs_per_sec); + mtime = div_u64(mtime, usable_segs_per_sec); + vblocks = div_u64(vblocks, usable_segs_per_sec); u = (vblocks * 100) >> sbi->log_blocks_per_seg; @@ -297,8 +325,11 @@ static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, /* alloc_mode == LFS */ if (p->gc_mode == GC_GREEDY) return get_valid_blocks(sbi, segno, true); - else + else if (p->gc_mode == GC_CB) return get_cb_cost(sbi, segno); + + f2fs_bug_on(sbi, 1); + return 0; } static unsigned int count_bits(const unsigned long *addr, @@ -313,6 +344,273 @@ static unsigned int count_bits(const unsigned long *addr, return sum; } +static struct victim_entry *attach_victim_entry(struct f2fs_sb_info *sbi, + unsigned long long mtime, unsigned int segno, + struct rb_node *parent, struct rb_node **p, + bool left_most) +{ + struct atgc_management *am = &sbi->am; + struct victim_entry *ve; + + ve = f2fs_kmem_cache_alloc(victim_entry_slab, GFP_NOFS); + + ve->mtime = mtime; + ve->segno = segno; + + rb_link_node(&ve->rb_node, parent, p); + rb_insert_color_cached(&ve->rb_node, &am->root, left_most); + + list_add_tail(&ve->list, &am->victim_list); + + am->victim_count++; + + return ve; +} + +static void insert_victim_entry(struct f2fs_sb_info *sbi, + unsigned long long mtime, unsigned int segno) +{ + struct atgc_management *am = &sbi->am; + struct rb_node **p; + struct rb_node *parent = NULL; + bool left_most = true; + + p = f2fs_lookup_rb_tree_ext(sbi, &am->root, &parent, mtime, &left_most); + attach_victim_entry(sbi, mtime, segno, parent, p, left_most); +} + +static void add_victim_entry(struct f2fs_sb_info *sbi, + struct victim_sel_policy *p, unsigned int segno) +{ + struct sit_info *sit_i = SIT_I(sbi); + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start = GET_SEG_FROM_SEC(sbi, secno); + unsigned long long mtime = 0; + unsigned int i; + + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + if (p->gc_mode == GC_AT && + get_valid_blocks(sbi, segno, true) == 0) + return; + + if (p->alloc_mode == AT_SSR && + get_seg_entry(sbi, segno)->ckpt_valid_blocks == 0) + return; + } + + for (i = 0; i < sbi->segs_per_sec; i++) + mtime += get_seg_entry(sbi, start + i)->mtime; + mtime = div_u64(mtime, sbi->segs_per_sec); + + /* Handle if the system time has changed by the user */ + if (mtime < sit_i->min_mtime) + sit_i->min_mtime = mtime; + if (mtime > sit_i->max_mtime) + sit_i->max_mtime = mtime; + if (mtime < sit_i->dirty_min_mtime) + sit_i->dirty_min_mtime = mtime; + if (mtime > sit_i->dirty_max_mtime) + sit_i->dirty_max_mtime = mtime; + + /* don't choose young section as candidate */ + if (sit_i->dirty_max_mtime - mtime < p->age_threshold) + return; + + insert_victim_entry(sbi, mtime, segno); +} + +static struct rb_node *lookup_central_victim(struct f2fs_sb_info *sbi, + struct victim_sel_policy *p) +{ + struct atgc_management *am = &sbi->am; + struct rb_node *parent = NULL; + bool left_most; + + f2fs_lookup_rb_tree_ext(sbi, &am->root, &parent, p->age, &left_most); + + return parent; +} + +static void atgc_lookup_victim(struct f2fs_sb_info *sbi, + struct victim_sel_policy *p) +{ + struct sit_info *sit_i = SIT_I(sbi); + struct atgc_management *am = &sbi->am; + struct rb_root_cached *root = &am->root; + struct rb_node *node; + struct rb_entry *re; + struct victim_entry *ve; + unsigned long long total_time; + unsigned long long age, u, accu; + unsigned long long max_mtime = sit_i->dirty_max_mtime; + unsigned long long min_mtime = sit_i->dirty_min_mtime; + unsigned int sec_blocks = BLKS_PER_SEC(sbi); + unsigned int vblocks; + unsigned int dirty_threshold = max(am->max_candidate_count, + am->candidate_ratio * + am->victim_count / 100); + unsigned int age_weight = am->age_weight; + unsigned int cost; + unsigned int iter = 0; + + if (max_mtime < min_mtime) + return; + + max_mtime += 1; + total_time = max_mtime - min_mtime; + + accu = div64_u64(ULLONG_MAX, total_time); + accu = min_t(unsigned long long, div_u64(accu, 100), + DEFAULT_ACCURACY_CLASS); + + node = rb_first_cached(root); +next: + re = rb_entry_safe(node, struct rb_entry, rb_node); + if (!re) + return; + + ve = (struct victim_entry *)re; + + if (ve->mtime >= max_mtime || ve->mtime < min_mtime) + goto skip; + + /* age = 10000 * x% * 60 */ + age = div64_u64(accu * (max_mtime - ve->mtime), total_time) * + age_weight; + + vblocks = get_valid_blocks(sbi, ve->segno, true); + f2fs_bug_on(sbi, !vblocks || vblocks == sec_blocks); + + /* u = 10000 * x% * 40 */ + u = div64_u64(accu * (sec_blocks - vblocks), sec_blocks) * + (100 - age_weight); + + f2fs_bug_on(sbi, age + u >= UINT_MAX); + + cost = UINT_MAX - (age + u); + iter++; + + if (cost < p->min_cost || + (cost == p->min_cost && age > p->oldest_age)) { + p->min_cost = cost; + p->oldest_age = age; + p->min_segno = ve->segno; + } +skip: + if (iter < dirty_threshold) { + node = rb_next(node); + goto next; + } +} + +/* + * select candidates around source section in range of + * [target - dirty_threshold, target + dirty_threshold] + */ +static void atssr_lookup_victim(struct f2fs_sb_info *sbi, + struct victim_sel_policy *p) +{ + struct sit_info *sit_i = SIT_I(sbi); + struct atgc_management *am = &sbi->am; + struct rb_node *node; + struct rb_entry *re; + struct victim_entry *ve; + unsigned long long age; + unsigned long long max_mtime = sit_i->dirty_max_mtime; + unsigned long long min_mtime = sit_i->dirty_min_mtime; + unsigned int seg_blocks = sbi->blocks_per_seg; + unsigned int vblocks; + unsigned int dirty_threshold = max(am->max_candidate_count, + am->candidate_ratio * + am->victim_count / 100); + unsigned int cost; + unsigned int iter = 0; + int stage = 0; + + if (max_mtime < min_mtime) + return; + max_mtime += 1; +next_stage: + node = lookup_central_victim(sbi, p); +next_node: + re = rb_entry_safe(node, struct rb_entry, rb_node); + if (!re) { + if (stage == 0) + goto skip_stage; + return; + } + + ve = (struct victim_entry *)re; + + if (ve->mtime >= max_mtime || ve->mtime < min_mtime) + goto skip_node; + + age = max_mtime - ve->mtime; + + vblocks = get_seg_entry(sbi, ve->segno)->ckpt_valid_blocks; + f2fs_bug_on(sbi, !vblocks); + + /* rare case */ + if (vblocks == seg_blocks) + goto skip_node; + + iter++; + + age = max_mtime - abs(p->age - age); + cost = UINT_MAX - vblocks; + + if (cost < p->min_cost || + (cost == p->min_cost && age > p->oldest_age)) { + p->min_cost = cost; + p->oldest_age = age; + p->min_segno = ve->segno; + } +skip_node: + if (iter < dirty_threshold) { + if (stage == 0) + node = rb_prev(node); + else if (stage == 1) + node = rb_next(node); + goto next_node; + } +skip_stage: + if (stage < 1) { + stage++; + iter = 0; + goto next_stage; + } +} +static void lookup_victim_by_age(struct f2fs_sb_info *sbi, + struct victim_sel_policy *p) +{ + f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, + &sbi->am.root, true)); + + if (p->gc_mode == GC_AT) + atgc_lookup_victim(sbi, p); + else if (p->alloc_mode == AT_SSR) + atssr_lookup_victim(sbi, p); + else + f2fs_bug_on(sbi, 1); +} + +static void release_victim_entry(struct f2fs_sb_info *sbi) +{ + struct atgc_management *am = &sbi->am; + struct victim_entry *ve, *tmp; + + list_for_each_entry_safe(ve, tmp, &am->victim_list, list) { + list_del(&ve->list); + kmem_cache_free(victim_entry_slab, ve); + am->victim_count--; + } + + am->root = RB_ROOT_CACHED; + + f2fs_bug_on(sbi, am->victim_count); + f2fs_bug_on(sbi, !list_empty(&am->victim_list)); +} + /* * This function is called from two paths. * One is garbage collection and the other is SSR segment selection. @@ -322,25 +620,37 @@ static unsigned int count_bits(const unsigned long *addr, * which has minimum valid blocks and removes it from dirty seglist. */ static int get_victim_by_default(struct f2fs_sb_info *sbi, - unsigned int *result, int gc_type, int type, char alloc_mode) + unsigned int *result, int gc_type, int type, + char alloc_mode, unsigned long long age) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct sit_info *sm = SIT_I(sbi); struct victim_sel_policy p; unsigned int secno, last_victim; unsigned int last_segment; - unsigned int nsearched = 0; + unsigned int nsearched; + bool is_atgc; int ret = 0; mutex_lock(&dirty_i->seglist_lock); last_segment = MAIN_SECS(sbi) * sbi->segs_per_sec; p.alloc_mode = alloc_mode; - select_policy(sbi, gc_type, type, &p); + p.age = age; + p.age_threshold = sbi->am.age_threshold; +retry: + select_policy(sbi, gc_type, type, &p); p.min_segno = NULL_SEGNO; + p.oldest_age = 0; p.min_cost = get_max_cost(sbi, &p); + is_atgc = (p.gc_mode == GC_AT || p.alloc_mode == AT_SSR); + nsearched = 0; + + if (is_atgc) + SIT_I(sbi)->dirty_min_mtime = ULLONG_MAX; + if (*result != NULL_SEGNO) { if (!get_valid_blocks(sbi, *result, false)) { ret = -ENODATA; @@ -421,11 +731,16 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, /* Don't touch checkpointed data */ if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) && get_ckpt_valid_blocks(sbi, segno) && - p.alloc_mode != SSR)) + p.alloc_mode == LFS)) goto next; if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) goto next; + if (is_atgc) { + add_victim_entry(sbi, &p, segno); + goto next; + } + cost = get_gc_cost(sbi, segno, &p); if (p.min_cost > cost) { @@ -444,6 +759,19 @@ next: break; } } + + /* get victim for GC_AT/AT_SSR */ + if (is_atgc) { + lookup_victim_by_age(sbi, &p); + release_victim_entry(sbi); + } + + if (is_atgc && p.min_segno == NULL_SEGNO && + sm->elapsed_time < p.age_threshold) { + p.age_threshold = 0; + goto retry; + } + if (p.min_segno != NULL_SEGNO) { got_it: *result = (p.min_segno / p.ofs_unit) * p.ofs_unit; @@ -536,6 +864,7 @@ static int gc_node_segment(struct f2fs_sb_info *sbi, int phase = 0; bool fggc = (gc_type == FG_GC); int submitted = 0; + unsigned int usable_blks_in_seg = f2fs_usable_blks_in_seg(sbi, segno); start_addr = START_BLOCK(sbi, segno); @@ -545,7 +874,7 @@ next_step: if (fggc && phase == 2) atomic_inc(&sbi->wb_sync_req[NODE]); - for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { + for (off = 0; off < usable_blks_in_seg; off++, entry++) { nid_t nid = le32_to_cpu(entry->nid); struct page *node_page; struct node_info ni; @@ -791,6 +1120,8 @@ static int move_data_block(struct inode *inode, block_t bidx, block_t newaddr; int err = 0; bool lfs_mode = f2fs_lfs_mode(fio.sbi); + int type = fio.sbi->am.atgc_enabled ? + CURSEG_ALL_DATA_ATGC : CURSEG_COLD_DATA; /* do not read out */ page = f2fs_grab_cache_page(inode->i_mapping, bidx, false); @@ -877,7 +1208,7 @@ static int move_data_block(struct inode *inode, block_t bidx, } f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, - &sum, CURSEG_COLD_DATA, NULL); + &sum, type, NULL); fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi), newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS); @@ -927,7 +1258,7 @@ put_page_out: recover_block: if (err) f2fs_do_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr, - true, true); + true, true, true); up_out: if (lfs_mode) up_write(&fio.sbi->io_order_lock); @@ -1033,13 +1364,14 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, int off; int phase = 0; int submitted = 0; + unsigned int usable_blks_in_seg = f2fs_usable_blks_in_seg(sbi, segno); start_addr = START_BLOCK(sbi, segno); next_step: entry = sum; - for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { + for (off = 0; off < usable_blks_in_seg; off++, entry++) { struct page *data_page; struct inode *inode; struct node_info dni; /* dnode info for the data */ @@ -1182,7 +1514,7 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, down_write(&sit_i->sentry_lock); ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type, - NO_CHECK_TYPE, LFS); + NO_CHECK_TYPE, LFS, 0); up_write(&sit_i->sentry_lock); return ret; } @@ -1204,6 +1536,17 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, if (__is_large_section(sbi)) end_segno = rounddown(end_segno, sbi->segs_per_sec); + /* + * zone-capacity can be less than zone-size in zoned devices, + * resulting in less than expected usable segments in the zone, + * calculate the end segno in the zone which can be garbage collected + */ + if (f2fs_sb_has_blkzoned(sbi)) + end_segno -= sbi->segs_per_sec - + f2fs_usable_segs_in_sec(sbi, segno); + + sanity_check_seg_type(sbi, get_seg_entry(sbi, segno)->type); + /* readahead multi ssa blocks those have contiguous address */ if (__is_large_section(sbi)) f2fs_ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), @@ -1356,7 +1699,8 @@ gc_more: goto stop; seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type); - if (gc_type == FG_GC && seg_freed == sbi->segs_per_sec) + if (gc_type == FG_GC && + seg_freed == f2fs_usable_segs_in_sec(sbi, segno)) sec_freed++; total_freed += seg_freed; @@ -1413,6 +1757,37 @@ stop: return ret; } +int __init f2fs_create_garbage_collection_cache(void) +{ + victim_entry_slab = f2fs_kmem_cache_create("f2fs_victim_entry", + sizeof(struct victim_entry)); + if (!victim_entry_slab) + return -ENOMEM; + return 0; +} + +void f2fs_destroy_garbage_collection_cache(void) +{ + kmem_cache_destroy(victim_entry_slab); +} + +static void init_atgc_management(struct f2fs_sb_info *sbi) +{ + struct atgc_management *am = &sbi->am; + + if (test_opt(sbi, ATGC) && + SIT_I(sbi)->elapsed_time >= DEF_GC_THREAD_AGE_THRESHOLD) + am->atgc_enabled = true; + + am->root = RB_ROOT_CACHED; + INIT_LIST_HEAD(&am->victim_list); + am->victim_count = 0; + + am->candidate_ratio = DEF_GC_THREAD_CANDIDATE_RATIO; + am->max_candidate_count = DEF_GC_THREAD_MAX_CANDIDATE_COUNT; + am->age_weight = DEF_GC_THREAD_AGE_WEIGHT; +} + void f2fs_build_gc_manager(struct f2fs_sb_info *sbi) { DIRTY_I(sbi)->v_ops = &default_v_ops; @@ -1423,6 +1798,8 @@ void f2fs_build_gc_manager(struct f2fs_sb_info *sbi) if (f2fs_is_multi_device(sbi) && !__is_large_section(sbi)) SIT_I(sbi)->last_victim[ALLOC_NEXT] = GET_SEGNO(sbi, FDEV(0).end_blk) + 1; + + init_atgc_management(sbi); } static int free_segment_range(struct f2fs_sb_info *sbi, @@ -1450,7 +1827,7 @@ static int free_segment_range(struct f2fs_sb_info *sbi, mutex_unlock(&DIRTY_I(sbi)->seglist_lock); /* Move out cursegs from the target range */ - for (type = CURSEG_HOT_DATA; type < NR_CURSEG_TYPE; type++) + for (type = CURSEG_HOT_DATA; type < NR_CURSEG_PERSIST_TYPE; type++) f2fs_allocate_segment_for_resize(sbi, type, start, end); /* do GC to move out valid blocks in the range */ diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index db3c61046aa4..0c8dae12dc51 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -14,6 +14,14 @@ #define DEF_GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */ #define DEF_GC_THREAD_MAX_SLEEP_TIME 60000 #define DEF_GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */ + +/* choose candidates from sections which has age of more than 7 days */ +#define DEF_GC_THREAD_AGE_THRESHOLD (60 * 60 * 24 * 7) +#define DEF_GC_THREAD_CANDIDATE_RATIO 20 /* select 20% oldest sections as candidates */ +#define DEF_GC_THREAD_MAX_CANDIDATE_COUNT 10 /* select at most 10 sections as candidates */ +#define DEF_GC_THREAD_AGE_WEIGHT 60 /* age weight */ +#define DEFAULT_ACCURACY_CLASS 10000 /* accuracy class */ + #define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */ #define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */ @@ -41,16 +49,69 @@ struct gc_inode_list { struct radix_tree_root iroot; }; +struct victim_info { + unsigned long long mtime; /* mtime of section */ + unsigned int segno; /* section No. */ +}; + +struct victim_entry { + struct rb_node rb_node; /* rb node located in rb-tree */ + union { + struct { + unsigned long long mtime; /* mtime of section */ + unsigned int segno; /* segment No. */ + }; + struct victim_info vi; /* victim info */ + }; + struct list_head list; +}; + /* * inline functions */ + +/* + * On a Zoned device zone-capacity can be less than zone-size and if + * zone-capacity is not aligned to f2fs segment size(2MB), then the segment + * starting just before zone-capacity has some blocks spanning across the + * zone-capacity, these blocks are not usable. + * Such spanning segments can be in free list so calculate the sum of usable + * blocks in currently free segments including normal and spanning segments. + */ +static inline block_t free_segs_blk_count_zoned(struct f2fs_sb_info *sbi) +{ + block_t free_seg_blks = 0; + struct free_segmap_info *free_i = FREE_I(sbi); + int j; + + spin_lock(&free_i->segmap_lock); + for (j = 0; j < MAIN_SEGS(sbi); j++) + if (!test_bit(j, free_i->free_segmap)) + free_seg_blks += f2fs_usable_blks_in_seg(sbi, j); + spin_unlock(&free_i->segmap_lock); + + return free_seg_blks; +} + +static inline block_t free_segs_blk_count(struct f2fs_sb_info *sbi) +{ + if (f2fs_sb_has_blkzoned(sbi)) + return free_segs_blk_count_zoned(sbi); + + return free_segments(sbi) << sbi->log_blocks_per_seg; +} + static inline block_t free_user_blocks(struct f2fs_sb_info *sbi) { - if (free_segments(sbi) < overprovision_segments(sbi)) + block_t free_blks, ovp_blks; + + free_blks = free_segs_blk_count(sbi); + ovp_blks = overprovision_segments(sbi) << sbi->log_blocks_per_seg; + + if (free_blks < ovp_blks) return 0; - else - return (free_segments(sbi) - overprovision_segments(sbi)) - << sbi->log_blocks_per_seg; + + return free_blks - ovp_blks; } static inline block_t limit_invalid_user_blocks(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 102df444f623..70384e31788d 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -524,7 +524,7 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, !f2fs_has_inline_xattr(dir)) F2FS_I(dir)->i_inline_xattr_size = 0; - kvfree(backup_dentry); + kfree(backup_dentry); return 0; recover: lock_page(ipage); @@ -535,7 +535,7 @@ recover: set_page_dirty(ipage); f2fs_put_page(ipage, 1); - kvfree(backup_dentry); + kfree(backup_dentry); return err; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 66969ae852b9..657db2fb6739 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -287,11 +287,19 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } + if ((fi->i_flags & F2FS_CASEFOLD_FL) && !f2fs_sb_has_casefold(sbi)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: inode (ino=%lx) has casefold flag, but casefold feature is off", + __func__, inode->i_ino); + return false; + } + if (f2fs_has_extra_attr(inode) && f2fs_sb_has_compression(sbi) && fi->i_flags & F2FS_COMPR_FL && F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_log_cluster_size)) { if (ri->i_compress_algorithm >= COMPRESS_MAX) { + set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported " "compress algorithm: %u, run fsck to fix", __func__, inode->i_ino, @@ -300,6 +308,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) } if (le64_to_cpu(ri->i_compr_blocks) > SECTOR_TO_BLOCK(inode->i_blocks)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: inode (ino=%lx) has inconsistent " "i_compr_blocks:%llu, i_blocks:%llu, run fsck to fix", __func__, inode->i_ino, @@ -309,6 +318,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) } if (ri->i_log_cluster_size < MIN_COMPRESS_LOG_SIZE || ri->i_log_cluster_size > MAX_COMPRESS_LOG_SIZE) { + set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported " "log cluster size: %u, run fsck to fix", __func__, inode->i_ino, @@ -442,7 +452,8 @@ static int do_read_inode(struct inode *inode) (fi->i_flags & F2FS_COMPR_FL)) { if (F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_log_cluster_size)) { - fi->i_compr_blocks = le64_to_cpu(ri->i_compr_blocks); + atomic_set(&fi->i_compr_blocks, + le64_to_cpu(ri->i_compr_blocks)); fi->i_compress_algorithm = ri->i_compress_algorithm; fi->i_log_cluster_size = ri->i_log_cluster_size; fi->i_cluster_size = 1 << fi->i_log_cluster_size; @@ -460,7 +471,7 @@ static int do_read_inode(struct inode *inode) stat_inc_inline_inode(inode); stat_inc_inline_dir(inode); stat_inc_compr_inode(inode); - stat_add_compr_blocks(inode, F2FS_I(inode)->i_compr_blocks); + stat_add_compr_blocks(inode, atomic_read(&fi->i_compr_blocks)); return 0; } @@ -619,7 +630,8 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, i_log_cluster_size)) { ri->i_compr_blocks = - cpu_to_le64(F2FS_I(inode)->i_compr_blocks); + cpu_to_le64(atomic_read( + &F2FS_I(inode)->i_compr_blocks)); ri->i_compress_algorithm = F2FS_I(inode)->i_compress_algorithm; ri->i_log_cluster_size = @@ -768,7 +780,8 @@ no_delete: stat_dec_inline_dir(inode); stat_dec_inline_inode(inode); stat_dec_compr_inode(inode); - stat_sub_compr_blocks(inode, F2FS_I(inode)->i_compr_blocks); + stat_sub_compr_blocks(inode, + atomic_read(&F2FS_I(inode)->i_compr_blocks)); if (likely(!f2fs_cp_error(sbi) && !is_sbi_flag_set(sbi, SBI_CP_DISABLED))) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 84e4bbc1a64d..8fa37d1434de 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -28,6 +28,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) nid_t ino; struct inode *inode; bool nid_free = false; + bool encrypt = false; int xattr_size = 0; int err; @@ -69,13 +70,17 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) F2FS_I(inode)->i_projid = make_kprojid(&init_user_ns, F2FS_DEF_PROJID); + err = fscrypt_prepare_new_inode(dir, inode, &encrypt); + if (err) + goto fail_drop; + err = dquot_initialize(inode); if (err) goto fail_drop; set_inode_flag(inode, FI_NEW_INODE); - if (f2fs_may_encrypt(dir, inode)) + if (encrypt) f2fs_set_encrypted_inode(inode); if (f2fs_sb_has_extra_attr(sbi)) { @@ -707,7 +712,7 @@ out_f2fs_handle_failed_inode: f2fs_handle_failed_inode(inode); out_free_encrypted_link: if (disk_link.name != (unsigned char *)symname) - kvfree(disk_link.name); + kfree(disk_link.name); return err; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index cb1b5b61a1da..d5d8ce077f29 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -109,7 +109,7 @@ static void clear_node_page_dirty(struct page *page) static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid) { - return f2fs_get_meta_page_nofail(sbi, current_nat_addr(sbi, nid)); + return f2fs_get_meta_page(sbi, current_nat_addr(sbi, nid)); } static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) @@ -3105,9 +3105,6 @@ static int init_node_manager(struct f2fs_sb_info *sbi) nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP); version_bitmap = __bitmap_ptr(sbi, NAT_BITMAP); - if (!version_bitmap) - return -EFAULT; - nm_i->nat_bitmap = kmemdup(version_bitmap, nm_i->bitmap_size, GFP_KERNEL); if (!nm_i->nat_bitmap) @@ -3257,7 +3254,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) kvfree(nm_i->nat_bitmap_mir); #endif sbi->nm_info = NULL; - kvfree(nm_i); + kfree(nm_i); } int __init f2fs_create_node_manager_caches(void) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e247a5ef3713..1596502f7375 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -189,7 +189,7 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) f2fs_trace_pid(page); - f2fs_set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE); + f2fs_set_page_private(page, ATOMIC_WRITTEN_PAGE); new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS); @@ -728,7 +728,7 @@ init_thread: "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(fcc->f2fs_issue_flush)) { err = PTR_ERR(fcc->f2fs_issue_flush); - kvfree(fcc); + kfree(fcc); SM_I(sbi)->fcc_info = NULL; return err; } @@ -747,7 +747,7 @@ void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) kthread_stop(flush_thread); } if (free) { - kvfree(fcc); + kfree(fcc); SM_I(sbi)->fcc_info = NULL; } } @@ -759,6 +759,9 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi) if (!f2fs_is_multi_device(sbi)) return 0; + if (test_opt(sbi, NOBARRIER)) + return 0; + for (i = 1; i < sbi->s_ndevs; i++) { if (!f2fs_test_bit(i, (char *)&sbi->dirty_device)) continue; @@ -859,20 +862,22 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned short valid_blocks, ckpt_valid_blocks; + unsigned int usable_blocks; if (segno == NULL_SEGNO || IS_CURSEG(sbi, segno)) return; + usable_blocks = f2fs_usable_blks_in_seg(sbi, segno); mutex_lock(&dirty_i->seglist_lock); valid_blocks = get_valid_blocks(sbi, segno, false); ckpt_valid_blocks = get_ckpt_valid_blocks(sbi, segno); if (valid_blocks == 0 && (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) || - ckpt_valid_blocks == sbi->blocks_per_seg)) { + ckpt_valid_blocks == usable_blocks)) { __locate_dirty_segment(sbi, segno, PRE); __remove_dirty_segment(sbi, segno, DIRTY); - } else if (valid_blocks < sbi->blocks_per_seg) { + } else if (valid_blocks < usable_blocks) { __locate_dirty_segment(sbi, segno, DIRTY); } else { /* Recovery routine with SSR needs this */ @@ -915,9 +920,11 @@ block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi) for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) { se = get_seg_entry(sbi, segno); if (IS_NODESEG(se->type)) - holes[NODE] += sbi->blocks_per_seg - se->valid_blocks; + holes[NODE] += f2fs_usable_blks_in_seg(sbi, segno) - + se->valid_blocks; else - holes[DATA] += sbi->blocks_per_seg - se->valid_blocks; + holes[DATA] += f2fs_usable_blks_in_seg(sbi, segno) - + se->valid_blocks; } mutex_unlock(&dirty_i->seglist_lock); @@ -1521,7 +1528,7 @@ retry: goto next; if (unlikely(dcc->rbtree_check)) f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, - &dcc->root)); + &dcc->root, false)); blk_start_plug(&plug); list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); @@ -1958,7 +1965,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) mutex_lock(&dirty_i->seglist_lock); for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi)) - __set_test_and_free(sbi, segno); + __set_test_and_free(sbi, segno, false); mutex_unlock(&dirty_i->seglist_lock); } @@ -2101,7 +2108,7 @@ init_thread: "f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(dcc->f2fs_issue_discard)) { err = PTR_ERR(dcc->f2fs_issue_discard); - kvfree(dcc); + kfree(dcc); SM_I(sbi)->dcc_info = NULL; return err; } @@ -2125,7 +2132,7 @@ static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi) if (unlikely(atomic_read(&dcc->discard_cmd_cnt))) f2fs_issue_discard_timeout(sbi); - kvfree(dcc); + kfree(dcc); SM_I(sbi)->dcc_info = NULL; } @@ -2150,6 +2157,39 @@ static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type, __mark_sit_entry_dirty(sbi, segno); } +static inline unsigned long long get_segment_mtime(struct f2fs_sb_info *sbi, + block_t blkaddr) +{ + unsigned int segno = GET_SEGNO(sbi, blkaddr); + + if (segno == NULL_SEGNO) + return 0; + return get_seg_entry(sbi, segno)->mtime; +} + +static void update_segment_mtime(struct f2fs_sb_info *sbi, block_t blkaddr, + unsigned long long old_mtime) +{ + struct seg_entry *se; + unsigned int segno = GET_SEGNO(sbi, blkaddr); + unsigned long long ctime = get_mtime(sbi, false); + unsigned long long mtime = old_mtime ? old_mtime : ctime; + + if (segno == NULL_SEGNO) + return; + + se = get_seg_entry(sbi, segno); + + if (!se->mtime) + se->mtime = mtime; + else + se->mtime = div_u64(se->mtime * se->valid_blocks + mtime, + se->valid_blocks + 1); + + if (ctime > SIT_I(sbi)->max_mtime) + SIT_I(sbi)->max_mtime = ctime; +} + static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) { struct seg_entry *se; @@ -2167,12 +2207,9 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); f2fs_bug_on(sbi, (new_vblocks < 0 || - (new_vblocks > sbi->blocks_per_seg))); + (new_vblocks > f2fs_usable_blks_in_seg(sbi, segno)))); se->valid_blocks = new_vblocks; - se->mtime = get_mtime(sbi, false); - if (se->mtime > SIT_I(sbi)->max_mtime) - SIT_I(sbi)->max_mtime = se->mtime; /* Update valid block bitmap */ if (del > 0) { @@ -2265,6 +2302,7 @@ void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) /* add it into sit main buffer */ down_write(&sit_i->sentry_lock); + update_segment_mtime(sbi, addr, 0); update_sit_entry(sbi, addr, -1); /* add it into dirty seglist */ @@ -2344,7 +2382,9 @@ int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra) */ struct page *f2fs_get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno) { - return f2fs_get_meta_page_nofail(sbi, GET_SUM_BLOCK(sbi, segno)); + if (unlikely(f2fs_cp_error(sbi))) + return ERR_PTR(-EIO); + return f2fs_get_meta_page_retry(sbi, GET_SUM_BLOCK(sbi, segno)); } void f2fs_update_meta_page(struct f2fs_sb_info *sbi, @@ -2389,9 +2429,9 @@ static void write_current_sum_page(struct f2fs_sb_info *sbi, f2fs_put_page(page, 1); } -static int is_next_segment_free(struct f2fs_sb_info *sbi, int type) +static int is_next_segment_free(struct f2fs_sb_info *sbi, + struct curseg_info *curseg, int type) { - struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int segno = curseg->segno + 1; struct free_segmap_info *free_i = FREE_I(sbi); @@ -2495,7 +2535,9 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) { struct curseg_info *curseg = CURSEG_I(sbi, type); struct summary_footer *sum_footer; + unsigned short seg_type = curseg->seg_type; + curseg->inited = true; curseg->segno = curseg->next_segno; curseg->zone = GET_ZONE_FROM_SEG(sbi, curseg->segno); curseg->next_blkoff = 0; @@ -2503,24 +2545,36 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) sum_footer = &(curseg->sum_blk->footer); memset(sum_footer, 0, sizeof(struct summary_footer)); - if (IS_DATASEG(type)) + + sanity_check_seg_type(sbi, seg_type); + + if (IS_DATASEG(seg_type)) SET_SUM_TYPE(sum_footer, SUM_TYPE_DATA); - if (IS_NODESEG(type)) + if (IS_NODESEG(seg_type)) SET_SUM_TYPE(sum_footer, SUM_TYPE_NODE); - __set_sit_entry_type(sbi, type, curseg->segno, modified); + __set_sit_entry_type(sbi, seg_type, curseg->segno, modified); } static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) { + struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned short seg_type = curseg->seg_type; + + sanity_check_seg_type(sbi, seg_type); + /* if segs_per_sec is large than 1, we need to keep original policy. */ if (__is_large_section(sbi)) - return CURSEG_I(sbi, type)->segno; + return curseg->segno; + + /* inmem log may not locate on any segment after mount */ + if (!curseg->inited) + return 0; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) return 0; if (test_opt(sbi, NOHEAP) && - (type == CURSEG_HOT_DATA || IS_NODESEG(type))) + (seg_type == CURSEG_HOT_DATA || IS_NODESEG(seg_type))) return 0; if (SIT_I(sbi)->last_victim[ALLOC_NEXT]) @@ -2530,7 +2584,7 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE) return 0; - return CURSEG_I(sbi, type)->segno; + return curseg->segno; } /* @@ -2540,12 +2594,14 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) { struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned short seg_type = curseg->seg_type; unsigned int segno = curseg->segno; int dir = ALLOC_LEFT; - write_sum_page(sbi, curseg->sum_blk, + if (curseg->inited) + write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, segno)); - if (type == CURSEG_WARM_DATA || type == CURSEG_COLD_DATA) + if (seg_type == CURSEG_WARM_DATA || seg_type == CURSEG_COLD_DATA) dir = ALLOC_RIGHT; if (test_opt(sbi, NOHEAP)) @@ -2594,7 +2650,7 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, * This function always allocates a used segment(from dirty seglist) by SSR * manner, so it should recover the existing segment information of valid blocks */ -static void change_curseg(struct f2fs_sb_info *sbi, int type) +static void change_curseg(struct f2fs_sb_info *sbi, int type, bool flush) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -2602,8 +2658,10 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type) struct f2fs_summary_block *sum_node; struct page *sum_page; - write_sum_page(sbi, curseg->sum_blk, - GET_SUM_BLOCK(sbi, curseg->segno)); + if (flush) + write_sum_page(sbi, curseg->sum_blk, + GET_SUM_BLOCK(sbi, curseg->segno)); + __set_test_and_inuse(sbi, new_segno); mutex_lock(&dirty_i->seglist_lock); @@ -2616,29 +2674,139 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type) __next_free_blkoff(sbi, curseg, 0); sum_page = f2fs_get_sum_page(sbi, new_segno); - f2fs_bug_on(sbi, IS_ERR(sum_page)); + if (IS_ERR(sum_page)) { + /* GC won't be able to use stale summary pages by cp_error */ + memset(curseg->sum_blk, 0, SUM_ENTRY_SIZE); + return; + } sum_node = (struct f2fs_summary_block *)page_address(sum_page); memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); f2fs_put_page(sum_page, 1); } -static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) +static int get_ssr_segment(struct f2fs_sb_info *sbi, int type, + int alloc_mode, unsigned long long age); + +static void get_atssr_segment(struct f2fs_sb_info *sbi, int type, + int target_type, int alloc_mode, + unsigned long long age) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + + curseg->seg_type = target_type; + + if (get_ssr_segment(sbi, type, alloc_mode, age)) { + struct seg_entry *se = get_seg_entry(sbi, curseg->next_segno); + + curseg->seg_type = se->type; + change_curseg(sbi, type, true); + } else { + /* allocate cold segment by default */ + curseg->seg_type = CURSEG_COLD_DATA; + new_curseg(sbi, type, true); + } + stat_inc_seg_type(sbi, curseg); +} + +static void __f2fs_init_atgc_curseg(struct f2fs_sb_info *sbi) +{ + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC); + + if (!sbi->am.atgc_enabled) + return; + + down_read(&SM_I(sbi)->curseg_lock); + + mutex_lock(&curseg->curseg_mutex); + down_write(&SIT_I(sbi)->sentry_lock); + + get_atssr_segment(sbi, CURSEG_ALL_DATA_ATGC, CURSEG_COLD_DATA, SSR, 0); + + up_write(&SIT_I(sbi)->sentry_lock); + mutex_unlock(&curseg->curseg_mutex); + + up_read(&SM_I(sbi)->curseg_lock); + +} +void f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi) +{ + __f2fs_init_atgc_curseg(sbi); +} + +static void __f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + + mutex_lock(&curseg->curseg_mutex); + if (!curseg->inited) + goto out; + + if (get_valid_blocks(sbi, curseg->segno, false)) { + write_sum_page(sbi, curseg->sum_blk, + GET_SUM_BLOCK(sbi, curseg->segno)); + } else { + mutex_lock(&DIRTY_I(sbi)->seglist_lock); + __set_test_and_free(sbi, curseg->segno, true); + mutex_unlock(&DIRTY_I(sbi)->seglist_lock); + } +out: + mutex_unlock(&curseg->curseg_mutex); +} + +void f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi) +{ + __f2fs_save_inmem_curseg(sbi, CURSEG_COLD_DATA_PINNED); + + if (sbi->am.atgc_enabled) + __f2fs_save_inmem_curseg(sbi, CURSEG_ALL_DATA_ATGC); +} + +static void __f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + + mutex_lock(&curseg->curseg_mutex); + if (!curseg->inited) + goto out; + if (get_valid_blocks(sbi, curseg->segno, false)) + goto out; + + mutex_lock(&DIRTY_I(sbi)->seglist_lock); + __set_test_and_inuse(sbi, curseg->segno); + mutex_unlock(&DIRTY_I(sbi)->seglist_lock); +out: + mutex_unlock(&curseg->curseg_mutex); +} + +void f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi) +{ + __f2fs_restore_inmem_curseg(sbi, CURSEG_COLD_DATA_PINNED); + + if (sbi->am.atgc_enabled) + __f2fs_restore_inmem_curseg(sbi, CURSEG_ALL_DATA_ATGC); +} + +static int get_ssr_segment(struct f2fs_sb_info *sbi, int type, + int alloc_mode, unsigned long long age) { struct curseg_info *curseg = CURSEG_I(sbi, type); const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; unsigned segno = NULL_SEGNO; + unsigned short seg_type = curseg->seg_type; int i, cnt; bool reversed = false; + sanity_check_seg_type(sbi, seg_type); + /* f2fs_need_SSR() already forces to do this */ - if (!v_ops->get_victim(sbi, &segno, BG_GC, type, SSR)) { + if (!v_ops->get_victim(sbi, &segno, BG_GC, seg_type, alloc_mode, age)) { curseg->next_segno = segno; return 1; } /* For node segments, let's do SSR more intensively */ - if (IS_NODESEG(type)) { - if (type >= CURSEG_WARM_NODE) { + if (IS_NODESEG(seg_type)) { + if (seg_type >= CURSEG_WARM_NODE) { reversed = true; i = CURSEG_COLD_NODE; } else { @@ -2646,7 +2814,7 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) } cnt = NR_CURSEG_NODE_TYPE; } else { - if (type >= CURSEG_WARM_DATA) { + if (seg_type >= CURSEG_WARM_DATA) { reversed = true; i = CURSEG_COLD_DATA; } else { @@ -2656,9 +2824,9 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) } for (; cnt-- > 0; reversed ? i-- : i++) { - if (i == type) + if (i == seg_type) continue; - if (!v_ops->get_victim(sbi, &segno, BG_GC, i, SSR)) { + if (!v_ops->get_victim(sbi, &segno, BG_GC, i, alloc_mode, age)) { curseg->next_segno = segno; return 1; } @@ -2687,13 +2855,15 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, if (force) new_curseg(sbi, type, true); else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && - type == CURSEG_WARM_NODE) + curseg->seg_type == CURSEG_WARM_NODE) new_curseg(sbi, type, false); - else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type) && + else if (curseg->alloc_type == LFS && + is_next_segment_free(sbi, curseg, type) && likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) new_curseg(sbi, type, false); - else if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type)) - change_curseg(sbi, type); + else if (f2fs_need_SSR(sbi) && + get_ssr_segment(sbi, type, SSR, 0)) + change_curseg(sbi, type, true); else new_curseg(sbi, type, false); @@ -2714,8 +2884,8 @@ void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, if (segno < start || segno > end) goto unlock; - if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type)) - change_curseg(sbi, type); + if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type, SSR, 0)) + change_curseg(sbi, type, true); else new_curseg(sbi, type, true); @@ -2738,11 +2908,15 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type) struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int old_segno; + if (!curseg->inited) + goto alloc; + if (!curseg->next_blkoff && !get_valid_blocks(sbi, curseg->segno, false) && !get_ckpt_valid_blocks(sbi, curseg->segno)) return; +alloc: old_segno = curseg->segno; SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true); locate_dirty_segment(sbi, old_segno); @@ -2806,7 +2980,7 @@ next: mutex_lock(&dcc->cmd_lock); if (unlikely(dcc->rbtree_check)) f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, - &dcc->root)); + &dcc->root, false)); dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, NULL, start, @@ -2930,12 +3104,11 @@ out: return err; } -static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) +static bool __has_curseg_space(struct f2fs_sb_info *sbi, + struct curseg_info *curseg) { - struct curseg_info *curseg = CURSEG_I(sbi, type); - if (curseg->next_blkoff < sbi->blocks_per_seg) - return true; - return false; + return curseg->next_blkoff < f2fs_usable_blks_in_seg(sbi, + curseg->segno); } int f2fs_rw_hint_to_seg_type(enum rw_hint hint) @@ -3075,8 +3248,13 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) if (fio->type == DATA) { struct inode *inode = fio->page->mapping->host; - if (is_cold_data(fio->page) || file_is_cold(inode) || - f2fs_compressed_file(inode)) + if (is_cold_data(fio->page)) { + if (fio->sbi->am.atgc_enabled) + return CURSEG_ALL_DATA_ATGC; + else + return CURSEG_COLD_DATA; + } + if (file_is_cold(inode) || f2fs_compressed_file(inode)) return CURSEG_COLD_DATA; if (file_is_hot(inode) || is_inode_flag_set(inode, FI_HOT_DATA) || @@ -3126,27 +3304,25 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); - bool put_pin_sem = false; - - if (type == CURSEG_COLD_DATA) { - /* GC during CURSEG_COLD_DATA_PINNED allocation */ - if (down_read_trylock(&sbi->pin_sem)) { - put_pin_sem = true; - } else { - type = CURSEG_WARM_DATA; - curseg = CURSEG_I(sbi, type); - } - } else if (type == CURSEG_COLD_DATA_PINNED) { - type = CURSEG_COLD_DATA; - } + unsigned long long old_mtime; + bool from_gc = (type == CURSEG_ALL_DATA_ATGC); + struct seg_entry *se = NULL; down_read(&SM_I(sbi)->curseg_lock); mutex_lock(&curseg->curseg_mutex); down_write(&sit_i->sentry_lock); + if (from_gc) { + f2fs_bug_on(sbi, GET_SEGNO(sbi, old_blkaddr) == NULL_SEGNO); + se = get_seg_entry(sbi, GET_SEGNO(sbi, old_blkaddr)); + sanity_check_seg_type(sbi, se->type); + f2fs_bug_on(sbi, IS_NODESEG(se->type)); + } *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); + f2fs_bug_on(sbi, curseg->next_blkoff >= sbi->blocks_per_seg); + f2fs_wait_discard_bio(sbi, *new_blkaddr); /* @@ -3160,6 +3336,14 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, stat_inc_block_count(sbi, curseg); + if (from_gc) { + old_mtime = get_segment_mtime(sbi, old_blkaddr); + } else { + update_segment_mtime(sbi, old_blkaddr, 0); + old_mtime = 0; + } + update_segment_mtime(sbi, *new_blkaddr, old_mtime); + /* * SIT information should be updated before segment allocation, * since SSR needs latest valid block information. @@ -3168,9 +3352,13 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) update_sit_entry(sbi, old_blkaddr, -1); - if (!__has_curseg_space(sbi, type)) - sit_i->s_ops->allocate_segment(sbi, type, false); - + if (!__has_curseg_space(sbi, curseg)) { + if (from_gc) + get_atssr_segment(sbi, type, se->type, + AT_SSR, se->mtime); + else + sit_i->s_ops->allocate_segment(sbi, type, false); + } /* * segment dirty status should be updated after segment allocation, * so we just need to update status only one time after previous @@ -3204,9 +3392,6 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, mutex_unlock(&curseg->curseg_mutex); up_read(&SM_I(sbi)->curseg_lock); - - if (put_pin_sem) - up_read(&sbi->pin_sem); } static void update_device_state(struct f2fs_io_info *fio) @@ -3355,7 +3540,8 @@ static inline int __f2fs_get_curseg(struct f2fs_sb_info *sbi, void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, block_t old_blkaddr, block_t new_blkaddr, - bool recover_curseg, bool recover_newaddr) + bool recover_curseg, bool recover_newaddr, + bool from_gc) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg; @@ -3400,17 +3586,22 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, /* change the current segment */ if (segno != curseg->segno) { curseg->next_segno = segno; - change_curseg(sbi, type); + change_curseg(sbi, type, true); } curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); __add_sum_entry(sbi, type, sum); - if (!recover_curseg || recover_newaddr) + if (!recover_curseg || recover_newaddr) { + if (!from_gc) + update_segment_mtime(sbi, new_blkaddr, 0); update_sit_entry(sbi, new_blkaddr, 1); + } if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) { invalidate_mapping_pages(META_MAPPING(sbi), old_blkaddr, old_blkaddr); + if (!from_gc) + update_segment_mtime(sbi, old_blkaddr, 0); update_sit_entry(sbi, old_blkaddr, -1); } @@ -3422,7 +3613,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (recover_curseg) { if (old_cursegno != curseg->segno) { curseg->next_segno = old_cursegno; - change_curseg(sbi, type); + change_curseg(sbi, type, true); } curseg->next_blkoff = old_blkoff; } @@ -3442,7 +3633,7 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, set_summary(&sum, dn->nid, dn->ofs_in_node, version); f2fs_do_replace_block(sbi, &sum, old_addr, new_addr, - recover_curseg, recover_newaddr); + recover_curseg, recover_newaddr, false); f2fs_update_data_blkaddr(dn, new_addr); } @@ -3574,7 +3765,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) blk_off = le16_to_cpu(ckpt->cur_data_blkoff[type - CURSEG_HOT_DATA]); if (__exist_node_summaries(sbi)) - blk_addr = sum_blk_addr(sbi, NR_CURSEG_TYPE, type); + blk_addr = sum_blk_addr(sbi, NR_CURSEG_PERSIST_TYPE, type); else blk_addr = sum_blk_addr(sbi, NR_CURSEG_DATA_TYPE, type); } else { @@ -3652,8 +3843,9 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) } if (__exist_node_summaries(sbi)) - f2fs_ra_meta_pages(sbi, sum_blk_addr(sbi, NR_CURSEG_TYPE, type), - NR_CURSEG_TYPE - type, META_CP, true); + f2fs_ra_meta_pages(sbi, + sum_blk_addr(sbi, NR_CURSEG_PERSIST_TYPE, type), + NR_CURSEG_PERSIST_TYPE - type, META_CP, true); for (; type <= CURSEG_COLD_NODE; type++) { err = read_normal_summaries(sbi, type); @@ -3781,7 +3973,7 @@ int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, static struct page *get_current_sit_page(struct f2fs_sb_info *sbi, unsigned int segno) { - return f2fs_get_meta_page_nofail(sbi, current_sit_addr(sbi, segno)); + return f2fs_get_meta_page(sbi, current_sit_addr(sbi, segno)); } static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, @@ -4155,14 +4347,14 @@ static int build_curseg(struct f2fs_sb_info *sbi) struct curseg_info *array; int i; - array = f2fs_kzalloc(sbi, array_size(NR_CURSEG_TYPE, sizeof(*array)), - GFP_KERNEL); + array = f2fs_kzalloc(sbi, array_size(NR_CURSEG_TYPE, + sizeof(*array)), GFP_KERNEL); if (!array) return -ENOMEM; SM_I(sbi)->curseg_array = array; - for (i = 0; i < NR_CURSEG_TYPE; i++) { + for (i = 0; i < NO_CHECK_TYPE; i++) { mutex_init(&array[i].curseg_mutex); array[i].sum_blk = f2fs_kzalloc(sbi, PAGE_SIZE, GFP_KERNEL); if (!array[i].sum_blk) @@ -4172,8 +4364,15 @@ static int build_curseg(struct f2fs_sb_info *sbi) sizeof(struct f2fs_journal), GFP_KERNEL); if (!array[i].journal) return -ENOMEM; + if (i < NR_PERSISTENT_LOG) + array[i].seg_type = CURSEG_HOT_DATA + i; + else if (i == CURSEG_COLD_DATA_PINNED) + array[i].seg_type = CURSEG_COLD_DATA; + else if (i == CURSEG_ALL_DATA_ATGC) + array[i].seg_type = CURSEG_COLD_DATA; array[i].segno = NULL_SEGNO; array[i].next_blkoff = 0; + array[i].inited = false; } return restore_curseg_summaries(sbi); } @@ -4294,9 +4493,12 @@ static void init_free_segmap(struct f2fs_sb_info *sbi) { unsigned int start; int type; + struct seg_entry *sentry; for (start = 0; start < MAIN_SEGS(sbi); start++) { - struct seg_entry *sentry = get_seg_entry(sbi, start); + if (f2fs_usable_blks_in_seg(sbi, start) == 0) + continue; + sentry = get_seg_entry(sbi, start); if (!sentry->valid_blocks) __set_free(sbi, start); else @@ -4316,7 +4518,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct free_segmap_info *free_i = FREE_I(sbi); unsigned int segno = 0, offset = 0, secno; - block_t valid_blocks; + block_t valid_blocks, usable_blks_in_seg; block_t blks_per_sec = BLKS_PER_SEC(sbi); while (1) { @@ -4326,9 +4528,10 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) break; offset = segno + 1; valid_blocks = get_valid_blocks(sbi, segno, false); - if (valid_blocks == sbi->blocks_per_seg || !valid_blocks) + usable_blks_in_seg = f2fs_usable_blks_in_seg(sbi, segno); + if (valid_blocks == usable_blks_in_seg || !valid_blocks) continue; - if (valid_blocks > sbi->blocks_per_seg) { + if (valid_blocks > usable_blks_in_seg) { f2fs_bug_on(sbi, 1); continue; } @@ -4408,11 +4611,13 @@ static int sanity_check_curseg(struct f2fs_sb_info *sbi) * In LFS/SSR curseg, .next_blkoff should point to an unused blkaddr; * In LFS curseg, all blkaddr after .next_blkoff should be unused. */ - for (i = 0; i < NO_CHECK_TYPE; i++) { + for (i = 0; i < NR_PERSISTENT_LOG; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); struct seg_entry *se = get_seg_entry(sbi, curseg->segno); unsigned int blkofs = curseg->next_blkoff; + sanity_check_seg_type(sbi, curseg->seg_type); + if (f2fs_test_bit(blkofs, se->cur_valid_map)) goto out; @@ -4637,7 +4842,7 @@ int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi) { int i, ret; - for (i = 0; i < NO_CHECK_TYPE; i++) { + for (i = 0; i < NR_PERSISTENT_LOG; i++) { ret = fix_curseg_write_pointer(sbi, i); if (ret) return ret; @@ -4678,6 +4883,101 @@ int f2fs_check_write_pointer(struct f2fs_sb_info *sbi) return 0; } + +static bool is_conv_zone(struct f2fs_sb_info *sbi, unsigned int zone_idx, + unsigned int dev_idx) +{ + if (!bdev_is_zoned(FDEV(dev_idx).bdev)) + return true; + return !test_bit(zone_idx, FDEV(dev_idx).blkz_seq); +} + +/* Return the zone index in the given device */ +static unsigned int get_zone_idx(struct f2fs_sb_info *sbi, unsigned int secno, + int dev_idx) +{ + block_t sec_start_blkaddr = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, secno)); + + return (sec_start_blkaddr - FDEV(dev_idx).start_blk) >> + sbi->log_blocks_per_blkz; +} + +/* + * Return the usable segments in a section based on the zone's + * corresponding zone capacity. Zone is equal to a section. + */ +static inline unsigned int f2fs_usable_zone_segs_in_sec( + struct f2fs_sb_info *sbi, unsigned int segno) +{ + unsigned int dev_idx, zone_idx, unusable_segs_in_sec; + + dev_idx = f2fs_target_device_index(sbi, START_BLOCK(sbi, segno)); + zone_idx = get_zone_idx(sbi, GET_SEC_FROM_SEG(sbi, segno), dev_idx); + + /* Conventional zone's capacity is always equal to zone size */ + if (is_conv_zone(sbi, zone_idx, dev_idx)) + return sbi->segs_per_sec; + + /* + * If the zone_capacity_blocks array is NULL, then zone capacity + * is equal to the zone size for all zones + */ + if (!FDEV(dev_idx).zone_capacity_blocks) + return sbi->segs_per_sec; + + /* Get the segment count beyond zone capacity block */ + unusable_segs_in_sec = (sbi->blocks_per_blkz - + FDEV(dev_idx).zone_capacity_blocks[zone_idx]) >> + sbi->log_blocks_per_seg; + return sbi->segs_per_sec - unusable_segs_in_sec; +} + +/* + * Return the number of usable blocks in a segment. The number of blocks + * returned is always equal to the number of blocks in a segment for + * segments fully contained within a sequential zone capacity or a + * conventional zone. For segments partially contained in a sequential + * zone capacity, the number of usable blocks up to the zone capacity + * is returned. 0 is returned in all other cases. + */ +static inline unsigned int f2fs_usable_zone_blks_in_seg( + struct f2fs_sb_info *sbi, unsigned int segno) +{ + block_t seg_start, sec_start_blkaddr, sec_cap_blkaddr; + unsigned int zone_idx, dev_idx, secno; + + secno = GET_SEC_FROM_SEG(sbi, segno); + seg_start = START_BLOCK(sbi, segno); + dev_idx = f2fs_target_device_index(sbi, seg_start); + zone_idx = get_zone_idx(sbi, secno, dev_idx); + + /* + * Conventional zone's capacity is always equal to zone size, + * so, blocks per segment is unchanged. + */ + if (is_conv_zone(sbi, zone_idx, dev_idx)) + return sbi->blocks_per_seg; + + if (!FDEV(dev_idx).zone_capacity_blocks) + return sbi->blocks_per_seg; + + sec_start_blkaddr = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, secno)); + sec_cap_blkaddr = sec_start_blkaddr + + FDEV(dev_idx).zone_capacity_blocks[zone_idx]; + + /* + * If segment starts before zone capacity and spans beyond + * zone capacity, then usable blocks are from seg start to + * zone capacity. If the segment starts after the zone capacity, + * then there are no usable blocks. + */ + if (seg_start >= sec_cap_blkaddr) + return 0; + if (seg_start + sbi->blocks_per_seg > sec_cap_blkaddr) + return sec_cap_blkaddr - seg_start; + + return sbi->blocks_per_seg; +} #else int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi) { @@ -4688,7 +4988,36 @@ int f2fs_check_write_pointer(struct f2fs_sb_info *sbi) { return 0; } + +static inline unsigned int f2fs_usable_zone_blks_in_seg(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + return 0; +} + +static inline unsigned int f2fs_usable_zone_segs_in_sec(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + return 0; +} #endif +unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + if (f2fs_sb_has_blkzoned(sbi)) + return f2fs_usable_zone_blks_in_seg(sbi, segno); + + return sbi->blocks_per_seg; +} + +unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + if (f2fs_sb_has_blkzoned(sbi)) + return f2fs_usable_zone_segs_in_sec(sbi, segno); + + return sbi->segs_per_sec; +} /* * Update min, max modified time for cost-benefit GC algorithm @@ -4715,6 +5044,7 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi) sit_i->min_mtime = mtime; } sit_i->max_mtime = get_mtime(sbi, false); + sit_i->dirty_max_mtime = 0; up_write(&sit_i->sentry_lock); } @@ -4830,7 +5160,7 @@ static void destroy_dirty_segmap(struct f2fs_sb_info *sbi) destroy_victim_secmap(sbi); SM_I(sbi)->dirty_info = NULL; - kvfree(dirty_i); + kfree(dirty_i); } static void destroy_curseg(struct f2fs_sb_info *sbi) @@ -4842,10 +5172,10 @@ static void destroy_curseg(struct f2fs_sb_info *sbi) return; SM_I(sbi)->curseg_array = NULL; for (i = 0; i < NR_CURSEG_TYPE; i++) { - kvfree(array[i].sum_blk); - kvfree(array[i].journal); + kfree(array[i].sum_blk); + kfree(array[i].journal); } - kvfree(array); + kfree(array); } static void destroy_free_segmap(struct f2fs_sb_info *sbi) @@ -4856,7 +5186,7 @@ static void destroy_free_segmap(struct f2fs_sb_info *sbi) SM_I(sbi)->free_info = NULL; kvfree(free_i->free_segmap); kvfree(free_i->free_secmap); - kvfree(free_i); + kfree(free_i); } static void destroy_sit_info(struct f2fs_sb_info *sbi) @@ -4868,7 +5198,7 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) if (sit_i->sentries) kvfree(sit_i->bitmap); - kvfree(sit_i->tmp_map); + kfree(sit_i->tmp_map); kvfree(sit_i->sentries); kvfree(sit_i->sec_entries); @@ -4880,7 +5210,7 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) kvfree(sit_i->sit_bitmap_mir); kvfree(sit_i->invalid_segmap); #endif - kvfree(sit_i); + kfree(sit_i); } void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi) @@ -4896,7 +5226,7 @@ void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi) destroy_free_segmap(sbi); destroy_sit_info(sbi); sbi->sm_info = NULL; - kvfree(sm_info); + kfree(sm_info); } int __init f2fs_create_segment_manager_caches(void) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 752b177073b2..e81eb0748e2a 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -16,13 +16,20 @@ #define DEF_MAX_RECLAIM_PREFREE_SEGMENTS 4096 /* 8GB in maximum */ #define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */ +#define F2FS_MIN_META_SEGMENTS 8 /* SB + 2 (CP + SIT + NAT) + SSA */ /* L: Logical segment # in volume, R: Relative segment # in main area */ #define GET_L2R_SEGNO(free_i, segno) ((segno) - (free_i)->start_segno) #define GET_R2L_SEGNO(free_i, segno) ((segno) + (free_i)->start_segno) #define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA) -#define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE) +#define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE && (t) <= CURSEG_COLD_NODE) + +static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, + unsigned short seg_type) +{ + f2fs_bug_on(sbi, seg_type >= NR_PERSISTENT_LOG); +} #define IS_HOT(t) ((t) == CURSEG_HOT_NODE || (t) == CURSEG_HOT_DATA) #define IS_WARM(t) ((t) == CURSEG_WARM_NODE || (t) == CURSEG_WARM_DATA) @@ -34,7 +41,9 @@ ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ ((seg) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ ((seg) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ - ((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno)) + ((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA_PINNED)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC)->segno)) #define IS_CURSEC(sbi, secno) \ (((secno) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ @@ -48,7 +57,11 @@ ((secno) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \ (sbi)->segs_per_sec) || \ ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ - (sbi)->segs_per_sec)) \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA_PINNED)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC)->segno / \ + (sbi)->segs_per_sec)) #define MAIN_BLKADDR(sbi) \ (SM_I(sbi) ? SM_I(sbi)->main_blkaddr : \ @@ -132,20 +145,25 @@ enum { * In the victim_sel_policy->alloc_mode, there are two block allocation modes. * LFS writes data sequentially with cleaning operations. * SSR (Slack Space Recycle) reuses obsolete space without cleaning operations. + * AT_SSR (Age Threshold based Slack Space Recycle) merges fragments into + * fragmented segment which has similar aging degree. */ enum { LFS = 0, - SSR + SSR, + AT_SSR, }; /* * In the victim_sel_policy->gc_mode, there are two gc, aka cleaning, modes. * GC_CB is based on cost-benefit algorithm. * GC_GREEDY is based on greedy algorithm. + * GC_AT is based on age-threshold algorithm. */ enum { GC_CB = 0, GC_GREEDY, + GC_AT, ALLOC_NEXT, FLUSH_DEVICE, MAX_GC_POLICY, @@ -174,7 +192,10 @@ struct victim_sel_policy { unsigned int offset; /* last scanned bitmap offset */ unsigned int ofs_unit; /* bitmap search unit */ unsigned int min_cost; /* minimum cost */ + unsigned long long oldest_age; /* oldest age of segments having the same min cost */ unsigned int min_segno; /* segment # having min. cost */ + unsigned long long age; /* mtime of GCed section*/ + unsigned long long age_threshold;/* age threshold */ }; struct seg_entry { @@ -240,6 +261,8 @@ struct sit_info { unsigned long long mounted_time; /* mount time */ unsigned long long min_mtime; /* min. modification time */ unsigned long long max_mtime; /* max. modification time */ + unsigned long long dirty_min_mtime; /* rerange candidates in GC_AT */ + unsigned long long dirty_max_mtime; /* rerange candidates in GC_AT */ unsigned int last_victim[MAX_GC_POLICY]; /* last victim segment # */ }; @@ -278,7 +301,7 @@ struct dirty_seglist_info { /* victim selection function for cleaning and SSR */ struct victim_selection { int (*get_victim)(struct f2fs_sb_info *, unsigned int *, - int, int, char); + int, int, char, unsigned long long); }; /* for active log information */ @@ -288,10 +311,12 @@ struct curseg_info { struct rw_semaphore journal_rwsem; /* protect journal area */ struct f2fs_journal *journal; /* cached journal info */ unsigned char alloc_type; /* current allocation type */ + unsigned short seg_type; /* segment type like CURSEG_XXX_TYPE */ unsigned int segno; /* current segment number */ unsigned short next_blkoff; /* next block offset to write */ unsigned int zone; /* current zone number */ unsigned int next_segno; /* preallocated segment */ + bool inited; /* indicate inmem log is inited */ }; struct sit_entry_set { @@ -305,8 +330,6 @@ struct sit_entry_set { */ static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type) { - if (type == CURSEG_COLD_DATA_PINNED) - type = CURSEG_COLD_DATA; return (struct curseg_info *)(SM_I(sbi)->curseg_array + type); } @@ -411,6 +434,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno) unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); unsigned int next; + unsigned int usable_segs = f2fs_usable_segs_in_sec(sbi, segno); spin_lock(&free_i->segmap_lock); clear_bit(segno, free_i->free_segmap); @@ -418,7 +442,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno) next = find_next_bit(free_i->free_segmap, start_segno + sbi->segs_per_sec, start_segno); - if (next >= start_segno + sbi->segs_per_sec) { + if (next >= start_segno + usable_segs) { clear_bit(secno, free_i->free_secmap); free_i->free_sections++; } @@ -438,22 +462,23 @@ static inline void __set_inuse(struct f2fs_sb_info *sbi, } static inline void __set_test_and_free(struct f2fs_sb_info *sbi, - unsigned int segno) + unsigned int segno, bool inmem) { struct free_segmap_info *free_i = FREE_I(sbi); unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); unsigned int next; + unsigned int usable_segs = f2fs_usable_segs_in_sec(sbi, segno); spin_lock(&free_i->segmap_lock); if (test_and_clear_bit(segno, free_i->free_segmap)) { free_i->free_segments++; - if (IS_CURSEC(sbi, secno)) + if (!inmem && IS_CURSEC(sbi, secno)) goto skip_free; next = find_next_bit(free_i->free_segmap, start_segno + sbi->segs_per_sec, start_segno); - if (next >= start_segno + sbi->segs_per_sec) { + if (next >= start_segno + usable_segs) { if (test_and_clear_bit(secno, free_i->free_secmap)) free_i->free_sections++; } @@ -500,7 +525,7 @@ static inline unsigned int free_segments(struct f2fs_sb_info *sbi) return FREE_I(sbi)->free_segments; } -static inline int reserved_segments(struct f2fs_sb_info *sbi) +static inline unsigned int reserved_segments(struct f2fs_sb_info *sbi) { return SM_I(sbi)->reserved_segments; } @@ -532,7 +557,7 @@ static inline int overprovision_segments(struct f2fs_sb_info *sbi) static inline int reserved_sections(struct f2fs_sb_info *sbi) { - return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi)); + return GET_SEC_FROM_SEG(sbi, reserved_segments(sbi)); } static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi) @@ -546,8 +571,8 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi) /* check current node segment */ for (i = CURSEG_HOT_NODE; i <= CURSEG_COLD_NODE; i++) { segno = CURSEG_I(sbi, i)->segno; - left_blocks = sbi->blocks_per_seg - - get_seg_entry(sbi, segno)->ckpt_valid_blocks; + left_blocks = f2fs_usable_blks_in_seg(sbi, segno) - + get_seg_entry(sbi, segno)->ckpt_valid_blocks; if (node_blocks > left_blocks) return false; @@ -555,7 +580,7 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi) /* check current data segment */ segno = CURSEG_I(sbi, CURSEG_HOT_DATA)->segno; - left_blocks = sbi->blocks_per_seg - + left_blocks = f2fs_usable_blks_in_seg(sbi, segno) - get_seg_entry(sbi, segno)->ckpt_valid_blocks; if (dent_blocks > left_blocks) return false; @@ -677,21 +702,22 @@ static inline int check_block_count(struct f2fs_sb_info *sbi, bool is_valid = test_bit_le(0, raw_sit->valid_map) ? true : false; int valid_blocks = 0; int cur_pos = 0, next_pos; + unsigned int usable_blks_per_seg = f2fs_usable_blks_in_seg(sbi, segno); /* check bitmap with valid block count */ do { if (is_valid) { next_pos = find_next_zero_bit_le(&raw_sit->valid_map, - sbi->blocks_per_seg, + usable_blks_per_seg, cur_pos); valid_blocks += next_pos - cur_pos; } else next_pos = find_next_bit_le(&raw_sit->valid_map, - sbi->blocks_per_seg, + usable_blks_per_seg, cur_pos); cur_pos = next_pos; is_valid = !is_valid; - } while (cur_pos < sbi->blocks_per_seg); + } while (cur_pos < usable_blks_per_seg); if (unlikely(GET_SIT_VBLOCKS(raw_sit) != valid_blocks)) { f2fs_err(sbi, "Mismatch valid blocks %d vs. %d", @@ -700,8 +726,13 @@ static inline int check_block_count(struct f2fs_sb_info *sbi, return -EFSCORRUPTED; } + if (usable_blks_per_seg < sbi->blocks_per_seg) + f2fs_bug_on(sbi, find_next_bit_le(&raw_sit->valid_map, + sbi->blocks_per_seg, + usable_blks_per_seg) != sbi->blocks_per_seg); + /* check segment usage, and check boundary of a given segment number */ - if (unlikely(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg + if (unlikely(GET_SIT_VBLOCKS(raw_sit) > usable_blks_per_seg || segno > TOTAL_SEGS(sbi) - 1)) { f2fs_err(sbi, "Wrong valid blocks %d or segno %u", GET_SIT_VBLOCKS(raw_sit), segno); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index dfa072fa8081..0c958fed3392 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -146,6 +146,7 @@ enum { Opt_compress_algorithm, Opt_compress_log_size, Opt_compress_extension, + Opt_atgc, Opt_err, }; @@ -213,6 +214,7 @@ static match_table_t f2fs_tokens = { {Opt_compress_algorithm, "compress_algorithm=%s"}, {Opt_compress_log_size, "compress_log_size=%u"}, {Opt_compress_extension, "compress_extension=%s"}, + {Opt_atgc, "atgc"}, {Opt_err, NULL}, }; @@ -433,12 +435,12 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb, * needed to allow it to be set or changed during remount. We do allow * it to be specified during remount, but only if there is no change. */ - if (is_remount && !F2FS_OPTION(sbi).dummy_enc_ctx.ctx) { + if (is_remount && !F2FS_OPTION(sbi).dummy_enc_policy.policy) { f2fs_warn(sbi, "Can't set test_dummy_encryption on remount"); return -EINVAL; } err = fscrypt_set_test_dummy_encryption( - sb, arg, &F2FS_OPTION(sbi).dummy_enc_ctx); + sb, arg->from, &F2FS_OPTION(sbi).dummy_enc_policy); if (err) { if (err == -EEXIST) f2fs_warn(sbi, @@ -580,7 +582,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) case Opt_active_logs: if (args->from && match_int(args, &arg)) return -EINVAL; - if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE) + if (arg != 2 && arg != 4 && + arg != NR_CURSEG_PERSIST_TYPE) return -EINVAL; F2FS_OPTION(sbi).active_logs = arg; break; @@ -868,8 +871,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) #ifdef CONFIG_F2FS_FS_COMPRESSION case Opt_compress_algorithm: if (!f2fs_sb_has_compression(sbi)) { - f2fs_err(sbi, "Compression feature if off"); - return -EINVAL; + f2fs_info(sbi, "Image doesn't support compression"); + break; } name = match_strdup(&args[0]); if (!name) @@ -894,8 +897,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) break; case Opt_compress_log_size: if (!f2fs_sb_has_compression(sbi)) { - f2fs_err(sbi, "Compression feature is off"); - return -EINVAL; + f2fs_info(sbi, "Image doesn't support compression"); + break; } if (args->from && match_int(args, &arg)) return -EINVAL; @@ -909,8 +912,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) break; case Opt_compress_extension: if (!f2fs_sb_has_compression(sbi)) { - f2fs_err(sbi, "Compression feature is off"); - return -EINVAL; + f2fs_info(sbi, "Image doesn't support compression"); + break; } name = match_strdup(&args[0]); if (!name) @@ -938,6 +941,9 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) f2fs_info(sbi, "compression options not supported"); break; #endif + case Opt_atgc: + set_opt(sbi, ATGC); + break; default: f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value", p); @@ -964,6 +970,17 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) return -EINVAL; } #endif + /* + * The BLKZONED feature indicates that the drive was formatted with + * zone alignment optimization. This is optional for host-aware + * devices, but mandatory for host-managed zoned block devices. + */ +#ifndef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_has_blkzoned(sbi)) { + f2fs_err(sbi, "Zoned block device support is not enabled"); + return -EINVAL; + } +#endif if (F2FS_IO_SIZE_BITS(sbi) && !f2fs_lfs_mode(sbi)) { f2fs_err(sbi, "Should set mode=lfs with %uKB-sized IO", @@ -1001,7 +1018,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) } /* Not pass down write hints if the number of active logs is lesser - * than NR_CURSEG_TYPE. + * than NR_CURSEG_PERSIST_TYPE. */ if (F2FS_OPTION(sbi).active_logs != NR_CURSEG_TYPE) F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; @@ -1020,6 +1037,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) /* Initialize f2fs-specific inode info */ atomic_set(&fi->dirty_pages, 0); + atomic_set(&fi->i_compr_blocks, 0); init_rwsem(&fi->i_sem); spin_lock_init(&fi->i_size_lock); INIT_LIST_HEAD(&fi->dirty_list); @@ -1184,6 +1202,7 @@ static void destroy_device_list(struct f2fs_sb_info *sbi) blkdev_put(FDEV(i).bdev, FMODE_EXCL); #ifdef CONFIG_BLK_DEV_ZONED kvfree(FDEV(i).blkz_seq); + kfree(FDEV(i).zone_capacity_blocks); #endif } kvfree(sbi->devs); @@ -1269,18 +1288,19 @@ static void f2fs_put_super(struct super_block *sb) kfree(sbi->raw_super); destroy_device_list(sbi); + f2fs_destroy_page_array_cache(sbi); f2fs_destroy_xattr_caches(sbi); mempool_destroy(sbi->write_io_dummy); #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) kfree(F2FS_OPTION(sbi).s_qf_names[i]); #endif - fscrypt_free_dummy_context(&F2FS_OPTION(sbi).dummy_enc_ctx); + fscrypt_free_dummy_policy(&F2FS_OPTION(sbi).dummy_enc_policy); destroy_percpu_info(sbi); for (i = 0; i < NR_PAGE_TYPE; i++) kvfree(sbi->write_io[i]); #ifdef CONFIG_UNICODE - utf8_unload(sbi->s_encoding); + utf8_unload(sb->s_encoding); #endif kfree(sbi); } @@ -1634,13 +1654,16 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) #ifdef CONFIG_F2FS_FS_COMPRESSION f2fs_show_compress_options(seq, sbi->sb); #endif + + if (test_opt(sbi, ATGC)) + seq_puts(seq, ",atgc"); return 0; } static void default_options(struct f2fs_sb_info *sbi) { /* init some FS parameters */ - F2FS_OPTION(sbi).active_logs = NR_CURSEG_TYPE; + F2FS_OPTION(sbi).active_logs = NR_CURSEG_PERSIST_TYPE; F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; @@ -1763,6 +1786,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); bool disable_checkpoint = test_opt(sbi, DISABLE_CHECKPOINT); bool no_io_align = !F2FS_IO_ALIGNED(sbi); + bool no_atgc = !test_opt(sbi, ATGC); bool checkpoint_changed; #ifdef CONFIG_QUOTA int i, j; @@ -1835,6 +1859,13 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } } #endif + /* disallow enable atgc dynamically */ + if (no_atgc == !!test_opt(sbi, ATGC)) { + err = -EINVAL; + f2fs_warn(sbi, "switch atgc option is not allowed"); + goto restore_opts; + } + /* disallow enable/disable extent_cache dynamically */ if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) { err = -EINVAL; @@ -2482,10 +2513,9 @@ static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len, ctx, len, fs_data, XATTR_CREATE); } -static const union fscrypt_context * -f2fs_get_dummy_context(struct super_block *sb) +static const union fscrypt_policy *f2fs_get_dummy_policy(struct super_block *sb) { - return F2FS_OPTION(F2FS_SB(sb)).dummy_enc_ctx.ctx; + return F2FS_OPTION(F2FS_SB(sb)).dummy_enc_policy.policy; } static bool f2fs_has_stable_inodes(struct super_block *sb) @@ -2523,7 +2553,7 @@ static const struct fscrypt_operations f2fs_cryptops = { .key_prefix = "f2fs:", .get_context = f2fs_get_context, .set_context = f2fs_set_context, - .get_dummy_context = f2fs_get_dummy_context, + .get_dummy_policy = f2fs_get_dummy_policy, .empty_dir = f2fs_empty_dir, .max_namelen = F2FS_NAME_LEN, .has_stable_inodes = f2fs_has_stable_inodes, @@ -2680,10 +2710,8 @@ static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi, } if (main_end_blkaddr > seg_end_blkaddr) { - f2fs_info(sbi, "Wrong MAIN_AREA boundary, start(%u) end(%u) block(%u)", - main_blkaddr, - segment0_blkaddr + - (segment_count << log_blocks_per_seg), + f2fs_info(sbi, "Wrong MAIN_AREA boundary, start(%u) end(%llu) block(%u)", + main_blkaddr, seg_end_blkaddr, segment_count_main << log_blocks_per_seg); return true; } else if (main_end_blkaddr < seg_end_blkaddr) { @@ -2701,10 +2729,8 @@ static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi, err = __f2fs_commit_super(bh, NULL); res = err ? "failed" : "done"; } - f2fs_info(sbi, "Fix alignment : %s, start(%u) end(%u) block(%u)", - res, main_blkaddr, - segment0_blkaddr + - (segment_count << log_blocks_per_seg), + f2fs_info(sbi, "Fix alignment : %s, start(%u) end(%llu) block(%u)", + res, main_blkaddr, seg_end_blkaddr, segment_count_main << log_blocks_per_seg); if (err) return true; @@ -2715,7 +2741,7 @@ static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi, static int sanity_check_raw_super(struct f2fs_sb_info *sbi, struct buffer_head *bh) { - block_t segment_count, segs_per_sec, secs_per_zone; + block_t segment_count, segs_per_sec, secs_per_zone, segment_count_main; block_t total_sections, blocks_per_seg; struct f2fs_super_block *raw_super = (struct f2fs_super_block *) (bh->b_data + F2FS_SUPER_OFFSET); @@ -2786,6 +2812,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, } segment_count = le32_to_cpu(raw_super->segment_count); + segment_count_main = le32_to_cpu(raw_super->segment_count_main); segs_per_sec = le32_to_cpu(raw_super->segs_per_sec); secs_per_zone = le32_to_cpu(raw_super->secs_per_zone); total_sections = le32_to_cpu(raw_super->section_count); @@ -2799,14 +2826,19 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return -EFSCORRUPTED; } - if (total_sections > segment_count || - total_sections < F2FS_MIN_SEGMENTS || + if (total_sections > segment_count_main || total_sections < 1 || segs_per_sec > segment_count || !segs_per_sec) { f2fs_info(sbi, "Invalid segment/section count (%u, %u x %u)", segment_count, total_sections, segs_per_sec); return -EFSCORRUPTED; } + if (segment_count_main != total_sections * segs_per_sec) { + f2fs_info(sbi, "Invalid segment/section count (%u != %u * %u)", + segment_count_main, total_sections, segs_per_sec); + return -EFSCORRUPTED; + } + if ((segment_count / segs_per_sec) < total_sections) { f2fs_info(sbi, "Small segment_count (%u < %u * %u)", segment_count, segs_per_sec, total_sections); @@ -2832,6 +2864,12 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, segment_count, dev_seg_count); return -EFSCORRUPTED; } + } else { + if (__F2FS_HAS_FEATURE(raw_super, F2FS_FEATURE_BLKZONED) && + !bdev_is_zoned(sbi->sb->s_bdev)) { + f2fs_info(sbi, "Zoned block device path is missing"); + return -EFSCORRUPTED; + } } if (secs_per_zone > total_sections || !secs_per_zone) { @@ -2907,7 +2945,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count); - if (unlikely(fsmeta < F2FS_MIN_SEGMENTS || + if (unlikely(fsmeta < F2FS_MIN_META_SEGMENTS || ovp_segments == 0 || reserved_segments == 0)) { f2fs_err(sbi, "Wrong layout: check mkfs.f2fs version"); return 1; @@ -2995,7 +3033,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) cp_payload = __cp_payload(sbi); if (cp_pack_start_sum < cp_payload + 1 || cp_pack_start_sum > blocks_per_seg - 1 - - NR_CURSEG_TYPE) { + NR_CURSEG_PERSIST_TYPE) { f2fs_err(sbi, "Wrong cp_pack_start_sum: %u", cp_pack_start_sum); return 1; @@ -3088,13 +3126,26 @@ static int init_percpu_info(struct f2fs_sb_info *sbi) } #ifdef CONFIG_BLK_DEV_ZONED + +struct f2fs_report_zones_args { + struct f2fs_dev_info *dev; + bool zone_cap_mismatch; +}; + static int f2fs_report_zone_cb(struct blk_zone *zone, unsigned int idx, - void *data) + void *data) { - struct f2fs_dev_info *dev = data; + struct f2fs_report_zones_args *rz_args = data; + + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + return 0; + + set_bit(idx, rz_args->dev->blkz_seq); + rz_args->dev->zone_capacity_blocks[idx] = zone->capacity >> + F2FS_LOG_SECTORS_PER_BLOCK; + if (zone->len != zone->capacity && !rz_args->zone_cap_mismatch) + rz_args->zone_cap_mismatch = true; - if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) - set_bit(idx, dev->blkz_seq); return 0; } @@ -3102,6 +3153,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) { struct block_device *bdev = FDEV(devi).bdev; sector_t nr_sectors = bdev->bd_part->nr_sects; + struct f2fs_report_zones_args rep_zone_arg; int ret; if (!f2fs_sb_has_blkzoned(sbi)) @@ -3127,12 +3179,26 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) if (!FDEV(devi).blkz_seq) return -ENOMEM; - /* Get block zones type */ + /* Get block zones type and zone-capacity */ + FDEV(devi).zone_capacity_blocks = f2fs_kzalloc(sbi, + FDEV(devi).nr_blkz * sizeof(block_t), + GFP_KERNEL); + if (!FDEV(devi).zone_capacity_blocks) + return -ENOMEM; + + rep_zone_arg.dev = &FDEV(devi); + rep_zone_arg.zone_cap_mismatch = false; + ret = blkdev_report_zones(bdev, 0, BLK_ALL_ZONES, f2fs_report_zone_cb, - &FDEV(devi)); + &rep_zone_arg); if (ret < 0) return ret; + if (!rep_zone_arg.zone_cap_mismatch) { + kfree(FDEV(devi).zone_capacity_blocks); + FDEV(devi).zone_capacity_blocks = NULL; + } + return 0; } #endif @@ -3329,7 +3395,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) static int f2fs_setup_casefold(struct f2fs_sb_info *sbi) { #ifdef CONFIG_UNICODE - if (f2fs_sb_has_casefold(sbi) && !sbi->s_encoding) { + if (f2fs_sb_has_casefold(sbi) && !sbi->sb->s_encoding) { const struct f2fs_sb_encodings *encoding_info; struct unicode_map *encoding; __u16 encoding_flags; @@ -3360,8 +3426,8 @@ static int f2fs_setup_casefold(struct f2fs_sb_info *sbi) "%s-%s with flags 0x%hx", encoding_info->name, encoding_info->version?:"\b", encoding_flags); - sbi->s_encoding = encoding; - sbi->s_encoding_flags = encoding_flags; + sbi->sb->s_encoding = encoding; + sbi->sb->s_encoding_flags = encoding_flags; sbi->sb->s_d_op = &f2fs_dentry_ops; } #else @@ -3440,18 +3506,6 @@ try_onemore: sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid, sizeof(raw_super->uuid)); - /* - * The BLKZONED feature indicates that the drive was formatted with - * zone alignment optimization. This is optional for host-aware - * devices, but mandatory for host-managed zoned block devices. - */ -#ifndef CONFIG_BLK_DEV_ZONED - if (f2fs_sb_has_blkzoned(sbi)) { - f2fs_err(sbi, "Zoned block device support is not enabled"); - err = -EOPNOTSUPP; - goto free_sb_buf; - } -#endif default_options(sbi); /* parse mount options */ options = kstrdup((const char *)data, GFP_KERNEL); @@ -3566,13 +3620,16 @@ try_onemore: err = f2fs_init_xattr_caches(sbi); if (err) goto free_io_dummy; + err = f2fs_init_page_array_cache(sbi); + if (err) + goto free_xattr_cache; /* get an inode for meta space */ sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi)); if (IS_ERR(sbi->meta_inode)) { f2fs_err(sbi, "Failed to read F2FS meta data inode"); err = PTR_ERR(sbi->meta_inode); - goto free_xattr_cache; + goto free_page_array_cache; } err = f2fs_get_valid_checkpoint(sbi); @@ -3762,6 +3819,8 @@ try_onemore: } reset_checkpoint: + f2fs_init_inmem_curseg(sbi); + /* f2fs_recover_fsync_data() cleared this already */ clear_sbi_flag(sbi, SBI_POR_DOING); @@ -3846,6 +3905,8 @@ free_meta_inode: make_bad_inode(sbi->meta_inode); iput(sbi->meta_inode); sbi->meta_inode = NULL; +free_page_array_cache: + f2fs_destroy_page_array_cache(sbi); free_xattr_cache: f2fs_destroy_xattr_caches(sbi); free_io_dummy: @@ -3857,14 +3918,14 @@ free_bio_info: kvfree(sbi->write_io[i]); #ifdef CONFIG_UNICODE - utf8_unload(sbi->s_encoding); + utf8_unload(sb->s_encoding); #endif free_options: #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) kfree(F2FS_OPTION(sbi).s_qf_names[i]); #endif - fscrypt_free_dummy_context(&F2FS_OPTION(sbi).dummy_enc_ctx); + fscrypt_free_dummy_policy(&F2FS_OPTION(sbi).dummy_enc_policy); kvfree(options); free_sb_buf: kfree(raw_super); @@ -3967,9 +4028,12 @@ static int __init init_f2fs_fs(void) err = f2fs_create_extent_cache(); if (err) goto free_checkpoint_caches; - err = f2fs_init_sysfs(); + err = f2fs_create_garbage_collection_cache(); if (err) goto free_extent_cache; + err = f2fs_init_sysfs(); + if (err) + goto free_garbage_collection_cache; err = register_shrinker(&f2fs_shrinker_info); if (err) goto free_sysfs; @@ -3989,7 +4053,12 @@ static int __init init_f2fs_fs(void) err = f2fs_init_compress_mempool(); if (err) goto free_bioset; + err = f2fs_init_compress_cache(); + if (err) + goto free_compress_mempool; return 0; +free_compress_mempool: + f2fs_destroy_compress_mempool(); free_bioset: f2fs_destroy_bioset(); free_bio_enrty_cache: @@ -4003,6 +4072,8 @@ free_shrinker: unregister_shrinker(&f2fs_shrinker_info); free_sysfs: f2fs_exit_sysfs(); +free_garbage_collection_cache: + f2fs_destroy_garbage_collection_cache(); free_extent_cache: f2fs_destroy_extent_cache(); free_checkpoint_caches: @@ -4019,6 +4090,7 @@ fail: static void __exit exit_f2fs_fs(void) { + f2fs_destroy_compress_cache(); f2fs_destroy_compress_mempool(); f2fs_destroy_bioset(); f2fs_destroy_bio_entry_cache(); @@ -4027,6 +4099,7 @@ static void __exit exit_f2fs_fs(void) unregister_filesystem(&f2fs_fs_type); unregister_shrinker(&f2fs_shrinker_info); f2fs_exit_sysfs(); + f2fs_destroy_garbage_collection_cache(); f2fs_destroy_extent_cache(); f2fs_destroy_checkpoint_caches(); f2fs_destroy_segment_manager_caches(); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 88ed9969cc86..ec77ccfea923 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -176,12 +176,14 @@ static ssize_t encoding_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { #ifdef CONFIG_UNICODE + struct super_block *sb = sbi->sb; + if (f2fs_sb_has_casefold(sbi)) return snprintf(buf, PAGE_SIZE, "%s (%d.%d.%d)\n", - sbi->s_encoding->charset, - (sbi->s_encoding->version >> 16) & 0xff, - (sbi->s_encoding->version >> 8) & 0xff, - sbi->s_encoding->version & 0xff); + sb->s_encoding->charset, + (sb->s_encoding->version >> 16) & 0xff, + (sb->s_encoding->version >> 8) & 0xff, + sb->s_encoding->version & 0xff); #endif return sprintf(buf, "(none)"); } @@ -375,12 +377,17 @@ out: return count; } if (!strcmp(a->attr.name, "gc_idle")) { - if (t == GC_IDLE_CB) + if (t == GC_IDLE_CB) { sbi->gc_mode = GC_IDLE_CB; - else if (t == GC_IDLE_GREEDY) + } else if (t == GC_IDLE_GREEDY) { sbi->gc_mode = GC_IDLE_GREEDY; - else + } else if (t == GC_IDLE_AT) { + if (!sbi->am.atgc_enabled) + return -EINVAL; + sbi->gc_mode = GC_AT; + } else { sbi->gc_mode = GC_NORMAL; + } return count; } @@ -968,4 +975,5 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) } kobject_del(&sbi->s_kobj); kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); } diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index 9eb0dba851e8..054ec852b5ea 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -228,6 +228,7 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) { + DEFINE_READAHEAD(ractl, NULL, inode->i_mapping, index); struct page *page; index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT; @@ -237,8 +238,7 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode, if (page) put_page(page); else if (num_ra_pages > 1) - page_cache_readahead_unbounded(inode->i_mapping, NULL, - index, num_ra_pages, 0); + page_cache_ra_unbounded(&ractl, num_ra_pages, 0); page = read_mapping_page(inode->i_mapping, index, NULL); } return page; diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 1b0736ce0918..65afcc3cc68a 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -39,7 +39,7 @@ static void xattr_free(struct f2fs_sb_info *sbi, void *xattr_addr, if (is_inline) kmem_cache_free(sbi->inline_xattr_slab, xattr_addr); else - kvfree(xattr_addr); + kfree(xattr_addr); } static int f2fs_xattr_generic_get(const struct xattr_handler *handler, @@ -425,7 +425,7 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage, *base_addr = txattr_addr; return 0; fail: - kvfree(txattr_addr); + kfree(txattr_addr); return err; } @@ -610,7 +610,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) } error = buffer_size - rest; cleanup: - kvfree(base_addr); + kfree(base_addr); return error; } @@ -750,7 +750,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (!error && S_ISDIR(inode->i_mode)) set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP); exit: - kvfree(base_addr); + kfree(base_addr); return error; } diff --git a/fs/file.c b/fs/file.c index 21c0893f2f1d..4559b5fec3bd 100644 --- a/fs/file.c +++ b/fs/file.c @@ -21,6 +21,7 @@ #include <linux/rcupdate.h> #include <linux/close_range.h> #include <net/sock.h> +#include <linux/io_uring.h> unsigned int sysctl_nr_open __read_mostly = 1024*1024; unsigned int sysctl_nr_open_min = BITS_PER_LONG; @@ -452,6 +453,7 @@ void exit_files(struct task_struct *tsk) struct files_struct * files = tsk->files; if (files) { + io_uring_files_cancel(files); task_lock(tsk); tsk->files = NULL; task_unlock(tsk); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 58b27e4070a3..e6005c78bfa9 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2321,7 +2321,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) wb = locked_inode_to_wb_and_lock_list(inode); - WARN(bdi_cap_writeback_dirty(wb->bdi) && + WARN((wb->bdi->capabilities & BDI_CAP_WRITEBACK) && !test_bit(WB_registered, &wb->state), "bdi-%s not registered\n", bdi_dev_name(wb->bdi)); @@ -2346,7 +2346,8 @@ void __mark_inode_dirty(struct inode *inode, int flags) * to make sure background write-back happens * later. */ - if (bdi_cap_writeback_dirty(wb->bdi) && wakeup_bdi) + if (wakeup_bdi && + (wb->bdi->capabilities & BDI_CAP_WRITEBACK)) wb_wakeup_delayed(wb); return; } @@ -2581,7 +2582,7 @@ int write_inode_now(struct inode *inode, int sync) .range_end = LLONG_MAX, }; - if (!mapping_cap_writeback_dirty(inode->i_mapping)) + if (!mapping_can_writeback(inode->i_mapping)) wbc.nr_to_write = 0; might_sleep(); diff --git a/fs/fs_parser.c b/fs/fs_parser.c index ab53e42a874a..68b0148f4bb8 100644 --- a/fs/fs_parser.c +++ b/fs/fs_parser.c @@ -189,7 +189,7 @@ out: } EXPORT_SYMBOL(fs_lookup_param); -int fs_param_bad_value(struct p_log *log, struct fs_parameter *param) +static int fs_param_bad_value(struct p_log *log, struct fs_parameter *param) { return inval_plog(log, "Bad value for '%s'", param->key); } diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index 774b2618018a..40ce9a1c12e5 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -8,7 +8,7 @@ config FUSE_FS There's also a companion library: libfuse2. This library is available from the FUSE homepage: - <http://fuse.sourceforge.net/> + <https://github.com/libfuse/> although chances are your distribution already has that library installed if you've installed the "fuse" package itself. @@ -38,3 +38,17 @@ config VIRTIO_FS If you want to share files between guests or with the host, answer Y or M. + +config FUSE_DAX + bool "Virtio Filesystem Direct Host Memory Access support" + default y + select INTERVAL_TREE + depends on VIRTIO_FS + depends on FS_DAX + depends on DAX_DRIVER + help + This allows bypassing guest page cache and allows mapping host page + cache directly in guest address space. + + If you want to allow mounting a Virtio Filesystem with the "dax" + option, answer Y. diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 3e8cebfb59b7..8c7021fb2cd4 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -7,5 +7,7 @@ obj-$(CONFIG_FUSE_FS) += fuse.o obj-$(CONFIG_CUSE) += cuse.o obj-$(CONFIG_VIRTIO_FS) += virtiofs.o -fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o -virtiofs-y += virtio_fs.o +fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o +fuse-$(CONFIG_FUSE_DAX) += dax.o + +virtiofs-y := virtio_fs.o diff --git a/fs/fuse/control.c b/fs/fuse/control.c index a1303ad303ba..cc7e94d73c6c 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -164,6 +164,7 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file, { unsigned val; struct fuse_conn *fc; + struct fuse_mount *fm; ssize_t ret; ret = fuse_conn_limit_write(file, buf, count, ppos, &val, @@ -174,18 +175,27 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file, if (!fc) goto out; + down_read(&fc->killsb); spin_lock(&fc->bg_lock); fc->congestion_threshold = val; - if (fc->sb) { + + /* + * Get any fuse_mount belonging to this fuse_conn; s_bdi is + * shared between all of them + */ + + if (!list_empty(&fc->mounts)) { + fm = list_first_entry(&fc->mounts, struct fuse_mount, fc_entry); if (fc->num_background < fc->congestion_threshold) { - clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); - clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); + clear_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC); + clear_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC); } else { - set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); - set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); + set_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC); + set_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC); } } spin_unlock(&fc->bg_lock); + up_read(&fc->killsb); fuse_conn_put(fc); out: return ret; diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 2cc17816d7b1..45082269e698 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -57,6 +57,7 @@ struct cuse_conn { struct list_head list; /* linked on cuse_conntbl */ + struct fuse_mount fm; /* Dummy mount referencing fc */ struct fuse_conn fc; /* fuse connection */ struct cdev *cdev; /* associated character device */ struct device *dev; /* device representing @cdev */ @@ -134,7 +135,7 @@ static int cuse_open(struct inode *inode, struct file *file) * Generic permission check is already done against the chrdev * file, proceed to open. */ - rc = fuse_do_open(&cc->fc, 0, file, 0); + rc = fuse_do_open(&cc->fm, 0, file, 0); if (rc) fuse_conn_put(&cc->fc); return rc; @@ -143,10 +144,10 @@ static int cuse_open(struct inode *inode, struct file *file) static int cuse_release(struct inode *inode, struct file *file) { struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_mount *fm = ff->fm; fuse_sync_release(NULL, ff, file->f_flags); - fuse_conn_put(fc); + fuse_conn_put(fm->fc); return 0; } @@ -155,7 +156,7 @@ static long cuse_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fuse_file *ff = file->private_data; - struct cuse_conn *cc = fc_to_cc(ff->fc); + struct cuse_conn *cc = fc_to_cc(ff->fm->fc); unsigned int flags = 0; if (cc->unrestricted_ioctl) @@ -168,7 +169,7 @@ static long cuse_file_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fuse_file *ff = file->private_data; - struct cuse_conn *cc = fc_to_cc(ff->fc); + struct cuse_conn *cc = fc_to_cc(ff->fm->fc); unsigned int flags = FUSE_IOCTL_COMPAT; if (cc->unrestricted_ioctl) @@ -313,9 +314,10 @@ struct cuse_init_args { * required data structures for it. Please read the comment at the * top of this file for high level overview. */ -static void cuse_process_init_reply(struct fuse_conn *fc, +static void cuse_process_init_reply(struct fuse_mount *fm, struct fuse_args *args, int error) { + struct fuse_conn *fc = fm->fc; struct cuse_init_args *ia = container_of(args, typeof(*ia), ap.args); struct fuse_args_pages *ap = &ia->ap; struct cuse_conn *cc = fc_to_cc(fc), *pos; @@ -424,7 +426,7 @@ static int cuse_send_init(struct cuse_conn *cc) { int rc; struct page *page; - struct fuse_conn *fc = &cc->fc; + struct fuse_mount *fm = &cc->fm; struct cuse_init_args *ia; struct fuse_args_pages *ap; @@ -460,7 +462,7 @@ static int cuse_send_init(struct cuse_conn *cc) ia->desc.length = ap->args.out_args[1].size; ap->args.end = cuse_process_init_reply; - rc = fuse_simple_background(fc, &ap->args, GFP_KERNEL); + rc = fuse_simple_background(fm, &ap->args, GFP_KERNEL); if (rc) { kfree(ia); err_free_page: @@ -506,7 +508,8 @@ static int cuse_channel_open(struct inode *inode, struct file *file) * Limit the cuse channel to requests that can * be represented in file->f_cred->user_ns. */ - fuse_conn_init(&cc->fc, file->f_cred->user_ns, &fuse_dev_fiq_ops, NULL); + fuse_conn_init(&cc->fc, &cc->fm, file->f_cred->user_ns, + &fuse_dev_fiq_ops, NULL); fud = fuse_dev_alloc_install(&cc->fc); if (!fud) { diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c new file mode 100644 index 000000000000..ff99ab2a3c43 --- /dev/null +++ b/fs/fuse/dax.c @@ -0,0 +1,1365 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * dax: direct host memory access + * Copyright (C) 2020 Red Hat, Inc. + */ + +#include "fuse_i.h" + +#include <linux/delay.h> +#include <linux/dax.h> +#include <linux/uio.h> +#include <linux/pfn_t.h> +#include <linux/iomap.h> +#include <linux/interval_tree.h> + +/* + * Default memory range size. A power of 2 so it agrees with common FUSE_INIT + * map_alignment values 4KB and 64KB. + */ +#define FUSE_DAX_SHIFT 21 +#define FUSE_DAX_SZ (1 << FUSE_DAX_SHIFT) +#define FUSE_DAX_PAGES (FUSE_DAX_SZ / PAGE_SIZE) + +/* Number of ranges reclaimer will try to free in one invocation */ +#define FUSE_DAX_RECLAIM_CHUNK (10) + +/* + * Dax memory reclaim threshold in percetage of total ranges. When free + * number of free ranges drops below this threshold, reclaim can trigger + * Default is 20% + */ +#define FUSE_DAX_RECLAIM_THRESHOLD (20) + +/** Translation information for file offsets to DAX window offsets */ +struct fuse_dax_mapping { + /* Pointer to inode where this memory range is mapped */ + struct inode *inode; + + /* Will connect in fcd->free_ranges to keep track of free memory */ + struct list_head list; + + /* For interval tree in file/inode */ + struct interval_tree_node itn; + + /* Will connect in fc->busy_ranges to keep track busy memory */ + struct list_head busy_list; + + /** Position in DAX window */ + u64 window_offset; + + /** Length of mapping, in bytes */ + loff_t length; + + /* Is this mapping read-only or read-write */ + bool writable; + + /* reference count when the mapping is used by dax iomap. */ + refcount_t refcnt; +}; + +/* Per-inode dax map */ +struct fuse_inode_dax { + /* Semaphore to protect modifications to the dmap tree */ + struct rw_semaphore sem; + + /* Sorted rb tree of struct fuse_dax_mapping elements */ + struct rb_root_cached tree; + unsigned long nr; +}; + +struct fuse_conn_dax { + /* DAX device */ + struct dax_device *dev; + + /* Lock protecting accessess to members of this structure */ + spinlock_t lock; + + /* List of memory ranges which are busy */ + unsigned long nr_busy_ranges; + struct list_head busy_ranges; + + /* Worker to free up memory ranges */ + struct delayed_work free_work; + + /* Wait queue for a dax range to become free */ + wait_queue_head_t range_waitq; + + /* DAX Window Free Ranges */ + long nr_free_ranges; + struct list_head free_ranges; + + unsigned long nr_ranges; +}; + +static inline struct fuse_dax_mapping * +node_to_dmap(struct interval_tree_node *node) +{ + if (!node) + return NULL; + + return container_of(node, struct fuse_dax_mapping, itn); +} + +static struct fuse_dax_mapping * +alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode); + +static void +__kick_dmap_free_worker(struct fuse_conn_dax *fcd, unsigned long delay_ms) +{ + unsigned long free_threshold; + + /* If number of free ranges are below threshold, start reclaim */ + free_threshold = max_t(unsigned long, fcd->nr_ranges * FUSE_DAX_RECLAIM_THRESHOLD / 100, + 1); + if (fcd->nr_free_ranges < free_threshold) + queue_delayed_work(system_long_wq, &fcd->free_work, + msecs_to_jiffies(delay_ms)); +} + +static void kick_dmap_free_worker(struct fuse_conn_dax *fcd, + unsigned long delay_ms) +{ + spin_lock(&fcd->lock); + __kick_dmap_free_worker(fcd, delay_ms); + spin_unlock(&fcd->lock); +} + +static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn_dax *fcd) +{ + struct fuse_dax_mapping *dmap; + + spin_lock(&fcd->lock); + dmap = list_first_entry_or_null(&fcd->free_ranges, + struct fuse_dax_mapping, list); + if (dmap) { + list_del_init(&dmap->list); + WARN_ON(fcd->nr_free_ranges <= 0); + fcd->nr_free_ranges--; + } + spin_unlock(&fcd->lock); + + kick_dmap_free_worker(fcd, 0); + return dmap; +} + +/* This assumes fcd->lock is held */ +static void __dmap_remove_busy_list(struct fuse_conn_dax *fcd, + struct fuse_dax_mapping *dmap) +{ + list_del_init(&dmap->busy_list); + WARN_ON(fcd->nr_busy_ranges == 0); + fcd->nr_busy_ranges--; +} + +static void dmap_remove_busy_list(struct fuse_conn_dax *fcd, + struct fuse_dax_mapping *dmap) +{ + spin_lock(&fcd->lock); + __dmap_remove_busy_list(fcd, dmap); + spin_unlock(&fcd->lock); +} + +/* This assumes fcd->lock is held */ +static void __dmap_add_to_free_pool(struct fuse_conn_dax *fcd, + struct fuse_dax_mapping *dmap) +{ + list_add_tail(&dmap->list, &fcd->free_ranges); + fcd->nr_free_ranges++; + wake_up(&fcd->range_waitq); +} + +static void dmap_add_to_free_pool(struct fuse_conn_dax *fcd, + struct fuse_dax_mapping *dmap) +{ + /* Return fuse_dax_mapping to free list */ + spin_lock(&fcd->lock); + __dmap_add_to_free_pool(fcd, dmap); + spin_unlock(&fcd->lock); +} + +static int fuse_setup_one_mapping(struct inode *inode, unsigned long start_idx, + struct fuse_dax_mapping *dmap, bool writable, + bool upgrade) +{ + struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_conn_dax *fcd = fm->fc->dax; + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_setupmapping_in inarg; + loff_t offset = start_idx << FUSE_DAX_SHIFT; + FUSE_ARGS(args); + ssize_t err; + + WARN_ON(fcd->nr_free_ranges < 0); + + /* Ask fuse daemon to setup mapping */ + memset(&inarg, 0, sizeof(inarg)); + inarg.foffset = offset; + inarg.fh = -1; + inarg.moffset = dmap->window_offset; + inarg.len = FUSE_DAX_SZ; + inarg.flags |= FUSE_SETUPMAPPING_FLAG_READ; + if (writable) + inarg.flags |= FUSE_SETUPMAPPING_FLAG_WRITE; + args.opcode = FUSE_SETUPMAPPING; + args.nodeid = fi->nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + err = fuse_simple_request(fm, &args); + if (err < 0) + return err; + dmap->writable = writable; + if (!upgrade) { + /* + * We don't take a refernce on inode. inode is valid right now + * and when inode is going away, cleanup logic should first + * cleanup dmap entries. + */ + dmap->inode = inode; + dmap->itn.start = dmap->itn.last = start_idx; + /* Protected by fi->dax->sem */ + interval_tree_insert(&dmap->itn, &fi->dax->tree); + fi->dax->nr++; + spin_lock(&fcd->lock); + list_add_tail(&dmap->busy_list, &fcd->busy_ranges); + fcd->nr_busy_ranges++; + spin_unlock(&fcd->lock); + } + return 0; +} + +static int fuse_send_removemapping(struct inode *inode, + struct fuse_removemapping_in *inargp, + struct fuse_removemapping_one *remove_one) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_mount *fm = get_fuse_mount(inode); + FUSE_ARGS(args); + + args.opcode = FUSE_REMOVEMAPPING; + args.nodeid = fi->nodeid; + args.in_numargs = 2; + args.in_args[0].size = sizeof(*inargp); + args.in_args[0].value = inargp; + args.in_args[1].size = inargp->count * sizeof(*remove_one); + args.in_args[1].value = remove_one; + return fuse_simple_request(fm, &args); +} + +static int dmap_removemapping_list(struct inode *inode, unsigned int num, + struct list_head *to_remove) +{ + struct fuse_removemapping_one *remove_one, *ptr; + struct fuse_removemapping_in inarg; + struct fuse_dax_mapping *dmap; + int ret, i = 0, nr_alloc; + + nr_alloc = min_t(unsigned int, num, FUSE_REMOVEMAPPING_MAX_ENTRY); + remove_one = kmalloc_array(nr_alloc, sizeof(*remove_one), GFP_NOFS); + if (!remove_one) + return -ENOMEM; + + ptr = remove_one; + list_for_each_entry(dmap, to_remove, list) { + ptr->moffset = dmap->window_offset; + ptr->len = dmap->length; + ptr++; + i++; + num--; + if (i >= nr_alloc || num == 0) { + memset(&inarg, 0, sizeof(inarg)); + inarg.count = i; + ret = fuse_send_removemapping(inode, &inarg, + remove_one); + if (ret) + goto out; + ptr = remove_one; + i = 0; + } + } +out: + kfree(remove_one); + return ret; +} + +/* + * Cleanup dmap entry and add back to free list. This should be called with + * fcd->lock held. + */ +static void dmap_reinit_add_to_free_pool(struct fuse_conn_dax *fcd, + struct fuse_dax_mapping *dmap) +{ + pr_debug("fuse: freeing memory range start_idx=0x%lx end_idx=0x%lx window_offset=0x%llx length=0x%llx\n", + dmap->itn.start, dmap->itn.last, dmap->window_offset, + dmap->length); + __dmap_remove_busy_list(fcd, dmap); + dmap->inode = NULL; + dmap->itn.start = dmap->itn.last = 0; + __dmap_add_to_free_pool(fcd, dmap); +} + +/* + * Free inode dmap entries whose range falls inside [start, end]. + * Does not take any locks. At this point of time it should only be + * called from evict_inode() path where we know all dmap entries can be + * reclaimed. + */ +static void inode_reclaim_dmap_range(struct fuse_conn_dax *fcd, + struct inode *inode, + loff_t start, loff_t end) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_dax_mapping *dmap, *n; + int err, num = 0; + LIST_HEAD(to_remove); + unsigned long start_idx = start >> FUSE_DAX_SHIFT; + unsigned long end_idx = end >> FUSE_DAX_SHIFT; + struct interval_tree_node *node; + + while (1) { + node = interval_tree_iter_first(&fi->dax->tree, start_idx, + end_idx); + if (!node) + break; + dmap = node_to_dmap(node); + /* inode is going away. There should not be any users of dmap */ + WARN_ON(refcount_read(&dmap->refcnt) > 1); + interval_tree_remove(&dmap->itn, &fi->dax->tree); + num++; + list_add(&dmap->list, &to_remove); + } + + /* Nothing to remove */ + if (list_empty(&to_remove)) + return; + + WARN_ON(fi->dax->nr < num); + fi->dax->nr -= num; + err = dmap_removemapping_list(inode, num, &to_remove); + if (err && err != -ENOTCONN) { + pr_warn("Failed to removemappings. start=0x%llx end=0x%llx\n", + start, end); + } + spin_lock(&fcd->lock); + list_for_each_entry_safe(dmap, n, &to_remove, list) { + list_del_init(&dmap->list); + dmap_reinit_add_to_free_pool(fcd, dmap); + } + spin_unlock(&fcd->lock); +} + +static int dmap_removemapping_one(struct inode *inode, + struct fuse_dax_mapping *dmap) +{ + struct fuse_removemapping_one forget_one; + struct fuse_removemapping_in inarg; + + memset(&inarg, 0, sizeof(inarg)); + inarg.count = 1; + memset(&forget_one, 0, sizeof(forget_one)); + forget_one.moffset = dmap->window_offset; + forget_one.len = dmap->length; + + return fuse_send_removemapping(inode, &inarg, &forget_one); +} + +/* + * It is called from evict_inode() and by that time inode is going away. So + * this function does not take any locks like fi->dax->sem for traversing + * that fuse inode interval tree. If that lock is taken then lock validator + * complains of deadlock situation w.r.t fs_reclaim lock. + */ +void fuse_dax_inode_cleanup(struct inode *inode) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + /* + * fuse_evict_inode() has already called truncate_inode_pages_final() + * before we arrive here. So we should not have to worry about any + * pages/exception entries still associated with inode. + */ + inode_reclaim_dmap_range(fc->dax, inode, 0, -1); + WARN_ON(fi->dax->nr); +} + +static void fuse_fill_iomap_hole(struct iomap *iomap, loff_t length) +{ + iomap->addr = IOMAP_NULL_ADDR; + iomap->length = length; + iomap->type = IOMAP_HOLE; +} + +static void fuse_fill_iomap(struct inode *inode, loff_t pos, loff_t length, + struct iomap *iomap, struct fuse_dax_mapping *dmap, + unsigned int flags) +{ + loff_t offset, len; + loff_t i_size = i_size_read(inode); + + offset = pos - (dmap->itn.start << FUSE_DAX_SHIFT); + len = min(length, dmap->length - offset); + + /* If length is beyond end of file, truncate further */ + if (pos + len > i_size) + len = i_size - pos; + + if (len > 0) { + iomap->addr = dmap->window_offset + offset; + iomap->length = len; + if (flags & IOMAP_FAULT) + iomap->length = ALIGN(len, PAGE_SIZE); + iomap->type = IOMAP_MAPPED; + /* + * increace refcnt so that reclaim code knows this dmap is in + * use. This assumes fi->dax->sem mutex is held either + * shared/exclusive. + */ + refcount_inc(&dmap->refcnt); + + /* iomap->private should be NULL */ + WARN_ON_ONCE(iomap->private); + iomap->private = dmap; + } else { + /* Mapping beyond end of file is hole */ + fuse_fill_iomap_hole(iomap, length); + } +} + +static int fuse_setup_new_dax_mapping(struct inode *inode, loff_t pos, + loff_t length, unsigned int flags, + struct iomap *iomap) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_conn_dax *fcd = fc->dax; + struct fuse_dax_mapping *dmap, *alloc_dmap = NULL; + int ret; + bool writable = flags & IOMAP_WRITE; + unsigned long start_idx = pos >> FUSE_DAX_SHIFT; + struct interval_tree_node *node; + + /* + * Can't do inline reclaim in fault path. We call + * dax_layout_busy_page() before we free a range. And + * fuse_wait_dax_page() drops fi->i_mmap_sem lock and requires it. + * In fault path we enter with fi->i_mmap_sem held and can't drop + * it. Also in fault path we hold fi->i_mmap_sem shared and not + * exclusive, so that creates further issues with fuse_wait_dax_page(). + * Hence return -EAGAIN and fuse_dax_fault() will wait for a memory + * range to become free and retry. + */ + if (flags & IOMAP_FAULT) { + alloc_dmap = alloc_dax_mapping(fcd); + if (!alloc_dmap) + return -EAGAIN; + } else { + alloc_dmap = alloc_dax_mapping_reclaim(fcd, inode); + if (IS_ERR(alloc_dmap)) + return PTR_ERR(alloc_dmap); + } + + /* If we are here, we should have memory allocated */ + if (WARN_ON(!alloc_dmap)) + return -EIO; + + /* + * Take write lock so that only one caller can try to setup mapping + * and other waits. + */ + down_write(&fi->dax->sem); + /* + * We dropped lock. Check again if somebody else setup + * mapping already. + */ + node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx); + if (node) { + dmap = node_to_dmap(node); + fuse_fill_iomap(inode, pos, length, iomap, dmap, flags); + dmap_add_to_free_pool(fcd, alloc_dmap); + up_write(&fi->dax->sem); + return 0; + } + + /* Setup one mapping */ + ret = fuse_setup_one_mapping(inode, pos >> FUSE_DAX_SHIFT, alloc_dmap, + writable, false); + if (ret < 0) { + dmap_add_to_free_pool(fcd, alloc_dmap); + up_write(&fi->dax->sem); + return ret; + } + fuse_fill_iomap(inode, pos, length, iomap, alloc_dmap, flags); + up_write(&fi->dax->sem); + return 0; +} + +static int fuse_upgrade_dax_mapping(struct inode *inode, loff_t pos, + loff_t length, unsigned int flags, + struct iomap *iomap) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_dax_mapping *dmap; + int ret; + unsigned long idx = pos >> FUSE_DAX_SHIFT; + struct interval_tree_node *node; + + /* + * Take exclusive lock so that only one caller can try to setup + * mapping and others wait. + */ + down_write(&fi->dax->sem); + node = interval_tree_iter_first(&fi->dax->tree, idx, idx); + + /* We are holding either inode lock or i_mmap_sem, and that should + * ensure that dmap can't be truncated. We are holding a reference + * on dmap and that should make sure it can't be reclaimed. So dmap + * should still be there in tree despite the fact we dropped and + * re-acquired the fi->dax->sem lock. + */ + ret = -EIO; + if (WARN_ON(!node)) + goto out_err; + + dmap = node_to_dmap(node); + + /* We took an extra reference on dmap to make sure its not reclaimd. + * Now we hold fi->dax->sem lock and that reference is not needed + * anymore. Drop it. + */ + if (refcount_dec_and_test(&dmap->refcnt)) { + /* refcount should not hit 0. This object only goes + * away when fuse connection goes away + */ + WARN_ON_ONCE(1); + } + + /* Maybe another thread already upgraded mapping while we were not + * holding lock. + */ + if (dmap->writable) { + ret = 0; + goto out_fill_iomap; + } + + ret = fuse_setup_one_mapping(inode, pos >> FUSE_DAX_SHIFT, dmap, true, + true); + if (ret < 0) + goto out_err; +out_fill_iomap: + fuse_fill_iomap(inode, pos, length, iomap, dmap, flags); +out_err: + up_write(&fi->dax->sem); + return ret; +} + +/* This is just for DAX and the mapping is ephemeral, do not use it for other + * purposes since there is no block device with a permanent mapping. + */ +static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t length, + unsigned int flags, struct iomap *iomap, + struct iomap *srcmap) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_dax_mapping *dmap; + bool writable = flags & IOMAP_WRITE; + unsigned long start_idx = pos >> FUSE_DAX_SHIFT; + struct interval_tree_node *node; + + /* We don't support FIEMAP */ + if (WARN_ON(flags & IOMAP_REPORT)) + return -EIO; + + iomap->offset = pos; + iomap->flags = 0; + iomap->bdev = NULL; + iomap->dax_dev = fc->dax->dev; + + /* + * Both read/write and mmap path can race here. So we need something + * to make sure if we are setting up mapping, then other path waits + * + * For now, use a semaphore for this. It probably needs to be + * optimized later. + */ + down_read(&fi->dax->sem); + node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx); + if (node) { + dmap = node_to_dmap(node); + if (writable && !dmap->writable) { + /* Upgrade read-only mapping to read-write. This will + * require exclusive fi->dax->sem lock as we don't want + * two threads to be trying to this simultaneously + * for same dmap. So drop shared lock and acquire + * exclusive lock. + * + * Before dropping fi->dax->sem lock, take reference + * on dmap so that its not freed by range reclaim. + */ + refcount_inc(&dmap->refcnt); + up_read(&fi->dax->sem); + pr_debug("%s: Upgrading mapping at offset 0x%llx length 0x%llx\n", + __func__, pos, length); + return fuse_upgrade_dax_mapping(inode, pos, length, + flags, iomap); + } else { + fuse_fill_iomap(inode, pos, length, iomap, dmap, flags); + up_read(&fi->dax->sem); + return 0; + } + } else { + up_read(&fi->dax->sem); + pr_debug("%s: no mapping at offset 0x%llx length 0x%llx\n", + __func__, pos, length); + if (pos >= i_size_read(inode)) + goto iomap_hole; + + return fuse_setup_new_dax_mapping(inode, pos, length, flags, + iomap); + } + + /* + * If read beyond end of file happnes, fs code seems to return + * it as hole + */ +iomap_hole: + fuse_fill_iomap_hole(iomap, length); + pr_debug("%s returning hole mapping. pos=0x%llx length_asked=0x%llx length_returned=0x%llx\n", + __func__, pos, length, iomap->length); + return 0; +} + +static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t length, + ssize_t written, unsigned int flags, + struct iomap *iomap) +{ + struct fuse_dax_mapping *dmap = iomap->private; + + if (dmap) { + if (refcount_dec_and_test(&dmap->refcnt)) { + /* refcount should not hit 0. This object only goes + * away when fuse connection goes away + */ + WARN_ON_ONCE(1); + } + } + + /* DAX writes beyond end-of-file aren't handled using iomap, so the + * file size is unchanged and there is nothing to do here. + */ + return 0; +} + +static const struct iomap_ops fuse_iomap_ops = { + .iomap_begin = fuse_iomap_begin, + .iomap_end = fuse_iomap_end, +}; + +static void fuse_wait_dax_page(struct inode *inode) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + up_write(&fi->i_mmap_sem); + schedule(); + down_write(&fi->i_mmap_sem); +} + +/* Should be called with fi->i_mmap_sem lock held exclusively */ +static int __fuse_dax_break_layouts(struct inode *inode, bool *retry, + loff_t start, loff_t end) +{ + struct page *page; + + page = dax_layout_busy_page_range(inode->i_mapping, start, end); + if (!page) + return 0; + + *retry = true; + return ___wait_var_event(&page->_refcount, + atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE, + 0, 0, fuse_wait_dax_page(inode)); +} + +/* dmap_end == 0 leads to unmapping of whole file */ +int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start, + u64 dmap_end) +{ + bool retry; + int ret; + + do { + retry = false; + ret = __fuse_dax_break_layouts(inode, &retry, dmap_start, + dmap_end); + } while (ret == 0 && retry); + + return ret; +} + +ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock_shared(inode)) + return -EAGAIN; + } else { + inode_lock_shared(inode); + } + + ret = dax_iomap_rw(iocb, to, &fuse_iomap_ops); + inode_unlock_shared(inode); + + /* TODO file_accessed(iocb->f_filp) */ + return ret; +} + +static bool file_extending_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + return (iov_iter_rw(from) == WRITE && + ((iocb->ki_pos) >= i_size_read(inode) || + (iocb->ki_pos + iov_iter_count(from) > i_size_read(inode)))); +} + +static ssize_t fuse_dax_direct_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); + ssize_t ret; + + ret = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE); + if (ret < 0) + return ret; + + fuse_invalidate_attr(inode); + fuse_write_update_size(inode, iocb->ki_pos); + return ret; +} + +ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock(inode)) + return -EAGAIN; + } else { + inode_lock(inode); + } + + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto out; + + ret = file_remove_privs(iocb->ki_filp); + if (ret) + goto out; + /* TODO file_update_time() but we don't want metadata I/O */ + + /* Do not use dax for file extending writes as write and on + * disk i_size increase are not atomic otherwise. + */ + if (file_extending_write(iocb, from)) + ret = fuse_dax_direct_write(iocb, from); + else + ret = dax_iomap_rw(iocb, from, &fuse_iomap_ops); + +out: + inode_unlock(inode); + + if (ret > 0) + ret = generic_write_sync(iocb, ret); + return ret; +} + +static int fuse_dax_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + + struct inode *inode = mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + + return dax_writeback_mapping_range(mapping, fc->dax->dev, wbc); +} + +static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf, + enum page_entry_size pe_size, bool write) +{ + vm_fault_t ret; + struct inode *inode = file_inode(vmf->vma->vm_file); + struct super_block *sb = inode->i_sb; + pfn_t pfn; + int error = 0; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_conn_dax *fcd = fc->dax; + bool retry = false; + + if (write) + sb_start_pagefault(sb); +retry: + if (retry && !(fcd->nr_free_ranges > 0)) + wait_event(fcd->range_waitq, (fcd->nr_free_ranges > 0)); + + /* + * We need to serialize against not only truncate but also against + * fuse dax memory range reclaim. While a range is being reclaimed, + * we do not want any read/write/mmap to make progress and try + * to populate page cache or access memory we are trying to free. + */ + down_read(&get_fuse_inode(inode)->i_mmap_sem); + ret = dax_iomap_fault(vmf, pe_size, &pfn, &error, &fuse_iomap_ops); + if ((ret & VM_FAULT_ERROR) && error == -EAGAIN) { + error = 0; + retry = true; + up_read(&get_fuse_inode(inode)->i_mmap_sem); + goto retry; + } + + if (ret & VM_FAULT_NEEDDSYNC) + ret = dax_finish_sync_fault(vmf, pe_size, pfn); + up_read(&get_fuse_inode(inode)->i_mmap_sem); + + if (write) + sb_end_pagefault(sb); + + return ret; +} + +static vm_fault_t fuse_dax_fault(struct vm_fault *vmf) +{ + return __fuse_dax_fault(vmf, PE_SIZE_PTE, + vmf->flags & FAULT_FLAG_WRITE); +} + +static vm_fault_t fuse_dax_huge_fault(struct vm_fault *vmf, + enum page_entry_size pe_size) +{ + return __fuse_dax_fault(vmf, pe_size, vmf->flags & FAULT_FLAG_WRITE); +} + +static vm_fault_t fuse_dax_page_mkwrite(struct vm_fault *vmf) +{ + return __fuse_dax_fault(vmf, PE_SIZE_PTE, true); +} + +static vm_fault_t fuse_dax_pfn_mkwrite(struct vm_fault *vmf) +{ + return __fuse_dax_fault(vmf, PE_SIZE_PTE, true); +} + +static const struct vm_operations_struct fuse_dax_vm_ops = { + .fault = fuse_dax_fault, + .huge_fault = fuse_dax_huge_fault, + .page_mkwrite = fuse_dax_page_mkwrite, + .pfn_mkwrite = fuse_dax_pfn_mkwrite, +}; + +int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma) +{ + file_accessed(file); + vma->vm_ops = &fuse_dax_vm_ops; + vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; + return 0; +} + +static int dmap_writeback_invalidate(struct inode *inode, + struct fuse_dax_mapping *dmap) +{ + int ret; + loff_t start_pos = dmap->itn.start << FUSE_DAX_SHIFT; + loff_t end_pos = (start_pos + FUSE_DAX_SZ - 1); + + ret = filemap_fdatawrite_range(inode->i_mapping, start_pos, end_pos); + if (ret) { + pr_debug("fuse: filemap_fdatawrite_range() failed. err=%d start_pos=0x%llx, end_pos=0x%llx\n", + ret, start_pos, end_pos); + return ret; + } + + ret = invalidate_inode_pages2_range(inode->i_mapping, + start_pos >> PAGE_SHIFT, + end_pos >> PAGE_SHIFT); + if (ret) + pr_debug("fuse: invalidate_inode_pages2_range() failed err=%d\n", + ret); + + return ret; +} + +static int reclaim_one_dmap_locked(struct inode *inode, + struct fuse_dax_mapping *dmap) +{ + int ret; + struct fuse_inode *fi = get_fuse_inode(inode); + + /* + * igrab() was done to make sure inode won't go under us, and this + * further avoids the race with evict(). + */ + ret = dmap_writeback_invalidate(inode, dmap); + if (ret) + return ret; + + /* Remove dax mapping from inode interval tree now */ + interval_tree_remove(&dmap->itn, &fi->dax->tree); + fi->dax->nr--; + + /* It is possible that umount/shutdown has killed the fuse connection + * and worker thread is trying to reclaim memory in parallel. Don't + * warn in that case. + */ + ret = dmap_removemapping_one(inode, dmap); + if (ret && ret != -ENOTCONN) { + pr_warn("Failed to remove mapping. offset=0x%llx len=0x%llx ret=%d\n", + dmap->window_offset, dmap->length, ret); + } + return 0; +} + +/* Find first mapped dmap for an inode and return file offset. Caller needs + * to hold fi->dax->sem lock either shared or exclusive. + */ +static struct fuse_dax_mapping *inode_lookup_first_dmap(struct inode *inode) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_dax_mapping *dmap; + struct interval_tree_node *node; + + for (node = interval_tree_iter_first(&fi->dax->tree, 0, -1); node; + node = interval_tree_iter_next(node, 0, -1)) { + dmap = node_to_dmap(node); + /* still in use. */ + if (refcount_read(&dmap->refcnt) > 1) + continue; + + return dmap; + } + + return NULL; +} + +/* + * Find first mapping in the tree and free it and return it. Do not add + * it back to free pool. + */ +static struct fuse_dax_mapping * +inode_inline_reclaim_one_dmap(struct fuse_conn_dax *fcd, struct inode *inode, + bool *retry) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_dax_mapping *dmap; + u64 dmap_start, dmap_end; + unsigned long start_idx; + int ret; + struct interval_tree_node *node; + + down_write(&fi->i_mmap_sem); + + /* Lookup a dmap and corresponding file offset to reclaim. */ + down_read(&fi->dax->sem); + dmap = inode_lookup_first_dmap(inode); + if (dmap) { + start_idx = dmap->itn.start; + dmap_start = start_idx << FUSE_DAX_SHIFT; + dmap_end = dmap_start + FUSE_DAX_SZ - 1; + } + up_read(&fi->dax->sem); + + if (!dmap) + goto out_mmap_sem; + /* + * Make sure there are no references to inode pages using + * get_user_pages() + */ + ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end); + if (ret) { + pr_debug("fuse: fuse_dax_break_layouts() failed. err=%d\n", + ret); + dmap = ERR_PTR(ret); + goto out_mmap_sem; + } + + down_write(&fi->dax->sem); + node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx); + /* Range already got reclaimed by somebody else */ + if (!node) { + if (retry) + *retry = true; + goto out_write_dmap_sem; + } + + dmap = node_to_dmap(node); + /* still in use. */ + if (refcount_read(&dmap->refcnt) > 1) { + dmap = NULL; + if (retry) + *retry = true; + goto out_write_dmap_sem; + } + + ret = reclaim_one_dmap_locked(inode, dmap); + if (ret < 0) { + dmap = ERR_PTR(ret); + goto out_write_dmap_sem; + } + + /* Clean up dmap. Do not add back to free list */ + dmap_remove_busy_list(fcd, dmap); + dmap->inode = NULL; + dmap->itn.start = dmap->itn.last = 0; + + pr_debug("fuse: %s: inline reclaimed memory range. inode=%p, window_offset=0x%llx, length=0x%llx\n", + __func__, inode, dmap->window_offset, dmap->length); + +out_write_dmap_sem: + up_write(&fi->dax->sem); +out_mmap_sem: + up_write(&fi->i_mmap_sem); + return dmap; +} + +static struct fuse_dax_mapping * +alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode) +{ + struct fuse_dax_mapping *dmap; + struct fuse_inode *fi = get_fuse_inode(inode); + + while (1) { + bool retry = false; + + dmap = alloc_dax_mapping(fcd); + if (dmap) + return dmap; + + dmap = inode_inline_reclaim_one_dmap(fcd, inode, &retry); + /* + * Either we got a mapping or it is an error, return in both + * the cases. + */ + if (dmap) + return dmap; + + /* If we could not reclaim a mapping because it + * had a reference or some other temporary failure, + * Try again. We want to give up inline reclaim only + * if there is no range assigned to this node. Otherwise + * if a deadlock is possible if we sleep with fi->i_mmap_sem + * held and worker to free memory can't make progress due + * to unavailability of fi->i_mmap_sem lock. So sleep + * only if fi->dax->nr=0 + */ + if (retry) + continue; + /* + * There are no mappings which can be reclaimed. Wait for one. + * We are not holding fi->dax->sem. So it is possible + * that range gets added now. But as we are not holding + * fi->i_mmap_sem, worker should still be able to free up + * a range and wake us up. + */ + if (!fi->dax->nr && !(fcd->nr_free_ranges > 0)) { + if (wait_event_killable_exclusive(fcd->range_waitq, + (fcd->nr_free_ranges > 0))) { + return ERR_PTR(-EINTR); + } + } + } +} + +static int lookup_and_reclaim_dmap_locked(struct fuse_conn_dax *fcd, + struct inode *inode, + unsigned long start_idx) +{ + int ret; + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_dax_mapping *dmap; + struct interval_tree_node *node; + + /* Find fuse dax mapping at file offset inode. */ + node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx); + + /* Range already got cleaned up by somebody else */ + if (!node) + return 0; + dmap = node_to_dmap(node); + + /* still in use. */ + if (refcount_read(&dmap->refcnt) > 1) + return 0; + + ret = reclaim_one_dmap_locked(inode, dmap); + if (ret < 0) + return ret; + + /* Cleanup dmap entry and add back to free list */ + spin_lock(&fcd->lock); + dmap_reinit_add_to_free_pool(fcd, dmap); + spin_unlock(&fcd->lock); + return ret; +} + +/* + * Free a range of memory. + * Locking: + * 1. Take fi->i_mmap_sem to block dax faults. + * 2. Take fi->dax->sem to protect interval tree and also to make sure + * read/write can not reuse a dmap which we might be freeing. + */ +static int lookup_and_reclaim_dmap(struct fuse_conn_dax *fcd, + struct inode *inode, + unsigned long start_idx, + unsigned long end_idx) +{ + int ret; + struct fuse_inode *fi = get_fuse_inode(inode); + loff_t dmap_start = start_idx << FUSE_DAX_SHIFT; + loff_t dmap_end = (dmap_start + FUSE_DAX_SZ) - 1; + + down_write(&fi->i_mmap_sem); + ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end); + if (ret) { + pr_debug("virtio_fs: fuse_dax_break_layouts() failed. err=%d\n", + ret); + goto out_mmap_sem; + } + + down_write(&fi->dax->sem); + ret = lookup_and_reclaim_dmap_locked(fcd, inode, start_idx); + up_write(&fi->dax->sem); +out_mmap_sem: + up_write(&fi->i_mmap_sem); + return ret; +} + +static int try_to_free_dmap_chunks(struct fuse_conn_dax *fcd, + unsigned long nr_to_free) +{ + struct fuse_dax_mapping *dmap, *pos, *temp; + int ret, nr_freed = 0; + unsigned long start_idx = 0, end_idx = 0; + struct inode *inode = NULL; + + /* Pick first busy range and free it for now*/ + while (1) { + if (nr_freed >= nr_to_free) + break; + + dmap = NULL; + spin_lock(&fcd->lock); + + if (!fcd->nr_busy_ranges) { + spin_unlock(&fcd->lock); + return 0; + } + + list_for_each_entry_safe(pos, temp, &fcd->busy_ranges, + busy_list) { + /* skip this range if it's in use. */ + if (refcount_read(&pos->refcnt) > 1) + continue; + + inode = igrab(pos->inode); + /* + * This inode is going away. That will free + * up all the ranges anyway, continue to + * next range. + */ + if (!inode) + continue; + /* + * Take this element off list and add it tail. If + * this element can't be freed, it will help with + * selecting new element in next iteration of loop. + */ + dmap = pos; + list_move_tail(&dmap->busy_list, &fcd->busy_ranges); + start_idx = end_idx = dmap->itn.start; + break; + } + spin_unlock(&fcd->lock); + if (!dmap) + return 0; + + ret = lookup_and_reclaim_dmap(fcd, inode, start_idx, end_idx); + iput(inode); + if (ret) + return ret; + nr_freed++; + } + return 0; +} + +static void fuse_dax_free_mem_worker(struct work_struct *work) +{ + int ret; + struct fuse_conn_dax *fcd = container_of(work, struct fuse_conn_dax, + free_work.work); + ret = try_to_free_dmap_chunks(fcd, FUSE_DAX_RECLAIM_CHUNK); + if (ret) { + pr_debug("fuse: try_to_free_dmap_chunks() failed with err=%d\n", + ret); + } + + /* If number of free ranges are still below threhold, requeue */ + kick_dmap_free_worker(fcd, 1); +} + +static void fuse_free_dax_mem_ranges(struct list_head *mem_list) +{ + struct fuse_dax_mapping *range, *temp; + + /* Free All allocated elements */ + list_for_each_entry_safe(range, temp, mem_list, list) { + list_del(&range->list); + if (!list_empty(&range->busy_list)) + list_del(&range->busy_list); + kfree(range); + } +} + +void fuse_dax_conn_free(struct fuse_conn *fc) +{ + if (fc->dax) { + fuse_free_dax_mem_ranges(&fc->dax->free_ranges); + kfree(fc->dax); + } +} + +static int fuse_dax_mem_range_init(struct fuse_conn_dax *fcd) +{ + long nr_pages, nr_ranges; + void *kaddr; + pfn_t pfn; + struct fuse_dax_mapping *range; + int ret, id; + size_t dax_size = -1; + unsigned long i; + + init_waitqueue_head(&fcd->range_waitq); + INIT_LIST_HEAD(&fcd->free_ranges); + INIT_LIST_HEAD(&fcd->busy_ranges); + INIT_DELAYED_WORK(&fcd->free_work, fuse_dax_free_mem_worker); + + id = dax_read_lock(); + nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size), &kaddr, + &pfn); + dax_read_unlock(id); + if (nr_pages < 0) { + pr_debug("dax_direct_access() returned %ld\n", nr_pages); + return nr_pages; + } + + nr_ranges = nr_pages/FUSE_DAX_PAGES; + pr_debug("%s: dax mapped %ld pages. nr_ranges=%ld\n", + __func__, nr_pages, nr_ranges); + + for (i = 0; i < nr_ranges; i++) { + range = kzalloc(sizeof(struct fuse_dax_mapping), GFP_KERNEL); + ret = -ENOMEM; + if (!range) + goto out_err; + + /* TODO: This offset only works if virtio-fs driver is not + * having some memory hidden at the beginning. This needs + * better handling + */ + range->window_offset = i * FUSE_DAX_SZ; + range->length = FUSE_DAX_SZ; + INIT_LIST_HEAD(&range->busy_list); + refcount_set(&range->refcnt, 1); + list_add_tail(&range->list, &fcd->free_ranges); + } + + fcd->nr_free_ranges = nr_ranges; + fcd->nr_ranges = nr_ranges; + return 0; +out_err: + /* Free All allocated elements */ + fuse_free_dax_mem_ranges(&fcd->free_ranges); + return ret; +} + +int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev) +{ + struct fuse_conn_dax *fcd; + int err; + + if (!dax_dev) + return 0; + + fcd = kzalloc(sizeof(*fcd), GFP_KERNEL); + if (!fcd) + return -ENOMEM; + + spin_lock_init(&fcd->lock); + fcd->dev = dax_dev; + err = fuse_dax_mem_range_init(fcd); + if (err) { + kfree(fcd); + return err; + } + + fc->dax = fcd; + return 0; +} + +bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi) +{ + struct fuse_conn *fc = get_fuse_conn_super(sb); + + fi->dax = NULL; + if (fc->dax) { + fi->dax = kzalloc(sizeof(*fi->dax), GFP_KERNEL_ACCOUNT); + if (!fi->dax) + return false; + + init_rwsem(&fi->dax->sem); + fi->dax->tree = RB_ROOT_CACHED; + } + + return true; +} + +static const struct address_space_operations fuse_dax_file_aops = { + .writepages = fuse_dax_writepages, + .direct_IO = noop_direct_IO, + .set_page_dirty = noop_set_page_dirty, + .invalidatepage = noop_invalidatepage, +}; + +void fuse_dax_inode_init(struct inode *inode) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + + if (!fc->dax) + return; + + inode->i_flags |= S_DAX; + inode->i_data.a_ops = &fuse_dax_file_aops; +} + +bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment) +{ + if (fc->dax && (map_alignment > FUSE_DAX_SHIFT)) { + pr_warn("FUSE: map_alignment %u incompatible with dax mem range size %u\n", + map_alignment, FUSE_DAX_SZ); + return false; + } + return true; +} + +void fuse_dax_cancel_work(struct fuse_conn *fc) +{ + struct fuse_conn_dax *fcd = fc->dax; + + if (fcd) + cancel_delayed_work_sync(&fcd->free_work); + +} +EXPORT_SYMBOL_GPL(fuse_dax_cancel_work); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 02b3c36b3676..588f8d1240aa 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -40,20 +40,21 @@ static struct fuse_dev *fuse_get_dev(struct file *file) return READ_ONCE(file->private_data); } -static void fuse_request_init(struct fuse_req *req) +static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req) { INIT_LIST_HEAD(&req->list); INIT_LIST_HEAD(&req->intr_entry); init_waitqueue_head(&req->waitq); refcount_set(&req->count, 1); __set_bit(FR_PENDING, &req->flags); + req->fm = fm; } -static struct fuse_req *fuse_request_alloc(gfp_t flags) +static struct fuse_req *fuse_request_alloc(struct fuse_mount *fm, gfp_t flags) { struct fuse_req *req = kmem_cache_zalloc(fuse_req_cachep, flags); if (req) - fuse_request_init(req); + fuse_request_init(fm, req); return req; } @@ -100,10 +101,11 @@ static void fuse_drop_waiting(struct fuse_conn *fc) } } -static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req); +static void fuse_put_request(struct fuse_req *req); -static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background) +static struct fuse_req *fuse_get_req(struct fuse_mount *fm, bool for_background) { + struct fuse_conn *fc = fm->fc; struct fuse_req *req; int err; atomic_inc(&fc->num_waiting); @@ -125,7 +127,7 @@ static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background) if (fc->conn_error) goto out; - req = fuse_request_alloc(GFP_KERNEL); + req = fuse_request_alloc(fm, GFP_KERNEL); err = -ENOMEM; if (!req) { if (for_background) @@ -143,7 +145,7 @@ static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background) if (unlikely(req->in.h.uid == ((uid_t)-1) || req->in.h.gid == ((gid_t)-1))) { - fuse_put_request(fc, req); + fuse_put_request(req); return ERR_PTR(-EOVERFLOW); } return req; @@ -153,8 +155,10 @@ static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background) return ERR_PTR(err); } -static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_put_request(struct fuse_req *req) { + struct fuse_conn *fc = req->fm->fc; + if (refcount_dec_and_test(&req->count)) { if (test_bit(FR_BACKGROUND, &req->flags)) { /* @@ -273,8 +277,10 @@ static void flush_bg_queue(struct fuse_conn *fc) * the 'end' callback is called if given, else the reference to the * request is released */ -void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req) +void fuse_request_end(struct fuse_req *req) { + struct fuse_mount *fm = req->fm; + struct fuse_conn *fc = fm->fc; struct fuse_iqueue *fiq = &fc->iq; if (test_and_set_bit(FR_FINISHED, &req->flags)) @@ -309,9 +315,9 @@ void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req) wake_up(&fc->blocked_waitq); } - if (fc->num_background == fc->congestion_threshold && fc->sb) { - clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); - clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); + if (fc->num_background == fc->congestion_threshold && fm->sb) { + clear_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC); + clear_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC); } fc->num_background--; fc->active_background--; @@ -323,14 +329,16 @@ void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req) } if (test_bit(FR_ASYNC, &req->flags)) - req->args->end(fc, req->args, req->out.h.error); + req->args->end(fm, req->args, req->out.h.error); put_request: - fuse_put_request(fc, req); + fuse_put_request(req); } EXPORT_SYMBOL_GPL(fuse_request_end); -static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) +static int queue_interrupt(struct fuse_req *req) { + struct fuse_iqueue *fiq = &req->fm->fc->iq; + spin_lock(&fiq->lock); /* Check for we've sent request to interrupt this req */ if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) { @@ -357,8 +365,9 @@ static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) return 0; } -static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) +static void request_wait_answer(struct fuse_req *req) { + struct fuse_conn *fc = req->fm->fc; struct fuse_iqueue *fiq = &fc->iq; int err; @@ -373,7 +382,7 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) /* matches barrier in fuse_dev_do_read() */ smp_mb__after_atomic(); if (test_bit(FR_SENT, &req->flags)) - queue_interrupt(fiq, req); + queue_interrupt(req); } if (!test_bit(FR_FORCE, &req->flags)) { @@ -402,9 +411,9 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) wait_event(req->waitq, test_bit(FR_FINISHED, &req->flags)); } -static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) +static void __fuse_request_send(struct fuse_req *req) { - struct fuse_iqueue *fiq = &fc->iq; + struct fuse_iqueue *fiq = &req->fm->fc->iq; BUG_ON(test_bit(FR_BACKGROUND, &req->flags)); spin_lock(&fiq->lock); @@ -418,7 +427,7 @@ static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) __fuse_get_request(req); queue_request_and_unlock(fiq, req); - request_wait_answer(fc, req); + request_wait_answer(req); /* Pairs with smp_wmb() in fuse_request_end() */ smp_rmb(); } @@ -457,8 +466,10 @@ static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args) } } -static void fuse_force_creds(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_force_creds(struct fuse_req *req) { + struct fuse_conn *fc = req->fm->fc; + req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid()); req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid()); req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); @@ -473,23 +484,24 @@ static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args) __set_bit(FR_ASYNC, &req->flags); } -ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) +ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args) { + struct fuse_conn *fc = fm->fc; struct fuse_req *req; ssize_t ret; if (args->force) { atomic_inc(&fc->num_waiting); - req = fuse_request_alloc(GFP_KERNEL | __GFP_NOFAIL); + req = fuse_request_alloc(fm, GFP_KERNEL | __GFP_NOFAIL); if (!args->nocreds) - fuse_force_creds(fc, req); + fuse_force_creds(req); __set_bit(FR_WAITING, &req->flags); __set_bit(FR_FORCE, &req->flags); } else { WARN_ON(args->nocreds); - req = fuse_get_req(fc, false); + req = fuse_get_req(fm, false); if (IS_ERR(req)) return PTR_ERR(req); } @@ -500,20 +512,21 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) if (!args->noreply) __set_bit(FR_ISREPLY, &req->flags); - __fuse_request_send(fc, req); + __fuse_request_send(req); ret = req->out.h.error; if (!ret && args->out_argvar) { BUG_ON(args->out_numargs == 0); ret = args->out_args[args->out_numargs - 1].size; } - fuse_put_request(fc, req); + fuse_put_request(req); return ret; } -static bool fuse_request_queue_background(struct fuse_conn *fc, - struct fuse_req *req) +static bool fuse_request_queue_background(struct fuse_req *req) { + struct fuse_mount *fm = req->fm; + struct fuse_conn *fc = fm->fc; bool queued = false; WARN_ON(!test_bit(FR_BACKGROUND, &req->flags)); @@ -527,9 +540,9 @@ static bool fuse_request_queue_background(struct fuse_conn *fc, fc->num_background++; if (fc->num_background == fc->max_background) fc->blocked = 1; - if (fc->num_background == fc->congestion_threshold && fc->sb) { - set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); - set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); + if (fc->num_background == fc->congestion_threshold && fm->sb) { + set_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC); + set_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC); } list_add_tail(&req->list, &fc->bg_queue); flush_bg_queue(fc); @@ -540,28 +553,28 @@ static bool fuse_request_queue_background(struct fuse_conn *fc, return queued; } -int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args, +int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args, gfp_t gfp_flags) { struct fuse_req *req; if (args->force) { WARN_ON(!args->nocreds); - req = fuse_request_alloc(gfp_flags); + req = fuse_request_alloc(fm, gfp_flags); if (!req) return -ENOMEM; __set_bit(FR_BACKGROUND, &req->flags); } else { WARN_ON(args->nocreds); - req = fuse_get_req(fc, true); + req = fuse_get_req(fm, true); if (IS_ERR(req)) return PTR_ERR(req); } fuse_args_to_req(req, args); - if (!fuse_request_queue_background(fc, req)) { - fuse_put_request(fc, req); + if (!fuse_request_queue_background(req)) { + fuse_put_request(req); return -ENOTCONN; } @@ -569,14 +582,14 @@ int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args, } EXPORT_SYMBOL_GPL(fuse_simple_background); -static int fuse_simple_notify_reply(struct fuse_conn *fc, +static int fuse_simple_notify_reply(struct fuse_mount *fm, struct fuse_args *args, u64 unique) { struct fuse_req *req; - struct fuse_iqueue *fiq = &fc->iq; + struct fuse_iqueue *fiq = &fm->fc->iq; int err = 0; - req = fuse_get_req(fc, false); + req = fuse_get_req(fm, false); if (IS_ERR(req)) return PTR_ERR(req); @@ -591,7 +604,7 @@ static int fuse_simple_notify_reply(struct fuse_conn *fc, } else { err = -ENODEV; spin_unlock(&fiq->lock); - fuse_put_request(fc, req); + fuse_put_request(req); } return err; @@ -785,15 +798,16 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) struct page *newpage; struct pipe_buffer *buf = cs->pipebufs; + get_page(oldpage); err = unlock_request(cs->req); if (err) - return err; + goto out_put_old; fuse_copy_finish(cs); err = pipe_buf_confirm(cs->pipe, buf); if (err) - return err; + goto out_put_old; BUG_ON(!cs->nr_segs); cs->currbuf = buf; @@ -833,7 +847,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) err = replace_page_cache_page(oldpage, newpage, GFP_KERNEL); if (err) { unlock_page(newpage); - return err; + goto out_put_old; } get_page(newpage); @@ -852,14 +866,19 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) if (err) { unlock_page(newpage); put_page(newpage); - return err; + goto out_put_old; } unlock_page(oldpage); + /* Drop ref for ap->pages[] array */ put_page(oldpage); cs->len = 0; - return 0; + err = 0; +out_put_old: + /* Drop ref obtained in this function */ + put_page(oldpage); + return err; out_fallback_unlock: unlock_page(newpage); @@ -868,10 +887,10 @@ out_fallback: cs->offset = buf->offset; err = lock_request(cs->req); - if (err) - return err; + if (!err) + err = 1; - return 1; + goto out_put_old; } static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, @@ -883,14 +902,16 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, if (cs->nr_segs >= cs->pipe->max_usage) return -EIO; + get_page(page); err = unlock_request(cs->req); - if (err) + if (err) { + put_page(page); return err; + } fuse_copy_finish(cs); buf = cs->pipebufs; - get_page(page); buf->page = page; buf->offset = offset; buf->len = count; @@ -1250,7 +1271,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, /* SETXATTR is special, since it may contain too large data */ if (args->opcode == FUSE_SETXATTR) req->out.h.error = -E2BIG; - fuse_request_end(fc, req); + fuse_request_end(req); goto restart; } spin_lock(&fpq->lock); @@ -1284,8 +1305,8 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, /* matches barrier in request_wait_answer() */ smp_mb__after_atomic(); if (test_bit(FR_INTERRUPTED, &req->flags)) - queue_interrupt(fiq, req); - fuse_put_request(fc, req); + queue_interrupt(req); + fuse_put_request(req); return reqsize; @@ -1293,7 +1314,7 @@ out_end: if (!test_bit(FR_PRIVATE, &req->flags)) list_del_init(&req->list); spin_unlock(&fpq->lock); - fuse_request_end(fc, req); + fuse_request_end(req); return err; err_unlock: @@ -1416,11 +1437,8 @@ static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size, fuse_copy_finish(cs); down_read(&fc->killsb); - err = -ENOENT; - if (fc->sb) { - err = fuse_reverse_inval_inode(fc->sb, outarg.ino, - outarg.off, outarg.len); - } + err = fuse_reverse_inval_inode(fc, outarg.ino, + outarg.off, outarg.len); up_read(&fc->killsb); return err; @@ -1466,9 +1484,7 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, buf[outarg.namelen] = 0; down_read(&fc->killsb); - err = -ENOENT; - if (fc->sb) - err = fuse_reverse_inval_entry(fc->sb, outarg.parent, 0, &name); + err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name); up_read(&fc->killsb); kfree(buf); return err; @@ -1516,10 +1532,7 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, buf[outarg.namelen] = 0; down_read(&fc->killsb); - err = -ENOENT; - if (fc->sb) - err = fuse_reverse_inval_entry(fc->sb, outarg.parent, - outarg.child, &name); + err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name); up_read(&fc->killsb); kfree(buf); return err; @@ -1561,10 +1574,7 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, down_read(&fc->killsb); err = -ENOENT; - if (!fc->sb) - goto out_up_killsb; - - inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid); + inode = fuse_ilookup(fc, nodeid, NULL); if (!inode) goto out_up_killsb; @@ -1621,7 +1631,7 @@ struct fuse_retrieve_args { struct fuse_notify_retrieve_in inarg; }; -static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_args *args, +static void fuse_retrieve_end(struct fuse_mount *fm, struct fuse_args *args, int error) { struct fuse_retrieve_args *ra = @@ -1631,7 +1641,7 @@ static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_args *args, kfree(ra); } -static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, +static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, struct fuse_notify_retrieve_out *outarg) { int err; @@ -1642,6 +1652,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, unsigned int offset; size_t total_len = 0; unsigned int num_pages; + struct fuse_conn *fc = fm->fc; struct fuse_retrieve_args *ra; size_t args_size = sizeof(*ra); struct fuse_args_pages *ap; @@ -1703,9 +1714,9 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, args->in_args[0].value = &ra->inarg; args->in_args[1].size = total_len; - err = fuse_simple_notify_reply(fc, args, outarg->notify_unique); + err = fuse_simple_notify_reply(fm, args, outarg->notify_unique); if (err) - fuse_retrieve_end(fc, args, err); + fuse_retrieve_end(fm, args, err); return err; } @@ -1714,7 +1725,9 @@ static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size, struct fuse_copy_state *cs) { struct fuse_notify_retrieve_out outarg; + struct fuse_mount *fm; struct inode *inode; + u64 nodeid; int err; err = -EINVAL; @@ -1729,14 +1742,12 @@ static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size, down_read(&fc->killsb); err = -ENOENT; - if (fc->sb) { - u64 nodeid = outarg.nodeid; + nodeid = outarg.nodeid; - inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid); - if (inode) { - err = fuse_retrieve(fc, inode, &outarg); - iput(inode); - } + inode = fuse_ilookup(fc, nodeid, &fm); + if (inode) { + err = fuse_retrieve(fm, inode, &outarg); + iput(inode); } up_read(&fc->killsb); @@ -1875,9 +1886,9 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, else if (oh.error == -ENOSYS) fc->no_interrupt = 1; else if (oh.error == -EAGAIN) - err = queue_interrupt(&fc->iq, req); + err = queue_interrupt(req); - fuse_put_request(fc, req); + fuse_put_request(req); goto copy_finish; } @@ -1907,7 +1918,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, list_del_init(&req->list); spin_unlock(&fpq->lock); - fuse_request_end(fc, req); + fuse_request_end(req); out: return err ? err : nbytes; @@ -2045,7 +2056,7 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait) } /* Abort all requests on the given list (pending or processing) */ -static void end_requests(struct fuse_conn *fc, struct list_head *head) +static void end_requests(struct list_head *head) { while (!list_empty(head)) { struct fuse_req *req; @@ -2053,7 +2064,7 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head) req->out.h.error = -ECONNABORTED; clear_bit(FR_SENT, &req->flags); list_del_init(&req->list); - fuse_request_end(fc, req); + fuse_request_end(req); } } @@ -2148,7 +2159,7 @@ void fuse_abort_conn(struct fuse_conn *fc) wake_up_all(&fc->blocked_waitq); spin_unlock(&fc->lock); - end_requests(fc, &to_end); + end_requests(&to_end); } else { spin_unlock(&fc->lock); } @@ -2178,7 +2189,7 @@ int fuse_dev_release(struct inode *inode, struct file *file) list_splice_init(&fpq->processing[i], &to_end); spin_unlock(&fpq->lock); - end_requests(fc, &to_end); + end_requests(&to_end); /* Are we the last open device? */ if (atomic_dec_and_test(&fc->dev_count)) { diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 26f028bc760b..ff7dbeb16f88 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -10,6 +10,7 @@ #include <linux/pagemap.h> #include <linux/file.h> +#include <linux/fs_context.h> #include <linux/sched.h> #include <linux/namei.h> #include <linux/slab.h> @@ -196,7 +197,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) { struct inode *inode; struct dentry *parent; - struct fuse_conn *fc; + struct fuse_mount *fm; struct fuse_inode *fi; int ret; @@ -218,27 +219,29 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) if (flags & LOOKUP_RCU) goto out; - fc = get_fuse_conn(inode); + fm = get_fuse_mount(inode); forget = fuse_alloc_forget(); ret = -ENOMEM; if (!forget) goto out; - attr_version = fuse_get_attr_version(fc); + attr_version = fuse_get_attr_version(fm->fc); parent = dget_parent(entry); - fuse_lookup_init(fc, &args, get_node_id(d_inode(parent)), + fuse_lookup_init(fm->fc, &args, get_node_id(d_inode(parent)), &entry->d_name, &outarg); - ret = fuse_simple_request(fc, &args); + ret = fuse_simple_request(fm, &args); dput(parent); /* Zero nodeid is same as -ENOENT */ if (!ret && !outarg.nodeid) ret = -ENOENT; if (!ret) { fi = get_fuse_inode(inode); - if (outarg.nodeid != get_node_id(inode)) { - fuse_queue_forget(fc, forget, outarg.nodeid, 1); + if (outarg.nodeid != get_node_id(inode) || + (bool) IS_AUTOMOUNT(inode) != (bool) (outarg.attr.flags & FUSE_ATTR_SUBMOUNT)) { + fuse_queue_forget(fm->fc, forget, + outarg.nodeid, 1); goto invalid; } spin_lock(&fi->lock); @@ -298,6 +301,79 @@ static int fuse_dentry_delete(const struct dentry *dentry) return time_before64(fuse_dentry_time(dentry), get_jiffies_64()); } +/* + * Create a fuse_mount object with a new superblock (with path->dentry + * as the root), and return that mount so it can be auto-mounted on + * @path. + */ +static struct vfsmount *fuse_dentry_automount(struct path *path) +{ + struct fs_context *fsc; + struct fuse_mount *parent_fm = get_fuse_mount_super(path->mnt->mnt_sb); + struct fuse_conn *fc = parent_fm->fc; + struct fuse_mount *fm; + struct vfsmount *mnt; + struct fuse_inode *mp_fi = get_fuse_inode(d_inode(path->dentry)); + struct super_block *sb; + int err; + + fsc = fs_context_for_submount(path->mnt->mnt_sb->s_type, path->dentry); + if (IS_ERR(fsc)) { + err = PTR_ERR(fsc); + goto out; + } + + err = -ENOMEM; + fm = kzalloc(sizeof(struct fuse_mount), GFP_KERNEL); + if (!fm) + goto out_put_fsc; + + refcount_set(&fm->count, 1); + fsc->s_fs_info = fm; + sb = sget_fc(fsc, NULL, set_anon_super_fc); + if (IS_ERR(sb)) { + err = PTR_ERR(sb); + fuse_mount_put(fm); + goto out_put_fsc; + } + fm->fc = fuse_conn_get(fc); + + /* Initialize superblock, making @mp_fi its root */ + err = fuse_fill_super_submount(sb, mp_fi); + if (err) + goto out_put_sb; + + sb->s_flags |= SB_ACTIVE; + fsc->root = dget(sb->s_root); + /* We are done configuring the superblock, so unlock it */ + up_write(&sb->s_umount); + + down_write(&fc->killsb); + list_add_tail(&fm->fc_entry, &fc->mounts); + up_write(&fc->killsb); + + /* Create the submount */ + mnt = vfs_create_mount(fsc); + if (IS_ERR(mnt)) { + err = PTR_ERR(mnt); + goto out_put_fsc; + } + mntget(mnt); + put_fs_context(fsc); + return mnt; + +out_put_sb: + /* + * Only jump here when fsc->root is NULL and sb is still locked + * (otherwise put_fs_context() will put the superblock) + */ + deactivate_locked_super(sb); +out_put_fsc: + put_fs_context(fsc); +out: + return ERR_PTR(err); +} + const struct dentry_operations fuse_dentry_operations = { .d_revalidate = fuse_dentry_revalidate, .d_delete = fuse_dentry_delete, @@ -305,6 +381,7 @@ const struct dentry_operations fuse_dentry_operations = { .d_init = fuse_dentry_init, .d_release = fuse_dentry_release, #endif + .d_automount = fuse_dentry_automount, }; const struct dentry_operations fuse_root_dentry_operations = { @@ -329,7 +406,7 @@ bool fuse_invalid_attr(struct fuse_attr *attr) int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name, struct fuse_entry_out *outarg, struct inode **inode) { - struct fuse_conn *fc = get_fuse_conn_super(sb); + struct fuse_mount *fm = get_fuse_mount_super(sb); FUSE_ARGS(args); struct fuse_forget_link *forget; u64 attr_version; @@ -346,10 +423,10 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name if (!forget) goto out; - attr_version = fuse_get_attr_version(fc); + attr_version = fuse_get_attr_version(fm->fc); - fuse_lookup_init(fc, &args, nodeid, name, outarg); - err = fuse_simple_request(fc, &args); + fuse_lookup_init(fm->fc, &args, nodeid, name, outarg); + err = fuse_simple_request(fm, &args); /* Zero nodeid is same as -ENOENT, but with valid timeout */ if (err || !outarg->nodeid) goto out_put_forget; @@ -365,7 +442,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name attr_version); err = -ENOMEM; if (!*inode) { - fuse_queue_forget(fc, forget, outarg->nodeid, 1); + fuse_queue_forget(fm->fc, forget, outarg->nodeid, 1); goto out; } err = 0; @@ -434,7 +511,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, { int err; struct inode *inode; - struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); struct fuse_forget_link *forget; struct fuse_create_in inarg; @@ -452,11 +529,11 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, goto out_err; err = -ENOMEM; - ff = fuse_file_alloc(fc); + ff = fuse_file_alloc(fm); if (!ff) goto out_put_forget_req; - if (!fc->dont_mask) + if (!fm->fc->dont_mask) mode &= ~current_umask(); flags &= ~O_NOCTTY; @@ -477,7 +554,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, args.out_args[0].value = &outentry; args.out_args[1].size = sizeof(outopen); args.out_args[1].value = &outopen; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err) goto out_free_ff; @@ -494,7 +571,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, if (!inode) { flags &= ~(O_CREAT | O_EXCL | O_TRUNC); fuse_sync_release(NULL, ff, flags); - fuse_queue_forget(fc, forget, outentry.nodeid, 1); + fuse_queue_forget(fm->fc, forget, outentry.nodeid, 1); err = -ENOMEM; goto out_err; } @@ -567,7 +644,7 @@ no_open: /* * Code shared between mknod, mkdir, symlink and link */ -static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args, +static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, struct inode *dir, struct dentry *entry, umode_t mode) { @@ -586,7 +663,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args, args->out_numargs = 1; args->out_args[0].size = sizeof(outarg); args->out_args[0].value = &outarg; - err = fuse_simple_request(fc, args); + err = fuse_simple_request(fm, args); if (err) goto out_put_forget_req; @@ -600,7 +677,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args, inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, &outarg.attr, entry_attr_timeout(&outarg), 0); if (!inode) { - fuse_queue_forget(fc, forget, outarg.nodeid, 1); + fuse_queue_forget(fm->fc, forget, outarg.nodeid, 1); return -ENOMEM; } kfree(forget); @@ -628,10 +705,10 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode, dev_t rdev) { struct fuse_mknod_in inarg; - struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); - if (!fc->dont_mask) + if (!fm->fc->dont_mask) mode &= ~current_umask(); memset(&inarg, 0, sizeof(inarg)); @@ -644,7 +721,7 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode, args.in_args[0].value = &inarg; args.in_args[1].size = entry->d_name.len + 1; args.in_args[1].value = entry->d_name.name; - return create_new_entry(fc, &args, dir, entry, mode); + return create_new_entry(fm, &args, dir, entry, mode); } static int fuse_create(struct inode *dir, struct dentry *entry, umode_t mode, @@ -656,10 +733,10 @@ static int fuse_create(struct inode *dir, struct dentry *entry, umode_t mode, static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode) { struct fuse_mkdir_in inarg; - struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); - if (!fc->dont_mask) + if (!fm->fc->dont_mask) mode &= ~current_umask(); memset(&inarg, 0, sizeof(inarg)); @@ -671,13 +748,13 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode) args.in_args[0].value = &inarg; args.in_args[1].size = entry->d_name.len + 1; args.in_args[1].value = entry->d_name.name; - return create_new_entry(fc, &args, dir, entry, S_IFDIR); + return create_new_entry(fm, &args, dir, entry, S_IFDIR); } static int fuse_symlink(struct inode *dir, struct dentry *entry, const char *link) { - struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_mount *fm = get_fuse_mount(dir); unsigned len = strlen(link) + 1; FUSE_ARGS(args); @@ -687,7 +764,7 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry, args.in_args[0].value = entry->d_name.name; args.in_args[1].size = len; args.in_args[1].value = link; - return create_new_entry(fc, &args, dir, entry, S_IFLNK); + return create_new_entry(fm, &args, dir, entry, S_IFLNK); } void fuse_update_ctime(struct inode *inode) @@ -701,7 +778,7 @@ void fuse_update_ctime(struct inode *inode) static int fuse_unlink(struct inode *dir, struct dentry *entry) { int err; - struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); args.opcode = FUSE_UNLINK; @@ -709,13 +786,13 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) args.in_numargs = 1; args.in_args[0].size = entry->d_name.len + 1; args.in_args[0].value = entry->d_name.name; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (!err) { struct inode *inode = d_inode(entry); struct fuse_inode *fi = get_fuse_inode(inode); spin_lock(&fi->lock); - fi->attr_version = atomic64_inc_return(&fc->attr_version); + fi->attr_version = atomic64_inc_return(&fm->fc->attr_version); /* * If i_nlink == 0 then unlink doesn't make sense, yet this can * happen if userspace filesystem is careless. It would be @@ -737,7 +814,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) static int fuse_rmdir(struct inode *dir, struct dentry *entry) { int err; - struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); args.opcode = FUSE_RMDIR; @@ -745,7 +822,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) args.in_numargs = 1; args.in_args[0].size = entry->d_name.len + 1; args.in_args[0].value = entry->d_name.name; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (!err) { clear_nlink(d_inode(entry)); fuse_dir_changed(dir); @@ -761,7 +838,7 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, { int err; struct fuse_rename2_in inarg; - struct fuse_conn *fc = get_fuse_conn(olddir); + struct fuse_mount *fm = get_fuse_mount(olddir); FUSE_ARGS(args); memset(&inarg, 0, argsize); @@ -776,7 +853,7 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, args.in_args[1].value = oldent->d_name.name; args.in_args[2].size = newent->d_name.len + 1; args.in_args[2].value = newent->d_name.name; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (!err) { /* ctime changes */ fuse_invalidate_attr(d_inode(oldent)); @@ -847,7 +924,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, int err; struct fuse_link_in inarg; struct inode *inode = d_inode(entry); - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); memset(&inarg, 0, sizeof(inarg)); @@ -858,7 +935,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, args.in_args[0].value = &inarg; args.in_args[1].size = newent->d_name.len + 1; args.in_args[1].value = newent->d_name.name; - err = create_new_entry(fc, &args, newdir, newent, inode->i_mode); + err = create_new_entry(fm, &args, newdir, newent, inode->i_mode); /* Contrary to "normal" filesystems it can happen that link makes two "logical" inodes point to the same "physical" inode. We invalidate the attributes of the old one, so it @@ -869,7 +946,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, struct fuse_inode *fi = get_fuse_inode(inode); spin_lock(&fi->lock); - fi->attr_version = atomic64_inc_return(&fc->attr_version); + fi->attr_version = atomic64_inc_return(&fm->fc->attr_version); if (likely(inode->i_nlink < UINT_MAX)) inc_nlink(inode); spin_unlock(&fi->lock); @@ -926,11 +1003,11 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, int err; struct fuse_getattr_in inarg; struct fuse_attr_out outarg; - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); u64 attr_version; - attr_version = fuse_get_attr_version(fc); + attr_version = fuse_get_attr_version(fm->fc); memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); @@ -949,7 +1026,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (!err) { if (fuse_invalid_attr(&outarg.attr) || (inode->i_mode ^ outarg.attr.mode) & S_IFMT) { @@ -1002,7 +1079,7 @@ int fuse_update_attributes(struct inode *inode, struct file *file) STATX_BASIC_STATS & ~STATX_ATIME, 0); } -int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, +int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, u64 child_nodeid, struct qstr *name) { int err = -ENOTDIR; @@ -1010,7 +1087,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, struct dentry *dir; struct dentry *entry; - parent = ilookup5(sb, parent_nodeid, fuse_inode_eq, &parent_nodeid); + parent = fuse_ilookup(fc, parent_nodeid, NULL); if (!parent) return -ENOENT; @@ -1102,14 +1179,14 @@ int fuse_allow_current_process(struct fuse_conn *fc) static int fuse_access(struct inode *inode, int mask) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_access_in inarg; int err; BUG_ON(mask & MAY_NOT_BLOCK); - if (fc->no_access) + if (fm->fc->no_access) return 0; memset(&inarg, 0, sizeof(inarg)); @@ -1119,9 +1196,9 @@ static int fuse_access(struct inode *inode, int mask) args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { - fc->no_access = 1; + fm->fc->no_access = 1; err = 0; } return err; @@ -1209,7 +1286,7 @@ static int fuse_permission(struct inode *inode, int mask) static int fuse_readlink_page(struct inode *inode, struct page *page) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_page_desc desc = { .length = PAGE_SIZE - 1 }; struct fuse_args_pages ap = { .num_pages = 1, @@ -1226,7 +1303,7 @@ static int fuse_readlink_page(struct inode *inode, struct page *page) ap.args.page_zeroing = true; ap.args.out_numargs = 1; ap.args.out_args[0].size = desc.length; - res = fuse_simple_request(fc, &ap.args); + res = fuse_simple_request(fm, &ap.args); fuse_invalidate_atime(inode); @@ -1454,7 +1531,7 @@ static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_args *args, */ int fuse_flush_times(struct inode *inode, struct fuse_file *ff) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_setattr_in inarg; struct fuse_attr_out outarg; @@ -1465,7 +1542,7 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff) inarg.valid = FATTR_MTIME; inarg.mtime = inode->i_mtime.tv_sec; inarg.mtimensec = inode->i_mtime.tv_nsec; - if (fc->minor >= 23) { + if (fm->fc->minor >= 23) { inarg.valid |= FATTR_CTIME; inarg.ctime = inode->i_ctime.tv_sec; inarg.ctimensec = inode->i_ctime.tv_nsec; @@ -1474,9 +1551,9 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff) inarg.valid |= FATTR_FH; inarg.fh = ff->fh; } - fuse_setattr_fill(fc, &args, inode, &inarg, &outarg); + fuse_setattr_fill(fm->fc, &args, inode, &inarg, &outarg); - return fuse_simple_request(fc, &args); + return fuse_simple_request(fm, &args); } /* @@ -1491,7 +1568,8 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, struct file *file) { struct inode *inode = d_inode(dentry); - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_conn *fc = fm->fc; struct fuse_inode *fi = get_fuse_inode(inode); FUSE_ARGS(args); struct fuse_setattr_in inarg; @@ -1501,6 +1579,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, loff_t oldsize; int err; bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode); + bool fault_blocked = false; if (!fc->default_permissions) attr->ia_valid |= ATTR_FORCE; @@ -1509,6 +1588,22 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, if (err) return err; + if (attr->ia_valid & ATTR_SIZE) { + if (WARN_ON(!S_ISREG(inode->i_mode))) + return -EIO; + is_truncate = true; + } + + if (FUSE_IS_DAX(inode) && is_truncate) { + down_write(&fi->i_mmap_sem); + fault_blocked = true; + err = fuse_dax_break_layouts(inode, 0, 0); + if (err) { + up_write(&fi->i_mmap_sem); + return err; + } + } + if (attr->ia_valid & ATTR_OPEN) { /* This is coming from open(..., ... | O_TRUNC); */ WARN_ON(!(attr->ia_valid & ATTR_SIZE)); @@ -1521,17 +1616,11 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, */ i_size_write(inode, 0); truncate_pagecache(inode, 0); - return 0; + goto out; } file = NULL; } - if (attr->ia_valid & ATTR_SIZE) { - if (WARN_ON(!S_ISREG(inode->i_mode))) - return -EIO; - is_truncate = true; - } - /* Flush dirty data/metadata before non-truncate SETATTR */ if (is_wb && S_ISREG(inode->i_mode) && attr->ia_valid & @@ -1566,7 +1655,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, inarg.lock_owner = fuse_lock_owner_id(fc, current->files); } fuse_setattr_fill(fc, &args, inode, &inarg, &outarg); - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err) { if (err == -EINTR) fuse_invalidate_attr(inode); @@ -1614,6 +1703,10 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, } clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); +out: + if (fault_blocked) + up_write(&fi->i_mmap_sem); + return 0; error: @@ -1621,6 +1714,9 @@ error: fuse_release_nowrite(inode); clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); + + if (fault_blocked) + up_write(&fi->i_mmap_sem); return err; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 6611ef3269a8..c03034e8c152 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -32,7 +32,7 @@ static struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags, return pages; } -static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, +static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, struct file *file, int opcode, struct fuse_open_out *outargp) { struct fuse_open_in inarg; @@ -40,7 +40,7 @@ static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, memset(&inarg, 0, sizeof(inarg)); inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY); - if (!fc->atomic_o_trunc) + if (!fm->fc->atomic_o_trunc) inarg.flags &= ~O_TRUNC; args.opcode = opcode; args.nodeid = nodeid; @@ -51,7 +51,7 @@ static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, args.out_args[0].size = sizeof(*outargp); args.out_args[0].value = outargp; - return fuse_simple_request(fc, &args); + return fuse_simple_request(fm, &args); } struct fuse_release_args { @@ -60,7 +60,7 @@ struct fuse_release_args { struct inode *inode; }; -struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) +struct fuse_file *fuse_file_alloc(struct fuse_mount *fm) { struct fuse_file *ff; @@ -68,7 +68,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) if (unlikely(!ff)) return NULL; - ff->fc = fc; + ff->fm = fm; ff->release_args = kzalloc(sizeof(*ff->release_args), GFP_KERNEL_ACCOUNT); if (!ff->release_args) { @@ -82,7 +82,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) RB_CLEAR_NODE(&ff->polled_node); init_waitqueue_head(&ff->poll_wait); - ff->kh = atomic64_inc_return(&fc->khctr); + ff->kh = atomic64_inc_return(&fm->fc->khctr); return ff; } @@ -100,7 +100,7 @@ static struct fuse_file *fuse_file_get(struct fuse_file *ff) return ff; } -static void fuse_release_end(struct fuse_conn *fc, struct fuse_args *args, +static void fuse_release_end(struct fuse_mount *fm, struct fuse_args *args, int error) { struct fuse_release_args *ra = container_of(args, typeof(*ra), args); @@ -114,29 +114,30 @@ static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir) if (refcount_dec_and_test(&ff->count)) { struct fuse_args *args = &ff->release_args->args; - if (isdir ? ff->fc->no_opendir : ff->fc->no_open) { + if (isdir ? ff->fm->fc->no_opendir : ff->fm->fc->no_open) { /* Do nothing when client does not implement 'open' */ - fuse_release_end(ff->fc, args, 0); + fuse_release_end(ff->fm, args, 0); } else if (sync) { - fuse_simple_request(ff->fc, args); - fuse_release_end(ff->fc, args, 0); + fuse_simple_request(ff->fm, args); + fuse_release_end(ff->fm, args, 0); } else { args->end = fuse_release_end; - if (fuse_simple_background(ff->fc, args, + if (fuse_simple_background(ff->fm, args, GFP_KERNEL | __GFP_NOFAIL)) - fuse_release_end(ff->fc, args, -ENOTCONN); + fuse_release_end(ff->fm, args, -ENOTCONN); } kfree(ff); } } -int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, +int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file, bool isdir) { + struct fuse_conn *fc = fm->fc; struct fuse_file *ff; int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN; - ff = fuse_file_alloc(fc); + ff = fuse_file_alloc(fm); if (!ff) return -ENOMEM; @@ -147,7 +148,7 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, struct fuse_open_out outarg; int err; - err = fuse_send_open(fc, nodeid, file, opcode, &outarg); + err = fuse_send_open(fm, nodeid, file, opcode, &outarg); if (!err) { ff->fh = outarg.fh; ff->open_flags = outarg.open_flags; @@ -216,27 +217,40 @@ void fuse_finish_open(struct inode *inode, struct file *file) int fuse_open_common(struct inode *inode, struct file *file, bool isdir) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_conn *fc = fm->fc; int err; bool is_wb_truncate = (file->f_flags & O_TRUNC) && fc->atomic_o_trunc && fc->writeback_cache; + bool dax_truncate = (file->f_flags & O_TRUNC) && + fc->atomic_o_trunc && FUSE_IS_DAX(inode); err = generic_file_open(inode, file); if (err) return err; - if (is_wb_truncate) { + if (is_wb_truncate || dax_truncate) { inode_lock(inode); fuse_set_nowrite(inode); } - err = fuse_do_open(fc, get_node_id(inode), file, isdir); + if (dax_truncate) { + down_write(&get_fuse_inode(inode)->i_mmap_sem); + err = fuse_dax_break_layouts(inode, 0, 0); + if (err) + goto out; + } + err = fuse_do_open(fm, get_node_id(inode), file, isdir); if (!err) fuse_finish_open(inode, file); - if (is_wb_truncate) { +out: + if (dax_truncate) + up_write(&get_fuse_inode(inode)->i_mmap_sem); + + if (is_wb_truncate | dax_truncate) { fuse_release_nowrite(inode); inode_unlock(inode); } @@ -247,7 +261,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, int flags, int opcode) { - struct fuse_conn *fc = ff->fc; + struct fuse_conn *fc = ff->fm->fc; struct fuse_release_args *ra = ff->release_args; /* Inode is NULL on error path of fuse_create_open() */ @@ -285,7 +299,7 @@ void fuse_release_common(struct file *file, bool isdir) if (ff->flock) { ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK; - ra->inarg.lock_owner = fuse_lock_owner_id(ff->fc, + ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc, (fl_owner_t) file); } /* Hold inode until release is finished */ @@ -300,7 +314,7 @@ void fuse_release_common(struct file *file, bool isdir) * synchronous RELEASE is allowed (and desirable) in this case * because the server can be trusted not to screw up. */ - fuse_file_put(ff, ff->fc->destroy, isdir); + fuse_file_put(ff, ff->fm->fc->destroy, isdir); } static int fuse_open(struct inode *inode, struct file *file) @@ -443,7 +457,7 @@ static void fuse_sync_writes(struct inode *inode) static int fuse_flush(struct file *file, fl_owner_t id) { struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_file *ff = file->private_data; struct fuse_flush_in inarg; FUSE_ARGS(args); @@ -465,12 +479,12 @@ static int fuse_flush(struct file *file, fl_owner_t id) return err; err = 0; - if (fc->no_flush) + if (fm->fc->no_flush) goto inval_attr_out; memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; - inarg.lock_owner = fuse_lock_owner_id(fc, id); + inarg.lock_owner = fuse_lock_owner_id(fm->fc, id); args.opcode = FUSE_FLUSH; args.nodeid = get_node_id(inode); args.in_numargs = 1; @@ -478,9 +492,9 @@ static int fuse_flush(struct file *file, fl_owner_t id) args.in_args[0].value = &inarg; args.force = true; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { - fc->no_flush = 1; + fm->fc->no_flush = 1; err = 0; } @@ -489,7 +503,7 @@ inval_attr_out: * In memory i_blocks is not maintained by fuse, if writeback cache is * enabled, i_blocks from cached attr may not be accurate. */ - if (!err && fc->writeback_cache) + if (!err && fm->fc->writeback_cache) fuse_invalidate_attr(inode); return err; } @@ -498,7 +512,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, int datasync, int opcode) { struct inode *inode = file->f_mapping->host; - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_file *ff = file->private_data; FUSE_ARGS(args); struct fuse_fsync_in inarg; @@ -511,7 +525,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; - return fuse_simple_request(fc, &args); + return fuse_simple_request(fm, &args); } static int fuse_fsync(struct file *file, loff_t start, loff_t end, @@ -686,7 +700,7 @@ static void fuse_io_free(struct fuse_io_args *ia) kfree(ia); } -static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_args *args, +static void fuse_aio_complete_req(struct fuse_mount *fm, struct fuse_args *args, int err) { struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args); @@ -715,7 +729,7 @@ static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_args *args, fuse_io_free(ia); } -static ssize_t fuse_async_req_send(struct fuse_conn *fc, +static ssize_t fuse_async_req_send(struct fuse_mount *fm, struct fuse_io_args *ia, size_t num_bytes) { ssize_t err; @@ -729,9 +743,9 @@ static ssize_t fuse_async_req_send(struct fuse_conn *fc, ia->ap.args.end = fuse_aio_complete_req; ia->ap.args.may_block = io->should_dirty; - err = fuse_simple_background(fc, &ia->ap.args, GFP_KERNEL); + err = fuse_simple_background(fm, &ia->ap.args, GFP_KERNEL); if (err) - fuse_aio_complete_req(fc, &ia->ap.args, err); + fuse_aio_complete_req(fm, &ia->ap.args, err); return num_bytes; } @@ -741,18 +755,18 @@ static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count, { struct file *file = ia->io->iocb->ki_filp; struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_mount *fm = ff->fm; fuse_read_args_fill(ia, file, pos, count, FUSE_READ); if (owner != NULL) { ia->read.in.read_flags |= FUSE_READ_LOCKOWNER; - ia->read.in.lock_owner = fuse_lock_owner_id(fc, owner); + ia->read.in.lock_owner = fuse_lock_owner_id(fm->fc, owner); } if (ia->io->async) - return fuse_async_req_send(fc, ia, count); + return fuse_async_req_send(fm, ia, count); - return fuse_simple_request(fc, &ia->ap.args); + return fuse_simple_request(fm, &ia->ap.args); } static void fuse_read_update_size(struct inode *inode, loff_t size, @@ -798,7 +812,7 @@ static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read, static int fuse_do_readpage(struct file *file, struct page *page) { struct inode *inode = page->mapping->host; - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); loff_t pos = page_offset(page); struct fuse_page_desc desc = { .length = PAGE_SIZE }; struct fuse_io_args ia = { @@ -818,14 +832,14 @@ static int fuse_do_readpage(struct file *file, struct page *page) */ fuse_wait_on_page_writeback(inode, page->index); - attr_ver = fuse_get_attr_version(fc); + attr_ver = fuse_get_attr_version(fm->fc); /* Don't overflow end offset */ if (pos + (desc.length - 1) == LLONG_MAX) desc.length--; fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ); - res = fuse_simple_request(fc, &ia.ap.args); + res = fuse_simple_request(fm, &ia.ap.args); if (res < 0) return res; /* @@ -855,7 +869,7 @@ static int fuse_readpage(struct file *file, struct page *page) return err; } -static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_args *args, +static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, int err) { int i; @@ -899,7 +913,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_args *args, static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) { struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_mount *fm = ff->fm; struct fuse_args_pages *ap = &ia->ap; loff_t pos = page_offset(ap->pages[0]); size_t count = ap->num_pages << PAGE_SHIFT; @@ -918,18 +932,18 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) WARN_ON((loff_t) (pos + count) < 0); fuse_read_args_fill(ia, file, pos, count, FUSE_READ); - ia->read.attr_ver = fuse_get_attr_version(fc); - if (fc->async_read) { + ia->read.attr_ver = fuse_get_attr_version(fm->fc); + if (fm->fc->async_read) { ia->ff = fuse_file_get(ff); ap->args.end = fuse_readpages_end; - err = fuse_simple_background(fc, &ap->args, GFP_KERNEL); + err = fuse_simple_background(fm, &ap->args, GFP_KERNEL); if (!err) return; } else { - res = fuse_simple_request(fc, &ap->args); + res = fuse_simple_request(fm, &ap->args); err = res < 0 ? res : 0; } - fuse_readpages_end(fc, &ap->args, err); + fuse_readpages_end(fm, &ap->args, err); } static void fuse_readahead(struct readahead_control *rac) @@ -1000,7 +1014,7 @@ static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff, args->opcode = FUSE_WRITE; args->nodeid = ff->nodeid; args->in_numargs = 2; - if (ff->fc->minor < 9) + if (ff->fm->fc->minor < 9) args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; else args->in_args[0].size = sizeof(ia->write.in); @@ -1029,7 +1043,7 @@ static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos, struct kiocb *iocb = ia->io->iocb; struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_mount *fm = ff->fm; struct fuse_write_in *inarg = &ia->write.in; ssize_t err; @@ -1037,13 +1051,13 @@ static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos, inarg->flags = fuse_write_flags(iocb); if (owner != NULL) { inarg->write_flags |= FUSE_WRITE_LOCKOWNER; - inarg->lock_owner = fuse_lock_owner_id(fc, owner); + inarg->lock_owner = fuse_lock_owner_id(fm->fc, owner); } if (ia->io->async) - return fuse_async_req_send(fc, ia, count); + return fuse_async_req_send(fm, ia, count); - err = fuse_simple_request(fc, &ia->ap.args); + err = fuse_simple_request(fm, &ia->ap.args); if (!err && ia->write.out.size > count) err = -EIO; @@ -1074,7 +1088,7 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, struct fuse_args_pages *ap = &ia->ap; struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_mount *fm = ff->fm; unsigned int offset, i; int err; @@ -1084,7 +1098,7 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, fuse_write_args_fill(ia, ff, pos, count); ia->write.in.flags = fuse_write_flags(iocb); - err = fuse_simple_request(fc, &ap->args); + err = fuse_simple_request(fm, &ap->args); if (!err && ia->write.out.size > count) err = -EIO; @@ -1399,7 +1413,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, struct file *file = io->iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_conn *fc = ff->fm->fc; size_t nmax = write ? fc->max_write : fc->max_read; loff_t pos = *ppos; size_t count = iov_iter_count(iter); @@ -1539,10 +1553,14 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); - if (is_bad_inode(file_inode(file))) + if (is_bad_inode(inode)) return -EIO; + if (FUSE_IS_DAX(inode)) + return fuse_dax_read_iter(iocb, to); + if (!(ff->open_flags & FOPEN_DIRECT_IO)) return fuse_cache_read_iter(iocb, to); else @@ -1553,10 +1571,14 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); - if (is_bad_inode(file_inode(file))) + if (is_bad_inode(inode)) return -EIO; + if (FUSE_IS_DAX(inode)) + return fuse_dax_write_iter(iocb, from); + if (!(ff->open_flags & FOPEN_DIRECT_IO)) return fuse_cache_write_iter(iocb, from); else @@ -1578,7 +1600,7 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa) kfree(wpa); } -static void fuse_writepage_finish(struct fuse_conn *fc, +static void fuse_writepage_finish(struct fuse_mount *fm, struct fuse_writepage_args *wpa) { struct fuse_args_pages *ap = &wpa->ia.ap; @@ -1596,7 +1618,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, } /* Called under fi->lock, may release and reacquire it */ -static void fuse_send_writepage(struct fuse_conn *fc, +static void fuse_send_writepage(struct fuse_mount *fm, struct fuse_writepage_args *wpa, loff_t size) __releases(fi->lock) __acquires(fi->lock) @@ -1622,10 +1644,10 @@ __acquires(fi->lock) args->force = true; args->nocreds = true; - err = fuse_simple_background(fc, args, GFP_ATOMIC); + err = fuse_simple_background(fm, args, GFP_ATOMIC); if (err == -ENOMEM) { spin_unlock(&fi->lock); - err = fuse_simple_background(fc, args, GFP_NOFS | __GFP_NOFAIL); + err = fuse_simple_background(fm, args, GFP_NOFS | __GFP_NOFAIL); spin_lock(&fi->lock); } @@ -1638,7 +1660,7 @@ __acquires(fi->lock) out_free: fi->writectr--; rb_erase(&wpa->writepages_entry, &fi->writepages); - fuse_writepage_finish(fc, wpa); + fuse_writepage_finish(fm, wpa); spin_unlock(&fi->lock); /* After fuse_writepage_finish() aux request list is private */ @@ -1662,7 +1684,7 @@ void fuse_flush_writepages(struct inode *inode) __releases(fi->lock) __acquires(fi->lock) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_inode *fi = get_fuse_inode(inode); loff_t crop = i_size_read(inode); struct fuse_writepage_args *wpa; @@ -1671,7 +1693,7 @@ __acquires(fi->lock) wpa = list_entry(fi->queued_writes.next, struct fuse_writepage_args, queue_entry); list_del_init(&wpa->queue_entry); - fuse_send_writepage(fc, wpa, crop); + fuse_send_writepage(fm, wpa, crop); } } @@ -1712,7 +1734,7 @@ static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa) WARN_ON(fuse_insert_writeback(root, wpa)); } -static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args, +static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, int error) { struct fuse_writepage_args *wpa = @@ -1724,7 +1746,7 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args, spin_lock(&fi->lock); rb_erase(&wpa->writepages_entry, &fi->writepages); while (wpa->next) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_write_in *inarg = &wpa->ia.write.in; struct fuse_writepage_args *next = wpa->next; @@ -1756,10 +1778,10 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args, * no invocations of fuse_writepage_end() while we're in * fuse_set_nowrite..fuse_release_nowrite section. */ - fuse_send_writepage(fc, next, inarg->offset + inarg->size); + fuse_send_writepage(fm, next, inarg->offset + inarg->size); } fi->writectr--; - fuse_writepage_finish(fc, wpa); + fuse_writepage_finish(fm, wpa); spin_unlock(&fi->lock); fuse_writepage_free(wpa); } @@ -2317,6 +2339,10 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) { struct fuse_file *ff = file->private_data; + /* DAX mmap is superior to direct_io mmap */ + if (FUSE_IS_DAX(file_inode(file))) + return fuse_dax_mmap(file, vma); + if (ff->open_flags & FOPEN_DIRECT_IO) { /* Can't provide the coherency needed for MAP_SHARED */ if (vma->vm_flags & VM_MAYSHARE) @@ -2395,7 +2421,7 @@ static void fuse_lk_fill(struct fuse_args *args, struct file *file, static int fuse_getlk(struct file *file, struct file_lock *fl) { struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_lk_in inarg; struct fuse_lk_out outarg; @@ -2405,9 +2431,9 @@ static int fuse_getlk(struct file *file, struct file_lock *fl) args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (!err) - err = convert_fuse_file_lock(fc, &outarg.lk, fl); + err = convert_fuse_file_lock(fm->fc, &outarg.lk, fl); return err; } @@ -2415,12 +2441,12 @@ static int fuse_getlk(struct file *file, struct file_lock *fl) static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) { struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_lk_in inarg; int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK; struct pid *pid = fl->fl_type != F_UNLCK ? task_tgid(current) : NULL; - pid_t pid_nr = pid_nr_ns(pid, fc->pid_ns); + pid_t pid_nr = pid_nr_ns(pid, fm->fc->pid_ns); int err; if (fl->fl_lmops && fl->fl_lmops->lm_grant) { @@ -2433,7 +2459,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) return 0; fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg); - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); /* locking is restartable */ if (err == -EINTR) @@ -2487,13 +2513,13 @@ static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl) static sector_t fuse_bmap(struct address_space *mapping, sector_t block) { struct inode *inode = mapping->host; - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_bmap_in inarg; struct fuse_bmap_out outarg; int err; - if (!inode->i_sb->s_bdev || fc->no_bmap) + if (!inode->i_sb->s_bdev || fm->fc->no_bmap) return 0; memset(&inarg, 0, sizeof(inarg)); @@ -2507,9 +2533,9 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block) args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) - fc->no_bmap = 1; + fm->fc->no_bmap = 1; return err ? 0 : outarg.block; } @@ -2517,7 +2543,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block) static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_file *ff = file->private_data; FUSE_ARGS(args); struct fuse_lseek_in inarg = { @@ -2528,7 +2554,7 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) struct fuse_lseek_out outarg; int err; - if (fc->no_lseek) + if (fm->fc->no_lseek) goto fallback; args.opcode = FUSE_LSEEK; @@ -2539,10 +2565,10 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err) { if (err == -ENOSYS) { - fc->no_lseek = 1; + fm->fc->no_lseek = 1; goto fallback; } return err; @@ -2728,7 +2754,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, unsigned int flags) { struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_mount *fm = ff->fm; struct fuse_ioctl_in inarg = { .fh = ff->fh, .cmd = cmd, @@ -2761,12 +2787,12 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); err = -ENOMEM; - ap.pages = fuse_pages_alloc(fc->max_pages, GFP_KERNEL, &ap.descs); + ap.pages = fuse_pages_alloc(fm->fc->max_pages, GFP_KERNEL, &ap.descs); iov_page = (struct iovec *) __get_free_page(GFP_KERNEL); if (!ap.pages || !iov_page) goto out; - fuse_page_descs_length_init(ap.descs, 0, fc->max_pages); + fuse_page_descs_length_init(ap.descs, 0, fm->fc->max_pages); /* * If restricted, initialize IO parameters as encoded in @cmd. @@ -2811,7 +2837,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, /* make sure there are enough buffer pages and init request with them */ err = -ENOMEM; - if (max_pages > fc->max_pages) + if (max_pages > fm->fc->max_pages) goto out; while (ap.num_pages < max_pages) { ap.pages[ap.num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); @@ -2848,7 +2874,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, ap.args.out_pages = true; ap.args.out_argvar = true; - transferred = fuse_simple_request(fc, &ap.args); + transferred = fuse_simple_request(fm, &ap.args); err = transferred; if (transferred < 0) goto out; @@ -2876,7 +2902,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, goto out; vaddr = kmap_atomic(ap.pages[0]); - err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr, + err = fuse_copy_ioctl_iovec(fm->fc, iov_page, vaddr, transferred, in_iovs + out_iovs, (flags & FUSE_IOCTL_COMPAT) != 0); kunmap_atomic(vaddr); @@ -2886,11 +2912,11 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, in_iov = iov_page; out_iov = in_iov + in_iovs; - err = fuse_verify_ioctl_iov(fc, in_iov, in_iovs); + err = fuse_verify_ioctl_iov(fm->fc, in_iov, in_iovs); if (err) goto out; - err = fuse_verify_ioctl_iov(fc, out_iov, out_iovs); + err = fuse_verify_ioctl_iov(fm->fc, out_iov, out_iovs); if (err) goto out; @@ -3000,13 +3026,13 @@ static void fuse_register_polled_file(struct fuse_conn *fc, __poll_t fuse_file_poll(struct file *file, poll_table *wait) { struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_mount *fm = ff->fm; struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh }; struct fuse_poll_out outarg; FUSE_ARGS(args); int err; - if (fc->no_poll) + if (fm->fc->no_poll) return DEFAULT_POLLMASK; poll_wait(file, &ff->poll_wait, wait); @@ -3018,7 +3044,7 @@ __poll_t fuse_file_poll(struct file *file, poll_table *wait) */ if (waitqueue_active(&ff->poll_wait)) { inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY; - fuse_register_polled_file(fc, ff); + fuse_register_polled_file(fm->fc, ff); } args.opcode = FUSE_POLL; @@ -3029,12 +3055,12 @@ __poll_t fuse_file_poll(struct file *file, poll_table *wait) args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (!err) return demangle_poll(outarg.revents); if (err == -ENOSYS) { - fc->no_poll = 1; + fm->fc->no_poll = 1; return DEFAULT_POLLMASK; } return EPOLLERR; @@ -3091,11 +3117,10 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) ssize_t ret = 0; struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; - bool async_dio = ff->fc->async_dio; loff_t pos = 0; struct inode *inode; loff_t i_size; - size_t count = iov_iter_count(iter); + size_t count = iov_iter_count(iter), shortened = 0; loff_t offset = iocb->ki_pos; struct fuse_io_priv *io; @@ -3103,17 +3128,9 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) inode = file->f_mapping->host; i_size = i_size_read(inode); - if ((iov_iter_rw(iter) == READ) && (offset > i_size)) + if ((iov_iter_rw(iter) == READ) && (offset >= i_size)) return 0; - /* optimization for short read */ - if (async_dio && iov_iter_rw(iter) != WRITE && offset + count > i_size) { - if (offset >= i_size) - return 0; - iov_iter_truncate(iter, fuse_round_up(ff->fc, i_size - offset)); - count = iov_iter_count(iter); - } - io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL); if (!io) return -ENOMEM; @@ -3129,15 +3146,22 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) * By default, we want to optimize all I/Os with async request * submission to the client filesystem if supported. */ - io->async = async_dio; + io->async = ff->fm->fc->async_dio; io->iocb = iocb; io->blocking = is_sync_kiocb(iocb); + /* optimization for short read */ + if (io->async && !io->write && offset + count > i_size) { + iov_iter_truncate(iter, fuse_round_up(ff->fm->fc, i_size - offset)); + shortened = count - iov_iter_count(iter); + count -= shortened; + } + /* * We cannot asynchronously extend the size of a file. * In such case the aio will behave exactly like sync io. */ - if ((offset + count > i_size) && iov_iter_rw(iter) == WRITE) + if ((offset + count > i_size) && io->write) io->blocking = true; if (io->async && io->blocking) { @@ -3155,6 +3179,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) } else { ret = __fuse_direct_read(io, iter, &pos); } + iov_iter_reexpand(iter, iov_iter_count(iter) + shortened); if (io->async) { bool blocking = io->blocking; @@ -3197,7 +3222,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, struct fuse_file *ff = file->private_data; struct inode *inode = file_inode(file); struct fuse_inode *fi = get_fuse_inode(inode); - struct fuse_conn *fc = ff->fc; + struct fuse_mount *fm = ff->fm; FUSE_ARGS(args); struct fuse_fallocate_in inarg = { .fh = ff->fh, @@ -3209,14 +3234,23 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) || (mode & FALLOC_FL_PUNCH_HOLE); + bool block_faults = FUSE_IS_DAX(inode) && lock_inode; + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; - if (fc->no_fallocate) + if (fm->fc->no_fallocate) return -EOPNOTSUPP; if (lock_inode) { inode_lock(inode); + if (block_faults) { + down_write(&fi->i_mmap_sem); + err = fuse_dax_break_layouts(inode, 0, 0); + if (err) + goto out; + } + if (mode & FALLOC_FL_PUNCH_HOLE) { loff_t endbyte = offset + length - 1; @@ -3241,9 +3275,9 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { - fc->no_fallocate = 1; + fm->fc->no_fallocate = 1; err = -EOPNOTSUPP; } if (err) @@ -3253,7 +3287,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, if (!(mode & FALLOC_FL_KEEP_SIZE)) { bool changed = fuse_write_update_size(inode, offset + length); - if (changed && fc->writeback_cache) + if (changed && fm->fc->writeback_cache) file_update_time(file); } @@ -3266,6 +3300,9 @@ out: if (!(mode & FALLOC_FL_KEEP_SIZE)) clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); + if (block_faults) + up_write(&fi->i_mmap_sem); + if (lock_inode) inode_unlock(inode); @@ -3281,7 +3318,8 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); struct fuse_inode *fi_out = get_fuse_inode(inode_out); - struct fuse_conn *fc = ff_in->fc; + struct fuse_mount *fm = ff_in->fm; + struct fuse_conn *fc = fm->fc; FUSE_ARGS(args); struct fuse_copy_file_range_in inarg = { .fh_in = ff_in->fh, @@ -3350,7 +3388,7 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { fc->no_copy_file_range = 1; err = -EOPNOTSUPP; @@ -3405,6 +3443,7 @@ static const struct file_operations fuse_file_operations = { .release = fuse_release, .fsync = fuse_fsync, .lock = fuse_file_lock, + .get_unmapped_area = thp_get_unmapped_area, .flock = fuse_file_flock, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, @@ -3440,4 +3479,7 @@ void fuse_init_file_inode(struct inode *inode) fi->writectr = 0; init_waitqueue_head(&fi->page_waitq); fi->writepages = RB_ROOT; + + if (IS_ENABLED(CONFIG_FUSE_DAX)) + fuse_dax_inode_init(inode); } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 740a8a7d7ae6..d51598017d13 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -148,6 +148,20 @@ struct fuse_inode { /** Lock to protect write related fields */ spinlock_t lock; + + /** + * Can't take inode lock in fault path (leads to circular dependency). + * Introduce another semaphore which can be taken in fault path and + * then other filesystem paths can take this to block faults. + */ + struct rw_semaphore i_mmap_sem; + +#ifdef CONFIG_FUSE_DAX + /* + * Dax specific inode data + */ + struct fuse_inode_dax *dax; +#endif }; /** FUSE inode state bits */ @@ -161,12 +175,13 @@ enum { }; struct fuse_conn; +struct fuse_mount; struct fuse_release_args; /** FUSE specific file data */ struct fuse_file { /** Fuse connection for this file */ - struct fuse_conn *fc; + struct fuse_mount *fm; /* Argument space reserved for release */ struct fuse_release_args *release_args; @@ -252,7 +267,7 @@ struct fuse_args { bool may_block:1; struct fuse_in_arg in_args[3]; struct fuse_arg out_args[2]; - void (*end)(struct fuse_conn *fc, struct fuse_args *args, int error); + void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error); }; struct fuse_args_pages { @@ -360,6 +375,9 @@ struct fuse_req { /** virtio-fs's physically contiguous buffer for in and out args */ void *argbuf; #endif + + /** fuse_mount this request belongs to */ + struct fuse_mount *fm; }; struct fuse_iqueue; @@ -482,11 +500,15 @@ struct fuse_fs_context { bool destroy:1; bool no_control:1; bool no_force_umount:1; - bool no_mount_options:1; + bool legacy_opts_show:1; + bool dax:1; unsigned int max_read; unsigned int blksize; const char *subtype; + /* DAX device, may be NULL */ + struct dax_device *dax_dev; + /* fuse_dev pointer to fill in, should contain NULL on entry */ void **fudptr; }; @@ -494,9 +516,9 @@ struct fuse_fs_context { /** * A Fuse connection. * - * This structure is created, when the filesystem is mounted, and is - * destroyed, when the client device is closed and the filesystem is - * unmounted. + * This structure is created, when the root filesystem is mounted, and + * is destroyed, when the client device is closed and the last + * fuse_mount is destroyed. */ struct fuse_conn { /** Lock protecting accessess to members of this structure */ @@ -610,6 +632,9 @@ struct fuse_conn { /** cache READLINK responses in page cache */ unsigned cache_symlinks:1; + /* show legacy mount options */ + unsigned int legacy_opts_show:1; + /* * The following bitfields are only for optimization purposes * and hence races in setting them will not cause malfunction @@ -717,8 +742,8 @@ struct fuse_conn { /** Do not allow MNT_FORCE umount */ unsigned int no_force_umount:1; - /* Do not show mount options */ - unsigned int no_mount_options:1; + /* Auto-mount submounts announced by the server */ + unsigned int auto_submounts:1; /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -726,10 +751,10 @@ struct fuse_conn { /** Negotiated minor version */ unsigned minor; - /** Entry on the fuse_conn_list */ + /** Entry on the fuse_mount_list */ struct list_head entry; - /** Device ID from super block */ + /** Device ID from the root super block */ dev_t dev; /** Dentries in the control filesystem */ @@ -747,24 +772,70 @@ struct fuse_conn { /** Called on final put */ void (*release)(struct fuse_conn *); - /** Super block for this connection. */ - struct super_block *sb; - - /** Read/write semaphore to hold when accessing sb. */ + /** + * Read/write semaphore to hold when accessing the sb of any + * fuse_mount belonging to this connection + */ struct rw_semaphore killsb; /** List of device instances belonging to this connection */ struct list_head devices; + +#ifdef CONFIG_FUSE_DAX + /* Dax specific conn data, non-NULL if DAX is enabled */ + struct fuse_conn_dax *dax; +#endif + + /** List of filesystems using this connection */ + struct list_head mounts; }; -static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) +/* + * Represents a mounted filesystem, potentially a submount. + * + * This object allows sharing a fuse_conn between separate mounts to + * allow submounts with dedicated superblocks and thus separate device + * IDs. + */ +struct fuse_mount { + /* Underlying (potentially shared) connection to the FUSE server */ + struct fuse_conn *fc; + + /* Refcount */ + refcount_t count; + + /* + * Super block for this connection (fc->killsb must be held when + * accessing this). + */ + struct super_block *sb; + + /* Entry on fc->mounts */ + struct list_head fc_entry; +}; + +static inline struct fuse_mount *get_fuse_mount_super(struct super_block *sb) { return sb->s_fs_info; } +static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) +{ + struct fuse_mount *fm = get_fuse_mount_super(sb); + + return fm ? fm->fc : NULL; +} + +static inline struct fuse_mount *get_fuse_mount(struct inode *inode) +{ + return get_fuse_mount_super(inode->i_sb); +} + static inline struct fuse_conn *get_fuse_conn(struct inode *inode) { - return get_fuse_conn_super(inode->i_sb); + struct fuse_mount *fm = get_fuse_mount(inode); + + return fm ? fm->fc : NULL; } static inline struct fuse_inode *get_fuse_inode(struct inode *inode) @@ -794,11 +865,6 @@ extern const struct dentry_operations fuse_dentry_operations; extern const struct dentry_operations fuse_root_dentry_operations; /** - * Inode to nodeid comparison. - */ -int fuse_inode_eq(struct inode *inode, void *_nodeidp); - -/** * Get a filled in inode */ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, @@ -848,7 +914,7 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, */ int fuse_open_common(struct inode *inode, struct file *file, bool isdir); -struct fuse_file *fuse_file_alloc(struct fuse_conn *fc); +struct fuse_file *fuse_file_alloc(struct fuse_mount *fm); void fuse_file_free(struct fuse_file *ff); void fuse_finish_open(struct inode *inode, struct file *file); @@ -916,14 +982,14 @@ void __exit fuse_ctl_cleanup(void); /** * Simple request sending that does request allocation and freeing */ -ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args); -int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args, +ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args); +int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args, gfp_t gfp_flags); /** * End a finished request */ -void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req); +void fuse_request_end(struct fuse_req *req); /* Abort all requests */ void fuse_abort_conn(struct fuse_conn *fc); @@ -949,7 +1015,8 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc); /** * Initialize fuse_conn */ -void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns, +void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, + struct user_namespace *user_ns, const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv); /** @@ -957,11 +1024,21 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns, */ void fuse_conn_put(struct fuse_conn *fc); +/** + * Acquire reference to fuse_mount + */ +struct fuse_mount *fuse_mount_get(struct fuse_mount *fm); + +/** + * Release reference to fuse_mount + */ +void fuse_mount_put(struct fuse_mount *fm); + struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc); struct fuse_dev *fuse_dev_alloc(void); void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc); void fuse_dev_free(struct fuse_dev *fud); -void fuse_send_init(struct fuse_conn *fc); +void fuse_send_init(struct fuse_mount *fm); /** * Fill in superblock and initialize fuse connection @@ -970,12 +1047,26 @@ void fuse_send_init(struct fuse_conn *fc); */ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx); -/** - * Disassociate fuse connection from superblock and kill the superblock +/* + * Fill in superblock for submounts + * @sb: partially-initialized superblock to fill in + * @parent_fi: The fuse_inode of the parent filesystem where this submount is + * mounted + */ +int fuse_fill_super_submount(struct super_block *sb, + struct fuse_inode *parent_fi); + +/* + * Remove the mount from the connection * - * Calls kill_anon_super(), do not use with bdev mounts. + * Returns whether this was the last mount */ -void fuse_kill_sb_anon(struct super_block *sb); +bool fuse_mount_remove(struct fuse_mount *fm); + +/* + * Shut down the connection (possibly sending DESTROY request). + */ +void fuse_conn_destroy(struct fuse_mount *fm); /** * Add connection to control filesystem @@ -1011,9 +1102,19 @@ void fuse_set_nowrite(struct inode *inode); void fuse_release_nowrite(struct inode *inode); /** + * Scan all fuse_mounts belonging to fc to find the first where + * ilookup5() returns a result. Return that result and the + * respective fuse_mount in *fm (unless fm is NULL). + * + * The caller must hold fc->killsb. + */ +struct inode *fuse_ilookup(struct fuse_conn *fc, u64 nodeid, + struct fuse_mount **fm); + +/** * File-system tells the kernel to invalidate cache for the given node id. */ -int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, +int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, loff_t offset, loff_t len); /** @@ -1026,10 +1127,10 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, * - is a file or oan empty directory * then the dentry is unhashed (d_delete()). */ -int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, +int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, u64 child_nodeid, struct qstr *name); -int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, +int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file, bool isdir); /** @@ -1093,4 +1194,20 @@ unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args); u64 fuse_get_unique(struct fuse_iqueue *fiq); void fuse_free_conn(struct fuse_conn *fc); +/* dax.c */ + +#define FUSE_IS_DAX(inode) (IS_ENABLED(CONFIG_FUSE_DAX) && IS_DAX(inode)) + +ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to); +ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from); +int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma); +int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start, u64 dmap_end); +int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev); +void fuse_dax_conn_free(struct fuse_conn *fc); +bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi); +void fuse_dax_inode_init(struct inode *inode); +void fuse_dax_inode_cleanup(struct inode *inode); +bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment); +void fuse_dax_cancel_work(struct fuse_conn *fc); + #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index bba747520e9b..1a47afc95f80 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -85,14 +85,22 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) fi->orig_ino = 0; fi->state = 0; mutex_init(&fi->mutex); + init_rwsem(&fi->i_mmap_sem); spin_lock_init(&fi->lock); fi->forget = fuse_alloc_forget(); - if (!fi->forget) { - kmem_cache_free(fuse_inode_cachep, fi); - return NULL; - } + if (!fi->forget) + goto out_free; + + if (IS_ENABLED(CONFIG_FUSE_DAX) && !fuse_dax_inode_alloc(sb, fi)) + goto out_free_forget; return &fi->inode; + +out_free_forget: + kfree(fi->forget); +out_free: + kmem_cache_free(fuse_inode_cachep, fi); + return NULL; } static void fuse_free_inode(struct inode *inode) @@ -101,6 +109,9 @@ static void fuse_free_inode(struct inode *inode) mutex_destroy(&fi->mutex); kfree(fi->forget); +#ifdef CONFIG_FUSE_DAX + kfree(fi->dax); +#endif kmem_cache_free(fuse_inode_cachep, fi); } @@ -112,8 +123,14 @@ static void fuse_evict_inode(struct inode *inode) clear_inode(inode); if (inode->i_sb->s_flags & SB_ACTIVE) { struct fuse_conn *fc = get_fuse_conn(inode); - fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup); - fi->forget = NULL; + + if (FUSE_IS_DAX(inode)) + fuse_dax_inode_cleanup(inode); + if (fi->nlookup) { + fuse_queue_forget(fc, fi->forget, fi->nodeid, + fi->nlookup); + fi->forget = NULL; + } } if (S_ISREG(inode->i_mode) && !is_bad_inode(inode)) { WARN_ON(!list_empty(&fi->write_files)); @@ -268,7 +285,7 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) BUG(); } -int fuse_inode_eq(struct inode *inode, void *_nodeidp) +static int fuse_inode_eq(struct inode *inode, void *_nodeidp) { u64 nodeid = *(u64 *) _nodeidp; if (get_node_id(inode) == nodeid) @@ -292,7 +309,26 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, struct fuse_inode *fi; struct fuse_conn *fc = get_fuse_conn_super(sb); - retry: + /* + * Auto mount points get their node id from the submount root, which is + * not a unique identifier within this filesystem. + * + * To avoid conflicts, do not place submount points into the inode hash + * table. + */ + if (fc->auto_submounts && (attr->flags & FUSE_ATTR_SUBMOUNT) && + S_ISDIR(attr->mode)) { + inode = new_inode(sb); + if (!inode) + return NULL; + + fuse_init_inode(inode, attr); + get_fuse_inode(inode)->nodeid = nodeid; + inode->i_flags |= S_AUTOMOUNT; + goto done; + } + +retry: inode = iget5_locked(sb, nodeid, fuse_inode_eq, fuse_inode_set, &nodeid); if (!inode) return NULL; @@ -310,7 +346,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, iput(inode); goto retry; } - +done: fi = get_fuse_inode(inode); spin_lock(&fi->lock); fi->nlookup++; @@ -320,16 +356,37 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, return inode; } -int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, +struct inode *fuse_ilookup(struct fuse_conn *fc, u64 nodeid, + struct fuse_mount **fm) +{ + struct fuse_mount *fm_iter; + struct inode *inode; + + WARN_ON(!rwsem_is_locked(&fc->killsb)); + list_for_each_entry(fm_iter, &fc->mounts, fc_entry) { + if (!fm_iter->sb) + continue; + + inode = ilookup5(fm_iter->sb, nodeid, fuse_inode_eq, &nodeid); + if (inode) { + if (fm) + *fm = fm_iter; + return inode; + } + } + + return NULL; +} + +int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, loff_t offset, loff_t len) { - struct fuse_conn *fc = get_fuse_conn_super(sb); struct fuse_inode *fi; struct inode *inode; pgoff_t pg_start; pgoff_t pg_end; - inode = ilookup5(sb, nodeid, fuse_inode_eq, &nodeid); + inode = fuse_ilookup(fc, nodeid, NULL); if (!inode) return -ENOENT; @@ -379,28 +436,23 @@ static void fuse_umount_begin(struct super_block *sb) fuse_abort_conn(fc); } -static void fuse_send_destroy(struct fuse_conn *fc) +static void fuse_send_destroy(struct fuse_mount *fm) { - if (fc->conn_init) { + if (fm->fc->conn_init) { FUSE_ARGS(args); args.opcode = FUSE_DESTROY; args.force = true; args.nocreds = true; - fuse_simple_request(fc, &args); + fuse_simple_request(fm, &args); } } static void fuse_put_super(struct super_block *sb) { - struct fuse_conn *fc = get_fuse_conn_super(sb); + struct fuse_mount *fm = get_fuse_mount_super(sb); - mutex_lock(&fuse_mutex); - list_del(&fc->entry); - fuse_ctl_remove_conn(fc); - mutex_unlock(&fuse_mutex); - - fuse_conn_put(fc); + fuse_mount_put(fm); } static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr) @@ -420,12 +472,12 @@ static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; - struct fuse_conn *fc = get_fuse_conn_super(sb); + struct fuse_mount *fm = get_fuse_mount_super(sb); FUSE_ARGS(args); struct fuse_statfs_out outarg; int err; - if (!fuse_allow_current_process(fc)) { + if (!fuse_allow_current_process(fm->fc)) { buf->f_type = FUSE_SUPER_MAGIC; return 0; } @@ -437,7 +489,7 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (!err) convert_fuse_statfs(buf, &outarg.st); return err; @@ -573,19 +625,25 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root) struct super_block *sb = root->d_sb; struct fuse_conn *fc = get_fuse_conn_super(sb); - if (fc->no_mount_options) - return 0; + if (fc->legacy_opts_show) { + seq_printf(m, ",user_id=%u", + from_kuid_munged(fc->user_ns, fc->user_id)); + seq_printf(m, ",group_id=%u", + from_kgid_munged(fc->user_ns, fc->group_id)); + if (fc->default_permissions) + seq_puts(m, ",default_permissions"); + if (fc->allow_other) + seq_puts(m, ",allow_other"); + if (fc->max_read != ~0) + seq_printf(m, ",max_read=%u", fc->max_read); + if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE) + seq_printf(m, ",blksize=%lu", sb->s_blocksize); + } +#ifdef CONFIG_FUSE_DAX + if (fc->dax) + seq_puts(m, ",dax"); +#endif - seq_printf(m, ",user_id=%u", from_kuid_munged(fc->user_ns, fc->user_id)); - seq_printf(m, ",group_id=%u", from_kgid_munged(fc->user_ns, fc->group_id)); - if (fc->default_permissions) - seq_puts(m, ",default_permissions"); - if (fc->allow_other) - seq_puts(m, ",allow_other"); - if (fc->max_read != ~0) - seq_printf(m, ",max_read=%u", fc->max_read); - if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE) - seq_printf(m, ",blksize=%lu", sb->s_blocksize); return 0; } @@ -615,7 +673,8 @@ static void fuse_pqueue_init(struct fuse_pqueue *fpq) fpq->connected = 1; } -void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns, +void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, + struct user_namespace *user_ns, const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv) { memset(fc, 0, sizeof(*fc)); @@ -642,6 +701,11 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns, fc->pid_ns = get_pid_ns(task_active_pid_ns(current)); fc->user_ns = get_user_ns(user_ns); fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; + + INIT_LIST_HEAD(&fc->mounts); + list_add(&fm->fc_entry, &fc->mounts); + fm->fc = fc; + refcount_set(&fm->count, 1); } EXPORT_SYMBOL_GPL(fuse_conn_init); @@ -650,6 +714,8 @@ void fuse_conn_put(struct fuse_conn *fc) if (refcount_dec_and_test(&fc->count)) { struct fuse_iqueue *fiq = &fc->iq; + if (IS_ENABLED(CONFIG_FUSE_DAX)) + fuse_dax_conn_free(fc); if (fiq->ops->release) fiq->ops->release(fiq); put_pid_ns(fc->pid_ns); @@ -666,6 +732,23 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc) } EXPORT_SYMBOL_GPL(fuse_conn_get); +void fuse_mount_put(struct fuse_mount *fm) +{ + if (refcount_dec_and_test(&fm->count)) { + if (fm->fc) + fuse_conn_put(fm->fc); + kfree(fm); + } +} +EXPORT_SYMBOL_GPL(fuse_mount_put); + +struct fuse_mount *fuse_mount_get(struct fuse_mount *fm) +{ + refcount_inc(&fm->count); + return fm; +} +EXPORT_SYMBOL_GPL(fuse_mount_get); + static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode) { struct fuse_attr attr; @@ -895,14 +978,16 @@ struct fuse_init_args { struct fuse_init_out out; }; -static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args, +static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, int error) { + struct fuse_conn *fc = fm->fc; struct fuse_init_args *ia = container_of(args, typeof(*ia), args); struct fuse_init_out *arg = &ia->out; + bool ok = true; if (error || arg->major != FUSE_KERNEL_VERSION) - fc->conn_error = 1; + ok = false; else { unsigned long ra_pages; @@ -950,11 +1035,11 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args, if (arg->flags & FUSE_HANDLE_KILLPRIV) fc->handle_killpriv = 1; if (arg->time_gran && arg->time_gran <= 1000000000) - fc->sb->s_time_gran = arg->time_gran; + fm->sb->s_time_gran = arg->time_gran; if ((arg->flags & FUSE_POSIX_ACL)) { fc->default_permissions = 1; fc->posix_acl = 1; - fc->sb->s_xattr = fuse_acl_xattr_handlers; + fm->sb->s_xattr = fuse_acl_xattr_handlers; } if (arg->flags & FUSE_CACHE_SYMLINKS) fc->cache_symlinks = 1; @@ -965,14 +1050,19 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args, min_t(unsigned int, FUSE_MAX_MAX_PAGES, max_t(unsigned int, arg->max_pages, 1)); } + if (IS_ENABLED(CONFIG_FUSE_DAX) && + arg->flags & FUSE_MAP_ALIGNMENT && + !fuse_dax_check_alignment(fc, arg->map_alignment)) { + ok = false; + } } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; fc->no_flock = 1; } - fc->sb->s_bdi->ra_pages = - min(fc->sb->s_bdi->ra_pages, ra_pages); + fm->sb->s_bdi->ra_pages = + min(fm->sb->s_bdi->ra_pages, ra_pages); fc->minor = arg->minor; fc->max_write = arg->minor < 5 ? 4096 : arg->max_write; fc->max_write = max_t(unsigned, 4096, fc->max_write); @@ -980,11 +1070,16 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args, } kfree(ia); + if (!ok) { + fc->conn_init = 0; + fc->conn_error = 1; + } + fuse_set_initialized(fc); wake_up_all(&fc->blocked_waitq); } -void fuse_send_init(struct fuse_conn *fc) +void fuse_send_init(struct fuse_mount *fm) { struct fuse_init_args *ia; @@ -992,7 +1087,7 @@ void fuse_send_init(struct fuse_conn *fc) ia->in.major = FUSE_KERNEL_VERSION; ia->in.minor = FUSE_KERNEL_MINOR_VERSION; - ia->in.max_readahead = fc->sb->s_bdi->ra_pages * PAGE_SIZE; + ia->in.max_readahead = fm->sb->s_bdi->ra_pages * PAGE_SIZE; ia->in.flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | @@ -1003,6 +1098,13 @@ void fuse_send_init(struct fuse_conn *fc) FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS | FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA; +#ifdef CONFIG_FUSE_DAX + if (fm->fc->dax) + ia->in.flags |= FUSE_MAP_ALIGNMENT; +#endif + if (fm->fc->auto_submounts) + ia->in.flags |= FUSE_SUBMOUNTS; + ia->args.opcode = FUSE_INIT; ia->args.in_numargs = 1; ia->args.in_args[0].size = sizeof(ia->in); @@ -1018,8 +1120,8 @@ void fuse_send_init(struct fuse_conn *fc) ia->args.nocreds = true; ia->args.end = process_init_reply; - if (fuse_simple_background(fc, &ia->args, GFP_KERNEL) != 0) - process_init_reply(fc, &ia->args, -ENOTCONN); + if (fuse_simple_background(fm, &ia->args, GFP_KERNEL) != 0) + process_init_reply(fm, &ia->args, -ENOTCONN); } EXPORT_SYMBOL_GPL(fuse_send_init); @@ -1049,9 +1151,9 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) if (err) return err; - sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; /* fuse does it's own writeback accounting */ - sb->s_bdi->capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT; + sb->s_bdi->capabilities &= ~BDI_CAP_WRITEBACK_ACCT; + sb->s_bdi->capabilities |= BDI_CAP_STRICTLIMIT; /* * For a single fuse filesystem use max 1% of dirty + @@ -1130,10 +1232,92 @@ void fuse_dev_free(struct fuse_dev *fud) } EXPORT_SYMBOL_GPL(fuse_dev_free); +static void fuse_fill_attr_from_inode(struct fuse_attr *attr, + const struct fuse_inode *fi) +{ + *attr = (struct fuse_attr){ + .ino = fi->inode.i_ino, + .size = fi->inode.i_size, + .blocks = fi->inode.i_blocks, + .atime = fi->inode.i_atime.tv_sec, + .mtime = fi->inode.i_mtime.tv_sec, + .ctime = fi->inode.i_ctime.tv_sec, + .atimensec = fi->inode.i_atime.tv_nsec, + .mtimensec = fi->inode.i_mtime.tv_nsec, + .ctimensec = fi->inode.i_ctime.tv_nsec, + .mode = fi->inode.i_mode, + .nlink = fi->inode.i_nlink, + .uid = fi->inode.i_uid.val, + .gid = fi->inode.i_gid.val, + .rdev = fi->inode.i_rdev, + .blksize = 1u << fi->inode.i_blkbits, + }; +} + +static void fuse_sb_defaults(struct super_block *sb) +{ + sb->s_magic = FUSE_SUPER_MAGIC; + sb->s_op = &fuse_super_operations; + sb->s_xattr = fuse_xattr_handlers; + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_time_gran = 1; + sb->s_export_op = &fuse_export_operations; + sb->s_iflags |= SB_I_IMA_UNVERIFIABLE_SIGNATURE; + if (sb->s_user_ns != &init_user_ns) + sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER; + sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION); + + /* + * If we are not in the initial user namespace posix + * acls must be translated. + */ + if (sb->s_user_ns != &init_user_ns) + sb->s_xattr = fuse_no_acl_xattr_handlers; +} + +int fuse_fill_super_submount(struct super_block *sb, + struct fuse_inode *parent_fi) +{ + struct fuse_mount *fm = get_fuse_mount_super(sb); + struct super_block *parent_sb = parent_fi->inode.i_sb; + struct fuse_attr root_attr; + struct inode *root; + + fuse_sb_defaults(sb); + fm->sb = sb; + + WARN_ON(sb->s_bdi != &noop_backing_dev_info); + sb->s_bdi = bdi_get(parent_sb->s_bdi); + + sb->s_xattr = parent_sb->s_xattr; + sb->s_time_gran = parent_sb->s_time_gran; + sb->s_blocksize = parent_sb->s_blocksize; + sb->s_blocksize_bits = parent_sb->s_blocksize_bits; + sb->s_subtype = kstrdup(parent_sb->s_subtype, GFP_KERNEL); + if (parent_sb->s_subtype && !sb->s_subtype) + return -ENOMEM; + + fuse_fill_attr_from_inode(&root_attr, parent_fi); + root = fuse_iget(sb, parent_fi->nodeid, 0, &root_attr, 0, 0); + /* + * This inode is just a duplicate, so it is not looked up and + * its nlookup should not be incremented. fuse_iget() does + * that, though, so undo it here. + */ + get_fuse_inode(root)->nlookup--; + sb->s_d_op = &fuse_dentry_operations; + sb->s_root = d_make_root(root); + if (!sb->s_root) + return -ENOMEM; + + return 0; +} + int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) { struct fuse_dev *fud = NULL; - struct fuse_conn *fc = get_fuse_conn_super(sb); + struct fuse_mount *fm = get_fuse_mount_super(sb); + struct fuse_conn *fc = fm->fc; struct inode *root; struct dentry *root_dentry; int err; @@ -1142,7 +1326,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) if (sb->s_flags & SB_MANDLOCK) goto err; - sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION); + fuse_sb_defaults(sb); if (ctx->is_bdev) { #ifdef CONFIG_BLOCK @@ -1157,32 +1341,21 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) sb->s_subtype = ctx->subtype; ctx->subtype = NULL; - sb->s_magic = FUSE_SUPER_MAGIC; - sb->s_op = &fuse_super_operations; - sb->s_xattr = fuse_xattr_handlers; - sb->s_maxbytes = MAX_LFS_FILESIZE; - sb->s_time_gran = 1; - sb->s_export_op = &fuse_export_operations; - sb->s_iflags |= SB_I_IMA_UNVERIFIABLE_SIGNATURE; - if (sb->s_user_ns != &init_user_ns) - sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER; - - /* - * If we are not in the initial user namespace posix - * acls must be translated. - */ - if (sb->s_user_ns != &init_user_ns) - sb->s_xattr = fuse_no_acl_xattr_handlers; + if (IS_ENABLED(CONFIG_FUSE_DAX)) { + err = fuse_dax_conn_alloc(fc, ctx->dax_dev); + if (err) + goto err; + } if (ctx->fudptr) { err = -ENOMEM; fud = fuse_dev_alloc_install(fc); if (!fud) - goto err; + goto err_free_dax; } fc->dev = sb->s_dev; - fc->sb = sb; + fm->sb = sb; err = fuse_bdi_init(fc, sb); if (err) goto err_dev_free; @@ -1196,11 +1369,11 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) fc->allow_other = ctx->allow_other; fc->user_id = ctx->user_id; fc->group_id = ctx->group_id; - fc->max_read = max_t(unsigned, 4096, ctx->max_read); + fc->legacy_opts_show = ctx->legacy_opts_show; + fc->max_read = max_t(unsigned int, 4096, ctx->max_read); fc->destroy = ctx->destroy; fc->no_control = ctx->no_control; fc->no_force_umount = ctx->no_force_umount; - fc->no_mount_options = ctx->no_mount_options; err = -ENOMEM; root = fuse_get_root_inode(sb, ctx->rootmode); @@ -1233,6 +1406,9 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) err_dev_free: if (fud) fuse_dev_free(fud); + err_free_dax: + if (IS_ENABLED(CONFIG_FUSE_DAX)) + fuse_dax_conn_free(fc); err: return err; } @@ -1244,6 +1420,7 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) struct file *file; int err; struct fuse_conn *fc; + struct fuse_mount *fm; err = -EINVAL; file = fget(ctx->fd); @@ -1264,9 +1441,16 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) if (!fc) goto err_fput; - fuse_conn_init(fc, sb->s_user_ns, &fuse_dev_fiq_ops, NULL); + fm = kzalloc(sizeof(*fm), GFP_KERNEL); + if (!fm) { + kfree(fc); + goto err_fput; + } + + fuse_conn_init(fc, fm, sb->s_user_ns, &fuse_dev_fiq_ops, NULL); fc->release = fuse_free_conn; - sb->s_fs_info = fc; + + sb->s_fs_info = fm; err = fuse_fill_super_common(sb, ctx); if (err) @@ -1277,11 +1461,11 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) * CPUs after this */ fput(file); - fuse_send_init(get_fuse_conn_super(sb)); + fuse_send_init(get_fuse_mount_super(sb)); return 0; err_put_conn: - fuse_conn_put(fc); + fuse_mount_put(fm); sb->s_fs_info = NULL; err_fput: fput(file); @@ -1325,6 +1509,7 @@ static int fuse_init_fs_context(struct fs_context *fc) ctx->max_read = ~0; ctx->blksize = FUSE_DEFAULT_BLKSIZE; + ctx->legacy_opts_show = true; #ifdef CONFIG_BLOCK if (fc->fs_type == &fuseblk_fs_type) { @@ -1338,29 +1523,52 @@ static int fuse_init_fs_context(struct fs_context *fc) return 0; } -static void fuse_sb_destroy(struct super_block *sb) +bool fuse_mount_remove(struct fuse_mount *fm) { - struct fuse_conn *fc = get_fuse_conn_super(sb); + struct fuse_conn *fc = fm->fc; + bool last = false; - if (fc) { - if (fc->destroy) - fuse_send_destroy(fc); + down_write(&fc->killsb); + list_del_init(&fm->fc_entry); + if (list_empty(&fc->mounts)) + last = true; + up_write(&fc->killsb); - fuse_abort_conn(fc); - fuse_wait_aborted(fc); + return last; +} +EXPORT_SYMBOL_GPL(fuse_mount_remove); - down_write(&fc->killsb); - fc->sb = NULL; - up_write(&fc->killsb); +void fuse_conn_destroy(struct fuse_mount *fm) +{ + struct fuse_conn *fc = fm->fc; + + if (fc->destroy) + fuse_send_destroy(fm); + + fuse_abort_conn(fc); + fuse_wait_aborted(fc); + + if (!list_empty(&fc->entry)) { + mutex_lock(&fuse_mutex); + list_del(&fc->entry); + fuse_ctl_remove_conn(fc); + mutex_unlock(&fuse_mutex); } } +EXPORT_SYMBOL_GPL(fuse_conn_destroy); -void fuse_kill_sb_anon(struct super_block *sb) +static void fuse_kill_sb_anon(struct super_block *sb) { - fuse_sb_destroy(sb); + struct fuse_mount *fm = get_fuse_mount_super(sb); + bool last; + + if (fm) { + last = fuse_mount_remove(fm); + if (last) + fuse_conn_destroy(fm); + } kill_anon_super(sb); } -EXPORT_SYMBOL_GPL(fuse_kill_sb_anon); static struct file_system_type fuse_fs_type = { .owner = THIS_MODULE, @@ -1375,7 +1583,14 @@ MODULE_ALIAS_FS("fuse"); #ifdef CONFIG_BLOCK static void fuse_kill_sb_blk(struct super_block *sb) { - fuse_sb_destroy(sb); + struct fuse_mount *fm = get_fuse_mount_super(sb); + bool last; + + if (fm) { + last = fuse_mount_remove(fm); + if (last) + fuse_conn_destroy(fm); + } kill_block_super(sb); } diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 90e3f01bd796..3b5e91045871 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -252,7 +252,7 @@ retry: static void fuse_force_forget(struct file *file, u64 nodeid) { struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_forget_in inarg; FUSE_ARGS(args); @@ -266,7 +266,7 @@ static void fuse_force_forget(struct file *file, u64 nodeid) args.force = true; args.noreply = true; - fuse_simple_request(fc, &args); + fuse_simple_request(fm, &args); /* ignore errors */ } @@ -320,7 +320,7 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) ssize_t res; struct page *page; struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_io_args ia = {}; struct fuse_args_pages *ap = &ia.ap; struct fuse_page_desc desc = { .length = PAGE_SIZE }; @@ -337,7 +337,7 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) ap->pages = &page; ap->descs = &desc; if (plus) { - attr_version = fuse_get_attr_version(fc); + attr_version = fuse_get_attr_version(fm->fc); fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE, FUSE_READDIRPLUS); } else { @@ -345,7 +345,7 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) FUSE_READDIR); } locked = fuse_lock_inode(inode); - res = fuse_simple_request(fc, &ap->args); + res = fuse_simple_request(fm, &ap->args); fuse_unlock_inode(inode, locked); if (res >= 0) { if (!res) { diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 104f35de5270..21a9e534417c 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -5,12 +5,17 @@ */ #include <linux/fs.h> +#include <linux/dax.h> +#include <linux/pci.h> +#include <linux/pfn_t.h> #include <linux/module.h> #include <linux/virtio.h> #include <linux/virtio_fs.h> #include <linux/delay.h> #include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/highmem.h> +#include <linux/uio.h> #include "fuse_i.h" /* List of virtio-fs device instances and a lock for the list. Also provides @@ -24,6 +29,8 @@ enum { VQ_REQUEST }; +#define VQ_NAME_LEN 24 + /* Per-virtqueue state */ struct virtio_fs_vq { spinlock_t lock; @@ -36,7 +43,7 @@ struct virtio_fs_vq { bool connected; long in_flight; struct completion in_flight_zero; /* No inflight requests */ - char name[24]; + char name[VQ_NAME_LEN]; } ____cacheline_aligned_in_smp; /* A virtio-fs device instance */ @@ -47,6 +54,12 @@ struct virtio_fs { struct virtio_fs_vq *vqs; unsigned int nvqs; /* number of virtqueues */ unsigned int num_request_queues; /* number of request queues */ + struct dax_device *dax_dev; + + /* DAX memory window where file contents are mapped */ + void *window_kaddr; + phys_addr_t window_phys_addr; + size_t window_len; }; struct virtio_fs_forget_req { @@ -69,6 +82,44 @@ struct virtio_fs_req_work { static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, struct fuse_req *req, bool in_flight); +enum { + OPT_DAX, +}; + +static const struct fs_parameter_spec virtio_fs_parameters[] = { + fsparam_flag("dax", OPT_DAX), + {} +}; + +static int virtio_fs_parse_param(struct fs_context *fc, + struct fs_parameter *param) +{ + struct fs_parse_result result; + struct fuse_fs_context *ctx = fc->fs_private; + int opt; + + opt = fs_parse(fc, virtio_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case OPT_DAX: + ctx->dax = 1; + break; + default: + return -EINVAL; + } + + return 0; +} + +static void virtio_fs_free_fc(struct fs_context *fc) +{ + struct fuse_fs_context *ctx = fc->fs_private; + + kfree(ctx); +} + static inline struct virtio_fs_vq *vq_to_fsvq(struct virtqueue *vq) { struct virtio_fs *fs = vq->vdev->priv; @@ -289,7 +340,6 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work) struct fuse_req *req; struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, dispatch_work.work); - struct fuse_conn *fc = fsvq->fud->fc; int ret; pr_debug("virtio-fs: worker %s called.\n", __func__); @@ -304,7 +354,7 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work) list_del_init(&req->list); spin_unlock(&fsvq->lock); - fuse_request_end(fc, req); + fuse_request_end(req); } /* Dispatch pending requests */ @@ -335,7 +385,7 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work) spin_unlock(&fsvq->lock); pr_err("virtio-fs: virtio_fs_enqueue_req() failed %d\n", ret); - fuse_request_end(fc, req); + fuse_request_end(req); } } } @@ -495,7 +545,6 @@ static void virtio_fs_request_complete(struct fuse_req *req, struct virtio_fs_vq *fsvq) { struct fuse_pqueue *fpq = &fsvq->fud->pq; - struct fuse_conn *fc = fsvq->fud->fc; struct fuse_args *args; struct fuse_args_pages *ap; unsigned int len, i, thislen; @@ -528,7 +577,7 @@ static void virtio_fs_request_complete(struct fuse_req *req, clear_bit(FR_SENT, &req->flags); spin_unlock(&fpq->lock); - fuse_request_end(fc, req); + fuse_request_end(req); spin_lock(&fsvq->lock); dec_in_flight_req(fsvq); spin_unlock(&fsvq->lock); @@ -596,6 +645,26 @@ static void virtio_fs_vq_done(struct virtqueue *vq) schedule_work(&fsvq->done_work); } +static void virtio_fs_init_vq(struct virtio_fs_vq *fsvq, char *name, + int vq_type) +{ + strncpy(fsvq->name, name, VQ_NAME_LEN); + spin_lock_init(&fsvq->lock); + INIT_LIST_HEAD(&fsvq->queued_reqs); + INIT_LIST_HEAD(&fsvq->end_reqs); + init_completion(&fsvq->in_flight_zero); + + if (vq_type == VQ_REQUEST) { + INIT_WORK(&fsvq->done_work, virtio_fs_requests_done_work); + INIT_DELAYED_WORK(&fsvq->dispatch_work, + virtio_fs_request_dispatch_work); + } else { + INIT_WORK(&fsvq->done_work, virtio_fs_hiprio_done_work); + INIT_DELAYED_WORK(&fsvq->dispatch_work, + virtio_fs_hiprio_dispatch_work); + } +} + /* Initialize virtqueues */ static int virtio_fs_setup_vqs(struct virtio_device *vdev, struct virtio_fs *fs) @@ -611,7 +680,7 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev, if (fs->num_request_queues == 0) return -EINVAL; - fs->nvqs = 1 + fs->num_request_queues; + fs->nvqs = VQ_REQUEST + fs->num_request_queues; fs->vqs = kcalloc(fs->nvqs, sizeof(fs->vqs[VQ_HIPRIO]), GFP_KERNEL); if (!fs->vqs) return -ENOMEM; @@ -625,29 +694,17 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev, goto out; } + /* Initialize the hiprio/forget request virtqueue */ callbacks[VQ_HIPRIO] = virtio_fs_vq_done; - snprintf(fs->vqs[VQ_HIPRIO].name, sizeof(fs->vqs[VQ_HIPRIO].name), - "hiprio"); + virtio_fs_init_vq(&fs->vqs[VQ_HIPRIO], "hiprio", VQ_HIPRIO); names[VQ_HIPRIO] = fs->vqs[VQ_HIPRIO].name; - INIT_WORK(&fs->vqs[VQ_HIPRIO].done_work, virtio_fs_hiprio_done_work); - INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].queued_reqs); - INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].end_reqs); - INIT_DELAYED_WORK(&fs->vqs[VQ_HIPRIO].dispatch_work, - virtio_fs_hiprio_dispatch_work); - init_completion(&fs->vqs[VQ_HIPRIO].in_flight_zero); - spin_lock_init(&fs->vqs[VQ_HIPRIO].lock); /* Initialize the requests virtqueues */ for (i = VQ_REQUEST; i < fs->nvqs; i++) { - spin_lock_init(&fs->vqs[i].lock); - INIT_WORK(&fs->vqs[i].done_work, virtio_fs_requests_done_work); - INIT_DELAYED_WORK(&fs->vqs[i].dispatch_work, - virtio_fs_request_dispatch_work); - INIT_LIST_HEAD(&fs->vqs[i].queued_reqs); - INIT_LIST_HEAD(&fs->vqs[i].end_reqs); - init_completion(&fs->vqs[i].in_flight_zero); - snprintf(fs->vqs[i].name, sizeof(fs->vqs[i].name), - "requests.%u", i - VQ_REQUEST); + char vq_name[VQ_NAME_LEN]; + + snprintf(vq_name, VQ_NAME_LEN, "requests.%u", i - VQ_REQUEST); + virtio_fs_init_vq(&fs->vqs[i], vq_name, VQ_REQUEST); callbacks[i] = virtio_fs_vq_done; names[i] = fs->vqs[i].name; } @@ -676,6 +733,130 @@ static void virtio_fs_cleanup_vqs(struct virtio_device *vdev, vdev->config->del_vqs(vdev); } +/* Map a window offset to a page frame number. The window offset will have + * been produced by .iomap_begin(), which maps a file offset to a window + * offset. + */ +static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, + long nr_pages, void **kaddr, pfn_t *pfn) +{ + struct virtio_fs *fs = dax_get_private(dax_dev); + phys_addr_t offset = PFN_PHYS(pgoff); + size_t max_nr_pages = fs->window_len/PAGE_SIZE - pgoff; + + if (kaddr) + *kaddr = fs->window_kaddr + offset; + if (pfn) + *pfn = phys_to_pfn_t(fs->window_phys_addr + offset, + PFN_DEV | PFN_MAP); + return nr_pages > max_nr_pages ? max_nr_pages : nr_pages; +} + +static size_t virtio_fs_copy_from_iter(struct dax_device *dax_dev, + pgoff_t pgoff, void *addr, + size_t bytes, struct iov_iter *i) +{ + return copy_from_iter(addr, bytes, i); +} + +static size_t virtio_fs_copy_to_iter(struct dax_device *dax_dev, + pgoff_t pgoff, void *addr, + size_t bytes, struct iov_iter *i) +{ + return copy_to_iter(addr, bytes, i); +} + +static int virtio_fs_zero_page_range(struct dax_device *dax_dev, + pgoff_t pgoff, size_t nr_pages) +{ + long rc; + void *kaddr; + + rc = dax_direct_access(dax_dev, pgoff, nr_pages, &kaddr, NULL); + if (rc < 0) + return rc; + memset(kaddr, 0, nr_pages << PAGE_SHIFT); + dax_flush(dax_dev, kaddr, nr_pages << PAGE_SHIFT); + return 0; +} + +static const struct dax_operations virtio_fs_dax_ops = { + .direct_access = virtio_fs_direct_access, + .copy_from_iter = virtio_fs_copy_from_iter, + .copy_to_iter = virtio_fs_copy_to_iter, + .zero_page_range = virtio_fs_zero_page_range, +}; + +static void virtio_fs_cleanup_dax(void *data) +{ + struct dax_device *dax_dev = data; + + kill_dax(dax_dev); + put_dax(dax_dev); +} + +static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs) +{ + struct virtio_shm_region cache_reg; + struct dev_pagemap *pgmap; + bool have_cache; + + if (!IS_ENABLED(CONFIG_FUSE_DAX)) + return 0; + + /* Get cache region */ + have_cache = virtio_get_shm_region(vdev, &cache_reg, + (u8)VIRTIO_FS_SHMCAP_ID_CACHE); + if (!have_cache) { + dev_notice(&vdev->dev, "%s: No cache capability\n", __func__); + return 0; + } + + if (!devm_request_mem_region(&vdev->dev, cache_reg.addr, cache_reg.len, + dev_name(&vdev->dev))) { + dev_warn(&vdev->dev, "could not reserve region addr=0x%llx len=0x%llx\n", + cache_reg.addr, cache_reg.len); + return -EBUSY; + } + + dev_notice(&vdev->dev, "Cache len: 0x%llx @ 0x%llx\n", cache_reg.len, + cache_reg.addr); + + pgmap = devm_kzalloc(&vdev->dev, sizeof(*pgmap), GFP_KERNEL); + if (!pgmap) + return -ENOMEM; + + pgmap->type = MEMORY_DEVICE_FS_DAX; + + /* Ideally we would directly use the PCI BAR resource but + * devm_memremap_pages() wants its own copy in pgmap. So + * initialize a struct resource from scratch (only the start + * and end fields will be used). + */ + pgmap->range = (struct range) { + .start = (phys_addr_t) cache_reg.addr, + .end = (phys_addr_t) cache_reg.addr + cache_reg.len - 1, + }; + pgmap->nr_range = 1; + + fs->window_kaddr = devm_memremap_pages(&vdev->dev, pgmap); + if (IS_ERR(fs->window_kaddr)) + return PTR_ERR(fs->window_kaddr); + + fs->window_phys_addr = (phys_addr_t) cache_reg.addr; + fs->window_len = (phys_addr_t) cache_reg.len; + + dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len 0x%llx\n", + __func__, fs->window_kaddr, cache_reg.addr, cache_reg.len); + + fs->dax_dev = alloc_dax(fs, NULL, &virtio_fs_dax_ops, 0); + if (IS_ERR(fs->dax_dev)) + return PTR_ERR(fs->dax_dev); + + return devm_add_action_or_reset(&vdev->dev, virtio_fs_cleanup_dax, + fs->dax_dev); +} + static int virtio_fs_probe(struct virtio_device *vdev) { struct virtio_fs *fs; @@ -697,6 +878,10 @@ static int virtio_fs_probe(struct virtio_device *vdev) /* TODO vq affinity */ + ret = virtio_fs_setup_dax(vdev, fs); + if (ret < 0) + goto out_vqs; + /* Bring the device online in case the filesystem is mounted and * requests need to be sent before we return. */ @@ -833,18 +1018,37 @@ __releases(fiq->lock) spin_unlock(&fiq->lock); } +/* Count number of scatter-gather elements required */ +static unsigned int sg_count_fuse_pages(struct fuse_page_desc *page_descs, + unsigned int num_pages, + unsigned int total_len) +{ + unsigned int i; + unsigned int this_len; + + for (i = 0; i < num_pages && total_len; i++) { + this_len = min(page_descs[i].length, total_len); + total_len -= this_len; + } + + return i; +} + /* Return the number of scatter-gather list elements required */ static unsigned int sg_count_fuse_req(struct fuse_req *req) { struct fuse_args *args = req->args; struct fuse_args_pages *ap = container_of(args, typeof(*ap), args); - unsigned int total_sgs = 1 /* fuse_in_header */; + unsigned int size, total_sgs = 1 /* fuse_in_header */; if (args->in_numargs - args->in_pages) total_sgs += 1; - if (args->in_pages) - total_sgs += ap->num_pages; + if (args->in_pages) { + size = args->in_args[args->in_numargs - 1].size; + total_sgs += sg_count_fuse_pages(ap->descs, ap->num_pages, + size); + } if (!test_bit(FR_ISREPLY, &req->flags)) return total_sgs; @@ -854,8 +1058,11 @@ static unsigned int sg_count_fuse_req(struct fuse_req *req) if (args->out_numargs - args->out_pages) total_sgs += 1; - if (args->out_pages) - total_sgs += ap->num_pages; + if (args->out_pages) { + size = args->out_args[args->out_numargs - 1].size; + total_sgs += sg_count_fuse_pages(ap->descs, ap->num_pages, + size); + } return total_sgs; } @@ -1071,24 +1278,28 @@ static const struct fuse_iqueue_ops virtio_fs_fiq_ops = { .release = virtio_fs_fiq_release, }; -static int virtio_fs_fill_super(struct super_block *sb) +static inline void virtio_fs_ctx_set_defaults(struct fuse_fs_context *ctx) { - struct fuse_conn *fc = get_fuse_conn_super(sb); + ctx->rootmode = S_IFDIR; + ctx->default_permissions = 1; + ctx->allow_other = 1; + ctx->max_read = UINT_MAX; + ctx->blksize = 512; + ctx->destroy = true; + ctx->no_control = true; + ctx->no_force_umount = true; +} + +static int virtio_fs_fill_super(struct super_block *sb, struct fs_context *fsc) +{ + struct fuse_mount *fm = get_fuse_mount_super(sb); + struct fuse_conn *fc = fm->fc; struct virtio_fs *fs = fc->iq.priv; + struct fuse_fs_context *ctx = fsc->fs_private; unsigned int i; int err; - struct fuse_fs_context ctx = { - .rootmode = S_IFDIR, - .default_permissions = 1, - .allow_other = 1, - .max_read = UINT_MAX, - .blksize = 512, - .destroy = true, - .no_control = true, - .no_force_umount = true, - .no_mount_options = true, - }; + virtio_fs_ctx_set_defaults(ctx); mutex_lock(&virtio_fs_mutex); /* After holding mutex, make sure virtiofs device is still there. @@ -1112,8 +1323,10 @@ static int virtio_fs_fill_super(struct super_block *sb) } /* virtiofs allocates and installs its own fuse devices */ - ctx.fudptr = NULL; - err = fuse_fill_super_common(sb, &ctx); + ctx->fudptr = NULL; + if (ctx->dax) + ctx->dax_dev = fs->dax_dev; + err = fuse_fill_super_common(sb, ctx); if (err < 0) goto err_free_fuse_devs; @@ -1125,7 +1338,7 @@ static int virtio_fs_fill_super(struct super_block *sb) /* Previous unmount will stop all queues. Start these again */ virtio_fs_start_all_queues(fs); - fuse_send_init(fc); + fuse_send_init(fm); mutex_unlock(&virtio_fs_mutex); return 0; @@ -1136,18 +1349,17 @@ err: return err; } -static void virtio_kill_sb(struct super_block *sb) +static void virtio_fs_conn_destroy(struct fuse_mount *fm) { - struct fuse_conn *fc = get_fuse_conn_super(sb); - struct virtio_fs *vfs; - struct virtio_fs_vq *fsvq; - - /* If mount failed, we can still be called without any fc */ - if (!fc) - return fuse_kill_sb_anon(sb); + struct fuse_conn *fc = fm->fc; + struct virtio_fs *vfs = fc->iq.priv; + struct virtio_fs_vq *fsvq = &vfs->vqs[VQ_HIPRIO]; - vfs = fc->iq.priv; - fsvq = &vfs->vqs[VQ_HIPRIO]; + /* Stop dax worker. Soon evict_inodes() will be called which + * will free all memory ranges belonging to all inodes. + */ + if (IS_ENABLED(CONFIG_FUSE_DAX)) + fuse_dax_cancel_work(fc); /* Stop forget queue. Soon destroy will be sent */ spin_lock(&fsvq->lock); @@ -1155,9 +1367,9 @@ static void virtio_kill_sb(struct super_block *sb) spin_unlock(&fsvq->lock); virtio_fs_drain_all_queues(vfs); - fuse_kill_sb_anon(sb); + fuse_conn_destroy(fm); - /* fuse_kill_sb_anon() must have sent destroy. Stop all queues + /* fuse_conn_destroy() must have sent destroy. Stop all queues * and drain one more time and free fuse devices. Freeing fuse * devices will drop their reference on fuse_conn and that in * turn will drop its reference on virtio_fs object. @@ -1167,12 +1379,27 @@ static void virtio_kill_sb(struct super_block *sb) virtio_fs_free_devs(vfs); } +static void virtio_kill_sb(struct super_block *sb) +{ + struct fuse_mount *fm = get_fuse_mount_super(sb); + bool last; + + /* If mount failed, we can still be called without any fc */ + if (fm) { + last = fuse_mount_remove(fm); + if (last) + virtio_fs_conn_destroy(fm); + } + kill_anon_super(sb); +} + static int virtio_fs_test_super(struct super_block *sb, struct fs_context *fsc) { - struct fuse_conn *fc = fsc->s_fs_info; + struct fuse_mount *fsc_fm = fsc->s_fs_info; + struct fuse_mount *sb_fm = get_fuse_mount_super(sb); - return fc->iq.priv == get_fuse_conn_super(sb)->iq.priv; + return fsc_fm->fc->iq.priv == sb_fm->fc->iq.priv; } static int virtio_fs_set_super(struct super_block *sb, @@ -1182,7 +1409,7 @@ static int virtio_fs_set_super(struct super_block *sb, err = get_anon_bdev(&sb->s_dev); if (!err) - fuse_conn_get(fsc->s_fs_info); + fuse_mount_get(fsc->s_fs_info); return err; } @@ -1192,6 +1419,7 @@ static int virtio_fs_get_tree(struct fs_context *fsc) struct virtio_fs *fs; struct super_block *sb; struct fuse_conn *fc; + struct fuse_mount *fm; int err; /* This gets a reference on virtio_fs object. This ptr gets installed @@ -1212,19 +1440,29 @@ static int virtio_fs_get_tree(struct fs_context *fsc) return -ENOMEM; } - fuse_conn_init(fc, get_user_ns(current_user_ns()), &virtio_fs_fiq_ops, - fs); + fm = kzalloc(sizeof(struct fuse_mount), GFP_KERNEL); + if (!fm) { + mutex_lock(&virtio_fs_mutex); + virtio_fs_put(fs); + mutex_unlock(&virtio_fs_mutex); + kfree(fc); + return -ENOMEM; + } + + fuse_conn_init(fc, fm, get_user_ns(current_user_ns()), + &virtio_fs_fiq_ops, fs); fc->release = fuse_free_conn; fc->delete_stale = true; + fc->auto_submounts = true; - fsc->s_fs_info = fc; + fsc->s_fs_info = fm; sb = sget_fc(fsc, virtio_fs_test_super, virtio_fs_set_super); - fuse_conn_put(fc); + fuse_mount_put(fm); if (IS_ERR(sb)) return PTR_ERR(sb); if (!sb->s_root) { - err = virtio_fs_fill_super(sb); + err = virtio_fs_fill_super(sb, fsc); if (err) { deactivate_locked_super(sb); return err; @@ -1239,11 +1477,19 @@ static int virtio_fs_get_tree(struct fs_context *fsc) } static const struct fs_context_operations virtio_fs_context_ops = { + .free = virtio_fs_free_fc, + .parse_param = virtio_fs_parse_param, .get_tree = virtio_fs_get_tree, }; static int virtio_fs_init_fs_context(struct fs_context *fsc) { + struct fuse_fs_context *ctx; + + ctx = kzalloc(sizeof(struct fuse_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + fsc->fs_private = ctx; fsc->ops = &virtio_fs_context_ops; return 0; } diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 20d052e08b3b..371bdcbc7233 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -14,12 +14,12 @@ int fuse_setxattr(struct inode *inode, const char *name, const void *value, size_t size, int flags) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_setxattr_in inarg; int err; - if (fc->no_setxattr) + if (fm->fc->no_setxattr) return -EOPNOTSUPP; memset(&inarg, 0, sizeof(inarg)); @@ -34,9 +34,9 @@ int fuse_setxattr(struct inode *inode, const char *name, const void *value, args.in_args[1].value = name; args.in_args[2].size = size; args.in_args[2].value = value; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { - fc->no_setxattr = 1; + fm->fc->no_setxattr = 1; err = -EOPNOTSUPP; } if (!err) { @@ -49,13 +49,13 @@ int fuse_setxattr(struct inode *inode, const char *name, const void *value, ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, size_t size) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_getxattr_in inarg; struct fuse_getxattr_out outarg; ssize_t ret; - if (fc->no_getxattr) + if (fm->fc->no_getxattr) return -EOPNOTSUPP; memset(&inarg, 0, sizeof(inarg)); @@ -77,11 +77,11 @@ ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; } - ret = fuse_simple_request(fc, &args); + ret = fuse_simple_request(fm, &args); if (!ret && !size) ret = min_t(ssize_t, outarg.size, XATTR_SIZE_MAX); if (ret == -ENOSYS) { - fc->no_getxattr = 1; + fm->fc->no_getxattr = 1; ret = -EOPNOTSUPP; } return ret; @@ -107,16 +107,16 @@ static int fuse_verify_xattr_list(char *list, size_t size) ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) { struct inode *inode = d_inode(entry); - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_getxattr_in inarg; struct fuse_getxattr_out outarg; ssize_t ret; - if (!fuse_allow_current_process(fc)) + if (!fuse_allow_current_process(fm->fc)) return -EACCES; - if (fc->no_listxattr) + if (fm->fc->no_listxattr) return -EOPNOTSUPP; memset(&inarg, 0, sizeof(inarg)); @@ -136,13 +136,13 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; } - ret = fuse_simple_request(fc, &args); + ret = fuse_simple_request(fm, &args); if (!ret && !size) ret = min_t(ssize_t, outarg.size, XATTR_LIST_MAX); if (ret > 0 && size) ret = fuse_verify_xattr_list(list, ret); if (ret == -ENOSYS) { - fc->no_listxattr = 1; + fm->fc->no_listxattr = 1; ret = -EOPNOTSUPP; } return ret; @@ -150,11 +150,11 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) int fuse_removexattr(struct inode *inode, const char *name) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); int err; - if (fc->no_removexattr) + if (fm->fc->no_removexattr) return -EOPNOTSUPP; args.opcode = FUSE_REMOVEXATTR; @@ -162,9 +162,9 @@ int fuse_removexattr(struct inode *inode, const char *name) args.in_numargs = 1; args.in_args[0].size = strlen(name) + 1; args.in_args[0].value = name; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { - fc->no_removexattr = 1; + fm->fc->no_removexattr = 1; err = -EOPNOTSUPP; } if (!err) { diff --git a/fs/inode.c b/fs/inode.c index 72c4c347afb7..9d78c37b00b8 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -181,6 +181,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode) mapping->a_ops = &empty_aops; mapping->host = inode; mapping->flags = 0; + if (sb->s_type->fs_flags & FS_THP_SUPPORT) + __set_bit(AS_THP_SUPPORT, &mapping->flags); mapping->wb_err = 0; atomic_set(&mapping->i_mmap_writable, 0); #ifdef CONFIG_READ_ONLY_THP_FOR_FS diff --git a/fs/internal.h b/fs/internal.h index 10517ece4516..a7cd0f64faa4 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -82,9 +82,6 @@ int may_linkat(struct path *link); /* * namespace.c */ -extern void *copy_mount_options(const void __user *); -extern char *copy_mount_string(const void __user *); - extern struct vfsmount *lookup_mnt(const struct path *); extern int finish_automount(struct vfsmount *, struct path *); diff --git a/fs/io-wq.c b/fs/io-wq.c index 414beb543883..7cb3b4cb9b11 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -17,6 +17,8 @@ #include <linux/rculist_nulls.h> #include <linux/fs_struct.h> #include <linux/task_work.h> +#include <linux/blk-cgroup.h> +#include <linux/audit.h> #include "io-wq.h" @@ -26,9 +28,8 @@ enum { IO_WORKER_F_UP = 1, /* up and active */ IO_WORKER_F_RUNNING = 2, /* account as running */ IO_WORKER_F_FREE = 4, /* worker on free list */ - IO_WORKER_F_EXITING = 8, /* worker exiting */ - IO_WORKER_F_FIXED = 16, /* static idle worker */ - IO_WORKER_F_BOUND = 32, /* is doing bounded work */ + IO_WORKER_F_FIXED = 8, /* static idle worker */ + IO_WORKER_F_BOUND = 16, /* is doing bounded work */ }; enum { @@ -57,9 +58,13 @@ struct io_worker { struct rcu_head rcu; struct mm_struct *mm; +#ifdef CONFIG_BLK_CGROUP + struct cgroup_subsys_state *blkcg_css; +#endif const struct cred *cur_creds; const struct cred *saved_creds; struct files_struct *restore_files; + struct nsproxy *restore_nsproxy; struct fs_struct *restore_fs; }; @@ -87,7 +92,7 @@ enum { */ struct io_wqe { struct { - spinlock_t lock; + raw_spinlock_t lock; struct io_wq_work_list work_list; unsigned long hash_map; unsigned flags; @@ -148,11 +153,12 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) if (current->files != worker->restore_files) { __acquire(&wqe->lock); - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); dropped_lock = true; task_lock(current); current->files = worker->restore_files; + current->nsproxy = worker->restore_nsproxy; task_unlock(current); } @@ -166,7 +172,7 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) if (worker->mm) { if (!dropped_lock) { __acquire(&wqe->lock); - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); dropped_lock = true; } __set_current_state(TASK_RUNNING); @@ -175,6 +181,13 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) worker->mm = NULL; } +#ifdef CONFIG_BLK_CGROUP + if (worker->blkcg_css) { + kthread_associate_blkcg(NULL); + worker->blkcg_css = NULL; + } +#endif + return dropped_lock; } @@ -200,7 +213,6 @@ static void io_worker_exit(struct io_worker *worker) { struct io_wqe *wqe = worker->wqe; struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker); - unsigned nr_workers; /* * If we're not at zero, someone else is holding a brief reference @@ -220,23 +232,19 @@ static void io_worker_exit(struct io_worker *worker) worker->flags = 0; preempt_enable(); - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); hlist_nulls_del_rcu(&worker->nulls_node); list_del_rcu(&worker->all_list); if (__io_worker_unuse(wqe, worker)) { __release(&wqe->lock); - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); } acct->nr_workers--; - nr_workers = wqe->acct[IO_WQ_ACCT_BOUND].nr_workers + - wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers; - spin_unlock_irq(&wqe->lock); - - /* all workers gone, wq exit can proceed */ - if (!nr_workers && refcount_dec_and_test(&wqe->wq->refs)) - complete(&wqe->wq->done); + raw_spin_unlock_irq(&wqe->lock); kfree_rcu(worker, rcu); + if (refcount_dec_and_test(&wqe->wq->refs)) + complete(&wqe->wq->done); } static inline bool io_wqe_run_queue(struct io_wqe *wqe) @@ -318,6 +326,7 @@ static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker) worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING); worker->restore_files = current->files; + worker->restore_nsproxy = current->nsproxy; worker->restore_fs = current->fs; io_wqe_inc_running(wqe, worker); } @@ -421,14 +430,10 @@ static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work) mmput(worker->mm); worker->mm = NULL; } - if (!work->mm) - return; - if (mmget_not_zero(work->mm)) { - kthread_use_mm(work->mm); - worker->mm = work->mm; - /* hang on to this mm */ - work->mm = NULL; + if (mmget_not_zero(work->identity->mm)) { + kthread_use_mm(work->identity->mm); + worker->mm = work->identity->mm; return; } @@ -436,12 +441,25 @@ static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work) work->flags |= IO_WQ_WORK_CANCEL; } +static inline void io_wq_switch_blkcg(struct io_worker *worker, + struct io_wq_work *work) +{ +#ifdef CONFIG_BLK_CGROUP + if (!(work->flags & IO_WQ_WORK_BLKCG)) + return; + if (work->identity->blkcg_css != worker->blkcg_css) { + kthread_associate_blkcg(work->identity->blkcg_css); + worker->blkcg_css = work->identity->blkcg_css; + } +#endif +} + static void io_wq_switch_creds(struct io_worker *worker, struct io_wq_work *work) { - const struct cred *old_creds = override_creds(work->creds); + const struct cred *old_creds = override_creds(work->identity->creds); - worker->cur_creds = work->creds; + worker->cur_creds = work->identity->creds; if (worker->saved_creds) put_cred(old_creds); /* creds set by previous switch */ else @@ -451,18 +469,26 @@ static void io_wq_switch_creds(struct io_worker *worker, static void io_impersonate_work(struct io_worker *worker, struct io_wq_work *work) { - if (work->files && current->files != work->files) { + if ((work->flags & IO_WQ_WORK_FILES) && + current->files != work->identity->files) { task_lock(current); - current->files = work->files; + current->files = work->identity->files; + current->nsproxy = work->identity->nsproxy; task_unlock(current); } - if (work->fs && current->fs != work->fs) - current->fs = work->fs; - if (work->mm != worker->mm) + if ((work->flags & IO_WQ_WORK_FS) && current->fs != work->identity->fs) + current->fs = work->identity->fs; + if ((work->flags & IO_WQ_WORK_MM) && work->identity->mm != worker->mm) io_wq_switch_mm(worker, work); - if (worker->cur_creds != work->creds) + if ((work->flags & IO_WQ_WORK_CREDS) && + worker->cur_creds != work->identity->creds) io_wq_switch_creds(worker, work); - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->fsize; + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->identity->fsize; + io_wq_switch_blkcg(worker, work); +#ifdef CONFIG_AUDIT + current->loginuid = work->identity->loginuid; + current->sessionid = work->identity->sessionid; +#endif } static void io_assign_current_work(struct io_worker *worker, @@ -475,6 +501,11 @@ static void io_assign_current_work(struct io_worker *worker, cond_resched(); } +#ifdef CONFIG_AUDIT + current->loginuid = KUIDT_INIT(AUDIT_UID_UNSET); + current->sessionid = AUDIT_SID_UNSET; +#endif + spin_lock_irq(&worker->lock); worker->cur_work = work; spin_unlock_irq(&worker->lock); @@ -504,7 +535,7 @@ get_next: else if (!wq_list_empty(&wqe->work_list)) wqe->flags |= IO_WQE_FLAG_STALLED; - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); if (!work) break; io_assign_current_work(worker, work); @@ -538,17 +569,17 @@ get_next: io_wqe_enqueue(wqe, linked); if (hash != -1U && !next_hashed) { - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); wqe->hash_map &= ~BIT_ULL(hash); wqe->flags &= ~IO_WQE_FLAG_STALLED; /* skip unnecessary unlock-lock wqe->lock */ if (!work) goto get_next; - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); } } while (work); - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); } while (1); } @@ -563,7 +594,7 @@ static int io_wqe_worker(void *data) while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) { set_current_state(TASK_INTERRUPTIBLE); loop: - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); if (io_wqe_run_queue(wqe)) { __set_current_state(TASK_RUNNING); io_worker_handle_work(worker); @@ -574,7 +605,7 @@ loop: __release(&wqe->lock); goto loop; } - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); if (signal_pending(current)) flush_signals(current); if (schedule_timeout(WORKER_IDLE_TIMEOUT)) @@ -586,11 +617,11 @@ loop: } if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) { - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); if (!wq_list_empty(&wqe->work_list)) io_worker_handle_work(worker); else - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); } io_worker_exit(worker); @@ -630,14 +661,14 @@ void io_wq_worker_sleeping(struct task_struct *tsk) worker->flags &= ~IO_WORKER_F_RUNNING; - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); io_wqe_dec_running(wqe, worker); - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); } static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) { - struct io_wqe_acct *acct =&wqe->acct[index]; + struct io_wqe_acct *acct = &wqe->acct[index]; struct io_worker *worker; worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node); @@ -655,8 +686,9 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) kfree(worker); return false; } + kthread_bind_mask(worker->task, cpumask_of_node(wqe->node)); - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); list_add_tail_rcu(&worker->all_list, &wqe->all_list); worker->flags |= IO_WORKER_F_FREE; @@ -665,11 +697,12 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) if (!acct->nr_workers && (worker->flags & IO_WORKER_F_BOUND)) worker->flags |= IO_WORKER_F_FIXED; acct->nr_workers++; - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); if (index == IO_WQ_ACCT_UNBOUND) atomic_inc(&wq->user->processes); + refcount_inc(&wq->refs); wake_up_process(worker->task); return true; } @@ -685,28 +718,63 @@ static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index) return acct->nr_workers < acct->max_workers; } +static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data) +{ + send_sig(SIGINT, worker->task, 1); + return false; +} + +/* + * Iterate the passed in list and call the specific function for each + * worker that isn't exiting + */ +static bool io_wq_for_each_worker(struct io_wqe *wqe, + bool (*func)(struct io_worker *, void *), + void *data) +{ + struct io_worker *worker; + bool ret = false; + + list_for_each_entry_rcu(worker, &wqe->all_list, all_list) { + if (io_worker_get(worker)) { + /* no task if node is/was offline */ + if (worker->task) + ret = func(worker, data); + io_worker_release(worker); + if (ret) + break; + } + } + + return ret; +} + +static bool io_wq_worker_wake(struct io_worker *worker, void *data) +{ + wake_up_process(worker->task); + return false; +} + /* * Manager thread. Tasked with creating new workers, if we need them. */ static int io_wq_manager(void *data) { struct io_wq *wq = data; - int workers_to_create = num_possible_nodes(); int node; /* create fixed workers */ - refcount_set(&wq->refs, workers_to_create); + refcount_set(&wq->refs, 1); for_each_node(node) { if (!node_online(node)) continue; - if (!create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND)) - goto err; - workers_to_create--; + if (create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND)) + continue; + set_bit(IO_WQ_BIT_ERROR, &wq->state); + set_bit(IO_WQ_BIT_EXIT, &wq->state); + goto out; } - while (workers_to_create--) - refcount_dec(&wq->refs); - complete(&wq->done); while (!kthread_should_stop()) { @@ -720,12 +788,12 @@ static int io_wq_manager(void *data) if (!node_online(node)) continue; - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND)) fork_worker[IO_WQ_ACCT_BOUND] = true; if (io_wqe_need_worker(wqe, IO_WQ_ACCT_UNBOUND)) fork_worker[IO_WQ_ACCT_UNBOUND] = true; - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); if (fork_worker[IO_WQ_ACCT_BOUND]) create_io_worker(wq, wqe, IO_WQ_ACCT_BOUND); if (fork_worker[IO_WQ_ACCT_UNBOUND]) @@ -738,12 +806,18 @@ static int io_wq_manager(void *data) if (current->task_works) task_work_run(); - return 0; -err: - set_bit(IO_WQ_BIT_ERROR, &wq->state); - set_bit(IO_WQ_BIT_EXIT, &wq->state); - if (refcount_sub_and_test(workers_to_create, &wq->refs)) +out: + if (refcount_dec_and_test(&wq->refs)) { complete(&wq->done); + return 0; + } + /* if ERROR is set and we get here, we have workers to wake */ + if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) { + rcu_read_lock(); + for_each_node(node) + io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL); + rcu_read_unlock(); + } return 0; } @@ -821,10 +895,10 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) } work_flags = work->flags; - spin_lock_irqsave(&wqe->lock, flags); + raw_spin_lock_irqsave(&wqe->lock, flags); io_wqe_insert_work(wqe, work); wqe->flags &= ~IO_WQE_FLAG_STALLED; - spin_unlock_irqrestore(&wqe->lock, flags); + raw_spin_unlock_irqrestore(&wqe->lock, flags); if ((work_flags & IO_WQ_WORK_CONCURRENT) || !atomic_read(&acct->nr_running)) @@ -850,37 +924,6 @@ void io_wq_hash_work(struct io_wq_work *work, void *val) work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT)); } -static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data) -{ - send_sig(SIGINT, worker->task, 1); - return false; -} - -/* - * Iterate the passed in list and call the specific function for each - * worker that isn't exiting - */ -static bool io_wq_for_each_worker(struct io_wqe *wqe, - bool (*func)(struct io_worker *, void *), - void *data) -{ - struct io_worker *worker; - bool ret = false; - - list_for_each_entry_rcu(worker, &wqe->all_list, all_list) { - if (io_worker_get(worker)) { - /* no task if node is/was offline */ - if (worker->task) - ret = func(worker, data); - io_worker_release(worker); - if (ret) - break; - } - } - - return ret; -} - void io_wq_cancel_all(struct io_wq *wq) { int node; @@ -951,13 +994,13 @@ static void io_wqe_cancel_pending_work(struct io_wqe *wqe, unsigned long flags; retry: - spin_lock_irqsave(&wqe->lock, flags); + raw_spin_lock_irqsave(&wqe->lock, flags); wq_list_for_each(node, prev, &wqe->work_list) { work = container_of(node, struct io_wq_work, list); if (!match->fn(work, match->data)) continue; io_wqe_remove_pending(wqe, work, prev); - spin_unlock_irqrestore(&wqe->lock, flags); + raw_spin_unlock_irqrestore(&wqe->lock, flags); io_run_cancel(work, wqe); match->nr_pending++; if (!match->cancel_all) @@ -966,7 +1009,7 @@ retry: /* not safe to continue after unlock */ goto retry; } - spin_unlock_irqrestore(&wqe->lock, flags); + raw_spin_unlock_irqrestore(&wqe->lock, flags); } static void io_wqe_cancel_running_work(struct io_wqe *wqe, @@ -1074,7 +1117,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) } atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0); wqe->wq = wq; - spin_lock_init(&wqe->lock); + raw_spin_lock_init(&wqe->lock); INIT_WQ_LIST(&wqe->work_list); INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0); INIT_LIST_HEAD(&wqe->all_list); @@ -1113,12 +1156,6 @@ bool io_wq_get(struct io_wq *wq, struct io_wq_data *data) return refcount_inc_not_zero(&wq->use_refs); } -static bool io_wq_worker_wake(struct io_worker *worker, void *data) -{ - wake_up_process(worker->task); - return false; -} - static void __io_wq_destroy(struct io_wq *wq) { int node; diff --git a/fs/io-wq.h b/fs/io-wq.h index ddaf9614cf9b..be21c500c925 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -1,6 +1,8 @@ #ifndef INTERNAL_IO_WQ_H #define INTERNAL_IO_WQ_H +#include <linux/io_uring.h> + struct io_wq; enum { @@ -10,6 +12,12 @@ enum { IO_WQ_WORK_NO_CANCEL = 8, IO_WQ_WORK_CONCURRENT = 16, + IO_WQ_WORK_FILES = 32, + IO_WQ_WORK_FS = 64, + IO_WQ_WORK_MM = 128, + IO_WQ_WORK_CREDS = 256, + IO_WQ_WORK_BLKCG = 512, + IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */ }; @@ -85,11 +93,7 @@ static inline void wq_list_del(struct io_wq_work_list *list, struct io_wq_work { struct io_wq_work_node list; - struct files_struct *files; - struct mm_struct *mm; - const struct cred *creds; - struct fs_struct *fs; - unsigned long fsize; + struct io_identity *identity; unsigned flags; }; diff --git a/fs/io_uring.c b/fs/io_uring.c index 3790c7fe9fee..02dc81622081 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -79,6 +79,9 @@ #include <linux/splice.h> #include <linux/task_work.h> #include <linux/pagemap.h> +#include <linux/io_uring.h> +#include <linux/blk-cgroup.h> +#include <linux/audit.h> #define CREATE_TRACE_POINTS #include <trace/events/io_uring.h> @@ -98,6 +101,8 @@ #define IORING_MAX_FILES_TABLE (1U << IORING_FILE_TABLE_SHIFT) #define IORING_FILE_TABLE_MASK (IORING_MAX_FILES_TABLE - 1) #define IORING_MAX_FIXED_FILES (64 * IORING_MAX_FILES_TABLE) +#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ + IORING_REGISTER_LAST + IORING_OP_LAST) struct io_uring { u32 head ____cacheline_aligned_in_smp; @@ -187,6 +192,7 @@ struct io_mapped_ubuf { size_t len; struct bio_vec *bvec; unsigned int nr_bvecs; + unsigned long acct_pages; }; struct fixed_file_table { @@ -205,7 +211,7 @@ struct fixed_file_data { struct fixed_file_table *table; struct io_ring_ctx *ctx; - struct percpu_ref *cur_refs; + struct fixed_file_ref_node *node; struct percpu_ref refs; struct completion done; struct list_head ref_list; @@ -219,6 +225,27 @@ struct io_buffer { __u16 bid; }; +struct io_restriction { + DECLARE_BITMAP(register_op, IORING_REGISTER_LAST); + DECLARE_BITMAP(sqe_op, IORING_OP_LAST); + u8 sqe_flags_allowed; + u8 sqe_flags_required; + bool registered; +}; + +struct io_sq_data { + refcount_t refs; + struct mutex lock; + + /* ctx's that are using this sqd */ + struct list_head ctx_list; + struct list_head ctx_new_list; + struct mutex ctx_lock; + + struct task_struct *thread; + struct wait_queue_head wait; +}; + struct io_ring_ctx { struct { struct percpu_ref refs; @@ -231,6 +258,7 @@ struct io_ring_ctx { unsigned int cq_overflow_flushed: 1; unsigned int drain_next: 1; unsigned int eventfd_async: 1; + unsigned int restricted: 1; /* * Ring buffer of indices into array of io_uring_sqe, which is @@ -264,9 +292,25 @@ struct io_ring_ctx { /* IO offload */ struct io_wq *io_wq; - struct task_struct *sqo_thread; /* if using sq thread polling */ - struct mm_struct *sqo_mm; - wait_queue_head_t sqo_wait; + + /* + * For SQPOLL usage - we hold a reference to the parent task, so we + * have access to the ->files + */ + struct task_struct *sqo_task; + + /* Only used for accounting purposes */ + struct mm_struct *mm_account; + +#ifdef CONFIG_BLK_CGROUP + struct cgroup_subsys_state *sqo_blkcg_css; +#endif + + struct io_sq_data *sq_data; /* if using sq thread polling */ + + struct wait_queue_head sqo_sq_wait; + struct wait_queue_entry sqo_wait_entry; + struct list_head sqd_list; /* * If used, fixed file set. Writers must ensure that ->refs is dead, @@ -275,8 +319,6 @@ struct io_ring_ctx { */ struct fixed_file_data *file_data; unsigned nr_user_files; - int ring_fd; - struct file *ring_file; /* if used, fixed mapped user buffers */ unsigned nr_user_bufs; @@ -286,6 +328,11 @@ struct io_ring_ctx { const struct cred *creds; +#ifdef CONFIG_AUDIT + kuid_t loginuid; + unsigned int sessionid; +#endif + struct completion ref_comp; struct completion sq_thread_comp; @@ -338,6 +385,7 @@ struct io_ring_ctx { struct llist_head file_put_llist; struct work_struct exit_work; + struct io_restriction restrictions; }; /* @@ -392,13 +440,16 @@ struct io_cancel { struct io_timeout { struct file *file; - u64 addr; - int flags; u32 off; u32 target_seq; struct list_head list; }; +struct io_timeout_rem { + struct file *file; + u64 addr; +}; + struct io_rw { /* NOTE: kiocb has the file as the first member, so don't do it here */ struct kiocb kiocb; @@ -514,15 +565,6 @@ struct io_async_rw { struct wait_page_queue wpq; }; -struct io_async_ctx { - union { - struct io_async_rw rw; - struct io_async_msghdr msg; - struct io_async_connect connect; - struct io_timeout_data timeout; - }; -}; - enum { REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, @@ -538,13 +580,11 @@ enum { REQ_F_NOWAIT_BIT, REQ_F_LINK_TIMEOUT_BIT, REQ_F_ISREG_BIT, - REQ_F_COMP_LOCKED_BIT, REQ_F_NEED_CLEANUP_BIT, REQ_F_POLLED_BIT, REQ_F_BUFFER_SELECTED_BIT, REQ_F_NO_FILE_TABLE_BIT, REQ_F_WORK_INITIALIZED_BIT, - REQ_F_TASK_PINNED_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -578,8 +618,6 @@ enum { REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), /* regular file */ REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), - /* completion under lock */ - REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT), /* needs cleanup */ REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), /* already went through poll handler */ @@ -590,8 +628,6 @@ enum { REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT), /* io_wq_work is initialized */ REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT), - /* req->task is refcounted */ - REQ_F_TASK_PINNED = BIT(REQ_F_TASK_PINNED_BIT), }; struct async_poll { @@ -614,6 +650,7 @@ struct io_kiocb { struct io_sync sync; struct io_cancel cancel; struct io_timeout timeout; + struct io_timeout_rem timeout_rem; struct io_connect connect; struct io_sr_msg sr_msg; struct io_open open; @@ -629,7 +666,8 @@ struct io_kiocb { struct io_completion compl; }; - struct io_async_ctx *io; + /* opcode allocated if it needs to store data for async defer */ + void *async_data; u8 opcode; /* polled IO has completed */ u8 iopoll_completed; @@ -697,10 +735,6 @@ struct io_submit_state { }; struct io_op_def { - /* needs req->io allocated for deferral/async */ - unsigned async_ctx : 1; - /* needs current->mm setup, does mm access */ - unsigned needs_mm : 1; /* needs req->file assigned */ unsigned needs_file : 1; /* don't fail if file grab fails */ @@ -711,44 +745,51 @@ struct io_op_def { unsigned unbound_nonreg_file : 1; /* opcode is not supported by this kernel */ unsigned not_supported : 1; - /* needs file table */ - unsigned file_table : 1; - /* needs ->fs */ - unsigned needs_fs : 1; /* set if opcode supports polled "wait" */ unsigned pollin : 1; unsigned pollout : 1; /* op supports buffer selection */ unsigned buffer_select : 1; + /* needs rlimit(RLIMIT_FSIZE) assigned */ unsigned needs_fsize : 1; + /* must always have async data allocated */ + unsigned needs_async_data : 1; + /* size of async data needed, if any */ + unsigned short async_size; + unsigned work_flags; }; static const struct io_op_def io_op_defs[] = { [IORING_OP_NOP] = {}, [IORING_OP_READV] = { - .async_ctx = 1, - .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, + .needs_async_data = 1, + .async_size = sizeof(struct io_async_rw), + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, [IORING_OP_WRITEV] = { - .async_ctx = 1, - .needs_mm = 1, .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, .pollout = 1, .needs_fsize = 1, + .needs_async_data = 1, + .async_size = sizeof(struct io_async_rw), + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, [IORING_OP_FSYNC] = { .needs_file = 1, + .work_flags = IO_WQ_WORK_BLKCG, }, [IORING_OP_READ_FIXED] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, + .async_size = sizeof(struct io_async_rw), + .work_flags = IO_WQ_WORK_BLKCG, }, [IORING_OP_WRITE_FIXED] = { .needs_file = 1, @@ -756,6 +797,8 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollout = 1, .needs_fsize = 1, + .async_size = sizeof(struct io_async_rw), + .work_flags = IO_WQ_WORK_BLKCG, }, [IORING_OP_POLL_ADD] = { .needs_file = 1, @@ -764,115 +807,123 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_POLL_REMOVE] = {}, [IORING_OP_SYNC_FILE_RANGE] = { .needs_file = 1, + .work_flags = IO_WQ_WORK_BLKCG, }, [IORING_OP_SENDMSG] = { - .async_ctx = 1, - .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, - .needs_fs = 1, .pollout = 1, + .needs_async_data = 1, + .async_size = sizeof(struct io_async_msghdr), + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | + IO_WQ_WORK_FS, }, [IORING_OP_RECVMSG] = { - .async_ctx = 1, - .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, - .needs_fs = 1, .pollin = 1, .buffer_select = 1, + .needs_async_data = 1, + .async_size = sizeof(struct io_async_msghdr), + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | + IO_WQ_WORK_FS, }, [IORING_OP_TIMEOUT] = { - .async_ctx = 1, - .needs_mm = 1, + .needs_async_data = 1, + .async_size = sizeof(struct io_timeout_data), + .work_flags = IO_WQ_WORK_MM, }, [IORING_OP_TIMEOUT_REMOVE] = {}, [IORING_OP_ACCEPT] = { - .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, - .file_table = 1, .pollin = 1, + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES, }, [IORING_OP_ASYNC_CANCEL] = {}, [IORING_OP_LINK_TIMEOUT] = { - .async_ctx = 1, - .needs_mm = 1, + .needs_async_data = 1, + .async_size = sizeof(struct io_timeout_data), + .work_flags = IO_WQ_WORK_MM, }, [IORING_OP_CONNECT] = { - .async_ctx = 1, - .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, + .needs_async_data = 1, + .async_size = sizeof(struct io_async_connect), + .work_flags = IO_WQ_WORK_MM, }, [IORING_OP_FALLOCATE] = { .needs_file = 1, .needs_fsize = 1, + .work_flags = IO_WQ_WORK_BLKCG, }, [IORING_OP_OPENAT] = { - .file_table = 1, - .needs_fs = 1, + .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG | + IO_WQ_WORK_FS, }, [IORING_OP_CLOSE] = { .needs_file = 1, .needs_file_no_error = 1, - .file_table = 1, + .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG, }, [IORING_OP_FILES_UPDATE] = { - .needs_mm = 1, - .file_table = 1, + .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_MM, }, [IORING_OP_STATX] = { - .needs_mm = 1, - .needs_fs = 1, - .file_table = 1, + .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_MM | + IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG, }, [IORING_OP_READ] = { - .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, + .async_size = sizeof(struct io_async_rw), + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, [IORING_OP_WRITE] = { - .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, .needs_fsize = 1, + .async_size = sizeof(struct io_async_rw), + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, [IORING_OP_FADVISE] = { .needs_file = 1, + .work_flags = IO_WQ_WORK_BLKCG, }, [IORING_OP_MADVISE] = { - .needs_mm = 1, + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, [IORING_OP_SEND] = { - .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, [IORING_OP_RECV] = { - .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, [IORING_OP_OPENAT2] = { - .file_table = 1, - .needs_fs = 1, + .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_FS | + IO_WQ_WORK_BLKCG, }, [IORING_OP_EPOLL_CTL] = { .unbound_nonreg_file = 1, - .file_table = 1, + .work_flags = IO_WQ_WORK_FILES, }, [IORING_OP_SPLICE] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, + .work_flags = IO_WQ_WORK_BLKCG, }, [IORING_OP_PROVIDE_BUFFERS] = {}, [IORING_OP_REMOVE_BUFFERS] = {}, @@ -892,21 +943,18 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2, struct io_comp_state *cs); static void io_cqring_fill_event(struct io_kiocb *req, long res); static void io_put_req(struct io_kiocb *req); +static void io_put_req_deferred(struct io_kiocb *req, int nr); static void io_double_put_req(struct io_kiocb *req); -static void __io_double_put_req(struct io_kiocb *req); static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req); static void __io_queue_linked_timeout(struct io_kiocb *req); static void io_queue_linked_timeout(struct io_kiocb *req); static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct io_uring_files_update *ip, unsigned nr_args); -static int io_prep_work_files(struct io_kiocb *req); static void __io_clean_op(struct io_kiocb *req); -static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, - int fd, struct file **out_file, bool fixed); -static void __io_queue_sqe(struct io_kiocb *req, - const struct io_uring_sqe *sqe, - struct io_comp_state *cs); +static struct file *io_file_get(struct io_submit_state *state, + struct io_kiocb *req, int fd, bool fixed); +static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs); static void io_file_put_work(struct work_struct *work); static ssize_t io_import_iovec(int rw, struct io_kiocb *req, @@ -933,14 +981,6 @@ struct sock *io_uring_get_socket(struct file *file) } EXPORT_SYMBOL(io_uring_get_socket); -static void io_get_req_task(struct io_kiocb *req) -{ - if (req->flags & REQ_F_TASK_PINNED) - return; - get_task_struct(req->task); - req->flags |= REQ_F_TASK_PINNED; -} - static inline void io_clean_op(struct io_kiocb *req) { if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED | @@ -948,13 +988,6 @@ static inline void io_clean_op(struct io_kiocb *req) __io_clean_op(req); } -/* not idempotent -- it doesn't clear REQ_F_TASK_PINNED */ -static void __io_put_req_task(struct io_kiocb *req) -{ - if (req->flags & REQ_F_TASK_PINNED) - put_task_struct(req->task); -} - static void io_sq_thread_drop_mm(void) { struct mm_struct *mm = current->mm; @@ -969,9 +1002,10 @@ static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) { if (!current->mm) { if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL) || - !mmget_not_zero(ctx->sqo_mm))) + !ctx->sqo_task->mm || + !mmget_not_zero(ctx->sqo_task->mm))) return -EFAULT; - kthread_use_mm(ctx->sqo_mm); + kthread_use_mm(ctx->sqo_task->mm); } return 0; @@ -980,11 +1014,31 @@ static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, struct io_kiocb *req) { - if (!io_op_defs[req->opcode].needs_mm) + if (!(io_op_defs[req->opcode].work_flags & IO_WQ_WORK_MM)) return 0; return __io_sq_thread_acquire_mm(ctx); } +static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx, + struct cgroup_subsys_state **cur_css) + +{ +#ifdef CONFIG_BLK_CGROUP + /* puts the old one when swapping */ + if (*cur_css != ctx->sqo_blkcg_css) { + kthread_associate_blkcg(ctx->sqo_blkcg_css); + *cur_css = ctx->sqo_blkcg_css; + } +#endif +} + +static void io_sq_thread_unassociate_blkcg(void) +{ +#ifdef CONFIG_BLK_CGROUP + kthread_associate_blkcg(NULL); +#endif +} + static inline void req_set_fail_links(struct io_kiocb *req) { if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK) @@ -992,16 +1046,48 @@ static inline void req_set_fail_links(struct io_kiocb *req) } /* + * None of these are dereferenced, they are simply used to check if any of + * them have changed. If we're under current and check they are still the + * same, we're fine to grab references to them for actual out-of-line use. + */ +static void io_init_identity(struct io_identity *id) +{ + id->files = current->files; + id->mm = current->mm; +#ifdef CONFIG_BLK_CGROUP + rcu_read_lock(); + id->blkcg_css = blkcg_css(); + rcu_read_unlock(); +#endif + id->creds = current_cred(); + id->nsproxy = current->nsproxy; + id->fs = current->fs; + id->fsize = rlimit(RLIMIT_FSIZE); +#ifdef CONFIG_AUDIT + id->loginuid = current->loginuid; + id->sessionid = current->sessionid; +#endif + refcount_set(&id->count, 1); +} + +/* * Note: must call io_req_init_async() for the first time you * touch any members of io_wq_work. */ static inline void io_req_init_async(struct io_kiocb *req) { + struct io_uring_task *tctx = current->io_uring; + if (req->flags & REQ_F_WORK_INITIALIZED) return; memset(&req->work, 0, sizeof(req->work)); req->flags |= REQ_F_WORK_INITIALIZED; + + /* Grab a ref if this isn't our static identity */ + req->work.identity = tctx->identity; + if (tctx->identity != &tctx->__identity) + refcount_inc(&req->work.identity->count); } static inline bool io_async_submit(struct io_ring_ctx *ctx) @@ -1054,7 +1140,8 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) goto err; ctx->flags = p->flags; - init_waitqueue_head(&ctx->sqo_wait); + init_waitqueue_head(&ctx->sqo_sq_wait); + INIT_LIST_HEAD(&ctx->sqd_list); init_waitqueue_head(&ctx->cq_wait); INIT_LIST_HEAD(&ctx->cq_overflow_list); init_completion(&ctx->ref_comp); @@ -1106,76 +1193,195 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) } } -/* - * Returns true if we need to defer file table putting. This can only happen - * from the error path with REQ_F_COMP_LOCKED set. - */ -static bool io_req_clean_work(struct io_kiocb *req) +static void io_put_identity(struct io_uring_task *tctx, struct io_kiocb *req) +{ + if (req->work.identity == &tctx->__identity) + return; + if (refcount_dec_and_test(&req->work.identity->count)) + kfree(req->work.identity); +} + +static void io_req_clean_work(struct io_kiocb *req) { if (!(req->flags & REQ_F_WORK_INITIALIZED)) - return false; + return; req->flags &= ~REQ_F_WORK_INITIALIZED; - if (req->work.mm) { - mmdrop(req->work.mm); - req->work.mm = NULL; + if (req->work.flags & IO_WQ_WORK_MM) { + mmdrop(req->work.identity->mm); + req->work.flags &= ~IO_WQ_WORK_MM; } - if (req->work.creds) { - put_cred(req->work.creds); - req->work.creds = NULL; +#ifdef CONFIG_BLK_CGROUP + if (req->work.flags & IO_WQ_WORK_BLKCG) { + css_put(req->work.identity->blkcg_css); + req->work.flags &= ~IO_WQ_WORK_BLKCG; } - if (req->work.fs) { - struct fs_struct *fs = req->work.fs; - - if (req->flags & REQ_F_COMP_LOCKED) - return true; +#endif + if (req->work.flags & IO_WQ_WORK_CREDS) { + put_cred(req->work.identity->creds); + req->work.flags &= ~IO_WQ_WORK_CREDS; + } + if (req->work.flags & IO_WQ_WORK_FS) { + struct fs_struct *fs = req->work.identity->fs; - spin_lock(&req->work.fs->lock); + spin_lock(&req->work.identity->fs->lock); if (--fs->users) fs = NULL; - spin_unlock(&req->work.fs->lock); + spin_unlock(&req->work.identity->fs->lock); if (fs) free_fs_struct(fs); - req->work.fs = NULL; + req->work.flags &= ~IO_WQ_WORK_FS; } - return false; + io_put_identity(req->task->io_uring, req); +} + +/* + * Create a private copy of io_identity, since some fields don't match + * the current context. + */ +static bool io_identity_cow(struct io_kiocb *req) +{ + struct io_uring_task *tctx = current->io_uring; + const struct cred *creds = NULL; + struct io_identity *id; + + if (req->work.flags & IO_WQ_WORK_CREDS) + creds = req->work.identity->creds; + + id = kmemdup(req->work.identity, sizeof(*id), GFP_KERNEL); + if (unlikely(!id)) { + req->work.flags |= IO_WQ_WORK_CANCEL; + return false; + } + + /* + * We can safely just re-init the creds we copied Either the field + * matches the current one, or we haven't grabbed it yet. The only + * exception is ->creds, through registered personalities, so handle + * that one separately. + */ + io_init_identity(id); + if (creds) + req->work.identity->creds = creds; + + /* add one for this request */ + refcount_inc(&id->count); + + /* drop old identity, assign new one. one ref for req, one for tctx */ + if (req->work.identity != tctx->identity && + refcount_sub_and_test(2, &req->work.identity->count)) + kfree(req->work.identity); + + req->work.identity = id; + tctx->identity = id; + return true; +} + +static bool io_grab_identity(struct io_kiocb *req) +{ + const struct io_op_def *def = &io_op_defs[req->opcode]; + struct io_identity *id = req->work.identity; + struct io_ring_ctx *ctx = req->ctx; + + if (def->needs_fsize && id->fsize != rlimit(RLIMIT_FSIZE)) + return false; + + if (!(req->work.flags & IO_WQ_WORK_FILES) && + (def->work_flags & IO_WQ_WORK_FILES) && + !(req->flags & REQ_F_NO_FILE_TABLE)) { + if (id->files != current->files || + id->nsproxy != current->nsproxy) + return false; + atomic_inc(&id->files->count); + get_nsproxy(id->nsproxy); + req->flags |= REQ_F_INFLIGHT; + + spin_lock_irq(&ctx->inflight_lock); + list_add(&req->inflight_entry, &ctx->inflight_list); + spin_unlock_irq(&ctx->inflight_lock); + req->work.flags |= IO_WQ_WORK_FILES; + } +#ifdef CONFIG_BLK_CGROUP + if (!(req->work.flags & IO_WQ_WORK_BLKCG) && + (def->work_flags & IO_WQ_WORK_BLKCG)) { + rcu_read_lock(); + if (id->blkcg_css != blkcg_css()) { + rcu_read_unlock(); + return false; + } + /* + * This should be rare, either the cgroup is dying or the task + * is moving cgroups. Just punt to root for the handful of ios. + */ + if (css_tryget_online(id->blkcg_css)) + req->work.flags |= IO_WQ_WORK_BLKCG; + rcu_read_unlock(); + } +#endif + if (!(req->work.flags & IO_WQ_WORK_CREDS)) { + if (id->creds != current_cred()) + return false; + get_cred(id->creds); + req->work.flags |= IO_WQ_WORK_CREDS; + } +#ifdef CONFIG_AUDIT + if (!uid_eq(current->loginuid, id->loginuid) || + current->sessionid != id->sessionid) + return false; +#endif + if (!(req->work.flags & IO_WQ_WORK_FS) && + (def->work_flags & IO_WQ_WORK_FS)) { + if (current->fs != id->fs) + return false; + spin_lock(&id->fs->lock); + if (!id->fs->in_exec) { + id->fs->users++; + req->work.flags |= IO_WQ_WORK_FS; + } else { + req->work.flags |= IO_WQ_WORK_CANCEL; + } + spin_unlock(¤t->fs->lock); + } + + return true; } static void io_prep_async_work(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; + struct io_ring_ctx *ctx = req->ctx; + struct io_identity *id; io_req_init_async(req); + id = req->work.identity; if (req->flags & REQ_F_ISREG) { - if (def->hash_reg_file || (req->ctx->flags & IORING_SETUP_IOPOLL)) + if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL)) io_wq_hash_work(&req->work, file_inode(req->file)); } else { if (def->unbound_nonreg_file) req->work.flags |= IO_WQ_WORK_UNBOUND; } - if (!req->work.mm && def->needs_mm) { - mmgrab(current->mm); - req->work.mm = current->mm; - } - if (!req->work.creds) - req->work.creds = get_current_cred(); - if (!req->work.fs && def->needs_fs) { - spin_lock(¤t->fs->lock); - if (!current->fs->in_exec) { - req->work.fs = current->fs; - req->work.fs->users++; - } else { - req->work.flags |= IO_WQ_WORK_CANCEL; - } - spin_unlock(¤t->fs->lock); + + /* ->mm can never change on us */ + if (!(req->work.flags & IO_WQ_WORK_MM) && + (def->work_flags & IO_WQ_WORK_MM)) { + mmgrab(id->mm); + req->work.flags |= IO_WQ_WORK_MM; } - if (def->needs_fsize) - req->work.fsize = rlimit(RLIMIT_FSIZE); - else - req->work.fsize = RLIM_INFINITY; + + /* if we fail grabbing identity, we must COW, regrab, and retry */ + if (io_grab_identity(req)) + return; + + if (!io_identity_cow(req)) + return; + + /* can't fail at this point */ + if (!io_grab_identity(req)) + WARN_ON(1); } static void io_prep_async_link(struct io_kiocb *req) @@ -1213,27 +1419,49 @@ static void io_queue_async_work(struct io_kiocb *req) static void io_kill_timeout(struct io_kiocb *req) { + struct io_timeout_data *io = req->async_data; int ret; - ret = hrtimer_try_to_cancel(&req->io->timeout.timer); + ret = hrtimer_try_to_cancel(&io->timer); if (ret != -1) { atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); list_del_init(&req->timeout.list); - req->flags |= REQ_F_COMP_LOCKED; io_cqring_fill_event(req, 0); - io_put_req(req); + io_put_req_deferred(req, 1); } } -static void io_kill_timeouts(struct io_ring_ctx *ctx) +static bool io_task_match(struct io_kiocb *req, struct task_struct *tsk) +{ + struct io_ring_ctx *ctx = req->ctx; + + if (!tsk || req->task == tsk) + return true; + if (ctx->flags & IORING_SETUP_SQPOLL) { + if (ctx->sq_data && req->task == ctx->sq_data->thread) + return true; + } + return false; +} + +/* + * Returns true if we found and killed one or more timeouts + */ +static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk) { struct io_kiocb *req, *tmp; + int canceled = 0; spin_lock_irq(&ctx->completion_lock); - list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) - io_kill_timeout(req); + list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { + if (io_task_match(req, tsk)) { + io_kill_timeout(req); + canceled++; + } + } spin_unlock_irq(&ctx->completion_lock); + return canceled != 0; } static void __io_queue_deferred(struct io_ring_ctx *ctx) @@ -1251,8 +1479,7 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx) if (link) { __io_queue_linked_timeout(link); /* drop submission reference */ - link->flags |= REQ_F_COMP_LOCKED; - io_put_req(link); + io_put_req_deferred(link, 1); } kfree(de); } while (!list_empty(&ctx->defer_list)); @@ -1284,6 +1511,13 @@ static void io_commit_cqring(struct io_ring_ctx *ctx) __io_queue_deferred(ctx); } +static inline bool io_sqring_full(struct io_ring_ctx *ctx) +{ + struct io_rings *r = ctx->rings; + + return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == r->sq_ring_entries; +} + static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; @@ -1317,8 +1551,8 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) { if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); - if (waitqueue_active(&ctx->sqo_wait)) - wake_up(&ctx->sqo_wait); + if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait)) + wake_up(&ctx->sq_data->wait); if (io_should_trigger_evfd(ctx)) eventfd_signal(ctx->cq_ev_fd, 1); } @@ -1332,12 +1566,25 @@ static void io_cqring_mark_overflow(struct io_ring_ctx *ctx) } } +static inline bool io_match_files(struct io_kiocb *req, + struct files_struct *files) +{ + if (!files) + return true; + if ((req->flags & REQ_F_WORK_INITIALIZED) && + (req->work.flags & IO_WQ_WORK_FILES)) + return req->work.identity->files == files; + return false; +} + /* Returns true if there are no backlogged entries after the flush */ -static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) +static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, + struct task_struct *tsk, + struct files_struct *files) { struct io_rings *rings = ctx->rings; + struct io_kiocb *req, *tmp; struct io_uring_cqe *cqe; - struct io_kiocb *req; unsigned long flags; LIST_HEAD(list); @@ -1356,13 +1603,16 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) ctx->cq_overflow_flushed = 1; cqe = NULL; - while (!list_empty(&ctx->cq_overflow_list)) { + list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) { + if (tsk && req->task != tsk) + continue; + if (!io_match_files(req, files)) + continue; + cqe = io_get_cqring(ctx); if (!cqe && !force) break; - req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb, - compl.list); list_move(&req->compl.list, &list); if (cqe) { WRITE_ONCE(cqe->user_data, req->user_data); @@ -1406,7 +1656,12 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) WRITE_ONCE(cqe->user_data, req->user_data); WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, cflags); - } else if (ctx->cq_overflow_flushed) { + } else if (ctx->cq_overflow_flushed || req->task->io_uring->in_idle) { + /* + * If we're in ring overflow flush mode, or in task cancel mode, + * then we cannot store the request for later flushing, we need + * to drop it on the floor. + */ WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); } else { @@ -1452,13 +1707,19 @@ static void io_submit_flush_completions(struct io_comp_state *cs) req = list_first_entry(&cs->list, struct io_kiocb, compl.list); list_del(&req->compl.list); __io_cqring_fill_event(req, req->result, req->compl.cflags); - if (!(req->flags & REQ_F_LINK_HEAD)) { - req->flags |= REQ_F_COMP_LOCKED; - io_put_req(req); - } else { + + /* + * io_free_req() doesn't care about completion_lock unless one + * of these flags is set. REQ_F_WORK_INITIALIZED is in the list + * because of a potential deadlock with req->work.fs->lock + */ + if (req->flags & (REQ_F_FAIL_LINK|REQ_F_LINK_TIMEOUT + |REQ_F_WORK_INITIALIZED)) { spin_unlock_irq(&ctx->completion_lock); io_put_req(req); spin_lock_irq(&ctx->completion_lock); + } else { + io_put_req(req); } } io_commit_cqring(ctx); @@ -1509,10 +1770,8 @@ static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx) static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx, struct io_submit_state *state) { - gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; - struct io_kiocb *req; - if (!state->free_reqs) { + gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; size_t sz; int ret; @@ -1529,14 +1788,11 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx, goto fallback; ret = 1; } - state->free_reqs = ret - 1; - req = state->reqs[ret - 1]; - } else { - state->free_reqs--; - req = state->reqs[state->free_reqs]; + state->free_reqs = ret; } - return req; + state->free_reqs--; + return state->reqs[state->free_reqs]; fallback: return io_get_fallback_req(ctx); } @@ -1550,23 +1806,30 @@ static inline void io_put_file(struct io_kiocb *req, struct file *file, fput(file); } -static bool io_dismantle_req(struct io_kiocb *req) +static void io_dismantle_req(struct io_kiocb *req) { io_clean_op(req); - if (req->io) - kfree(req->io); + if (req->async_data) + kfree(req->async_data); if (req->file) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); - return io_req_clean_work(req); + io_req_clean_work(req); } -static void __io_free_req_finish(struct io_kiocb *req) +static void __io_free_req(struct io_kiocb *req) { + struct io_uring_task *tctx = req->task->io_uring; struct io_ring_ctx *ctx = req->ctx; - __io_put_req_task(req); + io_dismantle_req(req); + + percpu_counter_dec(&tctx->inflight); + if (tctx->in_idle) + wake_up(&tctx->wait); + put_task_struct(req->task); + if (likely(!io_is_fallback_req(req))) kmem_cache_free(req_cachep, req); else @@ -1574,50 +1837,18 @@ static void __io_free_req_finish(struct io_kiocb *req) percpu_ref_put(&ctx->refs); } -static void io_req_task_file_table_put(struct callback_head *cb) -{ - struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); - struct fs_struct *fs = req->work.fs; - - spin_lock(&req->work.fs->lock); - if (--fs->users) - fs = NULL; - spin_unlock(&req->work.fs->lock); - if (fs) - free_fs_struct(fs); - req->work.fs = NULL; - __io_free_req_finish(req); -} - -static void __io_free_req(struct io_kiocb *req) -{ - if (!io_dismantle_req(req)) { - __io_free_req_finish(req); - } else { - int ret; - - init_task_work(&req->task_work, io_req_task_file_table_put); - ret = task_work_add(req->task, &req->task_work, TWA_RESUME); - if (unlikely(ret)) { - struct task_struct *tsk; - - tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &req->task_work, 0); - } - } -} - static bool io_link_cancel_timeout(struct io_kiocb *req) { + struct io_timeout_data *io = req->async_data; struct io_ring_ctx *ctx = req->ctx; int ret; - ret = hrtimer_try_to_cancel(&req->io->timeout.timer); + ret = hrtimer_try_to_cancel(&io->timer); if (ret != -1) { io_cqring_fill_event(req, -ECANCELED); io_commit_cqring(ctx); req->flags &= ~REQ_F_LINK_HEAD; - io_put_req(req); + io_put_req_deferred(req, 1); return true; } @@ -1636,7 +1867,6 @@ static bool __io_kill_linked_timeout(struct io_kiocb *req) return false; list_del_init(&link->link_list); - link->flags |= REQ_F_COMP_LOCKED; wake_ev = io_link_cancel_timeout(link); req->flags &= ~REQ_F_LINK_TIMEOUT; return wake_ev; @@ -1645,17 +1875,12 @@ static bool __io_kill_linked_timeout(struct io_kiocb *req) static void io_kill_linked_timeout(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; + unsigned long flags; bool wake_ev; - if (!(req->flags & REQ_F_COMP_LOCKED)) { - unsigned long flags; - - spin_lock_irqsave(&ctx->completion_lock, flags); - wake_ev = __io_kill_linked_timeout(req); - spin_unlock_irqrestore(&ctx->completion_lock, flags); - } else { - wake_ev = __io_kill_linked_timeout(req); - } + spin_lock_irqsave(&ctx->completion_lock, flags); + wake_ev = __io_kill_linked_timeout(req); + spin_unlock_irqrestore(&ctx->completion_lock, flags); if (wake_ev) io_cqring_ev_posted(ctx); @@ -1695,28 +1920,29 @@ static void __io_fail_links(struct io_kiocb *req) trace_io_uring_fail_link(req, link); io_cqring_fill_event(link, -ECANCELED); - link->flags |= REQ_F_COMP_LOCKED; - __io_double_put_req(link); - req->flags &= ~REQ_F_LINK_TIMEOUT; + + /* + * It's ok to free under spinlock as they're not linked anymore, + * but avoid REQ_F_WORK_INITIALIZED because it may deadlock on + * work.fs->lock. + */ + if (link->flags & REQ_F_WORK_INITIALIZED) + io_put_req_deferred(link, 2); + else + io_double_put_req(link); } io_commit_cqring(ctx); - io_cqring_ev_posted(ctx); } static void io_fail_links(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; + unsigned long flags; - if (!(req->flags & REQ_F_COMP_LOCKED)) { - unsigned long flags; - - spin_lock_irqsave(&ctx->completion_lock, flags); - __io_fail_links(req); - spin_unlock_irqrestore(&ctx->completion_lock, flags); - } else { - __io_fail_links(req); - } + spin_lock_irqsave(&ctx->completion_lock, flags); + __io_fail_links(req); + spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); } @@ -1746,13 +1972,15 @@ static struct io_kiocb *io_req_find_next(struct io_kiocb *req) return __io_req_find_next(req); } -static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb, - bool twa_signal_ok) +static int io_req_task_work_add(struct io_kiocb *req, bool twa_signal_ok) { struct task_struct *tsk = req->task; struct io_ring_ctx *ctx = req->ctx; int ret, notify; + if (tsk->flags & PF_EXITING) + return -ESRCH; + /* * SQPOLL kernel thread doesn't need notification, just a wakeup. For * all other cases, use TWA_SIGNAL unconditionally to ensure we're @@ -1763,7 +1991,7 @@ static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb, if (!(ctx->flags & IORING_SETUP_SQPOLL) && twa_signal_ok) notify = TWA_SIGNAL; - ret = task_work_add(tsk, cb, notify); + ret = task_work_add(tsk, &req->task_work, notify); if (!ret) wake_up_process(tsk); @@ -1787,8 +2015,10 @@ static void __io_req_task_cancel(struct io_kiocb *req, int error) static void io_req_task_cancel(struct callback_head *cb) { struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + struct io_ring_ctx *ctx = req->ctx; __io_req_task_cancel(req, -ECANCELED); + percpu_ref_put(&ctx->refs); } static void __io_req_task_submit(struct io_kiocb *req) @@ -1797,7 +2027,7 @@ static void __io_req_task_submit(struct io_kiocb *req) if (!__io_sq_thread_acquire_mm(ctx)) { mutex_lock(&ctx->uring_lock); - __io_queue_sqe(req, NULL, NULL); + __io_queue_sqe(req, NULL); mutex_unlock(&ctx->uring_lock); } else { __io_req_task_cancel(req, -EFAULT); @@ -1820,7 +2050,7 @@ static void io_req_task_queue(struct io_kiocb *req) init_task_work(&req->task_work, io_req_task_submit); percpu_ref_get(&req->ctx->refs); - ret = io_req_task_work_add(req, &req->task_work, true); + ret = io_req_task_work_add(req, true); if (unlikely(ret)) { struct task_struct *tsk; @@ -1874,6 +2104,9 @@ static void io_req_free_batch_finish(struct io_ring_ctx *ctx, if (rb->to_free) __io_req_free_batch_flush(ctx, rb); if (rb->task) { + struct io_uring_task *tctx = rb->task->io_uring; + + percpu_counter_sub(&tctx->inflight, rb->task_refs); put_task_struct_many(rb->task, rb->task_refs); rb->task = NULL; } @@ -1888,18 +2121,19 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req) if (req->flags & REQ_F_LINK_HEAD) io_queue_next(req); - if (req->flags & REQ_F_TASK_PINNED) { - if (req->task != rb->task) { - if (rb->task) - put_task_struct_many(rb->task, rb->task_refs); - rb->task = req->task; - rb->task_refs = 0; + if (req->task != rb->task) { + if (rb->task) { + struct io_uring_task *tctx = rb->task->io_uring; + + percpu_counter_sub(&tctx->inflight, rb->task_refs); + put_task_struct_many(rb->task, rb->task_refs); } - rb->task_refs++; - req->flags &= ~REQ_F_TASK_PINNED; + rb->task = req->task; + rb->task_refs = 0; } + rb->task_refs++; - WARN_ON_ONCE(io_dismantle_req(req)); + io_dismantle_req(req); rb->reqs[rb->to_free++] = req; if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs))) __io_req_free_batch_flush(req->ctx, rb); @@ -1926,6 +2160,34 @@ static void io_put_req(struct io_kiocb *req) io_free_req(req); } +static void io_put_req_deferred_cb(struct callback_head *cb) +{ + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + + io_free_req(req); +} + +static void io_free_req_deferred(struct io_kiocb *req) +{ + int ret; + + init_task_work(&req->task_work, io_put_req_deferred_cb); + ret = io_req_task_work_add(req, true); + if (unlikely(ret)) { + struct task_struct *tsk; + + tsk = io_wq_get_task(req->ctx->io_wq); + task_work_add(tsk, &req->task_work, 0); + wake_up_process(tsk); + } +} + +static inline void io_put_req_deferred(struct io_kiocb *req, int refs) +{ + if (refcount_sub_and_test(refs, &req->refs)) + io_free_req_deferred(req); +} + static struct io_wq_work *io_steal_work(struct io_kiocb *req) { struct io_kiocb *nxt; @@ -1942,17 +2204,6 @@ static struct io_wq_work *io_steal_work(struct io_kiocb *req) return nxt ? &nxt->work : NULL; } -/* - * Must only be used if we don't need to care about links, usually from - * within the completion handling itself. - */ -static void __io_double_put_req(struct io_kiocb *req) -{ - /* drop both submit and complete references */ - if (refcount_sub_and_test(2, &req->refs)) - __io_free_req(req); -} - static void io_double_put_req(struct io_kiocb *req) { /* drop both submit and complete references */ @@ -1973,7 +2224,7 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush) if (noflush && !list_empty(&ctx->cq_overflow_list)) return -1U; - io_cqring_overflow_flush(ctx, false); + io_cqring_overflow_flush(ctx, false, NULL, NULL); } /* See comment at the top of this file */ @@ -2010,6 +2261,12 @@ static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req) static inline bool io_run_task_work(void) { + /* + * Not safe to run on exiting task, and the task_work handling will + * not add work to such a task. + */ + if (unlikely(current->flags & PF_EXITING)) + return false; if (current->task_works) { __set_current_state(TASK_RUNNING); task_work_run(); @@ -2283,13 +2540,17 @@ static bool io_resubmit_prep(struct io_kiocb *req, int error) goto end_req; } - ret = io_import_iovec(rw, req, &iovec, &iter, false); - if (ret < 0) - goto end_req; - ret = io_setup_async_rw(req, iovec, inline_vecs, &iter, false); - if (!ret) + if (!req->async_data) { + ret = io_import_iovec(rw, req, &iovec, &iter, false); + if (ret < 0) + goto end_req; + ret = io_setup_async_rw(req, iovec, inline_vecs, &iter, false); + if (!ret) + return true; + kfree(iovec); + } else { return true; - kfree(iovec); + } end_req: req_set_fail_links(req); io_req_complete(req, ret); @@ -2386,8 +2647,8 @@ static void io_iopoll_req_issued(struct io_kiocb *req) list_add_tail(&req->inflight_entry, &ctx->iopoll_list); if ((ctx->flags & IORING_SETUP_SQPOLL) && - wq_has_sleeper(&ctx->sqo_wait)) - wake_up(&ctx->sqo_wait); + wq_has_sleeper(&ctx->sq_data->wait)) + wake_up(&ctx->sq_data->wait); } static void __io_state_file_put(struct io_submit_state *state) @@ -2416,7 +2677,6 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd) if (state->file) { if (state->fd == fd) { state->has_refs--; - state->ios_left--; return state->file; } __io_state_file_put(state); @@ -2426,15 +2686,14 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd) return NULL; state->fd = fd; - state->ios_left--; - state->has_refs = state->ios_left; + state->has_refs = state->ios_left - 1; return state->file; } static bool io_bdev_nowait(struct block_device *bdev) { #ifdef CONFIG_BLOCK - return !bdev || queue_is_mq(bdev_get_queue(bdev)); + return !bdev || blk_queue_nowait(bdev_get_queue(bdev)); #else return true; #endif @@ -2476,8 +2735,7 @@ static bool io_file_supports_async(struct file *file, int rw) return file->f_op->write_iter != NULL; } -static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, - bool force_nonblock) +static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; struct kiocb *kiocb = &req->rw.kiocb; @@ -2512,12 +2770,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (kiocb->ki_flags & IOCB_NOWAIT) req->flags |= REQ_F_NOWAIT; - if (kiocb->ki_flags & IOCB_DIRECT) - io_get_req_task(req); - - if (force_nonblock) - kiocb->ki_flags |= IOCB_NOWAIT; - if (ctx->flags & IORING_SETUP_IOPOLL) { if (!(kiocb->ki_flags & IOCB_DIRECT) || !kiocb->ki_filp->f_op->iopoll) @@ -2526,7 +2778,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, kiocb->ki_flags |= IOCB_HIPRI; kiocb->ki_complete = io_complete_rw_iopoll; req->iopoll_completed = 0; - io_get_req_task(req); } else { if (kiocb->ki_flags & IOCB_HIPRI) return -EINVAL; @@ -2564,13 +2815,14 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_comp_state *cs) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); + struct io_async_rw *io = req->async_data; /* add previously done IO, if any */ - if (req->io && req->io->rw.bytes_done > 0) { + if (io && io->bytes_done > 0) { if (ret < 0) - ret = req->io->rw.bytes_done; + ret = io->bytes_done; else - ret += req->io->rw.bytes_done; + ret += io->bytes_done; } if (req->flags & REQ_F_CUR_POS) @@ -2587,18 +2839,12 @@ static ssize_t io_import_fixed(struct io_kiocb *req, int rw, struct io_ring_ctx *ctx = req->ctx; size_t len = req->rw.len; struct io_mapped_ubuf *imu; - u16 index, buf_index; + u16 index, buf_index = req->buf_index; size_t offset; u64 buf_addr; - /* attempt to use fixed buffers without having provided iovecs */ - if (unlikely(!ctx->user_bufs)) - return -EFAULT; - - buf_index = req->buf_index; if (unlikely(buf_index >= ctx->nr_user_bufs)) return -EFAULT; - index = array_index_nospec(buf_index, ctx->nr_user_bufs); imu = &ctx->user_bufs[index]; buf_addr = req->rw.addr; @@ -2837,28 +3083,25 @@ static ssize_t __io_import_iovec(int rw, struct io_kiocb *req, return ret; } -#ifdef CONFIG_COMPAT - if (req->ctx->compat) - return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV, - iovec, iter); -#endif - - return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter); + return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter, + req->ctx->compat); } static ssize_t io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec, struct iov_iter *iter, bool needs_lock) { - if (!req->io) + struct io_async_rw *iorw = req->async_data; + + if (!iorw) return __io_import_iovec(rw, req, iovec, iter, needs_lock); *iovec = NULL; - return iov_iter_count(&req->io->rw.iter); + return iov_iter_count(&iorw->iter); } static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) { - return kiocb->ki_filp->f_mode & FMODE_STREAM ? NULL : &kiocb->ki_pos; + return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos; } /* @@ -2922,10 +3165,10 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, const struct iovec *fast_iov, struct iov_iter *iter) { - struct io_async_rw *rw = &req->io->rw; + struct io_async_rw *rw = req->async_data; memcpy(&rw->iter, iter, sizeof(*iter)); - rw->free_iovec = NULL; + rw->free_iovec = iovec; rw->bytes_done = 0; /* can only be fixed buffers, no need to do anything */ if (iter->type == ITER_BVEC) @@ -2942,33 +3185,33 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, memcpy(rw->fast_iov + iov_off, fast_iov + iov_off, sizeof(struct iovec) * iter->nr_segs); } else { - rw->free_iovec = iovec; req->flags |= REQ_F_NEED_CLEANUP; } } -static inline int __io_alloc_async_ctx(struct io_kiocb *req) +static inline int __io_alloc_async_data(struct io_kiocb *req) { - req->io = kmalloc(sizeof(*req->io), GFP_KERNEL); - return req->io == NULL; + WARN_ON_ONCE(!io_op_defs[req->opcode].async_size); + req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL); + return req->async_data == NULL; } -static int io_alloc_async_ctx(struct io_kiocb *req) +static int io_alloc_async_data(struct io_kiocb *req) { - if (!io_op_defs[req->opcode].async_ctx) + if (!io_op_defs[req->opcode].needs_async_data) return 0; - return __io_alloc_async_ctx(req); + return __io_alloc_async_data(req); } static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, const struct iovec *fast_iov, struct iov_iter *iter, bool force) { - if (!force && !io_op_defs[req->opcode].async_ctx) + if (!force && !io_op_defs[req->opcode].needs_async_data) return 0; - if (!req->io) { - if (__io_alloc_async_ctx(req)) + if (!req->async_data) { + if (__io_alloc_async_data(req)) return -ENOMEM; io_req_map_rw(req, iovec, fast_iov, iter); @@ -2976,29 +3219,28 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, return 0; } -static inline int io_rw_prep_async(struct io_kiocb *req, int rw, - bool force_nonblock) +static inline int io_rw_prep_async(struct io_kiocb *req, int rw) { - struct io_async_rw *iorw = &req->io->rw; - struct iovec *iov; + struct io_async_rw *iorw = req->async_data; + struct iovec *iov = iorw->fast_iov; ssize_t ret; - iorw->iter.iov = iov = iorw->fast_iov; - ret = __io_import_iovec(rw, req, &iov, &iorw->iter, !force_nonblock); + ret = __io_import_iovec(rw, req, &iov, &iorw->iter, false); if (unlikely(ret < 0)) return ret; - iorw->iter.iov = iov; - io_req_map_rw(req, iorw->iter.iov, iorw->fast_iov, &iorw->iter); + iorw->bytes_done = 0; + iorw->free_iovec = iov; + if (iov) + req->flags |= REQ_F_NEED_CLEANUP; return 0; } -static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, - bool force_nonblock) +static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { ssize_t ret; - ret = io_prep_rw(req, sqe, force_nonblock); + ret = io_prep_rw(req, sqe); if (ret) return ret; @@ -3006,9 +3248,9 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, return -EBADF; /* either don't need iovec imported or already have it */ - if (!req->io || req->flags & REQ_F_NEED_CLEANUP) + if (!req->async_data) return 0; - return io_rw_prep_async(req, READ, force_nonblock); + return io_rw_prep_async(req, READ); } /* @@ -3034,6 +3276,7 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, if (!wake_page_match(wpq, key)) return 0; + req->rw.kiocb.ki_flags &= ~IOCB_WAITQ; list_del_init(&wait->entry); init_task_work(&req->task_work, io_req_task_submit); @@ -3041,7 +3284,7 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, /* submit ref gets dropped, acquire a new one */ refcount_inc(&req->refs); - ret = io_req_task_work_add(req, &req->task_work, true); + ret = io_req_task_work_add(req, true); if (unlikely(ret)) { struct task_struct *tsk; @@ -3068,7 +3311,8 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, */ static bool io_rw_should_retry(struct io_kiocb *req) { - struct wait_page_queue *wait = &req->io->rw.wpq; + struct io_async_rw *rw = req->async_data; + struct wait_page_queue *wait = &rw->wpq; struct kiocb *kiocb = &req->rw.kiocb; /* never retry for NOWAIT, we just complete with -EAGAIN */ @@ -3091,9 +3335,8 @@ static bool io_rw_should_retry(struct io_kiocb *req) wait->wait.flags = 0; INIT_LIST_HEAD(&wait->wait.entry); kiocb->ki_flags |= IOCB_WAITQ; + kiocb->ki_flags &= ~IOCB_NOWAIT; kiocb->ki_waitq = wait; - - io_get_req_task(req); return true; } @@ -3113,11 +3356,13 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw.kiocb; struct iov_iter __iter, *iter = &__iter; + struct io_async_rw *rw = req->async_data; ssize_t io_size, ret, ret2; size_t iov_count; + bool no_async; - if (req->io) - iter = &req->io->rw.iter; + if (rw) + iter = &rw->iter; ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock); if (ret < 0) @@ -3130,9 +3375,13 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, /* Ensure we clear previously set non-block flag */ if (!force_nonblock) kiocb->ki_flags &= ~IOCB_NOWAIT; + else + kiocb->ki_flags |= IOCB_NOWAIT; + /* If the file doesn't support async, just async punt */ - if (force_nonblock && !io_file_supports_async(req->file, READ)) + no_async = force_nonblock && !io_file_supports_async(req->file, READ); + if (no_async) goto copy_iov; ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), iov_count); @@ -3155,10 +3404,8 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, goto done; /* some cases will consume bytes even on error returns */ iov_iter_revert(iter, iov_count - iov_iter_count(iter)); - ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); - if (ret) - goto out_free; - return -EAGAIN; + ret = 0; + goto copy_iov; } else if (ret < 0) { /* make sure -ERESTARTSYS -> -EINTR is done */ goto done; @@ -3176,12 +3423,15 @@ copy_iov: ret = ret2; goto out_free; } + if (no_async) + return -EAGAIN; + rw = req->async_data; /* it's copied and will be cleaned with ->io */ iovec = NULL; /* now use our persistent iterator, if we aren't already */ - iter = &req->io->rw.iter; + iter = &rw->iter; retry: - req->io->rw.bytes_done += ret; + rw->bytes_done += ret; /* if we can retry, do so with the callbacks armed */ if (!io_rw_should_retry(req)) { kiocb->ki_flags &= ~IOCB_WAITQ; @@ -3212,12 +3462,11 @@ out_free: return ret; } -static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, - bool force_nonblock) +static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { ssize_t ret; - ret = io_prep_rw(req, sqe, force_nonblock); + ret = io_prep_rw(req, sqe); if (ret) return ret; @@ -3225,9 +3474,9 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, return -EBADF; /* either don't need iovec imported or already have it */ - if (!req->io || req->flags & REQ_F_NEED_CLEANUP) + if (!req->async_data) return 0; - return io_rw_prep_async(req, WRITE, force_nonblock); + return io_rw_prep_async(req, WRITE); } static int io_write(struct io_kiocb *req, bool force_nonblock, @@ -3236,11 +3485,12 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw.kiocb; struct iov_iter __iter, *iter = &__iter; + struct io_async_rw *rw = req->async_data; size_t iov_count; ssize_t ret, ret2, io_size; - if (req->io) - iter = &req->io->rw.iter; + if (rw) + iter = &rw->iter; ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock); if (ret < 0) @@ -3251,7 +3501,9 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, /* Ensure we clear previously set non-block flag */ if (!force_nonblock) - req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT; + kiocb->ki_flags &= ~IOCB_NOWAIT; + else + kiocb->ki_flags |= IOCB_NOWAIT; /* If the file doesn't support async, just async punt */ if (force_nonblock && !io_file_supports_async(req->file, WRITE)) @@ -3323,10 +3575,7 @@ static int __io_splice_prep(struct io_kiocb *req, { struct io_splice* sp = &req->splice; unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL; - int ret; - if (req->flags & REQ_F_NEED_CLEANUP) - return 0; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -3337,10 +3586,10 @@ static int __io_splice_prep(struct io_kiocb *req, if (unlikely(sp->flags & ~valid_flags)) return -EINVAL; - ret = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in), &sp->file_in, - (sp->flags & SPLICE_F_FD_IN_FIXED)); - if (ret) - return ret; + sp->file_in = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in), + (sp->flags & SPLICE_F_FD_IN_FIXED)); + if (!sp->file_in) + return -EBADF; req->flags |= REQ_F_NEED_CLEANUP; if (!S_ISREG(file_inode(sp->file_in)->i_mode)) { @@ -3508,8 +3757,6 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe const char __user *fname; int ret; - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) - return -EINVAL; if (unlikely(sqe->ioprio || sqe->buf_index)) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) @@ -3536,8 +3783,8 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { u64 flags, mode; - if (req->flags & REQ_F_NEED_CLEANUP) - return 0; + if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) + return -EINVAL; mode = READ_ONCE(sqe->len); flags = READ_ONCE(sqe->open_flags); req->open.how = build_open_how(flags, mode); @@ -3550,8 +3797,8 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) size_t len; int ret; - if (req->flags & REQ_F_NEED_CLEANUP) - return 0; + if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) + return -EINVAL; how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); len = READ_ONCE(sqe->len); if (len < OPEN_HOW_SIZE_VER0) @@ -3767,7 +4014,7 @@ static int io_epoll_ctl_prep(struct io_kiocb *req, #if defined(CONFIG_EPOLL) if (sqe->ioprio || sqe->buf_index) return -EINVAL; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL))) return -EINVAL; req->epoll.epfd = READ_ONCE(sqe->fd); @@ -3834,7 +4081,7 @@ static int io_madvise(struct io_kiocb *req, bool force_nonblock) if (force_nonblock) return -EAGAIN; - ret = do_madvise(ma->addr, ma->len, ma->advice); + ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); if (ret < 0) req_set_fail_links(req); io_req_complete(req, ret); @@ -3882,7 +4129,7 @@ static int io_fadvise(struct io_kiocb *req, bool force_nonblock) static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL))) return -EINVAL; if (sqe->ioprio || sqe->buf_index) return -EINVAL; @@ -3938,8 +4185,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EBADF; req->close.fd = READ_ONCE(sqe->fd); - if ((req->file && req->file->f_op == &io_uring_fops) || - req->close.fd == req->ctx->ring_fd) + if ((req->file && req->file->f_op == &io_uring_fops)) return -EBADF; req->close.put_file = NULL; @@ -3969,7 +4215,7 @@ static int io_close(struct io_kiocb *req, bool force_nonblock, } /* No ->flush() or already async, safely close from here */ - ret = filp_close(close->put_file, req->work.files); + ret = filp_close(close->put_file, req->work.identity->files); if (ret < 0) req_set_fail_links(req); fput(close->put_file); @@ -4016,15 +4262,18 @@ static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock) static int io_setup_async_msg(struct io_kiocb *req, struct io_async_msghdr *kmsg) { - if (req->io) + struct io_async_msghdr *async_msg = req->async_data; + + if (async_msg) return -EAGAIN; - if (io_alloc_async_ctx(req)) { + if (io_alloc_async_data(req)) { if (kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); return -ENOMEM; } + async_msg = req->async_data; req->flags |= REQ_F_NEED_CLEANUP; - memcpy(&req->io->msg, kmsg, sizeof(*kmsg)); + memcpy(async_msg, kmsg, sizeof(*kmsg)); return -EAGAIN; } @@ -4039,8 +4288,8 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req, static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_async_msghdr *async_msg = req->async_data; struct io_sr_msg *sr = &req->sr_msg; - struct io_async_ctx *io = req->io; int ret; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) @@ -4055,13 +4304,9 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->msg_flags |= MSG_CMSG_COMPAT; #endif - if (!io || req->opcode == IORING_OP_SEND) + if (!async_msg || !io_op_defs[req->opcode].needs_async_data) return 0; - /* iovec is already imported */ - if (req->flags & REQ_F_NEED_CLEANUP) - return 0; - - ret = io_sendmsg_copy_hdr(req, &io->msg); + ret = io_sendmsg_copy_hdr(req, async_msg); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; return ret; @@ -4079,9 +4324,9 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, if (unlikely(!sock)) return ret; - if (req->io) { - kmsg = &req->io->msg; - kmsg->msg.msg_name = &req->io->msg.addr; + if (req->async_data) { + kmsg = req->async_data; + kmsg->msg.msg_name = &kmsg->addr; /* if iov is set, it's allocated already */ if (!kmsg->iov) kmsg->iov = kmsg->fast_iov; @@ -4130,7 +4375,7 @@ static int io_send(struct io_kiocb *req, bool force_nonblock, ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter); if (unlikely(ret)) - return ret;; + return ret; msg.msg_name = NULL; msg.msg_control = NULL; @@ -4179,8 +4424,9 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req, sr->len); iomsg->iov = NULL; } else { - ret = import_iovec(READ, uiov, iov_len, UIO_FASTIOV, - &iomsg->iov, &iomsg->msg.msg_iter); + ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV, + &iomsg->iov, &iomsg->msg.msg_iter, + false); if (ret > 0) ret = 0; } @@ -4220,9 +4466,9 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, sr->len = iomsg->iov[0].iov_len; iomsg->iov = NULL; } else { - ret = compat_import_iovec(READ, uiov, len, UIO_FASTIOV, - &iomsg->iov, - &iomsg->msg.msg_iter); + ret = __import_iovec(READ, (struct iovec __user *)uiov, len, + UIO_FASTIOV, &iomsg->iov, + &iomsg->msg.msg_iter, true); if (ret < 0) return ret; } @@ -4268,8 +4514,8 @@ static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req) static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_async_msghdr *async_msg = req->async_data; struct io_sr_msg *sr = &req->sr_msg; - struct io_async_ctx *io = req->io; int ret; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) @@ -4285,13 +4531,9 @@ static int io_recvmsg_prep(struct io_kiocb *req, sr->msg_flags |= MSG_CMSG_COMPAT; #endif - if (!io || req->opcode == IORING_OP_RECV) - return 0; - /* iovec is already imported */ - if (req->flags & REQ_F_NEED_CLEANUP) + if (!async_msg || !io_op_defs[req->opcode].needs_async_data) return 0; - - ret = io_recvmsg_copy_hdr(req, &io->msg); + ret = io_recvmsg_copy_hdr(req, async_msg); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; return ret; @@ -4310,9 +4552,9 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, if (unlikely(!sock)) return ret; - if (req->io) { - kmsg = &req->io->msg; - kmsg->msg.msg_name = &req->io->msg.addr; + if (req->async_data) { + kmsg = req->async_data; + kmsg->msg.msg_name = &kmsg->addr; /* if iov is set, it's allocated already */ if (!kmsg->iov) kmsg->iov = kmsg->fast_iov; @@ -4454,7 +4696,7 @@ static int io_accept(struct io_kiocb *req, bool force_nonblock, static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_connect *conn = &req->connect; - struct io_async_ctx *io = req->io; + struct io_async_connect *io = req->async_data; if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) return -EINVAL; @@ -4468,22 +4710,22 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; return move_addr_to_kernel(conn->addr, conn->addr_len, - &io->connect.address); + &io->address); } static int io_connect(struct io_kiocb *req, bool force_nonblock, struct io_comp_state *cs) { - struct io_async_ctx __io, *io; + struct io_async_connect __io, *io; unsigned file_flags; int ret; - if (req->io) { - io = req->io; + if (req->async_data) { + io = req->async_data; } else { ret = move_addr_to_kernel(req->connect.addr, req->connect.addr_len, - &__io.connect.address); + &__io.address); if (ret) goto out; io = &__io; @@ -4491,16 +4733,17 @@ static int io_connect(struct io_kiocb *req, bool force_nonblock, file_flags = force_nonblock ? O_NONBLOCK : 0; - ret = __sys_connect_file(req->file, &io->connect.address, + ret = __sys_connect_file(req->file, &io->address, req->connect.addr_len, file_flags); if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) { - if (req->io) + if (req->async_data) return -EAGAIN; - if (io_alloc_async_ctx(req)) { + if (io_alloc_async_data(req)) { ret = -ENOMEM; goto out; } - memcpy(&req->io->connect, &__io.connect, sizeof(__io.connect)); + io = req->async_data; + memcpy(req->async_data, &__io, sizeof(__io)); return -EAGAIN; } if (ret == -ERESTARTSYS) @@ -4608,7 +4851,7 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, * of executing it. We can't safely execute it anyway, as we may not * have the needed state needed for it anyway. */ - ret = io_req_task_work_add(req, &req->task_work, twa_signal_ok); + ret = io_req_task_work_add(req, twa_signal_ok); if (unlikely(ret)) { struct task_struct *tsk; @@ -4642,9 +4885,9 @@ static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll) static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req) { - /* pure poll stashes this in ->io, poll driven retry elsewhere */ + /* pure poll stashes this in ->async_data, poll driven retry elsewhere */ if (req->opcode == IORING_OP_POLL_ADD) - return (struct io_poll_iocb *) req->io; + return req->async_data; return req->apoll->double_poll; } @@ -4694,10 +4937,9 @@ static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt) hash_del(&req->hash_node); io_poll_complete(req, req->result, 0); - req->flags |= REQ_F_COMP_LOCKED; - *nxt = io_put_req_find_next(req); spin_unlock_irq(&ctx->completion_lock); + *nxt = io_put_req_find_next(req); io_cqring_ev_posted(ctx); } @@ -4724,6 +4966,8 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, if (mask && !(mask & poll->events)) return 0; + list_del_init(&wait->entry); + if (poll && poll->head) { bool done; @@ -4764,6 +5008,8 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, * for write). Setup a separate io_poll_iocb if this happens. */ if (unlikely(poll->head)) { + struct io_poll_iocb *poll_one = poll; + /* already have a 2nd entry, fail a third attempt */ if (*poll_ptr) { pt->error = -EINVAL; @@ -4774,7 +5020,7 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, pt->error = -ENOMEM; return; } - io_init_poll_iocb(poll, req->poll.events, io_poll_double_wake); + io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake); refcount_inc(&req->refs); poll->wait.private = req; *poll_ptr = poll; @@ -4919,7 +5165,6 @@ static bool io_arm_poll_handler(struct io_kiocb *req) apoll->double_poll = NULL; req->flags |= REQ_F_POLLED; - io_get_req_task(req); req->apoll = apoll; INIT_HLIST_NODE(&req->hash_node); @@ -4928,6 +5173,12 @@ static bool io_arm_poll_handler(struct io_kiocb *req) mask |= POLLIN | POLLRDNORM; if (def->pollout) mask |= POLLOUT | POLLWRNORM; + + /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */ + if ((req->opcode == IORING_OP_RECVMSG) && + (req->sr_msg.msg_flags & MSG_ERRQUEUE)) + mask &= ~POLLIN; + mask |= POLLERR | POLLPRI; ipt.pt._qproc = io_async_queue_proc; @@ -4986,15 +5237,17 @@ static bool io_poll_remove_one(struct io_kiocb *req) if (do_complete) { io_cqring_fill_event(req, -ECANCELED); io_commit_cqring(req->ctx); - req->flags |= REQ_F_COMP_LOCKED; req_set_fail_links(req); - io_put_req(req); + io_put_req_deferred(req, 1); } return do_complete; } -static void io_poll_remove_all(struct io_ring_ctx *ctx) +/* + * Returns true if we found and killed one or more poll requests + */ +static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk) { struct hlist_node *tmp; struct io_kiocb *req; @@ -5005,13 +5258,17 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx) struct hlist_head *list; list = &ctx->cancel_hash[i]; - hlist_for_each_entry_safe(req, tmp, list, hash_node) - posted += io_poll_remove_one(req); + hlist_for_each_entry_safe(req, tmp, list, hash_node) { + if (io_task_match(req, tsk)) + posted += io_poll_remove_one(req); + } } spin_unlock_irq(&ctx->completion_lock); if (posted) io_cqring_ev_posted(ctx); + + return posted != 0; } static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr) @@ -5079,7 +5336,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, { struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); - __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->io); + __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data); } static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -5100,8 +5357,6 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe #endif poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP | (events & EPOLLEXCLUSIVE); - - io_get_req_task(req); return 0; } @@ -5140,16 +5395,10 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) unsigned long flags; spin_lock_irqsave(&ctx->completion_lock, flags); + list_del_init(&req->timeout.list); atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); - /* - * We could be racing with timeout deletion. If the list is empty, - * then timeout lookup already found it and will be handling it. - */ - if (!list_empty(&req->timeout.list)) - list_del_init(&req->timeout.list); - io_cqring_fill_event(req, -ETIME); io_commit_cqring(ctx); spin_unlock_irqrestore(&ctx->completion_lock, flags); @@ -5162,18 +5411,17 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) static int __io_timeout_cancel(struct io_kiocb *req) { + struct io_timeout_data *io = req->async_data; int ret; - list_del_init(&req->timeout.list); - - ret = hrtimer_try_to_cancel(&req->io->timeout.timer); + ret = hrtimer_try_to_cancel(&io->timer); if (ret == -1) return -EALREADY; + list_del_init(&req->timeout.list); req_set_fail_links(req); - req->flags |= REQ_F_COMP_LOCKED; io_cqring_fill_event(req, -ECANCELED); - io_put_req(req); + io_put_req_deferred(req, 1); return 0; } @@ -5202,14 +5450,10 @@ static int io_timeout_remove_prep(struct io_kiocb *req, return -EINVAL; if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->len) - return -EINVAL; - - req->timeout.addr = READ_ONCE(sqe->addr); - req->timeout.flags = READ_ONCE(sqe->timeout_flags); - if (req->timeout.flags) + if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->timeout_flags) return -EINVAL; + req->timeout_rem.addr = READ_ONCE(sqe->addr); return 0; } @@ -5222,7 +5466,7 @@ static int io_timeout_remove(struct io_kiocb *req) int ret; spin_lock_irq(&ctx->completion_lock); - ret = io_timeout_cancel(ctx, req->timeout.addr); + ret = io_timeout_cancel(ctx, req->timeout_rem.addr); io_cqring_fill_event(req, ret); io_commit_cqring(ctx); @@ -5253,10 +5497,10 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, req->timeout.off = off; - if (!req->io && io_alloc_async_ctx(req)) + if (!req->async_data && io_alloc_async_data(req)) return -ENOMEM; - data = &req->io->timeout; + data = req->async_data; data->req = req; if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) @@ -5274,7 +5518,7 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, static int io_timeout(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - struct io_timeout_data *data = &req->io->timeout; + struct io_timeout_data *data = req->async_data; struct list_head *entry; u32 tail, off = req->timeout.off; @@ -5399,6 +5643,8 @@ static int io_async_cancel(struct io_kiocb *req) static int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + if (unlikely(req->ctx->flags & IORING_SETUP_SQPOLL)) + return -EINVAL; if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; if (sqe->ioprio || sqe->rw_flags) @@ -5435,118 +5681,86 @@ static int io_files_update(struct io_kiocb *req, bool force_nonblock, return 0; } -static int io_req_defer_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) +static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - ssize_t ret = 0; - - if (!sqe) - return 0; - - if (io_alloc_async_ctx(req)) - return -EAGAIN; - ret = io_prep_work_files(req); - if (unlikely(ret)) - return ret; - switch (req->opcode) { case IORING_OP_NOP: - break; + return 0; case IORING_OP_READV: case IORING_OP_READ_FIXED: case IORING_OP_READ: - ret = io_read_prep(req, sqe, true); - break; + return io_read_prep(req, sqe); case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: case IORING_OP_WRITE: - ret = io_write_prep(req, sqe, true); - break; + return io_write_prep(req, sqe); case IORING_OP_POLL_ADD: - ret = io_poll_add_prep(req, sqe); - break; + return io_poll_add_prep(req, sqe); case IORING_OP_POLL_REMOVE: - ret = io_poll_remove_prep(req, sqe); - break; + return io_poll_remove_prep(req, sqe); case IORING_OP_FSYNC: - ret = io_prep_fsync(req, sqe); - break; + return io_prep_fsync(req, sqe); case IORING_OP_SYNC_FILE_RANGE: - ret = io_prep_sfr(req, sqe); - break; + return io_prep_sfr(req, sqe); case IORING_OP_SENDMSG: case IORING_OP_SEND: - ret = io_sendmsg_prep(req, sqe); - break; + return io_sendmsg_prep(req, sqe); case IORING_OP_RECVMSG: case IORING_OP_RECV: - ret = io_recvmsg_prep(req, sqe); - break; + return io_recvmsg_prep(req, sqe); case IORING_OP_CONNECT: - ret = io_connect_prep(req, sqe); - break; + return io_connect_prep(req, sqe); case IORING_OP_TIMEOUT: - ret = io_timeout_prep(req, sqe, false); - break; + return io_timeout_prep(req, sqe, false); case IORING_OP_TIMEOUT_REMOVE: - ret = io_timeout_remove_prep(req, sqe); - break; + return io_timeout_remove_prep(req, sqe); case IORING_OP_ASYNC_CANCEL: - ret = io_async_cancel_prep(req, sqe); - break; + return io_async_cancel_prep(req, sqe); case IORING_OP_LINK_TIMEOUT: - ret = io_timeout_prep(req, sqe, true); - break; + return io_timeout_prep(req, sqe, true); case IORING_OP_ACCEPT: - ret = io_accept_prep(req, sqe); - break; + return io_accept_prep(req, sqe); case IORING_OP_FALLOCATE: - ret = io_fallocate_prep(req, sqe); - break; + return io_fallocate_prep(req, sqe); case IORING_OP_OPENAT: - ret = io_openat_prep(req, sqe); - break; + return io_openat_prep(req, sqe); case IORING_OP_CLOSE: - ret = io_close_prep(req, sqe); - break; + return io_close_prep(req, sqe); case IORING_OP_FILES_UPDATE: - ret = io_files_update_prep(req, sqe); - break; + return io_files_update_prep(req, sqe); case IORING_OP_STATX: - ret = io_statx_prep(req, sqe); - break; + return io_statx_prep(req, sqe); case IORING_OP_FADVISE: - ret = io_fadvise_prep(req, sqe); - break; + return io_fadvise_prep(req, sqe); case IORING_OP_MADVISE: - ret = io_madvise_prep(req, sqe); - break; + return io_madvise_prep(req, sqe); case IORING_OP_OPENAT2: - ret = io_openat2_prep(req, sqe); - break; + return io_openat2_prep(req, sqe); case IORING_OP_EPOLL_CTL: - ret = io_epoll_ctl_prep(req, sqe); - break; + return io_epoll_ctl_prep(req, sqe); case IORING_OP_SPLICE: - ret = io_splice_prep(req, sqe); - break; + return io_splice_prep(req, sqe); case IORING_OP_PROVIDE_BUFFERS: - ret = io_provide_buffers_prep(req, sqe); - break; + return io_provide_buffers_prep(req, sqe); case IORING_OP_REMOVE_BUFFERS: - ret = io_remove_buffers_prep(req, sqe); - break; + return io_remove_buffers_prep(req, sqe); case IORING_OP_TEE: - ret = io_tee_prep(req, sqe); - break; - default: - printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", - req->opcode); - ret = -EINVAL; - break; + return io_tee_prep(req, sqe); } - return ret; + printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", + req->opcode); + return-EINVAL; +} + +static int io_req_defer_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + if (!sqe) + return 0; + if (io_alloc_async_data(req)) + return -EAGAIN; + return io_req_prep(req, sqe); } static u32 io_get_sequence(struct io_kiocb *req) @@ -5580,7 +5794,7 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) return 0; - if (!req->io) { + if (!req->async_data) { ret = io_req_defer_prep(req, sqe); if (ret) return ret; @@ -5606,10 +5820,24 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EIOCBQUEUED; } -static void __io_clean_op(struct io_kiocb *req) +static void io_req_drop_files(struct io_kiocb *req) { - struct io_async_ctx *io = req->io; + struct io_ring_ctx *ctx = req->ctx; + unsigned long flags; + spin_lock_irqsave(&ctx->inflight_lock, flags); + list_del(&req->inflight_entry); + if (waitqueue_active(&ctx->inflight_wait)) + wake_up(&ctx->inflight_wait); + spin_unlock_irqrestore(&ctx->inflight_lock, flags); + req->flags &= ~REQ_F_INFLIGHT; + put_files_struct(req->work.identity->files); + put_nsproxy(req->work.identity->nsproxy); + req->work.flags &= ~IO_WQ_WORK_FILES; +} + +static void __io_clean_op(struct io_kiocb *req) +{ if (req->flags & REQ_F_BUFFER_SELECTED) { switch (req->opcode) { case IORING_OP_READV: @@ -5632,39 +5860,39 @@ static void __io_clean_op(struct io_kiocb *req) case IORING_OP_READ: case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: - case IORING_OP_WRITE: - if (io->rw.free_iovec) - kfree(io->rw.free_iovec); + case IORING_OP_WRITE: { + struct io_async_rw *io = req->async_data; + if (io->free_iovec) + kfree(io->free_iovec); break; + } case IORING_OP_RECVMSG: - case IORING_OP_SENDMSG: - if (io->msg.iov != io->msg.fast_iov) - kfree(io->msg.iov); + case IORING_OP_SENDMSG: { + struct io_async_msghdr *io = req->async_data; + if (io->iov != io->fast_iov) + kfree(io->iov); break; + } case IORING_OP_SPLICE: case IORING_OP_TEE: io_put_file(req, req->splice.file_in, (req->splice.flags & SPLICE_F_FD_IN_FIXED)); break; + case IORING_OP_OPENAT: + case IORING_OP_OPENAT2: + if (req->open.filename) + putname(req->open.filename); + break; } req->flags &= ~REQ_F_NEED_CLEANUP; } - if (req->flags & REQ_F_INFLIGHT) { - struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; - - spin_lock_irqsave(&ctx->inflight_lock, flags); - list_del(&req->inflight_entry); - if (waitqueue_active(&ctx->inflight_wait)) - wake_up(&ctx->inflight_wait); - spin_unlock_irqrestore(&ctx->inflight_lock, flags); - req->flags &= ~REQ_F_INFLIGHT; - } + if (req->flags & REQ_F_INFLIGHT) + io_req_drop_files(req); } -static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, - bool force_nonblock, struct io_comp_state *cs) +static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_ring_ctx *ctx = req->ctx; int ret; @@ -5676,221 +5904,89 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, case IORING_OP_READV: case IORING_OP_READ_FIXED: case IORING_OP_READ: - if (sqe) { - ret = io_read_prep(req, sqe, force_nonblock); - if (ret < 0) - break; - } ret = io_read(req, force_nonblock, cs); break; case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: case IORING_OP_WRITE: - if (sqe) { - ret = io_write_prep(req, sqe, force_nonblock); - if (ret < 0) - break; - } ret = io_write(req, force_nonblock, cs); break; case IORING_OP_FSYNC: - if (sqe) { - ret = io_prep_fsync(req, sqe); - if (ret < 0) - break; - } ret = io_fsync(req, force_nonblock); break; case IORING_OP_POLL_ADD: - if (sqe) { - ret = io_poll_add_prep(req, sqe); - if (ret) - break; - } ret = io_poll_add(req); break; case IORING_OP_POLL_REMOVE: - if (sqe) { - ret = io_poll_remove_prep(req, sqe); - if (ret < 0) - break; - } ret = io_poll_remove(req); break; case IORING_OP_SYNC_FILE_RANGE: - if (sqe) { - ret = io_prep_sfr(req, sqe); - if (ret < 0) - break; - } ret = io_sync_file_range(req, force_nonblock); break; case IORING_OP_SENDMSG: + ret = io_sendmsg(req, force_nonblock, cs); + break; case IORING_OP_SEND: - if (sqe) { - ret = io_sendmsg_prep(req, sqe); - if (ret < 0) - break; - } - if (req->opcode == IORING_OP_SENDMSG) - ret = io_sendmsg(req, force_nonblock, cs); - else - ret = io_send(req, force_nonblock, cs); + ret = io_send(req, force_nonblock, cs); break; case IORING_OP_RECVMSG: + ret = io_recvmsg(req, force_nonblock, cs); + break; case IORING_OP_RECV: - if (sqe) { - ret = io_recvmsg_prep(req, sqe); - if (ret) - break; - } - if (req->opcode == IORING_OP_RECVMSG) - ret = io_recvmsg(req, force_nonblock, cs); - else - ret = io_recv(req, force_nonblock, cs); + ret = io_recv(req, force_nonblock, cs); break; case IORING_OP_TIMEOUT: - if (sqe) { - ret = io_timeout_prep(req, sqe, false); - if (ret) - break; - } ret = io_timeout(req); break; case IORING_OP_TIMEOUT_REMOVE: - if (sqe) { - ret = io_timeout_remove_prep(req, sqe); - if (ret) - break; - } ret = io_timeout_remove(req); break; case IORING_OP_ACCEPT: - if (sqe) { - ret = io_accept_prep(req, sqe); - if (ret) - break; - } ret = io_accept(req, force_nonblock, cs); break; case IORING_OP_CONNECT: - if (sqe) { - ret = io_connect_prep(req, sqe); - if (ret) - break; - } ret = io_connect(req, force_nonblock, cs); break; case IORING_OP_ASYNC_CANCEL: - if (sqe) { - ret = io_async_cancel_prep(req, sqe); - if (ret) - break; - } ret = io_async_cancel(req); break; case IORING_OP_FALLOCATE: - if (sqe) { - ret = io_fallocate_prep(req, sqe); - if (ret) - break; - } ret = io_fallocate(req, force_nonblock); break; case IORING_OP_OPENAT: - if (sqe) { - ret = io_openat_prep(req, sqe); - if (ret) - break; - } ret = io_openat(req, force_nonblock); break; case IORING_OP_CLOSE: - if (sqe) { - ret = io_close_prep(req, sqe); - if (ret) - break; - } ret = io_close(req, force_nonblock, cs); break; case IORING_OP_FILES_UPDATE: - if (sqe) { - ret = io_files_update_prep(req, sqe); - if (ret) - break; - } ret = io_files_update(req, force_nonblock, cs); break; case IORING_OP_STATX: - if (sqe) { - ret = io_statx_prep(req, sqe); - if (ret) - break; - } ret = io_statx(req, force_nonblock); break; case IORING_OP_FADVISE: - if (sqe) { - ret = io_fadvise_prep(req, sqe); - if (ret) - break; - } ret = io_fadvise(req, force_nonblock); break; case IORING_OP_MADVISE: - if (sqe) { - ret = io_madvise_prep(req, sqe); - if (ret) - break; - } ret = io_madvise(req, force_nonblock); break; case IORING_OP_OPENAT2: - if (sqe) { - ret = io_openat2_prep(req, sqe); - if (ret) - break; - } ret = io_openat2(req, force_nonblock); break; case IORING_OP_EPOLL_CTL: - if (sqe) { - ret = io_epoll_ctl_prep(req, sqe); - if (ret) - break; - } ret = io_epoll_ctl(req, force_nonblock, cs); break; case IORING_OP_SPLICE: - if (sqe) { - ret = io_splice_prep(req, sqe); - if (ret < 0) - break; - } ret = io_splice(req, force_nonblock); break; case IORING_OP_PROVIDE_BUFFERS: - if (sqe) { - ret = io_provide_buffers_prep(req, sqe); - if (ret) - break; - } ret = io_provide_buffers(req, force_nonblock, cs); break; case IORING_OP_REMOVE_BUFFERS: - if (sqe) { - ret = io_remove_buffers_prep(req, sqe); - if (ret) - break; - } ret = io_remove_buffers(req, force_nonblock, cs); break; case IORING_OP_TEE: - if (sqe) { - ret = io_tee_prep(req, sqe); - if (ret < 0) - break; - } ret = io_tee(req, force_nonblock); break; default: @@ -5936,7 +6032,7 @@ static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work) if (!ret) { do { - ret = io_issue_sqe(req, NULL, false, NULL); + ret = io_issue_sqe(req, false, NULL); /* * We can get EAGAIN for polled IO even though we're * forcing a sync submission from here, since we can't @@ -5965,20 +6061,19 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, return table->files[index & IORING_FILE_TABLE_MASK]; } -static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, - int fd, struct file **out_file, bool fixed) +static struct file *io_file_get(struct io_submit_state *state, + struct io_kiocb *req, int fd, bool fixed) { struct io_ring_ctx *ctx = req->ctx; struct file *file; if (fixed) { - if (unlikely(!ctx->file_data || - (unsigned) fd >= ctx->nr_user_files)) - return -EBADF; + if (unlikely((unsigned int)fd >= ctx->nr_user_files)) + return NULL; fd = array_index_nospec(fd, ctx->nr_user_files); file = io_file_from_index(ctx, fd); if (file) { - req->fixed_file_refs = ctx->file_data->cur_refs; + req->fixed_file_refs = &ctx->file_data->node->refs; percpu_ref_get(req->fixed_file_refs); } } else { @@ -5986,11 +6081,7 @@ static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, file = __io_file_get(state, fd); } - if (file || io_op_defs[req->opcode].needs_file_no_error) { - *out_file = file; - return 0; - } - return -EBADF; + return file; } static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, @@ -6002,46 +6093,10 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, if (unlikely(!fixed && io_async_submit(req->ctx))) return -EBADF; - return io_file_get(state, req, fd, &req->file, fixed); -} - -static int io_grab_files(struct io_kiocb *req) -{ - int ret = -EBADF; - struct io_ring_ctx *ctx = req->ctx; - - io_req_init_async(req); - - if (req->work.files || (req->flags & REQ_F_NO_FILE_TABLE)) + req->file = io_file_get(state, req, fd, fixed); + if (req->file || io_op_defs[req->opcode].needs_file_no_error) return 0; - if (!ctx->ring_file) - return -EBADF; - - rcu_read_lock(); - spin_lock_irq(&ctx->inflight_lock); - /* - * We use the f_ops->flush() handler to ensure that we can flush - * out work accessing these files if the fd is closed. Check if - * the fd has changed since we started down this path, and disallow - * this operation if it has. - */ - if (fcheck(ctx->ring_fd) == ctx->ring_file) { - list_add(&req->inflight_entry, &ctx->inflight_list); - req->flags |= REQ_F_INFLIGHT; - req->work.files = current->files; - ret = 0; - } - spin_unlock_irq(&ctx->inflight_lock); - rcu_read_unlock(); - - return ret; -} - -static inline int io_prep_work_files(struct io_kiocb *req) -{ - if (!io_op_defs[req->opcode].file_table) - return 0; - return io_grab_files(req); + return -EBADF; } static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) @@ -6088,7 +6143,7 @@ static void __io_queue_linked_timeout(struct io_kiocb *req) * we got a chance to setup the timer */ if (!list_empty(&req->link_list)) { - struct io_timeout_data *data = &req->io->timeout; + struct io_timeout_data *data = req->async_data; data->timer.function = io_link_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), @@ -6126,8 +6181,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) return nxt; } -static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_comp_state *cs) +static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs) { struct io_kiocb *linked_timeout; struct io_kiocb *nxt; @@ -6137,17 +6191,18 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, again: linked_timeout = io_prep_linked_timeout(req); - if ((req->flags & REQ_F_WORK_INITIALIZED) && req->work.creds && - req->work.creds != current_cred()) { + if ((req->flags & REQ_F_WORK_INITIALIZED) && req->work.identity->creds && + req->work.identity->creds != current_cred()) { if (old_creds) revert_creds(old_creds); - if (old_creds == req->work.creds) + if (old_creds == req->work.identity->creds) old_creds = NULL; /* restored original creds */ else - old_creds = override_creds(req->work.creds); + old_creds = override_creds(req->work.identity->creds); + req->work.flags |= IO_WQ_WORK_CREDS; } - ret = io_issue_sqe(req, sqe, true, cs); + ret = io_issue_sqe(req, true, cs); /* * We async punt it if the file wasn't marked NOWAIT, or if the file @@ -6156,9 +6211,6 @@ again: if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { if (!io_arm_poll_handler(req)) { punt: - ret = io_prep_work_files(req); - if (unlikely(ret)) - goto err; /* * Queued up for async execution, worker will release * submit reference when the iocb is actually submitted. @@ -6172,7 +6224,6 @@ punt: } if (unlikely(ret)) { -err: /* un-prep timeout, so it'll be killed as any other linked */ req->flags &= ~REQ_F_LINK_TIMEOUT; req_set_fail_links(req); @@ -6212,7 +6263,7 @@ fail_req: io_req_complete(req, ret); } } else if (req->flags & REQ_F_FORCE_ASYNC) { - if (!req->io) { + if (!req->async_data) { ret = io_req_defer_prep(req, sqe); if (unlikely(ret)) goto fail_req; @@ -6226,7 +6277,12 @@ fail_req: req->work.flags |= IO_WQ_WORK_CONCURRENT; io_queue_async_work(req); } else { - __io_queue_sqe(req, sqe, cs); + if (sqe) { + ret = io_req_prep(req, sqe); + if (unlikely(ret)) + goto fail_req; + } + __io_queue_sqe(req, cs); } } @@ -6274,7 +6330,6 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, return ret; } trace_io_uring_link(ctx, req, head); - io_get_req_task(req); list_add_tail(&req->link_list, &head->link_list); /* last request of a link, enqueue the link */ @@ -6323,9 +6378,6 @@ static void io_submit_state_start(struct io_submit_state *state, struct io_ring_ctx *ctx, unsigned int max_ios) { blk_start_plug(&state->plug); -#ifdef CONFIG_BLOCK - state->plug.nowait = true; -#endif state->comp.nr = 0; INIT_LIST_HEAD(&state->comp.list); state->comp.ctx = ctx; @@ -6382,6 +6434,32 @@ static inline void io_consume_sqe(struct io_ring_ctx *ctx) ctx->cached_sq_head++; } +/* + * Check SQE restrictions (opcode and flags). + * + * Returns 'true' if SQE is allowed, 'false' otherwise. + */ +static inline bool io_check_restriction(struct io_ring_ctx *ctx, + struct io_kiocb *req, + unsigned int sqe_flags) +{ + if (!ctx->restricted) + return true; + + if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) + return false; + + if ((sqe_flags & ctx->restrictions.sqe_flags_required) != + ctx->restrictions.sqe_flags_required) + return false; + + if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed | + ctx->restrictions.sqe_flags_required)) + return false; + + return true; +} + #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ IOSQE_IO_HARDLINK | IOSQE_ASYNC | \ IOSQE_BUFFER_SELECT) @@ -6391,11 +6469,11 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, struct io_submit_state *state) { unsigned int sqe_flags; - int id; + int id, ret; req->opcode = READ_ONCE(sqe->opcode); req->user_data = READ_ONCE(sqe->user_data); - req->io = NULL; + req->async_data = NULL; req->file = NULL; req->ctx = ctx; req->flags = 0; @@ -6415,17 +6493,26 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) return -EINVAL; + if (unlikely(!io_check_restriction(ctx, req, sqe_flags))) + return -EACCES; + if ((sqe_flags & IOSQE_BUFFER_SELECT) && !io_op_defs[req->opcode].buffer_select) return -EOPNOTSUPP; id = READ_ONCE(sqe->personality); if (id) { + struct io_identity *iod; + io_req_init_async(req); - req->work.creds = idr_find(&ctx->personality_idr, id); - if (unlikely(!req->work.creds)) + iod = idr_find(&ctx->personality_idr, id); + if (unlikely(!iod)) return -EINVAL; - get_cred(req->work.creds); + refcount_inc(&iod->count); + io_put_identity(current->io_uring, req); + get_cred(iod->creds); + req->work.identity = iod; + req->work.flags |= IO_WQ_WORK_CREDS; } /* same numerical values with corresponding REQ_F_*, safe to copy */ @@ -6434,11 +6521,12 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, if (!io_op_defs[req->opcode].needs_file) return 0; - return io_req_set_file(state, req, READ_ONCE(sqe->fd)); + ret = io_req_set_file(state, req, READ_ONCE(sqe->fd)); + state->ios_left--; + return ret; } -static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, - struct file *ring_file, int ring_fd) +static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) { struct io_submit_state state; struct io_kiocb *link = NULL; @@ -6447,7 +6535,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, /* if we have a backlog and couldn't flush it all, return BUSY */ if (test_bit(0, &ctx->sq_check_overflow)) { if (!list_empty(&ctx->cq_overflow_list) && - !io_cqring_overflow_flush(ctx, false)) + !io_cqring_overflow_flush(ctx, false, NULL, NULL)) return -EBUSY; } @@ -6457,10 +6545,10 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, if (!percpu_ref_tryget_many(&ctx->refs, nr)) return -EAGAIN; - io_submit_state_start(&state, ctx, nr); + percpu_counter_add(¤t->io_uring->inflight, nr); + refcount_add(nr, ¤t->usage); - ctx->ring_fd = ring_fd; - ctx->ring_file = ring_file; + io_submit_state_start(&state, ctx, nr); for (i = 0; i < nr; i++) { const struct io_uring_sqe *sqe; @@ -6478,12 +6566,11 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, submitted = -EAGAIN; break; } - - err = io_init_req(ctx, req, sqe, &state); io_consume_sqe(ctx); /* will complete beyond this point, count as submitted */ submitted++; + err = io_init_req(ctx, req, sqe, &state); if (unlikely(err)) { fail_req: io_put_req(req); @@ -6500,8 +6587,12 @@ fail_req: if (unlikely(submitted != nr)) { int ref_used = (submitted == -EAGAIN) ? 0 : submitted; + struct io_uring_task *tctx = current->io_uring; + int unused = nr - ref_used; - percpu_ref_put_many(&ctx->refs, nr - ref_used); + percpu_ref_put_many(&ctx->refs, unused); + percpu_counter_sub(&tctx->inflight, unused); + put_task_struct_many(current, unused); } if (link) io_queue_link_head(link, &state.comp); @@ -6528,117 +6619,190 @@ static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx) spin_unlock_irq(&ctx->completion_lock); } -static int io_sq_thread(void *data) +static int io_sq_wake_function(struct wait_queue_entry *wqe, unsigned mode, + int sync, void *key) { - struct io_ring_ctx *ctx = data; - const struct cred *old_cred; - DEFINE_WAIT(wait); - unsigned long timeout; + struct io_ring_ctx *ctx = container_of(wqe, struct io_ring_ctx, sqo_wait_entry); + int ret; + + ret = autoremove_wake_function(wqe, mode, sync, key); + if (ret) { + unsigned long flags; + + spin_lock_irqsave(&ctx->completion_lock, flags); + ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; + spin_unlock_irqrestore(&ctx->completion_lock, flags); + } + return ret; +} + +enum sq_ret { + SQT_IDLE = 1, + SQT_SPIN = 2, + SQT_DID_WORK = 4, +}; + +static enum sq_ret __io_sq_thread(struct io_ring_ctx *ctx, + unsigned long start_jiffies, bool cap_entries) +{ + unsigned long timeout = start_jiffies + ctx->sq_thread_idle; + struct io_sq_data *sqd = ctx->sq_data; + unsigned int to_submit; int ret = 0; - complete(&ctx->sq_thread_comp); +again: + if (!list_empty(&ctx->iopoll_list)) { + unsigned nr_events = 0; + + mutex_lock(&ctx->uring_lock); + if (!list_empty(&ctx->iopoll_list) && !need_resched()) + io_do_iopoll(ctx, &nr_events, 0); + mutex_unlock(&ctx->uring_lock); + } + + to_submit = io_sqring_entries(ctx); - old_cred = override_creds(ctx->creds); + /* + * If submit got -EBUSY, flag us as needing the application + * to enter the kernel to reap and flush events. + */ + if (!to_submit || ret == -EBUSY || need_resched()) { + /* + * Drop cur_mm before scheduling, we can't hold it for + * long periods (or over schedule()). Do this before + * adding ourselves to the waitqueue, as the unuse/drop + * may sleep. + */ + io_sq_thread_drop_mm(); - timeout = jiffies + ctx->sq_thread_idle; - while (!kthread_should_park()) { - unsigned int to_submit; + /* + * We're polling. If we're within the defined idle + * period, then let us spin without work before going + * to sleep. The exception is if we got EBUSY doing + * more IO, we should wait for the application to + * reap events and wake us up. + */ + if (!list_empty(&ctx->iopoll_list) || need_resched() || + (!time_after(jiffies, timeout) && ret != -EBUSY && + !percpu_ref_is_dying(&ctx->refs))) + return SQT_SPIN; - if (!list_empty(&ctx->iopoll_list)) { - unsigned nr_events = 0; + prepare_to_wait(&sqd->wait, &ctx->sqo_wait_entry, + TASK_INTERRUPTIBLE); - mutex_lock(&ctx->uring_lock); - if (!list_empty(&ctx->iopoll_list) && !need_resched()) - io_do_iopoll(ctx, &nr_events, 0); - else - timeout = jiffies + ctx->sq_thread_idle; - mutex_unlock(&ctx->uring_lock); + /* + * While doing polled IO, before going to sleep, we need + * to check if there are new reqs added to iopoll_list, + * it is because reqs may have been punted to io worker + * and will be added to iopoll_list later, hence check + * the iopoll_list again. + */ + if ((ctx->flags & IORING_SETUP_IOPOLL) && + !list_empty_careful(&ctx->iopoll_list)) { + finish_wait(&sqd->wait, &ctx->sqo_wait_entry); + goto again; } to_submit = io_sqring_entries(ctx); + if (!to_submit || ret == -EBUSY) + return SQT_IDLE; + } + + finish_wait(&sqd->wait, &ctx->sqo_wait_entry); + io_ring_clear_wakeup_flag(ctx); + + /* if we're handling multiple rings, cap submit size for fairness */ + if (cap_entries && to_submit > 8) + to_submit = 8; + + mutex_lock(&ctx->uring_lock); + if (likely(!percpu_ref_is_dying(&ctx->refs))) + ret = io_submit_sqes(ctx, to_submit); + mutex_unlock(&ctx->uring_lock); + + if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait)) + wake_up(&ctx->sqo_sq_wait); + + return SQT_DID_WORK; +} + +static void io_sqd_init_new(struct io_sq_data *sqd) +{ + struct io_ring_ctx *ctx; + + while (!list_empty(&sqd->ctx_new_list)) { + ctx = list_first_entry(&sqd->ctx_new_list, struct io_ring_ctx, sqd_list); + init_wait(&ctx->sqo_wait_entry); + ctx->sqo_wait_entry.func = io_sq_wake_function; + list_move_tail(&ctx->sqd_list, &sqd->ctx_list); + complete(&ctx->sq_thread_comp); + } +} + +static int io_sq_thread(void *data) +{ + struct cgroup_subsys_state *cur_css = NULL; + const struct cred *old_cred = NULL; + struct io_sq_data *sqd = data; + struct io_ring_ctx *ctx; + unsigned long start_jiffies; + + start_jiffies = jiffies; + while (!kthread_should_stop()) { + enum sq_ret ret = 0; + bool cap_entries; /* - * If submit got -EBUSY, flag us as needing the application - * to enter the kernel to reap and flush events. + * Any changes to the sqd lists are synchronized through the + * kthread parking. This synchronizes the thread vs users, + * the users are synchronized on the sqd->ctx_lock. */ - if (!to_submit || ret == -EBUSY || need_resched()) { - /* - * Drop cur_mm before scheduling, we can't hold it for - * long periods (or over schedule()). Do this before - * adding ourselves to the waitqueue, as the unuse/drop - * may sleep. - */ - io_sq_thread_drop_mm(); + if (kthread_should_park()) + kthread_parkme(); - /* - * We're polling. If we're within the defined idle - * period, then let us spin without work before going - * to sleep. The exception is if we got EBUSY doing - * more IO, we should wait for the application to - * reap events and wake us up. - */ - if (!list_empty(&ctx->iopoll_list) || need_resched() || - (!time_after(jiffies, timeout) && ret != -EBUSY && - !percpu_ref_is_dying(&ctx->refs))) { - io_run_task_work(); - cond_resched(); - continue; - } + if (unlikely(!list_empty(&sqd->ctx_new_list))) + io_sqd_init_new(sqd); - prepare_to_wait(&ctx->sqo_wait, &wait, - TASK_INTERRUPTIBLE); + cap_entries = !list_is_singular(&sqd->ctx_list); - /* - * While doing polled IO, before going to sleep, we need - * to check if there are new reqs added to iopoll_list, - * it is because reqs may have been punted to io worker - * and will be added to iopoll_list later, hence check - * the iopoll_list again. - */ - if ((ctx->flags & IORING_SETUP_IOPOLL) && - !list_empty_careful(&ctx->iopoll_list)) { - finish_wait(&ctx->sqo_wait, &wait); - continue; + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { + if (current->cred != ctx->creds) { + if (old_cred) + revert_creds(old_cred); + old_cred = override_creds(ctx->creds); } + io_sq_thread_associate_blkcg(ctx, &cur_css); +#ifdef CONFIG_AUDIT + current->loginuid = ctx->loginuid; + current->sessionid = ctx->sessionid; +#endif - io_ring_set_wakeup_flag(ctx); + ret |= __io_sq_thread(ctx, start_jiffies, cap_entries); - to_submit = io_sqring_entries(ctx); - if (!to_submit || ret == -EBUSY) { - if (kthread_should_park()) { - finish_wait(&ctx->sqo_wait, &wait); - break; - } - if (io_run_task_work()) { - finish_wait(&ctx->sqo_wait, &wait); - io_ring_clear_wakeup_flag(ctx); - continue; - } - if (signal_pending(current)) - flush_signals(current); - schedule(); - finish_wait(&ctx->sqo_wait, &wait); + io_sq_thread_drop_mm(); + } - io_ring_clear_wakeup_flag(ctx); - ret = 0; + if (ret & SQT_SPIN) { + io_run_task_work(); + cond_resched(); + } else if (ret == SQT_IDLE) { + if (kthread_should_park()) continue; - } - finish_wait(&ctx->sqo_wait, &wait); - - io_ring_clear_wakeup_flag(ctx); + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) + io_ring_set_wakeup_flag(ctx); + schedule(); + start_jiffies = jiffies; + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) + io_ring_clear_wakeup_flag(ctx); } - - mutex_lock(&ctx->uring_lock); - if (likely(!percpu_ref_is_dying(&ctx->refs))) - ret = io_submit_sqes(ctx, to_submit, NULL, -1); - mutex_unlock(&ctx->uring_lock); - timeout = jiffies + ctx->sq_thread_idle; } io_run_task_work(); - io_sq_thread_drop_mm(); - revert_creds(old_cred); + if (cur_css) + io_sq_thread_unassociate_blkcg(); + if (old_cred) + revert_creds(old_cred); kthread_parkme(); @@ -6678,6 +6842,22 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, return autoremove_wake_function(curr, mode, wake_flags, key); } +static int io_run_task_work_sig(void) +{ + if (io_run_task_work()) + return 1; + if (!signal_pending(current)) + return 0; + if (current->jobctl & JOBCTL_TASK_WORK) { + spin_lock_irq(¤t->sighand->siglock); + current->jobctl &= ~JOBCTL_TASK_WORK; + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + return 1; + } + return -EINTR; +} + /* * Wait until events become available, if we don't already have some. The * application must reap them itself, as they reside on the shared cq ring. @@ -6723,19 +6903,11 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, TASK_INTERRUPTIBLE); /* make sure we run task_work before checking for signals */ - if (io_run_task_work()) + ret = io_run_task_work_sig(); + if (ret > 0) continue; - if (signal_pending(current)) { - if (current->jobctl & JOBCTL_TASK_WORK) { - spin_lock_irq(¤t->sighand->siglock); - current->jobctl &= ~JOBCTL_TASK_WORK; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - continue; - } - ret = -EINTR; + else if (ret < 0) break; - } if (io_should_wake(&iowq, false)) break; schedule(); @@ -6813,18 +6985,116 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) return 0; } -static void io_sq_thread_stop(struct io_ring_ctx *ctx) +static void io_put_sq_data(struct io_sq_data *sqd) { - if (ctx->sqo_thread) { - wait_for_completion(&ctx->sq_thread_comp); + if (refcount_dec_and_test(&sqd->refs)) { /* * The park is a bit of a work-around, without it we get * warning spews on shutdown with SQPOLL set and affinity * set to a single CPU. */ - kthread_park(ctx->sqo_thread); - kthread_stop(ctx->sqo_thread); - ctx->sqo_thread = NULL; + if (sqd->thread) { + kthread_park(sqd->thread); + kthread_stop(sqd->thread); + } + + kfree(sqd); + } +} + +static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p) +{ + struct io_ring_ctx *ctx_attach; + struct io_sq_data *sqd; + struct fd f; + + f = fdget(p->wq_fd); + if (!f.file) + return ERR_PTR(-ENXIO); + if (f.file->f_op != &io_uring_fops) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + ctx_attach = f.file->private_data; + sqd = ctx_attach->sq_data; + if (!sqd) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + refcount_inc(&sqd->refs); + fdput(f); + return sqd; +} + +static struct io_sq_data *io_get_sq_data(struct io_uring_params *p) +{ + struct io_sq_data *sqd; + + if (p->flags & IORING_SETUP_ATTACH_WQ) + return io_attach_sq_data(p); + + sqd = kzalloc(sizeof(*sqd), GFP_KERNEL); + if (!sqd) + return ERR_PTR(-ENOMEM); + + refcount_set(&sqd->refs, 1); + INIT_LIST_HEAD(&sqd->ctx_list); + INIT_LIST_HEAD(&sqd->ctx_new_list); + mutex_init(&sqd->ctx_lock); + mutex_init(&sqd->lock); + init_waitqueue_head(&sqd->wait); + return sqd; +} + +static void io_sq_thread_unpark(struct io_sq_data *sqd) + __releases(&sqd->lock) +{ + if (!sqd->thread) + return; + kthread_unpark(sqd->thread); + mutex_unlock(&sqd->lock); +} + +static void io_sq_thread_park(struct io_sq_data *sqd) + __acquires(&sqd->lock) +{ + if (!sqd->thread) + return; + mutex_lock(&sqd->lock); + kthread_park(sqd->thread); +} + +static void io_sq_thread_stop(struct io_ring_ctx *ctx) +{ + struct io_sq_data *sqd = ctx->sq_data; + + if (sqd) { + if (sqd->thread) { + /* + * We may arrive here from the error branch in + * io_sq_offload_create() where the kthread is created + * without being waked up, thus wake it up now to make + * sure the wait will complete. + */ + wake_up_process(sqd->thread); + wait_for_completion(&ctx->sq_thread_comp); + + io_sq_thread_park(sqd); + } + + mutex_lock(&sqd->ctx_lock); + list_del(&ctx->sqd_list); + mutex_unlock(&sqd->ctx_lock); + + if (sqd->thread) { + finish_wait(&sqd->wait, &ctx->sqo_wait_entry); + io_sq_thread_unpark(sqd); + } + + io_put_sq_data(sqd); + ctx->sq_data = NULL; } } @@ -6935,13 +7205,13 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx) } #endif -static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables, - unsigned nr_files) +static int io_sqe_alloc_file_tables(struct fixed_file_data *file_data, + unsigned nr_tables, unsigned nr_files) { int i; for (i = 0; i < nr_tables; i++) { - struct fixed_file_table *table = &ctx->file_data->table[i]; + struct fixed_file_table *table = &file_data->table[i]; unsigned this_files; this_files = min(nr_files, IORING_MAX_FILES_TABLE); @@ -6956,7 +7226,7 @@ static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables, return 0; for (i = 0; i < nr_tables; i++) { - struct fixed_file_table *table = &ctx->file_data->table[i]; + struct fixed_file_table *table = &file_data->table[i]; kfree(table->files); } return 1; @@ -7118,11 +7388,11 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { __s32 __user *fds = (__s32 __user *) arg; - unsigned nr_tables; + unsigned nr_tables, i; struct file *file; - int fd, ret = 0; - unsigned i; + int fd, ret = -ENOMEM; struct fixed_file_ref_node *ref_node; + struct fixed_file_data *file_data; if (ctx->file_data) return -EBUSY; @@ -7131,60 +7401,44 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, if (nr_args > IORING_MAX_FIXED_FILES) return -EMFILE; - ctx->file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL); - if (!ctx->file_data) + file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL); + if (!file_data) return -ENOMEM; - ctx->file_data->ctx = ctx; - init_completion(&ctx->file_data->done); - INIT_LIST_HEAD(&ctx->file_data->ref_list); - spin_lock_init(&ctx->file_data->lock); + file_data->ctx = ctx; + init_completion(&file_data->done); + INIT_LIST_HEAD(&file_data->ref_list); + spin_lock_init(&file_data->lock); nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE); - ctx->file_data->table = kcalloc(nr_tables, - sizeof(struct fixed_file_table), - GFP_KERNEL); - if (!ctx->file_data->table) { - kfree(ctx->file_data); - ctx->file_data = NULL; - return -ENOMEM; - } + file_data->table = kcalloc(nr_tables, sizeof(*file_data->table), + GFP_KERNEL); + if (!file_data->table) + goto out_free; - if (percpu_ref_init(&ctx->file_data->refs, io_file_ref_kill, - PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { - kfree(ctx->file_data->table); - kfree(ctx->file_data); - ctx->file_data = NULL; - return -ENOMEM; - } + if (percpu_ref_init(&file_data->refs, io_file_ref_kill, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) + goto out_free; - if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) { - percpu_ref_exit(&ctx->file_data->refs); - kfree(ctx->file_data->table); - kfree(ctx->file_data); - ctx->file_data = NULL; - return -ENOMEM; - } + if (io_sqe_alloc_file_tables(file_data, nr_tables, nr_args)) + goto out_ref; + ctx->file_data = file_data; for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { struct fixed_file_table *table; unsigned index; - ret = -EFAULT; - if (copy_from_user(&fd, &fds[i], sizeof(fd))) - break; + if (copy_from_user(&fd, &fds[i], sizeof(fd))) { + ret = -EFAULT; + goto out_fput; + } /* allow sparse sets */ - if (fd == -1) { - ret = 0; + if (fd == -1) continue; - } - table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT]; - index = i & IORING_FILE_TABLE_MASK; file = fget(fd); - ret = -EBADF; if (!file) - break; + goto out_fput; /* * Don't allow io_uring instances to be registered. If UNIX @@ -7195,29 +7449,13 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, */ if (file->f_op == &io_uring_fops) { fput(file); - break; + goto out_fput; } - ret = 0; + table = &file_data->table[i >> IORING_FILE_TABLE_SHIFT]; + index = i & IORING_FILE_TABLE_MASK; table->files[index] = file; } - if (ret) { - for (i = 0; i < ctx->nr_user_files; i++) { - file = io_file_from_index(ctx, i); - if (file) - fput(file); - } - for (i = 0; i < nr_tables; i++) - kfree(ctx->file_data->table[i].files); - - percpu_ref_exit(&ctx->file_data->refs); - kfree(ctx->file_data->table); - kfree(ctx->file_data); - ctx->file_data = NULL; - ctx->nr_user_files = 0; - return ret; - } - ret = io_sqe_files_scm(ctx); if (ret) { io_sqe_files_unregister(ctx); @@ -7230,11 +7468,27 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return PTR_ERR(ref_node); } - ctx->file_data->cur_refs = &ref_node->refs; - spin_lock(&ctx->file_data->lock); - list_add(&ref_node->node, &ctx->file_data->ref_list); - spin_unlock(&ctx->file_data->lock); - percpu_ref_get(&ctx->file_data->refs); + file_data->node = ref_node; + spin_lock(&file_data->lock); + list_add(&ref_node->node, &file_data->ref_list); + spin_unlock(&file_data->lock); + percpu_ref_get(&file_data->refs); + return ret; +out_fput: + for (i = 0; i < ctx->nr_user_files; i++) { + file = io_file_from_index(ctx, i); + if (file) + fput(file); + } + for (i = 0; i < nr_tables; i++) + kfree(file_data->table[i].files); + ctx->nr_user_files = 0; +out_ref: + percpu_ref_exit(&file_data->refs); +out_free: + kfree(file_data->table); + kfree(file_data); + ctx->file_data = NULL; return ret; } @@ -7285,14 +7539,12 @@ static int io_queue_file_removal(struct fixed_file_data *data, struct file *file) { struct io_file_put *pfile; - struct percpu_ref *refs = data->cur_refs; - struct fixed_file_ref_node *ref_node; + struct fixed_file_ref_node *ref_node = data->node; pfile = kzalloc(sizeof(*pfile), GFP_KERNEL); if (!pfile) return -ENOMEM; - ref_node = container_of(refs, struct fixed_file_ref_node, refs); pfile->file = file; list_add(&pfile->list, &ref_node->file_list); @@ -7375,10 +7627,10 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, } if (needs_switch) { - percpu_ref_kill(data->cur_refs); + percpu_ref_kill(&data->node->refs); spin_lock(&data->lock); list_add(&ref_node->node, &data->ref_list); - data->cur_refs = &ref_node->refs; + data->node = ref_node; spin_unlock(&data->lock); percpu_ref_get(&ctx->file_data->refs); } else @@ -7459,20 +7711,76 @@ out_fput: return ret; } -static int io_sq_offload_start(struct io_ring_ctx *ctx, - struct io_uring_params *p) +static int io_uring_alloc_task_context(struct task_struct *task) +{ + struct io_uring_task *tctx; + int ret; + + tctx = kmalloc(sizeof(*tctx), GFP_KERNEL); + if (unlikely(!tctx)) + return -ENOMEM; + + ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL); + if (unlikely(ret)) { + kfree(tctx); + return ret; + } + + xa_init(&tctx->xa); + init_waitqueue_head(&tctx->wait); + tctx->last = NULL; + tctx->in_idle = 0; + io_init_identity(&tctx->__identity); + tctx->identity = &tctx->__identity; + task->io_uring = tctx; + return 0; +} + +void __io_uring_free(struct task_struct *tsk) +{ + struct io_uring_task *tctx = tsk->io_uring; + + WARN_ON_ONCE(!xa_empty(&tctx->xa)); + WARN_ON_ONCE(refcount_read(&tctx->identity->count) != 1); + if (tctx->identity != &tctx->__identity) + kfree(tctx->identity); + percpu_counter_destroy(&tctx->inflight); + kfree(tctx); + tsk->io_uring = NULL; +} + +static int io_sq_offload_create(struct io_ring_ctx *ctx, + struct io_uring_params *p) { int ret; if (ctx->flags & IORING_SETUP_SQPOLL) { + struct io_sq_data *sqd; + ret = -EPERM; if (!capable(CAP_SYS_ADMIN)) goto err; + sqd = io_get_sq_data(p); + if (IS_ERR(sqd)) { + ret = PTR_ERR(sqd); + goto err; + } + + ctx->sq_data = sqd; + io_sq_thread_park(sqd); + mutex_lock(&sqd->ctx_lock); + list_add(&ctx->sqd_list, &sqd->ctx_new_list); + mutex_unlock(&sqd->ctx_lock); + io_sq_thread_unpark(sqd); + ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle); if (!ctx->sq_thread_idle) ctx->sq_thread_idle = HZ; + if (sqd->thread) + goto done; + if (p->flags & IORING_SETUP_SQ_AFF) { int cpu = p->sq_thread_cpu; @@ -7482,25 +7790,27 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, if (!cpu_online(cpu)) goto err; - ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread, - ctx, cpu, - "io_uring-sq"); + sqd->thread = kthread_create_on_cpu(io_sq_thread, sqd, + cpu, "io_uring-sq"); } else { - ctx->sqo_thread = kthread_create(io_sq_thread, ctx, + sqd->thread = kthread_create(io_sq_thread, sqd, "io_uring-sq"); } - if (IS_ERR(ctx->sqo_thread)) { - ret = PTR_ERR(ctx->sqo_thread); - ctx->sqo_thread = NULL; + if (IS_ERR(sqd->thread)) { + ret = PTR_ERR(sqd->thread); + sqd->thread = NULL; goto err; } - wake_up_process(ctx->sqo_thread); + ret = io_uring_alloc_task_context(sqd->thread); + if (ret) + goto err; } else if (p->flags & IORING_SETUP_SQ_AFF) { /* Can't have SQ_AFF without SQPOLL */ ret = -EINVAL; goto err; } +done: ret = io_init_wq_offload(ctx, p); if (ret) goto err; @@ -7511,6 +7821,14 @@ err: return ret; } +static void io_sq_offload_start(struct io_ring_ctx *ctx) +{ + struct io_sq_data *sqd = ctx->sq_data; + + if ((ctx->flags & IORING_SETUP_SQPOLL) && sqd->thread) + wake_up_process(sqd->thread); +} + static inline void __io_unaccount_mem(struct user_struct *user, unsigned long nr_pages) { @@ -7542,11 +7860,11 @@ static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages, if (ctx->limit_mem) __io_unaccount_mem(ctx->user, nr_pages); - if (ctx->sqo_mm) { + if (ctx->mm_account) { if (acct == ACCT_LOCKED) - ctx->sqo_mm->locked_vm -= nr_pages; + ctx->mm_account->locked_vm -= nr_pages; else if (acct == ACCT_PINNED) - atomic64_sub(nr_pages, &ctx->sqo_mm->pinned_vm); + atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); } } @@ -7561,11 +7879,11 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages, return ret; } - if (ctx->sqo_mm) { + if (ctx->mm_account) { if (acct == ACCT_LOCKED) - ctx->sqo_mm->locked_vm += nr_pages; + ctx->mm_account->locked_vm += nr_pages; else if (acct == ACCT_PINNED) - atomic64_add(nr_pages, &ctx->sqo_mm->pinned_vm); + atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); } return 0; @@ -7645,7 +7963,8 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx) for (j = 0; j < imu->nr_bvecs; j++) unpin_user_page(imu->bvec[j].bv_page); - io_unaccount_mem(ctx, imu->nr_bvecs, ACCT_PINNED); + if (imu->acct_pages) + io_unaccount_mem(ctx, imu->acct_pages, ACCT_PINNED); kvfree(imu->bvec); imu->nr_bvecs = 0; } @@ -7681,11 +8000,80 @@ static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, return 0; } +/* + * Not super efficient, but this is just a registration time. And we do cache + * the last compound head, so generally we'll only do a full search if we don't + * match that one. + * + * We check if the given compound head page has already been accounted, to + * avoid double accounting it. This allows us to account the full size of the + * page, not just the constituent pages of a huge page. + */ +static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, + int nr_pages, struct page *hpage) +{ + int i, j; + + /* check current page array */ + for (i = 0; i < nr_pages; i++) { + if (!PageCompound(pages[i])) + continue; + if (compound_head(pages[i]) == hpage) + return true; + } + + /* check previously registered pages */ + for (i = 0; i < ctx->nr_user_bufs; i++) { + struct io_mapped_ubuf *imu = &ctx->user_bufs[i]; + + for (j = 0; j < imu->nr_bvecs; j++) { + if (!PageCompound(imu->bvec[j].bv_page)) + continue; + if (compound_head(imu->bvec[j].bv_page) == hpage) + return true; + } + } + + return false; +} + +static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, + int nr_pages, struct io_mapped_ubuf *imu, + struct page **last_hpage) +{ + int i, ret; + + for (i = 0; i < nr_pages; i++) { + if (!PageCompound(pages[i])) { + imu->acct_pages++; + } else { + struct page *hpage; + + hpage = compound_head(pages[i]); + if (hpage == *last_hpage) + continue; + *last_hpage = hpage; + if (headpage_already_acct(ctx, pages, i, hpage)) + continue; + imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; + } + } + + if (!imu->acct_pages) + return 0; + + ret = io_account_mem(ctx, imu->acct_pages, ACCT_PINNED); + if (ret) + imu->acct_pages = 0; + return ret; +} + static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { struct vm_area_struct **vmas = NULL; struct page **pages = NULL; + struct page *last_hpage = NULL; int i, j, got_pages = 0; int ret = -EINVAL; @@ -7728,10 +8116,6 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, start = ubuf >> PAGE_SHIFT; nr_pages = end - start; - ret = io_account_mem(ctx, nr_pages, ACCT_PINNED); - if (ret) - goto err; - ret = 0; if (!pages || nr_pages > got_pages) { kvfree(vmas); @@ -7743,7 +8127,6 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, GFP_KERNEL); if (!pages || !vmas) { ret = -ENOMEM; - io_unaccount_mem(ctx, nr_pages, ACCT_PINNED); goto err; } got_pages = nr_pages; @@ -7752,10 +8135,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec), GFP_KERNEL); ret = -ENOMEM; - if (!imu->bvec) { - io_unaccount_mem(ctx, nr_pages, ACCT_PINNED); + if (!imu->bvec) goto err; - } ret = 0; mmap_read_lock(current->mm); @@ -7784,7 +8165,13 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, */ if (pret > 0) unpin_user_pages(pages, pret); - io_unaccount_mem(ctx, nr_pages, ACCT_PINNED); + kvfree(imu->bvec); + goto err; + } + + ret = io_buffer_account_pin(ctx, pages, pret, imu, &last_hpage); + if (ret) { + unpin_user_pages(pages, pret); kvfree(imu->bvec); goto err; } @@ -7869,11 +8256,19 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_finish_async(ctx); io_sqe_buffer_unregister(ctx); - if (ctx->sqo_mm) { - mmdrop(ctx->sqo_mm); - ctx->sqo_mm = NULL; + + if (ctx->sqo_task) { + put_task_struct(ctx->sqo_task); + ctx->sqo_task = NULL; + mmdrop(ctx->mm_account); + ctx->mm_account = NULL; } +#ifdef CONFIG_BLK_CGROUP + if (ctx->sqo_blkcg_css) + css_put(ctx->sqo_blkcg_css); +#endif + io_sqe_files_unregister(ctx); io_eventfd_unregister(ctx); io_destroy_buffers(ctx); @@ -7908,8 +8303,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) * io_commit_cqring */ smp_rmb(); - if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head != - ctx->rings->sq_ring_entries) + if (!io_sqring_full(ctx)) mask |= EPOLLOUT | EPOLLWRNORM; if (io_cqring_events(ctx, false)) mask |= EPOLLIN | EPOLLRDNORM; @@ -7927,11 +8321,14 @@ static int io_uring_fasync(int fd, struct file *file, int on) static int io_remove_personalities(int id, void *p, void *data) { struct io_ring_ctx *ctx = data; - const struct cred *cred; + struct io_identity *iod; - cred = idr_remove(&ctx->personality_idr, id); - if (cred) - put_cred(cred); + iod = idr_remove(&ctx->personality_idr, id); + if (iod) { + put_cred(iod->creds); + if (refcount_dec_and_test(&iod->count)) + kfree(iod); + } return 0; } @@ -7948,7 +8345,7 @@ static void io_ring_exit_work(struct work_struct *work) */ do { if (ctx->rings) - io_cqring_overflow_flush(ctx, true); + io_cqring_overflow_flush(ctx, true, NULL, NULL); io_iopoll_try_reap_events(ctx); } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)); io_ring_ctx_free(ctx); @@ -7960,15 +8357,15 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) percpu_ref_kill(&ctx->refs); mutex_unlock(&ctx->uring_lock); - io_kill_timeouts(ctx); - io_poll_remove_all(ctx); + io_kill_timeouts(ctx, NULL); + io_poll_remove_all(ctx, NULL); if (ctx->io_wq) io_wq_cancel_all(ctx->io_wq); /* if we failed setting up the ctx, we might not have any rings */ if (ctx->rings) - io_cqring_overflow_flush(ctx, true); + io_cqring_overflow_flush(ctx, true, NULL, NULL); io_iopoll_try_reap_events(ctx); idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx); @@ -8003,7 +8400,8 @@ static bool io_wq_files_match(struct io_wq_work *work, void *data) { struct files_struct *files = data; - return work->files == files; + return !files || ((work->flags & IO_WQ_WORK_FILES) && + work->identity->files == files); } /* @@ -8024,12 +8422,6 @@ static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req) return false; } -static inline bool io_match_files(struct io_kiocb *req, - struct files_struct *files) -{ - return (req->flags & REQ_F_WORK_INITIALIZED) && req->work.files == files; -} - static bool io_match_link_files(struct io_kiocb *req, struct files_struct *files) { @@ -8145,11 +8537,14 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx, } } -static void io_uring_cancel_files(struct io_ring_ctx *ctx, +/* + * Returns true if we found and killed one or more files pinning requests + */ +static bool io_uring_cancel_files(struct io_ring_ctx *ctx, struct files_struct *files) { if (list_empty_careful(&ctx->inflight_list)) - return; + return false; io_cancel_defer_files(ctx, files); /* cancel all at once, should be faster than doing it one by one*/ @@ -8161,7 +8556,8 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, spin_lock_irq(&ctx->inflight_lock); list_for_each_entry(req, &ctx->inflight_list, inflight_entry) { - if (req->work.files != files) + if (files && (req->work.flags & IO_WQ_WORK_FILES) && + req->work.identity->files != files) continue; /* req is being completed, ignore */ if (!refcount_inc_not_zero(&req->refs)) @@ -8180,9 +8576,13 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, /* cancel this request, or head link requests */ io_attempt_cancel(ctx, cancel_req); io_put_req(cancel_req); + /* cancellations _may_ trigger task work */ + io_run_task_work(); schedule(); finish_wait(&ctx->inflight_wait, &wait); } + + return true; } static bool io_cancel_task_cb(struct io_wq_work *work, void *data) @@ -8190,21 +8590,192 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data) struct io_kiocb *req = container_of(work, struct io_kiocb, work); struct task_struct *task = data; - return req->task == task; + return io_task_match(req, task); +} + +static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, + struct task_struct *task, + struct files_struct *files) +{ + bool ret; + + ret = io_uring_cancel_files(ctx, files); + if (!files) { + enum io_wq_cancel cret; + + cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, task, true); + if (cret != IO_WQ_CANCEL_NOTFOUND) + ret = true; + + /* SQPOLL thread does its own polling */ + if (!(ctx->flags & IORING_SETUP_SQPOLL)) { + while (!list_empty_careful(&ctx->iopoll_list)) { + io_iopoll_try_reap_events(ctx); + ret = true; + } + } + + ret |= io_poll_remove_all(ctx, task); + ret |= io_kill_timeouts(ctx, task); + } + + return ret; +} + +/* + * We need to iteratively cancel requests, in case a request has dependent + * hard links. These persist even for failure of cancelations, hence keep + * looping until none are found. + */ +static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, + struct files_struct *files) +{ + struct task_struct *task = current; + + if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) + task = ctx->sq_data->thread; + + io_cqring_overflow_flush(ctx, true, task, files); + + while (__io_uring_cancel_task_requests(ctx, task, files)) { + io_run_task_work(); + cond_resched(); + } +} + +/* + * Note that this task has used io_uring. We use it for cancelation purposes. + */ +static int io_uring_add_task_file(struct file *file) +{ + struct io_uring_task *tctx = current->io_uring; + + if (unlikely(!tctx)) { + int ret; + + ret = io_uring_alloc_task_context(current); + if (unlikely(ret)) + return ret; + tctx = current->io_uring; + } + if (tctx->last != file) { + void *old = xa_load(&tctx->xa, (unsigned long)file); + + if (!old) { + get_file(file); + xa_store(&tctx->xa, (unsigned long)file, file, GFP_KERNEL); + } + tctx->last = file; + } + + return 0; +} + +/* + * Remove this io_uring_file -> task mapping. + */ +static void io_uring_del_task_file(struct file *file) +{ + struct io_uring_task *tctx = current->io_uring; + + if (tctx->last == file) + tctx->last = NULL; + file = xa_erase(&tctx->xa, (unsigned long)file); + if (file) + fput(file); +} + +static void __io_uring_attempt_task_drop(struct file *file) +{ + struct file *old = xa_load(¤t->io_uring->xa, (unsigned long)file); + + if (old == file) + io_uring_del_task_file(file); +} + +/* + * Drop task note for this file if we're the only ones that hold it after + * pending fput() + */ +static void io_uring_attempt_task_drop(struct file *file, bool exiting) +{ + if (!current->io_uring) + return; + /* + * fput() is pending, will be 2 if the only other ref is our potential + * task file note. If the task is exiting, drop regardless of count. + */ + if (!exiting && atomic_long_read(&file->f_count) != 2) + return; + + __io_uring_attempt_task_drop(file); +} + +void __io_uring_files_cancel(struct files_struct *files) +{ + struct io_uring_task *tctx = current->io_uring; + struct file *file; + unsigned long index; + + /* make sure overflow events are dropped */ + tctx->in_idle = true; + + xa_for_each(&tctx->xa, index, file) { + struct io_ring_ctx *ctx = file->private_data; + + io_uring_cancel_task_requests(ctx, files); + if (files) + io_uring_del_task_file(file); + } +} + +/* + * Find any io_uring fd that this task has registered or done IO on, and cancel + * requests. + */ +void __io_uring_task_cancel(void) +{ + struct io_uring_task *tctx = current->io_uring; + DEFINE_WAIT(wait); + s64 inflight; + + /* make sure overflow events are dropped */ + tctx->in_idle = true; + + do { + /* read completions before cancelations */ + inflight = percpu_counter_sum(&tctx->inflight); + if (!inflight) + break; + __io_uring_files_cancel(NULL); + + prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE); + + /* + * If we've seen completions, retry. This avoids a race where + * a completion comes in before we did prepare_to_wait(). + */ + if (inflight != percpu_counter_sum(&tctx->inflight)) + continue; + schedule(); + } while (1); + + finish_wait(&tctx->wait, &wait); + tctx->in_idle = false; } static int io_uring_flush(struct file *file, void *data) { struct io_ring_ctx *ctx = file->private_data; - io_uring_cancel_files(ctx, data); - /* * If the task is going away, cancel work it may have pending */ if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) - io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, current, true); + data = NULL; + io_uring_cancel_task_requests(ctx, data); + io_uring_attempt_task_drop(file, !data); return 0; } @@ -8278,6 +8849,25 @@ static unsigned long io_uring_nommu_get_unmapped_area(struct file *file, #endif /* !CONFIG_MMU */ +static void io_sqpoll_wait_sq(struct io_ring_ctx *ctx) +{ + DEFINE_WAIT(wait); + + do { + if (!io_sqring_full(ctx)) + break; + + prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE); + + if (!io_sqring_full(ctx)) + break; + + schedule(); + } while (!signal_pending(current)); + + finish_wait(&ctx->sqo_sq_wait, &wait); +} + SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, u32, min_complete, u32, flags, const sigset_t __user *, sig, size_t, sigsz) @@ -8289,7 +8879,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, io_run_task_work(); - if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP)) + if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | + IORING_ENTER_SQ_WAIT)) return -EINVAL; f = fdget(fd); @@ -8305,6 +8896,10 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (!percpu_ref_tryget(&ctx->refs)) goto out_fput; + ret = -EBADFD; + if (ctx->flags & IORING_SETUP_R_DISABLED) + goto out; + /* * For SQ polling, the thread will do all submissions and completions. * Just return the requested submit count, and wake the thread if @@ -8313,13 +8908,18 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, ret = 0; if (ctx->flags & IORING_SETUP_SQPOLL) { if (!list_empty_careful(&ctx->cq_overflow_list)) - io_cqring_overflow_flush(ctx, false); + io_cqring_overflow_flush(ctx, false, NULL, NULL); if (flags & IORING_ENTER_SQ_WAKEUP) - wake_up(&ctx->sqo_wait); + wake_up(&ctx->sq_data->wait); + if (flags & IORING_ENTER_SQ_WAIT) + io_sqpoll_wait_sq(ctx); submitted = to_submit; } else if (to_submit) { + ret = io_uring_add_task_file(f.file); + if (unlikely(ret)) + goto out; mutex_lock(&ctx->uring_lock); - submitted = io_submit_sqes(ctx, to_submit, f.file, fd); + submitted = io_submit_sqes(ctx, to_submit); mutex_unlock(&ctx->uring_lock); if (submitted != to_submit) @@ -8385,11 +8985,25 @@ static int io_uring_show_cred(int id, void *p, void *data) static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) { + struct io_sq_data *sq = NULL; + bool has_lock; int i; - mutex_lock(&ctx->uring_lock); + /* + * Avoid ABBA deadlock between the seq lock and the io_uring mutex, + * since fdinfo case grabs it in the opposite direction of normal use + * cases. If we fail to get the lock, we just don't iterate any + * structures that could be going away outside the io_uring mutex. + */ + has_lock = mutex_trylock(&ctx->uring_lock); + + if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) + sq = ctx->sq_data; + + seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1); + seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1); seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files); - for (i = 0; i < ctx->nr_user_files; i++) { + for (i = 0; has_lock && i < ctx->nr_user_files; i++) { struct fixed_file_table *table; struct file *f; @@ -8401,13 +9015,13 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) seq_printf(m, "%5u: <none>\n", i); } seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs); - for (i = 0; i < ctx->nr_user_bufs; i++) { + for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) { struct io_mapped_ubuf *buf = &ctx->user_bufs[i]; seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, (unsigned int) buf->len); } - if (!idr_is_empty(&ctx->personality_idr)) { + if (has_lock && !idr_is_empty(&ctx->personality_idr)) { seq_printf(m, "Personalities:\n"); idr_for_each(&ctx->personality_idr, io_uring_show_cred, m); } @@ -8422,7 +9036,8 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) req->task->task_works != NULL); } spin_unlock_irq(&ctx->completion_lock); - mutex_unlock(&ctx->uring_lock); + if (has_lock) + mutex_unlock(&ctx->uring_lock); } static void io_uring_show_fdinfo(struct seq_file *m, struct file *f) @@ -8520,6 +9135,7 @@ static int io_uring_get_fd(struct io_ring_ctx *ctx) file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx, O_RDWR | O_CLOEXEC); if (IS_ERR(file)) { +err_fd: put_unused_fd(ret); ret = PTR_ERR(file); goto err; @@ -8528,6 +9144,10 @@ static int io_uring_get_fd(struct io_ring_ctx *ctx) #if defined(CONFIG_UNIX) ctx->ring_sock->file = file; #endif + if (unlikely(io_uring_add_task_file(file))) { + file = ERR_PTR(-ENOMEM); + goto err_fd; + } fd_install(ret, file); return ret; err: @@ -8604,9 +9224,39 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, ctx->compat = in_compat_syscall(); ctx->user = user; ctx->creds = get_current_cred(); +#ifdef CONFIG_AUDIT + ctx->loginuid = current->loginuid; + ctx->sessionid = current->sessionid; +#endif + ctx->sqo_task = get_task_struct(current); + /* + * This is just grabbed for accounting purposes. When a process exits, + * the mm is exited and dropped before the files, hence we need to hang + * on to this mm purely for the purposes of being able to unaccount + * memory (locked/pinned vm). It's not used for anything else. + */ mmgrab(current->mm); - ctx->sqo_mm = current->mm; + ctx->mm_account = current->mm; + +#ifdef CONFIG_BLK_CGROUP + /* + * The sq thread will belong to the original cgroup it was inited in. + * If the cgroup goes offline (e.g. disabling the io controller), then + * issued bios will be associated with the closest cgroup later in the + * block layer. + */ + rcu_read_lock(); + ctx->sqo_blkcg_css = blkcg_css(); + ret = css_tryget_online(ctx->sqo_blkcg_css); + rcu_read_unlock(); + if (!ret) { + /* don't init against a dying cgroup, have the user try again */ + ctx->sqo_blkcg_css = NULL; + ret = -ENODEV; + goto err; + } +#endif /* * Account memory _before_ installing the file descriptor. Once @@ -8622,10 +9272,13 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, if (ret) goto err; - ret = io_sq_offload_start(ctx, p); + ret = io_sq_offload_create(ctx, p); if (ret) goto err; + if (!(p->flags & IORING_SETUP_R_DISABLED)) + io_sq_offload_start(ctx); + memset(&p->sq_off, 0, sizeof(p->sq_off)); p->sq_off.head = offsetof(struct io_rings, sq.head); p->sq_off.tail = offsetof(struct io_rings, sq.tail); @@ -8688,7 +9341,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | - IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ)) + IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ | + IORING_SETUP_R_DISABLED)) return -EINVAL; return io_uring_create(entries, &p, params); @@ -8741,29 +9395,124 @@ out: static int io_register_personality(struct io_ring_ctx *ctx) { - const struct cred *creds = get_current_cred(); - int id; + struct io_identity *id; + int ret; + + id = kmalloc(sizeof(*id), GFP_KERNEL); + if (unlikely(!id)) + return -ENOMEM; + + io_init_identity(id); + id->creds = get_current_cred(); - id = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1, - USHRT_MAX, GFP_KERNEL); - if (id < 0) - put_cred(creds); - return id; + ret = idr_alloc_cyclic(&ctx->personality_idr, id, 1, USHRT_MAX, GFP_KERNEL); + if (ret < 0) { + put_cred(id->creds); + kfree(id); + } + return ret; } static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) { - const struct cred *old_creds; + struct io_identity *iod; - old_creds = idr_remove(&ctx->personality_idr, id); - if (old_creds) { - put_cred(old_creds); + iod = idr_remove(&ctx->personality_idr, id); + if (iod) { + put_cred(iod->creds); + if (refcount_dec_and_test(&iod->count)) + kfree(iod); return 0; } return -EINVAL; } +static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg, + unsigned int nr_args) +{ + struct io_uring_restriction *res; + size_t size; + int i, ret; + + /* Restrictions allowed only if rings started disabled */ + if (!(ctx->flags & IORING_SETUP_R_DISABLED)) + return -EBADFD; + + /* We allow only a single restrictions registration */ + if (ctx->restrictions.registered) + return -EBUSY; + + if (!arg || nr_args > IORING_MAX_RESTRICTIONS) + return -EINVAL; + + size = array_size(nr_args, sizeof(*res)); + if (size == SIZE_MAX) + return -EOVERFLOW; + + res = memdup_user(arg, size); + if (IS_ERR(res)) + return PTR_ERR(res); + + ret = 0; + + for (i = 0; i < nr_args; i++) { + switch (res[i].opcode) { + case IORING_RESTRICTION_REGISTER_OP: + if (res[i].register_op >= IORING_REGISTER_LAST) { + ret = -EINVAL; + goto out; + } + + __set_bit(res[i].register_op, + ctx->restrictions.register_op); + break; + case IORING_RESTRICTION_SQE_OP: + if (res[i].sqe_op >= IORING_OP_LAST) { + ret = -EINVAL; + goto out; + } + + __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op); + break; + case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: + ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags; + break; + case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: + ctx->restrictions.sqe_flags_required = res[i].sqe_flags; + break; + default: + ret = -EINVAL; + goto out; + } + } + +out: + /* Reset all restrictions if an error happened */ + if (ret != 0) + memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); + else + ctx->restrictions.registered = true; + + kfree(res); + return ret; +} + +static int io_register_enable_rings(struct io_ring_ctx *ctx) +{ + if (!(ctx->flags & IORING_SETUP_R_DISABLED)) + return -EBADFD; + + if (ctx->restrictions.registered) + ctx->restricted = 1; + + ctx->flags &= ~IORING_SETUP_R_DISABLED; + + io_sq_offload_start(ctx); + + return 0; +} + static bool io_register_op_must_quiesce(int op) { switch (op) { @@ -8805,11 +9554,31 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, * after we've killed the percpu ref. */ mutex_unlock(&ctx->uring_lock); - ret = wait_for_completion_interruptible(&ctx->ref_comp); + do { + ret = wait_for_completion_interruptible(&ctx->ref_comp); + if (!ret) + break; + ret = io_run_task_work_sig(); + if (ret < 0) + break; + } while (1); + mutex_lock(&ctx->uring_lock); + if (ret) { percpu_ref_resurrect(&ctx->refs); - ret = -EINTR; + goto out_quiesce; + } + } + + if (ctx->restricted) { + if (opcode >= IORING_REGISTER_LAST) { + ret = -EINVAL; + goto out; + } + + if (!test_bit(opcode, ctx->restrictions.register_op)) { + ret = -EACCES; goto out; } } @@ -8873,15 +9642,25 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_unregister_personality(ctx, nr_args); break; + case IORING_REGISTER_ENABLE_RINGS: + ret = -EINVAL; + if (arg || nr_args) + break; + ret = io_register_enable_rings(ctx); + break; + case IORING_REGISTER_RESTRICTIONS: + ret = io_register_restrictions(ctx, arg, nr_args); + break; default: ret = -EINVAL; break; } +out: if (io_register_op_must_quiesce(opcode)) { /* bring the ctx back to life */ percpu_ref_reinit(&ctx->refs); -out: +out_quiesce: reinit_completion(&ctx->ref_comp); } return ret; diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index bcfc288dba3f..8180061b9e16 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -22,18 +22,25 @@ #include "../internal.h" /* - * Structure allocated for each page when block size < PAGE_SIZE to track - * sub-page uptodate status and I/O completions. + * Structure allocated for each page or THP when block size < page size + * to track sub-page uptodate status and I/O completions. */ struct iomap_page { - atomic_t read_count; - atomic_t write_count; + atomic_t read_bytes_pending; + atomic_t write_bytes_pending; spinlock_t uptodate_lock; - DECLARE_BITMAP(uptodate, PAGE_SIZE / 512); + unsigned long uptodate[]; }; static inline struct iomap_page *to_iomap_page(struct page *page) { + /* + * per-block data is stored in the head page. Callers should + * not be dealing with tail pages (and if they are, they can + * call thp_head() first. + */ + VM_BUG_ON_PGFLAGS(PageTail(page), page); + if (page_has_private(page)) return (struct iomap_page *)page_private(page); return NULL; @@ -45,20 +52,16 @@ static struct iomap_page * iomap_page_create(struct inode *inode, struct page *page) { struct iomap_page *iop = to_iomap_page(page); + unsigned int nr_blocks = i_blocks_per_page(inode, page); - if (iop || i_blocksize(inode) == PAGE_SIZE) + if (iop || nr_blocks <= 1) return iop; - iop = kmalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL); - atomic_set(&iop->read_count, 0); - atomic_set(&iop->write_count, 0); + iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)), + GFP_NOFS | __GFP_NOFAIL); spin_lock_init(&iop->uptodate_lock); - bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE); - - /* - * migrate_page_move_mapping() assumes that pages with private data have - * their count elevated by 1. - */ + if (PageUptodate(page)) + bitmap_fill(iop->uptodate, nr_blocks); attach_page_private(page, iop); return iop; } @@ -67,11 +70,14 @@ static void iomap_page_release(struct page *page) { struct iomap_page *iop = detach_page_private(page); + unsigned int nr_blocks = i_blocks_per_page(page->mapping->host, page); if (!iop) return; - WARN_ON_ONCE(atomic_read(&iop->read_count)); - WARN_ON_ONCE(atomic_read(&iop->write_count)); + WARN_ON_ONCE(atomic_read(&iop->read_bytes_pending)); + WARN_ON_ONCE(atomic_read(&iop->write_bytes_pending)); + WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) != + PageUptodate(page)); kfree(iop); } @@ -142,19 +148,11 @@ iomap_iop_set_range_uptodate(struct page *page, unsigned off, unsigned len) struct inode *inode = page->mapping->host; unsigned first = off >> inode->i_blkbits; unsigned last = (off + len - 1) >> inode->i_blkbits; - bool uptodate = true; unsigned long flags; - unsigned int i; spin_lock_irqsave(&iop->uptodate_lock, flags); - for (i = 0; i < PAGE_SIZE / i_blocksize(inode); i++) { - if (i >= first && i <= last) - set_bit(i, iop->uptodate); - else if (!test_bit(i, iop->uptodate)) - uptodate = false; - } - - if (uptodate) + bitmap_set(iop->uptodate, first, last - first + 1); + if (bitmap_full(iop->uptodate, i_blocks_per_page(inode, page))) SetPageUptodate(page); spin_unlock_irqrestore(&iop->uptodate_lock, flags); } @@ -172,13 +170,6 @@ iomap_set_range_uptodate(struct page *page, unsigned off, unsigned len) } static void -iomap_read_finish(struct iomap_page *iop, struct page *page) -{ - if (!iop || atomic_dec_and_test(&iop->read_count)) - unlock_page(page); -} - -static void iomap_read_page_end_io(struct bio_vec *bvec, int error) { struct page *page = bvec->bv_page; @@ -191,7 +182,8 @@ iomap_read_page_end_io(struct bio_vec *bvec, int error) iomap_set_range_uptodate(page, bvec->bv_offset, bvec->bv_len); } - iomap_read_finish(iop, page); + if (!iop || atomic_sub_and_test(bvec->bv_len, &iop->read_bytes_pending)) + unlock_page(page); } static void @@ -271,30 +263,19 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, } ctx->cur_page_in_bio = true; + if (iop) + atomic_add(plen, &iop->read_bytes_pending); - /* - * Try to merge into a previous segment if we can. - */ + /* Try to merge into a previous segment if we can */ sector = iomap_sector(iomap, pos); - if (ctx->bio && bio_end_sector(ctx->bio) == sector) + if (ctx->bio && bio_end_sector(ctx->bio) == sector) { + if (__bio_try_merge_page(ctx->bio, page, plen, poff, + &same_page)) + goto done; is_contig = true; - - if (is_contig && - __bio_try_merge_page(ctx->bio, page, plen, poff, &same_page)) { - if (!same_page && iop) - atomic_inc(&iop->read_count); - goto done; } - /* - * If we start a new segment we need to increase the read count, and we - * need to do so before submitting any previous full bio to make sure - * that we don't prematurely unlock the page. - */ - if (iop) - atomic_inc(&iop->read_count); - - if (!ctx->bio || !is_contig || bio_full(ctx->bio, plen)) { + if (!is_contig || bio_full(ctx->bio, plen)) { gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); gfp_t orig_gfp = gfp; int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -571,13 +552,13 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags, { struct iomap_page *iop = iomap_page_create(inode, page); loff_t block_size = i_blocksize(inode); - loff_t block_start = pos & ~(block_size - 1); - loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1); + loff_t block_start = round_down(pos, block_size); + loff_t block_end = round_up(pos + len, block_size); unsigned from = offset_in_page(pos), to = from + len, poff, plen; - int status; if (PageUptodate(page)) return 0; + ClearPageError(page); do { iomap_adjust_read_range(inode, iop, &block_start, @@ -594,14 +575,13 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags, if (WARN_ON_ONCE(flags & IOMAP_WRITE_F_UNSHARE)) return -EIO; zero_user_segments(page, poff, from, to, poff + plen); - iomap_set_range_uptodate(page, poff, plen); - continue; + } else { + int status = iomap_read_page_sync(block_start, page, + poff, plen, srcmap); + if (status) + return status; } - - status = iomap_read_page_sync(block_start, page, poff, plen, - srcmap); - if (status) - return status; + iomap_set_range_uptodate(page, poff, plen); } while ((block_start += plen) < block_end); return 0; @@ -685,9 +665,8 @@ iomap_set_page_dirty(struct page *page) } EXPORT_SYMBOL_GPL(iomap_set_page_dirty); -static int -__iomap_write_end(struct inode *inode, loff_t pos, unsigned len, - unsigned copied, struct page *page) +static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len, + size_t copied, struct page *page) { flush_dcache_page(page); @@ -709,15 +688,15 @@ __iomap_write_end(struct inode *inode, loff_t pos, unsigned len, return copied; } -static int -iomap_write_end_inline(struct inode *inode, struct page *page, - struct iomap *iomap, loff_t pos, unsigned copied) +static size_t iomap_write_end_inline(struct inode *inode, struct page *page, + struct iomap *iomap, loff_t pos, size_t copied) { void *addr; WARN_ON_ONCE(!PageUptodate(page)); BUG_ON(pos + copied > PAGE_SIZE - offset_in_page(iomap->inline_data)); + flush_dcache_page(page); addr = kmap_atomic(page); memcpy(iomap->inline_data + pos, addr + pos, copied); kunmap_atomic(addr); @@ -726,13 +705,14 @@ iomap_write_end_inline(struct inode *inode, struct page *page, return copied; } -static int -iomap_write_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, - struct page *page, struct iomap *iomap, struct iomap *srcmap) +/* Returns the number of bytes copied. May be 0. Cannot be an errno. */ +static size_t iomap_write_end(struct inode *inode, loff_t pos, size_t len, + size_t copied, struct page *page, struct iomap *iomap, + struct iomap *srcmap) { const struct iomap_page_ops *page_ops = iomap->page_ops; loff_t old_size = inode->i_size; - int ret; + size_t ret; if (srcmap->type == IOMAP_INLINE) { ret = iomap_write_end_inline(inode, page, iomap, pos, copied); @@ -811,13 +791,8 @@ again: copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); - flush_dcache_page(page); - - status = iomap_write_end(inode, pos, bytes, copied, page, iomap, + copied = iomap_write_end(inode, pos, bytes, copied, page, iomap, srcmap); - if (unlikely(status < 0)) - break; - copied = status; cond_resched(); @@ -891,11 +866,8 @@ iomap_unshare_actor(struct inode *inode, loff_t pos, loff_t length, void *data, status = iomap_write_end(inode, pos, bytes, bytes, page, iomap, srcmap); - if (unlikely(status <= 0)) { - if (WARN_ON_ONCE(status == 0)) - return -EIO; - return status; - } + if (WARN_ON_ONCE(status == 0)) + return -EIO; cond_resched(); @@ -928,11 +900,13 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, } EXPORT_SYMBOL_GPL(iomap_file_unshare); -static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, - unsigned bytes, struct iomap *iomap, struct iomap *srcmap) +static s64 iomap_zero(struct inode *inode, loff_t pos, u64 length, + struct iomap *iomap, struct iomap *srcmap) { struct page *page; int status; + unsigned offset = offset_in_page(pos); + unsigned bytes = min_t(u64, PAGE_SIZE - offset, length); status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap, srcmap); if (status) @@ -944,38 +918,33 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, return iomap_write_end(inode, pos, bytes, bytes, page, iomap, srcmap); } -static loff_t -iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, - void *data, struct iomap *iomap, struct iomap *srcmap) +static loff_t iomap_zero_range_actor(struct inode *inode, loff_t pos, + loff_t length, void *data, struct iomap *iomap, + struct iomap *srcmap) { bool *did_zero = data; loff_t written = 0; - int status; /* already zeroed? we're done. */ if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) - return count; + return length; do { - unsigned offset, bytes; - - offset = offset_in_page(pos); - bytes = min_t(loff_t, PAGE_SIZE - offset, count); + s64 bytes; if (IS_DAX(inode)) - status = dax_iomap_zero(pos, offset, bytes, iomap); + bytes = dax_iomap_zero(pos, length, iomap); else - status = iomap_zero(inode, pos, offset, bytes, iomap, - srcmap); - if (status < 0) - return status; + bytes = iomap_zero(inode, pos, length, iomap, srcmap); + if (bytes < 0) + return bytes; pos += bytes; - count -= bytes; + length -= bytes; written += bytes; if (did_zero) *did_zero = true; - } while (count > 0); + } while (length > 0); return written; } @@ -1070,7 +1039,7 @@ EXPORT_SYMBOL_GPL(iomap_page_mkwrite); static void iomap_finish_page_writeback(struct inode *inode, struct page *page, - int error) + int error, unsigned int len) { struct iomap_page *iop = to_iomap_page(page); @@ -1079,10 +1048,10 @@ iomap_finish_page_writeback(struct inode *inode, struct page *page, mapping_set_error(inode->i_mapping, -EIO); } - WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE && !iop); - WARN_ON_ONCE(iop && atomic_read(&iop->write_count) <= 0); + WARN_ON_ONCE(i_blocks_per_page(inode, page) > 1 && !iop); + WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) <= 0); - if (!iop || atomic_dec_and_test(&iop->write_count)) + if (!iop || atomic_sub_and_test(len, &iop->write_bytes_pending)) end_page_writeback(page); } @@ -1116,7 +1085,8 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error) /* walk each page on bio, ending page IO on them */ bio_for_each_segment_all(bv, bio, iter_all) - iomap_finish_page_writeback(inode, bv->bv_page, error); + iomap_finish_page_writeback(inode, bv->bv_page, error, + bv->bv_len); bio_put(bio); } /* The ioend has been freed by bio_put() */ @@ -1332,8 +1302,8 @@ iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page, merged = __bio_try_merge_page(wpc->ioend->io_bio, page, len, poff, &same_page); - if (iop && !same_page) - atomic_inc(&iop->write_count); + if (iop) + atomic_add(len, &iop->write_bytes_pending); if (!merged) { if (bio_full(wpc->ioend->io_bio, len)) { @@ -1375,8 +1345,8 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, int error = 0, count = 0, i; LIST_HEAD(submit_list); - WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE && !iop); - WARN_ON_ONCE(iop && atomic_read(&iop->write_count) != 0); + WARN_ON_ONCE(i_blocks_per_page(inode, page) > 1 && !iop); + WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) != 0); /* * Walk through the page to find areas to write back. If we run off the diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index c1aafb2ab990..933f234d5bec 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -76,7 +76,7 @@ static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap, dio->submit.cookie = submit_bio(bio); } -static ssize_t iomap_dio_complete(struct iomap_dio *dio) +ssize_t iomap_dio_complete(struct iomap_dio *dio) { const struct iomap_dio_ops *dops = dio->dops; struct kiocb *iocb = dio->iocb; @@ -108,7 +108,7 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio) * ->end_io() when necessary, otherwise a racing buffer read would cache * zeros from unwritten extents. */ - if (!dio->error && + if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) { int err; err = invalidate_inode_pages2_range(inode->i_mapping, @@ -118,6 +118,7 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio) dio_warn_stale_pagecache(iocb->ki_filp); } + inode_dio_end(file_inode(iocb->ki_filp)); /* * If this is a DSYNC write, make sure we push it to stable storage now * that we've written data. @@ -125,11 +126,11 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio) if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC)) ret = generic_write_sync(iocb, ret); - inode_dio_end(file_inode(iocb->ki_filp)); kfree(dio); return ret; } +EXPORT_SYMBOL_GPL(iomap_dio_complete); static void iomap_dio_complete_work(struct work_struct *work) { @@ -388,6 +389,16 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, return iomap_dio_bio_actor(inode, pos, length, dio, iomap); case IOMAP_INLINE: return iomap_dio_inline_actor(inode, pos, length, dio, iomap); + case IOMAP_DELALLOC: + /* + * DIO is not serialised against mmap() access at all, and so + * if the page_mkwrite occurs between the writeback and the + * iomap_apply() call in the DIO path, then it will see the + * DELALLOC block that the page-mkwrite allocated. + */ + pr_warn_ratelimited("Direct I/O collision with buffered writes! File: %pD4 Comm: %.20s\n", + dio->iocb->ki_filp, current->comm); + return -EIO; default: WARN_ON_ONCE(1); return -EIO; @@ -406,8 +417,8 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, * Returns -ENOTBLK In case of a page invalidation invalidation failure for * writes. The callers needs to fall back to buffered I/O in this case. */ -ssize_t -iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, +struct iomap_dio * +__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, bool wait_for_completion) { @@ -421,14 +432,14 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, struct iomap_dio *dio; if (!count) - return 0; + return NULL; if (WARN_ON(is_sync_kiocb(iocb) && !wait_for_completion)) - return -EIO; + return ERR_PTR(-EIO); dio = kmalloc(sizeof(*dio), GFP_KERNEL); if (!dio) - return -ENOMEM; + return ERR_PTR(-ENOMEM); dio->iocb = iocb; atomic_set(&dio->ref, 1); @@ -558,7 +569,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->wait_for_completion = wait_for_completion; if (!atomic_dec_and_test(&dio->ref)) { if (!wait_for_completion) - return -EIOCBQUEUED; + return ERR_PTR(-EIOCBQUEUED); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); @@ -574,10 +585,26 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, __set_current_state(TASK_RUNNING); } - return iomap_dio_complete(dio); + return dio; out_free_dio: kfree(dio); - return ret; + if (ret) + return ERR_PTR(ret); + return NULL; +} +EXPORT_SYMBOL_GPL(__iomap_dio_rw); + +ssize_t +iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, + const struct iomap_ops *ops, const struct iomap_dio_ops *dops, + bool wait_for_completion) +{ + struct iomap_dio *dio; + + dio = __iomap_dio_rw(iocb, iter, ops, dops, wait_for_completion); + if (IS_ERR_OR_NULL(dio)) + return PTR_ERR_OR_ZERO(dio); + return iomap_dio_complete(dio); } EXPORT_SYMBOL_GPL(iomap_dio_rw); diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index a2f5338a5ea1..176580f54af9 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -473,7 +473,7 @@ static int metapage_readpage(struct file *fp, struct page *page) struct inode *inode = page->mapping->host; struct bio *bio = NULL; int block_offset; - int blocks_per_page = PAGE_SIZE >> inode->i_blkbits; + int blocks_per_page = i_blocks_per_page(inode, page); sector_t page_start; /* address of page in fs blocks */ sector_t pblock; int xlen; diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c new file mode 100644 index 000000000000..90d255fbdd9b --- /dev/null +++ b/fs/kernel_read_file.c @@ -0,0 +1,189 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/fs.h> +#include <linux/fs_struct.h> +#include <linux/kernel_read_file.h> +#include <linux/security.h> +#include <linux/vmalloc.h> + +/** + * kernel_read_file() - read file contents into a kernel buffer + * + * @file file to read from + * @offset where to start reading from (see below). + * @buf pointer to a "void *" buffer for reading into (if + * *@buf is NULL, a buffer will be allocated, and + * @buf_size will be ignored) + * @buf_size size of buf, if already allocated. If @buf not + * allocated, this is the largest size to allocate. + * @file_size if non-NULL, the full size of @file will be + * written here. + * @id the kernel_read_file_id identifying the type of + * file contents being read (for LSMs to examine) + * + * @offset must be 0 unless both @buf and @file_size are non-NULL + * (i.e. the caller must be expecting to read partial file contents + * via an already-allocated @buf, in at most @buf_size chunks, and + * will be able to determine when the entire file was read by + * checking @file_size). This isn't a recommended way to read a + * file, though, since it is possible that the contents might + * change between calls to kernel_read_file(). + * + * Returns number of bytes read (no single read will be bigger + * than INT_MAX), or negative on error. + * + */ +int kernel_read_file(struct file *file, loff_t offset, void **buf, + size_t buf_size, size_t *file_size, + enum kernel_read_file_id id) +{ + loff_t i_size, pos; + size_t copied; + void *allocated = NULL; + bool whole_file; + int ret; + + if (offset != 0 && (!*buf || !file_size)) + return -EINVAL; + + if (!S_ISREG(file_inode(file)->i_mode)) + return -EINVAL; + + ret = deny_write_access(file); + if (ret) + return ret; + + i_size = i_size_read(file_inode(file)); + if (i_size <= 0) { + ret = -EINVAL; + goto out; + } + /* The file is too big for sane activities. */ + if (i_size > INT_MAX) { + ret = -EFBIG; + goto out; + } + /* The entire file cannot be read in one buffer. */ + if (!file_size && offset == 0 && i_size > buf_size) { + ret = -EFBIG; + goto out; + } + + whole_file = (offset == 0 && i_size <= buf_size); + ret = security_kernel_read_file(file, id, whole_file); + if (ret) + goto out; + + if (file_size) + *file_size = i_size; + + if (!*buf) + *buf = allocated = vmalloc(i_size); + if (!*buf) { + ret = -ENOMEM; + goto out; + } + + pos = offset; + copied = 0; + while (copied < buf_size) { + ssize_t bytes; + size_t wanted = min_t(size_t, buf_size - copied, + i_size - pos); + + bytes = kernel_read(file, *buf + copied, wanted, &pos); + if (bytes < 0) { + ret = bytes; + goto out_free; + } + + if (bytes == 0) + break; + copied += bytes; + } + + if (whole_file) { + if (pos != i_size) { + ret = -EIO; + goto out_free; + } + + ret = security_kernel_post_read_file(file, *buf, i_size, id); + } + +out_free: + if (ret < 0) { + if (allocated) { + vfree(*buf); + *buf = NULL; + } + } + +out: + allow_write_access(file); + return ret == 0 ? copied : ret; +} +EXPORT_SYMBOL_GPL(kernel_read_file); + +int kernel_read_file_from_path(const char *path, loff_t offset, void **buf, + size_t buf_size, size_t *file_size, + enum kernel_read_file_id id) +{ + struct file *file; + int ret; + + if (!path || !*path) + return -EINVAL; + + file = filp_open(path, O_RDONLY, 0); + if (IS_ERR(file)) + return PTR_ERR(file); + + ret = kernel_read_file(file, offset, buf, buf_size, file_size, id); + fput(file); + return ret; +} +EXPORT_SYMBOL_GPL(kernel_read_file_from_path); + +int kernel_read_file_from_path_initns(const char *path, loff_t offset, + void **buf, size_t buf_size, + size_t *file_size, + enum kernel_read_file_id id) +{ + struct file *file; + struct path root; + int ret; + + if (!path || !*path) + return -EINVAL; + + task_lock(&init_task); + get_fs_root(init_task.fs, &root); + task_unlock(&init_task); + + file = file_open_root(root.dentry, root.mnt, path, O_RDONLY, 0); + path_put(&root); + if (IS_ERR(file)) + return PTR_ERR(file); + + ret = kernel_read_file(file, offset, buf, buf_size, file_size, id); + fput(file); + return ret; +} +EXPORT_SYMBOL_GPL(kernel_read_file_from_path_initns); + +int kernel_read_file_from_fd(int fd, loff_t offset, void **buf, + size_t buf_size, size_t *file_size, + enum kernel_read_file_id id) +{ + struct fd f = fdget(fd); + int ret = -EBADF; + + if (!f.file) + goto out; + + ret = kernel_read_file(f.file, offset, buf, buf_size, file_size, id); +out: + fdput(f); + return ret; +} +EXPORT_SYMBOL_GPL(kernel_read_file_from_fd); diff --git a/fs/libfs.c b/fs/libfs.c index e0d42e977d9a..fc34361c1489 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -20,6 +20,8 @@ #include <linux/fs_context.h> #include <linux/pseudo_fs.h> #include <linux/fsnotify.h> +#include <linux/unicode.h> +#include <linux/fscrypt.h> #include <linux/uaccess.h> @@ -1363,3 +1365,88 @@ bool is_empty_dir_inode(struct inode *inode) return (inode->i_fop == &empty_dir_operations) && (inode->i_op == &empty_dir_inode_operations); } + +#ifdef CONFIG_UNICODE +/* + * Determine if the name of a dentry should be casefolded. + * + * Return: if names will need casefolding + */ +static bool needs_casefold(const struct inode *dir) +{ + return IS_CASEFOLDED(dir) && dir->i_sb->s_encoding; +} + +/** + * generic_ci_d_compare - generic d_compare implementation for casefolding filesystems + * @dentry: dentry whose name we are checking against + * @len: len of name of dentry + * @str: str pointer to name of dentry + * @name: Name to compare against + * + * Return: 0 if names match, 1 if mismatch, or -ERRNO + */ +int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, + const char *str, const struct qstr *name) +{ + const struct dentry *parent = READ_ONCE(dentry->d_parent); + const struct inode *dir = READ_ONCE(parent->d_inode); + const struct super_block *sb = dentry->d_sb; + const struct unicode_map *um = sb->s_encoding; + struct qstr qstr = QSTR_INIT(str, len); + char strbuf[DNAME_INLINE_LEN]; + int ret; + + if (!dir || !needs_casefold(dir)) + goto fallback; + /* + * If the dentry name is stored in-line, then it may be concurrently + * modified by a rename. If this happens, the VFS will eventually retry + * the lookup, so it doesn't matter what ->d_compare() returns. + * However, it's unsafe to call utf8_strncasecmp() with an unstable + * string. Therefore, we have to copy the name into a temporary buffer. + */ + if (len <= DNAME_INLINE_LEN - 1) { + memcpy(strbuf, str, len); + strbuf[len] = 0; + qstr.name = strbuf; + /* prevent compiler from optimizing out the temporary buffer */ + barrier(); + } + ret = utf8_strncasecmp(um, name, &qstr); + if (ret >= 0) + return ret; + + if (sb_has_strict_encoding(sb)) + return -EINVAL; +fallback: + if (len != name->len) + return 1; + return !!memcmp(str, name->name, len); +} +EXPORT_SYMBOL(generic_ci_d_compare); + +/** + * generic_ci_d_hash - generic d_hash implementation for casefolding filesystems + * @dentry: dentry of the parent directory + * @str: qstr of name whose hash we should fill in + * + * Return: 0 if hash was successful or unchanged, and -EINVAL on error + */ +int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) +{ + const struct inode *dir = READ_ONCE(dentry->d_inode); + struct super_block *sb = dentry->d_sb; + const struct unicode_map *um = sb->s_encoding; + int ret = 0; + + if (!dir || !needs_casefold(dir)) + return 0; + + ret = utf8_casefold_hash(um, dentry, str); + if (ret < 0 && sb_has_strict_encoding(sb)) + return -EINVAL; + return 0; +} +EXPORT_SYMBOL(generic_ci_d_hash); +#endif diff --git a/fs/namei.c b/fs/namei.c index e99e2a9da0f7..f1eb8ccd2be9 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -568,8 +568,8 @@ static bool path_connected(struct vfsmount *mnt, struct dentry *dentry) { struct super_block *sb = mnt->mnt_sb; - /* Bind mounts and multi-root filesystems can have disconnected paths */ - if (!(sb->s_iflags & SB_I_MULTIROOT) && (mnt->mnt_root == sb->s_root)) + /* Bind mounts can have disconnected paths */ + if (mnt->mnt_root == sb->s_root) return true; return is_subdir(dentry, mnt->mnt_root); diff --git a/fs/namespace.c b/fs/namespace.c index bae0e95b3713..294e05a13d17 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3072,10 +3072,10 @@ static void shrink_submounts(struct mount *mnt) } } -void *copy_mount_options(const void __user * data) +static void *copy_mount_options(const void __user * data) { char *copy; - unsigned size; + unsigned left, offset; if (!data) return NULL; @@ -3084,20 +3084,31 @@ void *copy_mount_options(const void __user * data) if (!copy) return ERR_PTR(-ENOMEM); - size = PAGE_SIZE - offset_in_page(data); + left = copy_from_user(copy, data, PAGE_SIZE); - if (copy_from_user(copy, data, size)) { + /* + * Not all architectures have an exact copy_from_user(). Resort to + * byte at a time. + */ + offset = PAGE_SIZE - left; + while (left) { + char c; + if (get_user(c, (const char __user *)data + offset)) + break; + copy[offset] = c; + left--; + offset++; + } + + if (left == PAGE_SIZE) { kfree(copy); return ERR_PTR(-EFAULT); } - if (size != PAGE_SIZE) { - if (copy_from_user(copy + size, data + size, PAGE_SIZE - size)) - memset(copy + size, 0, PAGE_SIZE - size); - } + return copy; } -char *copy_mount_string(const void __user *data) +static char *copy_mount_string(const void __user *data) { return data ? strndup_user(data, PATH_MAX) : NULL; } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index e732580fe47b..cb52db9a0cfb 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -579,6 +579,9 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); do { + if (entry->label) + entry->label->len = NFS4_MAXLABELLEN; + status = xdr_decode(desc, entry, &stream); if (status != 0) { if (status == -EAGAIN) diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index 009987e69020..29ec8b09a52d 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -1040,6 +1040,65 @@ out_invalid_fh: } #if IS_ENABLED(CONFIG_NFS_V4) +struct compat_nfs_string { + compat_uint_t len; + compat_uptr_t data; +}; + +static inline void compat_nfs_string(struct nfs_string *dst, + struct compat_nfs_string *src) +{ + dst->data = compat_ptr(src->data); + dst->len = src->len; +} + +struct compat_nfs4_mount_data_v1 { + compat_int_t version; + compat_int_t flags; + compat_int_t rsize; + compat_int_t wsize; + compat_int_t timeo; + compat_int_t retrans; + compat_int_t acregmin; + compat_int_t acregmax; + compat_int_t acdirmin; + compat_int_t acdirmax; + struct compat_nfs_string client_addr; + struct compat_nfs_string mnt_path; + struct compat_nfs_string hostname; + compat_uint_t host_addrlen; + compat_uptr_t host_addr; + compat_int_t proto; + compat_int_t auth_flavourlen; + compat_uptr_t auth_flavours; +}; + +static void nfs4_compat_mount_data_conv(struct nfs4_mount_data *data) +{ + struct compat_nfs4_mount_data_v1 *compat = + (struct compat_nfs4_mount_data_v1 *)data; + + /* copy the fields backwards */ + data->auth_flavours = compat_ptr(compat->auth_flavours); + data->auth_flavourlen = compat->auth_flavourlen; + data->proto = compat->proto; + data->host_addr = compat_ptr(compat->host_addr); + data->host_addrlen = compat->host_addrlen; + compat_nfs_string(&data->hostname, &compat->hostname); + compat_nfs_string(&data->mnt_path, &compat->mnt_path); + compat_nfs_string(&data->client_addr, &compat->client_addr); + data->acdirmax = compat->acdirmax; + data->acdirmin = compat->acdirmin; + data->acregmax = compat->acregmax; + data->acregmin = compat->acregmin; + data->retrans = compat->retrans; + data->timeo = compat->timeo; + data->wsize = compat->wsize; + data->rsize = compat->rsize; + data->flags = compat->flags; + data->version = compat->version; +} + /* * Validate NFSv4 mount options */ @@ -1050,89 +1109,83 @@ static int nfs4_parse_monolithic(struct fs_context *fc, struct sockaddr *sap = (struct sockaddr *)&ctx->nfs_server.address; char *c; - if (data == NULL) - goto out_no_data; + if (!data) { + if (is_remount_fc(fc)) + goto done; + return nfs_invalf(fc, + "NFS4: mount program didn't pass any mount data"); + } ctx->version = 4; - switch (data->version) { - case 1: - if (data->host_addrlen > sizeof(ctx->nfs_server.address)) - goto out_no_address; - if (data->host_addrlen == 0) - goto out_no_address; - ctx->nfs_server.addrlen = data->host_addrlen; - if (copy_from_user(sap, data->host_addr, data->host_addrlen)) - return -EFAULT; - if (!nfs_verify_server_address(sap)) - goto out_no_address; - ctx->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port); - - if (data->auth_flavourlen) { - rpc_authflavor_t pseudoflavor; - if (data->auth_flavourlen > 1) - goto out_inval_auth; - if (copy_from_user(&pseudoflavor, - data->auth_flavours, - sizeof(pseudoflavor))) - return -EFAULT; - ctx->selected_flavor = pseudoflavor; - } else - ctx->selected_flavor = RPC_AUTH_UNIX; - - c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN); - if (IS_ERR(c)) - return PTR_ERR(c); - ctx->nfs_server.hostname = c; + if (data->version != 1) + return generic_parse_monolithic(fc, data); - c = strndup_user(data->mnt_path.data, NFS4_MAXPATHLEN); - if (IS_ERR(c)) - return PTR_ERR(c); - ctx->nfs_server.export_path = c; - dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", c); + if (in_compat_syscall()) + nfs4_compat_mount_data_conv(data); - c = strndup_user(data->client_addr.data, 16); - if (IS_ERR(c)) - return PTR_ERR(c); - ctx->client_address = c; - - /* - * Translate to nfs_fs_context, which nfs_fill_super - * can deal with. - */ + if (data->host_addrlen > sizeof(ctx->nfs_server.address)) + goto out_no_address; + if (data->host_addrlen == 0) + goto out_no_address; + ctx->nfs_server.addrlen = data->host_addrlen; + if (copy_from_user(sap, data->host_addr, data->host_addrlen)) + return -EFAULT; + if (!nfs_verify_server_address(sap)) + goto out_no_address; + ctx->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port); - ctx->flags = data->flags & NFS4_MOUNT_FLAGMASK; - ctx->rsize = data->rsize; - ctx->wsize = data->wsize; - ctx->timeo = data->timeo; - ctx->retrans = data->retrans; - ctx->acregmin = data->acregmin; - ctx->acregmax = data->acregmax; - ctx->acdirmin = data->acdirmin; - ctx->acdirmax = data->acdirmax; - ctx->nfs_server.protocol = data->proto; - nfs_validate_transport_protocol(ctx); - if (ctx->nfs_server.protocol == XPRT_TRANSPORT_UDP) - goto out_invalid_transport_udp; + if (data->auth_flavourlen) { + rpc_authflavor_t pseudoflavor; - break; - default: - goto generic; + if (data->auth_flavourlen > 1) + goto out_inval_auth; + if (copy_from_user(&pseudoflavor, data->auth_flavours, + sizeof(pseudoflavor))) + return -EFAULT; + ctx->selected_flavor = pseudoflavor; + } else { + ctx->selected_flavor = RPC_AUTH_UNIX; } + c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN); + if (IS_ERR(c)) + return PTR_ERR(c); + ctx->nfs_server.hostname = c; + + c = strndup_user(data->mnt_path.data, NFS4_MAXPATHLEN); + if (IS_ERR(c)) + return PTR_ERR(c); + ctx->nfs_server.export_path = c; + dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", c); + + c = strndup_user(data->client_addr.data, 16); + if (IS_ERR(c)) + return PTR_ERR(c); + ctx->client_address = c; + + /* + * Translate to nfs_fs_context, which nfs_fill_super + * can deal with. + */ + + ctx->flags = data->flags & NFS4_MOUNT_FLAGMASK; + ctx->rsize = data->rsize; + ctx->wsize = data->wsize; + ctx->timeo = data->timeo; + ctx->retrans = data->retrans; + ctx->acregmin = data->acregmin; + ctx->acregmax = data->acregmax; + ctx->acdirmin = data->acdirmin; + ctx->acdirmax = data->acdirmax; + ctx->nfs_server.protocol = data->proto; + nfs_validate_transport_protocol(ctx); + if (ctx->nfs_server.protocol == XPRT_TRANSPORT_UDP) + goto out_invalid_transport_udp; +done: ctx->skip_reconfig_option_check = true; return 0; -generic: - return generic_parse_monolithic(fc, data); - -out_no_data: - if (is_remount_fc(fc)) { - ctx->skip_reconfig_option_check = true; - return 0; - } - return nfs_invalf(fc, "NFS4: mount program didn't pass any mount data"); - out_inval_auth: return nfs_invalf(fc, "NFS4: Invalid number of RPC auth flavours %d", data->auth_flavourlen); diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 142225f0af59..2b2211d1234e 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -356,7 +356,15 @@ static ssize_t _nfs42_proc_copy(struct file *src, truncate_pagecache_range(dst_inode, pos_dst, pos_dst + res->write_res.count); - + spin_lock(&dst_inode->i_lock); + NFS_I(dst_inode)->cache_validity |= (NFS_INO_REVAL_PAGECACHE | + NFS_INO_REVAL_FORCED | NFS_INO_INVALID_SIZE | + NFS_INO_INVALID_ATTR | NFS_INO_INVALID_DATA); + spin_unlock(&dst_inode->i_lock); + spin_lock(&src_inode->i_lock); + NFS_I(src_inode)->cache_validity |= (NFS_INO_REVAL_PAGECACHE | + NFS_INO_REVAL_FORCED | NFS_INO_INVALID_ATIME); + spin_unlock(&src_inode->i_lock); status = res->write_res.count; out: if (args->sync) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index eb2401079b04..78c46a517fcf 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1200,13 +1200,6 @@ static void nfs_get_cache_cookie(struct super_block *sb, } #endif -static void nfs_set_readahead(struct backing_dev_info *bdi, - unsigned long iomax_pages) -{ - bdi->ra_pages = VM_READAHEAD_PAGES; - bdi->io_pages = iomax_pages; -} - int nfs_get_tree_common(struct fs_context *fc) { struct nfs_fs_context *ctx = nfs_fc2context(fc); @@ -1251,7 +1244,7 @@ int nfs_get_tree_common(struct fs_context *fc) MINOR(server->s_dev)); if (error) goto error_splat_super; - nfs_set_readahead(s->s_bdi, server->rpages); + s->s_bdi->io_pages = server->rpages; server->super = s; } diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 311e5ce80cfc..a07c39c94bbd 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -170,7 +170,7 @@ nfsd4_block_proc_getdeviceinfo(struct super_block *sb, struct nfs4_client *clp, struct nfsd4_getdeviceinfo *gdp) { - if (sb->s_bdev != sb->s_bdev->bd_contains) + if (bdev_is_partition(sb->s_bdev)) return nfserr_inval; return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp)); } @@ -382,7 +382,7 @@ nfsd4_scsi_proc_getdeviceinfo(struct super_block *sb, struct nfs4_client *clp, struct nfsd4_getdeviceinfo *gdp) { - if (sb->s_bdev != sb->s_bdev->bd_contains) + if (bdev_is_partition(sb->s_bdev)) return nfserr_inval; return nfserrno(nfsd4_block_get_device_info_scsi(sb, clp, gdp)); } diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index e516ae389ca5..5900879d5693 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -355,7 +355,7 @@ void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *bmap, /** * nilfs_bmap_assign - assign a new block number to a block * @bmap: bmap - * @bhp: pointer to buffer head + * @bh: pointer to buffer head * @blocknr: block number * @binfo: block information * diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index 86d4d850d130..025fb082575a 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -889,7 +889,7 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno) * nilfs_cpfile_change_cpmode - change checkpoint mode * @cpfile: inode of checkpoint file * @cno: checkpoint number - * @status: mode of checkpoint + * @mode: mode of checkpoint * * Description: nilfs_change_cpmode() changes the mode of the checkpoint * specified by @cno. The mode @mode is NILFS_CHECKPOINT or NILFS_SNAPSHOT. @@ -930,12 +930,12 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode) /** * nilfs_cpfile_get_stat - get checkpoint statistics * @cpfile: inode of checkpoint file - * @stat: pointer to a structure of checkpoint statistics + * @cpstat: pointer to a structure of checkpoint statistics * * Description: nilfs_cpfile_get_stat() returns information about checkpoints. * * Return Value: On success, 0 is returned, and checkpoints information is - * stored in the place pointed by @stat. On error, one of the following + * stored in the place pointed by @cpstat. On error, one of the following * negative error codes is returned. * * %-EIO - I/O error. diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index b175f1330408..171fb5cd427f 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -69,7 +69,6 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode, /** * nilfs_forget_buffer - discard dirty state - * @inode: owner inode of the buffer * @bh: buffer head of the buffer to be discarded */ void nilfs_forget_buffer(struct buffer_head *bh) diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 42ff67c0c14f..63722475e17e 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -546,13 +546,13 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, /** * nilfs_sufile_get_stat - get segment usage statistics * @sufile: inode of segment usage file - * @stat: pointer to a structure of segment usage statistics + * @sustat: pointer to a structure of segment usage statistics * * Description: nilfs_sufile_get_stat() returns information about segment * usage. * * Return Value: On success, 0 is returned, and segment usage information is - * stored in the place pointed by @stat. On error, one of the following + * stored in the place pointed by @sustat. On error, one of the following * negative error codes is returned. * * %-EIO - I/O error. diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index c942910a8649..9167884a61ec 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -531,6 +531,7 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, struct inode *dirid = fanotify_dfid_inode(mask, data, data_type, dir); const struct path *path = fsnotify_data_path(data, data_type); unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); + struct mem_cgroup *old_memcg; struct inode *child = NULL; bool name_event = false; @@ -580,7 +581,7 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, gfp |= __GFP_RETRY_MAYFAIL; /* Whoever is interested in the event, pays for the allocation. */ - memalloc_use_memcg(group->memcg); + old_memcg = set_active_memcg(group->memcg); if (fanotify_is_perm_event(mask)) { event = fanotify_alloc_perm_event(path, gfp); @@ -608,7 +609,7 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, event->pid = get_pid(task_tgid(current)); out: - memalloc_unuse_memcg(); + set_active_memcg(old_memcg); return event; } diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index a65cf8c9f600..9ddcbadc98e2 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -66,6 +66,7 @@ static int inotify_one_event(struct fsnotify_group *group, u32 mask, int ret; int len = 0; int alloc_len = sizeof(struct inotify_event_info); + struct mem_cgroup *old_memcg; if ((inode_mark->mask & FS_EXCL_UNLINK) && path && d_unlinked(path->dentry)) @@ -87,9 +88,9 @@ static int inotify_one_event(struct fsnotify_group *group, u32 mask, * trigger OOM killer in the target monitoring memcg as it may have * security repercussion. */ - memalloc_use_memcg(group->memcg); + old_memcg = set_active_memcg(group->memcg); event = kmalloc(alloc_len, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); - memalloc_unuse_memcg(); + set_active_memcg(old_memcg); if (unlikely(!event)) { /* diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 9bb9f0952b18..caf563981532 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -1810,6 +1810,12 @@ int ntfs_read_inode_mount(struct inode *vi) brelse(bh); } + if (le32_to_cpu(m->bytes_allocated) != vol->mft_record_size) { + ntfs_error(sb, "Incorrect mft record size %u in superblock, should be %u.", + le32_to_cpu(m->bytes_allocated), vol->mft_record_size); + goto err_out; + } + /* Apply the mst fixups. */ if (post_read_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size)) { /* FIXME: Try to use the $MFTMirr now. */ diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 4c1b90442d6f..78710788c237 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6013,7 +6013,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) goto out; } - /* Appending truncate log(TA) and and flushing truncate log(TF) are + /* Appending truncate log(TA) and flushing truncate log(TF) are * two separated transactions. They can be both committed but not * checkpointed. If crash occurs then, both two transaction will be * replayed with several already released to global bitmap clusters. @@ -7654,8 +7654,10 @@ out_mutex: * main_bm related locks for avoiding the current IO starve, then go to * trim the next group */ - if (ret >= 0 && group <= last_group) + if (ret >= 0 && group <= last_group) { + cond_resched(); goto next_group; + } out: range->len = trimmed * sb->s_blocksize; return ret; diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 89d13e0705fe..0179a73a3fa2 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1766,7 +1766,6 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, int sectsize; char *p = (char *)page; struct fd f; - struct inode *inode; ssize_t ret = -EINVAL; int live_threshold; @@ -1793,20 +1792,16 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, reg->hr_block_bytes == 0) goto out2; - inode = igrab(f.file->f_mapping->host); - if (inode == NULL) + if (!S_ISBLK(f.file->f_mapping->host->i_mode)) goto out2; - if (!S_ISBLK(inode->i_mode)) - goto out3; - - reg->hr_bdev = I_BDEV(f.file->f_mapping->host); - ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL); - if (ret) { + reg->hr_bdev = blkdev_get_by_dev(f.file->f_mapping->host->i_rdev, + FMODE_WRITE | FMODE_READ, NULL); + if (IS_ERR(reg->hr_bdev)) { + ret = PTR_ERR(reg->hr_bdev); reg->hr_bdev = NULL; - goto out3; + goto out2; } - inode = NULL; bdevname(reg->hr_bdev, reg->hr_dev_name); @@ -1909,16 +1904,13 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, config_item_name(®->hr_item), reg->hr_dev_name); out3: - iput(inode); + if (ret < 0) { + blkdev_put(reg->hr_bdev, FMODE_READ | FMODE_WRITE); + reg->hr_bdev = NULL; + } out2: fdput(f); out: - if (ret < 0) { - if (reg->hr_bdev) { - blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE); - reg->hr_bdev = NULL; - } - } return ret; } diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 720e9f94957e..fc8252a28cb1 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -677,7 +677,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, /* * Under certain conditions, the window slide code * might have reduced the number of bits available or - * disabled the the local alloc entirely. Re-check + * disabled the local alloc entirely. Re-check * here and return -ENOSPC if necessary. */ status = -ENOSPC; diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index d07fb92b7253..955ecd4030f0 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -43,7 +43,8 @@ static bool ovl_must_copy_xattr(const char *name) !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); } -int ovl_copy_xattr(struct dentry *old, struct dentry *new) +int ovl_copy_xattr(struct super_block *sb, struct dentry *old, + struct dentry *new) { ssize_t list_size, size, value_size = 0; char *buf, *name, *value = NULL; @@ -81,7 +82,7 @@ int ovl_copy_xattr(struct dentry *old, struct dentry *new) } list_size -= slen; - if (ovl_is_private_xattr(name)) + if (ovl_is_private_xattr(sb, name)) continue; retry: size = vfs_getxattr(old, name, value, value_size); @@ -128,7 +129,8 @@ out: return error; } -static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) +static int ovl_copy_up_data(struct ovl_fs *ofs, struct path *old, + struct path *new, loff_t len) { struct file *old_file; struct file *new_file; @@ -218,7 +220,7 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) len -= bytes; } out: - if (!error) + if (!error && ovl_should_sync(ofs)) error = vfs_fsync(new_file, 0); fput(new_file); out_fput: @@ -354,7 +356,8 @@ int ovl_set_origin(struct dentry *dentry, struct dentry *lower, } /* Store file handle of @upper dir in @index dir entry */ -static int ovl_set_upper_fh(struct dentry *upper, struct dentry *index) +static int ovl_set_upper_fh(struct ovl_fs *ofs, struct dentry *upper, + struct dentry *index) { const struct ovl_fh *fh; int err; @@ -363,7 +366,7 @@ static int ovl_set_upper_fh(struct dentry *upper, struct dentry *index) if (IS_ERR(fh)) return PTR_ERR(fh); - err = ovl_do_setxattr(index, OVL_XATTR_UPPER, fh->buf, fh->fb.len, 0); + err = ovl_do_setxattr(ofs, index, OVL_XATTR_UPPER, fh->buf, fh->fb.len); kfree(fh); return err; @@ -408,7 +411,7 @@ static int ovl_create_index(struct dentry *dentry, struct dentry *origin, if (IS_ERR(temp)) goto free_name; - err = ovl_set_upper_fh(upper, temp); + err = ovl_set_upper_fh(OVL_FS(dentry->d_sb), upper, temp); if (err) goto out; @@ -484,6 +487,7 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) { + struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); int err; /* @@ -499,12 +503,13 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) upperpath.dentry = temp; ovl_path_lowerdata(c->dentry, &datapath); - err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size); + err = ovl_copy_up_data(ofs, &datapath, &upperpath, + c->stat.size); if (err) return err; } - err = ovl_copy_xattr(c->lowerpath.dentry, temp); + err = ovl_copy_xattr(c->dentry->d_sb, c->lowerpath.dentry, temp); if (err) return err; @@ -781,9 +786,33 @@ static bool ovl_need_meta_copy_up(struct dentry *dentry, umode_t mode, return true; } +static ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value) +{ + ssize_t res; + char *buf; + + res = vfs_getxattr(dentry, name, NULL, 0); + if (res == -ENODATA || res == -EOPNOTSUPP) + res = 0; + + if (res > 0) { + buf = kzalloc(res, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + res = vfs_getxattr(dentry, name, buf, res); + if (res < 0) + kfree(buf); + else + *value = buf; + } + return res; +} + /* Copy up data of an inode which was copied up metadata only in the past. */ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) { + struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); struct path upperpath, datapath; int err; char *capability = NULL; @@ -799,12 +828,12 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) if (c->stat.size) { err = cap_size = ovl_getxattr(upperpath.dentry, XATTR_NAME_CAPS, - &capability, 0); - if (err < 0 && err != -ENODATA) + &capability); + if (cap_size < 0) goto out; } - err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size); + err = ovl_copy_up_data(ofs, &datapath, &upperpath, c->stat.size); if (err) goto out_free; @@ -813,14 +842,14 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) * don't want that to happen for normal copy-up operation. */ if (capability) { - err = ovl_do_setxattr(upperpath.dentry, XATTR_NAME_CAPS, - capability, cap_size, 0); + err = vfs_setxattr(upperpath.dentry, XATTR_NAME_CAPS, + capability, cap_size, 0); if (err) goto out_free; } - err = vfs_removexattr(upperpath.dentry, OVL_XATTR_METACOPY); + err = ovl_do_removexattr(ofs, upperpath.dentry, OVL_XATTR_METACOPY); if (err) goto out_free; diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 1bba4813f9cb..28a075b5f5b2 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -394,7 +394,7 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, if (IS_ERR(opaquedir)) goto out_unlock; - err = ovl_copy_xattr(upper, opaquedir); + err = ovl_copy_xattr(dentry->d_sb, upper, opaquedir); if (err) goto out_cleanup; diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 0e696f72cf65..ed35be3fafc6 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -752,7 +752,7 @@ static struct dentry *ovl_lower_fh_to_d(struct super_block *sb, goto out_err; } if (index) { - err = ovl_verify_origin(index, origin.dentry, false); + err = ovl_verify_origin(ofs, index, origin.dentry, false); if (err) goto out_err; } diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 0d940e29d62b..efccb7c1f9bc 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -136,6 +136,13 @@ static int ovl_real_fdget_meta(const struct file *file, struct fd *real, static int ovl_real_fdget(const struct file *file, struct fd *real) { + if (d_is_dir(file_dentry(file))) { + real->flags = 0; + real->file = ovl_dir_real_file(file, false); + + return PTR_ERR_OR_ZERO(real->file); + } + return ovl_real_fdget_meta(file, real, false); } @@ -331,6 +338,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) struct fd real; const struct cred *old_cred; ssize_t ret; + int ifl = iocb->ki_flags; if (!iov_iter_count(iter)) return 0; @@ -346,11 +354,14 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) if (ret) goto out_unlock; + if (!ovl_should_sync(OVL_FS(inode->i_sb))) + ifl &= ~(IOCB_DSYNC | IOCB_SYNC); + old_cred = ovl_override_creds(file_inode(file)->i_sb); if (is_sync_kiocb(iocb)) { file_start_write(real.file); ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, - ovl_iocb_to_rwf(iocb->ki_flags)); + ovl_iocb_to_rwf(ifl)); file_end_write(real.file); /* Update size */ ovl_copyattr(ovl_inode_real(inode), inode); @@ -370,6 +381,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) real.flags = 0; aio_req->orig_iocb = iocb; kiocb_clone(&aio_req->iocb, iocb, real.file); + aio_req->iocb.ki_flags = ifl; aio_req->iocb.ki_complete = ovl_aio_rw_complete; ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter); if (ret != -EIOCBQUEUED) @@ -433,6 +445,9 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync) const struct cred *old_cred; int ret; + if (!ovl_should_sync(OVL_FS(file_inode(file)->i_sb))) + return 0; + ret = ovl_real_fdget_meta(file, &real, !datasync); if (ret) return ret; @@ -544,12 +559,28 @@ static long ovl_real_ioctl(struct file *file, unsigned int cmd, return ret; } +static unsigned int ovl_iflags_to_fsflags(unsigned int iflags) +{ + unsigned int flags = 0; + + if (iflags & S_SYNC) + flags |= FS_SYNC_FL; + if (iflags & S_APPEND) + flags |= FS_APPEND_FL; + if (iflags & S_IMMUTABLE) + flags |= FS_IMMUTABLE_FL; + if (iflags & S_NOATIME) + flags |= FS_NOATIME_FL; + + return flags; +} + static long ovl_ioctl_set_flags(struct file *file, unsigned int cmd, - unsigned long arg, unsigned int iflags) + unsigned long arg, unsigned int flags) { long ret; struct inode *inode = file_inode(file); - unsigned int old_iflags; + unsigned int oldflags; if (!inode_owner_or_capable(inode)) return -EACCES; @@ -561,10 +592,9 @@ static long ovl_ioctl_set_flags(struct file *file, unsigned int cmd, inode_lock(inode); /* Check the capability before cred override */ - ret = -EPERM; - old_iflags = READ_ONCE(inode->i_flags); - if (((iflags ^ old_iflags) & (S_APPEND | S_IMMUTABLE)) && - !capable(CAP_LINUX_IMMUTABLE)) + oldflags = ovl_iflags_to_fsflags(READ_ONCE(inode->i_flags)); + ret = vfs_ioc_setflags_prepare(inode, oldflags, flags); + if (ret) goto unlock; ret = ovl_maybe_copy_up(file_dentry(file), O_WRONLY); @@ -583,22 +613,6 @@ unlock: } -static unsigned int ovl_fsflags_to_iflags(unsigned int flags) -{ - unsigned int iflags = 0; - - if (flags & FS_SYNC_FL) - iflags |= S_SYNC; - if (flags & FS_APPEND_FL) - iflags |= S_APPEND; - if (flags & FS_IMMUTABLE_FL) - iflags |= S_IMMUTABLE; - if (flags & FS_NOATIME_FL) - iflags |= S_NOATIME; - - return iflags; -} - static long ovl_ioctl_set_fsflags(struct file *file, unsigned int cmd, unsigned long arg) { @@ -607,24 +621,23 @@ static long ovl_ioctl_set_fsflags(struct file *file, unsigned int cmd, if (get_user(flags, (int __user *) arg)) return -EFAULT; - return ovl_ioctl_set_flags(file, cmd, arg, - ovl_fsflags_to_iflags(flags)); + return ovl_ioctl_set_flags(file, cmd, arg, flags); } -static unsigned int ovl_fsxflags_to_iflags(unsigned int xflags) +static unsigned int ovl_fsxflags_to_fsflags(unsigned int xflags) { - unsigned int iflags = 0; + unsigned int flags = 0; if (xflags & FS_XFLAG_SYNC) - iflags |= S_SYNC; + flags |= FS_SYNC_FL; if (xflags & FS_XFLAG_APPEND) - iflags |= S_APPEND; + flags |= FS_APPEND_FL; if (xflags & FS_XFLAG_IMMUTABLE) - iflags |= S_IMMUTABLE; + flags |= FS_IMMUTABLE_FL; if (xflags & FS_XFLAG_NOATIME) - iflags |= S_NOATIME; + flags |= FS_NOATIME_FL; - return iflags; + return flags; } static long ovl_ioctl_set_fsxflags(struct file *file, unsigned int cmd, @@ -637,10 +650,10 @@ static long ovl_ioctl_set_fsxflags(struct file *file, unsigned int cmd, return -EFAULT; return ovl_ioctl_set_flags(file, cmd, arg, - ovl_fsxflags_to_iflags(fa.fsx_xflags)); + ovl_fsxflags_to_fsflags(fa.fsx_xflags)); } -static long ovl_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +long ovl_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { long ret; @@ -665,8 +678,8 @@ static long ovl_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return ret; } -static long ovl_compat_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) +#ifdef CONFIG_COMPAT +long ovl_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { switch (cmd) { case FS_IOC32_GETFLAGS: @@ -683,6 +696,7 @@ static long ovl_compat_ioctl(struct file *file, unsigned int cmd, return ovl_ioctl(file, cmd, arg); } +#endif enum ovl_copyop { OVL_COPY, @@ -784,7 +798,9 @@ const struct file_operations ovl_file_operations = { .fallocate = ovl_fallocate, .fadvise = ovl_fadvise, .unlocked_ioctl = ovl_ioctl, +#ifdef CONFIG_COMPAT .compat_ioctl = ovl_compat_ioctl, +#endif .splice_read = ovl_splice_read, .splice_write = ovl_splice_write, diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 8be6cd264f66..b584dca845ba 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -327,7 +327,7 @@ static const char *ovl_get_link(struct dentry *dentry, return p; } -bool ovl_is_private_xattr(const char *name) +bool ovl_is_private_xattr(struct super_block *sb, const char *name) { return strncmp(name, OVL_XATTR_PREFIX, sizeof(OVL_XATTR_PREFIX) - 1) == 0; @@ -391,15 +391,18 @@ int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, return res; } -static bool ovl_can_list(const char *s) +static bool ovl_can_list(struct super_block *sb, const char *s) { + /* Never list private (.overlay) */ + if (ovl_is_private_xattr(sb, s)) + return false; + /* List all non-trusted xatts */ if (strncmp(s, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) != 0) return true; - /* Never list trusted.overlay, list other trusted for superuser only */ - return !ovl_is_private_xattr(s) && - ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN); + /* list other trusted for superuser only */ + return ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN); } ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) @@ -425,7 +428,7 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) return -EIO; len -= slen; - if (!ovl_can_list(s)) { + if (!ovl_can_list(dentry->d_sb, s)) { res -= slen; memmove(s, s + slen, len); } else { @@ -722,8 +725,8 @@ static int ovl_set_nlink_common(struct dentry *dentry, if (WARN_ON(len >= sizeof(buf))) return -EIO; - return ovl_do_setxattr(ovl_dentry_upper(dentry), - OVL_XATTR_NLINK, buf, len, 0); + return ovl_do_setxattr(OVL_FS(inode->i_sb), ovl_dentry_upper(dentry), + OVL_XATTR_NLINK, buf, len); } int ovl_set_nlink_upper(struct dentry *dentry) @@ -736,7 +739,7 @@ int ovl_set_nlink_lower(struct dentry *dentry) return ovl_set_nlink_common(dentry, ovl_dentry_lower(dentry), "L%+i"); } -unsigned int ovl_get_nlink(struct dentry *lowerdentry, +unsigned int ovl_get_nlink(struct ovl_fs *ofs, struct dentry *lowerdentry, struct dentry *upperdentry, unsigned int fallback) { @@ -748,7 +751,8 @@ unsigned int ovl_get_nlink(struct dentry *lowerdentry, if (!lowerdentry || !upperdentry || d_inode(lowerdentry)->i_nlink == 1) return fallback; - err = vfs_getxattr(upperdentry, OVL_XATTR_NLINK, &buf, sizeof(buf) - 1); + err = ovl_do_getxattr(ofs, upperdentry, OVL_XATTR_NLINK, + &buf, sizeof(buf) - 1); if (err < 0) goto fail; @@ -946,6 +950,7 @@ static struct inode *ovl_iget5(struct super_block *sb, struct inode *newinode, struct inode *ovl_get_inode(struct super_block *sb, struct ovl_inode_params *oip) { + struct ovl_fs *ofs = OVL_FS(sb); struct dentry *upperdentry = oip->upperdentry; struct ovl_path *lowerpath = oip->lowerpath; struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL; @@ -993,7 +998,8 @@ struct inode *ovl_get_inode(struct super_block *sb, /* Recalculate nlink for non-dir due to indexing */ if (!is_dir) - nlink = ovl_get_nlink(lowerdentry, upperdentry, nlink); + nlink = ovl_get_nlink(ofs, lowerdentry, upperdentry, + nlink); set_nlink(inode, nlink); ino = key->i_ino; } else { @@ -1009,7 +1015,7 @@ struct inode *ovl_get_inode(struct super_block *sb, ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev); ovl_inode_init(inode, oip, ino, fsid); - if (upperdentry && ovl_is_impuredir(upperdentry)) + if (upperdentry && ovl_is_impuredir(sb, upperdentry)) ovl_set_flag(OVL_IMPURE, inode); if (oip->index) @@ -1023,7 +1029,7 @@ struct inode *ovl_get_inode(struct super_block *sb, /* Check for non-merge dir that may have whiteouts */ if (is_dir) { if (((upperdentry && lowerdentry) || oip->numlower > 1) || - ovl_check_origin_xattr(upperdentry ?: lowerdentry)) { + ovl_check_origin_xattr(ofs, upperdentry ?: lowerdentry)) { ovl_set_flag(OVL_WHITEOUTS, inode); } } diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index f7d4358db637..a6162c4076db 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -30,8 +30,9 @@ static int ovl_check_redirect(struct dentry *dentry, struct ovl_lookup_data *d, { int res; char *buf; + struct ovl_fs *ofs = OVL_FS(d->sb); - buf = ovl_get_redirect_xattr(dentry, prelen + strlen(post)); + buf = ovl_get_redirect_xattr(ofs, dentry, prelen + strlen(post)); if (IS_ERR_OR_NULL(buf)) return PTR_ERR(buf); @@ -104,12 +105,13 @@ int ovl_check_fb_len(struct ovl_fb *fb, int fb_len) return 0; } -static struct ovl_fh *ovl_get_fh(struct dentry *dentry, const char *name) +static struct ovl_fh *ovl_get_fh(struct ovl_fs *ofs, struct dentry *dentry, + enum ovl_xattr ox) { int res, err; struct ovl_fh *fh = NULL; - res = vfs_getxattr(dentry, name, NULL, 0); + res = ovl_do_getxattr(ofs, dentry, ox, NULL, 0); if (res < 0) { if (res == -ENODATA || res == -EOPNOTSUPP) return NULL; @@ -123,7 +125,7 @@ static struct ovl_fh *ovl_get_fh(struct dentry *dentry, const char *name) if (!fh) return ERR_PTR(-ENOMEM); - res = vfs_getxattr(dentry, name, fh->buf, res); + res = ovl_do_getxattr(ofs, dentry, ox, fh->buf, res); if (res < 0) goto fail; @@ -186,9 +188,9 @@ struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt, return real; } -static bool ovl_is_opaquedir(struct dentry *dentry) +static bool ovl_is_opaquedir(struct super_block *sb, struct dentry *dentry) { - return ovl_check_dir_xattr(dentry, OVL_XATTR_OPAQUE); + return ovl_check_dir_xattr(sb, dentry, OVL_XATTR_OPAQUE); } static struct dentry *ovl_lookup_positive_unlocked(const char *name, @@ -251,7 +253,7 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, d->stop = true; goto put_and_out; } - err = ovl_check_metacopy_xattr(this); + err = ovl_check_metacopy_xattr(OVL_FS(d->sb), this); if (err < 0) goto out_err; @@ -271,7 +273,7 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, if (d->last) goto out; - if (ovl_is_opaquedir(this)) { + if (ovl_is_opaquedir(d->sb, this)) { d->stop = true; if (last_element) d->opaque = true; @@ -391,7 +393,7 @@ invalid: static int ovl_check_origin(struct ovl_fs *ofs, struct dentry *upperdentry, struct ovl_path **stackp) { - struct ovl_fh *fh = ovl_get_fh(upperdentry, OVL_XATTR_ORIGIN); + struct ovl_fh *fh = ovl_get_fh(ofs, upperdentry, OVL_XATTR_ORIGIN); int err; if (IS_ERR_OR_NULL(fh)) @@ -413,10 +415,10 @@ static int ovl_check_origin(struct ovl_fs *ofs, struct dentry *upperdentry, * Verify that @fh matches the file handle stored in xattr @name. * Return 0 on match, -ESTALE on mismatch, < 0 on error. */ -static int ovl_verify_fh(struct dentry *dentry, const char *name, - const struct ovl_fh *fh) +static int ovl_verify_fh(struct ovl_fs *ofs, struct dentry *dentry, + enum ovl_xattr ox, const struct ovl_fh *fh) { - struct ovl_fh *ofh = ovl_get_fh(dentry, name); + struct ovl_fh *ofh = ovl_get_fh(ofs, dentry, ox); int err = 0; if (!ofh) @@ -440,8 +442,9 @@ static int ovl_verify_fh(struct dentry *dentry, const char *name, * * Return 0 on match, -ESTALE on mismatch, -ENODATA on no xattr, < 0 on error. */ -int ovl_verify_set_fh(struct dentry *dentry, const char *name, - struct dentry *real, bool is_upper, bool set) +int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry, + enum ovl_xattr ox, struct dentry *real, bool is_upper, + bool set) { struct inode *inode; struct ovl_fh *fh; @@ -454,9 +457,9 @@ int ovl_verify_set_fh(struct dentry *dentry, const char *name, goto fail; } - err = ovl_verify_fh(dentry, name, fh); + err = ovl_verify_fh(ofs, dentry, ox, fh); if (set && err == -ENODATA) - err = ovl_do_setxattr(dentry, name, fh->buf, fh->fb.len, 0); + err = ovl_do_setxattr(ofs, dentry, ox, fh->buf, fh->fb.len); if (err) goto fail; @@ -481,7 +484,7 @@ struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index) if (!d_is_dir(index)) return dget(index); - fh = ovl_get_fh(index, OVL_XATTR_UPPER); + fh = ovl_get_fh(ofs, index, OVL_XATTR_UPPER); if (IS_ERR_OR_NULL(fh)) return ERR_CAST(fh); @@ -574,7 +577,7 @@ int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index) goto fail; } - err = ovl_verify_fh(upper, OVL_XATTR_ORIGIN, fh); + err = ovl_verify_fh(ofs, upper, OVL_XATTR_ORIGIN, fh); dput(upper); if (err) goto fail; @@ -585,7 +588,7 @@ int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index) if (err) goto fail; - if (ovl_get_nlink(origin.dentry, index, 0) == 0) + if (ovl_get_nlink(ofs, origin.dentry, index, 0) == 0) goto orphan; } @@ -741,7 +744,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, } /* Verify that dir index 'upper' xattr points to upper dir */ - err = ovl_verify_upper(index, upper, false); + err = ovl_verify_upper(ofs, index, upper, false); if (err) { if (err == -ESTALE) { pr_warn_ratelimited("suspected multiply redirected dir found (upper=%pd2, origin=%pd2, index=%pd2).\n", @@ -790,12 +793,12 @@ int ovl_path_next(int idx, struct dentry *dentry, struct path *path) } /* Fix missing 'origin' xattr */ -static int ovl_fix_origin(struct dentry *dentry, struct dentry *lower, - struct dentry *upper) +static int ovl_fix_origin(struct ovl_fs *ofs, struct dentry *dentry, + struct dentry *lower, struct dentry *upper) { int err; - if (ovl_check_origin_xattr(upper)) + if (ovl_check_origin_xattr(ofs, upper)) return 0; err = ovl_want_write(dentry); @@ -920,7 +923,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, * of lower dir and set upper parent "impure". */ if (upperdentry && !ctr && !ofs->noxattr && d.is_dir) { - err = ovl_fix_origin(dentry, this, upperdentry); + err = ovl_fix_origin(ofs, dentry, this, upperdentry); if (err) { dput(this); goto out_put; @@ -939,7 +942,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, if (upperdentry && !ctr && ((d.is_dir && ovl_verify_lower(dentry->d_sb)) || (!d.is_dir && ofs->config.index && origin_path))) { - err = ovl_verify_origin(upperdentry, this, false); + err = ovl_verify_origin(ofs, upperdentry, this, false); if (err) { dput(this); if (d.is_dir) @@ -1060,13 +1063,13 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, ovl_dentry_set_upper_alias(dentry); else if (index) { upperdentry = dget(index); - upperredirect = ovl_get_redirect_xattr(upperdentry, 0); + upperredirect = ovl_get_redirect_xattr(ofs, upperdentry, 0); if (IS_ERR(upperredirect)) { err = PTR_ERR(upperredirect); upperredirect = NULL; goto out_free_oe; } - err = ovl_check_metacopy_xattr(upperdentry); + err = ovl_check_metacopy_xattr(ofs, upperdentry); if (err < 0) goto out_free_oe; uppermetacopy = err; diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 29bc1ec699e7..f8880aa2ba0e 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -23,13 +23,16 @@ enum ovl_path_type { #define OVL_TYPE_ORIGIN(type) ((type) & __OVL_PATH_ORIGIN) #define OVL_XATTR_PREFIX XATTR_TRUSTED_PREFIX "overlay." -#define OVL_XATTR_OPAQUE OVL_XATTR_PREFIX "opaque" -#define OVL_XATTR_REDIRECT OVL_XATTR_PREFIX "redirect" -#define OVL_XATTR_ORIGIN OVL_XATTR_PREFIX "origin" -#define OVL_XATTR_IMPURE OVL_XATTR_PREFIX "impure" -#define OVL_XATTR_NLINK OVL_XATTR_PREFIX "nlink" -#define OVL_XATTR_UPPER OVL_XATTR_PREFIX "upper" -#define OVL_XATTR_METACOPY OVL_XATTR_PREFIX "metacopy" + +enum ovl_xattr { + OVL_XATTR_OPAQUE, + OVL_XATTR_REDIRECT, + OVL_XATTR_ORIGIN, + OVL_XATTR_IMPURE, + OVL_XATTR_NLINK, + OVL_XATTR_UPPER, + OVL_XATTR_METACOPY, +}; enum ovl_inode_flag { /* Pure upper dir that may contain non pure upper entries */ @@ -110,6 +113,12 @@ struct ovl_fh { #define OVL_FH_FID_OFFSET (OVL_FH_WIRE_OFFSET + \ offsetof(struct ovl_fb, fid)) +extern const char *ovl_xattr_table[]; +static inline const char *ovl_xattr(struct ovl_fs *ofs, enum ovl_xattr ox) +{ + return ovl_xattr_table[ox]; +} + static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry) { int err = vfs_rmdir(dir, dentry); @@ -170,17 +179,29 @@ static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry, return err; } -static inline int ovl_do_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) +static inline ssize_t ovl_do_getxattr(struct ovl_fs *ofs, struct dentry *dentry, + enum ovl_xattr ox, void *value, + size_t size) +{ + const char *name = ovl_xattr(ofs, ox); + return vfs_getxattr(dentry, name, value, size); +} + +static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry, + enum ovl_xattr ox, const void *value, + size_t size) { - int err = vfs_setxattr(dentry, name, value, size, flags); - pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, 0x%x) = %i\n", - dentry, name, min((int)size, 48), value, size, flags, err); + const char *name = ovl_xattr(ofs, ox); + int err = vfs_setxattr(dentry, name, value, size, 0); + pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, 0) = %i\n", + dentry, name, min((int)size, 48), value, size, err); return err; } -static inline int ovl_do_removexattr(struct dentry *dentry, const char *name) +static inline int ovl_do_removexattr(struct ovl_fs *ofs, struct dentry *dentry, + enum ovl_xattr ox) { + const char *name = ovl_xattr(ofs, ox); int err = vfs_removexattr(dentry, name); pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err); return err; @@ -280,10 +301,11 @@ struct file *ovl_path_open(struct path *path, int flags); int ovl_copy_up_start(struct dentry *dentry, int flags); void ovl_copy_up_end(struct dentry *dentry); bool ovl_already_copied_up(struct dentry *dentry, int flags); -bool ovl_check_origin_xattr(struct dentry *dentry); -bool ovl_check_dir_xattr(struct dentry *dentry, const char *name); +bool ovl_check_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry); +bool ovl_check_dir_xattr(struct super_block *sb, struct dentry *dentry, + enum ovl_xattr ox); int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry, - const char *name, const void *value, size_t size, + enum ovl_xattr ox, const void *value, size_t size, int xerr); int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry); void ovl_set_flag(unsigned long flag, struct inode *inode); @@ -296,15 +318,15 @@ bool ovl_need_index(struct dentry *dentry); int ovl_nlink_start(struct dentry *dentry); void ovl_nlink_end(struct dentry *dentry); int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir); -int ovl_check_metacopy_xattr(struct dentry *dentry); +int ovl_check_metacopy_xattr(struct ovl_fs *ofs, struct dentry *dentry); bool ovl_is_metacopy_dentry(struct dentry *dentry); -char *ovl_get_redirect_xattr(struct dentry *dentry, int padding); -ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value, - size_t padding); +char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct dentry *dentry, + int padding); -static inline bool ovl_is_impuredir(struct dentry *dentry) +static inline bool ovl_is_impuredir(struct super_block *sb, + struct dentry *dentry) { - return ovl_check_dir_xattr(dentry, OVL_XATTR_IMPURE); + return ovl_check_dir_xattr(sb, dentry, OVL_XATTR_IMPURE); } /* @@ -365,8 +387,9 @@ struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt, bool connected); int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, struct dentry *upperdentry, struct ovl_path **stackp); -int ovl_verify_set_fh(struct dentry *dentry, const char *name, - struct dentry *real, bool is_upper, bool set); +int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry, + enum ovl_xattr ox, struct dentry *real, bool is_upper, + bool set); struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index); int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index); int ovl_get_index_name(struct dentry *origin, struct qstr *name); @@ -378,20 +401,22 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); bool ovl_lower_positive(struct dentry *dentry); -static inline int ovl_verify_origin(struct dentry *upper, +static inline int ovl_verify_origin(struct ovl_fs *ofs, struct dentry *upper, struct dentry *origin, bool set) { - return ovl_verify_set_fh(upper, OVL_XATTR_ORIGIN, origin, false, set); + return ovl_verify_set_fh(ofs, upper, OVL_XATTR_ORIGIN, origin, + false, set); } -static inline int ovl_verify_upper(struct dentry *index, - struct dentry *upper, bool set) +static inline int ovl_verify_upper(struct ovl_fs *ofs, struct dentry *index, + struct dentry *upper, bool set) { - return ovl_verify_set_fh(index, OVL_XATTR_UPPER, upper, true, set); + return ovl_verify_set_fh(ofs, index, OVL_XATTR_UPPER, upper, true, set); } /* readdir.c */ extern const struct file_operations ovl_dir_operations; +struct file *ovl_dir_real_file(const struct file *file, bool want_upper); int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list); void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list); void ovl_cache_free(struct list_head *list); @@ -404,7 +429,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs); /* inode.c */ int ovl_set_nlink_upper(struct dentry *dentry); int ovl_set_nlink_lower(struct dentry *dentry); -unsigned int ovl_get_nlink(struct dentry *lowerdentry, +unsigned int ovl_get_nlink(struct ovl_fs *ofs, struct dentry *lowerdentry, struct dentry *upperdentry, unsigned int fallback); int ovl_setattr(struct dentry *dentry, struct iattr *attr); @@ -418,7 +443,7 @@ int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); struct posix_acl *ovl_get_acl(struct inode *inode, int type); int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags); -bool ovl_is_private_xattr(const char *name); +bool ovl_is_private_xattr(struct super_block *sb, const char *name); struct ovl_inode_params { struct inode *newinode; @@ -479,12 +504,15 @@ struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr); extern const struct file_operations ovl_file_operations; int __init ovl_aio_request_cache_init(void); void ovl_aio_request_cache_destroy(void); +long ovl_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +long ovl_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); /* copy_up.c */ int ovl_copy_up(struct dentry *dentry); int ovl_copy_up_with_data(struct dentry *dentry); int ovl_maybe_copy_up(struct dentry *dentry, int flags); -int ovl_copy_xattr(struct dentry *old, struct dentry *new); +int ovl_copy_xattr(struct super_block *sb, struct dentry *old, + struct dentry *new); int ovl_set_attr(struct dentry *upper, struct kstat *stat); struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper); int ovl_set_origin(struct dentry *dentry, struct dentry *lower, diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index b429c80879ee..1b5a2094df8e 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -17,6 +17,7 @@ struct ovl_config { bool nfs_export; int xino; bool metacopy; + bool ovl_volatile; }; struct ovl_sb { @@ -90,6 +91,11 @@ static inline struct ovl_fs *OVL_FS(struct super_block *sb) return (struct ovl_fs *)sb->s_fs_info; } +static inline bool ovl_should_sync(struct ovl_fs *ofs) +{ + return !ofs->config.ovl_volatile; +} + /* private information held for every overlayfs dentry */ struct ovl_entry { union { diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 6918b98faeb6..01620ebae1bd 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -606,6 +606,7 @@ static struct ovl_dir_cache *ovl_cache_get_impure(struct path *path) { int res; struct dentry *dentry = path->dentry; + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct ovl_dir_cache *cache; cache = ovl_dir_cache(d_inode(dentry)); @@ -632,7 +633,7 @@ static struct ovl_dir_cache *ovl_cache_get_impure(struct path *path) * Removing the "impure" xattr is best effort. */ if (!ovl_want_write(dentry)) { - ovl_do_removexattr(ovl_dentry_upper(dentry), + ovl_do_removexattr(ofs, ovl_dentry_upper(dentry), OVL_XATTR_IMPURE); ovl_drop_write(dentry); } @@ -839,7 +840,7 @@ out_unlock: return res; } -static struct file *ovl_dir_open_realfile(struct file *file, +static struct file *ovl_dir_open_realfile(const struct file *file, struct path *realpath) { struct file *res; @@ -852,16 +853,22 @@ static struct file *ovl_dir_open_realfile(struct file *file, return res; } -static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, - int datasync) +/* + * Like ovl_real_fdget(), returns upperfile if dir was copied up since open. + * Unlike ovl_real_fdget(), this caches upperfile in file->private_data. + * + * TODO: use same abstract type for file->private_data of dir and file so + * upperfile could also be cached for files as well. + */ +struct file *ovl_dir_real_file(const struct file *file, bool want_upper) { + struct ovl_dir_file *od = file->private_data; struct dentry *dentry = file->f_path.dentry; struct file *realfile = od->realfile; - /* Nothing to sync for lower */ if (!OVL_TYPE_UPPER(ovl_path_type(dentry))) - return 0; + return want_upper ? NULL : realfile; /* * Need to check if we started out being a lower dir, but got copied up @@ -880,7 +887,7 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, if (!od->upperfile) { if (IS_ERR(realfile)) { inode_unlock(inode); - return PTR_ERR(realfile); + return realfile; } smp_store_release(&od->upperfile, realfile); } else { @@ -893,6 +900,25 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, } } + return realfile; +} + +static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, + int datasync) +{ + struct file *realfile; + int err; + + if (!ovl_should_sync(OVL_FS(file->f_path.dentry->d_sb))) + return 0; + + realfile = ovl_dir_real_file(file, true); + err = PTR_ERR_OR_ZERO(realfile); + + /* Nothing to sync for lower */ + if (!realfile || err) + return err; + return vfs_fsync_range(realfile, start, end, datasync); } @@ -945,6 +971,10 @@ const struct file_operations ovl_dir_operations = { .llseek = ovl_dir_llseek, .fsync = ovl_dir_fsync, .release = ovl_dir_release, + .unlocked_ioctl = ovl_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ovl_compat_ioctl, +#endif }; int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list) @@ -1051,7 +1081,9 @@ int ovl_check_d_type_supported(struct path *realpath) return rdd.d_type_supported; } -static void ovl_workdir_cleanup_recurse(struct path *path, int level) +#define OVL_INCOMPATDIR_NAME "incompat" + +static int ovl_workdir_cleanup_recurse(struct path *path, int level) { int err; struct inode *dir = path->dentry->d_inode; @@ -1065,6 +1097,19 @@ static void ovl_workdir_cleanup_recurse(struct path *path, int level) .root = &root, .is_lowest = false, }; + bool incompat = false; + + /* + * The "work/incompat" directory is treated specially - if it is not + * empty, instead of printing a generic error and mounting read-only, + * we will error about incompat features and fail the mount. + * + * When called from ovl_indexdir_cleanup(), path->dentry->d_name.name + * starts with '#'. + */ + if (level == 2 && + !strcmp(path->dentry->d_name.name, OVL_INCOMPATDIR_NAME)) + incompat = true; err = ovl_dir_read(path, &rdd); if (err) @@ -1079,17 +1124,25 @@ static void ovl_workdir_cleanup_recurse(struct path *path, int level) continue; if (p->len == 2 && p->name[1] == '.') continue; + } else if (incompat) { + pr_err("overlay with incompat feature '%s' cannot be mounted\n", + p->name); + err = -EINVAL; + break; } dentry = lookup_one_len(p->name, path->dentry, p->len); if (IS_ERR(dentry)) continue; if (dentry->d_inode) - ovl_workdir_cleanup(dir, path->mnt, dentry, level); + err = ovl_workdir_cleanup(dir, path->mnt, dentry, level); dput(dentry); + if (err) + break; } inode_unlock(dir); out: ovl_cache_free(&list); + return err; } int ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt, @@ -1106,9 +1159,10 @@ int ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt, struct path path = { .mnt = mnt, .dentry = dentry }; inode_unlock(dir); - ovl_workdir_cleanup_recurse(&path, level + 1); + err = ovl_workdir_cleanup_recurse(&path, level + 1); inode_lock_nested(dir, I_MUTEX_PARENT); - err = ovl_cleanup(dir, dentry); + if (!err) + err = ovl_cleanup(dir, dentry); } return err; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 4b38141c2985..290983bcfbb3 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -264,6 +264,8 @@ static int ovl_sync_fs(struct super_block *sb, int wait) if (!ovl_upper_mnt(ofs)) return 0; + if (!ovl_should_sync(ofs)) + return 0; /* * Not called for sync(2) call or an emergency sync (SB_I_SKIP_SYNC). * All the super blocks will be iterated, including upper_sb. @@ -362,6 +364,8 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry) if (ofs->config.metacopy != ovl_metacopy_def) seq_printf(m, ",metacopy=%s", ofs->config.metacopy ? "on" : "off"); + if (ofs->config.ovl_volatile) + seq_puts(m, ",volatile"); return 0; } @@ -376,9 +380,11 @@ static int ovl_remount(struct super_block *sb, int *flags, char *data) if (*flags & SB_RDONLY && !sb_rdonly(sb)) { upper_sb = ovl_upper_mnt(ofs)->mnt_sb; - down_read(&upper_sb->s_umount); - ret = sync_filesystem(upper_sb); - up_read(&upper_sb->s_umount); + if (ovl_should_sync(ofs)) { + down_read(&upper_sb->s_umount); + ret = sync_filesystem(upper_sb); + up_read(&upper_sb->s_umount); + } } return ret; @@ -411,6 +417,7 @@ enum { OPT_XINO_AUTO, OPT_METACOPY_ON, OPT_METACOPY_OFF, + OPT_VOLATILE, OPT_ERR, }; @@ -429,6 +436,7 @@ static const match_table_t ovl_tokens = { {OPT_XINO_AUTO, "xino=auto"}, {OPT_METACOPY_ON, "metacopy=on"}, {OPT_METACOPY_OFF, "metacopy=off"}, + {OPT_VOLATILE, "volatile"}, {OPT_ERR, NULL} }; @@ -573,6 +581,10 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) metacopy_opt = true; break; + case OPT_VOLATILE: + config->ovl_volatile = true; + break; + default: pr_err("unrecognized mount option \"%s\" or missing value\n", p); @@ -595,6 +607,11 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) config->index = false; } + if (!config->upperdir && config->ovl_volatile) { + pr_info("option \"volatile\" is meaningless in a non-upper mount, ignoring it.\n"); + config->ovl_volatile = false; + } + err = ovl_parse_redirect_mode(config, config->redirect_mode); if (err) return err; @@ -705,8 +722,12 @@ retry: goto out_unlock; retried = true; - ovl_workdir_cleanup(dir, mnt, work, 0); + err = ovl_workdir_cleanup(dir, mnt, work, 0); dput(work); + if (err == -EINVAL) { + work = ERR_PTR(err); + goto out_unlock; + } goto retry; } @@ -1199,11 +1220,50 @@ out_unlock: return err; } +static struct dentry *ovl_lookup_or_create(struct dentry *parent, + const char *name, umode_t mode) +{ + size_t len = strlen(name); + struct dentry *child; + + inode_lock_nested(parent->d_inode, I_MUTEX_PARENT); + child = lookup_one_len(name, parent, len); + if (!IS_ERR(child) && !child->d_inode) + child = ovl_create_real(parent->d_inode, child, + OVL_CATTR(mode)); + inode_unlock(parent->d_inode); + dput(parent); + + return child; +} + +/* + * Creates $workdir/work/incompat/volatile/dirty file if it is not already + * present. + */ +static int ovl_create_volatile_dirty(struct ovl_fs *ofs) +{ + unsigned int ctr; + struct dentry *d = dget(ofs->workbasedir); + static const char *const volatile_path[] = { + OVL_WORKDIR_NAME, "incompat", "volatile", "dirty" + }; + const char *const *name = volatile_path; + + for (ctr = ARRAY_SIZE(volatile_path); ctr; ctr--, name++) { + d = ovl_lookup_or_create(d, *name, ctr > 1 ? S_IFDIR : S_IFREG); + if (IS_ERR(d)) + return PTR_ERR(d); + } + dput(d); + return 0; +} + static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, struct path *workpath) { struct vfsmount *mnt = ovl_upper_mnt(ofs); - struct dentry *temp; + struct dentry *temp, *workdir; bool rename_whiteout; bool d_type; int fh_type; @@ -1213,10 +1273,13 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, if (err) return err; - ofs->workdir = ovl_workdir_create(ofs, OVL_WORKDIR_NAME, false); - if (!ofs->workdir) + workdir = ovl_workdir_create(ofs, OVL_WORKDIR_NAME, false); + err = PTR_ERR(workdir); + if (IS_ERR_OR_NULL(workdir)) goto out; + ofs->workdir = workdir; + err = ovl_setup_trap(sb, ofs->workdir, &ofs->workdir_trap, "workdir"); if (err) goto out; @@ -1256,7 +1319,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, /* * Check if upper/work fs supports trusted.overlay.* xattr */ - err = ovl_do_setxattr(ofs->workdir, OVL_XATTR_OPAQUE, "0", 1, 0); + err = ovl_do_setxattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE, "0", 1); if (err) { ofs->noxattr = true; ofs->config.index = false; @@ -1264,7 +1327,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, pr_warn("upper fs does not support xattr, falling back to index=off and metacopy=off.\n"); err = 0; } else { - vfs_removexattr(ofs->workdir, OVL_XATTR_OPAQUE); + ovl_do_removexattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE); } /* @@ -1279,6 +1342,18 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, goto out; } + /* + * For volatile mount, create a incompat/volatile/dirty file to keep + * track of it. + */ + if (ofs->config.ovl_volatile) { + err = ovl_create_volatile_dirty(ofs); + if (err < 0) { + pr_err("Failed to create volatile/dirty file.\n"); + goto out; + } + } + /* Check if upper/work fs supports file handles */ fh_type = ovl_can_decode_fh(ofs->workdir->d_sb); if (ofs->config.index && !fh_type) { @@ -1347,6 +1422,7 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs, struct ovl_entry *oe, struct path *upperpath) { struct vfsmount *mnt = ovl_upper_mnt(ofs); + struct dentry *indexdir; int err; err = mnt_want_write(mnt); @@ -1354,8 +1430,8 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs, return err; /* Verify lower root is upper root origin */ - err = ovl_verify_origin(upperpath->dentry, oe->lowerstack[0].dentry, - true); + err = ovl_verify_origin(ofs, upperpath->dentry, + oe->lowerstack[0].dentry, true); if (err) { pr_err("failed to verify upper root origin\n"); goto out; @@ -1366,9 +1442,12 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs, ofs->workdir_trap = NULL; dput(ofs->workdir); ofs->workdir = NULL; - ofs->indexdir = ovl_workdir_create(ofs, OVL_INDEXDIR_NAME, true); - if (ofs->indexdir) { - ofs->workdir = dget(ofs->indexdir); + indexdir = ovl_workdir_create(ofs, OVL_INDEXDIR_NAME, true); + if (IS_ERR(indexdir)) { + err = PTR_ERR(indexdir); + } else if (indexdir) { + ofs->indexdir = indexdir; + ofs->workdir = dget(indexdir); err = ovl_setup_trap(sb, ofs->indexdir, &ofs->indexdir_trap, "indexdir"); @@ -1383,13 +1462,15 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs, * "trusted.overlay.upper" to indicate that index may have * directory entries. */ - if (ovl_check_origin_xattr(ofs->indexdir)) { - err = ovl_verify_set_fh(ofs->indexdir, OVL_XATTR_ORIGIN, + if (ovl_check_origin_xattr(ofs, ofs->indexdir)) { + err = ovl_verify_set_fh(ofs, ofs->indexdir, + OVL_XATTR_ORIGIN, upperpath->dentry, true, false); if (err) pr_err("failed to verify index dir 'origin' xattr\n"); } - err = ovl_verify_upper(ofs->indexdir, upperpath->dentry, true); + err = ovl_verify_upper(ofs, ofs->indexdir, upperpath->dentry, + true); if (err) pr_err("failed to verify index dir 'upper' xattr\n"); @@ -1755,7 +1836,7 @@ static struct dentry *ovl_get_root(struct super_block *sb, ino = d_inode(upperdentry)->i_ino; fsid = 0; ovl_dentry_set_upper_alias(root); - if (ovl_is_impuredir(upperdentry)) + if (ovl_is_impuredir(sb, upperdentry)) ovl_set_flag(OVL_IMPURE, d_inode(root)); } diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 56c1f89f20c9..23f475627d07 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -544,11 +544,11 @@ void ovl_copy_up_end(struct dentry *dentry) ovl_inode_unlock(d_inode(dentry)); } -bool ovl_check_origin_xattr(struct dentry *dentry) +bool ovl_check_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry) { int res; - res = vfs_getxattr(dentry, OVL_XATTR_ORIGIN, NULL, 0); + res = ovl_do_getxattr(ofs, dentry, OVL_XATTR_ORIGIN, NULL, 0); /* Zero size value means "copied up but origin unknown" */ if (res >= 0) @@ -557,7 +557,8 @@ bool ovl_check_origin_xattr(struct dentry *dentry) return false; } -bool ovl_check_dir_xattr(struct dentry *dentry, const char *name) +bool ovl_check_dir_xattr(struct super_block *sb, struct dentry *dentry, + enum ovl_xattr ox) { int res; char val; @@ -565,15 +566,36 @@ bool ovl_check_dir_xattr(struct dentry *dentry, const char *name) if (!d_is_dir(dentry)) return false; - res = vfs_getxattr(dentry, name, &val, 1); + res = ovl_do_getxattr(OVL_FS(sb), dentry, ox, &val, 1); if (res == 1 && val == 'y') return true; return false; } +#define OVL_XATTR_OPAQUE_POSTFIX "opaque" +#define OVL_XATTR_REDIRECT_POSTFIX "redirect" +#define OVL_XATTR_ORIGIN_POSTFIX "origin" +#define OVL_XATTR_IMPURE_POSTFIX "impure" +#define OVL_XATTR_NLINK_POSTFIX "nlink" +#define OVL_XATTR_UPPER_POSTFIX "upper" +#define OVL_XATTR_METACOPY_POSTFIX "metacopy" + +#define OVL_XATTR_TAB_ENTRY(x) \ + [x] = OVL_XATTR_PREFIX x ## _POSTFIX + +const char *ovl_xattr_table[] = { + OVL_XATTR_TAB_ENTRY(OVL_XATTR_OPAQUE), + OVL_XATTR_TAB_ENTRY(OVL_XATTR_REDIRECT), + OVL_XATTR_TAB_ENTRY(OVL_XATTR_ORIGIN), + OVL_XATTR_TAB_ENTRY(OVL_XATTR_IMPURE), + OVL_XATTR_TAB_ENTRY(OVL_XATTR_NLINK), + OVL_XATTR_TAB_ENTRY(OVL_XATTR_UPPER), + OVL_XATTR_TAB_ENTRY(OVL_XATTR_METACOPY), +}; + int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry, - const char *name, const void *value, size_t size, + enum ovl_xattr ox, const void *value, size_t size, int xerr) { int err; @@ -582,10 +604,10 @@ int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry, if (ofs->noxattr) return xerr; - err = ovl_do_setxattr(upperdentry, name, value, size, 0); + err = ovl_do_setxattr(ofs, upperdentry, ox, value, size); if (err == -EOPNOTSUPP) { - pr_warn("cannot set %s xattr on upper\n", name); + pr_warn("cannot set %s xattr on upper\n", ovl_xattr(ofs, ox)); ofs->noxattr = true; return xerr; } @@ -845,7 +867,7 @@ err: } /* err < 0, 0 if no metacopy xattr, 1 if metacopy xattr found */ -int ovl_check_metacopy_xattr(struct dentry *dentry) +int ovl_check_metacopy_xattr(struct ovl_fs *ofs, struct dentry *dentry) { int res; @@ -853,7 +875,7 @@ int ovl_check_metacopy_xattr(struct dentry *dentry) if (!S_ISREG(d_inode(dentry)->i_mode)) return 0; - res = vfs_getxattr(dentry, OVL_XATTR_METACOPY, NULL, 0); + res = ovl_do_getxattr(ofs, dentry, OVL_XATTR_METACOPY, NULL, 0); if (res < 0) { if (res == -ENODATA || res == -EOPNOTSUPP) return 0; @@ -882,49 +904,27 @@ bool ovl_is_metacopy_dentry(struct dentry *dentry) return (oe->numlower > 1); } -ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value, - size_t padding) -{ - ssize_t res; - char *buf = NULL; - - res = vfs_getxattr(dentry, name, NULL, 0); - if (res < 0) { - if (res == -ENODATA || res == -EOPNOTSUPP) - return -ENODATA; - goto fail; - } - - if (res != 0) { - buf = kzalloc(res + padding, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - res = vfs_getxattr(dentry, name, buf, res); - if (res < 0) - goto fail; - } - *value = buf; - - return res; - -fail: - pr_warn_ratelimited("failed to get xattr %s: err=%zi)\n", - name, res); - kfree(buf); - return res; -} - -char *ovl_get_redirect_xattr(struct dentry *dentry, int padding) +char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct dentry *dentry, + int padding) { int res; char *s, *next, *buf = NULL; - res = ovl_getxattr(dentry, OVL_XATTR_REDIRECT, &buf, padding + 1); - if (res == -ENODATA) + res = ovl_do_getxattr(ofs, dentry, OVL_XATTR_REDIRECT, NULL, 0); + if (res == -ENODATA || res == -EOPNOTSUPP) return NULL; if (res < 0) - return ERR_PTR(res); + goto fail; + if (res == 0) + goto invalid; + + buf = kzalloc(res + padding + 1, GFP_KERNEL); + if (!buf) + return ERR_PTR(-ENOMEM); + + res = ovl_do_getxattr(ofs, dentry, OVL_XATTR_REDIRECT, buf, res); + if (res < 0) + goto fail; if (res == 0) goto invalid; @@ -943,6 +943,10 @@ char *ovl_get_redirect_xattr(struct dentry *dentry, int padding) invalid: pr_warn_ratelimited("invalid redirect (%s)\n", buf); res = -EINVAL; + goto err_free; +fail: + pr_warn_ratelimited("failed to get redirect (%i)\n", res); +err_free: kfree(buf); return ERR_PTR(res); } diff --git a/fs/pipe.c b/fs/pipe.c index 60dbee457143..0ac197658a2d 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -106,25 +106,6 @@ void pipe_double_lock(struct pipe_inode_info *pipe1, } } -/* Drop the inode semaphore and wait for a pipe event, atomically */ -void pipe_wait(struct pipe_inode_info *pipe) -{ - DEFINE_WAIT(rdwait); - DEFINE_WAIT(wrwait); - - /* - * Pipes are system-local resources, so sleeping on them - * is considered a noninteractive wait: - */ - prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE); - prepare_to_wait(&pipe->wr_wait, &wrwait, TASK_INTERRUPTIBLE); - pipe_unlock(pipe); - schedule(); - finish_wait(&pipe->rd_wait, &rdwait); - finish_wait(&pipe->wr_wait, &wrwait); - pipe_lock(pipe); -} - static void anon_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { @@ -913,19 +894,18 @@ int create_pipe_files(struct file **res, int flags) { struct inode *inode = get_pipe_inode(); struct file *f; + int error; if (!inode) return -ENFILE; if (flags & O_NOTIFICATION_PIPE) { -#ifdef CONFIG_WATCH_QUEUE - if (watch_queue_init(inode->i_pipe) < 0) { + error = watch_queue_init(inode->i_pipe); + if (error) { + free_pipe_info(inode->i_pipe); iput(inode); - return -ENOMEM; + return error; } -#else - return -ENOPKG; -#endif } f = alloc_file_pseudo(inode, pipe_mnt, "", @@ -1035,12 +1015,52 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes) return do_pipe2(fildes, 0); } +/* + * This is the stupid "wait for pipe to be readable or writable" + * model. + * + * See pipe_read/write() for the proper kind of exclusive wait, + * but that requires that we wake up any other readers/writers + * if we then do not end up reading everything (ie the whole + * "wake_next_reader/writer" logic in pipe_read/write()). + */ +void pipe_wait_readable(struct pipe_inode_info *pipe) +{ + pipe_unlock(pipe); + wait_event_interruptible(pipe->rd_wait, pipe_readable(pipe)); + pipe_lock(pipe); +} + +void pipe_wait_writable(struct pipe_inode_info *pipe) +{ + pipe_unlock(pipe); + wait_event_interruptible(pipe->wr_wait, pipe_writable(pipe)); + pipe_lock(pipe); +} + +/* + * This depends on both the wait (here) and the wakeup (wake_up_partner) + * holding the pipe lock, so "*cnt" is stable and we know a wakeup cannot + * race with the count check and waitqueue prep. + * + * Normally in order to avoid races, you'd do the prepare_to_wait() first, + * then check the condition you're waiting for, and only then sleep. But + * because of the pipe lock, we can check the condition before being on + * the wait queue. + * + * We use the 'rd_wait' waitqueue for pipe partner waiting. + */ static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt) { + DEFINE_WAIT(rdwait); int cur = *cnt; while (cur == *cnt) { - pipe_wait(pipe); + prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE); + pipe_unlock(pipe); + schedule(); + finish_wait(&pipe->rd_wait, &rdwait); + pipe_lock(pipe); if (signal_pending(current)) break; } @@ -1050,7 +1070,6 @@ static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt) static void wake_up_partner(struct pipe_inode_info *pipe) { wake_up_interruptible_all(&pipe->rd_wait); - wake_up_interruptible_all(&pipe->wr_wait); } static int fifo_open(struct inode *inode, struct file *filp) diff --git a/fs/proc/base.c b/fs/proc/base.c index 617db4e0faa0..0f707003dda5 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1055,7 +1055,6 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, static int __set_oom_adj(struct file *file, int oom_adj, bool legacy) { - static DEFINE_MUTEX(oom_adj_mutex); struct mm_struct *mm = NULL; struct task_struct *task; int err = 0; @@ -1095,7 +1094,7 @@ static int __set_oom_adj(struct file *file, int oom_adj, bool legacy) struct task_struct *p = find_lock_task_mm(task); if (p) { - if (atomic_read(&p->mm->mm_users) > 1) { + if (test_bit(MMF_MULTIPROCESS, &p->mm->flags)) { mm = p->mm; mmgrab(mm); } @@ -1269,6 +1268,10 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, kuid_t kloginuid; int rv; + /* Don't let kthreads write their own loginuid */ + if (current->flags & PF_KTHREAD) + return -EPERM; + rcu_read_lock(); if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { rcu_read_unlock(); diff --git a/fs/proc/page.c b/fs/proc/page.c index f909243d4a66..9f1077d94cde 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -217,6 +217,9 @@ u64 stable_page_flags(struct page *page) u |= kpf_copy_bit(k, KPF_PRIVATE_2, PG_private_2); u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1); u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1); +#ifdef CONFIG_64BIT + u |= kpf_copy_bit(k, KPF_ARCH_2, PG_arch_2); +#endif return u; }; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 5066b0251ed8..217aa2705d5d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -520,16 +520,10 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, page = device_private_entry_to_page(swpent); } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap && pte_none(*pte))) { - page = find_get_entry(vma->vm_file->f_mapping, + page = xa_load(&vma->vm_file->f_mapping->i_pages, linear_page_index(vma, addr)); - if (!page) - return; - if (xa_is_value(page)) mss->swap += PAGE_SIZE; - else - put_page(page); - return; } @@ -653,6 +647,10 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_MERGEABLE)] = "mg", [ilog2(VM_UFFD_MISSING)]= "um", [ilog2(VM_UFFD_WP)] = "uw", +#ifdef CONFIG_ARM64_MTE + [ilog2(VM_MTE)] = "mt", + [ilog2(VM_MTE_ALLOWED)] = "", +#endif #ifdef CONFIG_ARCH_HAS_PKEYS /* These come out via ProtectionKey: */ [ilog2(VM_PKEY_BIT0)] = "", @@ -723,9 +721,21 @@ static const struct mm_walk_ops smaps_shmem_walk_ops = { .pte_hole = smaps_pte_hole, }; +/* + * Gather mem stats from @vma with the indicated beginning + * address @start, and keep them in @mss. + * + * Use vm_start of @vma as the beginning address if @start is 0. + */ static void smap_gather_stats(struct vm_area_struct *vma, - struct mem_size_stats *mss) + struct mem_size_stats *mss, unsigned long start) { + const struct mm_walk_ops *ops = &smaps_walk_ops; + + /* Invalid start */ + if (start >= vma->vm_end) + return; + #ifdef CONFIG_SHMEM /* In case of smaps_rollup, reset the value from previous vma */ mss->check_shmem_swap = false; @@ -742,18 +752,20 @@ static void smap_gather_stats(struct vm_area_struct *vma, */ unsigned long shmem_swapped = shmem_swap_usage(vma); - if (!shmem_swapped || (vma->vm_flags & VM_SHARED) || - !(vma->vm_flags & VM_WRITE)) { + if (!start && (!shmem_swapped || (vma->vm_flags & VM_SHARED) || + !(vma->vm_flags & VM_WRITE))) { mss->swap += shmem_swapped; } else { mss->check_shmem_swap = true; - walk_page_vma(vma, &smaps_shmem_walk_ops, mss); - return; + ops = &smaps_shmem_walk_ops; } } #endif /* mmap_lock is held in m_start */ - walk_page_vma(vma, &smaps_walk_ops, mss); + if (!start) + walk_page_vma(vma, ops, mss); + else + walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss); } #define SEQ_PUT_DEC(str, val) \ @@ -805,7 +817,7 @@ static int show_smap(struct seq_file *m, void *v) memset(&mss, 0, sizeof(mss)); - smap_gather_stats(vma, &mss); + smap_gather_stats(vma, &mss, 0); show_map_vma(m, vma); @@ -853,9 +865,73 @@ static int show_smaps_rollup(struct seq_file *m, void *v) hold_task_mempolicy(priv); - for (vma = priv->mm->mmap; vma; vma = vma->vm_next) { - smap_gather_stats(vma, &mss); + for (vma = priv->mm->mmap; vma;) { + smap_gather_stats(vma, &mss, 0); last_vma_end = vma->vm_end; + + /* + * Release mmap_lock temporarily if someone wants to + * access it for write request. + */ + if (mmap_lock_is_contended(mm)) { + mmap_read_unlock(mm); + ret = mmap_read_lock_killable(mm); + if (ret) { + release_task_mempolicy(priv); + goto out_put_mm; + } + + /* + * After dropping the lock, there are four cases to + * consider. See the following example for explanation. + * + * +------+------+-----------+ + * | VMA1 | VMA2 | VMA3 | + * +------+------+-----------+ + * | | | | + * 4k 8k 16k 400k + * + * Suppose we drop the lock after reading VMA2 due to + * contention, then we get: + * + * last_vma_end = 16k + * + * 1) VMA2 is freed, but VMA3 exists: + * + * find_vma(mm, 16k - 1) will return VMA3. + * In this case, just continue from VMA3. + * + * 2) VMA2 still exists: + * + * find_vma(mm, 16k - 1) will return VMA2. + * Iterate the loop like the original one. + * + * 3) No more VMAs can be found: + * + * find_vma(mm, 16k - 1) will return NULL. + * No more things to do, just break. + * + * 4) (last_vma_end - 1) is the middle of a vma (VMA'): + * + * find_vma(mm, 16k - 1) will return VMA' whose range + * contains last_vma_end. + * Iterate VMA' from last_vma_end. + */ + vma = find_vma(mm, last_vma_end - 1); + /* Case 3 above */ + if (!vma) + break; + + /* Case 1 above */ + if (vma->vm_start >= last_vma_end) + continue; + + /* Case 4 above */ + if (vma->vm_end > last_vma_end) + smap_gather_stats(vma, &mss, last_vma_end); + } + /* Case 2 above */ + vma = vma->vm_next; } show_vma_header_prefix(m, priv->mm->mmap->vm_start, @@ -1168,24 +1244,6 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, count = -EINTR; goto out_mm; } - /* - * Avoid to modify vma->vm_flags - * without locked ops while the - * coredump reads the vm_flags. - */ - if (!mmget_still_valid(mm)) { - /* - * Silently return "count" - * like if get_task_mm() - * failed. FIXME: should this - * function have returned - * -ESRCH if get_task_mm() - * failed like if - * get_proc_task() fails? - */ - mmap_write_unlock(mm); - goto out_mm; - } for (vma = mm->mmap; vma; vma = vma->vm_next) { vma->vm_flags &= ~VM_SOFTDIRTY; vma_set_page_prot(vma); diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig index d1ceb76adb71..b59cd172b5f9 100644 --- a/fs/quota/Kconfig +++ b/fs/quota/Kconfig @@ -70,8 +70,3 @@ config QFMT_V2 config QUOTACTL bool default n - -config QUOTACTL_COMPAT - bool - depends on QUOTACTL && COMPAT_FOR_U64_ALIGNMENT - default y diff --git a/fs/quota/Makefile b/fs/quota/Makefile index f2b49d0f0287..9160639daffa 100644 --- a/fs/quota/Makefile +++ b/fs/quota/Makefile @@ -4,5 +4,4 @@ obj-$(CONFIG_QFMT_V1) += quota_v1.o obj-$(CONFIG_QFMT_V2) += quota_v2.o obj-$(CONFIG_QUOTA_TREE) += quota_tree.o obj-$(CONFIG_QUOTACTL) += quota.o kqid.o -obj-$(CONFIG_QUOTACTL_COMPAT) += compat.o obj-$(CONFIG_QUOTA_NETLINK_INTERFACE) += netlink.o diff --git a/fs/quota/compat.c b/fs/quota/compat.c deleted file mode 100644 index c30572857619..000000000000 --- a/fs/quota/compat.c +++ /dev/null @@ -1,120 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include <linux/syscalls.h> -#include <linux/compat.h> -#include <linux/quotaops.h> - -/* - * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64) - * and is necessary due to alignment problems. - */ -struct compat_if_dqblk { - compat_u64 dqb_bhardlimit; - compat_u64 dqb_bsoftlimit; - compat_u64 dqb_curspace; - compat_u64 dqb_ihardlimit; - compat_u64 dqb_isoftlimit; - compat_u64 dqb_curinodes; - compat_u64 dqb_btime; - compat_u64 dqb_itime; - compat_uint_t dqb_valid; -}; - -/* XFS structures */ -struct compat_fs_qfilestat { - compat_u64 dqb_bhardlimit; - compat_u64 qfs_nblks; - compat_uint_t qfs_nextents; -}; - -struct compat_fs_quota_stat { - __s8 qs_version; - __u16 qs_flags; - __s8 qs_pad; - struct compat_fs_qfilestat qs_uquota; - struct compat_fs_qfilestat qs_gquota; - compat_uint_t qs_incoredqs; - compat_int_t qs_btimelimit; - compat_int_t qs_itimelimit; - compat_int_t qs_rtbtimelimit; - __u16 qs_bwarnlimit; - __u16 qs_iwarnlimit; -}; - -COMPAT_SYSCALL_DEFINE4(quotactl32, unsigned int, cmd, - const char __user *, special, qid_t, id, - void __user *, addr) -{ - unsigned int cmds; - struct if_dqblk __user *dqblk; - struct compat_if_dqblk __user *compat_dqblk; - struct fs_quota_stat __user *fsqstat; - struct compat_fs_quota_stat __user *compat_fsqstat; - compat_uint_t data; - u16 xdata; - long ret; - - cmds = cmd >> SUBCMDSHIFT; - - switch (cmds) { - case Q_GETQUOTA: - dqblk = compat_alloc_user_space(sizeof(struct if_dqblk)); - compat_dqblk = addr; - ret = kernel_quotactl(cmd, special, id, dqblk); - if (ret) - break; - if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) || - get_user(data, &dqblk->dqb_valid) || - put_user(data, &compat_dqblk->dqb_valid)) - ret = -EFAULT; - break; - case Q_SETQUOTA: - dqblk = compat_alloc_user_space(sizeof(struct if_dqblk)); - compat_dqblk = addr; - ret = -EFAULT; - if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) || - get_user(data, &compat_dqblk->dqb_valid) || - put_user(data, &dqblk->dqb_valid)) - break; - ret = kernel_quotactl(cmd, special, id, dqblk); - break; - case Q_XGETQSTAT: - fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat)); - compat_fsqstat = addr; - ret = kernel_quotactl(cmd, special, id, fsqstat); - if (ret) - break; - ret = -EFAULT; - /* Copying qs_version, qs_flags, qs_pad */ - if (copy_in_user(compat_fsqstat, fsqstat, - offsetof(struct compat_fs_quota_stat, qs_uquota))) - break; - /* Copying qs_uquota */ - if (copy_in_user(&compat_fsqstat->qs_uquota, - &fsqstat->qs_uquota, - sizeof(compat_fsqstat->qs_uquota)) || - get_user(data, &fsqstat->qs_uquota.qfs_nextents) || - put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents)) - break; - /* Copying qs_gquota */ - if (copy_in_user(&compat_fsqstat->qs_gquota, - &fsqstat->qs_gquota, - sizeof(compat_fsqstat->qs_gquota)) || - get_user(data, &fsqstat->qs_gquota.qfs_nextents) || - put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents)) - break; - /* Copying the rest */ - if (copy_in_user(&compat_fsqstat->qs_incoredqs, - &fsqstat->qs_incoredqs, - sizeof(struct compat_fs_quota_stat) - - offsetof(struct compat_fs_quota_stat, qs_incoredqs)) || - get_user(xdata, &fsqstat->qs_iwarnlimit) || - put_user(xdata, &compat_fsqstat->qs_iwarnlimit)) - break; - ret = 0; - break; - default: - ret = kernel_quotactl(cmd, special, id, addr); - } - return ret; -} diff --git a/fs/quota/compat.h b/fs/quota/compat.h new file mode 100644 index 000000000000..ef7d1e12d650 --- /dev/null +++ b/fs/quota/compat.h @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/compat.h> + +struct compat_if_dqblk { + compat_u64 dqb_bhardlimit; + compat_u64 dqb_bsoftlimit; + compat_u64 dqb_curspace; + compat_u64 dqb_ihardlimit; + compat_u64 dqb_isoftlimit; + compat_u64 dqb_curinodes; + compat_u64 dqb_btime; + compat_u64 dqb_itime; + compat_uint_t dqb_valid; +}; + +struct compat_fs_qfilestat { + compat_u64 dqb_bhardlimit; + compat_u64 qfs_nblks; + compat_uint_t qfs_nextents; +}; + +struct compat_fs_quota_stat { + __s8 qs_version; + __u16 qs_flags; + __s8 qs_pad; + struct compat_fs_qfilestat qs_uquota; + struct compat_fs_qfilestat qs_gquota; + compat_uint_t qs_incoredqs; + compat_int_t qs_btimelimit; + compat_int_t qs_itimelimit; + compat_int_t qs_rtbtimelimit; + __u16 qs_bwarnlimit; + __u16 qs_iwarnlimit; +}; diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 47f9e151988b..9af95c7a0bbe 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -19,6 +19,7 @@ #include <linux/types.h> #include <linux/writeback.h> #include <linux/nospec.h> +#include "compat.h" static int check_quotactl_permission(struct super_block *sb, int type, int cmd, qid_t id) @@ -211,8 +212,18 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id, if (ret) return ret; copy_to_if_dqblk(&idq, &fdq); - if (copy_to_user(addr, &idq, sizeof(idq))) - return -EFAULT; + + if (compat_need_64bit_alignment_fixup()) { + struct compat_if_dqblk __user *compat_dqblk = addr; + + if (copy_to_user(compat_dqblk, &idq, sizeof(*compat_dqblk))) + return -EFAULT; + if (put_user(idq.dqb_valid, &compat_dqblk->dqb_valid)) + return -EFAULT; + } else { + if (copy_to_user(addr, &idq, sizeof(idq))) + return -EFAULT; + } return 0; } @@ -277,8 +288,16 @@ static int quota_setquota(struct super_block *sb, int type, qid_t id, struct if_dqblk idq; struct kqid qid; - if (copy_from_user(&idq, addr, sizeof(idq))) - return -EFAULT; + if (compat_need_64bit_alignment_fixup()) { + struct compat_if_dqblk __user *compat_dqblk = addr; + + if (copy_from_user(&idq, compat_dqblk, sizeof(*compat_dqblk)) || + get_user(idq.dqb_valid, &compat_dqblk->dqb_valid)) + return -EFAULT; + } else { + if (copy_from_user(&idq, addr, sizeof(idq))) + return -EFAULT; + } if (!sb->s_qcop->set_dqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); @@ -382,6 +401,33 @@ static int quota_getstate(struct super_block *sb, int type, return 0; } +static int compat_copy_fs_qfilestat(struct compat_fs_qfilestat __user *to, + struct fs_qfilestat *from) +{ + if (copy_to_user(to, from, sizeof(*to)) || + put_user(from->qfs_nextents, &to->qfs_nextents)) + return -EFAULT; + return 0; +} + +static int compat_copy_fs_quota_stat(struct compat_fs_quota_stat __user *to, + struct fs_quota_stat *from) +{ + if (put_user(from->qs_version, &to->qs_version) || + put_user(from->qs_flags, &to->qs_flags) || + put_user(from->qs_pad, &to->qs_pad) || + compat_copy_fs_qfilestat(&to->qs_uquota, &from->qs_uquota) || + compat_copy_fs_qfilestat(&to->qs_gquota, &from->qs_gquota) || + put_user(from->qs_incoredqs, &to->qs_incoredqs) || + put_user(from->qs_btimelimit, &to->qs_btimelimit) || + put_user(from->qs_itimelimit, &to->qs_itimelimit) || + put_user(from->qs_rtbtimelimit, &to->qs_rtbtimelimit) || + put_user(from->qs_bwarnlimit, &to->qs_bwarnlimit) || + put_user(from->qs_iwarnlimit, &to->qs_iwarnlimit)) + return -EFAULT; + return 0; +} + static int quota_getxstate(struct super_block *sb, int type, void __user *addr) { struct fs_quota_stat fqs; @@ -390,9 +436,14 @@ static int quota_getxstate(struct super_block *sb, int type, void __user *addr) if (!sb->s_qcop->get_state) return -ENOSYS; ret = quota_getstate(sb, type, &fqs); - if (!ret && copy_to_user(addr, &fqs, sizeof(fqs))) + if (ret) + return ret; + + if (compat_need_64bit_alignment_fixup()) + return compat_copy_fs_quota_stat(addr, &fqs); + if (copy_to_user(addr, &fqs, sizeof(fqs))) return -EFAULT; - return ret; + return 0; } static int quota_getstatev(struct super_block *sb, int type, @@ -481,6 +532,14 @@ static inline u64 quota_btobb(u64 bytes) return (bytes + (1 << XFS_BB_SHIFT) - 1) >> XFS_BB_SHIFT; } +static inline s64 copy_from_xfs_dqblk_ts(const struct fs_disk_quota *d, + __s32 timer, __s8 timer_hi) +{ + if (d->d_fieldmask & FS_DQ_BIGTIME) + return (u32)timer | (s64)timer_hi << 32; + return timer; +} + static void copy_from_xfs_dqblk(struct qc_dqblk *dst, struct fs_disk_quota *src) { dst->d_spc_hardlimit = quota_bbtob(src->d_blk_hardlimit); @@ -489,14 +548,17 @@ static void copy_from_xfs_dqblk(struct qc_dqblk *dst, struct fs_disk_quota *src) dst->d_ino_softlimit = src->d_ino_softlimit; dst->d_space = quota_bbtob(src->d_bcount); dst->d_ino_count = src->d_icount; - dst->d_ino_timer = src->d_itimer; - dst->d_spc_timer = src->d_btimer; + dst->d_ino_timer = copy_from_xfs_dqblk_ts(src, src->d_itimer, + src->d_itimer_hi); + dst->d_spc_timer = copy_from_xfs_dqblk_ts(src, src->d_btimer, + src->d_btimer_hi); dst->d_ino_warns = src->d_iwarns; dst->d_spc_warns = src->d_bwarns; dst->d_rt_spc_hardlimit = quota_bbtob(src->d_rtb_hardlimit); dst->d_rt_spc_softlimit = quota_bbtob(src->d_rtb_softlimit); dst->d_rt_space = quota_bbtob(src->d_rtbcount); - dst->d_rt_spc_timer = src->d_rtbtimer; + dst->d_rt_spc_timer = copy_from_xfs_dqblk_ts(src, src->d_rtbtimer, + src->d_rtbtimer_hi); dst->d_rt_spc_warns = src->d_rtbwarns; dst->d_fieldmask = 0; if (src->d_fieldmask & FS_DQ_ISOFT) @@ -588,10 +650,26 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id, return sb->s_qcop->set_dqblk(sb, qid, &qdq); } +static inline void copy_to_xfs_dqblk_ts(const struct fs_disk_quota *d, + __s32 *timer_lo, __s8 *timer_hi, s64 timer) +{ + *timer_lo = timer; + if (d->d_fieldmask & FS_DQ_BIGTIME) + *timer_hi = timer >> 32; +} + +static inline bool want_bigtime(s64 timer) +{ + return timer > S32_MAX || timer < S32_MIN; +} + static void copy_to_xfs_dqblk(struct fs_disk_quota *dst, struct qc_dqblk *src, int type, qid_t id) { memset(dst, 0, sizeof(*dst)); + if (want_bigtime(src->d_ino_timer) || want_bigtime(src->d_spc_timer) || + want_bigtime(src->d_rt_spc_timer)) + dst->d_fieldmask |= FS_DQ_BIGTIME; dst->d_version = FS_DQUOT_VERSION; dst->d_id = id; if (type == USRQUOTA) @@ -606,14 +684,17 @@ static void copy_to_xfs_dqblk(struct fs_disk_quota *dst, struct qc_dqblk *src, dst->d_ino_softlimit = src->d_ino_softlimit; dst->d_bcount = quota_btobb(src->d_space); dst->d_icount = src->d_ino_count; - dst->d_itimer = src->d_ino_timer; - dst->d_btimer = src->d_spc_timer; + copy_to_xfs_dqblk_ts(dst, &dst->d_itimer, &dst->d_itimer_hi, + src->d_ino_timer); + copy_to_xfs_dqblk_ts(dst, &dst->d_btimer, &dst->d_btimer_hi, + src->d_spc_timer); dst->d_iwarns = src->d_ino_warns; dst->d_bwarns = src->d_spc_warns; dst->d_rtb_hardlimit = quota_btobb(src->d_rt_spc_hardlimit); dst->d_rtb_softlimit = quota_btobb(src->d_rt_spc_softlimit); dst->d_rtbcount = quota_btobb(src->d_rt_space); - dst->d_rtbtimer = src->d_rt_spc_timer; + copy_to_xfs_dqblk_ts(dst, &dst->d_rtbtimer, &dst->d_rtbtimer_hi, + src->d_rt_spc_timer); dst->d_rtbwarns = src->d_rt_spc_warns; } @@ -816,8 +897,8 @@ static struct super_block *quotactl_block(const char __user *special, int cmd) * calls. Maybe we need to add the process quotas etc. in the future, * but we probably should use rlimits for that. */ -int kernel_quotactl(unsigned int cmd, const char __user *special, - qid_t id, void __user *addr) +SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special, + qid_t, id, void __user *, addr) { uint cmds, type; struct super_block *sb = NULL; @@ -871,9 +952,3 @@ out: path_put(pathp); return ret; } - -SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special, - qid_t, id, void __user *, addr) -{ - return kernel_quotactl(cmd, special, id, addr); -} diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c index 58fc2a7c7fd1..e69a2bfdd81c 100644 --- a/fs/quota/quota_v2.c +++ b/fs/quota/quota_v2.c @@ -282,6 +282,7 @@ static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot) d->dqb_curspace = cpu_to_le64(m->dqb_curspace); d->dqb_btime = cpu_to_le64(m->dqb_btime); d->dqb_id = cpu_to_le32(from_kqid(&init_user_ns, dquot->dq_id)); + d->dqb_pad = 0; if (qtree_entry_unused(info, dp)) d->dqb_itime = cpu_to_le64(1); } diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 414695454956..355523f4a4bf 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -224,7 +224,7 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, if (!pages) goto out_free; - nr = find_get_pages(inode->i_mapping, &pgoff, lpages, pages); + nr = find_get_pages_contig(inode->i_mapping, pgoff, lpages, pages); if (nr != lpages) goto out_free_pages; /* leave if some pages were missing */ diff --git a/fs/read_write.c b/fs/read_write.c index 5db58b8c78d0..19f5c4bf75aa 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -538,6 +538,14 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t inc_syscw(current); return ret; } +/* + * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()", + * but autofs is one of the few internal kernel users that actually + * wants this _and_ can be built as a module. So we need to export + * this symbol for autofs, even though it really isn't appropriate + * for any other kernel modules. + */ +EXPORT_SYMBOL_GPL(__kernel_write); ssize_t kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) @@ -752,185 +760,6 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, return ret; } -/** - * rw_copy_check_uvector() - Copy an array of &struct iovec from userspace - * into the kernel and check that it is valid. - * - * @type: One of %CHECK_IOVEC_ONLY, %READ, or %WRITE. - * @uvector: Pointer to the userspace array. - * @nr_segs: Number of elements in userspace array. - * @fast_segs: Number of elements in @fast_pointer. - * @fast_pointer: Pointer to (usually small on-stack) kernel array. - * @ret_pointer: (output parameter) Pointer to a variable that will point to - * either @fast_pointer, a newly allocated kernel array, or NULL, - * depending on which array was used. - * - * This function copies an array of &struct iovec of @nr_segs from - * userspace into the kernel and checks that each element is valid (e.g. - * it does not point to a kernel address or cause overflow by being too - * large, etc.). - * - * As an optimization, the caller may provide a pointer to a small - * on-stack array in @fast_pointer, typically %UIO_FASTIOV elements long - * (the size of this array, or 0 if unused, should be given in @fast_segs). - * - * @ret_pointer will always point to the array that was used, so the - * caller must take care not to call kfree() on it e.g. in case the - * @fast_pointer array was used and it was allocated on the stack. - * - * Return: The total number of bytes covered by the iovec array on success - * or a negative error code on error. - */ -ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, - unsigned long nr_segs, unsigned long fast_segs, - struct iovec *fast_pointer, - struct iovec **ret_pointer) -{ - unsigned long seg; - ssize_t ret; - struct iovec *iov = fast_pointer; - - /* - * SuS says "The readv() function *may* fail if the iovcnt argument - * was less than or equal to 0, or greater than {IOV_MAX}. Linux has - * traditionally returned zero for zero segments, so... - */ - if (nr_segs == 0) { - ret = 0; - goto out; - } - - /* - * First get the "struct iovec" from user memory and - * verify all the pointers - */ - if (nr_segs > UIO_MAXIOV) { - ret = -EINVAL; - goto out; - } - if (nr_segs > fast_segs) { - iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL); - if (iov == NULL) { - ret = -ENOMEM; - goto out; - } - } - if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) { - ret = -EFAULT; - goto out; - } - - /* - * According to the Single Unix Specification we should return EINVAL - * if an element length is < 0 when cast to ssize_t or if the - * total length would overflow the ssize_t return value of the - * system call. - * - * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the - * overflow case. - */ - ret = 0; - for (seg = 0; seg < nr_segs; seg++) { - void __user *buf = iov[seg].iov_base; - ssize_t len = (ssize_t)iov[seg].iov_len; - - /* see if we we're about to use an invalid len or if - * it's about to overflow ssize_t */ - if (len < 0) { - ret = -EINVAL; - goto out; - } - if (type >= 0 - && unlikely(!access_ok(buf, len))) { - ret = -EFAULT; - goto out; - } - if (len > MAX_RW_COUNT - ret) { - len = MAX_RW_COUNT - ret; - iov[seg].iov_len = len; - } - ret += len; - } -out: - *ret_pointer = iov; - return ret; -} - -#ifdef CONFIG_COMPAT -ssize_t compat_rw_copy_check_uvector(int type, - const struct compat_iovec __user *uvector, unsigned long nr_segs, - unsigned long fast_segs, struct iovec *fast_pointer, - struct iovec **ret_pointer) -{ - compat_ssize_t tot_len; - struct iovec *iov = *ret_pointer = fast_pointer; - ssize_t ret = 0; - int seg; - - /* - * SuS says "The readv() function *may* fail if the iovcnt argument - * was less than or equal to 0, or greater than {IOV_MAX}. Linux has - * traditionally returned zero for zero segments, so... - */ - if (nr_segs == 0) - goto out; - - ret = -EINVAL; - if (nr_segs > UIO_MAXIOV) - goto out; - if (nr_segs > fast_segs) { - ret = -ENOMEM; - iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL); - if (iov == NULL) - goto out; - } - *ret_pointer = iov; - - ret = -EFAULT; - if (!access_ok(uvector, nr_segs*sizeof(*uvector))) - goto out; - - /* - * Single unix specification: - * We should -EINVAL if an element length is not >= 0 and fitting an - * ssize_t. - * - * In Linux, the total length is limited to MAX_RW_COUNT, there is - * no overflow possibility. - */ - tot_len = 0; - ret = -EINVAL; - for (seg = 0; seg < nr_segs; seg++) { - compat_uptr_t buf; - compat_ssize_t len; - - if (__get_user(len, &uvector->iov_len) || - __get_user(buf, &uvector->iov_base)) { - ret = -EFAULT; - goto out; - } - if (len < 0) /* size_t not fitting in compat_ssize_t .. */ - goto out; - if (type >= 0 && - !access_ok(compat_ptr(buf), len)) { - ret = -EFAULT; - goto out; - } - if (len > MAX_RW_COUNT - tot_len) - len = MAX_RW_COUNT - tot_len; - tot_len += len; - iov->iov_base = compat_ptr(buf); - iov->iov_len = (compat_size_t) len; - uvector++; - iov++; - } - ret = tot_len; - -out: - return ret; -} -#endif - static ssize_t do_iter_read(struct file *file, struct iov_iter *iter, loff_t *pos, rwf_t flags) { @@ -1247,224 +1076,93 @@ SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec, return do_pwritev(fd, vec, vlen, pos, flags); } +/* + * Various compat syscalls. Note that they all pretend to take a native + * iovec - import_iovec will properly treat those as compat_iovecs based on + * in_compat_syscall(). + */ #ifdef CONFIG_COMPAT -static size_t compat_readv(struct file *file, - const struct compat_iovec __user *vec, - unsigned long vlen, loff_t *pos, rwf_t flags) -{ - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov = iovstack; - struct iov_iter iter; - ssize_t ret; - - ret = compat_import_iovec(READ, vec, vlen, UIO_FASTIOV, &iov, &iter); - if (ret >= 0) { - ret = do_iter_read(file, &iter, pos, flags); - kfree(iov); - } - if (ret > 0) - add_rchar(current, ret); - inc_syscr(current); - return ret; -} - -static size_t do_compat_readv(compat_ulong_t fd, - const struct compat_iovec __user *vec, - compat_ulong_t vlen, rwf_t flags) -{ - struct fd f = fdget_pos(fd); - ssize_t ret; - loff_t pos; - - if (!f.file) - return -EBADF; - pos = f.file->f_pos; - ret = compat_readv(f.file, vec, vlen, &pos, flags); - if (ret >= 0) - f.file->f_pos = pos; - fdput_pos(f); - return ret; - -} - -COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd, - const struct compat_iovec __user *,vec, - compat_ulong_t, vlen) -{ - return do_compat_readv(fd, vec, vlen, 0); -} - -static long do_compat_preadv64(unsigned long fd, - const struct compat_iovec __user *vec, - unsigned long vlen, loff_t pos, rwf_t flags) -{ - struct fd f; - ssize_t ret; - - if (pos < 0) - return -EINVAL; - f = fdget(fd); - if (!f.file) - return -EBADF; - ret = -ESPIPE; - if (f.file->f_mode & FMODE_PREAD) - ret = compat_readv(f.file, vec, vlen, &pos, flags); - fdput(f); - return ret; -} - #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64 COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, - const struct compat_iovec __user *,vec, + const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos) { - return do_compat_preadv64(fd, vec, vlen, pos, 0); + return do_preadv(fd, vec, vlen, pos, 0); } #endif COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, - const struct compat_iovec __user *,vec, + const struct iovec __user *, vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; - return do_compat_preadv64(fd, vec, vlen, pos, 0); + return do_preadv(fd, vec, vlen, pos, 0); } #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2 COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd, - const struct compat_iovec __user *,vec, + const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { if (pos == -1) - return do_compat_readv(fd, vec, vlen, flags); - - return do_compat_preadv64(fd, vec, vlen, pos, flags); + return do_readv(fd, vec, vlen, flags); + return do_preadv(fd, vec, vlen, pos, flags); } #endif COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd, - const struct compat_iovec __user *,vec, + const struct iovec __user *, vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; if (pos == -1) - return do_compat_readv(fd, vec, vlen, flags); - - return do_compat_preadv64(fd, vec, vlen, pos, flags); -} - -static size_t compat_writev(struct file *file, - const struct compat_iovec __user *vec, - unsigned long vlen, loff_t *pos, rwf_t flags) -{ - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov = iovstack; - struct iov_iter iter; - ssize_t ret; - - ret = compat_import_iovec(WRITE, vec, vlen, UIO_FASTIOV, &iov, &iter); - if (ret >= 0) { - file_start_write(file); - ret = do_iter_write(file, &iter, pos, flags); - file_end_write(file); - kfree(iov); - } - if (ret > 0) - add_wchar(current, ret); - inc_syscw(current); - return ret; -} - -static size_t do_compat_writev(compat_ulong_t fd, - const struct compat_iovec __user* vec, - compat_ulong_t vlen, rwf_t flags) -{ - struct fd f = fdget_pos(fd); - ssize_t ret; - loff_t pos; - - if (!f.file) - return -EBADF; - pos = f.file->f_pos; - ret = compat_writev(f.file, vec, vlen, &pos, flags); - if (ret >= 0) - f.file->f_pos = pos; - fdput_pos(f); - return ret; -} - -COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd, - const struct compat_iovec __user *, vec, - compat_ulong_t, vlen) -{ - return do_compat_writev(fd, vec, vlen, 0); -} - -static long do_compat_pwritev64(unsigned long fd, - const struct compat_iovec __user *vec, - unsigned long vlen, loff_t pos, rwf_t flags) -{ - struct fd f; - ssize_t ret; - - if (pos < 0) - return -EINVAL; - f = fdget(fd); - if (!f.file) - return -EBADF; - ret = -ESPIPE; - if (f.file->f_mode & FMODE_PWRITE) - ret = compat_writev(f.file, vec, vlen, &pos, flags); - fdput(f); - return ret; + return do_readv(fd, vec, vlen, flags); + return do_preadv(fd, vec, vlen, pos, flags); } #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64 COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, - const struct compat_iovec __user *,vec, + const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos) { - return do_compat_pwritev64(fd, vec, vlen, pos, 0); + return do_pwritev(fd, vec, vlen, pos, 0); } #endif COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, - const struct compat_iovec __user *,vec, + const struct iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; - return do_compat_pwritev64(fd, vec, vlen, pos, 0); + return do_pwritev(fd, vec, vlen, pos, 0); } #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2 COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd, - const struct compat_iovec __user *,vec, + const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { if (pos == -1) - return do_compat_writev(fd, vec, vlen, flags); - - return do_compat_pwritev64(fd, vec, vlen, pos, flags); + return do_writev(fd, vec, vlen, flags); + return do_pwritev(fd, vec, vlen, pos, flags); } #endif COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd, - const struct compat_iovec __user *,vec, + const struct iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; if (pos == -1) - return do_compat_writev(fd, vec, vlen, flags); - - return do_compat_pwritev64(fd, vec, vlen, pos, flags); + return do_writev(fd, vec, vlen, flags); + return do_pwritev(fd, vec, vlen, pos, flags); } - -#endif +#endif /* CONFIG_COMPAT */ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, loff_t max) diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 1509775da040..c76d563dec0e 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1551,11 +1551,7 @@ void reiserfs_read_locked_inode(struct inode *inode, * set version 1, version 2 could be used too, because stat data * key is the same in both versions */ - key.version = KEY_FORMAT_3_5; - key.on_disk_key.k_dir_id = dirino; - key.on_disk_key.k_objectid = inode->i_ino; - key.on_disk_key.k_offset = 0; - key.on_disk_key.k_type = 0; + _make_cpu_key(&key, KEY_FORMAT_3_5, dirino, inode->i_ino, 0, 0, 3); /* look for the object's stat data */ retval = search_item(inode->i_sb, &key, &path_to_sd); @@ -2163,7 +2159,8 @@ out_end_trans: out_inserted_sd: clear_nlink(inode); th->t_trans_id = 0; /* so the caller can't use this handle later */ - unlock_new_inode(inode); /* OK to do even if we hadn't locked it */ + if (inode->i_state & I_NEW) + unlock_new_inode(inode); iput(inode); return err; } diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index a6bce5b1fb1d..1b9c7a387dc7 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1258,6 +1258,10 @@ static int reiserfs_parse_options(struct super_block *s, "turned on."); return 0; } + if (qf_names[qtype] != + REISERFS_SB(s)->s_qf_names[qtype]) + kfree(qf_names[qtype]); + qf_names[qtype] = NULL; if (*arg) { /* Some filename specified? */ if (REISERFS_SB(s)->s_qf_names[qtype] && strcmp(REISERFS_SB(s)->s_qf_names[qtype], @@ -1287,10 +1291,6 @@ static int reiserfs_parse_options(struct super_block *s, else *mount_options |= 1 << REISERFS_GRPQUOTA; } else { - if (qf_names[qtype] != - REISERFS_SB(s)->s_qf_names[qtype]) - kfree(qf_names[qtype]); - qf_names[qtype] = NULL; if (qtype == USRQUOTA) *mount_options &= ~(1 << REISERFS_USRQUOTA); else diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 28b241cd6987..fe63a7c3e0da 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -674,6 +674,13 @@ reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer, if (get_inode_sd_version(inode) == STAT_DATA_V1) return -EOPNOTSUPP; + /* + * priv_root needn't be initialized during mount so allow initial + * lookups to succeed. + */ + if (!REISERFS_SB(inode->i_sb)->priv_root) + return 0; + dentry = xattr_lookup(inode, name, XATTR_REPLACE); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); diff --git a/fs/romfs/super.c b/fs/romfs/super.c index e582d001f792..b1b7d3f5752f 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -356,6 +356,7 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos) } i->i_mode = mode; + i->i_blocks = (i->i_size + 511) >> 9; unlock_new_inode(i); return i; diff --git a/fs/splice.c b/fs/splice.c index d7c8a7c4db07..70cc52af780b 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -33,7 +33,6 @@ #include <linux/security.h> #include <linux/gfp.h> #include <linux/socket.h> -#include <linux/compat.h> #include <linux/sched/signal.h> #include "internal.h" @@ -526,6 +525,22 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des return 1; } +/* We know we have a pipe buffer, but maybe it's empty? */ +static inline bool eat_empty_buffer(struct pipe_inode_info *pipe) +{ + unsigned int tail = pipe->tail; + unsigned int mask = pipe->ring_size - 1; + struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + + if (unlikely(!buf->len)) { + pipe_buf_release(pipe, buf); + pipe->tail = tail+1; + return true; + } + + return false; +} + /** * splice_from_pipe_next - wait for some data to splice from * @pipe: pipe to splice from @@ -545,6 +560,7 @@ static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_des if (signal_pending(current)) return -ERESTARTSYS; +repeat: while (pipe_empty(pipe->head, pipe->tail)) { if (!pipe->writers) return 0; @@ -563,9 +579,12 @@ static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_des sd->need_wakeup = false; } - pipe_wait(pipe); + pipe_wait_readable(pipe); } + if (eat_empty_buffer(pipe)) + goto repeat; + return 1; } @@ -1077,7 +1096,7 @@ static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags) return -EAGAIN; if (signal_pending(current)) return -ERESTARTSYS; - pipe_wait(pipe); + pipe_wait_writable(pipe); } } @@ -1332,20 +1351,6 @@ static int vmsplice_type(struct fd f, int *type) * Currently we punt and implement it as a normal copy, see pipe_to_user(). * */ -static long do_vmsplice(struct file *f, struct iov_iter *iter, unsigned int flags) -{ - if (unlikely(flags & ~SPLICE_F_ALL)) - return -EINVAL; - - if (!iov_iter_count(iter)) - return 0; - - if (iov_iter_rw(iter) == WRITE) - return vmsplice_to_pipe(f, iter, flags); - else - return vmsplice_to_user(f, iter, flags); -} - SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov, unsigned long, nr_segs, unsigned int, flags) { @@ -1356,6 +1361,9 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov, struct fd f; int type; + if (unlikely(flags & ~SPLICE_F_ALL)) + return -EINVAL; + f = fdget(fd); error = vmsplice_type(f, &type); if (error) @@ -1363,40 +1371,21 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov, error = import_iovec(type, uiov, nr_segs, ARRAY_SIZE(iovstack), &iov, &iter); - if (error >= 0) { - error = do_vmsplice(f.file, &iter, flags); - kfree(iov); - } - fdput(f); - return error; -} - -#ifdef CONFIG_COMPAT -COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, iov32, - unsigned int, nr_segs, unsigned int, flags) -{ - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov = iovstack; - struct iov_iter iter; - ssize_t error; - struct fd f; - int type; + if (error < 0) + goto out_fdput; - f = fdget(fd); - error = vmsplice_type(f, &type); - if (error) - return error; + if (!iov_iter_count(&iter)) + error = 0; + else if (iov_iter_rw(&iter) == WRITE) + error = vmsplice_to_pipe(f.file, &iter, flags); + else + error = vmsplice_to_user(f.file, &iter, flags); - error = compat_import_iovec(type, iov32, nr_segs, - ARRAY_SIZE(iovstack), &iov, &iter); - if (error >= 0) { - error = do_vmsplice(f.file, &iter, flags); - kfree(iov); - } + kfree(iov); +out_fdput: fdput(f); return error; } -#endif SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, @@ -1454,7 +1443,7 @@ static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) ret = -EAGAIN; break; } - pipe_wait(pipe); + pipe_wait_readable(pipe); } pipe_unlock(pipe); @@ -1493,7 +1482,7 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) ret = -ERESTARTSYS; break; } - pipe_wait(pipe); + pipe_wait_writable(pipe); } pipe_unlock(pipe); diff --git a/fs/super.c b/fs/super.c index 904459b35119..a51c2083cd6b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1256,6 +1256,8 @@ static int set_bdev_super(struct super_block *s, void *data) s->s_dev = s->s_bdev->bd_dev; s->s_bdi = bdi_get(s->s_bdev->bd_bdi); + if (blk_queue_stable_writes(s->s_bdev->bd_disk->queue)) + s->s_iflags |= SB_I_STABLE_WRITES; return 0; } diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index eb6897ab78e7..96d0da65e088 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -15,6 +15,7 @@ #include <linux/list.h> #include <linux/mutex.h> #include <linux/seq_file.h> +#include <linux/mm.h> #include "sysfs.h" @@ -707,3 +708,57 @@ int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid) return 0; } EXPORT_SYMBOL_GPL(sysfs_change_owner); + +/** + * sysfs_emit - scnprintf equivalent, aware of PAGE_SIZE buffer. + * @buf: start of PAGE_SIZE buffer. + * @fmt: format + * @...: optional arguments to @format + * + * + * Returns number of characters written to @buf. + */ +int sysfs_emit(char *buf, const char *fmt, ...) +{ + va_list args; + int len; + + if (WARN(!buf || offset_in_page(buf), + "invalid sysfs_emit: buf:%p\n", buf)) + return 0; + + va_start(args, fmt); + len = vscnprintf(buf, PAGE_SIZE, fmt, args); + va_end(args); + + return len; +} +EXPORT_SYMBOL_GPL(sysfs_emit); + +/** + * sysfs_emit_at - scnprintf equivalent, aware of PAGE_SIZE buffer. + * @buf: start of PAGE_SIZE buffer. + * @at: offset in @buf to start write in bytes + * @at must be >= 0 && < PAGE_SIZE + * @fmt: format + * @...: optional arguments to @fmt + * + * + * Returns number of characters written starting at &@buf[@at]. + */ +int sysfs_emit_at(char *buf, int at, const char *fmt, ...) +{ + va_list args; + int len; + + if (WARN(!buf || offset_in_page(buf) || at < 0 || at >= PAGE_SIZE, + "invalid sysfs_emit_at: buf:%p at:%d\n", buf, at)) + return 0; + + va_start(args, fmt); + len = vscnprintf(buf + at, PAGE_SIZE - at, fmt, args); + va_end(args); + + return len; +} +EXPORT_SYMBOL_GPL(sysfs_emit_at); diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c index cc5c0abfd536..b93b3cd10bfd 100644 --- a/fs/ubifs/auth.c +++ b/fs/ubifs/auth.c @@ -54,7 +54,7 @@ static int ubifs_hash_calc_hmac(const struct ubifs_info *c, const u8 *hash, * ubifs_prepare_auth_node - Prepare an authentication node * @c: UBIFS file-system description object * @node: the node to calculate a hash for - * @hash: input hash of previous nodes + * @inhash: input hash of previous nodes * * This function prepares an authentication node for writing onto flash. * It creates a HMAC from the given input hash and writes it to the node. diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 31288d8fa2ce..ebff43f8009c 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -1123,6 +1123,7 @@ int dbg_check_dir(struct ubifs_info *c, const struct inode *dir) err = PTR_ERR(dent); if (err == -ENOENT) break; + kfree(pdent); return err; } diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 9d042942d8b2..155521e51ac5 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -81,19 +81,6 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, struct ubifs_inode *ui; bool encrypted = false; - if (IS_ENCRYPTED(dir)) { - err = fscrypt_get_encryption_info(dir); - if (err) { - ubifs_err(c, "fscrypt_get_encryption_info failed: %i", err); - return ERR_PTR(err); - } - - if (!fscrypt_has_encryption_key(dir)) - return ERR_PTR(-EPERM); - - encrypted = true; - } - inode = new_inode(c->vfs_sb); ui = ubifs_inode(inode); if (!inode) @@ -112,6 +99,12 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, current_time(inode); inode->i_mapping->nrpages = 0; + err = fscrypt_prepare_new_inode(dir, inode, &encrypted); + if (err) { + ubifs_err(c, "fscrypt_prepare_new_inode failed: %i", err); + goto out_iput; + } + switch (mode & S_IFMT) { case S_IFREG: inode->i_mapping->a_ops = &ubifs_file_address_operations; @@ -131,7 +124,6 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, case S_IFBLK: case S_IFCHR: inode->i_op = &ubifs_file_inode_operations; - encrypted = false; break; default: BUG(); @@ -151,9 +143,8 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, if (c->highest_inum >= INUM_WATERMARK) { spin_unlock(&c->cnt_lock); ubifs_err(c, "out of inode numbers"); - make_bad_inode(inode); - iput(inode); - return ERR_PTR(-EINVAL); + err = -EINVAL; + goto out_iput; } ubifs_warn(c, "running out of inode numbers (current %lu, max %u)", (unsigned long)c->highest_inum, INUM_WATERMARK); @@ -171,16 +162,19 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, spin_unlock(&c->cnt_lock); if (encrypted) { - err = fscrypt_inherit_context(dir, inode, &encrypted, true); + err = fscrypt_set_context(inode, NULL); if (err) { - ubifs_err(c, "fscrypt_inherit_context failed: %i", err); - make_bad_inode(inode); - iput(inode); - return ERR_PTR(err); + ubifs_err(c, "fscrypt_set_context failed: %i", err); + goto out_iput; } } return inode; + +out_iput: + make_bad_inode(inode); + iput(inode); + return ERR_PTR(err); } static int dbg_check_name(const struct ubifs_info *c, @@ -515,7 +509,7 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx) if (err) return err; - err = fscrypt_fname_alloc_buffer(dir, UBIFS_MAX_NLEN, &fstr); + err = fscrypt_fname_alloc_buffer(UBIFS_MAX_NLEN, &fstr); if (err) return err; diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index 62cb3db44e6e..a4aaeea63893 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -57,10 +57,6 @@ /** * switch_gc_head - switch the garbage collection journal head. * @c: UBIFS file-system description object - * @buf: buffer to write - * @len: length of the buffer to write - * @lnum: LEB number written is returned here - * @offs: offset written is returned here * * This function switch the GC head to the next LEB which is reserved in * @c->gc_lnum. Returns %0 in case of success, %-EAGAIN if commit is required, diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 3df9be2c684c..4363d85a3fd4 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -134,7 +134,6 @@ static int setflags(struct inode *inode, int flags) return err; out_unlock: - ubifs_err(c, "can't modify inode %lu attributes", inode->i_ino); mutex_unlock(&ui->ui_mutex); ubifs_release_budget(c, &req); return err; diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 4a5b06f8d812..091c2ad8f211 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -894,6 +894,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode) if (err == -ENOENT) break; + kfree(pxent); goto out_release; } @@ -906,6 +907,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode) ubifs_err(c, "dead directory entry '%s', error %d", xent->name, err); ubifs_ro_mode(c, err); + kfree(pxent); kfree(xent); goto out_release; } @@ -936,8 +938,6 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode) inode->i_ino); release_head(c, BASEHD); - ubifs_add_auth_dirt(c, lnum); - if (last_reference) { err = ubifs_tnc_remove_ino(c, inode->i_ino); if (err) @@ -947,6 +947,8 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode) } else { union ubifs_key key; + ubifs_add_auth_dirt(c, lnum); + ino_key_init(c, &key, inode->i_ino); err = ubifs_tnc_add(c, &key, lnum, offs, ilen, hash); } @@ -1798,7 +1800,6 @@ int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode, u8 hash[UBIFS_HASH_ARR_SZ]; dbg_jnl("ino %lu, ino %lu", host->i_ino, inode->i_ino); - ubifs_assert(c, host->i_nlink > 0); ubifs_assert(c, inode->i_nlink > 0); ubifs_assert(c, mutex_is_locked(&host_ui->ui_mutex)); diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 2c294085ffed..0fb61956146d 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -173,6 +173,7 @@ int ubifs_add_orphan(struct ubifs_info *c, ino_t inum) err = PTR_ERR(xent); if (err == -ENOENT) break; + kfree(pxent); return err; } @@ -182,6 +183,7 @@ int ubifs_add_orphan(struct ubifs_info *c, ino_t inum) xattr_orphan = orphan_add(c, xattr_inum, orphan); if (IS_ERR(xattr_orphan)) { + kfree(pxent); kfree(xent); return PTR_ERR(xattr_orphan); } diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index b69ffac7e415..2f8d8f4f411a 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -931,8 +931,6 @@ out: * validate_ref - validate a reference node. * @c: UBIFS file-system description object * @ref: the reference node to validate - * @ref_lnum: LEB number of the reference node - * @ref_offs: reference node offset * * This function returns %1 if a bud reference already exists for the LEB. %0 is * returned if the reference node is new, otherwise %-EINVAL is returned if diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index a2420c900275..cb3acfb7dd1f 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1110,14 +1110,20 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options, break; } case Opt_auth_key: - c->auth_key_name = kstrdup(args[0].from, GFP_KERNEL); - if (!c->auth_key_name) - return -ENOMEM; + if (!is_remount) { + c->auth_key_name = kstrdup(args[0].from, + GFP_KERNEL); + if (!c->auth_key_name) + return -ENOMEM; + } break; case Opt_auth_hash_name: - c->auth_hash_name = kstrdup(args[0].from, GFP_KERNEL); - if (!c->auth_hash_name) - return -ENOMEM; + if (!is_remount) { + c->auth_hash_name = kstrdup(args[0].from, + GFP_KERNEL); + if (!c->auth_hash_name) + return -ENOMEM; + } break; case Opt_ignore: break; @@ -1141,6 +1147,18 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options, return 0; } +/* + * ubifs_release_options - release mount parameters which have been dumped. + * @c: UBIFS file-system description object + */ +static void ubifs_release_options(struct ubifs_info *c) +{ + kfree(c->auth_key_name); + c->auth_key_name = NULL; + kfree(c->auth_hash_name); + c->auth_hash_name = NULL; +} + /** * destroy_journal - destroy journal data structures. * @c: UBIFS file-system description object @@ -1313,7 +1331,7 @@ static int mount_ubifs(struct ubifs_info *c) err = ubifs_read_superblock(c); if (err) - goto out_free; + goto out_auth; c->probing = 0; @@ -1325,18 +1343,18 @@ static int mount_ubifs(struct ubifs_info *c) ubifs_err(c, "'compressor \"%s\" is not compiled in", ubifs_compr_name(c, c->default_compr)); err = -ENOTSUPP; - goto out_free; + goto out_auth; } err = init_constants_sb(c); if (err) - goto out_free; + goto out_auth; sz = ALIGN(c->max_idx_node_sz, c->min_io_size) * 2; c->cbuf = kmalloc(sz, GFP_NOFS); if (!c->cbuf) { err = -ENOMEM; - goto out_free; + goto out_auth; } err = alloc_wbufs(c); @@ -1611,6 +1629,8 @@ out_wbufs: free_wbufs(c); out_cbuf: kfree(c->cbuf); +out_auth: + ubifs_exit_authentication(c); out_free: kfree(c->write_reserve_buf); kfree(c->bu.buf); @@ -1650,8 +1670,7 @@ static void ubifs_umount(struct ubifs_info *c) ubifs_lpt_free(c, 0); ubifs_exit_authentication(c); - kfree(c->auth_key_name); - kfree(c->auth_hash_name); + ubifs_release_options(c); kfree(c->cbuf); kfree(c->rcvrd_mst_node); kfree(c->mst_node); @@ -2177,6 +2196,8 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) c->vi.vol_id); if (err) goto out_close; + sb->s_bdi->ra_pages = 0; + sb->s_bdi->io_pages = 0; sb->s_fs_info = c; sb->s_magic = UBIFS_SUPER_MAGIC; @@ -2219,6 +2240,7 @@ out_umount: out_unlock: mutex_unlock(&c->umount_mutex); out_close: + ubifs_release_options(c); ubi_close_volume(c->ubi); out: return err; diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index f609f6cdde70..894f1ab14616 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -360,7 +360,6 @@ static int lnc_add_directly(struct ubifs_info *c, struct ubifs_zbranch *zbr, /** * lnc_free - remove a leaf node from the leaf node cache. * @zbr: zbranch of leaf node - * @node: leaf node */ static void lnc_free(struct ubifs_zbranch *zbr) { @@ -2885,6 +2884,7 @@ int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum) err = PTR_ERR(xent); if (err == -ENOENT) break; + kfree(pxent); return err; } @@ -2898,6 +2898,7 @@ int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum) fname_len(&nm) = le16_to_cpu(xent->nlen); err = ubifs_tnc_remove_nm(c, &key1, &nm); if (err) { + kfree(pxent); kfree(xent); return err; } @@ -2906,6 +2907,7 @@ int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum) highest_ino_key(c, &key2, xattr_inum); err = ubifs_tnc_remove_range(c, &key1, &key2); if (err) { + kfree(pxent); kfree(xent); return err; } @@ -3466,7 +3468,7 @@ out_unlock: /** * dbg_check_inode_size - check if inode size is correct. * @c: UBIFS file-system description object - * @inum: inode number + * @inode: inode to check * @size: inode size * * This function makes sure that the inode size (@size) is correct and it does diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 9aefbb60074f..a0b9b349efe6 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -522,6 +522,7 @@ int ubifs_purge_xattrs(struct inode *host) xent->name, err); ubifs_ro_mode(c, err); kfree(pxent); + kfree(xent); return err; } @@ -531,6 +532,7 @@ int ubifs_purge_xattrs(struct inode *host) err = remove_xattr(c, host, xino, &nm); if (err) { kfree(pxent); + kfree(xent); iput(xino); ubifs_err(c, "cannot remove xattr, error %d", err); return err; diff --git a/fs/udf/directory.c b/fs/udf/directory.c index d9523013096f..73720320f0ab 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -34,7 +34,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, fibh->soffset = fibh->eoffset; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { - fi = udf_get_fileident(iinfo->i_ext.i_data - + fi = udf_get_fileident(iinfo->i_data - (iinfo->i_efe ? sizeof(struct extendedFileEntry) : sizeof(struct fileEntry)), diff --git a/fs/udf/file.c b/fs/udf/file.c index 628941a6b79a..ad8eefad27d7 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -50,7 +50,7 @@ static void __udf_adinicb_readpage(struct page *page) * So just sample it once and use the same value everywhere. */ kaddr = kmap_atomic(page); - memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr, isize); + memcpy(kaddr, iinfo->i_data + iinfo->i_lenEAttr, isize); memset(kaddr + isize, 0, PAGE_SIZE - isize); flush_dcache_page(page); SetPageUptodate(page); @@ -76,8 +76,7 @@ static int udf_adinicb_writepage(struct page *page, BUG_ON(!PageLocked(page)); kaddr = kmap_atomic(page); - memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr, - i_size_read(inode)); + memcpy(iinfo->i_data + iinfo->i_lenEAttr, kaddr, i_size_read(inode)); SetPageUptodate(page); kunmap_atomic(kaddr); mark_inode_dirty(inode); @@ -215,7 +214,7 @@ long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return put_user(UDF_I(inode)->i_lenEAttr, (int __user *)arg); case UDF_GETEABLOCK: return copy_to_user((char __user *)arg, - UDF_I(inode)->i_ext.i_data, + UDF_I(inode)->i_data, UDF_I(inode)->i_lenEAttr) ? -EFAULT : 0; default: return -ENOIOCTLCMD; diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index 0adb40718a5d..84ed23edebfd 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -67,16 +67,16 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode) iinfo->i_efe = 1; if (UDF_VERS_USE_EXTENDED_FE > sbi->s_udfrev) sbi->s_udfrev = UDF_VERS_USE_EXTENDED_FE; - iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize - - sizeof(struct extendedFileEntry), - GFP_KERNEL); + iinfo->i_data = kzalloc(inode->i_sb->s_blocksize - + sizeof(struct extendedFileEntry), + GFP_KERNEL); } else { iinfo->i_efe = 0; - iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize - - sizeof(struct fileEntry), - GFP_KERNEL); + iinfo->i_data = kzalloc(inode->i_sb->s_blocksize - + sizeof(struct fileEntry), + GFP_KERNEL); } - if (!iinfo->i_ext.i_data) { + if (!iinfo->i_data) { iput(inode); return ERR_PTR(-ENOMEM); } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index adaba8e8b326..bb89c3e43212 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -139,23 +139,26 @@ void udf_evict_inode(struct inode *inode) struct udf_inode_info *iinfo = UDF_I(inode); int want_delete = 0; - if (!inode->i_nlink && !is_bad_inode(inode)) { - want_delete = 1; - udf_setsize(inode, 0); - udf_update_inode(inode, IS_SYNC(inode)); + if (!is_bad_inode(inode)) { + if (!inode->i_nlink) { + want_delete = 1; + udf_setsize(inode, 0); + udf_update_inode(inode, IS_SYNC(inode)); + } + if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && + inode->i_size != iinfo->i_lenExtents) { + udf_warn(inode->i_sb, + "Inode %lu (mode %o) has inode size %llu different from extent length %llu. Filesystem need not be standards compliant.\n", + inode->i_ino, inode->i_mode, + (unsigned long long)inode->i_size, + (unsigned long long)iinfo->i_lenExtents); + } } truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); clear_inode(inode); - if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && - inode->i_size != iinfo->i_lenExtents) { - udf_warn(inode->i_sb, "Inode %lu (mode %o) has inode size %llu different from extent length %llu. Filesystem need not be standards compliant.\n", - inode->i_ino, inode->i_mode, - (unsigned long long)inode->i_size, - (unsigned long long)iinfo->i_lenExtents); - } - kfree(iinfo->i_ext.i_data); - iinfo->i_ext.i_data = NULL; + kfree(iinfo->i_data); + iinfo->i_data = NULL; udf_clear_extent_cache(inode); if (want_delete) { udf_free_inode(inode); @@ -285,14 +288,14 @@ int udf_expand_file_adinicb(struct inode *inode) kaddr = kmap_atomic(page); memset(kaddr + iinfo->i_lenAlloc, 0x00, PAGE_SIZE - iinfo->i_lenAlloc); - memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr, + memcpy(kaddr, iinfo->i_data + iinfo->i_lenEAttr, iinfo->i_lenAlloc); flush_dcache_page(page); SetPageUptodate(page); kunmap_atomic(kaddr); } down_write(&iinfo->i_data_sem); - memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr, 0x00, + memset(iinfo->i_data + iinfo->i_lenEAttr, 0x00, iinfo->i_lenAlloc); iinfo->i_lenAlloc = 0; if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) @@ -308,8 +311,7 @@ int udf_expand_file_adinicb(struct inode *inode) lock_page(page); down_write(&iinfo->i_data_sem); kaddr = kmap_atomic(page); - memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr, - inode->i_size); + memcpy(iinfo->i_data + iinfo->i_lenEAttr, kaddr, inode->i_size); kunmap_atomic(kaddr); unlock_page(page); iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; @@ -396,8 +398,7 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, } mark_buffer_dirty_inode(dbh, inode); - memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr, 0, - iinfo->i_lenAlloc); + memset(iinfo->i_data + iinfo->i_lenEAttr, 0, iinfo->i_lenAlloc); iinfo->i_lenAlloc = 0; eloc.logicalBlockNum = *block; eloc.partitionReferenceNum = @@ -1260,7 +1261,7 @@ set_size: if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { down_write(&iinfo->i_data_sem); udf_clear_extent_cache(inode); - memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr + newsize, + memset(iinfo->i_data + iinfo->i_lenEAttr + newsize, 0x00, bsize - newsize - udf_file_entry_alloc_offset(inode)); iinfo->i_lenAlloc = newsize; @@ -1411,7 +1412,7 @@ reread: sizeof(struct extendedFileEntry)); if (ret) goto out; - memcpy(iinfo->i_ext.i_data, + memcpy(iinfo->i_data, bh->b_data + sizeof(struct extendedFileEntry), bs - sizeof(struct extendedFileEntry)); } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_FE)) { @@ -1420,7 +1421,7 @@ reread: ret = udf_alloc_i_data(inode, bs - sizeof(struct fileEntry)); if (ret) goto out; - memcpy(iinfo->i_ext.i_data, + memcpy(iinfo->i_data, bh->b_data + sizeof(struct fileEntry), bs - sizeof(struct fileEntry)); } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_USE)) { @@ -1433,7 +1434,7 @@ reread: sizeof(struct unallocSpaceEntry)); if (ret) goto out; - memcpy(iinfo->i_ext.i_data, + memcpy(iinfo->i_data, bh->b_data + sizeof(struct unallocSpaceEntry), bs - sizeof(struct unallocSpaceEntry)); return 0; @@ -1614,8 +1615,8 @@ out: static int udf_alloc_i_data(struct inode *inode, size_t size) { struct udf_inode_info *iinfo = UDF_I(inode); - iinfo->i_ext.i_data = kmalloc(size, GFP_KERNEL); - if (!iinfo->i_ext.i_data) + iinfo->i_data = kmalloc(size, GFP_KERNEL); + if (!iinfo->i_data) return -ENOMEM; return 0; } @@ -1706,7 +1707,7 @@ static int udf_update_inode(struct inode *inode, int do_sync) use->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc); memcpy(bh->b_data + sizeof(struct unallocSpaceEntry), - iinfo->i_ext.i_data, inode->i_sb->s_blocksize - + iinfo->i_data, inode->i_sb->s_blocksize - sizeof(struct unallocSpaceEntry)); use->descTag.tagIdent = cpu_to_le16(TAG_IDENT_USE); crclen = sizeof(struct unallocSpaceEntry); @@ -1772,7 +1773,7 @@ static int udf_update_inode(struct inode *inode, int do_sync) if (iinfo->i_efe == 0) { memcpy(bh->b_data + sizeof(struct fileEntry), - iinfo->i_ext.i_data, + iinfo->i_data, inode->i_sb->s_blocksize - sizeof(struct fileEntry)); fe->logicalBlocksRecorded = cpu_to_le64(lb_recorded); @@ -1791,7 +1792,7 @@ static int udf_update_inode(struct inode *inode, int do_sync) crclen = sizeof(struct fileEntry); } else { memcpy(bh->b_data + sizeof(struct extendedFileEntry), - iinfo->i_ext.i_data, + iinfo->i_data, inode->i_sb->s_blocksize - sizeof(struct extendedFileEntry)); efe->objectSize = @@ -2087,7 +2088,7 @@ void udf_write_aext(struct inode *inode, struct extent_position *epos, struct udf_inode_info *iinfo = UDF_I(inode); if (!epos->bh) - ptr = iinfo->i_ext.i_data + epos->offset - + ptr = iinfo->i_data + epos->offset - udf_file_entry_alloc_offset(inode) + iinfo->i_lenEAttr; else @@ -2179,7 +2180,7 @@ int8_t udf_current_aext(struct inode *inode, struct extent_position *epos, if (!epos->bh) { if (!epos->offset) epos->offset = udf_file_entry_alloc_offset(inode); - ptr = iinfo->i_ext.i_data + epos->offset - + ptr = iinfo->i_data + epos->offset - udf_file_entry_alloc_offset(inode) + iinfo->i_lenEAttr; alen = udf_file_entry_alloc_offset(inode) + diff --git a/fs/udf/misc.c b/fs/udf/misc.c index 401e64cde1be..eab94527340d 100644 --- a/fs/udf/misc.c +++ b/fs/udf/misc.c @@ -52,9 +52,9 @@ struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size, uint16_t crclen; struct udf_inode_info *iinfo = UDF_I(inode); - ea = iinfo->i_ext.i_data; + ea = iinfo->i_data; if (iinfo->i_lenEAttr) { - ad = iinfo->i_ext.i_data + iinfo->i_lenEAttr; + ad = iinfo->i_data + iinfo->i_lenEAttr; } else { ad = ea; size += sizeof(struct extendedAttrHeaderDesc); @@ -153,7 +153,7 @@ struct genericFormat *udf_get_extendedattr(struct inode *inode, uint32_t type, uint32_t offset; struct udf_inode_info *iinfo = UDF_I(inode); - ea = iinfo->i_ext.i_data; + ea = iinfo->i_data; if (iinfo->i_lenEAttr) { struct extendedAttrHeaderDesc *eahd; diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 77b6d89b9bcd..e169d8fe35b5 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -460,8 +460,7 @@ add: if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { block = dinfo->i_location.logicalBlockNum; fi = (struct fileIdentDesc *) - (dinfo->i_ext.i_data + - fibh->soffset - + (dinfo->i_data + fibh->soffset - udf_ext0_offset(dir) + dinfo->i_lenEAttr); } else { @@ -940,7 +939,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, mark_buffer_dirty_inode(epos.bh, inode); ea = epos.bh->b_data + udf_ext0_offset(inode); } else - ea = iinfo->i_ext.i_data + iinfo->i_lenEAttr; + ea = iinfo->i_data + iinfo->i_lenEAttr; eoffset = sb->s_blocksize - udf_ext0_offset(inode); pc = (struct pathComponent *)ea; @@ -1120,7 +1119,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, retval = -EIO; if (old_iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { dir_fi = udf_get_fileident( - old_iinfo->i_ext.i_data - + old_iinfo->i_data - (old_iinfo->i_efe ? sizeof(struct extendedFileEntry) : sizeof(struct fileEntry)), diff --git a/fs/udf/partition.c b/fs/udf/partition.c index 090baff83990..4cbf40575965 100644 --- a/fs/udf/partition.c +++ b/fs/udf/partition.c @@ -65,7 +65,7 @@ uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block, } if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { - loc = le32_to_cpu(((__le32 *)(iinfo->i_ext.i_data + + loc = le32_to_cpu(((__le32 *)(iinfo->i_data + vdata->s_start_offset))[block]); goto translate; } diff --git a/fs/udf/super.c b/fs/udf/super.c index 1c42f544096d..faf2017ada11 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -168,7 +168,7 @@ static void init_once(void *foo) { struct udf_inode_info *ei = (struct udf_inode_info *)foo; - ei->i_ext.i_data = NULL; + ei->i_data = NULL; inode_init_once(&ei->vfs_inode); } @@ -854,7 +854,7 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block) uint8_t *outstr; struct buffer_head *bh; uint16_t ident; - int ret = -ENOMEM; + int ret; struct timestamp *ts; outstr = kmalloc(128, GFP_NOFS); @@ -1006,18 +1006,10 @@ int udf_compute_nr_groups(struct super_block *sb, u32 partition) static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index) { struct udf_bitmap *bitmap; - int nr_groups; - int size; - - nr_groups = udf_compute_nr_groups(sb, index); - size = sizeof(struct udf_bitmap) + - (sizeof(struct buffer_head *) * nr_groups); - - if (size <= PAGE_SIZE) - bitmap = kzalloc(size, GFP_KERNEL); - else - bitmap = vzalloc(size); /* TODO: get rid of vzalloc */ + int nr_groups = udf_compute_nr_groups(sb, index); + bitmap = kvzalloc(struct_size(bitmap, s_block_bitmap, nr_groups), + GFP_KERNEL); if (!bitmap) return NULL; @@ -1210,7 +1202,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) vat20 = (struct virtualAllocationTable20 *)bh->b_data; } else { vat20 = (struct virtualAllocationTable20 *) - vati->i_ext.i_data; + vati->i_data; } map->s_type_specific.s_virtual.s_start_offset = @@ -1353,6 +1345,12 @@ static int udf_load_sparable_map(struct super_block *sb, (int)spm->numSparingTables); return -EIO; } + if (le32_to_cpu(spm->sizeSparingTable) > sb->s_blocksize) { + udf_err(sb, "error loading logical volume descriptor: " + "Too big sparing table size (%u)\n", + le32_to_cpu(spm->sizeSparingTable)); + return -EIO; + } for (i = 0; i < spm->numSparingTables; i++) { loc = le32_to_cpu(spm->locSparingTable[i]); @@ -1698,7 +1696,8 @@ static noinline int udf_process_sequence( "Pointers (max %u supported)\n", UDF_MAX_TD_NESTING); brelse(bh); - return -EIO; + ret = -EIO; + goto out; } vdp = (struct volDescPtr *)bh->b_data; @@ -1718,7 +1717,8 @@ static noinline int udf_process_sequence( curr = get_volume_descriptor_record(ident, bh, &data); if (IS_ERR(curr)) { brelse(bh); - return PTR_ERR(curr); + ret = PTR_ERR(curr); + goto out; } /* Descriptor we don't care about? */ if (!curr) @@ -1740,28 +1740,31 @@ static noinline int udf_process_sequence( */ if (!data.vds[VDS_POS_PRIMARY_VOL_DESC].block) { udf_err(sb, "Primary Volume Descriptor not found!\n"); - return -EAGAIN; + ret = -EAGAIN; + goto out; } ret = udf_load_pvoldesc(sb, data.vds[VDS_POS_PRIMARY_VOL_DESC].block); if (ret < 0) - return ret; + goto out; if (data.vds[VDS_POS_LOGICAL_VOL_DESC].block) { ret = udf_load_logicalvol(sb, data.vds[VDS_POS_LOGICAL_VOL_DESC].block, fileset); if (ret < 0) - return ret; + goto out; } /* Now handle prevailing Partition Descriptors */ for (i = 0; i < data.num_part_descs; i++) { ret = udf_load_partdesc(sb, data.part_descs_loc[i].rec.block); if (ret < 0) - return ret; + goto out; } - - return 0; + ret = 0; +out: + kfree(data.part_descs_loc); + return ret; } /* diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c index 25ff91c7e94a..c973db239604 100644 --- a/fs/udf/symlink.c +++ b/fs/udf/symlink.c @@ -122,7 +122,7 @@ static int udf_symlink_filler(struct file *file, struct page *page) down_read(&iinfo->i_data_sem); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { - symlink = iinfo->i_ext.i_data + iinfo->i_lenEAttr; + symlink = iinfo->i_data + iinfo->i_lenEAttr; } else { bh = sb_bread(inode->i_sb, pos); diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h index 4245d1f63258..06ff7006b822 100644 --- a/fs/udf/udf_i.h +++ b/fs/udf/udf_i.h @@ -45,11 +45,7 @@ struct udf_inode_info { unsigned i_strat4096 : 1; unsigned i_streamdir : 1; unsigned reserved : 25; - union { - struct short_ad *i_sad; - struct long_ad *i_lad; - __u8 *i_data; - } i_ext; + __u8 *i_data; struct kernel_lb_addr i_locStreamdir; __u64 i_lenStreams; struct rw_semaphore i_data_sem; diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c index 2a878b739115..dc25823bfed9 100644 --- a/fs/unicode/utf8-core.c +++ b/fs/unicode/utf8-core.c @@ -6,6 +6,7 @@ #include <linux/parser.h> #include <linux/errno.h> #include <linux/unicode.h> +#include <linux/stringhash.h> #include "utf8n.h" @@ -122,9 +123,29 @@ int utf8_casefold(const struct unicode_map *um, const struct qstr *str, } return -EINVAL; } - EXPORT_SYMBOL(utf8_casefold); +int utf8_casefold_hash(const struct unicode_map *um, const void *salt, + struct qstr *str) +{ + const struct utf8data *data = utf8nfdicf(um->version); + struct utf8cursor cur; + int c; + unsigned long hash = init_name_hash(salt); + + if (utf8ncursor(&cur, data, str->name, str->len) < 0) + return -EINVAL; + + while ((c = utf8byte(&cur))) { + if (c < 0) + return -EINVAL; + hash = partial_name_hash((unsigned char)c, hash); + } + str->hash = end_name_hash(hash); + return 0; +} +EXPORT_SYMBOL(utf8_casefold_hash); + int utf8_normalize(const struct unicode_map *um, const struct qstr *str, unsigned char *dest, size_t dlen) { diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 0e4a3837da52..000b457ad087 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -601,8 +601,6 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, /* the various vma->vm_userfaultfd_ctx still points to it */ mmap_write_lock(mm); - /* no task can run (and in turn coredump) yet */ - VM_WARN_ON(!mmget_still_valid(mm)); for (vma = mm->mmap; vma; vma = vma->vm_next) if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; @@ -842,7 +840,6 @@ static int userfaultfd_release(struct inode *inode, struct file *file) /* len == 0 means wake all */ struct userfaultfd_wake_range range = { .len = 0, }; unsigned long new_flags; - bool still_valid; WRITE_ONCE(ctx->released, true); @@ -858,7 +855,6 @@ static int userfaultfd_release(struct inode *inode, struct file *file) * taking the mmap_lock for writing. */ mmap_write_lock(mm); - still_valid = mmget_still_valid(mm); prev = NULL; for (vma = mm->mmap; vma; vma = vma->vm_next) { cond_resched(); @@ -869,17 +865,15 @@ static int userfaultfd_release(struct inode *inode, struct file *file) continue; } new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP); - if (still_valid) { - prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end, - new_flags, vma->anon_vma, - vma->vm_file, vma->vm_pgoff, - vma_policy(vma), - NULL_VM_UFFD_CTX); - if (prev) - vma = prev; - else - prev = vma; - } + prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end, + new_flags, vma->anon_vma, + vma->vm_file, vma->vm_pgoff, + vma_policy(vma), + NULL_VM_UFFD_CTX); + if (prev) + vma = prev; + else + prev = vma; vma->vm_flags = new_flags; vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; } @@ -1309,8 +1303,6 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, goto out; mmap_write_lock(mm); - if (!mmget_still_valid(mm)) - goto out_unlock; vma = find_vma_prev(mm, start, &prev); if (!vma) goto out_unlock; @@ -1511,8 +1503,6 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, goto out; mmap_write_lock(mm); - if (!mmget_still_valid(mm)) - goto out_unlock; vma = find_vma_prev(mm, start, &prev); if (!vma) goto out_unlock; diff --git a/fs/vboxsf/dir.c b/fs/vboxsf/dir.c index dd147b490982..4d569f14a8d8 100644 --- a/fs/vboxsf/dir.c +++ b/fs/vboxsf/dir.c @@ -134,7 +134,7 @@ try_next_entry: d_type = vboxsf_get_d_type(info->info.attr.mode); /* - * On 32 bit systems pos is 64 signed, while ino is 32 bit + * On 32-bit systems pos is 64-bit signed, while ino is 32-bit * unsigned so fake_ino may overflow, check for this. */ if ((ino_t)(ctx->pos + 1) != (u64)(ctx->pos + 1)) { diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c index 8fe03b4a0d2b..d7816c01a4f6 100644 --- a/fs/vboxsf/super.c +++ b/fs/vboxsf/super.c @@ -167,6 +167,8 @@ static int vboxsf_fill_super(struct super_block *sb, struct fs_context *fc) err = super_setup_bdi_name(sb, "vboxsf-%d", sbi->bdi_id); if (err) goto fail_free; + sb->s_bdi->ra_pages = 0; + sb->s_bdi->io_pages = 0; /* Turn source into a shfl_string and map the folder */ size = strlen(fc->source) + 1; @@ -384,7 +386,7 @@ fail_nomem: static int vboxsf_parse_monolithic(struct fs_context *fc, void *data) { - char *options = data; + unsigned char *options = data; if (options && options[0] == VBSF_MOUNT_SIGNATURE_BYTE_0 && options[1] == VBSF_MOUNT_SIGNATURE_BYTE_1 && diff --git a/fs/xattr.c b/fs/xattr.c index 386b45676d7e..cd7a563e8bcd 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -232,15 +232,15 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name, } /** - * __vfs_setxattr_locked: set an extended attribute while holding the inode + * __vfs_setxattr_locked - set an extended attribute while holding the inode * lock * - * @dentry - object to perform setxattr on - * @name - xattr name to set - * @value - value to set @name to - * @size - size of @value - * @flags - flags to pass into filesystem operations - * @delegated_inode - on return, will contain an inode pointer that + * @dentry: object to perform setxattr on + * @name: xattr name to set + * @value: value to set @name to + * @size: size of @value + * @flags: flags to pass into filesystem operations + * @delegated_inode: on return, will contain an inode pointer that * a delegation was broken on, NULL if none. */ int @@ -443,12 +443,12 @@ __vfs_removexattr(struct dentry *dentry, const char *name) EXPORT_SYMBOL(__vfs_removexattr); /** - * __vfs_removexattr_locked: set an extended attribute while holding the inode + * __vfs_removexattr_locked - set an extended attribute while holding the inode * lock * - * @dentry - object to perform setxattr on - * @name - name of xattr to remove - * @delegated_inode - on return, will contain an inode pointer that + * @dentry: object to perform setxattr on + * @name: name of xattr to remove + * @delegated_inode: on return, will contain an inode pointer that * a delegation was broken on, NULL if none. */ int diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index e685299eb3d2..9fac5ea8d0e4 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -22,6 +22,31 @@ config XFS_FS system of your root partition is compiled as a module, you'll need to use an initial ramdisk (initrd) to boot. +config XFS_SUPPORT_V4 + bool "Support deprecated V4 (crc=0) format" + depends on XFS_FS + default y + help + The V4 filesystem format lacks certain features that are supported + by the V5 format, such as metadata checksumming, strengthened + metadata verification, and the ability to store timestamps past the + year 2038. Because of this, the V4 format is deprecated. All users + should upgrade by backing up their files, reformatting, and restoring + from the backup. + + Administrators and users can detect a V4 filesystem by running + xfs_info against a filesystem mountpoint and checking for a string + beginning with "crc=". If the string "crc=0" is found, the + filesystem is a V4 filesystem. If no such string is found, please + upgrade xfsprogs to the latest version and try again. + + This option will become default N in September 2025. Support for the + V4 format will be removed entirely in September 2030. Distributors + can say N here to withdraw support earlier. + + To continue supporting the old V4 format (crc=0), say Y. + To close off an attack surface, say N. + config XFS_QUOTA bool "XFS Quota support" depends on XFS_FS diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index e841ed781a25..e986b95d94c9 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -93,25 +93,3 @@ kmem_alloc_large(size_t size, xfs_km_flags_t flags) return ptr; return __kmem_vmalloc(size, flags); } - -void * -kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags) -{ - int retries = 0; - gfp_t lflags = kmem_flags_convert(flags); - void *ptr; - - trace_kmem_realloc(newsize, flags, _RET_IP_); - - do { - ptr = krealloc(old, newsize, lflags); - if (ptr || (flags & KM_MAYFAIL)) - return ptr; - if (!(++retries % 100)) - xfs_err(NULL, - "%s(%u) possible memory allocation deadlock size %zu in %s (mode:0x%x)", - current->comm, current->pid, - newsize, __func__, lflags); - congestion_wait(BLK_RW_ASYNC, HZ/50); - } while (1); -} diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 8e8555817e6d..38007117697e 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -59,7 +59,6 @@ kmem_flags_convert(xfs_km_flags_t flags) extern void *kmem_alloc(size_t, xfs_km_flags_t); extern void *kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags); extern void *kmem_alloc_large(size_t size, xfs_km_flags_t); -extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t); static inline void kmem_free(const void *ptr) { kvfree(ptr); @@ -72,12 +71,6 @@ kmem_zalloc(size_t size, xfs_km_flags_t flags) return kmem_alloc(size, flags | KM_ZERO); } -static inline void * -kmem_zalloc_large(size_t size, xfs_km_flags_t flags) -{ - return kmem_alloc_large(size, flags | KM_ZERO); -} - /* * Zone interfaces */ diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 8cf73fe4338e..9331f3516afa 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -333,6 +333,11 @@ xfs_agiblock_init( } for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); + if (xfs_sb_version_hasinobtcounts(&mp->m_sb)) { + agi->agi_iblocks = cpu_to_be32(1); + if (xfs_sb_version_hasfinobt(&mp->m_sb)) + agi->agi_fblocks = cpu_to_be32(1); + } } typedef void (*aghdr_init_work_f)(struct xfs_mount *mp, struct xfs_buf *bp, diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 2e055c079f39..fd8e6418a0d3 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -428,7 +428,7 @@ xfs_attr_set( */ if (XFS_IFORK_Q(dp) == 0) { int sf_size = sizeof(struct xfs_attr_sf_hdr) + - XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, + xfs_attr_sf_entsize_byname(args->namelen, args->valuelen); error = xfs_bmap_add_attrfork(dp, sf_size, rsvd); @@ -523,6 +523,14 @@ out_trans_cancel: * External routines when attribute list is inside the inode *========================================================================*/ +static inline int xfs_attr_sf_totsize(struct xfs_inode *dp) +{ + struct xfs_attr_shortform *sf; + + sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data; + return be16_to_cpu(sf->hdr.totsize); +} + /* * Add a name to the shortform attribute list structure * This is the external routine. @@ -555,8 +563,8 @@ xfs_attr_shortform_addname(xfs_da_args_t *args) args->valuelen >= XFS_ATTR_SF_ENTSIZE_MAX) return -ENOSPC; - newsize = XFS_ATTR_SF_TOTSIZE(args->dp); - newsize += XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen); + newsize = xfs_attr_sf_totsize(args->dp); + newsize += xfs_attr_sf_entsize_byname(args->namelen, args->valuelen); forkoff = xfs_attr_shortform_bytesfit(args->dp, newsize); if (!forkoff) diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 305d4bc07337..bb128db220ac 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -684,9 +684,9 @@ xfs_attr_sf_findname( sf = (struct xfs_attr_shortform *)args->dp->i_afp->if_u1.if_data; sfe = &sf->list[0]; end = sf->hdr.count; - for (i = 0; i < end; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), + for (i = 0; i < end; sfe = xfs_attr_sf_nextentry(sfe), base += size, i++) { - size = XFS_ATTR_SF_ENTSIZE(sfe); + size = xfs_attr_sf_entsize(sfe); if (!xfs_attr_match(args, sfe->namelen, sfe->nameval, sfe->flags)) continue; @@ -728,15 +728,15 @@ xfs_attr_shortform_add( ifp = dp->i_afp; ASSERT(ifp->if_flags & XFS_IFINLINE); - sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; + sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; if (xfs_attr_sf_findname(args, &sfe, NULL) == -EEXIST) ASSERT(0); offset = (char *)sfe - (char *)sf; - size = XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen); + size = xfs_attr_sf_entsize_byname(args->namelen, args->valuelen); xfs_idata_realloc(dp, size, XFS_ATTR_FORK); - sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; - sfe = (xfs_attr_sf_entry_t *)((char *)sf + offset); + sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; + sfe = (struct xfs_attr_sf_entry *)((char *)sf + offset); sfe->namelen = args->namelen; sfe->valuelen = args->valuelen; @@ -787,12 +787,12 @@ xfs_attr_shortform_remove( dp = args->dp; mp = dp->i_mount; - sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data; + sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data; error = xfs_attr_sf_findname(args, &sfe, &base); if (error != -EEXIST) return error; - size = XFS_ATTR_SF_ENTSIZE(sfe); + size = xfs_attr_sf_entsize(sfe); /* * Fix up the attribute fork data, covering the hole @@ -837,8 +837,8 @@ xfs_attr_shortform_remove( int xfs_attr_shortform_lookup(xfs_da_args_t *args) { - xfs_attr_shortform_t *sf; - xfs_attr_sf_entry_t *sfe; + struct xfs_attr_shortform *sf; + struct xfs_attr_sf_entry *sfe; int i; struct xfs_ifork *ifp; @@ -846,10 +846,10 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args) ifp = args->dp->i_afp; ASSERT(ifp->if_flags & XFS_IFINLINE); - sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; + sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; - sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { + sfe = xfs_attr_sf_nextentry(sfe), i++) { if (xfs_attr_match(args, sfe->namelen, sfe->nameval, sfe->flags)) return -EEXIST; @@ -873,10 +873,10 @@ xfs_attr_shortform_getvalue( int i; ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE); - sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data; + sf = (struct xfs_attr_shortform *)args->dp->i_afp->if_u1.if_data; sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; - sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { + sfe = xfs_attr_sf_nextentry(sfe), i++) { if (xfs_attr_match(args, sfe->namelen, sfe->nameval, sfe->flags)) return xfs_attr_copy_value(args, @@ -908,12 +908,12 @@ xfs_attr_shortform_to_leaf( dp = args->dp; ifp = dp->i_afp; - sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; + sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; size = be16_to_cpu(sf->hdr.totsize); tmpbuffer = kmem_alloc(size, 0); ASSERT(tmpbuffer != NULL); memcpy(tmpbuffer, ifp->if_u1.if_data, size); - sf = (xfs_attr_shortform_t *)tmpbuffer; + sf = (struct xfs_attr_shortform *)tmpbuffer; xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); xfs_bmap_local_to_extents_empty(args->trans, dp, XFS_ATTR_FORK); @@ -951,7 +951,7 @@ xfs_attr_shortform_to_leaf( ASSERT(error != -ENOSPC); if (error) goto out; - sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + sfe = xfs_attr_sf_nextentry(sfe); } error = 0; *leaf_bp = bp; @@ -992,9 +992,8 @@ xfs_attr_shortform_allfit( return 0; if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX) return 0; - bytes += sizeof(struct xfs_attr_sf_entry) - 1 - + name_loc->namelen - + be16_to_cpu(name_loc->valuelen); + bytes += xfs_attr_sf_entsize_byname(name_loc->namelen, + be16_to_cpu(name_loc->valuelen)); } if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) && (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && @@ -1039,7 +1038,7 @@ xfs_attr_shortform_verify( * xfs_attr_sf_entry is defined with a 1-byte variable * array at the end, so we must subtract that off. */ - if (((char *)sfep + sizeof(*sfep) - 1) >= endp) + if (((char *)sfep + sizeof(*sfep)) >= endp) return __this_address; /* Don't allow names with known bad length. */ @@ -1051,7 +1050,7 @@ xfs_attr_shortform_verify( * within the data buffer. The next entry starts after the * name component, so nextentry is an acceptable test. */ - next_sfep = XFS_ATTR_SF_NEXTENTRY(sfep); + next_sfep = xfs_attr_sf_nextentry(sfep); if ((char *)next_sfep > endp) return __this_address; diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 3f80cede7406..48d8e9caf86f 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -96,8 +96,6 @@ xfs_attr3_rmt_verify( { struct xfs_attr3_rmt_hdr *rmt = ptr; - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return __this_address; if (!xfs_verify_magic(bp, rmt->rm_magic)) return __this_address; if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid)) diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h index bb004fb7944a..37578b369d9b 100644 --- a/fs/xfs/libxfs/xfs_attr_sf.h +++ b/fs/xfs/libxfs/xfs_attr_sf.h @@ -13,7 +13,6 @@ * to fit into the literal area of the inode. */ typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t; -typedef struct xfs_attr_sf_entry xfs_attr_sf_entry_t; /* * We generate this then sort it, attr_list() must return things in hash-order. @@ -27,16 +26,26 @@ typedef struct xfs_attr_sf_sort { unsigned char *name; /* name value, pointer into buffer */ } xfs_attr_sf_sort_t; -#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen) /* space name/value uses */ \ - (((int)sizeof(xfs_attr_sf_entry_t)-1 + (nlen)+(vlen))) #define XFS_ATTR_SF_ENTSIZE_MAX /* max space for name&value */ \ ((1 << (NBBY*(int)sizeof(uint8_t))) - 1) -#define XFS_ATTR_SF_ENTSIZE(sfep) /* space an entry uses */ \ - ((int)sizeof(xfs_attr_sf_entry_t)-1 + (sfep)->namelen+(sfep)->valuelen) -#define XFS_ATTR_SF_NEXTENTRY(sfep) /* next entry in struct */ \ - ((xfs_attr_sf_entry_t *)((char *)(sfep) + XFS_ATTR_SF_ENTSIZE(sfep))) -#define XFS_ATTR_SF_TOTSIZE(dp) /* total space in use */ \ - (be16_to_cpu(((xfs_attr_shortform_t *) \ - ((dp)->i_afp->if_u1.if_data))->hdr.totsize)) + +/* space name/value uses */ +static inline int xfs_attr_sf_entsize_byname(uint8_t nlen, uint8_t vlen) +{ + return sizeof(struct xfs_attr_sf_entry) + nlen + vlen; +} + +/* space an entry uses */ +static inline int xfs_attr_sf_entsize(struct xfs_attr_sf_entry *sfep) +{ + return struct_size(sfep, nameval, sfep->namelen + sfep->valuelen); +} + +/* next entry in struct */ +static inline struct xfs_attr_sf_entry * +xfs_attr_sf_nextentry(struct xfs_attr_sf_entry *sfep) +{ + return (void *)sfep + xfs_attr_sf_entsize(sfep); +} #endif /* __XFS_ATTR_SF_H__ */ diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 1b0a01b06a05..d9a692484eae 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5046,20 +5046,25 @@ xfs_bmap_del_extent_real( flags = XFS_ILOG_CORE; if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) { - xfs_fsblock_t bno; xfs_filblks_t len; xfs_extlen_t mod; - bno = div_u64_rem(del->br_startblock, mp->m_sb.sb_rextsize, - &mod); - ASSERT(mod == 0); len = div_u64_rem(del->br_blockcount, mp->m_sb.sb_rextsize, &mod); ASSERT(mod == 0); - error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len); - if (error) - goto done; + if (!(bflags & XFS_BMAPI_REMAP)) { + xfs_fsblock_t bno; + + bno = div_u64_rem(del->br_startblock, + mp->m_sb.sb_rextsize, &mod); + ASSERT(mod == 0); + + error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len); + if (error) + goto done; + } + do_fx = 0; nblks = len * mp->m_sb.sb_rextsize; qfield = XFS_TRANS_DQ_RTBCOUNT; diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 059ac108b1b3..b876b44c0204 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -15,8 +15,8 @@ */ #define XFS_DA_NODE_MAGIC 0xfebe /* magic number: non-leaf blocks */ #define XFS_ATTR_LEAF_MAGIC 0xfbee /* magic number: attribute leaf blks */ -#define XFS_DIR2_LEAF1_MAGIC 0xd2f1 /* magic number: v2 dirlf single blks */ -#define XFS_DIR2_LEAFN_MAGIC 0xd2ff /* magic number: v2 dirlf multi blks */ +#define XFS_DIR2_LEAF1_MAGIC 0xd2f1 /* magic number: v2 dirlf single blks */ +#define XFS_DIR2_LEAFN_MAGIC 0xd2ff /* magic number: v2 dirlf multi blks */ typedef struct xfs_da_blkinfo { __be32 forw; /* previous block in list */ @@ -35,8 +35,8 @@ typedef struct xfs_da_blkinfo { */ #define XFS_DA3_NODE_MAGIC 0x3ebe /* magic number: non-leaf blocks */ #define XFS_ATTR3_LEAF_MAGIC 0x3bee /* magic number: attribute leaf blks */ -#define XFS_DIR3_LEAF1_MAGIC 0x3df1 /* magic number: v2 dirlf single blks */ -#define XFS_DIR3_LEAFN_MAGIC 0x3dff /* magic number: v2 dirlf multi blks */ +#define XFS_DIR3_LEAF1_MAGIC 0x3df1 /* magic number: v3 dirlf single blks */ +#define XFS_DIR3_LEAFN_MAGIC 0x3dff /* magic number: v3 dirlf multi blks */ struct xfs_da3_blkinfo { /* @@ -61,7 +61,7 @@ struct xfs_da3_blkinfo { * Since we have duplicate keys, use a binary search but always follow * all match in the block, not just the first match found. */ -#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */ +#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */ typedef struct xfs_da_node_hdr { struct xfs_da_blkinfo info; /* block type, links, etc. */ @@ -579,7 +579,7 @@ xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp) /* * Entries are packed toward the top as tight as possible. */ -typedef struct xfs_attr_shortform { +struct xfs_attr_shortform { struct xfs_attr_sf_hdr { /* constant-structure header block */ __be16 totsize; /* total bytes in shortform list */ __u8 count; /* count of active entries */ @@ -589,9 +589,9 @@ typedef struct xfs_attr_shortform { uint8_t namelen; /* actual length of name (no NULL) */ uint8_t valuelen; /* actual length of value (no NULL) */ uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ - uint8_t nameval[1]; /* name & value bytes concatenated */ + uint8_t nameval[]; /* name & value bytes concatenated */ } list[1]; /* variable sized array */ -} xfs_attr_shortform_t; +}; typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */ __be16 base; /* base of free region */ @@ -746,14 +746,14 @@ xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx) */ static inline int xfs_attr_leaf_entsize_remote(int nlen) { - return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \ - XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1); + return round_up(sizeof(struct xfs_attr_leaf_name_remote) - 1 + + nlen, XFS_ATTR_LEAF_NAME_ALIGN); } static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen) { - return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) + - XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1); + return round_up(sizeof(struct xfs_attr_leaf_name_local) - 1 + + nlen + vlen, XFS_ATTR_LEAF_NAME_ALIGN); } static inline int xfs_attr_leaf_entsize_local_max(int bsize) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index d8f586256add..eff4a127188e 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -16,6 +16,8 @@ #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_log.h" /* * Deferred Operations in XFS @@ -186,8 +188,9 @@ xfs_defer_create_intent( { const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; - dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work, - dfp->dfp_count, sort); + if (!dfp->dfp_intent) + dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work, + dfp->dfp_count, sort); } /* @@ -312,22 +315,6 @@ xfs_defer_trans_roll( } /* - * Reset an already used dfops after finish. - */ -static void -xfs_defer_reset( - struct xfs_trans *tp) -{ - ASSERT(list_empty(&tp->t_dfops)); - - /* - * Low mode state transfers across transaction rolls to mirror dfops - * lifetime. Clear it now that dfops is reset. - */ - tp->t_flags &= ~XFS_TRANS_LOWMODE; -} - -/* * Free up any items left in the list. */ static void @@ -360,6 +347,58 @@ xfs_defer_cancel_list( } /* + * Prevent a log intent item from pinning the tail of the log by logging a + * done item to release the intent item; and then log a new intent item. + * The caller should provide a fresh transaction and roll it after we're done. + */ +static int +xfs_defer_relog( + struct xfs_trans **tpp, + struct list_head *dfops) +{ + struct xlog *log = (*tpp)->t_mountp->m_log; + struct xfs_defer_pending *dfp; + xfs_lsn_t threshold_lsn = NULLCOMMITLSN; + + + ASSERT((*tpp)->t_flags & XFS_TRANS_PERM_LOG_RES); + + list_for_each_entry(dfp, dfops, dfp_list) { + /* + * If the log intent item for this deferred op is not a part of + * the current log checkpoint, relog the intent item to keep + * the log tail moving forward. We're ok with this being racy + * because an incorrect decision means we'll be a little slower + * at pushing the tail. + */ + if (dfp->dfp_intent == NULL || + xfs_log_item_in_current_chkpt(dfp->dfp_intent)) + continue; + + /* + * Figure out where we need the tail to be in order to maintain + * the minimum required free space in the log. Only sample + * the log threshold once per call. + */ + if (threshold_lsn == NULLCOMMITLSN) { + threshold_lsn = xlog_grant_push_threshold(log, 0); + if (threshold_lsn == NULLCOMMITLSN) + break; + } + if (XFS_LSN_CMP(dfp->dfp_intent->li_lsn, threshold_lsn) >= 0) + continue; + + trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp); + XFS_STATS_INC((*tpp)->t_mountp, defer_relog); + dfp->dfp_intent = xfs_trans_item_relog(dfp->dfp_intent, *tpp); + } + + if ((*tpp)->t_flags & XFS_TRANS_DIRTY) + return xfs_defer_trans_roll(tpp); + return 0; +} + +/* * Log an intent-done item for the first pending intent, and finish the work * items. */ @@ -390,6 +429,7 @@ xfs_defer_finish_one( list_add(li, &dfp->dfp_work); dfp->dfp_count++; dfp->dfp_done = NULL; + dfp->dfp_intent = NULL; xfs_defer_create_intent(tp, dfp, false); } @@ -428,13 +468,27 @@ xfs_defer_finish_noroll( /* Until we run out of pending work to finish... */ while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) { + /* + * Deferred items that are created in the process of finishing + * other deferred work items should be queued at the head of + * the pending list, which puts them ahead of the deferred work + * that was created by the caller. This keeps the number of + * pending work items to a minimum, which decreases the amount + * of time that any one intent item can stick around in memory, + * pinning the log tail. + */ xfs_defer_create_intents(*tp); - list_splice_tail_init(&(*tp)->t_dfops, &dop_pending); + list_splice_init(&(*tp)->t_dfops, &dop_pending); error = xfs_defer_trans_roll(tp); if (error) goto out_shutdown; + /* Possibly relog intent items to keep the log moving. */ + error = xfs_defer_relog(tp, &dop_pending); + if (error) + goto out_shutdown; + dfp = list_first_entry(&dop_pending, struct xfs_defer_pending, dfp_list); error = xfs_defer_finish_one(*tp, dfp); @@ -475,7 +529,10 @@ xfs_defer_finish( return error; } } - xfs_defer_reset(*tp); + + /* Reset LOWMODE now that we've finished all the dfops. */ + ASSERT(list_empty(&(*tp)->t_dfops)); + (*tp)->t_flags &= ~XFS_TRANS_LOWMODE; return 0; } @@ -549,6 +606,139 @@ xfs_defer_move( * that behavior. */ dtp->t_flags |= (stp->t_flags & XFS_TRANS_LOWMODE); + stp->t_flags &= ~XFS_TRANS_LOWMODE; +} + +/* + * Prepare a chain of fresh deferred ops work items to be completed later. Log + * recovery requires the ability to put off until later the actual finishing + * work so that it can process unfinished items recovered from the log in + * correct order. + * + * Create and log intent items for all the work that we're capturing so that we + * can be assured that the items will get replayed if the system goes down + * before log recovery gets a chance to finish the work it put off. The entire + * deferred ops state is transferred to the capture structure and the + * transaction is then ready for the caller to commit it. If there are no + * intent items to capture, this function returns NULL. + * + * If capture_ip is not NULL, the capture structure will obtain an extra + * reference to the inode. + */ +static struct xfs_defer_capture * +xfs_defer_ops_capture( + struct xfs_trans *tp, + struct xfs_inode *capture_ip) +{ + struct xfs_defer_capture *dfc; + + if (list_empty(&tp->t_dfops)) + return NULL; + + /* Create an object to capture the defer ops. */ + dfc = kmem_zalloc(sizeof(*dfc), KM_NOFS); + INIT_LIST_HEAD(&dfc->dfc_list); + INIT_LIST_HEAD(&dfc->dfc_dfops); + + xfs_defer_create_intents(tp); + + /* Move the dfops chain and transaction state to the capture struct. */ + list_splice_init(&tp->t_dfops, &dfc->dfc_dfops); + dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE; + tp->t_flags &= ~XFS_TRANS_LOWMODE; + + /* Capture the remaining block reservations along with the dfops. */ + dfc->dfc_blkres = tp->t_blk_res - tp->t_blk_res_used; + dfc->dfc_rtxres = tp->t_rtx_res - tp->t_rtx_res_used; + + /* Preserve the log reservation size. */ + dfc->dfc_logres = tp->t_log_res; + + /* + * Grab an extra reference to this inode and attach it to the capture + * structure. + */ + if (capture_ip) { + ihold(VFS_I(capture_ip)); + dfc->dfc_capture_ip = capture_ip; + } + + return dfc; +} + +/* Release all resources that we used to capture deferred ops. */ +void +xfs_defer_ops_release( + struct xfs_mount *mp, + struct xfs_defer_capture *dfc) +{ + xfs_defer_cancel_list(mp, &dfc->dfc_dfops); + if (dfc->dfc_capture_ip) + xfs_irele(dfc->dfc_capture_ip); + kmem_free(dfc); +} + +/* + * Capture any deferred ops and commit the transaction. This is the last step + * needed to finish a log intent item that we recovered from the log. If any + * of the deferred ops operate on an inode, the caller must pass in that inode + * so that the reference can be transferred to the capture structure. The + * caller must hold ILOCK_EXCL on the inode, and must unlock it before calling + * xfs_defer_ops_continue. + */ +int +xfs_defer_ops_capture_and_commit( + struct xfs_trans *tp, + struct xfs_inode *capture_ip, + struct list_head *capture_list) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_defer_capture *dfc; + int error; + + ASSERT(!capture_ip || xfs_isilocked(capture_ip, XFS_ILOCK_EXCL)); + + /* If we don't capture anything, commit transaction and exit. */ + dfc = xfs_defer_ops_capture(tp, capture_ip); + if (!dfc) + return xfs_trans_commit(tp); + + /* Commit the transaction and add the capture structure to the list. */ + error = xfs_trans_commit(tp); + if (error) { + xfs_defer_ops_release(mp, dfc); + return error; + } + + list_add_tail(&dfc->dfc_list, capture_list); + return 0; +} + +/* + * Attach a chain of captured deferred ops to a new transaction and free the + * capture structure. If an inode was captured, it will be passed back to the + * caller with ILOCK_EXCL held and joined to the transaction with lockflags==0. + * The caller now owns the inode reference. + */ +void +xfs_defer_ops_continue( + struct xfs_defer_capture *dfc, + struct xfs_trans *tp, + struct xfs_inode **captured_ipp) +{ + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY)); + + /* Lock and join the captured inode to the new transaction. */ + if (dfc->dfc_capture_ip) { + xfs_ilock(dfc->dfc_capture_ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dfc->dfc_capture_ip, 0); + } + *captured_ipp = dfc->dfc_capture_ip; + + /* Move captured dfops chain and state to the transaction. */ + list_splice_init(&dfc->dfc_dfops, &tp->t_dfops); + tp->t_flags |= dfc->dfc_tpflags; - xfs_defer_reset(stp); + kmem_free(dfc); } diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 6b2ca580f2b0..05472f71fffe 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -8,6 +8,7 @@ struct xfs_btree_cur; struct xfs_defer_op_type; +struct xfs_defer_capture; /* * Header for deferred operation list. @@ -63,4 +64,40 @@ extern const struct xfs_defer_op_type xfs_rmap_update_defer_type; extern const struct xfs_defer_op_type xfs_extent_free_defer_type; extern const struct xfs_defer_op_type xfs_agfl_free_defer_type; +/* + * This structure enables a dfops user to detach the chain of deferred + * operations from a transaction so that they can be continued later. + */ +struct xfs_defer_capture { + /* List of other capture structures. */ + struct list_head dfc_list; + + /* Deferred ops state saved from the transaction. */ + struct list_head dfc_dfops; + unsigned int dfc_tpflags; + + /* Block reservations for the data and rt devices. */ + unsigned int dfc_blkres; + unsigned int dfc_rtxres; + + /* Log reservation saved from the transaction. */ + unsigned int dfc_logres; + + /* + * An inode reference that must be maintained to complete the deferred + * work. + */ + struct xfs_inode *dfc_capture_ip; +}; + +/* + * Functions to capture a chain of deferred operations and continue them later. + * This doesn't normally happen except log recovery. + */ +int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp, + struct xfs_inode *capture_ip, struct list_head *capture_list); +void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp, + struct xfs_inode **captured_ipp); +void xfs_defer_ops_release(struct xfs_mount *mp, struct xfs_defer_capture *d); + #endif /* __XFS_DEFER_H__ */ diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index 5a2db00b9d5f..6766417d5ba4 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -69,6 +69,13 @@ xfs_dquot_verify( ddq_type != XFS_DQTYPE_GROUP) return __this_address; + if ((ddq->d_type & XFS_DQTYPE_BIGTIME) && + !xfs_sb_version_hasbigtime(&mp->m_sb)) + return __this_address; + + if ((ddq->d_type & XFS_DQTYPE_BIGTIME) && !ddq->d_id) + return __this_address; + if (id != -1 && id != be32_to_cpu(ddq->d_id)) return __this_address; @@ -288,3 +295,31 @@ const struct xfs_buf_ops xfs_dquot_buf_ra_ops = { .verify_read = xfs_dquot_buf_readahead_verify, .verify_write = xfs_dquot_buf_write_verify, }; + +/* Convert an on-disk timer value into an incore timer value. */ +time64_t +xfs_dquot_from_disk_ts( + struct xfs_disk_dquot *ddq, + __be32 dtimer) +{ + uint32_t t = be32_to_cpu(dtimer); + + if (t != 0 && (ddq->d_type & XFS_DQTYPE_BIGTIME)) + return xfs_dq_bigtime_to_unix(t); + + return t; +} + +/* Convert an incore timer value into an on-disk timer value. */ +__be32 +xfs_dquot_to_disk_ts( + struct xfs_dquot *dqp, + time64_t timer) +{ + uint32_t t = timer; + + if (timer != 0 && (dqp->q_type & XFS_DQTYPE_BIGTIME)) + t = xfs_dq_unix_to_bigtime(timer); + + return cpu_to_be32(t); +} diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 31b7ece985bb..dd764da08f6f 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -449,10 +449,12 @@ xfs_sb_has_compat_feature( #define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */ #define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map btree */ #define XFS_SB_FEAT_RO_COMPAT_REFLINK (1 << 2) /* reflinked files */ +#define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3) /* inobt block counts */ #define XFS_SB_FEAT_RO_COMPAT_ALL \ (XFS_SB_FEAT_RO_COMPAT_FINOBT | \ XFS_SB_FEAT_RO_COMPAT_RMAPBT | \ - XFS_SB_FEAT_RO_COMPAT_REFLINK) + XFS_SB_FEAT_RO_COMPAT_REFLINK| \ + XFS_SB_FEAT_RO_COMPAT_INOBTCNT) #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL static inline bool xfs_sb_has_ro_compat_feature( @@ -465,10 +467,12 @@ xfs_sb_has_ro_compat_feature( #define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */ #define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */ #define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */ +#define XFS_SB_FEAT_INCOMPAT_BIGTIME (1 << 3) /* large timestamps */ #define XFS_SB_FEAT_INCOMPAT_ALL \ (XFS_SB_FEAT_INCOMPAT_FTYPE| \ XFS_SB_FEAT_INCOMPAT_SPINODES| \ - XFS_SB_FEAT_INCOMPAT_META_UUID) + XFS_SB_FEAT_INCOMPAT_META_UUID| \ + XFS_SB_FEAT_INCOMPAT_BIGTIME) #define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL static inline bool @@ -563,6 +567,23 @@ static inline bool xfs_sb_version_hasreflink(struct xfs_sb *sbp) (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_REFLINK); } +static inline bool xfs_sb_version_hasbigtime(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_BIGTIME); +} + +/* + * Inode btree block counter. We record the number of inobt and finobt blocks + * in the AGI header so that we can skip the finobt walk at mount time when + * setting up per-AG reservations. + */ +static inline bool xfs_sb_version_hasinobtcounts(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && + (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT); +} + /* * end of superblock version macros */ @@ -765,6 +786,9 @@ typedef struct xfs_agi { __be32 agi_free_root; /* root of the free inode btree */ __be32 agi_free_level;/* levels in free inode btree */ + __be32 agi_iblocks; /* inobt blocks used */ + __be32 agi_fblocks; /* finobt blocks used */ + /* structure must be padded to 64 bit alignment */ } xfs_agi_t; @@ -785,7 +809,8 @@ typedef struct xfs_agi { #define XFS_AGI_ALL_BITS_R1 ((1 << XFS_AGI_NUM_BITS_R1) - 1) #define XFS_AGI_FREE_ROOT (1 << 11) #define XFS_AGI_FREE_LEVEL (1 << 12) -#define XFS_AGI_NUM_BITS_R2 13 +#define XFS_AGI_IBLOCKS (1 << 13) /* both inobt/finobt block counters */ +#define XFS_AGI_NUM_BITS_R2 14 /* disk block (xfs_daddr_t) in the AG */ #define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log)) @@ -831,10 +856,87 @@ struct xfs_agfl { ASSERT(xfs_daddr_to_agno(mp, d) == \ xfs_daddr_to_agno(mp, (d) + (len) - 1))) -typedef struct xfs_timestamp { +/* + * XFS Timestamps + * ============== + * + * Traditional ondisk inode timestamps consist of signed 32-bit counters for + * seconds and nanoseconds; time zero is the Unix epoch, Jan 1 00:00:00 UTC + * 1970, which means that the timestamp epoch is the same as the Unix epoch. + * Therefore, the ondisk min and max defined here can be used directly to + * constrain the incore timestamps on a Unix system. Note that we actually + * encode a __be64 value on disk. + * + * When the bigtime feature is enabled, ondisk inode timestamps become an + * unsigned 64-bit nanoseconds counter. This means that the bigtime inode + * timestamp epoch is the start of the classic timestamp range, which is + * Dec 31 20:45:52 UTC 1901. Because the epochs are not the same, callers + * /must/ use the bigtime conversion functions when encoding and decoding raw + * timestamps. + */ +typedef __be64 xfs_timestamp_t; + +/* Legacy timestamp encoding format. */ +struct xfs_legacy_timestamp { __be32 t_sec; /* timestamp seconds */ __be32 t_nsec; /* timestamp nanoseconds */ -} xfs_timestamp_t; +}; + +/* + * Smallest possible ondisk seconds value with traditional timestamps. This + * corresponds exactly with the incore timestamp Dec 13 20:45:52 UTC 1901. + */ +#define XFS_LEGACY_TIME_MIN ((int64_t)S32_MIN) + +/* + * Largest possible ondisk seconds value with traditional timestamps. This + * corresponds exactly with the incore timestamp Jan 19 03:14:07 UTC 2038. + */ +#define XFS_LEGACY_TIME_MAX ((int64_t)S32_MAX) + +/* + * Smallest possible ondisk seconds value with bigtime timestamps. This + * corresponds (after conversion to a Unix timestamp) with the traditional + * minimum timestamp of Dec 13 20:45:52 UTC 1901. + */ +#define XFS_BIGTIME_TIME_MIN ((int64_t)0) + +/* + * Largest supported ondisk seconds value with bigtime timestamps. This + * corresponds (after conversion to a Unix timestamp) with an incore timestamp + * of Jul 2 20:20:24 UTC 2486. + * + * We round down the ondisk limit so that the bigtime quota and inode max + * timestamps will be the same. + */ +#define XFS_BIGTIME_TIME_MAX ((int64_t)((-1ULL / NSEC_PER_SEC) & ~0x3ULL)) + +/* + * Bigtime epoch is set exactly to the minimum time value that a traditional + * 32-bit timestamp can represent when using the Unix epoch as a reference. + * Hence the Unix epoch is at a fixed offset into the supported bigtime + * timestamp range. + * + * The bigtime epoch also matches the minimum value an on-disk 32-bit XFS + * timestamp can represent so we will not lose any fidelity in converting + * to/from unix and bigtime timestamps. + * + * The following conversion factor converts a seconds counter from the Unix + * epoch to the bigtime epoch. + */ +#define XFS_BIGTIME_EPOCH_OFFSET (-(int64_t)S32_MIN) + +/* Convert a timestamp from the Unix epoch to the bigtime epoch. */ +static inline uint64_t xfs_unix_to_bigtime(time64_t unix_seconds) +{ + return (uint64_t)unix_seconds + XFS_BIGTIME_EPOCH_OFFSET; +} + +/* Convert a timestamp from the bigtime epoch to the Unix epoch. */ +static inline time64_t xfs_bigtime_to_unix(uint64_t ondisk_seconds) +{ + return (time64_t)ondisk_seconds - XFS_BIGTIME_EPOCH_OFFSET; +} /* * On-disk inode structure. @@ -1061,12 +1163,22 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) #define XFS_DIFLAG2_DAX_BIT 0 /* use DAX for this inode */ #define XFS_DIFLAG2_REFLINK_BIT 1 /* file's blocks may be shared */ #define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */ +#define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */ + #define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT) #define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT) #define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT) +#define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT) #define XFS_DIFLAG2_ANY \ - (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE) + (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \ + XFS_DIFLAG2_BIGTIME) + +static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip) +{ + return dip->di_version >= 3 && + (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_BIGTIME)); +} /* * Inode number format: @@ -1152,13 +1264,98 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) #define XFS_DQTYPE_USER 0x01 /* user dquot record */ #define XFS_DQTYPE_PROJ 0x02 /* project dquot record */ #define XFS_DQTYPE_GROUP 0x04 /* group dquot record */ +#define XFS_DQTYPE_BIGTIME 0x80 /* large expiry timestamps */ /* bitmask to determine if this is a user/group/project dquot */ #define XFS_DQTYPE_REC_MASK (XFS_DQTYPE_USER | \ XFS_DQTYPE_PROJ | \ XFS_DQTYPE_GROUP) -#define XFS_DQTYPE_ANY (XFS_DQTYPE_REC_MASK) +#define XFS_DQTYPE_ANY (XFS_DQTYPE_REC_MASK | \ + XFS_DQTYPE_BIGTIME) + +/* + * XFS Quota Timers + * ================ + * + * Traditional quota grace period expiration timers are an unsigned 32-bit + * seconds counter; time zero is the Unix epoch, Jan 1 00:00:01 UTC 1970. + * Note that an expiration value of zero means that the quota limit has not + * been reached, and therefore no expiration has been set. Therefore, the + * ondisk min and max defined here can be used directly to constrain the incore + * quota expiration timestamps on a Unix system. + * + * When bigtime is enabled, we trade two bits of precision to expand the + * expiration timeout range to match that of big inode timestamps. The min and + * max recorded here are the on-disk limits, not a Unix timestamp. + * + * The grace period for each quota type is stored in the root dquot (id = 0) + * and is applied to a non-root dquot when it exceeds the soft or hard limits. + * The length of quota grace periods are unsigned 32-bit quantities measured in + * units of seconds. A value of zero means to use the default period. + */ + +/* + * Smallest possible ondisk quota expiration value with traditional timestamps. + * This corresponds exactly with the incore expiration Jan 1 00:00:01 UTC 1970. + */ +#define XFS_DQ_LEGACY_EXPIRY_MIN ((int64_t)1) + +/* + * Largest possible ondisk quota expiration value with traditional timestamps. + * This corresponds exactly with the incore expiration Feb 7 06:28:15 UTC 2106. + */ +#define XFS_DQ_LEGACY_EXPIRY_MAX ((int64_t)U32_MAX) + +/* + * Smallest possible ondisk quota expiration value with bigtime timestamps. + * This corresponds (after conversion to a Unix timestamp) with the incore + * expiration of Jan 1 00:00:04 UTC 1970. + */ +#define XFS_DQ_BIGTIME_EXPIRY_MIN (XFS_DQ_LEGACY_EXPIRY_MIN) + +/* + * Largest supported ondisk quota expiration value with bigtime timestamps. + * This corresponds (after conversion to a Unix timestamp) with an incore + * expiration of Jul 2 20:20:24 UTC 2486. + * + * The ondisk field supports values up to -1U, which corresponds to an incore + * expiration in 2514. This is beyond the maximum the bigtime inode timestamp, + * so we cap the maximum bigtime quota expiration to the max inode timestamp. + */ +#define XFS_DQ_BIGTIME_EXPIRY_MAX ((int64_t)4074815106U) + +/* + * The following conversion factors assist in converting a quota expiration + * timestamp between the incore and ondisk formats. + */ +#define XFS_DQ_BIGTIME_SHIFT (2) +#define XFS_DQ_BIGTIME_SLACK ((int64_t)(1ULL << XFS_DQ_BIGTIME_SHIFT) - 1) + +/* Convert an incore quota expiration timestamp to an ondisk bigtime value. */ +static inline uint32_t xfs_dq_unix_to_bigtime(time64_t unix_seconds) +{ + /* + * Round the expiration timestamp up to the nearest bigtime timestamp + * that we can store, to give users the most time to fix problems. + */ + return ((uint64_t)unix_seconds + XFS_DQ_BIGTIME_SLACK) >> + XFS_DQ_BIGTIME_SHIFT; +} + +/* Convert an ondisk bigtime quota expiration value to an incore timestamp. */ +static inline time64_t xfs_dq_bigtime_to_unix(uint32_t ondisk_seconds) +{ + return (time64_t)ondisk_seconds << XFS_DQ_BIGTIME_SHIFT; +} + +/* + * Default quota grace periods, ranging from zero (use the compiled defaults) + * to ~136 years. These are applied to a non-root dquot that has exceeded + * either limit. + */ +#define XFS_DQ_GRACE_MIN ((int64_t)0) +#define XFS_DQ_GRACE_MAX ((int64_t)U32_MAX) /* * This is the main portion of the on-disk representation of quota information diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 84bcffa87753..2a2e3cfd94f0 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -249,6 +249,7 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_SPINODES (1 << 18) /* sparse inode chunks */ #define XFS_FSOP_GEOM_FLAGS_RMAPBT (1 << 19) /* reverse mapping btree */ #define XFS_FSOP_GEOM_FLAGS_REFLINK (1 << 20) /* files can share blocks */ +#define XFS_FSOP_GEOM_FLAGS_BIGTIME (1 << 21) /* 64-bit nsec timestamps */ /* * Minimum and maximum sizes need for growth checks. diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index a6b37db55169..974e71bc4a3a 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2473,6 +2473,7 @@ xfs_ialloc_log_agi( offsetof(xfs_agi_t, agi_unlinked), offsetof(xfs_agi_t, agi_free_root), offsetof(xfs_agi_t, agi_free_level), + offsetof(xfs_agi_t, agi_iblocks), sizeof(xfs_agi_t) }; #ifdef DEBUG @@ -2806,6 +2807,10 @@ xfs_ialloc_setup_geometry( uint64_t icount; uint inodes; + igeo->new_diflags2 = 0; + if (xfs_sb_version_hasbigtime(&mp->m_sb)) + igeo->new_diflags2 |= XFS_DIFLAG2_BIGTIME; + /* Compute inode btree geometry. */ igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog; igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1); diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 3c8aebc36e64..cc919a2ee870 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -67,6 +67,25 @@ xfs_finobt_set_root( XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL); } +/* Update the inode btree block counter for this btree. */ +static inline void +xfs_inobt_mod_blockcount( + struct xfs_btree_cur *cur, + int howmuch) +{ + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agi *agi = agbp->b_addr; + + if (!xfs_sb_version_hasinobtcounts(&cur->bc_mp->m_sb)) + return; + + if (cur->bc_btnum == XFS_BTNUM_FINO) + be32_add_cpu(&agi->agi_fblocks, howmuch); + else if (cur->bc_btnum == XFS_BTNUM_INO) + be32_add_cpu(&agi->agi_iblocks, howmuch); + xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_IBLOCKS); +} + STATIC int __xfs_inobt_alloc_block( struct xfs_btree_cur *cur, @@ -102,6 +121,7 @@ __xfs_inobt_alloc_block( new->s = cpu_to_be32(XFS_FSB_TO_AGBNO(args.mp, args.fsbno)); *stat = 1; + xfs_inobt_mod_blockcount(cur, 1); return 0; } @@ -134,6 +154,7 @@ __xfs_inobt_free_block( struct xfs_buf *bp, enum xfs_ag_resv_type resv) { + xfs_inobt_mod_blockcount(cur, -1); return xfs_free_extent(cur->bc_tp, XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1, &XFS_RMAP_OINFO_INOBT, resv); @@ -480,19 +501,29 @@ xfs_inobt_commit_staged_btree( { struct xfs_agi *agi = agbp->b_addr; struct xbtree_afakeroot *afake = cur->bc_ag.afake; + int fields; ASSERT(cur->bc_flags & XFS_BTREE_STAGING); if (cur->bc_btnum == XFS_BTNUM_INO) { + fields = XFS_AGI_ROOT | XFS_AGI_LEVEL; agi->agi_root = cpu_to_be32(afake->af_root); agi->agi_level = cpu_to_be32(afake->af_levels); - xfs_ialloc_log_agi(tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL); + if (xfs_sb_version_hasinobtcounts(&cur->bc_mp->m_sb)) { + agi->agi_iblocks = cpu_to_be32(afake->af_blocks); + fields |= XFS_AGI_IBLOCKS; + } + xfs_ialloc_log_agi(tp, agbp, fields); xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_inobt_ops); } else { + fields = XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL; agi->agi_free_root = cpu_to_be32(afake->af_root); agi->agi_free_level = cpu_to_be32(afake->af_levels); - xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREE_ROOT | - XFS_AGI_FREE_LEVEL); + if (xfs_sb_version_hasinobtcounts(&cur->bc_mp->m_sb)) { + agi->agi_fblocks = cpu_to_be32(afake->af_blocks); + fields |= XFS_AGI_IBLOCKS; + } + xfs_ialloc_log_agi(tp, agbp, fields); xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_finobt_ops); } } @@ -673,6 +704,28 @@ xfs_inobt_count_blocks( return error; } +/* Read finobt block count from AGI header. */ +static int +xfs_finobt_read_blocks( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_extlen_t *tree_blocks) +{ + struct xfs_buf *agbp; + struct xfs_agi *agi; + int error; + + error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); + if (error) + return error; + + agi = agbp->b_addr; + *tree_blocks = be32_to_cpu(agi->agi_fblocks); + xfs_trans_brelse(tp, agbp); + return 0; +} + /* * Figure out how many blocks to reserve and how many are used by this btree. */ @@ -690,7 +743,11 @@ xfs_finobt_calc_reserves( if (!xfs_sb_version_hasfinobt(&mp->m_sb)) return 0; - error = xfs_inobt_count_blocks(mp, tp, agno, XFS_BTNUM_FINO, &tree_len); + if (xfs_sb_version_hasinobtcounts(&mp->m_sb)) + error = xfs_finobt_read_blocks(mp, tp, agno, &tree_len); + else + error = xfs_inobt_count_blocks(mp, tp, agno, XFS_BTNUM_FINO, + &tree_len); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c index 52451809c478..b4164256993d 100644 --- a/fs/xfs/libxfs/xfs_iext_tree.c +++ b/fs/xfs/libxfs/xfs_iext_tree.c @@ -603,7 +603,7 @@ xfs_iext_realloc_root( if (new_size / sizeof(struct xfs_iext_rec) == RECS_PER_LEAF) new_size = NODE_SIZE; - new = kmem_realloc(ifp->if_u1.if_root, new_size, KM_NOFS); + new = krealloc(ifp->if_u1.if_root, new_size, GFP_NOFS | __GFP_NOFAIL); memset(new + ifp->if_bytes, 0, new_size - ifp->if_bytes); ifp->if_u1.if_root = new; cur->leaf = new; diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 8d5dd08eab75..c667c63f2cb0 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -157,6 +157,36 @@ xfs_imap_to_bp( return 0; } +static inline struct timespec64 xfs_inode_decode_bigtime(uint64_t ts) +{ + struct timespec64 tv; + uint32_t n; + + tv.tv_sec = xfs_bigtime_to_unix(div_u64_rem(ts, NSEC_PER_SEC, &n)); + tv.tv_nsec = n; + + return tv; +} + +/* Convert an ondisk timestamp to an incore timestamp. */ +struct timespec64 +xfs_inode_from_disk_ts( + struct xfs_dinode *dip, + const xfs_timestamp_t ts) +{ + struct timespec64 tv; + struct xfs_legacy_timestamp *lts; + + if (xfs_dinode_has_bigtime(dip)) + return xfs_inode_decode_bigtime(be64_to_cpu(ts)); + + lts = (struct xfs_legacy_timestamp *)&ts; + tv.tv_sec = (int)be32_to_cpu(lts->t_sec); + tv.tv_nsec = (int)be32_to_cpu(lts->t_nsec); + + return tv; +} + int xfs_inode_from_disk( struct xfs_inode *ip, @@ -211,12 +241,9 @@ xfs_inode_from_disk( * a time before epoch is converted to a time long after epoch * on 64 bit systems. */ - inode->i_atime.tv_sec = (int)be32_to_cpu(from->di_atime.t_sec); - inode->i_atime.tv_nsec = (int)be32_to_cpu(from->di_atime.t_nsec); - inode->i_mtime.tv_sec = (int)be32_to_cpu(from->di_mtime.t_sec); - inode->i_mtime.tv_nsec = (int)be32_to_cpu(from->di_mtime.t_nsec); - inode->i_ctime.tv_sec = (int)be32_to_cpu(from->di_ctime.t_sec); - inode->i_ctime.tv_nsec = (int)be32_to_cpu(from->di_ctime.t_nsec); + inode->i_atime = xfs_inode_from_disk_ts(from, from->di_atime); + inode->i_mtime = xfs_inode_from_disk_ts(from, from->di_mtime); + inode->i_ctime = xfs_inode_from_disk_ts(from, from->di_ctime); to->di_size = be64_to_cpu(from->di_size); to->di_nblocks = be64_to_cpu(from->di_nblocks); @@ -229,8 +256,7 @@ xfs_inode_from_disk( if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { inode_set_iversion_queried(inode, be64_to_cpu(from->di_changecount)); - to->di_crtime.tv_sec = be32_to_cpu(from->di_crtime.t_sec); - to->di_crtime.tv_nsec = be32_to_cpu(from->di_crtime.t_nsec); + to->di_crtime = xfs_inode_from_disk_ts(from, from->di_crtime); to->di_flags2 = be64_to_cpu(from->di_flags2); to->di_cowextsize = be32_to_cpu(from->di_cowextsize); } @@ -252,6 +278,25 @@ out_destroy_data_fork: return error; } +/* Convert an incore timestamp to an ondisk timestamp. */ +static inline xfs_timestamp_t +xfs_inode_to_disk_ts( + struct xfs_inode *ip, + const struct timespec64 tv) +{ + struct xfs_legacy_timestamp *lts; + xfs_timestamp_t ts; + + if (xfs_inode_has_bigtime(ip)) + return cpu_to_be64(xfs_inode_encode_bigtime(tv)); + + lts = (struct xfs_legacy_timestamp *)&ts; + lts->t_sec = cpu_to_be32(tv.tv_sec); + lts->t_nsec = cpu_to_be32(tv.tv_nsec); + + return ts; +} + void xfs_inode_to_disk( struct xfs_inode *ip, @@ -271,12 +316,9 @@ xfs_inode_to_disk( to->di_projid_hi = cpu_to_be16(from->di_projid >> 16); memset(to->di_pad, 0, sizeof(to->di_pad)); - to->di_atime.t_sec = cpu_to_be32(inode->i_atime.tv_sec); - to->di_atime.t_nsec = cpu_to_be32(inode->i_atime.tv_nsec); - to->di_mtime.t_sec = cpu_to_be32(inode->i_mtime.tv_sec); - to->di_mtime.t_nsec = cpu_to_be32(inode->i_mtime.tv_nsec); - to->di_ctime.t_sec = cpu_to_be32(inode->i_ctime.tv_sec); - to->di_ctime.t_nsec = cpu_to_be32(inode->i_ctime.tv_nsec); + to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime); + to->di_mtime = xfs_inode_to_disk_ts(ip, inode->i_mtime); + to->di_ctime = xfs_inode_to_disk_ts(ip, inode->i_ctime); to->di_nlink = cpu_to_be32(inode->i_nlink); to->di_gen = cpu_to_be32(inode->i_generation); to->di_mode = cpu_to_be16(inode->i_mode); @@ -295,8 +337,7 @@ xfs_inode_to_disk( if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { to->di_version = 3; to->di_changecount = cpu_to_be64(inode_peek_iversion(inode)); - to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.tv_sec); - to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.tv_nsec); + to->di_crtime = xfs_inode_to_disk_ts(ip, from->di_crtime); to->di_flags2 = cpu_to_be64(from->di_flags2); to->di_cowextsize = cpu_to_be32(from->di_cowextsize); to->di_ino = cpu_to_be64(ip->i_ino); @@ -310,58 +351,6 @@ xfs_inode_to_disk( } } -void -xfs_log_dinode_to_disk( - struct xfs_log_dinode *from, - struct xfs_dinode *to) -{ - to->di_magic = cpu_to_be16(from->di_magic); - to->di_mode = cpu_to_be16(from->di_mode); - to->di_version = from->di_version; - to->di_format = from->di_format; - to->di_onlink = 0; - to->di_uid = cpu_to_be32(from->di_uid); - to->di_gid = cpu_to_be32(from->di_gid); - to->di_nlink = cpu_to_be32(from->di_nlink); - to->di_projid_lo = cpu_to_be16(from->di_projid_lo); - to->di_projid_hi = cpu_to_be16(from->di_projid_hi); - memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); - - to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec); - to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec); - to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec); - to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec); - to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec); - to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec); - - to->di_size = cpu_to_be64(from->di_size); - to->di_nblocks = cpu_to_be64(from->di_nblocks); - to->di_extsize = cpu_to_be32(from->di_extsize); - to->di_nextents = cpu_to_be32(from->di_nextents); - to->di_anextents = cpu_to_be16(from->di_anextents); - to->di_forkoff = from->di_forkoff; - to->di_aformat = from->di_aformat; - to->di_dmevmask = cpu_to_be32(from->di_dmevmask); - to->di_dmstate = cpu_to_be16(from->di_dmstate); - to->di_flags = cpu_to_be16(from->di_flags); - to->di_gen = cpu_to_be32(from->di_gen); - - if (from->di_version == 3) { - to->di_changecount = cpu_to_be64(from->di_changecount); - to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec); - to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec); - to->di_flags2 = cpu_to_be64(from->di_flags2); - to->di_cowextsize = cpu_to_be32(from->di_cowextsize); - to->di_ino = cpu_to_be64(from->di_ino); - to->di_lsn = cpu_to_be64(from->di_lsn); - memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); - uuid_copy(&to->di_uuid, &from->di_uuid); - to->di_flushiter = 0; - } else { - to->di_flushiter = cpu_to_be16(from->di_flushiter); - } -} - static xfs_failaddr_t xfs_dinode_verify_fork( struct xfs_dinode *dip, @@ -568,6 +557,11 @@ xfs_dinode_verify( if (fa) return fa; + /* bigtime iflag can only happen on bigtime filesystems */ + if (xfs_dinode_has_bigtime(dip) && + !xfs_sb_version_hasbigtime(&mp->m_sb)) + return __this_address; + return NULL; } diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index 6b08b9d060c2..ef5eaf33d146 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -17,7 +17,7 @@ struct xfs_dinode; */ struct xfs_icdinode { uint16_t di_flushiter; /* incremented on flush */ - uint32_t di_projid; /* owner's project id */ + prid_t di_projid; /* owner's project id */ xfs_fsize_t di_size; /* number of bytes in file */ xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */ xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ @@ -32,6 +32,11 @@ struct xfs_icdinode { struct timespec64 di_crtime; /* time created */ }; +static inline bool xfs_icdinode_has_bigtime(const struct xfs_icdinode *icd) +{ + return icd->di_flags2 & XFS_DIFLAG2_BIGTIME; +} + /* * Inode location information. Stored in the inode and passed to * xfs_imap_to_bp() to get a buffer and dinode for a given inode. @@ -49,8 +54,6 @@ void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *); void xfs_inode_to_disk(struct xfs_inode *ip, struct xfs_dinode *to, xfs_lsn_t lsn); int xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from); -void xfs_log_dinode_to_disk(struct xfs_log_dinode *from, - struct xfs_dinode *to); xfs_failaddr_t xfs_dinode_verify(struct xfs_mount *mp, xfs_ino_t ino, struct xfs_dinode *dip); @@ -60,4 +63,12 @@ xfs_failaddr_t xfs_inode_validate_cowextsize(struct xfs_mount *mp, uint32_t cowextsize, uint16_t mode, uint16_t flags, uint64_t flags2); +static inline uint64_t xfs_inode_encode_bigtime(struct timespec64 tv) +{ + return xfs_unix_to_bigtime(tv.tv_sec) * NSEC_PER_SEC + tv.tv_nsec; +} + +struct timespec64 xfs_inode_from_disk_ts(struct xfs_dinode *dip, + const xfs_timestamp_t ts); + #endif /* __XFS_INODE_BUF_H__ */ diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 0cf853d42d62..7575de5cecb1 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -386,8 +386,8 @@ xfs_iroot_realloc( cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); new_max = cur_max + rec_diff; new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); - ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, - KM_NOFS); + ifp->if_broot = krealloc(ifp->if_broot, new_size, + GFP_NOFS | __GFP_NOFAIL); op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, ifp->if_broot_bytes); np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, @@ -496,8 +496,8 @@ xfs_idata_realloc( * in size so that it can be logged and stay on word boundaries. * We enforce that here. */ - ifp->if_u1.if_data = kmem_realloc(ifp->if_u1.if_data, - roundup(new_size, 4), KM_NOFS); + ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, roundup(new_size, 4), + GFP_NOFS | __GFP_NOFAIL); ifp->if_bytes = new_size; } diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index e3400c9c71cd..8bd00da6d2a4 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -368,10 +368,13 @@ static inline int xfs_ilog_fdata(int w) * directly mirrors the xfs_dinode structure as it must contain all the same * information. */ -typedef struct xfs_ictimestamp { +typedef uint64_t xfs_ictimestamp_t; + +/* Legacy timestamp encoding format. */ +struct xfs_legacy_ictimestamp { int32_t t_sec; /* timestamp seconds */ int32_t t_nsec; /* timestamp nanoseconds */ -} xfs_ictimestamp_t; +}; /* * Define the format of the inode core that is logged. This structure must be diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 641132d0e39d..3cca2bfe714c 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -121,7 +121,6 @@ struct xlog_recover { void xlog_buf_readahead(struct xlog *log, xfs_daddr_t blkno, uint len, const struct xfs_buf_ops *ops); bool xlog_is_buffer_cancelled(struct xlog *log, xfs_daddr_t blkno, uint len); -void xlog_recover_iodone(struct xfs_buf *bp); void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type, uint64_t intent_id); diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index 076bdc7037ee..0f0af4e35032 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -23,7 +23,8 @@ typedef uint8_t xfs_dqtype_t; #define XFS_DQTYPE_STRINGS \ { XFS_DQTYPE_USER, "USER" }, \ { XFS_DQTYPE_PROJ, "PROJ" }, \ - { XFS_DQTYPE_GROUP, "GROUP" } + { XFS_DQTYPE_GROUP, "GROUP" }, \ + { XFS_DQTYPE_BIGTIME, "BIGTIME" } /* * flags for q_flags field in the dquot. @@ -143,4 +144,9 @@ extern int xfs_calc_dquots_per_chunk(unsigned int nbblks); extern void xfs_dqblk_repair(struct xfs_mount *mp, struct xfs_dqblk *dqb, xfs_dqid_t id, xfs_dqtype_t type); +struct xfs_dquot; +time64_t xfs_dquot_from_disk_ts(struct xfs_disk_dquot *ddq, + __be32 dtimer); +__be32 xfs_dquot_to_disk_ts(struct xfs_dquot *ddq, time64_t timer); + #endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 27c39268c31f..340c83f76c80 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -2505,12 +2505,15 @@ xfs_rmap_map_extent( int whichfork, struct xfs_bmbt_irec *PREV) { + enum xfs_rmap_intent_type type = XFS_RMAP_MAP; + if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork)) return; - __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? - XFS_RMAP_MAP_SHARED : XFS_RMAP_MAP, ip->i_ino, - whichfork, PREV); + if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip)) + type = XFS_RMAP_MAP_SHARED; + + __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV); } /* Unmap an extent out of a file. */ @@ -2521,12 +2524,15 @@ xfs_rmap_unmap_extent( int whichfork, struct xfs_bmbt_irec *PREV) { + enum xfs_rmap_intent_type type = XFS_RMAP_UNMAP; + if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork)) return; - __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? - XFS_RMAP_UNMAP_SHARED : XFS_RMAP_UNMAP, ip->i_ino, - whichfork, PREV); + if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip)) + type = XFS_RMAP_UNMAP_SHARED; + + __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV); } /* @@ -2543,12 +2549,15 @@ xfs_rmap_convert_extent( int whichfork, struct xfs_bmbt_irec *PREV) { + enum xfs_rmap_intent_type type = XFS_RMAP_CONVERT; + if (!xfs_rmap_update_is_needed(mp, whichfork)) return; - __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? - XFS_RMAP_CONVERT_SHARED : XFS_RMAP_CONVERT, ip->i_ino, - whichfork, PREV); + if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip)) + type = XFS_RMAP_CONVERT_SHARED; + + __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV); } /* Schedule the creation of an rmap for non-file data. */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 1d9fa8a300f1..6c1aba16113c 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1018,7 +1018,6 @@ xfs_rtalloc_query_range( struct xfs_mount *mp = tp->t_mountp; xfs_rtblock_t rtstart; xfs_rtblock_t rtend; - xfs_rtblock_t rem; int is_free; int error = 0; @@ -1027,13 +1026,12 @@ xfs_rtalloc_query_range( if (low_rec->ar_startext >= mp->m_sb.sb_rextents || low_rec->ar_startext == high_rec->ar_startext) return 0; - if (high_rec->ar_startext > mp->m_sb.sb_rextents) - high_rec->ar_startext = mp->m_sb.sb_rextents; + high_rec->ar_startext = min(high_rec->ar_startext, + mp->m_sb.sb_rextents - 1); /* Iterate the bitmap, looking for discrepancies. */ rtstart = low_rec->ar_startext; - rem = high_rec->ar_startext - rtstart; - while (rem) { + while (rtstart <= high_rec->ar_startext) { /* Is the first block free? */ error = xfs_rtcheck_range(mp, tp, rtstart, 1, 1, &rtend, &is_free); @@ -1042,7 +1040,7 @@ xfs_rtalloc_query_range( /* How long does the extent go for? */ error = xfs_rtfind_forw(mp, tp, rtstart, - high_rec->ar_startext - 1, &rtend); + high_rec->ar_startext, &rtend); if (error) break; @@ -1055,7 +1053,6 @@ xfs_rtalloc_query_range( break; } - rem -= rtend - rtstart + 1; rtstart = rtend + 1; } diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index ae9aaf1f34bf..5aeafa59ed27 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -954,7 +954,7 @@ xfs_log_sb( struct xfs_trans *tp) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_buf *bp = xfs_trans_getsb(tp, mp); + struct xfs_buf *bp = xfs_trans_getsb(tp); mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree); @@ -1084,7 +1084,7 @@ xfs_sync_sb_buf( if (error) return error; - bp = xfs_trans_getsb(tp, mp); + bp = xfs_trans_getsb(tp); xfs_log_sb(tp); xfs_trans_bhold(tp, bp); xfs_trans_set_sync(tp); @@ -1166,6 +1166,8 @@ xfs_fs_geometry( geo->flags |= XFS_FSOP_GEOM_FLAGS_RMAPBT; if (xfs_sb_version_hasreflink(sbp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_REFLINK; + if (xfs_sb_version_hasbigtime(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_BIGTIME; if (xfs_sb_version_hassector(sbp)) geo->logsectsize = sbp->sb_logsectsize; else diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 708feb8eac76..c795ae47b3c9 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -176,6 +176,9 @@ struct xfs_ino_geometry { unsigned int ialloc_align; unsigned int agino_log; /* #bits for agino in inum */ + + /* precomputed value for di_flags2 */ + uint64_t new_diflags2; }; #endif /* __XFS_SHARED_H__ */ diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index b7e222befb08..90f1d5645052 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -132,6 +132,17 @@ xfs_trans_log_inode( } /* + * If we're updating the inode core or the timestamps and it's possible + * to upgrade this inode to bigtime format, do so now. + */ + if ((flags & (XFS_ILOG_CORE | XFS_ILOG_TIMESTAMP)) && + xfs_sb_version_hasbigtime(&ip->i_mount->m_sb) && + !xfs_inode_has_bigtime(ip)) { + ip->i_d.di_flags2 |= XFS_DIFLAG2_BIGTIME; + flags |= XFS_ILOG_CORE; + } + + /* * Record the specific change for fdatasync optimisation. This allows * fdatasync to skip log forces for inodes that are only timestamp * dirty. @@ -177,9 +188,9 @@ xfs_trans_log_inode( /* * Always OR in the bits from the ili_last_fields field. This is to - * coordinate with the xfs_iflush() and xfs_iflush_done() routines in - * the eventual clearing of the ili_fields bits. See the big comment in - * xfs_iflush() for an explanation of this coordination mechanism. + * coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines + * in the eventual clearing of the ili_fields bits. See the big comment + * in xfs_iflush() for an explanation of this coordination mechanism. */ iip->ili_fields |= (flags | iip->ili_last_fields | iversion_flags); spin_unlock(&iip->ili_lock); diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index e9bcf1faa183..ae8e2e0ac64a 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -781,6 +781,35 @@ xchk_agi_xref_icounts( xchk_block_xref_set_corrupt(sc, sc->sa.agi_bp); } +/* Check agi_[fi]blocks against tree size */ +static inline void +xchk_agi_xref_fiblocks( + struct xfs_scrub *sc) +{ + struct xfs_agi *agi = sc->sa.agi_bp->b_addr; + xfs_agblock_t blocks; + int error = 0; + + if (!xfs_sb_version_hasinobtcounts(&sc->mp->m_sb)) + return; + + if (sc->sa.ino_cur) { + error = xfs_btree_count_blocks(sc->sa.ino_cur, &blocks); + if (!xchk_should_check_xref(sc, &error, &sc->sa.ino_cur)) + return; + if (blocks != be32_to_cpu(agi->agi_iblocks)) + xchk_block_xref_set_corrupt(sc, sc->sa.agi_bp); + } + + if (sc->sa.fino_cur) { + error = xfs_btree_count_blocks(sc->sa.fino_cur, &blocks); + if (!xchk_should_check_xref(sc, &error, &sc->sa.fino_cur)) + return; + if (blocks != be32_to_cpu(agi->agi_fblocks)) + xchk_block_xref_set_corrupt(sc, sc->sa.agi_bp); + } +} + /* Cross-reference with the other btrees. */ STATIC void xchk_agi_xref( @@ -804,6 +833,7 @@ xchk_agi_xref( xchk_agi_xref_icounts(sc); xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); xchk_xref_is_not_shared(sc, agbno, 1); + xchk_agi_xref_fiblocks(sc); /* scrub teardown will take care of sc->sa for us */ } diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index bca2ab1d4be9..401f71579ce6 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -810,10 +810,34 @@ xrep_agi_calc_from_btrees( error = xfs_ialloc_count_inodes(cur, &count, &freecount); if (error) goto err; + if (xfs_sb_version_hasinobtcounts(&mp->m_sb)) { + xfs_agblock_t blocks; + + error = xfs_btree_count_blocks(cur, &blocks); + if (error) + goto err; + agi->agi_iblocks = cpu_to_be32(blocks); + } xfs_btree_del_cursor(cur, error); agi->agi_count = cpu_to_be32(count); agi->agi_freecount = cpu_to_be32(freecount); + + if (xfs_sb_version_hasfinobt(&mp->m_sb) && + xfs_sb_version_hasinobtcounts(&mp->m_sb)) { + xfs_agblock_t blocks; + + cur = xfs_inobt_init_cursor(mp, sc->tp, agi_bp, sc->sa.agno, + XFS_BTNUM_FINO); + if (error) + goto err; + error = xfs_btree_count_blocks(cur, &blocks); + if (error) + goto err; + xfs_btree_del_cursor(cur, error); + agi->agi_fblocks = cpu_to_be32(blocks); + } + return 0; err: xfs_btree_del_cursor(cur, error); diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index e56786f0a13c..653f3280e1c1 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -441,6 +441,20 @@ xchk_da_btree_block( goto out_freebp; } + /* + * If we've been handed a block that is below the dabtree root, does + * its hashval match what the parent block expected to see? + */ + if (level > 0) { + struct xfs_da_node_entry *key; + + key = xchk_da_btree_node_entry(ds, level - 1); + if (be32_to_cpu(key->hashval) != blk->hashval) { + xchk_da_set_corrupt(ds, level); + goto out_freebp; + } + } + out: return error; out_freebp: diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 6d483ab29e63..3aa85b64de36 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -190,11 +190,30 @@ xchk_inode_flags2( if ((flags2 & XFS_DIFLAG2_DAX) && (flags2 & XFS_DIFLAG2_REFLINK)) goto bad; + /* no bigtime iflag without the bigtime feature */ + if (xfs_dinode_has_bigtime(dip) && + !xfs_sb_version_hasbigtime(&mp->m_sb)) + goto bad; + return; bad: xchk_ino_set_corrupt(sc, ino); } +static inline void +xchk_dinode_nsec( + struct xfs_scrub *sc, + xfs_ino_t ino, + struct xfs_dinode *dip, + const xfs_timestamp_t ts) +{ + struct timespec64 tv; + + tv = xfs_inode_from_disk_ts(dip, ts); + if (tv.tv_nsec < 0 || tv.tv_nsec >= NSEC_PER_SEC) + xchk_ino_set_corrupt(sc, ino); +} + /* Scrub all the ondisk inode fields. */ STATIC void xchk_dinode( @@ -293,12 +312,9 @@ xchk_dinode( } /* di_[amc]time.nsec */ - if (be32_to_cpu(dip->di_atime.t_nsec) >= NSEC_PER_SEC) - xchk_ino_set_corrupt(sc, ino); - if (be32_to_cpu(dip->di_mtime.t_nsec) >= NSEC_PER_SEC) - xchk_ino_set_corrupt(sc, ino); - if (be32_to_cpu(dip->di_ctime.t_nsec) >= NSEC_PER_SEC) - xchk_ino_set_corrupt(sc, ino); + xchk_dinode_nsec(sc, ino, dip, dip->di_atime); + xchk_dinode_nsec(sc, ino, dip, dip->di_mtime); + xchk_dinode_nsec(sc, ino, dip, dip->di_ctime); /* * di_size. xfs_dinode_verify checks for things that screw up @@ -403,8 +419,7 @@ xchk_dinode( } if (dip->di_version >= 3) { - if (be32_to_cpu(dip->di_crtime.t_nsec) >= NSEC_PER_SEC) - xchk_ino_set_corrupt(sc, ino); + xchk_dinode_nsec(sc, ino, dip, dip->di_crtime); xchk_inode_flags2(sc, dip, ino, mode, flags, flags2); xchk_inode_cowextsize(sc, dip, ino, mode, flags, flags2); diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c index 5641ae512c9e..c08be5ede066 100644 --- a/fs/xfs/scrub/symlink.c +++ b/fs/xfs/scrub/symlink.c @@ -22,7 +22,7 @@ xchk_setup_symlink( struct xfs_inode *ip) { /* Allocate the buffer without the inode lock held. */ - sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, 0); + sc->buf = kvzalloc(XFS_SYMLINK_MAXLEN + 1, GFP_KERNEL); if (!sc->buf) return -ENOMEM; diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index d4c687b5cd06..c544951a0c07 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -192,7 +192,7 @@ __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) if (acl) { args.valuelen = XFS_ACL_SIZE(acl->a_count); - args.value = kmem_zalloc_large(args.valuelen, 0); + args.value = kvzalloc(args.valuelen, GFP_KERNEL); if (!args.value) return -ENOMEM; xfs_acl_to_disk(args.value, acl); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index b35611882ff9..55d126d4e096 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -544,7 +544,7 @@ xfs_discard_page( page, ip->i_ino, offset); error = xfs_bmap_punch_delalloc_range(ip, start_fsb, - PAGE_SIZE / i_blocksize(inode)); + i_blocks_per_page(inode, page)); if (error && !XFS_FORCED_SHUTDOWN(mp)) xfs_alert(mp, "page discard unable to remove delalloc mapping."); out_invalidate: diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 50f922cad91a..8f8837fe21cf 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -61,7 +61,7 @@ xfs_attr_shortform_list( int error = 0; ASSERT(dp->i_afp != NULL); - sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data; + sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data; ASSERT(sf != NULL); if (!sf->hdr.count) return 0; @@ -96,7 +96,7 @@ xfs_attr_shortform_list( */ if (context->seen_enough) break; - sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + sfe = xfs_attr_sf_nextentry(sfe); } trace_xfs_attr_list_sf_all(context); return 0; @@ -136,7 +136,7 @@ xfs_attr_shortform_list( /* These are bytes, and both on-disk, don't endian-flip */ sbp->valuelen = sfe->valuelen; sbp->flags = sfe->flags; - sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + sfe = xfs_attr_sf_nextentry(sfe); sbp++; nsbuf++; } diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index ec3691372e7c..9e16a4d0f97c 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -24,6 +24,7 @@ #include "xfs_error.h" #include "xfs_log_priv.h" #include "xfs_log_recover.h" +#include "xfs_quota.h" kmem_zone_t *xfs_bui_zone; kmem_zone_t *xfs_bud_zone; @@ -423,30 +424,26 @@ const struct xfs_defer_op_type xfs_bmap_update_defer_type = { STATIC int xfs_bui_item_recover( struct xfs_log_item *lip, - struct xfs_trans *parent_tp) + struct list_head *capture_list) { struct xfs_bmbt_irec irec; struct xfs_bui_log_item *buip = BUI_ITEM(lip); struct xfs_trans *tp; struct xfs_inode *ip = NULL; - struct xfs_mount *mp = parent_tp->t_mountp; + struct xfs_mount *mp = lip->li_mountp; struct xfs_map_extent *bmap; struct xfs_bud_log_item *budp; xfs_fsblock_t startblock_fsb; xfs_fsblock_t inode_fsb; xfs_filblks_t count; xfs_exntst_t state; - enum xfs_bmap_intent_type type; - bool op_ok; unsigned int bui_type; int whichfork; int error = 0; /* Only one mapping operation per BUI... */ - if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) { - xfs_bui_release(buip); + if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) return -EFSCORRUPTED; - } /* * First check the validity of the extent described by the @@ -457,76 +454,58 @@ xfs_bui_item_recover( XFS_FSB_TO_DADDR(mp, bmap->me_startblock)); inode_fsb = XFS_BB_TO_FSB(mp, XFS_FSB_TO_DADDR(mp, XFS_INO_TO_FSB(mp, bmap->me_owner))); - switch (bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) { + state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? + XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; + switch (bui_type) { case XFS_BMAP_MAP: case XFS_BMAP_UNMAP: - op_ok = true; break; default: - op_ok = false; - break; + return -EFSCORRUPTED; } - if (!op_ok || startblock_fsb == 0 || + if (startblock_fsb == 0 || bmap->me_len == 0 || inode_fsb == 0 || startblock_fsb >= mp->m_sb.sb_dblocks || bmap->me_len >= mp->m_sb.sb_agblocks || inode_fsb >= mp->m_sb.sb_dblocks || - (bmap->me_flags & ~XFS_BMAP_EXTENT_FLAGS)) { - /* - * This will pull the BUI from the AIL and - * free the memory associated with it. - */ - xfs_bui_release(buip); + (bmap->me_flags & ~XFS_BMAP_EXTENT_FLAGS)) return -EFSCORRUPTED; - } - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, - XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp); + /* Grab the inode. */ + error = xfs_iget(mp, NULL, bmap->me_owner, 0, 0, &ip); if (error) return error; - /* - * Recovery stashes all deferred ops during intent processing and - * finishes them on completion. Transfer current dfops state to this - * transaction and transfer the result back before we return. - */ - xfs_defer_move(tp, parent_tp); - budp = xfs_trans_get_bud(tp, buip); - /* Grab the inode. */ - error = xfs_iget(mp, tp, bmap->me_owner, 0, XFS_ILOCK_EXCL, &ip); + error = xfs_qm_dqattach(ip); if (error) - goto err_inode; + goto err_rele; if (VFS_I(ip)->i_nlink == 0) xfs_iflags_set(ip, XFS_IRECOVERY); - /* Process deferred bmap item. */ - state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? - XFS_EXT_UNWRITTEN : XFS_EXT_NORM; - whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; - bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; - switch (bui_type) { - case XFS_BMAP_MAP: - case XFS_BMAP_UNMAP: - type = bui_type; - break; - default: - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); - error = -EFSCORRUPTED; - goto err_inode; - } + /* Allocate transaction and do the work. */ + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp); + if (error) + goto err_rele; + + budp = xfs_trans_get_bud(tp, buip); + xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); count = bmap->me_len; - error = xfs_trans_log_finish_bmap_update(tp, budp, type, ip, whichfork, - bmap->me_startoff, bmap->me_startblock, &count, state); + error = xfs_trans_log_finish_bmap_update(tp, budp, bui_type, ip, + whichfork, bmap->me_startoff, bmap->me_startblock, + &count, state); if (error) - goto err_inode; + goto err_cancel; if (count > 0) { - ASSERT(type == XFS_BMAP_UNMAP); + ASSERT(bui_type == XFS_BMAP_UNMAP); irec.br_startblock = bmap->me_startblock; irec.br_blockcount = count; irec.br_startoff = bmap->me_startoff; @@ -534,20 +513,24 @@ xfs_bui_item_recover( xfs_bmap_unmap_extent(tp, ip, &irec); } - xfs_defer_move(parent_tp, tp); - error = xfs_trans_commit(tp); + /* + * Commit transaction, which frees the transaction and saves the inode + * for later replay activities. + */ + error = xfs_defer_ops_capture_and_commit(tp, ip, capture_list); + if (error) + goto err_unlock; + xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_irele(ip); + return 0; - return error; - -err_inode: - xfs_defer_move(parent_tp, tp); +err_cancel: xfs_trans_cancel(tp); - if (ip) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_irele(ip); - } +err_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); +err_rele: + xfs_irele(ip); return error; } @@ -559,6 +542,32 @@ xfs_bui_item_match( return BUI_ITEM(lip)->bui_format.bui_id == intent_id; } +/* Relog an intent item to push the log tail forward. */ +static struct xfs_log_item * +xfs_bui_item_relog( + struct xfs_log_item *intent, + struct xfs_trans *tp) +{ + struct xfs_bud_log_item *budp; + struct xfs_bui_log_item *buip; + struct xfs_map_extent *extp; + unsigned int count; + + count = BUI_ITEM(intent)->bui_format.bui_nextents; + extp = BUI_ITEM(intent)->bui_format.bui_extents; + + tp->t_flags |= XFS_TRANS_DIRTY; + budp = xfs_trans_get_bud(tp, BUI_ITEM(intent)); + set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags); + + buip = xfs_bui_init(tp->t_mountp); + memcpy(buip->bui_format.bui_extents, extp, count * sizeof(*extp)); + atomic_set(&buip->bui_next_extent, count); + xfs_trans_add_item(tp, &buip->bui_item); + set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags); + return &buip->bui_item; +} + static const struct xfs_item_ops xfs_bui_item_ops = { .iop_size = xfs_bui_item_size, .iop_format = xfs_bui_item_format, @@ -566,6 +575,7 @@ static const struct xfs_item_ops xfs_bui_item_ops = { .iop_release = xfs_bui_item_release, .iop_recover = xfs_bui_item_recover, .iop_match = xfs_bui_item_match, + .iop_relog = xfs_bui_item_relog, }; /* diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 5123f82f2477..f2a8a0e75e1f 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -946,6 +946,14 @@ xfs_free_file_space( startoffset_fsb = XFS_B_TO_FSB(mp, offset); endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); + /* We can only free complete realtime extents. */ + if (XFS_IS_REALTIME_INODE(ip)) { + xfs_extlen_t extsz = xfs_get_extsz_hint(ip); + + if ((startoffset_fsb | endoffset_fsb) & (extsz - 1)) + return -EINVAL; + } + /* * Need to zero the stuff we're not freeing, on disk. */ @@ -1139,6 +1147,14 @@ xfs_insert_file_space( trace_xfs_insert_file_space(ip); + /* We can only insert complete realtime extents. */ + if (XFS_IS_REALTIME_INODE(ip)) { + xfs_extlen_t extsz = xfs_get_extsz_hint(ip); + + if ((stop_fsb | shift_fsb) & (extsz - 1)) + return -EINVAL; + } + error = xfs_bmap_can_insert_extents(ip, stop_fsb, shift_fsb); if (error) return error; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index d4cdcb6fb2fe..4e4cf91f4f9f 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -52,6 +52,15 @@ static kmem_zone_t *xfs_buf_zone; * b_lock (trylock due to inversion) */ +static int __xfs_buf_submit(struct xfs_buf *bp, bool wait); + +static inline int +xfs_buf_submit( + struct xfs_buf *bp) +{ + return __xfs_buf_submit(bp, !(bp->b_flags & XBF_ASYNC)); +} + static inline int xfs_buf_is_vmapped( struct xfs_buf *bp) @@ -751,7 +760,7 @@ found: return 0; } -STATIC int +int _xfs_buf_read( xfs_buf_t *bp, xfs_buf_flags_t flags) @@ -759,7 +768,7 @@ _xfs_buf_read( ASSERT(!(flags & XBF_WRITE)); ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL); - bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD); + bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE); bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); return xfs_buf_submit(bp); @@ -1170,20 +1179,145 @@ xfs_buf_wait_unpin( set_current_state(TASK_RUNNING); } +static void +xfs_buf_ioerror_alert_ratelimited( + struct xfs_buf *bp) +{ + static unsigned long lasttime; + static struct xfs_buftarg *lasttarg; + + if (bp->b_target != lasttarg || + time_after(jiffies, (lasttime + 5*HZ))) { + lasttime = jiffies; + xfs_buf_ioerror_alert(bp, __this_address); + } + lasttarg = bp->b_target; +} + /* - * Buffer Utility Routines + * Account for this latest trip around the retry handler, and decide if + * we've failed enough times to constitute a permanent failure. */ +static bool +xfs_buf_ioerror_permanent( + struct xfs_buf *bp, + struct xfs_error_cfg *cfg) +{ + struct xfs_mount *mp = bp->b_mount; -void + if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && + ++bp->b_retries > cfg->max_retries) + return true; + if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && + time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time)) + return true; + + /* At unmount we may treat errors differently */ + if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount) + return true; + + return false; +} + +/* + * On a sync write or shutdown we just want to stale the buffer and let the + * caller handle the error in bp->b_error appropriately. + * + * If the write was asynchronous then no one will be looking for the error. If + * this is the first failure of this type, clear the error state and write the + * buffer out again. This means we always retry an async write failure at least + * once, but we also need to set the buffer up to behave correctly now for + * repeated failures. + * + * If we get repeated async write failures, then we take action according to the + * error configuration we have been set up to use. + * + * Returns true if this function took care of error handling and the caller must + * not touch the buffer again. Return false if the caller should proceed with + * normal I/O completion handling. + */ +static bool +xfs_buf_ioend_handle_error( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_mount; + struct xfs_error_cfg *cfg; + + /* + * If we've already decided to shutdown the filesystem because of I/O + * errors, there's no point in giving this a retry. + */ + if (XFS_FORCED_SHUTDOWN(mp)) + goto out_stale; + + xfs_buf_ioerror_alert_ratelimited(bp); + + /* + * We're not going to bother about retrying this during recovery. + * One strike! + */ + if (bp->b_flags & _XBF_LOGRECOVERY) { + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + return false; + } + + /* + * Synchronous writes will have callers process the error. + */ + if (!(bp->b_flags & XBF_ASYNC)) + goto out_stale; + + trace_xfs_buf_iodone_async(bp, _RET_IP_); + + cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); + if (bp->b_last_error != bp->b_error || + !(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL))) { + bp->b_last_error = bp->b_error; + if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && + !bp->b_first_retry_time) + bp->b_first_retry_time = jiffies; + goto resubmit; + } + + /* + * Permanent error - we need to trigger a shutdown if we haven't already + * to indicate that inconsistency will result from this action. + */ + if (xfs_buf_ioerror_permanent(bp, cfg)) { + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + goto out_stale; + } + + /* Still considered a transient error. Caller will schedule retries. */ + if (bp->b_flags & _XBF_INODES) + xfs_buf_inode_io_fail(bp); + else if (bp->b_flags & _XBF_DQUOTS) + xfs_buf_dquot_io_fail(bp); + else + ASSERT(list_empty(&bp->b_li_list)); + xfs_buf_ioerror(bp, 0); + xfs_buf_relse(bp); + return true; + +resubmit: + xfs_buf_ioerror(bp, 0); + bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL); + xfs_buf_submit(bp); + return true; +out_stale: + xfs_buf_stale(bp); + bp->b_flags |= XBF_DONE; + bp->b_flags &= ~XBF_WRITE; + trace_xfs_buf_error_relse(bp, _RET_IP_); + return false; +} + +static void xfs_buf_ioend( struct xfs_buf *bp) { - bool read = bp->b_flags & XBF_READ; - trace_xfs_buf_iodone(bp, _RET_IP_); - bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); - /* * Pull in IO completion errors now. We are guaranteed to be running * single threaded, so we don't need the lock to read b_io_error. @@ -1191,39 +1325,47 @@ xfs_buf_ioend( if (!bp->b_error && bp->b_io_error) xfs_buf_ioerror(bp, bp->b_io_error); - if (read) { + if (bp->b_flags & XBF_READ) { if (!bp->b_error && bp->b_ops) bp->b_ops->verify_read(bp); if (!bp->b_error) bp->b_flags |= XBF_DONE; - xfs_buf_ioend_finish(bp); - return; - } + } else { + if (!bp->b_error) { + bp->b_flags &= ~XBF_WRITE_FAIL; + bp->b_flags |= XBF_DONE; + } - if (!bp->b_error) { - bp->b_flags &= ~XBF_WRITE_FAIL; - bp->b_flags |= XBF_DONE; - } + if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp)) + return; - /* - * If this is a log recovery buffer, we aren't doing transactional IO - * yet so we need to let it handle IO completions. - */ - if (bp->b_flags & _XBF_LOGRECOVERY) { - xlog_recover_iodone(bp); - return; - } + /* clear the retry state */ + bp->b_last_error = 0; + bp->b_retries = 0; + bp->b_first_retry_time = 0; - if (bp->b_flags & _XBF_INODES) { - xfs_buf_inode_iodone(bp); - return; - } + /* + * Note that for things like remote attribute buffers, there may + * not be a buffer log item here, so processing the buffer log + * item must remain optional. + */ + if (bp->b_log_item) + xfs_buf_item_done(bp); + + if (bp->b_flags & _XBF_INODES) + xfs_buf_inode_iodone(bp); + else if (bp->b_flags & _XBF_DQUOTS) + xfs_buf_dquot_iodone(bp); - if (bp->b_flags & _XBF_DQUOTS) { - xfs_buf_dquot_iodone(bp); - return; } - xfs_buf_iodone(bp); + + bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD | + _XBF_LOGRECOVERY); + + if (bp->b_flags & XBF_ASYNC) + xfs_buf_relse(bp); + else + complete(&bp->b_iowait); } static void @@ -1506,7 +1648,7 @@ xfs_buf_iowait( * safe to reference the buffer after a call to this function unless the caller * holds an additional reference itself. */ -int +static int __xfs_buf_submit( struct xfs_buf *bp, bool wait) diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 755b652e695a..bfd2907e7bc4 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -249,6 +249,7 @@ int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, int flags, int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr, size_t numblks, int flags, struct xfs_buf **bpp, const struct xfs_buf_ops *ops); +int _xfs_buf_read(struct xfs_buf *bp, xfs_buf_flags_t flags); void xfs_buf_hold(struct xfs_buf *bp); /* Releasing Buffers */ @@ -269,28 +270,12 @@ static inline void xfs_buf_relse(xfs_buf_t *bp) /* Buffer Read and Write Routines */ extern int xfs_bwrite(struct xfs_buf *bp); -extern void xfs_buf_ioend(struct xfs_buf *bp); -static inline void xfs_buf_ioend_finish(struct xfs_buf *bp) -{ - if (bp->b_flags & XBF_ASYNC) - xfs_buf_relse(bp); - else - complete(&bp->b_iowait); -} extern void __xfs_buf_ioerror(struct xfs_buf *bp, int error, xfs_failaddr_t failaddr); #define xfs_buf_ioerror(bp, err) __xfs_buf_ioerror((bp), (err), __this_address) extern void xfs_buf_ioerror_alert(struct xfs_buf *bp, xfs_failaddr_t fa); void xfs_buf_ioend_fail(struct xfs_buf *); - -extern int __xfs_buf_submit(struct xfs_buf *bp, bool); -static inline int xfs_buf_submit(struct xfs_buf *bp) -{ - bool wait = bp->b_flags & XBF_ASYNC ? false : true; - return __xfs_buf_submit(bp, wait); -} - void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize); void __xfs_buf_mark_corrupt(struct xfs_buf *bp, xfs_failaddr_t fa); #define xfs_buf_mark_corrupt(bp) __xfs_buf_mark_corrupt((bp), __this_address) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 408d1b572d3f..0356f2e340a1 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -30,8 +30,6 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip) return container_of(lip, struct xfs_buf_log_item, bli_item); } -static void xfs_buf_item_done(struct xfs_buf *bp); - /* Is this log iovec plausibly large enough to contain the buffer log format? */ bool xfs_buf_log_check_iovec( @@ -463,7 +461,7 @@ xfs_buf_item_unpin( */ if (bip->bli_flags & XFS_BLI_STALE_INODE) { xfs_buf_item_done(bp); - xfs_iflush_done(bp); + xfs_buf_inode_iodone(bp); ASSERT(list_empty(&bp->b_li_list)); } else { xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR); @@ -956,153 +954,10 @@ xfs_buf_item_relse( xfs_buf_item_free(bip); } -/* - * Decide if we're going to retry the write after a failure, and prepare - * the buffer for retrying the write. - */ -static bool -xfs_buf_ioerror_fail_without_retry( - struct xfs_buf *bp) -{ - struct xfs_mount *mp = bp->b_mount; - static ulong lasttime; - static xfs_buftarg_t *lasttarg; - - /* - * If we've already decided to shutdown the filesystem because of - * I/O errors, there's no point in giving this a retry. - */ - if (XFS_FORCED_SHUTDOWN(mp)) - return true; - - if (bp->b_target != lasttarg || - time_after(jiffies, (lasttime + 5*HZ))) { - lasttime = jiffies; - xfs_buf_ioerror_alert(bp, __this_address); - } - lasttarg = bp->b_target; - - /* synchronous writes will have callers process the error */ - if (!(bp->b_flags & XBF_ASYNC)) - return true; - return false; -} - -static bool -xfs_buf_ioerror_retry( - struct xfs_buf *bp, - struct xfs_error_cfg *cfg) -{ - if ((bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL)) && - bp->b_last_error == bp->b_error) - return false; - - bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL); - bp->b_last_error = bp->b_error; - if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && - !bp->b_first_retry_time) - bp->b_first_retry_time = jiffies; - return true; -} - -/* - * Account for this latest trip around the retry handler, and decide if - * we've failed enough times to constitute a permanent failure. - */ -static bool -xfs_buf_ioerror_permanent( - struct xfs_buf *bp, - struct xfs_error_cfg *cfg) -{ - struct xfs_mount *mp = bp->b_mount; - - if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && - ++bp->b_retries > cfg->max_retries) - return true; - if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && - time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time)) - return true; - - /* At unmount we may treat errors differently */ - if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount) - return true; - - return false; -} - -/* - * On a sync write or shutdown we just want to stale the buffer and let the - * caller handle the error in bp->b_error appropriately. - * - * If the write was asynchronous then no one will be looking for the error. If - * this is the first failure of this type, clear the error state and write the - * buffer out again. This means we always retry an async write failure at least - * once, but we also need to set the buffer up to behave correctly now for - * repeated failures. - * - * If we get repeated async write failures, then we take action according to the - * error configuration we have been set up to use. - * - * Multi-state return value: - * - * XBF_IOERROR_FINISH: clear IO error retry state and run callback completions - * XBF_IOERROR_DONE: resubmitted immediately, do not run any completions - * XBF_IOERROR_FAIL: transient error, run failure callback completions and then - * release the buffer - */ -enum { - XBF_IOERROR_FINISH, - XBF_IOERROR_DONE, - XBF_IOERROR_FAIL, -}; - -static int -xfs_buf_iodone_error( - struct xfs_buf *bp) -{ - struct xfs_mount *mp = bp->b_mount; - struct xfs_error_cfg *cfg; - - if (xfs_buf_ioerror_fail_without_retry(bp)) - goto out_stale; - - trace_xfs_buf_item_iodone_async(bp, _RET_IP_); - - cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); - if (xfs_buf_ioerror_retry(bp, cfg)) { - xfs_buf_ioerror(bp, 0); - xfs_buf_submit(bp); - return XBF_IOERROR_DONE; - } - - /* - * Permanent error - we need to trigger a shutdown if we haven't already - * to indicate that inconsistency will result from this action. - */ - if (xfs_buf_ioerror_permanent(bp, cfg)) { - xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); - goto out_stale; - } - - /* Still considered a transient error. Caller will schedule retries. */ - return XBF_IOERROR_FAIL; - -out_stale: - xfs_buf_stale(bp); - bp->b_flags |= XBF_DONE; - trace_xfs_buf_error_relse(bp, _RET_IP_); - return XBF_IOERROR_FINISH; -} - -static void +void xfs_buf_item_done( struct xfs_buf *bp) { - struct xfs_buf_log_item *bip = bp->b_log_item; - - if (!bip) - return; - /* * If we are forcibly shutting down, this may well be off the AIL * already. That's because we simulate the log-committed callbacks to @@ -1111,113 +966,12 @@ xfs_buf_item_done( * xfs_trans_ail_delete() takes care of these. * * Either way, AIL is useless if we're forcing a shutdown. + * + * Note that log recovery writes might have buffer items that are not on + * the AIL even when the file system is not shut down. */ - xfs_trans_ail_delete(&bip->bli_item, SHUTDOWN_CORRUPT_INCORE); - bp->b_log_item = NULL; - xfs_buf_item_free(bip); - xfs_buf_rele(bp); -} - -static inline void -xfs_buf_clear_ioerror_retry_state( - struct xfs_buf *bp) -{ - bp->b_last_error = 0; - bp->b_retries = 0; - bp->b_first_retry_time = 0; -} - -/* - * Inode buffer iodone callback function. - */ -void -xfs_buf_inode_iodone( - struct xfs_buf *bp) -{ - if (bp->b_error) { - struct xfs_log_item *lip; - int ret = xfs_buf_iodone_error(bp); - - if (ret == XBF_IOERROR_FINISH) - goto finish_iodone; - if (ret == XBF_IOERROR_DONE) - return; - ASSERT(ret == XBF_IOERROR_FAIL); - list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { - set_bit(XFS_LI_FAILED, &lip->li_flags); - } - xfs_buf_ioerror(bp, 0); - xfs_buf_relse(bp); - return; - } - -finish_iodone: - xfs_buf_clear_ioerror_retry_state(bp); - xfs_buf_item_done(bp); - xfs_iflush_done(bp); - xfs_buf_ioend_finish(bp); -} - -/* - * Dquot buffer iodone callback function. - */ -void -xfs_buf_dquot_iodone( - struct xfs_buf *bp) -{ - if (bp->b_error) { - struct xfs_log_item *lip; - int ret = xfs_buf_iodone_error(bp); - - if (ret == XBF_IOERROR_FINISH) - goto finish_iodone; - if (ret == XBF_IOERROR_DONE) - return; - ASSERT(ret == XBF_IOERROR_FAIL); - spin_lock(&bp->b_mount->m_ail->ail_lock); - list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { - xfs_set_li_failed(lip, bp); - } - spin_unlock(&bp->b_mount->m_ail->ail_lock); - xfs_buf_ioerror(bp, 0); - xfs_buf_relse(bp); - return; - } - -finish_iodone: - xfs_buf_clear_ioerror_retry_state(bp); - /* a newly allocated dquot buffer might have a log item attached */ - xfs_buf_item_done(bp); - xfs_dquot_done(bp); - xfs_buf_ioend_finish(bp); -} - -/* - * Dirty buffer iodone callback function. - * - * Note that for things like remote attribute buffers, there may not be a buffer - * log item here, so processing the buffer log item must remain be optional. - */ -void -xfs_buf_iodone( - struct xfs_buf *bp) -{ - if (bp->b_error) { - int ret = xfs_buf_iodone_error(bp); - - if (ret == XBF_IOERROR_FINISH) - goto finish_iodone; - if (ret == XBF_IOERROR_DONE) - return; - ASSERT(ret == XBF_IOERROR_FAIL); - ASSERT(list_empty(&bp->b_li_list)); - xfs_buf_ioerror(bp, 0); - xfs_buf_relse(bp); - return; - } - -finish_iodone: - xfs_buf_clear_ioerror_retry_state(bp); - xfs_buf_item_done(bp); - xfs_buf_ioend_finish(bp); + xfs_trans_ail_delete(&bp->b_log_item->bli_item, + (bp->b_flags & _XBF_LOGRECOVERY) ? 0 : + SHUTDOWN_CORRUPT_INCORE); + xfs_buf_item_relse(bp); } diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 23507cbb4c41..50aa0f5ef959 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -50,12 +50,24 @@ struct xfs_buf_log_item { }; int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); +void xfs_buf_item_done(struct xfs_buf *bp); void xfs_buf_item_relse(struct xfs_buf *); bool xfs_buf_item_put(struct xfs_buf_log_item *); void xfs_buf_item_log(struct xfs_buf_log_item *, uint, uint); bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *); void xfs_buf_inode_iodone(struct xfs_buf *); +void xfs_buf_inode_io_fail(struct xfs_buf *bp); +#ifdef CONFIG_XFS_QUOTA void xfs_buf_dquot_iodone(struct xfs_buf *); +void xfs_buf_dquot_io_fail(struct xfs_buf *bp); +#else +static inline void xfs_buf_dquot_iodone(struct xfs_buf *bp) +{ +} +static inline void xfs_buf_dquot_io_fail(struct xfs_buf *bp) +{ +} +#endif /* CONFIG_XFS_QUOTA */ void xfs_buf_iodone(struct xfs_buf *); bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec); diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 8f0457d67d77..d44e8b4a3391 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -414,7 +414,7 @@ xlog_recover_validate_buf_type( * * Write verifiers update the metadata LSN from log items attached to * the buffer. Therefore, initialize a bli purely to carry the LSN to - * the verifier. We'll clean it up in our ->iodone() callback. + * the verifier. */ if (bp->b_ops) { struct xfs_buf_log_item *bip; @@ -719,6 +719,8 @@ xlog_recover_get_buf_lsn( case XFS_ABTC_MAGIC: case XFS_RMAP_CRC_MAGIC: case XFS_REFC_CRC_MAGIC: + case XFS_FIBT_CRC_MAGIC: + case XFS_FIBT_MAGIC: case XFS_IBT_CRC_MAGIC: case XFS_IBT_MAGIC: { struct xfs_btree_block *btb = blk; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index bcd73b9c2994..1d95ed387d66 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -98,12 +98,33 @@ xfs_qm_adjust_dqlimits( xfs_dquot_set_prealloc_limits(dq); } +/* Set the expiration time of a quota's grace period. */ +time64_t +xfs_dquot_set_timeout( + struct xfs_mount *mp, + time64_t timeout) +{ + struct xfs_quotainfo *qi = mp->m_quotainfo; + + return clamp_t(time64_t, timeout, qi->qi_expiry_min, + qi->qi_expiry_max); +} + +/* Set the length of the default grace period. */ +time64_t +xfs_dquot_set_grace_period( + time64_t grace) +{ + return clamp_t(time64_t, grace, XFS_DQ_GRACE_MIN, XFS_DQ_GRACE_MAX); +} + /* * Determine if this quota counter is over either limit and set the quota * timers as appropriate. */ static inline void xfs_qm_adjust_res_timer( + struct xfs_mount *mp, struct xfs_dquot_res *res, struct xfs_quota_limits *qlim) { @@ -112,7 +133,8 @@ xfs_qm_adjust_res_timer( if ((res->softlimit && res->count > res->softlimit) || (res->hardlimit && res->count > res->hardlimit)) { if (res->timer == 0) - res->timer = ktime_get_real_seconds() + qlim->time; + res->timer = xfs_dquot_set_timeout(mp, + ktime_get_real_seconds() + qlim->time); } else { if (res->timer == 0) res->warnings = 0; @@ -145,9 +167,9 @@ xfs_qm_adjust_dqtimers( ASSERT(dq->q_id); defq = xfs_get_defquota(qi, xfs_dquot_type(dq)); - xfs_qm_adjust_res_timer(&dq->q_blk, &defq->blk); - xfs_qm_adjust_res_timer(&dq->q_ino, &defq->ino); - xfs_qm_adjust_res_timer(&dq->q_rtb, &defq->rtb); + xfs_qm_adjust_res_timer(dq->q_mount, &dq->q_blk, &defq->blk); + xfs_qm_adjust_res_timer(dq->q_mount, &dq->q_ino, &defq->ino); + xfs_qm_adjust_res_timer(dq->q_mount, &dq->q_rtb, &defq->rtb); } /* @@ -201,6 +223,8 @@ xfs_qm_init_dquot_blk( d->dd_diskdq.d_version = XFS_DQUOT_VERSION; d->dd_diskdq.d_id = cpu_to_be32(curid); d->dd_diskdq.d_type = type; + if (curid > 0 && xfs_sb_version_hasbigtime(&mp->m_sb)) + d->dd_diskdq.d_type |= XFS_DQTYPE_BIGTIME; if (xfs_sb_version_hascrc(&mp->m_sb)) { uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid); xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), @@ -514,9 +538,9 @@ xfs_dquot_from_disk( dqp->q_ino.warnings = be16_to_cpu(ddqp->d_iwarns); dqp->q_rtb.warnings = be16_to_cpu(ddqp->d_rtbwarns); - dqp->q_blk.timer = be32_to_cpu(ddqp->d_btimer); - dqp->q_ino.timer = be32_to_cpu(ddqp->d_itimer); - dqp->q_rtb.timer = be32_to_cpu(ddqp->d_rtbtimer); + dqp->q_blk.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_btimer); + dqp->q_ino.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_itimer); + dqp->q_rtb.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_rtbtimer); /* * Reservation counters are defined as reservation plus current usage @@ -559,9 +583,9 @@ xfs_dquot_to_disk( ddqp->d_iwarns = cpu_to_be16(dqp->q_ino.warnings); ddqp->d_rtbwarns = cpu_to_be16(dqp->q_rtb.warnings); - ddqp->d_btimer = cpu_to_be32(dqp->q_blk.timer); - ddqp->d_itimer = cpu_to_be32(dqp->q_ino.timer); - ddqp->d_rtbtimer = cpu_to_be32(dqp->q_rtb.timer); + ddqp->d_btimer = xfs_dquot_to_disk_ts(dqp, dqp->q_blk.timer); + ddqp->d_itimer = xfs_dquot_to_disk_ts(dqp, dqp->q_ino.timer); + ddqp->d_rtbtimer = xfs_dquot_to_disk_ts(dqp, dqp->q_rtb.timer); } /* Allocate and initialize the dquot buffer for this in-core dquot. */ @@ -807,8 +831,8 @@ xfs_qm_dqget_checks( } /* - * Given the file system, id, and type (UDQUOT/GDQUOT), return a locked - * dquot, doing an allocation (if requested) as needed. + * Given the file system, id, and type (UDQUOT/GDQUOT/PDQUOT), return a + * locked dquot, doing an allocation (if requested) as needed. */ int xfs_qm_dqget( @@ -1107,7 +1131,7 @@ xfs_qm_dqflush_done( } void -xfs_dquot_done( +xfs_buf_dquot_iodone( struct xfs_buf *bp) { struct xfs_log_item *lip, *n; @@ -1118,6 +1142,18 @@ xfs_dquot_done( } } +void +xfs_buf_dquot_io_fail( + struct xfs_buf *bp) +{ + struct xfs_log_item *lip; + + spin_lock(&bp->b_mount->m_ail->ail_lock); + list_for_each_entry(lip, &bp->b_li_list, li_bio_list) + xfs_set_li_failed(lip, bp); + spin_unlock(&bp->b_mount->m_ail->ail_lock); +} + /* Check incore dquot for errors before we flush. */ static xfs_failaddr_t xfs_qm_dqflush_check( @@ -1145,6 +1181,14 @@ xfs_qm_dqflush_check( !dqp->q_rtb.timer) return __this_address; + /* bigtime flag should never be set on root dquots */ + if (dqp->q_type & XFS_DQTYPE_BIGTIME) { + if (!xfs_sb_version_hasbigtime(&dqp->q_mount->m_sb)) + return __this_address; + if (dqp->q_id == 0) + return __this_address; + } + return NULL; } diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 282a65da93c7..f642884a6834 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -237,4 +237,7 @@ typedef int (*xfs_qm_dqiterate_fn)(struct xfs_dquot *dq, int xfs_qm_dqiterate(struct xfs_mount *mp, xfs_dqtype_t type, xfs_qm_dqiterate_fn iter_fn, void *priv); +time64_t xfs_dquot_set_timeout(struct xfs_mount *mp, time64_t timeout); +time64_t xfs_dquot_set_grace_period(time64_t grace); + #endif /* __XFS_DQUOT_H__ */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 6cb8cd11072a..6c11bfc3d452 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -585,10 +585,10 @@ const struct xfs_defer_op_type xfs_agfl_free_defer_type = { STATIC int xfs_efi_item_recover( struct xfs_log_item *lip, - struct xfs_trans *parent_tp) + struct list_head *capture_list) { struct xfs_efi_log_item *efip = EFI_ITEM(lip); - struct xfs_mount *mp = parent_tp->t_mountp; + struct xfs_mount *mp = lip->li_mountp; struct xfs_efd_log_item *efdp; struct xfs_trans *tp; struct xfs_extent *extp; @@ -608,14 +608,8 @@ xfs_efi_item_recover( if (startblock_fsb == 0 || extp->ext_len == 0 || startblock_fsb >= mp->m_sb.sb_dblocks || - extp->ext_len >= mp->m_sb.sb_agblocks) { - /* - * This will pull the EFI from the AIL and - * free the memory associated with it. - */ - xfs_efi_release(efip); + extp->ext_len >= mp->m_sb.sb_agblocks) return -EFSCORRUPTED; - } } error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); @@ -633,8 +627,7 @@ xfs_efi_item_recover( } - error = xfs_trans_commit(tp); - return error; + return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list); abort_error: xfs_trans_cancel(tp); @@ -649,6 +642,34 @@ xfs_efi_item_match( return EFI_ITEM(lip)->efi_format.efi_id == intent_id; } +/* Relog an intent item to push the log tail forward. */ +static struct xfs_log_item * +xfs_efi_item_relog( + struct xfs_log_item *intent, + struct xfs_trans *tp) +{ + struct xfs_efd_log_item *efdp; + struct xfs_efi_log_item *efip; + struct xfs_extent *extp; + unsigned int count; + + count = EFI_ITEM(intent)->efi_format.efi_nextents; + extp = EFI_ITEM(intent)->efi_format.efi_extents; + + tp->t_flags |= XFS_TRANS_DIRTY; + efdp = xfs_trans_get_efd(tp, EFI_ITEM(intent), count); + efdp->efd_next_extent = count; + memcpy(efdp->efd_format.efd_extents, extp, count * sizeof(*extp)); + set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags); + + efip = xfs_efi_init(tp->t_mountp, count); + memcpy(efip->efi_format.efi_extents, extp, count * sizeof(*extp)); + atomic_set(&efip->efi_next_extent, count); + xfs_trans_add_item(tp, &efip->efi_item); + set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags); + return &efip->efi_item; +} + static const struct xfs_item_ops xfs_efi_item_ops = { .iop_size = xfs_efi_item_size, .iop_format = xfs_efi_item_format, @@ -656,6 +677,7 @@ static const struct xfs_item_ops xfs_efi_item_ops = { .iop_release = xfs_efi_item_release, .iop_recover = xfs_efi_item_recover, .iop_match = xfs_efi_item_match, + .iop_relog = xfs_efi_item_relog, }; /* diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index a29f78a663ca..3d1b95124744 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1008,6 +1008,21 @@ xfs_file_fadvise( return ret; } +/* Does this file, inode, or mount want synchronous writes? */ +static inline bool xfs_file_sync_writes(struct file *filp) +{ + struct xfs_inode *ip = XFS_I(file_inode(filp)); + + if (ip->i_mount->m_flags & XFS_MOUNT_WSYNC) + return true; + if (filp->f_flags & (__O_SYNC | O_DSYNC)) + return true; + if (IS_SYNC(file_inode(filp))) + return true; + + return false; +} + STATIC loff_t xfs_file_remap_range( struct file *file_in, @@ -1065,7 +1080,7 @@ xfs_file_remap_range( if (ret) goto out_unlock; - if (mp->m_flags & XFS_MOUNT_WSYNC) + if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out)) xfs_log_force_inode(dest); out_unlock: xfs_iunlock2_io_mmap(src, dest); diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 1a88025e68a3..db23e455eb91 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -33,39 +33,7 @@ enum xfs_fstrm_alloc { /* * Allocation group filestream associations are tracked with per-ag atomic * counters. These counters allow xfs_filestream_pick_ag() to tell whether a - * particular AG already has active filestreams associated with it. The mount - * point's m_peraglock is used to protect these counters from per-ag array - * re-allocation during a growfs operation. When xfs_growfs_data_private() is - * about to reallocate the array, it calls xfs_filestream_flush() with the - * m_peraglock held in write mode. - * - * Since xfs_mru_cache_flush() guarantees that all the free functions for all - * the cache elements have finished executing before it returns, it's safe for - * the free functions to use the atomic counters without m_peraglock protection. - * This allows the implementation of xfs_fstrm_free_func() to be agnostic about - * whether it was called with the m_peraglock held in read mode, write mode or - * not held at all. The race condition this addresses is the following: - * - * - The work queue scheduler fires and pulls a filestream directory cache - * element off the LRU end of the cache for deletion, then gets pre-empted. - * - A growfs operation grabs the m_peraglock in write mode, flushes all the - * remaining items from the cache and reallocates the mount point's per-ag - * array, resetting all the counters to zero. - * - The work queue thread resumes and calls the free function for the element - * it started cleaning up earlier. In the process it decrements the - * filestreams counter for an AG that now has no references. - * - * With a shrinkfs feature, the above scenario could panic the system. - * - * All other uses of the following macros should be protected by either the - * m_peraglock held in read mode, or the cache's internal locking exposed by the - * interval between a call to xfs_mru_cache_lookup() and a call to - * xfs_mru_cache_done(). In addition, the m_peraglock must be held in read mode - * when new elements are added to the cache. - * - * Combined, these locking rules ensure that no associations will ever exist in - * the cache that reference per-ag array elements that have since been - * reallocated. + * particular AG already has active filestreams associated with it. */ int xfs_filestream_peek_ag( diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 4eebcec4aae6..9ce5e7d5bf8f 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -26,7 +26,7 @@ #include "xfs_rtalloc.h" /* Convert an xfs_fsmap to an fsmap. */ -void +static void xfs_fsmap_from_internal( struct fsmap *dest, struct xfs_fsmap *src) @@ -155,8 +155,7 @@ xfs_fsmap_owner_from_rmap( /* getfsmap query state */ struct xfs_getfsmap_info { struct xfs_fsmap_head *head; - xfs_fsmap_format_t formatter; /* formatting fn */ - void *format_arg; /* format buffer */ + struct fsmap *fsmap_recs; /* mapping records */ struct xfs_buf *agf_bp; /* AGF, for refcount queries */ xfs_daddr_t next_daddr; /* next daddr we expect */ u64 missing_owner; /* owner of holes */ @@ -224,6 +223,20 @@ xfs_getfsmap_is_shared( return 0; } +static inline void +xfs_getfsmap_format( + struct xfs_mount *mp, + struct xfs_fsmap *xfm, + struct xfs_getfsmap_info *info) +{ + struct fsmap *rec; + + trace_xfs_getfsmap_mapping(mp, xfm); + + rec = &info->fsmap_recs[info->head->fmh_entries++]; + xfs_fsmap_from_internal(rec, xfm); +} + /* * Format a reverse mapping for getfsmap, having translated rm_startblock * into the appropriate daddr units. @@ -256,6 +269,9 @@ xfs_getfsmap_helper( /* Are we just counting mappings? */ if (info->head->fmh_count == 0) { + if (info->head->fmh_entries == UINT_MAX) + return -ECANCELED; + if (rec_daddr > info->next_daddr) info->head->fmh_entries++; @@ -285,10 +301,7 @@ xfs_getfsmap_helper( fmr.fmr_offset = 0; fmr.fmr_length = rec_daddr - info->next_daddr; fmr.fmr_flags = FMR_OF_SPECIAL_OWNER; - error = info->formatter(&fmr, info->format_arg); - if (error) - return error; - info->head->fmh_entries++; + xfs_getfsmap_format(mp, &fmr, info); } if (info->last) @@ -320,11 +333,8 @@ xfs_getfsmap_helper( if (shared) fmr.fmr_flags |= FMR_OF_SHARED; } - error = info->formatter(&fmr, info->format_arg); - if (error) - return error; - info->head->fmh_entries++; + xfs_getfsmap_format(mp, &fmr, info); out: rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); if (info->next_daddr < rec_daddr) @@ -792,11 +802,11 @@ xfs_getfsmap_check_keys( #endif /* CONFIG_XFS_RT */ /* - * Get filesystem's extents as described in head, and format for - * output. Calls formatter to fill the user's buffer until all - * extents are mapped, until the passed-in head->fmh_count slots have - * been filled, or until the formatter short-circuits the loop, if it - * is tracking filled-in extents on its own. + * Get filesystem's extents as described in head, and format for output. Fills + * in the supplied records array until there are no more reverse mappings to + * return or head.fmh_entries == head.fmh_count. In the second case, this + * function returns -ECANCELED to indicate that more records would have been + * returned. * * Key to Confusion * ---------------- @@ -816,8 +826,7 @@ int xfs_getfsmap( struct xfs_mount *mp, struct xfs_fsmap_head *head, - xfs_fsmap_format_t formatter, - void *arg) + struct fsmap *fsmap_recs) { struct xfs_trans *tp = NULL; struct xfs_fsmap dkeys[2]; /* per-dev keys */ @@ -892,8 +901,7 @@ xfs_getfsmap( info.next_daddr = head->fmh_keys[0].fmr_physical + head->fmh_keys[0].fmr_length; - info.formatter = formatter; - info.format_arg = arg; + info.fsmap_recs = fsmap_recs; info.head = head; /* diff --git a/fs/xfs/xfs_fsmap.h b/fs/xfs/xfs_fsmap.h index c6c57739b862..a0775788e7b1 100644 --- a/fs/xfs/xfs_fsmap.h +++ b/fs/xfs/xfs_fsmap.h @@ -27,13 +27,9 @@ struct xfs_fsmap_head { struct xfs_fsmap fmh_keys[2]; /* low and high keys */ }; -void xfs_fsmap_from_internal(struct fsmap *dest, struct xfs_fsmap *src); void xfs_fsmap_to_internal(struct xfs_fsmap *dest, struct fsmap *src); -/* fsmap to userspace formatter - copy to user & advance pointer */ -typedef int (*xfs_fsmap_format_t)(struct xfs_fsmap *, void *); - int xfs_getfsmap(struct xfs_mount *mp, struct xfs_fsmap_head *head, - xfs_fsmap_format_t formatter, void *arg); + struct fsmap *out_recs); #endif /* __XFS_FSMAP_H__ */ diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 101028ebb571..deb99300d171 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -52,7 +52,6 @@ xfs_inode_alloc( XFS_STATS_INC(mp, vn_active); ASSERT(atomic_read(&ip->i_pincount) == 0); - ASSERT(!xfs_isiflocked(ip)); ASSERT(ip->i_ino == 0); /* initialise the xfs inode */ @@ -123,7 +122,7 @@ void xfs_inode_free( struct xfs_inode *ip) { - ASSERT(!xfs_isiflocked(ip)); + ASSERT(!xfs_iflags_test(ip, XFS_IFLUSHING)); /* * Because we use RCU freeing we need to ensure the inode always @@ -1035,23 +1034,21 @@ xfs_reclaim_inode( if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) goto out; - if (!xfs_iflock_nowait(ip)) + if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING)) goto out_iunlock; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { xfs_iunpin_wait(ip); - /* xfs_iflush_abort() drops the flush lock */ xfs_iflush_abort(ip); goto reclaim; } if (xfs_ipincount(ip)) - goto out_ifunlock; + goto out_clear_flush; if (!xfs_inode_clean(ip)) - goto out_ifunlock; + goto out_clear_flush; - xfs_ifunlock(ip); + xfs_iflags_clear(ip, XFS_IFLUSHING); reclaim: - ASSERT(!xfs_isiflocked(ip)); /* * Because we use RCU freeing we need to ensure the inode always appears @@ -1101,8 +1098,8 @@ reclaim: __xfs_inode_free(ip); return; -out_ifunlock: - xfs_ifunlock(ip); +out_clear_flush: + xfs_iflags_clear(ip, XFS_IFLUSHING); out_iunlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); out: @@ -1211,7 +1208,7 @@ xfs_reclaim_inodes( while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { xfs_ail_push_all_sync(mp->m_ail); xfs_reclaim_inodes_ag(mp, &nr_to_scan); - }; + } } /* diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index c06129cffba9..2bfbcf28b1bd 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -598,22 +598,6 @@ xfs_lock_two_inodes( } } -void -__xfs_iflock( - struct xfs_inode *ip) -{ - wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT); - DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT); - - do { - prepare_to_wait_exclusive(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); - if (xfs_isiflocked(ip)) - io_schedule(); - } while (!xfs_iflock_nowait(ip)); - - finish_wait(wq, &wait.wq_entry); -} - STATIC uint _xfs_dic2xflags( uint16_t di_flags, @@ -714,6 +698,68 @@ out_unlock: return error; } +/* Propagate di_flags from a parent inode to a child inode. */ +static void +xfs_inode_inherit_flags( + struct xfs_inode *ip, + const struct xfs_inode *pip) +{ + unsigned int di_flags = 0; + umode_t mode = VFS_I(ip)->i_mode; + + if (S_ISDIR(mode)) { + if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) + di_flags |= XFS_DIFLAG_RTINHERIT; + if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { + di_flags |= XFS_DIFLAG_EXTSZINHERIT; + ip->i_d.di_extsize = pip->i_d.di_extsize; + } + if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) + di_flags |= XFS_DIFLAG_PROJINHERIT; + } else if (S_ISREG(mode)) { + if ((pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) && + xfs_sb_version_hasrealtime(&ip->i_mount->m_sb)) + di_flags |= XFS_DIFLAG_REALTIME; + if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { + di_flags |= XFS_DIFLAG_EXTSIZE; + ip->i_d.di_extsize = pip->i_d.di_extsize; + } + } + if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) && + xfs_inherit_noatime) + di_flags |= XFS_DIFLAG_NOATIME; + if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) && + xfs_inherit_nodump) + di_flags |= XFS_DIFLAG_NODUMP; + if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) && + xfs_inherit_sync) + di_flags |= XFS_DIFLAG_SYNC; + if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) && + xfs_inherit_nosymlinks) + di_flags |= XFS_DIFLAG_NOSYMLINKS; + if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) && + xfs_inherit_nodefrag) + di_flags |= XFS_DIFLAG_NODEFRAG; + if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM) + di_flags |= XFS_DIFLAG_FILESTREAM; + + ip->i_d.di_flags |= di_flags; +} + +/* Propagate di_flags2 from a parent inode to a child inode. */ +static void +xfs_inode_inherit_flags2( + struct xfs_inode *ip, + const struct xfs_inode *pip) +{ + if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) { + ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; + ip->i_d.di_cowextsize = pip->i_d.di_cowextsize; + } + if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) + ip->i_d.di_flags2 |= XFS_DIFLAG2_DAX; +} + /* * Allocate an inode on disk and return a copy of its in-core version. * The in-core inode is locked exclusively. Set mode, nlink, and rdev @@ -840,7 +886,7 @@ xfs_ialloc( if (xfs_sb_version_has_v3inode(&mp->m_sb)) { inode_set_iversion(inode, 1); - ip->i_d.di_flags2 = 0; + ip->i_d.di_flags2 = mp->m_ino_geo.new_diflags2; ip->i_d.di_cowextsize = 0; ip->i_d.di_crtime = tv; } @@ -857,54 +903,10 @@ xfs_ialloc( break; case S_IFREG: case S_IFDIR: - if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { - uint di_flags = 0; - - if (S_ISDIR(mode)) { - if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) - di_flags |= XFS_DIFLAG_RTINHERIT; - if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { - di_flags |= XFS_DIFLAG_EXTSZINHERIT; - ip->i_d.di_extsize = pip->i_d.di_extsize; - } - if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) - di_flags |= XFS_DIFLAG_PROJINHERIT; - } else if (S_ISREG(mode)) { - if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) - di_flags |= XFS_DIFLAG_REALTIME; - if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { - di_flags |= XFS_DIFLAG_EXTSIZE; - ip->i_d.di_extsize = pip->i_d.di_extsize; - } - } - if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) && - xfs_inherit_noatime) - di_flags |= XFS_DIFLAG_NOATIME; - if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) && - xfs_inherit_nodump) - di_flags |= XFS_DIFLAG_NODUMP; - if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) && - xfs_inherit_sync) - di_flags |= XFS_DIFLAG_SYNC; - if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) && - xfs_inherit_nosymlinks) - di_flags |= XFS_DIFLAG_NOSYMLINKS; - if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) && - xfs_inherit_nodefrag) - di_flags |= XFS_DIFLAG_NODEFRAG; - if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM) - di_flags |= XFS_DIFLAG_FILESTREAM; - - ip->i_d.di_flags |= di_flags; - } - if (pip && (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY)) { - if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) { - ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; - ip->i_d.di_cowextsize = pip->i_d.di_cowextsize; - } - if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) - ip->i_d.di_flags2 |= XFS_DIFLAG2_DAX; - } + if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) + xfs_inode_inherit_flags(ip, pip); + if (pip && (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY)) + xfs_inode_inherit_flags2(ip, pip); /* FALLTHROUGH */ case S_IFLNK: ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; @@ -1532,17 +1534,10 @@ xfs_itruncate_extents_flags( if (error) goto out; - /* - * Duplicate the transaction that has the permanent - * reservation and commit the old transaction. - */ + /* free the just unmapped extents */ error = xfs_defer_finish(&tp); if (error) goto out; - - error = xfs_trans_roll_inode(&tp, ip); - if (error) - goto out; } if (whichfork == XFS_DATA_FORK) { @@ -2531,11 +2526,8 @@ retry: * valid, the wrong inode or stale. */ spin_lock(&ip->i_flags_lock); - if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) { - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); - return; - } + if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) + goto out_iflags_unlock; /* * Don't try to lock/unlock the current inode, but we _cannot_ skip the @@ -2552,16 +2544,14 @@ retry: } } ip->i_flags |= XFS_ISTALE; - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); /* - * If we can't get the flush lock, the inode is already attached. All + * If the inode is flushing, it is already attached to the buffer. All * we needed to do here is mark the inode stale so buffer IO completion * will remove it from the AIL. */ iip = ip->i_itemp; - if (!xfs_iflock_nowait(ip)) { + if (__xfs_iflags_test(ip, XFS_IFLUSHING)) { ASSERT(!list_empty(&iip->ili_item.li_bio_list)); ASSERT(iip->ili_last_fields); goto out_iunlock; @@ -2573,10 +2563,12 @@ retry: * commit as the flock synchronises removal of the inode from the * cluster buffer against inode reclaim. */ - if (!iip || list_empty(&iip->ili_item.li_bio_list)) { - xfs_ifunlock(ip); + if (!iip || list_empty(&iip->ili_item.li_bio_list)) goto out_iunlock; - } + + __xfs_iflags_set(ip, XFS_IFLUSHING); + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); /* we have a dirty inode in memory that has not yet been flushed. */ spin_lock(&iip->ili_lock); @@ -2586,9 +2578,16 @@ retry: spin_unlock(&iip->ili_lock); ASSERT(iip->ili_last_fields); + if (ip != free_ip) + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return; + out_iunlock: if (ip != free_ip) xfs_iunlock(ip, XFS_ILOCK_EXCL); +out_iflags_unlock: + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); } /* @@ -2631,8 +2630,9 @@ xfs_ifree_cluster( /* * We obtain and lock the backing buffer first in the process - * here, as we have to ensure that any dirty inode that we - * can't get the flush lock on is attached to the buffer. + * here to ensure dirty inodes attached to the buffer remain in + * the flushing state while we mark them stale. + * * If we scan the in-memory inodes first, then buffer IO can * complete before we get a lock on it, and hence we may fail * to mark all the active inodes on the buffer stale. @@ -2717,7 +2717,7 @@ xfs_ifree( VFS_I(ip)->i_mode = 0; /* mark incore inode as free */ ip->i_d.di_flags = 0; - ip->i_d.di_flags2 = 0; + ip->i_d.di_flags2 = ip->i_mount->m_ino_geo.new_diflags2; ip->i_d.di_dmevmask = 0; ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; @@ -3443,7 +3443,7 @@ xfs_iflush( int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - ASSERT(xfs_isiflocked(ip)); + ASSERT(xfs_iflags_test(ip, XFS_IFLUSHING)); ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); ASSERT(iip->ili_item.li_buf == bp); @@ -3553,8 +3553,8 @@ xfs_iflush( * * What we do is move the bits to the ili_last_fields field. When * logging the inode, these bits are moved back to the ili_fields field. - * In the xfs_iflush_done() routine we clear ili_last_fields, since we - * know that the information those bits represent is permanently on + * In the xfs_buf_inode_iodone() routine we clear ili_last_fields, since + * we know that the information those bits represent is permanently on * disk. As long as the flush completes before the inode is logged * again, then both ili_fields and ili_last_fields will be cleared. */ @@ -3568,7 +3568,7 @@ flush_out: /* * Store the current LSN of the inode so that we can tell whether the - * item has moved in the AIL from xfs_iflush_done(). + * item has moved in the AIL from xfs_buf_inode_iodone(). */ xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, &iip->ili_item.li_lsn); @@ -3613,7 +3613,7 @@ xfs_iflush_cluster( /* * Quick and dirty check to avoid locks if possible. */ - if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) + if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) continue; if (xfs_ipincount(ip)) continue; @@ -3627,7 +3627,7 @@ xfs_iflush_cluster( */ spin_lock(&ip->i_flags_lock); ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE)); - if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) { + if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) { spin_unlock(&ip->i_flags_lock); continue; } @@ -3635,24 +3635,17 @@ xfs_iflush_cluster( /* * ILOCK will pin the inode against reclaim and prevent * concurrent transactions modifying the inode while we are - * flushing the inode. + * flushing the inode. If we get the lock, set the flushing + * state before we drop the i_flags_lock. */ if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { spin_unlock(&ip->i_flags_lock); continue; } + __xfs_iflags_set(ip, XFS_IFLUSHING); spin_unlock(&ip->i_flags_lock); /* - * Skip inodes that are already flush locked as they have - * already been written to the buffer. - */ - if (!xfs_iflock_nowait(ip)) { - xfs_iunlock(ip, XFS_ILOCK_SHARED); - continue; - } - - /* * Abort flushing this inode if we are shut down because the * inode may not currently be in the AIL. This can occur when * log I/O failure unpins the inode without inserting into the @@ -3661,7 +3654,6 @@ xfs_iflush_cluster( */ if (XFS_FORCED_SHUTDOWN(mp)) { xfs_iunpin_wait(ip); - /* xfs_iflush_abort() drops the flush lock */ xfs_iflush_abort(ip); xfs_iunlock(ip, XFS_ILOCK_SHARED); error = -EIO; @@ -3670,7 +3662,7 @@ xfs_iflush_cluster( /* don't block waiting on a log force to unpin dirty inodes */ if (xfs_ipincount(ip)) { - xfs_ifunlock(ip); + xfs_iflags_clear(ip, XFS_IFLUSHING); xfs_iunlock(ip, XFS_ILOCK_SHARED); continue; } @@ -3678,7 +3670,7 @@ xfs_iflush_cluster( if (!xfs_inode_clean(ip)) error = xfs_iflush(ip, bp); else - xfs_ifunlock(ip); + xfs_iflags_clear(ip, XFS_IFLUSHING); xfs_iunlock(ip, XFS_ILOCK_SHARED); if (error) break; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index e9a8bb184d1f..751a3d1d7d84 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -194,6 +194,11 @@ static inline bool xfs_inode_has_cow_data(struct xfs_inode *ip) return ip->i_cowfp && ip->i_cowfp->if_bytes; } +static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip) +{ + return ip->i_d.di_flags2 & XFS_DIFLAG2_BIGTIME; +} + /* * Return the buftarg used for data allocations on a given inode. */ @@ -211,8 +216,7 @@ static inline bool xfs_inode_has_cow_data(struct xfs_inode *ip) #define XFS_INEW (1 << __XFS_INEW_BIT) #define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */ #define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */ -#define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */ -#define XFS_IFLOCK (1 << __XFS_IFLOCK_BIT) +#define XFS_IFLUSHING (1 << 7) /* inode is being flushed */ #define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */ #define XFS_IPINNED (1 << __XFS_IPINNED_BIT) #define XFS_IEOFBLOCKS (1 << 9) /* has the preallocblocks tag set */ @@ -234,36 +238,6 @@ static inline bool xfs_inode_has_cow_data(struct xfs_inode *ip) XFS_IDIRTY_RELEASE | XFS_ITRUNCATED) /* - * Synchronize processes attempting to flush the in-core inode back to disk. - */ - -static inline int xfs_isiflocked(struct xfs_inode *ip) -{ - return xfs_iflags_test(ip, XFS_IFLOCK); -} - -extern void __xfs_iflock(struct xfs_inode *ip); - -static inline int xfs_iflock_nowait(struct xfs_inode *ip) -{ - return !xfs_iflags_test_and_set(ip, XFS_IFLOCK); -} - -static inline void xfs_iflock(struct xfs_inode *ip) -{ - if (!xfs_iflock_nowait(ip)) - __xfs_iflock(ip); -} - -static inline void xfs_ifunlock(struct xfs_inode *ip) -{ - ASSERT(xfs_isiflocked(ip)); - xfs_iflags_clear(ip, XFS_IFLOCK); - smp_mb(); - wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT); -} - -/* * Flags for inode locking. * Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield) * 1<<16 - 1<<32-1 -- lockdep annotation (integers) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 6c65938cee1c..17e20a6d8b4e 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -295,6 +295,28 @@ xfs_inode_item_format_attr_fork( } } +/* + * Convert an incore timestamp to a log timestamp. Note that the log format + * specifies host endian format! + */ +static inline xfs_ictimestamp_t +xfs_inode_to_log_dinode_ts( + struct xfs_inode *ip, + const struct timespec64 tv) +{ + struct xfs_legacy_ictimestamp *lits; + xfs_ictimestamp_t its; + + if (xfs_inode_has_bigtime(ip)) + return xfs_inode_encode_bigtime(tv); + + lits = (struct xfs_legacy_ictimestamp *)&its; + lits->t_sec = tv.tv_sec; + lits->t_nsec = tv.tv_nsec; + + return its; +} + static void xfs_inode_to_log_dinode( struct xfs_inode *ip, @@ -313,12 +335,9 @@ xfs_inode_to_log_dinode( memset(to->di_pad, 0, sizeof(to->di_pad)); memset(to->di_pad3, 0, sizeof(to->di_pad3)); - to->di_atime.t_sec = inode->i_atime.tv_sec; - to->di_atime.t_nsec = inode->i_atime.tv_nsec; - to->di_mtime.t_sec = inode->i_mtime.tv_sec; - to->di_mtime.t_nsec = inode->i_mtime.tv_nsec; - to->di_ctime.t_sec = inode->i_ctime.tv_sec; - to->di_ctime.t_nsec = inode->i_ctime.tv_nsec; + to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime); + to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime); + to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode->i_ctime); to->di_nlink = inode->i_nlink; to->di_gen = inode->i_generation; to->di_mode = inode->i_mode; @@ -340,8 +359,7 @@ xfs_inode_to_log_dinode( if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { to->di_version = 3; to->di_changecount = inode_peek_iversion(inode); - to->di_crtime.t_sec = from->di_crtime.tv_sec; - to->di_crtime.t_nsec = from->di_crtime.tv_nsec; + to->di_crtime = xfs_inode_to_log_dinode_ts(ip, from->di_crtime); to->di_flags2 = from->di_flags2; to->di_cowextsize = from->di_cowextsize; to->di_ino = ip->i_ino; @@ -491,8 +509,7 @@ xfs_inode_item_push( (ip->i_flags & XFS_ISTALE)) return XFS_ITEM_PINNED; - /* If the inode is already flush locked, we're already flushing. */ - if (xfs_isiflocked(ip)) + if (xfs_iflags_test(ip, XFS_IFLUSHING)) return XFS_ITEM_FLUSHING; if (!xfs_buf_trylock(bp)) @@ -703,7 +720,7 @@ xfs_iflush_finish( iip->ili_last_fields = 0; iip->ili_flush_lsn = 0; spin_unlock(&iip->ili_lock); - xfs_ifunlock(iip->ili_inode); + xfs_iflags_clear(iip->ili_inode, XFS_IFLUSHING); if (drop_buffer) xfs_buf_rele(bp); } @@ -711,11 +728,11 @@ xfs_iflush_finish( /* * Inode buffer IO completion routine. It is responsible for removing inodes - * attached to the buffer from the AIL if they have not been re-logged, as well - * as completing the flush and unlocking the inode. + * attached to the buffer from the AIL if they have not been re-logged and + * completing the inode flush. */ void -xfs_iflush_done( +xfs_buf_inode_iodone( struct xfs_buf *bp) { struct xfs_log_item *lip, *n; @@ -754,11 +771,21 @@ xfs_iflush_done( list_splice_tail(&flushed_inodes, &bp->b_li_list); } +void +xfs_buf_inode_io_fail( + struct xfs_buf *bp) +{ + struct xfs_log_item *lip; + + list_for_each_entry(lip, &bp->b_li_list, li_bio_list) + set_bit(XFS_LI_FAILED, &lip->li_flags); +} + /* - * This is the inode flushing abort routine. It is called from xfs_iflush when + * This is the inode flushing abort routine. It is called when * the filesystem is shutting down to clean up the inode state. It is * responsible for removing the inode item from the AIL if it has not been - * re-logged, and unlocking the inode's flush lock. + * re-logged and clearing the inode's flush state. */ void xfs_iflush_abort( @@ -790,7 +817,7 @@ xfs_iflush_abort( list_del_init(&iip->ili_item.li_bio_list); spin_unlock(&iip->ili_lock); } - xfs_ifunlock(ip); + xfs_iflags_clear(ip, XFS_IFLUSHING); if (bp) xfs_buf_rele(bp); } diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 048b5e7dee90..4b926e32831c 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -25,8 +25,8 @@ struct xfs_inode_log_item { * * We need atomic changes between inode dirtying, inode flushing and * inode completion, but these all hold different combinations of - * ILOCK and iflock and hence we need some other method of serialising - * updates to the flush state. + * ILOCK and IFLUSHING and hence we need some other method of + * serialising updates to the flush state. */ spinlock_t ili_lock; /* flush state lock */ unsigned int ili_last_fields; /* fields when flushed */ @@ -43,7 +43,6 @@ static inline int xfs_inode_clean(struct xfs_inode *ip) extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); extern void xfs_inode_item_destroy(struct xfs_inode *); -extern void xfs_iflush_done(struct xfs_buf *); extern void xfs_iflush_abort(struct xfs_inode *); extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, struct xfs_inode_log_format *); diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index 5e0d291835b3..cb44f7653f03 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -115,6 +115,82 @@ out_free_ip: return error; } +static inline bool xfs_log_dinode_has_bigtime(const struct xfs_log_dinode *ld) +{ + return ld->di_version >= 3 && + (ld->di_flags2 & XFS_DIFLAG2_BIGTIME); +} + +/* Convert a log timestamp to an ondisk timestamp. */ +static inline xfs_timestamp_t +xfs_log_dinode_to_disk_ts( + struct xfs_log_dinode *from, + const xfs_ictimestamp_t its) +{ + struct xfs_legacy_timestamp *lts; + struct xfs_legacy_ictimestamp *lits; + xfs_timestamp_t ts; + + if (xfs_log_dinode_has_bigtime(from)) + return cpu_to_be64(its); + + lts = (struct xfs_legacy_timestamp *)&ts; + lits = (struct xfs_legacy_ictimestamp *)&its; + lts->t_sec = cpu_to_be32(lits->t_sec); + lts->t_nsec = cpu_to_be32(lits->t_nsec); + + return ts; +} + +STATIC void +xfs_log_dinode_to_disk( + struct xfs_log_dinode *from, + struct xfs_dinode *to) +{ + to->di_magic = cpu_to_be16(from->di_magic); + to->di_mode = cpu_to_be16(from->di_mode); + to->di_version = from->di_version; + to->di_format = from->di_format; + to->di_onlink = 0; + to->di_uid = cpu_to_be32(from->di_uid); + to->di_gid = cpu_to_be32(from->di_gid); + to->di_nlink = cpu_to_be32(from->di_nlink); + to->di_projid_lo = cpu_to_be16(from->di_projid_lo); + to->di_projid_hi = cpu_to_be16(from->di_projid_hi); + memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); + + to->di_atime = xfs_log_dinode_to_disk_ts(from, from->di_atime); + to->di_mtime = xfs_log_dinode_to_disk_ts(from, from->di_mtime); + to->di_ctime = xfs_log_dinode_to_disk_ts(from, from->di_ctime); + + to->di_size = cpu_to_be64(from->di_size); + to->di_nblocks = cpu_to_be64(from->di_nblocks); + to->di_extsize = cpu_to_be32(from->di_extsize); + to->di_nextents = cpu_to_be32(from->di_nextents); + to->di_anextents = cpu_to_be16(from->di_anextents); + to->di_forkoff = from->di_forkoff; + to->di_aformat = from->di_aformat; + to->di_dmevmask = cpu_to_be32(from->di_dmevmask); + to->di_dmstate = cpu_to_be16(from->di_dmstate); + to->di_flags = cpu_to_be16(from->di_flags); + to->di_gen = cpu_to_be32(from->di_gen); + + if (from->di_version == 3) { + to->di_changecount = cpu_to_be64(from->di_changecount); + to->di_crtime = xfs_log_dinode_to_disk_ts(from, + from->di_crtime); + to->di_flags2 = cpu_to_be64(from->di_flags2); + to->di_cowextsize = cpu_to_be32(from->di_cowextsize); + to->di_ino = cpu_to_be64(from->di_ino); + to->di_lsn = cpu_to_be64(from->di_lsn); + memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); + uuid_copy(&to->di_uuid, &from->di_uuid); + to->di_flushiter = 0; + } else { + to->di_flushiter = cpu_to_be16(from->di_flushiter); + } +} + STATIC int xlog_recover_inode_commit_pass2( struct xlog *log, diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 6f22a66777cd..3fbd98f61ea5 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -404,7 +404,7 @@ xfs_ioc_attr_list( context.cursor.offset)) return -EINVAL; - buffer = kmem_zalloc_large(bufsize, 0); + buffer = kvzalloc(bufsize, GFP_KERNEL); if (!buffer) return -ENOMEM; @@ -1190,7 +1190,8 @@ xfs_flags2diflags2( unsigned int xflags) { uint64_t di_flags2 = - (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK); + (ip->i_d.di_flags2 & (XFS_DIFLAG2_REFLINK | + XFS_DIFLAG2_BIGTIME)); if (xflags & FS_XFLAG_DAX) di_flags2 |= XFS_DIFLAG2_DAX; @@ -1690,7 +1691,7 @@ xfs_ioc_getbmap( if (bmx.bmv_count > ULONG_MAX / recsize) return -ENOMEM; - buf = kmem_zalloc_large(bmx.bmv_count * sizeof(*buf), 0); + buf = kvzalloc(bmx.bmv_count * sizeof(*buf), GFP_KERNEL); if (!buf) return -ENOMEM; @@ -1715,39 +1716,17 @@ out_free_buf: return error; } -struct getfsmap_info { - struct xfs_mount *mp; - struct fsmap_head __user *data; - unsigned int idx; - __u32 last_flags; -}; - -STATIC int -xfs_getfsmap_format(struct xfs_fsmap *xfm, void *priv) -{ - struct getfsmap_info *info = priv; - struct fsmap fm; - - trace_xfs_getfsmap_mapping(info->mp, xfm); - - info->last_flags = xfm->fmr_flags; - xfs_fsmap_from_internal(&fm, xfm); - if (copy_to_user(&info->data->fmh_recs[info->idx++], &fm, - sizeof(struct fsmap))) - return -EFAULT; - - return 0; -} - STATIC int xfs_ioc_getfsmap( struct xfs_inode *ip, struct fsmap_head __user *arg) { - struct getfsmap_info info = { NULL }; struct xfs_fsmap_head xhead = {0}; struct fsmap_head head; - bool aborted = false; + struct fsmap *recs; + unsigned int count; + __u32 last_flags = 0; + bool done = false; int error; if (copy_from_user(&head, arg, sizeof(struct fsmap_head))) @@ -1759,38 +1738,112 @@ xfs_ioc_getfsmap( sizeof(head.fmh_keys[1].fmr_reserved))) return -EINVAL; + /* + * Use an internal memory buffer so that we don't have to copy fsmap + * data to userspace while holding locks. Start by trying to allocate + * up to 128k for the buffer, but fall back to a single page if needed. + */ + count = min_t(unsigned int, head.fmh_count, + 131072 / sizeof(struct fsmap)); + recs = kvzalloc(count * sizeof(struct fsmap), GFP_KERNEL); + if (!recs) { + count = min_t(unsigned int, head.fmh_count, + PAGE_SIZE / sizeof(struct fsmap)); + recs = kvzalloc(count * sizeof(struct fsmap), GFP_KERNEL); + if (!recs) + return -ENOMEM; + } + xhead.fmh_iflags = head.fmh_iflags; - xhead.fmh_count = head.fmh_count; xfs_fsmap_to_internal(&xhead.fmh_keys[0], &head.fmh_keys[0]); xfs_fsmap_to_internal(&xhead.fmh_keys[1], &head.fmh_keys[1]); trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]); trace_xfs_getfsmap_high_key(ip->i_mount, &xhead.fmh_keys[1]); - info.mp = ip->i_mount; - info.data = arg; - error = xfs_getfsmap(ip->i_mount, &xhead, xfs_getfsmap_format, &info); - if (error == -ECANCELED) { - error = 0; - aborted = true; - } else if (error) - return error; + head.fmh_entries = 0; + do { + struct fsmap __user *user_recs; + struct fsmap *last_rec; + + user_recs = &arg->fmh_recs[head.fmh_entries]; + xhead.fmh_entries = 0; + xhead.fmh_count = min_t(unsigned int, count, + head.fmh_count - head.fmh_entries); + + /* Run query, record how many entries we got. */ + error = xfs_getfsmap(ip->i_mount, &xhead, recs); + switch (error) { + case 0: + /* + * There are no more records in the result set. Copy + * whatever we got to userspace and break out. + */ + done = true; + break; + case -ECANCELED: + /* + * The internal memory buffer is full. Copy whatever + * records we got to userspace and go again if we have + * not yet filled the userspace buffer. + */ + error = 0; + break; + default: + goto out_free; + } + head.fmh_entries += xhead.fmh_entries; + head.fmh_oflags = xhead.fmh_oflags; - /* If we didn't abort, set the "last" flag in the last fmx */ - if (!aborted && info.idx) { - info.last_flags |= FMR_OF_LAST; - if (copy_to_user(&info.data->fmh_recs[info.idx - 1].fmr_flags, - &info.last_flags, sizeof(info.last_flags))) - return -EFAULT; + /* + * If the caller wanted a record count or there aren't any + * new records to return, we're done. + */ + if (head.fmh_count == 0 || xhead.fmh_entries == 0) + break; + + /* Copy all the records we got out to userspace. */ + if (copy_to_user(user_recs, recs, + xhead.fmh_entries * sizeof(struct fsmap))) { + error = -EFAULT; + goto out_free; + } + + /* Remember the last record flags we copied to userspace. */ + last_rec = &recs[xhead.fmh_entries - 1]; + last_flags = last_rec->fmr_flags; + + /* Set up the low key for the next iteration. */ + xfs_fsmap_to_internal(&xhead.fmh_keys[0], last_rec); + trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]); + } while (!done && head.fmh_entries < head.fmh_count); + + /* + * If there are no more records in the query result set and we're not + * in counting mode, mark the last record returned with the LAST flag. + */ + if (done && head.fmh_count > 0 && head.fmh_entries > 0) { + struct fsmap __user *user_rec; + + last_flags |= FMR_OF_LAST; + user_rec = &arg->fmh_recs[head.fmh_entries - 1]; + + if (copy_to_user(&user_rec->fmr_flags, &last_flags, + sizeof(last_flags))) { + error = -EFAULT; + goto out_free; + } } /* copy back header */ - head.fmh_entries = xhead.fmh_entries; - head.fmh_oflags = xhead.fmh_oflags; - if (copy_to_user(arg, &head, sizeof(struct fsmap_head))) - return -EFAULT; + if (copy_to_user(arg, &head, sizeof(struct fsmap_head))) { + error = -EFAULT; + goto out_free; + } - return 0; +out_free: + kmem_free(recs); + return error; } STATIC int diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 80a13c8561d8..5e165456da68 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -237,7 +237,7 @@ xfs_vn_create( umode_t mode, bool flags) { - return xfs_vn_mknod(dir, dentry, mode, 0); + return xfs_generic_create(dir, dentry, mode, 0, false); } STATIC int @@ -246,7 +246,7 @@ xfs_vn_mkdir( struct dentry *dentry, umode_t mode) { - return xfs_vn_mknod(dir, dentry, mode|S_IFDIR, 0); + return xfs_generic_create(dir, dentry, mode | S_IFDIR, 0, false); } STATIC struct dentry * diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index ab737fed7b12..ad1009778d33 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -123,7 +123,6 @@ typedef __u32 xfs_nlink_t; #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #define EFSBADCRC EBADMSG /* Bad CRC detected */ -#define SYNCHRONIZE() barrier() #define __return_address __builtin_return_address(0) /* diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index ad0c69ee8947..fa2d05e65ff1 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1475,14 +1475,14 @@ xlog_commit_record( } /* - * Push on the buffer cache code if we ever use more than 75% of the on-disk - * log space. This code pushes on the lsn which would supposedly free up - * the 25% which we want to leave free. We may need to adopt a policy which - * pushes on an lsn which is further along in the log once we reach the high - * water mark. In this manner, we would be creating a low water mark. + * Compute the LSN that we'd need to push the log tail towards in order to have + * (a) enough on-disk log space to log the number of bytes specified, (b) at + * least 25% of the log space free, and (c) at least 256 blocks free. If the + * log free space already meets all three thresholds, this function returns + * NULLCOMMITLSN. */ -STATIC void -xlog_grant_push_ail( +xfs_lsn_t +xlog_grant_push_threshold( struct xlog *log, int need_bytes) { @@ -1508,7 +1508,7 @@ xlog_grant_push_ail( free_threshold = max(free_threshold, (log->l_logBBsize >> 2)); free_threshold = max(free_threshold, 256); if (free_blocks >= free_threshold) - return; + return NULLCOMMITLSN; xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle, &threshold_block); @@ -1528,13 +1528,33 @@ xlog_grant_push_ail( if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0) threshold_lsn = last_sync_lsn; + return threshold_lsn; +} + +/* + * Push the tail of the log if we need to do so to maintain the free log space + * thresholds set out by xlog_grant_push_threshold. We may need to adopt a + * policy which pushes on an lsn which is further along in the log once we + * reach the high water mark. In this manner, we would be creating a low water + * mark. + */ +STATIC void +xlog_grant_push_ail( + struct xlog *log, + int need_bytes) +{ + xfs_lsn_t threshold_lsn; + + threshold_lsn = xlog_grant_push_threshold(log, need_bytes); + if (threshold_lsn == NULLCOMMITLSN || XLOG_FORCED_SHUTDOWN(log)) + return; + /* * Get the transaction layer to kick the dirty buffers out to * disk asynchronously. No point in trying to do this if * the filesystem is shutting down. */ - if (!XLOG_FORCED_SHUTDOWN(log)) - xfs_ail_push(log->l_ailp, threshold_lsn); + xfs_ail_push(log->l_ailp, threshold_lsn); } /* @@ -1604,9 +1624,7 @@ xlog_cksum( int i; int xheads; - xheads = size / XLOG_HEADER_CYCLE_SIZE; - if (size % XLOG_HEADER_CYCLE_SIZE) - xheads++; + xheads = DIV_ROUND_UP(size, XLOG_HEADER_CYCLE_SIZE); for (i = 1; i < xheads; i++) { crc = crc32c(crc, &xhdr[i].hic_xheader, diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 1412d6993f1e..58c3fcbec94a 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -141,4 +141,6 @@ void xfs_log_quiesce(struct xfs_mount *mp); bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t); bool xfs_log_in_recovery(struct xfs_mount *); +xfs_lsn_t xlog_grant_push_threshold(struct xlog *log, int need_bytes); + #endif /* __XFS_LOG_H__ */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index e2ec91b2d0f4..a8289adc1b29 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -265,32 +265,6 @@ xlog_header_check_mount( return 0; } -void -xlog_recover_iodone( - struct xfs_buf *bp) -{ - if (bp->b_error) { - /* - * We're not going to bother about retrying - * this during recovery. One strike! - */ - if (!XFS_FORCED_SHUTDOWN(bp->b_mount)) { - xfs_buf_ioerror_alert(bp, __this_address); - xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR); - } - } - - /* - * On v5 supers, a bli could be attached to update the metadata LSN. - * Clean it up. - */ - if (bp->b_log_item) - xfs_buf_item_relse(bp); - ASSERT(bp->b_log_item == NULL); - bp->b_flags &= ~_XBF_LOGRECOVERY; - xfs_buf_ioend_finish(bp); -} - /* * This routine finds (to an approximation) the first block in the physical * log which contains the given cycle. It uses a binary search algorithm. @@ -397,6 +371,19 @@ out: return error; } +static inline int +xlog_logrec_hblks(struct xlog *log, struct xlog_rec_header *rh) +{ + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + int h_size = be32_to_cpu(rh->h_size); + + if ((be32_to_cpu(rh->h_version) & XLOG_VERSION_2) && + h_size > XLOG_HEADER_CYCLE_SIZE) + return DIV_ROUND_UP(h_size, XLOG_HEADER_CYCLE_SIZE); + } + return 1; +} + /* * Potentially backup over partial log record write. * @@ -489,15 +476,7 @@ xlog_find_verify_log_record( * reset last_blk. Only when last_blk points in the middle of a log * record do we update last_blk. */ - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { - uint h_size = be32_to_cpu(head->h_size); - - xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE; - if (h_size % XLOG_HEADER_CYCLE_SIZE) - xhdrs++; - } else { - xhdrs = 1; - } + xhdrs = xlog_logrec_hblks(log, head); if (*last_blk - i + extra_bblks != BTOBB(be32_to_cpu(head->h_len)) + xhdrs) @@ -1184,22 +1163,7 @@ xlog_check_unmount_rec( * below. We won't want to clear the unmount record if there is one, so * we pass the lsn of the unmount record rather than the block after it. */ - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { - int h_size = be32_to_cpu(rhead->h_size); - int h_version = be32_to_cpu(rhead->h_version); - - if ((h_version & XLOG_VERSION_2) && - (h_size > XLOG_HEADER_CYCLE_SIZE)) { - hblks = h_size / XLOG_HEADER_CYCLE_SIZE; - if (h_size % XLOG_HEADER_CYCLE_SIZE) - hblks++; - } else { - hblks = 1; - } - } else { - hblks = 1; - } - + hblks = xlog_logrec_hblks(log, rhead); after_umount_blk = xlog_wrap_logbno(log, rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len))); @@ -2097,7 +2061,7 @@ xlog_recover_add_to_cont_trans( old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; old_len = item->ri_buf[item->ri_cnt-1].i_len; - ptr = kmem_realloc(old_ptr, len + old_len, 0); + ptr = krealloc(old_ptr, len + old_len, GFP_KERNEL | __GFP_NOFAIL); memcpy(&ptr[old_len], dp, len); item->ri_buf[item->ri_cnt-1].i_len += len; item->ri_buf[item->ri_cnt-1].i_addr = ptr; @@ -2470,44 +2434,66 @@ xlog_recover_process_data( /* Take all the collected deferred ops and finish them in order. */ static int xlog_finish_defer_ops( - struct xfs_trans *parent_tp) + struct xfs_mount *mp, + struct list_head *capture_list) { - struct xfs_mount *mp = parent_tp->t_mountp; + struct xfs_defer_capture *dfc, *next; struct xfs_trans *tp; - int64_t freeblks; - uint resblks; - int error; + struct xfs_inode *ip; + int error = 0; - /* - * We're finishing the defer_ops that accumulated as a result of - * recovering unfinished intent items during log recovery. We - * reserve an itruncate transaction because it is the largest - * permanent transaction type. Since we're the only user of the fs - * right now, take 93% (15/16) of the available free blocks. Use - * weird math to avoid a 64-bit division. - */ - freeblks = percpu_counter_sum(&mp->m_fdblocks); - if (freeblks <= 0) - return -ENOSPC; - resblks = min_t(int64_t, UINT_MAX, freeblks); - resblks = (resblks * 15) >> 4; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks, - 0, XFS_TRANS_RESERVE, &tp); - if (error) - return error; - /* transfer all collected dfops to this transaction */ - xfs_defer_move(tp, parent_tp); + list_for_each_entry_safe(dfc, next, capture_list, dfc_list) { + struct xfs_trans_res resv; + + /* + * Create a new transaction reservation from the captured + * information. Set logcount to 1 to force the new transaction + * to regrant every roll so that we can make forward progress + * in recovery no matter how full the log might be. + */ + resv.tr_logres = dfc->dfc_logres; + resv.tr_logcount = 1; + resv.tr_logflags = XFS_TRANS_PERM_LOG_RES; + + error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres, + dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp); + if (error) + return error; + + /* + * Transfer to this new transaction all the dfops we captured + * from recovering a single intent item. + */ + list_del_init(&dfc->dfc_list); + xfs_defer_ops_continue(dfc, tp, &ip); + + error = xfs_trans_commit(tp); + if (ip) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_irele(ip); + } + if (error) + return error; + } - return xfs_trans_commit(tp); + ASSERT(list_empty(capture_list)); + return 0; } -/* Is this log item a deferred action intent? */ -static inline bool xlog_item_is_intent(struct xfs_log_item *lip) +/* Release all the captured defer ops and capture structures in this list. */ +static void +xlog_abort_defer_ops( + struct xfs_mount *mp, + struct list_head *capture_list) { - return lip->li_ops->iop_recover != NULL && - lip->li_ops->iop_match != NULL; -} + struct xfs_defer_capture *dfc; + struct xfs_defer_capture *next; + list_for_each_entry_safe(dfc, next, capture_list, dfc_list) { + list_del_init(&dfc->dfc_list); + xfs_defer_ops_release(mp, dfc); + } +} /* * When this is called, all of the log intent items which did not have * corresponding log done items should be in the AIL. What we do now @@ -2528,35 +2514,23 @@ STATIC int xlog_recover_process_intents( struct xlog *log) { - struct xfs_trans *parent_tp; + LIST_HEAD(capture_list); struct xfs_ail_cursor cur; struct xfs_log_item *lip; struct xfs_ail *ailp; - int error; + int error = 0; #if defined(DEBUG) || defined(XFS_WARN) xfs_lsn_t last_lsn; #endif - /* - * The intent recovery handlers commit transactions to complete recovery - * for individual intents, but any new deferred operations that are - * queued during that process are held off until the very end. The - * purpose of this transaction is to serve as a container for deferred - * operations. Each intent recovery handler must transfer dfops here - * before its local transaction commits, and we'll finish the entire - * list below. - */ - error = xfs_trans_alloc_empty(log->l_mp, &parent_tp); - if (error) - return error; - ailp = log->l_ailp; spin_lock(&ailp->ail_lock); - lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); #if defined(DEBUG) || defined(XFS_WARN) last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block); #endif - while (lip != NULL) { + for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); + lip != NULL; + lip = xfs_trans_ail_cursor_next(ailp, &cur)) { /* * We're done when we see something other than an intent. * There should be no intents left in the AIL now. @@ -2578,26 +2552,29 @@ xlog_recover_process_intents( /* * NOTE: If your intent processing routine can create more - * deferred ops, you /must/ attach them to the transaction in - * this routine or else those subsequent intents will get + * deferred ops, you /must/ attach them to the capture list in + * the recover routine or else those subsequent intents will be * replayed in the wrong order! */ - if (!test_and_set_bit(XFS_LI_RECOVERED, &lip->li_flags)) { - spin_unlock(&ailp->ail_lock); - error = lip->li_ops->iop_recover(lip, parent_tp); - spin_lock(&ailp->ail_lock); - } + spin_unlock(&ailp->ail_lock); + error = lip->li_ops->iop_recover(lip, &capture_list); + spin_lock(&ailp->ail_lock); if (error) - goto out; - lip = xfs_trans_ail_cursor_next(ailp, &cur); + break; } -out: + xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->ail_lock); - if (!error) - error = xlog_finish_defer_ops(parent_tp); - xfs_trans_cancel(parent_tp); + if (error) + goto err; + error = xlog_finish_defer_ops(log->l_mp, &capture_list); + if (error) + goto err; + + return 0; +err: + xlog_abort_defer_ops(log->l_mp, &capture_list); return error; } @@ -2904,7 +2881,8 @@ STATIC int xlog_valid_rec_header( struct xlog *log, struct xlog_rec_header *rhead, - xfs_daddr_t blkno) + xfs_daddr_t blkno, + int bufsize) { int hlen; @@ -2920,10 +2898,14 @@ xlog_valid_rec_header( return -EFSCORRUPTED; } - /* LR body must have data or it wouldn't have been written */ + /* + * LR body must have data (or it wouldn't have been written) + * and h_len must not be greater than LR buffer size. + */ hlen = be32_to_cpu(rhead->h_len); - if (XFS_IS_CORRUPT(log->l_mp, hlen <= 0 || hlen > INT_MAX)) + if (XFS_IS_CORRUPT(log->l_mp, hlen <= 0 || hlen > bufsize)) return -EFSCORRUPTED; + if (XFS_IS_CORRUPT(log->l_mp, blkno > log->l_logBBsize || blkno > INT_MAX)) return -EFSCORRUPTED; @@ -2984,9 +2966,6 @@ xlog_do_recovery_pass( goto bread_err1; rhead = (xlog_rec_header_t *)offset; - error = xlog_valid_rec_header(log, rhead, tail_blk); - if (error) - goto bread_err1; /* * xfsprogs has a bug where record length is based on lsunit but @@ -3001,30 +2980,22 @@ xlog_do_recovery_pass( */ h_size = be32_to_cpu(rhead->h_size); h_len = be32_to_cpu(rhead->h_len); - if (h_len > h_size) { - if (h_len <= log->l_mp->m_logbsize && - be32_to_cpu(rhead->h_num_logops) == 1) { - xfs_warn(log->l_mp, + if (h_len > h_size && h_len <= log->l_mp->m_logbsize && + rhead->h_num_logops == cpu_to_be32(1)) { + xfs_warn(log->l_mp, "invalid iclog size (%d bytes), using lsunit (%d bytes)", - h_size, log->l_mp->m_logbsize); - h_size = log->l_mp->m_logbsize; - } else { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, - log->l_mp); - error = -EFSCORRUPTED; - goto bread_err1; - } + h_size, log->l_mp->m_logbsize); + h_size = log->l_mp->m_logbsize; } - if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) && - (h_size > XLOG_HEADER_CYCLE_SIZE)) { - hblks = h_size / XLOG_HEADER_CYCLE_SIZE; - if (h_size % XLOG_HEADER_CYCLE_SIZE) - hblks++; + error = xlog_valid_rec_header(log, rhead, tail_blk, h_size); + if (error) + goto bread_err1; + + hblks = xlog_logrec_hblks(log, rhead); + if (hblks != 1) { kmem_free(hbp); hbp = xlog_alloc_buffer(log, hblks); - } else { - hblks = 1; } } else { ASSERT(log->l_sectBBsize == 1); @@ -3096,7 +3067,7 @@ xlog_do_recovery_pass( } rhead = (xlog_rec_header_t *)offset; error = xlog_valid_rec_header(log, rhead, - split_hblks ? blk_no : 0); + split_hblks ? blk_no : 0, h_size); if (error) goto bread_err2; @@ -3177,7 +3148,7 @@ xlog_do_recovery_pass( goto bread_err2; rhead = (xlog_rec_header_t *)offset; - error = xlog_valid_rec_header(log, rhead, blk_no); + error = xlog_valid_rec_header(log, rhead, blk_no, h_size); if (error) goto bread_err2; @@ -3294,14 +3265,14 @@ xlog_do_log_recovery( */ STATIC int xlog_do_recover( - struct xlog *log, - xfs_daddr_t head_blk, - xfs_daddr_t tail_blk) + struct xlog *log, + xfs_daddr_t head_blk, + xfs_daddr_t tail_blk) { - struct xfs_mount *mp = log->l_mp; - int error; - xfs_buf_t *bp; - xfs_sb_t *sbp; + struct xfs_mount *mp = log->l_mp; + struct xfs_buf *bp = mp->m_sb_bp; + struct xfs_sb *sbp = &mp->m_sb; + int error; trace_xfs_log_recover(log, head_blk, tail_blk); @@ -3315,9 +3286,8 @@ xlog_do_recover( /* * If IO errors happened during recovery, bail out. */ - if (XFS_FORCED_SHUTDOWN(mp)) { + if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - } /* * We now update the tail_lsn since much of the recovery has completed @@ -3331,16 +3301,12 @@ xlog_do_recover( xlog_assign_tail_lsn(mp); /* - * Now that we've finished replaying all buffer and inode - * updates, re-read in the superblock and reverify it. + * Now that we've finished replaying all buffer and inode updates, + * re-read the superblock and reverify it. */ - bp = xfs_getsb(mp); - bp->b_flags &= ~(XBF_DONE | XBF_ASYNC); - ASSERT(!(bp->b_flags & XBF_WRITE)); - bp->b_flags |= XBF_READ; - bp->b_ops = &xfs_sb_buf_ops; - - error = xfs_buf_submit(bp); + xfs_buf_lock(bp); + xfs_buf_hold(bp); + error = _xfs_buf_read(bp, XBF_READ); if (error) { if (!XFS_FORCED_SHUTDOWN(mp)) { xfs_buf_ioerror_alert(bp, __this_address); @@ -3351,7 +3317,6 @@ xlog_do_recover( } /* Convert superblock from on-disk format */ - sbp = &mp->m_sb; xfs_sb_from_disk(sbp, bp->b_addr); xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index c8ae49a1e99c..150ee5cb8645 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -80,9 +80,9 @@ xfs_uuid_mount( } if (hole < 0) { - xfs_uuid_table = kmem_realloc(xfs_uuid_table, + xfs_uuid_table = krealloc(xfs_uuid_table, (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table), - 0); + GFP_KERNEL | __GFP_NOFAIL); hole = xfs_uuid_table_size++; } xfs_uuid_table[hole] = *uuid; @@ -1059,11 +1059,12 @@ xfs_unmountfs( * We can potentially deadlock here if we have an inode cluster * that has been freed has its buffer still pinned in memory because * the transaction is still sitting in a iclog. The stale inodes - * on that buffer will have their flush locks held until the - * transaction hits the disk and the callbacks run. the inode - * flush takes the flush lock unconditionally and with nothing to - * push out the iclog we will never get that unlocked. hence we - * need to force the log first. + * on that buffer will be pinned to the buffer until the + * transaction hits the disk and the callbacks run. Pushing the AIL will + * skip the stale inodes and may never see the pinned buffer, so + * nothing will push out the iclog and unpin the buffer. Hence we + * need to force the log here to ensure all items are flushed into the + * AIL before we go any further. */ xfs_log_force(mp, XFS_LOG_SYNC); @@ -1289,23 +1290,6 @@ xfs_mod_frextents( } /* - * xfs_getsb() is called to obtain the buffer for the superblock. - * The buffer is returned locked and read in from disk. - * The buffer should be released with a call to xfs_brelse(). - */ -struct xfs_buf * -xfs_getsb( - struct xfs_mount *mp) -{ - struct xfs_buf *bp = mp->m_sb_bp; - - xfs_buf_lock(bp); - xfs_buf_hold(bp); - ASSERT(bp->b_flags & XBF_DONE); - return bp; -} - -/* * Used to free the superblock along various error paths. */ void diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index a72cfcaa4ad1..dfa429b77ee2 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -410,7 +410,6 @@ extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, bool reserved); extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta); -extern struct xfs_buf *xfs_getsb(xfs_mount_t *); extern int xfs_readsb(xfs_mount_t *, int); extern void xfs_freesb(xfs_mount_t *); extern bool xfs_fs_writable(struct xfs_mount *mp, int level); diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 5f04d8a5ab2a..0aa87c210104 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -15,6 +15,10 @@ "XFS: offsetof(" #structname ", " #member ") is wrong, " \ "expected " #off) +#define XFS_CHECK_VALUE(value, expected) \ + BUILD_BUG_ON_MSG((value) != (expected), \ + "XFS: value of " #value " is wrong, expected " #expected) + static inline void __init xfs_check_ondisk_structs(void) { @@ -23,7 +27,7 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_acl_entry, 12); XFS_CHECK_STRUCT_SIZE(struct xfs_agf, 224); XFS_CHECK_STRUCT_SIZE(struct xfs_agfl, 36); - XFS_CHECK_STRUCT_SIZE(struct xfs_agi, 336); + XFS_CHECK_STRUCT_SIZE(struct xfs_agi, 344); XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_key, 8); XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_rec, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_bmdr_block, 4); @@ -41,7 +45,8 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_rec, 12); XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_key, 20); XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_rec, 24); - XFS_CHECK_STRUCT_SIZE(struct xfs_timestamp, 8); + XFS_CHECK_STRUCT_SIZE(xfs_timestamp_t, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_legacy_timestamp, 8); XFS_CHECK_STRUCT_SIZE(xfs_alloc_key_t, 8); XFS_CHECK_STRUCT_SIZE(xfs_alloc_ptr_t, 4); XFS_CHECK_STRUCT_SIZE(xfs_alloc_rec_t, 8); @@ -84,12 +89,12 @@ xfs_check_ondisk_structs(void) XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8); XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9); XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 40); - XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.totsize, 0); - XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.count, 2); - XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].namelen, 4); - XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].valuelen, 5); - XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].flags, 6); - XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].nameval, 7); + XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.totsize, 0); + XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.count, 2); + XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].namelen, 4); + XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].valuelen, 5); + XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].flags, 6); + XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].nameval, 7); XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t, 12); XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t, 16); XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t, 8); @@ -121,7 +126,8 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_extent_64, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_log_dinode, 176); XFS_CHECK_STRUCT_SIZE(struct xfs_icreate_log, 28); - XFS_CHECK_STRUCT_SIZE(struct xfs_ictimestamp, 8); + XFS_CHECK_STRUCT_SIZE(xfs_ictimestamp_t, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_legacy_ictimestamp, 8); XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_32, 52); XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56); XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20); @@ -152,6 +158,20 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_inumbers, 24); XFS_CHECK_STRUCT_SIZE(struct xfs_bulkstat_req, 64); XFS_CHECK_STRUCT_SIZE(struct xfs_inumbers_req, 64); + + /* + * Make sure the incore inode timestamp range corresponds to hand + * converted values based on the ondisk format specification. + */ + XFS_CHECK_VALUE(XFS_BIGTIME_TIME_MIN - XFS_BIGTIME_EPOCH_OFFSET, + XFS_LEGACY_TIME_MIN); + XFS_CHECK_VALUE(XFS_BIGTIME_TIME_MAX - XFS_BIGTIME_EPOCH_OFFSET, + 16299260424LL); + + /* Do the same with the incore quota expiration range. */ + XFS_CHECK_VALUE(XFS_DQ_BIGTIME_EXPIRY_MIN << XFS_DQ_BIGTIME_SHIFT, 4); + XFS_CHECK_VALUE(XFS_DQ_BIGTIME_EXPIRY_MAX << XFS_DQ_BIGTIME_SHIFT, + 16299260424LL); } #endif /* __XFS_ONDISK_H */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index be67570badf8..b2a9abee8b2b 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -249,7 +249,6 @@ xfs_qm_unmount_quotas( STATIC int xfs_qm_dqattach_one( struct xfs_inode *ip, - xfs_dqid_t id, xfs_dqtype_t type, bool doalloc, struct xfs_dquot **IO_idqpp) @@ -330,23 +329,23 @@ xfs_qm_dqattach_locked( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) { - error = xfs_qm_dqattach_one(ip, i_uid_read(VFS_I(ip)), - XFS_DQTYPE_USER, doalloc, &ip->i_udquot); + error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_USER, + doalloc, &ip->i_udquot); if (error) goto done; ASSERT(ip->i_udquot); } if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) { - error = xfs_qm_dqattach_one(ip, i_gid_read(VFS_I(ip)), - XFS_DQTYPE_GROUP, doalloc, &ip->i_gdquot); + error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_GROUP, + doalloc, &ip->i_gdquot); if (error) goto done; ASSERT(ip->i_gdquot); } if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) { - error = xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQTYPE_PROJ, + error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_PROJ, doalloc, &ip->i_pdquot); if (error) goto done; @@ -661,6 +660,17 @@ xfs_qm_init_quotainfo( /* Precalc some constants */ qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); qinf->qi_dqperchunk = xfs_calc_dquots_per_chunk(qinf->qi_dqchunklen); + if (xfs_sb_version_hasbigtime(&mp->m_sb)) { + qinf->qi_expiry_min = + xfs_dq_bigtime_to_unix(XFS_DQ_BIGTIME_EXPIRY_MIN); + qinf->qi_expiry_max = + xfs_dq_bigtime_to_unix(XFS_DQ_BIGTIME_EXPIRY_MAX); + } else { + qinf->qi_expiry_min = XFS_DQ_LEGACY_EXPIRY_MIN; + qinf->qi_expiry_max = XFS_DQ_LEGACY_EXPIRY_MAX; + } + trace_xfs_quota_expiry_range(mp, qinf->qi_expiry_min, + qinf->qi_expiry_max); mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD); @@ -879,6 +889,8 @@ xfs_qm_reset_dqcounts( ddq->d_bwarns = 0; ddq->d_iwarns = 0; ddq->d_rtbwarns = 0; + if (xfs_sb_version_hasbigtime(&mp->m_sb)) + ddq->d_type |= XFS_DQTYPE_BIGTIME; } if (xfs_sb_version_hascrc(&mp->m_sb)) { @@ -1650,6 +1662,7 @@ xfs_qm_vop_dqalloc( } if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) { + ASSERT(O_udqpp); if (!uid_eq(inode->i_uid, uid)) { /* * What we need is the dquot that has this uid, and @@ -1683,6 +1696,7 @@ xfs_qm_vop_dqalloc( } } if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) { + ASSERT(O_gdqpp); if (!gid_eq(inode->i_gid, gid)) { xfs_iunlock(ip, lockflags); error = xfs_qm_dqget(mp, from_kgid(user_ns, gid), @@ -1700,9 +1714,10 @@ xfs_qm_vop_dqalloc( } } if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { + ASSERT(O_pdqpp); if (ip->i_d.di_projid != prid) { xfs_iunlock(ip, lockflags); - error = xfs_qm_dqget(mp, (xfs_dqid_t)prid, + error = xfs_qm_dqget(mp, prid, XFS_DQTYPE_PROJ, true, &pq); if (error) { ASSERT(error != -ENOENT); diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 9c078c35d924..e3dabab44097 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -65,6 +65,10 @@ struct xfs_quotainfo { struct xfs_def_quota qi_grp_default; struct xfs_def_quota qi_prj_default; struct shrinker qi_shrinker; + + /* Minimum and maximum quota expiration timestamp values. */ + time64_t qi_expiry_min; + time64_t qi_expiry_max; }; static inline struct radix_tree_root * diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 1c542b4a5220..ca1b57d291dc 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -479,13 +479,19 @@ xfs_setqlim_warns( static inline void xfs_setqlim_timer( + struct xfs_mount *mp, struct xfs_dquot_res *res, struct xfs_quota_limits *qlim, s64 timer) { - res->timer = timer; - if (qlim) - qlim->time = timer; + if (qlim) { + /* Set the length of the default grace period. */ + res->timer = xfs_dquot_set_grace_period(timer); + qlim->time = res->timer; + } else { + /* Set the grace period expiration on a quota. */ + res->timer = xfs_dquot_set_timeout(mp, timer); + } } /* @@ -574,7 +580,7 @@ xfs_qm_scall_setqlim( if (newlim->d_fieldmask & QC_SPC_WARNS) xfs_setqlim_warns(res, qlim, newlim->d_spc_warns); if (newlim->d_fieldmask & QC_SPC_TIMER) - xfs_setqlim_timer(res, qlim, newlim->d_spc_timer); + xfs_setqlim_timer(mp, res, qlim, newlim->d_spc_timer); /* Blocks on the realtime device. */ hard = (newlim->d_fieldmask & QC_RT_SPC_HARD) ? @@ -590,7 +596,7 @@ xfs_qm_scall_setqlim( if (newlim->d_fieldmask & QC_RT_SPC_WARNS) xfs_setqlim_warns(res, qlim, newlim->d_rt_spc_warns); if (newlim->d_fieldmask & QC_RT_SPC_TIMER) - xfs_setqlim_timer(res, qlim, newlim->d_rt_spc_timer); + xfs_setqlim_timer(mp, res, qlim, newlim->d_rt_spc_timer); /* Inodes */ hard = (newlim->d_fieldmask & QC_INO_HARD) ? @@ -606,7 +612,7 @@ xfs_qm_scall_setqlim( if (newlim->d_fieldmask & QC_INO_WARNS) xfs_setqlim_warns(res, qlim, newlim->d_ino_warns); if (newlim->d_fieldmask & QC_INO_TIMER) - xfs_setqlim_timer(res, qlim, newlim->d_ino_timer); + xfs_setqlim_timer(mp, res, qlim, newlim->d_ino_timer); if (id != 0) { /* diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 06b22e35fc90..5a62398940d0 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -108,8 +108,6 @@ extern void xfs_qm_mount_quotas(struct xfs_mount *); extern void xfs_qm_unmount(struct xfs_mount *); extern void xfs_qm_unmount_quotas(struct xfs_mount *); -void xfs_dquot_done(struct xfs_buf *); - #else static inline int xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid, @@ -151,12 +149,6 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp, #define xfs_qm_mount_quotas(mp) #define xfs_qm_unmount(mp) #define xfs_qm_unmount_quotas(mp) - -static inline void xfs_dquot_done(struct xfs_buf *bp) -{ - return; -} - #endif /* CONFIG_XFS_QUOTA */ #define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \ diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index ca93b6488377..7529eb63ce94 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -424,7 +424,7 @@ const struct xfs_defer_op_type xfs_refcount_update_defer_type = { STATIC int xfs_cui_item_recover( struct xfs_log_item *lip, - struct xfs_trans *parent_tp) + struct list_head *capture_list) { struct xfs_bmbt_irec irec; struct xfs_cui_log_item *cuip = CUI_ITEM(lip); @@ -432,7 +432,7 @@ xfs_cui_item_recover( struct xfs_cud_log_item *cudp; struct xfs_trans *tp; struct xfs_btree_cur *rcur = NULL; - struct xfs_mount *mp = parent_tp->t_mountp; + struct xfs_mount *mp = lip->li_mountp; xfs_fsblock_t startblock_fsb; xfs_fsblock_t new_fsb; xfs_extlen_t new_len; @@ -467,14 +467,8 @@ xfs_cui_item_recover( refc->pe_len == 0 || startblock_fsb >= mp->m_sb.sb_dblocks || refc->pe_len >= mp->m_sb.sb_agblocks || - (refc->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS)) { - /* - * This will pull the CUI from the AIL and - * free the memory associated with it. - */ - xfs_cui_release(cuip); + (refc->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS)) return -EFSCORRUPTED; - } } /* @@ -493,12 +487,7 @@ xfs_cui_item_recover( mp->m_refc_maxlevels * 2, 0, XFS_TRANS_RESERVE, &tp); if (error) return error; - /* - * Recovery stashes all deferred ops during intent processing and - * finishes them on completion. Transfer current dfops state to this - * transaction and transfer the result back before we return. - */ - xfs_defer_move(tp, parent_tp); + cudp = xfs_trans_get_cud(tp, cuip); for (i = 0; i < cuip->cui_format.cui_nextents; i++) { @@ -555,13 +544,10 @@ xfs_cui_item_recover( } xfs_refcount_finish_one_cleanup(tp, rcur, error); - xfs_defer_move(parent_tp, tp); - error = xfs_trans_commit(tp); - return error; + return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list); abort_error: xfs_refcount_finish_one_cleanup(tp, rcur, error); - xfs_defer_move(parent_tp, tp); xfs_trans_cancel(tp); return error; } @@ -574,6 +560,32 @@ xfs_cui_item_match( return CUI_ITEM(lip)->cui_format.cui_id == intent_id; } +/* Relog an intent item to push the log tail forward. */ +static struct xfs_log_item * +xfs_cui_item_relog( + struct xfs_log_item *intent, + struct xfs_trans *tp) +{ + struct xfs_cud_log_item *cudp; + struct xfs_cui_log_item *cuip; + struct xfs_phys_extent *extp; + unsigned int count; + + count = CUI_ITEM(intent)->cui_format.cui_nextents; + extp = CUI_ITEM(intent)->cui_format.cui_extents; + + tp->t_flags |= XFS_TRANS_DIRTY; + cudp = xfs_trans_get_cud(tp, CUI_ITEM(intent)); + set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags); + + cuip = xfs_cui_init(tp->t_mountp, count); + memcpy(cuip->cui_format.cui_extents, extp, count * sizeof(*extp)); + atomic_set(&cuip->cui_next_extent, count); + xfs_trans_add_item(tp, &cuip->cui_item); + set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags); + return &cuip->cui_item; +} + static const struct xfs_item_ops xfs_cui_item_ops = { .iop_size = xfs_cui_item_size, .iop_format = xfs_cui_item_format, @@ -581,6 +593,7 @@ static const struct xfs_item_ops xfs_cui_item_ops = { .iop_release = xfs_cui_item_release, .iop_recover = xfs_cui_item_recover, .iop_match = xfs_cui_item_match, + .iop_relog = xfs_cui_item_relog, }; /* diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index dc5b0753cd51..7adc996ca6e3 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -467,14 +467,14 @@ const struct xfs_defer_op_type xfs_rmap_update_defer_type = { STATIC int xfs_rui_item_recover( struct xfs_log_item *lip, - struct xfs_trans *parent_tp) + struct list_head *capture_list) { struct xfs_rui_log_item *ruip = RUI_ITEM(lip); struct xfs_map_extent *rmap; struct xfs_rud_log_item *rudp; struct xfs_trans *tp; struct xfs_btree_cur *rcur = NULL; - struct xfs_mount *mp = parent_tp->t_mountp; + struct xfs_mount *mp = lip->li_mountp; xfs_fsblock_t startblock_fsb; enum xfs_rmap_intent_type type; xfs_exntst_t state; @@ -511,14 +511,8 @@ xfs_rui_item_recover( rmap->me_len == 0 || startblock_fsb >= mp->m_sb.sb_dblocks || rmap->me_len >= mp->m_sb.sb_agblocks || - (rmap->me_flags & ~XFS_RMAP_EXTENT_FLAGS)) { - /* - * This will pull the RUI from the AIL and - * free the memory associated with it. - */ - xfs_rui_release(ruip); + (rmap->me_flags & ~XFS_RMAP_EXTENT_FLAGS)) return -EFSCORRUPTED; - } } error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, @@ -573,8 +567,7 @@ xfs_rui_item_recover( } xfs_rmap_finish_one_cleanup(tp, rcur, error); - error = xfs_trans_commit(tp); - return error; + return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list); abort_error: xfs_rmap_finish_one_cleanup(tp, rcur, error); @@ -590,6 +583,32 @@ xfs_rui_item_match( return RUI_ITEM(lip)->rui_format.rui_id == intent_id; } +/* Relog an intent item to push the log tail forward. */ +static struct xfs_log_item * +xfs_rui_item_relog( + struct xfs_log_item *intent, + struct xfs_trans *tp) +{ + struct xfs_rud_log_item *rudp; + struct xfs_rui_log_item *ruip; + struct xfs_map_extent *extp; + unsigned int count; + + count = RUI_ITEM(intent)->rui_format.rui_nextents; + extp = RUI_ITEM(intent)->rui_format.rui_extents; + + tp->t_flags |= XFS_TRANS_DIRTY; + rudp = xfs_trans_get_rud(tp, RUI_ITEM(intent)); + set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags); + + ruip = xfs_rui_init(tp->t_mountp, count); + memcpy(ruip->rui_format.rui_extents, extp, count * sizeof(*extp)); + atomic_set(&ruip->rui_next_extent, count); + xfs_trans_add_item(tp, &ruip->rui_item); + set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags); + return &ruip->rui_item; +} + static const struct xfs_item_ops xfs_rui_item_ops = { .iop_size = xfs_rui_item_size, .iop_format = xfs_rui_item_format, @@ -597,6 +616,7 @@ static const struct xfs_item_ops xfs_rui_item_ops = { .iop_release = xfs_rui_item_release, .iop_recover = xfs_rui_item_recover, .iop_match = xfs_rui_item_match, + .iop_relog = xfs_rui_item_relog, }; /* diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 6209e7b6b895..ede1baf31413 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -18,7 +18,7 @@ #include "xfs_trans_space.h" #include "xfs_icache.h" #include "xfs_rtalloc.h" - +#include "xfs_sb.h" /* * Read and return the summary information for a given extent size, @@ -247,6 +247,9 @@ xfs_rtallocate_extent_block( end = XFS_BLOCKTOBIT(mp, bbno + 1) - 1; i <= end; i++) { + /* Make sure we don't scan off the end of the rt volume. */ + maxlen = min(mp->m_sb.sb_rextents, i + maxlen) - i; + /* * See if there's a free extent of maxlen starting at i. * If it's not so then next will contain the first non-free. @@ -442,6 +445,14 @@ xfs_rtallocate_extent_near( */ if (bno >= mp->m_sb.sb_rextents) bno = mp->m_sb.sb_rextents - 1; + + /* Make sure we don't run off the end of the rt volume. */ + maxlen = min(mp->m_sb.sb_rextents, bno + maxlen) - bno; + if (maxlen < minlen) { + *rtblock = NULLRTBLOCK; + return 0; + } + /* * Try the exact allocation first. */ @@ -767,8 +778,14 @@ xfs_growfs_rt_alloc( struct xfs_bmbt_irec map; /* block map output */ int nmap; /* number of block maps */ int resblks; /* space reservation */ + enum xfs_blft buf_type; struct xfs_trans *tp; + if (ip == mp->m_rsumip) + buf_type = XFS_BLFT_RTSUMMARY_BUF; + else + buf_type = XFS_BLFT_RTBITMAP_BUF; + /* * Allocate space to the file, as necessary. */ @@ -830,6 +847,9 @@ xfs_growfs_rt_alloc( mp->m_bsize, 0, &bp); if (error) goto out_trans_cancel; + + xfs_trans_buf_set_type(tp, bp, buf_type); + bp->b_ops = &xfs_rtbuf_ops; memset(bp->b_addr, 0, mp->m_sb.sb_blocksize); xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1); /* @@ -862,7 +882,7 @@ xfs_alloc_rsum_cache( * lower bound on the minimum level with any free extents. We can * continue without the cache if it couldn't be allocated. */ - mp->m_rsum_cache = kmem_zalloc_large(rbmblocks, 0); + mp->m_rsum_cache = kvzalloc(rbmblocks, GFP_KERNEL); if (!mp->m_rsum_cache) xfs_warn(mp, "could not allocate realtime summary cache"); } @@ -1004,23 +1024,29 @@ xfs_growfs_rt( /* * Lock out other callers by grabbing the bitmap inode lock. */ - xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP); xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL); /* - * Update the bitmap inode's size. + * Update the bitmap inode's size ondisk and incore. We need + * to update the incore size so that inode inactivation won't + * punch what it thinks are "posteof" blocks. */ mp->m_rbmip->i_d.di_size = nsbp->sb_rbmblocks * nsbp->sb_blocksize; + i_size_write(VFS_I(mp->m_rbmip), mp->m_rbmip->i_d.di_size); xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); /* * Get the summary inode into the transaction. */ - xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL); + xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM); xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL); /* - * Update the summary inode's size. + * Update the summary inode's size. We need to update the + * incore size so that inode inactivation won't punch what it + * thinks are "posteof" blocks. */ mp->m_rsumip->i_d.di_size = nmp->m_rsumsize; + i_size_write(VFS_I(mp->m_rsumip), mp->m_rsumip->i_d.di_size); xfs_trans_log_inode(tp, mp->m_rsumip, XFS_ILOG_CORE); /* * Copy summary data from old to new sizes. @@ -1076,7 +1102,13 @@ error_cancel: if (error) break; } + if (error) + goto out_free; + + /* Update secondary superblocks now the physical grow has completed */ + error = xfs_update_secondary_sbs(mp); +out_free: /* * Free the fake mp structure. */ diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index f70f1255220b..20e0534a772c 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -23,6 +23,7 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) uint64_t xs_xstrat_bytes = 0; uint64_t xs_write_bytes = 0; uint64_t xs_read_bytes = 0; + uint64_t defer_relog = 0; static const struct xstats_entry { char *desc; @@ -70,10 +71,13 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) xs_xstrat_bytes += per_cpu_ptr(stats, i)->s.xs_xstrat_bytes; xs_write_bytes += per_cpu_ptr(stats, i)->s.xs_write_bytes; xs_read_bytes += per_cpu_ptr(stats, i)->s.xs_read_bytes; + defer_relog += per_cpu_ptr(stats, i)->s.defer_relog; } len += scnprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n", xs_xstrat_bytes, xs_write_bytes, xs_read_bytes); + len += scnprintf(buf + len, PATH_MAX-len, "defer_relog %llu\n", + defer_relog); len += scnprintf(buf + len, PATH_MAX-len, "debug %u\n", #if defined(DEBUG) 1); diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index 34d704f703d2..43ffba74f045 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -137,6 +137,7 @@ struct __xfsstats { uint64_t xs_xstrat_bytes; uint64_t xs_write_bytes; uint64_t xs_read_bytes; + uint64_t defer_relog; }; #define xfsstats_offset(f) (offsetof(struct __xfsstats, f)/sizeof(uint32_t)) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 71ac6c1cdc36..d1b5f2d2a245 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -654,11 +654,11 @@ xfs_fs_destroy_inode( ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM)); /* - * We always use background reclaim here because even if the - * inode is clean, it still may be under IO and hence we have - * to take the flush lock. The background reclaim path handles - * this more efficiently than we can here, so simply let background - * reclaim tear down all inodes. + * We always use background reclaim here because even if the inode is + * clean, it still may be under IO and hence we have wait for IO + * completion to occur before we can reclaim the inode. The background + * reclaim path handles this more efficiently than we can here, so + * simply let background reclaim tear down all inodes. */ xfs_inode_set_reclaim_tag(ip); } @@ -1234,25 +1234,12 @@ xfs_fc_parse_param( case Opt_nouuid: mp->m_flags |= XFS_MOUNT_NOUUID; return 0; - case Opt_ikeep: - mp->m_flags |= XFS_MOUNT_IKEEP; - return 0; - case Opt_noikeep: - mp->m_flags &= ~XFS_MOUNT_IKEEP; - return 0; case Opt_largeio: mp->m_flags |= XFS_MOUNT_LARGEIO; return 0; case Opt_nolargeio: mp->m_flags &= ~XFS_MOUNT_LARGEIO; return 0; - case Opt_attr2: - mp->m_flags |= XFS_MOUNT_ATTR2; - return 0; - case Opt_noattr2: - mp->m_flags &= ~XFS_MOUNT_ATTR2; - mp->m_flags |= XFS_MOUNT_NOATTR2; - return 0; case Opt_filestreams: mp->m_flags |= XFS_MOUNT_FILESTREAMS; return 0; @@ -1304,6 +1291,24 @@ xfs_fc_parse_param( xfs_mount_set_dax_mode(mp, result.uint_32); return 0; #endif + /* Following mount options will be removed in September 2025 */ + case Opt_ikeep: + xfs_warn(mp, "%s mount option is deprecated.", param->key); + mp->m_flags |= XFS_MOUNT_IKEEP; + return 0; + case Opt_noikeep: + xfs_warn(mp, "%s mount option is deprecated.", param->key); + mp->m_flags &= ~XFS_MOUNT_IKEEP; + return 0; + case Opt_attr2: + xfs_warn(mp, "%s mount option is deprecated.", param->key); + mp->m_flags |= XFS_MOUNT_ATTR2; + return 0; + case Opt_noattr2: + xfs_warn(mp, "%s mount option is deprecated.", param->key); + mp->m_flags &= ~XFS_MOUNT_ATTR2; + mp->m_flags |= XFS_MOUNT_NOATTR2; + return 0; default: xfs_warn(mp, "unknown mount option [%s].", param->key); return -EINVAL; @@ -1450,6 +1455,19 @@ xfs_fc_fill_super( if (error) goto out_free_sb; + /* V4 support is undergoing deprecation. */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) { +#ifdef CONFIG_XFS_SUPPORT_V4 + xfs_warn_once(mp, + "Deprecated V4 format (crc=0) will not be supported after September 2030."); +#else + xfs_warn(mp, + "Deprecated V4 format (crc=0) not supported by kernel."); + error = -EINVAL; + goto out_free_sb; +#endif + } + /* * XFS block mappings use 54 bits to store the logical block offset. * This should suffice to handle the maximum file size that the VFS @@ -1484,8 +1502,14 @@ xfs_fc_fill_super( sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_max_links = XFS_MAXLINK; sb->s_time_gran = 1; - sb->s_time_min = S32_MIN; - sb->s_time_max = S32_MAX; + if (xfs_sb_version_hasbigtime(&mp->m_sb)) { + sb->s_time_min = xfs_bigtime_to_unix(XFS_BIGTIME_TIME_MIN); + sb->s_time_max = xfs_bigtime_to_unix(XFS_BIGTIME_TIME_MAX); + } else { + sb->s_time_min = XFS_LEGACY_TIME_MIN; + sb->s_time_max = XFS_LEGACY_TIME_MAX; + } + trace_xfs_inode_timestamp_range(mp, sb->s_time_min, sb->s_time_max); sb->s_iflags |= SB_I_CGROUPWB; set_posix_acl_flag(sb); @@ -1494,6 +1518,10 @@ xfs_fc_fill_super( if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5) sb->s_flags |= SB_I_VERSION; + if (xfs_sb_version_hasbigtime(&mp->m_sb)) + xfs_warn(mp, + "EXPERIMENTAL big timestamp feature in use. Use at your own risk!"); + if (mp->m_flags & XFS_MOUNT_DAX_ALWAYS) { bool rtdev_is_dax = false, datadev_is_dax; @@ -1549,6 +1577,10 @@ xfs_fc_fill_super( goto out_filestream_unmount; } + if (xfs_sb_version_hasinobtcounts(&mp->m_sb)) + xfs_warn(mp, + "EXPERIMENTAL inode btree counters feature in use. Use at your own risk!"); + error = xfs_mountfs(mp); if (error) goto out_filestream_unmount; diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index 021ef96d0542..fac9de7ee6d0 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -50,13 +50,45 @@ xfs_panic_mask_proc_handler( } #endif /* CONFIG_PROC_FS */ +STATIC int +xfs_deprecate_irix_sgid_inherit_proc_handler( + struct ctl_table *ctl, + int write, + void *buffer, + size_t *lenp, + loff_t *ppos) +{ + if (write) { + printk_once(KERN_WARNING + "XFS: " "%s sysctl option is deprecated.\n", + ctl->procname); + } + return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); +} + +STATIC int +xfs_deprecate_irix_symlink_mode_proc_handler( + struct ctl_table *ctl, + int write, + void *buffer, + size_t *lenp, + loff_t *ppos) +{ + if (write) { + printk_once(KERN_WARNING + "XFS: " "%s sysctl option is deprecated.\n", + ctl->procname); + } + return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); +} + static struct ctl_table xfs_table[] = { { .procname = "irix_sgid_inherit", .data = &xfs_params.sgid_inherit.val, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = xfs_deprecate_irix_sgid_inherit_proc_handler, .extra1 = &xfs_params.sgid_inherit.min, .extra2 = &xfs_params.sgid_inherit.max }, @@ -65,7 +97,7 @@ static struct ctl_table xfs_table[] = { .data = &xfs_params.symlink_mode.val, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = xfs_deprecate_irix_symlink_mode_proc_handler, .extra1 = &xfs_params.symlink_mode.min, .extra2 = &xfs_params.symlink_mode.max }, diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index abb1d859f226..86951652d3ed 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -338,7 +338,7 @@ DEFINE_BUF_EVENT(xfs_buf_delwri_split); DEFINE_BUF_EVENT(xfs_buf_delwri_pushbuf); DEFINE_BUF_EVENT(xfs_buf_get_uncached); DEFINE_BUF_EVENT(xfs_buf_item_relse); -DEFINE_BUF_EVENT(xfs_buf_item_iodone_async); +DEFINE_BUF_EVENT(xfs_buf_iodone_async); DEFINE_BUF_EVENT(xfs_buf_error_relse); DEFINE_BUF_EVENT(xfs_buf_wait_buftarg); DEFINE_BUF_EVENT(xfs_trans_read_buf_shut); @@ -2533,6 +2533,7 @@ DEFINE_DEFER_PENDING_EVENT(xfs_defer_create_intent); DEFINE_DEFER_PENDING_EVENT(xfs_defer_cancel_list); DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish); DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_relog_intent); #define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer); @@ -3676,7 +3677,6 @@ DEFINE_EVENT(xfs_kmem_class, name, \ DEFINE_KMEM_EVENT(kmem_alloc); DEFINE_KMEM_EVENT(kmem_alloc_io); DEFINE_KMEM_EVENT(kmem_alloc_large); -DEFINE_KMEM_EVENT(kmem_realloc); TRACE_EVENT(xfs_check_new_dalign, TP_PROTO(struct xfs_mount *mp, int new_dalign, xfs_ino_t calc_rootino), @@ -3844,6 +3844,32 @@ TRACE_EVENT(xfs_btree_bload_block, __entry->nr_records) ) +DECLARE_EVENT_CLASS(xfs_timestamp_range_class, + TP_PROTO(struct xfs_mount *mp, time64_t min, time64_t max), + TP_ARGS(mp, min, max), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(long long, min) + __field(long long, max) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->min = min; + __entry->max = max; + ), + TP_printk("dev %d:%d min %lld max %lld", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->min, + __entry->max) +) + +#define DEFINE_TIMESTAMP_RANGE_EVENT(name) \ +DEFINE_EVENT(xfs_timestamp_range_class, name, \ + TP_PROTO(struct xfs_mount *mp, long long min, long long max), \ + TP_ARGS(mp, min, max)) +DEFINE_TIMESTAMP_RANGE_EVENT(xfs_inode_timestamp_range); +DEFINE_TIMESTAMP_RANGE_EVENT(xfs_quota_expiry_range); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index ed72867b1a19..c94e71f741b6 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -468,7 +468,7 @@ xfs_trans_apply_sb_deltas( xfs_buf_t *bp; int whole = 0; - bp = xfs_trans_getsb(tp, tp->t_mountp); + bp = xfs_trans_getsb(tp); sbp = bp->b_addr; /* @@ -959,7 +959,7 @@ xfs_trans_cancel( struct xfs_log_item *lip; list_for_each_entry(lip, &tp->t_items, li_trans) - ASSERT(!(lip->li_type == XFS_LI_EFD)); + ASSERT(!xlog_item_is_intent_done(lip)); } #endif xfs_trans_unreserve_and_mod_sb(tp); diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index b752501818d2..084658946cc8 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -55,14 +55,12 @@ struct xfs_log_item { #define XFS_LI_ABORTED 1 #define XFS_LI_FAILED 2 #define XFS_LI_DIRTY 3 /* log item dirty in transaction */ -#define XFS_LI_RECOVERED 4 /* log intent item has been recovered */ #define XFS_LI_FLAGS \ { (1 << XFS_LI_IN_AIL), "IN_AIL" }, \ { (1 << XFS_LI_ABORTED), "ABORTED" }, \ { (1 << XFS_LI_FAILED), "FAILED" }, \ - { (1 << XFS_LI_DIRTY), "DIRTY" }, \ - { (1 << XFS_LI_RECOVERED), "RECOVERED" } + { (1 << XFS_LI_DIRTY), "DIRTY" } struct xfs_item_ops { unsigned flags; @@ -74,10 +72,29 @@ struct xfs_item_ops { void (*iop_committing)(struct xfs_log_item *, xfs_lsn_t commit_lsn); void (*iop_release)(struct xfs_log_item *); xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t); - int (*iop_recover)(struct xfs_log_item *lip, struct xfs_trans *tp); + int (*iop_recover)(struct xfs_log_item *lip, + struct list_head *capture_list); bool (*iop_match)(struct xfs_log_item *item, uint64_t id); + struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent, + struct xfs_trans *tp); }; +/* Is this log item a deferred action intent? */ +static inline bool +xlog_item_is_intent(struct xfs_log_item *lip) +{ + return lip->li_ops->iop_recover != NULL && + lip->li_ops->iop_match != NULL; +} + +/* Is this a log intent-done item? */ +static inline bool +xlog_item_is_intent_done(struct xfs_log_item *lip) +{ + return lip->li_ops->iop_unpin == NULL && + lip->li_ops->iop_push == NULL; +} + /* * Release the log item as soon as committed. This is for items just logging * intents that never need to be written back in place. @@ -209,7 +226,7 @@ xfs_trans_read_buf( flags, bpp, ops); } -struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *); +struct xfs_buf *xfs_trans_getsb(struct xfs_trans *); void xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *); void xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *); @@ -243,4 +260,12 @@ void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp, extern kmem_zone_t *xfs_trans_zone; +static inline struct xfs_log_item * +xfs_trans_item_relog( + struct xfs_log_item *lip, + struct xfs_trans *tp) +{ + return lip->li_ops->iop_relog(lip, tp); +} + #endif /* __XFS_TRANS_H__ */ diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 11cd666cd99a..42d63b830cb9 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -166,50 +166,34 @@ xfs_trans_get_buf_map( } /* - * Get and lock the superblock buffer of this file system for the - * given transaction. - * - * We don't need to use incore_match() here, because the superblock - * buffer is a private buffer which we keep a pointer to in the - * mount structure. + * Get and lock the superblock buffer for the given transaction. */ -xfs_buf_t * +struct xfs_buf * xfs_trans_getsb( - xfs_trans_t *tp, - struct xfs_mount *mp) + struct xfs_trans *tp) { - xfs_buf_t *bp; - struct xfs_buf_log_item *bip; + struct xfs_buf *bp = tp->t_mountp->m_sb_bp; /* - * Default to just trying to lock the superblock buffer - * if tp is NULL. + * Just increment the lock recursion count if the buffer is already + * attached to this transaction. */ - if (tp == NULL) - return xfs_getsb(mp); - - /* - * If the superblock buffer already has this transaction - * pointer in its b_fsprivate2 field, then we know we already - * have it locked. In this case we just increment the lock - * recursion count and return the buffer to the caller. - */ - bp = mp->m_sb_bp; if (bp->b_transp == tp) { - bip = bp->b_log_item; + struct xfs_buf_log_item *bip = bp->b_log_item; + ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_recur++; + trace_xfs_trans_getsb_recur(bip); - return bp; - } + } else { + xfs_buf_lock(bp); + xfs_buf_hold(bp); + _xfs_trans_bjoin(tp, bp, 1); - bp = xfs_getsb(mp); - if (bp == NULL) - return NULL; + trace_xfs_trans_getsb(bp->b_log_item); + } - _xfs_trans_bjoin(tp, bp, 1); - trace_xfs_trans_getsb(bp->b_log_item); return bp; } diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index c6ba7ef18e06..fe45b0c3970c 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -55,6 +55,12 @@ xfs_trans_log_dquot( { ASSERT(XFS_DQ_IS_LOCKED(dqp)); + /* Upgrade the dquot to bigtime format if possible. */ + if (dqp->q_id != 0 && + xfs_sb_version_hasbigtime(&tp->t_mountp->m_sb) && + !(dqp->q_type & XFS_DQTYPE_BIGTIME)) + dqp->q_type |= XFS_DQTYPE_BIGTIME; + tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &dqp->q_logitem.qli_item.li_flags); } @@ -215,36 +221,27 @@ xfs_trans_mod_dquot( } switch (field) { - - /* - * regular disk blk reservation - */ - case XFS_TRANS_DQ_RES_BLKS: + /* regular disk blk reservation */ + case XFS_TRANS_DQ_RES_BLKS: qtrx->qt_blk_res += delta; break; - /* - * inode reservation - */ - case XFS_TRANS_DQ_RES_INOS: + /* inode reservation */ + case XFS_TRANS_DQ_RES_INOS: qtrx->qt_ino_res += delta; break; - /* - * disk blocks used. - */ - case XFS_TRANS_DQ_BCOUNT: + /* disk blocks used. */ + case XFS_TRANS_DQ_BCOUNT: qtrx->qt_bcount_delta += delta; break; - case XFS_TRANS_DQ_DELBCOUNT: + case XFS_TRANS_DQ_DELBCOUNT: qtrx->qt_delbcnt_delta += delta; break; - /* - * Inode Count - */ - case XFS_TRANS_DQ_ICOUNT: + /* Inode Count */ + case XFS_TRANS_DQ_ICOUNT: if (qtrx->qt_ino_res && delta > 0) { qtrx->qt_ino_res_used += delta; ASSERT(qtrx->qt_ino_res >= qtrx->qt_ino_res_used); @@ -252,17 +249,13 @@ xfs_trans_mod_dquot( qtrx->qt_icount_delta += delta; break; - /* - * rtblk reservation - */ - case XFS_TRANS_DQ_RES_RTBLKS: + /* rtblk reservation */ + case XFS_TRANS_DQ_RES_RTBLKS: qtrx->qt_rtblk_res += delta; break; - /* - * rtblk count - */ - case XFS_TRANS_DQ_RTBCOUNT: + /* rtblk count */ + case XFS_TRANS_DQ_RTBCOUNT: if (qtrx->qt_rtblk_res && delta > 0) { qtrx->qt_rtblk_res_used += delta; ASSERT(qtrx->qt_rtblk_res >= qtrx->qt_rtblk_res_used); @@ -270,11 +263,11 @@ xfs_trans_mod_dquot( qtrx->qt_rtbcount_delta += delta; break; - case XFS_TRANS_DQ_DELRTBCOUNT: + case XFS_TRANS_DQ_DELRTBCOUNT: qtrx->qt_delrtb_delta += delta; break; - default: + default: ASSERT(0); } diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 8ec7c8f109d7..64cc2a9c38c8 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -24,6 +24,39 @@ #include "zonefs.h" +static inline int zonefs_zone_mgmt(struct inode *inode, + enum req_opf op) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + int ret; + + lockdep_assert_held(&zi->i_truncate_mutex); + + ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector, + zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS); + if (ret) { + zonefs_err(inode->i_sb, + "Zone management operation %s at %llu failed %d\n", + blk_op_str(op), zi->i_zsector, ret); + return ret; + } + + return 0; +} + +static inline void zonefs_i_size_write(struct inode *inode, loff_t isize) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + + i_size_write(inode, isize); + /* + * A full zone is no longer open/active and does not need + * explicit closing. + */ + if (isize >= zi->i_max_size) + zi->i_flags &= ~ZONEFS_ZONE_OPEN; +} + static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) @@ -302,6 +335,17 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, } /* + * If the filesystem is mounted with the explicit-open mount option, we + * need to clear the ZONEFS_ZONE_OPEN flag if the zone transitioned to + * the read-only or offline condition, to avoid attempting an explicit + * close of the zone when the inode file is closed. + */ + if ((sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) && + (zone->cond == BLK_ZONE_COND_OFFLINE || + zone->cond == BLK_ZONE_COND_READONLY)) + zi->i_flags &= ~ZONEFS_ZONE_OPEN; + + /* * If error=remount-ro was specified, any error result in remounting * the volume as read-only. */ @@ -315,7 +359,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, * invalid data. */ zonefs_update_stats(inode, data_size); - i_size_write(inode, data_size); + zonefs_i_size_write(inode, data_size); zi->i_wpoffset = data_size; return 0; @@ -328,7 +372,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, * eventually correct the file size and zonefs inode write pointer offset * (which can be out of sync with the drive due to partial write failures). */ -static void zonefs_io_error(struct inode *inode, bool write) +static void __zonefs_io_error(struct inode *inode, bool write) { struct zonefs_inode_info *zi = ZONEFS_I(inode); struct super_block *sb = inode->i_sb; @@ -342,8 +386,6 @@ static void zonefs_io_error(struct inode *inode, bool write) }; int ret; - mutex_lock(&zi->i_truncate_mutex); - /* * Memory allocations in blkdev_report_zones() can trigger a memory * reclaim which may in turn cause a recursion into zonefs as well as @@ -359,7 +401,14 @@ static void zonefs_io_error(struct inode *inode, bool write) zonefs_err(sb, "Get inode %lu zone information failed %d\n", inode->i_ino, ret); memalloc_noio_restore(noio_flag); +} +static void zonefs_io_error(struct inode *inode, bool write) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + + mutex_lock(&zi->i_truncate_mutex); + __zonefs_io_error(inode, write); mutex_unlock(&zi->i_truncate_mutex); } @@ -397,13 +446,27 @@ static int zonefs_file_truncate(struct inode *inode, loff_t isize) if (isize == old_isize) goto unlock; - ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector, - zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS); - if (ret) { - zonefs_err(inode->i_sb, - "Zone management operation at %llu failed %d", - zi->i_zsector, ret); + ret = zonefs_zone_mgmt(inode, op); + if (ret) goto unlock; + + /* + * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set, + * take care of open zones. + */ + if (zi->i_flags & ZONEFS_ZONE_OPEN) { + /* + * Truncating a zone to EMPTY or FULL is the equivalent of + * closing the zone. For a truncation to 0, we need to + * re-open the zone to ensure new writes can be processed. + * For a truncation to the maximum file size, the zone is + * closed and writes cannot be accepted anymore, so clear + * the open flag. + */ + if (!isize) + ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); + else + zi->i_flags &= ~ZONEFS_ZONE_OPEN; } zonefs_update_stats(inode, isize); @@ -584,7 +647,7 @@ static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, mutex_lock(&zi->i_truncate_mutex); if (i_size_read(inode) < iocb->ki_pos + size) { zonefs_update_stats(inode, iocb->ki_pos + size); - i_size_write(inode, iocb->ki_pos + size); + zonefs_i_size_write(inode, iocb->ki_pos + size); } mutex_unlock(&zi->i_truncate_mutex); } @@ -865,8 +928,128 @@ inode_unlock: return ret; } +static inline bool zonefs_file_use_exp_open(struct inode *inode, struct file *file) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); + + if (!(sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN)) + return false; + + if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) + return false; + + if (!(file->f_mode & FMODE_WRITE)) + return false; + + return true; +} + +static int zonefs_open_zone(struct inode *inode) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); + int ret = 0; + + mutex_lock(&zi->i_truncate_mutex); + + zi->i_wr_refcnt++; + if (zi->i_wr_refcnt == 1) { + + if (atomic_inc_return(&sbi->s_open_zones) > sbi->s_max_open_zones) { + atomic_dec(&sbi->s_open_zones); + ret = -EBUSY; + goto unlock; + } + + if (i_size_read(inode) < zi->i_max_size) { + ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); + if (ret) { + zi->i_wr_refcnt--; + atomic_dec(&sbi->s_open_zones); + goto unlock; + } + zi->i_flags |= ZONEFS_ZONE_OPEN; + } + } + +unlock: + mutex_unlock(&zi->i_truncate_mutex); + + return ret; +} + +static int zonefs_file_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = generic_file_open(inode, file); + if (ret) + return ret; + + if (zonefs_file_use_exp_open(inode, file)) + return zonefs_open_zone(inode); + + return 0; +} + +static void zonefs_close_zone(struct inode *inode) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + int ret = 0; + + mutex_lock(&zi->i_truncate_mutex); + zi->i_wr_refcnt--; + if (!zi->i_wr_refcnt) { + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); + struct super_block *sb = inode->i_sb; + + /* + * If the file zone is full, it is not open anymore and we only + * need to decrement the open count. + */ + if (!(zi->i_flags & ZONEFS_ZONE_OPEN)) + goto dec; + + ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); + if (ret) { + __zonefs_io_error(inode, false); + /* + * Leaving zones explicitly open may lead to a state + * where most zones cannot be written (zone resources + * exhausted). So take preventive action by remounting + * read-only. + */ + if (zi->i_flags & ZONEFS_ZONE_OPEN && + !(sb->s_flags & SB_RDONLY)) { + zonefs_warn(sb, "closing zone failed, remounting filesystem read-only\n"); + sb->s_flags |= SB_RDONLY; + } + } + zi->i_flags &= ~ZONEFS_ZONE_OPEN; +dec: + atomic_dec(&sbi->s_open_zones); + } + mutex_unlock(&zi->i_truncate_mutex); +} + +static int zonefs_file_release(struct inode *inode, struct file *file) +{ + /* + * If we explicitly open a zone we must close it again as well, but the + * zone management operation can fail (either due to an IO error or as + * the zone has gone offline or read-only). Make sure we don't fail the + * close(2) for user-space. + */ + if (zonefs_file_use_exp_open(inode, file)) + zonefs_close_zone(inode); + + return 0; +} + static const struct file_operations zonefs_file_operations = { - .open = generic_file_open, + .open = zonefs_file_open, + .release = zonefs_file_release, .fsync = zonefs_file_fsync, .mmap = zonefs_file_mmap, .llseek = zonefs_file_llseek, @@ -890,6 +1073,7 @@ static struct inode *zonefs_alloc_inode(struct super_block *sb) inode_init_once(&zi->i_vnode); mutex_init(&zi->i_truncate_mutex); init_rwsem(&zi->i_mmap_sem); + zi->i_wr_refcnt = 0; return &zi->i_vnode; } @@ -940,7 +1124,7 @@ static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf) enum { Opt_errors_ro, Opt_errors_zro, Opt_errors_zol, Opt_errors_repair, - Opt_err, + Opt_explicit_open, Opt_err, }; static const match_table_t tokens = { @@ -948,6 +1132,7 @@ static const match_table_t tokens = { { Opt_errors_zro, "errors=zone-ro"}, { Opt_errors_zol, "errors=zone-offline"}, { Opt_errors_repair, "errors=repair"}, + { Opt_explicit_open, "explicit-open" }, { Opt_err, NULL} }; @@ -984,6 +1169,9 @@ static int zonefs_parse_options(struct super_block *sb, char *options) sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK; sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_REPAIR; break; + case Opt_explicit_open: + sbi->s_mount_opts |= ZONEFS_MNTOPT_EXPLICIT_OPEN; + break; default: return -EINVAL; } @@ -1403,6 +1591,13 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) sbi->s_gid = GLOBAL_ROOT_GID; sbi->s_perm = 0640; sbi->s_mount_opts = ZONEFS_MNTOPT_ERRORS_RO; + sbi->s_max_open_zones = bdev_max_open_zones(sb->s_bdev); + atomic_set(&sbi->s_open_zones, 0); + if (!sbi->s_max_open_zones && + sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { + zonefs_info(sb, "No open zones limit. Ignoring explicit_open mount option\n"); + sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN; + } ret = zonefs_read_super(sb); if (ret) diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h index 55b39970acb2..51141907097c 100644 --- a/fs/zonefs/zonefs.h +++ b/fs/zonefs/zonefs.h @@ -38,6 +38,8 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone) return ZONEFS_ZTYPE_SEQ; } +#define ZONEFS_ZONE_OPEN (1 << 0) + /* * In-memory inode data. */ @@ -74,6 +76,10 @@ struct zonefs_inode_info { */ struct mutex i_truncate_mutex; struct rw_semaphore i_mmap_sem; + + /* guarded by i_truncate_mutex */ + unsigned int i_wr_refcnt; + unsigned int i_flags; }; static inline struct zonefs_inode_info *ZONEFS_I(struct inode *inode) @@ -154,6 +160,7 @@ enum zonefs_features { #define ZONEFS_MNTOPT_ERRORS_MASK \ (ZONEFS_MNTOPT_ERRORS_RO | ZONEFS_MNTOPT_ERRORS_ZRO | \ ZONEFS_MNTOPT_ERRORS_ZOL | ZONEFS_MNTOPT_ERRORS_REPAIR) +#define ZONEFS_MNTOPT_EXPLICIT_OPEN (1 << 4) /* Explicit open/close of zones on open/close */ /* * In-memory Super block information. @@ -175,6 +182,9 @@ struct zonefs_sb_info { loff_t s_blocks; loff_t s_used_blocks; + + unsigned int s_max_open_zones; + atomic_t s_open_zones; }; static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb) |