diff options
Diffstat (limited to 'fs')
505 files changed, 9820 insertions, 34948 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 61dbe52bb3a3..281a1ed03a04 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -637,7 +637,7 @@ static int v9fs_init_inode_cache(void) v9fs_inode_cache = kmem_cache_create("v9fs_inode_cache", sizeof(struct v9fs_inode), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), + SLAB_ACCOUNT), v9fs_inode_init_once); if (!v9fs_inode_cache) return -ENOMEM; diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index bae330c2f0cf..abdbbaee5184 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -107,7 +107,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl) p9_debug(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl); - if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) { + if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->c.flc_type != F_UNLCK) { filemap_write_and_wait(inode->i_mapping); invalidate_mapping_pages(&inode->i_data, 0, -1); } @@ -121,13 +121,12 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) struct p9_fid *fid; uint8_t status = P9_LOCK_ERROR; int res = 0; - unsigned char fl_type; struct v9fs_session_info *v9ses; fid = filp->private_data; BUG_ON(fid == NULL); - BUG_ON((fl->fl_flags & FL_POSIX) != FL_POSIX); + BUG_ON((fl->c.flc_flags & FL_POSIX) != FL_POSIX); res = locks_lock_file_wait(filp, fl); if (res < 0) @@ -136,7 +135,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) /* convert posix lock to p9 tlock args */ memset(&flock, 0, sizeof(flock)); /* map the lock type */ - switch (fl->fl_type) { + switch (fl->c.flc_type) { case F_RDLCK: flock.type = P9_LOCK_TYPE_RDLCK; break; @@ -152,7 +151,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) flock.length = 0; else flock.length = fl->fl_end - fl->fl_start + 1; - flock.proc_id = fl->fl_pid; + flock.proc_id = fl->c.flc_pid; flock.client_id = fid->clnt->name; if (IS_SETLKW(cmd)) flock.flags = P9_LOCK_FLAGS_BLOCK; @@ -207,12 +206,13 @@ out_unlock: * incase server returned error for lock request, revert * it locally */ - if (res < 0 && fl->fl_type != F_UNLCK) { - fl_type = fl->fl_type; - fl->fl_type = F_UNLCK; + if (res < 0 && fl->c.flc_type != F_UNLCK) { + unsigned char type = fl->c.flc_type; + + fl->c.flc_type = F_UNLCK; /* Even if this fails we want to return the remote error */ locks_lock_file_wait(filp, fl); - fl->fl_type = fl_type; + fl->c.flc_type = type; } if (flock.client_id != fid->clnt->name) kfree(flock.client_id); @@ -234,7 +234,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl) * if we have a conflicting lock locally, no need to validate * with server */ - if (fl->fl_type != F_UNLCK) + if (fl->c.flc_type != F_UNLCK) return res; /* convert posix lock to p9 tgetlock args */ @@ -245,7 +245,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl) glock.length = 0; else glock.length = fl->fl_end - fl->fl_start + 1; - glock.proc_id = fl->fl_pid; + glock.proc_id = fl->c.flc_pid; glock.client_id = fid->clnt->name; res = p9_client_getlock_dotl(fid, &glock); @@ -254,13 +254,13 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl) /* map 9p lock type to os lock type */ switch (glock.type) { case P9_LOCK_TYPE_RDLCK: - fl->fl_type = F_RDLCK; + fl->c.flc_type = F_RDLCK; break; case P9_LOCK_TYPE_WRLCK: - fl->fl_type = F_WRLCK; + fl->c.flc_type = F_WRLCK; break; case P9_LOCK_TYPE_UNLCK: - fl->fl_type = F_UNLCK; + fl->c.flc_type = F_UNLCK; break; } if (glock.type != P9_LOCK_TYPE_UNLCK) { @@ -269,7 +269,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl) fl->fl_end = OFFSET_MAX; else fl->fl_end = glock.start + glock.length - 1; - fl->fl_pid = -glock.proc_id; + fl->c.flc_pid = -glock.proc_id; } out: if (glock.client_id != fid->clnt->name) @@ -293,7 +293,7 @@ static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl) p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n", filp, cmd, fl, filp); - if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) { + if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->c.flc_type != F_UNLCK) { filemap_write_and_wait(inode->i_mapping); invalidate_mapping_pages(&inode->i_data, 0, -1); } @@ -324,16 +324,16 @@ static int v9fs_file_flock_dotl(struct file *filp, int cmd, p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n", filp, cmd, fl, filp); - if (!(fl->fl_flags & FL_FLOCK)) + if (!(fl->c.flc_flags & FL_FLOCK)) goto out_err; - if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) { + if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->c.flc_type != F_UNLCK) { filemap_write_and_wait(inode->i_mapping); invalidate_mapping_pages(&inode->i_data, 0, -1); } /* Convert flock to posix lock */ - fl->fl_flags |= FL_POSIX; - fl->fl_flags ^= FL_FLOCK; + fl->c.flc_flags |= FL_POSIX; + fl->c.flc_flags ^= FL_FLOCK; if (IS_SETLK(cmd) | IS_SETLKW(cmd)) ret = v9fs_file_do_lock(filp, cmd, fl); diff --git a/fs/Kconfig b/fs/Kconfig index 89fdbefd1075..4bc7dd420874 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -162,7 +162,6 @@ menu "DOS/FAT/EXFAT/NT Filesystems" source "fs/fat/Kconfig" source "fs/exfat/Kconfig" -source "fs/ntfs/Kconfig" source "fs/ntfs3/Kconfig" endmenu @@ -174,6 +173,13 @@ source "fs/proc/Kconfig" source "fs/kernfs/Kconfig" source "fs/sysfs/Kconfig" +config FS_PID + bool "Pseudo filesystem for process file descriptors" + depends on 64BIT + default y + help + Pidfs implements advanced features for process file descriptors. + config TMPFS bool "Tmpfs virtual memory file system support (former shm fs)" depends on SHMEM diff --git a/fs/Makefile b/fs/Makefile index c09016257f05..6ecc9b0a53f2 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -15,7 +15,7 @@ obj-y := open.o read_write.o file_table.o super.o \ pnode.o splice.o sync.o utimes.o d_path.o \ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \ fs_types.o fs_context.o fs_parser.o fsopen.o init.o \ - kernel_read_file.o mnt_idmapping.o remap_range.o + kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o obj-$(CONFIG_BUFFER_HEAD) += buffer.o mpage.o obj-$(CONFIG_PROC_FS) += proc_namespace.o @@ -91,7 +91,6 @@ obj-y += unicode/ obj-$(CONFIG_SYSV_FS) += sysv/ obj-$(CONFIG_SMBFS) += smb/ obj-$(CONFIG_HPFS_FS) += hpfs/ -obj-$(CONFIG_NTFS_FS) += ntfs/ obj-$(CONFIG_NTFS3_FS) += ntfs3/ obj-$(CONFIG_UFS_FS) += ufs/ obj-$(CONFIG_EFS_FS) += efs/ diff --git a/fs/adfs/super.c b/fs/adfs/super.c index e8bfc38239cd..9354b14bbfe3 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -249,7 +249,7 @@ static int __init init_inodecache(void) adfs_inode_cachep = kmem_cache_create("adfs_inode_cache", sizeof(struct adfs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), + SLAB_ACCOUNT), init_once); if (adfs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/affs/affs.h b/fs/affs/affs.h index 60685ec76d98..2e612834329a 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -105,6 +105,7 @@ struct affs_sb_info { int work_queued; /* non-zero delayed work is queued */ struct delayed_work sb_work; /* superblock flush delayed work */ spinlock_t work_lock; /* protects sb_work and work_queued */ + struct rcu_head rcu; }; #define AFFS_MOUNT_SF_INTL 0x0001 /* International filesystem. */ diff --git a/fs/affs/super.c b/fs/affs/super.c index 58b391446ae1..3c5821339609 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -130,8 +130,7 @@ static int __init init_inodecache(void) { affs_inode_cachep = kmem_cache_create("affs_inode_cache", sizeof(struct affs_inode_info), - 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), + 0, (SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT), init_once); if (affs_inode_cachep == NULL) return -ENOMEM; @@ -640,7 +639,7 @@ static void affs_kill_sb(struct super_block *sb) affs_brelse(sbi->s_root_bh); kfree(sbi->s_prefix); mutex_destroy(&sbi->s_bmlock); - kfree(sbi); + kfree_rcu(sbi, rcu); } } diff --git a/fs/afs/dir.c b/fs/afs/dir.c index c14533ef108f..8a67fc427e74 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -124,7 +124,7 @@ static void afs_dir_read_cleanup(struct afs_read *req) if (xas_retry(&xas, folio)) continue; BUG_ON(xa_is_value(folio)); - ASSERTCMP(folio_file_mapping(folio), ==, mapping); + ASSERTCMP(folio->mapping, ==, mapping); folio_put(folio); } @@ -202,12 +202,12 @@ static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req) if (xas_retry(&xas, folio)) continue; - BUG_ON(folio_file_mapping(folio) != mapping); + BUG_ON(folio->mapping != mapping); size = min_t(loff_t, folio_size(folio), req->actual_len - folio_pos(folio)); for (offset = 0; offset < size; offset += sizeof(*block)) { block = kmap_local_folio(folio, offset); - pr_warn("[%02lx] %32phN\n", folio_index(folio) + offset, block); + pr_warn("[%02lx] %32phN\n", folio->index + offset, block); kunmap_local(block); } } @@ -233,7 +233,7 @@ static int afs_dir_check(struct afs_vnode *dvnode, struct afs_read *req) if (xas_retry(&xas, folio)) continue; - BUG_ON(folio_file_mapping(folio) != mapping); + BUG_ON(folio->mapping != mapping); if (!afs_dir_check_folio(dvnode, folio, req->actual_len)) { afs_dir_dump(dvnode, req); @@ -474,6 +474,16 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode, continue; } + /* Don't expose silly rename entries to userspace. */ + if (nlen > 6 && + dire->u.name[0] == '.' && + ctx->actor != afs_lookup_filldir && + ctx->actor != afs_lookup_one_filldir && + memcmp(dire->u.name, ".__afs", 6) == 0) { + ctx->pos = blkoff + next * sizeof(union afs_xdr_dirent); + continue; + } + /* found the next entry */ if (!dir_emit(ctx, dire->u.name, nlen, ntohl(dire->u.vnode), @@ -708,6 +718,8 @@ static void afs_do_lookup_success(struct afs_operation *op) break; } + if (vp->scb.status.abort_code) + trace_afs_bulkstat_error(op, &vp->fid, i, vp->scb.status.abort_code); if (!vp->scb.have_status && !vp->scb.have_error) continue; @@ -897,12 +909,16 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, afs_begin_vnode_operation(op); afs_wait_for_operation(op); } - inode = ERR_PTR(afs_op_error(op)); out_op: if (!afs_op_error(op)) { - inode = &op->file[1].vnode->netfs.inode; - op->file[1].vnode = NULL; + if (op->file[1].scb.status.abort_code) { + afs_op_accumulate_error(op, -ECONNABORTED, + op->file[1].scb.status.abort_code); + } else { + inode = &op->file[1].vnode->netfs.inode; + op->file[1].vnode = NULL; + } } if (op->file[0].scb.have_status) @@ -2022,7 +2038,7 @@ static bool afs_dir_release_folio(struct folio *folio, gfp_t gfp_flags) { struct afs_vnode *dvnode = AFS_FS_I(folio_inode(folio)); - _enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, folio_index(folio)); + _enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, folio->index); folio_detach_private(folio); diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index d3bc4a2d7085..c4d2711e20ad 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -258,16 +258,7 @@ const struct inode_operations afs_dynroot_inode_operations = { .lookup = afs_dynroot_lookup, }; -/* - * Dirs in the dynamic root don't need revalidation. - */ -static int afs_dynroot_d_revalidate(struct dentry *dentry, unsigned int flags) -{ - return 1; -} - const struct dentry_operations afs_dynroot_dentry_operations = { - .d_revalidate = afs_dynroot_d_revalidate, .d_delete = always_delete_dentry, .d_release = afs_d_release, .d_automount = afs_d_automount, diff --git a/fs/afs/file.c b/fs/afs/file.c index 3d33b221d9ca..ef2cc8f565d2 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -417,13 +417,17 @@ static void afs_add_open_mmap(struct afs_vnode *vnode) static void afs_drop_open_mmap(struct afs_vnode *vnode) { - if (!atomic_dec_and_test(&vnode->cb_nr_mmap)) + if (atomic_add_unless(&vnode->cb_nr_mmap, -1, 1)) return; down_write(&vnode->volume->open_mmaps_lock); - if (atomic_read(&vnode->cb_nr_mmap) == 0) + read_seqlock_excl(&vnode->cb_lock); + // the only place where ->cb_nr_mmap may hit 0 + // see __afs_break_callback() for the other side... + if (atomic_dec_and_test(&vnode->cb_nr_mmap)) list_del_init(&vnode->cb_mmap_link); + read_sequnlock_excl(&vnode->cb_lock); up_write(&vnode->volume->open_mmaps_lock); flush_work(&vnode->cb_work); diff --git a/fs/afs/flock.c b/fs/afs/flock.c index 9c6dea3139f5..f0e96a35093f 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -93,13 +93,13 @@ static void afs_grant_locks(struct afs_vnode *vnode) bool exclusive = (vnode->lock_type == AFS_LOCK_WRITE); list_for_each_entry_safe(p, _p, &vnode->pending_locks, fl_u.afs.link) { - if (!exclusive && p->fl_type == F_WRLCK) + if (!exclusive && lock_is_write(p)) continue; list_move_tail(&p->fl_u.afs.link, &vnode->granted_locks); p->fl_u.afs.state = AFS_LOCK_GRANTED; trace_afs_flock_op(vnode, p, afs_flock_op_grant); - wake_up(&p->fl_wait); + locks_wake_up(p); } } @@ -112,25 +112,24 @@ static void afs_next_locker(struct afs_vnode *vnode, int error) { struct file_lock *p, *_p, *next = NULL; struct key *key = vnode->lock_key; - unsigned int fl_type = F_RDLCK; + unsigned int type = F_RDLCK; _enter(""); if (vnode->lock_type == AFS_LOCK_WRITE) - fl_type = F_WRLCK; + type = F_WRLCK; list_for_each_entry_safe(p, _p, &vnode->pending_locks, fl_u.afs.link) { if (error && - p->fl_type == fl_type && - afs_file_key(p->fl_file) == key) { + p->c.flc_type == type && + afs_file_key(p->c.flc_file) == key) { list_del_init(&p->fl_u.afs.link); p->fl_u.afs.state = error; - wake_up(&p->fl_wait); + locks_wake_up(p); } /* Select the next locker to hand off to. */ - if (next && - (next->fl_type == F_WRLCK || p->fl_type == F_RDLCK)) + if (next && (lock_is_write(next) || lock_is_read(p))) continue; next = p; } @@ -142,7 +141,7 @@ static void afs_next_locker(struct afs_vnode *vnode, int error) afs_set_lock_state(vnode, AFS_VNODE_LOCK_SETTING); next->fl_u.afs.state = AFS_LOCK_YOUR_TRY; trace_afs_flock_op(vnode, next, afs_flock_op_wake); - wake_up(&next->fl_wait); + locks_wake_up(next); } else { afs_set_lock_state(vnode, AFS_VNODE_LOCK_NONE); trace_afs_flock_ev(vnode, NULL, afs_flock_no_lockers, 0); @@ -166,7 +165,7 @@ static void afs_kill_lockers_enoent(struct afs_vnode *vnode) struct file_lock, fl_u.afs.link); list_del_init(&p->fl_u.afs.link); p->fl_u.afs.state = -ENOENT; - wake_up(&p->fl_wait); + locks_wake_up(p); } key_put(vnode->lock_key); @@ -464,14 +463,14 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl) _enter("{%llx:%llu},%llu-%llu,%u,%u", vnode->fid.vid, vnode->fid.vnode, - fl->fl_start, fl->fl_end, fl->fl_type, mode); + fl->fl_start, fl->fl_end, fl->c.flc_type, mode); fl->fl_ops = &afs_lock_ops; INIT_LIST_HEAD(&fl->fl_u.afs.link); fl->fl_u.afs.state = AFS_LOCK_PENDING; partial = (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX); - type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; + type = lock_is_read(fl) ? AFS_LOCK_READ : AFS_LOCK_WRITE; if (mode == afs_flock_mode_write && partial) type = AFS_LOCK_WRITE; @@ -524,7 +523,7 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl) } if (vnode->lock_state == AFS_VNODE_LOCK_NONE && - !(fl->fl_flags & FL_SLEEP)) { + !(fl->c.flc_flags & FL_SLEEP)) { ret = -EAGAIN; if (type == AFS_LOCK_READ) { if (vnode->status.lock_count == -1) @@ -621,7 +620,7 @@ skip_server_lock: return 0; lock_is_contended: - if (!(fl->fl_flags & FL_SLEEP)) { + if (!(fl->c.flc_flags & FL_SLEEP)) { list_del_init(&fl->fl_u.afs.link); afs_next_locker(vnode, 0); ret = -EAGAIN; @@ -641,7 +640,7 @@ need_to_wait: spin_unlock(&vnode->lock); trace_afs_flock_ev(vnode, fl, afs_flock_waiting, 0); - ret = wait_event_interruptible(fl->fl_wait, + ret = wait_event_interruptible(fl->c.flc_wait, fl->fl_u.afs.state != AFS_LOCK_PENDING); trace_afs_flock_ev(vnode, fl, afs_flock_waited, ret); @@ -704,7 +703,8 @@ static int afs_do_unlk(struct file *file, struct file_lock *fl) struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); int ret; - _enter("{%llx:%llu},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type); + _enter("{%llx:%llu},%u", vnode->fid.vid, vnode->fid.vnode, + fl->c.flc_type); trace_afs_flock_op(vnode, fl, afs_flock_op_unlock); @@ -730,11 +730,11 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl) if (vnode->lock_state == AFS_VNODE_LOCK_DELETED) return -ENOENT; - fl->fl_type = F_UNLCK; + fl->c.flc_type = F_UNLCK; /* check local lock records first */ posix_test_lock(file, fl); - if (fl->fl_type == F_UNLCK) { + if (lock_is_unlock(fl)) { /* no local locks; consult the server */ ret = afs_fetch_status(vnode, key, false, NULL); if (ret < 0) @@ -743,18 +743,18 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl) lock_count = READ_ONCE(vnode->status.lock_count); if (lock_count != 0) { if (lock_count > 0) - fl->fl_type = F_RDLCK; + fl->c.flc_type = F_RDLCK; else - fl->fl_type = F_WRLCK; + fl->c.flc_type = F_WRLCK; fl->fl_start = 0; fl->fl_end = OFFSET_MAX; - fl->fl_pid = 0; + fl->c.flc_pid = 0; } } ret = 0; error: - _leave(" = %d [%hd]", ret, fl->fl_type); + _leave(" = %d [%hd]", ret, fl->c.flc_type); return ret; } @@ -769,7 +769,7 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl) _enter("{%llx:%llu},%d,{t=%x,fl=%x,r=%Ld:%Ld}", vnode->fid.vid, vnode->fid.vnode, cmd, - fl->fl_type, fl->fl_flags, + fl->c.flc_type, fl->c.flc_flags, (long long) fl->fl_start, (long long) fl->fl_end); if (IS_GETLK(cmd)) @@ -778,7 +778,7 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl) fl->fl_u.afs.debug_id = atomic_inc_return(&afs_file_lock_debug_id); trace_afs_flock_op(vnode, fl, afs_flock_op_lock); - if (fl->fl_type == F_UNLCK) + if (lock_is_unlock(fl)) ret = afs_do_unlk(file, fl); else ret = afs_do_setlk(file, fl); @@ -804,7 +804,7 @@ int afs_flock(struct file *file, int cmd, struct file_lock *fl) _enter("{%llx:%llu},%d,{t=%x,fl=%x}", vnode->fid.vid, vnode->fid.vnode, cmd, - fl->fl_type, fl->fl_flags); + fl->c.flc_type, fl->c.flc_flags); /* * No BSD flocks over NFS allowed. @@ -813,14 +813,14 @@ int afs_flock(struct file *file, int cmd, struct file_lock *fl) * Not sure whether that would be unique, though, or whether * that would break in other places. */ - if (!(fl->fl_flags & FL_FLOCK)) + if (!(fl->c.flc_flags & FL_FLOCK)) return -ENOLCK; fl->fl_u.afs.debug_id = atomic_inc_return(&afs_file_lock_debug_id); trace_afs_flock_op(vnode, fl, afs_flock_op_flock); /* we're simulating flock() locks using posix locks on the server */ - if (fl->fl_type == F_UNLCK) + if (lock_is_unlock(fl)) ret = afs_do_unlk(file, fl); else ret = afs_do_setlk(file, fl); @@ -843,7 +843,7 @@ int afs_flock(struct file *file, int cmd, struct file_lock *fl) */ static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(file_inode(fl->fl_file)); + struct afs_vnode *vnode = AFS_FS_I(file_inode(fl->c.flc_file)); _enter(""); @@ -861,7 +861,7 @@ static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl) */ static void afs_fl_release_private(struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(file_inode(fl->fl_file)); + struct afs_vnode *vnode = AFS_FS_I(file_inode(fl->c.flc_file)); _enter(""); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 9c03fcf7ffaa..6ce5a612937c 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -321,8 +321,7 @@ struct afs_net { struct list_head fs_probe_slow; /* List of afs_server to probe at 5m intervals */ struct hlist_head fs_proc; /* procfs servers list */ - struct hlist_head fs_addresses4; /* afs_server (by lowest IPv4 addr) */ - struct hlist_head fs_addresses6; /* afs_server (by lowest IPv6 addr) */ + struct hlist_head fs_addresses; /* afs_server (by lowest IPv6 addr) */ seqlock_t fs_addr_lock; /* For fs_addresses[46] */ struct work_struct fs_manager; @@ -561,8 +560,7 @@ struct afs_server { struct afs_server __rcu *uuid_next; /* Next server with same UUID */ struct afs_server *uuid_prev; /* Previous server with same UUID */ struct list_head probe_link; /* Link in net->fs_probe_list */ - struct hlist_node addr4_link; /* Link in net->fs_addresses4 */ - struct hlist_node addr6_link; /* Link in net->fs_addresses6 */ + struct hlist_node addr_link; /* Link in net->fs_addresses6 */ struct hlist_node proc_link; /* Link in net->fs_proc */ struct list_head volumes; /* RCU list of afs_server_entry objects */ struct afs_server *gc_next; /* Next server in manager's list */ diff --git a/fs/afs/main.c b/fs/afs/main.c index 1b3bd21c168a..a14f6013e316 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -90,8 +90,7 @@ static int __net_init afs_net_init(struct net *net_ns) INIT_LIST_HEAD(&net->fs_probe_slow); INIT_HLIST_HEAD(&net->fs_proc); - INIT_HLIST_HEAD(&net->fs_addresses4); - INIT_HLIST_HEAD(&net->fs_addresses6); + INIT_HLIST_HEAD(&net->fs_addresses); seqlock_init(&net->fs_addr_lock); INIT_WORK(&net->fs_manager, afs_manage_servers); diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 3bd02571f30d..15eab053af6d 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -166,7 +166,7 @@ static int afs_proc_addr_prefs_show(struct seq_file *m, void *v) if (!preflist) { seq_puts(m, "NO PREFS\n"); - return 0; + goto out; } seq_printf(m, "PROT SUBNET PRIOR (v=%u n=%u/%u/%u)\n", @@ -191,7 +191,8 @@ static int afs_proc_addr_prefs_show(struct seq_file *m, void *v) } } - rcu_read_lock(); +out: + rcu_read_unlock(); return 0; } diff --git a/fs/afs/server.c b/fs/afs/server.c index e169121f603e..038f9d0ae3af 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -38,7 +38,7 @@ struct afs_server *afs_find_server(struct afs_net *net, const struct rxrpc_peer seq++; /* 2 on the 1st/lockless path, otherwise odd */ read_seqbegin_or_lock(&net->fs_addr_lock, &seq); - hlist_for_each_entry_rcu(server, &net->fs_addresses6, addr6_link) { + hlist_for_each_entry_rcu(server, &net->fs_addresses, addr_link) { estate = rcu_dereference(server->endpoint_state); alist = estate->addresses; for (i = 0; i < alist->nr_addrs; i++) @@ -177,10 +177,8 @@ added_dup: * bit, but anything we might want to do gets messy and memory * intensive. */ - if (alist->nr_ipv4 > 0) - hlist_add_head_rcu(&server->addr4_link, &net->fs_addresses4); - if (alist->nr_addrs > alist->nr_ipv4) - hlist_add_head_rcu(&server->addr6_link, &net->fs_addresses6); + if (alist->nr_addrs > 0) + hlist_add_head_rcu(&server->addr_link, &net->fs_addresses); write_sequnlock(&net->fs_addr_lock); @@ -511,10 +509,8 @@ static void afs_gc_servers(struct afs_net *net, struct afs_server *gc_list) list_del(&server->probe_link); hlist_del_rcu(&server->proc_link); - if (!hlist_unhashed(&server->addr4_link)) - hlist_del_rcu(&server->addr4_link); - if (!hlist_unhashed(&server->addr6_link)) - hlist_del_rcu(&server->addr6_link); + if (!hlist_unhashed(&server->addr_link)) + hlist_del_rcu(&server->addr_link); } write_sequnlock(&net->fs_lock); diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 020ecd45e476..af3a3f57c1b3 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -353,7 +353,7 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key) { struct afs_server_list *new, *old, *discard; struct afs_vldb_entry *vldb; - char idbuf[16]; + char idbuf[24]; int ret, idsz; _enter(""); @@ -361,7 +361,7 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key) /* We look up an ID by passing it as a decimal string in the * operation's name parameter. */ - idsz = sprintf(idbuf, "%llu", volume->vid); + idsz = snprintf(idbuf, sizeof(idbuf), "%llu", volume->vid); vldb = afs_vl_lookup_vldb(volume->cell, key, idbuf, idsz); if (IS_ERR(vldb)) { @@ -589,13 +589,24 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel) { - struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, rw); - struct kioctx *ctx = req->ki_ctx; + struct aio_kiocb *req; + struct kioctx *ctx; unsigned long flags; + /* + * kiocb didn't come from aio or is neither a read nor a write, hence + * ignore it. + */ + if (!(iocb->ki_flags & IOCB_AIO_RW)) + return; + + req = container_of(iocb, struct aio_kiocb, rw); + if (WARN_ON_ONCE(!list_empty(&req->ki_list))) return; + ctx = req->ki_ctx; + spin_lock_irqsave(&ctx->ctx_lock, flags); list_add_tail(&req->ki_list, &ctx->active_reqs); req->ki_cancel = cancel; @@ -1509,7 +1520,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) req->ki_complete = aio_complete_rw; req->private = NULL; req->ki_pos = iocb->aio_offset; - req->ki_flags = req->ki_filp->f_iocb_flags; + req->ki_flags = req->ki_filp->f_iocb_flags | IOCB_AIO_RW; if (iocb->aio_flags & IOCB_FLAG_RESFD) req->ki_flags |= IOCB_EVENTFD; if (iocb->aio_flags & IOCB_FLAG_IOPRIO) { diff --git a/fs/attr.c b/fs/attr.c index 5a13f0c8495f..960a310581eb 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -16,8 +16,6 @@ #include <linux/fcntl.h> #include <linux/filelock.h> #include <linux/security.h> -#include <linux/evm.h> -#include <linux/ima.h> #include "internal.h" @@ -352,7 +350,7 @@ int may_setattr(struct mnt_idmap *idmap, struct inode *inode, EXPORT_SYMBOL(may_setattr); /** - * notify_change - modify attributes of a filesytem object + * notify_change - modify attributes of a filesystem object * @idmap: idmap of the mount the inode was found from * @dentry: object affected * @attr: new attributes @@ -502,8 +500,7 @@ int notify_change(struct mnt_idmap *idmap, struct dentry *dentry, if (!error) { fsnotify_change(dentry, ia_valid); - ima_inode_post_setattr(idmap, dentry); - evm_inode_post_setattr(dentry, ia_valid); + security_inode_post_setattr(idmap, dentry, ia_valid); } return error; diff --git a/fs/backing-file.c b/fs/backing-file.c index a681f38d84d8..740185198db3 100644 --- a/fs/backing-file.c +++ b/fs/backing-file.c @@ -325,9 +325,7 @@ EXPORT_SYMBOL_GPL(backing_file_mmap); static int __init backing_aio_init(void) { - backing_aio_cachep = kmem_cache_create("backing_aio", - sizeof(struct backing_aio), - 0, SLAB_HWCACHE_ALIGN, NULL); + backing_aio_cachep = KMEM_CACHE(backing_aio, SLAB_HWCACHE_ALIGN); if (!backing_aio_cachep) return -ENOMEM; diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 10704f2d3af5..fd3e175d8342 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -1715,7 +1715,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, * This works without any other locks because this is the only * thread that removes items from the need_discard tree */ - bch2_trans_unlock(trans); + bch2_trans_unlock_long(trans); blkdev_issue_discard(ca->disk_sb.bdev, k.k->p.offset * ca->mi.bucket_size, ca->mi.bucket_size, diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index b4dc319bcb2b..569b97904da4 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -68,9 +68,11 @@ void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { - prt_str(out, "bucket="); - bch2_bpos_to_text(out, bp_pos_to_bucket(c, k.k->p)); - prt_str(out, " "); + if (bch2_dev_exists2(c, k.k->p.inode)) { + prt_str(out, "bucket="); + bch2_bpos_to_text(out, bp_pos_to_bucket(c, k.k->p)); + prt_str(out, " "); + } bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v); } diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index b80c6c9efd8c..69d0d60d50e3 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -1249,6 +1249,18 @@ static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c) return stdio; } +static inline unsigned metadata_replicas_required(struct bch_fs *c) +{ + return min(c->opts.metadata_replicas, + c->opts.metadata_replicas_required); +} + +static inline unsigned data_replicas_required(struct bch_fs *c) +{ + return min(c->opts.data_replicas, + c->opts.data_replicas_required); +} + #define BKEY_PADDED_ONSTACK(key, pad) \ struct { struct bkey_i key; __u64 key ## _pad[pad]; } diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 5467a8635be1..3ef338df82f5 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -2156,7 +2156,9 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e * isn't monotonically increasing before FILTER_SNAPSHOTS, and * that's what we check against in extents mode: */ - if (k.k->p.inode > end.inode) + if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS) + ? bkey_gt(k.k->p, end) + : k.k->p.inode > end.inode)) goto end; if (iter->update_path && diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index bed75c93c069..684397442338 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -92,7 +92,7 @@ static noinline void print_cycle(struct printbuf *out, struct lock_graph *g) continue; bch2_btree_trans_to_text(out, i->trans); - bch2_prt_task_backtrace(out, task, i == g->g ? 5 : 1); + bch2_prt_task_backtrace(out, task, i == g->g ? 5 : 1, GFP_NOWAIT); } } @@ -227,7 +227,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle) prt_printf(&buf, "backtrace:"); prt_newline(&buf); printbuf_indent_add(&buf, 2); - bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2); + bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT); printbuf_indent_sub(&buf, 2); prt_newline(&buf); } diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 17a5938aa71a..4530b14ff2c3 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -280,7 +280,8 @@ retry: writepoint_ptr(&c->btree_write_point), &devs_have, res->nr_replicas, - c->opts.metadata_replicas_required, + min(res->nr_replicas, + c->opts.metadata_replicas_required), watermark, 0, cl, &wp); if (unlikely(ret)) return ERR_PTR(ret); diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index cadda9bbe4a4..7bdba8507fc9 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -627,7 +627,7 @@ restart: prt_printf(&i->buf, "backtrace:"); prt_newline(&i->buf); printbuf_indent_add(&i->buf, 2); - bch2_prt_task_backtrace(&i->buf, task, 0); + bch2_prt_task_backtrace(&i->buf, task, 0, GFP_KERNEL); printbuf_indent_sub(&i->buf, 2); prt_newline(&i->buf); diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index 73c12e565af5..27710cdd5710 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -303,18 +303,6 @@ void bch2_readahead(struct readahead_control *ractl) darray_exit(&readpages_iter.folios); } -static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio, - subvol_inum inum, struct folio *folio) -{ - bch2_folio_create(folio, __GFP_NOFAIL); - - rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; - rbio->bio.bi_iter.bi_sector = folio_sector(folio); - BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); - - bch2_trans_run(c, (bchfs_read(trans, rbio, inum, NULL), 0)); -} - static void bch2_read_single_folio_end_io(struct bio *bio) { complete(bio->bi_private); @@ -329,6 +317,9 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) int ret; DECLARE_COMPLETION_ONSTACK(done); + if (!bch2_folio_create(folio, GFP_KERNEL)) + return -ENOMEM; + bch2_inode_opts_get(&opts, c, &inode->ei_inode); rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read), @@ -336,7 +327,11 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) rbio->bio.bi_private = &done; rbio->bio.bi_end_io = bch2_read_single_folio_end_io; - __bchfs_readfolio(c, rbio, inode_inum(inode), folio); + rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; + rbio->bio.bi_iter.bi_sector = folio_sector(folio); + BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); + + bch2_trans_run(c, (bchfs_read(trans, rbio, inode_inum(inode), NULL), 0)); wait_for_completion(&done); ret = blk_status_to_errno(rbio->bio.bi_status); diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index e3b219e19e10..33cb6da3a5ad 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -88,6 +88,8 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) return ret; shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c)); + if (shorten >= iter->count) + shorten = 0; iter->count -= shorten; bio = bio_alloc_bioset(NULL, diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index dc52918d06ef..8c70123b6a0c 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -79,7 +79,7 @@ void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, continue; bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0, - REQ_OP_FLUSH, + REQ_OP_WRITE|REQ_PREFLUSH, GFP_KERNEL, &c->nocow_flush_bioset), struct nocow_flush, bio); diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 3a4c24c28e7f..3dc8630ff9fe 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -455,6 +455,7 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, if (IS_ERR(victim)) return PTR_ERR(victim); + dir = d_inode(path.dentry); if (victim->d_sb->s_fs_info != c) { ret = -EXDEV; goto err; @@ -463,14 +464,13 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, ret = -ENOENT; goto err; } - dir = d_inode(path.dentry); ret = __bch2_unlink(dir, victim, true); if (!ret) { fsnotify_rmdir(dir, victim); d_delete(victim); } - inode_unlock(dir); err: + inode_unlock(dir); dput(victim); path_put(&path); return ret; diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index ec419b8e2c43..77ae65542db9 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -435,7 +435,7 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir, bch2_subvol_is_ro(c, inode->ei_subvol) ?: __bch2_link(c, inode, dir, dentry); if (unlikely(ret)) - return ret; + return bch2_err_class(ret); ihold(&inode->v); d_instantiate(dentry, &inode->v); @@ -487,8 +487,9 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry) struct bch_inode_info *dir= to_bch_ei(vdir); struct bch_fs *c = dir->v.i_sb->s_fs_info; - return bch2_subvol_is_ro(c, dir->ei_subvol) ?: + int ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?: __bch2_unlink(vdir, dentry, false); + return bch2_err_class(ret); } static int bch2_symlink(struct mnt_idmap *idmap, @@ -523,7 +524,7 @@ static int bch2_symlink(struct mnt_idmap *idmap, return 0; err: iput(&inode->v); - return ret; + return bch2_err_class(ret); } static int bch2_mkdir(struct mnt_idmap *idmap, @@ -641,7 +642,7 @@ err: src_inode, dst_inode); - return ret; + return bch2_err_class(ret); } static void bch2_setattr_copy(struct mnt_idmap *idmap, diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 4f0ecd605675..6a760777bafb 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -119,22 +119,19 @@ static int lookup_inode(struct btree_trans *trans, u64 inode_nr, if (!ret) *snapshot = iter.pos.snapshot; err: - bch_err_msg(trans->c, ret, "fetching inode %llu:%u", inode_nr, *snapshot); bch2_trans_iter_exit(trans, &iter); return ret; } -static int __lookup_dirent(struct btree_trans *trans, +static int lookup_dirent_in_snapshot(struct btree_trans *trans, struct bch_hash_info hash_info, subvol_inum dir, struct qstr *name, - u64 *target, unsigned *type) + u64 *target, unsigned *type, u32 snapshot) { struct btree_iter iter; struct bkey_s_c_dirent d; - int ret; - - ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc, - &hash_info, dir, name, 0); + int ret = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc, + &hash_info, dir, name, 0, snapshot); if (ret) return ret; @@ -225,15 +222,16 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, struct bch_inode_unpacked root_inode; struct bch_hash_info root_hash_info; - ret = lookup_inode(trans, root_inum.inum, &root_inode, &snapshot); + u32 root_inode_snapshot = snapshot; + ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot); bch_err_msg(c, ret, "looking up root inode"); if (ret) return ret; root_hash_info = bch2_hash_info_init(c, &root_inode); - ret = __lookup_dirent(trans, root_hash_info, root_inum, - &lostfound_str, &inum, &d_type); + ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum, + &lostfound_str, &inum, &d_type, snapshot); if (bch2_err_matches(ret, ENOENT)) goto create_lostfound; @@ -250,7 +248,10 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, * The bch2_check_dirents pass has already run, dangling dirents * shouldn't exist here: */ - return lookup_inode(trans, inum, lostfound, &snapshot); + ret = lookup_inode(trans, inum, lostfound, &snapshot); + bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)", + inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot)); + return ret; create_lostfound: /* diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index ef3a53f9045a..2c098ac017b3 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -1564,6 +1564,7 @@ CLOSURE_CALLBACK(bch2_write) BUG_ON(!op->write_point.v); BUG_ON(bkey_eq(op->pos, POS_MAX)); + op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas); op->start_time = local_clock(); bch2_keylist_init(&op->insert_keys, op->inline_keys); wbio_init(bio)->put_bio = false; diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index d71d26e39521..bc890776eb57 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -233,7 +233,7 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t prt_str(&pbuf, "entry size: "); prt_human_readable_u64(&pbuf, vstruct_bytes(buf->data)); prt_newline(&pbuf); - bch2_prt_task_backtrace(&pbuf, current, 1); + bch2_prt_task_backtrace(&pbuf, current, 1, GFP_NOWAIT); trace_journal_entry_close(c, pbuf.buf); printbuf_exit(&pbuf); } diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 04a1e79a5ed3..47805193f18c 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1478,6 +1478,8 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w) c->opts.foreground_target; unsigned i, replicas = 0, replicas_want = READ_ONCE(c->opts.metadata_replicas); + unsigned replicas_need = min_t(unsigned, replicas_want, + READ_ONCE(c->opts.metadata_replicas_required)); rcu_read_lock(); retry: @@ -1526,7 +1528,7 @@ done: BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); - return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS; + return replicas >= replicas_need ? 0 : -EROFS; } static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) @@ -1988,7 +1990,8 @@ CLOSURE_CALLBACK(bch2_journal_write) percpu_ref_get(&ca->io_ref); bio = ca->journal.bio; - bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH); + bio_reset(bio, ca->disk_sb.bdev, + REQ_OP_WRITE|REQ_PREFLUSH); bio->bi_end_io = journal_write_endio; bio->bi_private = ca; closure_bio_submit(bio, cl); diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 820d25e19e5f..c33dca641575 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -205,7 +205,7 @@ void bch2_journal_space_available(struct journal *j) j->can_discard = can_discard; - if (nr_online < c->opts.metadata_replicas_required) { + if (nr_online < metadata_replicas_required(c)) { ret = JOURNAL_ERR_insufficient_devices; goto out; } @@ -892,9 +892,11 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) journal_seq_pin(j, seq)->devs); seq++; - spin_unlock(&j->lock); - ret = bch2_mark_replicas(c, &replicas.e); - spin_lock(&j->lock); + if (replicas.e.nr_devs) { + spin_unlock(&j->lock); + ret = bch2_mark_replicas(c, &replicas.e); + spin_lock(&j->lock); + } } spin_unlock(&j->lock); err: diff --git a/fs/bcachefs/mean_and_variance.h b/fs/bcachefs/mean_and_variance.h index b2be565bb8f2..64df11ab422b 100644 --- a/fs/bcachefs/mean_and_variance.h +++ b/fs/bcachefs/mean_and_variance.h @@ -17,7 +17,7 @@ * Rust and rustc has issues with u128. */ -#if defined(__SIZEOF_INT128__) && defined(__KERNEL__) +#if defined(__SIZEOF_INT128__) && defined(__KERNEL__) && !defined(CONFIG_PARISC) typedef struct { unsigned __int128 v; diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c index accf246c3233..b27d22925929 100644 --- a/fs/bcachefs/printbuf.c +++ b/fs/bcachefs/printbuf.c @@ -56,6 +56,7 @@ void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args) va_copy(args2, args); len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args2); + va_end(args2); } while (len + 1 >= printbuf_remaining(out) && !bch2_printbuf_make_room(out, len + 1)); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 9127d0e3ca2f..21e13bb4335b 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -577,8 +577,9 @@ u64 bch2_recovery_passes_from_stable(u64 v) static bool check_version_upgrade(struct bch_fs *c) { - unsigned latest_compatible = bch2_latest_compatible_version(c->sb.version); unsigned latest_version = bcachefs_metadata_version_current; + unsigned latest_compatible = min(latest_version, + bch2_latest_compatible_version(c->sb.version)); unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version; unsigned new_version = 0; @@ -597,7 +598,7 @@ static bool check_version_upgrade(struct bch_fs *c) new_version = latest_version; break; case BCH_VERSION_UPGRADE_none: - new_version = old_version; + new_version = min(old_version, latest_version); break; } } @@ -774,7 +775,7 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; } - if (!(c->opts.nochanges && c->opts.norecovery)) { + if (!c->opts.nochanges) { mutex_lock(&c->sb_lock); bool write_sb = false; @@ -804,7 +805,7 @@ int bch2_fs_recovery(struct bch_fs *c) if (bch2_check_version_downgrade(c)) { struct printbuf buf = PRINTBUF; - prt_str(&buf, "Version downgrade required:\n"); + prt_str(&buf, "Version downgrade required:"); __le64 passes = ext->recovery_passes_required[0]; bch2_sb_set_downgrade(c, @@ -812,7 +813,7 @@ int bch2_fs_recovery(struct bch_fs *c) BCH_VERSION_MINOR(c->sb.version)); passes = ext->recovery_passes_required[0] & ~passes; if (passes) { - prt_str(&buf, " running recovery passes: "); + prt_str(&buf, "\n running recovery passes: "); prt_bitflags(&buf, bch2_recovery_passes, bch2_recovery_passes_from_stable(le64_to_cpu(passes))); } diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index a45354d2acde..eff5ce18c69c 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -421,7 +421,7 @@ void bch2_dev_errors_reset(struct bch_dev *ca) m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); for (unsigned i = 0; i < ARRAY_SIZE(m->errors_at_reset); i++) m->errors_at_reset[i] = cpu_to_le64(atomic64_read(&ca->errors[i])); - m->errors_reset_time = ktime_get_real_seconds(); + m->errors_reset_time = cpu_to_le64(ktime_get_real_seconds()); bch2_write_super(c); mutex_unlock(&c->sb_lock); diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 45f67e8b29eb..ac6ba04d5521 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -728,7 +728,7 @@ static int check_snapshot(struct btree_trans *trans, return 0; memset(&s, 0, sizeof(s)); - memcpy(&s, k.v, bkey_val_bytes(k.k)); + memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k))); id = le32_to_cpu(s.parent); if (id) { diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index 89fdb7c21134..fcaa5a888744 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -160,21 +160,16 @@ static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, s } static __always_inline int -bch2_hash_lookup(struct btree_trans *trans, +bch2_hash_lookup_in_snapshot(struct btree_trans *trans, struct btree_iter *iter, const struct bch_hash_desc desc, const struct bch_hash_info *info, subvol_inum inum, const void *key, - unsigned flags) + unsigned flags, u32 snapshot) { struct bkey_s_c k; - u32 snapshot; int ret; - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - return ret; - for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, SPOS(inum.inum, desc.hash_key(info, key), snapshot), POS(inum.inum, U64_MAX), @@ -195,6 +190,19 @@ bch2_hash_lookup(struct btree_trans *trans, } static __always_inline int +bch2_hash_lookup(struct btree_trans *trans, + struct btree_iter *iter, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + subvol_inum inum, const void *key, + unsigned flags) +{ + u32 snapshot; + return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?: + bch2_hash_lookup_in_snapshot(trans, iter, desc, info, inum, key, flags, snapshot); +} + +static __always_inline int bch2_hash_hole(struct btree_trans *trans, struct btree_iter *iter, const struct bch_hash_desc desc, diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index d60c7d27a047..bd64eb68e84a 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -142,8 +142,8 @@ void bch2_sb_field_delete(struct bch_sb_handle *sb, void bch2_free_super(struct bch_sb_handle *sb) { kfree(sb->bio); - if (!IS_ERR_OR_NULL(sb->bdev_handle)) - bdev_release(sb->bdev_handle); + if (!IS_ERR_OR_NULL(sb->s_bdev_file)) + fput(sb->s_bdev_file); kfree(sb->holder); kfree(sb->sb_name); @@ -704,22 +704,22 @@ retry: if (!opt_get(*opts, nochanges)) sb->mode |= BLK_OPEN_WRITE; - sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); - if (IS_ERR(sb->bdev_handle) && - PTR_ERR(sb->bdev_handle) == -EACCES && + sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); + if (IS_ERR(sb->s_bdev_file) && + PTR_ERR(sb->s_bdev_file) == -EACCES && opt_get(*opts, read_only)) { sb->mode &= ~BLK_OPEN_WRITE; - sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); - if (!IS_ERR(sb->bdev_handle)) + sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); + if (!IS_ERR(sb->s_bdev_file)) opt_set(*opts, nochanges, true); } - if (IS_ERR(sb->bdev_handle)) { - ret = PTR_ERR(sb->bdev_handle); - goto out; + if (IS_ERR(sb->s_bdev_file)) { + ret = PTR_ERR(sb->s_bdev_file); + goto err; } - sb->bdev = sb->bdev_handle->bdev; + sb->bdev = file_bdev(sb->s_bdev_file); ret = bch2_sb_realloc(sb, 0); if (ret) { diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index b9911402b175..6b23e11825e6 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1428,10 +1428,10 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED) ? c->opts.metadata_replicas - : c->opts.metadata_replicas_required, + : metadata_replicas_required(c), !(flags & BCH_FORCE_IF_DATA_DEGRADED) ? c->opts.data_replicas - : c->opts.data_replicas_required); + : data_replicas_required(c)); return nr_rw >= required; case BCH_MEMBER_STATE_failed: diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h index 0e5a14fc8e7f..ec784d975f66 100644 --- a/fs/bcachefs/super_types.h +++ b/fs/bcachefs/super_types.h @@ -4,7 +4,7 @@ struct bch_sb_handle { struct bch_sb *sb; - struct bdev_handle *bdev_handle; + struct file *s_bdev_file; struct block_device *bdev; char *sb_name; struct bio *bio; diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c index b1c867aa2b58..9220d7de10db 100644 --- a/fs/bcachefs/thread_with_file.c +++ b/fs/bcachefs/thread_with_file.c @@ -53,9 +53,9 @@ int bch2_run_thread_with_file(struct thread_with_file *thr, if (ret) goto err; - fd_install(fd, file); get_task_struct(thr->task); wake_up_process(thr->task); + fd_install(fd, file); return fd; err: if (fd >= 0) diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index a135136adeee..3a32faa86b5c 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -272,14 +272,14 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines) console_unlock(); } -int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr) +int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr, + gfp_t gfp) { #ifdef CONFIG_STACKTRACE unsigned nr_entries = 0; - int ret = 0; stack->nr = 0; - ret = darray_make_room(stack, 32); + int ret = darray_make_room_gfp(stack, 32, gfp); if (ret) return ret; @@ -289,7 +289,7 @@ int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigne do { nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, skipnr + 1); } while (nr_entries == stack->size && - !(ret = darray_make_room(stack, stack->size * 2))); + !(ret = darray_make_room_gfp(stack, stack->size * 2, gfp))); stack->nr = nr_entries; up_read(&task->signal->exec_update_lock); @@ -308,10 +308,10 @@ void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack) } } -int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task, unsigned skipnr) +int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task, unsigned skipnr, gfp_t gfp) { bch_stacktrace stack = { 0 }; - int ret = bch2_save_backtrace(&stack, task, skipnr + 1); + int ret = bch2_save_backtrace(&stack, task, skipnr + 1, gfp); bch2_prt_backtrace(out, &stack); darray_exit(&stack); @@ -418,14 +418,15 @@ static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats, bch2_quantiles_update(&stats->quantiles, duration); } - if (time_after64(end, stats->last_event)) { + if (stats->last_event && time_after64(end, stats->last_event)) { freq = end - stats->last_event; mean_and_variance_update(&stats->freq_stats, freq); mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq); stats->max_freq = max(stats->max_freq, freq); stats->min_freq = min(stats->min_freq, freq); - stats->last_event = end; } + + stats->last_event = end; } static void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index df67bf55fe2b..b414736d59a5 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -348,9 +348,9 @@ void bch2_prt_u64_base2(struct printbuf *, u64); void bch2_print_string_as_lines(const char *prefix, const char *lines); typedef DARRAY(unsigned long) bch_stacktrace; -int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned); +int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t); void bch2_prt_backtrace(struct printbuf *, bch_stacktrace *); -int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *, unsigned); +int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *, unsigned, gfp_t); static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev) { diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 2b4dda047450..d76f406d3b2e 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -435,8 +435,7 @@ befs_init_inodecache(void) { befs_inode_cachep = kmem_cache_create_usercopy("befs_inode_cache", sizeof(struct befs_inode_info), 0, - (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| - SLAB_ACCOUNT), + SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, offsetof(struct befs_inode_info, i_data.symlink), sizeof_field(struct befs_inode_info, diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 355957dbce39..db81570c9637 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -259,7 +259,7 @@ static int __init init_inodecache(void) bfs_inode_cachep = kmem_cache_create("bfs_inode_cache", sizeof(struct bfs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), + SLAB_ACCOUNT), init_once); if (bfs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index fefc642541cb..1920ed69279b 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -320,7 +320,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm) else executable_stack = EXSTACK_DEFAULT; - if (stack_size == 0) { + if (stack_size == 0 && interp_params.flags & ELF_FDPIC_FLAG_PRESENT) { stack_size = interp_params.stack_size; if (interp_params.flags & ELF_FDPIC_FLAG_EXEC_STACK) executable_stack = EXSTACK_ENABLE_X; diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c index 1925a0919ca6..79026917db19 100644 --- a/fs/btrfs/accessors.c +++ b/fs/btrfs/accessors.c @@ -5,7 +5,8 @@ #include <asm/unaligned.h> #include "messages.h" -#include "ctree.h" +#include "extent_io.h" +#include "fs.h" #include "accessors.h" static bool check_setget_bounds(const struct extent_buffer *eb, @@ -63,8 +64,8 @@ u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \ const unsigned long oil = get_eb_offset_in_folio(token->eb, \ member_offset);\ - const int unit_size = folio_size(token->eb->folios[0]); \ - const int unit_shift = folio_shift(token->eb->folios[0]); \ + const int unit_size = token->eb->folio_size; \ + const int unit_shift = token->eb->folio_shift; \ const int size = sizeof(u##bits); \ u8 lebytes[sizeof(u##bits)]; \ const int part = unit_size - oil; \ @@ -94,7 +95,7 @@ u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ const unsigned long idx = get_eb_folio_index(eb, member_offset);\ const unsigned long oil = get_eb_offset_in_folio(eb, \ member_offset);\ - const int unit_size = folio_size(eb->folios[0]); \ + const int unit_size = eb->folio_size; \ char *kaddr = folio_address(eb->folios[idx]); \ const int size = sizeof(u##bits); \ const int part = unit_size - oil; \ @@ -117,8 +118,8 @@ void btrfs_set_token_##bits(struct btrfs_map_token *token, \ const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \ const unsigned long oil = get_eb_offset_in_folio(token->eb, \ member_offset);\ - const int unit_size = folio_size(token->eb->folios[0]); \ - const int unit_shift = folio_shift(token->eb->folios[0]); \ + const int unit_size = token->eb->folio_size; \ + const int unit_shift = token->eb->folio_shift; \ const int size = sizeof(u##bits); \ u8 lebytes[sizeof(u##bits)]; \ const int part = unit_size - oil; \ @@ -151,7 +152,7 @@ void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ const unsigned long idx = get_eb_folio_index(eb, member_offset);\ const unsigned long oil = get_eb_offset_in_folio(eb, \ member_offset);\ - const int unit_size = folio_size(eb->folios[0]); \ + const int unit_size = eb->folio_size; \ char *kaddr = folio_address(eb->folios[idx]); \ const int size = sizeof(u##bits); \ const int part = unit_size - oil; \ diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index ed7aa32972ad..6fce3e8d3dac 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -3,8 +3,17 @@ #ifndef BTRFS_ACCESSORS_H #define BTRFS_ACCESSORS_H -#include <linux/stddef.h> #include <asm/unaligned.h> +#include <linux/stddef.h> +#include <linux/types.h> +#include <linux/align.h> +#include <linux/build_bug.h> +#include <linux/compiler.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <uapi/linux/btrfs_tree.h> + +struct extent_buffer; struct btrfs_map_token { struct extent_buffer *eb; @@ -844,45 +853,6 @@ static inline void btrfs_set_balance_sys(struct extent_buffer *eb, write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); } -static inline void btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, - const struct btrfs_disk_balance_args *disk) -{ - memset(cpu, 0, sizeof(*cpu)); - - cpu->profiles = le64_to_cpu(disk->profiles); - cpu->usage = le64_to_cpu(disk->usage); - cpu->devid = le64_to_cpu(disk->devid); - cpu->pstart = le64_to_cpu(disk->pstart); - cpu->pend = le64_to_cpu(disk->pend); - cpu->vstart = le64_to_cpu(disk->vstart); - cpu->vend = le64_to_cpu(disk->vend); - cpu->target = le64_to_cpu(disk->target); - cpu->flags = le64_to_cpu(disk->flags); - cpu->limit = le64_to_cpu(disk->limit); - cpu->stripes_min = le32_to_cpu(disk->stripes_min); - cpu->stripes_max = le32_to_cpu(disk->stripes_max); -} - -static inline void btrfs_cpu_balance_args_to_disk( - struct btrfs_disk_balance_args *disk, - const struct btrfs_balance_args *cpu) -{ - memset(disk, 0, sizeof(*disk)); - - disk->profiles = cpu_to_le64(cpu->profiles); - disk->usage = cpu_to_le64(cpu->usage); - disk->devid = cpu_to_le64(cpu->devid); - disk->pstart = cpu_to_le64(cpu->pstart); - disk->pend = cpu_to_le64(cpu->pend); - disk->vstart = cpu_to_le64(cpu->vstart); - disk->vend = cpu_to_le64(cpu->vend); - disk->target = cpu_to_le64(cpu->target); - disk->flags = cpu_to_le64(cpu->flags); - disk->limit = cpu_to_le64(cpu->limit); - disk->stripes_min = cpu_to_le32(cpu->stripes_min); - disk->stripes_max = cpu_to_le32(cpu->stripes_max); -} - /* struct btrfs_super_block */ BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 7427449a04a3..e0ba00d64ea0 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -12,7 +12,6 @@ #include <linux/sched/mm.h> #include <linux/slab.h> #include "ctree.h" -#include "btrfs_inode.h" #include "xattr.h" #include "acl.h" diff --git a/fs/btrfs/acl.h b/fs/btrfs/acl.h index a270e71ec05f..48b9ddae4a46 100644 --- a/fs/btrfs/acl.h +++ b/fs/btrfs/acl.h @@ -3,8 +3,15 @@ #ifndef BTRFS_ACL_H #define BTRFS_ACL_H +struct posix_acl; +struct inode; +struct btrfs_trans_handle; + #ifdef CONFIG_BTRFS_FS_POSIX_ACL +struct mnt_idmap; +struct dentry; + struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu); int btrfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); @@ -13,6 +20,10 @@ int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, #else +#include <linux/errno.h> + +struct btrfs_trans_handle; + #define btrfs_get_acl NULL #define btrfs_set_acl NULL static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 9e261aac671e..361a866c1995 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -11,7 +11,6 @@ #include <linux/freezer.h> #include <trace/events/btrfs.h> #include "async-thread.h" -#include "ctree.h" enum { WORK_DONE_BIT, diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 62b8a0d57898..04c2f3175828 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -7,11 +7,14 @@ #ifndef BTRFS_ASYNC_THREAD_H #define BTRFS_ASYNC_THREAD_H +#include <linux/compiler_types.h> #include <linux/workqueue.h> +#include <linux/list.h> struct btrfs_fs_info; struct btrfs_workqueue; struct btrfs_work; + typedef void (*btrfs_func_t)(struct btrfs_work *arg); typedef void (*btrfs_ordered_func_t)(struct btrfs_work *arg, bool); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index beed7e459dab..c1e6a5bbeeaf 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -198,10 +198,7 @@ static struct kmem_cache *btrfs_prelim_ref_cache; int __init btrfs_prelim_ref_init(void) { btrfs_prelim_ref_cache = kmem_cache_create("btrfs_prelim_ref", - sizeof(struct prelim_ref), - 0, - SLAB_MEM_SPREAD, - NULL); + sizeof(struct prelim_ref), 0, 0, NULL); if (!btrfs_prelim_ref_cache) return -ENOMEM; return 0; @@ -1036,8 +1033,6 @@ static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx, slot = path->slots[0]; item_size = btrfs_item_size(leaf, slot); - BUG_ON(item_size < sizeof(*ei)); - ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); if (ctx->check_extent_item) { @@ -1435,8 +1430,10 @@ again: if (ret < 0) goto out; if (ret == 0) { - /* This shouldn't happen, indicates a bug or fs corruption. */ - ASSERT(ret != 0); + /* + * Key with offset -1 found, there would have to exist an extent + * item with such offset, but this is out of the valid range. + */ ret = -EUCLEAN; goto out; } @@ -2225,6 +2222,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; + if (ret == 0) { + /* + * Key with offset -1 found, there would have to exist an extent + * item with such offset, but this is out of the valid range. + */ + return -EUCLEAN; + } ret = btrfs_previous_extent_item(extent_root, path, 0); if (ret) { @@ -2247,7 +2251,6 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, eb = path->nodes[0]; item_size = btrfs_item_size(eb, path->slots[0]); - BUG_ON(item_size < sizeof(*ei)); ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); flags = btrfs_extent_flags(eb, ei); @@ -2850,6 +2853,16 @@ struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_inf return ret; } +static void btrfs_backref_iter_release(struct btrfs_backref_iter *iter) +{ + iter->bytenr = 0; + iter->item_ptr = 0; + iter->cur_ptr = 0; + iter->end_ptr = 0; + btrfs_release_path(iter->path); + memset(&iter->cur_key, 0, sizeof(iter->cur_key)); +} + int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) { struct btrfs_fs_info *fs_info = iter->fs_info; @@ -2868,6 +2881,10 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) if (ret < 0) return ret; if (ret == 0) { + /* + * Key with offset -1 found, there would have to exist an extent + * item with such offset, but this is out of the valid range. + */ ret = -EUCLEAN; goto release; } @@ -2938,6 +2955,14 @@ release: return ret; } +static bool btrfs_backref_iter_is_inline_ref(struct btrfs_backref_iter *iter) +{ + if (iter->cur_key.type == BTRFS_EXTENT_ITEM_KEY || + iter->cur_key.type == BTRFS_METADATA_ITEM_KEY) + return true; + return false; +} + /* * Go to the next backref item of current bytenr, can be either inlined or * keyed. @@ -2950,7 +2975,7 @@ release: */ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter) { - struct extent_buffer *eb = btrfs_backref_get_eb(iter); + struct extent_buffer *eb = iter->path->nodes[0]; struct btrfs_root *extent_root; struct btrfs_path *path = iter->path; struct btrfs_extent_inline_ref *iref; @@ -3038,6 +3063,19 @@ struct btrfs_backref_node *btrfs_backref_alloc_node( return node; } +void btrfs_backref_free_node(struct btrfs_backref_cache *cache, + struct btrfs_backref_node *node) +{ + if (node) { + ASSERT(list_empty(&node->list)); + ASSERT(list_empty(&node->lower)); + ASSERT(node->eb == NULL); + cache->nr_nodes--; + btrfs_put_root(node->root); + kfree(node); + } +} + struct btrfs_backref_edge *btrfs_backref_alloc_edge( struct btrfs_backref_cache *cache) { @@ -3049,6 +3087,52 @@ struct btrfs_backref_edge *btrfs_backref_alloc_edge( return edge; } +void btrfs_backref_free_edge(struct btrfs_backref_cache *cache, + struct btrfs_backref_edge *edge) +{ + if (edge) { + cache->nr_edges--; + kfree(edge); + } +} + +void btrfs_backref_unlock_node_buffer(struct btrfs_backref_node *node) +{ + if (node->locked) { + btrfs_tree_unlock(node->eb); + node->locked = 0; + } +} + +void btrfs_backref_drop_node_buffer(struct btrfs_backref_node *node) +{ + if (node->eb) { + btrfs_backref_unlock_node_buffer(node); + free_extent_buffer(node->eb); + node->eb = NULL; + } +} + +/* + * Drop the backref node from cache without cleaning up its children + * edges. + * + * This can only be called on node without parent edges. + * The children edges are still kept as is. + */ +void btrfs_backref_drop_node(struct btrfs_backref_cache *tree, + struct btrfs_backref_node *node) +{ + ASSERT(list_empty(&node->upper)); + + btrfs_backref_drop_node_buffer(node); + list_del_init(&node->list); + list_del_init(&node->lower); + if (!RB_EMPTY_NODE(&node->rb_node)) + rb_erase(&node->rb_node, &tree->rb_root); + btrfs_backref_free_node(tree, node); +} + /* * Drop the backref node from cache, also cleaning up all its * upper edges and any uncached nodes in the path. @@ -3120,6 +3204,19 @@ void btrfs_backref_release_cache(struct btrfs_backref_cache *cache) ASSERT(!cache->nr_edges); } +void btrfs_backref_link_edge(struct btrfs_backref_edge *edge, + struct btrfs_backref_node *lower, + struct btrfs_backref_node *upper, + int link_which) +{ + ASSERT(upper && lower && upper->level == lower->level + 1); + edge->node[LOWER] = lower; + edge->node[UPPER] = upper; + if (link_which & LINK_LOWER) + list_add_tail(&edge->list[LOWER], &lower->upper); + if (link_which & LINK_UPPER) + list_add_tail(&edge->list[UPPER], &upper->lower); +} /* * Handle direct tree backref * @@ -3428,7 +3525,7 @@ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans, int type; cond_resched(); - eb = btrfs_backref_get_eb(iter); + eb = iter->path->nodes[0]; key.objectid = iter->bytenr; if (btrfs_backref_iter_is_inline_ref(iter)) { diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index ab4ca0eda605..e8c22cccb5c1 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -6,11 +6,23 @@ #ifndef BTRFS_BACKREF_H #define BTRFS_BACKREF_H -#include <linux/btrfs.h> +#include <linux/types.h> +#include <linux/rbtree.h> +#include <linux/list.h> +#include <linux/slab.h> +#include <uapi/linux/btrfs.h> +#include <uapi/linux/btrfs_tree.h> #include "messages.h" -#include "ulist.h" +#include "locking.h" #include "disk-io.h" #include "extent_io.h" +#include "ctree.h" + +struct extent_inode_elem; +struct ulist; +struct btrfs_extent_item; +struct btrfs_trans_handle; +struct btrfs_fs_info; /* * Used by implementations of iterate_extent_inodes_t (see definition below) to @@ -271,22 +283,6 @@ struct btrfs_backref_iter { struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_info); -static inline void btrfs_backref_iter_free(struct btrfs_backref_iter *iter) -{ - if (!iter) - return; - btrfs_free_path(iter->path); - kfree(iter); -} - -static inline struct extent_buffer *btrfs_backref_get_eb( - struct btrfs_backref_iter *iter) -{ - if (!iter) - return NULL; - return iter->path->nodes[0]; -} - /* * For metadata with EXTENT_ITEM key (non-skinny) case, the first inline data * is btrfs_tree_block_info, without a btrfs_extent_inline_ref header. @@ -306,25 +302,6 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr); int btrfs_backref_iter_next(struct btrfs_backref_iter *iter); -static inline bool btrfs_backref_iter_is_inline_ref( - struct btrfs_backref_iter *iter) -{ - if (iter->cur_key.type == BTRFS_EXTENT_ITEM_KEY || - iter->cur_key.type == BTRFS_METADATA_ITEM_KEY) - return true; - return false; -} - -static inline void btrfs_backref_iter_release(struct btrfs_backref_iter *iter) -{ - iter->bytenr = 0; - iter->item_ptr = 0; - iter->cur_ptr = 0; - iter->end_ptr = 0; - btrfs_release_path(iter->path); - memset(&iter->cur_key, 0, sizeof(iter->cur_key)); -} - /* * Backref cache related structures * @@ -452,83 +429,22 @@ struct btrfs_backref_edge *btrfs_backref_alloc_edge( #define LINK_LOWER (1 << 0) #define LINK_UPPER (1 << 1) -static inline void btrfs_backref_link_edge(struct btrfs_backref_edge *edge, - struct btrfs_backref_node *lower, - struct btrfs_backref_node *upper, - int link_which) -{ - ASSERT(upper && lower && upper->level == lower->level + 1); - edge->node[LOWER] = lower; - edge->node[UPPER] = upper; - if (link_which & LINK_LOWER) - list_add_tail(&edge->list[LOWER], &lower->upper); - if (link_which & LINK_UPPER) - list_add_tail(&edge->list[UPPER], &upper->lower); -} - -static inline void btrfs_backref_free_node(struct btrfs_backref_cache *cache, - struct btrfs_backref_node *node) -{ - if (node) { - ASSERT(list_empty(&node->list)); - ASSERT(list_empty(&node->lower)); - ASSERT(node->eb == NULL); - cache->nr_nodes--; - btrfs_put_root(node->root); - kfree(node); - } -} - -static inline void btrfs_backref_free_edge(struct btrfs_backref_cache *cache, - struct btrfs_backref_edge *edge) -{ - if (edge) { - cache->nr_edges--; - kfree(edge); - } -} - -static inline void btrfs_backref_unlock_node_buffer( - struct btrfs_backref_node *node) -{ - if (node->locked) { - btrfs_tree_unlock(node->eb); - node->locked = 0; - } -} -static inline void btrfs_backref_drop_node_buffer( - struct btrfs_backref_node *node) -{ - if (node->eb) { - btrfs_backref_unlock_node_buffer(node); - free_extent_buffer(node->eb); - node->eb = NULL; - } -} - -/* - * Drop the backref node from cache without cleaning up its children - * edges. - * - * This can only be called on node without parent edges. - * The children edges are still kept as is. - */ -static inline void btrfs_backref_drop_node(struct btrfs_backref_cache *tree, - struct btrfs_backref_node *node) -{ - ASSERT(list_empty(&node->upper)); - - btrfs_backref_drop_node_buffer(node); - list_del_init(&node->list); - list_del_init(&node->lower); - if (!RB_EMPTY_NODE(&node->rb_node)) - rb_erase(&node->rb_node, &tree->rb_root); - btrfs_backref_free_node(tree, node); -} +void btrfs_backref_link_edge(struct btrfs_backref_edge *edge, + struct btrfs_backref_node *lower, + struct btrfs_backref_node *upper, + int link_which); +void btrfs_backref_free_node(struct btrfs_backref_cache *cache, + struct btrfs_backref_node *node); +void btrfs_backref_free_edge(struct btrfs_backref_cache *cache, + struct btrfs_backref_edge *edge); +void btrfs_backref_unlock_node_buffer(struct btrfs_backref_node *node); +void btrfs_backref_drop_node_buffer(struct btrfs_backref_node *node); void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache, struct btrfs_backref_node *node); +void btrfs_backref_drop_node(struct btrfs_backref_cache *tree, + struct btrfs_backref_node *node); void btrfs_backref_release_cache(struct btrfs_backref_cache *cache); diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 928f512cdb4a..477f350a8bd0 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -11,7 +11,6 @@ #include "raid56.h" #include "async-thread.h" #include "dev-replace.h" -#include "rcu-string.h" #include "zoned.h" #include "file-item.h" #include "raid-stripe-tree.h" @@ -509,8 +508,6 @@ static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, if (!bioc) { /* Single mirror read/write fast path. */ btrfs_bio(bio)->mirror_num = mirror_num; - if (bio_op(bio) != REQ_OP_READ) - btrfs_bio(bio)->orig_physical = smap->physical; bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT; if (bio_op(bio) != REQ_OP_READ) btrfs_bio(bio)->orig_physical = smap->physical; @@ -611,8 +608,20 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free) static bool should_async_write(struct btrfs_bio *bbio) { + bool auto_csum_mode = true; + +#ifdef CONFIG_BTRFS_DEBUG + struct btrfs_fs_devices *fs_devices = bbio->fs_info->fs_devices; + enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode); + + if (csum_mode == BTRFS_OFFLOAD_CSUM_FORCE_OFF) + return false; + + auto_csum_mode = (csum_mode == BTRFS_OFFLOAD_CSUM_AUTO); +#endif + /* Submit synchronously if the checksum implementation is fast. */ - if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags)) + if (auto_csum_mode && test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags)) return false; /* diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index bbaed317161a..d9dd5276093d 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -7,12 +7,14 @@ #ifndef BTRFS_BIO_H #define BTRFS_BIO_H +#include <linux/types.h> #include <linux/bio.h> #include <linux/workqueue.h> #include "tree-checker.h" struct btrfs_bio; struct btrfs_fs_info; +struct btrfs_inode; #define BTRFS_BIO_INLINE_CSUM_SIZE 64 diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index a9be9ac99222..5f7587ca1ca7 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -418,7 +418,7 @@ struct btrfs_caching_control *btrfs_get_caching_control( return ctl; } -void btrfs_put_caching_control(struct btrfs_caching_control *ctl) +static void btrfs_put_caching_control(struct btrfs_caching_control *ctl) { if (refcount_dec_and_test(&ctl->count)) kfree(ctl); @@ -1063,7 +1063,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, bool remove_rsv = false; block_group = btrfs_lookup_block_group(fs_info, map->start); - BUG_ON(!block_group); + if (!block_group) + return -ENOENT; + BUG_ON(!block_group->ro); trace_btrfs_remove_block_group(block_group); @@ -1429,7 +1431,7 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans, * group in pinned_extents before we were able to clear the whole block * group range from pinned_extents. This means that task can lookup for * the block group after we unpinned it from pinned_extents and removed - * it, leading to a BUG_ON() at unpin_extent_range(). + * it, leading to an error at unpin_extent_range(). */ mutex_lock(&fs_info->unused_bg_unpin_mutex); if (prev_trans) { @@ -1455,6 +1457,7 @@ out: */ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) { + LIST_HEAD(retry_list); struct btrfs_block_group *block_group; struct btrfs_space_info *space_info; struct btrfs_trans_handle *trans; @@ -1476,6 +1479,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_lock(&fs_info->unused_bgs_lock); while (!list_empty(&fs_info->unused_bgs)) { + u64 used; int trimming; block_group = list_first_entry(&fs_info->unused_bgs, @@ -1511,22 +1515,68 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) goto next; } + spin_lock(&space_info->lock); spin_lock(&block_group->lock); - if (block_group->reserved || block_group->pinned || - block_group->used || block_group->ro || + if (btrfs_is_block_group_used(block_group) || block_group->ro || list_is_singular(&block_group->list)) { /* * We want to bail if we made new allocations or have * outstanding allocations in this block group. We do * the ro check in case balance is currently acting on * this block group. + * + * Also bail out if this is the only block group for its + * type, because otherwise we would lose profile + * information from fs_info->avail_*_alloc_bits and the + * next block group of this type would be created with a + * "single" profile (even if we're in a raid fs) because + * fs_info->avail_*_alloc_bits would be 0. + */ + trace_btrfs_skip_unused_block_group(block_group); + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); + up_write(&space_info->groups_sem); + goto next; + } + + /* + * The block group may be unused but there may be space reserved + * accounting with the existence of that block group, that is, + * space_info->bytes_may_use was incremented by a task but no + * space was yet allocated from the block group by the task. + * That space may or may not be allocated, as we are generally + * pessimistic about space reservation for metadata as well as + * for data when using compression (as we reserve space based on + * the worst case, when data can't be compressed, and before + * actually attempting compression, before starting writeback). + * + * So check if the total space of the space_info minus the size + * of this block group is less than the used space of the + * space_info - if that's the case, then it means we have tasks + * that might be relying on the block group in order to allocate + * extents, and add back the block group to the unused list when + * we finish, so that we retry later in case no tasks ended up + * needing to allocate extents from the block group. + */ + used = btrfs_space_info_used(space_info, true); + if (space_info->total_bytes - block_group->length < used) { + /* + * Add a reference for the list, compensate for the ref + * drop under the "next" label for the + * fs_info->unused_bgs list. */ + btrfs_get_block_group(block_group); + list_add_tail(&block_group->bg_list, &retry_list); + trace_btrfs_skip_unused_block_group(block_group); spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); up_write(&space_info->groups_sem); goto next; } + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); /* We don't want to force the issue, only flip if it's ok. */ ret = inc_block_group_ro(block_group, 0); @@ -1650,12 +1700,16 @@ next: btrfs_put_block_group(block_group); spin_lock(&fs_info->unused_bgs_lock); } + list_splice_tail(&retry_list, &fs_info->unused_bgs); spin_unlock(&fs_info->unused_bgs_lock); mutex_unlock(&fs_info->reclaim_bgs_lock); return; flip_async: btrfs_end_transaction(trans); + spin_lock(&fs_info->unused_bgs_lock); + list_splice_tail(&retry_list, &fs_info->unused_bgs); + spin_unlock(&fs_info->unused_bgs_lock); mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_put_block_group(block_group); btrfs_discard_punt_unused_bgs_list(fs_info); @@ -2684,6 +2738,37 @@ next: btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info); list_del_init(&block_group->bg_list); clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags); + + /* + * If the block group is still unused, add it to the list of + * unused block groups. The block group may have been created in + * order to satisfy a space reservation, in which case the + * extent allocation only happens later. But often we don't + * actually need to allocate space that we previously reserved, + * so the block group may become unused for a long time. For + * example for metadata we generally reserve space for a worst + * possible scenario, but then don't end up allocating all that + * space or none at all (due to no need to COW, extent buffers + * were already COWed in the current transaction and still + * unwritten, tree heights lower than the maximum possible + * height, etc). For data we generally reserve the axact amount + * of space we are going to allocate later, the exception is + * when using compression, as we must reserve space based on the + * uncompressed data size, because the compression is only done + * when writeback triggered and we don't know how much space we + * are actually going to need, so we reserve the uncompressed + * size because the data may be uncompressible in the worst case. + */ + if (ret == 0) { + bool used; + + spin_lock(&block_group->lock); + used = btrfs_is_block_group_used(block_group); + spin_unlock(&block_group->lock); + + if (!used) + btrfs_mark_bg_unused(block_group); + } } btrfs_trans_release_chunk_metadata(trans); } diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index c4a1f01cc1c2..85e2d4cd12dc 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -3,9 +3,22 @@ #ifndef BTRFS_BLOCK_GROUP_H #define BTRFS_BLOCK_GROUP_H +#include <linux/atomic.h> +#include <linux/mutex.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/refcount.h> +#include <linux/wait.h> +#include <linux/sizes.h> +#include <linux/rwsem.h> +#include <linux/rbtree.h> +#include <uapi/linux/btrfs_tree.h> #include "free-space-cache.h" struct btrfs_chunk_map; +struct btrfs_fs_info; +struct btrfs_inode; +struct btrfs_trans_handle; enum btrfs_disk_cache_state { BTRFS_DC_WRITTEN, @@ -257,6 +270,13 @@ static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) return (block_group->start + block_group->length); } +static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg) +{ + lockdep_assert_held(&bg->lock); + + return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0); +} + static inline bool btrfs_is_block_group_data_only( struct btrfs_block_group *block_group) { @@ -290,7 +310,6 @@ void btrfs_wait_nocow_writers(struct btrfs_block_group *bg); void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache, u64 num_bytes); int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait); -void btrfs_put_caching_control(struct btrfs_caching_control *ctl); struct btrfs_caching_control *btrfs_get_caching_control( struct btrfs_block_group *cache); int btrfs_add_new_free_space(struct btrfs_block_group *block_group, diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index ceb5f586a2d5..95c174f9fd4f 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -6,7 +6,6 @@ #include "space-info.h" #include "transaction.h" #include "block-group.h" -#include "disk-io.h" #include "fs.h" #include "accessors.h" @@ -494,7 +493,7 @@ struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans, block_rsv = get_block_rsv(trans, root); - if (unlikely(block_rsv->size == 0)) + if (unlikely(btrfs_block_rsv_size(block_rsv) == 0)) goto try_reserve; again: ret = btrfs_block_rsv_use_bytes(block_rsv, blocksize); diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index b0bd12b8652f..1f53b967d069 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -3,8 +3,15 @@ #ifndef BTRFS_BLOCK_RSV_H #define BTRFS_BLOCK_RSV_H +#include <linux/types.h> +#include <linux/compiler.h> +#include <linux/spinlock.h> + struct btrfs_trans_handle; struct btrfs_root; +struct btrfs_space_info; +struct btrfs_block_rsv; +struct btrfs_fs_info; enum btrfs_reserve_flush_enum; /* @@ -101,4 +108,36 @@ static inline bool btrfs_block_rsv_full(const struct btrfs_block_rsv *rsv) return data_race(rsv->full); } +/* + * Get the reserved mount of a block reserve in a context where getting a stale + * value is acceptable, instead of accessing it directly and trigger data race + * warning from KCSAN. + */ +static inline u64 btrfs_block_rsv_reserved(struct btrfs_block_rsv *rsv) +{ + u64 ret; + + spin_lock(&rsv->lock); + ret = rsv->reserved; + spin_unlock(&rsv->lock); + + return ret; +} + +/* + * Get the size of a block reserve in a context where getting a stale value is + * acceptable, instead of accessing it directly and trigger data race warning + * from KCSAN. + */ +static inline u64 btrfs_block_rsv_size(struct btrfs_block_rsv *rsv) +{ + u64 ret; + + spin_lock(&rsv->lock); + ret = rsv->size; + spin_unlock(&rsv->lock); + + return ret; +} + #endif /* BTRFS_BLOCK_RSV_H */ diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 7f7c5a92d2b8..100020ca4658 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -8,13 +8,32 @@ #include <linux/hash.h> #include <linux/refcount.h> +#include <linux/spinlock.h> +#include <linux/mutex.h> +#include <linux/rwsem.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/compiler.h> #include <linux/fscrypt.h> +#include <linux/lockdep.h> +#include <uapi/linux/btrfs_tree.h> #include <trace/events/btrfs.h> +#include "block-rsv.h" +#include "btrfs_inode.h" #include "extent_map.h" #include "extent_io.h" +#include "extent-io-tree.h" #include "ordered-data.h" #include "delayed-inode.h" +struct extent_state; +struct posix_acl; +struct iov_iter; +struct writeback_control; +struct btrfs_root; +struct btrfs_fs_info; +struct btrfs_trans_handle; + /* * Since we search a directory based on f_pos (struct dir_context::pos) we have * to start at 2 since '.' and '..' have f_pos of 0 and 1 respectively, so @@ -41,7 +60,6 @@ enum { */ BTRFS_INODE_NEEDS_FULL_SYNC, BTRFS_INODE_COPY_EVERYTHING, - BTRFS_INODE_IN_DELALLOC_LIST, BTRFS_INODE_HAS_PROPS, BTRFS_INODE_SNAPSHOT_FLUSH, /* @@ -428,7 +446,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, u64 *ram_bytes, bool nowait, bool strict); -void __btrfs_del_delalloc_inode(struct btrfs_root *root, struct btrfs_inode *inode); +void btrfs_del_delalloc_inode(struct btrfs_inode *inode); struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index); int btrfs_unlink_inode(struct btrfs_trans_handle *trans, @@ -490,8 +508,7 @@ struct inode *btrfs_iget_path(struct super_block *s, u64 ino, struct btrfs_root *root, struct btrfs_path *path); struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root); struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, - u64 start, u64 len); + struct page *page, u64 start, u64 len); int btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 193168214eeb..b2b94009959d 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -25,8 +25,6 @@ #include "misc.h" #include "ctree.h" #include "fs.h" -#include "disk-io.h" -#include "transaction.h" #include "btrfs_inode.h" #include "bio.h" #include "ordered-data.h" @@ -34,8 +32,7 @@ #include "extent_io.h" #include "extent_map.h" #include "subpage.h" -#include "zoned.h" -#include "file-item.h" +#include "messages.h" #include "super.h" static struct bio_set btrfs_compressed_bioset; @@ -141,16 +138,16 @@ static int compression_decompress_bio(struct list_head *ws, } static int compression_decompress(int type, struct list_head *ws, - const u8 *data_in, struct page *dest_page, - unsigned long start_byte, size_t srclen, size_t destlen) + const u8 *data_in, struct page *dest_page, + unsigned long dest_pgoff, size_t srclen, size_t destlen) { switch (type) { case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page, - start_byte, srclen, destlen); + dest_pgoff, srclen, destlen); case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_page, - start_byte, srclen, destlen); + dest_pgoff, srclen, destlen); case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page, - start_byte, srclen, destlen); + dest_pgoff, srclen, destlen); case BTRFS_COMPRESS_NONE: default: /* @@ -284,7 +281,7 @@ static void end_bbio_comprssed_read(struct btrfs_bio *bbio) static noinline void end_compressed_writeback(const struct compressed_bio *cb) { struct inode *inode = &cb->bbio.inode->vfs_inode; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); unsigned long index = cb->start >> PAGE_SHIFT; unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; struct folio_batch fbatch; @@ -415,7 +412,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, struct compressed_bio *cb, int *memstall, unsigned long *pflags) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); unsigned long end_index; struct bio *orig_bio = &cb->orig_bbio->bio; u64 cur = cb->orig_bbio->file_offset + orig_bio->bi_iter.bi_size; @@ -441,7 +438,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, * This makes readahead less effective, so here disable readahead for * subpage for now, until full compressed write is supported. */ - if (btrfs_sb(inode->i_sb)->sectorsize < PAGE_SIZE) + if (fs_info->sectorsize < PAGE_SIZE) return 0; end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; @@ -1037,14 +1034,23 @@ static int btrfs_decompress_bio(struct compressed_bio *cb) * start_byte tells us the offset into the compressed data we're interested in */ int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, - unsigned long start_byte, size_t srclen, size_t destlen) + unsigned long dest_pgoff, size_t srclen, size_t destlen) { + struct btrfs_fs_info *fs_info = page_to_fs_info(dest_page); struct list_head *workspace; + const u32 sectorsize = fs_info->sectorsize; int ret; + /* + * The full destination page range should not exceed the page size. + * And the @destlen should not exceed sectorsize, as this is only called for + * inline file extents, which should not exceed sectorsize. + */ + ASSERT(dest_pgoff + destlen <= PAGE_SIZE && destlen <= sectorsize); + workspace = get_workspace(type, 0); ret = compression_decompress(type, workspace, data_in, dest_page, - start_byte, srclen, destlen); + dest_pgoff, srclen, destlen); put_workspace(type, workspace); return ret; @@ -1470,11 +1476,6 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, /* * Compression heuristic. * - * For now is's a naive and optimistic 'return true', we'll extend the logic to - * quickly (compared to direct compression) detect data characteristics - * (compressible/incompressible) to avoid wasting CPU time on incompressible - * data. - * * The following types of analysis can be performed: * - detect mostly zero data * - detect data with low "byte set" size (text, etc) diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 93cc92974dee..4691a84ca838 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -7,10 +7,18 @@ #define BTRFS_COMPRESSION_H #include <linux/sizes.h> +#include <linux/mm.h> +#include <linux/list.h> +#include <linux/workqueue.h> +#include <linux/wait.h> #include "bio.h" +struct address_space; +struct page; +struct inode; struct btrfs_inode; struct btrfs_ordered_extent; +struct btrfs_bio; /* * We want to make sure that amount of RAM required to uncompress an extent is @@ -32,8 +40,6 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); #define BTRFS_ZLIB_DEFAULT_LEVEL 3 -struct page; - struct compressed_bio { /* Number of compressed pages in the array */ unsigned int nr_pages; @@ -148,7 +154,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out); int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zlib_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen); struct list_head *zlib_alloc_workspace(unsigned int level); void zlib_free_workspace(struct list_head *ws); @@ -159,7 +165,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out); int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int lzo_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen); struct list_head *lzo_alloc_workspace(unsigned int level); void lzo_free_workspace(struct list_head *ws); @@ -169,7 +175,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out); int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zstd_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen); void zstd_init_workspace_manager(void); void zstd_cleanup_workspace_manager(void); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e65e012bac55..aaf53fd84358 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -820,7 +820,7 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot, } while (low < high) { - const int unit_size = folio_size(eb->folios[0]); + const int unit_size = eb->folio_size; unsigned long oil; unsigned long offset; struct btrfs_disk_key *tmp; @@ -4280,6 +4280,10 @@ void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans, /* * Given a key and some data, insert items into the tree. * This does all the path init required, making room in the tree if needed. + * + * Returns: 0 on success + * -EEXIST if the first key already exists + * < 0 on other errors */ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -5082,9 +5086,7 @@ int btrfs_previous_extent_item(struct btrfs_root *root, int __init btrfs_ctree_init(void) { - btrfs_path_cachep = kmem_cache_create("btrfs_path", - sizeof(struct btrfs_path), 0, - SLAB_MEM_SPREAD, NULL); + btrfs_path_cachep = KMEM_CACHE(btrfs_path, 0); if (!btrfs_path_cachep) return -ENOMEM; return 0; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 70e828d33177..c03c58246033 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -7,25 +7,24 @@ #define BTRFS_CTREE_H #include <linux/pagemap.h> +#include <linux/spinlock.h> +#include <linux/rbtree.h> +#include <linux/mutex.h> +#include <linux/wait.h> +#include <linux/list.h> +#include <linux/atomic.h> +#include <linux/xarray.h> +#include <linux/refcount.h> +#include <uapi/linux/btrfs_tree.h> #include "locking.h" #include "fs.h" #include "accessors.h" +#include "extent-io-tree.h" +struct extent_buffer; +struct btrfs_block_rsv; struct btrfs_trans_handle; -struct btrfs_transaction; -struct btrfs_pending_snapshot; -struct btrfs_delayed_ref_root; -struct btrfs_space_info; struct btrfs_block_group; -struct btrfs_ordered_sum; -struct btrfs_ref; -struct btrfs_bio; -struct btrfs_ioctl_encoded_io_args; -struct btrfs_device; -struct btrfs_fs_devices; -struct btrfs_balance_control; -struct btrfs_delayed_root; -struct reloc_control; /* Read ahead values for struct btrfs_path.reada */ enum { @@ -478,8 +477,7 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) return mapping_gfp_constraint(mapping, ~__GFP_FS); } -int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, - u64 start, u64 end); +void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 *actual_bytes); int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index c276b136ab63..f015fa1b6301 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -6,7 +6,6 @@ #include <linux/sched.h> #include "ctree.h" #include "disk-io.h" -#include "print-tree.h" #include "transaction.h" #include "locking.h" #include "accessors.h" @@ -521,7 +520,7 @@ static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, * keep_locks set and lowest_level is 1, regardless of the value of * path->slots[1]. */ - BUG_ON(path->locks[1] == 0); + ASSERT(path->locks[1] != 0); ret = btrfs_realloc_node(trans, root, path->nodes[1], 0, &last_ret, @@ -810,7 +809,7 @@ static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, u32 extent_thresh, u64 newer_than, bool locked) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct extent_map *next; bool ret = false; @@ -861,20 +860,21 @@ out: * NOTE: Caller should also wait for page writeback after the cluster is * prepared, here we don't do writeback wait for each page. */ -static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t index) +static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, pgoff_t index) { struct address_space *mapping = inode->vfs_inode.i_mapping; gfp_t mask = btrfs_alloc_write_mask(mapping); u64 page_start = (u64)index << PAGE_SHIFT; u64 page_end = page_start + PAGE_SIZE - 1; struct extent_state *cached_state = NULL; - struct page *page; + struct folio *folio; int ret; again: - page = find_or_create_page(mapping, index, mask); - if (!page) - return ERR_PTR(-ENOMEM); + folio = __filemap_get_folio(mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); + if (IS_ERR(folio)) + return folio; /* * Since we can defragment files opened read-only, we can encounter @@ -884,16 +884,16 @@ again: * executables that explicitly enable them, so this isn't very * restrictive. */ - if (PageCompound(page)) { - unlock_page(page); - put_page(page); + if (folio_test_large(folio)) { + folio_unlock(folio); + folio_put(folio); return ERR_PTR(-ETXTBSY); } - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(folio); if (ret < 0) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return ERR_PTR(ret); } @@ -908,17 +908,17 @@ again: if (!ordered) break; - unlock_page(page); + folio_unlock(folio); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); - lock_page(page); + folio_lock(folio); /* - * We unlocked the page above, so we need check if it was + * We unlocked the folio above, so we need check if it was * released or not. */ - if (page->mapping != mapping || !PagePrivate(page)) { - unlock_page(page); - put_page(page); + if (folio->mapping != mapping || !folio->private) { + folio_unlock(folio); + folio_put(folio); goto again; } } @@ -927,21 +927,21 @@ again: * Now the page range has no ordered extent any more. Read the page to * make it uptodate. */ - if (!PageUptodate(page)) { - btrfs_read_folio(NULL, page_folio(page)); - lock_page(page); - if (page->mapping != mapping || !PagePrivate(page)) { - unlock_page(page); - put_page(page); + if (!folio_test_uptodate(folio)) { + btrfs_read_folio(NULL, folio); + folio_lock(folio); + if (folio->mapping != mapping || !folio->private) { + folio_unlock(folio); + folio_put(folio); goto again; } - if (!PageUptodate(page)) { - unlock_page(page); - put_page(page); + if (!folio_test_uptodate(folio)) { + folio_unlock(folio); + folio_put(folio); return ERR_PTR(-EIO); } } - return page; + return folio; } struct defrag_target_range { @@ -1046,7 +1046,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode, goto add; /* Skip too large extent */ - if (range_len >= extent_thresh) + if (em->len >= extent_thresh) goto next; /* @@ -1162,7 +1162,7 @@ static_assert(PAGE_ALIGNED(CLUSTER_SIZE)); */ static int defrag_one_locked_target(struct btrfs_inode *inode, struct defrag_target_range *target, - struct page **pages, int nr_pages, + struct folio **folios, int nr_pages, struct extent_state **cached_state) { struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -1171,7 +1171,7 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, const u64 len = target->len; unsigned long last_index = (start + len - 1) >> PAGE_SHIFT; unsigned long start_index = start >> PAGE_SHIFT; - unsigned long first_index = page_index(pages[0]); + unsigned long first_index = folios[0]->index; int ret = 0; int i; @@ -1188,8 +1188,8 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, /* Update the page status */ for (i = start_index - first_index; i <= last_index - first_index; i++) { - ClearPageChecked(pages[i]); - btrfs_folio_clamp_set_dirty(fs_info, page_folio(pages[i]), start, len); + folio_clear_checked(folios[i]); + btrfs_folio_clamp_set_dirty(fs_info, folios[i], start, len); } btrfs_delalloc_release_extents(inode, len); extent_changeset_free(data_reserved); @@ -1205,7 +1205,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, struct defrag_target_range *entry; struct defrag_target_range *tmp; LIST_HEAD(target_list); - struct page **pages; + struct folio **folios; const u32 sectorsize = inode->root->fs_info->sectorsize; u64 last_index = (start + len - 1) >> PAGE_SHIFT; u64 start_index = start >> PAGE_SHIFT; @@ -1216,21 +1216,21 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE); ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize)); - pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); - if (!pages) + folios = kcalloc(nr_pages, sizeof(struct folio *), GFP_NOFS); + if (!folios) return -ENOMEM; /* Prepare all pages */ for (i = 0; i < nr_pages; i++) { - pages[i] = defrag_prepare_one_page(inode, start_index + i); - if (IS_ERR(pages[i])) { - ret = PTR_ERR(pages[i]); - pages[i] = NULL; - goto free_pages; + folios[i] = defrag_prepare_one_folio(inode, start_index + i); + if (IS_ERR(folios[i])) { + ret = PTR_ERR(folios[i]); + nr_pages = i; + goto free_folios; } } for (i = 0; i < nr_pages; i++) - wait_on_page_writeback(pages[i]); + folio_wait_writeback(folios[i]); /* Lock the pages range */ lock_extent(&inode->io_tree, start_index << PAGE_SHIFT, @@ -1250,7 +1250,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, goto unlock_extent; list_for_each_entry(entry, &target_list, list) { - ret = defrag_one_locked_target(inode, entry, pages, nr_pages, + ret = defrag_one_locked_target(inode, entry, folios, nr_pages, &cached_state); if (ret < 0) break; @@ -1264,14 +1264,12 @@ unlock_extent: unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT, (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, &cached_state); -free_pages: +free_folios: for (i = 0; i < nr_pages; i++) { - if (pages[i]) { - unlock_page(pages[i]); - put_page(pages[i]); - } + folio_unlock(folios[i]); + folio_put(folios[i]); } - kfree(pages); + kfree(folios); return ret; } @@ -1366,7 +1364,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_to_defrag) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); unsigned long sectors_defragged = 0; u64 isize = i_size_read(inode); u64 cur; @@ -1512,9 +1510,7 @@ void __cold btrfs_auto_defrag_exit(void) int __init btrfs_auto_defrag_init(void) { btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", - sizeof(struct inode_defrag), 0, - SLAB_MEM_SPREAD, - NULL); + sizeof(struct inode_defrag), 0, 0, NULL); if (!btrfs_inode_defrag_cachep) return -ENOMEM; diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h index 5a62763528d1..878528e086fb 100644 --- a/fs/btrfs/defrag.h +++ b/fs/btrfs/defrag.h @@ -3,6 +3,16 @@ #ifndef BTRFS_DEFRAG_H #define BTRFS_DEFRAG_H +#include <linux/types.h> +#include <linux/compiler_types.h> + +struct inode; +struct file_ra_state; +struct btrfs_fs_info; +struct btrfs_root; +struct btrfs_trans_handle; +struct btrfs_ioctl_defrag_range_args; + int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_to_defrag); diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 2833e8ef4c09..b3527efd0b4b 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -6,9 +6,7 @@ #include "block-rsv.h" #include "btrfs_inode.h" #include "space-info.h" -#include "transaction.h" #include "qgroup.h" -#include "block-group.h" #include "fs.h" /* @@ -245,7 +243,6 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv = &inode->block_rsv; u64 reserve_size = 0; u64 qgroup_rsv_size = 0; - u64 csum_leaves; unsigned outstanding_extents; lockdep_assert_held(&inode->lock); @@ -260,10 +257,12 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, outstanding_extents); reserve_size += btrfs_calc_metadata_size(fs_info, 1); } - csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, - inode->csum_bytes); - reserve_size += btrfs_calc_insert_metadata_size(fs_info, - csum_leaves); + if (!(inode->flags & BTRFS_INODE_NODATASUM)) { + u64 csum_leaves; + + csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes); + reserve_size += btrfs_calc_insert_metadata_size(fs_info, csum_leaves); + } /* * For qgroup rsv, the calculation is very simple: * account one nodesize for each outstanding extent @@ -278,14 +277,20 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, spin_unlock(&block_rsv->lock); } -static void calc_inode_reservations(struct btrfs_fs_info *fs_info, +static void calc_inode_reservations(struct btrfs_inode *inode, u64 num_bytes, u64 disk_num_bytes, u64 *meta_reserve, u64 *qgroup_reserve) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 nr_extents = count_max_extents(fs_info, num_bytes); - u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes); + u64 csum_leaves; u64 inode_update = btrfs_calc_metadata_size(fs_info, 1); + if (inode->flags & BTRFS_INODE_NODATASUM) + csum_leaves = 0; + else + csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes); + *meta_reserve = btrfs_calc_insert_metadata_size(fs_info, nr_extents + csum_leaves); @@ -337,7 +342,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, * everything out and try again, which is bad. This way we just * over-reserve slightly, and clean up the mess when we are done. */ - calc_inode_reservations(fs_info, num_bytes, disk_num_bytes, + calc_inode_reservations(inode, num_bytes, disk_num_bytes, &meta_reserve, &qgroup_reserve); ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true, noflush); @@ -359,7 +364,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, nr_extents = count_max_extents(fs_info, num_bytes); spin_lock(&inode->lock); btrfs_mod_outstanding_extents(inode, nr_extents); - inode->csum_bytes += disk_num_bytes; + if (!(inode->flags & BTRFS_INODE_NODATASUM)) + inode->csum_bytes += disk_num_bytes; btrfs_calculate_inode_block_rsv_size(fs_info, inode); spin_unlock(&inode->lock); @@ -393,7 +399,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, num_bytes = ALIGN(num_bytes, fs_info->sectorsize); spin_lock(&inode->lock); - inode->csum_bytes -= num_bytes; + if (!(inode->flags & BTRFS_INODE_NODATASUM)) + inode->csum_bytes -= num_bytes; btrfs_calculate_inode_block_rsv_size(fs_info, inode); spin_unlock(&inode->lock); diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h index c5d573f2366e..ce4f889e4f17 100644 --- a/fs/btrfs/delalloc-space.h +++ b/fs/btrfs/delalloc-space.h @@ -3,7 +3,11 @@ #ifndef BTRFS_DELALLOC_SPACE_H #define BTRFS_DELALLOC_SPACE_H +#include <linux/types.h> + struct extent_changeset; +struct btrfs_inode; +struct btrfs_fs_info; int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); int btrfs_check_data_free_space(struct btrfs_inode *inode, diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 08102883f560..dd6f566a383f 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -28,11 +28,7 @@ static struct kmem_cache *delayed_node_cache; int __init btrfs_delayed_inode_init(void) { - delayed_node_cache = kmem_cache_create("btrfs_delayed_node", - sizeof(struct btrfs_delayed_node), - 0, - SLAB_MEM_SPREAD, - NULL); + delayed_node_cache = KMEM_CACHE(btrfs_delayed_node, 0); if (!delayed_node_cache) return -ENOMEM; return 0; @@ -43,6 +39,17 @@ void __cold btrfs_delayed_inode_exit(void) kmem_cache_destroy(delayed_node_cache); } +void btrfs_init_delayed_root(struct btrfs_delayed_root *delayed_root) +{ + atomic_set(&delayed_root->items, 0); + atomic_set(&delayed_root->items_seq, 0); + delayed_root->nodes = 0; + spin_lock_init(&delayed_root->lock); + init_waitqueue_head(&delayed_root->wait); + INIT_LIST_HEAD(&delayed_root->node_list); + INIT_LIST_HEAD(&delayed_root->prepare_list); +} + static inline void btrfs_init_delayed_node( struct btrfs_delayed_node *delayed_node, struct btrfs_root *root, u64 inode_id) @@ -430,8 +437,6 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) delayed_root = delayed_node->root->fs_info->delayed_root; - BUG_ON(!delayed_root); - if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM) root = &delayed_node->ins_root; else @@ -980,7 +985,7 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node) if (delayed_node && test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { - BUG_ON(!delayed_node->root); + ASSERT(delayed_node->root); clear_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags); delayed_node->count--; diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 5cceb31bbd16..64e115d97499 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -7,15 +7,23 @@ #ifndef BTRFS_DELAYED_INODE_H #define BTRFS_DELAYED_INODE_H +#include <linux/types.h> #include <linux/rbtree.h> #include <linux/spinlock.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/wait.h> +#include <linux/fs.h> #include <linux/atomic.h> #include <linux/refcount.h> #include "ctree.h" +struct btrfs_disk_key; +struct btrfs_fs_info; +struct btrfs_inode; +struct btrfs_root; +struct btrfs_trans_handle; + enum btrfs_delayed_item_type { BTRFS_DELAYED_INSERTION_ITEM, BTRFS_DELAYED_DELETION_ITEM @@ -98,18 +106,7 @@ struct btrfs_delayed_item { char data[] __counted_by(data_len); }; -static inline void btrfs_init_delayed_root( - struct btrfs_delayed_root *delayed_root) -{ - atomic_set(&delayed_root->items, 0); - atomic_set(&delayed_root->items_seq, 0); - delayed_root->nodes = 0; - spin_lock_init(&delayed_root->lock); - init_waitqueue_head(&delayed_root->wait); - INIT_LIST_HEAD(&delayed_root->node_list); - INIT_LIST_HEAD(&delayed_root->prepare_list); -} - +void btrfs_init_delayed_root(struct btrfs_delayed_root *delayed_root); int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, const char *name, int name_len, struct btrfs_inode *dir, diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 891ea2fa263c..e44e62cf76bc 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -1004,6 +1004,52 @@ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, INIT_LIST_HEAD(&ref->add_list); } +void btrfs_init_generic_ref(struct btrfs_ref *generic_ref, int action, u64 bytenr, + u64 len, u64 parent, u64 owning_root) +{ + generic_ref->action = action; + generic_ref->bytenr = bytenr; + generic_ref->len = len; + generic_ref->parent = parent; + generic_ref->owning_root = owning_root; +} + +void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 root, + u64 mod_root, bool skip_qgroup) +{ +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + /* If @real_root not set, use @root as fallback */ + generic_ref->real_root = mod_root ?: root; +#endif + generic_ref->tree_ref.level = level; + generic_ref->tree_ref.ref_root = root; + generic_ref->type = BTRFS_REF_METADATA; + if (skip_qgroup || !(is_fstree(root) && + (!mod_root || is_fstree(mod_root)))) + generic_ref->skip_qgroup = true; + else + generic_ref->skip_qgroup = false; + +} + +void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ref_root, u64 ino, + u64 offset, u64 mod_root, bool skip_qgroup) +{ +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + /* If @real_root not set, use @root as fallback */ + generic_ref->real_root = mod_root ?: ref_root; +#endif + generic_ref->data_ref.ref_root = ref_root; + generic_ref->data_ref.ino = ino; + generic_ref->data_ref.offset = offset; + generic_ref->type = BTRFS_REF_DATA; + if (skip_qgroup || !(is_fstree(ref_root) && + (!mod_root || is_fstree(mod_root)))) + generic_ref->skip_qgroup = true; + else + generic_ref->skip_qgroup = false; +} + /* * add a delayed tree ref. This does all of the accounting required * to make sure the delayed ref is eventually processed before this @@ -1220,6 +1266,25 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, return 0; } +void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) +{ + if (refcount_dec_and_test(&ref->refs)) { + WARN_ON(!RB_EMPTY_NODE(&ref->ref_node)); + switch (ref->type) { + case BTRFS_TREE_BLOCK_REF_KEY: + case BTRFS_SHARED_BLOCK_REF_KEY: + kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); + break; + case BTRFS_EXTENT_DATA_REF_KEY: + case BTRFS_SHARED_DATA_REF_KEY: + kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); + break; + default: + BUG(); + } + } +} + /* * This does a simple search for the head node for a given extent. Returns the * head node if found, or NULL if not. @@ -1242,31 +1307,19 @@ void __cold btrfs_delayed_ref_exit(void) int __init btrfs_delayed_ref_init(void) { - btrfs_delayed_ref_head_cachep = kmem_cache_create( - "btrfs_delayed_ref_head", - sizeof(struct btrfs_delayed_ref_head), 0, - SLAB_MEM_SPREAD, NULL); + btrfs_delayed_ref_head_cachep = KMEM_CACHE(btrfs_delayed_ref_head, 0); if (!btrfs_delayed_ref_head_cachep) goto fail; - btrfs_delayed_tree_ref_cachep = kmem_cache_create( - "btrfs_delayed_tree_ref", - sizeof(struct btrfs_delayed_tree_ref), 0, - SLAB_MEM_SPREAD, NULL); + btrfs_delayed_tree_ref_cachep = KMEM_CACHE(btrfs_delayed_tree_ref, 0); if (!btrfs_delayed_tree_ref_cachep) goto fail; - btrfs_delayed_data_ref_cachep = kmem_cache_create( - "btrfs_delayed_data_ref", - sizeof(struct btrfs_delayed_data_ref), 0, - SLAB_MEM_SPREAD, NULL); + btrfs_delayed_data_ref_cachep = KMEM_CACHE(btrfs_delayed_data_ref, 0); if (!btrfs_delayed_data_ref_cachep) goto fail; - btrfs_delayed_extent_op_cachep = kmem_cache_create( - "btrfs_delayed_extent_op", - sizeof(struct btrfs_delayed_extent_op), 0, - SLAB_MEM_SPREAD, NULL); + btrfs_delayed_extent_op_cachep = KMEM_CACHE(btrfs_delayed_extent_op, 0); if (!btrfs_delayed_extent_op_cachep) goto fail; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 62d679d40f4f..b291147cb8ab 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -6,7 +6,17 @@ #ifndef BTRFS_DELAYED_REF_H #define BTRFS_DELAYED_REF_H +#include <linux/types.h> #include <linux/refcount.h> +#include <linux/list.h> +#include <linux/rbtree.h> +#include <linux/mutex.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <uapi/linux/btrfs_tree.h> + +struct btrfs_trans_handle; +struct btrfs_fs_info; /* these are the possible values of struct btrfs_delayed_ref_node->action */ enum btrfs_delayed_ref_action { @@ -308,53 +318,12 @@ static inline u64 btrfs_calc_delayed_ref_csum_bytes(const struct btrfs_fs_info * return btrfs_calc_metadata_size(fs_info, num_csum_items); } -static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref, - int action, u64 bytenr, u64 len, - u64 parent, u64 owning_root) -{ - generic_ref->action = action; - generic_ref->bytenr = bytenr; - generic_ref->len = len; - generic_ref->parent = parent; - generic_ref->owning_root = owning_root; -} - -static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, - u64 root, u64 mod_root, bool skip_qgroup) -{ -#ifdef CONFIG_BTRFS_FS_REF_VERIFY - /* If @real_root not set, use @root as fallback */ - generic_ref->real_root = mod_root ?: root; -#endif - generic_ref->tree_ref.level = level; - generic_ref->tree_ref.ref_root = root; - generic_ref->type = BTRFS_REF_METADATA; - if (skip_qgroup || !(is_fstree(root) && - (!mod_root || is_fstree(mod_root)))) - generic_ref->skip_qgroup = true; - else - generic_ref->skip_qgroup = false; - -} - -static inline void btrfs_init_data_ref(struct btrfs_ref *generic_ref, - u64 ref_root, u64 ino, u64 offset, u64 mod_root, - bool skip_qgroup) -{ -#ifdef CONFIG_BTRFS_FS_REF_VERIFY - /* If @real_root not set, use @root as fallback */ - generic_ref->real_root = mod_root ?: ref_root; -#endif - generic_ref->data_ref.ref_root = ref_root; - generic_ref->data_ref.ino = ino; - generic_ref->data_ref.offset = offset; - generic_ref->type = BTRFS_REF_DATA; - if (skip_qgroup || !(is_fstree(ref_root) && - (!mod_root || is_fstree(mod_root)))) - generic_ref->skip_qgroup = true; - else - generic_ref->skip_qgroup = false; -} +void btrfs_init_generic_ref(struct btrfs_ref *generic_ref, int action, u64 bytenr, + u64 len, u64 parent, u64 owning_root); +void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 root, + u64 mod_root, bool skip_qgroup); +void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ref_root, u64 ino, + u64 offset, u64 mod_root, bool skip_qgroup); static inline struct btrfs_delayed_extent_op * btrfs_alloc_delayed_extent_op(void) @@ -369,24 +338,7 @@ btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op) kmem_cache_free(btrfs_delayed_extent_op_cachep, op); } -static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) -{ - if (refcount_dec_and_test(&ref->refs)) { - WARN_ON(!RB_EMPTY_NODE(&ref->ref_node)); - switch (ref->type) { - case BTRFS_TREE_BLOCK_REF_KEY: - case BTRFS_SHARED_BLOCK_REF_KEY: - kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); - break; - case BTRFS_EXTENT_DATA_REF_KEY: - case BTRFS_SHARED_DATA_REF_KEY: - kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); - break; - default: - BUG(); - } - } -} +void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref); static inline u64 btrfs_ref_head_to_space_flags( struct btrfs_delayed_ref_head *head_ref) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 1502d664c892..7696beec4c21 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -11,10 +11,8 @@ #include <linux/math64.h> #include "misc.h" #include "ctree.h" -#include "extent_map.h" #include "disk-io.h" #include "transaction.h" -#include "print-tree.h" #include "volumes.h" #include "async-thread.h" #include "dev-replace.h" @@ -246,7 +244,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; - struct bdev_handle *bdev_handle; + struct file *bdev_file; struct block_device *bdev; u64 devid = BTRFS_DEV_REPLACE_DEVID; int ret = 0; @@ -257,13 +255,13 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, return -EINVAL; } - bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE, + bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE, fs_info->bdev_holder, NULL); - if (IS_ERR(bdev_handle)) { + if (IS_ERR(bdev_file)) { btrfs_err(fs_info, "target device %s is invalid!", device_path); - return PTR_ERR(bdev_handle); + return PTR_ERR(bdev_file); } - bdev = bdev_handle->bdev; + bdev = file_bdev(bdev_file); if (!btrfs_check_device_zone_type(fs_info, bdev)) { btrfs_err(fs_info, @@ -314,7 +312,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, device->commit_bytes_used = device->bytes_used; device->fs_info = fs_info; device->bdev = bdev; - device->bdev_handle = bdev_handle; + device->bdev_file = bdev_file; set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); device->dev_stats_valid = 1; @@ -335,7 +333,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, return 0; error: - bdev_release(bdev_handle); + fput(bdev_file); return ret; } @@ -725,6 +723,23 @@ leave: return ret; } +static int btrfs_check_replace_dev_names(struct btrfs_ioctl_dev_replace_args *args) +{ + if (args->start.srcdevid == 0) { + if (memchr(args->start.srcdev_name, 0, + sizeof(args->start.srcdev_name)) == NULL) + return -ENAMETOOLONG; + } else { + args->start.srcdev_name[0] = 0; + } + + if (memchr(args->start.tgtdev_name, 0, + sizeof(args->start.tgtdev_name)) == NULL) + return -ENAMETOOLONG; + + return 0; +} + int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args) { @@ -737,10 +752,9 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, default: return -EINVAL; } - - if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') || - args->start.tgtdev_name[0] == '\0') - return -EINVAL; + ret = btrfs_check_replace_dev_names(args); + if (ret < 0) + return ret; ret = btrfs_dev_replace_start(fs_info, args->start.tgtdev_name, args->start.srcdevid, @@ -984,8 +998,7 @@ error: btrfs_sysfs_remove_device(src_device); btrfs_sysfs_update_devid(tgt_device); if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state)) - btrfs_scratch_superblocks(fs_info, src_device->bdev, - src_device->name->str); + btrfs_scratch_superblocks(fs_info, src_device); /* write back the superblocks */ trans = btrfs_start_transaction(root, 0); diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 675082ccec89..23e480efe5e6 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -6,11 +6,15 @@ #ifndef BTRFS_DEV_REPLACE_H #define BTRFS_DEV_REPLACE_H +#include <linux/types.h> +#include <linux/compiler_types.h> + struct btrfs_ioctl_dev_replace_args; struct btrfs_fs_info; struct btrfs_trans_handle; struct btrfs_dev_replace; struct btrfs_block_group; +struct btrfs_device; int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info); int btrfs_run_dev_replace(struct btrfs_trans_handle *trans); diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h index e40a226373d7..00b3d83d7569 100644 --- a/fs/btrfs/dir-item.h +++ b/fs/btrfs/dir-item.h @@ -3,9 +3,15 @@ #ifndef BTRFS_DIR_ITEM_H #define BTRFS_DIR_ITEM_H +#include <linux/types.h> #include <linux/crc32c.h> struct fscrypt_str; +struct btrfs_fs_info; +struct btrfs_key; +struct btrfs_path; +struct btrfs_root; +struct btrfs_trans_handle; int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, const struct fscrypt_str *name); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c6907d533fe8..3df5477d48a8 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -29,7 +29,6 @@ #include "tree-log.h" #include "free-space-cache.h" #include "free-space-tree.h" -#include "rcu-string.h" #include "dev-replace.h" #include "raid56.h" #include "sysfs.h" @@ -193,7 +192,7 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, struct folio *folio = eb->folios[i]; u64 start = max_t(u64, eb->start, folio_pos(folio)); u64 end = min_t(u64, eb->start + eb->len, - folio_pos(folio) + folio_size(folio)); + folio_pos(folio) + eb->folio_size); u32 len = end - start; ret = btrfs_repair_io_failure(fs_info, 0, start, len, @@ -498,15 +497,15 @@ static int btree_migrate_folio(struct address_space *mapping, static int btree_writepages(struct address_space *mapping, struct writeback_control *wbc) { - struct btrfs_fs_info *fs_info; int ret; if (wbc->sync_mode == WB_SYNC_NONE) { + struct btrfs_fs_info *fs_info; if (wbc->for_kupdate) return 0; - fs_info = BTRFS_I(mapping->host)->root->fs_info; + fs_info = inode_to_fs_info(mapping->host); /* this is a bit racy, but that's ok */ ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes, BTRFS_DIRTY_METADATA_THRESH, @@ -529,11 +528,12 @@ static void btree_invalidate_folio(struct folio *folio, size_t offset, size_t length) { struct extent_io_tree *tree; - tree = &BTRFS_I(folio->mapping->host)->io_tree; + + tree = &folio_to_inode(folio)->io_tree; extent_invalidate_folio(tree, folio, offset); btree_release_folio(folio, GFP_NOFS); if (folio_get_private(folio)) { - btrfs_warn(BTRFS_I(folio->mapping->host)->root->fs_info, + btrfs_warn(folio_to_fs_info(folio), "folio private not zero on folio %llu", (unsigned long long)folio_pos(folio)); folio_detach_private(folio); @@ -544,7 +544,7 @@ static void btree_invalidate_folio(struct folio *folio, size_t offset, static bool btree_dirty_folio(struct address_space *mapping, struct folio *folio) { - struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host); struct btrfs_subpage_info *spi = fs_info->subpage_info; struct btrfs_subpage *subpage; struct extent_buffer *eb; @@ -1244,6 +1244,7 @@ void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info) btrfs_err(fs_info, "leaked root %s refcount %d", btrfs_root_name(&root->root_key, buf), refcount_read(&root->refs)); + WARN_ON_ONCE(1); while (refcount_read(&root->refs) > 1) btrfs_put_root(root); btrfs_put_root(root); @@ -1307,12 +1308,12 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) * * @objectid: root id * @anon_dev: preallocated anonymous block device number for new roots, - * pass 0 for new allocation. + * pass NULL for a new allocation. * @check_ref: whether to check root item references, If true, return -ENOENT * for orphan roots */ static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info, - u64 objectid, dev_t anon_dev, + u64 objectid, dev_t *anon_dev, bool check_ref) { struct btrfs_root *root; @@ -1336,8 +1337,17 @@ static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info, again: root = btrfs_lookup_fs_root(fs_info, objectid); if (root) { - /* Shouldn't get preallocated anon_dev for cached roots */ - ASSERT(!anon_dev); + /* + * Some other caller may have read out the newly inserted + * subvolume already (for things like backref walk etc). Not + * that common but still possible. In that case, we just need + * to free the anon_dev. + */ + if (unlikely(anon_dev && *anon_dev)) { + free_anon_bdev(*anon_dev); + *anon_dev = 0; + } + if (check_ref && btrfs_root_refs(&root->root_item) == 0) { btrfs_put_root(root); return ERR_PTR(-ENOENT); @@ -1357,7 +1367,7 @@ again: goto fail; } - ret = btrfs_init_fs_root(root, anon_dev); + ret = btrfs_init_fs_root(root, anon_dev ? *anon_dev : 0); if (ret) goto fail; @@ -1393,7 +1403,7 @@ fail: * root's anon_dev to 0 to avoid a double free, once by btrfs_put_root() * and once again by our caller. */ - if (anon_dev) + if (anon_dev && *anon_dev) root->anon_dev = 0; btrfs_put_root(root); return ERR_PTR(ret); @@ -1409,7 +1419,7 @@ fail: struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, u64 objectid, bool check_ref) { - return btrfs_get_root_ref(fs_info, objectid, 0, check_ref); + return btrfs_get_root_ref(fs_info, objectid, NULL, check_ref); } /* @@ -1417,11 +1427,11 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, * the anonymous block device id * * @objectid: tree objectid - * @anon_dev: if zero, allocate a new anonymous block device or use the - * parameter value + * @anon_dev: if NULL, allocate a new anonymous block device or use the + * parameter value if not NULL */ struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, - u64 objectid, dev_t anon_dev) + u64 objectid, dev_t *anon_dev) { return btrfs_get_root_ref(fs_info, objectid, anon_dev, true); } @@ -2230,7 +2240,7 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) struct btrfs_key location; int ret; - BUG_ON(!fs_info->tree_root); + ASSERT(fs_info->tree_root); ret = load_global_roots(tree_root); if (ret) @@ -2830,6 +2840,7 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block int ret; fs_info->sb = sb; + /* Temporary fixed values for block size until we read the superblock. */ sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE; sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE); @@ -3347,6 +3358,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super); sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE); + /* Update the values for the current filesystem. */ sb->s_blocksize = sectorsize; sb->s_blocksize_bits = blksize_bits(sectorsize); memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE); @@ -4617,7 +4629,7 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) struct inode *inode = NULL; btrfs_inode = list_first_entry(&splice, struct btrfs_inode, delalloc_inodes); - __btrfs_del_delalloc_inode(root, btrfs_inode); + btrfs_del_delalloc_inode(btrfs_inode); spin_unlock(&root->delalloc_lock); /* @@ -4916,7 +4928,14 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret < 0) goto error; - BUG_ON(ret == 0); /* Corruption */ + if (ret == 0) { + /* + * Key with offset -1 found, there would have to exist a root + * with such id, but this is out of valid range. + */ + ret = -EUCLEAN; + goto error; + } if (path->slots[0] > 0) { slot = path->slots[0] - 1; l = path->nodes[0]; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 9413726b329b..76eb53fe7a11 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -6,6 +6,22 @@ #ifndef BTRFS_DISK_IO_H #define BTRFS_DISK_IO_H +#include <linux/sizes.h> +#include <linux/compiler_types.h> +#include "ctree.h" +#include "fs.h" + +struct block_device; +struct super_block; +struct extent_buffer; +struct btrfs_device; +struct btrfs_fs_devices; +struct btrfs_fs_info; +struct btrfs_super_block; +struct btrfs_trans_handle; +struct btrfs_tree_parent_check; +struct btrfs_transaction; + #define BTRFS_SUPER_MIRROR_MAX 3 #define BTRFS_SUPER_MIRROR_SHIFT 12 @@ -25,10 +41,6 @@ static inline u64 btrfs_sb_offset(int mirror) return BTRFS_SUPER_INFO_OFFSET; } -struct btrfs_device; -struct btrfs_fs_devices; -struct btrfs_tree_parent_check; - void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info); void btrfs_init_fs_info(struct btrfs_fs_info *fs_info); struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, @@ -61,7 +73,7 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, u64 objectid, bool check_ref); struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, - u64 objectid, dev_t anon_dev); + u64 objectid, dev_t *anon_dev); struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 objectid); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 744a02b7fd67..8398d345ec5b 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -5,7 +5,6 @@ #include "ctree.h" #include "disk-io.h" #include "btrfs_inode.h" -#include "print-tree.h" #include "export.h" #include "accessors.h" #include "super.h" @@ -174,8 +173,15 @@ struct dentry *btrfs_get_parent(struct dentry *child) ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto fail; + if (ret == 0) { + /* + * Key with offset of -1 found, there would have to exist an + * inode with such number or a root with such id. + */ + ret = -EUCLEAN; + goto fail; + } - BUG_ON(ret == 0); /* Key with offset of -1 found */ if (path->slots[0] == 0) { ret = -ENOENT; goto fail; @@ -215,7 +221,7 @@ static int btrfs_get_name(struct dentry *parent, char *name, { struct inode *inode = d_inode(child); struct inode *dir = d_inode(parent); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_path *path; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_inode_ref *iref; diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h index eba6bc4f5a61..464582273af9 100644 --- a/fs/btrfs/export.h +++ b/fs/btrfs/export.h @@ -4,6 +4,10 @@ #define BTRFS_EXPORT_H #include <linux/exportfs.h> +#include <linux/types.h> + +struct dentry; +struct super_block; extern const struct export_operations btrfs_export_ops; diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index e3ee5449cc4a..c09b428823d7 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -6,7 +6,6 @@ #include "ctree.h" #include "extent-io-tree.h" #include "btrfs_inode.h" -#include "misc.h" static struct kmem_cache *extent_state_cache; @@ -48,6 +47,7 @@ static inline void btrfs_extent_state_leak_debug_check(void) extent_state_in_tree(state), refcount_read(&state->refs)); list_del(&state->leak_list); + WARN_ON_ONCE(1); kmem_cache_free(extent_state_cache, state); } } @@ -1883,8 +1883,8 @@ void __cold extent_state_free_cachep(void) int __init extent_state_init_cachep(void) { extent_state_cache = kmem_cache_create("btrfs_extent_state", - sizeof(struct extent_state), 0, - SLAB_MEM_SPREAD, NULL); + sizeof(struct extent_state), 0, 0, + NULL); if (!extent_state_cache) return -ENOMEM; diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index ebe6390d65e9..9d3a52d8f59a 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -3,9 +3,16 @@ #ifndef BTRFS_EXTENT_IO_TREE_H #define BTRFS_EXTENT_IO_TREE_H +#include <linux/rbtree.h> +#include <linux/spinlock.h> +#include <linux/refcount.h> +#include <linux/list.h> +#include <linux/wait.h> #include "misc.h" struct extent_changeset; +struct btrfs_fs_info; +struct btrfs_inode; /* Bits for the extent state */ enum { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f396aba92c57..beedd6ed64d3 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -18,7 +18,7 @@ #include <linux/crc32c.h> #include "ctree.h" #include "extent-tree.h" -#include "tree-log.h" +#include "transaction.h" #include "disk-io.h" #include "print-tree.h" #include "volumes.h" @@ -26,14 +26,11 @@ #include "locking.h" #include "free-space-cache.h" #include "free-space-tree.h" -#include "sysfs.h" #include "qgroup.h" #include "ref-verify.h" #include "space-info.h" #include "block-rsv.h" -#include "delalloc-space.h" #include "discard.h" -#include "rcu-string.h" #include "zoned.h" #include "dev-replace.h" #include "fs.h" @@ -1260,7 +1257,8 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, u64 bytes_left, end; u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT); - if (WARN_ON(start != aligned_start)) { + /* Adjust the range to be aligned to 512B sectors if necessary. */ + if (start != aligned_start) { len -= aligned_start - start; len = round_down(len, 1 << SECTOR_SHIFT); start = aligned_start; @@ -2398,7 +2396,14 @@ static noinline int check_committed_ref(struct btrfs_root *root, ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) goto out; - BUG_ON(ret == 0); /* Corruption */ + if (ret == 0) { + /* + * Key with offset -1 found, there would have to exist an extent + * item with such offset, but this is out of the valid range. + */ + ret = -EUCLEAN; + goto out; + } ret = -ENOENT; if (path->slots[0] == 0) @@ -2779,6 +2784,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, u64 total_unpinned = 0; u64 empty_cluster = 0; bool readonly; + int ret = 0; while (start <= end) { readonly = false; @@ -2788,7 +2794,11 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, btrfs_put_block_group(cache); total_unpinned = 0; cache = btrfs_lookup_block_group(fs_info, start); - BUG_ON(!cache); /* Logic error */ + if (cache == NULL) { + /* Logic error, something removed the block group. */ + ret = -EUCLEAN; + goto out; + } cluster = fetch_cluster_info(fs_info, cache->space_info, @@ -2857,7 +2867,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, if (cache) btrfs_put_block_group(cache); - return 0; +out: + return ret; } int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) @@ -2887,7 +2898,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) end + 1 - start, NULL); clear_extent_dirty(unpin, start, end, &cached_state); - unpin_extent_range(fs_info, start, end, true); + ret = unpin_extent_range(fs_info, start, end, true); + BUG_ON(ret); mutex_unlock(&fs_info->unused_bg_unpin_mutex); free_extent_state(cached_state); cond_resched(); @@ -3446,16 +3458,17 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, u64 parent, int last_ref) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_ref generic_ref = { 0 }; struct btrfs_block_group *bg; int ret; - btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, - buf->start, buf->len, parent, btrfs_header_owner(buf)); - btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), - root_id, 0, false); - if (root_id != BTRFS_TREE_LOG_OBJECTID) { + struct btrfs_ref generic_ref = { 0 }; + + btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, + buf->start, buf->len, parent, + btrfs_header_owner(buf)); + btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), + root_id, 0, false); btrfs_ref_tree_mod(fs_info, &generic_ref); ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL); BUG_ON(ret); /* -ENOMEM */ @@ -4298,6 +4311,42 @@ static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, return 0; } +static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl) +{ + if (ffe_ctl->for_treelog) { + spin_lock(&fs_info->treelog_bg_lock); + if (fs_info->treelog_bg) + ffe_ctl->hint_byte = fs_info->treelog_bg; + spin_unlock(&fs_info->treelog_bg_lock); + } else if (ffe_ctl->for_data_reloc) { + spin_lock(&fs_info->relocation_bg_lock); + if (fs_info->data_reloc_bg) + ffe_ctl->hint_byte = fs_info->data_reloc_bg; + spin_unlock(&fs_info->relocation_bg_lock); + } else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) { + struct btrfs_block_group *block_group; + + spin_lock(&fs_info->zone_active_bgs_lock); + list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { + /* + * No lock is OK here because avail is monotinically + * decreasing, and this is just a hint. + */ + u64 avail = block_group->zone_capacity - block_group->alloc_offset; + + if (block_group_bits(block_group, ffe_ctl->flags) && + avail >= ffe_ctl->num_bytes) { + ffe_ctl->hint_byte = block_group->start; + break; + } + } + spin_unlock(&fs_info->zone_active_bgs_lock); + } + + return 0; +} + static int prepare_allocation(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl, struct btrfs_space_info *space_info, @@ -4308,19 +4357,7 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, return prepare_allocation_clustered(fs_info, ffe_ctl, space_info, ins); case BTRFS_EXTENT_ALLOC_ZONED: - if (ffe_ctl->for_treelog) { - spin_lock(&fs_info->treelog_bg_lock); - if (fs_info->treelog_bg) - ffe_ctl->hint_byte = fs_info->treelog_bg; - spin_unlock(&fs_info->treelog_bg_lock); - } - if (ffe_ctl->for_data_reloc) { - spin_lock(&fs_info->relocation_bg_lock); - if (fs_info->data_reloc_bg) - ffe_ctl->hint_byte = fs_info->data_reloc_bg; - spin_unlock(&fs_info->relocation_bg_lock); - } - return 0; + return prepare_allocation_zoned(fs_info, ffe_ctl); default: BUG(); } @@ -4925,7 +4962,7 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 root_objectid = root->root_key.objectid; u64 owning_root = root_objectid; - BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); + ASSERT(root_objectid != BTRFS_TREE_LOG_OBJECTID); if (btrfs_is_data_reloc_root(root) && is_fstree(root->relocation_src_root)) owning_root = root->relocation_src_root; @@ -6142,10 +6179,13 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, return ret; } -int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, - u64 start, u64 end) +/* + * Unpin the extent range in an error context and don't add the space back. + * Errors are not propagated further. + */ +void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end) { - return unpin_extent_range(fs_info, start, end, false); + unpin_extent_range(fs_info, start, end, false); } /* diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index 2e066035ccee..af9f8800d5ac 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -3,11 +3,21 @@ #ifndef BTRFS_EXTENT_TREE_H #define BTRFS_EXTENT_TREE_H +#include <linux/types.h> #include "misc.h" #include "block-group.h" +#include "locking.h" +struct extent_buffer; struct btrfs_free_cluster; +struct btrfs_fs_info; +struct btrfs_root; +struct btrfs_path; +struct btrfs_ref; +struct btrfs_disk_key; struct btrfs_delayed_ref_head; +struct btrfs_delayed_ref_root; +struct btrfs_extent_inline_ref; enum btrfs_extent_allocation_policy { BTRFS_EXTENT_ALLOC_CLUSTERED, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index cfd2967f04a2..7441245b1ceb 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -14,7 +14,6 @@ #include <linux/pagevec.h> #include <linux/prefetch.h> #include <linux/fsverity.h> -#include "misc.h" #include "extent_io.h" #include "extent-io-tree.h" #include "extent_map.h" @@ -22,7 +21,6 @@ #include "btrfs_inode.h" #include "bio.h" #include "locking.h" -#include "rcu-string.h" #include "backref.h" #include "disk-io.h" #include "subpage.h" @@ -78,10 +76,11 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) eb = list_first_entry(&fs_info->allocated_ebs, struct extent_buffer, leak_list); pr_err( - "BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n", + "BTRFS: buffer leak start %llu len %u refs %d bflags %lu owner %llu\n", eb->start, eb->len, atomic_read(&eb->refs), eb->bflags, btrfs_header_owner(eb)); list_del(&eb->leak_list); + WARN_ON_ONCE(1); kmem_cache_free(extent_buffer_cache, eb); } spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); @@ -147,8 +146,8 @@ static void submit_write_bio(struct btrfs_bio_ctrl *bio_ctrl, int ret) int __init extent_buffer_init_cachep(void) { extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", - sizeof(struct extent_buffer), 0, - SLAB_MEM_SPREAD, NULL); + sizeof(struct extent_buffer), 0, 0, + NULL); if (!extent_buffer_cache) return -ENOMEM; @@ -207,7 +206,7 @@ static void __process_pages_contig(struct address_space *mapping, struct page *locked_page, u64 start, u64 end, unsigned long page_ops) { - struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host); pgoff_t start_index = start >> PAGE_SHIFT; pgoff_t end_index = end >> PAGE_SHIFT; pgoff_t index = start_index; @@ -251,7 +250,7 @@ static noinline int lock_delalloc_pages(struct inode *inode, u64 start, u64 end) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct address_space *mapping = inode->i_mapping; pgoff_t start_index = start >> PAGE_SHIFT; pgoff_t end_index = end >> PAGE_SHIFT; @@ -323,7 +322,7 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, struct page *locked_page, u64 *start, u64 *end) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; const u64 orig_start = *start; const u64 orig_end = *end; @@ -433,7 +432,7 @@ static bool btrfs_verify_page(struct page *page, u64 start) static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) { - struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + struct btrfs_fs_info *fs_info = page_to_fs_info(page); struct folio *folio = page_folio(page); ASSERT(page_offset(page) <= start && @@ -462,16 +461,15 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) */ static void end_bbio_data_write(struct btrfs_bio *bbio) { + struct btrfs_fs_info *fs_info = bbio->fs_info; struct bio *bio = &bbio->bio; int error = blk_status_to_errno(bio->bi_status); struct folio_iter fi; + const u32 sectorsize = fs_info->sectorsize; ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_folio_all(fi, bio) { struct folio *folio = fi.folio; - struct inode *inode = folio->mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - const u32 sectorsize = fs_info->sectorsize; u64 start = folio_pos(folio) + fi.offset; u32 len = fi.length; @@ -593,22 +591,17 @@ static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) */ static void end_bbio_data_read(struct btrfs_bio *bbio) { + struct btrfs_fs_info *fs_info = bbio->fs_info; struct bio *bio = &bbio->bio; struct processed_extent processed = { 0 }; struct folio_iter fi; - /* - * The offset to the beginning of a bio, since one bio can never be - * larger than UINT_MAX, u32 here is enough. - */ - u32 bio_offset = 0; + const u32 sectorsize = fs_info->sectorsize; ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_folio_all(fi, &bbio->bio) { bool uptodate = !bio->bi_status; struct folio *folio = fi.folio; struct inode *inode = folio->mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - const u32 sectorsize = fs_info->sectorsize; u64 start; u64 end; u32 len; @@ -667,10 +660,6 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) end_page_read(folio_page(folio, 0), uptodate, start, len); endio_readpage_release_extent(&processed, BTRFS_I(inode), start, end, uptodate); - - ASSERT(bio_offset + len > bio_offset); - bio_offset += len; - } /* Release the last extent */ endio_readpage_release_extent(&processed, NULL, 0, 0, false); @@ -738,6 +727,8 @@ static int alloc_eb_folio_array(struct extent_buffer *eb, gfp_t extra_gfp) for (int i = 0; i < num_pages; i++) eb->folios[i] = page_folio(page_array[i]); + eb->folio_size = PAGE_SIZE; + eb->folio_shift = PAGE_SHIFT; return 0; } @@ -827,7 +818,7 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl, u64 disk_bytenr, struct page *page, size_t size, unsigned long pg_offset) { - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct btrfs_inode *inode = page_to_inode(page); ASSERT(pg_offset + size <= PAGE_SIZE); ASSERT(bio_ctrl->end_io_func); @@ -936,17 +927,21 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb, int set_page_extent_mapped(struct page *page) { - struct folio *folio = page_folio(page); + return set_folio_extent_mapped(page_folio(page)); +} + +int set_folio_extent_mapped(struct folio *folio) +{ struct btrfs_fs_info *fs_info; - ASSERT(page->mapping); + ASSERT(folio->mapping); if (folio_test_private(folio)) return 0; - fs_info = btrfs_sb(page->mapping->host->i_sb); + fs_info = folio_to_fs_info(folio); - if (btrfs_is_subpage(fs_info, page->mapping)) + if (btrfs_is_subpage(fs_info, folio->mapping)) return btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA); folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE); @@ -963,20 +958,21 @@ void clear_page_extent_mapped(struct page *page) if (!folio_test_private(folio)) return; - fs_info = btrfs_sb(page->mapping->host->i_sb); + fs_info = page_to_fs_info(page); if (btrfs_is_subpage(fs_info, page->mapping)) return btrfs_detach_subpage(fs_info, folio); folio_detach_private(folio); } -static struct extent_map * -__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, +static struct extent_map *__get_extent_map(struct inode *inode, struct page *page, u64 start, u64 len, struct extent_map **em_cached) { struct extent_map *em; - if (em_cached && *em_cached) { + ASSERT(em_cached); + + if (*em_cached) { em = *em_cached; if (extent_map_in_tree(em) && start >= em->start && start < extent_map_end(em)) { @@ -988,8 +984,8 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, *em_cached = NULL; } - em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len); - if (em_cached && !IS_ERR(em)) { + em = btrfs_get_extent(BTRFS_I(inode), page, start, len); + if (!IS_ERR(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); *em_cached = em; @@ -1007,7 +1003,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start) { struct inode *inode = page->mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); u64 start = page_offset(page); const u64 end = start + PAGE_SIZE - 1; u64 cur = start; @@ -1018,7 +1014,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, int ret = 0; size_t pg_offset = 0; size_t iosize; - size_t blocksize = inode->i_sb->s_blocksize; + size_t blocksize = fs_info->sectorsize; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; ret = set_page_extent_mapped(page); @@ -1051,8 +1047,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, end_page_read(page, true, cur, iosize); break; } - em = __get_extent_map(inode, page, pg_offset, cur, - end - cur + 1, em_cached); + em = __get_extent_map(inode, page, cur, end - cur + 1, em_cached); if (IS_ERR(em)) { unlock_extent(tree, cur, end, NULL); end_page_read(page, false, cur, end + 1 - cur); @@ -1157,15 +1152,18 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, int btrfs_read_folio(struct file *file, struct folio *folio) { struct page *page = &folio->page; - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct btrfs_inode *inode = page_to_inode(page); u64 start = page_offset(page); u64 end = start + PAGE_SIZE - 1; struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ }; + struct extent_map *em_cached = NULL; int ret; btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - ret = btrfs_do_readpage(page, NULL, &bio_ctrl, NULL); + ret = btrfs_do_readpage(page, &em_cached, &bio_ctrl, NULL); + free_extent_map(em_cached); + /* * If btrfs_do_readpage() failed we will want to submit the assembled * bio to do the cleanup. @@ -1180,9 +1178,11 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start) { - struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host); + struct btrfs_inode *inode = page_to_inode(pages[0]); int index; + ASSERT(em_cached); + btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); for (index = 0; index < nr_pages; index++) { @@ -1371,7 +1371,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, continue; } - em = btrfs_get_extent(inode, NULL, 0, cur, len); + em = btrfs_get_extent(inode, NULL, cur, len); if (IS_ERR(em)) { ret = PTR_ERR_OR_ZERO(em); goto out_error; @@ -1739,10 +1739,10 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, folio_lock(folio); folio_clear_dirty_for_io(folio); folio_start_writeback(folio); - ret = bio_add_folio(&bbio->bio, folio, folio_size(folio), 0); + ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0); ASSERT(ret); wbc_account_cgroup_owner(wbc, folio_page(folio, 0), - folio_size(folio)); + eb->folio_size); wbc->nr_to_write -= folio_nr_pages(folio); folio_unlock(folio); } @@ -1766,7 +1766,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, */ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) { - struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + struct btrfs_fs_info *fs_info = page_to_fs_info(page); struct folio *folio = page_folio(page); int submitted = 0; u64 page_start = page_offset(page); @@ -1857,7 +1857,7 @@ static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx) if (!folio_test_private(folio)) return 0; - if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) + if (page_to_fs_info(page)->nodesize < PAGE_SIZE) return submit_eb_subpage(page, wbc); spin_lock(&mapping->i_private_lock); @@ -1915,7 +1915,7 @@ int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc) { struct btrfs_eb_write_context ctx = { .wbc = wbc }; - struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; + struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host); int ret = 0; int done = 0; int nr_to_write_done = 0; @@ -2203,7 +2203,7 @@ void extent_write_locked_range(struct inode *inode, struct page *locked_page, bool found_error = false; int ret = 0; struct address_space *mapping = inode->i_mapping; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); const u32 sectorsize = fs_info->sectorsize; loff_t i_size = i_size_read(inode); u64 cur = start; @@ -2309,7 +2309,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree, struct extent_state *cached_state = NULL; u64 start = folio_pos(folio); u64 end = start + folio_size(folio) - 1; - size_t blocksize = folio->mapping->host->i_sb->s_blocksize; + size_t blocksize = folio_to_fs_info(folio)->sectorsize; /* This function is only called for the btree inode */ ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO); @@ -2378,7 +2378,7 @@ int try_release_extent_mapping(struct page *page, gfp_t mask) struct extent_map *em; u64 start = page_offset(page); u64 end = start + PAGE_SIZE - 1; - struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host); + struct btrfs_inode *btrfs_inode = page_to_inode(page); struct extent_io_tree *tree = &btrfs_inode->io_tree; struct extent_map_tree *map = &btrfs_inode->extent_tree; @@ -2453,12 +2453,65 @@ next: return try_release_extent_state(tree, page, mask); } +struct btrfs_fiemap_entry { + u64 offset; + u64 phys; + u64 len; + u32 flags; +}; + /* - * To cache previous fiemap extent + * Indicate the caller of emit_fiemap_extent() that it needs to unlock the file + * range from the inode's io tree, unlock the subvolume tree search path, flush + * the fiemap cache and relock the file range and research the subvolume tree. + * The value here is something negative that can't be confused with a valid + * errno value and different from 1 because that's also a return value from + * fiemap_fill_next_extent() and also it's often used to mean some btree search + * did not find a key, so make it some distinct negative value. + */ +#define BTRFS_FIEMAP_FLUSH_CACHE (-(MAX_ERRNO + 1)) + +/* + * Used to: + * + * - Cache the next entry to be emitted to the fiemap buffer, so that we can + * merge extents that are contiguous and can be grouped as a single one; * - * Will be used for merging fiemap extent + * - Store extents ready to be written to the fiemap buffer in an intermediary + * buffer. This intermediary buffer is to ensure that in case the fiemap + * buffer is memory mapped to the fiemap target file, we don't deadlock + * during btrfs_page_mkwrite(). This is because during fiemap we are locking + * an extent range in order to prevent races with delalloc flushing and + * ordered extent completion, which is needed in order to reliably detect + * delalloc in holes and prealloc extents. And this can lead to a deadlock + * if the fiemap buffer is memory mapped to the file we are running fiemap + * against (a silly, useless in practice scenario, but possible) because + * btrfs_page_mkwrite() will try to lock the same extent range. */ struct fiemap_cache { + /* An array of ready fiemap entries. */ + struct btrfs_fiemap_entry *entries; + /* Number of entries in the entries array. */ + int entries_size; + /* Index of the next entry in the entries array to write to. */ + int entries_pos; + /* + * Once the entries array is full, this indicates what's the offset for + * the next file extent item we must search for in the inode's subvolume + * tree after unlocking the extent range in the inode's io tree and + * releasing the search path. + */ + u64 next_search_offset; + /* + * This matches struct fiemap_extent_info::fi_mapped_extents, we use it + * to count ourselves emitted extents and stop instead of relying on + * fiemap_fill_next_extent() because we buffer ready fiemap entries at + * the @entries array, and we want to stop as soon as we hit the max + * amount of extents to map, not just to save time but also to make the + * logic at extent_fiemap() simpler. + */ + unsigned int extents_mapped; + /* Fields for the cached extent (unsubmitted, not ready, extent). */ u64 offset; u64 phys; u64 len; @@ -2466,6 +2519,28 @@ struct fiemap_cache { bool cached; }; +static int flush_fiemap_cache(struct fiemap_extent_info *fieinfo, + struct fiemap_cache *cache) +{ + for (int i = 0; i < cache->entries_pos; i++) { + struct btrfs_fiemap_entry *entry = &cache->entries[i]; + int ret; + + ret = fiemap_fill_next_extent(fieinfo, entry->offset, + entry->phys, entry->len, + entry->flags); + /* + * Ignore 1 (reached max entries) because we keep track of that + * ourselves in emit_fiemap_extent(). + */ + if (ret < 0) + return ret; + } + cache->entries_pos = 0; + + return 0; +} + /* * Helper to submit fiemap extent. * @@ -2480,7 +2555,8 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, struct fiemap_cache *cache, u64 offset, u64 phys, u64 len, u32 flags) { - int ret = 0; + struct btrfs_fiemap_entry *entry; + u64 cache_end; /* Set at the end of extent_fiemap(). */ ASSERT((flags & FIEMAP_EXTENT_LAST) == 0); @@ -2489,15 +2565,104 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, goto assign; /* - * Sanity check, extent_fiemap() should have ensured that new - * fiemap extent won't overlap with cached one. - * Not recoverable. + * When iterating the extents of the inode, at extent_fiemap(), we may + * find an extent that starts at an offset behind the end offset of the + * previous extent we processed. This happens if fiemap is called + * without FIEMAP_FLAG_SYNC and there are ordered extents completing + * after we had to unlock the file range, release the search path, emit + * the fiemap extents stored in the buffer (cache->entries array) and + * the lock the remainder of the range and re-search the btree. + * + * For example we are in leaf X processing its last item, which is the + * file extent item for file range [512K, 1M[, and after + * btrfs_next_leaf() releases the path, there's an ordered extent that + * completes for the file range [768K, 2M[, and that results in trimming + * the file extent item so that it now corresponds to the file range + * [512K, 768K[ and a new file extent item is inserted for the file + * range [768K, 2M[, which may end up as the last item of leaf X or as + * the first item of the next leaf - in either case btrfs_next_leaf() + * will leave us with a path pointing to the new extent item, for the + * file range [768K, 2M[, since that's the first key that follows the + * last one we processed. So in order not to report overlapping extents + * to user space, we trim the length of the previously cached extent and + * emit it. * - * NOTE: Physical address can overlap, due to compression + * Upon calling btrfs_next_leaf() we may also find an extent with an + * offset smaller than or equals to cache->offset, and this happens + * when we had a hole or prealloc extent with several delalloc ranges in + * it, but after btrfs_next_leaf() released the path, delalloc was + * flushed and the resulting ordered extents were completed, so we can + * now have found a file extent item for an offset that is smaller than + * or equals to what we have in cache->offset. We deal with this as + * described below. */ - if (cache->offset + cache->len > offset) { - WARN_ON(1); - return -EINVAL; + cache_end = cache->offset + cache->len; + if (cache_end > offset) { + if (offset == cache->offset) { + /* + * We cached a dealloc range (found in the io tree) for + * a hole or prealloc extent and we have now found a + * file extent item for the same offset. What we have + * now is more recent and up to date, so discard what + * we had in the cache and use what we have just found. + */ + goto assign; + } else if (offset > cache->offset) { + /* + * The extent range we previously found ends after the + * offset of the file extent item we found and that + * offset falls somewhere in the middle of that previous + * extent range. So adjust the range we previously found + * to end at the offset of the file extent item we have + * just found, since this extent is more up to date. + * Emit that adjusted range and cache the file extent + * item we have just found. This corresponds to the case + * where a previously found file extent item was split + * due to an ordered extent completing. + */ + cache->len = offset - cache->offset; + goto emit; + } else { + const u64 range_end = offset + len; + + /* + * The offset of the file extent item we have just found + * is behind the cached offset. This means we were + * processing a hole or prealloc extent for which we + * have found delalloc ranges (in the io tree), so what + * we have in the cache is the last delalloc range we + * found while the file extent item we found can be + * either for a whole delalloc range we previously + * emmitted or only a part of that range. + * + * We have two cases here: + * + * 1) The file extent item's range ends at or behind the + * cached extent's end. In this case just ignore the + * current file extent item because we don't want to + * overlap with previous ranges that may have been + * emmitted already; + * + * 2) The file extent item starts behind the currently + * cached extent but its end offset goes beyond the + * end offset of the cached extent. We don't want to + * overlap with a previous range that may have been + * emmitted already, so we emit the currently cached + * extent and then partially store the current file + * extent item's range in the cache, for the subrange + * going the cached extent's end to the end of the + * file extent item. + */ + if (range_end <= cache_end) + return 0; + + if (!(flags & (FIEMAP_EXTENT_ENCODED | FIEMAP_EXTENT_DELALLOC))) + phys += cache_end - offset; + + offset = cache_end; + len = range_end - cache_end; + goto emit; + } } /* @@ -2517,12 +2682,37 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, return 0; } +emit: /* Not mergeable, need to submit cached one */ - ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, - cache->len, cache->flags); - cache->cached = false; - if (ret) - return ret; + + if (cache->entries_pos == cache->entries_size) { + /* + * We will need to research for the end offset of the last + * stored extent and not from the current offset, because after + * unlocking the range and releasing the path, if there's a hole + * between that end offset and this current offset, a new extent + * may have been inserted due to a new write, so we don't want + * to miss it. + */ + entry = &cache->entries[cache->entries_size - 1]; + cache->next_search_offset = entry->offset + entry->len; + cache->cached = false; + + return BTRFS_FIEMAP_FLUSH_CACHE; + } + + entry = &cache->entries[cache->entries_pos]; + entry->offset = cache->offset; + entry->phys = cache->phys; + entry->len = cache->len; + entry->flags = cache->flags; + cache->entries_pos++; + cache->extents_mapped++; + + if (cache->extents_mapped == fieinfo->fi_extents_max) { + cache->cached = false; + return 1; + } assign: cache->cached = true; cache->offset = offset; @@ -2562,7 +2752,7 @@ static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo, static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path) { - struct extent_buffer *clone; + struct extent_buffer *clone = path->nodes[0]; struct btrfs_key key; int slot; int ret; @@ -2571,29 +2761,45 @@ static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *p if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) return 0; + /* + * Add a temporary extra ref to an already cloned extent buffer to + * prevent btrfs_next_leaf() freeing it, we want to reuse it to avoid + * the cost of allocating a new one. + */ + ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, &clone->bflags)); + atomic_inc(&clone->refs); + ret = btrfs_next_leaf(inode->root, path); if (ret != 0) - return ret; + goto out; /* * Don't bother with cloning if there are no more file extent items for * our inode. */ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) - return 1; + if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) { + ret = 1; + goto out; + } /* See the comment at fiemap_search_slot() about why we clone. */ - clone = btrfs_clone_extent_buffer(path->nodes[0]); - if (!clone) - return -ENOMEM; + copy_extent_buffer_full(clone, path->nodes[0]); + /* + * Important to preserve the start field, for the optimizations when + * checking if extents are shared (see extent_fiemap()). + */ + clone->start = path->nodes[0]->start; slot = path->slots[0]; btrfs_release_path(path); path->nodes[0] = clone; path->slots[0] = slot; +out: + if (ret) + free_extent_buffer(clone); - return 0; + return ret; } /* @@ -2648,8 +2854,8 @@ static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path * neighbour leaf). * We also need the private clone because holding a read lock on an * extent buffer of the subvolume's b+tree will make lockdep unhappy - * when we call fiemap_fill_next_extent(), because that may cause a page - * fault when filling the user space buffer with fiemap data. + * when we check if extents are shared, as backref walking may need to + * lock the same leaf we are processing. */ clone = btrfs_clone_extent_buffer(path->nodes[0]); if (!clone) @@ -2873,24 +3079,29 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, struct btrfs_backref_share_check_ctx *backref_ctx; u64 last_extent_end; u64 prev_extent_end; - u64 lockstart; - u64 lockend; + u64 range_start; + u64 range_end; + const u64 sectorsize = inode->root->fs_info->sectorsize; bool stopped = false; int ret; + cache.entries_size = PAGE_SIZE / sizeof(struct btrfs_fiemap_entry); + cache.entries = kmalloc_array(cache.entries_size, + sizeof(struct btrfs_fiemap_entry), + GFP_KERNEL); backref_ctx = btrfs_alloc_backref_share_check_ctx(); path = btrfs_alloc_path(); - if (!backref_ctx || !path) { + if (!cache.entries || !backref_ctx || !path) { ret = -ENOMEM; goto out; } - lockstart = round_down(start, inode->root->fs_info->sectorsize); - lockend = round_up(start + len, inode->root->fs_info->sectorsize); - prev_extent_end = lockstart; +restart: + range_start = round_down(start, sectorsize); + range_end = round_up(start + len, sectorsize); + prev_extent_end = range_start; - btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); - lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + lock_extent(&inode->io_tree, range_start, range_end, &cached_state); ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end); if (ret < 0) @@ -2898,7 +3109,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, btrfs_release_path(path); path->reada = READA_FORWARD; - ret = fiemap_search_slot(inode, path, lockstart); + ret = fiemap_search_slot(inode, path, range_start); if (ret < 0) { goto out_unlock; } else if (ret > 0) { @@ -2910,7 +3121,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, goto check_eof_delalloc; } - while (prev_extent_end < lockend) { + while (prev_extent_end < range_end) { struct extent_buffer *leaf = path->nodes[0]; struct btrfs_file_extent_item *ei; struct btrfs_key key; @@ -2933,19 +3144,19 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, * The first iteration can leave us at an extent item that ends * before our range's start. Move to the next item. */ - if (extent_end <= lockstart) + if (extent_end <= range_start) goto next_item; backref_ctx->curr_leaf_bytenr = leaf->start; /* We have in implicit hole (NO_HOLES feature enabled). */ if (prev_extent_end < key.offset) { - const u64 range_end = min(key.offset, lockend) - 1; + const u64 hole_end = min(key.offset, range_end) - 1; ret = fiemap_process_hole(inode, fieinfo, &cache, &delalloc_cached_state, backref_ctx, 0, 0, 0, - prev_extent_end, range_end); + prev_extent_end, hole_end); if (ret < 0) { goto out_unlock; } else if (ret > 0) { @@ -2955,7 +3166,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, } /* We've reached the end of the fiemap range, stop. */ - if (key.offset >= lockend) { + if (key.offset >= range_end) { stopped = true; break; } @@ -3016,7 +3227,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, if (ret < 0) { goto out_unlock; } else if (ret > 0) { - /* fiemap_fill_next_extent() told us to stop. */ + /* emit_fiemap_extent() told us to stop. */ stopped = true; break; } @@ -3039,23 +3250,13 @@ next_item: } check_eof_delalloc: - /* - * Release (and free) the path before emitting any final entries to - * fiemap_fill_next_extent() to keep lockdep happy. This is because - * once we find no more file extent items exist, we may have a - * non-cloned leaf, and fiemap_fill_next_extent() can trigger page - * faults when copying data to the user space buffer. - */ - btrfs_free_path(path); - path = NULL; - - if (!stopped && prev_extent_end < lockend) { + if (!stopped && prev_extent_end < range_end) { ret = fiemap_process_hole(inode, fieinfo, &cache, &delalloc_cached_state, backref_ctx, - 0, 0, 0, prev_extent_end, lockend - 1); + 0, 0, 0, prev_extent_end, range_end - 1); if (ret < 0) goto out_unlock; - prev_extent_end = lockend; + prev_extent_end = range_end; } if (cache.cached && cache.offset + cache.len >= last_extent_end) { @@ -3079,13 +3280,39 @@ check_eof_delalloc: } } - ret = emit_last_fiemap_cache(fieinfo, &cache); - out_unlock: - unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); - btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + unlock_extent(&inode->io_tree, range_start, range_end, &cached_state); + + if (ret == BTRFS_FIEMAP_FLUSH_CACHE) { + btrfs_release_path(path); + ret = flush_fiemap_cache(fieinfo, &cache); + if (ret) + goto out; + len -= cache.next_search_offset - start; + start = cache.next_search_offset; + goto restart; + } else if (ret < 0) { + goto out; + } + + /* + * Must free the path before emitting to the fiemap buffer because we + * may have a non-cloned leaf and if the fiemap buffer is memory mapped + * to a file, a write into it (through btrfs_page_mkwrite()) may trigger + * waiting for an ordered extent that in order to complete needs to + * modify that leaf, therefore leading to a deadlock. + */ + btrfs_free_path(path); + path = NULL; + + ret = flush_fiemap_cache(fieinfo, &cache); + if (ret) + goto out; + + ret = emit_last_fiemap_cache(fieinfo, &cache); out: free_extent_state(delalloc_cached_state); + kfree(cache.entries); btrfs_free_backref_share_ctx(backref_ctx); btrfs_free_path(path); return ret; @@ -3534,7 +3761,7 @@ retry: /* For now, we should only have single-page folios for btree inode. */ ASSERT(folio_nr_pages(existing_folio) == 1); - if (folio_size(existing_folio) != folio_size(eb->folios[0])) { + if (folio_size(existing_folio) != eb->folio_size) { folio_unlock(existing_folio); folio_put(existing_folio); return -EAGAIN; @@ -3677,6 +3904,8 @@ reallocate: * and free the allocated page. */ folio = eb->folios[i]; + eb->folio_size = folio_size(folio); + eb->folio_shift = folio_shift(folio); spin_lock(&mapping->i_private_lock); /* Should not fail, as we have preallocated the memory */ ret = attach_extent_buffer_folio(eb, folio, prealloc); @@ -4126,7 +4355,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, for (int i = 0; i < num_folios; i++) { struct folio *folio = eb->folios[i]; - ret = bio_add_folio(&bbio->bio, folio, folio_size(folio), 0); + ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0); ASSERT(ret); } } @@ -4146,7 +4375,7 @@ static bool report_eb_range(const struct extent_buffer *eb, unsigned long start, unsigned long len) { btrfs_warn(eb->fs_info, - "access to eb bytenr %llu len %lu out of range start %lu len %lu", + "access to eb bytenr %llu len %u out of range start %lu len %lu", eb->start, eb->len, start, len); WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); @@ -4175,7 +4404,7 @@ static inline int check_eb_range(const struct extent_buffer *eb, void read_extent_buffer(const struct extent_buffer *eb, void *dstv, unsigned long start, unsigned long len) { - const int unit_size = folio_size(eb->folios[0]); + const int unit_size = eb->folio_size; size_t cur; size_t offset; char *dst = (char *)dstv; @@ -4215,7 +4444,7 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, void __user *dstv, unsigned long start, unsigned long len) { - const int unit_size = folio_size(eb->folios[0]); + const int unit_size = eb->folio_size; size_t cur; size_t offset; char __user *dst = (char __user *)dstv; @@ -4255,7 +4484,7 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, unsigned long start, unsigned long len) { - const int unit_size = folio_size(eb->folios[0]); + const int unit_size = eb->folio_size; size_t cur; size_t offset; char *kaddr; @@ -4326,7 +4555,7 @@ static void __write_extent_buffer(const struct extent_buffer *eb, const void *srcv, unsigned long start, unsigned long len, bool use_memmove) { - const int unit_size = folio_size(eb->folios[0]); + const int unit_size = eb->folio_size; size_t cur; size_t offset; char *kaddr; @@ -4375,7 +4604,7 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, static void memset_extent_buffer(const struct extent_buffer *eb, int c, unsigned long start, unsigned long len) { - const int unit_size = folio_size(eb->folios[0]); + const int unit_size = eb->folio_size; unsigned long cur = start; if (eb->addr) { @@ -4406,7 +4635,7 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, void copy_extent_buffer_full(const struct extent_buffer *dst, const struct extent_buffer *src) { - const int unit_size = folio_size(src->folios[0]); + const int unit_size = src->folio_size; unsigned long cur = 0; ASSERT(dst->len == src->len); @@ -4428,7 +4657,7 @@ void copy_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { - const int unit_size = folio_size(dst->folios[0]); + const int unit_size = dst->folio_size; u64 dst_len = dst->len; size_t cur; size_t offset; @@ -4484,10 +4713,10 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb, * the bitmap item in the extent buffer + the offset of the byte in the * bitmap item. */ - offset = start + offset_in_folio(eb->folios[0], eb->start) + byte_offset; + offset = start + offset_in_eb_folio(eb, eb->start) + byte_offset; - *folio_index = offset >> folio_shift(eb->folios[0]); - *folio_offset = offset_in_folio(eb->folios[0], offset); + *folio_index = offset >> eb->folio_shift; + *folio_offset = offset_in_eb_folio(eb, offset); } /* @@ -4601,7 +4830,7 @@ void memcpy_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { - const int unit_size = folio_size(dst->folios[0]); + const int unit_size = dst->folio_size; unsigned long cur_off = 0; if (check_eb_range(dst, dst_offset, len) || @@ -4725,7 +4954,7 @@ out: static int try_release_subpage_extent_buffer(struct page *page) { - struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + struct btrfs_fs_info *fs_info = page_to_fs_info(page); u64 cur = page_offset(page); const u64 end = page_offset(page) + PAGE_SIZE; int ret; @@ -4798,7 +5027,7 @@ int try_release_extent_buffer(struct page *page) struct folio *folio = page_folio(page); struct extent_buffer *eb; - if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) + if (page_to_fs_info(page)->nodesize < PAGE_SIZE) return try_release_subpage_extent_buffer(page); /* diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 46050500529b..e3530d427e1f 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -7,11 +7,32 @@ #include <linux/refcount.h> #include <linux/fiemap.h> #include <linux/btrfs_tree.h> +#include <linux/spinlock.h> +#include <linux/atomic.h> +#include <linux/rwsem.h> +#include <linux/list.h> +#include <linux/slab.h> #include "compression.h" +#include "messages.h" #include "ulist.h" #include "misc.h" +struct page; +struct file; +struct folio; +struct inode; +struct fiemap_extent_info; +struct readahead_control; +struct address_space; +struct writeback_control; +struct extent_io_tree; +struct extent_map_tree; +struct btrfs_block_group; +struct btrfs_fs_info; +struct btrfs_inode; +struct btrfs_root; struct btrfs_trans_handle; +struct btrfs_tree_parent_check; enum { EXTENT_BUFFER_UPTODATE, @@ -63,11 +84,6 @@ enum { #define BITMAP_LAST_BYTE_MASK(nbits) \ (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1))) -struct btrfs_root; -struct btrfs_inode; -struct btrfs_fs_info; -struct extent_io_tree; -struct btrfs_tree_parent_check; int __init extent_buffer_init_cachep(void); void __cold extent_buffer_free_cachep(void); @@ -75,7 +91,8 @@ void __cold extent_buffer_free_cachep(void); #define INLINE_EXTENT_BUFFER_PAGES (BTRFS_MAX_METADATA_BLOCKSIZE / PAGE_SIZE) struct extent_buffer { u64 start; - unsigned long len; + u32 len; + u32 folio_size; unsigned long bflags; struct btrfs_fs_info *fs_info; @@ -90,6 +107,7 @@ struct extent_buffer { int read_mirror; /* >= 0 if eb belongs to a log tree, -1 otherwise */ s8 log_index; + u8 folio_shift; struct rcu_head rcu_head; struct rw_semaphore lock; @@ -113,6 +131,13 @@ struct btrfs_eb_write_context { struct btrfs_block_group *zoned_bg; }; +static inline unsigned long offset_in_eb_folio(const struct extent_buffer *eb, + u64 start) +{ + ASSERT(eb->folio_size); + return start & (eb->folio_size - 1); +} + /* * Get the correct offset inside the page of extent buffer. * @@ -151,13 +176,13 @@ static inline unsigned long get_eb_folio_index(const struct extent_buffer *eb, * the folio_shift would be large enough to always make us * return 0 as index. * 1.2) Several page sized folios - * The folio_shift() would be PAGE_SHIFT, giving us the correct + * The folio_shift would be PAGE_SHIFT, giving us the correct * index. * * 2) sectorsize < PAGE_SIZE and nodesize < PAGE_SIZE case * The folio would only be page sized, and always give us 0 as index. */ - return offset >> folio_shift(eb->folios[0]); + return offset >> eb->folio_shift; } /* @@ -205,8 +230,6 @@ static inline void extent_changeset_free(struct extent_changeset *changeset) kfree(changeset); } -struct extent_map_tree; - int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); @@ -221,6 +244,7 @@ int btree_write_cache_pages(struct address_space *mapping, void extent_readahead(struct readahead_control *rac); int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); +int set_folio_extent_mapped(struct folio *folio); int set_page_extent_mapped(struct page *page); void clear_page_extent_mapped(struct page *page); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index b61099bf97a8..347ca13d15a9 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -5,7 +5,6 @@ #include <linux/spinlock.h> #include "messages.h" #include "ctree.h" -#include "volumes.h" #include "extent_map.h" #include "compression.h" #include "btrfs_inode.h" @@ -16,8 +15,7 @@ static struct kmem_cache *extent_map_cache; int __init extent_map_init(void) { extent_map_cache = kmem_cache_create("btrfs_extent_map", - sizeof(struct extent_map), 0, - SLAB_MEM_SPREAD, NULL); + sizeof(struct extent_map), 0, 0, NULL); if (!extent_map_cache) return -ENOMEM; return 0; @@ -291,6 +289,10 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) * Called after an extent has been written to disk properly. Set the generation * to the generation that actually added the file item to the inode so we know * we need to sync this extent when we call fsync(). + * + * Returns: 0 on success + * -ENOENT when the extent is not found in the tree + * -EUCLEAN if the found extent does not match the expected start */ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) { @@ -308,14 +310,18 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) "no extent map found for inode %llu (root %lld) when unpinning extent range [%llu, %llu), generation %llu", btrfs_ino(inode), btrfs_root_id(inode->root), start, len, gen); + ret = -ENOENT; goto out; } - if (WARN_ON(em->start != start)) + if (WARN_ON(em->start != start)) { btrfs_warn(fs_info, "found extent map for inode %llu (root %lld) with unexpected start offset %llu when unpinning extent range [%llu, %llu), generation %llu", btrfs_ino(inode), btrfs_root_id(inode->root), em->start, start, len, gen); + ret = -EUCLEAN; + goto out; + } em->generation = gen; em->flags &= ~EXTENT_FLAG_PINNED; @@ -531,7 +537,8 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, u64 end; u64 start_diff; - BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); + if (map_start < em->start || map_start >= extent_map_end(em)) + return -EINVAL; if (existing->start > map_start) { next = existing; @@ -626,9 +633,9 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, free_extent_map(em); *em_in = NULL; WARN_ONCE(ret, -"unexpected error %d: merge existing(start %llu len %llu) with em(start %llu len %llu)\n", - ret, existing->start, existing->len, - orig_start, orig_len); +"extent map merge error existing [%llu, %llu) with em [%llu, %llu) start %llu\n", + existing->start, existing->len, + orig_start, orig_len, start); } free_extent_map(existing); } diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index e380fc08bbe4..c5a098c99cc6 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -3,10 +3,18 @@ #ifndef BTRFS_EXTENT_MAP_H #define BTRFS_EXTENT_MAP_H +#include <linux/compiler_types.h> +#include <linux/rwlock_types.h> #include <linux/rbtree.h> +#include <linux/list.h> #include <linux/refcount.h> +#include "misc.h" +#include "extent_map.h" #include "compression.h" +struct btrfs_inode; +struct btrfs_fs_info; + #define EXTENT_MAP_LAST_BYTE ((u64)-4) #define EXTENT_MAP_HOLE ((u64)-3) #define EXTENT_MAP_INLINE ((u64)-2) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 81ac1d474bf1..e58fb5347e65 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -10,17 +10,14 @@ #include <linux/sched/mm.h> #include <crypto/hash.h> #include "messages.h" -#include "misc.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "bio.h" -#include "print-tree.h" #include "compression.h" #include "fs.h" #include "accessors.h" #include "file-item.h" -#include "super.h" #define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \ sizeof(struct btrfs_item) * 2) / \ @@ -179,7 +176,6 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, sizeof(*item)); if (ret < 0) goto out; - BUG_ON(ret); /* Can't happen */ leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -1229,8 +1225,6 @@ insert: ins_size); if (ret < 0) goto out; - if (WARN_ON(ret != 0)) - goto out; leaf = path->nodes[0]; csum: item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 04bd2d34efb1..15c05cc0fce6 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -3,8 +3,21 @@ #ifndef BTRFS_FILE_ITEM_H #define BTRFS_FILE_ITEM_H +#include <linux/list.h> +#include <uapi/linux/btrfs_tree.h> #include "accessors.h" +struct extent_map; +struct btrfs_file_extent_item; +struct btrfs_fs_info; +struct btrfs_path; +struct btrfs_bio; +struct btrfs_trans_handle; +struct btrfs_root; +struct btrfs_ordered_sum; +struct btrfs_path; +struct btrfs_inode; + #define BTRFS_FILE_EXTENT_INLINE_DATA_START \ (offsetof(struct btrfs_file_extent_item, disk_bytenr)) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 38dfcac47609..f9d76072398d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -22,10 +22,8 @@ #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "print-tree.h" #include "tree-log.h" #include "locking.h" -#include "volumes.h" #include "qgroup.h" #include "compression.h" #include "delalloc-space.h" @@ -1137,7 +1135,7 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); loff_t pos = iocb->ki_pos; int ret; loff_t oldsize; @@ -1185,7 +1183,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, struct file *file = iocb->ki_filp; loff_t pos; struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct page **pages = NULL; struct extent_changeset *data_reserved = NULL; u64 release_bytes = 0; @@ -1461,7 +1459,7 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); loff_t pos; ssize_t written = 0; ssize_t written_buffered; @@ -1787,7 +1785,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { struct dentry *dentry = file_dentry(file); struct inode *inode = d_inode(dentry); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; struct btrfs_log_ctx ctx; @@ -1912,6 +1910,8 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) goto out_release_extents; } + btrfs_init_log_ctx_scratch_eb(&ctx); + /* * We use start here because we will need to wait on the IO to complete * in btrfs_sync_log, which could require joining a transaction (for @@ -1931,6 +1931,15 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trans->in_fsync = true; ret = btrfs_log_dentry_safe(trans, dentry, &ctx); + /* + * Scratch eb no longer needed, release before syncing log or commit + * transaction, to avoid holding unnecessary memory during such long + * operations. + */ + if (ctx.scratch_eb) { + free_extent_buffer(ctx.scratch_eb); + ctx.scratch_eb = NULL; + } btrfs_release_log_ctx_extents(&ctx); if (ret < 0) { /* Fallthrough and commit/free transaction. */ @@ -2006,6 +2015,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = btrfs_commit_transaction(trans); out: + free_extent_buffer(ctx.scratch_eb); ASSERT(list_empty(&ctx.list)); ASSERT(list_empty(&ctx.conflict_inodes)); err = file_check_and_advance_wb_err(file); @@ -2176,7 +2186,7 @@ static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len) struct extent_map *em; int ret = 0; - em = btrfs_get_extent(inode, NULL, 0, + em = btrfs_get_extent(inode, NULL, round_down(*start, fs_info->sectorsize), round_up(*len, fs_info->sectorsize)); if (IS_ERR(em)) @@ -2593,7 +2603,7 @@ out: static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_state *cached_state = NULL; struct btrfs_path *path; @@ -2835,7 +2845,7 @@ static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode, int ret; offset = round_down(offset, sectorsize); - em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize); + em = btrfs_get_extent(inode, NULL, offset, sectorsize); if (IS_ERR(em)) return PTR_ERR(em); @@ -2866,7 +2876,7 @@ static int btrfs_zero_range(struct inode *inode, u64 bytes_to_reserve = 0; bool space_reserved = false; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start, + em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start, alloc_end - alloc_start); if (IS_ERR(em)) { ret = PTR_ERR(em); @@ -2909,8 +2919,7 @@ static int btrfs_zero_range(struct inode *inode, if (BTRFS_BYTES_TO_BLKS(fs_info, offset) == BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) { - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start, - sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start, sectorsize); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; @@ -3005,7 +3014,7 @@ reserve_space: } ret = btrfs_prealloc_file_range(inode, mode, alloc_start, alloc_end - alloc_start, - i_blocksize(inode), + fs_info->sectorsize, offset + len, &alloc_hint); unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); @@ -3049,7 +3058,7 @@ static long btrfs_fallocate(struct file *file, int mode, int ret; /* Do not allow fallocate in ZONED mode */ - if (btrfs_is_zoned(btrfs_sb(inode->i_sb))) + if (btrfs_is_zoned(inode_to_fs_info(inode))) return -EOPNOTSUPP; alloc_start = round_down(offset, blocksize); @@ -3126,7 +3135,7 @@ static long btrfs_fallocate(struct file *file, int mode, /* First, check if we exceed the qgroup limit */ while (cur_offset < alloc_end) { - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, + em = btrfs_get_extent(BTRFS_I(inode), NULL, cur_offset, alloc_end - cur_offset); if (IS_ERR(em)) { ret = PTR_ERR(em); @@ -3177,7 +3186,7 @@ static long btrfs_fallocate(struct file *file, int mode, if (!ret) { ret = btrfs_prealloc_file_range(inode, mode, range->start, - range->len, i_blocksize(inode), + range->len, blocksize, offset + len, &alloc_hint); /* * btrfs_prealloc_file_range() releases space even @@ -3754,7 +3763,7 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) if (fsverity_active(inode)) return 0; - if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos)) + if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos)) return 0; btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h index 82b34fbb295f..77aaca208c7b 100644 --- a/fs/btrfs/file.h +++ b/fs/btrfs/file.h @@ -3,6 +3,21 @@ #ifndef BTRFS_FILE_H #define BTRFS_FILE_H +#include <linux/types.h> + +struct file; +struct extent_state; +struct kiocb; +struct iov_iter; +struct page; +struct btrfs_ioctl_encoded_io_args; +struct btrfs_drop_extents_args; +struct btrfs_inode; +struct btrfs_root; +struct btrfs_path; +struct btrfs_replace_extent_info; +struct btrfs_trans_handle; + extern const struct file_operations btrfs_file_operations; int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index d372c7ce0e6b..c8a05d5eb9cb 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -19,9 +19,7 @@ #include "transaction.h" #include "disk-io.h" #include "extent_io.h" -#include "volumes.h" #include "space-info.h" -#include "delalloc-space.h" #include "block-group.h" #include "discard.h" #include "subpage.h" @@ -399,7 +397,7 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, return -ENOMEM; io_ctl->num_pages = num_pages; - io_ctl->fs_info = btrfs_sb(inode->i_sb); + io_ctl->fs_info = inode_to_fs_info(inode); io_ctl->inode = inode; return 0; @@ -2621,7 +2619,7 @@ static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl, } } -int __btrfs_add_free_space(struct btrfs_block_group *block_group, +static int __btrfs_add_free_space(struct btrfs_block_group *block_group, u64 offset, u64 bytes, enum btrfs_trim_state trim_state) { @@ -4156,15 +4154,13 @@ out: int __init btrfs_free_space_init(void) { - btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", - sizeof(struct btrfs_free_space), 0, - SLAB_MEM_SPREAD, NULL); + btrfs_free_space_cachep = KMEM_CACHE(btrfs_free_space, 0); if (!btrfs_free_space_cachep) return -ENOMEM; btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", PAGE_SIZE, PAGE_SIZE, - SLAB_MEM_SPREAD, NULL); + 0, NULL); if (!btrfs_free_space_bitmap_cachep) { kmem_cache_destroy(btrfs_free_space_cachep); return -ENOMEM; diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 33b4da3271b1..83774bfd7b3b 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -6,6 +6,19 @@ #ifndef BTRFS_FREE_SPACE_CACHE_H #define BTRFS_FREE_SPACE_CACHE_H +#include <linux/rbtree.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/mutex.h> +#include "fs.h" + +struct inode; +struct page; +struct btrfs_fs_info; +struct btrfs_path; +struct btrfs_trans_handle; +struct btrfs_trim_block_group; + /* * This is the trim state of an extent or bitmap. * @@ -114,8 +127,6 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans, void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, struct btrfs_free_space_ctl *ctl); -int __btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr, - u64 size, enum btrfs_trim_state trim_state); int btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr, u64 size); int btrfs_add_free_space_unused(struct btrfs_block_group *block_group, diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 7b598b070700..90f2938bd743 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1176,12 +1176,16 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) BTRFS_FREE_SPACE_TREE_OBJECTID); if (IS_ERR(free_space_root)) { ret = PTR_ERR(free_space_root); - goto abort; + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + goto out_clear; } ret = btrfs_global_root_insert(free_space_root); if (ret) { btrfs_put_root(free_space_root); - goto abort; + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + goto out_clear; } node = rb_first_cached(&fs_info->block_group_cache_tree); @@ -1189,8 +1193,11 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) block_group = rb_entry(node, struct btrfs_block_group, cache_node); ret = populate_free_space_tree(trans, block_group); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + goto out_clear; + } node = rb_next(node); } @@ -1206,11 +1213,9 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); return ret; -abort: +out_clear: clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); return ret; } @@ -1273,12 +1278,18 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info) btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); ret = clear_free_space_tree(trans, free_space_root); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } ret = btrfs_del_root(trans, &free_space_root->root_key); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } btrfs_global_root_delete(free_space_root); @@ -1295,11 +1306,6 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info) btrfs_put_root(free_space_root); return btrfs_commit_transaction(trans); - -abort: - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - return ret; } int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) @@ -1322,8 +1328,11 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) set_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); ret = clear_free_space_tree(trans, free_space_root); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } node = rb_first_cached(&fs_info->block_group_cache_tree); while (node) { @@ -1332,8 +1341,11 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) block_group = rb_entry(node, struct btrfs_block_group, cache_node); ret = populate_free_space_tree(trans, block_group); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } node = rb_next(node); } @@ -1344,10 +1356,6 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) ret = btrfs_commit_transaction(trans); clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); return ret; -abort: - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - return ret; } static int __add_block_group_free_space(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h index 6d5551d0ced8..e6c6d6f4f221 100644 --- a/fs/btrfs/free-space-tree.h +++ b/fs/btrfs/free-space-tree.h @@ -6,7 +6,13 @@ #ifndef BTRFS_FREE_SPACE_TREE_H #define BTRFS_FREE_SPACE_TREE_H +#include <linux/bits.h> + struct btrfs_caching_control; +struct btrfs_fs_info; +struct btrfs_path; +struct btrfs_block_group; +struct btrfs_trans_handle; /* * The default size for new free space bitmap items. The last bitmap in a block diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index f8bb73d6ab68..93f5c57ea4e3 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -4,13 +4,50 @@ #define BTRFS_FS_H #include <linux/blkdev.h> -#include <linux/fs.h> -#include <linux/btrfs_tree.h> #include <linux/sizes.h> +#include <linux/time64.h> +#include <linux/compiler.h> +#include <linux/math.h> +#include <linux/atomic.h> +#include <linux/blkdev.h> +#include <linux/percpu_counter.h> +#include <linux/completion.h> +#include <linux/lockdep.h> +#include <linux/spinlock.h> +#include <linux/mutex.h> +#include <linux/rwlock_types.h> +#include <linux/rwsem.h> +#include <linux/semaphore.h> +#include <linux/list.h> +#include <linux/radix-tree.h> +#include <linux/workqueue.h> +#include <linux/wait.h> +#include <linux/wait_bit.h> +#include <linux/sched.h> +#include <linux/rbtree.h> +#include <uapi/linux/btrfs.h> +#include <uapi/linux/btrfs_tree.h> #include "extent-io-tree.h" -#include "extent_map.h" #include "async-thread.h" #include "block-rsv.h" +#include "fs.h" + +struct inode; +struct super_block; +struct kobject; +struct reloc_control; +struct crypto_shash; +struct ulist; +struct btrfs_device; +struct btrfs_block_group; +struct btrfs_root; +struct btrfs_fs_devices; +struct btrfs_transaction; +struct btrfs_delayed_root; +struct btrfs_balance_control; +struct btrfs_subpage_info; +struct btrfs_stripe_hash_table; +struct btrfs_space_info; #define BTRFS_MAX_EXTENT_SIZE SZ_128M @@ -732,10 +769,13 @@ struct btrfs_fs_info { /* Reclaim partially filled block groups in the background */ struct work_struct reclaim_bgs_work; + /* Protected by unused_bgs_lock. */ struct list_head reclaim_bgs; int bg_reclaim_threshold; + /* Protects the lists unused_bgs and reclaim_bgs. */ spinlock_t unused_bgs_lock; + /* Protected by unused_bgs_lock. */ struct list_head unused_bgs; struct mutex unused_bg_unpin_mutex; /* Protect block groups that are going to be deleted */ @@ -829,6 +869,17 @@ struct btrfs_fs_info { #endif }; +#define page_to_inode(_page) (BTRFS_I(_Generic((_page), \ + struct page *: (_page))->mapping->host)) +#define folio_to_inode(_folio) (BTRFS_I(_Generic((_folio), \ + struct folio *: (_folio))->mapping->host)) + +#define page_to_fs_info(_page) (page_to_inode(_page)->root->fs_info) +#define folio_to_fs_info(_folio) (folio_to_inode(_folio)->root->fs_info) + +#define inode_to_fs_info(_inode) (BTRFS_I(_Generic((_inode), \ + struct inode *: (_inode)))->root->fs_info) + static inline u64 btrfs_get_fs_generation(const struct btrfs_fs_info *fs_info) { return READ_ONCE(fs_info->generation); @@ -922,6 +973,8 @@ void btrfs_exclop_finish(struct btrfs_fs_info *fs_info); void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation op); +int btrfs_check_ioctl_vol_args_path(const struct btrfs_ioctl_vol_args *vol_args); + /* Compatibility and incompatibility defines */ void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, const char *name); diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 7d734830e514..9c1394c0a6d7 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -9,7 +9,6 @@ #include "inode-item.h" #include "disk-io.h" #include "transaction.h" -#include "print-tree.h" #include "space-info.h" #include "accessors.h" #include "extent-tree.h" diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h index 4337bb26f419..c4aded82709b 100644 --- a/fs/btrfs/inode-item.h +++ b/fs/btrfs/inode-item.h @@ -6,14 +6,15 @@ #include <linux/types.h> #include <linux/crc32c.h> +struct fscrypt_str; +struct extent_buffer; struct btrfs_trans_handle; struct btrfs_root; struct btrfs_path; struct btrfs_key; struct btrfs_inode_extref; struct btrfs_inode; -struct extent_buffer; -struct fscrypt_str; +struct btrfs_truncate_control; /* * Return this if we need to call truncate_block for the last bit of the diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 809b11472a80..37701531eeb1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -39,14 +39,12 @@ #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "print-tree.h" #include "ordered-data.h" #include "xattr.h" #include "tree-log.h" #include "bio.h" #include "compression.h" #include "locking.h" -#include "free-space-cache.h" #include "props.h" #include "qgroup.h" #include "delalloc-space.h" @@ -740,7 +738,8 @@ static noinline int add_async_extent(struct async_chunk *cow, struct async_extent *async_extent; async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); - BUG_ON(!async_extent); /* -ENOMEM */ + if (!async_extent) + return -ENOMEM; async_extent->start = start; async_extent->ram_size = ram_size; async_extent->compressed_size = compressed_size; @@ -1027,8 +1026,9 @@ again: * The async work queues will take care of doing actual allocation on * disk for these compressed pages, and will submit the bios. */ - add_async_extent(async_chunk, start, total_in, total_compressed, pages, - nr_pages, compress_type); + ret = add_async_extent(async_chunk, start, total_in, total_compressed, pages, + nr_pages, compress_type); + BUG_ON(ret); if (start + total_in < end) { start += total_in; cond_resched(); @@ -1040,8 +1040,9 @@ mark_incompressible: if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress) inode->flags |= BTRFS_INODE_NOCOMPRESS; cleanup_and_bail_uncompressed: - add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, - BTRFS_COMPRESS_NONE); + ret = add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, + BTRFS_COMPRESS_NONE); + BUG_ON(ret); free_pages: if (pages) { for (i = 0; i < nr_pages; i++) { @@ -2302,6 +2303,8 @@ void btrfs_split_delalloc_extent(struct btrfs_inode *inode, struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 size; + lockdep_assert_held(&inode->io_tree.lock); + /* not delalloc, ignore it */ if (!(orig->state & EXTENT_DELALLOC)) return; @@ -2340,6 +2343,8 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state u64 new_size, old_size; u32 num_extents; + lockdep_assert_held(&inode->io_tree.lock); + /* not delalloc, ignore it */ if (!(other->state & EXTENT_DELALLOC)) return; @@ -2387,55 +2392,50 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state spin_unlock(&inode->lock); } -static void btrfs_add_delalloc_inodes(struct btrfs_root *root, - struct btrfs_inode *inode) +static void btrfs_add_delalloc_inode(struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; spin_lock(&root->delalloc_lock); - if (list_empty(&inode->delalloc_inodes)) { - list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes); - set_bit(BTRFS_INODE_IN_DELALLOC_LIST, &inode->runtime_flags); - root->nr_delalloc_inodes++; - if (root->nr_delalloc_inodes == 1) { - spin_lock(&fs_info->delalloc_root_lock); - BUG_ON(!list_empty(&root->delalloc_root)); - list_add_tail(&root->delalloc_root, - &fs_info->delalloc_roots); - spin_unlock(&fs_info->delalloc_root_lock); - } + ASSERT(list_empty(&inode->delalloc_inodes)); + list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes); + root->nr_delalloc_inodes++; + if (root->nr_delalloc_inodes == 1) { + spin_lock(&fs_info->delalloc_root_lock); + ASSERT(list_empty(&root->delalloc_root)); + list_add_tail(&root->delalloc_root, &fs_info->delalloc_roots); + spin_unlock(&fs_info->delalloc_root_lock); } spin_unlock(&root->delalloc_lock); } -void __btrfs_del_delalloc_inode(struct btrfs_root *root, - struct btrfs_inode *inode) +void btrfs_del_delalloc_inode(struct btrfs_inode *inode) { + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; + lockdep_assert_held(&root->delalloc_lock); + + /* + * We may be called after the inode was already deleted from the list, + * namely in the transaction abort path btrfs_destroy_delalloc_inodes(), + * and then later through btrfs_clear_delalloc_extent() while the inode + * still has ->delalloc_bytes > 0. + */ if (!list_empty(&inode->delalloc_inodes)) { list_del_init(&inode->delalloc_inodes); - clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &inode->runtime_flags); root->nr_delalloc_inodes--; if (!root->nr_delalloc_inodes) { ASSERT(list_empty(&root->delalloc_inodes)); spin_lock(&fs_info->delalloc_root_lock); - BUG_ON(list_empty(&root->delalloc_root)); + ASSERT(!list_empty(&root->delalloc_root)); list_del_init(&root->delalloc_root); spin_unlock(&fs_info->delalloc_root_lock); } } } -static void btrfs_del_delalloc_inode(struct btrfs_root *root, - struct btrfs_inode *inode) -{ - spin_lock(&root->delalloc_lock); - __btrfs_del_delalloc_inode(root, inode); - spin_unlock(&root->delalloc_lock); -} - /* * Properly track delayed allocation bytes in the inode and to maintain the * list of inodes that have pending delalloc work to be done. @@ -2445,6 +2445,8 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s { struct btrfs_fs_info *fs_info = inode->root->fs_info; + lockdep_assert_held(&inode->io_tree.lock); + if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC)) WARN_ON(1); /* @@ -2453,10 +2455,9 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s * bit, which is only set or cleared with irqs on */ if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { - struct btrfs_root *root = inode->root; u64 len = state->end + 1 - state->start; + u64 prev_delalloc_bytes; u32 num_extents = count_max_extents(fs_info, len); - bool do_list = !btrfs_is_free_space_inode(inode); spin_lock(&inode->lock); btrfs_mod_outstanding_extents(inode, num_extents); @@ -2469,13 +2470,20 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s percpu_counter_add_batch(&fs_info->delalloc_bytes, len, fs_info->delalloc_batch); spin_lock(&inode->lock); + prev_delalloc_bytes = inode->delalloc_bytes; inode->delalloc_bytes += len; if (bits & EXTENT_DEFRAG) inode->defrag_bytes += len; - if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &inode->runtime_flags)) - btrfs_add_delalloc_inodes(root, inode); spin_unlock(&inode->lock); + + /* + * We don't need to be under the protection of the inode's lock, + * because we are called while holding the inode's io_tree lock + * and are therefore protected against concurrent calls of this + * function and btrfs_clear_delalloc_extent(). + */ + if (!btrfs_is_free_space_inode(inode) && prev_delalloc_bytes == 0) + btrfs_add_delalloc_inode(inode); } if (!(state->state & EXTENT_DELALLOC_NEW) && @@ -2497,6 +2505,8 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, u64 len = state->end + 1 - state->start; u32 num_extents = count_max_extents(fs_info, len); + lockdep_assert_held(&inode->io_tree.lock); + if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) { spin_lock(&inode->lock); inode->defrag_bytes -= len; @@ -2510,7 +2520,7 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, */ if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = inode->root; - bool do_list = !btrfs_is_free_space_inode(inode); + u64 new_delalloc_bytes; spin_lock(&inode->lock); btrfs_mod_outstanding_extents(inode, -num_extents); @@ -2530,7 +2540,8 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, return; if (!btrfs_is_data_reloc_root(root) && - do_list && !(state->state & EXTENT_NORESERVE) && + !btrfs_is_free_space_inode(inode) && + !(state->state & EXTENT_NORESERVE) && (bits & EXTENT_CLEAR_DATA_RESV)) btrfs_free_reserved_data_space_noquota(fs_info, len); @@ -2538,11 +2549,20 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, fs_info->delalloc_batch); spin_lock(&inode->lock); inode->delalloc_bytes -= len; - if (do_list && inode->delalloc_bytes == 0 && - test_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &inode->runtime_flags)) - btrfs_del_delalloc_inode(root, inode); + new_delalloc_bytes = inode->delalloc_bytes; spin_unlock(&inode->lock); + + /* + * We don't need to be under the protection of the inode's lock, + * because we are called while holding the inode's io_tree lock + * and are therefore protected against concurrent calls of this + * function and btrfs_set_delalloc_extent(). + */ + if (!btrfs_is_free_space_inode(inode) && new_delalloc_bytes == 0) { + spin_lock(&root->delalloc_lock); + btrfs_del_delalloc_inode(inode); + spin_unlock(&root->delalloc_lock); + } } if ((state->state & EXTENT_DELALLOC_NEW) && @@ -2632,7 +2652,7 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, u64 em_len; int ret = 0; - em = btrfs_get_extent(inode, NULL, 0, search_start, search_len); + em = btrfs_get_extent(inode, NULL, search_start, search_len); if (IS_ERR(em)) return PTR_ERR(em); @@ -2829,7 +2849,7 @@ out_page: int btrfs_writepage_cow_fixup(struct page *page) { struct inode *inode = page->mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_writepage_fixup *fixup; /* This page has ordered extent covering it already */ @@ -3127,8 +3147,13 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) ordered_extent->disk_num_bytes); } } - unpin_extent_cache(inode, ordered_extent->file_offset, - ordered_extent->num_bytes, trans->transid); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out; + } + + ret = unpin_extent_cache(inode, ordered_extent->file_offset, + ordered_extent->num_bytes, trans->transid); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out; @@ -3184,8 +3209,23 @@ out: unwritten_start += logical_len; clear_extent_uptodate(io_tree, unwritten_start, end, NULL); - /* Drop extent maps for the part of the extent we didn't write. */ - btrfs_drop_extent_map_range(inode, unwritten_start, end, false); + /* + * Drop extent maps for the part of the extent we didn't write. + * + * We have an exception here for the free_space_inode, this is + * because when we do btrfs_get_extent() on the free space inode + * we will search the commit root. If this is a new block group + * we won't find anything, and we will trip over the assert in + * writepage where we do ASSERT(em->block_start != + * EXTENT_MAP_HOLE). + * + * Theoretically we could also skip this for any NOCOW extent as + * we don't mess with the extent map tree in the NOCOW case, but + * for now simply skip this if we are the free space inode. + */ + if (!btrfs_is_free_space_inode(inode)) + btrfs_drop_extent_map_range(inode, unwritten_start, + end, false); /* * If the ordered extent had an IOERR or something else went @@ -3239,7 +3279,7 @@ out: int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered) { - if (btrfs_is_zoned(btrfs_sb(ordered->inode->i_sb)) && + if (btrfs_is_zoned(inode_to_fs_info(ordered->inode)) && !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && list_empty(&ordered->bioc_list)) btrfs_finish_ordered_zoned(ordered); @@ -3724,7 +3764,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *in_path) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_path *path = in_path; struct extent_buffer *leaf; struct btrfs_inode_item *inode_item; @@ -4368,7 +4408,14 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) goto out; - BUG_ON(ret == 0); + if (ret == 0) { + /* + * Key with offset -1 found, there would have to exist a root + * with such id, but this is out of valid range. + */ + ret = -EUCLEAN; + goto out; + } ret = 0; if (path->slots[0] > 0) { @@ -4449,8 +4496,8 @@ again: int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) { - struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb); struct btrfs_root *root = dir->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct inode *inode = d_inode(dentry); struct btrfs_root *dest = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; @@ -4458,6 +4505,8 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) u64 root_flags; int ret; + down_write(&fs_info->subvol_sem); + /* * Don't allow to delete a subvolume with send in progress. This is * inside the inode lock so the error handling that has to drop the bit @@ -4469,25 +4518,25 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) btrfs_warn(fs_info, "attempt to delete subvolume %llu during send", dest->root_key.objectid); - return -EPERM; + ret = -EPERM; + goto out_up_write; } if (atomic_read(&dest->nr_swapfiles)) { spin_unlock(&dest->root_item_lock); btrfs_warn(fs_info, "attempt to delete subvolume %llu with active swapfile", root->root_key.objectid); - return -EPERM; + ret = -EPERM; + goto out_up_write; } root_flags = btrfs_root_flags(&dest->root_item); btrfs_set_root_flags(&dest->root_item, root_flags | BTRFS_ROOT_SUBVOL_DEAD); spin_unlock(&dest->root_item_lock); - down_write(&fs_info->subvol_sem); - ret = may_destroy_subvol(dest); if (ret) - goto out_up_write; + goto out_undead; btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); /* @@ -4497,7 +4546,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) */ ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true); if (ret) - goto out_up_write; + goto out_undead; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { @@ -4563,15 +4612,17 @@ out_end_trans: inode->i_flags |= S_DEAD; out_release: btrfs_subvolume_release_metadata(root, &block_rsv); -out_up_write: - up_write(&fs_info->subvol_sem); +out_undead: if (ret) { spin_lock(&dest->root_item_lock); root_flags = btrfs_root_flags(&dest->root_item); btrfs_set_root_flags(&dest->root_item, root_flags & ~BTRFS_ROOT_SUBVOL_DEAD); spin_unlock(&dest->root_item_lock); - } else { + } +out_up_write: + up_write(&fs_info->subvol_sem); + if (!ret) { d_invalidate(dentry); btrfs_prune_dentries(dest); ASSERT(dest->send_in_progress == 0); @@ -4676,7 +4727,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, u32 blocksize = fs_info->sectorsize; pgoff_t index = from >> PAGE_SHIFT; unsigned offset = from & (blocksize - 1); - struct page *page; + struct folio *folio; gfp_t mask = btrfs_alloc_write_mask(mapping); size_t write_bytes = blocksize; int ret = 0; @@ -4708,8 +4759,9 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, goto out; } again: - page = find_or_create_page(mapping, index, mask); - if (!page) { + folio = __filemap_get_folio(mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); + if (IS_ERR(folio)) { btrfs_delalloc_release_space(inode, data_reserved, block_start, blocksize, true); btrfs_delalloc_release_extents(inode, blocksize); @@ -4717,15 +4769,15 @@ again: goto out; } - if (!PageUptodate(page)) { - ret = btrfs_read_folio(NULL, page_folio(page)); - lock_page(page); - if (page->mapping != mapping) { - unlock_page(page); - put_page(page); + if (!folio_test_uptodate(folio)) { + ret = btrfs_read_folio(NULL, folio); + folio_lock(folio); + if (folio->mapping != mapping) { + folio_unlock(folio); + folio_put(folio); goto again; } - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { ret = -EIO; goto out_unlock; } @@ -4737,19 +4789,19 @@ again: * folio private, but left the page in the mapping. Set the page mapped * here to make sure it's properly set for the subpage stuff. */ - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(folio); if (ret < 0) goto out_unlock; - wait_on_page_writeback(page); + folio_wait_writeback(folio); lock_extent(io_tree, block_start, block_end, &cached_state); ordered = btrfs_lookup_ordered_extent(inode, block_start); if (ordered) { unlock_extent(io_tree, block_start, block_end, &cached_state); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; @@ -4770,15 +4822,16 @@ again: if (!len) len = blocksize - offset; if (front) - memzero_page(page, (block_start - page_offset(page)), - offset); + folio_zero_range(folio, block_start - folio_pos(folio), + offset); else - memzero_page(page, (block_start - page_offset(page)) + offset, - len); + folio_zero_range(folio, + (block_start - folio_pos(folio)) + offset, + len); } - btrfs_folio_clear_checked(fs_info, page_folio(page), block_start, + btrfs_folio_clear_checked(fs_info, folio, block_start, block_end + 1 - block_start); - btrfs_folio_set_dirty(fs_info, page_folio(page), block_start, + btrfs_folio_set_dirty(fs_info, folio, block_start, block_end + 1 - block_start); unlock_extent(io_tree, block_start, block_end, &cached_state); @@ -4795,8 +4848,8 @@ out_unlock: block_start, blocksize, true); } btrfs_delalloc_release_extents(inode, blocksize); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); out: if (only_release_metadata) btrfs_check_nocow_unlock(inode); @@ -4888,8 +4941,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) &cached_state); cur_offset = hole_start; while (1) { - em = btrfs_get_extent(inode, NULL, 0, cur_offset, - block_end - cur_offset); + em = btrfs_get_extent(inode, NULL, cur_offset, block_end - cur_offset); if (IS_ERR(em)) { err = PTR_ERR(em); em = NULL; @@ -5000,7 +5052,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) btrfs_drew_write_unlock(&root->snapshot_lock); btrfs_end_transaction(trans); } else { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); if (btrfs_is_zoned(fs_info)) { ret = btrfs_wait_ordered_range(inode, @@ -5203,7 +5255,7 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, void btrfs_evict_inode(struct inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info; struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *rsv = NULL; @@ -5217,6 +5269,7 @@ void btrfs_evict_inode(struct inode *inode) return; } + fs_info = inode_to_fs_info(inode); evict_inode_truncate_pages(inode); if (inode->i_nlink && @@ -5514,7 +5567,6 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p) BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; BTRFS_I(inode)->location.offset = 0; BTRFS_I(inode)->root = btrfs_grab_root(args->root); - BUG_ON(args->root && !BTRFS_I(inode)->root); if (args->root && args->root == args->root->fs_info->tree_root && args->ino != BTRFS_BTREE_INODE_OBJECTID) @@ -5642,7 +5694,7 @@ static inline u8 btrfs_inode_type(struct inode *inode) struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) { - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct inode *inode; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *sub_root = root; @@ -6181,7 +6233,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, struct inode *dir = args->dir; struct inode *inode = args->inode; const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name; - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct btrfs_root *root; struct btrfs_inode_item *inode_item; struct btrfs_key *location; @@ -6503,7 +6555,7 @@ fail_dir_item: static int btrfs_create_common(struct inode *dir, struct dentry *dentry, struct inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_new_inode_args new_inode_args = { .dir = dir, @@ -6573,7 +6625,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, struct btrfs_trans_handle *trans = NULL; struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = d_inode(old_dentry); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct fscrypt_name fname; u64 index; int err; @@ -6737,7 +6789,6 @@ static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path * * @inode: file to search in * @page: page to read extent data into if the extent is inline - * @pg_offset: offset into @page to copy to * @start: file offset * @len: length of range starting at @start * @@ -6751,8 +6802,7 @@ static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path * Return: ERR_PTR on error, non-NULL extent_map on success. */ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, - u64 start, u64 len) + struct page *page, u64 start, u64 len) { struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret = 0; @@ -6895,7 +6945,6 @@ next: * ensured by tree-checker and inline extent creation path. * Thus all members representing file offsets should be zero. */ - ASSERT(pg_offset == 0); ASSERT(extent_start == 0); ASSERT(em->start == 0); @@ -7059,7 +7108,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, u64 *ram_bytes, bool nowait, bool strict) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct can_nocow_file_extent_args nocow_args = { 0 }; struct btrfs_path *path; int ret; @@ -7298,7 +7347,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, unsigned int iomap_flags) { const bool nowait = (iomap_flags & IOMAP_NOWAIT); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct extent_map *em = *map; int type; u64 block_start, orig_start, orig_block_len, ram_bytes; @@ -7438,7 +7487,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, struct iomap *srcmap) { struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct extent_map *em; struct extent_state *cached_state = NULL; struct btrfs_dio_data *dio_data = iter->private; @@ -7536,7 +7585,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, if (ret < 0) goto err; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); + em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); goto unlock_err; @@ -7816,6 +7865,7 @@ struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { + struct btrfs_inode *btrfs_inode = BTRFS_I(inode); int ret; ret = fiemap_prep(inode, fieinfo, start, &len, 0); @@ -7841,7 +7891,26 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return ret; } - return extent_fiemap(BTRFS_I(inode), fieinfo, start, len); + btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED); + + /* + * We did an initial flush to avoid holding the inode's lock while + * triggering writeback and waiting for the completion of IO and ordered + * extents. Now after we locked the inode we do it again, because it's + * possible a new write may have happened in between those two steps. + */ + if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) { + ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX); + if (ret) { + btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED); + return ret; + } + } + + ret = extent_fiemap(btrfs_inode, fieinfo, start, len); + btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED); + + return ret; } static int btrfs_writepages(struct address_space *mapping, @@ -7864,7 +7933,7 @@ static void btrfs_readahead(struct readahead_control *rac) */ static void wait_subpage_spinlock(struct page *page) { - struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + struct btrfs_fs_info *fs_info = page_to_fs_info(page); struct folio *folio = page_folio(page); struct btrfs_subpage *subpage; @@ -7931,7 +8000,7 @@ static int btrfs_migrate_folio(struct address_space *mapping, static void btrfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) { - struct btrfs_inode *inode = BTRFS_I(folio->mapping->host); + struct btrfs_inode *inode = folio_to_inode(folio); struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_io_tree *tree = &inode->io_tree; struct extent_state *cached_state = NULL; @@ -8115,7 +8184,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) struct page *page = vmf->page; struct folio *folio = page_folio(page); struct inode *inode = file_inode(vmf->vma->vm_file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; @@ -8661,7 +8730,7 @@ int __init btrfs_init_cachep(void) { btrfs_inode_cachep = kmem_cache_create("btrfs_inode", sizeof(struct btrfs_inode), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT, + SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, init_once); if (!btrfs_inode_cachep) goto fail; @@ -8684,7 +8753,7 @@ static int btrfs_getattr(struct mnt_idmap *idmap, u64 delalloc_bytes; u64 inode_bytes; struct inode *inode = d_inode(path->dentry); - u32 blocksize = inode->i_sb->s_blocksize; + u32 blocksize = btrfs_sb(inode->i_sb)->sectorsize; u32 bi_flags = BTRFS_I(inode)->flags; u32 bi_ro_flags = BTRFS_I(inode)->ro_flags; @@ -8724,7 +8793,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, struct inode *new_dir, struct dentry *new_dentry) { - struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir); struct btrfs_trans_handle *trans; unsigned int trans_num_items; struct btrfs_root *root = BTRFS_I(old_dir)->root; @@ -8976,7 +9045,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { - struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir); struct btrfs_new_inode_args whiteout_args = { .dir = old_dir, .dentry = old_dentry, @@ -9418,7 +9487,7 @@ out: static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_path *path; @@ -9599,7 +9668,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, loff_t actual_len, u64 *alloc_hint, struct btrfs_trans_handle *trans) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct extent_map *em; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key ins; @@ -9751,7 +9820,7 @@ static int btrfs_permission(struct mnt_idmap *idmap, static int btrfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode; @@ -10125,7 +10194,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, cond_resched(); } - em = btrfs_get_extent(inode, NULL, 0, start, lockend - start + 1); + em = btrfs_get_extent(inode, NULL, start, lockend - start + 1); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out_unlock_extent; @@ -10269,6 +10338,13 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE) return -EINVAL; + /* + * Compressed extents should always have checksums, so error out if we + * have a NOCOW file or inode was created while mounted with NODATASUM. + */ + if (inode->flags & BTRFS_INODE_NODATASUM) + return -EINVAL; + orig_count = iov_iter_count(from); /* The extent size must be sane. */ @@ -10698,7 +10774,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, struct btrfs_block_group *bg; u64 len = isize - start; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); + em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 41b479861b3c..294e31edec9d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -34,11 +34,9 @@ #include "export.h" #include "transaction.h" #include "btrfs_inode.h" -#include "print-tree.h" #include "volumes.h" #include "locking.h" #include "backref.h" -#include "rcu-string.h" #include "send.h" #include "dev-replace.h" #include "props.h" @@ -47,9 +45,7 @@ #include "tree-log.h" #include "compression.h" #include "space-info.h" -#include "delalloc-space.h" #include "block-group.h" -#include "subpage.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" @@ -231,6 +227,20 @@ static int check_fsflags_compatible(struct btrfs_fs_info *fs_info, return 0; } +int btrfs_check_ioctl_vol_args_path(const struct btrfs_ioctl_vol_args *vol_args) +{ + if (memchr(vol_args->name, 0, sizeof(vol_args->name)) == NULL) + return -ENAMETOOLONG; + return 0; +} + +static int btrfs_check_ioctl_vol_args2_subvol_name(const struct btrfs_ioctl_vol_args_v2 *vol_args2) +{ + if (memchr(vol_args2->name, 0, sizeof(vol_args2->name)) == NULL) + return -ENAMETOOLONG; + return 0; +} + /* * Set flags/xflags from the internal inode flags. The remaining items of * fsxattr are zeroed. @@ -247,7 +257,7 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_inode *binode = BTRFS_I(inode); struct btrfs_root *root = binode->root; struct btrfs_trans_handle *trans; @@ -528,7 +538,7 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, * block group is in the logical address space, which can be any * sectorsize aligned bytenr in the range [0, U64_MAX]. */ - if (range.len < fs_info->sb->s_blocksize) + if (range.len < fs_info->sectorsize) return -EINVAL; range.minlen = max(range.minlen, minlen); @@ -584,7 +594,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, struct btrfs_qgroup_inherit *inherit) { - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct btrfs_trans_handle *trans; struct btrfs_key key; struct btrfs_root_item *root_item; @@ -721,7 +731,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap, free_extent_buffer(leaf); leaf = NULL; - new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev); + new_root = btrfs_get_new_fs_root(fs_info, objectid, &anon_dev); if (IS_ERR(new_root)) { ret = PTR_ERR(new_root); btrfs_abort_transaction(trans, ret); @@ -776,7 +786,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, struct dentry *dentry, bool readonly, struct btrfs_qgroup_inherit *inherit) { - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct inode *inode; struct btrfs_pending_snapshot *pending_snapshot; unsigned int trans_num_items; @@ -790,6 +800,9 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, return -EOPNOTSUPP; } + if (btrfs_root_refs(&root->root_item) == 0) + return -ENOENT; + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) return -EINVAL; @@ -907,7 +920,9 @@ static int btrfs_may_delete(struct mnt_idmap *idmap, if (d_really_is_negative(victim)) return -ENOENT; - BUG_ON(d_inode(victim->d_parent) != dir); + /* The @victim is not inside @dir. */ + if (d_inode(victim->d_parent) != dir) + return -EINVAL; audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); @@ -959,7 +974,7 @@ static noinline int btrfs_mksubvol(const struct path *parent, struct btrfs_qgroup_inherit *inherit) { struct inode *dir = d_inode(parent->dentry); - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct dentry *dentry; struct fscrypt_str name_str = FSTR_INIT((char *)name, namelen); int error; @@ -1094,7 +1109,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, { BTRFS_DEV_LOOKUP_ARGS(args); struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); u64 new_size; u64 old_size; u64 devid = 1; @@ -1125,7 +1140,10 @@ static noinline int btrfs_ioctl_resize(struct file *file, ret = PTR_ERR(vol_args); goto out_drop; } - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args_path(vol_args); + if (ret < 0) + goto out_free; + sizestr = vol_args->name; cancel = (strcmp("cancel", sizestr) == 0); ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_RESIZE, cancel); @@ -1325,12 +1343,15 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args_path(vol_args); + if (ret < 0) + goto out; ret = __btrfs_ioctl_snap_create(file, file_mnt_idmap(file), vol_args->name, vol_args->fd, subvol, false, NULL); +out: kfree(vol_args); return ret; } @@ -1349,7 +1370,9 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); - vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args); + if (ret < 0) + goto free_args; if (vol_args->flags & ~BTRFS_SUBVOL_CREATE_ARGS_MASK) { ret = -EOPNOTSUPP; @@ -1359,7 +1382,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, if (vol_args->flags & BTRFS_SUBVOL_RDONLY) readonly = true; if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) { - u64 nums; + struct btrfs_fs_info *fs_info = inode_to_fs_info(file_inode(file)); if (vol_args->size < sizeof(*inherit) || vol_args->size > PAGE_SIZE) { @@ -1372,19 +1395,9 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, goto free_args; } - if (inherit->num_qgroups > PAGE_SIZE || - inherit->num_ref_copies > PAGE_SIZE || - inherit->num_excl_copies > PAGE_SIZE) { - ret = -EINVAL; - goto free_inherit; - } - - nums = inherit->num_qgroups + 2 * inherit->num_ref_copies + - 2 * inherit->num_excl_copies; - if (vol_args->size != struct_size(inherit, qgroups, nums)) { - ret = -EINVAL; + ret = btrfs_qgroup_check_inherit(fs_info, inherit, vol_args->size); + if (ret < 0) goto free_inherit; - } } ret = __btrfs_ioctl_snap_create(file, file_mnt_idmap(file), @@ -1402,7 +1415,7 @@ free_args: static noinline int btrfs_ioctl_subvol_getflags(struct inode *inode, void __user *arg) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; u64 flags = 0; @@ -1425,7 +1438,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; u64 root_flags; @@ -1672,7 +1685,7 @@ static noinline int search_ioctl(struct inode *inode, u64 *buf_size, char __user *ubuf) { - struct btrfs_fs_info *info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *info = inode_to_fs_info(inode); struct btrfs_root *root; struct btrfs_key key; struct btrfs_path *path; @@ -2343,9 +2356,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, bool destroy_v2) { struct dentry *parent = file->f_path.dentry; - struct btrfs_fs_info *fs_info = btrfs_sb(parent->d_sb); struct dentry *dentry; struct inode *dir = d_inode(parent); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct inode *inode; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *dest = NULL; @@ -2379,7 +2392,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, * name, same as v1 currently does. */ if (!(vol_args2->flags & BTRFS_SUBVOL_SPEC_BY_ID)) { - vol_args2->name[BTRFS_SUBVOL_NAME_MAX] = 0; + err = btrfs_check_ioctl_vol_args2_subvol_name(vol_args2); + if (err < 0) + goto out; subvol_name = vol_args2->name; err = mnt_want_write_file(file); @@ -2463,7 +2478,10 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (IS_ERR(vol_args)) return PTR_ERR(vol_args); - vol_args->name[BTRFS_PATH_NAME_MAX] = 0; + err = btrfs_check_ioctl_vol_args_path(vol_args); + if (err < 0) + goto out; + subvol_name = vol_args->name; err = mnt_want_write_file(file); @@ -2608,6 +2626,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) ret = -EFAULT; goto out; } + if (range.flags & ~BTRFS_DEFRAG_RANGE_FLAGS_SUPP) { + ret = -EOPNOTSUPP; + goto out; + } /* compression requires us to start the IO */ if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { range.flags |= BTRFS_DEFRAG_RANGE_START_IO; @@ -2670,12 +2692,16 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) goto out; } - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args_path(vol_args); + if (ret < 0) + goto out_free; + ret = btrfs_init_new_device(fs_info, vol_args->name); if (!ret) btrfs_info(fs_info, "disk added %s", vol_args->name); +out_free: kfree(vol_args); out: if (restore_op) @@ -2689,9 +2715,9 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) { BTRFS_DEV_LOOKUP_ARGS(args); struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_ioctl_vol_args_v2 *vol_args; - struct bdev_handle *bdev_handle = NULL; + struct file *bdev_file = NULL; int ret; bool cancel = false; @@ -2707,7 +2733,10 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) goto out; } - vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args); + if (ret < 0) + goto out; + if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) { args.devid = vol_args->devid; } else if (!strcmp("cancel", vol_args->name)) { @@ -2728,7 +2757,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) goto err_drop; /* Exclusive operation is now claimed */ - ret = btrfs_rm_device(fs_info, &args, &bdev_handle); + ret = btrfs_rm_device(fs_info, &args, &bdev_file); btrfs_exclop_finish(fs_info); @@ -2742,8 +2771,8 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) } err_drop: mnt_drop_write_file(file); - if (bdev_handle) - bdev_release(bdev_handle); + if (bdev_file) + fput(bdev_file); out: btrfs_put_dev_args_from_path(&args); kfree(vol_args); @@ -2754,9 +2783,9 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) { BTRFS_DEV_LOOKUP_ARGS(args); struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_ioctl_vol_args *vol_args; - struct bdev_handle *bdev_handle = NULL; + struct file *bdev_file = NULL; int ret; bool cancel = false; @@ -2767,7 +2796,10 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) if (IS_ERR(vol_args)) return PTR_ERR(vol_args); - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args_path(vol_args); + if (ret < 0) + goto out_free; + if (!strcmp("cancel", vol_args->name)) { cancel = true; } else { @@ -2783,17 +2815,18 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE, cancel); if (ret == 0) { - ret = btrfs_rm_device(fs_info, &args, &bdev_handle); + ret = btrfs_rm_device(fs_info, &args, &bdev_file); if (!ret) btrfs_info(fs_info, "disk deleted %s", vol_args->name); btrfs_exclop_finish(fs_info); } mnt_drop_write_file(file); - if (bdev_handle) - bdev_release(bdev_handle); + if (bdev_file) + fput(bdev_file); out: btrfs_put_dev_args_from_path(&args); +out_free: kfree(vol_args); return ret; } @@ -2897,7 +2930,7 @@ out: static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_root *new_root; struct btrfs_dir_item *di; @@ -3171,7 +3204,7 @@ static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info, static long btrfs_ioctl_scrub(struct file *file, void __user *arg) { - struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(file_inode(file)); struct btrfs_ioctl_scrub_args *sa; int ret; @@ -3689,7 +3722,7 @@ out: static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_ioctl_quota_ctl_args *sa; int ret; @@ -3731,7 +3764,7 @@ drop_write: static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ioctl_qgroup_assign_args *sa; struct btrfs_trans_handle *trans; @@ -3808,6 +3841,11 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) goto out; } + if (sa->create && is_fstree(sa->qgroupid)) { + ret = -EINVAL; + goto out; + } + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -3882,7 +3920,7 @@ drop_write: static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_ioctl_quota_rescan_args *qsa; int ret; @@ -3946,7 +3984,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, struct btrfs_ioctl_received_subvol_args *sa) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_root_item *root_item = &root->root_item; struct btrfs_trans_handle *trans; @@ -4134,7 +4172,7 @@ static int btrfs_ioctl_get_fslabel(struct btrfs_fs_info *fs_info, static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_super_block *super_block = fs_info->super_copy; struct btrfs_trans_handle *trans; @@ -4277,7 +4315,7 @@ check_feature_bits(fs_info, FEAT_##mask_base, change_mask, flags, \ static int btrfs_ioctl_set_features(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_super_block *super_block = fs_info->super_copy; struct btrfs_ioctl_feature_flags flags[2]; @@ -4568,7 +4606,7 @@ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; void __user *argp = (void __user *)arg; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index d51b9a2f2f6e..2c5dc25ec670 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -3,6 +3,15 @@ #ifndef BTRFS_IOCTL_H #define BTRFS_IOCTL_H +#include <linux/types.h> + +struct file; +struct dentry; +struct mnt_idmap; +struct fileattr; +struct btrfs_fs_info; +struct btrfs_ioctl_balance_args; + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 74d8e2003f58..99ccab86bb86 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -13,7 +13,6 @@ #include "ctree.h" #include "extent_io.h" #include "locking.h" -#include "accessors.h" /* * Lockdep class keys for extent_buffer->lock's in this root. For a given @@ -85,7 +84,7 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int { struct btrfs_lockdep_keyset *ks; - BUG_ON(level >= ARRAY_SIZE(ks->keys)); + ASSERT(level < ARRAY_SIZE(ks->keys)); /* Find the matching keyset, id 0 is the default entry */ for (ks = btrfs_lockdep_keysets; ks->id; ks++) diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 7d6ee1e609bf..9576f485a300 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -8,8 +8,14 @@ #include <linux/atomic.h> #include <linux/wait.h> +#include <linux/lockdep.h> #include <linux/percpu_counter.h> #include "extent_io.h" +#include "locking.h" + +struct extent_buffer; +struct btrfs_path; +struct btrfs_root; #define BTRFS_WRITE_LOCK 1 #define BTRFS_READ_LOCK 2 @@ -157,8 +163,6 @@ enum btrfs_lockdep_trans_states { static_assert(BTRFS_NESTING_MAX <= MAX_LOCKDEP_SUBCLASSES, "too many lock subclasses defined"); -struct btrfs_path; - void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest); void btrfs_tree_lock(struct extent_buffer *eb); void btrfs_tree_unlock(struct extent_buffer *eb); diff --git a/fs/btrfs/lru_cache.h b/fs/btrfs/lru_cache.h index 00328c856be6..e32906ab6faa 100644 --- a/fs/btrfs/lru_cache.h +++ b/fs/btrfs/lru_cache.h @@ -3,8 +3,10 @@ #ifndef BTRFS_LRU_CACHE_H #define BTRFS_LRU_CACHE_H +#include <linux/types.h> #include <linux/maple_tree.h> #include <linux/list.h> +#include "lru_cache.h" /* * A cache entry. This is meant to be embedded in a structure of a user of @@ -50,11 +52,6 @@ struct btrfs_lru_cache { #define btrfs_lru_cache_for_each_entry_safe(cache, entry, tmp) \ list_for_each_entry_safe_reverse((entry), (tmp), &(cache)->lru_list, lru_list) -static inline unsigned int btrfs_lru_cache_size(const struct btrfs_lru_cache *cache) -{ - return cache->size; -} - static inline struct btrfs_lru_cache_entry *btrfs_lru_cache_lru_entry( struct btrfs_lru_cache *cache) { diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 1131d5a29d61..3e5d3b7028e8 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -214,7 +214,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out) { struct workspace *workspace = list_entry(ws, struct workspace, list); - const u32 sectorsize = btrfs_sb(mapping->host->i_sb)->sectorsize; + const u32 sectorsize = inode_to_fs_info(mapping->host)->sectorsize; struct page *page_in = NULL; char *sizes_ptr; const unsigned long max_nr_page = *out_pages; @@ -425,16 +425,16 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) } int lzo_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); + struct btrfs_fs_info *fs_info = page_to_fs_info(dest_page); + const u32 sectorsize = fs_info->sectorsize; size_t in_len; size_t out_len; size_t max_segment_len = WORKSPACE_BUF_LENGTH; int ret = 0; - char *kaddr; - unsigned long bytes; if (srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2) return -EUCLEAN; @@ -451,7 +451,7 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, } data_in += LZO_LEN; - out_len = PAGE_SIZE; + out_len = sectorsize; ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len); if (ret != LZO_E_OK) { pr_warn("BTRFS: decompress failed!\n"); @@ -459,29 +459,13 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, goto out; } - if (out_len < start_byte) { + ASSERT(out_len <= sectorsize); + memcpy_to_page(dest_page, dest_pgoff, workspace->buf, out_len); + /* Early end, considered as an error. */ + if (unlikely(out_len < destlen)) { ret = -EIO; - goto out; + memzero_page(dest_page, dest_pgoff + out_len, destlen - out_len); } - - /* - * the caller is already checking against PAGE_SIZE, but lets - * move this check closer to the memcpy/memset - */ - destlen = min_t(unsigned long, destlen, PAGE_SIZE); - bytes = min_t(unsigned long, destlen, out_len - start_byte); - - kaddr = kmap_local_page(dest_page); - memcpy(kaddr, workspace->buf + start_byte, bytes); - - /* - * btrfs_getblock is doing a zero on the tail of the page too, - * but this will cover anything missing from the decompressed - * data. - */ - if (bytes < destlen) - memset(kaddr+bytes, 0, destlen-bytes); - kunmap_local(kaddr); out: return ret; } diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c index cdada4865837..c96dd66fd0f7 100644 --- a/fs/btrfs/messages.c +++ b/fs/btrfs/messages.c @@ -3,8 +3,6 @@ #include "fs.h" #include "messages.h" #include "discard.h" -#include "transaction.h" -#include "space-info.h" #include "super.h" #ifdef CONFIG_PRINTK diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index 40f2d9f1a17a..dde4904aead9 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -3,6 +3,8 @@ #ifndef BTRFS_MISC_H #define BTRFS_MISC_H +#include <linux/types.h> +#include <linux/bitmap.h> #include <linux/sched.h> #include <linux/wait.h> #include <linux/math64.h> diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 59850dc17b22..b749ba45da2b 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -19,7 +19,6 @@ #include "qgroup.h" #include "subpage.h" #include "file.h" -#include "super.h" static struct kmem_cache *btrfs_ordered_extent_cache; @@ -1236,10 +1235,7 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( int __init ordered_data_init(void) { - btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", - sizeof(struct btrfs_ordered_extent), 0, - SLAB_MEM_SPREAD, - NULL); + btrfs_ordered_extent_cache = KMEM_CACHE(btrfs_ordered_extent, 0); if (!btrfs_ordered_extent_cache) return -ENOMEM; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 127ef8bf0ffd..34413fc5b4bd 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -6,6 +6,21 @@ #ifndef BTRFS_ORDERED_DATA_H #define BTRFS_ORDERED_DATA_H +#include <linux/types.h> +#include <linux/list.h> +#include <linux/refcount.h> +#include <linux/completion.h> +#include <linux/rbtree.h> +#include <linux/wait.h> +#include "async-thread.h" + +struct inode; +struct page; +struct extent_state; +struct btrfs_inode; +struct btrfs_root; +struct btrfs_fs_info; + struct btrfs_ordered_sum { /* * Logical start address and length for of the blocks covered by diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c index 7a1b021b5669..6195a2215b8f 100644 --- a/fs/btrfs/orphan.c +++ b/fs/btrfs/orphan.c @@ -4,7 +4,6 @@ */ #include "ctree.h" -#include "disk-io.h" #include "orphan.h" int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/orphan.h b/fs/btrfs/orphan.h index 3faab5cbb59a..aa54a88a60de 100644 --- a/fs/btrfs/orphan.h +++ b/fs/btrfs/orphan.h @@ -3,6 +3,11 @@ #ifndef BTRFS_ORPHAN_H #define BTRFS_ORPHAN_H +#include <linux/types.h> + +struct btrfs_trans_handle; +struct btrfs_root; + int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset); int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h index c42bc666d5ee..8504bf1702c7 100644 --- a/fs/btrfs/print-tree.h +++ b/fs/btrfs/print-tree.h @@ -9,6 +9,9 @@ /* Buffer size to contain tree name and possibly additional data (offset) */ #define BTRFS_ROOT_NAME_BUF_LEN 48 +struct extent_buffer; +struct btrfs_key; + void btrfs_print_leaf(const struct extent_buffer *l); void btrfs_print_tree(const struct extent_buffer *c, bool follow); const char *btrfs_root_name(const struct btrfs_key *key, char *buf); diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index f9bf591a0718..2a9b7b029eeb 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -4,6 +4,7 @@ */ #include <linux/hashtable.h> +#include <linux/xattr.h> #include "messages.h" #include "props.h" #include "btrfs_inode.h" @@ -302,7 +303,7 @@ static int prop_compression_validate(const struct btrfs_inode *inode, static int prop_compression_apply(struct inode *inode, const char *value, size_t len) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); int type; /* Reset to defaults */ diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h index 6e283196e38a..f60cd89feb29 100644 --- a/fs/btrfs/props.h +++ b/fs/btrfs/props.h @@ -6,7 +6,12 @@ #ifndef BTRFS_PROPS_H #define BTRFS_PROPS_H -#include "ctree.h" +#include <linux/compiler_types.h> + +struct inode; +struct btrfs_inode; +struct btrfs_path; +struct btrfs_trans_handle; int __init btrfs_props_init(void); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 63b426cc7798..5f90f0605b12 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1324,7 +1324,7 @@ static int flush_reservations(struct btrfs_fs_info *fs_info) trans = btrfs_join_transaction(fs_info->tree_root); if (IS_ERR(trans)) return PTR_ERR(trans); - btrfs_commit_transaction(trans); + ret = btrfs_commit_transaction(trans); return ret; } @@ -1736,6 +1736,15 @@ out: return ret; } +static bool qgroup_has_usage(struct btrfs_qgroup *qgroup) +{ + return (qgroup->rfer > 0 || qgroup->rfer_cmpr > 0 || + qgroup->excl > 0 || qgroup->excl_cmpr > 0 || + qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] > 0 || + qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] > 0 || + qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > 0); +} + int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -1755,6 +1764,11 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) goto out; } + if (is_fstree(qgroupid) && qgroup_has_usage(qgroup)) { + ret = -EBUSY; + goto out; + } + /* Check if there are no children of this qgroup */ if (!list_empty(&qgroup->members)) { ret = -EBUSY; @@ -2491,8 +2505,8 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *eb = root_eb; struct btrfs_path *path = NULL; - BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL); - BUG_ON(root_eb == NULL); + ASSERT(0 <= root_level && root_level < BTRFS_MAX_LEVEL); + ASSERT(root_eb != NULL); if (!btrfs_qgroup_full_accounting(fs_info)) return 0; @@ -2847,8 +2861,6 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, if (nr_old_roots == 0 && nr_new_roots == 0) goto out_free; - BUG_ON(!fs_info->quota_root); - trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr, num_bytes, nr_old_roots, nr_new_roots); @@ -2945,11 +2957,6 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) ctx.roots = NULL; } - /* Free the reserved data space */ - btrfs_qgroup_free_refroot(fs_info, - record->data_rsv_refroot, - record->data_rsv, - BTRFS_QGROUP_RSV_DATA); /* * Use BTRFS_SEQ_LAST as time_seq to do special search, * which doesn't lock tree or delayed_refs and search @@ -2973,6 +2980,11 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) record->old_roots = NULL; new_roots = NULL; } + /* Free the reserved data space */ + btrfs_qgroup_free_refroot(fs_info, + record->data_rsv_refroot, + record->data_rsv, + BTRFS_QGROUP_RSV_DATA); cleanup: ulist_free(record->old_roots); ulist_free(new_roots); @@ -3034,6 +3046,57 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans) return ret; } +int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_inherit *inherit, + size_t size) +{ + if (inherit->flags & ~BTRFS_QGROUP_INHERIT_FLAGS_SUPP) + return -EOPNOTSUPP; + if (size < sizeof(*inherit) || size > PAGE_SIZE) + return -EINVAL; + + /* + * In the past we allowed btrfs_qgroup_inherit to specify to copy + * rfer/excl numbers directly from other qgroups. This behavior has + * been disabled in userspace for a very long time, but here we should + * also disable it in kernel, as this behavior is known to mark qgroup + * inconsistent, and a rescan would wipe out the changes anyway. + * + * Reject any btrfs_qgroup_inherit with num_ref_copies or num_excl_copies. + */ + if (inherit->num_ref_copies > 0 || inherit->num_excl_copies > 0) + return -EINVAL; + + if (inherit->num_qgroups > PAGE_SIZE) + return -EINVAL; + + if (size != struct_size(inherit, qgroups, inherit->num_qgroups)) + return -EINVAL; + + /* + * Now check all the remaining qgroups, they should all: + * + * - Exist + * - Be higher level qgroups. + */ + for (int i = 0; i < inherit->num_qgroups; i++) { + struct btrfs_qgroup *qgroup; + u64 qgroupid = inherit->qgroups[i]; + + if (btrfs_qgroup_level(qgroupid) == 0) + return -EINVAL; + + spin_lock(&fs_info->qgroup_lock); + qgroup = find_qgroup_rb(fs_info, qgroupid); + if (!qgroup) { + spin_unlock(&fs_info->qgroup_lock); + return -ENOENT; + } + spin_unlock(&fs_info->qgroup_lock); + } + return 0; +} + static int qgroup_auto_inherit(struct btrfs_fs_info *fs_info, u64 inode_rootid, struct btrfs_qgroup_inherit **inherit) @@ -3076,6 +3139,62 @@ static int qgroup_auto_inherit(struct btrfs_fs_info *fs_info, } /* + * Check if we can skip rescan when inheriting qgroups. If @src has a single + * @parent, and that @parent is owning all its bytes exclusively, we can skip + * the full rescan, by just adding nodesize to the @parent's excl/rfer. + * + * Return <0 for fatal errors (like srcid/parentid has no qgroup). + * Return 0 if a quick inherit is done. + * Return >0 if a quick inherit is not possible, and a full rescan is needed. + */ +static int qgroup_snapshot_quick_inherit(struct btrfs_fs_info *fs_info, + u64 srcid, u64 parentid) +{ + struct btrfs_qgroup *src; + struct btrfs_qgroup *parent; + struct btrfs_qgroup_list *list; + int nr_parents = 0; + + src = find_qgroup_rb(fs_info, srcid); + if (!src) + return -ENOENT; + parent = find_qgroup_rb(fs_info, parentid); + if (!parent) + return -ENOENT; + + /* + * Source has no parent qgroup, but our new qgroup would have one. + * Qgroup numbers would become inconsistent. + */ + if (list_empty(&src->groups)) + return 1; + + list_for_each_entry(list, &src->groups, next_group) { + /* The parent is not the same, quick update is not possible. */ + if (list->group->qgroupid != parentid) + return 1; + nr_parents++; + /* + * More than one parent qgroup, we can't be sure about accounting + * consistency. + */ + if (nr_parents > 1) + return 1; + } + + /* + * The parent is not exclusively owning all its bytes. We're not sure + * if the source has any bytes not fully owned by the parent. + */ + if (parent->excl != parent->rfer) + return 1; + + parent->excl += fs_info->nodesize; + parent->rfer += fs_info->nodesize; + return 0; +} + +/* * Copy the accounting information between qgroups. This is necessary * when a snapshot or a subvolume is created. Throwing an error will * cause a transaction abort so we take extra care here to only error @@ -3243,6 +3362,13 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, qgroup_dirty(fs_info, dstgroup); qgroup_dirty(fs_info, srcgroup); + + /* + * If the source qgroup has parent but the new one doesn't, + * we need a full rescan. + */ + if (!inherit && !list_empty(&srcgroup->groups)) + need_rescan = true; } if (!inherit) @@ -3257,14 +3383,16 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, if (ret) goto unlock; } + if (srcid) { + /* Check if we can do a quick inherit. */ + ret = qgroup_snapshot_quick_inherit(fs_info, srcid, *i_qgroups); + if (ret < 0) + goto unlock; + if (ret > 0) + need_rescan = true; + ret = 0; + } ++i_qgroups; - - /* - * If we're doing a snapshot, and adding the snapshot to a new - * qgroup, the numbers are guaranteed to be incorrect. - */ - if (srcid) - need_rescan = true; } for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) { diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index be18c862e64e..706640be0ec2 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -6,12 +6,22 @@ #ifndef BTRFS_QGROUP_H #define BTRFS_QGROUP_H +#include <linux/types.h> #include <linux/spinlock.h> #include <linux/rbtree.h> #include <linux/kobject.h> -#include "ulist.h" -#include "delayed-ref.h" -#include "misc.h" +#include <linux/list.h> +#include <uapi/linux/btrfs_tree.h> + +struct extent_buffer; +struct extent_changeset; +struct btrfs_delayed_extent_op; +struct btrfs_fs_info; +struct btrfs_root; +struct btrfs_ioctl_quota_ctl_args; +struct btrfs_trans_handle; +struct btrfs_delayed_ref_root; +struct btrfs_inode; /* * Btrfs qgroup overview @@ -321,7 +331,6 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, struct btrfs_qgroup_limit *limit); int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); -struct btrfs_delayed_extent_op; int btrfs_qgroup_trace_extent_nolock( struct btrfs_fs_info *fs_info, @@ -341,6 +350,9 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, struct ulist *new_roots); int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans); int btrfs_run_qgroups(struct btrfs_trans_handle *trans); +int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_inherit *inherit, + size_t size); int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, u64 objectid, u64 inode_rootid, struct btrfs_qgroup_inherit *inherit); diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index 9589362acfbf..6af6b4b9a32e 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -11,7 +11,6 @@ #include "disk-io.h" #include "raid-stripe-tree.h" #include "volumes.h" -#include "misc.h" #include "print-tree.h" int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length) diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h index cdb58b38fcb5..c9c258f84903 100644 --- a/fs/btrfs/raid-stripe-tree.h +++ b/fs/btrfs/raid-stripe-tree.h @@ -6,6 +6,10 @@ #ifndef BTRFS_RAID_STRIPE_TREE_H #define BTRFS_RAID_STRIPE_TREE_H +#include <linux/types.h> +#include <uapi/linux/btrfs_tree.h> +#include "fs.h" + #define BTRFS_RST_SUPP_BLOCK_GROUP_MASK (BTRFS_BLOCK_GROUP_DUP | \ BTRFS_BLOCK_GROUP_RAID1_MASK | \ BTRFS_BLOCK_GROUP_RAID0 | \ @@ -13,6 +17,7 @@ struct btrfs_io_context; struct btrfs_io_stripe; +struct btrfs_fs_info; struct btrfs_ordered_extent; struct btrfs_trans_handle; diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 792c8e17c31d..6f4a9cfeea44 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -14,7 +14,6 @@ #include <linux/raid/xor.h> #include <linux/mm.h> #include "messages.h" -#include "misc.h" #include "ctree.h" #include "disk-io.h" #include "volumes.h" @@ -918,6 +917,13 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, */ ASSERT(stripe_nsectors <= BITS_PER_LONG); + /* + * Real stripes must be between 2 (2 disks RAID5, aka RAID1) and 256 + * (limited by u8). + */ + ASSERT(real_stripes >= 2); + ASSERT(real_stripes <= U8_MAX); + rbio = kzalloc(sizeof(*rbio), GFP_NOFS); if (!rbio) return ERR_PTR(-ENOMEM); @@ -955,6 +961,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, ASSERT(btrfs_nr_parity_stripes(bioc->map_type)); rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type); + ASSERT(rbio->nr_data > 0); return rbio; } @@ -1181,6 +1188,26 @@ static inline void bio_list_put(struct bio_list *bio_list) bio_put(bio); } +static void assert_rbio(struct btrfs_raid_bio *rbio) +{ + if (!IS_ENABLED(CONFIG_BTRFS_DEBUG) || + !IS_ENABLED(CONFIG_BTRFS_ASSERT)) + return; + + /* + * At least two stripes (2 disks RAID5), and since real_stripes is U8, + * we won't go beyond 256 disks anyway. + */ + ASSERT(rbio->real_stripes >= 2); + ASSERT(rbio->nr_data > 0); + + /* + * This is another check to make sure nr data stripes is smaller + * than total stripes. + */ + ASSERT(rbio->nr_data < rbio->real_stripes); +} + /* Generate PQ for one vertical stripe. */ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) { @@ -1212,6 +1239,7 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; + assert_rbio(rbio); raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, pointers); } else { @@ -2473,6 +2501,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) } if (has_qstripe) { + assert_rbio(rbio); /* RAID6, call the library function to fill in our P/Q */ raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, pointers); diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 470213688872..0d7b4c2fb6ae 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -7,9 +7,18 @@ #ifndef BTRFS_RAID56_H #define BTRFS_RAID56_H +#include <linux/types.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/bio.h> +#include <linux/refcount.h> #include <linux/workqueue.h> #include "volumes.h" +struct page; +struct sector_ptr; +struct btrfs_fs_info; + enum btrfs_rbio_ops { BTRFS_RBIO_WRITE, BTRFS_RBIO_READ_REBUILD, diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h index 5c2b66d155ef..1c2d7cb1fe6f 100644 --- a/fs/btrfs/rcu-string.h +++ b/fs/btrfs/rcu-string.h @@ -6,6 +6,12 @@ #ifndef BTRFS_RCU_STRING_H #define BTRFS_RCU_STRING_H +#include <linux/types.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/rcupdate.h> +#include <linux/printk.h> + struct rcu_string { struct rcu_head rcu; char str[]; diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 6486f0d7e993..8c4fc98ca9ce 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -889,8 +889,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, out_unlock: spin_unlock(&fs_info->ref_verify_lock); out: - if (ret) + if (ret) { + btrfs_free_ref_cache(fs_info); btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); + } return ret; } @@ -1021,8 +1023,8 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) } } if (ret) { - btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); btrfs_free_ref_cache(fs_info); + btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); } btrfs_free_path(path); return ret; diff --git a/fs/btrfs/ref-verify.h b/fs/btrfs/ref-verify.h index 855de37719b5..3511e1a5c96b 100644 --- a/fs/btrfs/ref-verify.h +++ b/fs/btrfs/ref-verify.h @@ -6,7 +6,16 @@ #ifndef BTRFS_REF_VERIFY_H #define BTRFS_REF_VERIFY_H +#include <linux/types.h> +#include <linux/rbtree_types.h> + +struct btrfs_fs_info; +struct btrfs_ref; + #ifdef CONFIG_BTRFS_FS_REF_VERIFY + +#include <linux/spinlock.h> + int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info); void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info); int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index ae90894dc7dc..08d0fb46ceec 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -174,7 +174,7 @@ static int clone_copy_inline_extent(struct inode *dst, char *inline_data, struct btrfs_trans_handle **trans_out) { - struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dst); struct btrfs_root *root = BTRFS_I(dst)->root; const u64 aligned_end = ALIGN(new_key->offset + datal, fs_info->sectorsize); @@ -337,7 +337,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, const u64 off, const u64 olen, const u64 olen_aligned, const u64 destoff, int no_time_update) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_path *path = NULL; struct extent_buffer *leaf; struct btrfs_trans_handle *trans; @@ -663,7 +663,7 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, struct inode *dst, u64 dst_loff) { struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info; - const u64 bs = fs_info->sb->s_blocksize; + const u64 bs = fs_info->sectorsize; int ret; /* @@ -726,11 +726,11 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, { struct inode *inode = file_inode(file); struct inode *src = file_inode(file_src); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); int ret; int wb_ret; u64 len = olen; - u64 bs = fs_info->sb->s_blocksize; + u64 bs = fs_info->sectorsize; /* * VFS's generic_remap_file_range_prep() protects us from cloning the @@ -796,7 +796,7 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); - u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize; + u64 bs = BTRFS_I(inode_out)->root->fs_info->sectorsize; u64 wb_len; int ret; diff --git a/fs/btrfs/reflink.h b/fs/btrfs/reflink.h index ecb309b4dad0..1e291f7d85c4 100644 --- a/fs/btrfs/reflink.h +++ b/fs/btrfs/reflink.h @@ -3,7 +3,9 @@ #ifndef BTRFS_REFLINK_H #define BTRFS_REFLINK_H -#include <linux/fs.h> +#include <linux/types.h> + +struct file; loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index abe594f77f99..f96f267fb4aa 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -523,7 +523,8 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree( if (handle_useless_nodes(rc, node)) node = NULL; out: - btrfs_backref_iter_free(iter); + btrfs_free_path(iter->path); + kfree(iter); btrfs_free_path(path); if (err) { btrfs_backref_error_cleanup(cache, node); @@ -2987,7 +2988,7 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, const struct file_extent_cluster *cluster, int *cluster_nr, unsigned long page_index) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); u64 offset = BTRFS_I(inode)->index_cnt; const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h index 5fb60f2deb53..788c86d8633a 100644 --- a/fs/btrfs/relocation.h +++ b/fs/btrfs/relocation.h @@ -3,6 +3,15 @@ #ifndef BTRFS_RELOCATION_H #define BTRFS_RELOCATION_H +#include <linux/types.h> + +struct extent_buffer; +struct btrfs_fs_info; +struct btrfs_root; +struct btrfs_trans_handle; +struct btrfs_ordered_extent; +struct btrfs_pending_snapshot; + int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start); int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 603ad1459368..4bb538a372ce 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -10,7 +10,6 @@ #include "messages.h" #include "transaction.h" #include "disk-io.h" -#include "print-tree.h" #include "qgroup.h" #include "space-info.h" #include "accessors.h" @@ -82,7 +81,14 @@ int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key, if (ret > 0) goto out; } else { - BUG_ON(ret == 0); /* Logical error */ + /* + * Key with offset -1 found, there would have to exist a root + * with such id, but this is out of the valid range. + */ + if (ret == 0) { + ret = -EUCLEAN; + goto out; + } if (path->slots[0] == 0) goto out; path->slots[0]--; @@ -323,8 +329,11 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, key, path, -1, 1); if (ret < 0) goto out; - - BUG_ON(ret != 0); + if (ret != 0) { + /* The root must exist but we did not find it by the key. */ + ret = -EUCLEAN; + goto out; + } ret = btrfs_del_item(trans, root, path); out: diff --git a/fs/btrfs/root-tree.h b/fs/btrfs/root-tree.h index 8b2c3859e464..6f929cf3bd49 100644 --- a/fs/btrfs/root-tree.h +++ b/fs/btrfs/root-tree.h @@ -3,7 +3,17 @@ #ifndef BTRFS_ROOT_TREE_H #define BTRFS_ROOT_TREE_H +#include <linux/types.h> + struct fscrypt_str; +struct extent_buffer; +struct btrfs_key; +struct btrfs_root; +struct btrfs_root_item; +struct btrfs_path; +struct btrfs_fs_info; +struct btrfs_block_rsv; +struct btrfs_trans_handle; int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a01807cbd4d4..c4bd0e60db59 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1098,12 +1098,22 @@ out: static void scrub_read_endio(struct btrfs_bio *bbio) { struct scrub_stripe *stripe = bbio->private; + struct bio_vec *bvec; + int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); + int num_sectors; + u32 bio_size = 0; + int i; + + ASSERT(sector_nr < stripe->nr_sectors); + bio_for_each_bvec_all(bvec, &bbio->bio, i) + bio_size += bvec->bv_len; + num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits; if (bbio->bio.bi_status) { - bitmap_set(&stripe->io_error_bitmap, 0, stripe->nr_sectors); - bitmap_set(&stripe->error_bitmap, 0, stripe->nr_sectors); + bitmap_set(&stripe->io_error_bitmap, sector_nr, num_sectors); + bitmap_set(&stripe->error_bitmap, sector_nr, num_sectors); } else { - bitmap_clear(&stripe->io_error_bitmap, 0, stripe->nr_sectors); + bitmap_clear(&stripe->io_error_bitmap, sector_nr, num_sectors); } bio_put(&bbio->bio); if (atomic_dec_and_test(&stripe->pending_io)) { @@ -1380,8 +1390,15 @@ static int find_first_extent_item(struct btrfs_root *extent_root, ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; + if (ret == 0) { + /* + * Key with offset -1 found, there would have to exist an extent + * item with such offset, but this is out of the valid range. + */ + btrfs_release_path(path); + return -EUCLEAN; + } - ASSERT(ret > 0); /* * Here we intentionally pass 0 as @min_objectid, as there could be * an extent item starting before @search_start. @@ -1636,6 +1653,9 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, { struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct btrfs_bio *bbio = NULL; + unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start + + stripe->bg->length - stripe->logical) >> + fs_info->sectorsize_bits; u64 stripe_len = BTRFS_STRIPE_LEN; int mirror = stripe->mirror_num; int i; @@ -1646,6 +1666,10 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, struct page *page = scrub_stripe_get_page(stripe, i); unsigned int pgoff = scrub_stripe_get_page_offset(stripe, i); + /* We're beyond the chunk boundary, no need to read anymore. */ + if (i >= nr_sectors) + break; + /* The current sector cannot be merged, submit the bio. */ if (bbio && ((i > 0 && @@ -1701,6 +1725,9 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, { struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_bio *bbio; + unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start + + stripe->bg->length - stripe->logical) >> + fs_info->sectorsize_bits; int mirror = stripe->mirror_num; ASSERT(stripe->bg); @@ -1715,14 +1742,16 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info, scrub_read_endio, stripe); - /* Read the whole stripe. */ bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT; - for (int i = 0; i < BTRFS_STRIPE_LEN >> PAGE_SHIFT; i++) { + /* Read the whole range inside the chunk boundary. */ + for (unsigned int cur = 0; cur < nr_sectors; cur++) { + struct page *page = scrub_stripe_get_page(stripe, cur); + unsigned int pgoff = scrub_stripe_get_page_offset(stripe, cur); int ret; - ret = bio_add_page(&bbio->bio, stripe->pages[i], PAGE_SIZE, 0); + ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); /* We should have allocated enough bio vectors. */ - ASSERT(ret == PAGE_SIZE); + ASSERT(ret == fs_info->sectorsize); } atomic_inc(&stripe->pending_io); diff --git a/fs/btrfs/scrub.h b/fs/btrfs/scrub.h index 7639103ebf9d..f0df597b75c7 100644 --- a/fs/btrfs/scrub.h +++ b/fs/btrfs/scrub.h @@ -3,6 +3,12 @@ #ifndef BTRFS_SCRUB_H #define BTRFS_SCRUB_H +#include <linux/types.h> + +struct btrfs_fs_info; +struct btrfs_device; +struct btrfs_scrub_progress; + int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, int readonly, int is_dev_replace); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 4e36550618e5..50b4a76ac88e 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -25,7 +25,6 @@ #include "btrfs_inode.h" #include "transaction.h" #include "compression.h" -#include "xattr.h" #include "print-tree.h" #include "accessors.h" #include "dir-item.h" @@ -777,7 +776,12 @@ static int begin_cmd(struct send_ctx *sctx, int cmd) if (WARN_ON(!sctx->send_buf)) return -EINVAL; - BUG_ON(sctx->send_size); + if (unlikely(sctx->send_size != 0)) { + btrfs_err(sctx->send_root->fs_info, + "send: command header buffer not empty cmd %d offset %llu", + cmd, sctx->send_off); + return -EINVAL; + } sctx->send_size += sizeof(*hdr); hdr = (struct btrfs_cmd_header *)sctx->send_buf; @@ -1070,7 +1074,15 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, ret = PTR_ERR(start); goto out; } - BUG_ON(start < p->buf); + if (unlikely(start < p->buf)) { + btrfs_err(root->fs_info, + "send: path ref buffer underflow for key (%llu %u %llu)", + found_key->objectid, + found_key->type, + found_key->offset); + ret = -EINVAL; + goto out; + } } p->start = start; } else { @@ -1406,7 +1418,7 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, struct btrfs_lru_cache_entry *raw_entry; struct backref_cache_entry *entry; - if (btrfs_lru_cache_size(&sctx->backref_cache) == 0) + if (sctx->backref_cache.size == 0) return false; /* @@ -1504,7 +1516,7 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, * transaction handle or holding fs_info->commit_root_sem, so no need * to take any lock here. */ - if (btrfs_lru_cache_size(&sctx->backref_cache) == 1) + if (sctx->backref_cache.size == 1) sctx->backref_cache_last_reloc_trans = fs_info->last_reloc_trans; } @@ -2809,8 +2821,7 @@ static int cache_dir_utimes(struct send_ctx *sctx, u64 dir, u64 gen) static int trim_dir_utimes_cache(struct send_ctx *sctx) { - while (btrfs_lru_cache_size(&sctx->dir_utimes_cache) > - SEND_MAX_DIR_UTIMES_CACHE_SIZE) { + while (sctx->dir_utimes_cache.size > SEND_MAX_DIR_UTIMES_CACHE_SIZE) { struct btrfs_lru_cache_entry *lru; int ret; @@ -4182,7 +4193,13 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * This should never happen as the root dir always has the same ref * which is always '..' */ - BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID); + if (unlikely(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID)) { + btrfs_err(fs_info, + "send: unexpected inode %llu in process_recorded_refs()", + sctx->cur_ino); + ret = -EINVAL; + goto out; + } valid_path = fs_path_alloc(); if (!valid_path) { @@ -6140,7 +6157,7 @@ static int send_write_or_clone(struct send_ctx *sctx, int ret = 0; u64 offset = key->offset; u64 end; - u64 bs = sctx->send_root->fs_info->sb->s_blocksize; + u64 bs = sctx->send_root->fs_info->sectorsize; end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size); if (offset >= end) @@ -6458,21 +6475,18 @@ static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path, if (sctx->cur_ino != key->objectid || !need_send_hole(sctx)) return 0; - if (sctx->cur_inode_last_extent == (u64)-1) { - ret = get_last_extent(sctx, key->offset - 1); - if (ret) - return ret; - } - - if (path->slots[0] == 0 && - sctx->cur_inode_last_extent < key->offset) { - /* - * We might have skipped entire leafs that contained only - * file extent items for our current inode. These leafs have - * a generation number smaller (older) than the one in the - * current leaf and the leaf our last extent came from, and - * are located between these 2 leafs. - */ + /* + * Get last extent's end offset (exclusive) if we haven't determined it + * yet (we're processing the first file extent item that is new), or if + * we're at the first slot of a leaf and the last extent's end is less + * than the current extent's offset, because we might have skipped + * entire leaves that contained only file extent items for our current + * inode. These leaves have a generation number smaller (older) than the + * one in the current leaf and the leaf our last extent came from, and + * are located between these 2 leaves. + */ + if ((sctx->cur_inode_last_extent == (u64)-1) || + (path->slots[0] == 0 && sctx->cur_inode_last_extent < key->offset)) { ret = get_last_extent(sctx, key->offset - 1); if (ret) return ret; @@ -6705,11 +6719,20 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) if (ret) goto out; } - if (sctx->cur_inode_last_extent < - sctx->cur_inode_size) { - ret = send_hole(sctx, sctx->cur_inode_size); - if (ret) + if (sctx->cur_inode_last_extent < sctx->cur_inode_size) { + ret = range_is_hole_in_parent(sctx, + sctx->cur_inode_last_extent, + sctx->cur_inode_size); + if (ret < 0) { goto out; + } else if (ret == 0) { + ret = send_hole(sctx, sctx->cur_inode_size); + if (ret < 0) + goto out; + } else { + /* Range is already a hole, skip. */ + ret = 0; + } } } if (need_truncate) { @@ -7420,8 +7443,8 @@ static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen u64 reada_done = 0; lockdep_assert_held_read(&parent->fs_info->commit_root_sem); + ASSERT(*level != 0); - BUG_ON(*level == 0); eb = btrfs_read_node_slot(parent, slot); if (IS_ERR(eb)) return PTR_ERR(eb); @@ -8111,7 +8134,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) } if (arg->flags & ~BTRFS_SEND_FLAG_MASK) { - ret = -EINVAL; + ret = -EOPNOTSUPP; goto out; } @@ -8205,8 +8228,8 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) goto out; } - sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots), - arg->clone_sources_count + 1, + sctx->clone_roots = kvcalloc(arg->clone_sources_count + 1, + sizeof(*sctx->clone_roots), GFP_KERNEL); if (!sctx->clone_roots) { ret = -ENOMEM; diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 4f5509cb1803..dd1c9f02b011 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -8,6 +8,11 @@ #define BTRFS_SEND_H #include <linux/types.h> +#include <linux/sizes.h> +#include <linux/align.h> + +struct inode; +struct btrfs_ioctl_send_args; #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" /* Conditional support for the upcoming protocol version. */ @@ -25,9 +30,6 @@ #define BTRFS_SEND_BUF_SIZE_V1 SZ_64K #define BTRFS_SEND_BUF_SIZE_V2 ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE) -struct inode; -struct btrfs_ioctl_send_args; - enum btrfs_tlv_type { BTRFS_TLV_U8, BTRFS_TLV_U16, diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 571bb13587d5..d620323d08ea 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -9,7 +9,6 @@ #include "ordered-data.h" #include "transaction.h" #include "block-group.h" -#include "zoned.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" @@ -856,7 +855,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info) { - u64 global_rsv_size = fs_info->global_block_rsv.reserved; + const u64 global_rsv_size = btrfs_block_rsv_reserved(&fs_info->global_block_rsv); u64 ordered, delalloc; u64 thresh; u64 used; @@ -956,8 +955,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, ordered = percpu_counter_read_positive(&fs_info->ordered_bytes) >> 1; delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes); if (ordered >= delalloc) - used += fs_info->delayed_refs_rsv.reserved + - fs_info->delayed_block_rsv.reserved; + used += btrfs_block_rsv_reserved(&fs_info->delayed_refs_rsv) + + btrfs_block_rsv_reserved(&fs_info->delayed_block_rsv); else used += space_info->bytes_may_use - global_rsv_size; @@ -1173,7 +1172,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) enum btrfs_flush_state flush; u64 delalloc_size = 0; u64 to_reclaim, block_rsv_size; - u64 global_rsv_size = global_rsv->reserved; + const u64 global_rsv_size = btrfs_block_rsv_reserved(global_rsv); loops++; @@ -1185,9 +1184,9 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) * assume it's tied up in delalloc reservations. */ block_rsv_size = global_rsv_size + - delayed_block_rsv->reserved + - delayed_refs_rsv->reserved + - trans_rsv->reserved; + btrfs_block_rsv_reserved(delayed_block_rsv) + + btrfs_block_rsv_reserved(delayed_refs_rsv) + + btrfs_block_rsv_reserved(trans_rsv); if (block_rsv_size < space_info->bytes_may_use) delalloc_size = space_info->bytes_may_use - block_rsv_size; @@ -1207,16 +1206,16 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) to_reclaim = delalloc_size; flush = FLUSH_DELALLOC; } else if (space_info->bytes_pinned > - (delayed_block_rsv->reserved + - delayed_refs_rsv->reserved)) { + (btrfs_block_rsv_reserved(delayed_block_rsv) + + btrfs_block_rsv_reserved(delayed_refs_rsv))) { to_reclaim = space_info->bytes_pinned; flush = COMMIT_TRANS; - } else if (delayed_block_rsv->reserved > - delayed_refs_rsv->reserved) { - to_reclaim = delayed_block_rsv->reserved; + } else if (btrfs_block_rsv_reserved(delayed_block_rsv) > + btrfs_block_rsv_reserved(delayed_refs_rsv)) { + to_reclaim = btrfs_block_rsv_reserved(delayed_block_rsv); flush = FLUSH_DELAYED_ITEMS_NR; } else { - to_reclaim = delayed_refs_rsv->reserved; + to_reclaim = btrfs_block_rsv_reserved(delayed_refs_rsv); flush = FLUSH_DELAYED_REFS_NR; } diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 92c595fed1b0..a733458fd13b 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -4,8 +4,17 @@ #define BTRFS_SPACE_INFO_H #include <trace/events/btrfs.h> +#include <linux/spinlock.h> +#include <linux/list.h> +#include <linux/kobject.h> +#include <linux/lockdep.h> +#include <linux/wait.h> +#include <linux/rwsem.h> #include "volumes.h" +struct btrfs_fs_info; +struct btrfs_block_group; + /* * Different levels for to flush space when doing space reservations. * diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 93511d54abf8..54736f6238e6 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -111,6 +111,9 @@ void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sector subpage_info->checked_offset = cur; cur += nr_bits; + subpage_info->locked_offset = cur; + cur += nr_bits; + subpage_info->total_nr_bits = cur; } @@ -237,28 +240,58 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, start + len <= folio_pos(folio) + PAGE_SIZE); } +#define subpage_calc_start_bit(fs_info, folio, name, start, len) \ +({ \ + unsigned int start_bit; \ + \ + btrfs_subpage_assert(fs_info, folio, start, len); \ + start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \ + start_bit += fs_info->subpage_info->name##_offset; \ + start_bit; \ +}) + void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { struct btrfs_subpage *subpage = folio_get_private(folio); + const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); const int nbits = len >> fs_info->sectorsize_bits; + unsigned long flags; + btrfs_subpage_assert(fs_info, folio, start, len); + spin_lock_irqsave(&subpage->lock, flags); + /* + * Even though it's just for reading the page, no one should have + * locked the subpage range. + */ + ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); + bitmap_set(subpage->bitmaps, start_bit, nbits); atomic_add(nbits, &subpage->readers); + spin_unlock_irqrestore(&subpage->lock, flags); } void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { struct btrfs_subpage *subpage = folio_get_private(folio); + const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); const int nbits = len >> fs_info->sectorsize_bits; + unsigned long flags; bool is_data; bool last; btrfs_subpage_assert(fs_info, folio, start, len); is_data = is_data_inode(folio->mapping->host); + + spin_lock_irqsave(&subpage->lock, flags); + + /* The range should have already been locked. */ + ASSERT(bitmap_test_range_all_set(subpage->bitmaps, start_bit, nbits)); ASSERT(atomic_read(&subpage->readers) >= nbits); + + bitmap_clear(subpage->bitmaps, start_bit, nbits); last = atomic_sub_and_test(nbits, &subpage->readers); /* @@ -270,6 +303,7 @@ void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, */ if (is_data && last) folio_unlock(folio); + spin_unlock_irqrestore(&subpage->lock, flags); } static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len) @@ -290,28 +324,38 @@ static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len) orig_start + orig_len) - *start; } -void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) +static void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len) { struct btrfs_subpage *subpage = folio_get_private(folio); + const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); const int nbits = (len >> fs_info->sectorsize_bits); + unsigned long flags; int ret; btrfs_subpage_assert(fs_info, folio, start, len); + spin_lock_irqsave(&subpage->lock, flags); ASSERT(atomic_read(&subpage->readers) == 0); + ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); + bitmap_set(subpage->bitmaps, start_bit, nbits); ret = atomic_add_return(nbits, &subpage->writers); ASSERT(ret == nbits); + spin_unlock_irqrestore(&subpage->lock, flags); } -bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) +static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len) { struct btrfs_subpage *subpage = folio_get_private(folio); + const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); const int nbits = (len >> fs_info->sectorsize_bits); + unsigned long flags; + bool last; btrfs_subpage_assert(fs_info, folio, start, len); + spin_lock_irqsave(&subpage->lock, flags); /* * We have call sites passing @lock_page into * extent_clear_unlock_delalloc() for compression path. @@ -319,11 +363,18 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, * This @locked_page is locked by plain lock_page(), thus its * subpage::writers is 0. Handle them in a special way. */ - if (atomic_read(&subpage->writers) == 0) + if (atomic_read(&subpage->writers) == 0) { + spin_unlock_irqrestore(&subpage->lock, flags); return true; + } ASSERT(atomic_read(&subpage->writers) >= nbits); - return atomic_sub_and_test(nbits, &subpage->writers); + /* The target range should have been locked. */ + ASSERT(bitmap_test_range_all_set(subpage->bitmaps, start_bit, nbits)); + bitmap_clear(subpage->bitmaps, start_bit, nbits); + last = atomic_sub_and_test(nbits, &subpage->writers); + spin_unlock_irqrestore(&subpage->lock, flags); + return last; } /* @@ -365,16 +416,6 @@ void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, folio_unlock(folio); } -#define subpage_calc_start_bit(fs_info, folio, name, start, len) \ -({ \ - unsigned int start_bit; \ - \ - btrfs_subpage_assert(fs_info, folio, start, len); \ - start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \ - start_bit += fs_info->subpage_info->name##_offset; \ - start_bit; \ -}) - #define subpage_test_bitmap_all_set(fs_info, subpage, name) \ bitmap_test_range_all_set(subpage->bitmaps, \ fs_info->subpage_info->name##_offset, \ @@ -475,7 +516,8 @@ void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, spin_lock_irqsave(&subpage->lock, flags); bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - folio_start_writeback(folio); + if (!folio_test_writeback(folio)) + folio_start_writeback(folio); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -750,6 +792,7 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, GET_SUBPAGE_BITMAP(subpage, subpage_info, writeback, &writeback_bitmap); GET_SUBPAGE_BITMAP(subpage, subpage_info, ordered, &ordered_bitmap); GET_SUBPAGE_BITMAP(subpage, subpage_info, checked, &checked_bitmap); + GET_SUBPAGE_BITMAP(subpage, subpage_info, locked, &checked_bitmap); spin_unlock_irqrestore(&subpage->lock, flags); dump_page(folio_page(folio, 0), "btrfs subpage dump"); diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 793c2b314a58..b6dc013b0fdc 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -4,6 +4,11 @@ #define BTRFS_SUBPAGE_H #include <linux/spinlock.h> +#include <linux/atomic.h> + +struct address_space; +struct folio; +struct btrfs_fs_info; /* * Extra info for subpapge bitmap. @@ -28,7 +33,7 @@ struct btrfs_subpage_info { unsigned int total_nr_bits; /* - * *_start indicates where the bitmap starts, the length is always + * *_offset indicates where the bitmap starts, the length is always * @bitmap_size, which is calculated from PAGE_SIZE / sectorsize. */ unsigned int uptodate_offset; @@ -36,6 +41,16 @@ struct btrfs_subpage_info { unsigned int writeback_offset; unsigned int ordered_offset; unsigned int checked_offset; + + /* + * For locked bitmaps, normally it's subpage representation for folio + * Locked flag, but metadata is different: + * + * - Metadata doesn't really lock the folio + * It's just to prevent page::private get cleared before the last + * end_page_read(). + */ + unsigned int locked_offset; }; /* @@ -93,10 +108,6 @@ void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); -void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len); -bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len); int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 896acfda1789..7e44ccaf348f 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -34,13 +34,11 @@ #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "print-tree.h" #include "props.h" #include "xattr.h" #include "bio.h" #include "export.h" #include "compression.h" -#include "rcu-string.h" #include "dev-replace.h" #include "free-space-cache.h" #include "backref.h" @@ -1457,6 +1455,14 @@ static int btrfs_reconfigure(struct fs_context *fc) btrfs_info_to_ctx(fs_info, &old_ctx); + /* + * This is our "bind mount" trick, we don't want to allow the user to do + * anything other than mount a different ro/rw and a different subvol, + * all of the mount options should be maintained. + */ + if (mount_reconfigure) + ctx->mount_opt = old_ctx.mount_opt; + sync_filesystem(sb); set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); @@ -1759,7 +1765,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = 0; buf->f_type = BTRFS_SUPER_MAGIC; - buf->f_bsize = dentry->d_sb->s_blocksize; + buf->f_bsize = fs_info->sectorsize; buf->f_namelen = BTRFS_NAME_LEN; /* We treat it as constant endianness (it doesn't matter _which_) @@ -2195,7 +2201,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, vol = memdup_user((void __user *)arg, sizeof(*vol)); if (IS_ERR(vol)) return PTR_ERR(vol); - vol->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args_path(vol); + if (ret < 0) + goto out; switch (cmd) { case BTRFS_IOC_SCAN_DEV: @@ -2237,6 +2245,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, break; } +out: kfree(vol); return ret; } diff --git a/fs/btrfs/super.h b/fs/btrfs/super.h index f18253ca280d..cbcab434b5ec 100644 --- a/fs/btrfs/super.h +++ b/fs/btrfs/super.h @@ -3,6 +3,13 @@ #ifndef BTRFS_SUPER_H #define BTRFS_SUPER_H +#include <linux/types.h> +#include <linux/fs.h> +#include "fs.h" + +struct super_block; +struct btrfs_fs_info; + bool btrfs_check_options(struct btrfs_fs_info *info, unsigned long *mount_opt, unsigned long flags); int btrfs_sync_fs(struct super_block *sb, int wait); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 84c05246ffd8..c6387a8ddb94 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -421,7 +421,7 @@ BTRFS_ATTR(static_feature, supported_sectorsizes, static ssize_t acl_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { - return sysfs_emit(buf, "%d\n", !!IS_ENABLED(CONFIG_BTRFS_FS_POSIX_ACL)); + return sysfs_emit(buf, "%d\n", IS_ENABLED(CONFIG_BTRFS_FS_POSIX_ACL)); } BTRFS_ATTR(static_feature, acl, acl_show); @@ -1228,11 +1228,12 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); + const enum btrfs_read_policy policy = READ_ONCE(fs_devices->read_policy); ssize_t ret = 0; int i; for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { - if (fs_devices->read_policy == i) + if (policy == i) ret += sysfs_emit_at(buf, ret, "%s[%s]", (ret == 0 ? "" : " "), btrfs_read_policy_name[i]); @@ -1256,8 +1257,8 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj, for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { if (sysfs_streq(buf, btrfs_read_policy_name[i])) { - if (i != fs_devices->read_policy) { - fs_devices->read_policy = i; + if (i != READ_ONCE(fs_devices->read_policy)) { + WRITE_ONCE(fs_devices->read_policy, i); btrfs_info(fs_devices->fs_info, "read policy set to '%s'", btrfs_read_policy_name[i]); @@ -1306,6 +1307,47 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj, BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show, btrfs_bg_reclaim_threshold_store); +#ifdef CONFIG_BTRFS_DEBUG +static ssize_t btrfs_offload_csum_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); + + switch (READ_ONCE(fs_devices->offload_csum_mode)) { + case BTRFS_OFFLOAD_CSUM_AUTO: + return sysfs_emit(buf, "auto\n"); + case BTRFS_OFFLOAD_CSUM_FORCE_ON: + return sysfs_emit(buf, "1\n"); + case BTRFS_OFFLOAD_CSUM_FORCE_OFF: + return sysfs_emit(buf, "0\n"); + default: + WARN_ON(1); + return -EINVAL; + } +} + +static ssize_t btrfs_offload_csum_store(struct kobject *kobj, + struct kobj_attribute *a, const char *buf, + size_t len) +{ + struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); + int ret; + bool val; + + ret = kstrtobool(buf, &val); + if (ret == 0) + WRITE_ONCE(fs_devices->offload_csum_mode, + val ? BTRFS_OFFLOAD_CSUM_FORCE_ON : BTRFS_OFFLOAD_CSUM_FORCE_OFF); + else if (ret == -EINVAL && sysfs_streq(buf, "auto")) + WRITE_ONCE(fs_devices->offload_csum_mode, BTRFS_OFFLOAD_CSUM_AUTO); + else + return -EINVAL; + + return len; +} +BTRFS_ATTR_RW(, offload_csum, btrfs_offload_csum_show, btrfs_offload_csum_store); +#endif + /* * Per-filesystem information and stats. * @@ -1325,6 +1367,9 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, bg_reclaim_threshold), BTRFS_ATTR_PTR(, commit_stats), BTRFS_ATTR_PTR(, temp_fsid), +#ifdef CONFIG_BTRFS_DEBUG + BTRFS_ATTR_PTR(, offload_csum), +#endif NULL, }; diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 86c7eef12873..e6a284c59809 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -3,8 +3,17 @@ #ifndef BTRFS_SYSFS_H #define BTRFS_SYSFS_H +#include <linux/types.h> +#include <linux/compiler_types.h> #include <linux/kobject.h> +struct btrfs_fs_info; +struct btrfs_device; +struct btrfs_fs_devices; +struct btrfs_block_group; +struct btrfs_space_info; +struct btrfs_qgroup; + enum btrfs_feature_set { FEAT_COMPAT, FEAT_COMPAT_RO, diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 25b3349595e0..865d4af4b303 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -11,6 +11,7 @@ #include "btrfs-tests.h" #include "../ctree.h" #include "../extent_io.h" +#include "../disk-io.h" #include "../btrfs_inode.h" #define PROCESS_UNLOCK (1 << 0) @@ -105,9 +106,11 @@ static void dump_extent_io_tree(const struct extent_io_tree *tree) } } -static int test_find_delalloc(u32 sectorsize) +static int test_find_delalloc(u32 sectorsize, u32 nodesize) { - struct inode *inode; + struct btrfs_fs_info *fs_info; + struct btrfs_root *root = NULL; + struct inode *inode = NULL; struct extent_io_tree *tmp; struct page *page; struct page *locked_page = NULL; @@ -121,12 +124,27 @@ static int test_find_delalloc(u32 sectorsize) test_msg("running find delalloc tests"); + fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); + if (!fs_info) { + test_std_err(TEST_ALLOC_FS_INFO); + return -ENOMEM; + } + + root = btrfs_alloc_dummy_root(fs_info); + if (IS_ERR(root)) { + test_std_err(TEST_ALLOC_ROOT); + ret = PTR_ERR(root); + goto out; + } + inode = btrfs_new_test_inode(); if (!inode) { test_std_err(TEST_ALLOC_INODE); - return -ENOMEM; + ret = -ENOMEM; + goto out; } tmp = &BTRFS_I(inode)->io_tree; + BTRFS_I(inode)->root = root; /* * Passing NULL as we don't have fs_info but tracepoints are not used @@ -316,6 +334,8 @@ out: process_page_range(inode, 0, total_dirty - 1, PROCESS_UNLOCK | PROCESS_RELEASE); iput(inode); + btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); return ret; } @@ -794,7 +814,7 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) test_msg("running extent I/O tests"); - ret = test_find_delalloc(sectorsize); + ret = test_find_delalloc(sectorsize, nodesize); if (ret) goto out; diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 9957de9f7806..99da9d34b77a 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -258,7 +258,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) /* First with no extents */ BTRFS_I(inode)->root = root; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize); if (IS_ERR(em)) { em = NULL; test_err("got an error when we shouldn't have"); @@ -278,7 +278,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) */ setup_file_extents(root, sectorsize); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, (u64)-1); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, (u64)-1); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -316,7 +316,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -339,7 +339,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Regular extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -367,7 +367,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* The next 3 are split extents */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -396,7 +396,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -418,7 +418,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -452,7 +452,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Prealloc extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -481,7 +481,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* The next 3 are a half written prealloc extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -511,7 +511,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -544,7 +544,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -579,7 +579,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Now for the compressed extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -613,7 +613,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Split compressed extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -648,7 +648,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -675,7 +675,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -710,7 +710,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* A hole between regular extents but no hole extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset + 6, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset + 6, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -737,7 +737,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, SZ_4M); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -770,7 +770,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -850,7 +850,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) insert_inode_item_key(root); insert_extent(root, sectorsize, sectorsize, sectorsize, 0, sectorsize, sectorsize, BTRFS_FILE_EXTENT_REG, 0, 1); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, 2 * sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 2 * sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -872,7 +872,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) } free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize, 2 * sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, sectorsize, 2 * sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 5b3333ceef04..46e8426adf4f 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -23,12 +23,10 @@ #include "qgroup.h" #include "block-group.h" #include "space-info.h" -#include "zoned.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" #include "root-tree.h" -#include "defrag.h" #include "dir-item.h" #include "uuid-tree.h" #include "ioctl.h" @@ -564,56 +562,22 @@ static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info, u64 num_bytes, u64 *delayed_refs_bytes) { - struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; struct btrfs_space_info *si = fs_info->trans_block_rsv.space_info; - u64 extra_delayed_refs_bytes = 0; - u64 bytes; + u64 bytes = num_bytes + *delayed_refs_bytes; int ret; /* - * If there's a gap between the size of the delayed refs reserve and - * its reserved space, than some tasks have added delayed refs or bumped - * its size otherwise (due to block group creation or removal, or block - * group item update). Also try to allocate that gap in order to prevent - * using (and possibly abusing) the global reserve when committing the - * transaction. - */ - if (flush == BTRFS_RESERVE_FLUSH_ALL && - !btrfs_block_rsv_full(delayed_refs_rsv)) { - spin_lock(&delayed_refs_rsv->lock); - if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) - extra_delayed_refs_bytes = delayed_refs_rsv->size - - delayed_refs_rsv->reserved; - spin_unlock(&delayed_refs_rsv->lock); - } - - bytes = num_bytes + *delayed_refs_bytes + extra_delayed_refs_bytes; - - /* * We want to reserve all the bytes we may need all at once, so we only * do 1 enospc flushing cycle per transaction start. */ ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush); - if (ret == 0) { - if (extra_delayed_refs_bytes > 0) - btrfs_migrate_to_delayed_refs_rsv(fs_info, - extra_delayed_refs_bytes); - return 0; - } - - if (extra_delayed_refs_bytes > 0) { - bytes -= extra_delayed_refs_bytes; - ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush); - if (ret == 0) - return 0; - } /* * If we are an emergency flush, which can steal from the global block * reserve, then attempt to not reserve space for the delayed refs, as * we will consume space for them from the global block reserve. */ - if (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) { + if (ret && flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) { bytes -= *delayed_refs_bytes; *delayed_refs_bytes = 0; ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush); @@ -1868,7 +1832,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } key.offset = (u64)-1; - pending->snap = btrfs_get_new_fs_root(fs_info, objectid, pending->anon_dev); + pending->snap = btrfs_get_new_fs_root(fs_info, objectid, &pending->anon_dev); if (IS_ERR(pending->snap)) { ret = PTR_ERR(pending->snap); pending->snap = NULL; @@ -1993,19 +1957,6 @@ static void update_super_roots(struct btrfs_fs_info *fs_info) super->uuid_tree_generation = root_item->generation; } -int btrfs_transaction_in_commit(struct btrfs_fs_info *info) -{ - struct btrfs_transaction *trans; - int ret = 0; - - spin_lock(&info->trans_lock); - trans = info->running_transaction; - if (trans) - ret = (trans->state >= TRANS_STATE_COMMIT_START); - spin_unlock(&info->trans_lock); - return ret; -} - int btrfs_transaction_blocked(struct btrfs_fs_info *info) { struct btrfs_transaction *trans; @@ -2720,9 +2671,7 @@ void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans, int __init btrfs_transaction_init(void) { - btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", - sizeof(struct btrfs_trans_handle), 0, - SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); + btrfs_trans_handle_cachep = KMEM_CACHE(btrfs_trans_handle, SLAB_TEMPORARY); if (!btrfs_trans_handle_cachep) return -ENOMEM; return 0; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 2bf8bbdfd0b3..4e451ab173b1 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -6,12 +6,27 @@ #ifndef BTRFS_TRANSACTION_H #define BTRFS_TRANSACTION_H +#include <linux/atomic.h> #include <linux/refcount.h> +#include <linux/list.h> +#include <linux/time64.h> +#include <linux/mutex.h> +#include <linux/wait.h> #include "btrfs_inode.h" #include "delayed-ref.h" -#include "ctree.h" +#include "extent-io-tree.h" +#include "block-rsv.h" +#include "messages.h" #include "misc.h" +struct dentry; +struct inode; +struct btrfs_pending_snapshot; +struct btrfs_fs_info; +struct btrfs_root_item; +struct btrfs_root; +struct btrfs_path; + /* Radix-tree tag for roots that are part of the trasaction. */ #define BTRFS_ROOT_TRANS_TAG 0 @@ -262,7 +277,6 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages, int mark); int btrfs_wait_tree_log_extents(struct btrfs_root *root, int mark); int btrfs_transaction_blocked(struct btrfs_fs_info *info); -int btrfs_transaction_in_commit(struct btrfs_fs_info *info); void btrfs_put_transaction(struct btrfs_transaction *transaction); void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 50fdc69fdddf..c8fbcae4e88e 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -21,7 +21,6 @@ #include "messages.h" #include "ctree.h" #include "tree-checker.h" -#include "disk-io.h" #include "compression.h" #include "volumes.h" #include "misc.h" @@ -30,7 +29,6 @@ #include "file-item.h" #include "inode-item.h" #include "dir-item.h" -#include "raid-stripe-tree.h" #include "extent-tree.h" /* @@ -67,6 +65,7 @@ static void generic_err(const struct extent_buffer *eb, int slot, vaf.fmt = fmt; vaf.va = &args; + dump_page(folio_page(eb->folios[0], 0), "eb page dump"); btrfs_crit(fs_info, "corrupt %s: root=%llu block=%llu slot=%d, %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", @@ -94,6 +93,7 @@ static void file_extent_err(const struct extent_buffer *eb, int slot, vaf.fmt = fmt; vaf.va = &args; + dump_page(folio_page(eb->folios[0], 0), "eb page dump"); btrfs_crit(fs_info, "corrupt %s: root=%llu block=%llu slot=%d ino=%llu file_offset=%llu, %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", @@ -154,6 +154,7 @@ static void dir_item_err(const struct extent_buffer *eb, int slot, vaf.fmt = fmt; vaf.va = &args; + dump_page(folio_page(eb->folios[0], 0), "eb page dump"); btrfs_crit(fs_info, "corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", @@ -649,6 +650,7 @@ static void block_group_err(const struct extent_buffer *eb, int slot, vaf.fmt = fmt; vaf.va = &args; + dump_page(folio_page(eb->folios[0], 0), "eb page dump"); btrfs_crit(fs_info, "corrupt %s: root=%llu block=%llu slot=%d bg_start=%llu bg_len=%llu, %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", @@ -1005,6 +1007,7 @@ static void dev_item_err(const struct extent_buffer *eb, int slot, vaf.fmt = fmt; vaf.va = &args; + dump_page(folio_page(eb->folios[0], 0), "eb page dump"); btrfs_crit(eb->fs_info, "corrupt %s: root=%llu block=%llu slot=%d devid=%llu %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", @@ -1260,6 +1263,7 @@ static void extent_err(const struct extent_buffer *eb, int slot, vaf.fmt = fmt; vaf.va = &args; + dump_page(folio_page(eb->folios[0], 0), "eb page dump"); btrfs_crit(eb->fs_info, "corrupt %s: block=%llu slot=%d extent bytenr=%llu len=%llu %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", @@ -1436,7 +1440,7 @@ static int check_extent_item(struct extent_buffer *leaf, if (unlikely(ptr + btrfs_extent_inline_ref_size(inline_type) > end)) { extent_err(leaf, slot, "inline ref item overflows extent item, ptr %lu iref size %u end %lu", - ptr, inline_type, end); + ptr, btrfs_extent_inline_ref_size(inline_type), end); return -EUCLEAN; } diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index 14b9fbe82da4..5c809b50b2d0 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -6,10 +6,12 @@ #ifndef BTRFS_TREE_CHECKER_H #define BTRFS_TREE_CHECKER_H +#include <linux/types.h> #include <uapi/linux/btrfs_tree.h> struct extent_buffer; struct btrfs_chunk; +struct btrfs_key; /* All the extra info needed to verify the parentness of a tree block. */ struct btrfs_tree_parent_check { diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 331fc7429952..472918a5bc73 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -13,13 +13,11 @@ #include "tree-log.h" #include "disk-io.h" #include "locking.h" -#include "print-tree.h" #include "backref.h" #include "compression.h" #include "qgroup.h" #include "block-group.h" #include "space-info.h" -#include "zoned.h" #include "inode-item.h" #include "fs.h" #include "accessors.h" @@ -2820,6 +2818,52 @@ static void wait_for_writer(struct btrfs_root *root) finish_wait(&root->log_writer_wait, &wait); } +void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct inode *inode) +{ + ctx->log_ret = 0; + ctx->log_transid = 0; + ctx->log_new_dentries = false; + ctx->logging_new_name = false; + ctx->logging_new_delayed_dentries = false; + ctx->logged_before = false; + ctx->inode = inode; + INIT_LIST_HEAD(&ctx->list); + INIT_LIST_HEAD(&ctx->ordered_extents); + INIT_LIST_HEAD(&ctx->conflict_inodes); + ctx->num_conflict_inodes = 0; + ctx->logging_conflict_inodes = false; + ctx->scratch_eb = NULL; +} + +void btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx *ctx) +{ + struct btrfs_inode *inode = BTRFS_I(ctx->inode); + + if (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) && + !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags)) + return; + + /* + * Don't care about allocation failure. This is just for optimization, + * if we fail to allocate here, we will try again later if needed. + */ + ctx->scratch_eb = alloc_dummy_extent_buffer(inode->root->fs_info, 0); +} + +void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx) +{ + struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_extent *tmp; + + ASSERT(inode_is_locked(ctx->inode)); + + list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { + list_del_init(&ordered->log_list); + btrfs_put_ordered_extent(ordered); + } +} + + static inline void btrfs_remove_log_ctx(struct btrfs_root *root, struct btrfs_log_ctx *ctx) { @@ -3619,6 +3663,30 @@ out: return ret; } +static int clone_leaf(struct btrfs_path *path, struct btrfs_log_ctx *ctx) +{ + const int slot = path->slots[0]; + + if (ctx->scratch_eb) { + copy_extent_buffer_full(ctx->scratch_eb, path->nodes[0]); + } else { + ctx->scratch_eb = btrfs_clone_extent_buffer(path->nodes[0]); + if (!ctx->scratch_eb) + return -ENOMEM; + } + + btrfs_release_path(path); + path->nodes[0] = ctx->scratch_eb; + path->slots[0] = slot; + /* + * Add extra ref to scratch eb so that it is not freed when callers + * release the path, so we can reuse it later if needed. + */ + atomic_inc(&ctx->scratch_eb->refs); + + return 0; +} + static int process_dir_items_leaf(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, @@ -3633,23 +3701,20 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, bool last_found = false; int batch_start = 0; int batch_size = 0; - int i; + int ret; /* * We need to clone the leaf, release the read lock on it, and use the * clone before modifying the log tree. See the comment at copy_items() * about why we need to do this. */ - src = btrfs_clone_extent_buffer(path->nodes[0]); - if (!src) - return -ENOMEM; + ret = clone_leaf(path, ctx); + if (ret < 0) + return ret; - i = path->slots[0]; - btrfs_release_path(path); - path->nodes[0] = src; - path->slots[0] = i; + src = path->nodes[0]; - for (; i < nritems; i++) { + for (int i = path->slots[0]; i < nritems; i++) { struct btrfs_dir_item *di; struct btrfs_key key; int ret; @@ -4259,17 +4324,16 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, struct btrfs_path *dst_path, struct btrfs_path *src_path, int start_slot, int nr, int inode_only, - u64 logged_isize) + u64 logged_isize, struct btrfs_log_ctx *ctx) { struct btrfs_root *log = inode->root->log_root; struct btrfs_file_extent_item *extent; struct extent_buffer *src; - int ret = 0; + int ret; struct btrfs_key *ins_keys; u32 *ins_sizes; struct btrfs_item_batch batch; char *ins_data; - int i; int dst_index; const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM); const u64 i_size = i_size_read(&inode->vfs_inode); @@ -4302,14 +4366,11 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, * while the other is holding the delayed node's mutex and wants to * write lock the same subvolume leaf for flushing delayed items. */ - src = btrfs_clone_extent_buffer(src_path->nodes[0]); - if (!src) - return -ENOMEM; + ret = clone_leaf(src_path, ctx); + if (ret < 0) + return ret; - i = src_path->slots[0]; - btrfs_release_path(src_path); - src_path->nodes[0] = src; - src_path->slots[0] = i; + src = src_path->nodes[0]; ins_data = kmalloc(nr * sizeof(struct btrfs_key) + nr * sizeof(u32), GFP_NOFS); @@ -4324,7 +4385,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, batch.nr = 0; dst_index = 0; - for (i = 0; i < nr; i++) { + for (int i = 0; i < nr; i++) { const int src_slot = start_slot + i; struct btrfs_root *csum_root; struct btrfs_ordered_sum *sums; @@ -4431,7 +4492,7 @@ add_to_batch: goto out; dst_index = 0; - for (i = 0; i < nr; i++) { + for (int i = 0; i < nr; i++) { const int src_slot = start_slot + i; const int dst_slot = dst_path->slots[0] + dst_index; struct btrfs_key key; @@ -4704,7 +4765,8 @@ static int log_one_extent(struct btrfs_trans_handle *trans, */ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, - struct btrfs_path *path) + struct btrfs_path *path, + struct btrfs_log_ctx *ctx) { struct btrfs_root *root = inode->root; struct btrfs_key key; @@ -4770,7 +4832,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, if (slot >= btrfs_header_nritems(leaf)) { if (ins_nr > 0) { ret = copy_items(trans, inode, dst_path, path, - start_slot, ins_nr, 1, 0); + start_slot, ins_nr, 1, 0, ctx); if (ret < 0) goto out; ins_nr = 0; @@ -4820,7 +4882,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, } if (ins_nr > 0) ret = copy_items(trans, inode, dst_path, path, - start_slot, ins_nr, 1, 0); + start_slot, ins_nr, 1, 0, ctx); out: btrfs_release_path(path); btrfs_free_path(dst_path); @@ -4899,7 +4961,7 @@ process: write_unlock(&tree->lock); if (!ret) - ret = btrfs_log_prealloc_extents(trans, inode, path); + ret = btrfs_log_prealloc_extents(trans, inode, path, ctx); if (ret) return ret; @@ -4980,7 +5042,8 @@ static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, - struct btrfs_path *dst_path) + struct btrfs_path *dst_path, + struct btrfs_log_ctx *ctx) { struct btrfs_root *root = inode->root; int ret; @@ -5009,7 +5072,7 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, if (slot >= nritems) { if (ins_nr > 0) { ret = copy_items(trans, inode, dst_path, path, - start_slot, ins_nr, 1, 0); + start_slot, ins_nr, 1, 0, ctx); if (ret < 0) return ret; ins_nr = 0; @@ -5035,7 +5098,7 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, } if (ins_nr > 0) { ret = copy_items(trans, inode, dst_path, path, - start_slot, ins_nr, 1, 0); + start_slot, ins_nr, 1, 0, ctx); if (ret < 0) return ret; } @@ -5847,7 +5910,7 @@ again: } ret = copy_items(trans, inode, dst_path, path, ins_start_slot, ins_nr, - inode_only, logged_isize); + inode_only, logged_isize, ctx); if (ret < 0) return ret; ins_nr = 0; @@ -5866,7 +5929,7 @@ again: goto next_slot; ret = copy_items(trans, inode, dst_path, path, ins_start_slot, - ins_nr, inode_only, logged_isize); + ins_nr, inode_only, logged_isize, ctx); if (ret < 0) return ret; ins_nr = 0; @@ -5883,7 +5946,7 @@ again: } ret = copy_items(trans, inode, dst_path, path, ins_start_slot, - ins_nr, inode_only, logged_isize); + ins_nr, inode_only, logged_isize, ctx); if (ret < 0) return ret; ins_nr = 1; @@ -5898,7 +5961,7 @@ next_slot: if (ins_nr) { ret = copy_items(trans, inode, dst_path, path, ins_start_slot, ins_nr, inode_only, - logged_isize); + logged_isize, ctx); if (ret < 0) return ret; ins_nr = 0; @@ -5923,7 +5986,7 @@ next_key: } if (ins_nr) { ret = copy_items(trans, inode, dst_path, path, ins_start_slot, - ins_nr, inode_only, logged_isize); + ins_nr, inode_only, logged_isize, ctx); if (ret) return ret; } @@ -5934,7 +5997,7 @@ next_key: * lock the same leaf with btrfs_log_prealloc_extents() below. */ btrfs_release_path(path); - ret = btrfs_log_prealloc_extents(trans, inode, dst_path); + ret = btrfs_log_prealloc_extents(trans, inode, dst_path, ctx); } return ret; @@ -6526,7 +6589,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, btrfs_release_path(path); btrfs_release_path(dst_path); - ret = btrfs_log_all_xattrs(trans, inode, path, dst_path); + ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx); if (ret) goto out_unlock; xattrs_logged = true; @@ -6553,7 +6616,7 @@ log_extents: * BTRFS_INODE_COPY_EVERYTHING set. */ if (!xattrs_logged && inode->logged_trans < trans->transid) { - ret = btrfs_log_all_xattrs(trans, inode, path, dst_path); + ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx); if (ret) goto out_unlock; btrfs_release_path(path); @@ -7502,6 +7565,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, btrfs_init_log_ctx(&ctx, &inode->vfs_inode); ctx.logging_new_name = true; + btrfs_init_log_ctx_scratch_eb(&ctx); /* * We don't care about the return value. If we fail to log the new name * then we know the next attempt to sync the log will fallback to a full @@ -7510,6 +7574,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * inconsistent state after a rename operation. */ btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); + free_extent_buffer(ctx.scratch_eb); ASSERT(list_empty(&ctx.conflict_inodes)); out: /* diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index a550a8a375cd..22e9cbc81577 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -6,10 +6,18 @@ #ifndef BTRFS_TREE_LOG_H #define BTRFS_TREE_LOG_H +#include <linux/list.h> +#include <linux/fs.h> #include "messages.h" #include "ctree.h" #include "transaction.h" +struct inode; +struct dentry; +struct btrfs_ordered_extent; +struct btrfs_root; +struct btrfs_trans_handle; + /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ #define BTRFS_NO_LOG_SYNC 256 @@ -36,37 +44,20 @@ struct btrfs_log_ctx { struct list_head conflict_inodes; int num_conflict_inodes; bool logging_conflict_inodes; + /* + * Used for fsyncs that need to copy items from the subvolume tree to + * the log tree (full sync flag set or copy everything flag set) to + * avoid allocating a temporary extent buffer while holding a lock on + * an extent buffer of the subvolume tree and under the log transaction. + * Also helps to avoid allocating and freeing a temporary extent buffer + * in case we need to process multiple leaves from the subvolume tree. + */ + struct extent_buffer *scratch_eb; }; -static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, - struct inode *inode) -{ - ctx->log_ret = 0; - ctx->log_transid = 0; - ctx->log_new_dentries = false; - ctx->logging_new_name = false; - ctx->logging_new_delayed_dentries = false; - ctx->logged_before = false; - ctx->inode = inode; - INIT_LIST_HEAD(&ctx->list); - INIT_LIST_HEAD(&ctx->ordered_extents); - INIT_LIST_HEAD(&ctx->conflict_inodes); - ctx->num_conflict_inodes = 0; - ctx->logging_conflict_inodes = false; -} - -static inline void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx) -{ - struct btrfs_ordered_extent *ordered; - struct btrfs_ordered_extent *tmp; - - ASSERT(inode_is_locked(ctx->inode)); - - list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { - list_del_init(&ordered->log_list); - btrfs_put_ordered_extent(ordered); - } -} +void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct inode *inode); +void btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx *ctx); +void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx); static inline void btrfs_set_log_full_commit(struct btrfs_trans_handle *trans) { diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index 3df6153d5d5a..43b3accbed7a 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -44,7 +44,7 @@ struct tree_mod_elem { /* * Pull a new tree mod seq number for our operation. */ -static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) +static u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) { return atomic64_inc_return(&fs_info->tree_mod_seq); } @@ -170,8 +170,7 @@ static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info, * this until all tree mod log insertions are recorded in the rb tree and then * write unlock fs_info::tree_mod_log_lock. */ -static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb) +static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) { if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) return true; @@ -188,7 +187,7 @@ static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, } /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */ -static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, +static bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, struct extent_buffer *eb) { if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) @@ -367,9 +366,9 @@ free_tms: return ret; } -static inline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, - struct tree_mod_elem **tm_list, - int nritems) +static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, + struct tree_mod_elem **tm_list, + int nritems) { int i, j; int ret; diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h index 94f10afeee97..ff00c8e8a393 100644 --- a/fs/btrfs/tree-mod-log.h +++ b/fs/btrfs/tree-mod-log.h @@ -3,7 +3,13 @@ #ifndef BTRFS_TREE_MOD_LOG_H #define BTRFS_TREE_MOD_LOG_H -#include "ctree.h" +#include <linux/list.h> + +struct extent_buffer; +struct btrfs_fs_info; +struct btrfs_path; +struct btrfs_root; +struct btrfs_seq_list; /* Represents a tree mod log user. */ struct btrfs_seq_list { diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index b4ac2b0cd235..183863f4bfa4 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -7,7 +7,6 @@ #include <linux/slab.h> #include "messages.h" #include "ulist.h" -#include "ctree.h" /* * ulist is a generic data structure to hold a collection of unique u64 diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index b2cef187ea8e..8e200fe1a2dd 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -7,6 +7,7 @@ #ifndef BTRFS_ULIST_H #define BTRFS_ULIST_H +#include <linux/types.h> #include <linux/list.h> #include <linux/rbtree.h> diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 5be74f9e47eb..b0aff297d67d 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -9,7 +9,6 @@ #include "ctree.h" #include "transaction.h" #include "disk-io.h" -#include "print-tree.h" #include "fs.h" #include "accessors.h" #include "uuid-tree.h" @@ -114,7 +113,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, ret = btrfs_insert_empty_item(trans, uuid_root, path, &key, sizeof(subid_le)); - if (ret >= 0) { + if (ret == 0) { /* Add an item for the type for the first time */ eb = path->nodes[0]; slot = path->slots[0]; diff --git a/fs/btrfs/uuid-tree.h b/fs/btrfs/uuid-tree.h index 5350c87fe2ca..080ede0227ae 100644 --- a/fs/btrfs/uuid-tree.h +++ b/fs/btrfs/uuid-tree.h @@ -3,6 +3,11 @@ #ifndef BTRFS_UUID_TREE_H #define BTRFS_UUID_TREE_H +#include <linux/types.h> + +struct btrfs_trans_handle; +struct btrfs_fs_info; + int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, u64 subid); int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index 66e2270b0dae..4042dd6437ae 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -14,7 +14,6 @@ #include "ctree.h" #include "btrfs_inode.h" #include "transaction.h" -#include "disk-io.h" #include "locking.h" #include "fs.h" #include "accessors.h" diff --git a/fs/btrfs/verity.h b/fs/btrfs/verity.h index 91c10f7d0a46..d696659e43e4 100644 --- a/fs/btrfs/verity.h +++ b/fs/btrfs/verity.h @@ -3,8 +3,13 @@ #ifndef BTRFS_VERITY_H #define BTRFS_VERITY_H +struct inode; +struct btrfs_inode; + #ifdef CONFIG_FS_VERITY +#include <linux/fsverity.h> + extern const struct fsverity_operations btrfs_verityops; int btrfs_drop_verity_items(struct btrfs_inode *inode); @@ -12,6 +17,8 @@ int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size) #else +#include <linux/errno.h> + static inline int btrfs_drop_verity_items(struct btrfs_inode *inode) { return 0; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 4c32497311d2..a2d07fa3cfdf 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -14,10 +14,8 @@ #include <linux/namei.h> #include "misc.h" #include "ctree.h" -#include "extent_map.h" #include "disk-io.h" #include "transaction.h" -#include "print-tree.h" #include "volumes.h" #include "raid56.h" #include "rcu-string.h" @@ -468,39 +466,39 @@ static noinline struct btrfs_fs_devices *find_fsid( static int btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder, - int flush, struct bdev_handle **bdev_handle, + int flush, struct file **bdev_file, struct btrfs_super_block **disk_super) { struct block_device *bdev; int ret; - *bdev_handle = bdev_open_by_path(device_path, flags, holder, NULL); + *bdev_file = bdev_file_open_by_path(device_path, flags, holder, NULL); - if (IS_ERR(*bdev_handle)) { - ret = PTR_ERR(*bdev_handle); + if (IS_ERR(*bdev_file)) { + ret = PTR_ERR(*bdev_file); goto error; } - bdev = (*bdev_handle)->bdev; + bdev = file_bdev(*bdev_file); if (flush) sync_blockdev(bdev); ret = set_blocksize(bdev, BTRFS_BDEV_BLOCKSIZE); if (ret) { - bdev_release(*bdev_handle); + fput(*bdev_file); goto error; } invalidate_bdev(bdev); *disk_super = btrfs_read_dev_super(bdev); if (IS_ERR(*disk_super)) { ret = PTR_ERR(*disk_super); - bdev_release(*bdev_handle); + fput(*bdev_file); goto error; } return 0; error: - *bdev_handle = NULL; + *bdev_file = NULL; return ret; } @@ -643,7 +641,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, struct btrfs_device *device, blk_mode_t flags, void *holder) { - struct bdev_handle *bdev_handle; + struct file *bdev_file; struct btrfs_super_block *disk_super; u64 devid; int ret; @@ -654,7 +652,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, return -EINVAL; ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, - &bdev_handle, &disk_super); + &bdev_file, &disk_super); if (ret) return ret; @@ -678,20 +676,20 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); fs_devices->seeding = true; } else { - if (bdev_read_only(bdev_handle->bdev)) + if (bdev_read_only(file_bdev(bdev_file))) clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); else set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); } - if (!bdev_nonrot(bdev_handle->bdev)) + if (!bdev_nonrot(file_bdev(bdev_file))) fs_devices->rotating = true; - if (bdev_max_discard_sectors(bdev_handle->bdev)) + if (bdev_max_discard_sectors(file_bdev(bdev_file))) fs_devices->discardable = true; - device->bdev_handle = bdev_handle; - device->bdev = bdev_handle->bdev; + device->bdev_file = bdev_file; + device->bdev = file_bdev(bdev_file); clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); fs_devices->open_devices++; @@ -706,7 +704,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, error_free_page: btrfs_release_disk_super(disk_super); - bdev_release(bdev_handle); + fput(bdev_file); return -EINVAL; } @@ -769,8 +767,9 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (same_fsid_diff_dev) { generate_random_uuid(fs_devices->fsid); fs_devices->temp_fsid = true; - pr_info("BTRFS: device %s using temp-fsid %pU\n", - path, fs_devices->fsid); + pr_info("BTRFS: device %s (%d:%d) using temp-fsid %pU\n", + path, MAJOR(path_devt), MINOR(path_devt), + fs_devices->fsid); } mutex_lock(&fs_devices->device_list_mutex); @@ -799,8 +798,9 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (fs_devices->opened) { btrfs_err(NULL, -"device %s belongs to fsid %pU, and the fs is already mounted, scanned by %s (%d)", - path, fs_devices->fsid, current->comm, +"device %s (%d:%d) belongs to fsid %pU, and the fs is already mounted, scanned by %s (%d)", + path, MAJOR(path_devt), MINOR(path_devt), + fs_devices->fsid, current->comm, task_pid_nr(current)); mutex_unlock(&fs_devices->device_list_mutex); return ERR_PTR(-EBUSY); @@ -826,13 +826,15 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (disk_super->label[0]) pr_info( - "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n", +"BTRFS: device label %s devid %llu transid %llu %s (%d:%d) scanned by %s (%d)\n", disk_super->label, devid, found_transid, path, + MAJOR(path_devt), MINOR(path_devt), current->comm, task_pid_nr(current)); else pr_info( - "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n", +"BTRFS: device fsid %pU devid %llu transid %llu %s (%d:%d) scanned by %s (%d)\n", disk_super->fsid, devid, found_transid, path, + MAJOR(path_devt), MINOR(path_devt), current->comm, task_pid_nr(current)); } else if (!device->name || strcmp(device->name->str, path)) { @@ -1015,10 +1017,10 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, if (device->devid == BTRFS_DEV_REPLACE_DEVID) continue; - if (device->bdev_handle) { - bdev_release(device->bdev_handle); + if (device->bdev_file) { + fput(device->bdev_file); device->bdev = NULL; - device->bdev_handle = NULL; + device->bdev_file = NULL; fs_devices->open_devices--; } if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { @@ -1063,7 +1065,7 @@ static void btrfs_close_bdev(struct btrfs_device *device) invalidate_bdev(device->bdev); } - bdev_release(device->bdev_handle); + fput(device->bdev_file); } static void btrfs_close_one_device(struct btrfs_device *device) @@ -1316,7 +1318,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, struct btrfs_super_block *disk_super; bool new_device_added = false; struct btrfs_device *device = NULL; - struct bdev_handle *bdev_handle; + struct file *bdev_file; u64 bytenr, bytenr_orig; int ret; @@ -1339,18 +1341,18 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, * values temporarily, as the device paths of the fsid are the only * required information for assembling the volume. */ - bdev_handle = bdev_open_by_path(path, flags, NULL, NULL); - if (IS_ERR(bdev_handle)) - return ERR_CAST(bdev_handle); + bdev_file = bdev_file_open_by_path(path, flags, NULL, NULL); + if (IS_ERR(bdev_file)) + return ERR_CAST(bdev_file); bytenr_orig = btrfs_sb_offset(0); - ret = btrfs_sb_log_location_bdev(bdev_handle->bdev, 0, READ, &bytenr); + ret = btrfs_sb_log_location_bdev(file_bdev(bdev_file), 0, READ, &bytenr); if (ret) { device = ERR_PTR(ret); goto error_bdev_put; } - disk_super = btrfs_read_disk_super(bdev_handle->bdev, bytenr, + disk_super = btrfs_read_disk_super(file_bdev(bdev_file), bytenr, bytenr_orig); if (IS_ERR(disk_super)) { device = ERR_CAST(disk_super); @@ -1368,7 +1370,8 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, else btrfs_free_stale_devices(devt, NULL); - pr_debug("BTRFS: skip registering single non-seed device %s\n", path); + pr_debug("BTRFS: skip registering single non-seed device %s (%d:%d)\n", + path, MAJOR(devt), MINOR(devt)); device = NULL; goto free_disk_super; } @@ -1381,7 +1384,7 @@ free_disk_super: btrfs_release_disk_super(disk_super); error_bdev_put: - bdev_release(bdev_handle); + fput(bdev_file); return device; } @@ -1403,7 +1406,7 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start, if (in_range(physical_start, *start, len) || in_range(*start, physical_start, - physical_end - physical_start)) { + physical_end + 1 - physical_start)) { *start = physical_end + 1; return true; } @@ -2032,11 +2035,10 @@ static void btrfs_scratch_superblock(struct btrfs_fs_info *fs_info, copy_num, ret); } -void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, - struct block_device *bdev, - const char *device_path) +void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct btrfs_device *device) { int copy_num; + struct block_device *bdev = device->bdev; if (!bdev) return; @@ -2052,12 +2054,12 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, btrfs_kobject_uevent(bdev, KOBJ_CHANGE); /* Update ctime/mtime for device path for libblkid */ - update_dev_time(device_path); + update_dev_time(device->name->str); } int btrfs_rm_device(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, - struct bdev_handle **bdev_handle) + struct file **bdev_file) { struct btrfs_trans_handle *trans; struct btrfs_device *device; @@ -2166,7 +2168,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, btrfs_assign_next_active_device(device, NULL); - if (device->bdev_handle) { + if (device->bdev_file) { cur_devices->open_devices--; /* remove sysfs entry */ btrfs_sysfs_remove_device(device); @@ -2182,20 +2184,19 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, * free the device. * * We cannot call btrfs_close_bdev() here because we're holding the sb - * write lock, and bdev_release() will pull in the ->open_mutex on - * the block device and it's dependencies. Instead just flush the - * device and let the caller do the final bdev_release. + * write lock, and fput() on the block device will pull in the + * ->open_mutex on the block device and it's dependencies. Instead + * just flush the device and let the caller do the final bdev_release. */ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { - btrfs_scratch_superblocks(fs_info, device->bdev, - device->name->str); + btrfs_scratch_superblocks(fs_info, device); if (device->bdev) { sync_blockdev(device->bdev); invalidate_bdev(device->bdev); } } - *bdev_handle = device->bdev_handle; + *bdev_file = device->bdev_file; synchronize_rcu(); btrfs_free_device(device); @@ -2301,8 +2302,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) mutex_unlock(&fs_devices->device_list_mutex); - btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev, - tgtdev->name->str); + btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev); btrfs_close_bdev(tgtdev); synchronize_rcu(); @@ -2332,7 +2332,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, const char *path) { struct btrfs_super_block *disk_super; - struct bdev_handle *bdev_handle; + struct file *bdev_file; int ret; if (!path || !path[0]) @@ -2350,7 +2350,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, } ret = btrfs_get_bdev_and_sb(path, BLK_OPEN_READ, NULL, 0, - &bdev_handle, &disk_super); + &bdev_file, &disk_super); if (ret) { btrfs_put_dev_args_from_path(args); return ret; @@ -2363,7 +2363,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, else memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE); btrfs_release_disk_super(disk_super); - bdev_release(bdev_handle); + fput(bdev_file); return 0; } @@ -2583,7 +2583,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path struct btrfs_root *root = fs_info->dev_root; struct btrfs_trans_handle *trans; struct btrfs_device *device; - struct bdev_handle *bdev_handle; + struct file *bdev_file; struct super_block *sb = fs_info->sb; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_fs_devices *seed_devices = NULL; @@ -2596,12 +2596,12 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (sb_rdonly(sb) && !fs_devices->seeding) return -EROFS; - bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE, + bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE, fs_info->bdev_holder, NULL); - if (IS_ERR(bdev_handle)) - return PTR_ERR(bdev_handle); + if (IS_ERR(bdev_file)) + return PTR_ERR(bdev_file); - if (!btrfs_check_device_zone_type(fs_info, bdev_handle->bdev)) { + if (!btrfs_check_device_zone_type(fs_info, file_bdev(bdev_file))) { ret = -EINVAL; goto error; } @@ -2613,11 +2613,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path locked = true; } - sync_blockdev(bdev_handle->bdev); + sync_blockdev(file_bdev(bdev_file)); rcu_read_lock(); list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { - if (device->bdev == bdev_handle->bdev) { + if (device->bdev == file_bdev(bdev_file)) { ret = -EEXIST; rcu_read_unlock(); goto error; @@ -2633,8 +2633,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path } device->fs_info = fs_info; - device->bdev_handle = bdev_handle; - device->bdev = bdev_handle->bdev; + device->bdev_file = bdev_file; + device->bdev = file_bdev(bdev_file); ret = lookup_bdev(device_path, &device->devt); if (ret) goto error_free_device; @@ -2817,7 +2817,7 @@ error_free_zone: error_free_device: btrfs_free_device(device); error: - bdev_release(bdev_handle); + fput(bdev_file); if (locked) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); @@ -3087,7 +3087,6 @@ struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, map = btrfs_find_chunk_map(fs_info, logical, length); if (unlikely(!map)) { - read_unlock(&fs_info->mapping_tree_lock); btrfs_crit(fs_info, "unable to find chunk map for logical %llu length %llu", logical, length); @@ -3095,7 +3094,6 @@ struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, } if (unlikely(map->start > logical || map->start + map->chunk_len <= logical)) { - read_unlock(&fs_info->mapping_tree_lock); btrfs_crit(fs_info, "found a bad chunk map, wanted %llu-%llu, found %llu-%llu", logical, logical + length, map->start, @@ -3395,7 +3393,17 @@ again: mutex_unlock(&fs_info->reclaim_bgs_lock); goto error; } - BUG_ON(ret == 0); /* Corruption */ + if (ret == 0) { + /* + * On the first search we would find chunk tree with + * offset -1, which is not possible. On subsequent + * loops this would find an existing item on an invalid + * offset (one less than the previous one, wrong + * alignment and size). + */ + ret = -EUCLEAN; + goto error; + } ret = btrfs_previous_item(chunk_root, path, key.objectid, key.type); @@ -3482,6 +3490,44 @@ static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, return 0; } +static void btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, + const struct btrfs_disk_balance_args *disk) +{ + memset(cpu, 0, sizeof(*cpu)); + + cpu->profiles = le64_to_cpu(disk->profiles); + cpu->usage = le64_to_cpu(disk->usage); + cpu->devid = le64_to_cpu(disk->devid); + cpu->pstart = le64_to_cpu(disk->pstart); + cpu->pend = le64_to_cpu(disk->pend); + cpu->vstart = le64_to_cpu(disk->vstart); + cpu->vend = le64_to_cpu(disk->vend); + cpu->target = le64_to_cpu(disk->target); + cpu->flags = le64_to_cpu(disk->flags); + cpu->limit = le64_to_cpu(disk->limit); + cpu->stripes_min = le32_to_cpu(disk->stripes_min); + cpu->stripes_max = le32_to_cpu(disk->stripes_max); +} + +static void btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk, + const struct btrfs_balance_args *cpu) +{ + memset(disk, 0, sizeof(*disk)); + + disk->profiles = cpu_to_le64(cpu->profiles); + disk->usage = cpu_to_le64(cpu->usage); + disk->devid = cpu_to_le64(cpu->devid); + disk->pstart = cpu_to_le64(cpu->pstart); + disk->pend = cpu_to_le64(cpu->pend); + disk->vstart = cpu_to_le64(cpu->vstart); + disk->vend = cpu_to_le64(cpu->vend); + disk->target = cpu_to_le64(cpu->target); + disk->flags = cpu_to_le64(cpu->flags); + disk->limit = cpu_to_le64(cpu->limit); + disk->stripes_min = cpu_to_le32(cpu->stripes_min); + disk->stripes_max = cpu_to_le32(cpu->stripes_max); +} + static int insert_balance_item(struct btrfs_fs_info *fs_info, struct btrfs_balance_control *bctl) { @@ -3626,7 +3672,7 @@ static void reset_balance_state(struct btrfs_fs_info *fs_info) struct btrfs_balance_control *bctl = fs_info->balance_ctl; int ret; - BUG_ON(!fs_info->balance_ctl); + ASSERT(fs_info->balance_ctl); spin_lock(&fs_info->balance_lock); fs_info->balance_ctl = NULL; @@ -5946,6 +5992,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map, int first, int dev_replace_is_ongoing) { + const enum btrfs_read_policy policy = READ_ONCE(fs_info->fs_devices->read_policy); int i; int num_stripes; int preferred_mirror; @@ -5960,13 +6007,12 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, else num_stripes = map->num_stripes; - switch (fs_info->fs_devices->read_policy) { + switch (policy) { default: /* Shouldn't happen, just warn and use pid instead of failing */ - btrfs_warn_rl(fs_info, - "unknown read_policy type %u, reset to pid", - fs_info->fs_devices->read_policy); - fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID; + btrfs_warn_rl(fs_info, "unknown read_policy type %u, reset to pid", + policy); + WRITE_ONCE(fs_info->fs_devices->read_policy, BTRFS_READ_POLICY_PID); fallthrough; case BTRFS_READ_POLICY_PID: preferred_mirror = first + (current->pid % num_stripes); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 53f87f398da7..93854609a4d5 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -6,13 +6,28 @@ #ifndef BTRFS_VOLUMES_H #define BTRFS_VOLUMES_H +#include <linux/blk_types.h> +#include <linux/sizes.h> +#include <linux/atomic.h> #include <linux/sort.h> -#include <linux/btrfs.h> -#include "async-thread.h" +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/log2.h> +#include <linux/kobject.h> +#include <linux/refcount.h> +#include <linux/completion.h> +#include <linux/rbtree.h> +#include <uapi/linux/btrfs.h> #include "messages.h" -#include "tree-checker.h" #include "rcu-string.h" +struct block_device; +struct bdev_handle; +struct btrfs_fs_info; +struct btrfs_block_group; +struct btrfs_trans_handle; +struct btrfs_zoned_device_info; + #define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G) extern struct mutex uuid_mutex; @@ -77,7 +92,7 @@ enum btrfs_raid_types { #define BTRFS_DEV_STATE_FLUSH_SENT (4) #define BTRFS_DEV_STATE_NO_READA (5) -struct btrfs_zoned_device_info; +struct btrfs_fs_devices; struct btrfs_device { struct list_head dev_list; /* device_list_mutex */ @@ -90,7 +105,7 @@ struct btrfs_device { u64 generation; - struct bdev_handle *bdev_handle; + struct file *bdev_file; struct block_device *bdev; struct btrfs_zoned_device_info *zone_info; @@ -276,6 +291,25 @@ enum btrfs_read_policy { BTRFS_NR_READ_POLICY, }; +#ifdef CONFIG_BTRFS_DEBUG +/* + * Checksum mode - offload it to workqueues or do it synchronously in + * btrfs_submit_chunk(). + */ +enum btrfs_offload_csum_mode { + /* + * Choose offloading checksum or do it synchronously automatically. + * Do it synchronously if the checksum is fast, or offload to workqueues + * otherwise. + */ + BTRFS_OFFLOAD_CSUM_AUTO, + /* Always offload checksum to workqueues. */ + BTRFS_OFFLOAD_CSUM_FORCE_ON, + /* Never offload checksum to workqueues. */ + BTRFS_OFFLOAD_CSUM_FORCE_OFF, +}; +#endif + struct btrfs_fs_devices { u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ @@ -380,6 +414,11 @@ struct btrfs_fs_devices { /* Policy used to read the mirrored stripes. */ enum btrfs_read_policy read_policy; + +#ifdef CONFIG_BTRFS_DEBUG + /* Checksum mode - offload it or do it synchronously. */ + enum btrfs_offload_csum_mode offload_csum_mode; +#endif }; #define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \ @@ -557,8 +596,6 @@ static inline void btrfs_free_chunk_map(struct btrfs_chunk_map *map) } } -struct btrfs_balance_args; -struct btrfs_balance_progress; struct btrfs_balance_control { struct btrfs_balance_args data; struct btrfs_balance_args meta; @@ -661,7 +698,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args); int btrfs_rm_device(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, - struct bdev_handle **bdev_handle); + struct file **bdev_file); void __exit btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, @@ -780,9 +817,7 @@ void btrfs_commit_device_sizes(struct btrfs_transaction *trans); struct list_head * __attribute_const__ btrfs_get_fs_uuids(void); bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, struct btrfs_device *failing_dev); -void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, - struct block_device *bdev, - const char *device_path); +void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct btrfs_device *device); enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags); int btrfs_bg_type_to_factor(u64 flags); diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index 118118ca3e1d..b9376ea258ff 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -6,7 +6,11 @@ #ifndef BTRFS_XATTR_H #define BTRFS_XATTR_H -#include <linux/xattr.h> +struct dentry; +struct inode; +struct qstr; +struct xattr_handler; +struct btrfs_trans_handle; extern const struct xattr_handler * const btrfs_xattr_handlers[]; diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 36cf1f0e338e..e5b3f2003896 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -354,18 +354,13 @@ done: } int zlib_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret = 0; int wbits = MAX_WBITS; - unsigned long bytes_left; - unsigned long total_out = 0; - unsigned long pg_offset = 0; - - destlen = min_t(unsigned long, destlen, PAGE_SIZE); - bytes_left = destlen; + unsigned long to_copy; workspace->strm.next_in = data_in; workspace->strm.avail_in = srclen; @@ -390,60 +385,30 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in, return -EIO; } - while (bytes_left > 0) { - unsigned long buf_start; - unsigned long buf_offset; - unsigned long bytes; - - ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH); - if (ret != Z_OK && ret != Z_STREAM_END) - break; - - buf_start = total_out; - total_out = workspace->strm.total_out; - - if (total_out == buf_start) { - ret = -EIO; - break; - } - - if (total_out <= start_byte) - goto next; - - if (total_out > start_byte && buf_start < start_byte) - buf_offset = start_byte - buf_start; - else - buf_offset = 0; - - bytes = min(PAGE_SIZE - pg_offset, - PAGE_SIZE - (buf_offset % PAGE_SIZE)); - bytes = min(bytes, bytes_left); + /* + * Everything (in/out buf) should be at most one sector, there should + * be no need to switch any input/output buffer. + */ + ret = zlib_inflate(&workspace->strm, Z_FINISH); + to_copy = min(workspace->strm.total_out, destlen); + if (ret != Z_STREAM_END) + goto out; - memcpy_to_page(dest_page, pg_offset, - workspace->buf + buf_offset, bytes); + memcpy_to_page(dest_page, dest_pgoff, workspace->buf, to_copy); - pg_offset += bytes; - bytes_left -= bytes; -next: - workspace->strm.next_out = workspace->buf; - workspace->strm.avail_out = workspace->buf_size; - } - - if (ret != Z_STREAM_END && bytes_left != 0) +out: + if (unlikely(to_copy != destlen)) { + pr_warn_ratelimited("BTRFS: inflate failed, decompressed=%lu expected=%zu\n", + to_copy, destlen); ret = -EIO; - else + } else { ret = 0; + } zlib_inflateEnd(&workspace->strm); - /* - * this should only happen if zlib returned fewer bytes than we - * expected. btrfs_get_block is responsible for zeroing from the - * end of the inline extent (destlen) to the end of the page - */ - if (pg_offset < destlen) { - memzero_page(dest_page, pg_offset, destlen - pg_offset); - } + if (unlikely(to_copy < destlen)) + memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy); return ret; } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 5bd76813b23f..5a3d5ec75c5a 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -12,10 +12,8 @@ #include "rcu-string.h" #include "disk-io.h" #include "block-group.h" -#include "transaction.h" #include "dev-replace.h" #include "space-info.h" -#include "super.h" #include "fs.h" #include "accessors.h" #include "bio.h" @@ -824,11 +822,14 @@ static int sb_log_location(struct block_device *bdev, struct blk_zone *zones, reset = &zones[1]; if (reset && reset->cond != BLK_ZONE_COND_EMPTY) { + unsigned int nofs_flags; + ASSERT(sb_zone_is_full(reset)); + nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, - reset->start, reset->len, - GFP_NOFS); + reset->start, reset->len); + memalloc_nofs_restore(nofs_flags); if (ret) return ret; @@ -974,11 +975,14 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror) * explicit ZONE_FINISH is not necessary. */ if (zone->wp != zone->start + zone->capacity) { + unsigned int nofs_flags; int ret; + nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, zone->start, - zone->len, GFP_NOFS); + zone->len); + memalloc_nofs_restore(nofs_flags); if (ret) return ret; } @@ -996,11 +1000,13 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror) int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) { + unsigned int nofs_flags; sector_t zone_sectors; sector_t nr_sectors; u8 zone_sectors_shift; u32 sb_zone; u32 nr_zones; + int ret; zone_sectors = bdev_zone_sectors(bdev); zone_sectors_shift = ilog2(zone_sectors); @@ -1011,9 +1017,12 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) if (sb_zone + 1 >= nr_zones) return -ENOENT; - return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, - zone_start_sector(sb_zone, bdev), - zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS); + nofs_flags = memalloc_nofs_save(); + ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, + zone_start_sector(sb_zone, bdev), + zone_sectors * BTRFS_NR_SB_LOG_ZONES); + memalloc_nofs_restore(nofs_flags); + return ret; } /* @@ -1124,12 +1133,14 @@ static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos) int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, u64 length, u64 *bytes) { + unsigned int nofs_flags; int ret; *bytes = 0; + nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET, - physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT, - GFP_NOFS); + physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT); + memalloc_nofs_restore(nofs_flags); if (ret) return ret; @@ -1639,6 +1650,15 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } out: + /* Reject non SINGLE data profiles without RST */ + if ((map->type & BTRFS_BLOCK_GROUP_DATA) && + (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) && + !fs_info->stripe_root) { + btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", + btrfs_bg_type_to_raid_name(map->type)); + return -EINVAL; + } + if (cache->alloc_offset > cache->zone_capacity) { btrfs_err(fs_info, "zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu", @@ -1670,6 +1690,7 @@ out: } bitmap_free(active); kfree(zone_info); + btrfs_free_chunk_map(map); return ret; } @@ -2055,6 +2076,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) map = block_group->physical_map; + spin_lock(&fs_info->zone_active_bgs_lock); spin_lock(&block_group->lock); if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) { ret = true; @@ -2067,7 +2089,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) goto out_unlock; } - spin_lock(&fs_info->zone_active_bgs_lock); for (i = 0; i < map->num_stripes; i++) { struct btrfs_zoned_device_info *zinfo; int reserved = 0; @@ -2087,20 +2108,17 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) */ if (atomic_read(&zinfo->active_zones_left) <= reserved) { ret = false; - spin_unlock(&fs_info->zone_active_bgs_lock); goto out_unlock; } if (!btrfs_dev_set_active_zone(device, physical)) { /* Cannot activate the zone */ ret = false; - spin_unlock(&fs_info->zone_active_bgs_lock); goto out_unlock; } if (!is_data) zinfo->reserved_active_zones--; } - spin_unlock(&fs_info->zone_active_bgs_lock); /* Successfully activated all the zones */ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); @@ -2108,8 +2126,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) /* For the active block group list */ btrfs_get_block_group(block_group); - - spin_lock(&fs_info->zone_active_bgs_lock); list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs); spin_unlock(&fs_info->zone_active_bgs_lock); @@ -2117,6 +2133,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) out_unlock: spin_unlock(&block_group->lock); + spin_unlock(&fs_info->zone_active_bgs_lock); return ret; } @@ -2238,14 +2255,16 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ struct btrfs_device *device = map->stripes[i].dev; const u64 physical = map->stripes[i].physical; struct btrfs_zoned_device_info *zinfo = device->zone_info; + unsigned int nofs_flags; if (zinfo->max_active_zones == 0) continue; + nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, physical >> SECTOR_SHIFT, - zinfo->zone_size >> SECTOR_SHIFT, - GFP_NOFS); + zinfo->zone_size >> SECTOR_SHIFT); + memalloc_nofs_restore(nofs_flags); if (ret) return ret; diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index f573bda496fb..77c4321e331f 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -4,12 +4,27 @@ #define BTRFS_ZONED_H #include <linux/types.h> +#include <linux/atomic.h> #include <linux/blkdev.h> +#include <linux/blkzoned.h> +#include <linux/errno.h> +#include <linux/spinlock.h> +#include <linux/mutex.h> #include "messages.h" #include "volumes.h" #include "disk-io.h" #include "block-group.h" #include "btrfs_inode.h" +#include "fs.h" + +struct block_device; +struct extent_buffer; +struct btrfs_bio; +struct btrfs_ordered_extent; +struct btrfs_fs_info; +struct btrfs_space_info; +struct btrfs_eb_write_context; +struct btrfs_fs_devices; #define BTRFS_DEFAULT_RECLAIM_THRESH (75) diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 0d66db8bc1d4..92b3744b819b 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -18,8 +18,9 @@ #include <linux/slab.h> #include <linux/zstd.h> #include "misc.h" +#include "fs.h" #include "compression.h" -#include "ctree.h" +#include "super.h" #define ZSTD_BTRFS_MAX_WINDOWLOG 17 #define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG) @@ -618,80 +619,48 @@ done: } int zstd_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); + struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb); + const u32 sectorsize = fs_info->sectorsize; zstd_dstream *stream; int ret = 0; - size_t ret2; - unsigned long total_out = 0; - unsigned long pg_offset = 0; + unsigned long to_copy = 0; stream = zstd_init_dstream( ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); if (!stream) { pr_warn("BTRFS: zstd_init_dstream failed\n"); - ret = -EIO; goto finish; } - destlen = min_t(size_t, destlen, PAGE_SIZE); - workspace->in_buf.src = data_in; workspace->in_buf.pos = 0; workspace->in_buf.size = srclen; workspace->out_buf.dst = workspace->buf; workspace->out_buf.pos = 0; - workspace->out_buf.size = PAGE_SIZE; - - ret2 = 1; - while (pg_offset < destlen - && workspace->in_buf.pos < workspace->in_buf.size) { - unsigned long buf_start; - unsigned long buf_offset; - unsigned long bytes; - - /* Check if the frame is over and we still need more input */ - if (ret2 == 0) { - pr_debug("BTRFS: zstd_decompress_stream ended early\n"); - ret = -EIO; - goto finish; - } - ret2 = zstd_decompress_stream(stream, &workspace->out_buf, - &workspace->in_buf); - if (zstd_is_error(ret2)) { - pr_debug("BTRFS: zstd_decompress_stream returned %d\n", - zstd_get_error_code(ret2)); - ret = -EIO; - goto finish; - } - - buf_start = total_out; - total_out += workspace->out_buf.pos; - workspace->out_buf.pos = 0; - - if (total_out <= start_byte) - continue; - - if (total_out > start_byte && buf_start < start_byte) - buf_offset = start_byte - buf_start; - else - buf_offset = 0; - - bytes = min_t(unsigned long, destlen - pg_offset, - workspace->out_buf.size - buf_offset); - - memcpy_to_page(dest_page, pg_offset, - workspace->out_buf.dst + buf_offset, bytes); - - pg_offset += bytes; + workspace->out_buf.size = sectorsize; + + /* + * Since both input and output buffers should not exceed one sector, + * one call should end the decompression. + */ + ret = zstd_decompress_stream(stream, &workspace->out_buf, &workspace->in_buf); + if (zstd_is_error(ret)) { + pr_warn_ratelimited("BTRFS: zstd_decompress_stream return %d\n", + zstd_get_error_code(ret)); + goto finish; } - ret = 0; + to_copy = workspace->out_buf.pos; + memcpy_to_page(dest_page, dest_pgoff, workspace->out_buf.dst, to_copy); finish: - if (pg_offset < destlen) { - memzero_page(dest_page, pg_offset, destlen - pg_offset); + /* Error or early end. */ + if (unlikely(to_copy < destlen)) { + ret = -EIO; + memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy); } return ret; } diff --git a/fs/buffer.c b/fs/buffer.c index d3bcf601d3e5..4f73d23c2c46 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -55,7 +55,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, - struct writeback_control *wbc); + enum rw_hint hint, struct writeback_control *wbc); #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) @@ -464,7 +464,7 @@ EXPORT_SYMBOL(mark_buffer_async_write); * a successful fsync(). For example, ext2 indirect blocks need to be * written back and waited upon before fsync() returns. * - * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(), + * The functions mark_buffer_dirty_inode(), fsync_inode_buffers(), * inode_has_buffers() and invalidate_inode_buffers() are provided for the * management of a list of dependent buffers at ->i_mapping->i_private_list. * @@ -1889,7 +1889,8 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio, do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc); + submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, + inode->i_write_hint, wbc); nr_underway++; } bh = next; @@ -1944,7 +1945,8 @@ recover: struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { clear_buffer_dirty(bh); - submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc); + submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, + inode->i_write_hint, wbc); nr_underway++; } bh = next; @@ -2756,6 +2758,7 @@ static void end_bio_bh_io_sync(struct bio *bio) } static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, + enum rw_hint write_hint, struct writeback_control *wbc) { const enum req_op op = opf & REQ_OP_MASK; @@ -2783,6 +2786,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_write_hint = write_hint; __bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); @@ -2802,7 +2806,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, void submit_bh(blk_opf_t opf, struct buffer_head *bh) { - submit_bh_wbc(opf, bh, NULL); + submit_bh_wbc(opf, bh, WRITE_LIFE_NOT_SET, NULL); } EXPORT_SYMBOL(submit_bh); @@ -3121,12 +3125,8 @@ void __init buffer_init(void) unsigned long nrpages; int ret; - bh_cachep = kmem_cache_create("buffer_head", - sizeof(struct buffer_head), 0, - (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| - SLAB_MEM_SPREAD), - NULL); - + bh_cachep = KMEM_CACHE(buffer_head, + SLAB_RECLAIM_ACCOUNT|SLAB_PANIC); /* * Limit the bh occupancy to 10% of ZONE_NORMAL */ diff --git a/fs/cachefiles/cache.c b/fs/cachefiles/cache.c index 7077f72e6f47..f449f7340aad 100644 --- a/fs/cachefiles/cache.c +++ b/fs/cachefiles/cache.c @@ -168,6 +168,8 @@ error_unsupported: dput(root); error_open_root: cachefiles_end_secure(cache, saved_cred); + put_cred(cache->cache_cred); + cache->cache_cred = NULL; error_getsec: fscache_relinquish_cache(cache_cookie); cache->cache = NULL; diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c index 3f24905f4066..6465e2574230 100644 --- a/fs/cachefiles/daemon.c +++ b/fs/cachefiles/daemon.c @@ -816,6 +816,7 @@ static void cachefiles_daemon_unbind(struct cachefiles_cache *cache) cachefiles_put_directory(cache->graveyard); cachefiles_put_directory(cache->store); mntput(cache->mnt); + put_cred(cache->cache_cred); kfree(cache->rootdirname); kfree(cache->secctx); diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index 5fd74ec60bef..4ba42f1fa3b4 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -539,6 +539,9 @@ int cachefiles_ondemand_init_object(struct cachefiles_object *object) struct fscache_volume *volume = object->volume->vcookie; size_t volume_key_size, cookie_key_size, data_len; + if (!object->ondemand) + return 0; + /* * CacheFiles will firstly check the cache file under the root cache * directory. If the coherency check failed, it will fallback to diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 9c02f328c966..7fb4aae97412 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1452,7 +1452,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, if (flushing & CEPH_CAP_XATTR_EXCL) { arg->old_xattr_buf = __ceph_build_xattrs_blob(ci); arg->xattr_version = ci->i_xattrs.version; - arg->xattr_buf = ci->i_xattrs.blob; + arg->xattr_buf = ceph_buffer_get(ci->i_xattrs.blob); } else { arg->xattr_buf = NULL; arg->old_xattr_buf = NULL; @@ -1553,6 +1553,7 @@ static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci) encode_cap_msg(msg, arg); ceph_con_send(&arg->session->s_con, msg); ceph_buffer_put(arg->old_xattr_buf); + ceph_buffer_put(arg->xattr_buf); if (arg->wake) wake_up_all(&ci->i_cap_wq); } @@ -2155,6 +2156,30 @@ retry: ceph_cap_string(cap->implemented), ceph_cap_string(revoking)); + /* completed revocation? going down and there are no caps? */ + if (revoking) { + if ((revoking & cap_used) == 0) { + doutc(cl, "completed revocation of %s\n", + ceph_cap_string(cap->implemented & ~cap->issued)); + goto ack; + } + + /* + * If the "i_wrbuffer_ref" was increased by mmap or generic + * cache write just before the ceph_check_caps() is called, + * the Fb capability revoking will fail this time. Then we + * must wait for the BDI's delayed work to flush the dirty + * pages and to release the "i_wrbuffer_ref", which will cost + * at most 5 seconds. That means the MDS needs to wait at + * most 5 seconds to finished the Fb capability's revocation. + * + * Let's queue a writeback for it. + */ + if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref && + (revoking & CEPH_CAP_FILE_BUFFER)) + queue_writeback = true; + } + if (cap == ci->i_auth_cap && (cap->issued & CEPH_CAP_FILE_WR)) { /* request larger max_size from MDS? */ @@ -2182,30 +2207,6 @@ retry: } } - /* completed revocation? going down and there are no caps? */ - if (revoking) { - if ((revoking & cap_used) == 0) { - doutc(cl, "completed revocation of %s\n", - ceph_cap_string(cap->implemented & ~cap->issued)); - goto ack; - } - - /* - * If the "i_wrbuffer_ref" was increased by mmap or generic - * cache write just before the ceph_check_caps() is called, - * the Fb capability revoking will fail this time. Then we - * must wait for the BDI's delayed work to flush the dirty - * pages and to release the "i_wrbuffer_ref", which will cost - * at most 5 seconds. That means the MDS needs to wait at - * most 5 seconds to finished the Fb capability's revocation. - * - * Let's queue a writeback for it. - */ - if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref && - (revoking & CEPH_CAP_FILE_BUFFER)) - queue_writeback = true; - } - /* want more caps from mds? */ if (want & ~cap->mds_wanted) { if (want & ~(cap->mds_wanted | cap->issued)) @@ -3215,7 +3216,6 @@ static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci, enum put_cap_refs_mode { PUT_CAP_REFS_SYNC = 0, - PUT_CAP_REFS_NO_CHECK, PUT_CAP_REFS_ASYNC, }; @@ -3331,11 +3331,6 @@ void ceph_put_cap_refs_async(struct ceph_inode_info *ci, int had) __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_ASYNC); } -void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, int had) -{ - __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_NO_CHECK); -} - /* * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap * context. Adjust per-snap dirty page accounting as appropriate. @@ -4777,7 +4772,22 @@ int ceph_drop_caps_for_unlink(struct inode *inode) if (__ceph_caps_dirty(ci)) { struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; - __cap_delay_requeue_front(mdsc, ci); + + doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, + ceph_vinop(inode)); + spin_lock(&mdsc->cap_unlink_delay_lock); + ci->i_ceph_flags |= CEPH_I_FLUSH; + if (!list_empty(&ci->i_cap_delay_list)) + list_del_init(&ci->i_cap_delay_list); + list_add_tail(&ci->i_cap_delay_list, + &mdsc->cap_unlink_delay_list); + spin_unlock(&mdsc->cap_unlink_delay_lock); + + /* + * Fire the work immediately, because the MDS maybe + * waiting for caps release. + */ + ceph_queue_cap_unlink_work(mdsc); } } spin_unlock(&ci->i_ceph_lock); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 0c25d326afc4..7b2e77517f23 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -78,6 +78,8 @@ struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry, if (!inode) return ERR_PTR(-ENOMEM); + inode->i_blkbits = CEPH_FSCRYPT_BLOCK_SHIFT; + if (!S_ISLNK(*mode)) { err = ceph_pre_init_acls(dir, mode, as_ctx); if (err < 0) diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index e07ad29ff8b9..ebf4ac0055dd 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -33,7 +33,7 @@ void __init ceph_flock_init(void) static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src) { - struct inode *inode = file_inode(dst->fl_file); + struct inode *inode = file_inode(dst->c.flc_file); atomic_inc(&ceph_inode(inode)->i_filelock_ref); dst->fl_u.ceph.inode = igrab(inode); } @@ -110,17 +110,18 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode, else length = fl->fl_end - fl->fl_start + 1; - owner = secure_addr(fl->fl_owner); + owner = secure_addr(fl->c.flc_owner); doutc(cl, "rule: %d, op: %d, owner: %llx, pid: %llu, " "start: %llu, length: %llu, wait: %d, type: %d\n", - (int)lock_type, (int)operation, owner, (u64)fl->fl_pid, - fl->fl_start, length, wait, fl->fl_type); + (int)lock_type, (int)operation, owner, + (u64) fl->c.flc_pid, + fl->fl_start, length, wait, fl->c.flc_type); req->r_args.filelock_change.rule = lock_type; req->r_args.filelock_change.type = cmd; req->r_args.filelock_change.owner = cpu_to_le64(owner); - req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid); + req->r_args.filelock_change.pid = cpu_to_le64((u64) fl->c.flc_pid); req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start); req->r_args.filelock_change.length = cpu_to_le64(length); req->r_args.filelock_change.wait = wait; @@ -130,13 +131,13 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode, err = ceph_mdsc_wait_request(mdsc, req, wait ? ceph_lock_wait_for_completion : NULL); if (!err && operation == CEPH_MDS_OP_GETFILELOCK) { - fl->fl_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid); + fl->c.flc_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid); if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type) - fl->fl_type = F_RDLCK; + fl->c.flc_type = F_RDLCK; else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type) - fl->fl_type = F_WRLCK; + fl->c.flc_type = F_WRLCK; else - fl->fl_type = F_UNLCK; + fl->c.flc_type = F_UNLCK; fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start); length = le64_to_cpu(req->r_reply_info.filelock_reply->start) + @@ -150,8 +151,8 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode, ceph_mdsc_put_request(req); doutc(cl, "rule: %d, op: %d, pid: %llu, start: %llu, " "length: %llu, wait: %d, type: %d, err code %d\n", - (int)lock_type, (int)operation, (u64)fl->fl_pid, - fl->fl_start, length, wait, fl->fl_type, err); + (int)lock_type, (int)operation, (u64) fl->c.flc_pid, + fl->fl_start, length, wait, fl->c.flc_type, err); return err; } @@ -227,10 +228,10 @@ static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, static int try_unlock_file(struct file *file, struct file_lock *fl) { int err; - unsigned int orig_flags = fl->fl_flags; - fl->fl_flags |= FL_EXISTS; + unsigned int orig_flags = fl->c.flc_flags; + fl->c.flc_flags |= FL_EXISTS; err = locks_lock_file_wait(file, fl); - fl->fl_flags = orig_flags; + fl->c.flc_flags = orig_flags; if (err == -ENOENT) { if (!(orig_flags & FL_EXISTS)) err = 0; @@ -253,13 +254,13 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) u8 wait = 0; u8 lock_cmd; - if (!(fl->fl_flags & FL_POSIX)) + if (!(fl->c.flc_flags & FL_POSIX)) return -ENOLCK; if (ceph_inode_is_shutdown(inode)) return -ESTALE; - doutc(cl, "fl_owner: %p\n", fl->fl_owner); + doutc(cl, "fl_owner: %p\n", fl->c.flc_owner); /* set wait bit as appropriate, then make command as Ceph expects it*/ if (IS_GETLK(cmd)) @@ -273,19 +274,19 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) } spin_unlock(&ci->i_ceph_lock); if (err < 0) { - if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK == fl->fl_type) + if (op == CEPH_MDS_OP_SETFILELOCK && lock_is_unlock(fl)) posix_lock_file(file, fl, NULL); return err; } - if (F_RDLCK == fl->fl_type) + if (lock_is_read(fl)) lock_cmd = CEPH_LOCK_SHARED; - else if (F_WRLCK == fl->fl_type) + else if (lock_is_write(fl)) lock_cmd = CEPH_LOCK_EXCL; else lock_cmd = CEPH_LOCK_UNLOCK; - if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK == fl->fl_type) { + if (op == CEPH_MDS_OP_SETFILELOCK && lock_is_unlock(fl)) { err = try_unlock_file(file, fl); if (err <= 0) return err; @@ -293,7 +294,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl); if (!err) { - if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK != fl->fl_type) { + if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK != fl->c.flc_type) { doutc(cl, "locking locally\n"); err = posix_lock_file(file, fl, NULL); if (err) { @@ -319,13 +320,13 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) u8 wait = 0; u8 lock_cmd; - if (!(fl->fl_flags & FL_FLOCK)) + if (!(fl->c.flc_flags & FL_FLOCK)) return -ENOLCK; if (ceph_inode_is_shutdown(inode)) return -ESTALE; - doutc(cl, "fl_file: %p\n", fl->fl_file); + doutc(cl, "fl_file: %p\n", fl->c.flc_file); spin_lock(&ci->i_ceph_lock); if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) { @@ -333,7 +334,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) } spin_unlock(&ci->i_ceph_lock); if (err < 0) { - if (F_UNLCK == fl->fl_type) + if (lock_is_unlock(fl)) locks_lock_file_wait(file, fl); return err; } @@ -341,14 +342,14 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) if (IS_SETLKW(cmd)) wait = 1; - if (F_RDLCK == fl->fl_type) + if (lock_is_read(fl)) lock_cmd = CEPH_LOCK_SHARED; - else if (F_WRLCK == fl->fl_type) + else if (lock_is_write(fl)) lock_cmd = CEPH_LOCK_EXCL; else lock_cmd = CEPH_LOCK_UNLOCK; - if (F_UNLCK == fl->fl_type) { + if (lock_is_unlock(fl)) { err = try_unlock_file(file, fl); if (err <= 0) return err; @@ -356,7 +357,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, inode, lock_cmd, wait, fl); - if (!err && F_UNLCK != fl->fl_type) { + if (!err && F_UNLCK != fl->c.flc_type) { err = locks_lock_file_wait(file, fl); if (err) { ceph_lock_message(CEPH_LOCK_FLOCK, @@ -385,9 +386,9 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count) ctx = locks_inode_context(inode); if (ctx) { spin_lock(&ctx->flc_lock); - list_for_each_entry(lock, &ctx->flc_posix, fl_list) + for_each_file_lock(lock, &ctx->flc_posix) ++(*fcntl_count); - list_for_each_entry(lock, &ctx->flc_flock, fl_list) + for_each_file_lock(lock, &ctx->flc_flock) ++(*flock_count); spin_unlock(&ctx->flc_lock); } @@ -408,10 +409,10 @@ static int lock_to_ceph_filelock(struct inode *inode, cephlock->start = cpu_to_le64(lock->fl_start); cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1); cephlock->client = cpu_to_le64(0); - cephlock->pid = cpu_to_le64((u64)lock->fl_pid); - cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner)); + cephlock->pid = cpu_to_le64((u64) lock->c.flc_pid); + cephlock->owner = cpu_to_le64(secure_addr(lock->c.flc_owner)); - switch (lock->fl_type) { + switch (lock->c.flc_type) { case F_RDLCK: cephlock->type = CEPH_LOCK_SHARED; break; @@ -422,7 +423,8 @@ static int lock_to_ceph_filelock(struct inode *inode, cephlock->type = CEPH_LOCK_UNLOCK; break; default: - doutc(cl, "Have unknown lock type %d\n", lock->fl_type); + doutc(cl, "Have unknown lock type %d\n", + lock->c.flc_type); err = -EINVAL; } @@ -453,7 +455,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode, return 0; spin_lock(&ctx->flc_lock); - list_for_each_entry(lock, &ctx->flc_posix, fl_list) { + for_each_file_lock(lock, &ctx->flc_posix) { ++seen_fcntl; if (seen_fcntl > num_fcntl_locks) { err = -ENOSPC; @@ -464,7 +466,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode, goto fail; ++l; } - list_for_each_entry(lock, &ctx->flc_flock, fl_list) { + for_each_file_lock(lock, &ctx->flc_flock) { ++seen_flock; if (seen_flock > num_flock_locks) { err = -ENOSPC; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 548d1de379f3..3ab9c268a8bb 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1089,7 +1089,7 @@ void ceph_mdsc_release_request(struct kref *kref) struct ceph_mds_request *req = container_of(kref, struct ceph_mds_request, r_kref); - ceph_mdsc_release_dir_caps_no_check(req); + ceph_mdsc_release_dir_caps_async(req); destroy_reply_info(&req->r_reply_info); if (req->r_request) ceph_msg_put(req->r_request); @@ -2484,6 +2484,50 @@ void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr) } } +void ceph_queue_cap_unlink_work(struct ceph_mds_client *mdsc) +{ + struct ceph_client *cl = mdsc->fsc->client; + if (mdsc->stopping) + return; + + if (queue_work(mdsc->fsc->cap_wq, &mdsc->cap_unlink_work)) { + doutc(cl, "caps unlink work queued\n"); + } else { + doutc(cl, "failed to queue caps unlink work\n"); + } +} + +static void ceph_cap_unlink_work(struct work_struct *work) +{ + struct ceph_mds_client *mdsc = + container_of(work, struct ceph_mds_client, cap_unlink_work); + struct ceph_client *cl = mdsc->fsc->client; + + doutc(cl, "begin\n"); + spin_lock(&mdsc->cap_unlink_delay_lock); + while (!list_empty(&mdsc->cap_unlink_delay_list)) { + struct ceph_inode_info *ci; + struct inode *inode; + + ci = list_first_entry(&mdsc->cap_unlink_delay_list, + struct ceph_inode_info, + i_cap_delay_list); + list_del_init(&ci->i_cap_delay_list); + + inode = igrab(&ci->netfs.inode); + if (inode) { + spin_unlock(&mdsc->cap_unlink_delay_lock); + doutc(cl, "on %p %llx.%llx\n", inode, + ceph_vinop(inode)); + ceph_check_caps(ci, CHECK_CAPS_FLUSH); + iput(inode); + spin_lock(&mdsc->cap_unlink_delay_lock); + } + } + spin_unlock(&mdsc->cap_unlink_delay_lock); + doutc(cl, "done\n"); +} + /* * requests */ @@ -4261,7 +4305,7 @@ void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req) } } -void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req) +void ceph_mdsc_release_dir_caps_async(struct ceph_mds_request *req) { struct ceph_client *cl = req->r_mdsc->fsc->client; int dcaps; @@ -4269,8 +4313,7 @@ void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req) dcaps = xchg(&req->r_dir_caps, 0); if (dcaps) { doutc(cl, "releasing r_dir_caps=%s\n", ceph_cap_string(dcaps)); - ceph_put_cap_refs_no_check_caps(ceph_inode(req->r_parent), - dcaps); + ceph_put_cap_refs_async(ceph_inode(req->r_parent), dcaps); } } @@ -4306,7 +4349,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, if (req->r_session->s_mds != session->s_mds) continue; - ceph_mdsc_release_dir_caps_no_check(req); + ceph_mdsc_release_dir_caps_async(req); __send_request(session, req, true); } @@ -5360,6 +5403,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) INIT_LIST_HEAD(&mdsc->cap_delay_list); INIT_LIST_HEAD(&mdsc->cap_wait_list); spin_lock_init(&mdsc->cap_delay_lock); + INIT_LIST_HEAD(&mdsc->cap_unlink_delay_list); + spin_lock_init(&mdsc->cap_unlink_delay_lock); INIT_LIST_HEAD(&mdsc->snap_flush_list); spin_lock_init(&mdsc->snap_flush_lock); mdsc->last_cap_flush_tid = 1; @@ -5368,6 +5413,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) spin_lock_init(&mdsc->cap_dirty_lock); init_waitqueue_head(&mdsc->cap_flushing_wq); INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work); + INIT_WORK(&mdsc->cap_unlink_work, ceph_cap_unlink_work); err = ceph_metric_init(&mdsc->metric); if (err) goto err_mdsmap; @@ -5641,6 +5687,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) ceph_cleanup_global_and_empty_realms(mdsc); cancel_work_sync(&mdsc->cap_reclaim_work); + cancel_work_sync(&mdsc->cap_unlink_work); cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ doutc(cl, "done\n"); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 2e6ddaa13d72..03f8ff00874f 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -462,6 +462,8 @@ struct ceph_mds_client { unsigned long last_renew_caps; /* last time we renewed our caps */ struct list_head cap_delay_list; /* caps with delayed release */ spinlock_t cap_delay_lock; /* protects cap_delay_list */ + struct list_head cap_unlink_delay_list; /* caps with delayed release for unlink */ + spinlock_t cap_unlink_delay_lock; /* protects cap_unlink_delay_list */ struct list_head snap_flush_list; /* cap_snaps ready to flush */ spinlock_t snap_flush_lock; @@ -475,6 +477,8 @@ struct ceph_mds_client { struct work_struct cap_reclaim_work; atomic_t cap_reclaim_pending; + struct work_struct cap_unlink_work; + /* * Cap reservations * @@ -552,7 +556,7 @@ extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, struct inode *dir, struct ceph_mds_request *req); extern void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req); -extern void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req); +extern void ceph_mdsc_release_dir_caps_async(struct ceph_mds_request *req); static inline void ceph_mdsc_get_request(struct ceph_mds_request *req) { kref_get(&req->r_kref); @@ -574,6 +578,7 @@ extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc); extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr); +extern void ceph_queue_cap_unlink_work(struct ceph_mds_client *mdsc); extern int ceph_iterate_session_caps(struct ceph_mds_session *session, int (*cb)(struct inode *, int mds, void *), void *arg); diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index fae97c25ce58..8109aba66e02 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -380,10 +380,11 @@ struct ceph_mdsmap *ceph_mdsmap_decode(struct ceph_mds_client *mdsc, void **p, ceph_decode_skip_8(p, end, bad_ext); /* required_client_features */ ceph_decode_skip_set(p, end, 64, bad_ext); + /* bal_rank_mask */ + ceph_decode_skip_string(p, end, bad_ext); + } + if (mdsmap_ev >= 18) { ceph_decode_64_safe(p, end, m->m_max_xattr_size, bad_ext); - } else { - /* This forces the usage of the (sync) SETXATTR Op */ - m->m_max_xattr_size = 0; } bad_ext: doutc(cl, "m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n", diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h index 89f1931f1ba6..1f2171dd01bf 100644 --- a/fs/ceph/mdsmap.h +++ b/fs/ceph/mdsmap.h @@ -27,7 +27,11 @@ struct ceph_mdsmap { u32 m_session_timeout; /* seconds */ u32 m_session_autoclose; /* seconds */ u64 m_max_file_size; - u64 m_max_xattr_size; /* maximum size for xattrs blob */ + /* + * maximum size for xattrs blob. + * Zeroed by default to force the usage of the (sync) SETXATTR Op. + */ + u64 m_max_xattr_size; u32 m_max_mds; /* expected up:active mds number */ u32 m_num_active_mds; /* actual up:active mds number */ u32 possible_max_rank; /* possible max rank index */ diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 5ec102f6b1ac..885cb5d4e771 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -928,36 +928,36 @@ static int __init init_caches(void) ceph_inode_cachep = kmem_cache_create("ceph_inode_info", sizeof(struct ceph_inode_info), __alignof__(struct ceph_inode_info), - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| - SLAB_ACCOUNT, ceph_inode_init_once); + SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, + ceph_inode_init_once); if (!ceph_inode_cachep) return -ENOMEM; - ceph_cap_cachep = KMEM_CACHE(ceph_cap, SLAB_MEM_SPREAD); + ceph_cap_cachep = KMEM_CACHE(ceph_cap, 0); if (!ceph_cap_cachep) goto bad_cap; - ceph_cap_snap_cachep = KMEM_CACHE(ceph_cap_snap, SLAB_MEM_SPREAD); + ceph_cap_snap_cachep = KMEM_CACHE(ceph_cap_snap, 0); if (!ceph_cap_snap_cachep) goto bad_cap_snap; ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); + SLAB_RECLAIM_ACCOUNT); if (!ceph_cap_flush_cachep) goto bad_cap_flush; ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); + SLAB_RECLAIM_ACCOUNT); if (!ceph_dentry_cachep) goto bad_dentry; - ceph_file_cachep = KMEM_CACHE(ceph_file_info, SLAB_MEM_SPREAD); + ceph_file_cachep = KMEM_CACHE(ceph_file_info, 0); if (!ceph_file_cachep) goto bad_file; - ceph_dir_file_cachep = KMEM_CACHE(ceph_dir_file_info, SLAB_MEM_SPREAD); + ceph_dir_file_cachep = KMEM_CACHE(ceph_dir_file_info, 0); if (!ceph_dir_file_cachep) goto bad_dir_file; - ceph_mds_request_cachep = KMEM_CACHE(ceph_mds_request, SLAB_MEM_SPREAD); + ceph_mds_request_cachep = KMEM_CACHE(ceph_mds_request, 0); if (!ceph_mds_request_cachep) goto bad_mds_req; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index b06e2bc86221..b63b4cd9b5b6 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1255,8 +1255,6 @@ extern void ceph_take_cap_refs(struct ceph_inode_info *ci, int caps, extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps); extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); extern void ceph_put_cap_refs_async(struct ceph_inode_info *ci, int had); -extern void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, - int had); extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, struct ceph_snap_context *snapc); extern void __ceph_remove_capsnap(struct inode *inode, diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 0c7c2528791e..6898dc621011 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -24,6 +24,8 @@ #include <linux/pid_namespace.h> #include <linux/uaccess.h> #include <linux/fs.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/vmalloc.h> #include <linux/coda.h> @@ -70,8 +72,8 @@ int __init coda_init_inodecache(void) { coda_inode_cachep = kmem_cache_create("coda_inode_cache", sizeof(struct coda_inode_info), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| - SLAB_ACCOUNT, init_once); + SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, + init_once); if (coda_inode_cachep == NULL) return -ENOMEM; return 0; @@ -87,10 +89,10 @@ void coda_destroy_inodecache(void) kmem_cache_destroy(coda_inode_cachep); } -static int coda_remount(struct super_block *sb, int *flags, char *data) +static int coda_reconfigure(struct fs_context *fc) { - sync_filesystem(sb); - *flags |= SB_NOATIME; + sync_filesystem(fc->root->d_sb); + fc->sb_flags |= SB_NOATIME; return 0; } @@ -102,78 +104,102 @@ static const struct super_operations coda_super_operations = .evict_inode = coda_evict_inode, .put_super = coda_put_super, .statfs = coda_statfs, - .remount_fs = coda_remount, }; -static int get_device_index(struct coda_mount_data *data) +struct coda_fs_context { + int idx; +}; + +enum { + Opt_fd, +}; + +static const struct fs_parameter_spec coda_param_specs[] = { + fsparam_fd ("fd", Opt_fd), + {} +}; + +static int coda_parse_fd(struct fs_context *fc, int fd) { + struct coda_fs_context *ctx = fc->fs_private; struct fd f; struct inode *inode; int idx; - if (data == NULL) { - pr_warn("%s: Bad mount data\n", __func__); - return -1; - } - - if (data->version != CODA_MOUNT_VERSION) { - pr_warn("%s: Bad mount version\n", __func__); - return -1; - } - - f = fdget(data->fd); + f = fdget(fd); if (!f.file) - goto Ebadf; + return -EBADF; inode = file_inode(f.file); if (!S_ISCHR(inode->i_mode) || imajor(inode) != CODA_PSDEV_MAJOR) { fdput(f); - goto Ebadf; + return invalf(fc, "code: Not coda psdev"); } idx = iminor(inode); fdput(f); - if (idx < 0 || idx >= MAX_CODADEVS) { - pr_warn("%s: Bad minor number\n", __func__); - return -1; + if (idx < 0 || idx >= MAX_CODADEVS) + return invalf(fc, "coda: Bad minor number"); + ctx->idx = idx; + return 0; +} + +static int coda_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, coda_param_specs, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_fd: + return coda_parse_fd(fc, result.uint_32); } - return idx; -Ebadf: - pr_warn("%s: Bad file\n", __func__); - return -1; + return 0; +} + +/* + * Parse coda's binary mount data form. We ignore any errors and go with index + * 0 if we get one for backward compatibility. + */ +static int coda_parse_monolithic(struct fs_context *fc, void *_data) +{ + struct coda_mount_data *data = _data; + + if (!data) + return invalf(fc, "coda: Bad mount data"); + + if (data->version != CODA_MOUNT_VERSION) + return invalf(fc, "coda: Bad mount version"); + + coda_parse_fd(fc, data->fd); + return 0; } -static int coda_fill_super(struct super_block *sb, void *data, int silent) +static int coda_fill_super(struct super_block *sb, struct fs_context *fc) { + struct coda_fs_context *ctx = fc->fs_private; struct inode *root = NULL; struct venus_comm *vc; struct CodaFid fid; int error; - int idx; - - if (task_active_pid_ns(current) != &init_pid_ns) - return -EINVAL; - - idx = get_device_index((struct coda_mount_data *) data); - /* Ignore errors in data, for backward compatibility */ - if(idx == -1) - idx = 0; - - pr_info("%s: device index: %i\n", __func__, idx); + infof(fc, "coda: device index: %i\n", ctx->idx); - vc = &coda_comms[idx]; + vc = &coda_comms[ctx->idx]; mutex_lock(&vc->vc_mutex); if (!vc->vc_inuse) { - pr_warn("%s: No pseudo device\n", __func__); + errorf(fc, "coda: No pseudo device"); error = -EINVAL; goto unlock_out; } if (vc->vc_sb) { - pr_warn("%s: Device already mounted\n", __func__); + errorf(fc, "coda: Device already mounted"); error = -EBUSY; goto unlock_out; } @@ -313,18 +339,45 @@ static int coda_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -/* init_coda: used by filesystems.c to register coda */ +static int coda_get_tree(struct fs_context *fc) +{ + if (task_active_pid_ns(current) != &init_pid_ns) + return -EINVAL; -static struct dentry *coda_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) + return get_tree_nodev(fc, coda_fill_super); +} + +static void coda_free_fc(struct fs_context *fc) { - return mount_nodev(fs_type, flags, data, coda_fill_super); + kfree(fc->fs_private); +} + +static const struct fs_context_operations coda_context_ops = { + .free = coda_free_fc, + .parse_param = coda_parse_param, + .parse_monolithic = coda_parse_monolithic, + .get_tree = coda_get_tree, + .reconfigure = coda_reconfigure, +}; + +static int coda_init_fs_context(struct fs_context *fc) +{ + struct coda_fs_context *ctx; + + ctx = kzalloc(sizeof(struct coda_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + fc->fs_private = ctx; + fc->ops = &coda_context_ops; + return 0; } struct file_system_type coda_fs_type = { .owner = THIS_MODULE, .name = "coda", - .mount = coda_mount, + .init_fs_context = coda_init_fs_context, + .parameters = coda_param_specs, .kill_sb = kill_anon_super, .fs_flags = FS_BINARY_MOUNTDATA, }; diff --git a/fs/coredump.c b/fs/coredump.c index f258c17c1841..be6403b4b14b 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -872,6 +872,9 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page) loff_t pos; ssize_t n; + if (!page) + return 0; + if (cprm->to_skip) { if (!__dump_skip(cprm, cprm->to_skip)) return 0; @@ -884,7 +887,6 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page) pos = file->f_pos; bvec_set_page(&bvec, page, PAGE_SIZE, 0); iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE); - iov_iter_set_copy_mc(&iter); n = __kernel_write_iter(cprm->file, &iter, &pos); if (n != PAGE_SIZE) return 0; @@ -895,10 +897,44 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page) return 1; } +/* + * If we might get machine checks from kernel accesses during the + * core dump, let's get those errors early rather than during the + * IO. This is not performance-critical enough to warrant having + * all the machine check logic in the iovec paths. + */ +#ifdef copy_mc_to_kernel + +#define dump_page_alloc() alloc_page(GFP_KERNEL) +#define dump_page_free(x) __free_page(x) +static struct page *dump_page_copy(struct page *src, struct page *dst) +{ + void *buf = kmap_local_page(src); + size_t left = copy_mc_to_kernel(page_address(dst), buf, PAGE_SIZE); + kunmap_local(buf); + return left ? NULL : dst; +} + +#else + +/* We just want to return non-NULL; it's never used. */ +#define dump_page_alloc() ERR_PTR(-EINVAL) +#define dump_page_free(x) ((void)(x)) +static inline struct page *dump_page_copy(struct page *src, struct page *dst) +{ + return src; +} +#endif + int dump_user_range(struct coredump_params *cprm, unsigned long start, unsigned long len) { unsigned long addr; + struct page *dump_page; + + dump_page = dump_page_alloc(); + if (!dump_page) + return 0; for (addr = start; addr < start + len; addr += PAGE_SIZE) { struct page *page; @@ -912,14 +948,17 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, */ page = get_dump_page(addr); if (page) { - int stop = !dump_emit_page(cprm, page); + int stop = !dump_emit_page(cprm, dump_page_copy(page, dump_page)); put_page(page); - if (stop) + if (stop) { + dump_page_free(dump_page); return 0; + } } else { dump_skip(cprm, PAGE_SIZE); } } + dump_page_free(dump_page); return 1; } #endif diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 60dbfa0f8805..39e75131fd5a 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -495,7 +495,7 @@ static void cramfs_kill_sb(struct super_block *sb) sb->s_mtd = NULL; } else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV) && sb->s_bdev) { sync_blockdev(sb->s_bdev); - bdev_release(sb->s_bdev_handle); + fput(sb->s_bdev_file); } kfree(sbi); } diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 7b3fc189593a..0ad52fbe51c9 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -74,13 +74,7 @@ struct fscrypt_nokey_name { static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) { - if (str->len == 1 && str->name[0] == '.') - return true; - - if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.') - return true; - - return false; + return is_dot_dotdot(str->name, str->len); } /** diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 1892356cf924..8371e4e1f596 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -222,16 +222,19 @@ struct fscrypt_inode_info { struct fscrypt_prepared_key ci_enc_key; /* True if ci_enc_key should be freed when this struct is freed */ - bool ci_owns_key; + u8 ci_owns_key : 1; #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT /* * True if this inode will use inline encryption (blk-crypto) instead of * the traditional filesystem-layer encryption. */ - bool ci_inlinecrypt; + u8 ci_inlinecrypt : 1; #endif + /* True if ci_dirhash_key is initialized */ + u8 ci_dirhash_key_initialized : 1; + /* * log2 of the data unit size (granularity of contents encryption) of * this file. This is computable from ci_policy and ci_inode but is @@ -242,6 +245,9 @@ struct fscrypt_inode_info { /* Cached value: log2 of number of data units per FS block */ u8 ci_data_units_per_block_bits; + /* Hashed inode number. Only set for IV_INO_LBLK_32 */ + u32 ci_hashed_ino; + /* * Encryption mode used for this inode. It corresponds to either the * contents or filenames encryption mode, depending on the inode type. @@ -276,16 +282,12 @@ struct fscrypt_inode_info { * the plaintext filenames -- currently just casefolded directories. */ siphash_key_t ci_dirhash_key; - bool ci_dirhash_key_initialized; /* The encryption policy used by this inode */ union fscrypt_policy ci_policy; /* This inode's nonce, copied from the fscrypt_context */ u8 ci_nonce[FSCRYPT_FILE_NONCE_SIZE]; - - /* Hashed inode number. Only set for IV_INO_LBLK_32 */ - u32 ci_hashed_ino; }; typedef enum { diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 52504dd478d3..104771c3d3f6 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -102,11 +102,8 @@ int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, if (err && err != -ENOENT) return err; - if (fname->is_nokey_name) { - spin_lock(&dentry->d_lock); - dentry->d_flags |= DCACHE_NOKEY_NAME; - spin_unlock(&dentry->d_lock); - } + fscrypt_prepare_dentry(dentry, fname->is_nokey_name); + return err; } EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup); @@ -131,12 +128,10 @@ EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup); int fscrypt_prepare_lookup_partial(struct inode *dir, struct dentry *dentry) { int err = fscrypt_get_encryption_info(dir, true); + bool is_nokey_name = (!err && !fscrypt_has_encryption_key(dir)); + + fscrypt_prepare_dentry(dentry, is_nokey_name); - if (!err && !fscrypt_has_encryption_key(dir)) { - spin_lock(&dentry->d_lock); - dentry->d_flags |= DCACHE_NOKEY_NAME; - spin_unlock(&dentry->d_lock); - } return err; } EXPORT_SYMBOL_GPL(fscrypt_prepare_lookup_partial); diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index 0edf0b58daa7..6681a71625f0 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -74,8 +74,12 @@ void fscrypt_put_master_key(struct fscrypt_master_key *mk) * that concurrent keyring lookups can no longer find it. */ WARN_ON_ONCE(refcount_read(&mk->mk_active_refs) != 0); - key_put(mk->mk_users); - mk->mk_users = NULL; + if (mk->mk_users) { + /* Clear the keyring so the quota gets released right away. */ + keyring_clear(mk->mk_users); + key_put(mk->mk_users); + mk->mk_users = NULL; + } call_rcu(&mk->mk_rcu_head, fscrypt_free_master_key); } diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index d71f7c799e79..b4fe01ea4bd4 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -23,7 +23,7 @@ struct fscrypt_mode fscrypt_modes[] = { .blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_256_XTS, }, [FSCRYPT_MODE_AES_256_CTS] = { - .friendly_name = "AES-256-CTS-CBC", + .friendly_name = "AES-256-CBC-CTS", .cipher_str = "cts(cbc(aes))", .keysize = 32, .security_strength = 32, @@ -38,7 +38,7 @@ struct fscrypt_mode fscrypt_modes[] = { .blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV, }, [FSCRYPT_MODE_AES_128_CTS] = { - .friendly_name = "AES-128-CTS-CBC", + .friendly_name = "AES-128-CBC-CTS", .cipher_str = "cts(cbc(aes))", .keysize = 16, .security_strength = 16, @@ -53,7 +53,7 @@ struct fscrypt_mode fscrypt_modes[] = { .blk_crypto_mode = BLK_ENCRYPTION_MODE_SM4_XTS, }, [FSCRYPT_MODE_SM4_CTS] = { - .friendly_name = "SM4-CTS-CBC", + .friendly_name = "SM4-CBC-CTS", .cipher_str = "cts(cbc(sm4))", .keysize = 16, .security_strength = 16, @@ -687,7 +687,7 @@ int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported) /** * fscrypt_prepare_new_inode() - prepare to create a new inode in a directory * @dir: a possibly-encrypted directory - * @inode: the new inode. ->i_mode must be set already. + * @inode: the new inode. ->i_mode and ->i_blkbits must be set already. * ->i_ino doesn't need to be set yet. * @encrypt_ret: (output) set to %true if the new inode will be encrypted * @@ -717,6 +717,9 @@ int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode, if (IS_ERR(policy)) return PTR_ERR(policy); + if (WARN_ON_ONCE(inode->i_blkbits == 0)) + return -EINVAL; + if (WARN_ON_ONCE(inode->i_mode == 0)) return -EINVAL; diff --git a/fs/dcache.c b/fs/dcache.c index b813528fb147..71a8e943a0fa 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -3061,7 +3061,10 @@ static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry) if (d_unhashed(dentry) || !dentry->d_inode) return D_WALK_SKIP; - dentry->d_lockref.count--; + if (!(dentry->d_flags & DCACHE_GENOCIDE)) { + dentry->d_flags |= DCACHE_GENOCIDE; + dentry->d_lockref.count--; + } } return D_WALK_CONTINUE; } @@ -3136,7 +3139,7 @@ static void __init dcache_init(void) * of the dcache. */ dentry_cache = KMEM_CACHE_USERCOPY(dentry, - SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT, + SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_ACCOUNT, d_iname); /* Hash may have been set up in dcache_init_early */ diff --git a/fs/direct-io.c b/fs/direct-io.c index 60456263a338..62c97ff9e852 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -410,6 +410,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, bio->bi_end_io = dio_bio_end_io; if (dio->is_pinned) bio_set_flag(bio, BIO_PAGE_PINNED); + bio->bi_write_hint = file_inode(dio->iocb->ki_filp)->i_write_hint; + sdio->bio = bio; sdio->logical_offset_in_bio = sdio->cur_page_fs_offset; } diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index d814c5121367..9ca83ef70ed1 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -138,14 +138,14 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, } op->info.optype = DLM_PLOCK_OP_LOCK; - op->info.pid = fl->fl_pid; - op->info.ex = (fl->fl_type == F_WRLCK); - op->info.wait = !!(fl->fl_flags & FL_SLEEP); + op->info.pid = fl->c.flc_pid; + op->info.ex = lock_is_write(fl); + op->info.wait = !!(fl->c.flc_flags & FL_SLEEP); op->info.fsid = ls->ls_global_id; op->info.number = number; op->info.start = fl->fl_start; op->info.end = fl->fl_end; - op->info.owner = (__u64)(long)fl->fl_owner; + op->info.owner = (__u64)(long) fl->c.flc_owner; /* async handling */ if (fl->fl_lmops && fl->fl_lmops->lm_grant) { op_data = kzalloc(sizeof(*op_data), GFP_NOFS); @@ -258,7 +258,7 @@ static int dlm_plock_callback(struct plock_op *op) } /* got fs lock; bookkeep locally as well: */ - flc->fl_flags &= ~FL_SLEEP; + flc->c.flc_flags &= ~FL_SLEEP; if (posix_lock_file(file, flc, NULL)) { /* * This can only happen in the case of kmalloc() failure. @@ -291,7 +291,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file, struct dlm_ls *ls; struct plock_op *op; int rv; - unsigned char fl_flags = fl->fl_flags; + unsigned char saved_flags = fl->c.flc_flags; ls = dlm_find_lockspace_local(lockspace); if (!ls) @@ -304,7 +304,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file, } /* cause the vfs unlock to return ENOENT if lock is not found */ - fl->fl_flags |= FL_EXISTS; + fl->c.flc_flags |= FL_EXISTS; rv = locks_lock_file_wait(file, fl); if (rv == -ENOENT) { @@ -317,14 +317,14 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file, } op->info.optype = DLM_PLOCK_OP_UNLOCK; - op->info.pid = fl->fl_pid; + op->info.pid = fl->c.flc_pid; op->info.fsid = ls->ls_global_id; op->info.number = number; op->info.start = fl->fl_start; op->info.end = fl->fl_end; - op->info.owner = (__u64)(long)fl->fl_owner; + op->info.owner = (__u64)(long) fl->c.flc_owner; - if (fl->fl_flags & FL_CLOSE) { + if (fl->c.flc_flags & FL_CLOSE) { op->info.flags |= DLM_PLOCK_FL_CLOSE; send_op(op); rv = 0; @@ -345,7 +345,7 @@ out_free: dlm_release_plock_op(op); out: dlm_put_lockspace(ls); - fl->fl_flags = fl_flags; + fl->c.flc_flags = saved_flags; return rv; } EXPORT_SYMBOL_GPL(dlm_posix_unlock); @@ -375,14 +375,14 @@ int dlm_posix_cancel(dlm_lockspace_t *lockspace, u64 number, struct file *file, return -EINVAL; memset(&info, 0, sizeof(info)); - info.pid = fl->fl_pid; - info.ex = (fl->fl_type == F_WRLCK); + info.pid = fl->c.flc_pid; + info.ex = lock_is_write(fl); info.fsid = ls->ls_global_id; dlm_put_lockspace(ls); info.number = number; info.start = fl->fl_start; info.end = fl->fl_end; - info.owner = (__u64)(long)fl->fl_owner; + info.owner = (__u64)(long) fl->c.flc_owner; rv = do_lock_cancel(&info); switch (rv) { @@ -437,13 +437,13 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file, } op->info.optype = DLM_PLOCK_OP_GET; - op->info.pid = fl->fl_pid; - op->info.ex = (fl->fl_type == F_WRLCK); + op->info.pid = fl->c.flc_pid; + op->info.ex = lock_is_write(fl); op->info.fsid = ls->ls_global_id; op->info.number = number; op->info.start = fl->fl_start; op->info.end = fl->fl_end; - op->info.owner = (__u64)(long)fl->fl_owner; + op->info.owner = (__u64)(long) fl->c.flc_owner; send_op(op); wait_event(recv_wq, (op->done != 0)); @@ -455,16 +455,16 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file, rv = op->info.rv; - fl->fl_type = F_UNLCK; + fl->c.flc_type = F_UNLCK; if (rv == -ENOENT) rv = 0; else if (rv > 0) { locks_init_lock(fl); - fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK; - fl->fl_flags = FL_POSIX; - fl->fl_pid = op->info.pid; + fl->c.flc_type = (op->info.ex) ? F_WRLCK : F_RDLCK; + fl->c.flc_flags = FL_POSIX; + fl->c.flc_pid = op->info.pid; if (op->info.nodeid != dlm_our_nodeid()) - fl->fl_pid = -fl->fl_pid; + fl->c.flc_pid = -fl->c.flc_pid; fl->fl_start = op->info.start; fl->fl_end = op->info.end; rv = 0; diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 03bd55069d86..2fe0f3af1a08 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1949,16 +1949,6 @@ out: return rc; } -static bool is_dot_dotdot(const char *name, size_t name_size) -{ - if (name_size == 1 && name[0] == '.') - return true; - else if (name_size == 2 && name[0] == '.' && name[1] == '.') - return true; - - return false; -} - /** * ecryptfs_decode_and_decrypt_filename - converts the encoded cipher text name to decoded plaintext * @plaintext_name: The plaintext name diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h index 169252e6dc46..f7206158ee81 100644 --- a/fs/efivarfs/internal.h +++ b/fs/efivarfs/internal.h @@ -38,7 +38,7 @@ struct efivar_entry { int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *, struct list_head *), - void *data, bool duplicates, struct list_head *head); + void *data, struct list_head *head); int efivar_entry_add(struct efivar_entry *entry, struct list_head *head); void __efivar_entry_add(struct efivar_entry *entry, struct list_head *head); diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 6038dd39367a..bb14462f6d99 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -343,12 +343,7 @@ static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc) if (err) return err; - err = efivar_init(efivarfs_callback, (void *)sb, true, - &sfi->efivarfs_list); - if (err) - efivar_entry_iter(efivarfs_destroy, &sfi->efivarfs_list, NULL); - - return err; + return efivar_init(efivarfs_callback, sb, &sfi->efivarfs_list); } static int efivarfs_get_tree(struct fs_context *fc) diff --git a/fs/efivarfs/vars.c b/fs/efivarfs/vars.c index 114ff0fd4e55..4d722af1014f 100644 --- a/fs/efivarfs/vars.c +++ b/fs/efivarfs/vars.c @@ -361,7 +361,6 @@ static void dup_variable_bug(efi_char16_t *str16, efi_guid_t *vendor_guid, * efivar_init - build the initial list of EFI variables * @func: callback function to invoke for every variable * @data: function-specific data to pass to @func - * @duplicates: error if we encounter duplicates on @head? * @head: initialised head of variable list * * Get every EFI variable from the firmware and invoke @func. @func @@ -371,9 +370,9 @@ static void dup_variable_bug(efi_char16_t *str16, efi_guid_t *vendor_guid, */ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *, struct list_head *), - void *data, bool duplicates, struct list_head *head) + void *data, struct list_head *head) { - unsigned long variable_name_size = 1024; + unsigned long variable_name_size = 512; efi_char16_t *variable_name; efi_status_t status; efi_guid_t vendor_guid; @@ -390,12 +389,13 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *, goto free; /* - * Per EFI spec, the maximum storage allocated for both - * the variable name and variable data is 1024 bytes. + * A small set of old UEFI implementations reject sizes + * above a certain threshold, the lowest seen in the wild + * is 512. */ do { - variable_name_size = 1024; + variable_name_size = 512; status = efivar_get_next_variable(&variable_name_size, variable_name, @@ -413,8 +413,7 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *, * we'll ever see a different variable name, * and may end up looping here forever. */ - if (duplicates && - variable_is_present(variable_name, &vendor_guid, + if (variable_is_present(variable_name, &vendor_guid, head)) { dup_variable_bug(variable_name, &vendor_guid, variable_name_size); @@ -432,9 +431,13 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *, break; case EFI_NOT_FOUND: break; + case EFI_BUFFER_TOO_SMALL: + pr_warn("efivars: Variable name size exceeds maximum (%lu > 512)\n", + variable_name_size); + status = EFI_NOT_FOUND; + break; default: - printk(KERN_WARNING "efivars: get_next_variable: status=%lx\n", - status); + pr_warn("efivars: get_next_variable: status=%lx\n", status); status = EFI_NOT_FOUND; break; } diff --git a/fs/efs/super.c b/fs/efs/super.c index f17fdac76b2e..e4421c10caeb 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -14,19 +14,14 @@ #include <linux/buffer_head.h> #include <linux/vfs.h> #include <linux/blkdev.h> - +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include "efs.h" #include <linux/efs_vh.h> #include <linux/efs_fs_sb.h> static int efs_statfs(struct dentry *dentry, struct kstatfs *buf); -static int efs_fill_super(struct super_block *s, void *d, int silent); - -static struct dentry *efs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) -{ - return mount_bdev(fs_type, flags, dev_name, data, efs_fill_super); -} +static int efs_init_fs_context(struct fs_context *fc); static void efs_kill_sb(struct super_block *s) { @@ -35,15 +30,6 @@ static void efs_kill_sb(struct super_block *s) kfree(sbi); } -static struct file_system_type efs_fs_type = { - .owner = THIS_MODULE, - .name = "efs", - .mount = efs_mount, - .kill_sb = efs_kill_sb, - .fs_flags = FS_REQUIRES_DEV, -}; -MODULE_ALIAS_FS("efs"); - static struct pt_types sgi_pt_types[] = { {0x00, "SGI vh"}, {0x01, "SGI trkrepl"}, @@ -63,6 +49,27 @@ static struct pt_types sgi_pt_types[] = { {0, NULL} }; +enum { + Opt_explicit_open, +}; + +static const struct fs_parameter_spec efs_param_spec[] = { + fsparam_flag ("explicit-open", Opt_explicit_open), + {} +}; + +/* + * File system definition and registration. + */ +static struct file_system_type efs_fs_type = { + .owner = THIS_MODULE, + .name = "efs", + .kill_sb = efs_kill_sb, + .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = efs_init_fs_context, + .parameters = efs_param_spec, +}; +MODULE_ALIAS_FS("efs"); static struct kmem_cache * efs_inode_cachep; @@ -91,8 +98,8 @@ static int __init init_inodecache(void) { efs_inode_cachep = kmem_cache_create("efs_inode_cache", sizeof(struct efs_inode_info), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| - SLAB_ACCOUNT, init_once); + SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, + init_once); if (efs_inode_cachep == NULL) return -ENOMEM; return 0; @@ -108,18 +115,10 @@ static void destroy_inodecache(void) kmem_cache_destroy(efs_inode_cachep); } -static int efs_remount(struct super_block *sb, int *flags, char *data) -{ - sync_filesystem(sb); - *flags |= SB_RDONLY; - return 0; -} - static const struct super_operations efs_superblock_operations = { .alloc_inode = efs_alloc_inode, .free_inode = efs_free_inode, .statfs = efs_statfs, - .remount_fs = efs_remount, }; static const struct export_operations efs_export_ops = { @@ -249,26 +248,26 @@ static int efs_validate_super(struct efs_sb_info *sb, struct efs_super *super) { return 0; } -static int efs_fill_super(struct super_block *s, void *d, int silent) +static int efs_fill_super(struct super_block *s, struct fs_context *fc) { struct efs_sb_info *sb; struct buffer_head *bh; struct inode *root; - sb = kzalloc(sizeof(struct efs_sb_info), GFP_KERNEL); + sb = kzalloc(sizeof(struct efs_sb_info), GFP_KERNEL); if (!sb) return -ENOMEM; s->s_fs_info = sb; s->s_time_min = 0; s->s_time_max = U32_MAX; - + s->s_magic = EFS_SUPER_MAGIC; if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) { pr_err("device does not support %d byte blocks\n", EFS_BLOCKSIZE); return -EINVAL; } - + /* read the vh (volume header) block */ bh = sb_bread(s, 0); @@ -294,7 +293,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) pr_err("cannot read superblock\n"); return -EIO; } - + if (efs_validate_super(sb, (struct efs_super *) bh->b_data)) { #ifdef DEBUG pr_warn("invalid superblock at block %u\n", @@ -328,6 +327,61 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) return 0; } +static void efs_free_fc(struct fs_context *fc) +{ + kfree(fc->fs_private); +} + +static int efs_get_tree(struct fs_context *fc) +{ + return get_tree_bdev(fc, efs_fill_super); +} + +static int efs_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + int token; + struct fs_parse_result result; + + token = fs_parse(fc, efs_param_spec, param, &result); + if (token < 0) + return token; + return 0; +} + +static int efs_reconfigure(struct fs_context *fc) +{ + sync_filesystem(fc->root->d_sb); + + return 0; +} + +struct efs_context { + unsigned long s_mount_opts; +}; + +static const struct fs_context_operations efs_context_opts = { + .parse_param = efs_parse_param, + .get_tree = efs_get_tree, + .reconfigure = efs_reconfigure, + .free = efs_free_fc, +}; + +/* + * Set up the filesystem mount context. + */ +static int efs_init_fs_context(struct fs_context *fc) +{ + struct efs_context *ctx; + + ctx = kzalloc(sizeof(struct efs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + fc->fs_private = ctx; + fc->ops = &efs_context_opts; + + return 0; +} + static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct efs_sb_info *sbi = SUPER_INFO(sb); diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index 279933e007d2..333587ba6183 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -11,13 +11,12 @@ struct z_erofs_decompress_req { struct super_block *sb; struct page **in, **out; - unsigned short pageofs_in, pageofs_out; unsigned int inputsize, outputsize; - /* indicate the algorithm will be used for decompression */ - unsigned int alg; + unsigned int alg; /* the algorithm for decompression */ bool inplace_io, partial_decoding, fillgaps; + gfp_t gfp; /* allocation flags for extra temporary buffers */ }; struct z_erofs_decompressor { @@ -82,13 +81,6 @@ static inline bool z_erofs_put_shortlivedpage(struct page **pagepool, return true; } -#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping) -static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi, - struct page *page) -{ - return page->mapping == MNGD_MAPPING(sbi); -} - int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf, unsigned int padbufsize); extern const struct z_erofs_decompressor erofs_decompressors[]; diff --git a/fs/erofs/data.c b/fs/erofs/data.c index c98aeda8abb2..52524bd9698b 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -220,7 +220,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) up_read(&devs->rwsem); return 0; } - map->m_bdev = dif->bdev_handle ? dif->bdev_handle->bdev : NULL; + map->m_bdev = dif->bdev_file ? file_bdev(dif->bdev_file) : NULL; map->m_daxdev = dif->dax_dev; map->m_dax_part_off = dif->dax_part_off; map->m_fscache = dif->fscache; @@ -238,8 +238,8 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) if (map->m_pa >= startoff && map->m_pa < startoff + length) { map->m_pa -= startoff; - map->m_bdev = dif->bdev_handle ? - dif->bdev_handle->bdev : NULL; + map->m_bdev = dif->bdev_file ? + file_bdev(dif->bdev_file) : NULL; map->m_daxdev = dif->dax_dev; map->m_dax_part_off = dif->dax_part_off; map->m_fscache = dif->fscache; @@ -447,5 +447,6 @@ const struct file_operations erofs_file_fops = { .llseek = generic_file_llseek, .read_iter = erofs_file_read_iter, .mmap = erofs_file_mmap, + .get_unmapped_area = thp_get_unmapped_area, .splice_read = filemap_splice_read, }; diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 072ef6a66823..2ec9b2bb628d 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -111,8 +111,9 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, victim = availables[--top]; get_page(victim); } else { - victim = erofs_allocpage(pagepool, - GFP_KERNEL | __GFP_NOFAIL); + victim = erofs_allocpage(pagepool, rq->gfp); + if (!victim) + return -ENOMEM; set_page_private(victim, Z_EROFS_SHORTLIVED_PAGE); } rq->out[i] = victim; @@ -322,7 +323,8 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, unsigned int cur = 0, ni = 0, no, pi, po, insz, cnt; u8 *kin; - DBG_BUGON(rq->outputsize > rq->inputsize); + if (rq->outputsize > rq->inputsize) + return -EOPNOTSUPP; if (rq->alg == Z_EROFS_COMPRESSION_INTERLACED) { cur = bs - (rq->pageofs_out & (bs - 1)); pi = (rq->pageofs_in + rq->inputsize - cur) & ~PAGE_MASK; diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c index 4a64a9c91dd3..81e65c453ef0 100644 --- a/fs/erofs/decompressor_deflate.c +++ b/fs/erofs/decompressor_deflate.c @@ -95,7 +95,7 @@ int z_erofs_load_deflate_config(struct super_block *sb, } int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq, - struct page **pagepool) + struct page **pgpl) { const unsigned int nrpages_out = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; @@ -158,8 +158,12 @@ again: strm->z.avail_out = min_t(u32, outsz, PAGE_SIZE - pofs); outsz -= strm->z.avail_out; if (!rq->out[no]) { - rq->out[no] = erofs_allocpage(pagepool, - GFP_KERNEL | __GFP_NOFAIL); + rq->out[no] = erofs_allocpage(pgpl, rq->gfp); + if (!rq->out[no]) { + kout = NULL; + err = -ENOMEM; + break; + } set_page_private(rq->out[no], Z_EROFS_SHORTLIVED_PAGE); } @@ -208,11 +212,11 @@ again: if (rq->out[no] != rq->in[j]) continue; - - DBG_BUGON(erofs_page_is_managed(EROFS_SB(sb), - rq->in[j])); - tmppage = erofs_allocpage(pagepool, - GFP_KERNEL | __GFP_NOFAIL); + tmppage = erofs_allocpage(pgpl, rq->gfp); + if (!tmppage) { + err = -ENOMEM; + goto failed; + } set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE); copy_highpage(tmppage, rq->in[j]); rq->in[j] = tmppage; @@ -230,7 +234,7 @@ again: break; } } - +failed: if (zlib_inflateEnd(&strm->z) != Z_OK && !err) err = -EIO; if (kout) diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c index 2dd14f99c1dc..4b28dc130c9f 100644 --- a/fs/erofs/decompressor_lzma.c +++ b/fs/erofs/decompressor_lzma.c @@ -148,7 +148,7 @@ again: } int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, - struct page **pagepool) + struct page **pgpl) { const unsigned int nrpages_out = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; @@ -215,8 +215,11 @@ again: PAGE_SIZE - pageofs); outlen -= strm->buf.out_size; if (!rq->out[no] && rq->fillgaps) { /* deduped */ - rq->out[no] = erofs_allocpage(pagepool, - GFP_KERNEL | __GFP_NOFAIL); + rq->out[no] = erofs_allocpage(pgpl, rq->gfp); + if (!rq->out[no]) { + err = -ENOMEM; + break; + } set_page_private(rq->out[no], Z_EROFS_SHORTLIVED_PAGE); } @@ -255,11 +258,11 @@ again: if (rq->out[no] != rq->in[j]) continue; - - DBG_BUGON(erofs_page_is_managed(EROFS_SB(rq->sb), - rq->in[j])); - tmppage = erofs_allocpage(pagepool, - GFP_KERNEL | __GFP_NOFAIL); + tmppage = erofs_allocpage(pgpl, rq->gfp); + if (!tmppage) { + err = -ENOMEM; + goto failed; + } set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE); copy_highpage(tmppage, rq->in[j]); rq->in[j] = tmppage; @@ -277,6 +280,7 @@ again: break; } } +failed: if (no < nrpages_out && strm->buf.out) kunmap(rq->out[no]); if (ni < nrpages_in) diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index bc12030393b2..8aff1a724805 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -3,6 +3,7 @@ * Copyright (C) 2022, Alibaba Cloud * Copyright (C) 2022, Bytedance Inc. All rights reserved. */ +#include <linux/pseudo_fs.h> #include <linux/fscache.h> #include "internal.h" @@ -12,9 +13,27 @@ static LIST_HEAD(erofs_domain_list); static LIST_HEAD(erofs_domain_cookies_list); static struct vfsmount *erofs_pseudo_mnt; -struct erofs_fscache_request { - struct erofs_fscache_request *primary; - struct netfs_cache_resources cache_resources; +static int erofs_anon_init_fs_context(struct fs_context *fc) +{ + return init_pseudo(fc, EROFS_SUPER_MAGIC) ? 0 : -ENOMEM; +} + +static struct file_system_type erofs_anon_fs_type = { + .owner = THIS_MODULE, + .name = "pseudo_erofs", + .init_fs_context = erofs_anon_init_fs_context, + .kill_sb = kill_anon_super, +}; + +struct erofs_fscache_io { + struct netfs_cache_resources cres; + struct iov_iter iter; + netfs_io_terminated_t end_io; + void *private; + refcount_t ref; +}; + +struct erofs_fscache_rq { struct address_space *mapping; /* The mapping being accessed */ loff_t start; /* Start position */ size_t len; /* Length of the request */ @@ -23,44 +42,17 @@ struct erofs_fscache_request { refcount_t ref; }; -static struct erofs_fscache_request *erofs_fscache_req_alloc(struct address_space *mapping, - loff_t start, size_t len) +static bool erofs_fscache_io_put(struct erofs_fscache_io *io) { - struct erofs_fscache_request *req; - - req = kzalloc(sizeof(struct erofs_fscache_request), GFP_KERNEL); - if (!req) - return ERR_PTR(-ENOMEM); - - req->mapping = mapping; - req->start = start; - req->len = len; - refcount_set(&req->ref, 1); - - return req; + if (!refcount_dec_and_test(&io->ref)) + return false; + if (io->cres.ops) + io->cres.ops->end_operation(&io->cres); + kfree(io); + return true; } -static struct erofs_fscache_request *erofs_fscache_req_chain(struct erofs_fscache_request *primary, - size_t len) -{ - struct erofs_fscache_request *req; - - /* use primary request for the first submission */ - if (!primary->submitted) { - refcount_inc(&primary->ref); - return primary; - } - - req = erofs_fscache_req_alloc(primary->mapping, - primary->start + primary->submitted, len); - if (!IS_ERR(req)) { - req->primary = primary; - refcount_inc(&primary->ref); - } - return req; -} - -static void erofs_fscache_req_complete(struct erofs_fscache_request *req) +static void erofs_fscache_req_complete(struct erofs_fscache_rq *req) { struct folio *folio; bool failed = req->error; @@ -80,120 +72,196 @@ static void erofs_fscache_req_complete(struct erofs_fscache_request *req) rcu_read_unlock(); } -static void erofs_fscache_req_put(struct erofs_fscache_request *req) +static void erofs_fscache_req_put(struct erofs_fscache_rq *req) { - if (refcount_dec_and_test(&req->ref)) { - if (req->cache_resources.ops) - req->cache_resources.ops->end_operation(&req->cache_resources); - if (!req->primary) - erofs_fscache_req_complete(req); - else - erofs_fscache_req_put(req->primary); - kfree(req); - } + if (!refcount_dec_and_test(&req->ref)) + return; + erofs_fscache_req_complete(req); + kfree(req); } -static void erofs_fscache_subreq_complete(void *priv, +static struct erofs_fscache_rq *erofs_fscache_req_alloc(struct address_space *mapping, + loff_t start, size_t len) +{ + struct erofs_fscache_rq *req = kzalloc(sizeof(*req), GFP_KERNEL); + + if (!req) + return NULL; + req->mapping = mapping; + req->start = start; + req->len = len; + refcount_set(&req->ref, 1); + return req; +} + +static void erofs_fscache_req_io_put(struct erofs_fscache_io *io) +{ + struct erofs_fscache_rq *req = io->private; + + if (erofs_fscache_io_put(io)) + erofs_fscache_req_put(req); +} + +static void erofs_fscache_req_end_io(void *priv, ssize_t transferred_or_error, bool was_async) { - struct erofs_fscache_request *req = priv; + struct erofs_fscache_io *io = priv; + struct erofs_fscache_rq *req = io->private; - if (IS_ERR_VALUE(transferred_or_error)) { - if (req->primary) - req->primary->error = transferred_or_error; - else - req->error = transferred_or_error; - } - erofs_fscache_req_put(req); + if (IS_ERR_VALUE(transferred_or_error)) + req->error = transferred_or_error; + erofs_fscache_req_io_put(io); +} + +static struct erofs_fscache_io *erofs_fscache_req_io_alloc(struct erofs_fscache_rq *req) +{ + struct erofs_fscache_io *io = kzalloc(sizeof(*io), GFP_KERNEL); + + if (!io) + return NULL; + io->end_io = erofs_fscache_req_end_io; + io->private = req; + refcount_inc(&req->ref); + refcount_set(&io->ref, 1); + return io; } /* - * Read data from fscache (cookie, pstart, len), and fill the read data into - * page cache described by (req->mapping, lstart, len). @pstart describeis the - * start physical address in the cache file. + * Read data from fscache described by cookie at pstart physical address + * offset, and fill the read data into buffer described by io->iter. */ -static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie, - struct erofs_fscache_request *req, loff_t pstart, size_t len) +static int erofs_fscache_read_io_async(struct fscache_cookie *cookie, + loff_t pstart, struct erofs_fscache_io *io) { enum netfs_io_source source; - struct super_block *sb = req->mapping->host->i_sb; - struct netfs_cache_resources *cres = &req->cache_resources; - struct iov_iter iter; - loff_t lstart = req->start + req->submitted; - size_t done = 0; + struct netfs_cache_resources *cres = &io->cres; + struct iov_iter *iter = &io->iter; int ret; - DBG_BUGON(len > req->len - req->submitted); - ret = fscache_begin_read_operation(cres, cookie); if (ret) return ret; - while (done < len) { - loff_t sstart = pstart + done; - size_t slen = len - done; + while (iov_iter_count(iter)) { + size_t orig_count = iov_iter_count(iter), len = orig_count; unsigned long flags = 1 << NETFS_SREQ_ONDEMAND; source = cres->ops->prepare_ondemand_read(cres, - sstart, &slen, LLONG_MAX, &flags, 0); - if (WARN_ON(slen == 0)) + pstart, &len, LLONG_MAX, &flags, 0); + if (WARN_ON(len == 0)) source = NETFS_INVALID_READ; if (source != NETFS_READ_FROM_CACHE) { - erofs_err(sb, "failed to fscache prepare_read (source %d)", source); + erofs_err(NULL, "prepare_read failed (source %d)", source); return -EIO; } - refcount_inc(&req->ref); - iov_iter_xarray(&iter, ITER_DEST, &req->mapping->i_pages, - lstart + done, slen); - - ret = fscache_read(cres, sstart, &iter, NETFS_READ_HOLE_FAIL, - erofs_fscache_subreq_complete, req); + iov_iter_truncate(iter, len); + refcount_inc(&io->ref); + ret = fscache_read(cres, pstart, iter, NETFS_READ_HOLE_FAIL, + io->end_io, io); if (ret == -EIOCBQUEUED) ret = 0; if (ret) { - erofs_err(sb, "failed to fscache_read (ret %d)", ret); + erofs_err(NULL, "fscache_read failed (ret %d)", ret); return ret; } + if (WARN_ON(iov_iter_count(iter))) + return -EIO; - done += slen; + iov_iter_reexpand(iter, orig_count - len); + pstart += len; } - DBG_BUGON(done != len); return 0; } -static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio) +struct erofs_fscache_bio { + struct erofs_fscache_io io; + struct bio bio; /* w/o bdev to share bio_add_page/endio() */ + struct bio_vec bvecs[BIO_MAX_VECS]; +}; + +static void erofs_fscache_bio_endio(void *priv, + ssize_t transferred_or_error, bool was_async) +{ + struct erofs_fscache_bio *io = priv; + + if (IS_ERR_VALUE(transferred_or_error)) + io->bio.bi_status = errno_to_blk_status(transferred_or_error); + io->bio.bi_end_io(&io->bio); + BUILD_BUG_ON(offsetof(struct erofs_fscache_bio, io) != 0); + erofs_fscache_io_put(&io->io); +} + +struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev) { + struct erofs_fscache_bio *io; + + io = kmalloc(sizeof(*io), GFP_KERNEL | __GFP_NOFAIL); + bio_init(&io->bio, NULL, io->bvecs, BIO_MAX_VECS, REQ_OP_READ); + io->io.private = mdev->m_fscache->cookie; + io->io.end_io = erofs_fscache_bio_endio; + refcount_set(&io->io.ref, 1); + return &io->bio; +} + +void erofs_fscache_submit_bio(struct bio *bio) +{ + struct erofs_fscache_bio *io = container_of(bio, + struct erofs_fscache_bio, bio); int ret; + + iov_iter_bvec(&io->io.iter, ITER_DEST, io->bvecs, bio->bi_vcnt, + bio->bi_iter.bi_size); + ret = erofs_fscache_read_io_async(io->io.private, + bio->bi_iter.bi_sector << 9, &io->io); + erofs_fscache_io_put(&io->io); + if (!ret) + return; + bio->bi_status = errno_to_blk_status(ret); + bio->bi_end_io(bio); +} + +static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio) +{ struct erofs_fscache *ctx = folio->mapping->host->i_private; - struct erofs_fscache_request *req; + int ret = -ENOMEM; + struct erofs_fscache_rq *req; + struct erofs_fscache_io *io; req = erofs_fscache_req_alloc(folio->mapping, folio_pos(folio), folio_size(folio)); - if (IS_ERR(req)) { + if (!req) { folio_unlock(folio); - return PTR_ERR(req); + return ret; } - ret = erofs_fscache_read_folios_async(ctx->cookie, req, - folio_pos(folio), folio_size(folio)); + io = erofs_fscache_req_io_alloc(req); + if (!io) { + req->error = ret; + goto out; + } + iov_iter_xarray(&io->iter, ITER_DEST, &folio->mapping->i_pages, + folio_pos(folio), folio_size(folio)); + + ret = erofs_fscache_read_io_async(ctx->cookie, folio_pos(folio), io); if (ret) req->error = ret; + erofs_fscache_req_io_put(io); +out: erofs_fscache_req_put(req); return ret; } -static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary) +static int erofs_fscache_data_read_slice(struct erofs_fscache_rq *req) { - struct address_space *mapping = primary->mapping; + struct address_space *mapping = req->mapping; struct inode *inode = mapping->host; struct super_block *sb = inode->i_sb; - struct erofs_fscache_request *req; + struct erofs_fscache_io *io; struct erofs_map_blocks map; struct erofs_map_dev mdev; - struct iov_iter iter; - loff_t pos = primary->start + primary->submitted; + loff_t pos = req->start + req->submitted; size_t count; int ret; @@ -204,6 +272,7 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary) if (map.m_flags & EROFS_MAP_META) { struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + struct iov_iter iter; erofs_blk_t blknr; size_t offset, size; void *src; @@ -224,15 +293,17 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary) } iov_iter_zero(PAGE_SIZE - size, &iter); erofs_put_metabuf(&buf); - primary->submitted += PAGE_SIZE; + req->submitted += PAGE_SIZE; return 0; } - count = primary->len - primary->submitted; + count = req->len - req->submitted; if (!(map.m_flags & EROFS_MAP_MAPPED)) { + struct iov_iter iter; + iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, count); iov_iter_zero(count, &iter); - primary->submitted += count; + req->submitted += count; return 0; } @@ -247,18 +318,19 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary) if (ret) return ret; - req = erofs_fscache_req_chain(primary, count); - if (IS_ERR(req)) - return PTR_ERR(req); + io = erofs_fscache_req_io_alloc(req); + if (!io) + return -ENOMEM; + iov_iter_xarray(&io->iter, ITER_DEST, &mapping->i_pages, pos, count); + ret = erofs_fscache_read_io_async(mdev.m_fscache->cookie, + mdev.m_pa + (pos - map.m_la), io); + erofs_fscache_req_io_put(io); - ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie, - req, mdev.m_pa + (pos - map.m_la), count); - erofs_fscache_req_put(req); - primary->submitted += count; + req->submitted += count; return ret; } -static int erofs_fscache_data_read(struct erofs_fscache_request *req) +static int erofs_fscache_data_read(struct erofs_fscache_rq *req) { int ret; @@ -267,20 +339,19 @@ static int erofs_fscache_data_read(struct erofs_fscache_request *req) if (ret) req->error = ret; } while (!ret && req->submitted < req->len); - return ret; } static int erofs_fscache_read_folio(struct file *file, struct folio *folio) { - struct erofs_fscache_request *req; + struct erofs_fscache_rq *req; int ret; req = erofs_fscache_req_alloc(folio->mapping, folio_pos(folio), folio_size(folio)); - if (IS_ERR(req)) { + if (!req) { folio_unlock(folio); - return PTR_ERR(req); + return -ENOMEM; } ret = erofs_fscache_data_read(req); @@ -290,14 +361,14 @@ static int erofs_fscache_read_folio(struct file *file, struct folio *folio) static void erofs_fscache_readahead(struct readahead_control *rac) { - struct erofs_fscache_request *req; + struct erofs_fscache_rq *req; if (!readahead_count(rac)) return; req = erofs_fscache_req_alloc(rac->mapping, readahead_pos(rac), readahead_length(rac)); - if (IS_ERR(req)) + if (!req) return; /* The request completion will drop refs on the folios. */ @@ -381,11 +452,12 @@ static int erofs_fscache_init_domain(struct super_block *sb) goto out; if (!erofs_pseudo_mnt) { - erofs_pseudo_mnt = kern_mount(&erofs_fs_type); - if (IS_ERR(erofs_pseudo_mnt)) { - err = PTR_ERR(erofs_pseudo_mnt); + struct vfsmount *mnt = kern_mount(&erofs_anon_fs_type); + if (IS_ERR(mnt)) { + err = PTR_ERR(mnt); goto out; } + erofs_pseudo_mnt = mnt; } domain->volume = sbi->volume; @@ -459,7 +531,7 @@ static struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb inode->i_size = OFFSET_MAX; inode->i_mapping->a_ops = &erofs_fscache_meta_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL); inode->i_blkbits = EROFS_SB(sb)->blkszbits; inode->i_private = ctx; diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 3d616dea55dc..0eb0e6f933c3 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -60,7 +60,7 @@ static void *erofs_read_inode(struct erofs_buf *buf, } else { const unsigned int gotten = sb->s_blocksize - *ofs; - copied = kmalloc(vi->inode_isize, GFP_NOFS); + copied = kmalloc(vi->inode_isize, GFP_KERNEL); if (!copied) { err = -ENOMEM; goto err_out; @@ -259,14 +259,12 @@ static int erofs_fill_inode(struct inode *inode) if (erofs_inode_is_data_compressed(vi->datalayout)) { #ifdef CONFIG_EROFS_FS_ZIP - if (!erofs_is_fscache_mode(inode->i_sb)) { - DO_ONCE_LITE_IF(inode->i_sb->s_blocksize != PAGE_SIZE, - erofs_info, inode->i_sb, - "EXPERIMENTAL EROFS subpage compressed block support in use. Use at your own risk!"); - inode->i_mapping->a_ops = &z_erofs_aops; - err = 0; - goto out_unlock; - } + DO_ONCE_LITE_IF(inode->i_blkbits != PAGE_SHIFT, + erofs_info, inode->i_sb, + "EXPERIMENTAL EROFS subpage compressed block support in use. Use at your own risk!"); + inode->i_mapping->a_ops = &z_erofs_aops; + err = 0; + goto out_unlock; #endif err = -EOPNOTSUPP; goto out_unlock; diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index b0409badb017..39c67119f43b 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -49,7 +49,7 @@ typedef u32 erofs_blk_t; struct erofs_device_info { char *path; struct erofs_fscache *fscache; - struct bdev_handle *bdev_handle; + struct file *bdev_file; struct dax_device *dax_dev; u64 dax_part_off; @@ -385,7 +385,6 @@ struct erofs_map_dev { unsigned int m_deviceid; }; -extern struct file_system_type erofs_fs_type; extern const struct super_operations erofs_sops; extern const struct address_space_operations erofs_raw_access_aops; @@ -467,8 +466,8 @@ int __init erofs_init_shrinker(void); void erofs_exit_shrinker(void); int __init z_erofs_init_zip_subsystem(void); void z_erofs_exit_zip_subsystem(void); -int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, - struct erofs_workgroup *egrp); +int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi, + struct erofs_workgroup *egrp); int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, int flags); void *erofs_get_pcpubuf(unsigned int requiredpages); @@ -513,6 +512,8 @@ void erofs_fscache_unregister_fs(struct super_block *sb); struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, char *name, unsigned int flags); void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache); +struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev); +void erofs_fscache_submit_bio(struct bio *bio); #else static inline int erofs_fscache_register_fs(struct super_block *sb) { @@ -530,6 +531,8 @@ struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, static inline void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache) { } +static inline struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev) { return NULL; } +static inline void erofs_fscache_submit_bio(struct bio *bio) {} #endif #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c index d4f631d39f0f..f0110a78acb2 100644 --- a/fs/erofs/namei.c +++ b/fs/erofs/namei.c @@ -130,24 +130,24 @@ static void *erofs_find_target_block(struct erofs_buf *target, /* string comparison without already matched prefix */ diff = erofs_dirnamecmp(name, &dname, &matched); - if (!diff) { - *_ndirents = 0; - goto out; - } else if (diff > 0) { - head = mid + 1; - startprfx = matched; - - if (!IS_ERR(candidate)) - erofs_put_metabuf(target); - *target = buf; - candidate = de; - *_ndirents = ndirents; - } else { + if (diff < 0) { erofs_put_metabuf(&buf); - back = mid - 1; endprfx = matched; + continue; + } + + if (!IS_ERR(candidate)) + erofs_put_metabuf(target); + *target = buf; + if (!diff) { + *_ndirents = 0; + return de; } + head = mid + 1; + startprfx = matched; + candidate = de; + *_ndirents = ndirents; continue; } out: /* free if the candidate is valid */ diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 5f60f163bd56..69308fd73e4a 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -177,7 +177,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_fscache *fscache; struct erofs_deviceslot *dis; - struct bdev_handle *bdev_handle; + struct file *bdev_file; void *ptr; ptr = erofs_read_metabuf(buf, sb, erofs_blknr(sb, *pos), EROFS_KMAP); @@ -201,12 +201,12 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, return PTR_ERR(fscache); dif->fscache = fscache; } else if (!sbi->devs->flatdev) { - bdev_handle = bdev_open_by_path(dif->path, BLK_OPEN_READ, + bdev_file = bdev_file_open_by_path(dif->path, BLK_OPEN_READ, sb->s_type, NULL); - if (IS_ERR(bdev_handle)) - return PTR_ERR(bdev_handle); - dif->bdev_handle = bdev_handle; - dif->dax_dev = fs_dax_get_by_bdev(bdev_handle->bdev, + if (IS_ERR(bdev_file)) + return PTR_ERR(bdev_file); + dif->bdev_file = bdev_file; + dif->dax_dev = fs_dax_get_by_bdev(file_bdev(bdev_file), &dif->dax_part_off, NULL, NULL); } @@ -579,13 +579,6 @@ static const struct export_operations erofs_export_ops = { .get_parent = erofs_get_parent, }; -static int erofs_fc_fill_pseudo_super(struct super_block *sb, struct fs_context *fc) -{ - static const struct tree_descr empty_descr = {""}; - - return simple_fill_super(sb, EROFS_SUPER_MAGIC, &empty_descr); -} - static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) { struct inode *inode; @@ -712,11 +705,6 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) return 0; } -static int erofs_fc_anon_get_tree(struct fs_context *fc) -{ - return get_tree_nodev(fc, erofs_fc_fill_pseudo_super); -} - static int erofs_fc_get_tree(struct fs_context *fc) { struct erofs_fs_context *ctx = fc->fs_private; @@ -754,8 +742,8 @@ static int erofs_release_device_info(int id, void *ptr, void *data) struct erofs_device_info *dif = ptr; fs_put_dax(dif->dax_dev, NULL); - if (dif->bdev_handle) - bdev_release(dif->bdev_handle); + if (dif->bdev_file) + fput(dif->bdev_file); erofs_fscache_unregister_cookie(dif->fscache); dif->fscache = NULL; kfree(dif->path); @@ -789,20 +777,10 @@ static const struct fs_context_operations erofs_context_ops = { .free = erofs_fc_free, }; -static const struct fs_context_operations erofs_anon_context_ops = { - .get_tree = erofs_fc_anon_get_tree, -}; - static int erofs_init_fs_context(struct fs_context *fc) { struct erofs_fs_context *ctx; - /* pseudo mount for anon inodes */ - if (fc->sb_flags & SB_KERNMOUNT) { - fc->ops = &erofs_anon_context_ops; - return 0; - } - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return -ENOMEM; @@ -824,12 +802,6 @@ static void erofs_kill_sb(struct super_block *sb) { struct erofs_sb_info *sbi; - /* pseudo mount for anon inodes */ - if (sb->s_flags & SB_KERNMOUNT) { - kill_anon_super(sb); - return; - } - if (erofs_is_fscache_mode(sb)) kill_anon_super(sb); else @@ -868,7 +840,7 @@ static void erofs_put_super(struct super_block *sb) erofs_fscache_unregister_fs(sb); } -struct file_system_type erofs_fs_type = { +static struct file_system_type erofs_fs_type = { .owner = THIS_MODULE, .name = "erofs", .init_fs_context = erofs_init_fs_context, @@ -885,7 +857,7 @@ static int __init erofs_module_init(void) erofs_inode_cachep = kmem_cache_create("erofs_inode", sizeof(struct erofs_inode), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT, + SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, erofs_inode_init_once); if (!erofs_inode_cachep) return -ENOMEM; diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c index 5dea308764b4..518bdd69c823 100644 --- a/fs/erofs/utils.c +++ b/fs/erofs/utils.c @@ -81,7 +81,7 @@ struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb, repeat: xa_lock(&sbi->managed_pslots); pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index, - NULL, grp, GFP_NOFS); + NULL, grp, GFP_KERNEL); if (pre) { if (xa_is_err(pre)) { pre = ERR_PTR(xa_err(pre)); @@ -129,7 +129,7 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, * the XArray. Otherwise some cached pages could be still attached to * the orphan old workgroup when the new one is available in the tree. */ - if (erofs_try_to_free_all_cached_pages(sbi, grp)) + if (erofs_try_to_free_all_cached_folios(sbi, grp)) goto out; /* diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 692c0c39be63..3216b920d369 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -19,7 +19,10 @@ typedef void *z_erofs_next_pcluster_t; struct z_erofs_bvec { - struct page *page; + union { + struct page *page; + struct folio *folio; + }; int offset; unsigned int end; }; @@ -82,6 +85,9 @@ struct z_erofs_pcluster { /* L: indicate several pageofs_outs or not */ bool multibases; + /* L: whether extra buffer allocations are best-effort */ + bool besteffort; + /* A: compressed bvecs (can be cached or inplaced pages) */ struct z_erofs_bvec compressed_bvecs[]; }; @@ -113,47 +119,46 @@ static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) return PAGE_ALIGN(pcl->pclustersize) >> PAGE_SHIFT; } +#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping) +static bool erofs_folio_is_managed(struct erofs_sb_info *sbi, struct folio *fo) +{ + return fo->mapping == MNGD_MAPPING(sbi); +} + /* - * bit 30: I/O error occurred on this page - * bit 0 - 29: remaining parts to complete this page + * bit 30: I/O error occurred on this folio + * bit 0 - 29: remaining parts to complete this folio */ -#define Z_EROFS_PAGE_EIO (1 << 30) +#define Z_EROFS_FOLIO_EIO (1 << 30) -static inline void z_erofs_onlinepage_init(struct page *page) +static void z_erofs_onlinefolio_init(struct folio *folio) { union { atomic_t o; - unsigned long v; + void *v; } u = { .o = ATOMIC_INIT(1) }; - set_page_private(page, u.v); - smp_wmb(); - SetPagePrivate(page); + folio->private = u.v; /* valid only if file-backed folio is locked */ } -static inline void z_erofs_onlinepage_split(struct page *page) +static void z_erofs_onlinefolio_split(struct folio *folio) { - atomic_inc((atomic_t *)&page->private); + atomic_inc((atomic_t *)&folio->private); } -static void z_erofs_onlinepage_endio(struct page *page, int err) +static void z_erofs_onlinefolio_end(struct folio *folio, int err) { int orig, v; - DBG_BUGON(!PagePrivate(page)); - do { - orig = atomic_read((atomic_t *)&page->private); - v = (orig - 1) | (err ? Z_EROFS_PAGE_EIO : 0); - } while (atomic_cmpxchg((atomic_t *)&page->private, orig, v) != orig); - - if (!(v & ~Z_EROFS_PAGE_EIO)) { - set_page_private(page, 0); - ClearPagePrivate(page); - if (!(v & Z_EROFS_PAGE_EIO)) - SetPageUptodate(page); - unlock_page(page); - } + orig = atomic_read((atomic_t *)&folio->private); + v = (orig - 1) | (err ? Z_EROFS_FOLIO_EIO : 0); + } while (atomic_cmpxchg((atomic_t *)&folio->private, orig, v) != orig); + + if (v & ~Z_EROFS_FOLIO_EIO) + return; + folio->private = 0; + folio_end_read(folio, !(v & Z_EROFS_FOLIO_EIO)); } #define Z_EROFS_ONSTACK_PAGES 32 @@ -230,7 +235,7 @@ static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter, struct page *nextpage = *candidate_bvpage; if (!nextpage) { - nextpage = erofs_allocpage(pagepool, GFP_NOFS); + nextpage = erofs_allocpage(pagepool, GFP_KERNEL); if (!nextpage) return -ENOMEM; set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE); @@ -302,7 +307,7 @@ static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int size) if (nrpages > pcs->maxpages) continue; - pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS); + pcl = kmem_cache_zalloc(pcs->slab, GFP_KERNEL); if (!pcl) return ERR_PTR(-ENOMEM); pcl->pclustersize = size; @@ -563,25 +568,19 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; unsigned int i; - if (i_blocksize(fe->inode) != PAGE_SIZE) - return; - if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) + if (i_blocksize(fe->inode) != PAGE_SIZE || + fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) return; for (i = 0; i < pclusterpages; ++i) { struct page *page, *newpage; - void *t; /* mark pages just found for debugging */ - /* the compressed page was loaded before */ + /* Inaccurate check w/o locking to avoid unneeded lookups */ if (READ_ONCE(pcl->compressed_bvecs[i].page)) continue; page = find_get_page(mc, pcl->obj.index + i); - - if (page) { - t = (void *)((unsigned long)page | 1); - newpage = NULL; - } else { + if (!page) { /* I/O is needed, no possible to decompress directly */ standalone = false; if (!shouldalloc) @@ -595,11 +594,14 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) if (!newpage) continue; set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE); - t = (void *)((unsigned long)newpage | 1); } - - if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t)) + spin_lock(&pcl->obj.lockref.lock); + if (!pcl->compressed_bvecs[i].page) { + pcl->compressed_bvecs[i].page = page ? page : newpage; + spin_unlock(&pcl->obj.lockref.lock); continue; + } + spin_unlock(&pcl->obj.lockref.lock); if (page) put_page(page); @@ -615,9 +617,9 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; } -/* called by erofs_shrinker to get rid of all compressed_pages */ -int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, - struct erofs_workgroup *grp) +/* called by erofs_shrinker to get rid of all cached compressed bvecs */ +int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi, + struct erofs_workgroup *grp) { struct z_erofs_pcluster *const pcl = container_of(grp, struct z_erofs_pcluster, obj); @@ -625,27 +627,22 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, int i; DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); - /* - * refcount of workgroup is now freezed as 0, - * therefore no need to worry about available decompression users. - */ + /* There is no actice user since the pcluster is now freezed */ for (i = 0; i < pclusterpages; ++i) { - struct page *page = pcl->compressed_bvecs[i].page; + struct folio *folio = pcl->compressed_bvecs[i].folio; - if (!page) + if (!folio) continue; - /* block other users from reclaiming or migrating the page */ - if (!trylock_page(page)) + /* Avoid reclaiming or migrating this folio */ + if (!folio_trylock(folio)) return -EBUSY; - if (!erofs_page_is_managed(sbi, page)) + if (!erofs_folio_is_managed(sbi, folio)) continue; - - /* barrier is implied in the following 'unlock_page' */ - WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); - detach_page_private(page); - unlock_page(page); + pcl->compressed_bvecs[i].folio = NULL; + folio_detach_private(folio); + folio_unlock(folio); } return 0; } @@ -662,20 +659,17 @@ static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp) ret = false; spin_lock(&pcl->obj.lockref.lock); - if (pcl->obj.lockref.count > 0) - goto out; - - DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); - for (i = 0; i < pclusterpages; ++i) { - if (pcl->compressed_bvecs[i].page == &folio->page) { - WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); - ret = true; - break; + if (pcl->obj.lockref.count <= 0) { + DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); + for (i = 0; i < pclusterpages; ++i) { + if (pcl->compressed_bvecs[i].folio == folio) { + pcl->compressed_bvecs[i].folio = NULL; + folio_detach_private(folio); + ret = true; + break; + } } } - if (ret) - folio_detach_private(folio); -out: spin_unlock(&pcl->obj.lockref.lock); return ret; } @@ -694,7 +688,7 @@ static void z_erofs_cache_invalidate_folio(struct folio *folio, DBG_BUGON(stop > folio_size(folio) || stop < length); if (offset == 0 && stop == folio_size(folio)) - while (!z_erofs_cache_release_folio(folio, GFP_NOFS)) + while (!z_erofs_cache_release_folio(folio, 0)) cond_resched(); } @@ -713,36 +707,30 @@ int erofs_init_managed_cache(struct super_block *sb) set_nlink(inode, 1); inode->i_size = OFFSET_MAX; inode->i_mapping->a_ops = &z_erofs_cache_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL); EROFS_SB(sb)->managed_cache = inode; return 0; } -static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, - struct z_erofs_bvec *bvec) -{ - struct z_erofs_pcluster *const pcl = fe->pcl; - - while (fe->icur > 0) { - if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page, - NULL, bvec->page)) { - pcl->compressed_bvecs[fe->icur] = *bvec; - return true; - } - } - return false; -} - /* callers must be with pcluster lock held */ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, struct z_erofs_bvec *bvec, bool exclusive) { + struct z_erofs_pcluster *pcl = fe->pcl; int ret; if (exclusive) { /* give priority for inplaceio to use file pages first */ - if (z_erofs_try_inplace_io(fe, bvec)) + spin_lock(&pcl->obj.lockref.lock); + while (fe->icur > 0) { + if (pcl->compressed_bvecs[--fe->icur].page) + continue; + pcl->compressed_bvecs[fe->icur] = *bvec; + spin_unlock(&pcl->obj.lockref.lock); return 0; + } + spin_unlock(&pcl->obj.lockref.lock); + /* otherwise, check if it can be used as a bvpage */ if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED && !fe->candidate_bvpage) @@ -963,20 +951,20 @@ static int z_erofs_read_fragment(struct super_block *sb, struct page *page, return 0; } -static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, - struct page *page) +static int z_erofs_scan_folio(struct z_erofs_decompress_frontend *fe, + struct folio *folio, bool ra) { struct inode *const inode = fe->inode; struct erofs_map_blocks *const map = &fe->map; - const loff_t offset = page_offset(page); - const unsigned int bs = i_blocksize(inode); + const loff_t offset = folio_pos(folio); + const unsigned int bs = i_blocksize(inode), fs = folio_size(folio); bool tight = true, exclusive; unsigned int cur, end, len, split; int err = 0; - z_erofs_onlinepage_init(page); + z_erofs_onlinefolio_init(folio); split = 0; - end = PAGE_SIZE; + end = fs; repeat: if (offset + end - 1 < map->m_la || offset + end - 1 >= map->m_la + map->m_llen) { @@ -993,7 +981,7 @@ repeat: ++split; if (!(map->m_flags & EROFS_MAP_MAPPED)) { - zero_user_segment(page, cur, end); + folio_zero_segment(folio, cur, end); tight = false; goto next_part; } @@ -1002,8 +990,8 @@ repeat: erofs_off_t fpos = offset + cur - map->m_la; len = min_t(unsigned int, map->m_llen - fpos, end - cur); - err = z_erofs_read_fragment(inode->i_sb, page, cur, cur + len, - EROFS_I(inode)->z_fragmentoff + fpos); + err = z_erofs_read_fragment(inode->i_sb, &folio->page, cur, + cur + len, EROFS_I(inode)->z_fragmentoff + fpos); if (err) goto out; tight = false; @@ -1014,28 +1002,29 @@ repeat: err = z_erofs_pcluster_begin(fe); if (err) goto out; + fe->pcl->besteffort |= !ra; } /* - * Ensure the current partial page belongs to this submit chain rather + * Ensure the current partial folio belongs to this submit chain rather * than other concurrent submit chains or the noio(bypass) chain since - * those chains are handled asynchronously thus the page cannot be used + * those chains are handled asynchronously thus the folio cannot be used * for inplace I/O or bvpage (should be processed in a strict order.) */ tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); - exclusive = (!cur && ((split <= 1) || (tight && bs == PAGE_SIZE))); + exclusive = (!cur && ((split <= 1) || (tight && bs == fs))); if (cur) tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED); err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) { - .page = page, + .page = &folio->page, .offset = offset - map->m_la, .end = end, }), exclusive); if (err) goto out; - z_erofs_onlinepage_split(page); + z_erofs_onlinefolio_split(folio); if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) fe->pcl->multibases = true; if (fe->pcl->length < offset + end - map->m_la) { @@ -1056,7 +1045,7 @@ next_part: goto repeat; out: - z_erofs_onlinepage_endio(page, err); + z_erofs_onlinefolio_end(folio, err); return err; } @@ -1159,7 +1148,7 @@ static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be, cur += len; } kunmap_local(dst); - z_erofs_onlinepage_endio(bvi->bvec.page, err); + z_erofs_onlinefolio_end(page_folio(bvi->bvec.page), err); list_del(p); kfree(bvi); } @@ -1210,7 +1199,7 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, be->compressed_pages[i] = page; if (z_erofs_is_inline_pcluster(pcl) || - erofs_page_is_managed(EROFS_SB(be->sb), page)) { + erofs_folio_is_managed(EROFS_SB(be->sb), page_folio(page))) { if (!PageUptodate(page)) err = -EIO; continue; @@ -1280,6 +1269,9 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, .inplace_io = overlapped, .partial_decoding = pcl->partial, .fillgaps = pcl->multibases, + .gfp = pcl->besteffort ? + GFP_KERNEL | __GFP_NOFAIL : + GFP_NOWAIT | __GFP_NORETRY }, be->pagepool); /* must handle all compressed pages before actual file pages */ @@ -1292,7 +1284,8 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, /* consider shortlived pages added when decompressing */ page = be->compressed_pages[i]; - if (!page || erofs_page_is_managed(sbi, page)) + if (!page || + erofs_folio_is_managed(sbi, page_folio(page))) continue; (void)z_erofs_put_shortlivedpage(be->pagepool, page); WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); @@ -1313,7 +1306,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, /* recycle all individual short-lived pages */ if (z_erofs_put_shortlivedpage(be->pagepool, page)) continue; - z_erofs_onlinepage_endio(page, err); + z_erofs_onlinefolio_end(page_folio(page), err); } if (be->decompressed_pages != be->onstack_pages) @@ -1322,6 +1315,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, pcl->length = 0; pcl->partial = true; pcl->multibases = false; + pcl->besteffort = false; pcl->bvset.nextpage = NULL; pcl->vcnt = 0; @@ -1423,69 +1417,61 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec, { gfp_t gfp = mapping_gfp_mask(mc); bool tocache = false; - struct z_erofs_bvec *zbv = pcl->compressed_bvecs + nr; + struct z_erofs_bvec zbv; struct address_space *mapping; - struct page *page, *oldpage; - int justfound, bs = i_blocksize(f->inode); + struct page *page; + int bs = i_blocksize(f->inode); - /* Except for inplace pages, the entire page can be used for I/Os */ + /* Except for inplace folios, the entire folio can be used for I/Os */ bvec->bv_offset = 0; bvec->bv_len = PAGE_SIZE; repeat: - oldpage = READ_ONCE(zbv->page); - if (!oldpage) - goto out_allocpage; - - justfound = (unsigned long)oldpage & 1UL; - page = (struct page *)((unsigned long)oldpage & ~1UL); - bvec->bv_page = page; + spin_lock(&pcl->obj.lockref.lock); + zbv = pcl->compressed_bvecs[nr]; + spin_unlock(&pcl->obj.lockref.lock); + if (!zbv.folio) + goto out_allocfolio; - DBG_BUGON(z_erofs_is_shortlived_page(page)); + bvec->bv_page = &zbv.folio->page; + DBG_BUGON(z_erofs_is_shortlived_page(bvec->bv_page)); /* - * Handle preallocated cached pages. We tried to allocate such pages + * Handle preallocated cached folios. We tried to allocate such folios * without triggering direct reclaim. If allocation failed, inplace - * file-backed pages will be used instead. + * file-backed folios will be used instead. */ - if (page->private == Z_EROFS_PREALLOCATED_PAGE) { - set_page_private(page, 0); - WRITE_ONCE(zbv->page, page); + if (zbv.folio->private == (void *)Z_EROFS_PREALLOCATED_PAGE) { + zbv.folio->private = 0; tocache = true; goto out_tocache; } - mapping = READ_ONCE(page->mapping); + mapping = READ_ONCE(zbv.folio->mapping); /* - * File-backed pages for inplace I/Os are all locked steady, + * File-backed folios for inplace I/Os are all locked steady, * therefore it is impossible for `mapping` to be NULL. */ if (mapping && mapping != mc) { - if (zbv->offset < 0) - bvec->bv_offset = round_up(-zbv->offset, bs); - bvec->bv_len = round_up(zbv->end, bs) - bvec->bv_offset; + if (zbv.offset < 0) + bvec->bv_offset = round_up(-zbv.offset, bs); + bvec->bv_len = round_up(zbv.end, bs) - bvec->bv_offset; return; } - lock_page(page); - /* only true if page reclaim goes wrong, should never happen */ - DBG_BUGON(justfound && PagePrivate(page)); - - /* the cached page is still in managed cache */ - if (page->mapping == mc) { - WRITE_ONCE(zbv->page, page); + folio_lock(zbv.folio); + if (zbv.folio->mapping == mc) { /* - * The cached page is still available but without a valid - * `->private` pcluster hint. Let's reconnect them. + * The cached folio is still in managed cache but without + * a valid `->private` pcluster hint. Let's reconnect them. */ - if (!PagePrivate(page)) { - DBG_BUGON(!justfound); - /* compressed_bvecs[] already takes a ref */ - attach_page_private(page, pcl); - put_page(page); + if (!folio_test_private(zbv.folio)) { + folio_attach_private(zbv.folio, pcl); + /* compressed_bvecs[] already takes a ref before */ + folio_put(zbv.folio); } /* no need to submit if it is already up-to-date */ - if (PageUptodate(page)) { - unlock_page(page); + if (folio_test_uptodate(zbv.folio)) { + folio_unlock(zbv.folio); bvec->bv_page = NULL; } return; @@ -1495,30 +1481,32 @@ repeat: * It has been truncated, so it's unsafe to reuse this one. Let's * allocate a new page for compressed data. */ - DBG_BUGON(page->mapping); - DBG_BUGON(!justfound); - + DBG_BUGON(zbv.folio->mapping); tocache = true; - unlock_page(page); - put_page(page); -out_allocpage: + folio_unlock(zbv.folio); + folio_put(zbv.folio); +out_allocfolio: page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL); - if (oldpage != cmpxchg(&zbv->page, oldpage, page)) { + spin_lock(&pcl->obj.lockref.lock); + if (pcl->compressed_bvecs[nr].folio) { erofs_pagepool_add(&f->pagepool, page); + spin_unlock(&pcl->obj.lockref.lock); cond_resched(); goto repeat; } + pcl->compressed_bvecs[nr].folio = zbv.folio = page_folio(page); + spin_unlock(&pcl->obj.lockref.lock); bvec->bv_page = page; out_tocache: if (!tocache || bs != PAGE_SIZE || - add_to_page_cache_lru(page, mc, pcl->obj.index + nr, gfp)) { - /* turn into a temporary shortlived page (1 ref) */ - set_page_private(page, Z_EROFS_SHORTLIVED_PAGE); + filemap_add_folio(mc, zbv.folio, pcl->obj.index + nr, gfp)) { + /* turn into a temporary shortlived folio (1 ref) */ + zbv.folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE; return; } - attach_page_private(page, pcl); + folio_attach_private(zbv.folio, pcl); /* drop a refcount added by allocpage (then 2 refs in total here) */ - put_page(page); + folio_put(zbv.folio); } static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb, @@ -1573,28 +1561,29 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, qtail[JQ_BYPASS] = &pcl->next; } -static void z_erofs_submissionqueue_endio(struct bio *bio) +static void z_erofs_endio(struct bio *bio) { struct z_erofs_decompressqueue *q = bio->bi_private; blk_status_t err = bio->bi_status; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + struct folio_iter fi; - bio_for_each_segment_all(bvec, bio, iter_all) { - struct page *page = bvec->bv_page; + bio_for_each_folio_all(fi, bio) { + struct folio *folio = fi.folio; - DBG_BUGON(PageUptodate(page)); - DBG_BUGON(z_erofs_page_is_invalidated(page)); - if (erofs_page_is_managed(EROFS_SB(q->sb), page)) { - if (!err) - SetPageUptodate(page); - unlock_page(page); - } + DBG_BUGON(folio_test_uptodate(folio)); + DBG_BUGON(z_erofs_page_is_invalidated(&folio->page)); + if (!erofs_folio_is_managed(EROFS_SB(q->sb), folio)) + continue; + + if (!err) + folio_mark_uptodate(folio); + folio_unlock(folio); } if (err) q->eio = true; z_erofs_decompress_kickoff(q, -1); - bio_put(bio); + if (bio->bi_bdev) + bio_put(bio); } static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, @@ -1608,7 +1597,6 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, z_erofs_next_pcluster_t owned_head = f->owned_head; /* bio is NULL initially, so no need to initialize last_{index,bdev} */ erofs_off_t last_pa; - struct block_device *last_bdev; unsigned int nr_bios = 0; struct bio *bio = NULL; unsigned long pflags; @@ -1655,9 +1643,13 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, continue; if (bio && (cur != last_pa || - last_bdev != mdev.m_bdev)) { -submit_bio_retry: - submit_bio(bio); + bio->bi_bdev != mdev.m_bdev)) { +io_retry: + if (!erofs_is_fscache_mode(sb)) + submit_bio(bio); + else + erofs_fscache_submit_bio(bio); + if (memstall) { psi_memstall_leave(&pflags); memstall = 0; @@ -1672,22 +1664,24 @@ submit_bio_retry: } if (!bio) { - bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS, - REQ_OP_READ, GFP_NOIO); - bio->bi_end_io = z_erofs_submissionqueue_endio; + bio = erofs_is_fscache_mode(sb) ? + erofs_fscache_bio_alloc(&mdev) : + bio_alloc(mdev.m_bdev, BIO_MAX_VECS, + REQ_OP_READ, GFP_NOIO); + bio->bi_end_io = z_erofs_endio; bio->bi_iter.bi_sector = cur >> 9; bio->bi_private = q[JQ_SUBMIT]; if (readahead) bio->bi_opf |= REQ_RAHEAD; ++nr_bios; - last_bdev = mdev.m_bdev; } if (cur + bvec.bv_len > end) bvec.bv_len = end - cur; + DBG_BUGON(bvec.bv_len < sb->s_blocksize); if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len, bvec.bv_offset)) - goto submit_bio_retry; + goto io_retry; last_pa = cur + bvec.bv_len; bypass = false; @@ -1700,7 +1694,10 @@ submit_bio_retry: } while (owned_head != Z_EROFS_PCLUSTER_TAIL); if (bio) { - submit_bio(bio); + if (!erofs_is_fscache_mode(sb)) + submit_bio(bio); + else + erofs_fscache_submit_bio(bio); if (memstall) psi_memstall_leave(&pflags); } @@ -1785,7 +1782,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, if (PageUptodate(page)) unlock_page(page); else - (void)z_erofs_do_read_page(f, page); + z_erofs_scan_folio(f, page_folio(page), !!rac); put_page(page); } @@ -1806,7 +1803,7 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT; z_erofs_pcluster_readmore(&f, NULL, true); - err = z_erofs_do_read_page(&f, &folio->page); + err = z_erofs_scan_folio(&f, folio, false); z_erofs_pcluster_readmore(&f, NULL, false); z_erofs_pcluster_end(&f); @@ -1847,7 +1844,7 @@ static void z_erofs_readahead(struct readahead_control *rac) folio = head; head = folio_get_private(folio); - err = z_erofs_do_read_page(&f, &folio->page); + err = z_erofs_scan_folio(&f, folio, true); if (err && err != -EINTR) erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu", folio->index, EROFS_I(inode)->nid); diff --git a/fs/eventfd.c b/fs/eventfd.c index ad8186d47ba7..9afdb722fa92 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -251,7 +251,7 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c ssize_t res; __u64 ucnt; - if (count < sizeof(ucnt)) + if (count != sizeof(ucnt)) return -EINVAL; if (copy_from_user(&ucnt, buf, sizeof(ucnt))) return -EFAULT; @@ -283,13 +283,18 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c static void eventfd_show_fdinfo(struct seq_file *m, struct file *f) { struct eventfd_ctx *ctx = f->private_data; + __u64 cnt; spin_lock_irq(&ctx->wqh.lock); - seq_printf(m, "eventfd-count: %16llx\n", - (unsigned long long)ctx->count); + cnt = ctx->count; spin_unlock_irq(&ctx->wqh.lock); - seq_printf(m, "eventfd-id: %d\n", ctx->id); - seq_printf(m, "eventfd-semaphore: %d\n", + + seq_printf(m, + "eventfd-count: %16llx\n" + "eventfd-id: %d\n" + "eventfd-semaphore: %d\n", + cnt, + ctx->id, !!(ctx->flags & EFD_SEMAPHORE)); } #endif @@ -383,6 +388,7 @@ static int do_eventfd(unsigned int count, int flags) /* Check the EFD_* constants for consistency. */ BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK); + BUILD_BUG_ON(EFD_SEMAPHORE != (1 << 0)); if (flags & ~EFD_FLAGS_SET) return -EINVAL; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 3534d36a1474..882b89edc52a 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -37,6 +37,7 @@ #include <linux/seq_file.h> #include <linux/compat.h> #include <linux/rculist.h> +#include <linux/capability.h> #include <net/busy_poll.h> /* @@ -206,7 +207,7 @@ struct eventpoll { */ struct epitem *ovflist; - /* wakeup_source used when ep_scan_ready_list is running */ + /* wakeup_source used when ep_send_events or __ep_eventpoll_poll is running */ struct wakeup_source *ws; /* The user that created the eventpoll descriptor */ @@ -227,6 +228,11 @@ struct eventpoll { #ifdef CONFIG_NET_RX_BUSY_POLL /* used to track busy poll napi_id */ unsigned int napi_id; + /* busy poll timeout */ + u32 busy_poll_usecs; + /* busy poll packet budget */ + u16 busy_poll_budget; + bool prefer_busy_poll; #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -387,11 +393,41 @@ static inline int ep_events_available(struct eventpoll *ep) } #ifdef CONFIG_NET_RX_BUSY_POLL +/** + * busy_loop_ep_timeout - check if busy poll has timed out. The timeout value + * from the epoll instance ep is preferred, but if it is not set fallback to + * the system-wide global via busy_loop_timeout. + * + * @start_time: The start time used to compute the remaining time until timeout. + * @ep: Pointer to the eventpoll context. + * + * Return: true if the timeout has expired, false otherwise. + */ +static bool busy_loop_ep_timeout(unsigned long start_time, + struct eventpoll *ep) +{ + unsigned long bp_usec = READ_ONCE(ep->busy_poll_usecs); + + if (bp_usec) { + unsigned long end_time = start_time + bp_usec; + unsigned long now = busy_loop_current_time(); + + return time_after(now, end_time); + } else { + return busy_loop_timeout(start_time); + } +} + +static bool ep_busy_loop_on(struct eventpoll *ep) +{ + return !!ep->busy_poll_usecs || net_busy_loop_on(); +} + static bool ep_busy_loop_end(void *p, unsigned long start_time) { struct eventpoll *ep = p; - return ep_events_available(ep) || busy_loop_timeout(start_time); + return ep_events_available(ep) || busy_loop_ep_timeout(start_time, ep); } /* @@ -403,10 +439,15 @@ static bool ep_busy_loop_end(void *p, unsigned long start_time) static bool ep_busy_loop(struct eventpoll *ep, int nonblock) { unsigned int napi_id = READ_ONCE(ep->napi_id); + u16 budget = READ_ONCE(ep->busy_poll_budget); + bool prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll); - if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) { - napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false, - BUSY_POLL_BUDGET); + if (!budget) + budget = BUSY_POLL_BUDGET; + + if (napi_id >= MIN_NAPI_ID && ep_busy_loop_on(ep)) { + napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, + ep, prefer_busy_poll, budget); if (ep_events_available(ep)) return true; /* @@ -425,12 +466,12 @@ static bool ep_busy_loop(struct eventpoll *ep, int nonblock) */ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) { - struct eventpoll *ep; + struct eventpoll *ep = epi->ep; unsigned int napi_id; struct socket *sock; struct sock *sk; - if (!net_busy_loop_on()) + if (!ep_busy_loop_on(ep)) return; sock = sock_from_file(epi->ffd.file); @@ -442,7 +483,6 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) return; napi_id = READ_ONCE(sk->sk_napi_id); - ep = epi->ep; /* Non-NAPI IDs can be rejected * or @@ -455,6 +495,49 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) ep->napi_id = napi_id; } +static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct eventpoll *ep = file->private_data; + void __user *uarg = (void __user *)arg; + struct epoll_params epoll_params; + + switch (cmd) { + case EPIOCSPARAMS: + if (copy_from_user(&epoll_params, uarg, sizeof(epoll_params))) + return -EFAULT; + + /* pad byte must be zero */ + if (epoll_params.__pad) + return -EINVAL; + + if (epoll_params.busy_poll_usecs > S32_MAX) + return -EINVAL; + + if (epoll_params.prefer_busy_poll > 1) + return -EINVAL; + + if (epoll_params.busy_poll_budget > NAPI_POLL_WEIGHT && + !capable(CAP_NET_ADMIN)) + return -EPERM; + + WRITE_ONCE(ep->busy_poll_usecs, epoll_params.busy_poll_usecs); + WRITE_ONCE(ep->busy_poll_budget, epoll_params.busy_poll_budget); + WRITE_ONCE(ep->prefer_busy_poll, epoll_params.prefer_busy_poll); + return 0; + case EPIOCGPARAMS: + memset(&epoll_params, 0, sizeof(epoll_params)); + epoll_params.busy_poll_usecs = READ_ONCE(ep->busy_poll_usecs); + epoll_params.busy_poll_budget = READ_ONCE(ep->busy_poll_budget); + epoll_params.prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll); + if (copy_to_user(uarg, &epoll_params, sizeof(epoll_params))) + return -EFAULT; + return 0; + default: + return -ENOIOCTLCMD; + } +} + #else static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock) @@ -466,6 +549,12 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) { } +static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + return -EOPNOTSUPP; +} + #endif /* CONFIG_NET_RX_BUSY_POLL */ /* @@ -678,12 +767,6 @@ static void ep_done_scan(struct eventpoll *ep, write_unlock_irq(&ep->lock); } -static void epi_rcu_free(struct rcu_head *head) -{ - struct epitem *epi = container_of(head, struct epitem, rcu); - kmem_cache_free(epi_cache, epi); -} - static void ep_get(struct eventpoll *ep) { refcount_inc(&ep->refcount); @@ -767,7 +850,7 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make * use of the rbn field. */ - call_rcu(&epi->rcu, epi_rcu_free); + kfree_rcu(epi, rcu); percpu_counter_dec(&ep->user->epoll_watches); return ep_refcount_dec_and_test(ep); @@ -825,6 +908,27 @@ static void ep_clear_and_put(struct eventpoll *ep) ep_free(ep); } +static long ep_eventpoll_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + int ret; + + if (!is_file_epoll(file)) + return -EINVAL; + + switch (cmd) { + case EPIOCSPARAMS: + case EPIOCGPARAMS: + ret = ep_eventpoll_bp_ioctl(file, cmd, arg); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + static int ep_eventpoll_release(struct inode *inode, struct file *file) { struct eventpoll *ep = file->private_data; @@ -931,6 +1035,8 @@ static const struct file_operations eventpoll_fops = { .release = ep_eventpoll_release, .poll = ep_eventpoll_poll, .llseek = noop_llseek, + .unlocked_ioctl = ep_eventpoll_ioctl, + .compat_ioctl = compat_ptr_ioctl, }; /* @@ -1153,7 +1259,7 @@ static inline bool chain_epi_lockless(struct epitem *epi) * This callback takes a read lock in order not to contend with concurrent * events from another file descriptor, thus all modifications to ->rdllist * or ->ovflist are lockless. Read lock is paired with the write lock from - * ep_scan_ready_list(), which stops all list modifications and guarantees + * ep_start/done_scan(), which stops all list modifications and guarantees * that lists state is seen correctly. * * Another thing worth to mention is that ep_poll_callback() can be called @@ -1751,7 +1857,7 @@ static int ep_send_events(struct eventpoll *ep, * availability. At this point, no one can insert * into ep->rdllist besides us. The epoll_ctl() * callers are locked out by - * ep_scan_ready_list() holding "mtx" and the + * ep_send_events() holding "mtx" and the * poll callback will queue them in ep->ovflist. */ list_add_tail(&epi->rdllink, &ep->rdllist); @@ -1904,7 +2010,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, __set_current_state(TASK_INTERRUPTIBLE); /* - * Do the final check under the lock. ep_scan_ready_list() + * Do the final check under the lock. ep_start/done_scan() * plays with two lists (->rdllist and ->ovflist) and there * is always a race when both lists are empty for short * period of time although events are pending, so lock is @@ -2058,6 +2164,11 @@ static int do_epoll_create(int flags) error = PTR_ERR(file); goto out_free_fd; } +#ifdef CONFIG_NET_RX_BUSY_POLL + ep->busy_poll_usecs = 0; + ep->busy_poll_budget = 0; + ep->prefer_busy_poll = false; +#endif ep->file = file; fd_install(fd, file); return fd; diff --git a/fs/exec.c b/fs/exec.c index 8cdd5b2dd09c..ff6f26671cfc 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -128,7 +128,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) struct filename *tmp = getname(library); int error = PTR_ERR(tmp); static const struct open_flags uselib_flags = { - .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, + .open_flag = O_LARGEFILE | O_RDONLY, .acc_mode = MAY_READ | MAY_EXEC, .intent = LOOKUP_OPEN, .lookup_flags = LOOKUP_FOLLOW, @@ -904,6 +904,10 @@ EXPORT_SYMBOL(transfer_args_to_stack); #endif /* CONFIG_MMU */ +/* + * On success, caller must call do_close_execat() on the returned + * struct file to close it. + */ static struct file *do_open_execat(int fd, struct filename *name, int flags) { struct file *file; @@ -948,6 +952,17 @@ exit: return ERR_PTR(err); } +/** + * open_exec - Open a path name for execution + * + * @name: path name to open with the intent of executing it. + * + * Returns ERR_PTR on failure or allocated struct file on success. + * + * As this is a wrapper for the internal do_open_execat(), callers + * must call allow_write_access() before fput() on release. Also see + * do_close_execat(). + */ struct file *open_exec(const char *name) { struct filename *filename = getname_kernel(name); @@ -1143,7 +1158,6 @@ static int de_thread(struct task_struct *tsk) BUG_ON(leader->exit_state != EXIT_ZOMBIE); leader->exit_state = EXIT_DEAD; - /* * We are going to release_task()->ptrace_unlink() silently, * the tracer can sleep in do_wait(). EXIT_DEAD guarantees @@ -1409,6 +1423,9 @@ int begin_new_exec(struct linux_binprm * bprm) out_unlock: up_write(&me->signal->exec_update_lock); + if (!bprm->cred) + mutex_unlock(&me->signal->cred_guard_mutex); + out: return retval; } @@ -1484,6 +1501,15 @@ static int prepare_bprm_creds(struct linux_binprm *bprm) return -ENOMEM; } +/* Matches do_open_execat() */ +static void do_close_execat(struct file *file) +{ + if (!file) + return; + allow_write_access(file); + fput(file); +} + static void free_bprm(struct linux_binprm *bprm) { if (bprm->mm) { @@ -1495,10 +1521,7 @@ static void free_bprm(struct linux_binprm *bprm) mutex_unlock(¤t->signal->cred_guard_mutex); abort_creds(bprm->cred); } - if (bprm->file) { - allow_write_access(bprm->file); - fput(bprm->file); - } + do_close_execat(bprm->file); if (bprm->executable) fput(bprm->executable); /* If a binfmt changed the interp, free it. */ @@ -1520,8 +1543,7 @@ static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int fl bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); if (!bprm) { - allow_write_access(file); - fput(file); + do_close_execat(file); return ERR_PTR(-ENOMEM); } @@ -1610,6 +1632,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm) } rcu_read_unlock(); + /* "users" and "in_exec" locked for copy_fs() */ if (p->fs->users > n_fs) bprm->unsafe |= LSM_UNSAFE_SHARE; else @@ -1696,7 +1719,6 @@ static int prepare_binprm(struct linux_binprm *bprm) */ int remove_arg_zero(struct linux_binprm *bprm) { - int ret = 0; unsigned long offset; char *kaddr; struct page *page; @@ -1707,10 +1729,8 @@ int remove_arg_zero(struct linux_binprm *bprm) do { offset = bprm->p & ~PAGE_MASK; page = get_arg_page(bprm, bprm->p, 0); - if (!page) { - ret = -EFAULT; - goto out; - } + if (!page) + return -EFAULT; kaddr = kmap_local_page(page); for (; offset < PAGE_SIZE && kaddr[offset]; @@ -1723,10 +1743,8 @@ int remove_arg_zero(struct linux_binprm *bprm) bprm->p++; bprm->argc--; - ret = 0; -out: - return ret; + return 0; } EXPORT_SYMBOL(remove_arg_zero); @@ -1826,9 +1844,6 @@ static int exec_binprm(struct linux_binprm *bprm) return 0; } -/* - * sys_execve() executes a new program. - */ static int bprm_execve(struct linux_binprm *bprm) { int retval; diff --git a/fs/exfat/cache.c b/fs/exfat/cache.c index 5a2f119b7e8c..7cc200d89821 100644 --- a/fs/exfat/cache.c +++ b/fs/exfat/cache.c @@ -46,7 +46,7 @@ int exfat_cache_init(void) { exfat_cachep = kmem_cache_create("exfat_cache", sizeof(struct exfat_cache), - 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, + 0, SLAB_RECLAIM_ACCOUNT, exfat_cache_init_once); if (!exfat_cachep) return -ENOMEM; diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 9474cd50da6d..361595433480 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -275,6 +275,7 @@ struct exfat_sb_info { spinlock_t inode_hash_lock; struct hlist_head inode_hashtable[EXFAT_HASH_SIZE]; + struct rcu_head rcu; }; #define EXFAT_CACHE_VALID 0 diff --git a/fs/exfat/file.c b/fs/exfat/file.c index d25a96a148af..cc00f1a7a1e1 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -35,13 +35,18 @@ static int exfat_cont_expand(struct inode *inode, loff_t size) if (new_num_clusters == num_clusters) goto out; - exfat_chain_set(&clu, ei->start_clu, num_clusters, ei->flags); - ret = exfat_find_last_cluster(sb, &clu, &last_clu); - if (ret) - return ret; + if (num_clusters) { + exfat_chain_set(&clu, ei->start_clu, num_clusters, ei->flags); + ret = exfat_find_last_cluster(sb, &clu, &last_clu); + if (ret) + return ret; + + clu.dir = last_clu + 1; + } else { + last_clu = EXFAT_EOF_CLUSTER; + clu.dir = EXFAT_EOF_CLUSTER; + } - clu.dir = (last_clu == EXFAT_EOF_CLUSTER) ? - EXFAT_EOF_CLUSTER : last_clu + 1; clu.size = 0; clu.flags = ei->flags; @@ -51,17 +56,19 @@ static int exfat_cont_expand(struct inode *inode, loff_t size) return ret; /* Append new clusters to chain */ - if (clu.flags != ei->flags) { - exfat_chain_cont_cluster(sb, ei->start_clu, num_clusters); - ei->flags = ALLOC_FAT_CHAIN; - } - if (clu.flags == ALLOC_FAT_CHAIN) - if (exfat_ent_set(sb, last_clu, clu.dir)) - goto free_clu; - - if (num_clusters == 0) + if (num_clusters) { + if (clu.flags != ei->flags) + if (exfat_chain_cont_cluster(sb, ei->start_clu, num_clusters)) + goto free_clu; + + if (clu.flags == ALLOC_FAT_CHAIN) + if (exfat_ent_set(sb, last_clu, clu.dir)) + goto free_clu; + } else ei->start_clu = clu.dir; + ei->flags = clu.flags; + out: inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); /* Expanded range not zeroed, do not update valid_size */ diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 522edcbb2ce4..0687f952956c 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -501,7 +501,7 @@ static ssize_t exfat_direct_IO(struct kiocb *iocb, struct iov_iter *iter) struct inode *inode = mapping->host; struct exfat_inode_info *ei = EXFAT_I(inode); loff_t pos = iocb->ki_pos; - loff_t size = iocb->ki_pos + iov_iter_count(iter); + loff_t size = pos + iov_iter_count(iter); int rw = iov_iter_rw(iter); ssize_t ret; @@ -525,11 +525,10 @@ static ssize_t exfat_direct_IO(struct kiocb *iocb, struct iov_iter *iter) */ ret = blockdev_direct_IO(iocb, inode, iter, exfat_get_block); if (ret < 0) { - if (rw == WRITE) + if (rw == WRITE && ret != -EIOCBQUEUED) exfat_write_failed(mapping, size); - if (ret != -EIOCBQUEUED) - return ret; + return ret; } else size = pos + ret; diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c index 705710f93e2d..afdf13c34ff5 100644 --- a/fs/exfat/nls.c +++ b/fs/exfat/nls.c @@ -655,7 +655,6 @@ static int exfat_load_upcase_table(struct super_block *sb, unsigned int sect_size = sb->s_blocksize; unsigned int i, index = 0; u32 chksum = 0; - int ret; unsigned char skip = false; unsigned short *upcase_table; @@ -673,8 +672,7 @@ static int exfat_load_upcase_table(struct super_block *sb, if (!bh) { exfat_err(sb, "failed to read sector(0x%llx)", (unsigned long long)sector); - ret = -EIO; - goto free_table; + return -EIO; } sector++; for (i = 0; i < sect_size && index <= 0xFFFF; i += 2) { @@ -701,15 +699,12 @@ static int exfat_load_upcase_table(struct super_block *sb, exfat_err(sb, "failed to load upcase table (idx : 0x%08x, chksum : 0x%08x, utbl_chksum : 0x%08x)", index, chksum, utbl_checksum); - ret = -EINVAL; -free_table: - exfat_free_upcase_table(sbi); - return ret; + return -EINVAL; } static int exfat_load_default_upcase_table(struct super_block *sb) { - int i, ret = -EIO; + int i; struct exfat_sb_info *sbi = EXFAT_SB(sb); unsigned char skip = false; unsigned short uni = 0, *upcase_table; @@ -740,8 +735,7 @@ static int exfat_load_default_upcase_table(struct super_block *sb) return 0; /* FATAL error: default upcase table has error */ - exfat_free_upcase_table(sbi); - return ret; + return -EIO; } int exfat_create_upcase_table(struct super_block *sb) diff --git a/fs/exfat/super.c b/fs/exfat/super.c index d9d4fa91010b..3d5ea2cfad66 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -39,9 +39,6 @@ static void exfat_put_super(struct super_block *sb) exfat_free_bitmap(sbi); brelse(sbi->boot_bh); mutex_unlock(&sbi->s_lock); - - unload_nls(sbi->nls_io); - exfat_free_upcase_table(sbi); } static int exfat_sync_fs(struct super_block *sb, int wait) @@ -600,7 +597,7 @@ static int __exfat_fill_super(struct super_block *sb) ret = exfat_load_bitmap(sb); if (ret) { exfat_err(sb, "failed to load alloc-bitmap"); - goto free_upcase_table; + goto free_bh; } ret = exfat_count_used_clusters(sb, &sbi->used_clusters); @@ -613,8 +610,6 @@ static int __exfat_fill_super(struct super_block *sb) free_alloc_bitmap: exfat_free_bitmap(sbi); -free_upcase_table: - exfat_free_upcase_table(sbi); free_bh: brelse(sbi->boot_bh); return ret; @@ -701,12 +696,10 @@ put_inode: sb->s_root = NULL; free_table: - exfat_free_upcase_table(sbi); exfat_free_bitmap(sbi); brelse(sbi->boot_bh); check_nls_io: - unload_nls(sbi->nls_io); return err; } @@ -771,13 +764,22 @@ static int exfat_init_fs_context(struct fs_context *fc) return 0; } +static void delayed_free(struct rcu_head *p) +{ + struct exfat_sb_info *sbi = container_of(p, struct exfat_sb_info, rcu); + + unload_nls(sbi->nls_io); + exfat_free_upcase_table(sbi); + exfat_free_sbi(sbi); +} + static void exfat_kill_sb(struct super_block *sb) { struct exfat_sb_info *sbi = sb->s_fs_info; kill_block_super(sb); if (sbi) - exfat_free_sbi(sbi); + call_rcu(&sbi->rcu, delayed_free); } static struct file_system_type exfat_fs_type = { @@ -811,7 +813,7 @@ static int __init init_exfat_fs(void) exfat_inode_cachep = kmem_cache_create("exfat_inode_cache", sizeof(struct exfat_inode_info), - 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + 0, SLAB_RECLAIM_ACCOUNT, exfat_inode_init_once); if (!exfat_inode_cachep) { err = -ENOMEM; diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 3ae0154c5680..07ea3d62b298 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -255,7 +255,7 @@ static bool filldir_one(struct dir_context *ctx, const char *name, int len, container_of(ctx, struct getdents_callback, ctx); buf->sequence++; - if (buf->ino == ino && len <= NAME_MAX) { + if (buf->ino == ino && len <= NAME_MAX && !is_dot_dotdot(name, len)) { memcpy(buf->name, name, len); buf->name[len] = '\0'; buf->found = 1; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 01f9addc8b1f..cabea887314d 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -213,8 +213,7 @@ static int __init init_inodecache(void) { ext2_inode_cachep = kmem_cache_create_usercopy("ext2_inode_cache", sizeof(struct ext2_inode_info), 0, - (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| - SLAB_ACCOUNT), + SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, offsetof(struct ext2_inode_info, i_data), sizeof_field(struct ext2_inode_info, i_data), init_once); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index a5d784872303..3c0d7d143036 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -252,8 +252,10 @@ struct ext4_allocation_request { #define EXT4_MAP_MAPPED BIT(BH_Mapped) #define EXT4_MAP_UNWRITTEN BIT(BH_Unwritten) #define EXT4_MAP_BOUNDARY BIT(BH_Boundary) +#define EXT4_MAP_DELAYED BIT(BH_Delay) #define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ - EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY) + EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\ + EXT4_MAP_DELAYED) struct ext4_map_blocks { ext4_fsblk_t m_pblk; @@ -1548,7 +1550,7 @@ struct ext4_sb_info { unsigned long s_commit_interval; u32 s_max_batch_time; u32 s_min_batch_time; - struct bdev_handle *s_journal_bdev_handle; + struct file *s_journal_bdev_file; #ifdef CONFIG_QUOTA /* Names of quota files with journalled quota */ char __rcu *s_qf_names[EXT4_MAXQUOTAS]; @@ -2912,10 +2914,10 @@ extern const struct seq_operations ext4_mb_seq_groups_ops; extern const struct seq_operations ext4_mb_seq_structs_summary_ops; extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset); extern int ext4_mb_init(struct super_block *); -extern int ext4_mb_release(struct super_block *); +extern void ext4_mb_release(struct super_block *); extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, struct ext4_allocation_request *, int *); -extern void ext4_discard_preallocations(struct inode *, unsigned int); +extern void ext4_discard_preallocations(struct inode *); extern int __init ext4_init_mballoc(void); extern void ext4_exit_mballoc(void); extern ext4_group_t ext4_mb_prefetch(struct super_block *sb, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 01299b55a567..7669d154c05e 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -100,7 +100,7 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped) * i_rwsem. So we can safely drop the i_data_sem here. */ BUG_ON(EXT4_JOURNAL(inode) == NULL); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); *dropped = 1; return 0; @@ -2229,7 +2229,7 @@ static int ext4_fill_es_cache_info(struct inode *inode, /* - * ext4_ext_determine_hole - determine hole around given block + * ext4_ext_find_hole - find hole around given block according to the given path * @inode: inode we lookup in * @path: path in extent tree to @lblk * @lblk: pointer to logical block around which we want to determine hole @@ -2241,9 +2241,9 @@ static int ext4_fill_es_cache_info(struct inode *inode, * The function returns the length of a hole starting at @lblk. We update @lblk * to the beginning of the hole if we managed to find it. */ -static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode, - struct ext4_ext_path *path, - ext4_lblk_t *lblk) +static ext4_lblk_t ext4_ext_find_hole(struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t *lblk) { int depth = ext_depth(inode); struct ext4_extent *ex; @@ -2271,30 +2271,6 @@ static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode, } /* - * ext4_ext_put_gap_in_cache: - * calculate boundaries of the gap that the requested block fits into - * and cache this gap - */ -static void -ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start, - ext4_lblk_t hole_len) -{ - struct extent_status es; - - ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start, - hole_start + hole_len - 1, &es); - if (es.es_len) { - /* There's delayed extent containing lblock? */ - if (es.es_lblk <= hole_start) - return; - hole_len = min(es.es_lblk - hole_start, hole_len); - } - ext_debug(inode, " -> %u:%u\n", hole_start, hole_len); - ext4_es_insert_extent(inode, hole_start, hole_len, ~0, - EXTENT_STATUS_HOLE); -} - -/* * ext4_ext_rm_idx: * removes index from the index block. */ @@ -4062,6 +4038,72 @@ static int get_implied_cluster_alloc(struct super_block *sb, return 0; } +/* + * Determine hole length around the given logical block, first try to + * locate and expand the hole from the given @path, and then adjust it + * if it's partially or completely converted to delayed extents, insert + * it into the extent cache tree if it's indeed a hole, finally return + * the length of the determined extent. + */ +static ext4_lblk_t ext4_ext_determine_insert_hole(struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t lblk) +{ + ext4_lblk_t hole_start, len; + struct extent_status es; + + hole_start = lblk; + len = ext4_ext_find_hole(inode, path, &hole_start); +again: + ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start, + hole_start + len - 1, &es); + if (!es.es_len) + goto insert_hole; + + /* + * There's a delalloc extent in the hole, handle it if the delalloc + * extent is in front of, behind and straddle the queried range. + */ + if (lblk >= es.es_lblk + es.es_len) { + /* + * The delalloc extent is in front of the queried range, + * find again from the queried start block. + */ + len -= lblk - hole_start; + hole_start = lblk; + goto again; + } else if (in_range(lblk, es.es_lblk, es.es_len)) { + /* + * The delalloc extent containing lblk, it must have been + * added after ext4_map_blocks() checked the extent status + * tree so we are not holding i_rwsem and delalloc info is + * only stabilized by i_data_sem we are going to release + * soon. Don't modify the extent status tree and report + * extent as a hole, just adjust the length to the delalloc + * extent's after lblk. + */ + len = es.es_lblk + es.es_len - lblk; + return len; + } else { + /* + * The delalloc extent is partially or completely behind + * the queried range, update hole length until the + * beginning of the delalloc extent. + */ + len = min(es.es_lblk - hole_start, len); + } + +insert_hole: + /* Put just found gap into cache to speed up subsequent requests */ + ext_debug(inode, " -> %u:%u\n", hole_start, len); + ext4_es_insert_extent(inode, hole_start, len, ~0, EXTENT_STATUS_HOLE); + + /* Update hole_len to reflect hole size after lblk */ + if (hole_start != lblk) + len -= lblk - hole_start; + + return len; +} /* * Block allocation/map/preallocation routine for extents based files @@ -4179,22 +4221,12 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, * we couldn't try to create block if create flag is zero */ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { - ext4_lblk_t hole_start, hole_len; + ext4_lblk_t len; - hole_start = map->m_lblk; - hole_len = ext4_ext_determine_hole(inode, path, &hole_start); - /* - * put just found gap into cache to speed up - * subsequent requests - */ - ext4_ext_put_gap_in_cache(inode, hole_start, hole_len); + len = ext4_ext_determine_insert_hole(inode, path, map->m_lblk); - /* Update hole_len to reflect hole size after map->m_lblk */ - if (hole_start != map->m_lblk) - hole_len -= map->m_lblk - hole_start; map->m_pblk = 0; - map->m_len = min_t(unsigned int, map->m_len, hole_len); - + map->m_len = min_t(unsigned int, map->m_len, len); goto out; } @@ -4313,7 +4345,7 @@ got_allocated_blocks: * not a good idea to call discard here directly, * but otherwise we'd need to call it every free(). */ - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE; ext4_free_blocks(handle, inode, NULL, newblock, @@ -5357,7 +5389,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); ext4_es_remove_extent(inode, punch_start, EXT_MAX_BLOCKS - punch_start); ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1); @@ -5365,7 +5397,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); ret = ext4_ext_shift_extents(inode, handle, punch_stop, punch_stop - punch_start, SHIFT_LEFT); @@ -5497,7 +5529,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) goto out_stop; down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); path = ext4_find_extent(inode, offset_lblk, NULL, 0); if (IS_ERR(path)) { diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 6aa15dafc677..54d6ff22585c 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -174,7 +174,7 @@ static int ext4_release_file(struct inode *inode, struct file *filp) (atomic_read(&inode->i_writecount) == 1) && !EXT4_I(inode)->i_reserved_data_blocks) { down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); } if (is_dx(inode) && filp->private_data) diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c index 11e6f33677a2..df853c4d3a8c 100644 --- a/fs/ext4/fsmap.c +++ b/fs/ext4/fsmap.c @@ -576,9 +576,9 @@ static bool ext4_getfsmap_is_valid_device(struct super_block *sb, if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX || fm->fmr_device == new_encode_dev(sb->s_bdev->bd_dev)) return true; - if (EXT4_SB(sb)->s_journal_bdev_handle && + if (EXT4_SB(sb)->s_journal_bdev_file && fm->fmr_device == - new_encode_dev(EXT4_SB(sb)->s_journal_bdev_handle->bdev->bd_dev)) + new_encode_dev(file_bdev(EXT4_SB(sb)->s_journal_bdev_file)->bd_dev)) return true; return false; } @@ -648,9 +648,9 @@ int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head, memset(handlers, 0, sizeof(handlers)); handlers[0].gfd_dev = new_encode_dev(sb->s_bdev->bd_dev); handlers[0].gfd_fn = ext4_getfsmap_datadev; - if (EXT4_SB(sb)->s_journal_bdev_handle) { + if (EXT4_SB(sb)->s_journal_bdev_file) { handlers[1].gfd_dev = new_encode_dev( - EXT4_SB(sb)->s_journal_bdev_handle->bdev->bd_dev); + file_bdev(EXT4_SB(sb)->s_journal_bdev_file)->bd_dev); handlers[1].gfd_fn = ext4_getfsmap_logdev; } diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index a9f3716119d3..d8ca7f64f952 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -714,7 +714,7 @@ static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode, * i_rwsem. So we can safely drop the i_data_sem here. */ BUG_ON(EXT4_JOURNAL(inode) == NULL); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); *dropped = 1; return 0; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5af1b0b8680e..2ccf3b5e3a7c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -371,7 +371,7 @@ void ext4_da_update_reserve_space(struct inode *inode, */ if ((ei->i_reserved_data_blocks == 0) && !inode_is_open_for_write(inode)) - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); } static int __check_block_validity(struct inode *inode, const char *func, @@ -515,6 +515,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, map->m_len = retval; } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) { map->m_pblk = 0; + map->m_flags |= ext4_es_is_delayed(&es) ? + EXT4_MAP_DELAYED : 0; retval = es.es_len - (map->m_lblk - es.es_lblk); if (retval > map->m_len) retval = map->m_len; @@ -1703,11 +1705,8 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) { - if (ext4_es_is_hole(&es)) { - retval = 0; - down_read(&EXT4_I(inode)->i_data_sem); + if (ext4_es_is_hole(&es)) goto add_delayed; - } /* * Delayed extent could be allocated by fallocate. @@ -1749,26 +1748,11 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, retval = ext4_ext_map_blocks(NULL, inode, map, 0); else retval = ext4_ind_map_blocks(NULL, inode, map, 0); - -add_delayed: - if (retval == 0) { - int ret; - - /* - * XXX: __block_prepare_write() unmaps passed block, - * is it OK? - */ - - ret = ext4_insert_delayed_block(inode, map->m_lblk); - if (ret != 0) { - retval = ret; - goto out_unlock; - } - - map_bh(bh, inode->i_sb, invalid_block); - set_buffer_new(bh); - set_buffer_delay(bh); - } else if (retval > 0) { + if (retval < 0) { + up_read(&EXT4_I(inode)->i_data_sem); + return retval; + } + if (retval > 0) { unsigned int status; if (unlikely(retval != map->m_len)) { @@ -1783,11 +1767,21 @@ add_delayed: EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status); + up_read(&EXT4_I(inode)->i_data_sem); + return retval; } + up_read(&EXT4_I(inode)->i_data_sem); -out_unlock: - up_read((&EXT4_I(inode)->i_data_sem)); +add_delayed: + down_write(&EXT4_I(inode)->i_data_sem); + retval = ext4_insert_delayed_block(inode, map->m_lblk); + up_write(&EXT4_I(inode)->i_data_sem); + if (retval) + return retval; + map_bh(bh, inode->i_sb, invalid_block); + set_buffer_new(bh); + set_buffer_delay(bh); return retval; } @@ -3268,6 +3262,9 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, iomap->addr = (u64) map->m_pblk << blkbits; if (flags & IOMAP_DAX) iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off; + } else if (map->m_flags & EXT4_MAP_DELAYED) { + iomap->type = IOMAP_DELALLOC; + iomap->addr = IOMAP_NULL_ADDR; } else { iomap->type = IOMAP_HOLE; iomap->addr = IOMAP_NULL_ADDR; @@ -3430,35 +3427,11 @@ const struct iomap_ops ext4_iomap_overwrite_ops = { .iomap_end = ext4_iomap_end, }; -static bool ext4_iomap_is_delalloc(struct inode *inode, - struct ext4_map_blocks *map) -{ - struct extent_status es; - ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1; - - ext4_es_find_extent_range(inode, &ext4_es_is_delayed, - map->m_lblk, end, &es); - - if (!es.es_len || es.es_lblk > end) - return false; - - if (es.es_lblk > map->m_lblk) { - map->m_len = es.es_lblk - map->m_lblk; - return false; - } - - offset = map->m_lblk - es.es_lblk; - map->m_len = es.es_len - offset; - - return true; -} - static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { int ret; - bool delalloc = false; struct ext4_map_blocks map; u8 blkbits = inode->i_blkbits; @@ -3499,13 +3472,8 @@ static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) return ret; - if (ret == 0) - delalloc = ext4_iomap_is_delalloc(inode, &map); - set_iomap: ext4_set_iomap(inode, iomap, &map, offset, length, flags); - if (delalloc && iomap->type == IOMAP_HOLE) - iomap->type = IOMAP_DELALLOC; return 0; } @@ -4015,12 +3983,12 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) /* If there are blocks to remove, do it */ if (stop_block > first_block) { + ext4_lblk_t hole_len = stop_block - first_block; down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); - ext4_es_remove_extent(inode, first_block, - stop_block - first_block); + ext4_es_remove_extent(inode, first_block, hole_len); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ret = ext4_ext_remove_space(inode, first_block, @@ -4029,6 +3997,8 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) ret = ext4_ind_remove_space(handle, inode, first_block, stop_block); + ext4_es_insert_extent(inode, first_block, hole_len, ~0, + EXTENT_STATUS_HOLE); up_write(&EXT4_I(inode)->i_data_sem); } ext4_fc_track_range(handle, inode, first_block, stop_block); @@ -4170,7 +4140,7 @@ int ext4_truncate(struct inode *inode) down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) err = ext4_ext_truncate(handle, inode); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index aa6be510eb8f..7160a71044c8 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -467,7 +467,7 @@ static long swap_inode_boot_loader(struct super_block *sb, ext4_reset_inode_seed(inode); ext4_reset_inode_seed(inode_bl); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); err = ext4_mark_inode_dirty(handle, inode); if (err < 0) { diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index f44f668e407f..e4f7cf9d89c4 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -564,14 +564,14 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += EXT4_C2B(EXT4_SB(sb), first + i); + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, inode ? inode->i_ino : 0, blocknr, "freeing block already freed " "(bit %u)", first + i); - ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, - EXT4_GROUP_INFO_BBITMAP_CORRUPT); } mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); } @@ -677,7 +677,7 @@ do { \ } \ } while (0) -static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, +static void __mb_check_buddy(struct ext4_buddy *e4b, char *file, const char *function, int line) { struct super_block *sb = e4b->bd_sb; @@ -696,7 +696,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, void *buddy2; if (e4b->bd_info->bb_check_counter++ % 10) - return 0; + return; while (order > 1) { buddy = mb_find_buddy(e4b, order, &max); @@ -758,7 +758,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, grp = ext4_get_group_info(sb, e4b->bd_group); if (!grp) - return NULL; + return; list_for_each(cur, &grp->bb_prealloc_list) { ext4_group_t groupnr; struct ext4_prealloc_space *pa; @@ -768,7 +768,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, for (i = 0; i < pa->pa_len; i++) MB_CHECK_ASSERT(mb_test_bit(k + i, buddy)); } - return 0; } #undef MB_CHECK_ASSERT #define mb_check_buddy(e4b) __mb_check_buddy(e4b, \ @@ -842,7 +841,7 @@ mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) struct ext4_sb_info *sbi = EXT4_SB(sb); int new_order; - if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0) + if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_fragments == 0) return; new_order = mb_avg_fragment_size_order(sb, @@ -871,7 +870,7 @@ mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) * cr level needs an update. */ static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context *ac, - enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) + enum criteria *new_cr, ext4_group_t *group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *iter; @@ -945,7 +944,7 @@ ext4_mb_find_good_group_avg_frag_lists(struct ext4_allocation_context *ac, int o * order. Updates *new_cr if cr level needs an update. */ static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *ac, - enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) + enum criteria *new_cr, ext4_group_t *group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *grp = NULL; @@ -990,7 +989,7 @@ static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context * * much and fall to CR_GOAL_LEN_SLOW in that case. */ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context *ac, - enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) + enum criteria *new_cr, ext4_group_t *group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *grp = NULL; @@ -1125,11 +1124,11 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, } if (*new_cr == CR_POWER2_ALIGNED) { - ext4_mb_choose_next_group_p2_aligned(ac, new_cr, group, ngroups); + ext4_mb_choose_next_group_p2_aligned(ac, new_cr, group); } else if (*new_cr == CR_GOAL_LEN_FAST) { - ext4_mb_choose_next_group_goal_fast(ac, new_cr, group, ngroups); + ext4_mb_choose_next_group_goal_fast(ac, new_cr, group); } else if (*new_cr == CR_BEST_AVAIL_LEN) { - ext4_mb_choose_next_group_best_avail(ac, new_cr, group, ngroups); + ext4_mb_choose_next_group_best_avail(ac, new_cr, group); } else { /* * TODO: For CR=2, we can arrange groups in an rb tree sorted by @@ -1233,6 +1232,24 @@ void ext4_mb_generate_buddy(struct super_block *sb, atomic64_add(period, &sbi->s_mb_generation_time); } +static void mb_regenerate_buddy(struct ext4_buddy *e4b) +{ + int count; + int order = 1; + void *buddy; + + while ((buddy = mb_find_buddy(e4b, order++, &count))) + mb_set_bits(buddy, 0, count); + + e4b->bd_info->bb_fragments = 0; + memset(e4b->bd_info->bb_counters, 0, + sizeof(*e4b->bd_info->bb_counters) * + (e4b->bd_sb->s_blocksize_bits + 2)); + + ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy, + e4b->bd_bitmap, e4b->bd_group, e4b->bd_info); +} + /* The buddy information is attached the buddy cache inode * for convenience. The information regarding each group * is loaded via ext4_mb_load_buddy. The information involve @@ -1891,11 +1908,6 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, mb_check_buddy(e4b); mb_free_blocks_double(inode, e4b, first, count); - this_cpu_inc(discard_pa_seq); - e4b->bd_info->bb_free += count; - if (first < e4b->bd_info->bb_first_free) - e4b->bd_info->bb_first_free = first; - /* access memory sequentially: check left neighbour, * clear range and then check right neighbour */ @@ -1909,21 +1921,31 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t blocknr; + /* + * Fastcommit replay can free already freed blocks which + * corrupts allocation info. Regenerate it. + */ + if (sbi->s_mount_state & EXT4_FC_REPLAY) { + mb_regenerate_buddy(e4b); + goto check; + } + blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += EXT4_C2B(sbi, block); - if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { - ext4_grp_locked_error(sb, e4b->bd_group, - inode ? inode->i_ino : 0, - blocknr, - "freeing already freed block (bit %u); block bitmap corrupt.", - block); - ext4_mark_group_bitmap_corrupted( - sb, e4b->bd_group, + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); - } - goto done; + ext4_grp_locked_error(sb, e4b->bd_group, + inode ? inode->i_ino : 0, blocknr, + "freeing already freed block (bit %u); block bitmap corrupt.", + block); + return; } + this_cpu_inc(discard_pa_seq); + e4b->bd_info->bb_free += count; + if (first < e4b->bd_info->bb_first_free) + e4b->bd_info->bb_first_free = first; + /* let's maintain fragments counter */ if (left_is_free && right_is_free) e4b->bd_info->bb_fragments--; @@ -1948,9 +1970,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, if (first <= last) mb_buddy_mark_free(e4b, first >> 1, last >> 1); -done: mb_set_largest_free_order(sb, e4b->bd_info); mb_update_avg_fragment_size(sb, e4b->bd_info); +check: mb_check_buddy(e4b); } @@ -2276,6 +2298,9 @@ void ext4_mb_try_best_found(struct ext4_allocation_context *ac, return; ext4_lock_group(ac->ac_sb, group); + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) + goto out; + max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex); if (max > 0) { @@ -2283,6 +2308,7 @@ void ext4_mb_try_best_found(struct ext4_allocation_context *ac, ext4_mb_use_best_found(ac, e4b); } +out: ext4_unlock_group(ac->ac_sb, group); ext4_mb_unload_buddy(e4b); } @@ -2309,12 +2335,10 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, if (err) return err; - if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) { - ext4_mb_unload_buddy(e4b); - return 0; - } - ext4_lock_group(ac->ac_sb, group); + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) + goto out; + max = mb_find_extent(e4b, ac->ac_g_ex.fe_start, ac->ac_g_ex.fe_len, &ex); ex.fe_logical = 0xDEADFA11; /* debug value */ @@ -2347,6 +2371,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); } +out: ext4_unlock_group(ac->ac_sb, group); ext4_mb_unload_buddy(e4b); @@ -2380,12 +2405,12 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, k = mb_find_next_zero_bit(buddy, max, 0); if (k >= max) { + ext4_mark_group_bitmap_corrupted(ac->ac_sb, + e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0, "%d free clusters of order %d. But found 0", grp->bb_counters[i], i); - ext4_mark_group_bitmap_corrupted(ac->ac_sb, - e4b->bd_group, - EXT4_GROUP_INFO_BBITMAP_CORRUPT); break; } ac->ac_found++; @@ -2436,12 +2461,12 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, * free blocks even though group info says we * have free blocks */ + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, "%d free clusters as per " "group info. But bitmap says 0", free); - ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, - EXT4_GROUP_INFO_BBITMAP_CORRUPT); break; } @@ -2467,12 +2492,12 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, if (WARN_ON(ex.fe_len <= 0)) break; if (free < ex.fe_len) { + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, "%d free clusters as per " "group info. But got %d blocks", free, ex.fe_len); - ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, - EXT4_GROUP_INFO_BBITMAP_CORRUPT); /* * The number of free blocks differs. This mostly * indicate that the bitmap is corrupt. So exit @@ -3725,7 +3750,7 @@ static int ext4_mb_cleanup_pa(struct ext4_group_info *grp) return count; } -int ext4_mb_release(struct super_block *sb) +void ext4_mb_release(struct super_block *sb) { ext4_group_t ngroups = ext4_get_groups_count(sb); ext4_group_t i; @@ -3801,8 +3826,6 @@ int ext4_mb_release(struct super_block *sb) } free_percpu(sbi->s_locality_groups); - - return 0; } static inline int ext4_issue_discard(struct super_block *sb, @@ -5284,7 +5307,7 @@ static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac) * the caller MUST hold group/inode locks. * TODO: optimize the case when there are no in-core structures yet */ -static noinline_for_stack int +static noinline_for_stack void ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, struct ext4_prealloc_space *pa) { @@ -5334,11 +5357,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, */ } atomic_add(free, &sbi->s_mb_discarded); - - return 0; } -static noinline_for_stack int +static noinline_for_stack void ext4_mb_release_group_pa(struct ext4_buddy *e4b, struct ext4_prealloc_space *pa) { @@ -5352,13 +5373,11 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b, if (unlikely(group != e4b->bd_group && pa->pa_len != 0)) { ext4_warning(sb, "bad group: expected %u, group %u, pa_start %llu", e4b->bd_group, group, pa->pa_pstart); - return 0; + return; } mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len); - - return 0; } /* @@ -5479,7 +5498,7 @@ out_dbg: * * FIXME!! Make sure it is valid at all the call sites */ -void ext4_discard_preallocations(struct inode *inode, unsigned int needed) +void ext4_discard_preallocations(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct super_block *sb = inode->i_sb; @@ -5491,9 +5510,8 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed) struct rb_node *iter; int err; - if (!S_ISREG(inode->i_mode)) { + if (!S_ISREG(inode->i_mode)) return; - } if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) return; @@ -5501,15 +5519,12 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed) mb_debug(sb, "discard preallocation for inode %lu\n", inode->i_ino); trace_ext4_discard_preallocations(inode, - atomic_read(&ei->i_prealloc_active), needed); - - if (needed == 0) - needed = UINT_MAX; + atomic_read(&ei->i_prealloc_active)); repeat: /* first, collect all pa's in the inode */ write_lock(&ei->i_prealloc_lock); - for (iter = rb_first(&ei->i_prealloc_node); iter && needed; + for (iter = rb_first(&ei->i_prealloc_node); iter; iter = rb_next(iter)) { pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); @@ -5533,7 +5548,6 @@ repeat: spin_unlock(&pa->pa_lock); rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); list_add(&pa->u.pa_tmp_list, &list); - needed--; continue; } @@ -5943,7 +5957,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) /* * release all resource we used in allocation */ -static int ext4_mb_release_context(struct ext4_allocation_context *ac) +static void ext4_mb_release_context(struct ext4_allocation_context *ac) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_prealloc_space *pa = ac->ac_pa; @@ -5980,7 +5994,6 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) mutex_unlock(&ac->ac_lg->lg_mutex); ext4_mb_collect_stats(ac); - return 0; } static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) @@ -6761,6 +6774,9 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) bool set_trimmed = false; void *bitmap; + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) + return 0; + last = ext4_last_grp_cluster(sb, e4b->bd_group); bitmap = e4b->bd_bitmap; if (start == 0 && max >= last) diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index d7aeb5da7d86..56938532b4ce 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -192,7 +192,6 @@ struct ext4_allocation_context { */ ext4_grpblk_t ac_orig_goal_len; - __u32 ac_groups_considered; __u32 ac_flags; /* allocation hints */ __u16 ac_groups_scanned; __u16 ac_groups_linear_remaining; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 3aa57376d9c2..7cd4afa4de1d 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -618,6 +618,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, goto out; o_end = o_start + len; + *moved_len = 0; while (o_start < o_end) { struct ext4_extent *ex; ext4_lblk_t cur_blk, next_blk; @@ -672,7 +673,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, */ ext4_double_up_write_data_sem(orig_inode, donor_inode); /* Swap original branches with new branches */ - move_extent_per_page(o_filp, donor_inode, + *moved_len += move_extent_per_page(o_filp, donor_inode, orig_page_index, donor_page_index, offset_in_page, cur_len, unwritten, &ret); @@ -682,14 +683,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, o_start += cur_len; d_start += cur_len; } - *moved_len = o_start - orig_blk; - if (*moved_len > len) - *moved_len = len; out: if (*moved_len) { - ext4_discard_preallocations(orig_inode, 0); - ext4_discard_preallocations(donor_inode, 0); + ext4_discard_preallocations(orig_inode); + ext4_discard_preallocations(donor_inode); } ext4_free_ext_path(path); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 05b647e6bc19..5e4f65c14dfb 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1762,7 +1762,6 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir, struct buffer_head *bh; err = ext4_fname_prepare_lookup(dir, dentry, &fname); - generic_set_encrypted_ci_d_ops(dentry); if (err == -ENOENT) return NULL; if (err) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index dcba0f85dfe2..59c72b6dd153 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1359,14 +1359,14 @@ static void ext4_put_super(struct super_block *sb) sync_blockdev(sb->s_bdev); invalidate_bdev(sb->s_bdev); - if (sbi->s_journal_bdev_handle) { + if (sbi->s_journal_bdev_file) { /* * Invalidate the journal device's buffers. We don't want them * floating about in memory - the physical journal device may * hotswapped, and it breaks the `ro-after' testing code. */ - sync_blockdev(sbi->s_journal_bdev_handle->bdev); - invalidate_bdev(sbi->s_journal_bdev_handle->bdev); + sync_blockdev(file_bdev(sbi->s_journal_bdev_file)); + invalidate_bdev(file_bdev(sbi->s_journal_bdev_file)); } ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); @@ -1500,8 +1500,7 @@ static int __init init_inodecache(void) { ext4_inode_cachep = kmem_cache_create_usercopy("ext4_inode_cache", sizeof(struct ext4_inode_info), 0, - (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| - SLAB_ACCOUNT), + SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, offsetof(struct ext4_inode_info, i_data), sizeof_field(struct ext4_inode_info, i_data), init_once); @@ -1525,7 +1524,7 @@ void ext4_clear_inode(struct inode *inode) ext4_fc_del(inode); invalidate_inode_buffers(inode); clear_inode(inode); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); dquot_drop(inode); if (EXT4_I(inode)->jinode) { @@ -4233,7 +4232,7 @@ int ext4_calculate_overhead(struct super_block *sb) * Add the internal journal blocks whether the journal has been * loaded or not */ - if (sbi->s_journal && !sbi->s_journal_bdev_handle) + if (sbi->s_journal && !sbi->s_journal_bdev_file) overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len); else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) { /* j_inum for internal journal is non-zero */ @@ -5346,7 +5345,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) sb->s_qcop = &ext4_qctl_operations; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; #endif - memcpy(&sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); + super_set_uuid(sb, es->s_uuid, sizeof(es->s_uuid)); INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); @@ -5484,6 +5483,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) goto failed_mount4; } + generic_set_sb_d_ops(sb); sb->s_root = d_make_root(root); if (!sb->s_root) { ext4_msg(sb, KERN_ERR, "get root dentry failed"); @@ -5670,9 +5670,9 @@ failed_mount: #endif fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); brelse(sbi->s_sbh); - if (sbi->s_journal_bdev_handle) { - invalidate_bdev(sbi->s_journal_bdev_handle->bdev); - bdev_release(sbi->s_journal_bdev_handle); + if (sbi->s_journal_bdev_file) { + invalidate_bdev(file_bdev(sbi->s_journal_bdev_file)); + fput(sbi->s_journal_bdev_file); } out_fail: invalidate_bdev(sb->s_bdev); @@ -5842,30 +5842,30 @@ static journal_t *ext4_open_inode_journal(struct super_block *sb, return journal; } -static struct bdev_handle *ext4_get_journal_blkdev(struct super_block *sb, +static struct file *ext4_get_journal_blkdev(struct super_block *sb, dev_t j_dev, ext4_fsblk_t *j_start, ext4_fsblk_t *j_len) { struct buffer_head *bh; struct block_device *bdev; - struct bdev_handle *bdev_handle; + struct file *bdev_file; int hblock, blocksize; ext4_fsblk_t sb_block; unsigned long offset; struct ext4_super_block *es; int errno; - bdev_handle = bdev_open_by_dev(j_dev, + bdev_file = bdev_file_open_by_dev(j_dev, BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES, sb, &fs_holder_ops); - if (IS_ERR(bdev_handle)) { + if (IS_ERR(bdev_file)) { ext4_msg(sb, KERN_ERR, "failed to open journal device unknown-block(%u,%u) %ld", - MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev_handle)); - return bdev_handle; + MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev_file)); + return bdev_file; } - bdev = bdev_handle->bdev; + bdev = file_bdev(bdev_file); blocksize = sb->s_blocksize; hblock = bdev_logical_block_size(bdev); if (blocksize < hblock) { @@ -5912,12 +5912,12 @@ static struct bdev_handle *ext4_get_journal_blkdev(struct super_block *sb, *j_start = sb_block + 1; *j_len = ext4_blocks_count(es); brelse(bh); - return bdev_handle; + return bdev_file; out_bh: brelse(bh); out_bdev: - bdev_release(bdev_handle); + fput(bdev_file); return ERR_PTR(errno); } @@ -5927,14 +5927,14 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb, journal_t *journal; ext4_fsblk_t j_start; ext4_fsblk_t j_len; - struct bdev_handle *bdev_handle; + struct file *bdev_file; int errno = 0; - bdev_handle = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len); - if (IS_ERR(bdev_handle)) - return ERR_CAST(bdev_handle); + bdev_file = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len); + if (IS_ERR(bdev_file)) + return ERR_CAST(bdev_file); - journal = jbd2_journal_init_dev(bdev_handle->bdev, sb->s_bdev, j_start, + journal = jbd2_journal_init_dev(file_bdev(bdev_file), sb->s_bdev, j_start, j_len, sb->s_blocksize); if (IS_ERR(journal)) { ext4_msg(sb, KERN_ERR, "failed to create device journal"); @@ -5949,14 +5949,14 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb, goto out_journal; } journal->j_private = sb; - EXT4_SB(sb)->s_journal_bdev_handle = bdev_handle; + EXT4_SB(sb)->s_journal_bdev_file = bdev_file; ext4_init_journal_params(sb, journal); return journal; out_journal: jbd2_journal_destroy(journal); out_bdev: - bdev_release(bdev_handle); + fput(bdev_file); return ERR_PTR(errno); } @@ -7314,12 +7314,12 @@ static inline int ext3_feature_set_ok(struct super_block *sb) static void ext4_kill_sb(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); - struct bdev_handle *handle = sbi ? sbi->s_journal_bdev_handle : NULL; + struct file *bdev_file = sbi ? sbi->s_journal_bdev_file : NULL; kill_block_super(sb); - if (handle) - bdev_release(handle); + if (bdev_file) + fput(bdev_file); } static struct file_system_type ext4_fs_type = { diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 75bf1f88843c..645240cc0229 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -92,10 +92,12 @@ static const char *ext4_get_link(struct dentry *dentry, struct inode *inode, if (!dentry) { bh = ext4_getblk(NULL, inode, 0, EXT4_GET_BLOCKS_CACHED_NOWAIT); - if (IS_ERR(bh)) - return ERR_CAST(bh); - if (!bh || !ext4_buffer_uptodate(bh)) + if (IS_ERR(bh) || !bh) return ERR_PTR(-ECHILD); + if (!ext4_buffer_uptodate(bh)) { + brelse(bh); + return ERR_PTR(-ECHILD); + } } else { bh = ext4_bread(NULL, inode, 0, 0); if (IS_ERR(bh)) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 65294e3b0bef..4c77e8ce5c75 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -24,6 +24,7 @@ #include <linux/blkdev.h> #include <linux/quotaops.h> #include <linux/part_stat.h> +#include <linux/rw_hint.h> #include <crypto/hash.h> #include <linux/fscrypt.h> @@ -1239,7 +1240,7 @@ struct f2fs_bio_info { #define FDEV(i) (sbi->devs[i]) #define RDEV(i) (raw_super->devs[i]) struct f2fs_dev_info { - struct bdev_handle *bdev_handle; + struct file *bdev_file; struct block_device *bdev; char path[MAX_PATH_LEN]; unsigned int total_segments; @@ -3364,17 +3365,6 @@ static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi) return is_set_ckpt_flags(sbi, CP_ERROR_FLAG); } -static inline bool is_dot_dotdot(const u8 *name, size_t len) -{ - if (len == 1 && name[0] == '.') - return true; - - if (len == 2 && name[0] == '.' && name[1] == '.') - return true; - - return false; -} - static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index b3bb815fc6aa..f7f63a567d86 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -531,7 +531,6 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, } err = f2fs_prepare_lookup(dir, dentry, &fname); - generic_set_encrypted_ci_d_ops(dentry); if (err == -ENOENT) goto out_splice; if (err) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 4c8836ded90f..e1065ba70207 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1971,9 +1971,15 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, } if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) { + unsigned int nofs_flags; + int ret; + trace_f2fs_issue_reset_zone(bdev, blkstart); - return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, - sector, nr_sects, GFP_NOFS); + nofs_flags = memalloc_nofs_save(); + ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, + sector, nr_sects); + memalloc_nofs_restore(nofs_flags); + return ret; } __queue_zone_reset_cmd(sbi, bdev, blkstart, lblkstart, blklen); @@ -4865,6 +4871,7 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi, block_t zone_block, valid_block_cnt; unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT; int ret; + unsigned int nofs_flags; if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ) return 0; @@ -4912,8 +4919,10 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi, "pointer: valid block[0x%x,0x%x] cond[0x%x]", zone_segno, valid_block_cnt, zone->cond); + nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(fdev->bdev, REQ_OP_ZONE_FINISH, - zone->start, zone->len, GFP_NOFS); + zone->start, zone->len); + memalloc_nofs_restore(nofs_flags); if (ret == -EOPNOTSUPP) { ret = blkdev_issue_zeroout(fdev->bdev, zone->wp, zone->len - (zone->wp - zone->start), diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d45ab0992ae5..b880b746f226 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1605,7 +1605,7 @@ static void destroy_device_list(struct f2fs_sb_info *sbi) for (i = 0; i < sbi->s_ndevs; i++) { if (i > 0) - bdev_release(FDEV(i).bdev_handle); + fput(FDEV(i).bdev_file); #ifdef CONFIG_BLK_DEV_ZONED kvfree(FDEV(i).blkz_seq); #endif @@ -4247,7 +4247,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) for (i = 0; i < max_devices; i++) { if (i == 0) - FDEV(0).bdev_handle = sbi->sb->s_bdev_handle; + FDEV(0).bdev_file = sbi->sb->s_bdev_file; else if (!RDEV(i).path[0]) break; @@ -4267,14 +4267,14 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) FDEV(i).end_blk = FDEV(i).start_blk + (FDEV(i).total_segments << sbi->log_blocks_per_seg) - 1; - FDEV(i).bdev_handle = bdev_open_by_path( + FDEV(i).bdev_file = bdev_file_open_by_path( FDEV(i).path, mode, sbi->sb, NULL); } } - if (IS_ERR(FDEV(i).bdev_handle)) - return PTR_ERR(FDEV(i).bdev_handle); + if (IS_ERR(FDEV(i).bdev_file)) + return PTR_ERR(FDEV(i).bdev_file); - FDEV(i).bdev = FDEV(i).bdev_handle->bdev; + FDEV(i).bdev = file_bdev(FDEV(i).bdev_file); /* to release errored devices */ sbi->s_ndevs = i + 1; @@ -4496,7 +4496,7 @@ try_onemore: sb->s_time_gran = 1; sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0); - memcpy(&sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid)); + super_set_uuid(sb, (void *) raw_super->uuid, sizeof(raw_super->uuid)); sb->s_iflags |= SB_I_CGROUPWB; /* init f2fs-specific super block info */ @@ -4660,6 +4660,7 @@ try_onemore: goto free_node_inode; } + generic_set_sb_d_ops(sb); sb->s_root = d_make_root(root); /* allocate root dentry */ if (!sb->s_root) { err = -ENOMEM; diff --git a/fs/fat/cache.c b/fs/fat/cache.c index 738e427e2d21..2af424e200b3 100644 --- a/fs/fat/cache.c +++ b/fs/fat/cache.c @@ -47,7 +47,7 @@ int __init fat_cache_init(void) { fat_cache_cachep = kmem_cache_create("fat_cache", sizeof(struct fat_cache), - 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, + 0, SLAB_RECLAIM_ACCOUNT, init_once); if (fat_cache_cachep == NULL) return -ENOMEM; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 1fac3dabf130..d9e6fbb6f246 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -787,7 +787,7 @@ static int __init fat_init_inodecache(void) fat_inode_cachep = kmem_cache_create("fat_inode_cache", sizeof(struct msdos_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), + SLAB_ACCOUNT), init_once); if (fat_inode_cachep == NULL) return -ENOMEM; @@ -1762,6 +1762,9 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, else /* fat 16 or 12 */ sbi->vol_id = bpb.fat16_vol_id; + __le32 vol_id_le = cpu_to_le32(sbi->vol_id); + super_set_uuid(sb, (void *) &vol_id_le, sizeof(vol_id_le)); + sbi->dir_per_block = sb->s_blocksize / sizeof(struct msdos_dir_entry); sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1; diff --git a/fs/fcntl.c b/fs/fcntl.c index c80a6acad742..54cc85d3338e 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -27,6 +27,7 @@ #include <linux/memfd.h> #include <linux/compat.h> #include <linux/mount.h> +#include <linux/rw_hint.h> #include <linux/poll.h> #include <asm/siginfo.h> @@ -268,8 +269,15 @@ static int f_getowner_uids(struct file *filp, unsigned long arg) } #endif -static bool rw_hint_valid(enum rw_hint hint) +static bool rw_hint_valid(u64 hint) { + BUILD_BUG_ON(WRITE_LIFE_NOT_SET != RWH_WRITE_LIFE_NOT_SET); + BUILD_BUG_ON(WRITE_LIFE_NONE != RWH_WRITE_LIFE_NONE); + BUILD_BUG_ON(WRITE_LIFE_SHORT != RWH_WRITE_LIFE_SHORT); + BUILD_BUG_ON(WRITE_LIFE_MEDIUM != RWH_WRITE_LIFE_MEDIUM); + BUILD_BUG_ON(WRITE_LIFE_LONG != RWH_WRITE_LIFE_LONG); + BUILD_BUG_ON(WRITE_LIFE_EXTREME != RWH_WRITE_LIFE_EXTREME); + switch (hint) { case RWH_WRITE_LIFE_NOT_SET: case RWH_WRITE_LIFE_NONE: @@ -283,34 +291,40 @@ static bool rw_hint_valid(enum rw_hint hint) } } -static long fcntl_rw_hint(struct file *file, unsigned int cmd, - unsigned long arg) +static long fcntl_get_rw_hint(struct file *file, unsigned int cmd, + unsigned long arg) { struct inode *inode = file_inode(file); u64 __user *argp = (u64 __user *)arg; - enum rw_hint hint; - u64 h; + u64 hint = READ_ONCE(inode->i_write_hint); - switch (cmd) { - case F_GET_RW_HINT: - h = inode->i_write_hint; - if (copy_to_user(argp, &h, sizeof(*argp))) - return -EFAULT; - return 0; - case F_SET_RW_HINT: - if (copy_from_user(&h, argp, sizeof(h))) - return -EFAULT; - hint = (enum rw_hint) h; - if (!rw_hint_valid(hint)) - return -EINVAL; + if (copy_to_user(argp, &hint, sizeof(*argp))) + return -EFAULT; + return 0; +} - inode_lock(inode); - inode->i_write_hint = hint; - inode_unlock(inode); - return 0; - default: +static long fcntl_set_rw_hint(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct inode *inode = file_inode(file); + u64 __user *argp = (u64 __user *)arg; + u64 hint; + + if (copy_from_user(&hint, argp, sizeof(hint))) + return -EFAULT; + if (!rw_hint_valid(hint)) return -EINVAL; - } + + WRITE_ONCE(inode->i_write_hint, hint); + + /* + * file->f_mapping->host may differ from inode. As an example, + * blkdev_open() modifies file->f_mapping. + */ + if (file->f_mapping->host != inode) + WRITE_ONCE(file->f_mapping->host->i_write_hint, hint); + + return 0; } static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, @@ -416,8 +430,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, err = memfd_fcntl(filp, cmd, argi); break; case F_GET_RW_HINT: + err = fcntl_get_rw_hint(filp, cmd, arg); + break; case F_SET_RW_HINT: - err = fcntl_rw_hint(filp, cmd, arg); + err = fcntl_set_rw_hint(filp, cmd, arg); break; default: break; @@ -846,12 +862,6 @@ int send_sigurg(struct fown_struct *fown) static DEFINE_SPINLOCK(fasync_lock); static struct kmem_cache *fasync_cache __ro_after_init; -static void fasync_free_rcu(struct rcu_head *head) -{ - kmem_cache_free(fasync_cache, - container_of(head, struct fasync_struct, fa_rcu)); -} - /* * Remove a fasync entry. If successfully removed, return * positive and clear the FASYNC flag. If no entry exists, @@ -877,7 +887,7 @@ int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) write_unlock_irq(&fa->fa_lock); *fp = fa->fa_next; - call_rcu(&fa->fa_rcu, fasync_free_rcu); + kfree_rcu(fa, fa_rcu); filp->f_flags &= ~FASYNC; result = 1; break; diff --git a/fs/fhandle.c b/fs/fhandle.c index 18b3ba8dc8ea..57a12614addf 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -36,7 +36,7 @@ static long do_sys_name_to_handle(const struct path *path, if (f_handle.handle_bytes > MAX_HANDLE_SZ) return -EINVAL; - handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes, + handle = kzalloc(sizeof(struct file_handle) + f_handle.handle_bytes, GFP_KERNEL); if (!handle) return -ENOMEM; diff --git a/fs/file_table.c b/fs/file_table.c index b991f90571b4..4f03beed4737 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -26,7 +26,6 @@ #include <linux/percpu_counter.h> #include <linux/percpu.h> #include <linux/task_work.h> -#include <linux/ima.h> #include <linux/swap.h> #include <linux/kmemleak.h> @@ -276,21 +275,15 @@ struct file *alloc_empty_backing_file(int flags, const struct cred *cred) } /** - * alloc_file - allocate and initialize a 'struct file' + * file_init_path - initialize a 'struct file' based on path * + * @file: the file to set up * @path: the (dentry, vfsmount) pair for the new file - * @flags: O_... flags with which the new file will be opened * @fop: the 'struct file_operations' for the new file */ -static struct file *alloc_file(const struct path *path, int flags, - const struct file_operations *fop) +static void file_init_path(struct file *file, const struct path *path, + const struct file_operations *fop) { - struct file *file; - - file = alloc_empty_file(flags, current_cred()); - if (IS_ERR(file)) - return file; - file->f_path = *path; file->f_inode = path->dentry->d_inode; file->f_mapping = path->dentry->d_inode->i_mapping; @@ -309,22 +302,51 @@ static struct file *alloc_file(const struct path *path, int flags, file->f_op = fop; if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_inc(path->dentry->d_inode); +} + +/** + * alloc_file - allocate and initialize a 'struct file' + * + * @path: the (dentry, vfsmount) pair for the new file + * @flags: O_... flags with which the new file will be opened + * @fop: the 'struct file_operations' for the new file + */ +static struct file *alloc_file(const struct path *path, int flags, + const struct file_operations *fop) +{ + struct file *file; + + file = alloc_empty_file(flags, current_cred()); + if (!IS_ERR(file)) + file_init_path(file, path, fop); return file; } -struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt, - const char *name, int flags, - const struct file_operations *fops) +static inline int alloc_path_pseudo(const char *name, struct inode *inode, + struct vfsmount *mnt, struct path *path) { struct qstr this = QSTR_INIT(name, strlen(name)); + + path->dentry = d_alloc_pseudo(mnt->mnt_sb, &this); + if (!path->dentry) + return -ENOMEM; + path->mnt = mntget(mnt); + d_instantiate(path->dentry, inode); + return 0; +} + +struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt, + const char *name, int flags, + const struct file_operations *fops) +{ + int ret; struct path path; struct file *file; - path.dentry = d_alloc_pseudo(mnt->mnt_sb, &this); - if (!path.dentry) - return ERR_PTR(-ENOMEM); - path.mnt = mntget(mnt); - d_instantiate(path.dentry, inode); + ret = alloc_path_pseudo(name, inode, mnt, &path); + if (ret) + return ERR_PTR(ret); + file = alloc_file(&path, flags, fops); if (IS_ERR(file)) { ihold(inode); @@ -334,6 +356,30 @@ struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt, } EXPORT_SYMBOL(alloc_file_pseudo); +struct file *alloc_file_pseudo_noaccount(struct inode *inode, + struct vfsmount *mnt, const char *name, + int flags, + const struct file_operations *fops) +{ + int ret; + struct path path; + struct file *file; + + ret = alloc_path_pseudo(name, inode, mnt, &path); + if (ret) + return ERR_PTR(ret); + + file = alloc_empty_file_noaccount(flags, current_cred()); + if (IS_ERR(file)) { + ihold(inode); + path_put(&path); + return file; + } + file_init_path(file, &path, fops); + return file; +} +EXPORT_SYMBOL_GPL(alloc_file_pseudo_noaccount); + struct file *alloc_file_clone(struct file *base, int flags, const struct file_operations *fops) { @@ -367,7 +413,7 @@ static void __fput(struct file *file) eventpoll_release(file); locks_remove_file(file); - ima_file_free(file); + security_file_release(file); if (unlikely(file->f_flags & FASYNC)) { if (file->f_op->fasync) file->f_op->fasync(-1, file, 0); diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index e6e2a2185e7c..42e03b6b1cc7 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -307,7 +307,7 @@ vxfs_init(void) vxfs_inode_cachep = kmem_cache_create_usercopy("vxfs_inode", sizeof(struct vxfs_inode_info), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, + SLAB_RECLAIM_ACCOUNT, offsetof(struct vxfs_inode_info, vii_immed.vi_immed), sizeof_field(struct vxfs_inode_info, vii_immed.vi_immed), diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 3d84fcc471c6..e4f17c53ddfc 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -141,6 +141,31 @@ static void wb_wakeup(struct bdi_writeback *wb) spin_unlock_irq(&wb->work_lock); } +/* + * This function is used when the first inode for this wb is marked dirty. It + * wakes-up the corresponding bdi thread which should then take care of the + * periodic background write-out of dirty inodes. Since the write-out would + * starts only 'dirty_writeback_interval' centisecs from now anyway, we just + * set up a timer which wakes the bdi thread up later. + * + * Note, we wouldn't bother setting up the timer, but this function is on the + * fast-path (used by '__mark_inode_dirty()'), so we save few context switches + * by delaying the wake-up. + * + * We have to be careful not to postpone flush work if it is scheduled for + * earlier. Thus we use queue_delayed_work(). + */ +static void wb_wakeup_delayed(struct bdi_writeback *wb) +{ + unsigned long timeout; + + timeout = msecs_to_jiffies(dirty_writeback_interval * 10); + spin_lock_irq(&wb->work_lock); + if (test_bit(WB_registered, &wb->state)) + queue_delayed_work(bdi_wq, &wb->dwork, timeout); + spin_unlock_irq(&wb->work_lock); +} + static void finish_writeback_work(struct bdi_writeback *wb, struct wb_writeback_work *work) { diff --git a/fs/fs_parser.c b/fs/fs_parser.c index edb3712dcfa5..a4d6ca0b8971 100644 --- a/fs/fs_parser.c +++ b/fs/fs_parser.c @@ -83,8 +83,8 @@ static const struct fs_parameter_spec *fs_lookup_key( } /* - * fs_parse - Parse a filesystem configuration parameter - * @fc: The filesystem context to log errors through. + * __fs_parse - Parse a filesystem configuration parameter + * @log: The filesystem context to log errors through. * @desc: The parameter description to use. * @param: The parameter. * @result: Where to place the result of the parse diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 91e89e68177e..b6cad106c37e 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -474,8 +474,7 @@ err: static void cuse_fc_release(struct fuse_conn *fc) { - struct cuse_conn *cc = fc_to_cc(fc); - kfree_rcu(cc, fc.rcu); + kfree(fc_to_cc(fc)); } /** diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 148a71b8b4d0..c007b0f0c3a7 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2509,14 +2509,14 @@ static int convert_fuse_file_lock(struct fuse_conn *fc, * translate it into the caller's pid namespace. */ rcu_read_lock(); - fl->fl_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns); + fl->c.flc_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns); rcu_read_unlock(); break; default: return -EIO; } - fl->fl_type = ffl->type; + fl->c.flc_type = ffl->type; return 0; } @@ -2530,10 +2530,10 @@ static void fuse_lk_fill(struct fuse_args *args, struct file *file, memset(inarg, 0, sizeof(*inarg)); inarg->fh = ff->fh; - inarg->owner = fuse_lock_owner_id(fc, fl->fl_owner); + inarg->owner = fuse_lock_owner_id(fc, fl->c.flc_owner); inarg->lk.start = fl->fl_start; inarg->lk.end = fl->fl_end; - inarg->lk.type = fl->fl_type; + inarg->lk.type = fl->c.flc_type; inarg->lk.pid = pid; if (flock) inarg->lk_flags |= FUSE_LK_FLOCK; @@ -2570,8 +2570,8 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_lk_in inarg; - int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK; - struct pid *pid = fl->fl_type != F_UNLCK ? task_tgid(current) : NULL; + int opcode = (fl->c.flc_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK; + struct pid *pid = fl->c.flc_type != F_UNLCK ? task_tgid(current) : NULL; pid_t pid_nr = pid_nr_ns(pid, fm->fc->pid_ns); int err; @@ -2581,7 +2581,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) } /* Unlock on close is handled by the flush method */ - if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX) + if ((fl->c.flc_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX) return 0; fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 1df83eebda92..bcbe34488862 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -888,6 +888,7 @@ struct fuse_mount { /* Entry on fc->mounts */ struct list_head fc_entry; + struct rcu_head rcu; }; static inline struct fuse_mount *get_fuse_mount_super(struct super_block *sb) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 2a6d44f91729..516ea2979a90 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -930,6 +930,14 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, } EXPORT_SYMBOL_GPL(fuse_conn_init); +static void delayed_release(struct rcu_head *p) +{ + struct fuse_conn *fc = container_of(p, struct fuse_conn, rcu); + + put_user_ns(fc->user_ns); + fc->release(fc); +} + void fuse_conn_put(struct fuse_conn *fc) { if (refcount_dec_and_test(&fc->count)) { @@ -941,13 +949,12 @@ void fuse_conn_put(struct fuse_conn *fc) if (fiq->ops->release) fiq->ops->release(fiq); put_pid_ns(fc->pid_ns); - put_user_ns(fc->user_ns); bucket = rcu_dereference_protected(fc->curr_bucket, 1); if (bucket) { WARN_ON(atomic_read(&bucket->count) != 1); kfree(bucket); } - fc->release(fc); + call_rcu(&fc->rcu, delayed_release); } } EXPORT_SYMBOL_GPL(fuse_conn_put); @@ -1366,7 +1373,7 @@ EXPORT_SYMBOL_GPL(fuse_send_init); void fuse_free_conn(struct fuse_conn *fc) { WARN_ON(!list_empty(&fc->devices)); - kfree_rcu(fc, rcu); + kfree(fc); } EXPORT_SYMBOL_GPL(fuse_free_conn); @@ -1902,7 +1909,7 @@ static void fuse_sb_destroy(struct super_block *sb) void fuse_mount_destroy(struct fuse_mount *fm) { fuse_conn_put(fm->fc); - kfree(fm); + kfree_rcu(fm, rcu); } EXPORT_SYMBOL(fuse_mount_destroy); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index d9ccfd27e4f1..789af5c8fade 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -2465,7 +2465,7 @@ out: } static int gfs2_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode, - loff_t offset) + loff_t offset, unsigned int len) { int ret; diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c index 177f1f41f225..2e215e8c3c88 100644 --- a/fs/gfs2/dentry.c +++ b/fs/gfs2/dentry.c @@ -32,25 +32,21 @@ static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags) { - struct dentry *parent = NULL; + struct dentry *parent; struct gfs2_sbd *sdp; struct gfs2_inode *dip; - struct inode *dinode, *inode; + struct inode *inode; struct gfs2_holder d_gh; struct gfs2_inode *ip = NULL; int error, valid = 0; int had_lock = 0; - if (flags & LOOKUP_RCU) { - dinode = d_inode_rcu(READ_ONCE(dentry->d_parent)); - if (!dinode) - return -ECHILD; - } else { - parent = dget_parent(dentry); - dinode = d_inode(parent); - } - sdp = GFS2_SB(dinode); - dip = GFS2_I(dinode); + if (flags & LOOKUP_RCU) + return -ECHILD; + + parent = dget_parent(dentry); + sdp = GFS2_SB(d_inode(parent)); + dip = GFS2_I(d_inode(parent)); inode = d_inode(dentry); if (inode) { @@ -66,8 +62,7 @@ static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags) had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL); if (!had_lock) { - error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, - flags & LOOKUP_RCU ? GL_NOBLOCK : 0, &d_gh); + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); if (error) goto out; } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 992ca4effb50..4c42ada60ae7 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1440,10 +1440,10 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host); struct lm_lockstruct *ls = &sdp->sd_lockstruct; - if (!(fl->fl_flags & FL_POSIX)) + if (!(fl->c.flc_flags & FL_POSIX)) return -ENOLCK; if (gfs2_withdrawing_or_withdrawn(sdp)) { - if (fl->fl_type == F_UNLCK) + if (lock_is_unlock(fl)) locks_lock_file_wait(file, fl); return -EIO; } @@ -1451,7 +1451,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) return dlm_posix_cancel(ls->ls_dlm, ip->i_no_addr, file, fl); else if (IS_GETLK(cmd)) return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl); - else if (fl->fl_type == F_UNLCK) + else if (lock_is_unlock(fl)) return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl); else return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl); @@ -1483,7 +1483,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) int error = 0; int sleeptime; - state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED; + state = lock_is_write(fl) ? LM_ST_EXCLUSIVE : LM_ST_SHARED; flags = GL_EXACT | GL_NOPID; if (!IS_SETLKW(cmd)) flags |= LM_FLAG_TRY_1CB; @@ -1495,8 +1495,8 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) if (fl_gh->gh_state == state) goto out; locks_init_lock(&request); - request.fl_type = F_UNLCK; - request.fl_flags = FL_FLOCK; + request.c.flc_type = F_UNLCK; + request.c.flc_flags = FL_FLOCK; locks_lock_file_wait(file, &request); gfs2_glock_dq(fl_gh); gfs2_holder_reinit(state, flags, fl_gh); @@ -1557,10 +1557,10 @@ static void do_unflock(struct file *file, struct file_lock *fl) static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl) { - if (!(fl->fl_flags & FL_FLOCK)) + if (!(fl->c.flc_flags & FL_FLOCK)) return -ENOLCK; - if (fl->fl_type == F_UNLCK) { + if (lock_is_unlock(fl)) { do_unflock(file, fl); return 0; } else { diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 6bfc9383b7b8..1b95db2c3aac 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1882,10 +1882,10 @@ int gfs2_permission(struct mnt_idmap *idmap, struct inode *inode, WARN_ON_ONCE(!may_not_block); return -ECHILD; } - if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { - int noblock = may_not_block ? GL_NOBLOCK : 0; - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, - LM_FLAG_ANY | noblock, &i_gh); + if (gfs2_glock_is_locked_by_me(gl) == NULL) { + if (may_not_block) + return -ECHILD; + error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); if (error) return error; } diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 79be0cdc730c..04cadc02e5a6 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -111,7 +111,6 @@ static int __init init_gfs2_fs(void) gfs2_inode_cachep = kmem_cache_create("gfs2_inode", sizeof(struct gfs2_inode), 0, SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD| SLAB_ACCOUNT, gfs2_init_inode_once); if (!gfs2_inode_cachep) diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 1281e60be639..572d58e86296 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -214,7 +214,7 @@ static void gfs2_sb_in(struct gfs2_sbd *sdp, const void *buf) memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN); memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN); - memcpy(&s->s_uuid, str->sb_uuid, 16); + super_set_uuid(s, str->sb_uuid, 16); } /** diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 7ededcb720c1..012a3d003fbe 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -190,6 +190,7 @@ struct hfsplus_sb_info { int work_queued; /* non-zero delayed work is queued */ struct delayed_work sync_work; /* FS sync delayed work */ spinlock_t work_lock; /* protects sync_work and work_queued */ + struct rcu_head rcu; }; #define HFSPLUS_SB_WRITEBACKUP 0 diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 1986b4f18a90..97920202790f 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -277,6 +277,14 @@ void hfsplus_mark_mdb_dirty(struct super_block *sb) spin_unlock(&sbi->work_lock); } +static void delayed_free(struct rcu_head *p) +{ + struct hfsplus_sb_info *sbi = container_of(p, struct hfsplus_sb_info, rcu); + + unload_nls(sbi->nls); + kfree(sbi); +} + static void hfsplus_put_super(struct super_block *sb) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); @@ -302,9 +310,7 @@ static void hfsplus_put_super(struct super_block *sb) hfs_btree_close(sbi->ext_tree); kfree(sbi->s_vhdr_buf); kfree(sbi->s_backup_vhdr_buf); - unload_nls(sbi->nls); - kfree(sb->s_fs_info); - sb->s_fs_info = NULL; + call_rcu(&sbi->rcu, delayed_free); } static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index b0cb70400996..ce9346099c72 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -30,7 +30,7 @@ struct hfsplus_wd { * @sector: block to read or write, for blocks of HFSPLUS_SECTOR_SIZE bytes * @buf: buffer for I/O * @data: output pointer for location of requested data - * @opf: request op flags + * @opf: I/O operation type and flags * * The unit of I/O is hfsplus_min_io_size(sb), which may be bigger than * HFSPLUS_SECTOR_SIZE, and @buf must be sized accordingly. On reads diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 6b0ba3c1efba..314834a078e9 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -255,7 +255,7 @@ static int init_inodecache(void) hpfs_inode_cachep = kmem_cache_create("hpfs_inode_cache", sizeof(struct hpfs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), + SLAB_ACCOUNT), init_once); if (hpfs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index ea5b8e57d904..6502c7e776d1 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -100,6 +100,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) loff_t len, vma_len; int ret; struct hstate *h = hstate_file(file); + vm_flags_t vm_flags; /* * vma address alignment (but not the pgoff alignment) has @@ -141,10 +142,20 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); ret = -ENOMEM; + + vm_flags = vma->vm_flags; + /* + * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip + * reserving here. Note: only for SHM hugetlbfs file, the inode + * flag S_PRIVATE is set. + */ + if (inode->i_flags & S_PRIVATE) + vm_flags |= VM_NORESERVE; + if (!hugetlb_reserve_pages(inode, vma->vm_pgoff >> huge_page_order(h), len >> huge_page_shift(h), vma, - vma->vm_flags)) + vm_flags)) goto out; ret = 0; @@ -340,7 +351,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) } else { folio_unlock(folio); - if (!folio_test_has_hwpoisoned(folio)) + if (!folio_test_hwpoison(folio)) want = nr; else { /* @@ -922,7 +933,7 @@ static int hugetlbfs_setattr(struct mnt_idmap *idmap, unsigned int ia_valid = attr->ia_valid; struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); - error = setattr_prepare(&nop_mnt_idmap, dentry, attr); + error = setattr_prepare(idmap, dentry, attr); if (error) return error; @@ -939,7 +950,7 @@ static int hugetlbfs_setattr(struct mnt_idmap *idmap, hugetlb_vmtruncate(inode, newsize); } - setattr_copy(&nop_mnt_idmap, inode, attr); + setattr_copy(idmap, inode, attr); mark_inode_dirty(inode); return 0; } @@ -974,6 +985,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb, static struct lock_class_key hugetlbfs_i_mmap_rwsem_key; static struct inode *hugetlbfs_get_inode(struct super_block *sb, + struct mnt_idmap *idmap, struct inode *dir, umode_t mode, dev_t dev) { @@ -995,7 +1007,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); inode->i_ino = get_next_ino(); - inode_init_owner(&nop_mnt_idmap, inode, dir, mode); + inode_init_owner(idmap, inode, dir, mode); lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, &hugetlbfs_i_mmap_rwsem_key); inode->i_mapping->a_ops = &hugetlbfs_aops; @@ -1039,7 +1051,7 @@ static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir, { struct inode *inode; - inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev); + inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode, dev); if (!inode) return -ENOSPC; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); @@ -1051,7 +1063,7 @@ static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir, static int hugetlbfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { - int retval = hugetlbfs_mknod(&nop_mnt_idmap, dir, dentry, + int retval = hugetlbfs_mknod(idmap, dir, dentry, mode | S_IFDIR, 0); if (!retval) inc_nlink(dir); @@ -1062,7 +1074,7 @@ static int hugetlbfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - return hugetlbfs_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFREG, 0); + return hugetlbfs_mknod(idmap, dir, dentry, mode | S_IFREG, 0); } static int hugetlbfs_tmpfile(struct mnt_idmap *idmap, @@ -1071,7 +1083,7 @@ static int hugetlbfs_tmpfile(struct mnt_idmap *idmap, { struct inode *inode; - inode = hugetlbfs_get_inode(dir->i_sb, dir, mode | S_IFREG, 0); + inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode | S_IFREG, 0); if (!inode) return -ENOSPC; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); @@ -1083,10 +1095,11 @@ static int hugetlbfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { + const umode_t mode = S_IFLNK|S_IRWXUGO; struct inode *inode; int error = -ENOSPC; - inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0); + inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode, 0); if (inode) { int l = strlen(symname)+1; error = page_symlink(inode, symname, l); @@ -1354,6 +1367,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par { struct hugetlbfs_fs_context *ctx = fc->fs_private; struct fs_parse_result result; + struct hstate *h; char *rest; unsigned long ps; int opt; @@ -1398,11 +1412,12 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par case Opt_pagesize: ps = memparse(param->string, &rest); - ctx->hstate = size_to_hstate(ps); - if (!ctx->hstate) { + h = size_to_hstate(ps); + if (!h) { pr_err("Unsupported page size %lu MB\n", ps / SZ_1M); return -EINVAL; } + ctx->hstate = h; return 0; case Opt_min_size: @@ -1553,6 +1568,7 @@ static struct file_system_type hugetlbfs_fs_type = { .init_fs_context = hugetlbfs_init_fs_context, .parameters = hugetlb_fs_parameters, .kill_sb = kill_litter_super, + .fs_flags = FS_ALLOW_IDMAP, }; static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE]; @@ -1606,7 +1622,9 @@ struct file *hugetlb_file_setup(const char *name, size_t size, } file = ERR_PTR(-ENOSPC); - inode = hugetlbfs_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0); + /* hugetlbfs_vfsmount[] mounts do not use idmapped mounts. */ + inode = hugetlbfs_get_inode(mnt->mnt_sb, &nop_mnt_idmap, NULL, + S_IFREG | S_IRWXUGO, 0); if (!inode) goto out; if (creat_flags == HUGETLB_SHMFS_INODE) diff --git a/fs/inode.c b/fs/inode.c index 91048c4c9c9e..d290f007b3d1 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -20,6 +20,7 @@ #include <linux/ratelimit.h> #include <linux/list_lru.h> #include <linux/iversion.h> +#include <linux/rw_hint.h> #include <trace/events/writeback.h> #include "internal.h" @@ -588,7 +589,8 @@ void dump_mapping(const struct address_space *mapping) } dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias); - if (get_kernel_nofault(dentry, dentry_ptr)) { + if (get_kernel_nofault(dentry, dentry_ptr) || + !dentry.d_parent || !dentry.d_name.name) { pr_warn("aops:%ps ino:%lx invalid dentry:%px\n", a_ops, ino, dentry_ptr); return; @@ -2285,7 +2287,7 @@ void __init inode_init(void) sizeof(struct inode), 0, (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), + SLAB_ACCOUNT), init_once); /* Hash may have been set up in inode_init_early */ @@ -2509,7 +2511,7 @@ struct timespec64 inode_set_ctime_current(struct inode *inode) { struct timespec64 now = current_time(inode); - inode_set_ctime(inode, now.tv_sec, now.tv_nsec); + inode_set_ctime_to_ts(inode, now); return now; } EXPORT_SYMBOL(inode_set_ctime_current); diff --git a/fs/internal.h b/fs/internal.h index b67406435fc0..49c1fcfee4b3 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -183,6 +183,7 @@ extern struct open_how build_open_how(int flags, umode_t mode); extern int build_open_flags(const struct open_how *how, struct open_flags *op); struct file *file_close_fd_locked(struct files_struct *files, unsigned fd); +long do_ftruncate(struct file *file, loff_t length, int small); long do_sys_ftruncate(unsigned int fd, loff_t length, int small); int chmod_common(const struct path *path, umode_t mode); int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, @@ -310,3 +311,10 @@ ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *po struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns); struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap); void mnt_idmap_put(struct mnt_idmap *idmap); +struct stashed_operations { + void (*put_data)(void *data); + void (*init_inode)(struct inode *inode, void *data); +}; +int path_from_stashed(struct dentry **stashed, unsigned long ino, + struct vfsmount *mnt, void *data, struct path *path); +void stashed_dentry_prune(struct dentry *dentry); diff --git a/fs/ioctl.c b/fs/ioctl.c index 76cf22ac97d7..1d5abfdf0f22 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -763,6 +763,33 @@ static int ioctl_fssetxattr(struct file *file, void __user *argp) return err; } +static int ioctl_getfsuuid(struct file *file, void __user *argp) +{ + struct super_block *sb = file_inode(file)->i_sb; + struct fsuuid2 u = { .len = sb->s_uuid_len, }; + + if (!sb->s_uuid_len) + return -ENOIOCTLCMD; + + memcpy(&u.uuid[0], &sb->s_uuid, sb->s_uuid_len); + + return copy_to_user(argp, &u, sizeof(u)) ? -EFAULT : 0; +} + +static int ioctl_get_fs_sysfs_path(struct file *file, void __user *argp) +{ + struct super_block *sb = file_inode(file)->i_sb; + + if (!strlen(sb->s_sysfs_name)) + return -ENOIOCTLCMD; + + struct fs_sysfs_path u = {}; + + u.len = scnprintf(u.name, sizeof(u.name), "%s/%s", sb->s_type->name, sb->s_sysfs_name); + + return copy_to_user(argp, &u, sizeof(u)) ? -EFAULT : 0; +} + /* * do_vfs_ioctl() is not for drivers and not intended to be EXPORT_SYMBOL()'d. * It's just a simple helper for sys_ioctl and compat_sys_ioctl. @@ -845,6 +872,12 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd, case FS_IOC_FSSETXATTR: return ioctl_fssetxattr(filp, argp); + case FS_IOC_GETFSUUID: + return ioctl_getfsuuid(filp, argp); + + case FS_IOC_GETFSSYSFSPATH: + return ioctl_get_fs_sysfs_path(filp, argp); + default: if (S_ISREG(inode->i_mode)) return file_ioctl(filp, cmd, argp); diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 093c4515b22a..4e8e41c8b3c0 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Red Hat, Inc. - * Copyright (C) 2016-2019 Christoph Hellwig. + * Copyright (C) 2016-2023 Christoph Hellwig. */ #include <linux/module.h> #include <linux/compiler.h> @@ -95,6 +95,44 @@ static inline bool ifs_block_is_dirty(struct folio *folio, return test_bit(block + blks_per_folio, ifs->state); } +static unsigned ifs_find_dirty_range(struct folio *folio, + struct iomap_folio_state *ifs, u64 *range_start, u64 range_end) +{ + struct inode *inode = folio->mapping->host; + unsigned start_blk = + offset_in_folio(folio, *range_start) >> inode->i_blkbits; + unsigned end_blk = min_not_zero( + offset_in_folio(folio, range_end) >> inode->i_blkbits, + i_blocks_per_folio(inode, folio)); + unsigned nblks = 1; + + while (!ifs_block_is_dirty(folio, ifs, start_blk)) + if (++start_blk == end_blk) + return 0; + + while (start_blk + nblks < end_blk) { + if (!ifs_block_is_dirty(folio, ifs, start_blk + nblks)) + break; + nblks++; + } + + *range_start = folio_pos(folio) + (start_blk << inode->i_blkbits); + return nblks << inode->i_blkbits; +} + +static unsigned iomap_find_dirty_range(struct folio *folio, u64 *range_start, + u64 range_end) +{ + struct iomap_folio_state *ifs = folio->private; + + if (*range_start >= range_end) + return 0; + + if (ifs) + return ifs_find_dirty_range(folio, ifs, range_start, range_end); + return range_end - *range_start; +} + static void ifs_clear_range_dirty(struct folio *folio, struct iomap_folio_state *ifs, size_t off, size_t len) { @@ -1454,15 +1492,10 @@ out_unlock: EXPORT_SYMBOL_GPL(iomap_page_mkwrite); static void iomap_finish_folio_write(struct inode *inode, struct folio *folio, - size_t len, int error) + size_t len) { struct iomap_folio_state *ifs = folio->private; - if (error) { - folio_set_error(folio); - mapping_set_error(inode->i_mapping, error); - } - WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs); WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0); @@ -1479,40 +1512,29 @@ static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error) { struct inode *inode = ioend->io_inode; - struct bio *bio = &ioend->io_inline_bio; - struct bio *last = ioend->io_bio, *next; - u64 start = bio->bi_iter.bi_sector; - loff_t offset = ioend->io_offset; - bool quiet = bio_flagged(bio, BIO_QUIET); + struct bio *bio = &ioend->io_bio; + struct folio_iter fi; u32 folio_count = 0; - for (bio = &ioend->io_inline_bio; bio; bio = next) { - struct folio_iter fi; - - /* - * For the last bio, bi_private points to the ioend, so we - * need to explicitly end the iteration here. - */ - if (bio == last) - next = NULL; - else - next = bio->bi_private; - - /* walk all folios in bio, ending page IO on them */ - bio_for_each_folio_all(fi, bio) { - iomap_finish_folio_write(inode, fi.folio, fi.length, - error); - folio_count++; + if (error) { + mapping_set_error(inode->i_mapping, error); + if (!bio_flagged(bio, BIO_QUIET)) { + pr_err_ratelimited( +"%s: writeback error on inode %lu, offset %lld, sector %llu", + inode->i_sb->s_id, inode->i_ino, + ioend->io_offset, ioend->io_sector); } - bio_put(bio); } - /* The ioend has been freed by bio_put() */ - if (unlikely(error && !quiet)) { - printk_ratelimited(KERN_ERR -"%s: writeback error on inode %lu, offset %lld, sector %llu", - inode->i_sb->s_id, inode->i_ino, offset, start); + /* walk all folios in bio, ending page IO on them */ + bio_for_each_folio_all(fi, bio) { + if (error) + folio_set_error(fi.folio); + iomap_finish_folio_write(inode, fi.folio, fi.length); + folio_count++; } + + bio_put(bio); /* frees the ioend */ return folio_count; } @@ -1553,7 +1575,7 @@ EXPORT_SYMBOL_GPL(iomap_finish_ioends); static bool iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next) { - if (ioend->io_bio->bi_status != next->io_bio->bi_status) + if (ioend->io_bio.bi_status != next->io_bio.bi_status) return false; if ((ioend->io_flags & IOMAP_F_SHARED) ^ (next->io_flags & IOMAP_F_SHARED)) @@ -1618,47 +1640,46 @@ EXPORT_SYMBOL_GPL(iomap_sort_ioends); static void iomap_writepage_end_bio(struct bio *bio) { - struct iomap_ioend *ioend = bio->bi_private; - - iomap_finish_ioend(ioend, blk_status_to_errno(bio->bi_status)); + iomap_finish_ioend(iomap_ioend_from_bio(bio), + blk_status_to_errno(bio->bi_status)); } /* * Submit the final bio for an ioend. * * If @error is non-zero, it means that we have a situation where some part of - * the submission process has failed after we've marked pages for writeback - * and unlocked them. In this situation, we need to fail the bio instead of - * submitting it. This typically only happens on a filesystem shutdown. + * the submission process has failed after we've marked pages for writeback. + * We cannot cancel ioend directly in that case, so call the bio end I/O handler + * with the error status here to run the normal I/O completion handler to clear + * the writeback bit and let the file system proess the errors. */ -static int -iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend, - int error) +static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error) { - ioend->io_bio->bi_private = ioend; - ioend->io_bio->bi_end_io = iomap_writepage_end_bio; + if (!wpc->ioend) + return error; + /* + * Let the file systems prepare the I/O submission and hook in an I/O + * comletion handler. This also needs to happen in case after a + * failure happened so that the file system end I/O handler gets called + * to clean up. + */ if (wpc->ops->prepare_ioend) - error = wpc->ops->prepare_ioend(ioend, error); + error = wpc->ops->prepare_ioend(wpc->ioend, error); + if (error) { - /* - * If we're failing the IO now, just mark the ioend with an - * error and finish it. This will run IO completion immediately - * as there is only one reference to the ioend at this point in - * time. - */ - ioend->io_bio->bi_status = errno_to_blk_status(error); - bio_endio(ioend->io_bio); - return error; + wpc->ioend->io_bio.bi_status = errno_to_blk_status(error); + bio_endio(&wpc->ioend->io_bio); + } else { + submit_bio(&wpc->ioend->io_bio); } - submit_bio(ioend->io_bio); - return 0; + wpc->ioend = NULL; + return error; } -static struct iomap_ioend * -iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc, - loff_t offset, sector_t sector, struct writeback_control *wbc) +static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, + struct writeback_control *wbc, struct inode *inode, loff_t pos) { struct iomap_ioend *ioend; struct bio *bio; @@ -1666,63 +1687,42 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc, bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS, REQ_OP_WRITE | wbc_to_write_flags(wbc), GFP_NOFS, &iomap_ioend_bioset); - bio->bi_iter.bi_sector = sector; + bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos); + bio->bi_end_io = iomap_writepage_end_bio; wbc_init_bio(wbc, bio); + bio->bi_write_hint = inode->i_write_hint; - ioend = container_of(bio, struct iomap_ioend, io_inline_bio); + ioend = iomap_ioend_from_bio(bio); INIT_LIST_HEAD(&ioend->io_list); ioend->io_type = wpc->iomap.type; ioend->io_flags = wpc->iomap.flags; ioend->io_inode = inode; ioend->io_size = 0; - ioend->io_folios = 0; - ioend->io_offset = offset; - ioend->io_bio = bio; - ioend->io_sector = sector; - return ioend; -} - -/* - * Allocate a new bio, and chain the old bio to the new one. - * - * Note that we have to perform the chaining in this unintuitive order - * so that the bi_private linkage is set up in the right direction for the - * traversal in iomap_finish_ioend(). - */ -static struct bio * -iomap_chain_bio(struct bio *prev) -{ - struct bio *new; - - new = bio_alloc(prev->bi_bdev, BIO_MAX_VECS, prev->bi_opf, GFP_NOFS); - bio_clone_blkg_association(new, prev); - new->bi_iter.bi_sector = bio_end_sector(prev); + ioend->io_offset = pos; + ioend->io_sector = bio->bi_iter.bi_sector; - bio_chain(prev, new); - bio_get(prev); /* for iomap_finish_ioend */ - submit_bio(prev); - return new; + wpc->nr_folios = 0; + return ioend; } -static bool -iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset, - sector_t sector) +static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos) { if ((wpc->iomap.flags & IOMAP_F_SHARED) != (wpc->ioend->io_flags & IOMAP_F_SHARED)) return false; if (wpc->iomap.type != wpc->ioend->io_type) return false; - if (offset != wpc->ioend->io_offset + wpc->ioend->io_size) + if (pos != wpc->ioend->io_offset + wpc->ioend->io_size) return false; - if (sector != bio_end_sector(wpc->ioend->io_bio)) + if (iomap_sector(&wpc->iomap, pos) != + bio_end_sector(&wpc->ioend->io_bio)) return false; /* * Limit ioend bio chain lengths to minimise IO completion latency. This * also prevents long tight loops ending page writeback on all the * folios in the ioend. */ - if (wpc->ioend->io_folios >= IOEND_BATCH_SIZE) + if (wpc->nr_folios >= IOEND_BATCH_SIZE) return false; return true; } @@ -1730,255 +1730,238 @@ iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset, /* * Test to see if we have an existing ioend structure that we could append to * first; otherwise finish off the current ioend and start another. + * + * If a new ioend is created and cached, the old ioend is submitted to the block + * layer instantly. Batching optimisations are provided by higher level block + * plugging. + * + * At the end of a writeback pass, there will be a cached ioend remaining on the + * writepage context that the caller will need to submit. */ -static void -iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio, - struct iomap_folio_state *ifs, struct iomap_writepage_ctx *wpc, - struct writeback_control *wbc, struct list_head *iolist) +static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, + struct writeback_control *wbc, struct folio *folio, + struct inode *inode, loff_t pos, unsigned len) { - sector_t sector = iomap_sector(&wpc->iomap, pos); - unsigned len = i_blocksize(inode); + struct iomap_folio_state *ifs = folio->private; size_t poff = offset_in_folio(folio, pos); + int error; - if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, sector)) { - if (wpc->ioend) - list_add(&wpc->ioend->io_list, iolist); - wpc->ioend = iomap_alloc_ioend(inode, wpc, pos, sector, wbc); + if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) { +new_ioend: + error = iomap_submit_ioend(wpc, 0); + if (error) + return error; + wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos); } - if (!bio_add_folio(wpc->ioend->io_bio, folio, len, poff)) { - wpc->ioend->io_bio = iomap_chain_bio(wpc->ioend->io_bio); - bio_add_folio_nofail(wpc->ioend->io_bio, folio, len, poff); - } + if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff)) + goto new_ioend; if (ifs) atomic_add(len, &ifs->write_bytes_pending); wpc->ioend->io_size += len; wbc_account_cgroup_owner(wbc, &folio->page, len); + return 0; } -/* - * We implement an immediate ioend submission policy here to avoid needing to - * chain multiple ioends and hence nest mempool allocations which can violate - * the forward progress guarantees we need to provide. The current ioend we're - * adding blocks to is cached in the writepage context, and if the new block - * doesn't append to the cached ioend, it will create a new ioend and cache that - * instead. - * - * If a new ioend is created and cached, the old ioend is returned and queued - * locally for submission once the entire page is processed or an error has been - * detected. While ioends are submitted immediately after they are completed, - * batching optimisations are provided by higher level block plugging. - * - * At the end of a writeback pass, there will be a cached ioend remaining on the - * writepage context that the caller will need to submit. - */ -static int -iomap_writepage_map(struct iomap_writepage_ctx *wpc, - struct writeback_control *wbc, struct inode *inode, - struct folio *folio, u64 end_pos) +static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc, + struct writeback_control *wbc, struct folio *folio, + struct inode *inode, u64 pos, unsigned dirty_len, + unsigned *count) { - struct iomap_folio_state *ifs = folio->private; - struct iomap_ioend *ioend, *next; - unsigned len = i_blocksize(inode); - unsigned nblocks = i_blocks_per_folio(inode, folio); - u64 pos = folio_pos(folio); - int error = 0, count = 0, i; - LIST_HEAD(submit_list); - - WARN_ON_ONCE(end_pos <= pos); - - if (!ifs && nblocks > 1) { - ifs = ifs_alloc(inode, folio, 0); - iomap_set_range_dirty(folio, 0, end_pos - pos); - } + int error; - WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0); - - /* - * Walk through the folio to find areas to write back. If we - * run off the end of the current map or find the current map - * invalid, grab a new one. - */ - for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) { - if (ifs && !ifs_block_is_dirty(folio, ifs, i)) - continue; + do { + unsigned map_len; - error = wpc->ops->map_blocks(wpc, inode, pos); + error = wpc->ops->map_blocks(wpc, inode, pos, dirty_len); if (error) break; - trace_iomap_writepage_map(inode, &wpc->iomap); - if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE)) - continue; - if (wpc->iomap.type == IOMAP_HOLE) - continue; - iomap_add_to_ioend(inode, pos, folio, ifs, wpc, wbc, - &submit_list); - count++; - } - if (count) - wpc->ioend->io_folios++; + trace_iomap_writepage_map(inode, pos, dirty_len, &wpc->iomap); - WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list)); - WARN_ON_ONCE(!folio_test_locked(folio)); - WARN_ON_ONCE(folio_test_writeback(folio)); - WARN_ON_ONCE(folio_test_dirty(folio)); + map_len = min_t(u64, dirty_len, + wpc->iomap.offset + wpc->iomap.length - pos); + WARN_ON_ONCE(!folio->private && map_len < dirty_len); + + switch (wpc->iomap.type) { + case IOMAP_INLINE: + WARN_ON_ONCE(1); + error = -EIO; + break; + case IOMAP_HOLE: + break; + default: + error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos, + map_len); + if (!error) + (*count)++; + break; + } + dirty_len -= map_len; + pos += map_len; + } while (dirty_len && !error); /* * We cannot cancel the ioend directly here on error. We may have * already set other pages under writeback and hence we have to run I/O * completion to mark the error state of the pages under writeback * appropriately. + * + * Just let the file system know what portion of the folio failed to + * map. */ - if (unlikely(error)) { - /* - * Let the filesystem know what portion of the current page - * failed to map. If the page hasn't been added to ioend, it - * won't be affected by I/O completion and we must unlock it - * now. - */ - if (wpc->ops->discard_folio) - wpc->ops->discard_folio(folio, pos); - if (!count) { - folio_unlock(folio); - goto done; - } - } - - /* - * We can have dirty bits set past end of file in page_mkwrite path - * while mapping the last partial folio. Hence it's better to clear - * all the dirty bits in the folio here. - */ - iomap_clear_range_dirty(folio, 0, folio_size(folio)); - folio_start_writeback(folio); - folio_unlock(folio); - - /* - * Preserve the original error if there was one; catch - * submission errors here and propagate into subsequent ioend - * submissions. - */ - list_for_each_entry_safe(ioend, next, &submit_list, io_list) { - int error2; - - list_del_init(&ioend->io_list); - error2 = iomap_submit_ioend(wpc, ioend, error); - if (error2 && !error) - error = error2; - } - - /* - * We can end up here with no error and nothing to write only if we race - * with a partial page truncate on a sub-page block sized filesystem. - */ - if (!count) - folio_end_writeback(folio); -done: - mapping_set_error(inode->i_mapping, error); + if (error && wpc->ops->discard_folio) + wpc->ops->discard_folio(folio, pos); return error; } /* - * Write out a dirty page. + * Check interaction of the folio with the file end. * - * For delalloc space on the page, we need to allocate space and flush it. - * For unwritten space on the page, we need to start the conversion to - * regular allocated space. + * If the folio is entirely beyond i_size, return false. If it straddles + * i_size, adjust end_pos and zero all data beyond i_size. */ -static int iomap_do_writepage(struct folio *folio, - struct writeback_control *wbc, void *data) +static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode, + u64 *end_pos) { - struct iomap_writepage_ctx *wpc = data; - struct inode *inode = folio->mapping->host; - u64 end_pos, isize; - - trace_iomap_writepage(inode, folio_pos(folio), folio_size(folio)); + u64 isize = i_size_read(inode); - /* - * Refuse to write the folio out if we're called from reclaim context. - * - * This avoids stack overflows when called from deeply used stacks in - * random callers for direct reclaim or memcg reclaim. We explicitly - * allow reclaim from kswapd as the stack usage there is relatively low. - * - * This should never happen except in the case of a VM regression so - * warn about it. - */ - if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == - PF_MEMALLOC)) - goto redirty; - - /* - * Is this folio beyond the end of the file? - * - * The folio index is less than the end_index, adjust the end_pos - * to the highest offset that this folio should represent. - * ----------------------------------------------------- - * | file mapping | <EOF> | - * ----------------------------------------------------- - * | Page ... | Page N-2 | Page N-1 | Page N | | - * ^--------------------------------^----------|-------- - * | desired writeback range | see else | - * ---------------------------------^------------------| - */ - isize = i_size_read(inode); - end_pos = folio_pos(folio) + folio_size(folio); - if (end_pos > isize) { - /* - * Check whether the page to write out is beyond or straddles - * i_size or not. - * ------------------------------------------------------- - * | file mapping | <EOF> | - * ------------------------------------------------------- - * | Page ... | Page N-2 | Page N-1 | Page N | Beyond | - * ^--------------------------------^-----------|--------- - * | | Straddles | - * ---------------------------------^-----------|--------| - */ + if (*end_pos > isize) { size_t poff = offset_in_folio(folio, isize); pgoff_t end_index = isize >> PAGE_SHIFT; /* - * Skip the page if it's fully outside i_size, e.g. - * due to a truncate operation that's in progress. We've - * cleaned this page and truncate will finish things off for - * us. + * If the folio is entirely ouside of i_size, skip it. + * + * This can happen due to a truncate operation that is in + * progress and in that case truncate will finish it off once + * we've dropped the folio lock. * - * Note that the end_index is unsigned long. If the given - * offset is greater than 16TB on a 32-bit system then if we - * checked if the page is fully outside i_size with - * "if (page->index >= end_index + 1)", "end_index + 1" would - * overflow and evaluate to 0. Hence this page would be + * Note that the pgoff_t used for end_index is an unsigned long. + * If the given offset is greater than 16TB on a 32-bit system, + * then if we checked if the folio is fully outside i_size with + * "if (folio->index >= end_index + 1)", "end_index + 1" would + * overflow and evaluate to 0. Hence this folio would be * redirtied and written out repeatedly, which would result in * an infinite loop; the user program performing this operation * would hang. Instead, we can detect this situation by - * checking if the page is totally beyond i_size or if its + * checking if the folio is totally beyond i_size or if its * offset is just equal to the EOF. */ if (folio->index > end_index || (folio->index == end_index && poff == 0)) - goto unlock; + return false; /* - * The page straddles i_size. It must be zeroed out on each - * and every writepage invocation because it may be mmapped. - * "A file is mapped in multiples of the page size. For a file - * that is not a multiple of the page size, the remaining - * memory is zeroed when mapped, and writes to that region are - * not written out to the file." + * The folio straddles i_size. + * + * It must be zeroed out on each and every writepage invocation + * because it may be mmapped: + * + * A file is mapped in multiples of the page size. For a + * file that is not a multiple of the page size, the + * remaining memory is zeroed when mapped, and writes to that + * region are not written out to the file. + * + * Also adjust the writeback range to skip all blocks entirely + * beyond i_size. */ folio_zero_segment(folio, poff, folio_size(folio)); - end_pos = isize; + *end_pos = round_up(isize, i_blocksize(inode)); + } + + return true; +} + +static int iomap_writepage_map(struct iomap_writepage_ctx *wpc, + struct writeback_control *wbc, struct folio *folio) +{ + struct iomap_folio_state *ifs = folio->private; + struct inode *inode = folio->mapping->host; + u64 pos = folio_pos(folio); + u64 end_pos = pos + folio_size(folio); + unsigned count = 0; + int error = 0; + u32 rlen; + + WARN_ON_ONCE(!folio_test_locked(folio)); + WARN_ON_ONCE(folio_test_dirty(folio)); + WARN_ON_ONCE(folio_test_writeback(folio)); + + trace_iomap_writepage(inode, pos, folio_size(folio)); + + if (!iomap_writepage_handle_eof(folio, inode, &end_pos)) { + folio_unlock(folio); + return 0; + } + WARN_ON_ONCE(end_pos <= pos); + + if (i_blocks_per_folio(inode, folio) > 1) { + if (!ifs) { + ifs = ifs_alloc(inode, folio, 0); + iomap_set_range_dirty(folio, 0, end_pos - pos); + } + + /* + * Keep the I/O completion handler from clearing the writeback + * bit until we have submitted all blocks by adding a bias to + * ifs->write_bytes_pending, which is dropped after submitting + * all blocks. + */ + WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0); + atomic_inc(&ifs->write_bytes_pending); } - return iomap_writepage_map(wpc, wbc, inode, folio, end_pos); + /* + * Set the writeback bit ASAP, as the I/O completion for the single + * block per folio case happen hit as soon as we're submitting the bio. + */ + folio_start_writeback(folio); -redirty: - folio_redirty_for_writepage(wbc, folio); -unlock: + /* + * Walk through the folio to find dirty areas to write back. + */ + while ((rlen = iomap_find_dirty_range(folio, &pos, end_pos))) { + error = iomap_writepage_map_blocks(wpc, wbc, folio, inode, + pos, rlen, &count); + if (error) + break; + pos += rlen; + } + + if (count) + wpc->nr_folios++; + + /* + * We can have dirty bits set past end of file in page_mkwrite path + * while mapping the last partial folio. Hence it's better to clear + * all the dirty bits in the folio here. + */ + iomap_clear_range_dirty(folio, 0, folio_size(folio)); + + /* + * Usually the writeback bit is cleared by the I/O completion handler. + * But we may end up either not actually writing any blocks, or (when + * there are multiple blocks in a folio) all I/O might have finished + * already at this point. In that case we need to clear the writeback + * bit ourselves right after unlocking the page. + */ folio_unlock(folio); - return 0; + if (ifs) { + if (atomic_dec_and_test(&ifs->write_bytes_pending)) + folio_end_writeback(folio); + } else { + if (!count) + folio_end_writeback(folio); + } + mapping_set_error(inode->i_mapping, error); + return error; +} + +static int iomap_do_writepage(struct folio *folio, + struct writeback_control *wbc, void *data) +{ + return iomap_writepage_map(data, wbc, folio); } int @@ -1988,18 +1971,24 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc, { int ret; + /* + * Writeback from reclaim context should never happen except in the case + * of a VM regression so warn about it and refuse to write the data. + */ + if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC | PF_KSWAPD)) == + PF_MEMALLOC)) + return -EIO; + wpc->ops = ops; ret = write_cache_pages(mapping, wbc, iomap_do_writepage, wpc); - if (!wpc->ioend) - return ret; - return iomap_submit_ioend(wpc, wpc->ioend, ret); + return iomap_submit_ioend(wpc, ret); } EXPORT_SYMBOL_GPL(iomap_writepages); static int __init iomap_init(void) { return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), - offsetof(struct iomap_ioend, io_inline_bio), + offsetof(struct iomap_ioend, io_bio), BIOSET_NEED_BVECS); } fs_initcall(iomap_init); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index bcd3f8cf5ea4..f3b43d223a46 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -380,6 +380,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits, GFP_KERNEL); bio->bi_iter.bi_sector = iomap_sector(iomap, pos); + bio->bi_write_hint = inode->i_write_hint; bio->bi_ioprio = dio->iocb->ki_ioprio; bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index c16fd55f5595..0a991c4ce87d 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -154,7 +154,48 @@ DEFINE_EVENT(iomap_class, name, \ TP_ARGS(inode, iomap)) DEFINE_IOMAP_EVENT(iomap_iter_dstmap); DEFINE_IOMAP_EVENT(iomap_iter_srcmap); -DEFINE_IOMAP_EVENT(iomap_writepage_map); + +TRACE_EVENT(iomap_writepage_map, + TP_PROTO(struct inode *inode, u64 pos, unsigned int dirty_len, + struct iomap *iomap), + TP_ARGS(inode, pos, dirty_len, iomap), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u64, ino) + __field(u64, pos) + __field(u64, dirty_len) + __field(u64, addr) + __field(loff_t, offset) + __field(u64, length) + __field(u16, type) + __field(u16, flags) + __field(dev_t, bdev) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->pos = pos; + __entry->dirty_len = dirty_len; + __entry->addr = iomap->addr; + __entry->offset = iomap->offset; + __entry->length = iomap->length; + __entry->type = iomap->type; + __entry->flags = iomap->flags; + __entry->bdev = iomap->bdev ? iomap->bdev->bd_dev : 0; + ), + TP_printk("dev %d:%d ino 0x%llx bdev %d:%d pos 0x%llx dirty len 0x%llx " + "addr 0x%llx offset 0x%llx length 0x%llx type %s flags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + MAJOR(__entry->bdev), MINOR(__entry->bdev), + __entry->pos, + __entry->dirty_len, + __entry->addr, + __entry->offset, + __entry->length, + __print_symbolic(__entry->type, IOMAP_TYPE_STRINGS), + __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS)) +); TRACE_EVENT(iomap_iter, TP_PROTO(struct iomap_iter *iter, const void *ops, @@ -165,6 +206,7 @@ TRACE_EVENT(iomap_iter, __field(u64, ino) __field(loff_t, pos) __field(u64, length) + __field(s64, processed) __field(unsigned int, flags) __field(const void *, ops) __field(unsigned long, caller) @@ -174,15 +216,17 @@ TRACE_EVENT(iomap_iter, __entry->ino = iter->inode->i_ino; __entry->pos = iter->pos; __entry->length = iomap_length(iter); + __entry->processed = iter->processed; __entry->flags = iter->flags; __entry->ops = ops; __entry->caller = caller; ), - TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx flags %s (0x%x) ops %ps caller %pS", + TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx processed %lld flags %s (0x%x) ops %ps caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->pos, __entry->length, + __entry->processed, __print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS), __entry->flags, __entry->ops, diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 3e4d53e26f94..25fca44149dd 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -93,7 +93,7 @@ static int __init init_inodecache(void) isofs_inode_cachep = kmem_cache_create("isofs_inode_cache", sizeof(struct iso_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), + SLAB_ACCOUNT), init_once); if (!isofs_inode_cachep) return -ENOMEM; diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index f99591a634b4..aede1be4dc0c 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -387,7 +387,7 @@ static int __init init_jffs2_fs(void) jffs2_inode_cachep = kmem_cache_create("jffs2_i", sizeof(struct jffs2_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), + SLAB_ACCOUNT), jffs2_i_init_once); if (!jffs2_inode_cachep) { pr_err("error: Failed to initialise inode cache\n"); diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 8eec84c651bf..cb3cda1390ad 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -2763,9 +2763,7 @@ static int dbBackSplit(dmtree_t *tp, int leafno, bool is_ctl) * leafno - the number of the leaf to be updated. * newval - the new value for the leaf. * - * RETURN VALUES: - * 0 - success - * -EIO - i/o error + * RETURN VALUES: none */ static int dbJoin(dmtree_t *tp, int leafno, int newval, bool is_ctl) { @@ -2792,10 +2790,6 @@ static int dbJoin(dmtree_t *tp, int leafno, int newval, bool is_ctl) * get the buddy size (number of words covered) of * the new value. */ - - if ((newval - tp->dmt_budmin) > BUDMIN) - return -EIO; - budsz = BUDSIZE(newval, tp->dmt_budmin); /* try to join. diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index cb6d1fda66a7..73389c68e251 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1058,7 +1058,7 @@ void jfs_syncpt(struct jfs_log *log, int hard_sync) int lmLogOpen(struct super_block *sb) { int rc; - struct bdev_handle *bdev_handle; + struct file *bdev_file; struct jfs_log *log; struct jfs_sb_info *sbi = JFS_SBI(sb); @@ -1070,7 +1070,7 @@ int lmLogOpen(struct super_block *sb) mutex_lock(&jfs_log_mutex); list_for_each_entry(log, &jfs_external_logs, journal_list) { - if (log->bdev_handle->bdev->bd_dev == sbi->logdev) { + if (file_bdev(log->bdev_file)->bd_dev == sbi->logdev) { if (!uuid_equal(&log->uuid, &sbi->loguuid)) { jfs_warn("wrong uuid on JFS journal"); mutex_unlock(&jfs_log_mutex); @@ -1100,14 +1100,14 @@ int lmLogOpen(struct super_block *sb) * file systems to log may have n-to-1 relationship; */ - bdev_handle = bdev_open_by_dev(sbi->logdev, + bdev_file = bdev_file_open_by_dev(sbi->logdev, BLK_OPEN_READ | BLK_OPEN_WRITE, log, NULL); - if (IS_ERR(bdev_handle)) { - rc = PTR_ERR(bdev_handle); + if (IS_ERR(bdev_file)) { + rc = PTR_ERR(bdev_file); goto free; } - log->bdev_handle = bdev_handle; + log->bdev_file = bdev_file; uuid_copy(&log->uuid, &sbi->loguuid); /* @@ -1141,7 +1141,7 @@ journal_found: lbmLogShutdown(log); close: /* close external log device */ - bdev_release(bdev_handle); + fput(bdev_file); free: /* free log descriptor */ mutex_unlock(&jfs_log_mutex); @@ -1162,7 +1162,7 @@ static int open_inline_log(struct super_block *sb) init_waitqueue_head(&log->syncwait); set_bit(log_INLINELOG, &log->flag); - log->bdev_handle = sb->s_bdev_handle; + log->bdev_file = sb->s_bdev_file; log->base = addressPXD(&JFS_SBI(sb)->logpxd); log->size = lengthPXD(&JFS_SBI(sb)->logpxd) >> (L2LOGPSIZE - sb->s_blocksize_bits); @@ -1436,7 +1436,7 @@ int lmLogClose(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); struct jfs_log *log = sbi->log; - struct bdev_handle *bdev_handle; + struct file *bdev_file; int rc = 0; jfs_info("lmLogClose: log:0x%p", log); @@ -1482,10 +1482,10 @@ int lmLogClose(struct super_block *sb) * external log as separate logical volume */ list_del(&log->journal_list); - bdev_handle = log->bdev_handle; + bdev_file = log->bdev_file; rc = lmLogShutdown(log); - bdev_release(bdev_handle); + fput(bdev_file); kfree(log); @@ -1972,7 +1972,7 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) bp->l_flag |= lbmREAD; - bio = bio_alloc(log->bdev_handle->bdev, 1, REQ_OP_READ, GFP_NOFS); + bio = bio_alloc(file_bdev(log->bdev_file), 1, REQ_OP_READ, GFP_NOFS); bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); __bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset); BUG_ON(bio->bi_iter.bi_size != LOGPSIZE); @@ -2115,7 +2115,7 @@ static void lbmStartIO(struct lbuf * bp) jfs_info("lbmStartIO"); if (!log->no_integrity) - bdev = log->bdev_handle->bdev; + bdev = file_bdev(log->bdev_file); bio = bio_alloc(bdev, 1, REQ_OP_WRITE | REQ_SYNC, GFP_NOFS); diff --git a/fs/jfs/jfs_logmgr.h b/fs/jfs/jfs_logmgr.h index 84aa2d253907..8b8994e48cd0 100644 --- a/fs/jfs/jfs_logmgr.h +++ b/fs/jfs/jfs_logmgr.h @@ -356,7 +356,7 @@ struct jfs_log { * before writing syncpt. */ struct list_head journal_list; /* Global list */ - struct bdev_handle *bdev_handle; /* 4: log lv pointer */ + struct file *bdev_file; /* 4: log lv pointer */ int serial; /* 4: log mount serial number */ s64 base; /* @8: log extent address (inline log ) */ diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c index 9b5c6a20b30c..98f9a432c336 100644 --- a/fs/jfs/jfs_mount.c +++ b/fs/jfs/jfs_mount.c @@ -431,7 +431,7 @@ int updateSuper(struct super_block *sb, uint state) if (state == FM_MOUNT) { /* record log's dev_t and mount serial number */ j_sb->s_logdev = cpu_to_le32( - new_encode_dev(sbi->log->bdev_handle->bdev->bd_dev)); + new_encode_dev(file_bdev(sbi->log->bdev_file)->bd_dev)); j_sb->s_logserial = cpu_to_le32(sbi->log->serial); } else if (state == FM_CLEAN) { /* diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 8d8e556bd610..73f09a762b79 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -932,7 +932,7 @@ static int __init init_jfs_fs(void) jfs_inode_cachep = kmem_cache_create_usercopy("jfs_ip", sizeof(struct jfs_inode_info), - 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT, + 0, SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, offsetof(struct jfs_inode_info, i_inline_all), sizeof_field(struct jfs_inode_info, i_inline_all), init_once); diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 0c93cad0f0ac..e29f4edf9572 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -358,7 +358,9 @@ int kernfs_get_tree(struct fs_context *fc) } sb->s_flags |= SB_ACTIVE; - uuid_gen(&sb->s_uuid); + uuid_t uuid; + uuid_gen(&uuid); + super_set_uuid(sb, uuid.b, sizeof(uuid)); down_write(&root->kernfs_supers_rwsem); list_add(&info->node, &info->root->supers); diff --git a/fs/libfs.c b/fs/libfs.c index eec6031b0155..0d14ae808fcf 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -23,6 +23,7 @@ #include <linux/fsnotify.h> #include <linux/unicode.h> #include <linux/fscrypt.h> +#include <linux/pidfs.h> #include <linux/uaccess.h> @@ -240,17 +241,22 @@ const struct inode_operations simple_dir_inode_operations = { }; EXPORT_SYMBOL(simple_dir_inode_operations); -static void offset_set(struct dentry *dentry, u32 offset) +/* 0 is '.', 1 is '..', so always start with offset 2 or more */ +enum { + DIR_OFFSET_MIN = 2, +}; + +static void offset_set(struct dentry *dentry, long offset) { - dentry->d_fsdata = (void *)((uintptr_t)(offset)); + dentry->d_fsdata = (void *)offset; } -static u32 dentry2offset(struct dentry *dentry) +static long dentry2offset(struct dentry *dentry) { - return (u32)((uintptr_t)(dentry->d_fsdata)); + return (long)dentry->d_fsdata; } -static struct lock_class_key simple_offset_xa_lock; +static struct lock_class_key simple_offset_lock_class; /** * simple_offset_init - initialize an offset_ctx @@ -259,11 +265,9 @@ static struct lock_class_key simple_offset_xa_lock; */ void simple_offset_init(struct offset_ctx *octx) { - xa_init_flags(&octx->xa, XA_FLAGS_ALLOC1); - lockdep_set_class(&octx->xa.xa_lock, &simple_offset_xa_lock); - - /* 0 is '.', 1 is '..', so always start with offset 2 */ - octx->next_offset = 2; + mt_init_flags(&octx->mt, MT_FLAGS_ALLOC_RANGE); + lockdep_set_class(&octx->mt.ma_lock, &simple_offset_lock_class); + octx->next_offset = DIR_OFFSET_MIN; } /** @@ -271,20 +275,19 @@ void simple_offset_init(struct offset_ctx *octx) * @octx: directory offset ctx to be updated * @dentry: new dentry being added * - * Returns zero on success. @so_ctx and the dentry offset are updated. + * Returns zero on success. @octx and the dentry's offset are updated. * Otherwise, a negative errno value is returned. */ int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry) { - static const struct xa_limit limit = XA_LIMIT(2, U32_MAX); - u32 offset; + unsigned long offset; int ret; if (dentry2offset(dentry) != 0) return -EBUSY; - ret = xa_alloc_cyclic(&octx->xa, &offset, dentry, limit, - &octx->next_offset, GFP_KERNEL); + ret = mtree_alloc_cyclic(&octx->mt, &offset, dentry, DIR_OFFSET_MIN, + LONG_MAX, &octx->next_offset, GFP_KERNEL); if (ret < 0) return ret; @@ -300,17 +303,49 @@ int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry) */ void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry) { - u32 offset; + long offset; offset = dentry2offset(dentry); if (offset == 0) return; - xa_erase(&octx->xa, offset); + mtree_erase(&octx->mt, offset); offset_set(dentry, 0); } /** + * simple_offset_empty - Check if a dentry can be unlinked + * @dentry: dentry to be tested + * + * Returns 0 if @dentry is a non-empty directory; otherwise returns 1. + */ +int simple_offset_empty(struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + struct offset_ctx *octx; + struct dentry *child; + unsigned long index; + int ret = 1; + + if (!inode || !S_ISDIR(inode->i_mode)) + return ret; + + index = DIR_OFFSET_MIN; + octx = inode->i_op->get_offset_ctx(inode); + mt_for_each(&octx->mt, child, index, LONG_MAX) { + spin_lock(&child->d_lock); + if (simple_positive(child)) { + spin_unlock(&child->d_lock); + ret = 0; + break; + } + spin_unlock(&child->d_lock); + } + + return ret; +} + +/** * simple_offset_rename_exchange - exchange rename with directory offsets * @old_dir: parent of dentry being moved * @old_dentry: dentry being moved @@ -327,8 +362,8 @@ int simple_offset_rename_exchange(struct inode *old_dir, { struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir); struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir); - u32 old_index = dentry2offset(old_dentry); - u32 new_index = dentry2offset(new_dentry); + long old_index = dentry2offset(old_dentry); + long new_index = dentry2offset(new_dentry); int ret; simple_offset_remove(old_ctx, old_dentry); @@ -354,9 +389,9 @@ int simple_offset_rename_exchange(struct inode *old_dir, out_restore: offset_set(old_dentry, old_index); - xa_store(&old_ctx->xa, old_index, old_dentry, GFP_KERNEL); + mtree_store(&old_ctx->mt, old_index, old_dentry, GFP_KERNEL); offset_set(new_dentry, new_index); - xa_store(&new_ctx->xa, new_index, new_dentry, GFP_KERNEL); + mtree_store(&new_ctx->mt, new_index, new_dentry, GFP_KERNEL); return ret; } @@ -369,7 +404,7 @@ out_restore: */ void simple_offset_destroy(struct offset_ctx *octx) { - xa_destroy(&octx->xa); + mtree_destroy(&octx->mt); } /** @@ -399,15 +434,16 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence) /* In this case, ->private_data is protected by f_pos_lock */ file->private_data = NULL; - return vfs_setpos(file, offset, U32_MAX); + return vfs_setpos(file, offset, LONG_MAX); } -static struct dentry *offset_find_next(struct xa_state *xas) +static struct dentry *offset_find_next(struct offset_ctx *octx, loff_t offset) { + MA_STATE(mas, &octx->mt, offset, offset); struct dentry *child, *found = NULL; rcu_read_lock(); - child = xas_next_entry(xas, U32_MAX); + child = mas_find(&mas, LONG_MAX); if (!child) goto out; spin_lock(&child->d_lock); @@ -421,8 +457,8 @@ out: static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry) { - u32 offset = dentry2offset(dentry); struct inode *inode = d_inode(dentry); + long offset = dentry2offset(dentry); return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len, offset, inode->i_ino, fs_umode_to_dtype(inode->i_mode)); @@ -430,12 +466,11 @@ static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry) static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx) { - struct offset_ctx *so_ctx = inode->i_op->get_offset_ctx(inode); - XA_STATE(xas, &so_ctx->xa, ctx->pos); + struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode); struct dentry *dentry; while (true) { - dentry = offset_find_next(&xas); + dentry = offset_find_next(octx, ctx->pos); if (!dentry) return ERR_PTR(-ENOENT); @@ -444,8 +479,8 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx) break; } + ctx->pos = dentry2offset(dentry) + 1; dput(dentry); - ctx->pos = xas.xa_index + 1; } return NULL; } @@ -481,7 +516,7 @@ static int offset_readdir(struct file *file, struct dir_context *ctx) return 0; /* In this case, ->private_data is protected by f_pos_lock */ - if (ctx->pos == 2) + if (ctx->pos == DIR_OFFSET_MIN) file->private_data = NULL; else if (file->private_data == ERR_PTR(-ENOENT)) return 0; @@ -1580,7 +1615,7 @@ EXPORT_SYMBOL(alloc_anon_inode); * All arguments are ignored and it just returns -EINVAL. */ int -simple_nosetlease(struct file *filp, int arg, struct file_lock **flp, +simple_nosetlease(struct file *filp, int arg, struct file_lease **flp, void **priv) { return -EINVAL; @@ -1704,16 +1739,28 @@ bool is_empty_dir_inode(struct inode *inode) static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { - const struct dentry *parent = READ_ONCE(dentry->d_parent); - const struct inode *dir = READ_ONCE(parent->d_inode); - const struct super_block *sb = dentry->d_sb; - const struct unicode_map *um = sb->s_encoding; - struct qstr qstr = QSTR_INIT(str, len); + const struct dentry *parent; + const struct inode *dir; char strbuf[DNAME_INLINE_LEN]; - int ret; + struct qstr qstr; + + /* + * Attempt a case-sensitive match first. It is cheaper and + * should cover most lookups, including all the sane + * applications that expect a case-sensitive filesystem. + * + * This comparison is safe under RCU because the caller + * guarantees the consistency between str and len. See + * __d_lookup_rcu_op_compare() for details. + */ + if (len == name->len && !memcmp(str, name->name, len)) + return 0; + parent = READ_ONCE(dentry->d_parent); + dir = READ_ONCE(parent->d_inode); if (!dir || !IS_CASEFOLDED(dir)) - goto fallback; + return 1; + /* * If the dentry name is stored in-line, then it may be concurrently * modified by a rename. If this happens, the VFS will eventually retry @@ -1724,20 +1771,14 @@ static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, if (len <= DNAME_INLINE_LEN - 1) { memcpy(strbuf, str, len); strbuf[len] = 0; - qstr.name = strbuf; + str = strbuf; /* prevent compiler from optimizing out the temporary buffer */ barrier(); } - ret = utf8_strncasecmp(um, name, &qstr); - if (ret >= 0) - return ret; + qstr.len = len; + qstr.name = str; - if (sb_has_strict_encoding(sb)) - return -EINVAL; -fallback: - if (len != name->len) - return 1; - return !!memcmp(str, name->name, len); + return utf8_strncasecmp(dentry->d_sb->s_encoding, name, &qstr); } /** @@ -1752,7 +1793,7 @@ static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) const struct inode *dir = READ_ONCE(dentry->d_inode); struct super_block *sb = dentry->d_sb; const struct unicode_map *um = sb->s_encoding; - int ret = 0; + int ret; if (!dir || !IS_CASEFOLDED(dir)) return 0; @@ -1766,73 +1807,45 @@ static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) static const struct dentry_operations generic_ci_dentry_ops = { .d_hash = generic_ci_d_hash, .d_compare = generic_ci_d_compare, -}; -#endif - #ifdef CONFIG_FS_ENCRYPTION -static const struct dentry_operations generic_encrypted_dentry_ops = { .d_revalidate = fscrypt_d_revalidate, +#endif }; #endif -#if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE) -static const struct dentry_operations generic_encrypted_ci_dentry_ops = { - .d_hash = generic_ci_d_hash, - .d_compare = generic_ci_d_compare, +#ifdef CONFIG_FS_ENCRYPTION +static const struct dentry_operations generic_encrypted_dentry_ops = { .d_revalidate = fscrypt_d_revalidate, }; #endif /** - * generic_set_encrypted_ci_d_ops - helper for setting d_ops for given dentry - * @dentry: dentry to set ops on - * - * Casefolded directories need d_hash and d_compare set, so that the dentries - * contained in them are handled case-insensitively. Note that these operations - * are needed on the parent directory rather than on the dentries in it, and - * while the casefolding flag can be toggled on and off on an empty directory, - * dentry_operations can't be changed later. As a result, if the filesystem has - * casefolding support enabled at all, we have to give all dentries the - * casefolding operations even if their inode doesn't have the casefolding flag - * currently (and thus the casefolding ops would be no-ops for now). - * - * Encryption works differently in that the only dentry operation it needs is - * d_revalidate, which it only needs on dentries that have the no-key name flag. - * The no-key flag can't be set "later", so we don't have to worry about that. + * generic_set_sb_d_ops - helper for choosing the set of + * filesystem-wide dentry operations for the enabled features + * @sb: superblock to be configured * - * Finally, to maximize compatibility with overlayfs (which isn't compatible - * with certain dentry operations) and to avoid taking an unnecessary - * performance hit, we use custom dentry_operations for each possible - * combination rather than always installing all operations. + * Filesystems supporting casefolding and/or fscrypt can call this + * helper at mount-time to configure sb->s_d_op to best set of dentry + * operations required for the enabled features. The helper must be + * called after these have been configured, but before the root dentry + * is created. */ -void generic_set_encrypted_ci_d_ops(struct dentry *dentry) +void generic_set_sb_d_ops(struct super_block *sb) { -#ifdef CONFIG_FS_ENCRYPTION - bool needs_encrypt_ops = dentry->d_flags & DCACHE_NOKEY_NAME; -#endif #if IS_ENABLED(CONFIG_UNICODE) - bool needs_ci_ops = dentry->d_sb->s_encoding; -#endif -#if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE) - if (needs_encrypt_ops && needs_ci_ops) { - d_set_d_op(dentry, &generic_encrypted_ci_dentry_ops); + if (sb->s_encoding) { + sb->s_d_op = &generic_ci_dentry_ops; return; } #endif #ifdef CONFIG_FS_ENCRYPTION - if (needs_encrypt_ops) { - d_set_d_op(dentry, &generic_encrypted_dentry_ops); - return; - } -#endif -#if IS_ENABLED(CONFIG_UNICODE) - if (needs_ci_ops) { - d_set_d_op(dentry, &generic_ci_dentry_ops); + if (sb->s_cop) { + sb->s_d_op = &generic_encrypted_dentry_ops; return; } #endif } -EXPORT_SYMBOL(generic_set_encrypted_ci_d_ops); +EXPORT_SYMBOL(generic_set_sb_d_ops); /** * inode_maybe_inc_iversion - increments i_version @@ -1973,3 +1986,144 @@ struct timespec64 simple_inode_init_ts(struct inode *inode) return ts; } EXPORT_SYMBOL(simple_inode_init_ts); + +static inline struct dentry *get_stashed_dentry(struct dentry *stashed) +{ + struct dentry *dentry; + + guard(rcu)(); + dentry = READ_ONCE(stashed); + if (!dentry) + return NULL; + if (!lockref_get_not_dead(&dentry->d_lockref)) + return NULL; + return dentry; +} + +static struct dentry *prepare_anon_dentry(struct dentry **stashed, + unsigned long ino, + struct super_block *sb, + void *data) +{ + struct dentry *dentry; + struct inode *inode; + const struct stashed_operations *sops = sb->s_fs_info; + + dentry = d_alloc_anon(sb); + if (!dentry) + return ERR_PTR(-ENOMEM); + + inode = new_inode_pseudo(sb); + if (!inode) { + dput(dentry); + return ERR_PTR(-ENOMEM); + } + + inode->i_ino = ino; + inode->i_flags |= S_IMMUTABLE; + inode->i_mode = S_IFREG; + simple_inode_init_ts(inode); + sops->init_inode(inode, data); + + /* Notice when this is changed. */ + WARN_ON_ONCE(!S_ISREG(inode->i_mode)); + WARN_ON_ONCE(!IS_IMMUTABLE(inode)); + + /* Store address of location where dentry's supposed to be stashed. */ + dentry->d_fsdata = stashed; + + /* @data is now owned by the fs */ + d_instantiate(dentry, inode); + return dentry; +} + +static struct dentry *stash_dentry(struct dentry **stashed, + struct dentry *dentry) +{ + guard(rcu)(); + for (;;) { + struct dentry *old; + + /* Assume any old dentry was cleared out. */ + old = cmpxchg(stashed, NULL, dentry); + if (likely(!old)) + return dentry; + + /* Check if somebody else installed a reusable dentry. */ + if (lockref_get_not_dead(&old->d_lockref)) + return old; + + /* There's an old dead dentry there, try to take it over. */ + if (likely(try_cmpxchg(stashed, &old, dentry))) + return dentry; + } +} + +/** + * path_from_stashed - create path from stashed or new dentry + * @stashed: where to retrieve or stash dentry + * @ino: inode number to use + * @mnt: mnt of the filesystems to use + * @data: data to store in inode->i_private + * @path: path to create + * + * The function tries to retrieve a stashed dentry from @stashed. If the dentry + * is still valid then it will be reused. If the dentry isn't able the function + * will allocate a new dentry and inode. It will then check again whether it + * can reuse an existing dentry in case one has been added in the meantime or + * update @stashed with the newly added dentry. + * + * Special-purpose helper for nsfs and pidfs. + * + * Return: On success zero and on failure a negative error is returned. + */ +int path_from_stashed(struct dentry **stashed, unsigned long ino, + struct vfsmount *mnt, void *data, struct path *path) +{ + struct dentry *dentry; + const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info; + + /* See if dentry can be reused. */ + path->dentry = get_stashed_dentry(*stashed); + if (path->dentry) { + sops->put_data(data); + goto out_path; + } + + /* Allocate a new dentry. */ + dentry = prepare_anon_dentry(stashed, ino, mnt->mnt_sb, data); + if (IS_ERR(dentry)) { + sops->put_data(data); + return PTR_ERR(dentry); + } + + /* Added a new dentry. @data is now owned by the filesystem. */ + path->dentry = stash_dentry(stashed, dentry); + if (path->dentry != dentry) + dput(dentry); + +out_path: + WARN_ON_ONCE(path->dentry->d_fsdata != stashed); + WARN_ON_ONCE(d_inode(path->dentry)->i_private != data); + path->mnt = mntget(mnt); + return 0; +} + +void stashed_dentry_prune(struct dentry *dentry) +{ + struct dentry **stashed = dentry->d_fsdata; + struct inode *inode = d_inode(dentry); + + if (WARN_ON_ONCE(!stashed)) + return; + + if (!inode) + return; + + /* + * Only replace our own @dentry as someone else might've + * already cleared out @dentry and stashed their own + * dentry in there. + */ + cmpxchg(stashed, dentry, NULL); +} diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c index 8161667c976f..527458db4525 100644 --- a/fs/lockd/clnt4xdr.c +++ b/fs/lockd/clnt4xdr.c @@ -243,7 +243,7 @@ static void encode_nlm4_holder(struct xdr_stream *xdr, u64 l_offset, l_len; __be32 *p; - encode_bool(xdr, lock->fl.fl_type == F_RDLCK); + encode_bool(xdr, lock->fl.c.flc_type == F_RDLCK); encode_int32(xdr, lock->svid); encode_netobj(xdr, lock->oh.data, lock->oh.len); @@ -270,7 +270,7 @@ static int decode_nlm4_holder(struct xdr_stream *xdr, struct nlm_res *result) goto out_overflow; exclusive = be32_to_cpup(p++); lock->svid = be32_to_cpup(p); - fl->fl_pid = (pid_t)lock->svid; + fl->c.flc_pid = (pid_t)lock->svid; error = decode_netobj(xdr, &lock->oh); if (unlikely(error)) @@ -280,8 +280,8 @@ static int decode_nlm4_holder(struct xdr_stream *xdr, struct nlm_res *result) if (unlikely(p == NULL)) goto out_overflow; - fl->fl_flags = FL_POSIX; - fl->fl_type = exclusive != 0 ? F_WRLCK : F_RDLCK; + fl->c.flc_flags = FL_POSIX; + fl->c.flc_type = exclusive != 0 ? F_WRLCK : F_RDLCK; p = xdr_decode_hyper(p, &l_offset); xdr_decode_hyper(p, &l_len); nlm4svc_set_file_lock_range(fl, l_offset, l_len); @@ -357,7 +357,7 @@ static void nlm4_xdr_enc_testargs(struct rpc_rqst *req, const struct nlm_lock *lock = &args->lock; encode_cookie(xdr, &args->cookie); - encode_bool(xdr, lock->fl.fl_type == F_WRLCK); + encode_bool(xdr, lock->fl.c.flc_type == F_WRLCK); encode_nlm4_lock(xdr, lock); } @@ -380,7 +380,7 @@ static void nlm4_xdr_enc_lockargs(struct rpc_rqst *req, encode_cookie(xdr, &args->cookie); encode_bool(xdr, args->block); - encode_bool(xdr, lock->fl.fl_type == F_WRLCK); + encode_bool(xdr, lock->fl.c.flc_type == F_WRLCK); encode_nlm4_lock(xdr, lock); encode_bool(xdr, args->reclaim); encode_int32(xdr, args->state); @@ -403,7 +403,7 @@ static void nlm4_xdr_enc_cancargs(struct rpc_rqst *req, encode_cookie(xdr, &args->cookie); encode_bool(xdr, args->block); - encode_bool(xdr, lock->fl.fl_type == F_WRLCK); + encode_bool(xdr, lock->fl.c.flc_type == F_WRLCK); encode_nlm4_lock(xdr, lock); } diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 5d85715be763..a7e0519ec024 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -185,7 +185,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock) continue; if (!rpc_cmp_addr(nlm_addr(block->b_host), addr)) continue; - if (nfs_compare_fh(NFS_FH(file_inode(fl_blocked->fl_file)), fh) != 0) + if (nfs_compare_fh(NFS_FH(file_inode(fl_blocked->c.flc_file)), fh) != 0) continue; /* Alright, we found a lock. Set the return status * and wake up the caller diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index fba6c7fa7474..cebcc283b7ce 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -133,7 +133,8 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl) char *nodename = req->a_host->h_rpcclnt->cl_nodename; nlmclnt_next_cookie(&argp->cookie); - memcpy(&lock->fh, NFS_FH(file_inode(fl->fl_file)), sizeof(struct nfs_fh)); + memcpy(&lock->fh, NFS_FH(file_inode(fl->c.flc_file)), + sizeof(struct nfs_fh)); lock->caller = nodename; lock->oh.data = req->a_owner; lock->oh.len = snprintf(req->a_owner, sizeof(req->a_owner), "%u@%s", @@ -142,7 +143,7 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl) lock->svid = fl->fl_u.nfs_fl.owner->pid; lock->fl.fl_start = fl->fl_start; lock->fl.fl_end = fl->fl_end; - lock->fl.fl_type = fl->fl_type; + lock->fl.c.flc_type = fl->c.flc_type; } static void nlmclnt_release_lockargs(struct nlm_rqst *req) @@ -182,7 +183,7 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl, void *dat call->a_callback_data = data; if (IS_SETLK(cmd) || IS_SETLKW(cmd)) { - if (fl->fl_type != F_UNLCK) { + if (fl->c.flc_type != F_UNLCK) { call->a_args.block = IS_SETLKW(cmd) ? 1 : 0; status = nlmclnt_lock(call, fl); } else @@ -432,13 +433,14 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl) { int status; - status = nlmclnt_call(nfs_file_cred(fl->fl_file), req, NLMPROC_TEST); + status = nlmclnt_call(nfs_file_cred(fl->c.flc_file), req, + NLMPROC_TEST); if (status < 0) goto out; switch (req->a_res.status) { case nlm_granted: - fl->fl_type = F_UNLCK; + fl->c.flc_type = F_UNLCK; break; case nlm_lck_denied: /* @@ -446,8 +448,8 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl) */ fl->fl_start = req->a_res.lock.fl.fl_start; fl->fl_end = req->a_res.lock.fl.fl_end; - fl->fl_type = req->a_res.lock.fl.fl_type; - fl->fl_pid = -req->a_res.lock.fl.fl_pid; + fl->c.flc_type = req->a_res.lock.fl.c.flc_type; + fl->c.flc_pid = -req->a_res.lock.fl.c.flc_pid; break; default: status = nlm_stat_to_errno(req->a_res.status); @@ -485,14 +487,15 @@ static const struct file_lock_operations nlmclnt_lock_ops = { static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host) { fl->fl_u.nfs_fl.state = 0; - fl->fl_u.nfs_fl.owner = nlmclnt_find_lockowner(host, fl->fl_owner); + fl->fl_u.nfs_fl.owner = nlmclnt_find_lockowner(host, + fl->c.flc_owner); INIT_LIST_HEAD(&fl->fl_u.nfs_fl.list); fl->fl_ops = &nlmclnt_lock_ops; } static int do_vfs_lock(struct file_lock *fl) { - return locks_lock_file_wait(fl->fl_file, fl); + return locks_lock_file_wait(fl->c.flc_file, fl); } /* @@ -518,12 +521,12 @@ static int do_vfs_lock(struct file_lock *fl) static int nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl) { - const struct cred *cred = nfs_file_cred(fl->fl_file); + const struct cred *cred = nfs_file_cred(fl->c.flc_file); struct nlm_host *host = req->a_host; struct nlm_res *resp = &req->a_res; struct nlm_wait block; - unsigned char fl_flags = fl->fl_flags; - unsigned char fl_type; + unsigned char flags = fl->c.flc_flags; + unsigned char type; __be32 b_status; int status = -ENOLCK; @@ -531,9 +534,9 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl) goto out; req->a_args.state = nsm_local_state; - fl->fl_flags |= FL_ACCESS; + fl->c.flc_flags |= FL_ACCESS; status = do_vfs_lock(fl); - fl->fl_flags = fl_flags; + fl->c.flc_flags = flags; if (status < 0) goto out; @@ -591,11 +594,11 @@ again: goto again; } /* Ensure the resulting lock will get added to granted list */ - fl->fl_flags |= FL_SLEEP; + fl->c.flc_flags |= FL_SLEEP; if (do_vfs_lock(fl) < 0) printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __func__); up_read(&host->h_rwsem); - fl->fl_flags = fl_flags; + fl->c.flc_flags = flags; status = 0; } if (status < 0) @@ -605,7 +608,7 @@ again: * cases NLM_LCK_DENIED is returned for a permanent error. So * turn it into an ENOLCK. */ - if (resp->status == nlm_lck_denied && (fl_flags & FL_SLEEP)) + if (resp->status == nlm_lck_denied && (flags & FL_SLEEP)) status = -ENOLCK; else status = nlm_stat_to_errno(resp->status); @@ -622,13 +625,13 @@ out_unlock: req->a_host->h_addrlen, req->a_res.status); dprintk("lockd: lock attempt ended in fatal error.\n" " Attempting to unlock.\n"); - fl_type = fl->fl_type; - fl->fl_type = F_UNLCK; + type = fl->c.flc_type; + fl->c.flc_type = F_UNLCK; down_read(&host->h_rwsem); do_vfs_lock(fl); up_read(&host->h_rwsem); - fl->fl_type = fl_type; - fl->fl_flags = fl_flags; + fl->c.flc_type = type; + fl->c.flc_flags = flags; nlmclnt_async_call(cred, req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops); return status; } @@ -651,12 +654,14 @@ nlmclnt_reclaim(struct nlm_host *host, struct file_lock *fl, nlmclnt_setlockargs(req, fl); req->a_args.reclaim = 1; - status = nlmclnt_call(nfs_file_cred(fl->fl_file), req, NLMPROC_LOCK); + status = nlmclnt_call(nfs_file_cred(fl->c.flc_file), req, + NLMPROC_LOCK); if (status >= 0 && req->a_res.status == nlm_granted) return 0; printk(KERN_WARNING "lockd: failed to reclaim lock for pid %d " - "(errno %d, status %d)\n", fl->fl_pid, + "(errno %d, status %d)\n", + fl->c.flc_pid, status, ntohl(req->a_res.status)); /* @@ -683,26 +688,26 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl) struct nlm_host *host = req->a_host; struct nlm_res *resp = &req->a_res; int status; - unsigned char fl_flags = fl->fl_flags; + unsigned char flags = fl->c.flc_flags; /* * Note: the server is supposed to either grant us the unlock * request, or to deny it with NLM_LCK_DENIED_GRACE_PERIOD. In either * case, we want to unlock. */ - fl->fl_flags |= FL_EXISTS; + fl->c.flc_flags |= FL_EXISTS; down_read(&host->h_rwsem); status = do_vfs_lock(fl); up_read(&host->h_rwsem); - fl->fl_flags = fl_flags; + fl->c.flc_flags = flags; if (status == -ENOENT) { status = 0; goto out; } refcount_inc(&req->a_count); - status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req, - NLMPROC_UNLOCK, &nlmclnt_unlock_ops); + status = nlmclnt_async_call(nfs_file_cred(fl->c.flc_file), req, + NLMPROC_UNLOCK, &nlmclnt_unlock_ops); if (status < 0) goto out; @@ -795,8 +800,8 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl req->a_args.block = block; refcount_inc(&req->a_count); - status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req, - NLMPROC_CANCEL, &nlmclnt_cancel_ops); + status = nlmclnt_async_call(nfs_file_cred(fl->c.flc_file), req, + NLMPROC_CANCEL, &nlmclnt_cancel_ops); if (status == 0 && req->a_res.status == nlm_lck_denied) status = -ENOLCK; nlmclnt_release_call(req); diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c index 4df62f635529..a3e97278b997 100644 --- a/fs/lockd/clntxdr.c +++ b/fs/lockd/clntxdr.c @@ -238,7 +238,7 @@ static void encode_nlm_holder(struct xdr_stream *xdr, u32 l_offset, l_len; __be32 *p; - encode_bool(xdr, lock->fl.fl_type == F_RDLCK); + encode_bool(xdr, lock->fl.c.flc_type == F_RDLCK); encode_int32(xdr, lock->svid); encode_netobj(xdr, lock->oh.data, lock->oh.len); @@ -265,7 +265,7 @@ static int decode_nlm_holder(struct xdr_stream *xdr, struct nlm_res *result) goto out_overflow; exclusive = be32_to_cpup(p++); lock->svid = be32_to_cpup(p); - fl->fl_pid = (pid_t)lock->svid; + fl->c.flc_pid = (pid_t)lock->svid; error = decode_netobj(xdr, &lock->oh); if (unlikely(error)) @@ -275,8 +275,8 @@ static int decode_nlm_holder(struct xdr_stream *xdr, struct nlm_res *result) if (unlikely(p == NULL)) goto out_overflow; - fl->fl_flags = FL_POSIX; - fl->fl_type = exclusive != 0 ? F_WRLCK : F_RDLCK; + fl->c.flc_flags = FL_POSIX; + fl->c.flc_type = exclusive != 0 ? F_WRLCK : F_RDLCK; l_offset = be32_to_cpup(p++); l_len = be32_to_cpup(p); end = l_offset + l_len - 1; @@ -357,7 +357,7 @@ static void nlm_xdr_enc_testargs(struct rpc_rqst *req, const struct nlm_lock *lock = &args->lock; encode_cookie(xdr, &args->cookie); - encode_bool(xdr, lock->fl.fl_type == F_WRLCK); + encode_bool(xdr, lock->fl.c.flc_type == F_WRLCK); encode_nlm_lock(xdr, lock); } @@ -380,7 +380,7 @@ static void nlm_xdr_enc_lockargs(struct rpc_rqst *req, encode_cookie(xdr, &args->cookie); encode_bool(xdr, args->block); - encode_bool(xdr, lock->fl.fl_type == F_WRLCK); + encode_bool(xdr, lock->fl.c.flc_type == F_WRLCK); encode_nlm_lock(xdr, lock); encode_bool(xdr, args->reclaim); encode_int32(xdr, args->state); @@ -403,7 +403,7 @@ static void nlm_xdr_enc_cancargs(struct rpc_rqst *req, encode_cookie(xdr, &args->cookie); encode_bool(xdr, args->block); - encode_bool(xdr, lock->fl.fl_type == F_WRLCK); + encode_bool(xdr, lock->fl.c.flc_type == F_WRLCK); encode_nlm_lock(xdr, lock); } diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index ce5862482097..ab8042a5b895 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -710,8 +710,6 @@ static const struct svc_version *nlmsvc_version[] = { #endif }; -static struct svc_stat nlmsvc_stats; - #define NLM_NRVERS ARRAY_SIZE(nlmsvc_version) static struct svc_program nlmsvc_program = { .pg_prog = NLM_PROGRAM, /* program number */ @@ -719,7 +717,6 @@ static struct svc_program nlmsvc_program = { .pg_vers = nlmsvc_version, /* version table */ .pg_name = "lockd", /* service name */ .pg_class = "nfsd", /* share authentication with nfsd */ - .pg_stats = &nlmsvc_stats, /* stats table */ .pg_authenticate = &lockd_authenticate, /* export authentication */ .pg_init_request = svc_generic_init_request, .pg_rpcbind_set = svc_generic_rpcbind_set, diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index b72023a6b4c1..8a72c418cdcc 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -52,16 +52,16 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, *filp = file; /* Set up the missing parts of the file_lock structure */ - lock->fl.fl_flags = FL_POSIX; - lock->fl.fl_file = file->f_file[mode]; - lock->fl.fl_pid = current->tgid; + lock->fl.c.flc_flags = FL_POSIX; + lock->fl.c.flc_file = file->f_file[mode]; + lock->fl.c.flc_pid = current->tgid; lock->fl.fl_start = (loff_t)lock->lock_start; lock->fl.fl_end = lock->lock_len ? (loff_t)(lock->lock_start + lock->lock_len - 1) : OFFSET_MAX; lock->fl.fl_lmops = &nlmsvc_lock_operations; nlmsvc_locks_init_private(&lock->fl, host, (pid_t)lock->svid); - if (!lock->fl.fl_owner) { + if (!lock->fl.c.flc_owner) { /* lockowner allocation has failed */ nlmsvc_release_host(host); return nlm_lck_denied_nolocks; @@ -106,7 +106,7 @@ __nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp) if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; - test_owner = argp->lock.fl.fl_owner; + test_owner = argp->lock.fl.c.flc_owner; /* Now check for conflicting locks */ resp->status = nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie); if (resp->status == nlm_drop_reply) diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 2dc10900ad1c..1f2149db10f2 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -150,16 +150,17 @@ nlmsvc_lookup_block(struct nlm_file *file, struct nlm_lock *lock) struct file_lock *fl; dprintk("lockd: nlmsvc_lookup_block f=%p pd=%d %Ld-%Ld ty=%d\n", - file, lock->fl.fl_pid, + file, lock->fl.c.flc_pid, (long long)lock->fl.fl_start, - (long long)lock->fl.fl_end, lock->fl.fl_type); + (long long)lock->fl.fl_end, + lock->fl.c.flc_type); spin_lock(&nlm_blocked_lock); list_for_each_entry(block, &nlm_blocked, b_list) { fl = &block->b_call->a_args.lock.fl; dprintk("lockd: check f=%p pd=%d %Ld-%Ld ty=%d cookie=%s\n", - block->b_file, fl->fl_pid, + block->b_file, fl->c.flc_pid, (long long)fl->fl_start, - (long long)fl->fl_end, fl->fl_type, + (long long)fl->fl_end, fl->c.flc_type, nlmdbg_cookie2a(&block->b_call->a_args.cookie)); if (block->b_file == file && nlm_compare_locks(fl, &lock->fl)) { kref_get(&block->b_count); @@ -244,7 +245,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host, goto failed_free; /* Set notifier function for VFS, and init args */ - call->a_args.lock.fl.fl_flags |= FL_SLEEP; + call->a_args.lock.fl.c.flc_flags |= FL_SLEEP; call->a_args.lock.fl.fl_lmops = &nlmsvc_lock_operations; nlmclnt_next_cookie(&call->a_args.cookie); @@ -402,14 +403,14 @@ static struct nlm_lockowner *nlmsvc_find_lockowner(struct nlm_host *host, pid_t void nlmsvc_release_lockowner(struct nlm_lock *lock) { - if (lock->fl.fl_owner) - nlmsvc_put_lockowner(lock->fl.fl_owner); + if (lock->fl.c.flc_owner) + nlmsvc_put_lockowner(lock->fl.c.flc_owner); } void nlmsvc_locks_init_private(struct file_lock *fl, struct nlm_host *host, pid_t pid) { - fl->fl_owner = nlmsvc_find_lockowner(host, pid); + fl->c.flc_owner = nlmsvc_find_lockowner(host, pid); } /* @@ -425,7 +426,7 @@ static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock) /* set default data area */ call->a_args.lock.oh.data = call->a_owner; - call->a_args.lock.svid = ((struct nlm_lockowner *)lock->fl.fl_owner)->pid; + call->a_args.lock.svid = ((struct nlm_lockowner *) lock->fl.c.flc_owner)->pid; if (lock->oh.len > NLMCLNT_OHSIZE) { void *data = kmalloc(lock->oh.len, GFP_KERNEL); @@ -489,7 +490,8 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, dprintk("lockd: nlmsvc_lock(%s/%ld, ty=%d, pi=%d, %Ld-%Ld, bl=%d)\n", inode->i_sb->s_id, inode->i_ino, - lock->fl.fl_type, lock->fl.fl_pid, + lock->fl.c.flc_type, + lock->fl.c.flc_pid, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end, wait); @@ -512,7 +514,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, goto out; lock = &block->b_call->a_args.lock; } else - lock->fl.fl_flags &= ~FL_SLEEP; + lock->fl.c.flc_flags &= ~FL_SLEEP; if (block->b_flags & B_QUEUED) { dprintk("lockd: nlmsvc_lock deferred block %p flags %d\n", @@ -560,10 +562,10 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, spin_unlock(&nlm_blocked_lock); if (!wait) - lock->fl.fl_flags &= ~FL_SLEEP; + lock->fl.c.flc_flags &= ~FL_SLEEP; mode = lock_to_openmode(&lock->fl); error = vfs_lock_file(file->f_file[mode], F_SETLK, &lock->fl, NULL); - lock->fl.fl_flags &= ~FL_SLEEP; + lock->fl.c.flc_flags &= ~FL_SLEEP; dprintk("lockd: vfs_lock_file returned %d\n", error); switch (error) { @@ -616,7 +618,7 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, dprintk("lockd: nlmsvc_testlock(%s/%ld, ty=%d, %Ld-%Ld)\n", nlmsvc_file_inode(file)->i_sb->s_id, nlmsvc_file_inode(file)->i_ino, - lock->fl.fl_type, + lock->fl.c.flc_type, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); @@ -636,19 +638,19 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, goto out; } - if (lock->fl.fl_type == F_UNLCK) { + if (lock->fl.c.flc_type == F_UNLCK) { ret = nlm_granted; goto out; } dprintk("lockd: conflicting lock(ty=%d, %Ld-%Ld)\n", - lock->fl.fl_type, (long long)lock->fl.fl_start, + lock->fl.c.flc_type, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); conflock->caller = "somehost"; /* FIXME */ conflock->len = strlen(conflock->caller); conflock->oh.len = 0; /* don't return OH info */ - conflock->svid = lock->fl.fl_pid; - conflock->fl.fl_type = lock->fl.fl_type; + conflock->svid = lock->fl.c.flc_pid; + conflock->fl.c.flc_type = lock->fl.c.flc_type; conflock->fl.fl_start = lock->fl.fl_start; conflock->fl.fl_end = lock->fl.fl_end; locks_release_private(&lock->fl); @@ -673,21 +675,21 @@ nlmsvc_unlock(struct net *net, struct nlm_file *file, struct nlm_lock *lock) dprintk("lockd: nlmsvc_unlock(%s/%ld, pi=%d, %Ld-%Ld)\n", nlmsvc_file_inode(file)->i_sb->s_id, nlmsvc_file_inode(file)->i_ino, - lock->fl.fl_pid, + lock->fl.c.flc_pid, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); /* First, cancel any lock that might be there */ nlmsvc_cancel_blocked(net, file, lock); - lock->fl.fl_type = F_UNLCK; - lock->fl.fl_file = file->f_file[O_RDONLY]; - if (lock->fl.fl_file) - error = vfs_lock_file(lock->fl.fl_file, F_SETLK, + lock->fl.c.flc_type = F_UNLCK; + lock->fl.c.flc_file = file->f_file[O_RDONLY]; + if (lock->fl.c.flc_file) + error = vfs_lock_file(lock->fl.c.flc_file, F_SETLK, &lock->fl, NULL); - lock->fl.fl_file = file->f_file[O_WRONLY]; - if (lock->fl.fl_file) - error |= vfs_lock_file(lock->fl.fl_file, F_SETLK, + lock->fl.c.flc_file = file->f_file[O_WRONLY]; + if (lock->fl.c.flc_file) + error |= vfs_lock_file(lock->fl.c.flc_file, F_SETLK, &lock->fl, NULL); return (error < 0)? nlm_lck_denied_nolocks : nlm_granted; @@ -710,7 +712,7 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l dprintk("lockd: nlmsvc_cancel(%s/%ld, pi=%d, %Ld-%Ld)\n", nlmsvc_file_inode(file)->i_sb->s_id, nlmsvc_file_inode(file)->i_ino, - lock->fl.fl_pid, + lock->fl.c.flc_pid, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); @@ -863,12 +865,12 @@ nlmsvc_grant_blocked(struct nlm_block *block) /* vfs_lock_file() can mangle fl_start and fl_end, but we need * them unchanged for the GRANT_MSG */ - lock->fl.fl_flags |= FL_SLEEP; + lock->fl.c.flc_flags |= FL_SLEEP; fl_start = lock->fl.fl_start; fl_end = lock->fl.fl_end; mode = lock_to_openmode(&lock->fl); error = vfs_lock_file(file->f_file[mode], F_SETLK, &lock->fl, NULL); - lock->fl.fl_flags &= ~FL_SLEEP; + lock->fl.c.flc_flags &= ~FL_SLEEP; lock->fl.fl_start = fl_start; lock->fl.fl_end = fl_end; @@ -993,8 +995,8 @@ nlmsvc_grant_reply(struct nlm_cookie *cookie, __be32 status) /* Client doesn't want it, just unlock it */ nlmsvc_unlink_block(block); fl = &block->b_call->a_args.lock.fl; - fl->fl_type = F_UNLCK; - error = vfs_lock_file(fl->fl_file, F_SETLK, fl, NULL); + fl->c.flc_type = F_UNLCK; + error = vfs_lock_file(fl->c.flc_file, F_SETLK, fl, NULL); if (error) pr_warn("lockd: unable to unlock lock rejected by client!\n"); break; diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 32784f508c81..a03220e66ce0 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -77,12 +77,12 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, /* Set up the missing parts of the file_lock structure */ mode = lock_to_openmode(&lock->fl); - lock->fl.fl_flags = FL_POSIX; - lock->fl.fl_file = file->f_file[mode]; - lock->fl.fl_pid = current->tgid; + lock->fl.c.flc_flags = FL_POSIX; + lock->fl.c.flc_file = file->f_file[mode]; + lock->fl.c.flc_pid = current->tgid; lock->fl.fl_lmops = &nlmsvc_lock_operations; nlmsvc_locks_init_private(&lock->fl, host, (pid_t)lock->svid); - if (!lock->fl.fl_owner) { + if (!lock->fl.c.flc_owner) { /* lockowner allocation has failed */ nlmsvc_release_host(host); return nlm_lck_denied_nolocks; @@ -127,7 +127,7 @@ __nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp) if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; - test_owner = argp->lock.fl.fl_owner; + test_owner = argp->lock.fl.c.flc_owner; /* Now check for conflicting locks */ resp->status = cast_status(nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie)); diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index e3b6229e7ae5..9103896164f6 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -73,7 +73,7 @@ static inline unsigned int file_hash(struct nfs_fh *f) int lock_to_openmode(struct file_lock *lock) { - return (lock->fl_type == F_WRLCK) ? O_WRONLY : O_RDONLY; + return lock_is_write(lock) ? O_WRONLY : O_RDONLY; } /* @@ -181,18 +181,18 @@ static int nlm_unlock_files(struct nlm_file *file, const struct file_lock *fl) struct file_lock lock; locks_init_lock(&lock); - lock.fl_type = F_UNLCK; + lock.c.flc_type = F_UNLCK; lock.fl_start = 0; lock.fl_end = OFFSET_MAX; - lock.fl_owner = fl->fl_owner; - lock.fl_pid = fl->fl_pid; - lock.fl_flags = FL_POSIX; + lock.c.flc_owner = fl->c.flc_owner; + lock.c.flc_pid = fl->c.flc_pid; + lock.c.flc_flags = FL_POSIX; - lock.fl_file = file->f_file[O_RDONLY]; - if (lock.fl_file && vfs_lock_file(lock.fl_file, F_SETLK, &lock, NULL)) + lock.c.flc_file = file->f_file[O_RDONLY]; + if (lock.c.flc_file && vfs_lock_file(lock.c.flc_file, F_SETLK, &lock, NULL)) goto out_err; - lock.fl_file = file->f_file[O_WRONLY]; - if (lock.fl_file && vfs_lock_file(lock.fl_file, F_SETLK, &lock, NULL)) + lock.c.flc_file = file->f_file[O_WRONLY]; + if (lock.c.flc_file && vfs_lock_file(lock.c.flc_file, F_SETLK, &lock, NULL)) goto out_err; return 0; out_err: @@ -218,14 +218,14 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file, again: file->f_locks = 0; spin_lock(&flctx->flc_lock); - list_for_each_entry(fl, &flctx->flc_posix, fl_list) { + for_each_file_lock(fl, &flctx->flc_posix) { if (fl->fl_lmops != &nlmsvc_lock_operations) continue; /* update current lock count */ file->f_locks++; - lockhost = ((struct nlm_lockowner *)fl->fl_owner)->host; + lockhost = ((struct nlm_lockowner *) fl->c.flc_owner)->host; if (match(lockhost, host)) { spin_unlock(&flctx->flc_lock); @@ -272,7 +272,7 @@ nlm_file_inuse(struct nlm_file *file) if (flctx && !list_empty_careful(&flctx->flc_posix)) { spin_lock(&flctx->flc_lock); - list_for_each_entry(fl, &flctx->flc_posix, fl_list) { + for_each_file_lock(fl, &flctx->flc_posix) { if (fl->fl_lmops == &nlmsvc_lock_operations) { spin_unlock(&flctx->flc_lock); return 1; diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c index 2fb5748dae0c..adfcce2bf11b 100644 --- a/fs/lockd/xdr.c +++ b/fs/lockd/xdr.c @@ -88,8 +88,8 @@ svcxdr_decode_lock(struct xdr_stream *xdr, struct nlm_lock *lock) return false; locks_init_lock(fl); - fl->fl_flags = FL_POSIX; - fl->fl_type = F_RDLCK; + fl->c.flc_flags = FL_POSIX; + fl->c.flc_type = F_RDLCK; end = start + len - 1; fl->fl_start = s32_to_loff_t(start); if (len == 0 || end < 0) @@ -107,7 +107,7 @@ svcxdr_encode_holder(struct xdr_stream *xdr, const struct nlm_lock *lock) s32 start, len; /* exclusive */ - if (xdr_stream_encode_bool(xdr, fl->fl_type != F_RDLCK) < 0) + if (xdr_stream_encode_bool(xdr, fl->c.flc_type != F_RDLCK) < 0) return false; if (xdr_stream_encode_u32(xdr, lock->svid) < 0) return false; @@ -164,7 +164,7 @@ nlmsvc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) if (!svcxdr_decode_lock(xdr, &argp->lock)) return false; if (exclusive) - argp->lock.fl.fl_type = F_WRLCK; + argp->lock.fl.c.flc_type = F_WRLCK; return true; } @@ -184,7 +184,7 @@ nlmsvc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) if (!svcxdr_decode_lock(xdr, &argp->lock)) return false; if (exclusive) - argp->lock.fl.fl_type = F_WRLCK; + argp->lock.fl.c.flc_type = F_WRLCK; if (xdr_stream_decode_bool(xdr, &argp->reclaim) < 0) return false; if (xdr_stream_decode_u32(xdr, &argp->state) < 0) @@ -209,7 +209,7 @@ nlmsvc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) if (!svcxdr_decode_lock(xdr, &argp->lock)) return false; if (exclusive) - argp->lock.fl.fl_type = F_WRLCK; + argp->lock.fl.c.flc_type = F_WRLCK; return true; } @@ -223,7 +223,7 @@ nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) return false; if (!svcxdr_decode_lock(xdr, &argp->lock)) return false; - argp->lock.fl.fl_type = F_UNLCK; + argp->lock.fl.c.flc_type = F_UNLCK; return true; } diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c index 5fcbf30cd275..3d28b9c3ed15 100644 --- a/fs/lockd/xdr4.c +++ b/fs/lockd/xdr4.c @@ -89,8 +89,8 @@ svcxdr_decode_lock(struct xdr_stream *xdr, struct nlm_lock *lock) return false; locks_init_lock(fl); - fl->fl_flags = FL_POSIX; - fl->fl_type = F_RDLCK; + fl->c.flc_flags = FL_POSIX; + fl->c.flc_type = F_RDLCK; nlm4svc_set_file_lock_range(fl, lock->lock_start, lock->lock_len); return true; } @@ -102,7 +102,7 @@ svcxdr_encode_holder(struct xdr_stream *xdr, const struct nlm_lock *lock) s64 start, len; /* exclusive */ - if (xdr_stream_encode_bool(xdr, fl->fl_type != F_RDLCK) < 0) + if (xdr_stream_encode_bool(xdr, fl->c.flc_type != F_RDLCK) < 0) return false; if (xdr_stream_encode_u32(xdr, lock->svid) < 0) return false; @@ -159,7 +159,7 @@ nlm4svc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) if (!svcxdr_decode_lock(xdr, &argp->lock)) return false; if (exclusive) - argp->lock.fl.fl_type = F_WRLCK; + argp->lock.fl.c.flc_type = F_WRLCK; return true; } @@ -179,7 +179,7 @@ nlm4svc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) if (!svcxdr_decode_lock(xdr, &argp->lock)) return false; if (exclusive) - argp->lock.fl.fl_type = F_WRLCK; + argp->lock.fl.c.flc_type = F_WRLCK; if (xdr_stream_decode_bool(xdr, &argp->reclaim) < 0) return false; if (xdr_stream_decode_u32(xdr, &argp->state) < 0) @@ -204,7 +204,7 @@ nlm4svc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) if (!svcxdr_decode_lock(xdr, &argp->lock)) return false; if (exclusive) - argp->lock.fl.fl_type = F_WRLCK; + argp->lock.fl.c.flc_type = F_WRLCK; return true; } @@ -218,7 +218,7 @@ nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) return false; if (!svcxdr_decode_lock(xdr, &argp->lock)) return false; - argp->lock.fl.fl_type = F_UNLCK; + argp->lock.fl.c.flc_type = F_UNLCK; return true; } diff --git a/fs/locks.c b/fs/locks.c index cc7c117ee192..90c8746874de 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -48,7 +48,6 @@ * children. * */ - #include <linux/capability.h> #include <linux/file.h> #include <linux/fdtable.h> @@ -70,24 +69,28 @@ #include <linux/uaccess.h> -#define IS_POSIX(fl) (fl->fl_flags & FL_POSIX) -#define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) -#define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT)) -#define IS_OFDLCK(fl) (fl->fl_flags & FL_OFDLCK) -#define IS_REMOTELCK(fl) (fl->fl_pid <= 0) +static struct file_lock *file_lock(struct file_lock_core *flc) +{ + return container_of(flc, struct file_lock, c); +} + +static struct file_lease *file_lease(struct file_lock_core *flc) +{ + return container_of(flc, struct file_lease, c); +} -static bool lease_breaking(struct file_lock *fl) +static bool lease_breaking(struct file_lease *fl) { - return fl->fl_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING); + return fl->c.flc_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING); } -static int target_leasetype(struct file_lock *fl) +static int target_leasetype(struct file_lease *fl) { - if (fl->fl_flags & FL_UNLOCK_PENDING) + if (fl->c.flc_flags & FL_UNLOCK_PENDING) return F_UNLCK; - if (fl->fl_flags & FL_DOWNGRADE_PENDING) + if (fl->c.flc_flags & FL_DOWNGRADE_PENDING) return F_RDLCK; - return fl->fl_type; + return fl->c.flc_type; } static int leases_enable = 1; @@ -168,6 +171,7 @@ static DEFINE_SPINLOCK(blocked_lock_lock); static struct kmem_cache *flctx_cache __ro_after_init; static struct kmem_cache *filelock_cache __ro_after_init; +static struct kmem_cache *filelease_cache __ro_after_init; static struct file_lock_context * locks_get_lock_context(struct inode *inode, int type) @@ -204,11 +208,12 @@ out: static void locks_dump_ctx_list(struct list_head *list, char *list_type) { - struct file_lock *fl; + struct file_lock_core *flc; - list_for_each_entry(fl, list, fl_list) { - pr_warn("%s: fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n", list_type, fl->fl_owner, fl->fl_flags, fl->fl_type, fl->fl_pid); - } + list_for_each_entry(flc, list, flc_list) + pr_warn("%s: fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n", + list_type, flc->flc_owner, flc->flc_flags, + flc->flc_type, flc->flc_pid); } static void @@ -229,19 +234,19 @@ locks_check_ctx_lists(struct inode *inode) } static void -locks_check_ctx_file_list(struct file *filp, struct list_head *list, - char *list_type) +locks_check_ctx_file_list(struct file *filp, struct list_head *list, char *list_type) { - struct file_lock *fl; + struct file_lock_core *flc; struct inode *inode = file_inode(filp); - list_for_each_entry(fl, list, fl_list) - if (fl->fl_file == filp) + list_for_each_entry(flc, list, flc_list) + if (flc->flc_file == filp) pr_warn("Leaked %s lock on dev=0x%x:0x%x ino=0x%lx " " fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n", list_type, MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), inode->i_ino, - fl->fl_owner, fl->fl_flags, fl->fl_type, fl->fl_pid); + flc->flc_owner, flc->flc_flags, + flc->flc_type, flc->flc_pid); } void @@ -255,13 +260,13 @@ locks_free_lock_context(struct inode *inode) } } -static void locks_init_lock_heads(struct file_lock *fl) +static void locks_init_lock_heads(struct file_lock_core *flc) { - INIT_HLIST_NODE(&fl->fl_link); - INIT_LIST_HEAD(&fl->fl_list); - INIT_LIST_HEAD(&fl->fl_blocked_requests); - INIT_LIST_HEAD(&fl->fl_blocked_member); - init_waitqueue_head(&fl->fl_wait); + INIT_HLIST_NODE(&flc->flc_link); + INIT_LIST_HEAD(&flc->flc_list); + INIT_LIST_HEAD(&flc->flc_blocked_requests); + INIT_LIST_HEAD(&flc->flc_blocked_member); + init_waitqueue_head(&flc->flc_wait); } /* Allocate an empty lock structure. */ @@ -270,19 +275,33 @@ struct file_lock *locks_alloc_lock(void) struct file_lock *fl = kmem_cache_zalloc(filelock_cache, GFP_KERNEL); if (fl) - locks_init_lock_heads(fl); + locks_init_lock_heads(&fl->c); return fl; } EXPORT_SYMBOL_GPL(locks_alloc_lock); +/* Allocate an empty lock structure. */ +struct file_lease *locks_alloc_lease(void) +{ + struct file_lease *fl = kmem_cache_zalloc(filelease_cache, GFP_KERNEL); + + if (fl) + locks_init_lock_heads(&fl->c); + + return fl; +} +EXPORT_SYMBOL_GPL(locks_alloc_lease); + void locks_release_private(struct file_lock *fl) { - BUG_ON(waitqueue_active(&fl->fl_wait)); - BUG_ON(!list_empty(&fl->fl_list)); - BUG_ON(!list_empty(&fl->fl_blocked_requests)); - BUG_ON(!list_empty(&fl->fl_blocked_member)); - BUG_ON(!hlist_unhashed(&fl->fl_link)); + struct file_lock_core *flc = &fl->c; + + BUG_ON(waitqueue_active(&flc->flc_wait)); + BUG_ON(!list_empty(&flc->flc_list)); + BUG_ON(!list_empty(&flc->flc_blocked_requests)); + BUG_ON(!list_empty(&flc->flc_blocked_member)); + BUG_ON(!hlist_unhashed(&flc->flc_link)); if (fl->fl_ops) { if (fl->fl_ops->fl_release_private) @@ -292,8 +311,8 @@ void locks_release_private(struct file_lock *fl) if (fl->fl_lmops) { if (fl->fl_lmops->lm_put_owner) { - fl->fl_lmops->lm_put_owner(fl->fl_owner); - fl->fl_owner = NULL; + fl->fl_lmops->lm_put_owner(flc->flc_owner); + flc->flc_owner = NULL; } fl->fl_lmops = NULL; } @@ -309,16 +328,15 @@ EXPORT_SYMBOL_GPL(locks_release_private); * %true: @owner has at least one blocker * %false: @owner has no blockers */ -bool locks_owner_has_blockers(struct file_lock_context *flctx, - fl_owner_t owner) +bool locks_owner_has_blockers(struct file_lock_context *flctx, fl_owner_t owner) { - struct file_lock *fl; + struct file_lock_core *flc; spin_lock(&flctx->flc_lock); - list_for_each_entry(fl, &flctx->flc_posix, fl_list) { - if (fl->fl_owner != owner) + list_for_each_entry(flc, &flctx->flc_posix, flc_list) { + if (flc->flc_owner != owner) continue; - if (!list_empty(&fl->fl_blocked_requests)) { + if (!list_empty(&flc->flc_blocked_requests)) { spin_unlock(&flctx->flc_lock); return true; } @@ -336,35 +354,52 @@ void locks_free_lock(struct file_lock *fl) } EXPORT_SYMBOL(locks_free_lock); +/* Free a lease which is not in use. */ +void locks_free_lease(struct file_lease *fl) +{ + kmem_cache_free(filelease_cache, fl); +} +EXPORT_SYMBOL(locks_free_lease); + static void locks_dispose_list(struct list_head *dispose) { - struct file_lock *fl; + struct file_lock_core *flc; while (!list_empty(dispose)) { - fl = list_first_entry(dispose, struct file_lock, fl_list); - list_del_init(&fl->fl_list); - locks_free_lock(fl); + flc = list_first_entry(dispose, struct file_lock_core, flc_list); + list_del_init(&flc->flc_list); + if (flc->flc_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT)) + locks_free_lease(file_lease(flc)); + else + locks_free_lock(file_lock(flc)); } } void locks_init_lock(struct file_lock *fl) { memset(fl, 0, sizeof(struct file_lock)); - locks_init_lock_heads(fl); + locks_init_lock_heads(&fl->c); } EXPORT_SYMBOL(locks_init_lock); +void locks_init_lease(struct file_lease *fl) +{ + memset(fl, 0, sizeof(*fl)); + locks_init_lock_heads(&fl->c); +} +EXPORT_SYMBOL(locks_init_lease); + /* * Initialize a new lock from an existing file_lock structure. */ void locks_copy_conflock(struct file_lock *new, struct file_lock *fl) { - new->fl_owner = fl->fl_owner; - new->fl_pid = fl->fl_pid; - new->fl_file = NULL; - new->fl_flags = fl->fl_flags; - new->fl_type = fl->fl_type; + new->c.flc_owner = fl->c.flc_owner; + new->c.flc_pid = fl->c.flc_pid; + new->c.flc_file = NULL; + new->c.flc_flags = fl->c.flc_flags; + new->c.flc_type = fl->c.flc_type; new->fl_start = fl->fl_start; new->fl_end = fl->fl_end; new->fl_lmops = fl->fl_lmops; @@ -372,7 +407,7 @@ void locks_copy_conflock(struct file_lock *new, struct file_lock *fl) if (fl->fl_lmops) { if (fl->fl_lmops->lm_get_owner) - fl->fl_lmops->lm_get_owner(fl->fl_owner); + fl->fl_lmops->lm_get_owner(fl->c.flc_owner); } } EXPORT_SYMBOL(locks_copy_conflock); @@ -384,7 +419,7 @@ void locks_copy_lock(struct file_lock *new, struct file_lock *fl) locks_copy_conflock(new, fl); - new->fl_file = fl->fl_file; + new->c.flc_file = fl->c.flc_file; new->fl_ops = fl->fl_ops; if (fl->fl_ops) { @@ -400,15 +435,17 @@ static void locks_move_blocks(struct file_lock *new, struct file_lock *fl) /* * As ctx->flc_lock is held, new requests cannot be added to - * ->fl_blocked_requests, so we don't need a lock to check if it + * ->flc_blocked_requests, so we don't need a lock to check if it * is empty. */ - if (list_empty(&fl->fl_blocked_requests)) + if (list_empty(&fl->c.flc_blocked_requests)) return; spin_lock(&blocked_lock_lock); - list_splice_init(&fl->fl_blocked_requests, &new->fl_blocked_requests); - list_for_each_entry(f, &new->fl_blocked_requests, fl_blocked_member) - f->fl_blocker = new; + list_splice_init(&fl->c.flc_blocked_requests, + &new->c.flc_blocked_requests); + list_for_each_entry(f, &new->c.flc_blocked_requests, + c.flc_blocked_member) + f->c.flc_blocker = &new->c; spin_unlock(&blocked_lock_lock); } @@ -429,21 +466,21 @@ static void flock_make_lock(struct file *filp, struct file_lock *fl, int type) { locks_init_lock(fl); - fl->fl_file = filp; - fl->fl_owner = filp; - fl->fl_pid = current->tgid; - fl->fl_flags = FL_FLOCK; - fl->fl_type = type; + fl->c.flc_file = filp; + fl->c.flc_owner = filp; + fl->c.flc_pid = current->tgid; + fl->c.flc_flags = FL_FLOCK; + fl->c.flc_type = type; fl->fl_end = OFFSET_MAX; } -static int assign_type(struct file_lock *fl, int type) +static int assign_type(struct file_lock_core *flc, int type) { switch (type) { case F_RDLCK: case F_WRLCK: case F_UNLCK: - fl->fl_type = type; + flc->flc_type = type; break; default: return -EINVAL; @@ -488,14 +525,14 @@ static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl, } else fl->fl_end = OFFSET_MAX; - fl->fl_owner = current->files; - fl->fl_pid = current->tgid; - fl->fl_file = filp; - fl->fl_flags = FL_POSIX; + fl->c.flc_owner = current->files; + fl->c.flc_pid = current->tgid; + fl->c.flc_file = filp; + fl->c.flc_flags = FL_POSIX; fl->fl_ops = NULL; fl->fl_lmops = NULL; - return assign_type(fl, l->l_type); + return assign_type(&fl->c, l->l_type); } /* Verify a "struct flock" and copy it to a "struct file_lock" as a POSIX @@ -516,16 +553,16 @@ static int flock_to_posix_lock(struct file *filp, struct file_lock *fl, /* default lease lock manager operations */ static bool -lease_break_callback(struct file_lock *fl) +lease_break_callback(struct file_lease *fl) { kill_fasync(&fl->fl_fasync, SIGIO, POLL_MSG); return false; } static void -lease_setup(struct file_lock *fl, void **priv) +lease_setup(struct file_lease *fl, void **priv) { - struct file *filp = fl->fl_file; + struct file *filp = fl->c.flc_file; struct fasync_struct *fa = *priv; /* @@ -539,7 +576,7 @@ lease_setup(struct file_lock *fl, void **priv) __f_setown(filp, task_pid(current), PIDTYPE_TGID, 0); } -static const struct lock_manager_operations lease_manager_ops = { +static const struct lease_manager_operations lease_manager_ops = { .lm_break = lease_break_callback, .lm_change = lease_modify, .lm_setup = lease_setup, @@ -548,27 +585,24 @@ static const struct lock_manager_operations lease_manager_ops = { /* * Initialize a lease, use the default lock manager operations */ -static int lease_init(struct file *filp, int type, struct file_lock *fl) +static int lease_init(struct file *filp, int type, struct file_lease *fl) { - if (assign_type(fl, type) != 0) + if (assign_type(&fl->c, type) != 0) return -EINVAL; - fl->fl_owner = filp; - fl->fl_pid = current->tgid; + fl->c.flc_owner = filp; + fl->c.flc_pid = current->tgid; - fl->fl_file = filp; - fl->fl_flags = FL_LEASE; - fl->fl_start = 0; - fl->fl_end = OFFSET_MAX; - fl->fl_ops = NULL; + fl->c.flc_file = filp; + fl->c.flc_flags = FL_LEASE; fl->fl_lmops = &lease_manager_ops; return 0; } /* Allocate a file_lock initialised to this type of lease */ -static struct file_lock *lease_alloc(struct file *filp, int type) +static struct file_lease *lease_alloc(struct file *filp, int type) { - struct file_lock *fl = locks_alloc_lock(); + struct file_lease *fl = locks_alloc_lease(); int error = -ENOMEM; if (fl == NULL) @@ -576,7 +610,7 @@ static struct file_lock *lease_alloc(struct file *filp, int type) error = lease_init(filp, type, fl); if (error) { - locks_free_lock(fl); + locks_free_lease(fl); return ERR_PTR(error); } return fl; @@ -593,26 +627,26 @@ static inline int locks_overlap(struct file_lock *fl1, struct file_lock *fl2) /* * Check whether two locks have the same owner. */ -static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2) +static int posix_same_owner(struct file_lock_core *fl1, struct file_lock_core *fl2) { - return fl1->fl_owner == fl2->fl_owner; + return fl1->flc_owner == fl2->flc_owner; } /* Must be called with the flc_lock held! */ -static void locks_insert_global_locks(struct file_lock *fl) +static void locks_insert_global_locks(struct file_lock_core *flc) { struct file_lock_list_struct *fll = this_cpu_ptr(&file_lock_list); percpu_rwsem_assert_held(&file_rwsem); spin_lock(&fll->lock); - fl->fl_link_cpu = smp_processor_id(); - hlist_add_head(&fl->fl_link, &fll->hlist); + flc->flc_link_cpu = smp_processor_id(); + hlist_add_head(&flc->flc_link, &fll->hlist); spin_unlock(&fll->lock); } /* Must be called with the flc_lock held! */ -static void locks_delete_global_locks(struct file_lock *fl) +static void locks_delete_global_locks(struct file_lock_core *flc) { struct file_lock_list_struct *fll; @@ -623,33 +657,33 @@ static void locks_delete_global_locks(struct file_lock *fl) * is done while holding the flc_lock, and new insertions into the list * also require that it be held. */ - if (hlist_unhashed(&fl->fl_link)) + if (hlist_unhashed(&flc->flc_link)) return; - fll = per_cpu_ptr(&file_lock_list, fl->fl_link_cpu); + fll = per_cpu_ptr(&file_lock_list, flc->flc_link_cpu); spin_lock(&fll->lock); - hlist_del_init(&fl->fl_link); + hlist_del_init(&flc->flc_link); spin_unlock(&fll->lock); } static unsigned long -posix_owner_key(struct file_lock *fl) +posix_owner_key(struct file_lock_core *flc) { - return (unsigned long)fl->fl_owner; + return (unsigned long) flc->flc_owner; } -static void locks_insert_global_blocked(struct file_lock *waiter) +static void locks_insert_global_blocked(struct file_lock_core *waiter) { lockdep_assert_held(&blocked_lock_lock); - hash_add(blocked_hash, &waiter->fl_link, posix_owner_key(waiter)); + hash_add(blocked_hash, &waiter->flc_link, posix_owner_key(waiter)); } -static void locks_delete_global_blocked(struct file_lock *waiter) +static void locks_delete_global_blocked(struct file_lock_core *waiter) { lockdep_assert_held(&blocked_lock_lock); - hash_del(&waiter->fl_link); + hash_del(&waiter->flc_link); } /* Remove waiter from blocker's block list. @@ -657,41 +691,39 @@ static void locks_delete_global_blocked(struct file_lock *waiter) * * Must be called with blocked_lock_lock held. */ -static void __locks_delete_block(struct file_lock *waiter) +static void __locks_unlink_block(struct file_lock_core *waiter) { locks_delete_global_blocked(waiter); - list_del_init(&waiter->fl_blocked_member); + list_del_init(&waiter->flc_blocked_member); } -static void __locks_wake_up_blocks(struct file_lock *blocker) +static void __locks_wake_up_blocks(struct file_lock_core *blocker) { - while (!list_empty(&blocker->fl_blocked_requests)) { - struct file_lock *waiter; + while (!list_empty(&blocker->flc_blocked_requests)) { + struct file_lock_core *waiter; + struct file_lock *fl; - waiter = list_first_entry(&blocker->fl_blocked_requests, - struct file_lock, fl_blocked_member); - __locks_delete_block(waiter); - if (waiter->fl_lmops && waiter->fl_lmops->lm_notify) - waiter->fl_lmops->lm_notify(waiter); + waiter = list_first_entry(&blocker->flc_blocked_requests, + struct file_lock_core, flc_blocked_member); + + fl = file_lock(waiter); + __locks_unlink_block(waiter); + if ((waiter->flc_flags & (FL_POSIX | FL_FLOCK)) && + fl->fl_lmops && fl->fl_lmops->lm_notify) + fl->fl_lmops->lm_notify(fl); else - wake_up(&waiter->fl_wait); + locks_wake_up(fl); /* - * The setting of fl_blocker to NULL marks the "done" + * The setting of flc_blocker to NULL marks the "done" * point in deleting a block. Paired with acquire at the top * of locks_delete_block(). */ - smp_store_release(&waiter->fl_blocker, NULL); + smp_store_release(&waiter->flc_blocker, NULL); } } -/** - * locks_delete_block - stop waiting for a file lock - * @waiter: the lock which was waiting - * - * lockd/nfsd need to disconnect the lock while working on it. - */ -int locks_delete_block(struct file_lock *waiter) +static int __locks_delete_block(struct file_lock_core *waiter) { int status = -ENOENT; @@ -716,24 +748,35 @@ int locks_delete_block(struct file_lock *waiter) * no new locks can be inserted into its fl_blocked_requests list, and * can avoid doing anything further if the list is empty. */ - if (!smp_load_acquire(&waiter->fl_blocker) && - list_empty(&waiter->fl_blocked_requests)) + if (!smp_load_acquire(&waiter->flc_blocker) && + list_empty(&waiter->flc_blocked_requests)) return status; spin_lock(&blocked_lock_lock); - if (waiter->fl_blocker) + if (waiter->flc_blocker) status = 0; __locks_wake_up_blocks(waiter); - __locks_delete_block(waiter); + __locks_unlink_block(waiter); /* * The setting of fl_blocker to NULL marks the "done" point in deleting * a block. Paired with acquire at the top of this function. */ - smp_store_release(&waiter->fl_blocker, NULL); + smp_store_release(&waiter->flc_blocker, NULL); spin_unlock(&blocked_lock_lock); return status; } + +/** + * locks_delete_block - stop waiting for a file lock + * @waiter: the lock which was waiting + * + * lockd/nfsd need to disconnect the lock while working on it. + */ +int locks_delete_block(struct file_lock *waiter) +{ + return __locks_delete_block(&waiter->c); +} EXPORT_SYMBOL(locks_delete_block); /* Insert waiter into blocker's block list. @@ -751,26 +794,28 @@ EXPORT_SYMBOL(locks_delete_block); * waiters, and add beneath any waiter that blocks the new waiter. * Thus wakeups don't happen until needed. */ -static void __locks_insert_block(struct file_lock *blocker, - struct file_lock *waiter, - bool conflict(struct file_lock *, - struct file_lock *)) +static void __locks_insert_block(struct file_lock_core *blocker, + struct file_lock_core *waiter, + bool conflict(struct file_lock_core *, + struct file_lock_core *)) { - struct file_lock *fl; - BUG_ON(!list_empty(&waiter->fl_blocked_member)); + struct file_lock_core *flc; + BUG_ON(!list_empty(&waiter->flc_blocked_member)); new_blocker: - list_for_each_entry(fl, &blocker->fl_blocked_requests, fl_blocked_member) - if (conflict(fl, waiter)) { - blocker = fl; + list_for_each_entry(flc, &blocker->flc_blocked_requests, flc_blocked_member) + if (conflict(flc, waiter)) { + blocker = flc; goto new_blocker; } - waiter->fl_blocker = blocker; - list_add_tail(&waiter->fl_blocked_member, &blocker->fl_blocked_requests); - if (IS_POSIX(blocker) && !IS_OFDLCK(blocker)) + waiter->flc_blocker = blocker; + list_add_tail(&waiter->flc_blocked_member, + &blocker->flc_blocked_requests); + + if ((blocker->flc_flags & (FL_POSIX|FL_OFDLCK)) == FL_POSIX) locks_insert_global_blocked(waiter); - /* The requests in waiter->fl_blocked are known to conflict with + /* The requests in waiter->flc_blocked are known to conflict with * waiter, but might not conflict with blocker, or the requests * and lock which block it. So they all need to be woken. */ @@ -778,10 +823,10 @@ new_blocker: } /* Must be called with flc_lock held. */ -static void locks_insert_block(struct file_lock *blocker, - struct file_lock *waiter, - bool conflict(struct file_lock *, - struct file_lock *)) +static void locks_insert_block(struct file_lock_core *blocker, + struct file_lock_core *waiter, + bool conflict(struct file_lock_core *, + struct file_lock_core *)) { spin_lock(&blocked_lock_lock); __locks_insert_block(blocker, waiter, conflict); @@ -793,7 +838,7 @@ static void locks_insert_block(struct file_lock *blocker, * * Must be called with the inode->flc_lock held! */ -static void locks_wake_up_blocks(struct file_lock *blocker) +static void locks_wake_up_blocks(struct file_lock_core *blocker) { /* * Avoid taking global lock if list is empty. This is safe since new @@ -802,7 +847,7 @@ static void locks_wake_up_blocks(struct file_lock *blocker) * fl_blocked_requests list does not require the flc_lock, so we must * recheck list_empty() after acquiring the blocked_lock_lock. */ - if (list_empty(&blocker->fl_blocked_requests)) + if (list_empty(&blocker->flc_blocked_requests)) return; spin_lock(&blocked_lock_lock); @@ -811,39 +856,39 @@ static void locks_wake_up_blocks(struct file_lock *blocker) } static void -locks_insert_lock_ctx(struct file_lock *fl, struct list_head *before) +locks_insert_lock_ctx(struct file_lock_core *fl, struct list_head *before) { - list_add_tail(&fl->fl_list, before); + list_add_tail(&fl->flc_list, before); locks_insert_global_locks(fl); } static void -locks_unlink_lock_ctx(struct file_lock *fl) +locks_unlink_lock_ctx(struct file_lock_core *fl) { locks_delete_global_locks(fl); - list_del_init(&fl->fl_list); + list_del_init(&fl->flc_list); locks_wake_up_blocks(fl); } static void -locks_delete_lock_ctx(struct file_lock *fl, struct list_head *dispose) +locks_delete_lock_ctx(struct file_lock_core *fl, struct list_head *dispose) { locks_unlink_lock_ctx(fl); if (dispose) - list_add(&fl->fl_list, dispose); + list_add(&fl->flc_list, dispose); else - locks_free_lock(fl); + locks_free_lock(file_lock(fl)); } /* Determine if lock sys_fl blocks lock caller_fl. Common functionality * checks for shared/exclusive status of overlapping locks. */ -static bool locks_conflict(struct file_lock *caller_fl, - struct file_lock *sys_fl) +static bool locks_conflict(struct file_lock_core *caller_flc, + struct file_lock_core *sys_flc) { - if (sys_fl->fl_type == F_WRLCK) + if (sys_flc->flc_type == F_WRLCK) return true; - if (caller_fl->fl_type == F_WRLCK) + if (caller_flc->flc_type == F_WRLCK) return true; return false; } @@ -851,20 +896,23 @@ static bool locks_conflict(struct file_lock *caller_fl, /* Determine if lock sys_fl blocks lock caller_fl. POSIX specific * checking before calling the locks_conflict(). */ -static bool posix_locks_conflict(struct file_lock *caller_fl, - struct file_lock *sys_fl) +static bool posix_locks_conflict(struct file_lock_core *caller_flc, + struct file_lock_core *sys_flc) { + struct file_lock *caller_fl = file_lock(caller_flc); + struct file_lock *sys_fl = file_lock(sys_flc); + /* POSIX locks owned by the same process do not conflict with * each other. */ - if (posix_same_owner(caller_fl, sys_fl)) + if (posix_same_owner(caller_flc, sys_flc)) return false; /* Check whether they overlap */ if (!locks_overlap(caller_fl, sys_fl)) return false; - return locks_conflict(caller_fl, sys_fl); + return locks_conflict(caller_flc, sys_flc); } /* Determine if lock sys_fl blocks lock caller_fl. Used on xx_GETLK @@ -873,28 +921,31 @@ static bool posix_locks_conflict(struct file_lock *caller_fl, static bool posix_test_locks_conflict(struct file_lock *caller_fl, struct file_lock *sys_fl) { + struct file_lock_core *caller = &caller_fl->c; + struct file_lock_core *sys = &sys_fl->c; + /* F_UNLCK checks any locks on the same fd. */ - if (caller_fl->fl_type == F_UNLCK) { - if (!posix_same_owner(caller_fl, sys_fl)) + if (lock_is_unlock(caller_fl)) { + if (!posix_same_owner(caller, sys)) return false; return locks_overlap(caller_fl, sys_fl); } - return posix_locks_conflict(caller_fl, sys_fl); + return posix_locks_conflict(caller, sys); } /* Determine if lock sys_fl blocks lock caller_fl. FLOCK specific * checking before calling the locks_conflict(). */ -static bool flock_locks_conflict(struct file_lock *caller_fl, - struct file_lock *sys_fl) +static bool flock_locks_conflict(struct file_lock_core *caller_flc, + struct file_lock_core *sys_flc) { /* FLOCK locks referring to the same filp do not conflict with * each other. */ - if (caller_fl->fl_file == sys_fl->fl_file) + if (caller_flc->flc_file == sys_flc->flc_file) return false; - return locks_conflict(caller_fl, sys_fl); + return locks_conflict(caller_flc, sys_flc); } void @@ -908,13 +959,13 @@ posix_test_lock(struct file *filp, struct file_lock *fl) ctx = locks_inode_context(inode); if (!ctx || list_empty_careful(&ctx->flc_posix)) { - fl->fl_type = F_UNLCK; + fl->c.flc_type = F_UNLCK; return; } retry: spin_lock(&ctx->flc_lock); - list_for_each_entry(cfl, &ctx->flc_posix, fl_list) { + list_for_each_entry(cfl, &ctx->flc_posix, c.flc_list) { if (!posix_test_locks_conflict(fl, cfl)) continue; if (cfl->fl_lmops && cfl->fl_lmops->lm_lock_expirable @@ -930,7 +981,7 @@ retry: locks_copy_conflock(fl, cfl); goto out; } - fl->fl_type = F_UNLCK; + fl->c.flc_type = F_UNLCK; out: spin_unlock(&ctx->flc_lock); return; @@ -972,25 +1023,27 @@ EXPORT_SYMBOL(posix_test_lock); #define MAX_DEADLK_ITERATIONS 10 -/* Find a lock that the owner of the given block_fl is blocking on. */ -static struct file_lock *what_owner_is_waiting_for(struct file_lock *block_fl) +/* Find a lock that the owner of the given @blocker is blocking on. */ +static struct file_lock_core *what_owner_is_waiting_for(struct file_lock_core *blocker) { - struct file_lock *fl; + struct file_lock_core *flc; - hash_for_each_possible(blocked_hash, fl, fl_link, posix_owner_key(block_fl)) { - if (posix_same_owner(fl, block_fl)) { - while (fl->fl_blocker) - fl = fl->fl_blocker; - return fl; + hash_for_each_possible(blocked_hash, flc, flc_link, posix_owner_key(blocker)) { + if (posix_same_owner(flc, blocker)) { + while (flc->flc_blocker) + flc = flc->flc_blocker; + return flc; } } return NULL; } /* Must be called with the blocked_lock_lock held! */ -static int posix_locks_deadlock(struct file_lock *caller_fl, - struct file_lock *block_fl) +static bool posix_locks_deadlock(struct file_lock *caller_fl, + struct file_lock *block_fl) { + struct file_lock_core *caller = &caller_fl->c; + struct file_lock_core *blocker = &block_fl->c; int i = 0; lockdep_assert_held(&blocked_lock_lock); @@ -999,16 +1052,16 @@ static int posix_locks_deadlock(struct file_lock *caller_fl, * This deadlock detector can't reasonably detect deadlocks with * FL_OFDLCK locks, since they aren't owned by a process, per-se. */ - if (IS_OFDLCK(caller_fl)) - return 0; + if (caller->flc_flags & FL_OFDLCK) + return false; - while ((block_fl = what_owner_is_waiting_for(block_fl))) { + while ((blocker = what_owner_is_waiting_for(blocker))) { if (i++ > MAX_DEADLK_ITERATIONS) - return 0; - if (posix_same_owner(caller_fl, block_fl)) - return 1; + return false; + if (posix_same_owner(caller, blocker)) + return true; } - return 0; + return false; } /* Try to create a FLOCK lock on filp. We always insert new FLOCK locks @@ -1027,14 +1080,14 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request) bool found = false; LIST_HEAD(dispose); - ctx = locks_get_lock_context(inode, request->fl_type); + ctx = locks_get_lock_context(inode, request->c.flc_type); if (!ctx) { - if (request->fl_type != F_UNLCK) + if (request->c.flc_type != F_UNLCK) return -ENOMEM; - return (request->fl_flags & FL_EXISTS) ? -ENOENT : 0; + return (request->c.flc_flags & FL_EXISTS) ? -ENOENT : 0; } - if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) { + if (!(request->c.flc_flags & FL_ACCESS) && (request->c.flc_type != F_UNLCK)) { new_fl = locks_alloc_lock(); if (!new_fl) return -ENOMEM; @@ -1042,41 +1095,41 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request) percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); - if (request->fl_flags & FL_ACCESS) + if (request->c.flc_flags & FL_ACCESS) goto find_conflict; - list_for_each_entry(fl, &ctx->flc_flock, fl_list) { - if (request->fl_file != fl->fl_file) + list_for_each_entry(fl, &ctx->flc_flock, c.flc_list) { + if (request->c.flc_file != fl->c.flc_file) continue; - if (request->fl_type == fl->fl_type) + if (request->c.flc_type == fl->c.flc_type) goto out; found = true; - locks_delete_lock_ctx(fl, &dispose); + locks_delete_lock_ctx(&fl->c, &dispose); break; } - if (request->fl_type == F_UNLCK) { - if ((request->fl_flags & FL_EXISTS) && !found) + if (lock_is_unlock(request)) { + if ((request->c.flc_flags & FL_EXISTS) && !found) error = -ENOENT; goto out; } find_conflict: - list_for_each_entry(fl, &ctx->flc_flock, fl_list) { - if (!flock_locks_conflict(request, fl)) + list_for_each_entry(fl, &ctx->flc_flock, c.flc_list) { + if (!flock_locks_conflict(&request->c, &fl->c)) continue; error = -EAGAIN; - if (!(request->fl_flags & FL_SLEEP)) + if (!(request->c.flc_flags & FL_SLEEP)) goto out; error = FILE_LOCK_DEFERRED; - locks_insert_block(fl, request, flock_locks_conflict); + locks_insert_block(&fl->c, &request->c, flock_locks_conflict); goto out; } - if (request->fl_flags & FL_ACCESS) + if (request->c.flc_flags & FL_ACCESS) goto out; locks_copy_lock(new_fl, request); locks_move_blocks(new_fl, request); - locks_insert_lock_ctx(new_fl, &ctx->flc_flock); + locks_insert_lock_ctx(&new_fl->c, &ctx->flc_flock); new_fl = NULL; error = 0; @@ -1105,9 +1158,9 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, void *owner; void (*func)(void); - ctx = locks_get_lock_context(inode, request->fl_type); + ctx = locks_get_lock_context(inode, request->c.flc_type); if (!ctx) - return (request->fl_type == F_UNLCK) ? 0 : -ENOMEM; + return lock_is_unlock(request) ? 0 : -ENOMEM; /* * We may need two file_lock structures for this operation, @@ -1115,8 +1168,8 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, * * In some cases we can be sure, that no new locks will be needed */ - if (!(request->fl_flags & FL_ACCESS) && - (request->fl_type != F_UNLCK || + if (!(request->c.flc_flags & FL_ACCESS) && + (request->c.flc_type != F_UNLCK || request->fl_start != 0 || request->fl_end != OFFSET_MAX)) { new_fl = locks_alloc_lock(); new_fl2 = locks_alloc_lock(); @@ -1130,9 +1183,9 @@ retry: * there are any, either return error or put the request on the * blocker's list of waiters and the global blocked_hash. */ - if (request->fl_type != F_UNLCK) { - list_for_each_entry(fl, &ctx->flc_posix, fl_list) { - if (!posix_locks_conflict(request, fl)) + if (request->c.flc_type != F_UNLCK) { + list_for_each_entry(fl, &ctx->flc_posix, c.flc_list) { + if (!posix_locks_conflict(&request->c, &fl->c)) continue; if (fl->fl_lmops && fl->fl_lmops->lm_lock_expirable && (*fl->fl_lmops->lm_lock_expirable)(fl)) { @@ -1148,7 +1201,7 @@ retry: if (conflock) locks_copy_conflock(conflock, fl); error = -EAGAIN; - if (!(request->fl_flags & FL_SLEEP)) + if (!(request->c.flc_flags & FL_SLEEP)) goto out; /* * Deadlock detection and insertion into the blocked @@ -1160,10 +1213,10 @@ retry: * Ensure that we don't find any locks blocked on this * request during deadlock detection. */ - __locks_wake_up_blocks(request); + __locks_wake_up_blocks(&request->c); if (likely(!posix_locks_deadlock(request, fl))) { error = FILE_LOCK_DEFERRED; - __locks_insert_block(fl, request, + __locks_insert_block(&fl->c, &request->c, posix_locks_conflict); } spin_unlock(&blocked_lock_lock); @@ -1173,22 +1226,22 @@ retry: /* If we're just looking for a conflict, we're done. */ error = 0; - if (request->fl_flags & FL_ACCESS) + if (request->c.flc_flags & FL_ACCESS) goto out; /* Find the first old lock with the same owner as the new lock */ - list_for_each_entry(fl, &ctx->flc_posix, fl_list) { - if (posix_same_owner(request, fl)) + list_for_each_entry(fl, &ctx->flc_posix, c.flc_list) { + if (posix_same_owner(&request->c, &fl->c)) break; } /* Process locks with this owner. */ - list_for_each_entry_safe_from(fl, tmp, &ctx->flc_posix, fl_list) { - if (!posix_same_owner(request, fl)) + list_for_each_entry_safe_from(fl, tmp, &ctx->flc_posix, c.flc_list) { + if (!posix_same_owner(&request->c, &fl->c)) break; /* Detect adjacent or overlapping regions (if same lock type) */ - if (request->fl_type == fl->fl_type) { + if (request->c.flc_type == fl->c.flc_type) { /* In all comparisons of start vs end, use * "start - 1" rather than "end + 1". If end * is OFFSET_MAX, end + 1 will become negative. @@ -1215,7 +1268,7 @@ retry: else request->fl_end = fl->fl_end; if (added) { - locks_delete_lock_ctx(fl, &dispose); + locks_delete_lock_ctx(&fl->c, &dispose); continue; } request = fl; @@ -1228,7 +1281,7 @@ retry: continue; if (fl->fl_start > request->fl_end) break; - if (request->fl_type == F_UNLCK) + if (lock_is_unlock(request)) added = true; if (fl->fl_start < request->fl_start) left = fl; @@ -1244,7 +1297,7 @@ retry: * one (This may happen several times). */ if (added) { - locks_delete_lock_ctx(fl, &dispose); + locks_delete_lock_ctx(&fl->c, &dispose); continue; } /* @@ -1261,8 +1314,9 @@ retry: locks_move_blocks(new_fl, request); request = new_fl; new_fl = NULL; - locks_insert_lock_ctx(request, &fl->fl_list); - locks_delete_lock_ctx(fl, &dispose); + locks_insert_lock_ctx(&request->c, + &fl->c.flc_list); + locks_delete_lock_ctx(&fl->c, &dispose); added = true; } } @@ -1279,8 +1333,8 @@ retry: error = 0; if (!added) { - if (request->fl_type == F_UNLCK) { - if (request->fl_flags & FL_EXISTS) + if (lock_is_unlock(request)) { + if (request->c.flc_flags & FL_EXISTS) error = -ENOENT; goto out; } @@ -1291,7 +1345,7 @@ retry: } locks_copy_lock(new_fl, request); locks_move_blocks(new_fl, request); - locks_insert_lock_ctx(new_fl, &fl->fl_list); + locks_insert_lock_ctx(&new_fl->c, &fl->c.flc_list); fl = new_fl; new_fl = NULL; } @@ -1303,14 +1357,14 @@ retry: left = new_fl2; new_fl2 = NULL; locks_copy_lock(left, right); - locks_insert_lock_ctx(left, &fl->fl_list); + locks_insert_lock_ctx(&left->c, &fl->c.flc_list); } right->fl_start = request->fl_end + 1; - locks_wake_up_blocks(right); + locks_wake_up_blocks(&right->c); } if (left) { left->fl_end = request->fl_start - 1; - locks_wake_up_blocks(left); + locks_wake_up_blocks(&left->c); } out: spin_unlock(&ctx->flc_lock); @@ -1364,8 +1418,8 @@ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl) error = posix_lock_inode(inode, fl, NULL); if (error != FILE_LOCK_DEFERRED) break; - error = wait_event_interruptible(fl->fl_wait, - list_empty(&fl->fl_blocked_member)); + error = wait_event_interruptible(fl->c.flc_wait, + list_empty(&fl->c.flc_blocked_member)); if (error) break; } @@ -1373,37 +1427,37 @@ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl) return error; } -static void lease_clear_pending(struct file_lock *fl, int arg) +static void lease_clear_pending(struct file_lease *fl, int arg) { switch (arg) { case F_UNLCK: - fl->fl_flags &= ~FL_UNLOCK_PENDING; + fl->c.flc_flags &= ~FL_UNLOCK_PENDING; fallthrough; case F_RDLCK: - fl->fl_flags &= ~FL_DOWNGRADE_PENDING; + fl->c.flc_flags &= ~FL_DOWNGRADE_PENDING; } } /* We already had a lease on this file; just change its type */ -int lease_modify(struct file_lock *fl, int arg, struct list_head *dispose) +int lease_modify(struct file_lease *fl, int arg, struct list_head *dispose) { - int error = assign_type(fl, arg); + int error = assign_type(&fl->c, arg); if (error) return error; lease_clear_pending(fl, arg); - locks_wake_up_blocks(fl); + locks_wake_up_blocks(&fl->c); if (arg == F_UNLCK) { - struct file *filp = fl->fl_file; + struct file *filp = fl->c.flc_file; f_delown(filp); filp->f_owner.signum = 0; - fasync_helper(0, fl->fl_file, 0, &fl->fl_fasync); + fasync_helper(0, fl->c.flc_file, 0, &fl->fl_fasync); if (fl->fl_fasync != NULL) { printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync); fl->fl_fasync = NULL; } - locks_delete_lock_ctx(fl, dispose); + locks_delete_lock_ctx(&fl->c, dispose); } return 0; } @@ -1420,11 +1474,11 @@ static bool past_time(unsigned long then) static void time_out_leases(struct inode *inode, struct list_head *dispose) { struct file_lock_context *ctx = inode->i_flctx; - struct file_lock *fl, *tmp; + struct file_lease *fl, *tmp; lockdep_assert_held(&ctx->flc_lock); - list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) { + list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, c.flc_list) { trace_time_out_leases(inode, fl); if (past_time(fl->fl_downgrade_time)) lease_modify(fl, F_RDLCK, dispose); @@ -1433,38 +1487,40 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose) } } -static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker) +static bool leases_conflict(struct file_lock_core *lc, struct file_lock_core *bc) { bool rc; + struct file_lease *lease = file_lease(lc); + struct file_lease *breaker = file_lease(bc); if (lease->fl_lmops->lm_breaker_owns_lease && lease->fl_lmops->lm_breaker_owns_lease(lease)) return false; - if ((breaker->fl_flags & FL_LAYOUT) != (lease->fl_flags & FL_LAYOUT)) { + if ((bc->flc_flags & FL_LAYOUT) != (lc->flc_flags & FL_LAYOUT)) { rc = false; goto trace; } - if ((breaker->fl_flags & FL_DELEG) && (lease->fl_flags & FL_LEASE)) { + if ((bc->flc_flags & FL_DELEG) && (lc->flc_flags & FL_LEASE)) { rc = false; goto trace; } - rc = locks_conflict(breaker, lease); + rc = locks_conflict(bc, lc); trace: trace_leases_conflict(rc, lease, breaker); return rc; } static bool -any_leases_conflict(struct inode *inode, struct file_lock *breaker) +any_leases_conflict(struct inode *inode, struct file_lease *breaker) { struct file_lock_context *ctx = inode->i_flctx; - struct file_lock *fl; + struct file_lock_core *flc; lockdep_assert_held(&ctx->flc_lock); - list_for_each_entry(fl, &ctx->flc_lease, fl_list) { - if (leases_conflict(fl, breaker)) + list_for_each_entry(flc, &ctx->flc_lease, flc_list) { + if (leases_conflict(flc, &breaker->c)) return true; } return false; @@ -1487,7 +1543,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) { int error = 0; struct file_lock_context *ctx; - struct file_lock *new_fl, *fl, *tmp; + struct file_lease *new_fl, *fl, *tmp; unsigned long break_time; int want_write = (mode & O_ACCMODE) != O_RDONLY; LIST_HEAD(dispose); @@ -1495,7 +1551,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK); if (IS_ERR(new_fl)) return PTR_ERR(new_fl); - new_fl->fl_flags = type; + new_fl->c.flc_flags = type; /* typically we will check that ctx is non-NULL before calling */ ctx = locks_inode_context(inode); @@ -1519,22 +1575,22 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) break_time++; /* so that 0 means no break time */ } - list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) { - if (!leases_conflict(fl, new_fl)) + list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, c.flc_list) { + if (!leases_conflict(&fl->c, &new_fl->c)) continue; if (want_write) { - if (fl->fl_flags & FL_UNLOCK_PENDING) + if (fl->c.flc_flags & FL_UNLOCK_PENDING) continue; - fl->fl_flags |= FL_UNLOCK_PENDING; + fl->c.flc_flags |= FL_UNLOCK_PENDING; fl->fl_break_time = break_time; } else { if (lease_breaking(fl)) continue; - fl->fl_flags |= FL_DOWNGRADE_PENDING; + fl->c.flc_flags |= FL_DOWNGRADE_PENDING; fl->fl_downgrade_time = break_time; } if (fl->fl_lmops->lm_break(fl)) - locks_delete_lock_ctx(fl, &dispose); + locks_delete_lock_ctx(&fl->c, &dispose); } if (list_empty(&ctx->flc_lease)) @@ -1547,26 +1603,26 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) } restart: - fl = list_first_entry(&ctx->flc_lease, struct file_lock, fl_list); + fl = list_first_entry(&ctx->flc_lease, struct file_lease, c.flc_list); break_time = fl->fl_break_time; if (break_time != 0) break_time -= jiffies; if (break_time == 0) break_time++; - locks_insert_block(fl, new_fl, leases_conflict); + locks_insert_block(&fl->c, &new_fl->c, leases_conflict); trace_break_lease_block(inode, new_fl); spin_unlock(&ctx->flc_lock); percpu_up_read(&file_rwsem); locks_dispose_list(&dispose); - error = wait_event_interruptible_timeout(new_fl->fl_wait, - list_empty(&new_fl->fl_blocked_member), - break_time); + error = wait_event_interruptible_timeout(new_fl->c.flc_wait, + list_empty(&new_fl->c.flc_blocked_member), + break_time); percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); trace_break_lease_unblock(inode, new_fl); - locks_delete_block(new_fl); + __locks_delete_block(&new_fl->c); if (error >= 0) { /* * Wait for the next conflicting lease that has not been @@ -1583,7 +1639,7 @@ out: percpu_up_read(&file_rwsem); locks_dispose_list(&dispose); free_lock: - locks_free_lock(new_fl); + locks_free_lease(new_fl); return error; } EXPORT_SYMBOL(__break_lease); @@ -1601,14 +1657,14 @@ void lease_get_mtime(struct inode *inode, struct timespec64 *time) { bool has_lease = false; struct file_lock_context *ctx; - struct file_lock *fl; + struct file_lock_core *flc; ctx = locks_inode_context(inode); if (ctx && !list_empty_careful(&ctx->flc_lease)) { spin_lock(&ctx->flc_lock); - fl = list_first_entry_or_null(&ctx->flc_lease, - struct file_lock, fl_list); - if (fl && (fl->fl_type == F_WRLCK)) + flc = list_first_entry_or_null(&ctx->flc_lease, + struct file_lock_core, flc_list); + if (flc && flc->flc_type == F_WRLCK) has_lease = true; spin_unlock(&ctx->flc_lock); } @@ -1643,7 +1699,7 @@ EXPORT_SYMBOL(lease_get_mtime); */ int fcntl_getlease(struct file *filp) { - struct file_lock *fl; + struct file_lease *fl; struct inode *inode = file_inode(filp); struct file_lock_context *ctx; int type = F_UNLCK; @@ -1654,8 +1710,8 @@ int fcntl_getlease(struct file *filp) percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); time_out_leases(inode, &dispose); - list_for_each_entry(fl, &ctx->flc_lease, fl_list) { - if (fl->fl_file != filp) + list_for_each_entry(fl, &ctx->flc_lease, c.flc_list) { + if (fl->c.flc_file != filp) continue; type = target_leasetype(fl); break; @@ -1715,12 +1771,12 @@ check_conflicting_open(struct file *filp, const int arg, int flags) } static int -generic_add_lease(struct file *filp, int arg, struct file_lock **flp, void **priv) +generic_add_lease(struct file *filp, int arg, struct file_lease **flp, void **priv) { - struct file_lock *fl, *my_fl = NULL, *lease; + struct file_lease *fl, *my_fl = NULL, *lease; struct inode *inode = file_inode(filp); struct file_lock_context *ctx; - bool is_deleg = (*flp)->fl_flags & FL_DELEG; + bool is_deleg = (*flp)->c.flc_flags & FL_DELEG; int error; LIST_HEAD(dispose); @@ -1746,7 +1802,7 @@ generic_add_lease(struct file *filp, int arg, struct file_lock **flp, void **pri percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); time_out_leases(inode, &dispose); - error = check_conflicting_open(filp, arg, lease->fl_flags); + error = check_conflicting_open(filp, arg, lease->c.flc_flags); if (error) goto out; @@ -1759,9 +1815,9 @@ generic_add_lease(struct file *filp, int arg, struct file_lock **flp, void **pri * except for this filp. */ error = -EAGAIN; - list_for_each_entry(fl, &ctx->flc_lease, fl_list) { - if (fl->fl_file == filp && - fl->fl_owner == lease->fl_owner) { + list_for_each_entry(fl, &ctx->flc_lease, c.flc_list) { + if (fl->c.flc_file == filp && + fl->c.flc_owner == lease->c.flc_owner) { my_fl = fl; continue; } @@ -1776,7 +1832,7 @@ generic_add_lease(struct file *filp, int arg, struct file_lock **flp, void **pri * Modifying our existing lease is OK, but no getting a * new lease if someone else is opening for write: */ - if (fl->fl_flags & FL_UNLOCK_PENDING) + if (fl->c.flc_flags & FL_UNLOCK_PENDING) goto out; } @@ -1792,7 +1848,7 @@ generic_add_lease(struct file *filp, int arg, struct file_lock **flp, void **pri if (!leases_enable) goto out; - locks_insert_lock_ctx(lease, &ctx->flc_lease); + locks_insert_lock_ctx(&lease->c, &ctx->flc_lease); /* * The check in break_lease() is lockless. It's possible for another * open to race in after we did the earlier check for a conflicting @@ -1803,9 +1859,9 @@ generic_add_lease(struct file *filp, int arg, struct file_lock **flp, void **pri * precedes these checks. */ smp_mb(); - error = check_conflicting_open(filp, arg, lease->fl_flags); + error = check_conflicting_open(filp, arg, lease->c.flc_flags); if (error) { - locks_unlink_lock_ctx(lease); + locks_unlink_lock_ctx(&lease->c); goto out; } @@ -1826,7 +1882,7 @@ out: static int generic_delete_lease(struct file *filp, void *owner) { int error = -EAGAIN; - struct file_lock *fl, *victim = NULL; + struct file_lease *fl, *victim = NULL; struct inode *inode = file_inode(filp); struct file_lock_context *ctx; LIST_HEAD(dispose); @@ -1839,9 +1895,9 @@ static int generic_delete_lease(struct file *filp, void *owner) percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); - list_for_each_entry(fl, &ctx->flc_lease, fl_list) { - if (fl->fl_file == filp && - fl->fl_owner == owner) { + list_for_each_entry(fl, &ctx->flc_lease, c.flc_list) { + if (fl->c.flc_file == filp && + fl->c.flc_owner == owner) { victim = fl; break; } @@ -1866,21 +1922,9 @@ static int generic_delete_lease(struct file *filp, void *owner) * The (input) flp->fl_lmops->lm_break function is required * by break_lease(). */ -int generic_setlease(struct file *filp, int arg, struct file_lock **flp, +int generic_setlease(struct file *filp, int arg, struct file_lease **flp, void **priv) { - struct inode *inode = file_inode(filp); - vfsuid_t vfsuid = i_uid_into_vfsuid(file_mnt_idmap(filp), inode); - int error; - - if ((!vfsuid_eq_kuid(vfsuid, current_fsuid())) && !capable(CAP_LEASE)) - return -EACCES; - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - error = security_file_lock(filp, arg); - if (error) - return error; - switch (arg) { case F_UNLCK: return generic_delete_lease(filp, *priv); @@ -1913,7 +1957,7 @@ lease_notifier_chain_init(void) } static inline void -setlease_notifier(int arg, struct file_lock *lease) +setlease_notifier(int arg, struct file_lease *lease) { if (arg != F_UNLCK) srcu_notifier_call_chain(&lease_notifier_chain, arg, lease); @@ -1931,6 +1975,19 @@ void lease_unregister_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(lease_unregister_notifier); + +int +kernel_setlease(struct file *filp, int arg, struct file_lease **lease, void **priv) +{ + if (lease) + setlease_notifier(arg, *lease); + if (filp->f_op->setlease) + return filp->f_op->setlease(filp, arg, lease, priv); + else + return generic_setlease(filp, arg, lease, priv); +} +EXPORT_SYMBOL_GPL(kernel_setlease); + /** * vfs_setlease - sets a lease on an open file * @filp: file pointer @@ -1949,20 +2006,26 @@ EXPORT_SYMBOL_GPL(lease_unregister_notifier); * may be NULL if the lm_setup operation doesn't require it. */ int -vfs_setlease(struct file *filp, int arg, struct file_lock **lease, void **priv) +vfs_setlease(struct file *filp, int arg, struct file_lease **lease, void **priv) { - if (lease) - setlease_notifier(arg, *lease); - if (filp->f_op->setlease) - return filp->f_op->setlease(filp, arg, lease, priv); - else - return generic_setlease(filp, arg, lease, priv); + struct inode *inode = file_inode(filp); + vfsuid_t vfsuid = i_uid_into_vfsuid(file_mnt_idmap(filp), inode); + int error; + + if ((!vfsuid_eq_kuid(vfsuid, current_fsuid())) && !capable(CAP_LEASE)) + return -EACCES; + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + error = security_file_lock(filp, arg); + if (error) + return error; + return kernel_setlease(filp, arg, lease, priv); } EXPORT_SYMBOL_GPL(vfs_setlease); static int do_fcntl_add_lease(unsigned int fd, struct file *filp, int arg) { - struct file_lock *fl; + struct file_lease *fl; struct fasync_struct *new; int error; @@ -1972,14 +2035,14 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, int arg) new = fasync_alloc(); if (!new) { - locks_free_lock(fl); + locks_free_lease(fl); return -ENOMEM; } new->fa_fd = fd; error = vfs_setlease(filp, arg, &fl, (void **)&new); if (fl) - locks_free_lock(fl); + locks_free_lease(fl); if (new) fasync_free(new); return error; @@ -2017,8 +2080,8 @@ static int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl) error = flock_lock_inode(inode, fl); if (error != FILE_LOCK_DEFERRED) break; - error = wait_event_interruptible(fl->fl_wait, - list_empty(&fl->fl_blocked_member)); + error = wait_event_interruptible(fl->c.flc_wait, + list_empty(&fl->c.flc_blocked_member)); if (error) break; } @@ -2036,7 +2099,7 @@ static int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl) int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl) { int res = 0; - switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) { + switch (fl->c.flc_flags & (FL_POSIX|FL_FLOCK)) { case FL_POSIX: res = posix_lock_inode_wait(inode, fl); break; @@ -2098,13 +2161,13 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) flock_make_lock(f.file, &fl, type); - error = security_file_lock(f.file, fl.fl_type); + error = security_file_lock(f.file, fl.c.flc_type); if (error) goto out_putf; can_sleep = !(cmd & LOCK_NB); if (can_sleep) - fl.fl_flags |= FL_SLEEP; + fl.c.flc_flags |= FL_SLEEP; if (f.file->f_op->flock) error = f.file->f_op->flock(f.file, @@ -2130,7 +2193,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) */ int vfs_test_lock(struct file *filp, struct file_lock *fl) { - WARN_ON_ONCE(filp != fl->fl_file); + WARN_ON_ONCE(filp != fl->c.flc_file); if (filp->f_op->lock) return filp->f_op->lock(filp, F_GETLK, fl); posix_test_lock(filp, fl); @@ -2145,25 +2208,28 @@ EXPORT_SYMBOL_GPL(vfs_test_lock); * * Used to translate a fl_pid into a namespace virtual pid number */ -static pid_t locks_translate_pid(struct file_lock *fl, struct pid_namespace *ns) +static pid_t locks_translate_pid(struct file_lock_core *fl, struct pid_namespace *ns) { pid_t vnr; struct pid *pid; - if (IS_OFDLCK(fl)) + if (fl->flc_flags & FL_OFDLCK) return -1; - if (IS_REMOTELCK(fl)) - return fl->fl_pid; + + /* Remote locks report a negative pid value */ + if (fl->flc_pid <= 0) + return fl->flc_pid; + /* * If the flock owner process is dead and its pid has been already * freed, the translation below won't work, but we still want to show * flock owner pid number in init pidns. */ if (ns == &init_pid_ns) - return (pid_t)fl->fl_pid; + return (pid_t) fl->flc_pid; rcu_read_lock(); - pid = find_pid_ns(fl->fl_pid, &init_pid_ns); + pid = find_pid_ns(fl->flc_pid, &init_pid_ns); vnr = pid_nr_ns(pid, ns); rcu_read_unlock(); return vnr; @@ -2171,7 +2237,7 @@ static pid_t locks_translate_pid(struct file_lock *fl, struct pid_namespace *ns) static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl) { - flock->l_pid = locks_translate_pid(fl, task_active_pid_ns(current)); + flock->l_pid = locks_translate_pid(&fl->c, task_active_pid_ns(current)); #if BITS_PER_LONG == 32 /* * Make sure we can represent the posix lock via @@ -2186,19 +2252,19 @@ static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl) flock->l_len = fl->fl_end == OFFSET_MAX ? 0 : fl->fl_end - fl->fl_start + 1; flock->l_whence = 0; - flock->l_type = fl->fl_type; + flock->l_type = fl->c.flc_type; return 0; } #if BITS_PER_LONG == 32 static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl) { - flock->l_pid = locks_translate_pid(fl, task_active_pid_ns(current)); + flock->l_pid = locks_translate_pid(&fl->c, task_active_pid_ns(current)); flock->l_start = fl->fl_start; flock->l_len = fl->fl_end == OFFSET_MAX ? 0 : fl->fl_end - fl->fl_start + 1; flock->l_whence = 0; - flock->l_type = fl->fl_type; + flock->l_type = fl->c.flc_type; } #endif @@ -2227,16 +2293,16 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock *flock) if (flock->l_pid != 0) goto out; - fl->fl_flags |= FL_OFDLCK; - fl->fl_owner = filp; + fl->c.flc_flags |= FL_OFDLCK; + fl->c.flc_owner = filp; } error = vfs_test_lock(filp, fl); if (error) goto out; - flock->l_type = fl->fl_type; - if (fl->fl_type != F_UNLCK) { + flock->l_type = fl->c.flc_type; + if (fl->c.flc_type != F_UNLCK) { error = posix_lock_to_flock(flock, fl); if (error) goto out; @@ -2283,7 +2349,7 @@ out: */ int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf) { - WARN_ON_ONCE(filp != fl->fl_file); + WARN_ON_ONCE(filp != fl->c.flc_file); if (filp->f_op->lock) return filp->f_op->lock(filp, cmd, fl); else @@ -2296,7 +2362,7 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd, { int error; - error = security_file_lock(filp, fl->fl_type); + error = security_file_lock(filp, fl->c.flc_type); if (error) return error; @@ -2304,8 +2370,8 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd, error = vfs_lock_file(filp, cmd, fl, NULL); if (error != FILE_LOCK_DEFERRED) break; - error = wait_event_interruptible(fl->fl_wait, - list_empty(&fl->fl_blocked_member)); + error = wait_event_interruptible(fl->c.flc_wait, + list_empty(&fl->c.flc_blocked_member)); if (error) break; } @@ -2318,13 +2384,13 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd, static int check_fmode_for_setlk(struct file_lock *fl) { - switch (fl->fl_type) { + switch (fl->c.flc_type) { case F_RDLCK: - if (!(fl->fl_file->f_mode & FMODE_READ)) + if (!(fl->c.flc_file->f_mode & FMODE_READ)) return -EBADF; break; case F_WRLCK: - if (!(fl->fl_file->f_mode & FMODE_WRITE)) + if (!(fl->c.flc_file->f_mode & FMODE_WRITE)) return -EBADF; } return 0; @@ -2363,8 +2429,8 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, goto out; cmd = F_SETLK; - file_lock->fl_flags |= FL_OFDLCK; - file_lock->fl_owner = filp; + file_lock->c.flc_flags |= FL_OFDLCK; + file_lock->c.flc_owner = filp; break; case F_OFD_SETLKW: error = -EINVAL; @@ -2372,11 +2438,11 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, goto out; cmd = F_SETLKW; - file_lock->fl_flags |= FL_OFDLCK; - file_lock->fl_owner = filp; + file_lock->c.flc_flags |= FL_OFDLCK; + file_lock->c.flc_owner = filp; fallthrough; case F_SETLKW: - file_lock->fl_flags |= FL_SLEEP; + file_lock->c.flc_flags |= FL_SLEEP; } error = do_lock_file_wait(filp, cmd, file_lock); @@ -2386,8 +2452,8 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, * lock that was just acquired. There is no need to do that when we're * unlocking though, or for OFD locks. */ - if (!error && file_lock->fl_type != F_UNLCK && - !(file_lock->fl_flags & FL_OFDLCK)) { + if (!error && file_lock->c.flc_type != F_UNLCK && + !(file_lock->c.flc_flags & FL_OFDLCK)) { struct files_struct *files = current->files; /* * We need that spin_lock here - it prevents reordering between @@ -2398,7 +2464,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, f = files_lookup_fd_locked(files, fd); spin_unlock(&files->file_lock); if (f != filp) { - file_lock->fl_type = F_UNLCK; + file_lock->c.flc_type = F_UNLCK; error = do_lock_file_wait(filp, cmd, file_lock); WARN_ON_ONCE(error); error = -EBADF; @@ -2437,16 +2503,16 @@ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 *flock) if (flock->l_pid != 0) goto out; - fl->fl_flags |= FL_OFDLCK; - fl->fl_owner = filp; + fl->c.flc_flags |= FL_OFDLCK; + fl->c.flc_owner = filp; } error = vfs_test_lock(filp, fl); if (error) goto out; - flock->l_type = fl->fl_type; - if (fl->fl_type != F_UNLCK) + flock->l_type = fl->c.flc_type; + if (fl->c.flc_type != F_UNLCK) posix_lock_to_flock64(flock, fl); out: @@ -2486,8 +2552,8 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, goto out; cmd = F_SETLK64; - file_lock->fl_flags |= FL_OFDLCK; - file_lock->fl_owner = filp; + file_lock->c.flc_flags |= FL_OFDLCK; + file_lock->c.flc_owner = filp; break; case F_OFD_SETLKW: error = -EINVAL; @@ -2495,11 +2561,11 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, goto out; cmd = F_SETLKW64; - file_lock->fl_flags |= FL_OFDLCK; - file_lock->fl_owner = filp; + file_lock->c.flc_flags |= FL_OFDLCK; + file_lock->c.flc_owner = filp; fallthrough; case F_SETLKW64: - file_lock->fl_flags |= FL_SLEEP; + file_lock->c.flc_flags |= FL_SLEEP; } error = do_lock_file_wait(filp, cmd, file_lock); @@ -2509,8 +2575,8 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, * lock that was just acquired. There is no need to do that when we're * unlocking though, or for OFD locks. */ - if (!error && file_lock->fl_type != F_UNLCK && - !(file_lock->fl_flags & FL_OFDLCK)) { + if (!error && file_lock->c.flc_type != F_UNLCK && + !(file_lock->c.flc_flags & FL_OFDLCK)) { struct files_struct *files = current->files; /* * We need that spin_lock here - it prevents reordering between @@ -2521,7 +2587,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, f = files_lookup_fd_locked(files, fd); spin_unlock(&files->file_lock); if (f != filp) { - file_lock->fl_type = F_UNLCK; + file_lock->c.flc_type = F_UNLCK; error = do_lock_file_wait(filp, cmd, file_lock); WARN_ON_ONCE(error); error = -EBADF; @@ -2555,13 +2621,13 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner) return; locks_init_lock(&lock); - lock.fl_type = F_UNLCK; - lock.fl_flags = FL_POSIX | FL_CLOSE; + lock.c.flc_type = F_UNLCK; + lock.c.flc_flags = FL_POSIX | FL_CLOSE; lock.fl_start = 0; lock.fl_end = OFFSET_MAX; - lock.fl_owner = owner; - lock.fl_pid = current->tgid; - lock.fl_file = filp; + lock.c.flc_owner = owner; + lock.c.flc_pid = current->tgid; + lock.c.flc_file = filp; lock.fl_ops = NULL; lock.fl_lmops = NULL; @@ -2584,7 +2650,7 @@ locks_remove_flock(struct file *filp, struct file_lock_context *flctx) return; flock_make_lock(filp, &fl, F_UNLCK); - fl.fl_flags |= FL_CLOSE; + fl.c.flc_flags |= FL_CLOSE; if (filp->f_op->flock) filp->f_op->flock(filp, F_SETLKW, &fl); @@ -2599,7 +2665,7 @@ locks_remove_flock(struct file *filp, struct file_lock_context *flctx) static void locks_remove_lease(struct file *filp, struct file_lock_context *ctx) { - struct file_lock *fl, *tmp; + struct file_lease *fl, *tmp; LIST_HEAD(dispose); if (list_empty(&ctx->flc_lease)) @@ -2607,8 +2673,8 @@ locks_remove_lease(struct file *filp, struct file_lock_context *ctx) percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); - list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) - if (filp == fl->fl_file) + list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, c.flc_list) + if (filp == fl->c.flc_file) lease_modify(fl, F_UNLCK, &dispose); spin_unlock(&ctx->flc_lock); percpu_up_read(&file_rwsem); @@ -2652,7 +2718,7 @@ void locks_remove_file(struct file *filp) */ int vfs_cancel_lock(struct file *filp, struct file_lock *fl) { - WARN_ON_ONCE(filp != fl->fl_file); + WARN_ON_ONCE(filp != fl->c.flc_file); if (filp->f_op->lock) return filp->f_op->lock(filp, F_CANCELLK, fl); return 0; @@ -2691,69 +2757,73 @@ struct locks_iterator { loff_t li_pos; }; -static void lock_get_status(struct seq_file *f, struct file_lock *fl, +static void lock_get_status(struct seq_file *f, struct file_lock_core *flc, loff_t id, char *pfx, int repeat) { struct inode *inode = NULL; - unsigned int fl_pid; + unsigned int pid; struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb); - int type; + int type = flc->flc_type; + struct file_lock *fl = file_lock(flc); + + pid = locks_translate_pid(flc, proc_pidns); - fl_pid = locks_translate_pid(fl, proc_pidns); /* * If lock owner is dead (and pid is freed) or not visible in current * pidns, zero is shown as a pid value. Check lock info from * init_pid_ns to get saved lock pid value. */ - - if (fl->fl_file != NULL) - inode = file_inode(fl->fl_file); + if (flc->flc_file != NULL) + inode = file_inode(flc->flc_file); seq_printf(f, "%lld: ", id); if (repeat) seq_printf(f, "%*s", repeat - 1 + (int)strlen(pfx), pfx); - if (IS_POSIX(fl)) { - if (fl->fl_flags & FL_ACCESS) + if (flc->flc_flags & FL_POSIX) { + if (flc->flc_flags & FL_ACCESS) seq_puts(f, "ACCESS"); - else if (IS_OFDLCK(fl)) + else if (flc->flc_flags & FL_OFDLCK) seq_puts(f, "OFDLCK"); else seq_puts(f, "POSIX "); seq_printf(f, " %s ", (inode == NULL) ? "*NOINODE*" : "ADVISORY "); - } else if (IS_FLOCK(fl)) { + } else if (flc->flc_flags & FL_FLOCK) { seq_puts(f, "FLOCK ADVISORY "); - } else if (IS_LEASE(fl)) { - if (fl->fl_flags & FL_DELEG) + } else if (flc->flc_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT)) { + struct file_lease *lease = file_lease(flc); + + type = target_leasetype(lease); + + if (flc->flc_flags & FL_DELEG) seq_puts(f, "DELEG "); else seq_puts(f, "LEASE "); - if (lease_breaking(fl)) + if (lease_breaking(lease)) seq_puts(f, "BREAKING "); - else if (fl->fl_file) + else if (flc->flc_file) seq_puts(f, "ACTIVE "); else seq_puts(f, "BREAKER "); } else { seq_puts(f, "UNKNOWN UNKNOWN "); } - type = IS_LEASE(fl) ? target_leasetype(fl) : fl->fl_type; seq_printf(f, "%s ", (type == F_WRLCK) ? "WRITE" : (type == F_RDLCK) ? "READ" : "UNLCK"); if (inode) { /* userspace relies on this representation of dev_t */ - seq_printf(f, "%d %02x:%02x:%lu ", fl_pid, + seq_printf(f, "%d %02x:%02x:%lu ", pid, MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), inode->i_ino); } else { - seq_printf(f, "%d <none>:0 ", fl_pid); + seq_printf(f, "%d <none>:0 ", pid); } - if (IS_POSIX(fl)) { + if (flc->flc_flags & FL_POSIX) { if (fl->fl_end == OFFSET_MAX) seq_printf(f, "%Ld EOF\n", fl->fl_start); else @@ -2763,17 +2833,18 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, } } -static struct file_lock *get_next_blocked_member(struct file_lock *node) +static struct file_lock_core *get_next_blocked_member(struct file_lock_core *node) { - struct file_lock *tmp; + struct file_lock_core *tmp; /* NULL node or root node */ - if (node == NULL || node->fl_blocker == NULL) + if (node == NULL || node->flc_blocker == NULL) return NULL; /* Next member in the linked list could be itself */ - tmp = list_next_entry(node, fl_blocked_member); - if (list_entry_is_head(tmp, &node->fl_blocker->fl_blocked_requests, fl_blocked_member) + tmp = list_next_entry(node, flc_blocked_member); + if (list_entry_is_head(tmp, &node->flc_blocker->flc_blocked_requests, + flc_blocked_member) || tmp == node) { return NULL; } @@ -2784,18 +2855,18 @@ static struct file_lock *get_next_blocked_member(struct file_lock *node) static int locks_show(struct seq_file *f, void *v) { struct locks_iterator *iter = f->private; - struct file_lock *cur, *tmp; + struct file_lock_core *cur, *tmp; struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb); int level = 0; - cur = hlist_entry(v, struct file_lock, fl_link); + cur = hlist_entry(v, struct file_lock_core, flc_link); if (locks_translate_pid(cur, proc_pidns) == 0) return 0; - /* View this crossed linked list as a binary tree, the first member of fl_blocked_requests - * is the left child of current node, the next silibing in fl_blocked_member is the - * right child, we can alse get the parent of current node from fl_blocker, so this + /* View this crossed linked list as a binary tree, the first member of flc_blocked_requests + * is the left child of current node, the next silibing in flc_blocked_member is the + * right child, we can alse get the parent of current node from flc_blocker, so this * question becomes traversal of a binary tree */ while (cur != NULL) { @@ -2804,17 +2875,18 @@ static int locks_show(struct seq_file *f, void *v) else lock_get_status(f, cur, iter->li_pos, "", level); - if (!list_empty(&cur->fl_blocked_requests)) { + if (!list_empty(&cur->flc_blocked_requests)) { /* Turn left */ - cur = list_first_entry_or_null(&cur->fl_blocked_requests, - struct file_lock, fl_blocked_member); + cur = list_first_entry_or_null(&cur->flc_blocked_requests, + struct file_lock_core, + flc_blocked_member); level++; } else { /* Turn right */ tmp = get_next_blocked_member(cur); /* Fall back to parent node */ - while (tmp == NULL && cur->fl_blocker != NULL) { - cur = cur->fl_blocker; + while (tmp == NULL && cur->flc_blocker != NULL) { + cur = cur->flc_blocker; level--; tmp = get_next_blocked_member(cur); } @@ -2829,14 +2901,13 @@ static void __show_fd_locks(struct seq_file *f, struct list_head *head, int *id, struct file *filp, struct files_struct *files) { - struct file_lock *fl; + struct file_lock_core *fl; - list_for_each_entry(fl, head, fl_list) { + list_for_each_entry(fl, head, flc_list) { - if (filp != fl->fl_file) + if (filp != fl->flc_file) continue; - if (fl->fl_owner != files && - fl->fl_owner != filp) + if (fl->flc_owner != files && fl->flc_owner != filp) continue; (*id)++; @@ -2915,6 +2986,9 @@ static int __init filelock_init(void) filelock_cache = kmem_cache_create("file_lock_cache", sizeof(struct file_lock), 0, SLAB_PANIC, NULL); + filelease_cache = kmem_cache_create("file_lock_cache", + sizeof(struct file_lease), 0, SLAB_PANIC, NULL); + for_each_possible_cpu(i) { struct file_lock_list_struct *fll = per_cpu_ptr(&file_lock_list, i); diff --git a/fs/mbcache.c b/fs/mbcache.c index 82aa7a35db26..e60a840999aa 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -426,9 +426,7 @@ EXPORT_SYMBOL(mb_cache_destroy); static int __init mbcache_init(void) { - mb_entry_cache = kmem_cache_create("mbcache", - sizeof(struct mb_cache_entry), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); + mb_entry_cache = KMEM_CACHE(mb_cache_entry, SLAB_RECLAIM_ACCOUNT); if (!mb_entry_cache) return -ENOMEM; return 0; diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 73f37f298087..7cbd2b9f4d11 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -87,7 +87,7 @@ static int __init init_inodecache(void) minix_inode_cachep = kmem_cache_create("minix_inode_cache", sizeof(struct minix_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), + SLAB_ACCOUNT), init_once); if (minix_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c index 64c5205e2b5e..3c60f1eaca61 100644 --- a/fs/mnt_idmapping.c +++ b/fs/mnt_idmapping.c @@ -214,7 +214,7 @@ static int copy_mnt_idmap(struct uid_gid_map *map_from, * anything at all. */ if (nr_extents == 0) - return 0; + return -EINVAL; /* * Here we know that nr_extents is greater than zero which means diff --git a/fs/mpage.c b/fs/mpage.c index 738882e0766d..fa8b99a199fa 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -605,6 +605,7 @@ alloc_new: GFP_NOFS); bio->bi_iter.bi_sector = first_block << (blkbits - 9); wbc_init_bio(wbc, bio); + bio->bi_write_hint = inode->i_write_hint; } /* diff --git a/fs/namei.c b/fs/namei.c index 4e0de939fea1..ceb9ddf8dfdd 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -17,8 +17,8 @@ #include <linux/init.h> #include <linux/export.h> -#include <linux/kernel.h> #include <linux/slab.h> +#include <linux/wordpart.h> #include <linux/fs.h> #include <linux/filelock.h> #include <linux/namei.h> @@ -27,7 +27,6 @@ #include <linux/fsnotify.h> #include <linux/personality.h> #include <linux/security.h> -#include <linux/ima.h> #include <linux/syscalls.h> #include <linux/mount.h> #include <linux/audit.h> @@ -1717,7 +1716,11 @@ static inline int may_lookup(struct mnt_idmap *idmap, { if (nd->flags & LOOKUP_RCU) { int err = inode_permission(idmap, nd->inode, MAY_EXEC|MAY_NOT_BLOCK); - if (err != -ECHILD || !try_to_unlazy(nd)) + if (!err) // success, keep going + return 0; + if (!try_to_unlazy(nd)) + return -ECHILD; // redo it all non-lazy + if (err != -ECHILD) // hard error return err; } return inode_permission(idmap, nd->inode, MAY_EXEC); @@ -2676,10 +2679,8 @@ static int lookup_one_common(struct mnt_idmap *idmap, if (!len) return -EACCES; - if (unlikely(name[0] == '.')) { - if (len < 2 || (len == 2 && name[1] == '.')) - return -EACCES; - } + if (is_dot_dotdot(name, len)) + return -EACCES; while (len--) { unsigned int c = *(const unsigned char *)name++; @@ -3640,7 +3641,7 @@ static int do_open(struct nameidata *nd, if (!error && !(file->f_mode & FMODE_OPENED)) error = vfs_open(&nd->path, file); if (!error) - error = ima_file_check(file, op->acc_mode); + error = security_file_post_open(file, op->acc_mode); if (!error && do_truncate) error = handle_truncate(idmap, file); if (unlikely(error > 0)) { @@ -3703,7 +3704,7 @@ static int vfs_tmpfile(struct mnt_idmap *idmap, inode->i_state |= I_LINKABLE; spin_unlock(&inode->i_lock); } - ima_post_create_tmpfile(idmap, inode); + security_inode_post_create_tmpfile(idmap, inode); return 0; } @@ -4049,8 +4050,6 @@ retry: case 0: case S_IFREG: error = vfs_create(idmap, path.dentry->d_inode, dentry, mode, true); - if (!error) - ima_post_path_mknod(idmap, dentry); break; case S_IFCHR: case S_IFBLK: error = vfs_mknod(idmap, path.dentry->d_inode, @@ -4061,6 +4060,11 @@ retry: dentry, mode, 0); break; } + + if (error) + goto out2; + + security_path_post_mknod(idmap, dentry); out2: done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { diff --git a/fs/namespace.c b/fs/namespace.c index 437f60e96d40..5a51315c6678 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4472,10 +4472,15 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr) /* * If this is an attached mount make sure it's located in the callers * mount namespace. If it's not don't let the caller interact with it. - * If this is a detached mount make sure it has an anonymous mount - * namespace attached to it, i.e. we've created it via OPEN_TREE_CLONE. + * + * If this mount doesn't have a parent it's most often simply a + * detached mount with an anonymous mount namespace. IOW, something + * that's simply not attached yet. But there are apparently also users + * that do change mount properties on the rootfs itself. That obviously + * neither has a parent nor is it a detached mount so we cannot + * unconditionally check for detached mounts. */ - if (!(mnt_has_parent(mnt) ? check_mnt(mnt) : is_anon_ns(mnt->mnt_ns))) + if ((mnt_has_parent(mnt) || !is_anon_ns(mnt->mnt_ns)) && !check_mnt(mnt)) goto out; /* diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index a59e7b2edaac..3298c29b5548 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -101,7 +101,7 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) } if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { - if (folio_index(folio) == rreq->no_unlock_folio && + if (folio->index == rreq->no_unlock_folio && test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) _debug("no unlock"); else @@ -246,13 +246,13 @@ EXPORT_SYMBOL(netfs_readahead); */ int netfs_read_folio(struct file *file, struct folio *folio) { - struct address_space *mapping = folio_file_mapping(folio); + struct address_space *mapping = folio->mapping; struct netfs_io_request *rreq; struct netfs_inode *ctx = netfs_inode(mapping->host); struct folio *sink = NULL; int ret; - _enter("%lx", folio_index(folio)); + _enter("%lx", folio->index); rreq = netfs_alloc_request(mapping, file, folio_file_pos(folio), folio_size(folio), @@ -460,7 +460,7 @@ retry: ret = PTR_ERR(rreq); goto error; } - rreq->no_unlock_folio = folio_index(folio); + rreq->no_unlock_folio = folio->index; __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags); ret = netfs_begin_cache_read(rreq, ctx); @@ -518,7 +518,7 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio, size_t offset, size_t len) { struct netfs_io_request *rreq; - struct address_space *mapping = folio_file_mapping(folio); + struct address_space *mapping = folio->mapping; struct netfs_inode *ctx = netfs_inode(mapping->host); unsigned long long start = folio_pos(folio); size_t flen = folio_size(folio); @@ -535,7 +535,7 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio, goto error; } - rreq->no_unlock_folio = folio_index(folio); + rreq->no_unlock_folio = folio->index; __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags); ret = netfs_begin_cache_read(rreq, ctx); if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index 93dc76f34e39..9a0d32e4b422 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -221,10 +221,11 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, if (unlikely(fault_in_iov_iter_readable(iter, part) == part)) break; - ret = -ENOMEM; folio = netfs_grab_folio_for_write(mapping, pos, part); - if (!folio) + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); break; + } flen = folio_size(folio); offset = pos & (flen - 1); @@ -343,7 +344,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, break; default: WARN(true, "Unexpected modify type %u ix=%lx\n", - howto, folio_index(folio)); + howto, folio->index); ret = -EIO; goto error_folio_unlock; } @@ -476,6 +477,9 @@ ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode)); + if (!iov_iter_count(from)) + return 0; + if ((iocb->ki_flags & IOCB_DIRECT) || test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags)) return netfs_unbuffered_write_iter(iocb, from); @@ -648,7 +652,7 @@ static void netfs_pages_written_back(struct netfs_io_request *wreq) xas_for_each(&xas, folio, last) { WARN(!folio_test_writeback(folio), "bad %zx @%llx page %lx %lx\n", - wreq->len, wreq->start, folio_index(folio), last); + wreq->len, wreq->start, folio->index, last); if ((finfo = netfs_folio_info(folio))) { /* Streaming writes cannot be redirtied whilst under @@ -795,7 +799,7 @@ static void netfs_extend_writeback(struct address_space *mapping, continue; if (xa_is_value(folio)) break; - if (folio_index(folio) != index) { + if (folio->index != index) { xas_reset(xas); break; } @@ -901,7 +905,7 @@ static ssize_t netfs_write_back_from_locked_folio(struct address_space *mapping, long count = wbc->nr_to_write; int ret; - _enter(",%lx,%llx-%llx,%u", folio_index(folio), start, end, caching); + _enter(",%lx,%llx-%llx,%u", folio->index, start, end, caching); wreq = netfs_alloc_request(mapping, NULL, start, folio_size(folio), NETFS_WRITEBACK); @@ -1047,7 +1051,7 @@ search_again: start = folio_pos(folio); /* May regress with THPs */ - _debug("wback %lx", folio_index(folio)); + _debug("wback %lx", folio->index); /* At this point we hold neither the i_pages lock nor the page lock: * the page may be truncated or invalidated (changing page->mapping to diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index 60a40d293c87..bee047e20f5d 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -139,6 +139,9 @@ ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from) _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode)); + if (!iov_iter_count(from)) + return 0; + trace_netfs_write_iter(iocb, from); netfs_stat(&netfs_n_rh_dio_write); @@ -146,7 +149,7 @@ ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret < 0) return ret; ret = generic_write_checks(iocb, from); - if (ret < 0) + if (ret <= 0) goto out; ret = file_remove_privs(file); if (ret < 0) diff --git a/fs/netfs/fscache_cache.c b/fs/netfs/fscache_cache.c index d645f8b302a2..9397ed39b0b4 100644 --- a/fs/netfs/fscache_cache.c +++ b/fs/netfs/fscache_cache.c @@ -179,13 +179,14 @@ EXPORT_SYMBOL(fscache_acquire_cache); void fscache_put_cache(struct fscache_cache *cache, enum fscache_cache_trace where) { - unsigned int debug_id = cache->debug_id; + unsigned int debug_id; bool zero; int ref; if (IS_ERR_OR_NULL(cache)) return; + debug_id = cache->debug_id; zero = __refcount_dec_and_test(&cache->ref, &ref); trace_fscache_cache(debug_id, ref - 1, where); diff --git a/fs/netfs/io.c b/fs/netfs/io.c index 4309edf33862..4261ad6c55b6 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -124,7 +124,7 @@ static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq, /* We might have multiple writes from the same huge * folio, but we mustn't unlock a folio more than once. */ - if (have_unlocked && folio_index(folio) <= unlocked) + if (have_unlocked && folio->index <= unlocked) continue; unlocked = folio_next_index(folio) - 1; trace_netfs_folio(folio, netfs_folio_trace_end_copy); @@ -748,6 +748,8 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync) if (!rreq->submitted) { netfs_put_request(rreq, false, netfs_rreq_trace_put_no_submit); + if (rreq->origin == NETFS_DIO_READ) + inode_dio_end(rreq->inode); ret = 0; goto out; } diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index 0e3af37fc924..90051ced8e2a 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -180,7 +180,7 @@ void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) struct netfs_folio *finfo = NULL; size_t flen = folio_size(folio); - _enter("{%lx},%zx,%zx", folio_index(folio), offset, length); + _enter("{%lx},%zx,%zx", folio->index, offset, length); folio_wait_fscache(folio); diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index b4294a8aa2d4..f1eeb4914199 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -108,7 +108,7 @@ struct pnfs_block_dev { struct pnfs_block_dev *children; u64 chunk_size; - struct bdev_handle *bdev_handle; + struct file *bdev_file; u64 disk_offset; u64 pr_key; diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index c97ebc42ec0f..93ef7f864980 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -25,17 +25,17 @@ bl_free_device(struct pnfs_block_dev *dev) } else { if (dev->pr_registered) { const struct pr_ops *ops = - dev->bdev_handle->bdev->bd_disk->fops->pr_ops; + file_bdev(dev->bdev_file)->bd_disk->fops->pr_ops; int error; - error = ops->pr_register(dev->bdev_handle->bdev, + error = ops->pr_register(file_bdev(dev->bdev_file), dev->pr_key, 0, false); if (error) pr_err("failed to unregister PR key.\n"); } - if (dev->bdev_handle) - bdev_release(dev->bdev_handle); + if (dev->bdev_file) + fput(dev->bdev_file); } } @@ -169,7 +169,7 @@ static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset, map->start = dev->start; map->len = dev->len; map->disk_offset = dev->disk_offset; - map->bdev = dev->bdev_handle->bdev; + map->bdev = file_bdev(dev->bdev_file); return true; } @@ -236,26 +236,26 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d, struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) { struct pnfs_block_volume *v = &volumes[idx]; - struct bdev_handle *bdev_handle; + struct file *bdev_file; dev_t dev; dev = bl_resolve_deviceid(server, v, gfp_mask); if (!dev) return -EIO; - bdev_handle = bdev_open_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE, + bdev_file = bdev_file_open_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE, NULL, NULL); - if (IS_ERR(bdev_handle)) { + if (IS_ERR(bdev_file)) { printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n", - MAJOR(dev), MINOR(dev), PTR_ERR(bdev_handle)); - return PTR_ERR(bdev_handle); + MAJOR(dev), MINOR(dev), PTR_ERR(bdev_file)); + return PTR_ERR(bdev_file); } - d->bdev_handle = bdev_handle; - d->len = bdev_nr_bytes(bdev_handle->bdev); + d->bdev_file = bdev_file; + d->len = bdev_nr_bytes(file_bdev(bdev_file)); d->map = bl_map_simple; printk(KERN_INFO "pNFS: using block device %s\n", - bdev_handle->bdev->bd_disk->disk_name); + file_bdev(bdev_file)->bd_disk->disk_name); return 0; } @@ -300,10 +300,10 @@ bl_validate_designator(struct pnfs_block_volume *v) } } -static struct bdev_handle * +static struct file * bl_open_path(struct pnfs_block_volume *v, const char *prefix) { - struct bdev_handle *bdev_handle; + struct file *bdev_file; const char *devname; devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/%s%*phN", @@ -311,15 +311,15 @@ bl_open_path(struct pnfs_block_volume *v, const char *prefix) if (!devname) return ERR_PTR(-ENOMEM); - bdev_handle = bdev_open_by_path(devname, BLK_OPEN_READ | BLK_OPEN_WRITE, + bdev_file = bdev_file_open_by_path(devname, BLK_OPEN_READ | BLK_OPEN_WRITE, NULL, NULL); - if (IS_ERR(bdev_handle)) { + if (IS_ERR(bdev_file)) { pr_warn("pNFS: failed to open device %s (%ld)\n", - devname, PTR_ERR(bdev_handle)); + devname, PTR_ERR(bdev_file)); } kfree(devname); - return bdev_handle; + return bdev_file; } static int @@ -327,7 +327,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) { struct pnfs_block_volume *v = &volumes[idx]; - struct bdev_handle *bdev_handle; + struct file *bdev_file; const struct pr_ops *ops; int error; @@ -340,14 +340,14 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, * On other distributions like Debian, the default SCSI by-id path will * point to the dm-multipath device if one exists. */ - bdev_handle = bl_open_path(v, "dm-uuid-mpath-0x"); - if (IS_ERR(bdev_handle)) - bdev_handle = bl_open_path(v, "wwn-0x"); - if (IS_ERR(bdev_handle)) - return PTR_ERR(bdev_handle); - d->bdev_handle = bdev_handle; - - d->len = bdev_nr_bytes(d->bdev_handle->bdev); + bdev_file = bl_open_path(v, "dm-uuid-mpath-0x"); + if (IS_ERR(bdev_file)) + bdev_file = bl_open_path(v, "wwn-0x"); + if (IS_ERR(bdev_file)) + return PTR_ERR(bdev_file); + d->bdev_file = bdev_file; + + d->len = bdev_nr_bytes(file_bdev(d->bdev_file)); d->map = bl_map_simple; d->pr_key = v->scsi.pr_key; @@ -355,20 +355,20 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, return -ENODEV; pr_info("pNFS: using block device %s (reservation key 0x%llx)\n", - d->bdev_handle->bdev->bd_disk->disk_name, d->pr_key); + file_bdev(d->bdev_file)->bd_disk->disk_name, d->pr_key); - ops = d->bdev_handle->bdev->bd_disk->fops->pr_ops; + ops = file_bdev(d->bdev_file)->bd_disk->fops->pr_ops; if (!ops) { pr_err("pNFS: block device %s does not support reservations.", - d->bdev_handle->bdev->bd_disk->disk_name); + file_bdev(d->bdev_file)->bd_disk->disk_name); error = -EINVAL; goto out_blkdev_put; } - error = ops->pr_register(d->bdev_handle->bdev, 0, d->pr_key, true); + error = ops->pr_register(file_bdev(d->bdev_file), 0, d->pr_key, true); if (error) { pr_err("pNFS: failed to register key for block device %s.", - d->bdev_handle->bdev->bd_disk->disk_name); + file_bdev(d->bdev_file)->bd_disk->disk_name); goto out_blkdev_put; } @@ -376,7 +376,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, return 0; out_blkdev_put: - bdev_release(d->bdev_handle); + fput(d->bdev_file); return error; } diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 760d27dd7225..8adfcd4c8c1a 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -356,15 +356,12 @@ static const struct svc_version *nfs4_callback_version[] = { [4] = &nfs4_callback_version4, }; -static struct svc_stat nfs4_callback_stats; - static struct svc_program nfs4_callback_program = { .pg_prog = NFS4_CALLBACK, /* RPC service number */ .pg_nvers = ARRAY_SIZE(nfs4_callback_version), /* Number of entries */ .pg_vers = nfs4_callback_version, /* version table */ .pg_name = "NFSv4 callback", /* service name */ .pg_class = "nfs", /* authentication class */ - .pg_stats = &nfs4_callback_stats, .pg_authenticate = nfs_callback_authenticate, .pg_init_request = svc_generic_init_request, .pg_rpcbind_set = svc_generic_rpcbind_set, diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 44eca51b2808..fbdc9ca80f71 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -246,7 +246,7 @@ void nfs_free_client(struct nfs_client *clp) put_nfs_version(clp->cl_nfs_mod); kfree(clp->cl_hostname); kfree(clp->cl_acceptor); - kfree(clp); + kfree_rcu(clp, rcu); } EXPORT_SYMBOL_GPL(nfs_free_client); @@ -1006,6 +1006,14 @@ struct nfs_server *nfs_alloc_server(void) } EXPORT_SYMBOL_GPL(nfs_alloc_server); +static void delayed_free(struct rcu_head *p) +{ + struct nfs_server *server = container_of(p, struct nfs_server, rcu); + + nfs_free_iostats(server->io_stats); + kfree(server); +} + /* * Free up a server record */ @@ -1031,10 +1039,9 @@ void nfs_free_server(struct nfs_server *server) ida_destroy(&server->lockowner_id); ida_destroy(&server->openowner_id); - nfs_free_iostats(server->io_stats); put_cred(server->cred); - kfree(server); nfs_release_automount_timer(); + call_rcu(&server->rcu, delayed_free); } EXPORT_SYMBOL_GPL(nfs_free_server); diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index fa1a14def45c..d4a42ce0c7e3 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -156,8 +156,8 @@ static int nfs_delegation_claim_locks(struct nfs4_state *state, const nfs4_state list = &flctx->flc_posix; spin_lock(&flctx->flc_lock); restart: - list_for_each_entry(fl, list, fl_list) { - if (nfs_file_open_context(fl->fl_file)->state != state) + for_each_file_lock(fl, list) { + if (nfs_file_open_context(fl->c.flc_file)->state != state) continue; spin_unlock(&flctx->flc_lock); status = nfs4_lock_delegation_recall(fl, state, stateid); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index c8ecbe999059..ac505671efbd 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1431,9 +1431,9 @@ static bool nfs_verifier_is_delegated(struct dentry *dentry) static void nfs_set_verifier_locked(struct dentry *dentry, unsigned long verf) { struct inode *inode = d_inode(dentry); - struct inode *dir = d_inode(dentry->d_parent); + struct inode *dir = d_inode_rcu(dentry->d_parent); - if (!nfs_verify_change_attribute(dir, verf)) + if (!dir || !nfs_verify_change_attribute(dir, verf)) return; if (inode && NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) nfs_set_verifier_delegated(&verf); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index c03926a1cc73..7af5d270de28 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -1037,8 +1037,7 @@ int __init nfs_init_directcache(void) { nfs_direct_cachep = kmem_cache_create("nfs_direct_cache", sizeof(struct nfs_direct_req), - 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + 0, SLAB_RECLAIM_ACCOUNT, NULL); if (nfs_direct_cachep == NULL) return -ENOMEM; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 8577ccf621f5..407c6e15afe2 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -720,15 +720,15 @@ do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) { struct inode *inode = filp->f_mapping->host; int status = 0; - unsigned int saved_type = fl->fl_type; + unsigned int saved_type = fl->c.flc_type; /* Try local locking first */ posix_test_lock(filp, fl); - if (fl->fl_type != F_UNLCK) { + if (fl->c.flc_type != F_UNLCK) { /* found a conflict */ goto out; } - fl->fl_type = saved_type; + fl->c.flc_type = saved_type; if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) goto out_noconflict; @@ -740,7 +740,7 @@ do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) out: return status; out_noconflict: - fl->fl_type = F_UNLCK; + fl->c.flc_type = F_UNLCK; goto out; } @@ -765,7 +765,7 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) * If we're signalled while cleaning up locks on process exit, we * still need to complete the unlock. */ - if (status < 0 && !(fl->fl_flags & FL_CLOSE)) + if (status < 0 && !(fl->c.flc_flags & FL_CLOSE)) return status; } @@ -832,12 +832,12 @@ int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) int is_local = 0; dprintk("NFS: lock(%pD2, t=%x, fl=%x, r=%lld:%lld)\n", - filp, fl->fl_type, fl->fl_flags, + filp, fl->c.flc_type, fl->c.flc_flags, (long long)fl->fl_start, (long long)fl->fl_end); nfs_inc_stats(inode, NFSIOS_VFSLOCK); - if (fl->fl_flags & FL_RECLAIM) + if (fl->c.flc_flags & FL_RECLAIM) return -ENOGRACE; if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL) @@ -851,7 +851,7 @@ int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) if (IS_GETLK(cmd)) ret = do_getlk(filp, cmd, fl, is_local); - else if (fl->fl_type == F_UNLCK) + else if (lock_is_unlock(fl)) ret = do_unlk(filp, cmd, fl, is_local); else ret = do_setlk(filp, cmd, fl, is_local); @@ -869,16 +869,16 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) int is_local = 0; dprintk("NFS: flock(%pD2, t=%x, fl=%x)\n", - filp, fl->fl_type, fl->fl_flags); + filp, fl->c.flc_type, fl->c.flc_flags); - if (!(fl->fl_flags & FL_FLOCK)) + if (!(fl->c.flc_flags & FL_FLOCK)) return -ENOLCK; if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK) is_local = 1; /* We're simulating flock() locks using posix locks on the server */ - if (fl->fl_type == F_UNLCK) + if (lock_is_unlock(fl)) return do_unlk(filp, cmd, fl, is_local); return do_setlk(filp, cmd, fl, is_local); } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index ebb8d60e1152..93ea49a7eb61 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -2372,7 +2372,7 @@ static int __init nfs_init_inodecache(void) nfs_inode_cachep = kmem_cache_create("nfs_inode_cache", sizeof(struct nfs_inode), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), + SLAB_ACCOUNT), init_once); if (nfs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 2de66e4e8280..cbbe3f0193b8 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -963,7 +963,7 @@ nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl) struct nfs_open_context *ctx = nfs_file_open_context(filp); int status; - if (fl->fl_flags & FL_CLOSE) { + if (fl->c.flc_flags & FL_CLOSE) { l_ctx = nfs_get_lock_context(ctx); if (IS_ERR(l_ctx)) l_ctx = NULL; diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c index 49aaf28a6950..b6e3d8f77b91 100644 --- a/fs/nfs/nfs42xattr.c +++ b/fs/nfs/nfs42xattr.c @@ -1017,7 +1017,7 @@ int __init nfs4_xattr_cache_init(void) nfs4_xattr_cache_cachep = kmem_cache_create("nfs4_xattr_cache_cache", sizeof(struct nfs4_xattr_cache), 0, - (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), + (SLAB_RECLAIM_ACCOUNT), nfs4_xattr_cache_init_once); if (nfs4_xattr_cache_cachep == NULL) return -ENOMEM; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 581698f1b7b2..6ff41ceb9f1c 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -330,7 +330,7 @@ extern int update_open_stateid(struct nfs4_state *state, const nfs4_stateid *deleg_stateid, fmode_t fmode); extern int nfs4_proc_setlease(struct file *file, int arg, - struct file_lock **lease, void **priv); + struct file_lease **lease, void **priv); extern int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo); extern void nfs4_update_changeattr(struct inode *dir, diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index e238abc78a13..1cd9652f3c28 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -439,7 +439,7 @@ void nfs42_ssc_unregister_ops(void) } #endif /* CONFIG_NFS_V4_2 */ -static int nfs4_setlease(struct file *file, int arg, struct file_lock **lease, +static int nfs4_setlease(struct file *file, int arg, struct file_lease **lease, void **priv) { return nfs4_proc_setlease(file, arg, lease, priv); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 23819a756508..815996cb27fc 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6800,7 +6800,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); switch (status) { case 0: - request->fl_type = F_UNLCK; + request->c.flc_type = F_UNLCK; break; case -NFS4ERR_DENIED: status = 0; @@ -7018,8 +7018,8 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl, /* Ensure this is an unlock - when canceling a lock, the * canceled lock is passed in, and it won't be an unlock. */ - fl->fl_type = F_UNLCK; - if (fl->fl_flags & FL_CLOSE) + fl->c.flc_type = F_UNLCK; + if (fl->c.flc_flags & FL_CLOSE) set_bit(NFS_CONTEXT_UNLOCK, &ctx->flags); data = nfs4_alloc_unlockdata(fl, ctx, lsp, seqid); @@ -7045,11 +7045,11 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * struct rpc_task *task; struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t); int status = 0; - unsigned char fl_flags = request->fl_flags; + unsigned char saved_flags = request->c.flc_flags; status = nfs4_set_lock_state(state, request); /* Unlock _before_ we do the RPC call */ - request->fl_flags |= FL_EXISTS; + request->c.flc_flags |= FL_EXISTS; /* Exclude nfs_delegation_claim_locks() */ mutex_lock(&sp->so_delegreturn_mutex); /* Exclude nfs4_reclaim_open_stateid() - note nesting! */ @@ -7073,14 +7073,16 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * status = -ENOMEM; if (IS_ERR(seqid)) goto out; - task = nfs4_do_unlck(request, nfs_file_open_context(request->fl_file), lsp, seqid); + task = nfs4_do_unlck(request, + nfs_file_open_context(request->c.flc_file), + lsp, seqid); status = PTR_ERR(task); if (IS_ERR(task)) goto out; status = rpc_wait_for_completion_task(task); rpc_put_task(task); out: - request->fl_flags = fl_flags; + request->c.flc_flags = saved_flags; trace_nfs4_unlock(request, state, F_SETLK, status); return status; } @@ -7191,7 +7193,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) renew_lease(NFS_SERVER(d_inode(data->ctx->dentry)), data->timestamp); if (data->arg.new_lock && !data->cancelled) { - data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS); + data->fl.c.flc_flags &= ~(FL_SLEEP | FL_ACCESS); if (locks_lock_inode_wait(lsp->ls_state->inode, &data->fl) < 0) goto out_restart; } @@ -7292,7 +7294,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f if (nfs_server_capable(state->inode, NFS_CAP_MOVEABLE)) task_setup_data.flags |= RPC_TASK_MOVEABLE; - data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file), + data = nfs4_alloc_lockdata(fl, + nfs_file_open_context(fl->c.flc_file), fl->fl_u.nfs4_fl.owner, GFP_KERNEL); if (data == NULL) return -ENOMEM; @@ -7398,10 +7401,10 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock { struct nfs_inode *nfsi = NFS_I(state->inode); struct nfs4_state_owner *sp = state->owner; - unsigned char fl_flags = request->fl_flags; + unsigned char flags = request->c.flc_flags; int status; - request->fl_flags |= FL_ACCESS; + request->c.flc_flags |= FL_ACCESS; status = locks_lock_inode_wait(state->inode, request); if (status < 0) goto out; @@ -7410,7 +7413,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock if (test_bit(NFS_DELEGATED_STATE, &state->flags)) { /* Yes: cache locks! */ /* ...but avoid races with delegation recall... */ - request->fl_flags = fl_flags & ~FL_SLEEP; + request->c.flc_flags = flags & ~FL_SLEEP; status = locks_lock_inode_wait(state->inode, request); up_read(&nfsi->rwsem); mutex_unlock(&sp->so_delegreturn_mutex); @@ -7420,7 +7423,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock mutex_unlock(&sp->so_delegreturn_mutex); status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW); out: - request->fl_flags = fl_flags; + request->c.flc_flags = flags; return status; } @@ -7562,7 +7565,7 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) if (!(IS_SETLK(cmd) || IS_SETLKW(cmd))) return -EINVAL; - if (request->fl_type == F_UNLCK) { + if (lock_is_unlock(request)) { if (state != NULL) return nfs4_proc_unlck(state, cmd, request); return 0; @@ -7571,7 +7574,7 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) if (state == NULL) return -ENOLCK; - if ((request->fl_flags & FL_POSIX) && + if ((request->c.flc_flags & FL_POSIX) && !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags)) return -ENOLCK; @@ -7579,7 +7582,7 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) * Don't rely on the VFS having checked the file open mode, * since it won't do this for flock() locks. */ - switch (request->fl_type) { + switch (request->c.flc_type) { case F_RDLCK: if (!(filp->f_mode & FMODE_READ)) return -EBADF; @@ -7601,7 +7604,7 @@ static int nfs4_delete_lease(struct file *file, void **priv) return generic_setlease(file, F_UNLCK, NULL, priv); } -static int nfs4_add_lease(struct file *file, int arg, struct file_lock **lease, +static int nfs4_add_lease(struct file *file, int arg, struct file_lease **lease, void **priv) { struct inode *inode = file_inode(file); @@ -7619,7 +7622,7 @@ static int nfs4_add_lease(struct file *file, int arg, struct file_lock **lease, return -EAGAIN; } -int nfs4_proc_setlease(struct file *file, int arg, struct file_lock **lease, +int nfs4_proc_setlease(struct file *file, int arg, struct file_lease **lease, void **priv) { switch (arg) { diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 9a5d911a7edc..8cfabdbda336 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -847,15 +847,15 @@ void nfs4_close_sync(struct nfs4_state *state, fmode_t fmode) */ static struct nfs4_lock_state * __nfs4_find_lock_state(struct nfs4_state *state, - fl_owner_t fl_owner, fl_owner_t fl_owner2) + fl_owner_t owner, fl_owner_t owner2) { struct nfs4_lock_state *pos, *ret = NULL; list_for_each_entry(pos, &state->lock_states, ls_locks) { - if (pos->ls_owner == fl_owner) { + if (pos->ls_owner == owner) { ret = pos; break; } - if (pos->ls_owner == fl_owner2) + if (pos->ls_owner == owner2) ret = pos; } if (ret) @@ -868,7 +868,7 @@ __nfs4_find_lock_state(struct nfs4_state *state, * exists, return an uninitialized one. * */ -static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) +static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t owner) { struct nfs4_lock_state *lsp; struct nfs_server *server = state->owner->so_server; @@ -879,7 +879,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f nfs4_init_seqid_counter(&lsp->ls_seqid); refcount_set(&lsp->ls_count, 1); lsp->ls_state = state; - lsp->ls_owner = fl_owner; + lsp->ls_owner = owner; lsp->ls_seqid.owner_id = ida_alloc(&server->lockowner_id, GFP_KERNEL_ACCOUNT); if (lsp->ls_seqid.owner_id < 0) goto out_free; @@ -980,7 +980,7 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl) if (fl->fl_ops != NULL) return 0; - lsp = nfs4_get_lock_state(state, fl->fl_owner); + lsp = nfs4_get_lock_state(state, fl->c.flc_owner); if (lsp == NULL) return -ENOMEM; fl->fl_u.nfs4_fl.owner = lsp; @@ -993,7 +993,7 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst, const struct nfs_lock_context *l_ctx) { struct nfs4_lock_state *lsp; - fl_owner_t fl_owner, fl_flock_owner; + fl_owner_t owner, fl_flock_owner; int ret = -ENOENT; if (l_ctx == NULL) @@ -1002,11 +1002,11 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst, if (test_bit(LK_STATE_IN_USE, &state->flags) == 0) goto out; - fl_owner = l_ctx->lockowner; + owner = l_ctx->lockowner; fl_flock_owner = l_ctx->open_context->flock_owner; spin_lock(&state->state_lock); - lsp = __nfs4_find_lock_state(state, fl_owner, fl_flock_owner); + lsp = __nfs4_find_lock_state(state, owner, fl_flock_owner); if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags)) ret = -EIO; else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) { @@ -1529,8 +1529,8 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_ down_write(&nfsi->rwsem); spin_lock(&flctx->flc_lock); restart: - list_for_each_entry(fl, list, fl_list) { - if (nfs_file_open_context(fl->fl_file)->state != state) + for_each_file_lock(fl, list) { + if (nfs_file_open_context(fl->c.flc_file)->state != state) continue; spin_unlock(&flctx->flc_lock); status = ops->recover_lock(state, fl); diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index d27919d7241d..fd7cb15b08b2 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -699,7 +699,7 @@ DECLARE_EVENT_CLASS(nfs4_lock_event, __entry->error = error < 0 ? -error : 0; __entry->cmd = cmd; - __entry->type = request->fl_type; + __entry->type = request->c.flc_type; __entry->start = request->fl_start; __entry->end = request->fl_end; __entry->dev = inode->i_sb->s_dev; @@ -771,7 +771,7 @@ TRACE_EVENT(nfs4_set_lock, __entry->error = error < 0 ? -error : 0; __entry->cmd = cmd; - __entry->type = request->fl_type; + __entry->type = request->c.flc_type; __entry->start = request->fl_start; __entry->end = request->fl_end; __entry->dev = inode->i_sb->s_dev; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 69406e60f391..1416099dfcd1 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1305,7 +1305,7 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct static inline int nfs4_lock_type(struct file_lock *fl, int block) { - if (fl->fl_type == F_RDLCK) + if (lock_is_read(fl)) return block ? NFS4_READW_LT : NFS4_READ_LT; return block ? NFS4_WRITEW_LT : NFS4_WRITE_LT; } @@ -5052,10 +5052,10 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) fl->fl_end = fl->fl_start + (loff_t)length - 1; if (length == ~(uint64_t)0) fl->fl_end = OFFSET_MAX; - fl->fl_type = F_WRLCK; + fl->c.flc_type = F_WRLCK; if (type & 1) - fl->fl_type = F_RDLCK; - fl->fl_pid = 0; + fl->c.flc_type = F_RDLCK; + fl->c.flc_pid = 0; } p = xdr_decode_hyper(p, &clientid); /* read 8 bytes */ namelen = be32_to_cpup(p); /* read 4 bytes */ /* have read all 32 bytes now */ diff --git a/fs/nfs/write.c b/fs/nfs/write.c index bb79d3a886ae..84bb85264572 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1301,7 +1301,7 @@ static bool is_whole_file_wrlock(struct file_lock *fl) { return fl->fl_start == 0 && fl->fl_end == OFFSET_MAX && - fl->fl_type == F_WRLCK; + lock_is_write(fl); } /* If we know the page is up to date, and we're not using byte range locks (or @@ -1335,13 +1335,13 @@ static int nfs_can_extend_write(struct file *file, struct folio *folio, spin_lock(&flctx->flc_lock); if (!list_empty(&flctx->flc_posix)) { fl = list_first_entry(&flctx->flc_posix, struct file_lock, - fl_list); + c.flc_list); if (is_whole_file_wrlock(fl)) ret = 1; } else if (!list_empty(&flctx->flc_flock)) { fl = list_first_entry(&flctx->flc_flock, struct file_lock, - fl_list); - if (fl->fl_type == F_WRLCK) + c.flc_list); + if (lock_is_write(fl)) ret = 1; } spin_unlock(&flctx->flc_lock); diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 46fd74d91ea9..3c040c81c77d 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -328,10 +328,10 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, } static void -nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls) +nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file) { struct nfs4_client *clp = ls->ls_stid.sc_client; - struct block_device *bdev = ls->ls_file->nf_file->f_path.mnt->mnt_sb->s_bdev; + struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev; bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY, nfsd4_scsi_pr_key(clp), 0, true); diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h index 4cbe0434cbb8..66a05fefae98 100644 --- a/fs/nfsd/cache.h +++ b/fs/nfsd/cache.h @@ -80,8 +80,6 @@ enum { int nfsd_drc_slab_create(void); void nfsd_drc_slab_free(void); -int nfsd_net_reply_cache_init(struct nfsd_net *nn); -void nfsd_net_reply_cache_destroy(struct nfsd_net *nn); int nfsd_reply_cache_init(struct nfsd_net *); void nfsd_reply_cache_shutdown(struct nfsd_net *); int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 9cb7f0c33df5..ddd3e0d9cfa6 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -61,13 +61,10 @@ static DEFINE_PER_CPU(unsigned long, nfsd_file_total_age); static DEFINE_PER_CPU(unsigned long, nfsd_file_evictions); struct nfsd_fcache_disposal { - struct work_struct work; spinlock_t lock; struct list_head freeme; }; -static struct workqueue_struct *nfsd_filecache_wq __read_mostly; - static struct kmem_cache *nfsd_file_slab; static struct kmem_cache *nfsd_file_mark_slab; static struct list_lru nfsd_file_lru; @@ -283,7 +280,7 @@ nfsd_file_free(struct nfsd_file *nf) nfsd_file_mark_put(nf->nf_mark); if (nf->nf_file) { nfsd_file_check_write_error(nf); - filp_close(nf->nf_file, NULL); + nfsd_filp_close(nf->nf_file); } /* @@ -421,7 +418,37 @@ nfsd_file_dispose_list_delayed(struct list_head *dispose) spin_lock(&l->lock); list_move_tail(&nf->nf_lru, &l->freeme); spin_unlock(&l->lock); - queue_work(nfsd_filecache_wq, &l->work); + svc_wake_up(nn->nfsd_serv); + } +} + +/** + * nfsd_file_net_dispose - deal with nfsd_files waiting to be disposed. + * @nn: nfsd_net in which to find files to be disposed. + * + * When files held open for nfsv3 are removed from the filecache, whether + * due to memory pressure or garbage collection, they are queued to + * a per-net-ns queue. This function completes the disposal, either + * directly or by waking another nfsd thread to help with the work. + */ +void nfsd_file_net_dispose(struct nfsd_net *nn) +{ + struct nfsd_fcache_disposal *l = nn->fcache_disposal; + + if (!list_empty(&l->freeme)) { + LIST_HEAD(dispose); + int i; + + spin_lock(&l->lock); + for (i = 0; i < 8 && !list_empty(&l->freeme); i++) + list_move(l->freeme.next, &dispose); + spin_unlock(&l->lock); + if (!list_empty(&l->freeme)) + /* Wake up another thread to share the work + * *before* doing any actual disposing. + */ + svc_wake_up(nn->nfsd_serv); + nfsd_file_dispose_list(&dispose); } } @@ -631,28 +658,6 @@ nfsd_file_close_inode_sync(struct inode *inode) list_del_init(&nf->nf_lru); nfsd_file_free(nf); } - flush_delayed_fput(); -} - -/** - * nfsd_file_delayed_close - close unused nfsd_files - * @work: dummy - * - * Scrape the freeme list for this nfsd_net, and then dispose of them - * all. - */ -static void -nfsd_file_delayed_close(struct work_struct *work) -{ - LIST_HEAD(head); - struct nfsd_fcache_disposal *l = container_of(work, - struct nfsd_fcache_disposal, work); - - spin_lock(&l->lock); - list_splice_init(&l->freeme, &head); - spin_unlock(&l->lock); - - nfsd_file_dispose_list(&head); } static int @@ -662,8 +667,8 @@ nfsd_file_lease_notifier_call(struct notifier_block *nb, unsigned long arg, struct file_lock *fl = data; /* Only close files for F_SETLEASE leases */ - if (fl->fl_flags & FL_LEASE) - nfsd_file_close_inode(file_inode(fl->fl_file)); + if (fl->c.flc_flags & FL_LEASE) + nfsd_file_close_inode(file_inode(fl->c.flc_file)); return 0; } @@ -717,25 +722,18 @@ nfsd_file_cache_init(void) return ret; ret = -ENOMEM; - nfsd_filecache_wq = alloc_workqueue("nfsd_filecache", WQ_UNBOUND, 0); - if (!nfsd_filecache_wq) - goto out; - - nfsd_file_slab = kmem_cache_create("nfsd_file", - sizeof(struct nfsd_file), 0, 0, NULL); + nfsd_file_slab = KMEM_CACHE(nfsd_file, 0); if (!nfsd_file_slab) { pr_err("nfsd: unable to create nfsd_file_slab\n"); goto out_err; } - nfsd_file_mark_slab = kmem_cache_create("nfsd_file_mark", - sizeof(struct nfsd_file_mark), 0, 0, NULL); + nfsd_file_mark_slab = KMEM_CACHE(nfsd_file_mark, 0); if (!nfsd_file_mark_slab) { pr_err("nfsd: unable to create nfsd_file_mark_slab\n"); goto out_err; } - ret = list_lru_init(&nfsd_file_lru); if (ret) { pr_err("nfsd: failed to init nfsd_file_lru: %d\n", ret); @@ -785,8 +783,6 @@ out_err: nfsd_file_slab = NULL; kmem_cache_destroy(nfsd_file_mark_slab); nfsd_file_mark_slab = NULL; - destroy_workqueue(nfsd_filecache_wq); - nfsd_filecache_wq = NULL; rhltable_destroy(&nfsd_file_rhltable); goto out; } @@ -832,7 +828,6 @@ nfsd_alloc_fcache_disposal(void) l = kmalloc(sizeof(*l), GFP_KERNEL); if (!l) return NULL; - INIT_WORK(&l->work, nfsd_file_delayed_close); spin_lock_init(&l->lock); INIT_LIST_HEAD(&l->freeme); return l; @@ -841,7 +836,6 @@ nfsd_alloc_fcache_disposal(void) static void nfsd_free_fcache_disposal(struct nfsd_fcache_disposal *l) { - cancel_work_sync(&l->work); nfsd_file_dispose_list(&l->freeme); kfree(l); } @@ -910,8 +904,6 @@ nfsd_file_cache_shutdown(void) fsnotify_wait_marks_destroyed(); kmem_cache_destroy(nfsd_file_mark_slab); nfsd_file_mark_slab = NULL; - destroy_workqueue(nfsd_filecache_wq); - nfsd_filecache_wq = NULL; rhltable_destroy(&nfsd_file_rhltable); for_each_possible_cpu(i) { diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index e54165a3224f..c61884def906 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -56,6 +56,7 @@ void nfsd_file_cache_shutdown_net(struct net *net); void nfsd_file_put(struct nfsd_file *nf); struct nfsd_file *nfsd_file_get(struct nfsd_file *nf); void nfsd_file_close_inode_sync(struct inode *inode); +void nfsd_file_net_dispose(struct nfsd_net *nn); bool nfsd_file_is_cached(struct inode *inode); __be32 nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int may_flags, struct nfsd_file **nfp); diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 74b4360779a1..d4be519b5734 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -11,8 +11,10 @@ #include <net/net_namespace.h> #include <net/netns/generic.h> #include <linux/filelock.h> +#include <linux/nfs4.h> #include <linux/percpu_counter.h> #include <linux/siphash.h> +#include <linux/sunrpc/stats.h> /* Hash tables for nfs4_clientid state */ #define CLIENT_HASH_BITS 4 @@ -26,10 +28,22 @@ struct nfsd4_client_tracking_ops; enum { /* cache misses due only to checksum comparison failures */ - NFSD_NET_PAYLOAD_MISSES, + NFSD_STATS_PAYLOAD_MISSES, /* amount of memory (in bytes) currently consumed by the DRC */ - NFSD_NET_DRC_MEM_USAGE, - NFSD_NET_COUNTERS_NUM + NFSD_STATS_DRC_MEM_USAGE, + NFSD_STATS_RC_HITS, /* repcache hits */ + NFSD_STATS_RC_MISSES, /* repcache misses */ + NFSD_STATS_RC_NOCACHE, /* uncached reqs */ + NFSD_STATS_FH_STALE, /* FH stale error */ + NFSD_STATS_IO_READ, /* bytes returned to read requests */ + NFSD_STATS_IO_WRITE, /* bytes passed in write requests */ +#ifdef CONFIG_NFSD_V4 + NFSD_STATS_FIRST_NFS4_OP, /* count of individual nfsv4 operations */ + NFSD_STATS_LAST_NFS4_OP = NFSD_STATS_FIRST_NFS4_OP + LAST_NFS4_OP, +#define NFSD_STATS_NFS4_OP(op) (NFSD_STATS_FIRST_NFS4_OP + (op)) + NFSD_STATS_WDELEG_GETATTR, /* count of getattr conflict with wdeleg */ +#endif + NFSD_STATS_COUNTERS_NUM }; /* @@ -164,7 +178,10 @@ struct nfsd_net { atomic_t num_drc_entries; /* Per-netns stats counters */ - struct percpu_counter counter[NFSD_NET_COUNTERS_NUM]; + struct percpu_counter counter[NFSD_STATS_COUNTERS_NUM]; + + /* sunrpc svc stats */ + struct svc_stat nfsd_svcstats; /* longest hash chain seen */ unsigned int longest_chain; @@ -192,6 +209,10 @@ struct nfsd_net { atomic_t nfsd_courtesy_clients; struct shrinker *nfsd_client_shrinker; struct work_struct nfsd_shrinker_work; + + /* last time an admin-revoke happened for NFSv4.0 */ + time64_t nfs40_last_revoke; + }; /* Simple check to find out if a given net was properly initialized */ diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index b78eceebd945..dfcc957e460d 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -71,13 +71,15 @@ nfsd3_proc_setattr(struct svc_rqst *rqstp) struct nfsd_attrs attrs = { .na_iattr = &argp->attrs, }; + const struct timespec64 *guardtime = NULL; dprintk("nfsd: SETATTR(3) %s\n", SVCFH_fmt(&argp->fh)); fh_copy(&resp->fh, &argp->fh); - resp->status = nfsd_setattr(rqstp, &resp->fh, &attrs, - argp->check_guard, argp->guardtime); + if (argp->check_guard) + guardtime = &argp->guardtime; + resp->status = nfsd_setattr(rqstp, &resp->fh, &attrs, guardtime); return rpc_success; } diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index f32128955ec8..a7a07470c1f8 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -295,17 +295,14 @@ svcxdr_decode_sattr3(struct svc_rqst *rqstp, struct xdr_stream *xdr, static bool svcxdr_decode_sattrguard3(struct xdr_stream *xdr, struct nfsd3_sattrargs *args) { - __be32 *p; u32 check; if (xdr_stream_decode_bool(xdr, &check) < 0) return false; if (check) { - p = xdr_inline_decode(xdr, XDR_UNIT * 2); - if (!p) + if (!svcxdr_decode_nfstime3(xdr, &args->guardtime)) return false; args->check_guard = 1; - args->guardtime = be32_to_cpup(p); } else args->check_guard = 0; diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 926c29879c6a..87c9547989f6 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -45,7 +45,7 @@ #define NFSDDBG_FACILITY NFSDDBG_PROC -static void nfsd4_mark_cb_fault(struct nfs4_client *, int reason); +static void nfsd4_mark_cb_fault(struct nfs4_client *clp); #define NFSPROC4_CB_NULL 0 #define NFSPROC4_CB_COMPOUND 1 @@ -85,7 +85,21 @@ static void encode_uint32(struct xdr_stream *xdr, u32 n) static void encode_bitmap4(struct xdr_stream *xdr, const __u32 *bitmap, size_t len) { - WARN_ON_ONCE(xdr_stream_encode_uint32_array(xdr, bitmap, len) < 0); + xdr_stream_encode_uint32_array(xdr, bitmap, len); +} + +static int decode_cb_fattr4(struct xdr_stream *xdr, uint32_t *bitmap, + struct nfs4_cb_fattr *fattr) +{ + fattr->ncf_cb_change = 0; + fattr->ncf_cb_fsize = 0; + if (bitmap[0] & FATTR4_WORD0_CHANGE) + if (xdr_stream_decode_u64(xdr, &fattr->ncf_cb_change) < 0) + return -NFSERR_BAD_XDR; + if (bitmap[0] & FATTR4_WORD0_SIZE) + if (xdr_stream_decode_u64(xdr, &fattr->ncf_cb_fsize) < 0) + return -NFSERR_BAD_XDR; + return 0; } static void encode_nfs_cb_opnum4(struct xdr_stream *xdr, enum nfs_cb_opnum4 op) @@ -334,6 +348,30 @@ encode_cb_recallany4args(struct xdr_stream *xdr, } /* + * CB_GETATTR4args + * struct CB_GETATTR4args { + * nfs_fh4 fh; + * bitmap4 attr_request; + * }; + * + * The size and change attributes are the only one + * guaranteed to be serviced by the client. + */ +static void +encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr, + struct nfs4_cb_fattr *fattr) +{ + struct nfs4_delegation *dp = + container_of(fattr, struct nfs4_delegation, dl_cb_fattr); + struct knfsd_fh *fh = &dp->dl_stid.sc_file->fi_fhandle; + + encode_nfs_cb_opnum4(xdr, OP_CB_GETATTR); + encode_nfs_fh4(xdr, fh); + encode_bitmap4(xdr, fattr->ncf_cb_bmap, ARRAY_SIZE(fattr->ncf_cb_bmap)); + hdr->nops++; +} + +/* * CB_SEQUENCE4args * * struct CB_SEQUENCE4args { @@ -469,6 +507,26 @@ static void nfs4_xdr_enc_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr, } /* + * 20.1. Operation 3: CB_GETATTR - Get Attributes + */ +static void nfs4_xdr_enc_cb_getattr(struct rpc_rqst *req, + struct xdr_stream *xdr, const void *data) +{ + const struct nfsd4_callback *cb = data; + struct nfs4_cb_fattr *ncf = + container_of(cb, struct nfs4_cb_fattr, ncf_getattr); + struct nfs4_cb_compound_hdr hdr = { + .ident = cb->cb_clp->cl_cb_ident, + .minorversion = cb->cb_clp->cl_minorversion, + }; + + encode_cb_compound4args(xdr, &hdr); + encode_cb_sequence4args(xdr, cb, &hdr); + encode_cb_getattr4args(xdr, &hdr, ncf); + encode_cb_nops(&hdr); +} + +/* * 20.2. Operation 4: CB_RECALL - Recall a Delegation */ static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr, @@ -524,6 +582,42 @@ static int nfs4_xdr_dec_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr, } /* + * 20.1. Operation 3: CB_GETATTR - Get Attributes + */ +static int nfs4_xdr_dec_cb_getattr(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *data) +{ + struct nfsd4_callback *cb = data; + struct nfs4_cb_compound_hdr hdr; + int status; + u32 bitmap[3] = {0}; + u32 attrlen; + struct nfs4_cb_fattr *ncf = + container_of(cb, struct nfs4_cb_fattr, ncf_getattr); + + status = decode_cb_compound4res(xdr, &hdr); + if (unlikely(status)) + return status; + + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; + + status = decode_cb_op_status(xdr, OP_CB_GETATTR, &cb->cb_status); + if (status) + return status; + if (xdr_stream_decode_uint32_array(xdr, bitmap, 3) < 0) + return -NFSERR_BAD_XDR; + if (xdr_stream_decode_u32(xdr, &attrlen) < 0) + return -NFSERR_BAD_XDR; + if (attrlen > (sizeof(ncf->ncf_cb_change) + sizeof(ncf->ncf_cb_fsize))) + return -NFSERR_BAD_XDR; + status = decode_cb_fattr4(xdr, bitmap, ncf); + return status; +} + +/* * 20.2. Operation 4: CB_RECALL - Recall a Delegation */ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, @@ -674,7 +768,7 @@ static void nfs4_xdr_enc_cb_notify_lock(struct rpc_rqst *req, const struct nfsd4_callback *cb = data; const struct nfsd4_blocked_lock *nbl = container_of(cb, struct nfsd4_blocked_lock, nbl_cb); - struct nfs4_lockowner *lo = (struct nfs4_lockowner *)nbl->nbl_lock.fl_owner; + struct nfs4_lockowner *lo = (struct nfs4_lockowner *)nbl->nbl_lock.c.flc_owner; struct nfs4_cb_compound_hdr hdr = { .ident = 0, .minorversion = cb->cb_clp->cl_minorversion, @@ -831,6 +925,7 @@ static const struct rpc_procinfo nfs4_cb_procedures[] = { PROC(CB_NOTIFY_LOCK, COMPOUND, cb_notify_lock, cb_notify_lock), PROC(CB_OFFLOAD, COMPOUND, cb_offload, cb_offload), PROC(CB_RECALL_ANY, COMPOUND, cb_recall_any, cb_recall_any), + PROC(CB_GETATTR, COMPOUND, cb_getattr, cb_getattr), }; static unsigned int nfs4_cb_counts[ARRAY_SIZE(nfs4_cb_procedures)]; @@ -887,7 +982,16 @@ static struct workqueue_struct *callback_wq; static bool nfsd4_queue_cb(struct nfsd4_callback *cb) { - return queue_work(callback_wq, &cb->cb_work); + trace_nfsd_cb_queue(cb->cb_clp, cb); + return queue_delayed_work(callback_wq, &cb->cb_work, 0); +} + +static void nfsd4_queue_cb_delayed(struct nfsd4_callback *cb, + unsigned long msecs) +{ + trace_nfsd_cb_queue(cb->cb_clp, cb); + queue_delayed_work(callback_wq, &cb->cb_work, + msecs_to_jiffies(msecs)); } static void nfsd41_cb_inflight_begin(struct nfs4_client *clp) @@ -999,18 +1103,18 @@ static void nfsd4_mark_cb_state(struct nfs4_client *clp, int newstate) { if (clp->cl_cb_state != newstate) { clp->cl_cb_state = newstate; - trace_nfsd_cb_state(clp); + trace_nfsd_cb_new_state(clp); } } -static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason) +static void nfsd4_mark_cb_down(struct nfs4_client *clp) { if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags)) return; nfsd4_mark_cb_state(clp, NFSD4_CB_DOWN); } -static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason) +static void nfsd4_mark_cb_fault(struct nfs4_client *clp) { if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags)) return; @@ -1022,7 +1126,7 @@ static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata) struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null); if (task->tk_status) - nfsd4_mark_cb_down(clp, task->tk_status); + nfsd4_mark_cb_down(clp); else nfsd4_mark_cb_state(clp, NFSD4_CB_UP); } @@ -1106,6 +1210,7 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb) { struct nfs4_client *clp = cb->cb_clp; + trace_nfsd_cb_destroy(clp, cb); nfsd41_cb_release_slot(cb); if (cb->cb_ops && cb->cb_ops->release) cb->cb_ops->release(cb); @@ -1158,6 +1263,8 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback if (!cb->cb_holds_slot) goto need_restart; + /* This is the operation status code for CB_SEQUENCE */ + trace_nfsd_cb_seq_status(task, cb); switch (cb->cb_seq_status) { case 0: /* @@ -1171,13 +1278,23 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback break; case -ESERVERFAULT: ++session->se_cb_seq_nr; - fallthrough; + nfsd4_mark_cb_fault(cb->cb_clp); + ret = false; + break; case 1: + /* + * cb_seq_status remains 1 if an RPC Reply was never + * received. NFSD can't know if the client processed + * the CB_SEQUENCE operation. Ask the client to send a + * DESTROY_SESSION to recover. + */ + fallthrough; case -NFS4ERR_BADSESSION: - nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status); + nfsd4_mark_cb_fault(cb->cb_clp); ret = false; - break; + goto need_restart; case -NFS4ERR_DELAY: + cb->cb_seq_status = 1; if (!rpc_restart_call(task)) goto out; @@ -1192,14 +1309,11 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback } break; default: - nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status); - dprintk("%s: unprocessed error %d\n", __func__, - cb->cb_seq_status); + nfsd4_mark_cb_fault(cb->cb_clp); } - nfsd41_cb_release_slot(cb); - dprintk("%s: freed slot, new seqid=%d\n", __func__, - clp->cl_cb_session->se_cb_seq_nr); + + trace_nfsd_cb_free_slot(task, cb); if (RPC_SIGNALLED(task)) goto need_restart; @@ -1211,6 +1325,7 @@ retry_nowait: goto out; need_restart: if (!test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) { + trace_nfsd_cb_restart(clp, cb); task->tk_status = 0; cb->cb_need_restart = true; } @@ -1240,7 +1355,7 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata) case -EIO: case -ETIMEDOUT: case -EACCES: - nfsd4_mark_cb_down(clp, task->tk_status); + nfsd4_mark_cb_down(clp); } break; default: @@ -1295,12 +1410,13 @@ void nfsd4_shutdown_callback(struct nfs4_client *clp) nfsd41_cb_inflight_wait_complete(clp); } -/* requires cl_lock: */ static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp) { struct nfsd4_session *s; struct nfsd4_conn *c; + lockdep_assert_held(&clp->cl_lock); + list_for_each_entry(s, &clp->cl_sessions, se_perclnt) { list_for_each_entry(c, &s->se_conns, cn_persession) { if (c->cn_flags & NFS4_CDFC4_BACK) @@ -1324,11 +1440,14 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) struct nfsd4_conn *c; int err; + trace_nfsd_cb_bc_update(clp, cb); + /* * This is either an update, or the client dying; in either case, * kill the old client: */ if (clp->cl_cb_client) { + trace_nfsd_cb_bc_shutdown(clp, cb); rpc_shutdown_client(clp->cl_cb_client); clp->cl_cb_client = NULL; put_cred(clp->cl_cb_cred); @@ -1340,13 +1459,15 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) } if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) return; + spin_lock(&clp->cl_lock); /* * Only serialized callback code is allowed to clear these * flags; main nfsd code can only set them: */ - BUG_ON(!(clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK)); + WARN_ON(!(clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK)); clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags); + memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn)); c = __nfsd4_find_backchannel(clp); if (c) { @@ -1358,7 +1479,7 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) err = setup_callback_client(clp, &conn, ses); if (err) { - nfsd4_mark_cb_down(clp, err); + nfsd4_mark_cb_down(clp); if (c) svc_xprt_put(c->cn_xprt); return; @@ -1369,25 +1490,28 @@ static void nfsd4_run_cb_work(struct work_struct *work) { struct nfsd4_callback *cb = - container_of(work, struct nfsd4_callback, cb_work); + container_of(work, struct nfsd4_callback, cb_work.work); struct nfs4_client *clp = cb->cb_clp; struct rpc_clnt *clnt; int flags; - if (cb->cb_need_restart) { - cb->cb_need_restart = false; - } else { - if (cb->cb_ops && cb->cb_ops->prepare) - cb->cb_ops->prepare(cb); - } + trace_nfsd_cb_start(clp); if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK) nfsd4_process_cb_update(cb); clnt = clp->cl_cb_client; if (!clnt) { - /* Callback channel broken, or client killed; give up: */ - nfsd41_destroy_cb(cb); + if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) + nfsd41_destroy_cb(cb); + else { + /* + * XXX: Ideally, we could wait for the client to + * reconnect, but I haven't figured out how + * to do that yet. + */ + nfsd4_queue_cb_delayed(cb, 25); + } return; } @@ -1400,6 +1524,12 @@ nfsd4_run_cb_work(struct work_struct *work) return; } + if (cb->cb_need_restart) { + cb->cb_need_restart = false; + } else { + if (cb->cb_ops && cb->cb_ops->prepare) + cb->cb_ops->prepare(cb); + } cb->cb_msg.rpc_cred = clp->cl_cb_cred; flags = clp->cl_minorversion ? RPC_TASK_NOCONNECT : RPC_TASK_SOFTCONN; rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | flags, @@ -1414,8 +1544,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, cb->cb_msg.rpc_argp = cb; cb->cb_msg.rpc_resp = cb; cb->cb_ops = ops; - INIT_WORK(&cb->cb_work, nfsd4_run_cb_work); - cb->cb_seq_status = 1; + INIT_DELAYED_WORK(&cb->cb_work, nfsd4_run_cb_work); cb->cb_status = 0; cb->cb_need_restart = false; cb->cb_holds_slot = false; diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 5e8096bc5eaa..4f3072b5979a 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -25,7 +25,7 @@ static struct kmem_cache *nfs4_layout_cache; static struct kmem_cache *nfs4_layout_stateid_cache; static const struct nfsd4_callback_ops nfsd4_cb_layout_ops; -static const struct lock_manager_operations nfsd4_layouts_lm_ops; +static const struct lease_manager_operations nfsd4_layouts_lm_ops; const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = { #ifdef CONFIG_NFSD_FLEXFILELAYOUT @@ -152,6 +152,23 @@ void nfsd4_setup_layout_type(struct svc_export *exp) #endif } +void nfsd4_close_layout(struct nfs4_layout_stateid *ls) +{ + struct nfsd_file *fl; + + spin_lock(&ls->ls_stid.sc_file->fi_lock); + fl = ls->ls_file; + ls->ls_file = NULL; + spin_unlock(&ls->ls_stid.sc_file->fi_lock); + + if (fl) { + if (!nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls) + kernel_setlease(fl->nf_file, F_UNLCK, NULL, + (void **)&ls); + nfsd_file_put(fl); + } +} + static void nfsd4_free_layout_stateid(struct nfs4_stid *stid) { @@ -169,9 +186,7 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid) list_del_init(&ls->ls_perfile); spin_unlock(&fp->fi_lock); - if (!nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls) - vfs_setlease(ls->ls_file->nf_file, F_UNLCK, NULL, (void **)&ls); - nfsd_file_put(ls->ls_file); + nfsd4_close_layout(ls); if (ls->ls_recalled) atomic_dec(&ls->ls_stid.sc_file->fi_lo_recalls); @@ -182,27 +197,26 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid) static int nfsd4_layout_setlease(struct nfs4_layout_stateid *ls) { - struct file_lock *fl; + struct file_lease *fl; int status; if (nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls) return 0; - fl = locks_alloc_lock(); + fl = locks_alloc_lease(); if (!fl) return -ENOMEM; - locks_init_lock(fl); + locks_init_lease(fl); fl->fl_lmops = &nfsd4_layouts_lm_ops; - fl->fl_flags = FL_LAYOUT; - fl->fl_type = F_RDLCK; - fl->fl_end = OFFSET_MAX; - fl->fl_owner = ls; - fl->fl_pid = current->tgid; - fl->fl_file = ls->ls_file->nf_file; - - status = vfs_setlease(fl->fl_file, fl->fl_type, &fl, NULL); + fl->c.flc_flags = FL_LAYOUT; + fl->c.flc_type = F_RDLCK; + fl->c.flc_owner = ls; + fl->c.flc_pid = current->tgid; + fl->c.flc_file = ls->ls_file->nf_file; + + status = kernel_setlease(fl->c.flc_file, fl->c.flc_type, &fl, NULL); if (status) { - locks_free_lock(fl); + locks_free_lease(fl); return status; } BUG_ON(fl != NULL); @@ -236,7 +250,7 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, nfsd4_init_cb(&ls->ls_recall, clp, &nfsd4_cb_layout_ops, NFSPROC4_CLNT_CB_LAYOUT); - if (parent->sc_type == NFS4_DELEG_STID) + if (parent->sc_type == SC_TYPE_DELEG) ls->ls_file = nfsd_file_get(fp->fi_deleg_file); else ls->ls_file = find_any_file(fp); @@ -250,7 +264,7 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, } spin_lock(&clp->cl_lock); - stp->sc_type = NFS4_LAYOUT_STID; + stp->sc_type = SC_TYPE_LAYOUT; list_add(&ls->ls_perclnt, &clp->cl_lo_states); spin_unlock(&clp->cl_lock); @@ -269,13 +283,13 @@ nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp, { struct nfs4_layout_stateid *ls; struct nfs4_stid *stid; - unsigned char typemask = NFS4_LAYOUT_STID; + unsigned short typemask = SC_TYPE_LAYOUT; __be32 status; if (create) - typemask |= (NFS4_OPEN_STID | NFS4_LOCK_STID | NFS4_DELEG_STID); + typemask |= (SC_TYPE_OPEN | SC_TYPE_LOCK | SC_TYPE_DELEG); - status = nfsd4_lookup_stateid(cstate, stateid, typemask, &stid, + status = nfsd4_lookup_stateid(cstate, stateid, typemask, 0, &stid, net_generic(SVC_NET(rqstp), nfsd_net_id)); if (status) goto out; @@ -286,7 +300,7 @@ nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp, goto out_put_stid; } - if (stid->sc_type != NFS4_LAYOUT_STID) { + if (stid->sc_type != SC_TYPE_LAYOUT) { ls = nfsd4_alloc_layout_stateid(cstate, stid, layout_type); nfs4_put_stid(stid); @@ -518,7 +532,7 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp, lrp->lrs_present = true; } else { trace_nfsd_layoutstate_unhash(&ls->ls_stid.sc_stateid); - nfs4_unhash_stid(&ls->ls_stid); + ls->ls_stid.sc_status |= SC_STATUS_CLOSED; lrp->lrs_present = false; } spin_unlock(&ls->ls_lock); @@ -605,7 +619,7 @@ nfsd4_return_all_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp) } static void -nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls) +nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls, struct nfsd_file *file) { struct nfs4_client *clp = ls->ls_stid.sc_client; char addr_str[INET6_ADDRSTRLEN]; @@ -627,7 +641,7 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls) argv[0] = (char *)nfsd_recall_failed; argv[1] = addr_str; - argv[2] = ls->ls_file->nf_file->f_path.mnt->mnt_sb->s_id; + argv[2] = file->nf_file->f_path.mnt->mnt_sb->s_id; argv[3] = NULL; error = call_usermodehelper(nfsd_recall_failed, argv, envp, @@ -657,6 +671,7 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) struct nfsd_net *nn; ktime_t now, cutoff; const struct nfsd4_layout_ops *ops; + struct nfsd_file *fl; trace_nfsd_cb_layout_done(&ls->ls_stid.sc_stateid, task); switch (task->tk_status) { @@ -688,12 +703,17 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) * Unknown error or non-responding client, we'll need to fence. */ trace_nfsd_layout_recall_fail(&ls->ls_stid.sc_stateid); - - ops = nfsd4_layout_ops[ls->ls_layout_type]; - if (ops->fence_client) - ops->fence_client(ls); - else - nfsd4_cb_layout_fail(ls); + rcu_read_lock(); + fl = nfsd_file_get(ls->ls_file); + rcu_read_unlock(); + if (fl) { + ops = nfsd4_layout_ops[ls->ls_layout_type]; + if (ops->fence_client) + ops->fence_client(ls, fl); + else + nfsd4_cb_layout_fail(ls, fl); + nfsd_file_put(fl); + } return 1; case -NFS4ERR_NOMATCHING_LAYOUT: trace_nfsd_layout_recall_done(&ls->ls_stid.sc_stateid); @@ -723,7 +743,7 @@ static const struct nfsd4_callback_ops nfsd4_cb_layout_ops = { }; static bool -nfsd4_layout_lm_break(struct file_lock *fl) +nfsd4_layout_lm_break(struct file_lease *fl) { /* * We don't want the locks code to timeout the lease for us; @@ -731,19 +751,19 @@ nfsd4_layout_lm_break(struct file_lock *fl) * in time: */ fl->fl_break_time = 0; - nfsd4_recall_file_layout(fl->fl_owner); + nfsd4_recall_file_layout(fl->c.flc_owner); return false; } static int -nfsd4_layout_lm_change(struct file_lock *onlist, int arg, +nfsd4_layout_lm_change(struct file_lease *onlist, int arg, struct list_head *dispose) { BUG_ON(!(arg & F_UNLCK)); return lease_modify(onlist, arg, dispose); } -static const struct lock_manager_operations nfsd4_layouts_lm_ops = { +static const struct lease_manager_operations nfsd4_layouts_lm_ops = { .lm_break = nfsd4_layout_lm_break, .lm_change = nfsd4_layout_lm_change, }; @@ -756,13 +776,11 @@ nfsd4_init_pnfs(void) for (i = 0; i < DEVID_HASH_SIZE; i++) INIT_LIST_HEAD(&nfsd_devid_hash[i]); - nfs4_layout_cache = kmem_cache_create("nfs4_layout", - sizeof(struct nfs4_layout), 0, 0, NULL); + nfs4_layout_cache = KMEM_CACHE(nfs4_layout, 0); if (!nfs4_layout_cache) return -ENOMEM; - nfs4_layout_stateid_cache = kmem_cache_create("nfs4_layout_stateid", - sizeof(struct nfs4_layout_stateid), 0, 0, NULL); + nfs4_layout_stateid_cache = KMEM_CACHE(nfs4_layout_stateid, 0); if (!nfs4_layout_stateid_cache) { kmem_cache_destroy(nfs4_layout_cache); return -ENOMEM; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 14712fa08f76..2927b1263f08 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1143,6 +1143,7 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, }; struct inode *inode; __be32 status = nfs_ok; + bool save_no_wcc; int err; if (setattr->sa_iattr.ia_valid & ATTR_SIZE) { @@ -1168,8 +1169,10 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) goto out; - status = nfsd_setattr(rqstp, &cstate->current_fh, &attrs, - 0, (time64_t)0); + save_no_wcc = cstate->current_fh.fh_no_wcc; + cstate->current_fh.fh_no_wcc = true; + status = nfsd_setattr(rqstp, &cstate->current_fh, &attrs, NULL); + cstate->current_fh.fh_no_wcc = save_no_wcc; if (!status) status = nfserrno(attrs.na_labelerr); if (!status) @@ -2490,10 +2493,10 @@ nfsd4_proc_null(struct svc_rqst *rqstp) return rpc_success; } -static inline void nfsd4_increment_op_stats(u32 opnum) +static inline void nfsd4_increment_op_stats(struct nfsd_net *nn, u32 opnum) { if (opnum >= FIRST_NFS4_OP && opnum <= LAST_NFS4_OP) - percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_NFS4_OP(opnum)]); + percpu_counter_inc(&nn->counter[NFSD_STATS_NFS4_OP(opnum)]); } static const struct nfsd4_operation nfsd4_ops[]; @@ -2768,7 +2771,7 @@ encode_op: status, nfsd4_op_name(op->opnum)); nfsd4_cstate_clear_replay(cstate); - nfsd4_increment_op_stats(op->opnum); + nfsd4_increment_op_stats(nn, op->opnum); } fh_put(current_fh); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 2fa54cfd4882..1a93c7fcf76c 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -87,6 +87,7 @@ static void nfs4_free_ol_stateid(struct nfs4_stid *stid); void nfsd4_end_grace(struct nfsd_net *nn); static void _free_cpntf_state_locked(struct nfsd_net *nn, struct nfs4_cpntf_state *cps); static void nfsd4_file_hash_remove(struct nfs4_file *fi); +static void deleg_reaper(struct nfsd_net *nn); /* Locking: */ @@ -127,6 +128,7 @@ static void free_session(struct nfsd4_session *); static const struct nfsd4_callback_ops nfsd4_cb_recall_ops; static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops; +static const struct nfsd4_callback_ops nfsd4_cb_getattr_ops; static struct workqueue_struct *laundry_wq; @@ -318,6 +320,7 @@ free_nbl(struct kref *kref) struct nfsd4_blocked_lock *nbl; nbl = container_of(kref, struct nfsd4_blocked_lock, nbl_kref); + locks_release_private(&nbl->nbl_lock); kfree(nbl); } @@ -325,7 +328,6 @@ static void free_blocked_lock(struct nfsd4_blocked_lock *nbl) { locks_delete_block(&nbl->nbl_lock); - locks_release_private(&nbl->nbl_lock); kref_put(&nbl->nbl_kref, free_nbl); } @@ -1189,6 +1191,10 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp, dp->dl_recalled = false; nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client, &nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL); + nfsd4_init_cb(&dp->dl_cb_fattr.ncf_getattr, dp->dl_stid.sc_client, + &nfsd4_cb_getattr_ops, NFSPROC4_CLNT_CB_GETATTR); + dp->dl_cb_fattr.ncf_file_modified = false; + dp->dl_cb_fattr.ncf_cb_bmap[0] = FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE; get_nfs4_file(fp); dp->dl_stid.sc_file = fp; return dp; @@ -1210,6 +1216,8 @@ nfs4_put_stid(struct nfs4_stid *s) return; } idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id); + if (s->sc_status & SC_STATUS_ADMIN_REVOKED) + atomic_dec(&s->sc_client->cl_admin_revoked); nfs4_free_cpntf_statelist(clp->net, s); spin_unlock(&clp->cl_lock); s->sc_free(s); @@ -1249,7 +1257,7 @@ static void nfs4_unlock_deleg_lease(struct nfs4_delegation *dp) WARN_ON_ONCE(!fp->fi_delegees); - vfs_setlease(nf->nf_file, F_UNLCK, NULL, (void **)&dp); + kernel_setlease(nf->nf_file, F_UNLCK, NULL, (void **)&dp); put_deleg_file(fp); } @@ -1260,11 +1268,6 @@ static void destroy_unhashed_deleg(struct nfs4_delegation *dp) nfs4_put_stid(&dp->dl_stid); } -void nfs4_unhash_stid(struct nfs4_stid *s) -{ - s->sc_type = 0; -} - /** * nfs4_delegation_exists - Discover if this delegation already exists * @clp: a pointer to the nfs4_client we're granting a delegation to @@ -1312,11 +1315,12 @@ hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp) lockdep_assert_held(&state_lock); lockdep_assert_held(&fp->fi_lock); + lockdep_assert_held(&clp->cl_lock); if (nfs4_delegation_exists(clp, fp)) return -EAGAIN; refcount_inc(&dp->dl_stid.sc_count); - dp->dl_stid.sc_type = NFS4_DELEG_STID; + dp->dl_stid.sc_type = SC_TYPE_DELEG; list_add(&dp->dl_perfile, &fp->fi_delegations); list_add(&dp->dl_perclnt, &clp->cl_delegations); return 0; @@ -1328,7 +1332,7 @@ static bool delegation_hashed(struct nfs4_delegation *dp) } static bool -unhash_delegation_locked(struct nfs4_delegation *dp) +unhash_delegation_locked(struct nfs4_delegation *dp, unsigned short statusmask) { struct nfs4_file *fp = dp->dl_stid.sc_file; @@ -1337,7 +1341,13 @@ unhash_delegation_locked(struct nfs4_delegation *dp) if (!delegation_hashed(dp)) return false; - dp->dl_stid.sc_type = NFS4_CLOSED_DELEG_STID; + if (statusmask == SC_STATUS_REVOKED && + dp->dl_stid.sc_client->cl_minorversion == 0) + statusmask = SC_STATUS_CLOSED; + dp->dl_stid.sc_status |= statusmask; + if (statusmask & SC_STATUS_ADMIN_REVOKED) + atomic_inc(&dp->dl_stid.sc_client->cl_admin_revoked); + /* Ensure that deleg break won't try to requeue it */ ++dp->dl_time; spin_lock(&fp->fi_lock); @@ -1353,7 +1363,7 @@ static void destroy_delegation(struct nfs4_delegation *dp) bool unhashed; spin_lock(&state_lock); - unhashed = unhash_delegation_locked(dp); + unhashed = unhash_delegation_locked(dp, SC_STATUS_CLOSED); spin_unlock(&state_lock); if (unhashed) destroy_unhashed_deleg(dp); @@ -1367,9 +1377,9 @@ static void revoke_delegation(struct nfs4_delegation *dp) trace_nfsd_stid_revoke(&dp->dl_stid); - if (clp->cl_minorversion) { + if (dp->dl_stid.sc_status & + (SC_STATUS_REVOKED | SC_STATUS_ADMIN_REVOKED)) { spin_lock(&clp->cl_lock); - dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID; refcount_inc(&dp->dl_stid.sc_count); list_add(&dp->dl_recall_lru, &clp->cl_revoked); spin_unlock(&clp->cl_lock); @@ -1377,8 +1387,8 @@ static void revoke_delegation(struct nfs4_delegation *dp) destroy_unhashed_deleg(dp); } -/* - * SETCLIENTID state +/* + * SETCLIENTID state */ static unsigned int clientid_hashval(u32 id) @@ -1531,6 +1541,8 @@ static void put_ol_stateid_locked(struct nfs4_ol_stateid *stp, } idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id); + if (s->sc_status & SC_STATUS_ADMIN_REVOKED) + atomic_dec(&s->sc_client->cl_admin_revoked); list_add(&stp->st_locks, reaplist); } @@ -1541,7 +1553,7 @@ static bool unhash_lock_stateid(struct nfs4_ol_stateid *stp) if (!unhash_ol_stateid(stp)) return false; list_del_init(&stp->st_locks); - nfs4_unhash_stid(&stp->st_stid); + stp->st_stid.sc_status |= SC_STATUS_CLOSED; return true; } @@ -1599,7 +1611,7 @@ static void release_open_stateid_locks(struct nfs4_ol_stateid *open_stp, while (!list_empty(&open_stp->st_locks)) { stp = list_entry(open_stp->st_locks.next, struct nfs4_ol_stateid, st_locks); - WARN_ON(!unhash_lock_stateid(stp)); + unhash_lock_stateid(stp); put_ol_stateid_locked(stp, reaplist); } } @@ -1620,6 +1632,7 @@ static void release_open_stateid(struct nfs4_ol_stateid *stp) LIST_HEAD(reaplist); spin_lock(&stp->st_stid.sc_client->cl_lock); + stp->st_stid.sc_status |= SC_STATUS_CLOSED; if (unhash_open_stateid(stp, &reaplist)) put_ol_stateid_locked(stp, &reaplist); spin_unlock(&stp->st_stid.sc_client->cl_lock); @@ -1675,6 +1688,136 @@ static void release_openowner(struct nfs4_openowner *oo) nfs4_put_stateowner(&oo->oo_owner); } +static struct nfs4_stid *find_one_sb_stid(struct nfs4_client *clp, + struct super_block *sb, + unsigned int sc_types) +{ + unsigned long id, tmp; + struct nfs4_stid *stid; + + spin_lock(&clp->cl_lock); + idr_for_each_entry_ul(&clp->cl_stateids, stid, tmp, id) + if ((stid->sc_type & sc_types) && + stid->sc_status == 0 && + stid->sc_file->fi_inode->i_sb == sb) { + refcount_inc(&stid->sc_count); + break; + } + spin_unlock(&clp->cl_lock); + return stid; +} + +/** + * nfsd4_revoke_states - revoke all nfsv4 states associated with given filesystem + * @net: used to identify instance of nfsd (there is one per net namespace) + * @sb: super_block used to identify target filesystem + * + * All nfs4 states (open, lock, delegation, layout) held by the server instance + * and associated with a file on the given filesystem will be revoked resulting + * in any files being closed and so all references from nfsd to the filesystem + * being released. Thus nfsd will no longer prevent the filesystem from being + * unmounted. + * + * The clients which own the states will subsequently being notified that the + * states have been "admin-revoked". + */ +void nfsd4_revoke_states(struct net *net, struct super_block *sb) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + unsigned int idhashval; + unsigned int sc_types; + + sc_types = SC_TYPE_OPEN | SC_TYPE_LOCK | SC_TYPE_DELEG | SC_TYPE_LAYOUT; + + spin_lock(&nn->client_lock); + for (idhashval = 0; idhashval < CLIENT_HASH_MASK; idhashval++) { + struct list_head *head = &nn->conf_id_hashtbl[idhashval]; + struct nfs4_client *clp; + retry: + list_for_each_entry(clp, head, cl_idhash) { + struct nfs4_stid *stid = find_one_sb_stid(clp, sb, + sc_types); + if (stid) { + struct nfs4_ol_stateid *stp; + struct nfs4_delegation *dp; + struct nfs4_layout_stateid *ls; + + spin_unlock(&nn->client_lock); + switch (stid->sc_type) { + case SC_TYPE_OPEN: + stp = openlockstateid(stid); + mutex_lock_nested(&stp->st_mutex, + OPEN_STATEID_MUTEX); + + spin_lock(&clp->cl_lock); + if (stid->sc_status == 0) { + stid->sc_status |= + SC_STATUS_ADMIN_REVOKED; + atomic_inc(&clp->cl_admin_revoked); + spin_unlock(&clp->cl_lock); + release_all_access(stp); + } else + spin_unlock(&clp->cl_lock); + mutex_unlock(&stp->st_mutex); + break; + case SC_TYPE_LOCK: + stp = openlockstateid(stid); + mutex_lock_nested(&stp->st_mutex, + LOCK_STATEID_MUTEX); + spin_lock(&clp->cl_lock); + if (stid->sc_status == 0) { + struct nfs4_lockowner *lo = + lockowner(stp->st_stateowner); + struct nfsd_file *nf; + + stid->sc_status |= + SC_STATUS_ADMIN_REVOKED; + atomic_inc(&clp->cl_admin_revoked); + spin_unlock(&clp->cl_lock); + nf = find_any_file(stp->st_stid.sc_file); + if (nf) { + get_file(nf->nf_file); + filp_close(nf->nf_file, + (fl_owner_t)lo); + nfsd_file_put(nf); + } + release_all_access(stp); + } else + spin_unlock(&clp->cl_lock); + mutex_unlock(&stp->st_mutex); + break; + case SC_TYPE_DELEG: + dp = delegstateid(stid); + spin_lock(&state_lock); + if (!unhash_delegation_locked( + dp, SC_STATUS_ADMIN_REVOKED)) + dp = NULL; + spin_unlock(&state_lock); + if (dp) + revoke_delegation(dp); + break; + case SC_TYPE_LAYOUT: + ls = layoutstateid(stid); + nfsd4_close_layout(ls); + break; + } + nfs4_put_stid(stid); + spin_lock(&nn->client_lock); + if (clp->cl_minorversion == 0) + /* Allow cleanup after a lease period. + * store_release ensures cleanup will + * see any newly revoked states if it + * sees the time updated. + */ + nn->nfs40_last_revoke = + ktime_get_boottime_seconds(); + goto retry; + } + } + } + spin_unlock(&nn->client_lock); +} + static inline int hash_sessionid(struct nfs4_sessionid *sessionid) { @@ -2228,7 +2371,7 @@ __destroy_client(struct nfs4_client *clp) spin_lock(&state_lock); while (!list_empty(&clp->cl_delegations)) { dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt); - WARN_ON(!unhash_delegation_locked(dp)); + unhash_delegation_locked(dp, SC_STATUS_CLOSED); list_add(&dp->dl_recall_lru, &reaplist); } spin_unlock(&state_lock); @@ -2460,14 +2603,16 @@ find_stateid_locked(struct nfs4_client *cl, stateid_t *t) } static struct nfs4_stid * -find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask) +find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, + unsigned short typemask, unsigned short ok_states) { struct nfs4_stid *s; spin_lock(&cl->cl_lock); s = find_stateid_locked(cl, t); if (s != NULL) { - if (typemask & s->sc_type) + if ((s->sc_status & ~ok_states) == 0 && + (typemask & s->sc_type)) refcount_inc(&s->sc_count); else s = NULL; @@ -2487,9 +2632,9 @@ static struct nfs4_client *get_nfsdfs_clp(struct inode *inode) static void seq_quote_mem(struct seq_file *m, char *data, int len) { - seq_printf(m, "\""); + seq_puts(m, "\""); seq_escape_mem(m, data, len, ESCAPE_HEX | ESCAPE_NAP | ESCAPE_APPEND, "\"\\"); - seq_printf(m, "\""); + seq_puts(m, "\""); } static const char *cb_state2str(int state) @@ -2530,20 +2675,22 @@ static int client_info_show(struct seq_file *m, void *v) seq_puts(m, "status: unconfirmed\n"); seq_printf(m, "seconds from last renew: %lld\n", ktime_get_boottime_seconds() - clp->cl_time); - seq_printf(m, "name: "); + seq_puts(m, "name: "); seq_quote_mem(m, clp->cl_name.data, clp->cl_name.len); seq_printf(m, "\nminor version: %d\n", clp->cl_minorversion); if (clp->cl_nii_domain.data) { - seq_printf(m, "Implementation domain: "); + seq_puts(m, "Implementation domain: "); seq_quote_mem(m, clp->cl_nii_domain.data, clp->cl_nii_domain.len); - seq_printf(m, "\nImplementation name: "); + seq_puts(m, "\nImplementation name: "); seq_quote_mem(m, clp->cl_nii_name.data, clp->cl_nii_name.len); seq_printf(m, "\nImplementation time: [%lld, %ld]\n", clp->cl_nii_time.tv_sec, clp->cl_nii_time.tv_nsec); } seq_printf(m, "callback state: %s\n", cb_state2str(clp->cl_cb_state)); seq_printf(m, "callback address: %pISpc\n", &clp->cl_cb_conn.cb_addr); + seq_printf(m, "admin-revoked states: %d\n", + atomic_read(&clp->cl_admin_revoked)); drop_client(clp); return 0; @@ -2602,7 +2749,7 @@ static void nfs4_show_superblock(struct seq_file *s, struct nfsd_file *f) static void nfs4_show_owner(struct seq_file *s, struct nfs4_stateowner *oo) { - seq_printf(s, "owner: "); + seq_puts(s, "owner: "); seq_quote_mem(s, oo->so_owner.data, oo->so_owner.len); } @@ -2620,20 +2767,13 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st) struct nfs4_stateowner *oo; unsigned int access, deny; - if (st->sc_type != NFS4_OPEN_STID && st->sc_type != NFS4_LOCK_STID) - return 0; /* XXX: or SEQ_SKIP? */ ols = openlockstateid(st); oo = ols->st_stateowner; nf = st->sc_file; - spin_lock(&nf->fi_lock); - file = find_any_file_locked(nf); - if (!file) - goto out; - - seq_printf(s, "- "); + seq_puts(s, "- "); nfs4_show_stateid(s, &st->sc_stateid); - seq_printf(s, ": { type: open, "); + seq_puts(s, ": { type: open, "); access = bmap_to_share_mode(ols->st_access_bmap); deny = bmap_to_share_mode(ols->st_deny_bmap); @@ -2645,14 +2785,19 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st) deny & NFS4_SHARE_ACCESS_READ ? "r" : "-", deny & NFS4_SHARE_ACCESS_WRITE ? "w" : "-"); - nfs4_show_superblock(s, file); - seq_printf(s, ", "); - nfs4_show_fname(s, file); - seq_printf(s, ", "); - nfs4_show_owner(s, oo); - seq_printf(s, " }\n"); -out: + spin_lock(&nf->fi_lock); + file = find_any_file_locked(nf); + if (file) { + nfs4_show_superblock(s, file); + seq_puts(s, ", "); + nfs4_show_fname(s, file); + seq_puts(s, ", "); + } spin_unlock(&nf->fi_lock); + nfs4_show_owner(s, oo); + if (st->sc_status & SC_STATUS_ADMIN_REVOKED) + seq_puts(s, ", admin-revoked"); + seq_puts(s, " }\n"); return 0; } @@ -2666,30 +2811,31 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st) ols = openlockstateid(st); oo = ols->st_stateowner; nf = st->sc_file; - spin_lock(&nf->fi_lock); - file = find_any_file_locked(nf); - if (!file) - goto out; - seq_printf(s, "- "); + seq_puts(s, "- "); nfs4_show_stateid(s, &st->sc_stateid); - seq_printf(s, ": { type: lock, "); + seq_puts(s, ": { type: lock, "); - /* - * Note: a lock stateid isn't really the same thing as a lock, - * it's the locking state held by one owner on a file, and there - * may be multiple (or no) lock ranges associated with it. - * (Same for the matter is true of open stateids.) - */ + spin_lock(&nf->fi_lock); + file = find_any_file_locked(nf); + if (file) { + /* + * Note: a lock stateid isn't really the same thing as a lock, + * it's the locking state held by one owner on a file, and there + * may be multiple (or no) lock ranges associated with it. + * (Same for the matter is true of open stateids.) + */ - nfs4_show_superblock(s, file); - /* XXX: open stateid? */ - seq_printf(s, ", "); - nfs4_show_fname(s, file); - seq_printf(s, ", "); + nfs4_show_superblock(s, file); + /* XXX: open stateid? */ + seq_puts(s, ", "); + nfs4_show_fname(s, file); + seq_puts(s, ", "); + } nfs4_show_owner(s, oo); - seq_printf(s, " }\n"); -out: + if (st->sc_status & SC_STATUS_ADMIN_REVOKED) + seq_puts(s, ", admin-revoked"); + seq_puts(s, " }\n"); spin_unlock(&nf->fi_lock); return 0; } @@ -2702,27 +2848,28 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st) ds = delegstateid(st); nf = st->sc_file; - spin_lock(&nf->fi_lock); - file = nf->fi_deleg_file; - if (!file) - goto out; - seq_printf(s, "- "); + seq_puts(s, "- "); nfs4_show_stateid(s, &st->sc_stateid); - seq_printf(s, ": { type: deleg, "); + seq_puts(s, ": { type: deleg, "); - /* Kinda dead code as long as we only support read delegs: */ - seq_printf(s, "access: %s, ", - ds->dl_type == NFS4_OPEN_DELEGATE_READ ? "r" : "w"); + seq_printf(s, "access: %s", + ds->dl_type == NFS4_OPEN_DELEGATE_READ ? "r" : "w"); /* XXX: lease time, whether it's being recalled. */ - nfs4_show_superblock(s, file); - seq_printf(s, ", "); - nfs4_show_fname(s, file); - seq_printf(s, " }\n"); -out: + spin_lock(&nf->fi_lock); + file = nf->fi_deleg_file; + if (file) { + seq_puts(s, ", "); + nfs4_show_superblock(s, file); + seq_puts(s, ", "); + nfs4_show_fname(s, file); + } spin_unlock(&nf->fi_lock); + if (st->sc_status & SC_STATUS_ADMIN_REVOKED) + seq_puts(s, ", admin-revoked"); + seq_puts(s, " }\n"); return 0; } @@ -2732,18 +2879,25 @@ static int nfs4_show_layout(struct seq_file *s, struct nfs4_stid *st) struct nfsd_file *file; ls = container_of(st, struct nfs4_layout_stateid, ls_stid); - file = ls->ls_file; - seq_printf(s, "- "); + seq_puts(s, "- "); nfs4_show_stateid(s, &st->sc_stateid); - seq_printf(s, ": { type: layout, "); + seq_puts(s, ": { type: layout"); /* XXX: What else would be useful? */ - nfs4_show_superblock(s, file); - seq_printf(s, ", "); - nfs4_show_fname(s, file); - seq_printf(s, " }\n"); + spin_lock(&ls->ls_stid.sc_file->fi_lock); + file = ls->ls_file; + if (file) { + seq_puts(s, ", "); + nfs4_show_superblock(s, file); + seq_puts(s, ", "); + nfs4_show_fname(s, file); + } + spin_unlock(&ls->ls_stid.sc_file->fi_lock); + if (st->sc_status & SC_STATUS_ADMIN_REVOKED) + seq_puts(s, ", admin-revoked"); + seq_puts(s, " }\n"); return 0; } @@ -2753,13 +2907,13 @@ static int states_show(struct seq_file *s, void *v) struct nfs4_stid *st = v; switch (st->sc_type) { - case NFS4_OPEN_STID: + case SC_TYPE_OPEN: return nfs4_show_open(s, st); - case NFS4_LOCK_STID: + case SC_TYPE_LOCK: return nfs4_show_lock(s, st); - case NFS4_DELEG_STID: + case SC_TYPE_DELEG: return nfs4_show_deleg(s, st); - case NFS4_LAYOUT_STID: + case SC_TYPE_LAYOUT: return nfs4_show_layout(s, st); default: return 0; /* XXX: or SEQ_SKIP? */ @@ -2896,11 +3050,59 @@ nfsd4_cb_recall_any_release(struct nfsd4_callback *cb) spin_unlock(&nn->client_lock); } +static int +nfsd4_cb_getattr_done(struct nfsd4_callback *cb, struct rpc_task *task) +{ + struct nfs4_cb_fattr *ncf = + container_of(cb, struct nfs4_cb_fattr, ncf_getattr); + + ncf->ncf_cb_status = task->tk_status; + switch (task->tk_status) { + case -NFS4ERR_DELAY: + rpc_delay(task, 2 * HZ); + return 0; + default: + return 1; + } +} + +static void +nfsd4_cb_getattr_release(struct nfsd4_callback *cb) +{ + struct nfs4_cb_fattr *ncf = + container_of(cb, struct nfs4_cb_fattr, ncf_getattr); + struct nfs4_delegation *dp = + container_of(ncf, struct nfs4_delegation, dl_cb_fattr); + + nfs4_put_stid(&dp->dl_stid); + clear_bit(CB_GETATTR_BUSY, &ncf->ncf_cb_flags); + wake_up_bit(&ncf->ncf_cb_flags, CB_GETATTR_BUSY); +} + static const struct nfsd4_callback_ops nfsd4_cb_recall_any_ops = { .done = nfsd4_cb_recall_any_done, .release = nfsd4_cb_recall_any_release, }; +static const struct nfsd4_callback_ops nfsd4_cb_getattr_ops = { + .done = nfsd4_cb_getattr_done, + .release = nfsd4_cb_getattr_release, +}; + +static void nfs4_cb_getattr(struct nfs4_cb_fattr *ncf) +{ + struct nfs4_delegation *dp = + container_of(ncf, struct nfs4_delegation, dl_cb_fattr); + + if (test_and_set_bit(CB_GETATTR_BUSY, &ncf->ncf_cb_flags)) + return; + /* set to proper status when nfsd4_cb_getattr_done runs */ + ncf->ncf_cb_status = NFS4ERR_IO; + + refcount_inc(&dp->dl_stid.sc_count); + nfsd4_run_cb(&ncf->ncf_getattr); +} + static struct nfs4_client *create_client(struct xdr_netobj name, struct svc_rqst *rqstp, nfs4_verifier *verf) { @@ -3414,6 +3616,9 @@ out_new: new->cl_spo_must_allow.u.words[0] = exid->spo_must_allow[0]; new->cl_spo_must_allow.u.words[1] = exid->spo_must_allow[1]; + /* Contrived initial CREATE_SESSION response */ + new->cl_cs_slot.sl_status = nfserr_seq_misordered; + add_to_unconfirmed(new); swap(new, conf); out_copy: @@ -3584,10 +3789,10 @@ nfsd4_create_session(struct svc_rqst *rqstp, struct nfsd4_create_session *cr_ses = &u->create_session; struct sockaddr *sa = svc_addr(rqstp); struct nfs4_client *conf, *unconf; + struct nfsd4_clid_slot *cs_slot; struct nfs4_client *old = NULL; struct nfsd4_session *new; struct nfsd4_conn *conn; - struct nfsd4_clid_slot *cs_slot = NULL; __be32 status = 0; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); @@ -3611,53 +3816,60 @@ nfsd4_create_session(struct svc_rqst *rqstp, goto out_free_session; spin_lock(&nn->client_lock); + + /* RFC 8881 Section 18.36.4 Phase 1: Client record look-up. */ unconf = find_unconfirmed_client(&cr_ses->clientid, true, nn); conf = find_confirmed_client(&cr_ses->clientid, true, nn); - WARN_ON_ONCE(conf && unconf); + if (!conf && !unconf) { + status = nfserr_stale_clientid; + goto out_free_conn; + } - if (conf) { - status = nfserr_wrong_cred; - if (!nfsd4_mach_creds_match(conf, rqstp)) - goto out_free_conn; + /* RFC 8881 Section 18.36.4 Phase 2: Sequence ID processing. */ + if (conf) cs_slot = &conf->cl_cs_slot; - status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); - if (status) { - if (status == nfserr_replay_cache) - status = nfsd4_replay_create_session(cr_ses, cs_slot); + else + cs_slot = &unconf->cl_cs_slot; + status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); + if (status) { + if (status == nfserr_replay_cache) { + status = nfsd4_replay_create_session(cr_ses, cs_slot); goto out_free_conn; } - } else if (unconf) { + goto out_cache_error; + } + cs_slot->sl_seqid++; + cr_ses->seqid = cs_slot->sl_seqid; + + /* RFC 8881 Section 18.36.4 Phase 3: Client ID confirmation. */ + if (conf) { + status = nfserr_wrong_cred; + if (!nfsd4_mach_creds_match(conf, rqstp)) + goto out_cache_error; + } else { status = nfserr_clid_inuse; if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) || !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) { trace_nfsd_clid_cred_mismatch(unconf, rqstp); - goto out_free_conn; + goto out_cache_error; } status = nfserr_wrong_cred; if (!nfsd4_mach_creds_match(unconf, rqstp)) - goto out_free_conn; - cs_slot = &unconf->cl_cs_slot; - status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); - if (status) { - /* an unconfirmed replay returns misordered */ - status = nfserr_seq_misordered; - goto out_free_conn; - } + goto out_cache_error; old = find_confirmed_client_by_name(&unconf->cl_name, nn); if (old) { status = mark_client_expired_locked(old); if (status) { old = NULL; - goto out_free_conn; + goto out_cache_error; } trace_nfsd_clid_replaced(&old->cl_clientid); } move_to_confirmed(unconf); conf = unconf; - } else { - status = nfserr_stale_clientid; - goto out_free_conn; } + + /* RFC 8881 Section 18.36.4 Phase 4: Session creation. */ status = nfs_ok; /* Persistent sessions are not supported */ cr_ses->flags &= ~SESSION4_PERSIST; @@ -3669,8 +3881,6 @@ nfsd4_create_session(struct svc_rqst *rqstp, memcpy(cr_ses->sessionid.data, new->se_sessionid.data, NFS4_MAX_SESSIONID_LEN); - cs_slot->sl_seqid++; - cr_ses->seqid = cs_slot->sl_seqid; /* cache solo and embedded create sessions under the client_lock */ nfsd4_cache_create_session(cr_ses, cs_slot, status); @@ -3683,6 +3893,9 @@ nfsd4_create_session(struct svc_rqst *rqstp, if (old) expire_client(old); return status; + +out_cache_error: + nfsd4_cache_create_session(cr_ses, cs_slot, status); out_free_conn: spin_unlock(&nn->client_lock); free_conn(conn); @@ -4058,6 +4271,9 @@ out: } if (!list_empty(&clp->cl_revoked)) seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED; + if (atomic_read(&clp->cl_admin_revoked)) + seq->status_flags |= SEQ4_STATUS_ADMIN_STATE_REVOKED; + trace_nfsd_seq4_status(rqstp, seq); out_no_session: if (conn) free_conn(conn); @@ -4352,32 +4568,25 @@ nfsd4_free_slabs(void) int nfsd4_init_slabs(void) { - client_slab = kmem_cache_create("nfsd4_clients", - sizeof(struct nfs4_client), 0, 0, NULL); + client_slab = KMEM_CACHE(nfs4_client, 0); if (client_slab == NULL) goto out; - openowner_slab = kmem_cache_create("nfsd4_openowners", - sizeof(struct nfs4_openowner), 0, 0, NULL); + openowner_slab = KMEM_CACHE(nfs4_openowner, 0); if (openowner_slab == NULL) goto out_free_client_slab; - lockowner_slab = kmem_cache_create("nfsd4_lockowners", - sizeof(struct nfs4_lockowner), 0, 0, NULL); + lockowner_slab = KMEM_CACHE(nfs4_lockowner, 0); if (lockowner_slab == NULL) goto out_free_openowner_slab; - file_slab = kmem_cache_create("nfsd4_files", - sizeof(struct nfs4_file), 0, 0, NULL); + file_slab = KMEM_CACHE(nfs4_file, 0); if (file_slab == NULL) goto out_free_lockowner_slab; - stateid_slab = kmem_cache_create("nfsd4_stateids", - sizeof(struct nfs4_ol_stateid), 0, 0, NULL); + stateid_slab = KMEM_CACHE(nfs4_ol_stateid, 0); if (stateid_slab == NULL) goto out_free_file_slab; - deleg_slab = kmem_cache_create("nfsd4_delegations", - sizeof(struct nfs4_delegation), 0, 0, NULL); + deleg_slab = KMEM_CACHE(nfs4_delegation, 0); if (deleg_slab == NULL) goto out_free_stateid_slab; - odstate_slab = kmem_cache_create("nfsd4_odstate", - sizeof(struct nfs4_clnt_odstate), 0, 0, NULL); + odstate_slab = KMEM_CACHE(nfs4_clnt_odstate, 0); if (odstate_slab == NULL) goto out_free_deleg_slab; return 0; @@ -4531,7 +4740,8 @@ nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open) continue; if (local->st_stateowner != &oo->oo_owner) continue; - if (local->st_stid.sc_type == NFS4_OPEN_STID) { + if (local->st_stid.sc_type == SC_TYPE_OPEN && + !local->st_stid.sc_status) { ret = local; refcount_inc(&ret->st_stid.sc_count); break; @@ -4540,22 +4750,75 @@ nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open) return ret; } -static __be32 -nfsd4_verify_open_stid(struct nfs4_stid *s) +static void nfsd4_drop_revoked_stid(struct nfs4_stid *s) + __releases(&s->sc_client->cl_lock) { - __be32 ret = nfs_ok; + struct nfs4_client *cl = s->sc_client; + LIST_HEAD(reaplist); + struct nfs4_ol_stateid *stp; + struct nfs4_delegation *dp; + bool unhashed; switch (s->sc_type) { - default: + case SC_TYPE_OPEN: + stp = openlockstateid(s); + if (unhash_open_stateid(stp, &reaplist)) + put_ol_stateid_locked(stp, &reaplist); + spin_unlock(&cl->cl_lock); + free_ol_stateid_reaplist(&reaplist); break; - case 0: - case NFS4_CLOSED_STID: - case NFS4_CLOSED_DELEG_STID: - ret = nfserr_bad_stateid; + case SC_TYPE_LOCK: + stp = openlockstateid(s); + unhashed = unhash_lock_stateid(stp); + spin_unlock(&cl->cl_lock); + if (unhashed) + nfs4_put_stid(s); break; - case NFS4_REVOKED_DELEG_STID: - ret = nfserr_deleg_revoked; + case SC_TYPE_DELEG: + dp = delegstateid(s); + list_del_init(&dp->dl_recall_lru); + spin_unlock(&cl->cl_lock); + nfs4_put_stid(s); + break; + default: + spin_unlock(&cl->cl_lock); + } +} + +static void nfsd40_drop_revoked_stid(struct nfs4_client *cl, + stateid_t *stid) +{ + /* NFSv4.0 has no way for the client to tell the server + * that it can forget an admin-revoked stateid. + * So we keep it around until the first time that the + * client uses it, and drop it the first time + * nfserr_admin_revoked is returned. + * For v4.1 and later we wait until explicitly told + * to free the stateid. + */ + if (cl->cl_minorversion == 0) { + struct nfs4_stid *st; + + spin_lock(&cl->cl_lock); + st = find_stateid_locked(cl, stid); + if (st) + nfsd4_drop_revoked_stid(st); + else + spin_unlock(&cl->cl_lock); } +} + +static __be32 +nfsd4_verify_open_stid(struct nfs4_stid *s) +{ + __be32 ret = nfs_ok; + + if (s->sc_status & SC_STATUS_ADMIN_REVOKED) + ret = nfserr_admin_revoked; + else if (s->sc_status & SC_STATUS_REVOKED) + ret = nfserr_deleg_revoked; + else if (s->sc_status & SC_STATUS_CLOSED) + ret = nfserr_bad_stateid; return ret; } @@ -4567,6 +4830,10 @@ nfsd4_lock_ol_stateid(struct nfs4_ol_stateid *stp) mutex_lock_nested(&stp->st_mutex, LOCK_STATEID_MUTEX); ret = nfsd4_verify_open_stid(&stp->st_stid); + if (ret == nfserr_admin_revoked) + nfsd40_drop_revoked_stid(stp->st_stid.sc_client, + &stp->st_stid.sc_stateid); + if (ret != nfs_ok) mutex_unlock(&stp->st_mutex); return ret; @@ -4641,7 +4908,7 @@ retry: open->op_stp = NULL; refcount_inc(&stp->st_stid.sc_count); - stp->st_stid.sc_type = NFS4_OPEN_STID; + stp->st_stid.sc_type = SC_TYPE_OPEN; INIT_LIST_HEAD(&stp->st_locks); stp->st_stateowner = nfs4_get_stateowner(&oo->oo_owner); get_nfs4_file(fp); @@ -4868,9 +5135,9 @@ static int nfsd4_cb_recall_done(struct nfsd4_callback *cb, trace_nfsd_cb_recall_done(&dp->dl_stid.sc_stateid, task); - if (dp->dl_stid.sc_type == NFS4_CLOSED_DELEG_STID || - dp->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID) - return 1; + if (dp->dl_stid.sc_status) + /* CLOSED or REVOKED */ + return 1; switch (task->tk_status) { case 0: @@ -4922,9 +5189,9 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp) /* Called from break_lease() with flc_lock held. */ static bool -nfsd_break_deleg_cb(struct file_lock *fl) +nfsd_break_deleg_cb(struct file_lease *fl) { - struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner; + struct nfs4_delegation *dp = (struct nfs4_delegation *) fl->c.flc_owner; struct nfs4_file *fp = dp->dl_stid.sc_file; struct nfs4_client *clp = dp->dl_stid.sc_client; struct nfsd_net *nn; @@ -4945,10 +5212,8 @@ nfsd_break_deleg_cb(struct file_lock *fl) */ fl->fl_break_time = 0; - spin_lock(&fp->fi_lock); fp->fi_had_conflict = true; nfsd_break_one_deleg(dp); - spin_unlock(&fp->fi_lock); return false; } @@ -4960,9 +5225,9 @@ nfsd_break_deleg_cb(struct file_lock *fl) * %true: Lease conflict was resolved * %false: Lease conflict was not resolved. */ -static bool nfsd_breaker_owns_lease(struct file_lock *fl) +static bool nfsd_breaker_owns_lease(struct file_lease *fl) { - struct nfs4_delegation *dl = fl->fl_owner; + struct nfs4_delegation *dl = fl->c.flc_owner; struct svc_rqst *rqst; struct nfs4_client *clp; @@ -4977,10 +5242,10 @@ static bool nfsd_breaker_owns_lease(struct file_lock *fl) } static int -nfsd_change_deleg_cb(struct file_lock *onlist, int arg, +nfsd_change_deleg_cb(struct file_lease *onlist, int arg, struct list_head *dispose) { - struct nfs4_delegation *dp = (struct nfs4_delegation *)onlist->fl_owner; + struct nfs4_delegation *dp = (struct nfs4_delegation *) onlist->c.flc_owner; struct nfs4_client *clp = dp->dl_stid.sc_client; if (arg & F_UNLCK) { @@ -4991,7 +5256,7 @@ nfsd_change_deleg_cb(struct file_lock *onlist, int arg, return -EAGAIN; } -static const struct lock_manager_operations nfsd_lease_mng_ops = { +static const struct lease_manager_operations nfsd_lease_mng_ops = { .lm_breaker_owns_lease = nfsd_breaker_owns_lease, .lm_break = nfsd_break_deleg_cb, .lm_change = nfsd_change_deleg_cb, @@ -5115,12 +5380,12 @@ static int share_access_to_flags(u32 share_access) return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE; } -static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl, stateid_t *s) +static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl, + stateid_t *s) { struct nfs4_stid *ret; - ret = find_stateid_by_type(cl, s, - NFS4_DELEG_STID|NFS4_REVOKED_DELEG_STID); + ret = find_stateid_by_type(cl, s, SC_TYPE_DELEG, SC_STATUS_REVOKED); if (!ret) return NULL; return delegstateid(ret); @@ -5143,10 +5408,15 @@ nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open, deleg = find_deleg_stateid(cl, &open->op_delegate_stateid); if (deleg == NULL) goto out; - if (deleg->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID) { + if (deleg->dl_stid.sc_status & SC_STATUS_ADMIN_REVOKED) { + nfs4_put_stid(&deleg->dl_stid); + status = nfserr_admin_revoked; + goto out; + } + if (deleg->dl_stid.sc_status & SC_STATUS_REVOKED) { nfs4_put_stid(&deleg->dl_stid); - if (cl->cl_minorversion) - status = nfserr_deleg_revoked; + nfsd40_drop_revoked_stid(cl, &open->op_delegate_stateid); + status = nfserr_deleg_revoked; goto out; } flags = share_access_to_flags(open->op_share_access); @@ -5191,7 +5461,7 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh, return 0; if (!(open->op_share_access & NFS4_SHARE_ACCESS_WRITE)) return nfserr_inval; - return nfsd_setattr(rqstp, fh, &attrs, 0, (time64_t)0); + return nfsd_setattr(rqstp, fh, &attrs, NULL); } static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, @@ -5331,21 +5601,20 @@ static bool nfsd4_cb_channel_good(struct nfs4_client *clp) return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN; } -static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, +static struct file_lease *nfs4_alloc_init_lease(struct nfs4_delegation *dp, int flag) { - struct file_lock *fl; + struct file_lease *fl; - fl = locks_alloc_lock(); + fl = locks_alloc_lease(); if (!fl) return NULL; fl->fl_lmops = &nfsd_lease_mng_ops; - fl->fl_flags = FL_DELEG; - fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK; - fl->fl_end = OFFSET_MAX; - fl->fl_owner = (fl_owner_t)dp; - fl->fl_pid = current->tgid; - fl->fl_file = dp->dl_stid.sc_file->fi_deleg_file->nf_file; + fl->c.flc_flags = FL_DELEG; + fl->c.flc_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK; + fl->c.flc_owner = (fl_owner_t)dp; + fl->c.flc_pid = current->tgid; + fl->c.flc_file = dp->dl_stid.sc_file->fi_deleg_file->nf_file; return fl; } @@ -5463,7 +5732,7 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, struct nfs4_clnt_odstate *odstate = stp->st_clnt_odstate; struct nfs4_delegation *dp; struct nfsd_file *nf = NULL; - struct file_lock *fl; + struct file_lease *fl; u32 dl_type; /* @@ -5533,9 +5802,10 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, if (!fl) goto out_clnt_odstate; - status = vfs_setlease(fp->fi_deleg_file->nf_file, fl->fl_type, &fl, NULL); + status = kernel_setlease(fp->fi_deleg_file->nf_file, + fl->c.flc_type, &fl, NULL); if (fl) - locks_free_lock(fl); + locks_free_lease(fl); if (status) goto out_clnt_odstate; @@ -5557,13 +5827,16 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, if (status) goto out_unlock; + status = -EAGAIN; + if (fp->fi_had_conflict) + goto out_unlock; + spin_lock(&state_lock); + spin_lock(&clp->cl_lock); spin_lock(&fp->fi_lock); - if (fp->fi_had_conflict) - status = -EAGAIN; - else - status = hash_delegation_locked(dp, fp); + status = hash_delegation_locked(dp, fp); spin_unlock(&fp->fi_lock); + spin_unlock(&clp->cl_lock); spin_unlock(&state_lock); if (status) @@ -5571,7 +5844,7 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, return dp; out_unlock: - vfs_setlease(fp->fi_deleg_file->nf_file, F_UNLCK, NULL, (void **)&dp); + kernel_setlease(fp->fi_deleg_file->nf_file, F_UNLCK, NULL, (void **)&dp); out_clnt_odstate: put_clnt_odstate(dp->dl_clnt_odstate); nfs4_put_stid(&dp->dl_stid); @@ -5635,6 +5908,8 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, struct svc_fh *parent = NULL; int cb_up; int status = 0; + struct kstat stat; + struct path path; cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client); open->op_recall = false; @@ -5672,6 +5947,18 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) { open->op_delegate_type = NFS4_OPEN_DELEGATE_WRITE; trace_nfsd_deleg_write(&dp->dl_stid.sc_stateid); + path.mnt = currentfh->fh_export->ex_path.mnt; + path.dentry = currentfh->fh_dentry; + if (vfs_getattr(&path, &stat, + (STATX_SIZE | STATX_CTIME | STATX_CHANGE_COOKIE), + AT_STATX_SYNC_AS_STAT)) { + nfs4_put_stid(&dp->dl_stid); + destroy_delegation(dp); + goto out_no_deleg; + } + dp->dl_cb_fattr.ncf_cur_fsize = stat.size; + dp->dl_cb_fattr.ncf_initial_cinfo = + nfsd4_change_attribute(&stat, d_inode(currentfh->fh_dentry)); } else { open->op_delegate_type = NFS4_OPEN_DELEGATE_READ; trace_nfsd_deleg_read(&dp->dl_stid.sc_stateid); @@ -5775,7 +6062,6 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf } else { status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open, true); if (status) { - stp->st_stid.sc_type = NFS4_CLOSED_STID; release_open_stateid(stp); mutex_unlock(&stp->st_mutex); goto out; @@ -6129,6 +6415,43 @@ nfs4_process_client_reaplist(struct list_head *reaplist) } } +static void nfs40_clean_admin_revoked(struct nfsd_net *nn, + struct laundry_time *lt) +{ + struct nfs4_client *clp; + + spin_lock(&nn->client_lock); + if (nn->nfs40_last_revoke == 0 || + nn->nfs40_last_revoke > lt->cutoff) { + spin_unlock(&nn->client_lock); + return; + } + nn->nfs40_last_revoke = 0; + +retry: + list_for_each_entry(clp, &nn->client_lru, cl_lru) { + unsigned long id, tmp; + struct nfs4_stid *stid; + + if (atomic_read(&clp->cl_admin_revoked) == 0) + continue; + + spin_lock(&clp->cl_lock); + idr_for_each_entry_ul(&clp->cl_stateids, stid, tmp, id) + if (stid->sc_status & SC_STATUS_ADMIN_REVOKED) { + refcount_inc(&stid->sc_count); + spin_unlock(&nn->client_lock); + /* this function drops ->cl_lock */ + nfsd4_drop_revoked_stid(stid); + nfs4_put_stid(stid); + spin_lock(&nn->client_lock); + goto retry; + } + spin_unlock(&clp->cl_lock); + } + spin_unlock(&nn->client_lock); +} + static time64_t nfs4_laundromat(struct nfsd_net *nn) { @@ -6162,12 +6485,14 @@ nfs4_laundromat(struct nfsd_net *nn) nfs4_get_client_reaplist(nn, &reaplist, <); nfs4_process_client_reaplist(&reaplist); + nfs40_clean_admin_revoked(nn, <); + spin_lock(&state_lock); list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); if (!state_expired(<, dp->dl_time)) break; - WARN_ON(!unhash_delegation_locked(dp)); + unhash_delegation_locked(dp, SC_STATUS_REVOKED); list_add(&dp->dl_recall_lru, &reaplist); } spin_unlock(&state_lock); @@ -6226,6 +6551,8 @@ nfs4_laundromat(struct nfsd_net *nn) /* service the server-to-server copy delayed unmount list */ nfsd4_ssc_expire_umount(nn); #endif + if (atomic_long_read(&num_delegations) >= max_delegations) + deleg_reaper(nn); out: return max_t(time64_t, lt.new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT); } @@ -6287,6 +6614,8 @@ deleg_reaper(struct nfsd_net *nn) list_del_init(&clp->cl_ra_cblist); clp->cl_ra->ra_keep = 0; clp->cl_ra->ra_bmval[0] = BIT(RCA4_TYPE_MASK_RDATA_DLG); + clp->cl_ra->ra_bmval[0] = BIT(RCA4_TYPE_MASK_RDATA_DLG) | + BIT(RCA4_TYPE_MASK_WDATA_DLG); trace_nfsd_cb_recall_any(clp->cl_ra); nfsd4_run_cb(&clp->cl_ra->ra_cb); } @@ -6380,6 +6709,9 @@ static __be32 nfsd4_stid_check_stateid_generation(stateid_t *in, struct nfs4_sti if (ret == nfs_ok) ret = check_stateid_generation(in, &s->sc_stateid, has_session); spin_unlock(&s->sc_lock); + if (ret == nfserr_admin_revoked) + nfsd40_drop_revoked_stid(s->sc_client, + &s->sc_stateid); return ret; } @@ -6406,32 +6738,33 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) status = nfsd4_stid_check_stateid_generation(stateid, s, 1); if (status) goto out_unlock; + status = nfsd4_verify_open_stid(s); + if (status) + goto out_unlock; + switch (s->sc_type) { - case NFS4_DELEG_STID: + case SC_TYPE_DELEG: status = nfs_ok; break; - case NFS4_REVOKED_DELEG_STID: - status = nfserr_deleg_revoked; - break; - case NFS4_OPEN_STID: - case NFS4_LOCK_STID: + case SC_TYPE_OPEN: + case SC_TYPE_LOCK: status = nfsd4_check_openowner_confirmed(openlockstateid(s)); break; default: printk("unknown stateid type %x\n", s->sc_type); - fallthrough; - case NFS4_CLOSED_STID: - case NFS4_CLOSED_DELEG_STID: status = nfserr_bad_stateid; } out_unlock: spin_unlock(&cl->cl_lock); + if (status == nfserr_admin_revoked) + nfsd40_drop_revoked_stid(cl, stateid); return status; } __be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, - stateid_t *stateid, unsigned char typemask, + stateid_t *stateid, + unsigned short typemask, unsigned short statusmask, struct nfs4_stid **s, struct nfsd_net *nn) { __be32 status; @@ -6442,10 +6775,15 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, * only return revoked delegations if explicitly asked. * otherwise we report revoked or bad_stateid status. */ - if (typemask & NFS4_REVOKED_DELEG_STID) + if (statusmask & SC_STATUS_REVOKED) return_revoked = true; - else if (typemask & NFS4_DELEG_STID) - typemask |= NFS4_REVOKED_DELEG_STID; + if (typemask & SC_TYPE_DELEG) + /* Always allow REVOKED for DELEG so we can + * retturn the appropriate error. + */ + statusmask |= SC_STATUS_REVOKED; + + statusmask |= SC_STATUS_ADMIN_REVOKED; if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) || CLOSE_STATEID(stateid)) @@ -6458,14 +6796,17 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, } if (status) return status; - stid = find_stateid_by_type(cstate->clp, stateid, typemask); + stid = find_stateid_by_type(cstate->clp, stateid, typemask, statusmask); if (!stid) return nfserr_bad_stateid; - if ((stid->sc_type == NFS4_REVOKED_DELEG_STID) && !return_revoked) { + if ((stid->sc_status & SC_STATUS_REVOKED) && !return_revoked) { nfs4_put_stid(stid); - if (cstate->minorversion) - return nfserr_deleg_revoked; - return nfserr_bad_stateid; + return nfserr_deleg_revoked; + } + if (stid->sc_status & SC_STATUS_ADMIN_REVOKED) { + nfsd40_drop_revoked_stid(cstate->clp, stateid); + nfs4_put_stid(stid); + return nfserr_admin_revoked; } *s = stid; return nfs_ok; @@ -6476,17 +6817,17 @@ nfs4_find_file(struct nfs4_stid *s, int flags) { struct nfsd_file *ret = NULL; - if (!s) + if (!s || s->sc_status) return NULL; switch (s->sc_type) { - case NFS4_DELEG_STID: + case SC_TYPE_DELEG: spin_lock(&s->sc_file->fi_lock); ret = nfsd_file_get(s->sc_file->fi_deleg_file); spin_unlock(&s->sc_file->fi_lock); break; - case NFS4_OPEN_STID: - case NFS4_LOCK_STID: + case SC_TYPE_OPEN: + case SC_TYPE_LOCK: if (flags & RD_STATE) ret = find_readable_file(s->sc_file); else @@ -6599,7 +6940,8 @@ static __be32 find_cpntf_state(struct nfsd_net *nn, stateid_t *st, goto out; *stid = find_stateid_by_type(found, &cps->cp_p_stateid, - NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID); + SC_TYPE_DELEG|SC_TYPE_OPEN|SC_TYPE_LOCK, + 0); if (*stid) status = nfs_ok; else @@ -6656,8 +6998,8 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, } status = nfsd4_lookup_stateid(cstate, stateid, - NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, - &s, nn); + SC_TYPE_DELEG|SC_TYPE_OPEN|SC_TYPE_LOCK, + 0, &s, nn); if (status == nfserr_bad_stateid) status = find_cpntf_state(nn, stateid, &s); if (status) @@ -6668,16 +7010,13 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, goto out; switch (s->sc_type) { - case NFS4_DELEG_STID: + case SC_TYPE_DELEG: status = nfs4_check_delegmode(delegstateid(s), flags); break; - case NFS4_OPEN_STID: - case NFS4_LOCK_STID: + case SC_TYPE_OPEN: + case SC_TYPE_LOCK: status = nfs4_check_olstateid(openlockstateid(s), flags); break; - default: - status = nfserr_bad_stateid; - break; } if (status) goto out; @@ -6756,34 +7095,39 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, spin_lock(&cl->cl_lock); s = find_stateid_locked(cl, stateid); - if (!s) + if (!s || s->sc_status & SC_STATUS_CLOSED) goto out_unlock; + if (s->sc_status & SC_STATUS_ADMIN_REVOKED) { + nfsd4_drop_revoked_stid(s); + ret = nfs_ok; + goto out; + } spin_lock(&s->sc_lock); switch (s->sc_type) { - case NFS4_DELEG_STID: + case SC_TYPE_DELEG: + if (s->sc_status & SC_STATUS_REVOKED) { + spin_unlock(&s->sc_lock); + dp = delegstateid(s); + list_del_init(&dp->dl_recall_lru); + spin_unlock(&cl->cl_lock); + nfs4_put_stid(s); + ret = nfs_ok; + goto out; + } ret = nfserr_locks_held; break; - case NFS4_OPEN_STID: + case SC_TYPE_OPEN: ret = check_stateid_generation(stateid, &s->sc_stateid, 1); if (ret) break; ret = nfserr_locks_held; break; - case NFS4_LOCK_STID: + case SC_TYPE_LOCK: spin_unlock(&s->sc_lock); refcount_inc(&s->sc_count); spin_unlock(&cl->cl_lock); ret = nfsd4_free_lock_stateid(stateid, s); goto out; - case NFS4_REVOKED_DELEG_STID: - spin_unlock(&s->sc_lock); - dp = delegstateid(s); - list_del_init(&dp->dl_recall_lru); - spin_unlock(&cl->cl_lock); - nfs4_put_stid(s); - ret = nfs_ok; - goto out; - /* Default falls through and returns nfserr_bad_stateid */ } spin_unlock(&s->sc_lock); out_unlock: @@ -6825,6 +7169,7 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_ * @seqid: seqid (provided by client) * @stateid: stateid (provided by client) * @typemask: mask of allowable types for this operation + * @statusmask: mask of allowed states: 0 or STID_CLOSED * @stpp: return pointer for the stateid found * @nn: net namespace for request * @@ -6834,7 +7179,8 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_ */ static __be32 nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, - stateid_t *stateid, char typemask, + stateid_t *stateid, + unsigned short typemask, unsigned short statusmask, struct nfs4_ol_stateid **stpp, struct nfsd_net *nn) { @@ -6845,7 +7191,8 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, trace_nfsd_preprocess(seqid, stateid); *stpp = NULL; - status = nfsd4_lookup_stateid(cstate, stateid, typemask, &s, nn); + status = nfsd4_lookup_stateid(cstate, stateid, + typemask, statusmask, &s, nn); if (status) return status; stp = openlockstateid(s); @@ -6867,7 +7214,7 @@ static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cs struct nfs4_ol_stateid *stp; status = nfs4_preprocess_seqid_op(cstate, seqid, stateid, - NFS4_OPEN_STID, &stp, nn); + SC_TYPE_OPEN, 0, &stp, nn); if (status) return status; oo = openowner(stp->st_stateowner); @@ -6898,8 +7245,8 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return status; status = nfs4_preprocess_seqid_op(cstate, - oc->oc_seqid, &oc->oc_req_stateid, - NFS4_OPEN_STID, &stp, nn); + oc->oc_seqid, &oc->oc_req_stateid, + SC_TYPE_OPEN, 0, &stp, nn); if (status) goto out; oo = openowner(stp->st_stateowner); @@ -7029,18 +7376,20 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); - dprintk("NFSD: nfsd4_close on file %pd\n", + dprintk("NFSD: nfsd4_close on file %pd\n", cstate->current_fh.fh_dentry); status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid, - &close->cl_stateid, - NFS4_OPEN_STID|NFS4_CLOSED_STID, - &stp, nn); + &close->cl_stateid, + SC_TYPE_OPEN, SC_STATUS_CLOSED, + &stp, nn); nfsd4_bump_seqid(cstate, status); if (status) - goto out; + goto out; - stp->st_stid.sc_type = NFS4_CLOSED_STID; + spin_lock(&stp->st_stid.sc_client->cl_lock); + stp->st_stid.sc_status |= SC_STATUS_CLOSED; + spin_unlock(&stp->st_stid.sc_client->cl_lock); /* * Technically we don't _really_ have to increment or copy it, since @@ -7082,7 +7431,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) return status; - status = nfsd4_lookup_stateid(cstate, stateid, NFS4_DELEG_STID, &s, nn); + status = nfsd4_lookup_stateid(cstate, stateid, SC_TYPE_DELEG, 0, &s, nn); if (status) goto out; dp = delegstateid(s); @@ -7149,7 +7498,7 @@ nfsd4_lm_put_owner(fl_owner_t owner) static bool nfsd4_lm_lock_expirable(struct file_lock *cfl) { - struct nfs4_lockowner *lo = (struct nfs4_lockowner *)cfl->fl_owner; + struct nfs4_lockowner *lo = (struct nfs4_lockowner *) cfl->c.flc_owner; struct nfs4_client *clp = lo->lo_owner.so_client; struct nfsd_net *nn; @@ -7171,7 +7520,7 @@ nfsd4_lm_expire_lock(void) static void nfsd4_lm_notify(struct file_lock *fl) { - struct nfs4_lockowner *lo = (struct nfs4_lockowner *)fl->fl_owner; + struct nfs4_lockowner *lo = (struct nfs4_lockowner *) fl->c.flc_owner; struct net *net = lo->lo_owner.so_client->net; struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct nfsd4_blocked_lock *nbl = container_of(fl, @@ -7208,7 +7557,7 @@ nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny) struct nfs4_lockowner *lo; if (fl->fl_lmops == &nfsd_posix_mng_ops) { - lo = (struct nfs4_lockowner *) fl->fl_owner; + lo = (struct nfs4_lockowner *) fl->c.flc_owner; xdr_netobj_dup(&deny->ld_owner, &lo->lo_owner.so_owner, GFP_KERNEL); if (!deny->ld_owner.data) @@ -7227,7 +7576,7 @@ nevermind: if (fl->fl_end != NFS4_MAX_UINT64) deny->ld_length = fl->fl_end - fl->fl_start + 1; deny->ld_type = NFS4_READ_LT; - if (fl->fl_type != F_RDLCK) + if (fl->c.flc_type != F_RDLCK) deny->ld_type = NFS4_WRITE_LT; } @@ -7349,7 +7698,7 @@ retry: if (retstp) goto out_found; refcount_inc(&stp->st_stid.sc_count); - stp->st_stid.sc_type = NFS4_LOCK_STID; + stp->st_stid.sc_type = SC_TYPE_LOCK; stp->st_stateowner = nfs4_get_stateowner(&lo->lo_owner); get_nfs4_file(fp); stp->st_stid.sc_file = fp; @@ -7493,8 +7842,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, int lkflg; int err; bool new = false; - unsigned char fl_type; - unsigned int fl_flags = FL_POSIX; + unsigned char type; + unsigned int flags = FL_POSIX; struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); @@ -7536,9 +7885,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, &lock_stp, &new); } else { status = nfs4_preprocess_seqid_op(cstate, - lock->lk_old_lock_seqid, - &lock->lk_old_lock_stateid, - NFS4_LOCK_STID, &lock_stp, nn); + lock->lk_old_lock_seqid, + &lock->lk_old_lock_stateid, + SC_TYPE_LOCK, 0, &lock_stp, + nn); } if (status) goto out; @@ -7557,14 +7907,14 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; if (lock->lk_reclaim) - fl_flags |= FL_RECLAIM; + flags |= FL_RECLAIM; fp = lock_stp->st_stid.sc_file; switch (lock->lk_type) { case NFS4_READW_LT: if (nfsd4_has_session(cstate) || exportfs_lock_op_is_async(sb->s_export_op)) - fl_flags |= FL_SLEEP; + flags |= FL_SLEEP; fallthrough; case NFS4_READ_LT: spin_lock(&fp->fi_lock); @@ -7572,12 +7922,12 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (nf) get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ); spin_unlock(&fp->fi_lock); - fl_type = F_RDLCK; + type = F_RDLCK; break; case NFS4_WRITEW_LT: if (nfsd4_has_session(cstate) || exportfs_lock_op_is_async(sb->s_export_op)) - fl_flags |= FL_SLEEP; + flags |= FL_SLEEP; fallthrough; case NFS4_WRITE_LT: spin_lock(&fp->fi_lock); @@ -7585,7 +7935,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (nf) get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE); spin_unlock(&fp->fi_lock); - fl_type = F_WRLCK; + type = F_WRLCK; break; default: status = nfserr_inval; @@ -7605,7 +7955,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * on those filesystems: */ if (!exportfs_lock_op_is_async(sb->s_export_op)) - fl_flags &= ~FL_SLEEP; + flags &= ~FL_SLEEP; nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn); if (!nbl) { @@ -7615,11 +7965,11 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } file_lock = &nbl->nbl_lock; - file_lock->fl_type = fl_type; - file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lock_sop->lo_owner)); - file_lock->fl_pid = current->tgid; - file_lock->fl_file = nf->nf_file; - file_lock->fl_flags = fl_flags; + file_lock->c.flc_type = type; + file_lock->c.flc_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lock_sop->lo_owner)); + file_lock->c.flc_pid = current->tgid; + file_lock->c.flc_file = nf->nf_file; + file_lock->c.flc_flags = flags; file_lock->fl_lmops = &nfsd_posix_mng_ops; file_lock->fl_start = lock->lk_offset; file_lock->fl_end = last_byte_offset(lock->lk_offset, lock->lk_length); @@ -7632,7 +7982,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } - if (fl_flags & FL_SLEEP) { + if (flags & FL_SLEEP) { nbl->nbl_time = ktime_get_boottime_seconds(); spin_lock(&nn->blocked_locks_lock); list_add_tail(&nbl->nbl_list, &lock_sop->lo_blocked); @@ -7669,7 +8019,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, out: if (nbl) { /* dequeue it if we queued it before */ - if (fl_flags & FL_SLEEP) { + if (flags & FL_SLEEP) { spin_lock(&nn->blocked_locks_lock); if (!list_empty(&nbl->nbl_list) && !list_empty(&nbl->nbl_lru)) { @@ -7737,9 +8087,9 @@ static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct err = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ)); if (err) goto out; - lock->fl_file = nf->nf_file; + lock->c.flc_file = nf->nf_file; err = nfserrno(vfs_test_lock(nf->nf_file, lock)); - lock->fl_file = NULL; + lock->c.flc_file = NULL; out: inode_unlock(inode); nfsd_file_put(nf); @@ -7784,11 +8134,11 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, switch (lockt->lt_type) { case NFS4_READ_LT: case NFS4_READW_LT: - file_lock->fl_type = F_RDLCK; + file_lock->c.flc_type = F_RDLCK; break; case NFS4_WRITE_LT: case NFS4_WRITEW_LT: - file_lock->fl_type = F_WRLCK; + file_lock->c.flc_type = F_WRLCK; break; default: dprintk("NFSD: nfs4_lockt: bad lock type!\n"); @@ -7798,9 +8148,9 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, lo = find_lockowner_str(cstate->clp, &lockt->lt_owner); if (lo) - file_lock->fl_owner = (fl_owner_t)lo; - file_lock->fl_pid = current->tgid; - file_lock->fl_flags = FL_POSIX; + file_lock->c.flc_owner = (fl_owner_t)lo; + file_lock->c.flc_pid = current->tgid; + file_lock->c.flc_flags = FL_POSIX; file_lock->fl_start = lockt->lt_offset; file_lock->fl_end = last_byte_offset(lockt->lt_offset, lockt->lt_length); @@ -7811,7 +8161,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) goto out; - if (file_lock->fl_type != F_UNLCK) { + if (file_lock->c.flc_type != F_UNLCK) { status = nfserr_denied; nfs4_set_lock_denied(file_lock, &lockt->lt_denied); } @@ -7851,8 +8201,8 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfserr_inval; status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid, - &locku->lu_stateid, NFS4_LOCK_STID, - &stp, nn); + &locku->lu_stateid, SC_TYPE_LOCK, 0, + &stp, nn); if (status) goto out; nf = find_any_file(stp->st_stid.sc_file); @@ -7867,11 +8217,11 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto put_file; } - file_lock->fl_type = F_UNLCK; - file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(stp->st_stateowner)); - file_lock->fl_pid = current->tgid; - file_lock->fl_file = nf->nf_file; - file_lock->fl_flags = FL_POSIX; + file_lock->c.flc_type = F_UNLCK; + file_lock->c.flc_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(stp->st_stateowner)); + file_lock->c.flc_pid = current->tgid; + file_lock->c.flc_file = nf->nf_file; + file_lock->c.flc_flags = FL_POSIX; file_lock->fl_lmops = &nfsd_posix_mng_ops; file_lock->fl_start = locku->lu_offset; @@ -7911,14 +8261,16 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) { struct file_lock *fl; int status = false; - struct nfsd_file *nf = find_any_file(fp); + struct nfsd_file *nf; struct inode *inode; struct file_lock_context *flctx; + spin_lock(&fp->fi_lock); + nf = find_any_file_locked(fp); if (!nf) { /* Any valid lock stateid should have some sort of access */ WARN_ON_ONCE(1); - return status; + goto out; } inode = file_inode(nf->nf_file); @@ -7926,15 +8278,16 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) if (flctx && !list_empty_careful(&flctx->flc_posix)) { spin_lock(&flctx->flc_lock); - list_for_each_entry(fl, &flctx->flc_posix, fl_list) { - if (fl->fl_owner == (fl_owner_t)lowner) { + for_each_file_lock(fl, &flctx->flc_posix) { + if (fl->c.flc_owner == (fl_owner_t)lowner) { status = true; break; } } spin_unlock(&flctx->flc_lock); } - nfsd_file_put(nf); +out: + spin_unlock(&fp->fi_lock); return status; } @@ -7944,10 +8297,8 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) * @cstate: NFSv4 COMPOUND state * @u: RELEASE_LOCKOWNER arguments * - * The lockowner's so_count is bumped when a lock record is added - * or when copying a conflicting lock. The latter case is brief, - * but can lead to fleeting false positives when looking for - * locks-in-use. + * Check if theree are any locks still held and if not - free the lockowner + * and any lock state that is owned. * * Return values: * %nfs_ok: lockowner released or not found @@ -7983,17 +8334,20 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, spin_unlock(&clp->cl_lock); return nfs_ok; } - if (atomic_read(&lo->lo_owner.so_count) != 2) { - spin_unlock(&clp->cl_lock); - nfs4_put_stateowner(&lo->lo_owner); - return nfserr_locks_held; + + list_for_each_entry(stp, &lo->lo_owner.so_stateids, st_perstateowner) { + if (check_for_locks(stp->st_stid.sc_file, lo)) { + spin_unlock(&clp->cl_lock); + nfs4_put_stateowner(&lo->lo_owner); + return nfserr_locks_held; + } } unhash_lockowner_locked(lo); while (!list_empty(&lo->lo_owner.so_stateids)) { stp = list_first_entry(&lo->lo_owner.so_stateids, struct nfs4_ol_stateid, st_perstateowner); - WARN_ON(!unhash_lock_stateid(stp)); + unhash_lock_stateid(stp); put_ol_stateid_locked(stp, &reaplist); } spin_unlock(&clp->cl_lock); @@ -8286,7 +8640,7 @@ nfs4_state_shutdown_net(struct net *net) spin_lock(&state_lock); list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); - WARN_ON(!unhash_delegation_locked(dp)); + unhash_delegation_locked(dp, SC_STATUS_CLOSED); list_add(&dp->dl_recall_lru, &reaplist); } spin_unlock(&state_lock); @@ -8428,6 +8782,8 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate, * nfsd4_deleg_getattr_conflict - Recall if GETATTR causes conflict * @rqstp: RPC transaction context * @inode: file to be checked for a conflict + * @modified: return true if file was modified + * @size: new size of file if modified is true * * This function is called when there is a conflict between a write * delegation and a change/size GETATTR from another client. The server @@ -8436,27 +8792,30 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate, * delegation before replying to the GETATTR. See RFC 8881 section * 18.7.4. * - * The current implementation does not support CB_GETATTR yet. However - * this can avoid recalling the delegation could be added in follow up - * work. - * * Returns 0 if there is no conflict; otherwise an nfs_stat * code is returned. */ __be32 -nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode) +nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode, + bool *modified, u64 *size) { __be32 status; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct file_lock_context *ctx; - struct file_lock *fl; + struct file_lease *fl; struct nfs4_delegation *dp; + struct iattr attrs; + struct nfs4_cb_fattr *ncf; + *modified = false; ctx = locks_inode_context(inode); if (!ctx) return 0; spin_lock(&ctx->flc_lock); - list_for_each_entry(fl, &ctx->flc_lease, fl_list) { - if (fl->fl_flags == FL_LAYOUT) + for_each_file_lock(fl, &ctx->flc_lease) { + unsigned char type = fl->c.flc_type; + + if (fl->c.flc_flags == FL_LAYOUT) continue; if (fl->fl_lmops != &nfsd_lease_mng_ops) { /* @@ -8464,23 +8823,49 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode) * we are done; there isn't any write delegation * on this inode */ - if (fl->fl_type == F_RDLCK) + if (type == F_RDLCK) break; goto break_lease; } - if (fl->fl_type == F_WRLCK) { - dp = fl->fl_owner; + if (type == F_WRLCK) { + dp = fl->c.flc_owner; if (dp->dl_recall.cb_clp == *(rqstp->rq_lease_breaker)) { spin_unlock(&ctx->flc_lock); return 0; } break_lease: + nfsd_stats_wdeleg_getattr_inc(nn); + dp = fl->c.flc_owner; + ncf = &dp->dl_cb_fattr; + nfs4_cb_getattr(&dp->dl_cb_fattr); spin_unlock(&ctx->flc_lock); - nfsd_stats_wdeleg_getattr_inc(); - status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ)); - if (status != nfserr_jukebox || - !nfsd_wait_for_delegreturn(rqstp, inode)) - return status; + wait_on_bit_timeout(&ncf->ncf_cb_flags, CB_GETATTR_BUSY, + TASK_INTERRUPTIBLE, NFSD_CB_GETATTR_TIMEOUT); + if (ncf->ncf_cb_status) { + /* Recall delegation only if client didn't respond */ + status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ)); + if (status != nfserr_jukebox || + !nfsd_wait_for_delegreturn(rqstp, inode)) + return status; + } + if (!ncf->ncf_file_modified && + (ncf->ncf_initial_cinfo != ncf->ncf_cb_change || + ncf->ncf_cur_fsize != ncf->ncf_cb_fsize)) + ncf->ncf_file_modified = true; + if (ncf->ncf_file_modified) { + /* + * Per section 10.4.3 of RFC 8881, the server would + * not update the file's metadata with the client's + * modified size + */ + attrs.ia_mtime = attrs.ia_ctime = current_time(inode); + attrs.ia_valid = ATTR_MTIME | ATTR_CTIME; + setattr_copy(&nop_mnt_idmap, inode, &attrs); + mark_inode_dirty(inode); + ncf->ncf_cur_fsize = ncf->ncf_cb_fsize; + *size = ncf->ncf_cur_fsize; + *modified = true; + } return 0; } break; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index c719c475a068..fac938f563ad 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3507,6 +3507,8 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, unsigned long mask[2]; } u; unsigned long bit; + bool file_modified = false; + u64 size = 0; WARN_ON_ONCE(bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1); WARN_ON_ONCE(!nfsd_attrs_supported(minorversion, bmval)); @@ -3533,7 +3535,8 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, } args.size = 0; if (u.attrmask[0] & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) { - status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry)); + status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry), + &file_modified, &size); if (status) goto out; } @@ -3543,7 +3546,10 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, AT_STATX_SYNC_AS_STAT); if (err) goto out_nfserr; - args.size = args.stat.size; + if (file_modified) + args.size = size; + else + args.size = args.stat.size; if (!(args.stat.result_mask & STATX_BTIME)) /* underlying FS does not offer btime so we can't share it */ @@ -5386,16 +5392,11 @@ nfsd4_listxattr_validate_cookie(struct nfsd4_listxattrs *listxattrs, /* * If the cookie is larger than the maximum number we can fit - * in either the buffer we just got back from vfs_listxattr, or, - * XDR-encoded, in the return buffer, it's invalid. + * in the buffer we just got back from vfs_listxattr, it's invalid. */ if (cookie > (listxattrs->lsxa_len) / (XATTR_USER_PREFIX_LEN + 2)) return nfserr_badcookie; - if (cookie > (listxattrs->lsxa_maxcount / - (XDR_QUADLEN(XATTR_USER_PREFIX_LEN + 2) + 4))) - return nfserr_badcookie; - *offsetp = (u32)cookie; return 0; } @@ -5412,6 +5413,7 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr, u64 cookie; char *sp; __be32 status, tmp; + __be64 wire_cookie; __be32 *p; u32 nuser; @@ -5427,7 +5429,7 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr, */ cookie_offset = xdr->buf->len; count_offset = cookie_offset + 8; - p = xdr_reserve_space(xdr, 12); + p = xdr_reserve_space(xdr, XDR_UNIT * 3); if (!p) { status = nfserr_resource; goto out; @@ -5438,7 +5440,8 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr, sp = listxattrs->lsxa_buf; nuser = 0; - xdrleft = listxattrs->lsxa_maxcount; + /* Bytes left is maxcount - 8 (cookie) - 4 (array count) */ + xdrleft = listxattrs->lsxa_maxcount - XDR_UNIT * 3; while (left > 0 && xdrleft > 0) { slen = strlen(sp); @@ -5451,7 +5454,8 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr, slen -= XATTR_USER_PREFIX_LEN; xdrlen = 4 + ((slen + 3) & ~3); - if (xdrlen > xdrleft) { + /* Check if both entry and eof can fit in the XDR buffer */ + if (xdrlen + XDR_UNIT > xdrleft) { if (count == 0) { /* * Can't even fit the first attribute name. @@ -5503,7 +5507,8 @@ wreof: cookie = offset + count; - write_bytes_to_xdr_buf(xdr->buf, cookie_offset, &cookie, 8); + wire_cookie = cpu_to_be64(cookie); + write_bytes_to_xdr_buf(xdr->buf, cookie_offset, &wire_cookie, 8); tmp = cpu_to_be32(count); write_bytes_to_xdr_buf(xdr->buf, count_offset, &tmp, 4); out: @@ -5727,27 +5732,24 @@ release: rqstp->rq_next_page = xdr->page_ptr + 1; } -/* - * Encode the reply stored in the stateowner reply cache - * - * XDR note: do not encode rp->rp_buflen: the buffer contains the - * previously sent already encoded operation. +/** + * nfsd4_encode_replay - encode a result stored in the stateowner reply cache + * @xdr: send buffer's XDR stream + * @op: operation being replayed + * + * @op->replay->rp_buf contains the previously-sent already-encoded result. */ -void -nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op) +void nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op) { - __be32 *p; struct nfs4_replay *rp = op->replay; - p = xdr_reserve_space(xdr, 8 + rp->rp_buflen); - if (!p) { - WARN_ON_ONCE(1); - return; - } - *p++ = cpu_to_be32(op->opnum); - *p++ = rp->rp_status; /* already xdr'ed */ + trace_nfsd_stateowner_replay(op->opnum, rp); - p = xdr_encode_opaque_fixed(p, rp->rp_buf, rp->rp_buflen); + if (xdr_stream_encode_u32(xdr, op->opnum) != XDR_UNIT) + return; + if (xdr_stream_encode_be32(xdr, rp->rp_status) != XDR_UNIT) + return; + xdr_stream_encode_opaque_fixed(xdr, rp->rp_buf, rp->rp_buflen); } void nfsd4_release_compoundargs(struct svc_rqst *rqstp) diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 5c1a4a0aa605..ba9d326b3de6 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -166,8 +166,7 @@ nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct nfsd_cacherep *rp, int nfsd_drc_slab_create(void) { - drc_slab = kmem_cache_create("nfsd_drc", - sizeof(struct nfsd_cacherep), 0, 0, NULL); + drc_slab = KMEM_CACHE(nfsd_cacherep, 0); return drc_slab ? 0: -ENOMEM; } @@ -176,27 +175,6 @@ void nfsd_drc_slab_free(void) kmem_cache_destroy(drc_slab); } -/** - * nfsd_net_reply_cache_init - per net namespace reply cache set-up - * @nn: nfsd_net being initialized - * - * Returns zero on succes; otherwise a negative errno is returned. - */ -int nfsd_net_reply_cache_init(struct nfsd_net *nn) -{ - return nfsd_percpu_counters_init(nn->counter, NFSD_NET_COUNTERS_NUM); -} - -/** - * nfsd_net_reply_cache_destroy - per net namespace reply cache tear-down - * @nn: nfsd_net being freed - * - */ -void nfsd_net_reply_cache_destroy(struct nfsd_net *nn) -{ - nfsd_percpu_counters_destroy(nn->counter, NFSD_NET_COUNTERS_NUM); -} - int nfsd_reply_cache_init(struct nfsd_net *nn) { unsigned int hashsize; @@ -501,7 +479,7 @@ out: int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, unsigned int len, struct nfsd_cacherep **cacherep) { - struct nfsd_net *nn; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct nfsd_cacherep *rp, *found; __wsum csum; struct nfsd_drc_bucket *b; @@ -510,7 +488,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, int rtn = RC_DOIT; if (type == RC_NOCACHE) { - nfsd_stats_rc_nocache_inc(); + nfsd_stats_rc_nocache_inc(nn); goto out; } @@ -520,7 +498,6 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, * Since the common case is a cache miss followed by an insert, * preallocate an entry. */ - nn = net_generic(SVC_NET(rqstp), nfsd_net_id); rp = nfsd_cacherep_alloc(rqstp, csum, nn); if (!rp) goto out; @@ -537,7 +514,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, nfsd_cacherep_dispose(&dispose); - nfsd_stats_rc_misses_inc(); + nfsd_stats_rc_misses_inc(nn); atomic_inc(&nn->num_drc_entries); nfsd_stats_drc_mem_usage_add(nn, sizeof(*rp)); goto out; @@ -545,7 +522,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, found_entry: /* We found a matching entry which is either in progress or done. */ nfsd_reply_cache_free_locked(NULL, rp, nn); - nfsd_stats_rc_hits_inc(); + nfsd_stats_rc_hits_inc(nn); rtn = RC_DROPIT; rp = found; @@ -687,15 +664,15 @@ int nfsd_reply_cache_stats_show(struct seq_file *m, void *v) atomic_read(&nn->num_drc_entries)); seq_printf(m, "hash buckets: %u\n", 1 << nn->maskbits); seq_printf(m, "mem usage: %lld\n", - percpu_counter_sum_positive(&nn->counter[NFSD_NET_DRC_MEM_USAGE])); + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_DRC_MEM_USAGE])); seq_printf(m, "cache hits: %lld\n", - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_HITS])); + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_HITS])); seq_printf(m, "cache misses: %lld\n", - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_MISSES])); + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_MISSES])); seq_printf(m, "not cached: %lld\n", - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE])); + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_NOCACHE])); seq_printf(m, "payload misses: %lld\n", - percpu_counter_sum_positive(&nn->counter[NFSD_NET_PAYLOAD_MISSES])); + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_PAYLOAD_MISSES])); seq_printf(m, "longest chain len: %u\n", nn->longest_chain); seq_printf(m, "cachesize at longest: %u\n", nn->longest_chain_cachesize); return 0; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index f206ca32e7f5..ecd18bffeebc 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -281,6 +281,7 @@ static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size) * 3. Is that directory the root of an exported file system? */ error = nlmsvc_unlock_all_by_sb(path.dentry->d_sb); + nfsd4_revoke_states(netns(file), path.dentry->d_sb); path_put(&path); return error; @@ -1671,14 +1672,17 @@ static __net_init int nfsd_net_init(struct net *net) retval = nfsd_idmap_init(net); if (retval) goto out_idmap_error; - retval = nfsd_net_reply_cache_init(nn); + retval = nfsd_stat_counters_init(nn); if (retval) goto out_repcache_error; + memset(&nn->nfsd_svcstats, 0, sizeof(nn->nfsd_svcstats)); + nn->nfsd_svcstats.program = &nfsd_program; nn->nfsd_versions = NULL; nn->nfsd4_minorversions = NULL; nfsd4_init_leases_net(nn); get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key)); seqlock_init(&nn->writeverf_lock); + nfsd_proc_stat_init(net); return 0; @@ -1699,7 +1703,8 @@ static __net_exit void nfsd_net_exit(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); - nfsd_net_reply_cache_destroy(nn); + nfsd_proc_stat_shutdown(net); + nfsd_stat_counters_destroy(nn); nfsd_idmap_shutdown(net); nfsd_export_shutdown(net); nfsd_netns_free_versions(nn); @@ -1722,12 +1727,9 @@ static int __init init_nfsd(void) retval = nfsd4_init_pnfs(); if (retval) goto out_free_slabs; - retval = nfsd_stat_init(); /* Statistics */ - if (retval) - goto out_free_pnfs; retval = nfsd_drc_slab_create(); if (retval) - goto out_free_stat; + goto out_free_pnfs; nfsd_lockd_init(); /* lockd->nfsd callbacks */ retval = create_proc_exports_entry(); if (retval) @@ -1761,8 +1763,6 @@ out_free_exports: out_free_lockd: nfsd_lockd_shutdown(); nfsd_drc_slab_free(); -out_free_stat: - nfsd_stat_shutdown(); out_free_pnfs: nfsd4_exit_pnfs(); out_free_slabs: @@ -1780,7 +1780,6 @@ static void __exit exit_nfsd(void) nfsd_drc_slab_free(); remove_proc_entry("fs/nfs/exports", NULL); remove_proc_entry("fs/nfs", NULL); - nfsd_stat_shutdown(); nfsd_lockd_shutdown(); nfsd4_free_slabs(); nfsd4_exit_pnfs(); diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 304e9728b929..16c5a05f340e 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -86,6 +86,7 @@ extern struct mutex nfsd_mutex; extern spinlock_t nfsd_drc_lock; extern unsigned long nfsd_drc_max_mem; extern unsigned long nfsd_drc_mem_used; +extern atomic_t nfsd_th_cnt; /* number of available threads */ extern const struct seq_operations nfs_exports_op; @@ -274,6 +275,7 @@ void nfsd_lockd_shutdown(void); #define nfserr_no_grace cpu_to_be32(NFSERR_NO_GRACE) #define nfserr_reclaim_bad cpu_to_be32(NFSERR_RECLAIM_BAD) #define nfserr_badname cpu_to_be32(NFSERR_BADNAME) +#define nfserr_admin_revoked cpu_to_be32(NFS4ERR_ADMIN_REVOKED) #define nfserr_cb_path_down cpu_to_be32(NFSERR_CB_PATH_DOWN) #define nfserr_locked cpu_to_be32(NFSERR_LOCKED) #define nfserr_wrongsec cpu_to_be32(NFSERR_WRONGSEC) @@ -365,6 +367,7 @@ void nfsd_lockd_shutdown(void); #define NFSD_CLIENT_MAX_TRIM_PER_RUN 128 #define NFS4_CLIENTS_PER_GB 1024 #define NFSD_DELEGRETURN_TIMEOUT (HZ / 34) /* 30ms */ +#define NFSD_CB_GETATTR_TIMEOUT NFSD_DELEGRETURN_TIMEOUT /* * The following attributes are currently not supported by the NFSv4 server: diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index dbfa0ac13564..40fecf7b224f 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -327,6 +327,7 @@ out: __be32 fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access) { + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct svc_export *exp = NULL; struct dentry *dentry; __be32 error; @@ -395,7 +396,7 @@ skip_pseudoflavor_check: out: trace_nfsd_fh_verify_err(rqstp, fhp, type, access, error); if (error == nfserr_stale) - nfsd_stats_fh_stale_inc(exp); + nfsd_stats_fh_stale_inc(nn, exp); return error; } diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index a7315928a760..36370b957b63 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -103,7 +103,7 @@ nfsd_proc_setattr(struct svc_rqst *rqstp) } } - resp->status = nfsd_setattr(rqstp, fhp, &attrs, 0, (time64_t)0); + resp->status = nfsd_setattr(rqstp, fhp, &attrs, NULL); if (resp->status != nfs_ok) goto out; @@ -390,8 +390,8 @@ nfsd_proc_create(struct svc_rqst *rqstp) */ attr->ia_valid &= ATTR_SIZE; if (attr->ia_valid) - resp->status = nfsd_setattr(rqstp, newfhp, &attrs, 0, - (time64_t)0); + resp->status = nfsd_setattr(rqstp, newfhp, &attrs, + NULL); } out_unlock: diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index a667802e08e7..c0d17b92b249 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -34,6 +34,7 @@ #define NFSDDBG_FACILITY NFSDDBG_SVC +atomic_t nfsd_th_cnt = ATOMIC_INIT(0); extern struct svc_program nfsd_program; static int nfsd(void *vrqstp); #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) @@ -80,7 +81,6 @@ unsigned long nfsd_drc_max_mem; unsigned long nfsd_drc_mem_used; #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) -static struct svc_stat nfsd_acl_svcstats; static const struct svc_version *nfsd_acl_version[] = { # if defined(CONFIG_NFSD_V2_ACL) [2] = &nfsd_acl_version2, @@ -99,15 +99,11 @@ static struct svc_program nfsd_acl_program = { .pg_vers = nfsd_acl_version, .pg_name = "nfsacl", .pg_class = "nfsd", - .pg_stats = &nfsd_acl_svcstats, .pg_authenticate = &svc_set_client, .pg_init_request = nfsd_acl_init_request, .pg_rpcbind_set = nfsd_acl_rpcbind_set, }; -static struct svc_stat nfsd_acl_svcstats = { - .program = &nfsd_acl_program, -}; #endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */ static const struct svc_version *nfsd_version[] = { @@ -132,7 +128,6 @@ struct svc_program nfsd_program = { .pg_vers = nfsd_version, /* version table */ .pg_name = "nfsd", /* program name */ .pg_class = "nfsd", /* authentication class */ - .pg_stats = &nfsd_svcstats, /* version table */ .pg_authenticate = &svc_set_client, /* export authentication */ .pg_init_request = nfsd_init_request, .pg_rpcbind_set = nfsd_rpcbind_set, @@ -666,7 +661,8 @@ int nfsd_create_serv(struct net *net) if (nfsd_max_blksize == 0) nfsd_max_blksize = nfsd_get_default_max_blksize(); nfsd_reset_versions(nn); - serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, nfsd); + serv = svc_create_pooled(&nfsd_program, &nn->nfsd_svcstats, + nfsd_max_blksize, nfsd); if (serv == NULL) return -ENOMEM; @@ -929,7 +925,7 @@ nfsd(void *vrqstp) current->fs->umask = 0; - atomic_inc(&nfsdstats.th_cnt); + atomic_inc(&nfsd_th_cnt); set_freezable(); @@ -941,9 +937,11 @@ nfsd(void *vrqstp) rqstp->rq_server->sv_maxconn = nn->max_connections; svc_recv(rqstp); + + nfsd_file_net_dispose(nn); } - atomic_dec(&nfsdstats.th_cnt); + atomic_dec(&nfsd_th_cnt); out: /* Release the thread */ diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h index de1e0dfed06a..925817f66917 100644 --- a/fs/nfsd/pnfs.h +++ b/fs/nfsd/pnfs.h @@ -37,7 +37,8 @@ struct nfsd4_layout_ops { __be32 (*proc_layoutcommit)(struct inode *inode, struct nfsd4_layoutcommit *lcp); - void (*fence_client)(struct nfs4_layout_stateid *ls); + void (*fence_client)(struct nfs4_layout_stateid *ls, + struct nfsd_file *file); }; extern const struct nfsd4_layout_ops *nfsd4_layout_ops[]; @@ -72,11 +73,13 @@ void nfsd4_setup_layout_type(struct svc_export *exp); void nfsd4_return_all_client_layouts(struct nfs4_client *); void nfsd4_return_all_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp); +void nfsd4_close_layout(struct nfs4_layout_stateid *ls); int nfsd4_init_pnfs(void); void nfsd4_exit_pnfs(void); #else struct nfs4_client; struct nfs4_file; +struct nfs4_layout_stateid; static inline void nfsd4_setup_layout_type(struct svc_export *exp) { @@ -89,6 +92,9 @@ static inline void nfsd4_return_all_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp) { } +static inline void nfsd4_close_layout(struct nfs4_layout_stateid *ls) +{ +} static inline void nfsd4_exit_pnfs(void) { } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 41bdc913fa71..01c6f3445646 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -68,7 +68,7 @@ struct nfsd4_callback { struct nfs4_client *cb_clp; struct rpc_message cb_msg; const struct nfsd4_callback_ops *cb_ops; - struct work_struct cb_work; + struct delayed_work cb_work; int cb_seq_status; int cb_status; bool cb_need_restart; @@ -88,17 +88,34 @@ struct nfsd4_callback_ops { */ struct nfs4_stid { refcount_t sc_count; -#define NFS4_OPEN_STID 1 -#define NFS4_LOCK_STID 2 -#define NFS4_DELEG_STID 4 -/* For an open stateid kept around *only* to process close replays: */ -#define NFS4_CLOSED_STID 8 + + /* A new stateid is added to the cl_stateids idr early before it + * is fully initialised. Its sc_type is then zero. After + * initialisation the sc_type it set under cl_lock, and then + * never changes. + */ +#define SC_TYPE_OPEN BIT(0) +#define SC_TYPE_LOCK BIT(1) +#define SC_TYPE_DELEG BIT(2) +#define SC_TYPE_LAYOUT BIT(3) + unsigned short sc_type; + +/* state_lock protects sc_status for delegation stateids. + * ->cl_lock protects sc_status for open and lock stateids. + * ->st_mutex also protect sc_status for open stateids. + * ->ls_lock protects sc_status for layout stateids. + */ +/* + * For an open stateid kept around *only* to process close replays. + * For deleg stateid, kept in idr until last reference is dropped. + */ +#define SC_STATUS_CLOSED BIT(0) /* For a deleg stateid kept around only to process free_stateid's: */ -#define NFS4_REVOKED_DELEG_STID 16 -#define NFS4_CLOSED_DELEG_STID 32 -#define NFS4_LAYOUT_STID 64 +#define SC_STATUS_REVOKED BIT(1) +#define SC_STATUS_ADMIN_REVOKED BIT(2) + unsigned short sc_status; + struct list_head sc_cp_list; - unsigned char sc_type; stateid_t sc_stateid; spinlock_t sc_lock; struct nfs4_client *sc_client; @@ -117,6 +134,24 @@ struct nfs4_cpntf_state { time64_t cpntf_time; /* last time stateid used */ }; +struct nfs4_cb_fattr { + struct nfsd4_callback ncf_getattr; + u32 ncf_cb_status; + u32 ncf_cb_bmap[1]; + + /* from CB_GETATTR reply */ + u64 ncf_cb_change; + u64 ncf_cb_fsize; + + unsigned long ncf_cb_flags; + bool ncf_file_modified; + u64 ncf_initial_cinfo; + u64 ncf_cur_fsize; +}; + +/* bits for ncf_cb_flags */ +#define CB_GETATTR_BUSY 0 + /* * Represents a delegation stateid. The nfs4_client holds references to these * and they are put when it is being destroyed or when the delegation is @@ -150,6 +185,9 @@ struct nfs4_delegation { int dl_retries; struct nfsd4_callback dl_recall; bool dl_recalled; + + /* for CB_GETATTR */ + struct nfs4_cb_fattr dl_cb_fattr; }; #define cb_to_delegation(cb) \ @@ -317,8 +355,9 @@ enum { * 0. If they are not renewed within a lease period, they become eligible for * destruction by the laundromat. * - * These objects can also be destroyed prematurely by the fault injection code, - * or if the client sends certain forms of SETCLIENTID or EXCHANGE_ID updates. + * These objects can also be destroyed if the client sends certain forms of + * SETCLIENTID or EXCHANGE_ID operations. + * * Care is taken *not* to do this however when the objects have an elevated * refcount. * @@ -326,7 +365,7 @@ enum { * * o Each nfs4_clients is also hashed by name (the opaque quantity initially * sent by the client to identify itself). - * + * * o cl_perclient list is used to ensure no dangling stateowner references * when we expire the nfs4_client */ @@ -351,6 +390,7 @@ struct nfs4_client { clientid_t cl_clientid; /* generated by server */ nfs4_verifier cl_confirm; /* generated by server */ u32 cl_minorversion; + atomic_t cl_admin_revoked; /* count of admin-revoked states */ /* NFSv4.1 client implementation id: */ struct xdr_netobj cl_nii_domain; struct xdr_netobj cl_nii_name; @@ -640,6 +680,7 @@ enum nfsd4_cb_op { NFSPROC4_CLNT_CB_SEQUENCE, NFSPROC4_CLNT_CB_NOTIFY_LOCK, NFSPROC4_CLNT_CB_RECALL_ANY, + NFSPROC4_CLNT_CB_GETATTR, }; /* Returns true iff a is later than b: */ @@ -672,15 +713,15 @@ extern __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, stateid_t *stateid, int flags, struct nfsd_file **filp, struct nfs4_stid **cstid); __be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, - stateid_t *stateid, unsigned char typemask, - struct nfs4_stid **s, struct nfsd_net *nn); + stateid_t *stateid, unsigned short typemask, + unsigned short statusmask, + struct nfs4_stid **s, struct nfsd_net *nn); struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab, void (*sc_free)(struct nfs4_stid *)); int nfs4_init_copy_state(struct nfsd_net *nn, struct nfsd4_copy *copy); void nfs4_free_copy_state(struct nfsd4_copy *copy); struct nfs4_cpntf_state *nfs4_alloc_init_cpntf_state(struct nfsd_net *nn, struct nfs4_stid *p_stid); -void nfs4_unhash_stid(struct nfs4_stid *s); void nfs4_put_stid(struct nfs4_stid *s); void nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid); void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *); @@ -714,6 +755,14 @@ static inline void get_nfs4_file(struct nfs4_file *fi) } struct nfsd_file *find_any_file(struct nfs4_file *f); +#ifdef CONFIG_NFSD_V4 +void nfsd4_revoke_states(struct net *net, struct super_block *sb); +#else +static inline void nfsd4_revoke_states(struct net *net, struct super_block *sb) +{ +} +#endif + /* grace period management */ void nfsd4_end_grace(struct nfsd_net *nn); @@ -732,5 +781,5 @@ static inline bool try_to_expire_client(struct nfs4_client *clp) } extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, - struct inode *inode); + struct inode *inode, bool *file_modified, u64 *size); #endif /* NFSD4_STATE_H */ diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index 12d79f5d4eb1..be52fb1e928e 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -27,25 +27,22 @@ #include "nfsd.h" -struct nfsd_stats nfsdstats; -struct svc_stat nfsd_svcstats = { - .program = &nfsd_program, -}; - static int nfsd_show(struct seq_file *seq, void *v) { + struct net *net = pde_data(file_inode(seq->file)); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); int i; seq_printf(seq, "rc %lld %lld %lld\nfh %lld 0 0 0 0\nio %lld %lld\n", - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_HITS]), - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_MISSES]), - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE]), - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_FH_STALE]), - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_IO_READ]), - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_IO_WRITE])); + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_HITS]), + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_MISSES]), + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_NOCACHE]), + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_FH_STALE]), + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_IO_READ]), + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_IO_WRITE])); /* thread usage: */ - seq_printf(seq, "th %u 0", atomic_read(&nfsdstats.th_cnt)); + seq_printf(seq, "th %u 0", atomic_read(&nfsd_th_cnt)); /* deprecated thread usage histogram stats */ for (i = 0; i < 10; i++) @@ -55,7 +52,7 @@ static int nfsd_show(struct seq_file *seq, void *v) seq_puts(seq, "\nra 0 0 0 0 0 0 0 0 0 0 0 0\n"); /* show my rpc info */ - svc_seq_show(seq, &nfsd_svcstats); + svc_seq_show(seq, &nn->nfsd_svcstats); #ifdef CONFIG_NFSD_V4 /* Show count for individual nfsv4 operations */ @@ -63,10 +60,10 @@ static int nfsd_show(struct seq_file *seq, void *v) seq_printf(seq, "proc4ops %u", LAST_NFS4_OP + 1); for (i = 0; i <= LAST_NFS4_OP; i++) { seq_printf(seq, " %lld", - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_NFS4_OP(i)])); + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_NFS4_OP(i)])); } seq_printf(seq, "\nwdeleg_getattr %lld", - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_WDELEG_GETATTR])); + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_WDELEG_GETATTR])); seq_putc(seq, '\n'); #endif @@ -108,31 +105,24 @@ void nfsd_percpu_counters_destroy(struct percpu_counter counters[], int num) percpu_counter_destroy(&counters[i]); } -static int nfsd_stat_counters_init(void) +int nfsd_stat_counters_init(struct nfsd_net *nn) { - return nfsd_percpu_counters_init(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM); + return nfsd_percpu_counters_init(nn->counter, NFSD_STATS_COUNTERS_NUM); } -static void nfsd_stat_counters_destroy(void) +void nfsd_stat_counters_destroy(struct nfsd_net *nn) { - nfsd_percpu_counters_destroy(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM); + nfsd_percpu_counters_destroy(nn->counter, NFSD_STATS_COUNTERS_NUM); } -int nfsd_stat_init(void) +void nfsd_proc_stat_init(struct net *net) { - int err; - - err = nfsd_stat_counters_init(); - if (err) - return err; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); - svc_proc_register(&init_net, &nfsd_svcstats, &nfsd_proc_ops); - - return 0; + svc_proc_register(net, &nn->nfsd_svcstats, &nfsd_proc_ops); } -void nfsd_stat_shutdown(void) +void nfsd_proc_stat_shutdown(struct net *net) { - nfsd_stat_counters_destroy(); - svc_proc_unregister(&init_net, "nfsd"); + svc_proc_unregister(net, "nfsd"); } diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h index 14f50c660b61..d2753e975dfd 100644 --- a/fs/nfsd/stats.h +++ b/fs/nfsd/stats.h @@ -10,94 +10,72 @@ #include <uapi/linux/nfsd/stats.h> #include <linux/percpu_counter.h> - -enum { - NFSD_STATS_RC_HITS, /* repcache hits */ - NFSD_STATS_RC_MISSES, /* repcache misses */ - NFSD_STATS_RC_NOCACHE, /* uncached reqs */ - NFSD_STATS_FH_STALE, /* FH stale error */ - NFSD_STATS_IO_READ, /* bytes returned to read requests */ - NFSD_STATS_IO_WRITE, /* bytes passed in write requests */ -#ifdef CONFIG_NFSD_V4 - NFSD_STATS_FIRST_NFS4_OP, /* count of individual nfsv4 operations */ - NFSD_STATS_LAST_NFS4_OP = NFSD_STATS_FIRST_NFS4_OP + LAST_NFS4_OP, -#define NFSD_STATS_NFS4_OP(op) (NFSD_STATS_FIRST_NFS4_OP + (op)) - NFSD_STATS_WDELEG_GETATTR, /* count of getattr conflict with wdeleg */ -#endif - NFSD_STATS_COUNTERS_NUM -}; - -struct nfsd_stats { - struct percpu_counter counter[NFSD_STATS_COUNTERS_NUM]; - - atomic_t th_cnt; /* number of available threads */ -}; - -extern struct nfsd_stats nfsdstats; - -extern struct svc_stat nfsd_svcstats; - int nfsd_percpu_counters_init(struct percpu_counter *counters, int num); void nfsd_percpu_counters_reset(struct percpu_counter *counters, int num); void nfsd_percpu_counters_destroy(struct percpu_counter *counters, int num); -int nfsd_stat_init(void); -void nfsd_stat_shutdown(void); +int nfsd_stat_counters_init(struct nfsd_net *nn); +void nfsd_stat_counters_destroy(struct nfsd_net *nn); +void nfsd_proc_stat_init(struct net *net); +void nfsd_proc_stat_shutdown(struct net *net); -static inline void nfsd_stats_rc_hits_inc(void) +static inline void nfsd_stats_rc_hits_inc(struct nfsd_net *nn) { - percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_HITS]); + percpu_counter_inc(&nn->counter[NFSD_STATS_RC_HITS]); } -static inline void nfsd_stats_rc_misses_inc(void) +static inline void nfsd_stats_rc_misses_inc(struct nfsd_net *nn) { - percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_MISSES]); + percpu_counter_inc(&nn->counter[NFSD_STATS_RC_MISSES]); } -static inline void nfsd_stats_rc_nocache_inc(void) +static inline void nfsd_stats_rc_nocache_inc(struct nfsd_net *nn) { - percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE]); + percpu_counter_inc(&nn->counter[NFSD_STATS_RC_NOCACHE]); } -static inline void nfsd_stats_fh_stale_inc(struct svc_export *exp) +static inline void nfsd_stats_fh_stale_inc(struct nfsd_net *nn, + struct svc_export *exp) { - percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_FH_STALE]); + percpu_counter_inc(&nn->counter[NFSD_STATS_FH_STALE]); if (exp && exp->ex_stats) percpu_counter_inc(&exp->ex_stats->counter[EXP_STATS_FH_STALE]); } -static inline void nfsd_stats_io_read_add(struct svc_export *exp, s64 amount) +static inline void nfsd_stats_io_read_add(struct nfsd_net *nn, + struct svc_export *exp, s64 amount) { - percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_READ], amount); + percpu_counter_add(&nn->counter[NFSD_STATS_IO_READ], amount); if (exp && exp->ex_stats) percpu_counter_add(&exp->ex_stats->counter[EXP_STATS_IO_READ], amount); } -static inline void nfsd_stats_io_write_add(struct svc_export *exp, s64 amount) +static inline void nfsd_stats_io_write_add(struct nfsd_net *nn, + struct svc_export *exp, s64 amount) { - percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_WRITE], amount); + percpu_counter_add(&nn->counter[NFSD_STATS_IO_WRITE], amount); if (exp && exp->ex_stats) percpu_counter_add(&exp->ex_stats->counter[EXP_STATS_IO_WRITE], amount); } static inline void nfsd_stats_payload_misses_inc(struct nfsd_net *nn) { - percpu_counter_inc(&nn->counter[NFSD_NET_PAYLOAD_MISSES]); + percpu_counter_inc(&nn->counter[NFSD_STATS_PAYLOAD_MISSES]); } static inline void nfsd_stats_drc_mem_usage_add(struct nfsd_net *nn, s64 amount) { - percpu_counter_add(&nn->counter[NFSD_NET_DRC_MEM_USAGE], amount); + percpu_counter_add(&nn->counter[NFSD_STATS_DRC_MEM_USAGE], amount); } static inline void nfsd_stats_drc_mem_usage_sub(struct nfsd_net *nn, s64 amount) { - percpu_counter_sub(&nn->counter[NFSD_NET_DRC_MEM_USAGE], amount); + percpu_counter_sub(&nn->counter[NFSD_STATS_DRC_MEM_USAGE], amount); } #ifdef CONFIG_NFSD_V4 -static inline void nfsd_stats_wdeleg_getattr_inc(void) +static inline void nfsd_stats_wdeleg_getattr_inc(struct nfsd_net *nn) { - percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_WDELEG_GETATTR]); + percpu_counter_inc(&nn->counter[NFSD_STATS_WDELEG_GETATTR]); } #endif #endif /* _NFSD_STATS_H */ diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index d1e8cf079b0f..e545e92c4408 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -9,8 +9,10 @@ #define _NFSD_TRACE_H #include <linux/tracepoint.h> +#include <linux/sunrpc/clnt.h> #include <linux/sunrpc/xprt.h> #include <trace/misc/nfs.h> +#include <trace/misc/sunrpc.h> #include "export.h" #include "nfsfh.h" @@ -641,23 +643,18 @@ DEFINE_EVENT(nfsd_stateseqid_class, nfsd_##name, \ DEFINE_STATESEQID_EVENT(preprocess); DEFINE_STATESEQID_EVENT(open_confirm); -TRACE_DEFINE_ENUM(NFS4_OPEN_STID); -TRACE_DEFINE_ENUM(NFS4_LOCK_STID); -TRACE_DEFINE_ENUM(NFS4_DELEG_STID); -TRACE_DEFINE_ENUM(NFS4_CLOSED_STID); -TRACE_DEFINE_ENUM(NFS4_REVOKED_DELEG_STID); -TRACE_DEFINE_ENUM(NFS4_CLOSED_DELEG_STID); -TRACE_DEFINE_ENUM(NFS4_LAYOUT_STID); - #define show_stid_type(x) \ __print_flags(x, "|", \ - { NFS4_OPEN_STID, "OPEN" }, \ - { NFS4_LOCK_STID, "LOCK" }, \ - { NFS4_DELEG_STID, "DELEG" }, \ - { NFS4_CLOSED_STID, "CLOSED" }, \ - { NFS4_REVOKED_DELEG_STID, "REVOKED" }, \ - { NFS4_CLOSED_DELEG_STID, "CLOSED_DELEG" }, \ - { NFS4_LAYOUT_STID, "LAYOUT" }) + { SC_TYPE_OPEN, "OPEN" }, \ + { SC_TYPE_LOCK, "LOCK" }, \ + { SC_TYPE_DELEG, "DELEG" }, \ + { SC_TYPE_LAYOUT, "LAYOUT" }) + +#define show_stid_status(x) \ + __print_flags(x, "|", \ + { SC_STATUS_CLOSED, "CLOSED" }, \ + { SC_STATUS_REVOKED, "REVOKED" }, \ + { SC_STATUS_ADMIN_REVOKED, "ADMIN_REVOKED" }) DECLARE_EVENT_CLASS(nfsd_stid_class, TP_PROTO( @@ -666,6 +663,7 @@ DECLARE_EVENT_CLASS(nfsd_stid_class, TP_ARGS(stid), TP_STRUCT__entry( __field(unsigned long, sc_type) + __field(unsigned long, sc_status) __field(int, sc_count) __field(u32, cl_boot) __field(u32, cl_id) @@ -676,16 +674,18 @@ DECLARE_EVENT_CLASS(nfsd_stid_class, const stateid_t *stp = &stid->sc_stateid; __entry->sc_type = stid->sc_type; + __entry->sc_status = stid->sc_status; __entry->sc_count = refcount_read(&stid->sc_count); __entry->cl_boot = stp->si_opaque.so_clid.cl_boot; __entry->cl_id = stp->si_opaque.so_clid.cl_id; __entry->si_id = stp->si_opaque.so_id; __entry->si_generation = stp->si_generation; ), - TP_printk("client %08x:%08x stateid %08x:%08x ref=%d type=%s", + TP_printk("client %08x:%08x stateid %08x:%08x ref=%d type=%s state=%s", __entry->cl_boot, __entry->cl_id, __entry->si_id, __entry->si_generation, - __entry->sc_count, show_stid_type(__entry->sc_type) + __entry->sc_count, show_stid_type(__entry->sc_type), + show_stid_status(__entry->sc_status) ) ); @@ -696,6 +696,59 @@ DEFINE_EVENT(nfsd_stid_class, nfsd_stid_##name, \ DEFINE_STID_EVENT(revoke); +TRACE_EVENT(nfsd_stateowner_replay, + TP_PROTO( + u32 opnum, + const struct nfs4_replay *rp + ), + TP_ARGS(opnum, rp), + TP_STRUCT__entry( + __field(unsigned long, status) + __field(u32, opnum) + ), + TP_fast_assign( + __entry->status = be32_to_cpu(rp->rp_status); + __entry->opnum = opnum; + ), + TP_printk("opnum=%u status=%lu", + __entry->opnum, __entry->status) +); + +TRACE_EVENT_CONDITION(nfsd_seq4_status, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct nfsd4_sequence *sequence + ), + TP_ARGS(rqstp, sequence), + TP_CONDITION(sequence->status_flags), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(u32, xid) + __field(u32, cl_boot) + __field(u32, cl_id) + __field(u32, seqno) + __field(u32, reserved) + __field(unsigned long, status_flags) + ), + TP_fast_assign( + const struct nfsd4_sessionid *sid = + (struct nfsd4_sessionid *)&sequence->sessionid; + + __entry->netns_ino = SVC_NET(rqstp)->ns.inum; + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->cl_boot = sid->clientid.cl_boot; + __entry->cl_id = sid->clientid.cl_id; + __entry->seqno = sid->sequence; + __entry->reserved = sid->reserved; + __entry->status_flags = sequence->status_flags; + ), + TP_printk("xid=0x%08x sessionid=%08x:%08x:%08x:%08x status_flags=%s", + __entry->xid, __entry->cl_boot, __entry->cl_id, + __entry->seqno, __entry->reserved, + show_nfs4_seq4_status(__entry->status_flags) + ) +); + DECLARE_EVENT_CLASS(nfsd_clientid_class, TP_PROTO(const clientid_t *clid), TP_ARGS(clid), @@ -1334,7 +1387,8 @@ DEFINE_EVENT(nfsd_cb_class, nfsd_cb_##name, \ TP_PROTO(const struct nfs4_client *clp), \ TP_ARGS(clp)) -DEFINE_NFSD_CB_EVENT(state); +DEFINE_NFSD_CB_EVENT(start); +DEFINE_NFSD_CB_EVENT(new_state); DEFINE_NFSD_CB_EVENT(probe); DEFINE_NFSD_CB_EVENT(lost); DEFINE_NFSD_CB_EVENT(shutdown); @@ -1405,6 +1459,128 @@ TRACE_EVENT(nfsd_cb_setup_err, __entry->error) ); +DECLARE_EVENT_CLASS(nfsd_cb_lifetime_class, + TP_PROTO( + const struct nfs4_client *clp, + const struct nfsd4_callback *cb + ), + TP_ARGS(clp, cb), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __field(const void *, cb) + __field(bool, need_restart) + __sockaddr(addr, clp->cl_cb_conn.cb_addrlen) + ), + TP_fast_assign( + __entry->cl_boot = clp->cl_clientid.cl_boot; + __entry->cl_id = clp->cl_clientid.cl_id; + __entry->cb = cb; + __entry->need_restart = cb->cb_need_restart; + __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr, + clp->cl_cb_conn.cb_addrlen) + ), + TP_printk("addr=%pISpc client %08x:%08x cb=%p%s", + __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id, + __entry->cb, __entry->need_restart ? + " (need restart)" : " (first try)" + ) +); + +#define DEFINE_NFSD_CB_LIFETIME_EVENT(name) \ +DEFINE_EVENT(nfsd_cb_lifetime_class, nfsd_cb_##name, \ + TP_PROTO( \ + const struct nfs4_client *clp, \ + const struct nfsd4_callback *cb \ + ), \ + TP_ARGS(clp, cb)) + +DEFINE_NFSD_CB_LIFETIME_EVENT(queue); +DEFINE_NFSD_CB_LIFETIME_EVENT(destroy); +DEFINE_NFSD_CB_LIFETIME_EVENT(restart); +DEFINE_NFSD_CB_LIFETIME_EVENT(bc_update); +DEFINE_NFSD_CB_LIFETIME_EVENT(bc_shutdown); + +TRACE_EVENT(nfsd_cb_seq_status, + TP_PROTO( + const struct rpc_task *task, + const struct nfsd4_callback *cb + ), + TP_ARGS(task, cb), + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(u32, cl_boot) + __field(u32, cl_id) + __field(u32, seqno) + __field(u32, reserved) + __field(int, tk_status) + __field(int, seq_status) + ), + TP_fast_assign( + const struct nfs4_client *clp = cb->cb_clp; + const struct nfsd4_session *session = clp->cl_cb_session; + const struct nfsd4_sessionid *sid = + (struct nfsd4_sessionid *)&session->se_sessionid; + + __entry->task_id = task->tk_pid; + __entry->client_id = task->tk_client ? + task->tk_client->cl_clid : -1; + __entry->cl_boot = sid->clientid.cl_boot; + __entry->cl_id = sid->clientid.cl_id; + __entry->seqno = sid->sequence; + __entry->reserved = sid->reserved; + __entry->tk_status = task->tk_status; + __entry->seq_status = cb->cb_seq_status; + ), + TP_printk(SUNRPC_TRACE_TASK_SPECIFIER + " sessionid=%08x:%08x:%08x:%08x tk_status=%d seq_status=%d\n", + __entry->task_id, __entry->client_id, + __entry->cl_boot, __entry->cl_id, + __entry->seqno, __entry->reserved, + __entry->tk_status, __entry->seq_status + ) +); + +TRACE_EVENT(nfsd_cb_free_slot, + TP_PROTO( + const struct rpc_task *task, + const struct nfsd4_callback *cb + ), + TP_ARGS(task, cb), + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(u32, cl_boot) + __field(u32, cl_id) + __field(u32, seqno) + __field(u32, reserved) + __field(u32, slot_seqno) + ), + TP_fast_assign( + const struct nfs4_client *clp = cb->cb_clp; + const struct nfsd4_session *session = clp->cl_cb_session; + const struct nfsd4_sessionid *sid = + (struct nfsd4_sessionid *)&session->se_sessionid; + + __entry->task_id = task->tk_pid; + __entry->client_id = task->tk_client ? + task->tk_client->cl_clid : -1; + __entry->cl_boot = sid->clientid.cl_boot; + __entry->cl_id = sid->clientid.cl_id; + __entry->seqno = sid->sequence; + __entry->reserved = sid->reserved; + __entry->slot_seqno = session->se_cb_seq_nr; + ), + TP_printk(SUNRPC_TRACE_TASK_SPECIFIER + " sessionid=%08x:%08x:%08x:%08x new slot seqno=%u\n", + __entry->task_id, __entry->client_id, + __entry->cl_boot, __entry->cl_id, + __entry->seqno, __entry->reserved, + __entry->slot_seqno + ) +); + TRACE_EVENT_CONDITION(nfsd_cb_recall, TP_PROTO( const struct nfs4_stid *stid diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index b7c7a9273ea0..6a9464262fae 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -25,7 +25,6 @@ #include <linux/posix_acl_xattr.h> #include <linux/xattr.h> #include <linux/jhash.h> -#include <linux/ima.h> #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/uaccess.h> @@ -476,7 +475,6 @@ static int __nfsd_setattr(struct dentry *dentry, struct iattr *iap) * @rqstp: controlling RPC transaction * @fhp: filehandle of target * @attr: attributes to set - * @check_guard: set to 1 if guardtime is a valid timestamp * @guardtime: do not act if ctime.tv_sec does not match this timestamp * * This call may adjust the contents of @attr (in particular, this @@ -488,8 +486,7 @@ static int __nfsd_setattr(struct dentry *dentry, struct iattr *iap) */ __be32 nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, - struct nfsd_attrs *attr, - int check_guard, time64_t guardtime) + struct nfsd_attrs *attr, const struct timespec64 *guardtime) { struct dentry *dentry; struct inode *inode; @@ -497,7 +494,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, int accmode = NFSD_MAY_SATTR; umode_t ftype = 0; __be32 err; - int host_err; + int host_err = 0; bool get_write_count; bool size_change = (iap->ia_valid & ATTR_SIZE); int retries; @@ -538,9 +535,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, nfsd_sanitize_attrs(inode, iap); - if (check_guard && guardtime != inode_get_ctime_sec(inode)) - return nfserr_notsync; - /* * The size case is special, it changes the file in addition to the * attributes, and file systems don't expect it to be mixed with @@ -555,6 +549,19 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, } inode_lock(inode); + err = fh_fill_pre_attrs(fhp); + if (err) + goto out_unlock; + + if (guardtime) { + struct timespec64 ctime = inode_get_ctime(inode); + if ((u32)guardtime->tv_sec != (u32)ctime.tv_sec || + guardtime->tv_nsec != ctime.tv_nsec) { + err = nfserr_notsync; + goto out_fill_attrs; + } + } + for (retries = 1;;) { struct iattr attrs; @@ -582,13 +589,23 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, attr->na_aclerr = set_posix_acl(&nop_mnt_idmap, dentry, ACL_TYPE_DEFAULT, attr->na_dpacl); +out_fill_attrs: + /* + * RFC 1813 Section 3.3.2 does not mandate that an NFS server + * returns wcc_data for SETATTR. Some client implementations + * depend on receiving wcc_data, however, to sort out partial + * updates (eg., the client requested that size and mode be + * modified, but the server changed only the file mode). + */ + fh_fill_post_attrs(fhp); +out_unlock: inode_unlock(inode); if (size_change) put_write_access(inode); out: if (!host_err) host_err = commit_metadata(fhp); - return nfserrno(host_err); + return err != 0 ? err : nfserrno(host_err); } #if defined(CONFIG_NFSD_V4) @@ -877,7 +894,7 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, goto out; } - host_err = ima_file_check(file, may_flags); + host_err = security_file_post_open(file, may_flags); if (host_err) { fput(file); goto out; @@ -1002,7 +1019,9 @@ static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned long *count, u32 *eof, ssize_t host_err) { if (host_err >= 0) { - nfsd_stats_io_read_add(fhp->fh_export, host_err); + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + nfsd_stats_io_read_add(nn, fhp->fh_export, host_err); *eof = nfsd_eof_on_read(file, offset, host_err, *count); *count = host_err; fsnotify_access(file); @@ -1185,7 +1204,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, goto out_nfserr; } *cnt = host_err; - nfsd_stats_io_write_add(exp, *cnt); + nfsd_stats_io_write_add(nn, exp, *cnt); fsnotify_modify(file); host_err = filemap_check_wb_err(file->f_mapping, since); if (host_err < 0) @@ -1404,7 +1423,7 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, * if the attributes have not changed. */ if (iap->ia_valid) - status = nfsd_setattr(rqstp, resfhp, attrs, 0, (time64_t)0); + status = nfsd_setattr(rqstp, resfhp, attrs, NULL); else status = nfserrno(commit_metadata(resfhp)); @@ -1906,10 +1925,10 @@ out_unlock: fh_drop_write(ffhp); /* - * If the target dentry has cached open files, then we need to try to - * close them prior to doing the rename. Flushing delayed fput - * shouldn't be done with locks held however, so we delay it until this - * point and then reattempt the whole shebang. + * If the target dentry has cached open files, then we need to + * try to close them prior to doing the rename. Final fput + * shouldn't be done with locks held however, so we delay it + * until this point and then reattempt the whole shebang. */ if (close_cached) { close_cached = false; @@ -2177,11 +2196,43 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, if (err == nfserr_eof || err == nfserr_toosmall) err = nfs_ok; /* can still be found in ->err */ out_close: - fput(file); + nfsd_filp_close(file); out: return err; } +/** + * nfsd_filp_close: close a file synchronously + * @fp: the file to close + * + * nfsd_filp_close() is similar in behaviour to filp_close(). + * The difference is that if this is the final close on the + * file, the that finalisation happens immediately, rather then + * being handed over to a work_queue, as it the case for + * filp_close(). + * When a user-space process closes a file (even when using + * filp_close() the finalisation happens before returning to + * userspace, so it is effectively synchronous. When a kernel thread + * uses file_close(), on the other hand, the handling is completely + * asynchronous. This means that any cost imposed by that finalisation + * is not imposed on the nfsd thread, and nfsd could potentually + * close files more quickly than the work queue finalises the close, + * which would lead to unbounded growth in the queue. + * + * In some contexts is it not safe to synchronously wait for + * close finalisation (see comment for __fput_sync()), but nfsd + * does not match those contexts. In partcilarly it does not, at the + * time that this function is called, hold and locks and no finalisation + * of any file, socket, or device driver would have any cause to wait + * for nfsd to make progress. + */ +void nfsd_filp_close(struct file *fp) +{ + get_file(fp); + filp_close(fp, NULL); + __fput_sync(fp); +} + /* * Get file system stats * N.B. After this call fhp needs an fh_put diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 702fbc4483bf..c60fdb6200fd 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -69,7 +69,7 @@ __be32 nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *, const char *, unsigned int, struct svc_export **, struct dentry **); __be32 nfsd_setattr(struct svc_rqst *, struct svc_fh *, - struct nfsd_attrs *, int, time64_t); + struct nfsd_attrs *, const struct timespec64 *); int nfsd_mountpoint(struct dentry *, struct svc_export *); #ifdef CONFIG_NFSD_V4 __be32 nfsd4_vfs_fallocate(struct svc_rqst *, struct svc_fh *, @@ -148,6 +148,8 @@ __be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *, __be32 nfsd_permission(struct svc_rqst *, struct svc_export *, struct dentry *, int); +void nfsd_filp_close(struct file *fp); + static inline int fh_want_write(struct svc_fh *fh) { int ret; diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h index 03fe4e21306c..522067b7fd75 100644 --- a/fs/nfsd/xdr3.h +++ b/fs/nfsd/xdr3.h @@ -14,7 +14,7 @@ struct nfsd3_sattrargs { struct svc_fh fh; struct iattr attrs; int check_guard; - time64_t guardtime; + struct timespec64 guardtime; }; struct nfsd3_diropargs { diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h index 0d39af1b00a0..e8b00309c449 100644 --- a/fs/nfsd/xdr4cb.h +++ b/fs/nfsd/xdr4cb.h @@ -54,3 +54,21 @@ #define NFS4_dec_cb_recall_any_sz (cb_compound_dec_hdr_sz + \ cb_sequence_dec_sz + \ op_dec_sz) + +/* + * 1: CB_GETATTR opcode (32-bit) + * N: file_handle + * 1: number of entry in attribute array (32-bit) + * 1: entry 0 in attribute array (32-bit) + */ +#define NFS4_enc_cb_getattr_sz (cb_compound_enc_hdr_sz + \ + cb_sequence_enc_sz + \ + 1 + enc_nfs4_fh_sz + 1 + 1) +/* + * 4: fattr_bitmap_maxsz + * 1: attribute array len + * 2: change attr (64-bit) + * 2: size (64-bit) + */ +#define NFS4_dec_cb_getattr_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + 4 + 1 + 2 + 2 + op_dec_sz) diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index bec33b89a075..0e3fc5ba33c7 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -107,7 +107,13 @@ static vm_fault_t nilfs_page_mkwrite(struct vm_fault *vmf) nilfs_transaction_commit(inode->i_sb); mapped: - folio_wait_stable(folio); + /* + * Since checksumming including data blocks is performed to determine + * the validity of the log to be written and used for recovery, it is + * necessary to wait for writeback to finish here, regardless of the + * stable write requirement of the backing device. + */ + folio_wait_writeback(folio); out: sb_end_pagefault(inode->i_sb); return vmf_fs_error(ret); diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index 0955b657938f..a9b8d77c8c1d 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -472,9 +472,10 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs, static int nilfs_recovery_copy_block(struct the_nilfs *nilfs, struct nilfs_recovery_block *rb, - struct page *page) + loff_t pos, struct page *page) { struct buffer_head *bh_org; + size_t from = pos & ~PAGE_MASK; void *kaddr; bh_org = __bread(nilfs->ns_bdev, rb->blocknr, nilfs->ns_blocksize); @@ -482,7 +483,7 @@ static int nilfs_recovery_copy_block(struct the_nilfs *nilfs, return -EIO; kaddr = kmap_atomic(page); - memcpy(kaddr + bh_offset(bh_org), bh_org->b_data, bh_org->b_size); + memcpy(kaddr + from, bh_org->b_data, bh_org->b_size); kunmap_atomic(kaddr); brelse(bh_org); return 0; @@ -521,7 +522,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, goto failed_inode; } - err = nilfs_recovery_copy_block(nilfs, rb, page); + err = nilfs_recovery_copy_block(nilfs, rb, pos, page); if (unlikely(err)) goto failed_page; diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 2590a0860eab..2bfb08052d39 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1703,7 +1703,6 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci) list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { - set_buffer_async_write(bh); if (bh == segbuf->sb_super_root) { if (bh->b_folio != bd_folio) { folio_lock(bd_folio); @@ -1714,6 +1713,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci) } break; } + set_buffer_async_write(bh); if (bh->b_folio != fs_folio) { nilfs_begin_folio_io(fs_folio); fs_folio = bh->b_folio; @@ -1800,7 +1800,6 @@ static void nilfs_abort_logs(struct list_head *logs, int err) list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { - clear_buffer_async_write(bh); if (bh == segbuf->sb_super_root) { clear_buffer_uptodate(bh); if (bh->b_folio != bd_folio) { @@ -1809,6 +1808,7 @@ static void nilfs_abort_logs(struct list_head *logs, int err) } break; } + clear_buffer_async_write(bh); if (bh->b_folio != fs_folio) { nilfs_end_folio_io(fs_folio, err); fs_folio = bh->b_folio; @@ -1896,8 +1896,9 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) BIT(BH_Delay) | BIT(BH_NILFS_Volatile) | BIT(BH_NILFS_Redirected)); - set_mask_bits(&bh->b_state, clear_bits, set_bits); if (bh == segbuf->sb_super_root) { + set_buffer_uptodate(bh); + clear_buffer_dirty(bh); if (bh->b_folio != bd_folio) { folio_end_writeback(bd_folio); bd_folio = bh->b_folio; @@ -1905,6 +1906,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) update_sr = true; break; } + set_mask_bits(&bh->b_state, clear_bits, set_bits); if (bh->b_folio != fs_folio) { nilfs_end_folio_io(fs_folio, 0); fs_folio = bh->b_folio; diff --git a/fs/nsfs.c b/fs/nsfs.c index 34e1e3e36733..7aaafb5cb9fc 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -27,26 +27,17 @@ static const struct file_operations ns_file_operations = { static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) { struct inode *inode = d_inode(dentry); - const struct proc_ns_operations *ns_ops = dentry->d_fsdata; + struct ns_common *ns = inode->i_private; + const struct proc_ns_operations *ns_ops = ns->ops; return dynamic_dname(buffer, buflen, "%s:[%lu]", ns_ops->name, inode->i_ino); } -static void ns_prune_dentry(struct dentry *dentry) -{ - struct inode *inode = d_inode(dentry); - if (inode) { - struct ns_common *ns = inode->i_private; - atomic_long_set(&ns->stashed, 0); - } -} - -const struct dentry_operations ns_dentry_operations = -{ - .d_prune = ns_prune_dentry, +const struct dentry_operations ns_dentry_operations = { .d_delete = always_delete_dentry, .d_dname = ns_dname, + .d_prune = stashed_dentry_prune, }; static void nsfs_evict(struct inode *inode) @@ -56,67 +47,16 @@ static void nsfs_evict(struct inode *inode) ns->ops->put(ns); } -static int __ns_get_path(struct path *path, struct ns_common *ns) -{ - struct vfsmount *mnt = nsfs_mnt; - struct dentry *dentry; - struct inode *inode; - unsigned long d; - - rcu_read_lock(); - d = atomic_long_read(&ns->stashed); - if (!d) - goto slow; - dentry = (struct dentry *)d; - if (!lockref_get_not_dead(&dentry->d_lockref)) - goto slow; - rcu_read_unlock(); - ns->ops->put(ns); -got_it: - path->mnt = mntget(mnt); - path->dentry = dentry; - return 0; -slow: - rcu_read_unlock(); - inode = new_inode_pseudo(mnt->mnt_sb); - if (!inode) { - ns->ops->put(ns); - return -ENOMEM; - } - inode->i_ino = ns->inum; - simple_inode_init_ts(inode); - inode->i_flags |= S_IMMUTABLE; - inode->i_mode = S_IFREG | S_IRUGO; - inode->i_fop = &ns_file_operations; - inode->i_private = ns; - - dentry = d_make_root(inode); /* not the normal use, but... */ - if (!dentry) - return -ENOMEM; - dentry->d_fsdata = (void *)ns->ops; - d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry); - if (d) { - d_delete(dentry); /* make sure ->d_prune() does nothing */ - dput(dentry); - cpu_relax(); - return -EAGAIN; - } - goto got_it; -} - int ns_get_path_cb(struct path *path, ns_get_path_helper_t *ns_get_cb, void *private_data) { - int ret; + struct ns_common *ns; - do { - struct ns_common *ns = ns_get_cb(private_data); - if (!ns) - return -ENOENT; - ret = __ns_get_path(path, ns); - } while (ret == -EAGAIN); + ns = ns_get_cb(private_data); + if (!ns) + return -ENOENT; - return ret; + return path_from_stashed(&ns->stashed, ns->inum, nsfs_mnt, ns, path); } struct ns_get_path_task_args { @@ -146,6 +86,7 @@ int open_related_ns(struct ns_common *ns, struct ns_common *(*get_ns)(struct ns_common *ns)) { struct path path = {}; + struct ns_common *relative; struct file *f; int err; int fd; @@ -154,19 +95,15 @@ int open_related_ns(struct ns_common *ns, if (fd < 0) return fd; - do { - struct ns_common *relative; - - relative = get_ns(ns); - if (IS_ERR(relative)) { - put_unused_fd(fd); - return PTR_ERR(relative); - } - - err = __ns_get_path(&path, relative); - } while (err == -EAGAIN); + relative = get_ns(ns); + if (IS_ERR(relative)) { + put_unused_fd(fd); + return PTR_ERR(relative); + } - if (err) { + err = path_from_stashed(&relative->stashed, relative->inum, nsfs_mnt, + relative, &path); + if (err < 0) { put_unused_fd(fd); return err; } @@ -249,7 +186,8 @@ bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino) static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry) { struct inode *inode = d_inode(dentry); - const struct proc_ns_operations *ns_ops = dentry->d_fsdata; + const struct ns_common *ns = inode->i_private; + const struct proc_ns_operations *ns_ops = ns->ops; seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino); return 0; @@ -261,6 +199,24 @@ static const struct super_operations nsfs_ops = { .show_path = nsfs_show_path, }; +static void nsfs_init_inode(struct inode *inode, void *data) +{ + inode->i_private = data; + inode->i_mode |= S_IRUGO; + inode->i_fop = &ns_file_operations; +} + +static void nsfs_put_data(void *data) +{ + struct ns_common *ns = data; + ns->ops->put(ns); +} + +static const struct stashed_operations nsfs_stashed_ops = { + .init_inode = nsfs_init_inode, + .put_data = nsfs_put_data, +}; + static int nsfs_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC); @@ -268,6 +224,7 @@ static int nsfs_init_fs_context(struct fs_context *fc) return -ENOMEM; ctx->ops = &nsfs_ops; ctx->dops = &ns_dentry_operations; + fc->s_fs_info = (void *)&nsfs_stashed_ops; return 0; } diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig deleted file mode 100644 index 7b2509741735..000000000000 --- a/fs/ntfs/Kconfig +++ /dev/null @@ -1,81 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -config NTFS_FS - tristate "NTFS file system support" - select BUFFER_HEAD - select NLS - help - NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003. - - Saying Y or M here enables read support. There is partial, but - safe, write support available. For write support you must also - say Y to "NTFS write support" below. - - There are also a number of user-space tools available, called - ntfsprogs. These include ntfsundelete and ntfsresize, that work - without NTFS support enabled in the kernel. - - This is a rewrite from scratch of Linux NTFS support and replaced - the old NTFS code starting with Linux 2.5.11. A backport to - the Linux 2.4 kernel series is separately available as a patch - from the project web site. - - For more information see <file:Documentation/filesystems/ntfs.rst> - and <http://www.linux-ntfs.org/>. - - To compile this file system support as a module, choose M here: the - module will be called ntfs. - - If you are not using Windows NT, 2000, XP or 2003 in addition to - Linux on your computer it is safe to say N. - -config NTFS_DEBUG - bool "NTFS debugging support" - depends on NTFS_FS - help - If you are experiencing any problems with the NTFS file system, say - Y here. This will result in additional consistency checks to be - performed by the driver as well as additional debugging messages to - be written to the system log. Note that debugging messages are - disabled by default. To enable them, supply the option debug_msgs=1 - at the kernel command line when booting the kernel or as an option - to insmod when loading the ntfs module. Once the driver is active, - you can enable debugging messages by doing (as root): - echo 1 > /proc/sys/fs/ntfs-debug - Replacing the "1" with "0" would disable debug messages. - - If you leave debugging messages disabled, this results in little - overhead, but enabling debug messages results in very significant - slowdown of the system. - - When reporting bugs, please try to have available a full dump of - debugging messages while the misbehaviour was occurring. - -config NTFS_RW - bool "NTFS write support" - depends on NTFS_FS - depends on PAGE_SIZE_LESS_THAN_64KB - help - This enables the partial, but safe, write support in the NTFS driver. - - The only supported operation is overwriting existing files, without - changing the file length. No file or directory creation, deletion or - renaming is possible. Note only non-resident files can be written to - so you may find that some very small files (<500 bytes or so) cannot - be written to. - - While we cannot guarantee that it will not damage any data, we have - so far not received a single report where the driver would have - damaged someones data so we assume it is perfectly safe to use. - - Note: While write support is safe in this version (a rewrite from - scratch of the NTFS support), it should be noted that the old NTFS - write support, included in Linux 2.5.10 and before (since 1997), - is not safe. - - This is currently useful with TopologiLinux. TopologiLinux is run - on top of any DOS/Microsoft Windows system without partitioning your - hard disk. Unlike other Linux distributions TopologiLinux does not - need its own partition. For more information see - <http://topologi-linux.sourceforge.net/> - - It is perfectly safe to say N here. diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile deleted file mode 100644 index 3e736572ed00..000000000000 --- a/fs/ntfs/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# Rules for making the NTFS driver. - -obj-$(CONFIG_NTFS_FS) += ntfs.o - -ntfs-y := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \ - index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \ - unistr.o upcase.o - -ntfs-$(CONFIG_NTFS_RW) += bitmap.o lcnalloc.o logfile.o quota.o usnjrnl.o - -ccflags-y := -DNTFS_VERSION=\"2.1.32\" -ccflags-$(CONFIG_NTFS_DEBUG) += -DDEBUG -ccflags-$(CONFIG_NTFS_RW) += -DNTFS_RW - diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c deleted file mode 100644 index 2d01517a2d59..000000000000 --- a/fs/ntfs/aops.c +++ /dev/null @@ -1,1744 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * aops.c - NTFS kernel address space operations and page cache handling. - * - * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc. - * Copyright (c) 2002 Richard Russon - */ - -#include <linux/errno.h> -#include <linux/fs.h> -#include <linux/gfp.h> -#include <linux/mm.h> -#include <linux/pagemap.h> -#include <linux/swap.h> -#include <linux/buffer_head.h> -#include <linux/writeback.h> -#include <linux/bit_spinlock.h> -#include <linux/bio.h> - -#include "aops.h" -#include "attrib.h" -#include "debug.h" -#include "inode.h" -#include "mft.h" -#include "runlist.h" -#include "types.h" -#include "ntfs.h" - -/** - * ntfs_end_buffer_async_read - async io completion for reading attributes - * @bh: buffer head on which io is completed - * @uptodate: whether @bh is now uptodate or not - * - * Asynchronous I/O completion handler for reading pages belonging to the - * attribute address space of an inode. The inodes can either be files or - * directories or they can be fake inodes describing some attribute. - * - * If NInoMstProtected(), perform the post read mst fixups when all IO on the - * page has been completed and mark the page uptodate or set the error bit on - * the page. To determine the size of the records that need fixing up, we - * cheat a little bit by setting the index_block_size in ntfs_inode to the ntfs - * record size, and index_block_size_bits, to the log(base 2) of the ntfs - * record size. - */ -static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate) -{ - unsigned long flags; - struct buffer_head *first, *tmp; - struct page *page; - struct inode *vi; - ntfs_inode *ni; - int page_uptodate = 1; - - page = bh->b_page; - vi = page->mapping->host; - ni = NTFS_I(vi); - - if (likely(uptodate)) { - loff_t i_size; - s64 file_ofs, init_size; - - set_buffer_uptodate(bh); - - file_ofs = ((s64)page->index << PAGE_SHIFT) + - bh_offset(bh); - read_lock_irqsave(&ni->size_lock, flags); - init_size = ni->initialized_size; - i_size = i_size_read(vi); - read_unlock_irqrestore(&ni->size_lock, flags); - if (unlikely(init_size > i_size)) { - /* Race with shrinking truncate. */ - init_size = i_size; - } - /* Check for the current buffer head overflowing. */ - if (unlikely(file_ofs + bh->b_size > init_size)) { - int ofs; - void *kaddr; - - ofs = 0; - if (file_ofs < init_size) - ofs = init_size - file_ofs; - kaddr = kmap_atomic(page); - memset(kaddr + bh_offset(bh) + ofs, 0, - bh->b_size - ofs); - flush_dcache_page(page); - kunmap_atomic(kaddr); - } - } else { - clear_buffer_uptodate(bh); - SetPageError(page); - ntfs_error(ni->vol->sb, "Buffer I/O error, logical block " - "0x%llx.", (unsigned long long)bh->b_blocknr); - } - first = page_buffers(page); - spin_lock_irqsave(&first->b_uptodate_lock, flags); - clear_buffer_async_read(bh); - unlock_buffer(bh); - tmp = bh; - do { - if (!buffer_uptodate(tmp)) - page_uptodate = 0; - if (buffer_async_read(tmp)) { - if (likely(buffer_locked(tmp))) - goto still_busy; - /* Async buffers must be locked. */ - BUG(); - } - tmp = tmp->b_this_page; - } while (tmp != bh); - spin_unlock_irqrestore(&first->b_uptodate_lock, flags); - /* - * If none of the buffers had errors then we can set the page uptodate, - * but we first have to perform the post read mst fixups, if the - * attribute is mst protected, i.e. if NInoMstProteced(ni) is true. - * Note we ignore fixup errors as those are detected when - * map_mft_record() is called which gives us per record granularity - * rather than per page granularity. - */ - if (!NInoMstProtected(ni)) { - if (likely(page_uptodate && !PageError(page))) - SetPageUptodate(page); - } else { - u8 *kaddr; - unsigned int i, recs; - u32 rec_size; - - rec_size = ni->itype.index.block_size; - recs = PAGE_SIZE / rec_size; - /* Should have been verified before we got here... */ - BUG_ON(!recs); - kaddr = kmap_atomic(page); - for (i = 0; i < recs; i++) - post_read_mst_fixup((NTFS_RECORD*)(kaddr + - i * rec_size), rec_size); - kunmap_atomic(kaddr); - flush_dcache_page(page); - if (likely(page_uptodate && !PageError(page))) - SetPageUptodate(page); - } - unlock_page(page); - return; -still_busy: - spin_unlock_irqrestore(&first->b_uptodate_lock, flags); - return; -} - -/** - * ntfs_read_block - fill a @folio of an address space with data - * @folio: page cache folio to fill with data - * - * We read each buffer asynchronously and when all buffers are read in, our io - * completion handler ntfs_end_buffer_read_async(), if required, automatically - * applies the mst fixups to the folio before finally marking it uptodate and - * unlocking it. - * - * We only enforce allocated_size limit because i_size is checked for in - * generic_file_read(). - * - * Return 0 on success and -errno on error. - * - * Contains an adapted version of fs/buffer.c::block_read_full_folio(). - */ -static int ntfs_read_block(struct folio *folio) -{ - loff_t i_size; - VCN vcn; - LCN lcn; - s64 init_size; - struct inode *vi; - ntfs_inode *ni; - ntfs_volume *vol; - runlist_element *rl; - struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; - sector_t iblock, lblock, zblock; - unsigned long flags; - unsigned int blocksize, vcn_ofs; - int i, nr; - unsigned char blocksize_bits; - - vi = folio->mapping->host; - ni = NTFS_I(vi); - vol = ni->vol; - - /* $MFT/$DATA must have its complete runlist in memory at all times. */ - BUG_ON(!ni->runlist.rl && !ni->mft_no && !NInoAttr(ni)); - - blocksize = vol->sb->s_blocksize; - blocksize_bits = vol->sb->s_blocksize_bits; - - head = folio_buffers(folio); - if (!head) - head = create_empty_buffers(folio, blocksize, 0); - bh = head; - - /* - * We may be racing with truncate. To avoid some of the problems we - * now take a snapshot of the various sizes and use those for the whole - * of the function. In case of an extending truncate it just means we - * may leave some buffers unmapped which are now allocated. This is - * not a problem since these buffers will just get mapped when a write - * occurs. In case of a shrinking truncate, we will detect this later - * on due to the runlist being incomplete and if the folio is being - * fully truncated, truncate will throw it away as soon as we unlock - * it so no need to worry what we do with it. - */ - iblock = (s64)folio->index << (PAGE_SHIFT - blocksize_bits); - read_lock_irqsave(&ni->size_lock, flags); - lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits; - init_size = ni->initialized_size; - i_size = i_size_read(vi); - read_unlock_irqrestore(&ni->size_lock, flags); - if (unlikely(init_size > i_size)) { - /* Race with shrinking truncate. */ - init_size = i_size; - } - zblock = (init_size + blocksize - 1) >> blocksize_bits; - - /* Loop through all the buffers in the folio. */ - rl = NULL; - nr = i = 0; - do { - int err = 0; - - if (unlikely(buffer_uptodate(bh))) - continue; - if (unlikely(buffer_mapped(bh))) { - arr[nr++] = bh; - continue; - } - bh->b_bdev = vol->sb->s_bdev; - /* Is the block within the allowed limits? */ - if (iblock < lblock) { - bool is_retry = false; - - /* Convert iblock into corresponding vcn and offset. */ - vcn = (VCN)iblock << blocksize_bits >> - vol->cluster_size_bits; - vcn_ofs = ((VCN)iblock << blocksize_bits) & - vol->cluster_size_mask; - if (!rl) { -lock_retry_remap: - down_read(&ni->runlist.lock); - rl = ni->runlist.rl; - } - if (likely(rl != NULL)) { - /* Seek to element containing target vcn. */ - while (rl->length && rl[1].vcn <= vcn) - rl++; - lcn = ntfs_rl_vcn_to_lcn(rl, vcn); - } else - lcn = LCN_RL_NOT_MAPPED; - /* Successful remap. */ - if (lcn >= 0) { - /* Setup buffer head to correct block. */ - bh->b_blocknr = ((lcn << vol->cluster_size_bits) - + vcn_ofs) >> blocksize_bits; - set_buffer_mapped(bh); - /* Only read initialized data blocks. */ - if (iblock < zblock) { - arr[nr++] = bh; - continue; - } - /* Fully non-initialized data block, zero it. */ - goto handle_zblock; - } - /* It is a hole, need to zero it. */ - if (lcn == LCN_HOLE) - goto handle_hole; - /* If first try and runlist unmapped, map and retry. */ - if (!is_retry && lcn == LCN_RL_NOT_MAPPED) { - is_retry = true; - /* - * Attempt to map runlist, dropping lock for - * the duration. - */ - up_read(&ni->runlist.lock); - err = ntfs_map_runlist(ni, vcn); - if (likely(!err)) - goto lock_retry_remap; - rl = NULL; - } else if (!rl) - up_read(&ni->runlist.lock); - /* - * If buffer is outside the runlist, treat it as a - * hole. This can happen due to concurrent truncate - * for example. - */ - if (err == -ENOENT || lcn == LCN_ENOENT) { - err = 0; - goto handle_hole; - } - /* Hard error, zero out region. */ - if (!err) - err = -EIO; - bh->b_blocknr = -1; - folio_set_error(folio); - ntfs_error(vol->sb, "Failed to read from inode 0x%lx, " - "attribute type 0x%x, vcn 0x%llx, " - "offset 0x%x because its location on " - "disk could not be determined%s " - "(error code %i).", ni->mft_no, - ni->type, (unsigned long long)vcn, - vcn_ofs, is_retry ? " even after " - "retrying" : "", err); - } - /* - * Either iblock was outside lblock limits or - * ntfs_rl_vcn_to_lcn() returned error. Just zero that portion - * of the folio and set the buffer uptodate. - */ -handle_hole: - bh->b_blocknr = -1UL; - clear_buffer_mapped(bh); -handle_zblock: - folio_zero_range(folio, i * blocksize, blocksize); - if (likely(!err)) - set_buffer_uptodate(bh); - } while (i++, iblock++, (bh = bh->b_this_page) != head); - - /* Release the lock if we took it. */ - if (rl) - up_read(&ni->runlist.lock); - - /* Check we have at least one buffer ready for i/o. */ - if (nr) { - struct buffer_head *tbh; - - /* Lock the buffers. */ - for (i = 0; i < nr; i++) { - tbh = arr[i]; - lock_buffer(tbh); - tbh->b_end_io = ntfs_end_buffer_async_read; - set_buffer_async_read(tbh); - } - /* Finally, start i/o on the buffers. */ - for (i = 0; i < nr; i++) { - tbh = arr[i]; - if (likely(!buffer_uptodate(tbh))) - submit_bh(REQ_OP_READ, tbh); - else - ntfs_end_buffer_async_read(tbh, 1); - } - return 0; - } - /* No i/o was scheduled on any of the buffers. */ - if (likely(!folio_test_error(folio))) - folio_mark_uptodate(folio); - else /* Signal synchronous i/o error. */ - nr = -EIO; - folio_unlock(folio); - return nr; -} - -/** - * ntfs_read_folio - fill a @folio of a @file with data from the device - * @file: open file to which the folio @folio belongs or NULL - * @folio: page cache folio to fill with data - * - * For non-resident attributes, ntfs_read_folio() fills the @folio of the open - * file @file by calling the ntfs version of the generic block_read_full_folio() - * function, ntfs_read_block(), which in turn creates and reads in the buffers - * associated with the folio asynchronously. - * - * For resident attributes, OTOH, ntfs_read_folio() fills @folio by copying the - * data from the mft record (which at this stage is most likely in memory) and - * fills the remainder with zeroes. Thus, in this case, I/O is synchronous, as - * even if the mft record is not cached at this point in time, we need to wait - * for it to be read in before we can do the copy. - * - * Return 0 on success and -errno on error. - */ -static int ntfs_read_folio(struct file *file, struct folio *folio) -{ - struct page *page = &folio->page; - loff_t i_size; - struct inode *vi; - ntfs_inode *ni, *base_ni; - u8 *addr; - ntfs_attr_search_ctx *ctx; - MFT_RECORD *mrec; - unsigned long flags; - u32 attr_len; - int err = 0; - -retry_readpage: - BUG_ON(!PageLocked(page)); - vi = page->mapping->host; - i_size = i_size_read(vi); - /* Is the page fully outside i_size? (truncate in progress) */ - if (unlikely(page->index >= (i_size + PAGE_SIZE - 1) >> - PAGE_SHIFT)) { - zero_user(page, 0, PAGE_SIZE); - ntfs_debug("Read outside i_size - truncated?"); - goto done; - } - /* - * This can potentially happen because we clear PageUptodate() during - * ntfs_writepage() of MstProtected() attributes. - */ - if (PageUptodate(page)) { - unlock_page(page); - return 0; - } - ni = NTFS_I(vi); - /* - * Only $DATA attributes can be encrypted and only unnamed $DATA - * attributes can be compressed. Index root can have the flags set but - * this means to create compressed/encrypted files, not that the - * attribute is compressed/encrypted. Note we need to check for - * AT_INDEX_ALLOCATION since this is the type of both directory and - * index inodes. - */ - if (ni->type != AT_INDEX_ALLOCATION) { - /* If attribute is encrypted, deny access, just like NT4. */ - if (NInoEncrypted(ni)) { - BUG_ON(ni->type != AT_DATA); - err = -EACCES; - goto err_out; - } - /* Compressed data streams are handled in compress.c. */ - if (NInoNonResident(ni) && NInoCompressed(ni)) { - BUG_ON(ni->type != AT_DATA); - BUG_ON(ni->name_len); - return ntfs_read_compressed_block(page); - } - } - /* NInoNonResident() == NInoIndexAllocPresent() */ - if (NInoNonResident(ni)) { - /* Normal, non-resident data stream. */ - return ntfs_read_block(folio); - } - /* - * Attribute is resident, implying it is not compressed or encrypted. - * This also means the attribute is smaller than an mft record and - * hence smaller than a page, so can simply zero out any pages with - * index above 0. Note the attribute can actually be marked compressed - * but if it is resident the actual data is not compressed so we are - * ok to ignore the compressed flag here. - */ - if (unlikely(page->index > 0)) { - zero_user(page, 0, PAGE_SIZE); - goto done; - } - if (!NInoAttr(ni)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - /* Map, pin, and lock the mft record. */ - mrec = map_mft_record(base_ni); - if (IS_ERR(mrec)) { - err = PTR_ERR(mrec); - goto err_out; - } - /* - * If a parallel write made the attribute non-resident, drop the mft - * record and retry the read_folio. - */ - if (unlikely(NInoNonResident(ni))) { - unmap_mft_record(base_ni); - goto retry_readpage; - } - ctx = ntfs_attr_get_search_ctx(base_ni, mrec); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto unm_err_out; - } - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) - goto put_unm_err_out; - attr_len = le32_to_cpu(ctx->attr->data.resident.value_length); - read_lock_irqsave(&ni->size_lock, flags); - if (unlikely(attr_len > ni->initialized_size)) - attr_len = ni->initialized_size; - i_size = i_size_read(vi); - read_unlock_irqrestore(&ni->size_lock, flags); - if (unlikely(attr_len > i_size)) { - /* Race with shrinking truncate. */ - attr_len = i_size; - } - addr = kmap_atomic(page); - /* Copy the data to the page. */ - memcpy(addr, (u8*)ctx->attr + - le16_to_cpu(ctx->attr->data.resident.value_offset), - attr_len); - /* Zero the remainder of the page. */ - memset(addr + attr_len, 0, PAGE_SIZE - attr_len); - flush_dcache_page(page); - kunmap_atomic(addr); -put_unm_err_out: - ntfs_attr_put_search_ctx(ctx); -unm_err_out: - unmap_mft_record(base_ni); -done: - SetPageUptodate(page); -err_out: - unlock_page(page); - return err; -} - -#ifdef NTFS_RW - -/** - * ntfs_write_block - write a @folio to the backing store - * @folio: page cache folio to write out - * @wbc: writeback control structure - * - * This function is for writing folios belonging to non-resident, non-mst - * protected attributes to their backing store. - * - * For a folio with buffers, map and write the dirty buffers asynchronously - * under folio writeback. For a folio without buffers, create buffers for the - * folio, then proceed as above. - * - * If a folio doesn't have buffers the folio dirty state is definitive. If - * a folio does have buffers, the folio dirty state is just a hint, - * and the buffer dirty state is definitive. (A hint which has rules: - * dirty buffers against a clean folio is illegal. Other combinations are - * legal and need to be handled. In particular a dirty folio containing - * clean buffers for example.) - * - * Return 0 on success and -errno on error. - * - * Based on ntfs_read_block() and __block_write_full_folio(). - */ -static int ntfs_write_block(struct folio *folio, struct writeback_control *wbc) -{ - VCN vcn; - LCN lcn; - s64 initialized_size; - loff_t i_size; - sector_t block, dblock, iblock; - struct inode *vi; - ntfs_inode *ni; - ntfs_volume *vol; - runlist_element *rl; - struct buffer_head *bh, *head; - unsigned long flags; - unsigned int blocksize, vcn_ofs; - int err; - bool need_end_writeback; - unsigned char blocksize_bits; - - vi = folio->mapping->host; - ni = NTFS_I(vi); - vol = ni->vol; - - ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index " - "0x%lx.", ni->mft_no, ni->type, folio->index); - - BUG_ON(!NInoNonResident(ni)); - BUG_ON(NInoMstProtected(ni)); - blocksize = vol->sb->s_blocksize; - blocksize_bits = vol->sb->s_blocksize_bits; - head = folio_buffers(folio); - if (!head) { - BUG_ON(!folio_test_uptodate(folio)); - head = create_empty_buffers(folio, blocksize, - (1 << BH_Uptodate) | (1 << BH_Dirty)); - } - bh = head; - - /* NOTE: Different naming scheme to ntfs_read_block()! */ - - /* The first block in the folio. */ - block = (s64)folio->index << (PAGE_SHIFT - blocksize_bits); - - read_lock_irqsave(&ni->size_lock, flags); - i_size = i_size_read(vi); - initialized_size = ni->initialized_size; - read_unlock_irqrestore(&ni->size_lock, flags); - - /* The first out of bounds block for the data size. */ - dblock = (i_size + blocksize - 1) >> blocksize_bits; - - /* The last (fully or partially) initialized block. */ - iblock = initialized_size >> blocksize_bits; - - /* - * Be very careful. We have no exclusion from block_dirty_folio - * here, and the (potentially unmapped) buffers may become dirty at - * any time. If a buffer becomes dirty here after we've inspected it - * then we just miss that fact, and the folio stays dirty. - * - * Buffers outside i_size may be dirtied by block_dirty_folio; - * handle that here by just cleaning them. - */ - - /* - * Loop through all the buffers in the folio, mapping all the dirty - * buffers to disk addresses and handling any aliases from the - * underlying block device's mapping. - */ - rl = NULL; - err = 0; - do { - bool is_retry = false; - - if (unlikely(block >= dblock)) { - /* - * Mapped buffers outside i_size will occur, because - * this folio can be outside i_size when there is a - * truncate in progress. The contents of such buffers - * were zeroed by ntfs_writepage(). - * - * FIXME: What about the small race window where - * ntfs_writepage() has not done any clearing because - * the folio was within i_size but before we get here, - * vmtruncate() modifies i_size? - */ - clear_buffer_dirty(bh); - set_buffer_uptodate(bh); - continue; - } - - /* Clean buffers are not written out, so no need to map them. */ - if (!buffer_dirty(bh)) - continue; - - /* Make sure we have enough initialized size. */ - if (unlikely((block >= iblock) && - (initialized_size < i_size))) { - /* - * If this folio is fully outside initialized - * size, zero out all folios between the current - * initialized size and the current folio. Just - * use ntfs_read_folio() to do the zeroing - * transparently. - */ - if (block > iblock) { - // TODO: - // For each folio do: - // - read_cache_folio() - // Again for each folio do: - // - wait_on_folio_locked() - // - Check (folio_test_uptodate(folio) && - // !folio_test_error(folio)) - // Update initialized size in the attribute and - // in the inode. - // Again, for each folio do: - // block_dirty_folio(); - // folio_put() - // We don't need to wait on the writes. - // Update iblock. - } - /* - * The current folio straddles initialized size. Zero - * all non-uptodate buffers and set them uptodate (and - * dirty?). Note, there aren't any non-uptodate buffers - * if the folio is uptodate. - * FIXME: For an uptodate folio, the buffers may need to - * be written out because they were not initialized on - * disk before. - */ - if (!folio_test_uptodate(folio)) { - // TODO: - // Zero any non-uptodate buffers up to i_size. - // Set them uptodate and dirty. - } - // TODO: - // Update initialized size in the attribute and in the - // inode (up to i_size). - // Update iblock. - // FIXME: This is inefficient. Try to batch the two - // size changes to happen in one go. - ntfs_error(vol->sb, "Writing beyond initialized size " - "is not supported yet. Sorry."); - err = -EOPNOTSUPP; - break; - // Do NOT set_buffer_new() BUT DO clear buffer range - // outside write request range. - // set_buffer_uptodate() on complete buffers as well as - // set_buffer_dirty(). - } - - /* No need to map buffers that are already mapped. */ - if (buffer_mapped(bh)) - continue; - - /* Unmapped, dirty buffer. Need to map it. */ - bh->b_bdev = vol->sb->s_bdev; - - /* Convert block into corresponding vcn and offset. */ - vcn = (VCN)block << blocksize_bits; - vcn_ofs = vcn & vol->cluster_size_mask; - vcn >>= vol->cluster_size_bits; - if (!rl) { -lock_retry_remap: - down_read(&ni->runlist.lock); - rl = ni->runlist.rl; - } - if (likely(rl != NULL)) { - /* Seek to element containing target vcn. */ - while (rl->length && rl[1].vcn <= vcn) - rl++; - lcn = ntfs_rl_vcn_to_lcn(rl, vcn); - } else - lcn = LCN_RL_NOT_MAPPED; - /* Successful remap. */ - if (lcn >= 0) { - /* Setup buffer head to point to correct block. */ - bh->b_blocknr = ((lcn << vol->cluster_size_bits) + - vcn_ofs) >> blocksize_bits; - set_buffer_mapped(bh); - continue; - } - /* It is a hole, need to instantiate it. */ - if (lcn == LCN_HOLE) { - u8 *kaddr; - unsigned long *bpos, *bend; - - /* Check if the buffer is zero. */ - kaddr = kmap_local_folio(folio, bh_offset(bh)); - bpos = (unsigned long *)kaddr; - bend = (unsigned long *)(kaddr + blocksize); - do { - if (unlikely(*bpos)) - break; - } while (likely(++bpos < bend)); - kunmap_local(kaddr); - if (bpos == bend) { - /* - * Buffer is zero and sparse, no need to write - * it. - */ - bh->b_blocknr = -1; - clear_buffer_dirty(bh); - continue; - } - // TODO: Instantiate the hole. - // clear_buffer_new(bh); - // clean_bdev_bh_alias(bh); - ntfs_error(vol->sb, "Writing into sparse regions is " - "not supported yet. Sorry."); - err = -EOPNOTSUPP; - break; - } - /* If first try and runlist unmapped, map and retry. */ - if (!is_retry && lcn == LCN_RL_NOT_MAPPED) { - is_retry = true; - /* - * Attempt to map runlist, dropping lock for - * the duration. - */ - up_read(&ni->runlist.lock); - err = ntfs_map_runlist(ni, vcn); - if (likely(!err)) - goto lock_retry_remap; - rl = NULL; - } else if (!rl) - up_read(&ni->runlist.lock); - /* - * If buffer is outside the runlist, truncate has cut it out - * of the runlist. Just clean and clear the buffer and set it - * uptodate so it can get discarded by the VM. - */ - if (err == -ENOENT || lcn == LCN_ENOENT) { - bh->b_blocknr = -1; - clear_buffer_dirty(bh); - folio_zero_range(folio, bh_offset(bh), blocksize); - set_buffer_uptodate(bh); - err = 0; - continue; - } - /* Failed to map the buffer, even after retrying. */ - if (!err) - err = -EIO; - bh->b_blocknr = -1; - ntfs_error(vol->sb, "Failed to write to inode 0x%lx, " - "attribute type 0x%x, vcn 0x%llx, offset 0x%x " - "because its location on disk could not be " - "determined%s (error code %i).", ni->mft_no, - ni->type, (unsigned long long)vcn, - vcn_ofs, is_retry ? " even after " - "retrying" : "", err); - break; - } while (block++, (bh = bh->b_this_page) != head); - - /* Release the lock if we took it. */ - if (rl) - up_read(&ni->runlist.lock); - - /* For the error case, need to reset bh to the beginning. */ - bh = head; - - /* Just an optimization, so ->read_folio() is not called later. */ - if (unlikely(!folio_test_uptodate(folio))) { - int uptodate = 1; - do { - if (!buffer_uptodate(bh)) { - uptodate = 0; - bh = head; - break; - } - } while ((bh = bh->b_this_page) != head); - if (uptodate) - folio_mark_uptodate(folio); - } - - /* Setup all mapped, dirty buffers for async write i/o. */ - do { - if (buffer_mapped(bh) && buffer_dirty(bh)) { - lock_buffer(bh); - if (test_clear_buffer_dirty(bh)) { - BUG_ON(!buffer_uptodate(bh)); - mark_buffer_async_write(bh); - } else - unlock_buffer(bh); - } else if (unlikely(err)) { - /* - * For the error case. The buffer may have been set - * dirty during attachment to a dirty folio. - */ - if (err != -ENOMEM) - clear_buffer_dirty(bh); - } - } while ((bh = bh->b_this_page) != head); - - if (unlikely(err)) { - // TODO: Remove the -EOPNOTSUPP check later on... - if (unlikely(err == -EOPNOTSUPP)) - err = 0; - else if (err == -ENOMEM) { - ntfs_warning(vol->sb, "Error allocating memory. " - "Redirtying folio so we try again " - "later."); - /* - * Put the folio back on mapping->dirty_pages, but - * leave its buffer's dirty state as-is. - */ - folio_redirty_for_writepage(wbc, folio); - err = 0; - } else - folio_set_error(folio); - } - - BUG_ON(folio_test_writeback(folio)); - folio_start_writeback(folio); /* Keeps try_to_free_buffers() away. */ - - /* Submit the prepared buffers for i/o. */ - need_end_writeback = true; - do { - struct buffer_head *next = bh->b_this_page; - if (buffer_async_write(bh)) { - submit_bh(REQ_OP_WRITE, bh); - need_end_writeback = false; - } - bh = next; - } while (bh != head); - folio_unlock(folio); - - /* If no i/o was started, need to end writeback here. */ - if (unlikely(need_end_writeback)) - folio_end_writeback(folio); - - ntfs_debug("Done."); - return err; -} - -/** - * ntfs_write_mst_block - write a @page to the backing store - * @page: page cache page to write out - * @wbc: writeback control structure - * - * This function is for writing pages belonging to non-resident, mst protected - * attributes to their backing store. The only supported attributes are index - * allocation and $MFT/$DATA. Both directory inodes and index inodes are - * supported for the index allocation case. - * - * The page must remain locked for the duration of the write because we apply - * the mst fixups, write, and then undo the fixups, so if we were to unlock the - * page before undoing the fixups, any other user of the page will see the - * page contents as corrupt. - * - * We clear the page uptodate flag for the duration of the function to ensure - * exclusion for the $MFT/$DATA case against someone mapping an mft record we - * are about to apply the mst fixups to. - * - * Return 0 on success and -errno on error. - * - * Based on ntfs_write_block(), ntfs_mft_writepage(), and - * write_mft_record_nolock(). - */ -static int ntfs_write_mst_block(struct page *page, - struct writeback_control *wbc) -{ - sector_t block, dblock, rec_block; - struct inode *vi = page->mapping->host; - ntfs_inode *ni = NTFS_I(vi); - ntfs_volume *vol = ni->vol; - u8 *kaddr; - unsigned int rec_size = ni->itype.index.block_size; - ntfs_inode *locked_nis[PAGE_SIZE / NTFS_BLOCK_SIZE]; - struct buffer_head *bh, *head, *tbh, *rec_start_bh; - struct buffer_head *bhs[MAX_BUF_PER_PAGE]; - runlist_element *rl; - int i, nr_locked_nis, nr_recs, nr_bhs, max_bhs, bhs_per_rec, err, err2; - unsigned bh_size, rec_size_bits; - bool sync, is_mft, page_is_dirty, rec_is_dirty; - unsigned char bh_size_bits; - - if (WARN_ON(rec_size < NTFS_BLOCK_SIZE)) - return -EINVAL; - - ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index " - "0x%lx.", vi->i_ino, ni->type, page->index); - BUG_ON(!NInoNonResident(ni)); - BUG_ON(!NInoMstProtected(ni)); - is_mft = (S_ISREG(vi->i_mode) && !vi->i_ino); - /* - * NOTE: ntfs_write_mst_block() would be called for $MFTMirr if a page - * in its page cache were to be marked dirty. However this should - * never happen with the current driver and considering we do not - * handle this case here we do want to BUG(), at least for now. - */ - BUG_ON(!(is_mft || S_ISDIR(vi->i_mode) || - (NInoAttr(ni) && ni->type == AT_INDEX_ALLOCATION))); - bh_size = vol->sb->s_blocksize; - bh_size_bits = vol->sb->s_blocksize_bits; - max_bhs = PAGE_SIZE / bh_size; - BUG_ON(!max_bhs); - BUG_ON(max_bhs > MAX_BUF_PER_PAGE); - - /* Were we called for sync purposes? */ - sync = (wbc->sync_mode == WB_SYNC_ALL); - - /* Make sure we have mapped buffers. */ - bh = head = page_buffers(page); - BUG_ON(!bh); - - rec_size_bits = ni->itype.index.block_size_bits; - BUG_ON(!(PAGE_SIZE >> rec_size_bits)); - bhs_per_rec = rec_size >> bh_size_bits; - BUG_ON(!bhs_per_rec); - - /* The first block in the page. */ - rec_block = block = (sector_t)page->index << - (PAGE_SHIFT - bh_size_bits); - - /* The first out of bounds block for the data size. */ - dblock = (i_size_read(vi) + bh_size - 1) >> bh_size_bits; - - rl = NULL; - err = err2 = nr_bhs = nr_recs = nr_locked_nis = 0; - page_is_dirty = rec_is_dirty = false; - rec_start_bh = NULL; - do { - bool is_retry = false; - - if (likely(block < rec_block)) { - if (unlikely(block >= dblock)) { - clear_buffer_dirty(bh); - set_buffer_uptodate(bh); - continue; - } - /* - * This block is not the first one in the record. We - * ignore the buffer's dirty state because we could - * have raced with a parallel mark_ntfs_record_dirty(). - */ - if (!rec_is_dirty) - continue; - if (unlikely(err2)) { - if (err2 != -ENOMEM) - clear_buffer_dirty(bh); - continue; - } - } else /* if (block == rec_block) */ { - BUG_ON(block > rec_block); - /* This block is the first one in the record. */ - rec_block += bhs_per_rec; - err2 = 0; - if (unlikely(block >= dblock)) { - clear_buffer_dirty(bh); - continue; - } - if (!buffer_dirty(bh)) { - /* Clean records are not written out. */ - rec_is_dirty = false; - continue; - } - rec_is_dirty = true; - rec_start_bh = bh; - } - /* Need to map the buffer if it is not mapped already. */ - if (unlikely(!buffer_mapped(bh))) { - VCN vcn; - LCN lcn; - unsigned int vcn_ofs; - - bh->b_bdev = vol->sb->s_bdev; - /* Obtain the vcn and offset of the current block. */ - vcn = (VCN)block << bh_size_bits; - vcn_ofs = vcn & vol->cluster_size_mask; - vcn >>= vol->cluster_size_bits; - if (!rl) { -lock_retry_remap: - down_read(&ni->runlist.lock); - rl = ni->runlist.rl; - } - if (likely(rl != NULL)) { - /* Seek to element containing target vcn. */ - while (rl->length && rl[1].vcn <= vcn) - rl++; - lcn = ntfs_rl_vcn_to_lcn(rl, vcn); - } else - lcn = LCN_RL_NOT_MAPPED; - /* Successful remap. */ - if (likely(lcn >= 0)) { - /* Setup buffer head to correct block. */ - bh->b_blocknr = ((lcn << - vol->cluster_size_bits) + - vcn_ofs) >> bh_size_bits; - set_buffer_mapped(bh); - } else { - /* - * Remap failed. Retry to map the runlist once - * unless we are working on $MFT which always - * has the whole of its runlist in memory. - */ - if (!is_mft && !is_retry && - lcn == LCN_RL_NOT_MAPPED) { - is_retry = true; - /* - * Attempt to map runlist, dropping - * lock for the duration. - */ - up_read(&ni->runlist.lock); - err2 = ntfs_map_runlist(ni, vcn); - if (likely(!err2)) - goto lock_retry_remap; - if (err2 == -ENOMEM) - page_is_dirty = true; - lcn = err2; - } else { - err2 = -EIO; - if (!rl) - up_read(&ni->runlist.lock); - } - /* Hard error. Abort writing this record. */ - if (!err || err == -ENOMEM) - err = err2; - bh->b_blocknr = -1; - ntfs_error(vol->sb, "Cannot write ntfs record " - "0x%llx (inode 0x%lx, " - "attribute type 0x%x) because " - "its location on disk could " - "not be determined (error " - "code %lli).", - (long long)block << - bh_size_bits >> - vol->mft_record_size_bits, - ni->mft_no, ni->type, - (long long)lcn); - /* - * If this is not the first buffer, remove the - * buffers in this record from the list of - * buffers to write and clear their dirty bit - * if not error -ENOMEM. - */ - if (rec_start_bh != bh) { - while (bhs[--nr_bhs] != rec_start_bh) - ; - if (err2 != -ENOMEM) { - do { - clear_buffer_dirty( - rec_start_bh); - } while ((rec_start_bh = - rec_start_bh-> - b_this_page) != - bh); - } - } - continue; - } - } - BUG_ON(!buffer_uptodate(bh)); - BUG_ON(nr_bhs >= max_bhs); - bhs[nr_bhs++] = bh; - } while (block++, (bh = bh->b_this_page) != head); - if (unlikely(rl)) - up_read(&ni->runlist.lock); - /* If there were no dirty buffers, we are done. */ - if (!nr_bhs) - goto done; - /* Map the page so we can access its contents. */ - kaddr = kmap(page); - /* Clear the page uptodate flag whilst the mst fixups are applied. */ - BUG_ON(!PageUptodate(page)); - ClearPageUptodate(page); - for (i = 0; i < nr_bhs; i++) { - unsigned int ofs; - - /* Skip buffers which are not at the beginning of records. */ - if (i % bhs_per_rec) - continue; - tbh = bhs[i]; - ofs = bh_offset(tbh); - if (is_mft) { - ntfs_inode *tni; - unsigned long mft_no; - - /* Get the mft record number. */ - mft_no = (((s64)page->index << PAGE_SHIFT) + ofs) - >> rec_size_bits; - /* Check whether to write this mft record. */ - tni = NULL; - if (!ntfs_may_write_mft_record(vol, mft_no, - (MFT_RECORD*)(kaddr + ofs), &tni)) { - /* - * The record should not be written. This - * means we need to redirty the page before - * returning. - */ - page_is_dirty = true; - /* - * Remove the buffers in this mft record from - * the list of buffers to write. - */ - do { - bhs[i] = NULL; - } while (++i % bhs_per_rec); - continue; - } - /* - * The record should be written. If a locked ntfs - * inode was returned, add it to the array of locked - * ntfs inodes. - */ - if (tni) - locked_nis[nr_locked_nis++] = tni; - } - /* Apply the mst protection fixups. */ - err2 = pre_write_mst_fixup((NTFS_RECORD*)(kaddr + ofs), - rec_size); - if (unlikely(err2)) { - if (!err || err == -ENOMEM) - err = -EIO; - ntfs_error(vol->sb, "Failed to apply mst fixups " - "(inode 0x%lx, attribute type 0x%x, " - "page index 0x%lx, page offset 0x%x)!" - " Unmount and run chkdsk.", vi->i_ino, - ni->type, page->index, ofs); - /* - * Mark all the buffers in this record clean as we do - * not want to write corrupt data to disk. - */ - do { - clear_buffer_dirty(bhs[i]); - bhs[i] = NULL; - } while (++i % bhs_per_rec); - continue; - } - nr_recs++; - } - /* If no records are to be written out, we are done. */ - if (!nr_recs) - goto unm_done; - flush_dcache_page(page); - /* Lock buffers and start synchronous write i/o on them. */ - for (i = 0; i < nr_bhs; i++) { - tbh = bhs[i]; - if (!tbh) - continue; - if (!trylock_buffer(tbh)) - BUG(); - /* The buffer dirty state is now irrelevant, just clean it. */ - clear_buffer_dirty(tbh); - BUG_ON(!buffer_uptodate(tbh)); - BUG_ON(!buffer_mapped(tbh)); - get_bh(tbh); - tbh->b_end_io = end_buffer_write_sync; - submit_bh(REQ_OP_WRITE, tbh); - } - /* Synchronize the mft mirror now if not @sync. */ - if (is_mft && !sync) - goto do_mirror; -do_wait: - /* Wait on i/o completion of buffers. */ - for (i = 0; i < nr_bhs; i++) { - tbh = bhs[i]; - if (!tbh) - continue; - wait_on_buffer(tbh); - if (unlikely(!buffer_uptodate(tbh))) { - ntfs_error(vol->sb, "I/O error while writing ntfs " - "record buffer (inode 0x%lx, " - "attribute type 0x%x, page index " - "0x%lx, page offset 0x%lx)! Unmount " - "and run chkdsk.", vi->i_ino, ni->type, - page->index, bh_offset(tbh)); - if (!err || err == -ENOMEM) - err = -EIO; - /* - * Set the buffer uptodate so the page and buffer - * states do not become out of sync. - */ - set_buffer_uptodate(tbh); - } - } - /* If @sync, now synchronize the mft mirror. */ - if (is_mft && sync) { -do_mirror: - for (i = 0; i < nr_bhs; i++) { - unsigned long mft_no; - unsigned int ofs; - - /* - * Skip buffers which are not at the beginning of - * records. - */ - if (i % bhs_per_rec) - continue; - tbh = bhs[i]; - /* Skip removed buffers (and hence records). */ - if (!tbh) - continue; - ofs = bh_offset(tbh); - /* Get the mft record number. */ - mft_no = (((s64)page->index << PAGE_SHIFT) + ofs) - >> rec_size_bits; - if (mft_no < vol->mftmirr_size) - ntfs_sync_mft_mirror(vol, mft_no, - (MFT_RECORD*)(kaddr + ofs), - sync); - } - if (!sync) - goto do_wait; - } - /* Remove the mst protection fixups again. */ - for (i = 0; i < nr_bhs; i++) { - if (!(i % bhs_per_rec)) { - tbh = bhs[i]; - if (!tbh) - continue; - post_write_mst_fixup((NTFS_RECORD*)(kaddr + - bh_offset(tbh))); - } - } - flush_dcache_page(page); -unm_done: - /* Unlock any locked inodes. */ - while (nr_locked_nis-- > 0) { - ntfs_inode *tni, *base_tni; - - tni = locked_nis[nr_locked_nis]; - /* Get the base inode. */ - mutex_lock(&tni->extent_lock); - if (tni->nr_extents >= 0) - base_tni = tni; - else { - base_tni = tni->ext.base_ntfs_ino; - BUG_ON(!base_tni); - } - mutex_unlock(&tni->extent_lock); - ntfs_debug("Unlocking %s inode 0x%lx.", - tni == base_tni ? "base" : "extent", - tni->mft_no); - mutex_unlock(&tni->mrec_lock); - atomic_dec(&tni->count); - iput(VFS_I(base_tni)); - } - SetPageUptodate(page); - kunmap(page); -done: - if (unlikely(err && err != -ENOMEM)) { - /* - * Set page error if there is only one ntfs record in the page. - * Otherwise we would loose per-record granularity. - */ - if (ni->itype.index.block_size == PAGE_SIZE) - SetPageError(page); - NVolSetErrors(vol); - } - if (page_is_dirty) { - ntfs_debug("Page still contains one or more dirty ntfs " - "records. Redirtying the page starting at " - "record 0x%lx.", page->index << - (PAGE_SHIFT - rec_size_bits)); - redirty_page_for_writepage(wbc, page); - unlock_page(page); - } else { - /* - * Keep the VM happy. This must be done otherwise the - * radix-tree tag PAGECACHE_TAG_DIRTY remains set even though - * the page is clean. - */ - BUG_ON(PageWriteback(page)); - set_page_writeback(page); - unlock_page(page); - end_page_writeback(page); - } - if (likely(!err)) - ntfs_debug("Done."); - return err; -} - -/** - * ntfs_writepage - write a @page to the backing store - * @page: page cache page to write out - * @wbc: writeback control structure - * - * This is called from the VM when it wants to have a dirty ntfs page cache - * page cleaned. The VM has already locked the page and marked it clean. - * - * For non-resident attributes, ntfs_writepage() writes the @page by calling - * the ntfs version of the generic block_write_full_folio() function, - * ntfs_write_block(), which in turn if necessary creates and writes the - * buffers associated with the page asynchronously. - * - * For resident attributes, OTOH, ntfs_writepage() writes the @page by copying - * the data to the mft record (which at this stage is most likely in memory). - * The mft record is then marked dirty and written out asynchronously via the - * vfs inode dirty code path for the inode the mft record belongs to or via the - * vm page dirty code path for the page the mft record is in. - * - * Based on ntfs_read_folio() and fs/buffer.c::block_write_full_folio(). - * - * Return 0 on success and -errno on error. - */ -static int ntfs_writepage(struct page *page, struct writeback_control *wbc) -{ - struct folio *folio = page_folio(page); - loff_t i_size; - struct inode *vi = folio->mapping->host; - ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi); - char *addr; - ntfs_attr_search_ctx *ctx = NULL; - MFT_RECORD *m = NULL; - u32 attr_len; - int err; - -retry_writepage: - BUG_ON(!folio_test_locked(folio)); - i_size = i_size_read(vi); - /* Is the folio fully outside i_size? (truncate in progress) */ - if (unlikely(folio->index >= (i_size + PAGE_SIZE - 1) >> - PAGE_SHIFT)) { - /* - * The folio may have dirty, unmapped buffers. Make them - * freeable here, so the page does not leak. - */ - block_invalidate_folio(folio, 0, folio_size(folio)); - folio_unlock(folio); - ntfs_debug("Write outside i_size - truncated?"); - return 0; - } - /* - * Only $DATA attributes can be encrypted and only unnamed $DATA - * attributes can be compressed. Index root can have the flags set but - * this means to create compressed/encrypted files, not that the - * attribute is compressed/encrypted. Note we need to check for - * AT_INDEX_ALLOCATION since this is the type of both directory and - * index inodes. - */ - if (ni->type != AT_INDEX_ALLOCATION) { - /* If file is encrypted, deny access, just like NT4. */ - if (NInoEncrypted(ni)) { - folio_unlock(folio); - BUG_ON(ni->type != AT_DATA); - ntfs_debug("Denying write access to encrypted file."); - return -EACCES; - } - /* Compressed data streams are handled in compress.c. */ - if (NInoNonResident(ni) && NInoCompressed(ni)) { - BUG_ON(ni->type != AT_DATA); - BUG_ON(ni->name_len); - // TODO: Implement and replace this with - // return ntfs_write_compressed_block(page); - folio_unlock(folio); - ntfs_error(vi->i_sb, "Writing to compressed files is " - "not supported yet. Sorry."); - return -EOPNOTSUPP; - } - // TODO: Implement and remove this check. - if (NInoNonResident(ni) && NInoSparse(ni)) { - folio_unlock(folio); - ntfs_error(vi->i_sb, "Writing to sparse files is not " - "supported yet. Sorry."); - return -EOPNOTSUPP; - } - } - /* NInoNonResident() == NInoIndexAllocPresent() */ - if (NInoNonResident(ni)) { - /* We have to zero every time due to mmap-at-end-of-file. */ - if (folio->index >= (i_size >> PAGE_SHIFT)) { - /* The folio straddles i_size. */ - unsigned int ofs = i_size & (folio_size(folio) - 1); - folio_zero_segment(folio, ofs, folio_size(folio)); - } - /* Handle mst protected attributes. */ - if (NInoMstProtected(ni)) - return ntfs_write_mst_block(page, wbc); - /* Normal, non-resident data stream. */ - return ntfs_write_block(folio, wbc); - } - /* - * Attribute is resident, implying it is not compressed, encrypted, or - * mst protected. This also means the attribute is smaller than an mft - * record and hence smaller than a folio, so can simply return error on - * any folios with index above 0. Note the attribute can actually be - * marked compressed but if it is resident the actual data is not - * compressed so we are ok to ignore the compressed flag here. - */ - BUG_ON(folio_buffers(folio)); - BUG_ON(!folio_test_uptodate(folio)); - if (unlikely(folio->index > 0)) { - ntfs_error(vi->i_sb, "BUG()! folio->index (0x%lx) > 0. " - "Aborting write.", folio->index); - BUG_ON(folio_test_writeback(folio)); - folio_start_writeback(folio); - folio_unlock(folio); - folio_end_writeback(folio); - return -EIO; - } - if (!NInoAttr(ni)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - /* Map, pin, and lock the mft record. */ - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - m = NULL; - ctx = NULL; - goto err_out; - } - /* - * If a parallel write made the attribute non-resident, drop the mft - * record and retry the writepage. - */ - if (unlikely(NInoNonResident(ni))) { - unmap_mft_record(base_ni); - goto retry_writepage; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) - goto err_out; - /* - * Keep the VM happy. This must be done otherwise - * PAGECACHE_TAG_DIRTY remains set even though the folio is clean. - */ - BUG_ON(folio_test_writeback(folio)); - folio_start_writeback(folio); - folio_unlock(folio); - attr_len = le32_to_cpu(ctx->attr->data.resident.value_length); - i_size = i_size_read(vi); - if (unlikely(attr_len > i_size)) { - /* Race with shrinking truncate or a failed truncate. */ - attr_len = i_size; - /* - * If the truncate failed, fix it up now. If a concurrent - * truncate, we do its job, so it does not have to do anything. - */ - err = ntfs_resident_attr_value_resize(ctx->mrec, ctx->attr, - attr_len); - /* Shrinking cannot fail. */ - BUG_ON(err); - } - addr = kmap_local_folio(folio, 0); - /* Copy the data from the folio to the mft record. */ - memcpy((u8*)ctx->attr + - le16_to_cpu(ctx->attr->data.resident.value_offset), - addr, attr_len); - /* Zero out of bounds area in the page cache folio. */ - memset(addr + attr_len, 0, folio_size(folio) - attr_len); - kunmap_local(addr); - flush_dcache_folio(folio); - flush_dcache_mft_record_page(ctx->ntfs_ino); - /* We are done with the folio. */ - folio_end_writeback(folio); - /* Finally, mark the mft record dirty, so it gets written back. */ - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - return 0; -err_out: - if (err == -ENOMEM) { - ntfs_warning(vi->i_sb, "Error allocating memory. Redirtying " - "page so we try again later."); - /* - * Put the folio back on mapping->dirty_pages, but leave its - * buffers' dirty state as-is. - */ - folio_redirty_for_writepage(wbc, folio); - err = 0; - } else { - ntfs_error(vi->i_sb, "Resident attribute write failed with " - "error %i.", err); - folio_set_error(folio); - NVolSetErrors(ni->vol); - } - folio_unlock(folio); - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(base_ni); - return err; -} - -#endif /* NTFS_RW */ - -/** - * ntfs_bmap - map logical file block to physical device block - * @mapping: address space mapping to which the block to be mapped belongs - * @block: logical block to map to its physical device block - * - * For regular, non-resident files (i.e. not compressed and not encrypted), map - * the logical @block belonging to the file described by the address space - * mapping @mapping to its physical device block. - * - * The size of the block is equal to the @s_blocksize field of the super block - * of the mounted file system which is guaranteed to be smaller than or equal - * to the cluster size thus the block is guaranteed to fit entirely inside the - * cluster which means we do not need to care how many contiguous bytes are - * available after the beginning of the block. - * - * Return the physical device block if the mapping succeeded or 0 if the block - * is sparse or there was an error. - * - * Note: This is a problem if someone tries to run bmap() on $Boot system file - * as that really is in block zero but there is nothing we can do. bmap() is - * just broken in that respect (just like it cannot distinguish sparse from - * not available or error). - */ -static sector_t ntfs_bmap(struct address_space *mapping, sector_t block) -{ - s64 ofs, size; - loff_t i_size; - LCN lcn; - unsigned long blocksize, flags; - ntfs_inode *ni = NTFS_I(mapping->host); - ntfs_volume *vol = ni->vol; - unsigned delta; - unsigned char blocksize_bits, cluster_size_shift; - - ntfs_debug("Entering for mft_no 0x%lx, logical block 0x%llx.", - ni->mft_no, (unsigned long long)block); - if (ni->type != AT_DATA || !NInoNonResident(ni) || NInoEncrypted(ni)) { - ntfs_error(vol->sb, "BMAP does not make sense for %s " - "attributes, returning 0.", - (ni->type != AT_DATA) ? "non-data" : - (!NInoNonResident(ni) ? "resident" : - "encrypted")); - return 0; - } - /* None of these can happen. */ - BUG_ON(NInoCompressed(ni)); - BUG_ON(NInoMstProtected(ni)); - blocksize = vol->sb->s_blocksize; - blocksize_bits = vol->sb->s_blocksize_bits; - ofs = (s64)block << blocksize_bits; - read_lock_irqsave(&ni->size_lock, flags); - size = ni->initialized_size; - i_size = i_size_read(VFS_I(ni)); - read_unlock_irqrestore(&ni->size_lock, flags); - /* - * If the offset is outside the initialized size or the block straddles - * the initialized size then pretend it is a hole unless the - * initialized size equals the file size. - */ - if (unlikely(ofs >= size || (ofs + blocksize > size && size < i_size))) - goto hole; - cluster_size_shift = vol->cluster_size_bits; - down_read(&ni->runlist.lock); - lcn = ntfs_attr_vcn_to_lcn_nolock(ni, ofs >> cluster_size_shift, false); - up_read(&ni->runlist.lock); - if (unlikely(lcn < LCN_HOLE)) { - /* - * Step down to an integer to avoid gcc doing a long long - * comparision in the switch when we know @lcn is between - * LCN_HOLE and LCN_EIO (i.e. -1 to -5). - * - * Otherwise older gcc (at least on some architectures) will - * try to use __cmpdi2() which is of course not available in - * the kernel. - */ - switch ((int)lcn) { - case LCN_ENOENT: - /* - * If the offset is out of bounds then pretend it is a - * hole. - */ - goto hole; - case LCN_ENOMEM: - ntfs_error(vol->sb, "Not enough memory to complete " - "mapping for inode 0x%lx. " - "Returning 0.", ni->mft_no); - break; - default: - ntfs_error(vol->sb, "Failed to complete mapping for " - "inode 0x%lx. Run chkdsk. " - "Returning 0.", ni->mft_no); - break; - } - return 0; - } - if (lcn < 0) { - /* It is a hole. */ -hole: - ntfs_debug("Done (returning hole)."); - return 0; - } - /* - * The block is really allocated and fullfils all our criteria. - * Convert the cluster to units of block size and return the result. - */ - delta = ofs & vol->cluster_size_mask; - if (unlikely(sizeof(block) < sizeof(lcn))) { - block = lcn = ((lcn << cluster_size_shift) + delta) >> - blocksize_bits; - /* If the block number was truncated return 0. */ - if (unlikely(block != lcn)) { - ntfs_error(vol->sb, "Physical block 0x%llx is too " - "large to be returned, returning 0.", - (long long)lcn); - return 0; - } - } else - block = ((lcn << cluster_size_shift) + delta) >> - blocksize_bits; - ntfs_debug("Done (returning block 0x%llx).", (unsigned long long)lcn); - return block; -} - -/* - * ntfs_normal_aops - address space operations for normal inodes and attributes - * - * Note these are not used for compressed or mst protected inodes and - * attributes. - */ -const struct address_space_operations ntfs_normal_aops = { - .read_folio = ntfs_read_folio, -#ifdef NTFS_RW - .writepage = ntfs_writepage, - .dirty_folio = block_dirty_folio, -#endif /* NTFS_RW */ - .bmap = ntfs_bmap, - .migrate_folio = buffer_migrate_folio, - .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_folio = generic_error_remove_folio, -}; - -/* - * ntfs_compressed_aops - address space operations for compressed inodes - */ -const struct address_space_operations ntfs_compressed_aops = { - .read_folio = ntfs_read_folio, -#ifdef NTFS_RW - .writepage = ntfs_writepage, - .dirty_folio = block_dirty_folio, -#endif /* NTFS_RW */ - .migrate_folio = buffer_migrate_folio, - .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_folio = generic_error_remove_folio, -}; - -/* - * ntfs_mst_aops - general address space operations for mst protecteed inodes - * and attributes - */ -const struct address_space_operations ntfs_mst_aops = { - .read_folio = ntfs_read_folio, /* Fill page with data. */ -#ifdef NTFS_RW - .writepage = ntfs_writepage, /* Write dirty page to disk. */ - .dirty_folio = filemap_dirty_folio, -#endif /* NTFS_RW */ - .migrate_folio = buffer_migrate_folio, - .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_folio = generic_error_remove_folio, -}; - -#ifdef NTFS_RW - -/** - * mark_ntfs_record_dirty - mark an ntfs record dirty - * @page: page containing the ntfs record to mark dirty - * @ofs: byte offset within @page at which the ntfs record begins - * - * Set the buffers and the page in which the ntfs record is located dirty. - * - * The latter also marks the vfs inode the ntfs record belongs to dirty - * (I_DIRTY_PAGES only). - * - * If the page does not have buffers, we create them and set them uptodate. - * The page may not be locked which is why we need to handle the buffers under - * the mapping->i_private_lock. Once the buffers are marked dirty we no longer - * need the lock since try_to_free_buffers() does not free dirty buffers. - */ -void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) { - struct address_space *mapping = page->mapping; - ntfs_inode *ni = NTFS_I(mapping->host); - struct buffer_head *bh, *head, *buffers_to_free = NULL; - unsigned int end, bh_size, bh_ofs; - - BUG_ON(!PageUptodate(page)); - end = ofs + ni->itype.index.block_size; - bh_size = VFS_I(ni)->i_sb->s_blocksize; - spin_lock(&mapping->i_private_lock); - if (unlikely(!page_has_buffers(page))) { - spin_unlock(&mapping->i_private_lock); - bh = head = alloc_page_buffers(page, bh_size, true); - spin_lock(&mapping->i_private_lock); - if (likely(!page_has_buffers(page))) { - struct buffer_head *tail; - - do { - set_buffer_uptodate(bh); - tail = bh; - bh = bh->b_this_page; - } while (bh); - tail->b_this_page = head; - attach_page_private(page, head); - } else - buffers_to_free = bh; - } - bh = head = page_buffers(page); - BUG_ON(!bh); - do { - bh_ofs = bh_offset(bh); - if (bh_ofs + bh_size <= ofs) - continue; - if (unlikely(bh_ofs >= end)) - break; - set_buffer_dirty(bh); - } while ((bh = bh->b_this_page) != head); - spin_unlock(&mapping->i_private_lock); - filemap_dirty_folio(mapping, page_folio(page)); - if (unlikely(buffers_to_free)) { - do { - bh = buffers_to_free->b_this_page; - free_buffer_head(buffers_to_free); - buffers_to_free = bh; - } while (buffers_to_free); - } -} - -#endif /* NTFS_RW */ diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h deleted file mode 100644 index 8d0958a149cb..000000000000 --- a/fs/ntfs/aops.h +++ /dev/null @@ -1,88 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * aops.h - Defines for NTFS kernel address space operations and page cache - * handling. Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2004 Anton Altaparmakov - * Copyright (c) 2002 Richard Russon - */ - -#ifndef _LINUX_NTFS_AOPS_H -#define _LINUX_NTFS_AOPS_H - -#include <linux/mm.h> -#include <linux/highmem.h> -#include <linux/pagemap.h> -#include <linux/fs.h> - -#include "inode.h" - -/** - * ntfs_unmap_page - release a page that was mapped using ntfs_map_page() - * @page: the page to release - * - * Unpin, unmap and release a page that was obtained from ntfs_map_page(). - */ -static inline void ntfs_unmap_page(struct page *page) -{ - kunmap(page); - put_page(page); -} - -/** - * ntfs_map_page - map a page into accessible memory, reading it if necessary - * @mapping: address space for which to obtain the page - * @index: index into the page cache for @mapping of the page to map - * - * Read a page from the page cache of the address space @mapping at position - * @index, where @index is in units of PAGE_SIZE, and not in bytes. - * - * If the page is not in memory it is loaded from disk first using the - * read_folio method defined in the address space operations of @mapping - * and the page is added to the page cache of @mapping in the process. - * - * If the page belongs to an mst protected attribute and it is marked as such - * in its ntfs inode (NInoMstProtected()) the mst fixups are applied but no - * error checking is performed. This means the caller has to verify whether - * the ntfs record(s) contained in the page are valid or not using one of the - * ntfs_is_XXXX_record{,p}() macros, where XXXX is the record type you are - * expecting to see. (For details of the macros, see fs/ntfs/layout.h.) - * - * If the page is in high memory it is mapped into memory directly addressible - * by the kernel. - * - * Finally the page count is incremented, thus pinning the page into place. - * - * The above means that page_address(page) can be used on all pages obtained - * with ntfs_map_page() to get the kernel virtual address of the page. - * - * When finished with the page, the caller has to call ntfs_unmap_page() to - * unpin, unmap and release the page. - * - * Note this does not grant exclusive access. If such is desired, the caller - * must provide it independently of the ntfs_{un}map_page() calls by using - * a {rw_}semaphore or other means of serialization. A spin lock cannot be - * used as ntfs_map_page() can block. - * - * The unlocked and uptodate page is returned on success or an encoded error - * on failure. Caller has to test for error using the IS_ERR() macro on the - * return value. If that evaluates to 'true', the negative error code can be - * obtained using PTR_ERR() on the return value of ntfs_map_page(). - */ -static inline struct page *ntfs_map_page(struct address_space *mapping, - unsigned long index) -{ - struct page *page = read_mapping_page(mapping, index, NULL); - - if (!IS_ERR(page)) - kmap(page); - return page; -} - -#ifdef NTFS_RW - -extern void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs); - -#endif /* NTFS_RW */ - -#endif /* _LINUX_NTFS_AOPS_H */ diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c deleted file mode 100644 index f79408f9127a..000000000000 --- a/fs/ntfs/attrib.c +++ /dev/null @@ -1,2624 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * attrib.c - NTFS attribute operations. Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc. - * Copyright (c) 2002 Richard Russon - */ - -#include <linux/buffer_head.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/swap.h> -#include <linux/writeback.h> - -#include "attrib.h" -#include "debug.h" -#include "layout.h" -#include "lcnalloc.h" -#include "malloc.h" -#include "mft.h" -#include "ntfs.h" -#include "types.h" - -/** - * ntfs_map_runlist_nolock - map (a part of) a runlist of an ntfs inode - * @ni: ntfs inode for which to map (part of) a runlist - * @vcn: map runlist part containing this vcn - * @ctx: active attribute search context if present or NULL if not - * - * Map the part of a runlist containing the @vcn of the ntfs inode @ni. - * - * If @ctx is specified, it is an active search context of @ni and its base mft - * record. This is needed when ntfs_map_runlist_nolock() encounters unmapped - * runlist fragments and allows their mapping. If you do not have the mft - * record mapped, you can specify @ctx as NULL and ntfs_map_runlist_nolock() - * will perform the necessary mapping and unmapping. - * - * Note, ntfs_map_runlist_nolock() saves the state of @ctx on entry and - * restores it before returning. Thus, @ctx will be left pointing to the same - * attribute on return as on entry. However, the actual pointers in @ctx may - * point to different memory locations on return, so you must remember to reset - * any cached pointers from the @ctx, i.e. after the call to - * ntfs_map_runlist_nolock(), you will probably want to do: - * m = ctx->mrec; - * a = ctx->attr; - * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that - * you cache ctx->mrec in a variable @m of type MFT_RECORD *. - * - * Return 0 on success and -errno on error. There is one special error code - * which is not an error as such. This is -ENOENT. It means that @vcn is out - * of bounds of the runlist. - * - * Note the runlist can be NULL after this function returns if @vcn is zero and - * the attribute has zero allocated size, i.e. there simply is no runlist. - * - * WARNING: If @ctx is supplied, regardless of whether success or failure is - * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx - * is no longer valid, i.e. you need to either call - * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it. - * In that case PTR_ERR(@ctx->mrec) will give you the error code for - * why the mapping of the old inode failed. - * - * Locking: - The runlist described by @ni must be locked for writing on entry - * and is locked on return. Note the runlist will be modified. - * - If @ctx is NULL, the base mft record of @ni must not be mapped on - * entry and it will be left unmapped on return. - * - If @ctx is not NULL, the base mft record must be mapped on entry - * and it will be left mapped on return. - */ -int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx) -{ - VCN end_vcn; - unsigned long flags; - ntfs_inode *base_ni; - MFT_RECORD *m; - ATTR_RECORD *a; - runlist_element *rl; - struct page *put_this_page = NULL; - int err = 0; - bool ctx_is_temporary, ctx_needs_reset; - ntfs_attr_search_ctx old_ctx = { NULL, }; - - ntfs_debug("Mapping runlist part containing vcn 0x%llx.", - (unsigned long long)vcn); - if (!NInoAttr(ni)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - if (!ctx) { - ctx_is_temporary = ctx_needs_reset = true; - m = map_mft_record(base_ni); - if (IS_ERR(m)) - return PTR_ERR(m); - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - } else { - VCN allocated_size_vcn; - - BUG_ON(IS_ERR(ctx->mrec)); - a = ctx->attr; - BUG_ON(!a->non_resident); - ctx_is_temporary = false; - end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn); - read_lock_irqsave(&ni->size_lock, flags); - allocated_size_vcn = ni->allocated_size >> - ni->vol->cluster_size_bits; - read_unlock_irqrestore(&ni->size_lock, flags); - if (!a->data.non_resident.lowest_vcn && end_vcn <= 0) - end_vcn = allocated_size_vcn - 1; - /* - * If we already have the attribute extent containing @vcn in - * @ctx, no need to look it up again. We slightly cheat in - * that if vcn exceeds the allocated size, we will refuse to - * map the runlist below, so there is definitely no need to get - * the right attribute extent. - */ - if (vcn >= allocated_size_vcn || (a->type == ni->type && - a->name_length == ni->name_len && - !memcmp((u8*)a + le16_to_cpu(a->name_offset), - ni->name, ni->name_len) && - sle64_to_cpu(a->data.non_resident.lowest_vcn) - <= vcn && end_vcn >= vcn)) - ctx_needs_reset = false; - else { - /* Save the old search context. */ - old_ctx = *ctx; - /* - * If the currently mapped (extent) inode is not the - * base inode we will unmap it when we reinitialize the - * search context which means we need to get a - * reference to the page containing the mapped mft - * record so we do not accidentally drop changes to the - * mft record when it has not been marked dirty yet. - */ - if (old_ctx.base_ntfs_ino && old_ctx.ntfs_ino != - old_ctx.base_ntfs_ino) { - put_this_page = old_ctx.ntfs_ino->page; - get_page(put_this_page); - } - /* - * Reinitialize the search context so we can lookup the - * needed attribute extent. - */ - ntfs_attr_reinit_search_ctx(ctx); - ctx_needs_reset = true; - } - } - if (ctx_needs_reset) { - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, vcn, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - err = -EIO; - goto err_out; - } - BUG_ON(!ctx->attr->non_resident); - } - a = ctx->attr; - /* - * Only decompress the mapping pairs if @vcn is inside it. Otherwise - * we get into problems when we try to map an out of bounds vcn because - * we then try to map the already mapped runlist fragment and - * ntfs_mapping_pairs_decompress() fails. - */ - end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn) + 1; - if (unlikely(vcn && vcn >= end_vcn)) { - err = -ENOENT; - goto err_out; - } - rl = ntfs_mapping_pairs_decompress(ni->vol, a, ni->runlist.rl); - if (IS_ERR(rl)) - err = PTR_ERR(rl); - else - ni->runlist.rl = rl; -err_out: - if (ctx_is_temporary) { - if (likely(ctx)) - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - } else if (ctx_needs_reset) { - /* - * If there is no attribute list, restoring the search context - * is accomplished simply by copying the saved context back over - * the caller supplied context. If there is an attribute list, - * things are more complicated as we need to deal with mapping - * of mft records and resulting potential changes in pointers. - */ - if (NInoAttrList(base_ni)) { - /* - * If the currently mapped (extent) inode is not the - * one we had before, we need to unmap it and map the - * old one. - */ - if (ctx->ntfs_ino != old_ctx.ntfs_ino) { - /* - * If the currently mapped inode is not the - * base inode, unmap it. - */ - if (ctx->base_ntfs_ino && ctx->ntfs_ino != - ctx->base_ntfs_ino) { - unmap_extent_mft_record(ctx->ntfs_ino); - ctx->mrec = ctx->base_mrec; - BUG_ON(!ctx->mrec); - } - /* - * If the old mapped inode is not the base - * inode, map it. - */ - if (old_ctx.base_ntfs_ino && - old_ctx.ntfs_ino != - old_ctx.base_ntfs_ino) { -retry_map: - ctx->mrec = map_mft_record( - old_ctx.ntfs_ino); - /* - * Something bad has happened. If out - * of memory retry till it succeeds. - * Any other errors are fatal and we - * return the error code in ctx->mrec. - * Let the caller deal with it... We - * just need to fudge things so the - * caller can reinit and/or put the - * search context safely. - */ - if (IS_ERR(ctx->mrec)) { - if (PTR_ERR(ctx->mrec) == - -ENOMEM) { - schedule(); - goto retry_map; - } else - old_ctx.ntfs_ino = - old_ctx. - base_ntfs_ino; - } - } - } - /* Update the changed pointers in the saved context. */ - if (ctx->mrec != old_ctx.mrec) { - if (!IS_ERR(ctx->mrec)) - old_ctx.attr = (ATTR_RECORD*)( - (u8*)ctx->mrec + - ((u8*)old_ctx.attr - - (u8*)old_ctx.mrec)); - old_ctx.mrec = ctx->mrec; - } - } - /* Restore the search context to the saved one. */ - *ctx = old_ctx; - /* - * We drop the reference on the page we took earlier. In the - * case that IS_ERR(ctx->mrec) is true this means we might lose - * some changes to the mft record that had been made between - * the last time it was marked dirty/written out and now. This - * at this stage is not a problem as the mapping error is fatal - * enough that the mft record cannot be written out anyway and - * the caller is very likely to shutdown the whole inode - * immediately and mark the volume dirty for chkdsk to pick up - * the pieces anyway. - */ - if (put_this_page) - put_page(put_this_page); - } - return err; -} - -/** - * ntfs_map_runlist - map (a part of) a runlist of an ntfs inode - * @ni: ntfs inode for which to map (part of) a runlist - * @vcn: map runlist part containing this vcn - * - * Map the part of a runlist containing the @vcn of the ntfs inode @ni. - * - * Return 0 on success and -errno on error. There is one special error code - * which is not an error as such. This is -ENOENT. It means that @vcn is out - * of bounds of the runlist. - * - * Locking: - The runlist must be unlocked on entry and is unlocked on return. - * - This function takes the runlist lock for writing and may modify - * the runlist. - */ -int ntfs_map_runlist(ntfs_inode *ni, VCN vcn) -{ - int err = 0; - - down_write(&ni->runlist.lock); - /* Make sure someone else didn't do the work while we were sleeping. */ - if (likely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) <= - LCN_RL_NOT_MAPPED)) - err = ntfs_map_runlist_nolock(ni, vcn, NULL); - up_write(&ni->runlist.lock); - return err; -} - -/** - * ntfs_attr_vcn_to_lcn_nolock - convert a vcn into a lcn given an ntfs inode - * @ni: ntfs inode of the attribute whose runlist to search - * @vcn: vcn to convert - * @write_locked: true if the runlist is locked for writing - * - * Find the virtual cluster number @vcn in the runlist of the ntfs attribute - * described by the ntfs inode @ni and return the corresponding logical cluster - * number (lcn). - * - * If the @vcn is not mapped yet, the attempt is made to map the attribute - * extent containing the @vcn and the vcn to lcn conversion is retried. - * - * If @write_locked is true the caller has locked the runlist for writing and - * if false for reading. - * - * Since lcns must be >= 0, we use negative return codes with special meaning: - * - * Return code Meaning / Description - * ========================================== - * LCN_HOLE Hole / not allocated on disk. - * LCN_ENOENT There is no such vcn in the runlist, i.e. @vcn is out of bounds. - * LCN_ENOMEM Not enough memory to map runlist. - * LCN_EIO Critical error (runlist/file is corrupt, i/o error, etc). - * - * Locking: - The runlist must be locked on entry and is left locked on return. - * - If @write_locked is 'false', i.e. the runlist is locked for reading, - * the lock may be dropped inside the function so you cannot rely on - * the runlist still being the same when this function returns. - */ -LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn, - const bool write_locked) -{ - LCN lcn; - unsigned long flags; - bool is_retry = false; - - BUG_ON(!ni); - ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, %s_locked.", - ni->mft_no, (unsigned long long)vcn, - write_locked ? "write" : "read"); - BUG_ON(!NInoNonResident(ni)); - BUG_ON(vcn < 0); - if (!ni->runlist.rl) { - read_lock_irqsave(&ni->size_lock, flags); - if (!ni->allocated_size) { - read_unlock_irqrestore(&ni->size_lock, flags); - return LCN_ENOENT; - } - read_unlock_irqrestore(&ni->size_lock, flags); - } -retry_remap: - /* Convert vcn to lcn. If that fails map the runlist and retry once. */ - lcn = ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn); - if (likely(lcn >= LCN_HOLE)) { - ntfs_debug("Done, lcn 0x%llx.", (long long)lcn); - return lcn; - } - if (lcn != LCN_RL_NOT_MAPPED) { - if (lcn != LCN_ENOENT) - lcn = LCN_EIO; - } else if (!is_retry) { - int err; - - if (!write_locked) { - up_read(&ni->runlist.lock); - down_write(&ni->runlist.lock); - if (unlikely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) != - LCN_RL_NOT_MAPPED)) { - up_write(&ni->runlist.lock); - down_read(&ni->runlist.lock); - goto retry_remap; - } - } - err = ntfs_map_runlist_nolock(ni, vcn, NULL); - if (!write_locked) { - up_write(&ni->runlist.lock); - down_read(&ni->runlist.lock); - } - if (likely(!err)) { - is_retry = true; - goto retry_remap; - } - if (err == -ENOENT) - lcn = LCN_ENOENT; - else if (err == -ENOMEM) - lcn = LCN_ENOMEM; - else - lcn = LCN_EIO; - } - if (lcn != LCN_ENOENT) - ntfs_error(ni->vol->sb, "Failed with error code %lli.", - (long long)lcn); - return lcn; -} - -/** - * ntfs_attr_find_vcn_nolock - find a vcn in the runlist of an ntfs inode - * @ni: ntfs inode describing the runlist to search - * @vcn: vcn to find - * @ctx: active attribute search context if present or NULL if not - * - * Find the virtual cluster number @vcn in the runlist described by the ntfs - * inode @ni and return the address of the runlist element containing the @vcn. - * - * If the @vcn is not mapped yet, the attempt is made to map the attribute - * extent containing the @vcn and the vcn to lcn conversion is retried. - * - * If @ctx is specified, it is an active search context of @ni and its base mft - * record. This is needed when ntfs_attr_find_vcn_nolock() encounters unmapped - * runlist fragments and allows their mapping. If you do not have the mft - * record mapped, you can specify @ctx as NULL and ntfs_attr_find_vcn_nolock() - * will perform the necessary mapping and unmapping. - * - * Note, ntfs_attr_find_vcn_nolock() saves the state of @ctx on entry and - * restores it before returning. Thus, @ctx will be left pointing to the same - * attribute on return as on entry. However, the actual pointers in @ctx may - * point to different memory locations on return, so you must remember to reset - * any cached pointers from the @ctx, i.e. after the call to - * ntfs_attr_find_vcn_nolock(), you will probably want to do: - * m = ctx->mrec; - * a = ctx->attr; - * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that - * you cache ctx->mrec in a variable @m of type MFT_RECORD *. - * Note you need to distinguish between the lcn of the returned runlist element - * being >= 0 and LCN_HOLE. In the later case you have to return zeroes on - * read and allocate clusters on write. - * - * Return the runlist element containing the @vcn on success and - * ERR_PTR(-errno) on error. You need to test the return value with IS_ERR() - * to decide if the return is success or failure and PTR_ERR() to get to the - * error code if IS_ERR() is true. - * - * The possible error return codes are: - * -ENOENT - No such vcn in the runlist, i.e. @vcn is out of bounds. - * -ENOMEM - Not enough memory to map runlist. - * -EIO - Critical error (runlist/file is corrupt, i/o error, etc). - * - * WARNING: If @ctx is supplied, regardless of whether success or failure is - * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx - * is no longer valid, i.e. you need to either call - * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it. - * In that case PTR_ERR(@ctx->mrec) will give you the error code for - * why the mapping of the old inode failed. - * - * Locking: - The runlist described by @ni must be locked for writing on entry - * and is locked on return. Note the runlist may be modified when - * needed runlist fragments need to be mapped. - * - If @ctx is NULL, the base mft record of @ni must not be mapped on - * entry and it will be left unmapped on return. - * - If @ctx is not NULL, the base mft record must be mapped on entry - * and it will be left mapped on return. - */ -runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn, - ntfs_attr_search_ctx *ctx) -{ - unsigned long flags; - runlist_element *rl; - int err = 0; - bool is_retry = false; - - BUG_ON(!ni); - ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, with%s ctx.", - ni->mft_no, (unsigned long long)vcn, ctx ? "" : "out"); - BUG_ON(!NInoNonResident(ni)); - BUG_ON(vcn < 0); - if (!ni->runlist.rl) { - read_lock_irqsave(&ni->size_lock, flags); - if (!ni->allocated_size) { - read_unlock_irqrestore(&ni->size_lock, flags); - return ERR_PTR(-ENOENT); - } - read_unlock_irqrestore(&ni->size_lock, flags); - } -retry_remap: - rl = ni->runlist.rl; - if (likely(rl && vcn >= rl[0].vcn)) { - while (likely(rl->length)) { - if (unlikely(vcn < rl[1].vcn)) { - if (likely(rl->lcn >= LCN_HOLE)) { - ntfs_debug("Done."); - return rl; - } - break; - } - rl++; - } - if (likely(rl->lcn != LCN_RL_NOT_MAPPED)) { - if (likely(rl->lcn == LCN_ENOENT)) - err = -ENOENT; - else - err = -EIO; - } - } - if (!err && !is_retry) { - /* - * If the search context is invalid we cannot map the unmapped - * region. - */ - if (IS_ERR(ctx->mrec)) - err = PTR_ERR(ctx->mrec); - else { - /* - * The @vcn is in an unmapped region, map the runlist - * and retry. - */ - err = ntfs_map_runlist_nolock(ni, vcn, ctx); - if (likely(!err)) { - is_retry = true; - goto retry_remap; - } - } - if (err == -EINVAL) - err = -EIO; - } else if (!err) - err = -EIO; - if (err != -ENOENT) - ntfs_error(ni->vol->sb, "Failed with error code %i.", err); - return ERR_PTR(err); -} - -/** - * ntfs_attr_find - find (next) attribute in mft record - * @type: attribute type to find - * @name: attribute name to find (optional, i.e. NULL means don't care) - * @name_len: attribute name length (only needed if @name present) - * @ic: IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present) - * @val: attribute value to find (optional, resident attributes only) - * @val_len: attribute value length - * @ctx: search context with mft record and attribute to search from - * - * You should not need to call this function directly. Use ntfs_attr_lookup() - * instead. - * - * ntfs_attr_find() takes a search context @ctx as parameter and searches the - * mft record specified by @ctx->mrec, beginning at @ctx->attr, for an - * attribute of @type, optionally @name and @val. - * - * If the attribute is found, ntfs_attr_find() returns 0 and @ctx->attr will - * point to the found attribute. - * - * If the attribute is not found, ntfs_attr_find() returns -ENOENT and - * @ctx->attr will point to the attribute before which the attribute being - * searched for would need to be inserted if such an action were to be desired. - * - * On actual error, ntfs_attr_find() returns -EIO. In this case @ctx->attr is - * undefined and in particular do not rely on it not changing. - * - * If @ctx->is_first is 'true', the search begins with @ctx->attr itself. If it - * is 'false', the search begins after @ctx->attr. - * - * If @ic is IGNORE_CASE, the @name comparisson is not case sensitive and - * @ctx->ntfs_ino must be set to the ntfs inode to which the mft record - * @ctx->mrec belongs. This is so we can get at the ntfs volume and hence at - * the upcase table. If @ic is CASE_SENSITIVE, the comparison is case - * sensitive. When @name is present, @name_len is the @name length in Unicode - * characters. - * - * If @name is not present (NULL), we assume that the unnamed attribute is - * being searched for. - * - * Finally, the resident attribute value @val is looked for, if present. If - * @val is not present (NULL), @val_len is ignored. - * - * ntfs_attr_find() only searches the specified mft record and it ignores the - * presence of an attribute list attribute (unless it is the one being searched - * for, obviously). If you need to take attribute lists into consideration, - * use ntfs_attr_lookup() instead (see below). This also means that you cannot - * use ntfs_attr_find() to search for extent records of non-resident - * attributes, as extents with lowest_vcn != 0 are usually described by the - * attribute list attribute only. - Note that it is possible that the first - * extent is only in the attribute list while the last extent is in the base - * mft record, so do not rely on being able to find the first extent in the - * base mft record. - * - * Warning: Never use @val when looking for attribute types which can be - * non-resident as this most likely will result in a crash! - */ -static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name, - const u32 name_len, const IGNORE_CASE_BOOL ic, - const u8 *val, const u32 val_len, ntfs_attr_search_ctx *ctx) -{ - ATTR_RECORD *a; - ntfs_volume *vol = ctx->ntfs_ino->vol; - ntfschar *upcase = vol->upcase; - u32 upcase_len = vol->upcase_len; - - /* - * Iterate over attributes in mft record starting at @ctx->attr, or the - * attribute following that, if @ctx->is_first is 'true'. - */ - if (ctx->is_first) { - a = ctx->attr; - ctx->is_first = false; - } else - a = (ATTR_RECORD*)((u8*)ctx->attr + - le32_to_cpu(ctx->attr->length)); - for (;; a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) { - u8 *mrec_end = (u8 *)ctx->mrec + - le32_to_cpu(ctx->mrec->bytes_allocated); - u8 *name_end; - - /* check whether ATTR_RECORD wrap */ - if ((u8 *)a < (u8 *)ctx->mrec) - break; - - /* check whether Attribute Record Header is within bounds */ - if ((u8 *)a > mrec_end || - (u8 *)a + sizeof(ATTR_RECORD) > mrec_end) - break; - - /* check whether ATTR_RECORD's name is within bounds */ - name_end = (u8 *)a + le16_to_cpu(a->name_offset) + - a->name_length * sizeof(ntfschar); - if (name_end > mrec_end) - break; - - ctx->attr = a; - if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) || - a->type == AT_END)) - return -ENOENT; - if (unlikely(!a->length)) - break; - - /* check whether ATTR_RECORD's length wrap */ - if ((u8 *)a + le32_to_cpu(a->length) < (u8 *)a) - break; - /* check whether ATTR_RECORD's length is within bounds */ - if ((u8 *)a + le32_to_cpu(a->length) > mrec_end) - break; - - if (a->type != type) - continue; - /* - * If @name is present, compare the two names. If @name is - * missing, assume we want an unnamed attribute. - */ - if (!name) { - /* The search failed if the found attribute is named. */ - if (a->name_length) - return -ENOENT; - } else if (!ntfs_are_names_equal(name, name_len, - (ntfschar*)((u8*)a + le16_to_cpu(a->name_offset)), - a->name_length, ic, upcase, upcase_len)) { - register int rc; - - rc = ntfs_collate_names(name, name_len, - (ntfschar*)((u8*)a + - le16_to_cpu(a->name_offset)), - a->name_length, 1, IGNORE_CASE, - upcase, upcase_len); - /* - * If @name collates before a->name, there is no - * matching attribute. - */ - if (rc == -1) - return -ENOENT; - /* If the strings are not equal, continue search. */ - if (rc) - continue; - rc = ntfs_collate_names(name, name_len, - (ntfschar*)((u8*)a + - le16_to_cpu(a->name_offset)), - a->name_length, 1, CASE_SENSITIVE, - upcase, upcase_len); - if (rc == -1) - return -ENOENT; - if (rc) - continue; - } - /* - * The names match or @name not present and attribute is - * unnamed. If no @val specified, we have found the attribute - * and are done. - */ - if (!val) - return 0; - /* @val is present; compare values. */ - else { - register int rc; - - rc = memcmp(val, (u8*)a + le16_to_cpu( - a->data.resident.value_offset), - min_t(u32, val_len, le32_to_cpu( - a->data.resident.value_length))); - /* - * If @val collates before the current attribute's - * value, there is no matching attribute. - */ - if (!rc) { - register u32 avl; - - avl = le32_to_cpu( - a->data.resident.value_length); - if (val_len == avl) - return 0; - if (val_len < avl) - return -ENOENT; - } else if (rc < 0) - return -ENOENT; - } - } - ntfs_error(vol->sb, "Inode is corrupt. Run chkdsk."); - NVolSetErrors(vol); - return -EIO; -} - -/** - * load_attribute_list - load an attribute list into memory - * @vol: ntfs volume from which to read - * @runlist: runlist of the attribute list - * @al_start: destination buffer - * @size: size of the destination buffer in bytes - * @initialized_size: initialized size of the attribute list - * - * Walk the runlist @runlist and load all clusters from it copying them into - * the linear buffer @al. The maximum number of bytes copied to @al is @size - * bytes. Note, @size does not need to be a multiple of the cluster size. If - * @initialized_size is less than @size, the region in @al between - * @initialized_size and @size will be zeroed and not read from disk. - * - * Return 0 on success or -errno on error. - */ -int load_attribute_list(ntfs_volume *vol, runlist *runlist, u8 *al_start, - const s64 size, const s64 initialized_size) -{ - LCN lcn; - u8 *al = al_start; - u8 *al_end = al + initialized_size; - runlist_element *rl; - struct buffer_head *bh; - struct super_block *sb; - unsigned long block_size; - unsigned long block, max_block; - int err = 0; - unsigned char block_size_bits; - - ntfs_debug("Entering."); - if (!vol || !runlist || !al || size <= 0 || initialized_size < 0 || - initialized_size > size) - return -EINVAL; - if (!initialized_size) { - memset(al, 0, size); - return 0; - } - sb = vol->sb; - block_size = sb->s_blocksize; - block_size_bits = sb->s_blocksize_bits; - down_read(&runlist->lock); - rl = runlist->rl; - if (!rl) { - ntfs_error(sb, "Cannot read attribute list since runlist is " - "missing."); - goto err_out; - } - /* Read all clusters specified by the runlist one run at a time. */ - while (rl->length) { - lcn = ntfs_rl_vcn_to_lcn(rl, rl->vcn); - ntfs_debug("Reading vcn = 0x%llx, lcn = 0x%llx.", - (unsigned long long)rl->vcn, - (unsigned long long)lcn); - /* The attribute list cannot be sparse. */ - if (lcn < 0) { - ntfs_error(sb, "ntfs_rl_vcn_to_lcn() failed. Cannot " - "read attribute list."); - goto err_out; - } - block = lcn << vol->cluster_size_bits >> block_size_bits; - /* Read the run from device in chunks of block_size bytes. */ - max_block = block + (rl->length << vol->cluster_size_bits >> - block_size_bits); - ntfs_debug("max_block = 0x%lx.", max_block); - do { - ntfs_debug("Reading block = 0x%lx.", block); - bh = sb_bread(sb, block); - if (!bh) { - ntfs_error(sb, "sb_bread() failed. Cannot " - "read attribute list."); - goto err_out; - } - if (al + block_size >= al_end) - goto do_final; - memcpy(al, bh->b_data, block_size); - brelse(bh); - al += block_size; - } while (++block < max_block); - rl++; - } - if (initialized_size < size) { -initialize: - memset(al_start + initialized_size, 0, size - initialized_size); - } -done: - up_read(&runlist->lock); - return err; -do_final: - if (al < al_end) { - /* - * Partial block. - * - * Note: The attribute list can be smaller than its allocation - * by multiple clusters. This has been encountered by at least - * two people running Windows XP, thus we cannot do any - * truncation sanity checking here. (AIA) - */ - memcpy(al, bh->b_data, al_end - al); - brelse(bh); - if (initialized_size < size) - goto initialize; - goto done; - } - brelse(bh); - /* Real overflow! */ - ntfs_error(sb, "Attribute list buffer overflow. Read attribute list " - "is truncated."); -err_out: - err = -EIO; - goto done; -} - -/** - * ntfs_external_attr_find - find an attribute in the attribute list of an inode - * @type: attribute type to find - * @name: attribute name to find (optional, i.e. NULL means don't care) - * @name_len: attribute name length (only needed if @name present) - * @ic: IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present) - * @lowest_vcn: lowest vcn to find (optional, non-resident attributes only) - * @val: attribute value to find (optional, resident attributes only) - * @val_len: attribute value length - * @ctx: search context with mft record and attribute to search from - * - * You should not need to call this function directly. Use ntfs_attr_lookup() - * instead. - * - * Find an attribute by searching the attribute list for the corresponding - * attribute list entry. Having found the entry, map the mft record if the - * attribute is in a different mft record/inode, ntfs_attr_find() the attribute - * in there and return it. - * - * On first search @ctx->ntfs_ino must be the base mft record and @ctx must - * have been obtained from a call to ntfs_attr_get_search_ctx(). On subsequent - * calls @ctx->ntfs_ino can be any extent inode, too (@ctx->base_ntfs_ino is - * then the base inode). - * - * After finishing with the attribute/mft record you need to call - * ntfs_attr_put_search_ctx() to cleanup the search context (unmapping any - * mapped inodes, etc). - * - * If the attribute is found, ntfs_external_attr_find() returns 0 and - * @ctx->attr will point to the found attribute. @ctx->mrec will point to the - * mft record in which @ctx->attr is located and @ctx->al_entry will point to - * the attribute list entry for the attribute. - * - * If the attribute is not found, ntfs_external_attr_find() returns -ENOENT and - * @ctx->attr will point to the attribute in the base mft record before which - * the attribute being searched for would need to be inserted if such an action - * were to be desired. @ctx->mrec will point to the mft record in which - * @ctx->attr is located and @ctx->al_entry will point to the attribute list - * entry of the attribute before which the attribute being searched for would - * need to be inserted if such an action were to be desired. - * - * Thus to insert the not found attribute, one wants to add the attribute to - * @ctx->mrec (the base mft record) and if there is not enough space, the - * attribute should be placed in a newly allocated extent mft record. The - * attribute list entry for the inserted attribute should be inserted in the - * attribute list attribute at @ctx->al_entry. - * - * On actual error, ntfs_external_attr_find() returns -EIO. In this case - * @ctx->attr is undefined and in particular do not rely on it not changing. - */ -static int ntfs_external_attr_find(const ATTR_TYPE type, - const ntfschar *name, const u32 name_len, - const IGNORE_CASE_BOOL ic, const VCN lowest_vcn, - const u8 *val, const u32 val_len, ntfs_attr_search_ctx *ctx) -{ - ntfs_inode *base_ni, *ni; - ntfs_volume *vol; - ATTR_LIST_ENTRY *al_entry, *next_al_entry; - u8 *al_start, *al_end; - ATTR_RECORD *a; - ntfschar *al_name; - u32 al_name_len; - int err = 0; - static const char *es = " Unmount and run chkdsk."; - - ni = ctx->ntfs_ino; - base_ni = ctx->base_ntfs_ino; - ntfs_debug("Entering for inode 0x%lx, type 0x%x.", ni->mft_no, type); - if (!base_ni) { - /* First call happens with the base mft record. */ - base_ni = ctx->base_ntfs_ino = ctx->ntfs_ino; - ctx->base_mrec = ctx->mrec; - } - if (ni == base_ni) - ctx->base_attr = ctx->attr; - if (type == AT_END) - goto not_found; - vol = base_ni->vol; - al_start = base_ni->attr_list; - al_end = al_start + base_ni->attr_list_size; - if (!ctx->al_entry) - ctx->al_entry = (ATTR_LIST_ENTRY*)al_start; - /* - * Iterate over entries in attribute list starting at @ctx->al_entry, - * or the entry following that, if @ctx->is_first is 'true'. - */ - if (ctx->is_first) { - al_entry = ctx->al_entry; - ctx->is_first = false; - } else - al_entry = (ATTR_LIST_ENTRY*)((u8*)ctx->al_entry + - le16_to_cpu(ctx->al_entry->length)); - for (;; al_entry = next_al_entry) { - /* Out of bounds check. */ - if ((u8*)al_entry < base_ni->attr_list || - (u8*)al_entry > al_end) - break; /* Inode is corrupt. */ - ctx->al_entry = al_entry; - /* Catch the end of the attribute list. */ - if ((u8*)al_entry == al_end) - goto not_found; - if (!al_entry->length) - break; - if ((u8*)al_entry + 6 > al_end || (u8*)al_entry + - le16_to_cpu(al_entry->length) > al_end) - break; - next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry + - le16_to_cpu(al_entry->length)); - if (le32_to_cpu(al_entry->type) > le32_to_cpu(type)) - goto not_found; - if (type != al_entry->type) - continue; - /* - * If @name is present, compare the two names. If @name is - * missing, assume we want an unnamed attribute. - */ - al_name_len = al_entry->name_length; - al_name = (ntfschar*)((u8*)al_entry + al_entry->name_offset); - if (!name) { - if (al_name_len) - goto not_found; - } else if (!ntfs_are_names_equal(al_name, al_name_len, name, - name_len, ic, vol->upcase, vol->upcase_len)) { - register int rc; - - rc = ntfs_collate_names(name, name_len, al_name, - al_name_len, 1, IGNORE_CASE, - vol->upcase, vol->upcase_len); - /* - * If @name collates before al_name, there is no - * matching attribute. - */ - if (rc == -1) - goto not_found; - /* If the strings are not equal, continue search. */ - if (rc) - continue; - /* - * FIXME: Reverse engineering showed 0, IGNORE_CASE but - * that is inconsistent with ntfs_attr_find(). The - * subsequent rc checks were also different. Perhaps I - * made a mistake in one of the two. Need to recheck - * which is correct or at least see what is going on... - * (AIA) - */ - rc = ntfs_collate_names(name, name_len, al_name, - al_name_len, 1, CASE_SENSITIVE, - vol->upcase, vol->upcase_len); - if (rc == -1) - goto not_found; - if (rc) - continue; - } - /* - * The names match or @name not present and attribute is - * unnamed. Now check @lowest_vcn. Continue search if the - * next attribute list entry still fits @lowest_vcn. Otherwise - * we have reached the right one or the search has failed. - */ - if (lowest_vcn && (u8*)next_al_entry >= al_start && - (u8*)next_al_entry + 6 < al_end && - (u8*)next_al_entry + le16_to_cpu( - next_al_entry->length) <= al_end && - sle64_to_cpu(next_al_entry->lowest_vcn) <= - lowest_vcn && - next_al_entry->type == al_entry->type && - next_al_entry->name_length == al_name_len && - ntfs_are_names_equal((ntfschar*)((u8*) - next_al_entry + - next_al_entry->name_offset), - next_al_entry->name_length, - al_name, al_name_len, CASE_SENSITIVE, - vol->upcase, vol->upcase_len)) - continue; - if (MREF_LE(al_entry->mft_reference) == ni->mft_no) { - if (MSEQNO_LE(al_entry->mft_reference) != ni->seq_no) { - ntfs_error(vol->sb, "Found stale mft " - "reference in attribute list " - "of base inode 0x%lx.%s", - base_ni->mft_no, es); - err = -EIO; - break; - } - } else { /* Mft references do not match. */ - /* If there is a mapped record unmap it first. */ - if (ni != base_ni) - unmap_extent_mft_record(ni); - /* Do we want the base record back? */ - if (MREF_LE(al_entry->mft_reference) == - base_ni->mft_no) { - ni = ctx->ntfs_ino = base_ni; - ctx->mrec = ctx->base_mrec; - } else { - /* We want an extent record. */ - ctx->mrec = map_extent_mft_record(base_ni, - le64_to_cpu( - al_entry->mft_reference), &ni); - if (IS_ERR(ctx->mrec)) { - ntfs_error(vol->sb, "Failed to map " - "extent mft record " - "0x%lx of base inode " - "0x%lx.%s", - MREF_LE(al_entry-> - mft_reference), - base_ni->mft_no, es); - err = PTR_ERR(ctx->mrec); - if (err == -ENOENT) - err = -EIO; - /* Cause @ctx to be sanitized below. */ - ni = NULL; - break; - } - ctx->ntfs_ino = ni; - } - ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec + - le16_to_cpu(ctx->mrec->attrs_offset)); - } - /* - * ctx->vfs_ino, ctx->mrec, and ctx->attr now point to the - * mft record containing the attribute represented by the - * current al_entry. - */ - /* - * We could call into ntfs_attr_find() to find the right - * attribute in this mft record but this would be less - * efficient and not quite accurate as ntfs_attr_find() ignores - * the attribute instance numbers for example which become - * important when one plays with attribute lists. Also, - * because a proper match has been found in the attribute list - * entry above, the comparison can now be optimized. So it is - * worth re-implementing a simplified ntfs_attr_find() here. - */ - a = ctx->attr; - /* - * Use a manual loop so we can still use break and continue - * with the same meanings as above. - */ -do_next_attr_loop: - if ((u8*)a < (u8*)ctx->mrec || (u8*)a > (u8*)ctx->mrec + - le32_to_cpu(ctx->mrec->bytes_allocated)) - break; - if (a->type == AT_END) - break; - if (!a->length) - break; - if (al_entry->instance != a->instance) - goto do_next_attr; - /* - * If the type and/or the name are mismatched between the - * attribute list entry and the attribute record, there is - * corruption so we break and return error EIO. - */ - if (al_entry->type != a->type) - break; - if (!ntfs_are_names_equal((ntfschar*)((u8*)a + - le16_to_cpu(a->name_offset)), a->name_length, - al_name, al_name_len, CASE_SENSITIVE, - vol->upcase, vol->upcase_len)) - break; - ctx->attr = a; - /* - * If no @val specified or @val specified and it matches, we - * have found it! - */ - if (!val || (!a->non_resident && le32_to_cpu( - a->data.resident.value_length) == val_len && - !memcmp((u8*)a + - le16_to_cpu(a->data.resident.value_offset), - val, val_len))) { - ntfs_debug("Done, found."); - return 0; - } -do_next_attr: - /* Proceed to the next attribute in the current mft record. */ - a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length)); - goto do_next_attr_loop; - } - if (!err) { - ntfs_error(vol->sb, "Base inode 0x%lx contains corrupt " - "attribute list attribute.%s", base_ni->mft_no, - es); - err = -EIO; - } - if (ni != base_ni) { - if (ni) - unmap_extent_mft_record(ni); - ctx->ntfs_ino = base_ni; - ctx->mrec = ctx->base_mrec; - ctx->attr = ctx->base_attr; - } - if (err != -ENOMEM) - NVolSetErrors(vol); - return err; -not_found: - /* - * If we were looking for AT_END, we reset the search context @ctx and - * use ntfs_attr_find() to seek to the end of the base mft record. - */ - if (type == AT_END) { - ntfs_attr_reinit_search_ctx(ctx); - return ntfs_attr_find(AT_END, name, name_len, ic, val, val_len, - ctx); - } - /* - * The attribute was not found. Before we return, we want to ensure - * @ctx->mrec and @ctx->attr indicate the position at which the - * attribute should be inserted in the base mft record. Since we also - * want to preserve @ctx->al_entry we cannot reinitialize the search - * context using ntfs_attr_reinit_search_ctx() as this would set - * @ctx->al_entry to NULL. Thus we do the necessary bits manually (see - * ntfs_attr_init_search_ctx() below). Note, we _only_ preserve - * @ctx->al_entry as the remaining fields (base_*) are identical to - * their non base_ counterparts and we cannot set @ctx->base_attr - * correctly yet as we do not know what @ctx->attr will be set to by - * the call to ntfs_attr_find() below. - */ - if (ni != base_ni) - unmap_extent_mft_record(ni); - ctx->mrec = ctx->base_mrec; - ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec + - le16_to_cpu(ctx->mrec->attrs_offset)); - ctx->is_first = true; - ctx->ntfs_ino = base_ni; - ctx->base_ntfs_ino = NULL; - ctx->base_mrec = NULL; - ctx->base_attr = NULL; - /* - * In case there are multiple matches in the base mft record, need to - * keep enumerating until we get an attribute not found response (or - * another error), otherwise we would keep returning the same attribute - * over and over again and all programs using us for enumeration would - * lock up in a tight loop. - */ - do { - err = ntfs_attr_find(type, name, name_len, ic, val, val_len, - ctx); - } while (!err); - ntfs_debug("Done, not found."); - return err; -} - -/** - * ntfs_attr_lookup - find an attribute in an ntfs inode - * @type: attribute type to find - * @name: attribute name to find (optional, i.e. NULL means don't care) - * @name_len: attribute name length (only needed if @name present) - * @ic: IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present) - * @lowest_vcn: lowest vcn to find (optional, non-resident attributes only) - * @val: attribute value to find (optional, resident attributes only) - * @val_len: attribute value length - * @ctx: search context with mft record and attribute to search from - * - * Find an attribute in an ntfs inode. On first search @ctx->ntfs_ino must - * be the base mft record and @ctx must have been obtained from a call to - * ntfs_attr_get_search_ctx(). - * - * This function transparently handles attribute lists and @ctx is used to - * continue searches where they were left off at. - * - * After finishing with the attribute/mft record you need to call - * ntfs_attr_put_search_ctx() to cleanup the search context (unmapping any - * mapped inodes, etc). - * - * Return 0 if the search was successful and -errno if not. - * - * When 0, @ctx->attr is the found attribute and it is in mft record - * @ctx->mrec. If an attribute list attribute is present, @ctx->al_entry is - * the attribute list entry of the found attribute. - * - * When -ENOENT, @ctx->attr is the attribute which collates just after the - * attribute being searched for, i.e. if one wants to add the attribute to the - * mft record this is the correct place to insert it into. If an attribute - * list attribute is present, @ctx->al_entry is the attribute list entry which - * collates just after the attribute list entry of the attribute being searched - * for, i.e. if one wants to add the attribute to the mft record this is the - * correct place to insert its attribute list entry into. - * - * When -errno != -ENOENT, an error occurred during the lookup. @ctx->attr is - * then undefined and in particular you should not rely on it not changing. - */ -int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name, - const u32 name_len, const IGNORE_CASE_BOOL ic, - const VCN lowest_vcn, const u8 *val, const u32 val_len, - ntfs_attr_search_ctx *ctx) -{ - ntfs_inode *base_ni; - - ntfs_debug("Entering."); - BUG_ON(IS_ERR(ctx->mrec)); - if (ctx->base_ntfs_ino) - base_ni = ctx->base_ntfs_ino; - else - base_ni = ctx->ntfs_ino; - /* Sanity check, just for debugging really. */ - BUG_ON(!base_ni); - if (!NInoAttrList(base_ni) || type == AT_ATTRIBUTE_LIST) - return ntfs_attr_find(type, name, name_len, ic, val, val_len, - ctx); - return ntfs_external_attr_find(type, name, name_len, ic, lowest_vcn, - val, val_len, ctx); -} - -/** - * ntfs_attr_init_search_ctx - initialize an attribute search context - * @ctx: attribute search context to initialize - * @ni: ntfs inode with which to initialize the search context - * @mrec: mft record with which to initialize the search context - * - * Initialize the attribute search context @ctx with @ni and @mrec. - */ -static inline void ntfs_attr_init_search_ctx(ntfs_attr_search_ctx *ctx, - ntfs_inode *ni, MFT_RECORD *mrec) -{ - *ctx = (ntfs_attr_search_ctx) { - .mrec = mrec, - /* Sanity checks are performed elsewhere. */ - .attr = (ATTR_RECORD*)((u8*)mrec + - le16_to_cpu(mrec->attrs_offset)), - .is_first = true, - .ntfs_ino = ni, - }; -} - -/** - * ntfs_attr_reinit_search_ctx - reinitialize an attribute search context - * @ctx: attribute search context to reinitialize - * - * Reinitialize the attribute search context @ctx, unmapping an associated - * extent mft record if present, and initialize the search context again. - * - * This is used when a search for a new attribute is being started to reset - * the search context to the beginning. - */ -void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx) -{ - if (likely(!ctx->base_ntfs_ino)) { - /* No attribute list. */ - ctx->is_first = true; - /* Sanity checks are performed elsewhere. */ - ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec + - le16_to_cpu(ctx->mrec->attrs_offset)); - /* - * This needs resetting due to ntfs_external_attr_find() which - * can leave it set despite having zeroed ctx->base_ntfs_ino. - */ - ctx->al_entry = NULL; - return; - } /* Attribute list. */ - if (ctx->ntfs_ino != ctx->base_ntfs_ino) - unmap_extent_mft_record(ctx->ntfs_ino); - ntfs_attr_init_search_ctx(ctx, ctx->base_ntfs_ino, ctx->base_mrec); - return; -} - -/** - * ntfs_attr_get_search_ctx - allocate/initialize a new attribute search context - * @ni: ntfs inode with which to initialize the search context - * @mrec: mft record with which to initialize the search context - * - * Allocate a new attribute search context, initialize it with @ni and @mrec, - * and return it. Return NULL if allocation failed. - */ -ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni, MFT_RECORD *mrec) -{ - ntfs_attr_search_ctx *ctx; - - ctx = kmem_cache_alloc(ntfs_attr_ctx_cache, GFP_NOFS); - if (ctx) - ntfs_attr_init_search_ctx(ctx, ni, mrec); - return ctx; -} - -/** - * ntfs_attr_put_search_ctx - release an attribute search context - * @ctx: attribute search context to free - * - * Release the attribute search context @ctx, unmapping an associated extent - * mft record if present. - */ -void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx) -{ - if (ctx->base_ntfs_ino && ctx->ntfs_ino != ctx->base_ntfs_ino) - unmap_extent_mft_record(ctx->ntfs_ino); - kmem_cache_free(ntfs_attr_ctx_cache, ctx); - return; -} - -#ifdef NTFS_RW - -/** - * ntfs_attr_find_in_attrdef - find an attribute in the $AttrDef system file - * @vol: ntfs volume to which the attribute belongs - * @type: attribute type which to find - * - * Search for the attribute definition record corresponding to the attribute - * @type in the $AttrDef system file. - * - * Return the attribute type definition record if found and NULL if not found. - */ -static ATTR_DEF *ntfs_attr_find_in_attrdef(const ntfs_volume *vol, - const ATTR_TYPE type) -{ - ATTR_DEF *ad; - - BUG_ON(!vol->attrdef); - BUG_ON(!type); - for (ad = vol->attrdef; (u8*)ad - (u8*)vol->attrdef < - vol->attrdef_size && ad->type; ++ad) { - /* We have not found it yet, carry on searching. */ - if (likely(le32_to_cpu(ad->type) < le32_to_cpu(type))) - continue; - /* We found the attribute; return it. */ - if (likely(ad->type == type)) - return ad; - /* We have gone too far already. No point in continuing. */ - break; - } - /* Attribute not found. */ - ntfs_debug("Attribute type 0x%x not found in $AttrDef.", - le32_to_cpu(type)); - return NULL; -} - -/** - * ntfs_attr_size_bounds_check - check a size of an attribute type for validity - * @vol: ntfs volume to which the attribute belongs - * @type: attribute type which to check - * @size: size which to check - * - * Check whether the @size in bytes is valid for an attribute of @type on the - * ntfs volume @vol. This information is obtained from $AttrDef system file. - * - * Return 0 if valid, -ERANGE if not valid, or -ENOENT if the attribute is not - * listed in $AttrDef. - */ -int ntfs_attr_size_bounds_check(const ntfs_volume *vol, const ATTR_TYPE type, - const s64 size) -{ - ATTR_DEF *ad; - - BUG_ON(size < 0); - /* - * $ATTRIBUTE_LIST has a maximum size of 256kiB, but this is not - * listed in $AttrDef. - */ - if (unlikely(type == AT_ATTRIBUTE_LIST && size > 256 * 1024)) - return -ERANGE; - /* Get the $AttrDef entry for the attribute @type. */ - ad = ntfs_attr_find_in_attrdef(vol, type); - if (unlikely(!ad)) - return -ENOENT; - /* Do the bounds check. */ - if (((sle64_to_cpu(ad->min_size) > 0) && - size < sle64_to_cpu(ad->min_size)) || - ((sle64_to_cpu(ad->max_size) > 0) && size > - sle64_to_cpu(ad->max_size))) - return -ERANGE; - return 0; -} - -/** - * ntfs_attr_can_be_non_resident - check if an attribute can be non-resident - * @vol: ntfs volume to which the attribute belongs - * @type: attribute type which to check - * - * Check whether the attribute of @type on the ntfs volume @vol is allowed to - * be non-resident. This information is obtained from $AttrDef system file. - * - * Return 0 if the attribute is allowed to be non-resident, -EPERM if not, and - * -ENOENT if the attribute is not listed in $AttrDef. - */ -int ntfs_attr_can_be_non_resident(const ntfs_volume *vol, const ATTR_TYPE type) -{ - ATTR_DEF *ad; - - /* Find the attribute definition record in $AttrDef. */ - ad = ntfs_attr_find_in_attrdef(vol, type); - if (unlikely(!ad)) - return -ENOENT; - /* Check the flags and return the result. */ - if (ad->flags & ATTR_DEF_RESIDENT) - return -EPERM; - return 0; -} - -/** - * ntfs_attr_can_be_resident - check if an attribute can be resident - * @vol: ntfs volume to which the attribute belongs - * @type: attribute type which to check - * - * Check whether the attribute of @type on the ntfs volume @vol is allowed to - * be resident. This information is derived from our ntfs knowledge and may - * not be completely accurate, especially when user defined attributes are - * present. Basically we allow everything to be resident except for index - * allocation and $EA attributes. - * - * Return 0 if the attribute is allowed to be non-resident and -EPERM if not. - * - * Warning: In the system file $MFT the attribute $Bitmap must be non-resident - * otherwise windows will not boot (blue screen of death)! We cannot - * check for this here as we do not know which inode's $Bitmap is - * being asked about so the caller needs to special case this. - */ -int ntfs_attr_can_be_resident(const ntfs_volume *vol, const ATTR_TYPE type) -{ - if (type == AT_INDEX_ALLOCATION) - return -EPERM; - return 0; -} - -/** - * ntfs_attr_record_resize - resize an attribute record - * @m: mft record containing attribute record - * @a: attribute record to resize - * @new_size: new size in bytes to which to resize the attribute record @a - * - * Resize the attribute record @a, i.e. the resident part of the attribute, in - * the mft record @m to @new_size bytes. - * - * Return 0 on success and -errno on error. The following error codes are - * defined: - * -ENOSPC - Not enough space in the mft record @m to perform the resize. - * - * Note: On error, no modifications have been performed whatsoever. - * - * Warning: If you make a record smaller without having copied all the data you - * are interested in the data may be overwritten. - */ -int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size) -{ - ntfs_debug("Entering for new_size %u.", new_size); - /* Align to 8 bytes if it is not already done. */ - if (new_size & 7) - new_size = (new_size + 7) & ~7; - /* If the actual attribute length has changed, move things around. */ - if (new_size != le32_to_cpu(a->length)) { - u32 new_muse = le32_to_cpu(m->bytes_in_use) - - le32_to_cpu(a->length) + new_size; - /* Not enough space in this mft record. */ - if (new_muse > le32_to_cpu(m->bytes_allocated)) - return -ENOSPC; - /* Move attributes following @a to their new location. */ - memmove((u8*)a + new_size, (u8*)a + le32_to_cpu(a->length), - le32_to_cpu(m->bytes_in_use) - ((u8*)a - - (u8*)m) - le32_to_cpu(a->length)); - /* Adjust @m to reflect the change in used space. */ - m->bytes_in_use = cpu_to_le32(new_muse); - /* Adjust @a to reflect the new size. */ - if (new_size >= offsetof(ATTR_REC, length) + sizeof(a->length)) - a->length = cpu_to_le32(new_size); - } - return 0; -} - -/** - * ntfs_resident_attr_value_resize - resize the value of a resident attribute - * @m: mft record containing attribute record - * @a: attribute record whose value to resize - * @new_size: new size in bytes to which to resize the attribute value of @a - * - * Resize the value of the attribute @a in the mft record @m to @new_size bytes. - * If the value is made bigger, the newly allocated space is cleared. - * - * Return 0 on success and -errno on error. The following error codes are - * defined: - * -ENOSPC - Not enough space in the mft record @m to perform the resize. - * - * Note: On error, no modifications have been performed whatsoever. - * - * Warning: If you make a record smaller without having copied all the data you - * are interested in the data may be overwritten. - */ -int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a, - const u32 new_size) -{ - u32 old_size; - - /* Resize the resident part of the attribute record. */ - if (ntfs_attr_record_resize(m, a, - le16_to_cpu(a->data.resident.value_offset) + new_size)) - return -ENOSPC; - /* - * The resize succeeded! If we made the attribute value bigger, clear - * the area between the old size and @new_size. - */ - old_size = le32_to_cpu(a->data.resident.value_length); - if (new_size > old_size) - memset((u8*)a + le16_to_cpu(a->data.resident.value_offset) + - old_size, 0, new_size - old_size); - /* Finally update the length of the attribute value. */ - a->data.resident.value_length = cpu_to_le32(new_size); - return 0; -} - -/** - * ntfs_attr_make_non_resident - convert a resident to a non-resident attribute - * @ni: ntfs inode describing the attribute to convert - * @data_size: size of the resident data to copy to the non-resident attribute - * - * Convert the resident ntfs attribute described by the ntfs inode @ni to a - * non-resident one. - * - * @data_size must be equal to the attribute value size. This is needed since - * we need to know the size before we can map the mft record and our callers - * always know it. The reason we cannot simply read the size from the vfs - * inode i_size is that this is not necessarily uptodate. This happens when - * ntfs_attr_make_non_resident() is called in the ->truncate call path(s). - * - * Return 0 on success and -errno on error. The following error return codes - * are defined: - * -EPERM - The attribute is not allowed to be non-resident. - * -ENOMEM - Not enough memory. - * -ENOSPC - Not enough disk space. - * -EINVAL - Attribute not defined on the volume. - * -EIO - I/o error or other error. - * Note that -ENOSPC is also returned in the case that there is not enough - * space in the mft record to do the conversion. This can happen when the mft - * record is already very full. The caller is responsible for trying to make - * space in the mft record and trying again. FIXME: Do we need a separate - * error return code for this kind of -ENOSPC or is it always worth trying - * again in case the attribute may then fit in a resident state so no need to - * make it non-resident at all? Ho-hum... (AIA) - * - * NOTE to self: No changes in the attribute list are required to move from - * a resident to a non-resident attribute. - * - * Locking: - The caller must hold i_mutex on the inode. - */ -int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size) -{ - s64 new_size; - struct inode *vi = VFS_I(ni); - ntfs_volume *vol = ni->vol; - ntfs_inode *base_ni; - MFT_RECORD *m; - ATTR_RECORD *a; - ntfs_attr_search_ctx *ctx; - struct page *page; - runlist_element *rl; - u8 *kaddr; - unsigned long flags; - int mp_size, mp_ofs, name_ofs, arec_size, err, err2; - u32 attr_size; - u8 old_res_attr_flags; - - /* Check that the attribute is allowed to be non-resident. */ - err = ntfs_attr_can_be_non_resident(vol, ni->type); - if (unlikely(err)) { - if (err == -EPERM) - ntfs_debug("Attribute is not allowed to be " - "non-resident."); - else - ntfs_debug("Attribute not defined on the NTFS " - "volume!"); - return err; - } - /* - * FIXME: Compressed and encrypted attributes are not supported when - * writing and we should never have gotten here for them. - */ - BUG_ON(NInoCompressed(ni)); - BUG_ON(NInoEncrypted(ni)); - /* - * The size needs to be aligned to a cluster boundary for allocation - * purposes. - */ - new_size = (data_size + vol->cluster_size - 1) & - ~(vol->cluster_size - 1); - if (new_size > 0) { - /* - * Will need the page later and since the page lock nests - * outside all ntfs locks, we need to get the page now. - */ - page = find_or_create_page(vi->i_mapping, 0, - mapping_gfp_mask(vi->i_mapping)); - if (unlikely(!page)) - return -ENOMEM; - /* Start by allocating clusters to hold the attribute value. */ - rl = ntfs_cluster_alloc(vol, 0, new_size >> - vol->cluster_size_bits, -1, DATA_ZONE, true); - if (IS_ERR(rl)) { - err = PTR_ERR(rl); - ntfs_debug("Failed to allocate cluster%s, error code " - "%i.", (new_size >> - vol->cluster_size_bits) > 1 ? "s" : "", - err); - goto page_err_out; - } - } else { - rl = NULL; - page = NULL; - } - /* Determine the size of the mapping pairs array. */ - mp_size = ntfs_get_size_for_mapping_pairs(vol, rl, 0, -1); - if (unlikely(mp_size < 0)) { - err = mp_size; - ntfs_debug("Failed to get size for mapping pairs array, error " - "code %i.", err); - goto rl_err_out; - } - down_write(&ni->runlist.lock); - if (!NInoAttr(ni)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - m = NULL; - ctx = NULL; - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - err = -EIO; - goto err_out; - } - m = ctx->mrec; - a = ctx->attr; - BUG_ON(NInoNonResident(ni)); - BUG_ON(a->non_resident); - /* - * Calculate new offsets for the name and the mapping pairs array. - */ - if (NInoSparse(ni) || NInoCompressed(ni)) - name_ofs = (offsetof(ATTR_REC, - data.non_resident.compressed_size) + - sizeof(a->data.non_resident.compressed_size) + - 7) & ~7; - else - name_ofs = (offsetof(ATTR_REC, - data.non_resident.compressed_size) + 7) & ~7; - mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7; - /* - * Determine the size of the resident part of the now non-resident - * attribute record. - */ - arec_size = (mp_ofs + mp_size + 7) & ~7; - /* - * If the page is not uptodate bring it uptodate by copying from the - * attribute value. - */ - attr_size = le32_to_cpu(a->data.resident.value_length); - BUG_ON(attr_size != data_size); - if (page && !PageUptodate(page)) { - kaddr = kmap_atomic(page); - memcpy(kaddr, (u8*)a + - le16_to_cpu(a->data.resident.value_offset), - attr_size); - memset(kaddr + attr_size, 0, PAGE_SIZE - attr_size); - kunmap_atomic(kaddr); - flush_dcache_page(page); - SetPageUptodate(page); - } - /* Backup the attribute flag. */ - old_res_attr_flags = a->data.resident.flags; - /* Resize the resident part of the attribute record. */ - err = ntfs_attr_record_resize(m, a, arec_size); - if (unlikely(err)) - goto err_out; - /* - * Convert the resident part of the attribute record to describe a - * non-resident attribute. - */ - a->non_resident = 1; - /* Move the attribute name if it exists and update the offset. */ - if (a->name_length) - memmove((u8*)a + name_ofs, (u8*)a + le16_to_cpu(a->name_offset), - a->name_length * sizeof(ntfschar)); - a->name_offset = cpu_to_le16(name_ofs); - /* Setup the fields specific to non-resident attributes. */ - a->data.non_resident.lowest_vcn = 0; - a->data.non_resident.highest_vcn = cpu_to_sle64((new_size - 1) >> - vol->cluster_size_bits); - a->data.non_resident.mapping_pairs_offset = cpu_to_le16(mp_ofs); - memset(&a->data.non_resident.reserved, 0, - sizeof(a->data.non_resident.reserved)); - a->data.non_resident.allocated_size = cpu_to_sle64(new_size); - a->data.non_resident.data_size = - a->data.non_resident.initialized_size = - cpu_to_sle64(attr_size); - if (NInoSparse(ni) || NInoCompressed(ni)) { - a->data.non_resident.compression_unit = 0; - if (NInoCompressed(ni) || vol->major_ver < 3) - a->data.non_resident.compression_unit = 4; - a->data.non_resident.compressed_size = - a->data.non_resident.allocated_size; - } else - a->data.non_resident.compression_unit = 0; - /* Generate the mapping pairs array into the attribute record. */ - err = ntfs_mapping_pairs_build(vol, (u8*)a + mp_ofs, - arec_size - mp_ofs, rl, 0, -1, NULL); - if (unlikely(err)) { - ntfs_debug("Failed to build mapping pairs, error code %i.", - err); - goto undo_err_out; - } - /* Setup the in-memory attribute structure to be non-resident. */ - ni->runlist.rl = rl; - write_lock_irqsave(&ni->size_lock, flags); - ni->allocated_size = new_size; - if (NInoSparse(ni) || NInoCompressed(ni)) { - ni->itype.compressed.size = ni->allocated_size; - if (a->data.non_resident.compression_unit) { - ni->itype.compressed.block_size = 1U << (a->data. - non_resident.compression_unit + - vol->cluster_size_bits); - ni->itype.compressed.block_size_bits = - ffs(ni->itype.compressed.block_size) - - 1; - ni->itype.compressed.block_clusters = 1U << - a->data.non_resident.compression_unit; - } else { - ni->itype.compressed.block_size = 0; - ni->itype.compressed.block_size_bits = 0; - ni->itype.compressed.block_clusters = 0; - } - vi->i_blocks = ni->itype.compressed.size >> 9; - } else - vi->i_blocks = ni->allocated_size >> 9; - write_unlock_irqrestore(&ni->size_lock, flags); - /* - * This needs to be last since the address space operations ->read_folio - * and ->writepage can run concurrently with us as they are not - * serialized on i_mutex. Note, we are not allowed to fail once we flip - * this switch, which is another reason to do this last. - */ - NInoSetNonResident(ni); - /* Mark the mft record dirty, so it gets written back. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - up_write(&ni->runlist.lock); - if (page) { - set_page_dirty(page); - unlock_page(page); - put_page(page); - } - ntfs_debug("Done."); - return 0; -undo_err_out: - /* Convert the attribute back into a resident attribute. */ - a->non_resident = 0; - /* Move the attribute name if it exists and update the offset. */ - name_ofs = (offsetof(ATTR_RECORD, data.resident.reserved) + - sizeof(a->data.resident.reserved) + 7) & ~7; - if (a->name_length) - memmove((u8*)a + name_ofs, (u8*)a + le16_to_cpu(a->name_offset), - a->name_length * sizeof(ntfschar)); - mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7; - a->name_offset = cpu_to_le16(name_ofs); - arec_size = (mp_ofs + attr_size + 7) & ~7; - /* Resize the resident part of the attribute record. */ - err2 = ntfs_attr_record_resize(m, a, arec_size); - if (unlikely(err2)) { - /* - * This cannot happen (well if memory corruption is at work it - * could happen in theory), but deal with it as well as we can. - * If the old size is too small, truncate the attribute, - * otherwise simply give it a larger allocated size. - * FIXME: Should check whether chkdsk complains when the - * allocated size is much bigger than the resident value size. - */ - arec_size = le32_to_cpu(a->length); - if ((mp_ofs + attr_size) > arec_size) { - err2 = attr_size; - attr_size = arec_size - mp_ofs; - ntfs_error(vol->sb, "Failed to undo partial resident " - "to non-resident attribute " - "conversion. Truncating inode 0x%lx, " - "attribute type 0x%x from %i bytes to " - "%i bytes to maintain metadata " - "consistency. THIS MEANS YOU ARE " - "LOSING %i BYTES DATA FROM THIS %s.", - vi->i_ino, - (unsigned)le32_to_cpu(ni->type), - err2, attr_size, err2 - attr_size, - ((ni->type == AT_DATA) && - !ni->name_len) ? "FILE": "ATTRIBUTE"); - write_lock_irqsave(&ni->size_lock, flags); - ni->initialized_size = attr_size; - i_size_write(vi, attr_size); - write_unlock_irqrestore(&ni->size_lock, flags); - } - } - /* Setup the fields specific to resident attributes. */ - a->data.resident.value_length = cpu_to_le32(attr_size); - a->data.resident.value_offset = cpu_to_le16(mp_ofs); - a->data.resident.flags = old_res_attr_flags; - memset(&a->data.resident.reserved, 0, - sizeof(a->data.resident.reserved)); - /* Copy the data from the page back to the attribute value. */ - if (page) { - kaddr = kmap_atomic(page); - memcpy((u8*)a + mp_ofs, kaddr, attr_size); - kunmap_atomic(kaddr); - } - /* Setup the allocated size in the ntfs inode in case it changed. */ - write_lock_irqsave(&ni->size_lock, flags); - ni->allocated_size = arec_size - mp_ofs; - write_unlock_irqrestore(&ni->size_lock, flags); - /* Mark the mft record dirty, so it gets written back. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); -err_out: - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(base_ni); - ni->runlist.rl = NULL; - up_write(&ni->runlist.lock); -rl_err_out: - if (rl) { - if (ntfs_cluster_free_from_rl(vol, rl) < 0) { - ntfs_error(vol->sb, "Failed to release allocated " - "cluster(s) in error code path. Run " - "chkdsk to recover the lost " - "cluster(s)."); - NVolSetErrors(vol); - } - ntfs_free(rl); -page_err_out: - unlock_page(page); - put_page(page); - } - if (err == -EINVAL) - err = -EIO; - return err; -} - -/** - * ntfs_attr_extend_allocation - extend the allocated space of an attribute - * @ni: ntfs inode of the attribute whose allocation to extend - * @new_alloc_size: new size in bytes to which to extend the allocation to - * @new_data_size: new size in bytes to which to extend the data to - * @data_start: beginning of region which is required to be non-sparse - * - * Extend the allocated space of an attribute described by the ntfs inode @ni - * to @new_alloc_size bytes. If @data_start is -1, the whole extension may be - * implemented as a hole in the file (as long as both the volume and the ntfs - * inode @ni have sparse support enabled). If @data_start is >= 0, then the - * region between the old allocated size and @data_start - 1 may be made sparse - * but the regions between @data_start and @new_alloc_size must be backed by - * actual clusters. - * - * If @new_data_size is -1, it is ignored. If it is >= 0, then the data size - * of the attribute is extended to @new_data_size. Note that the i_size of the - * vfs inode is not updated. Only the data size in the base attribute record - * is updated. The caller has to update i_size separately if this is required. - * WARNING: It is a BUG() for @new_data_size to be smaller than the old data - * size as well as for @new_data_size to be greater than @new_alloc_size. - * - * For resident attributes this involves resizing the attribute record and if - * necessary moving it and/or other attributes into extent mft records and/or - * converting the attribute to a non-resident attribute which in turn involves - * extending the allocation of a non-resident attribute as described below. - * - * For non-resident attributes this involves allocating clusters in the data - * zone on the volume (except for regions that are being made sparse) and - * extending the run list to describe the allocated clusters as well as - * updating the mapping pairs array of the attribute. This in turn involves - * resizing the attribute record and if necessary moving it and/or other - * attributes into extent mft records and/or splitting the attribute record - * into multiple extent attribute records. - * - * Also, the attribute list attribute is updated if present and in some of the - * above cases (the ones where extent mft records/attributes come into play), - * an attribute list attribute is created if not already present. - * - * Return the new allocated size on success and -errno on error. In the case - * that an error is encountered but a partial extension at least up to - * @data_start (if present) is possible, the allocation is partially extended - * and this is returned. This means the caller must check the returned size to - * determine if the extension was partial. If @data_start is -1 then partial - * allocations are not performed. - * - * WARNING: Do not call ntfs_attr_extend_allocation() for $MFT/$DATA. - * - * Locking: This function takes the runlist lock of @ni for writing as well as - * locking the mft record of the base ntfs inode. These locks are maintained - * throughout execution of the function. These locks are required so that the - * attribute can be resized safely and so that it can for example be converted - * from resident to non-resident safely. - * - * TODO: At present attribute list attribute handling is not implemented. - * - * TODO: At present it is not safe to call this function for anything other - * than the $DATA attribute(s) of an uncompressed and unencrypted file. - */ -s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size, - const s64 new_data_size, const s64 data_start) -{ - VCN vcn; - s64 ll, allocated_size, start = data_start; - struct inode *vi = VFS_I(ni); - ntfs_volume *vol = ni->vol; - ntfs_inode *base_ni; - MFT_RECORD *m; - ATTR_RECORD *a; - ntfs_attr_search_ctx *ctx; - runlist_element *rl, *rl2; - unsigned long flags; - int err, mp_size; - u32 attr_len = 0; /* Silence stupid gcc warning. */ - bool mp_rebuilt; - -#ifdef DEBUG - read_lock_irqsave(&ni->size_lock, flags); - allocated_size = ni->allocated_size; - read_unlock_irqrestore(&ni->size_lock, flags); - ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, " - "old_allocated_size 0x%llx, " - "new_allocated_size 0x%llx, new_data_size 0x%llx, " - "data_start 0x%llx.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), - (unsigned long long)allocated_size, - (unsigned long long)new_alloc_size, - (unsigned long long)new_data_size, - (unsigned long long)start); -#endif -retry_extend: - /* - * For non-resident attributes, @start and @new_size need to be aligned - * to cluster boundaries for allocation purposes. - */ - if (NInoNonResident(ni)) { - if (start > 0) - start &= ~(s64)vol->cluster_size_mask; - new_alloc_size = (new_alloc_size + vol->cluster_size - 1) & - ~(s64)vol->cluster_size_mask; - } - BUG_ON(new_data_size >= 0 && new_data_size > new_alloc_size); - /* Check if new size is allowed in $AttrDef. */ - err = ntfs_attr_size_bounds_check(vol, ni->type, new_alloc_size); - if (unlikely(err)) { - /* Only emit errors when the write will fail completely. */ - read_lock_irqsave(&ni->size_lock, flags); - allocated_size = ni->allocated_size; - read_unlock_irqrestore(&ni->size_lock, flags); - if (start < 0 || start >= allocated_size) { - if (err == -ERANGE) { - ntfs_error(vol->sb, "Cannot extend allocation " - "of inode 0x%lx, attribute " - "type 0x%x, because the new " - "allocation would exceed the " - "maximum allowed size for " - "this attribute type.", - vi->i_ino, (unsigned) - le32_to_cpu(ni->type)); - } else { - ntfs_error(vol->sb, "Cannot extend allocation " - "of inode 0x%lx, attribute " - "type 0x%x, because this " - "attribute type is not " - "defined on the NTFS volume. " - "Possible corruption! You " - "should run chkdsk!", - vi->i_ino, (unsigned) - le32_to_cpu(ni->type)); - } - } - /* Translate error code to be POSIX conformant for write(2). */ - if (err == -ERANGE) - err = -EFBIG; - else - err = -EIO; - return err; - } - if (!NInoAttr(ni)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - /* - * We will be modifying both the runlist (if non-resident) and the mft - * record so lock them both down. - */ - down_write(&ni->runlist.lock); - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - m = NULL; - ctx = NULL; - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - read_lock_irqsave(&ni->size_lock, flags); - allocated_size = ni->allocated_size; - read_unlock_irqrestore(&ni->size_lock, flags); - /* - * If non-resident, seek to the last extent. If resident, there is - * only one extent, so seek to that. - */ - vcn = NInoNonResident(ni) ? allocated_size >> vol->cluster_size_bits : - 0; - /* - * Abort if someone did the work whilst we waited for the locks. If we - * just converted the attribute from resident to non-resident it is - * likely that exactly this has happened already. We cannot quite - * abort if we need to update the data size. - */ - if (unlikely(new_alloc_size <= allocated_size)) { - ntfs_debug("Allocated size already exceeds requested size."); - new_alloc_size = allocated_size; - if (new_data_size < 0) - goto done; - /* - * We want the first attribute extent so that we can update the - * data size. - */ - vcn = 0; - } - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, vcn, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - err = -EIO; - goto err_out; - } - m = ctx->mrec; - a = ctx->attr; - /* Use goto to reduce indentation. */ - if (a->non_resident) - goto do_non_resident_extend; - BUG_ON(NInoNonResident(ni)); - /* The total length of the attribute value. */ - attr_len = le32_to_cpu(a->data.resident.value_length); - /* - * Extend the attribute record to be able to store the new attribute - * size. ntfs_attr_record_resize() will not do anything if the size is - * not changing. - */ - if (new_alloc_size < vol->mft_record_size && - !ntfs_attr_record_resize(m, a, - le16_to_cpu(a->data.resident.value_offset) + - new_alloc_size)) { - /* The resize succeeded! */ - write_lock_irqsave(&ni->size_lock, flags); - ni->allocated_size = le32_to_cpu(a->length) - - le16_to_cpu(a->data.resident.value_offset); - write_unlock_irqrestore(&ni->size_lock, flags); - if (new_data_size >= 0) { - BUG_ON(new_data_size < attr_len); - a->data.resident.value_length = - cpu_to_le32((u32)new_data_size); - } - goto flush_done; - } - /* - * We have to drop all the locks so we can call - * ntfs_attr_make_non_resident(). This could be optimised by try- - * locking the first page cache page and only if that fails dropping - * the locks, locking the page, and redoing all the locking and - * lookups. While this would be a huge optimisation, it is not worth - * it as this is definitely a slow code path. - */ - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - up_write(&ni->runlist.lock); - /* - * Not enough space in the mft record, try to make the attribute - * non-resident and if successful restart the extension process. - */ - err = ntfs_attr_make_non_resident(ni, attr_len); - if (likely(!err)) - goto retry_extend; - /* - * Could not make non-resident. If this is due to this not being - * permitted for this attribute type or there not being enough space, - * try to make other attributes non-resident. Otherwise fail. - */ - if (unlikely(err != -EPERM && err != -ENOSPC)) { - /* Only emit errors when the write will fail completely. */ - read_lock_irqsave(&ni->size_lock, flags); - allocated_size = ni->allocated_size; - read_unlock_irqrestore(&ni->size_lock, flags); - if (start < 0 || start >= allocated_size) - ntfs_error(vol->sb, "Cannot extend allocation of " - "inode 0x%lx, attribute type 0x%x, " - "because the conversion from resident " - "to non-resident attribute failed " - "with error code %i.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), err); - if (err != -ENOMEM) - err = -EIO; - goto conv_err_out; - } - /* TODO: Not implemented from here, abort. */ - read_lock_irqsave(&ni->size_lock, flags); - allocated_size = ni->allocated_size; - read_unlock_irqrestore(&ni->size_lock, flags); - if (start < 0 || start >= allocated_size) { - if (err == -ENOSPC) - ntfs_error(vol->sb, "Not enough space in the mft " - "record/on disk for the non-resident " - "attribute value. This case is not " - "implemented yet."); - else /* if (err == -EPERM) */ - ntfs_error(vol->sb, "This attribute type may not be " - "non-resident. This case is not " - "implemented yet."); - } - err = -EOPNOTSUPP; - goto conv_err_out; -#if 0 - // TODO: Attempt to make other attributes non-resident. - if (!err) - goto do_resident_extend; - /* - * Both the attribute list attribute and the standard information - * attribute must remain in the base inode. Thus, if this is one of - * these attributes, we have to try to move other attributes out into - * extent mft records instead. - */ - if (ni->type == AT_ATTRIBUTE_LIST || - ni->type == AT_STANDARD_INFORMATION) { - // TODO: Attempt to move other attributes into extent mft - // records. - err = -EOPNOTSUPP; - if (!err) - goto do_resident_extend; - goto err_out; - } - // TODO: Attempt to move this attribute to an extent mft record, but - // only if it is not already the only attribute in an mft record in - // which case there would be nothing to gain. - err = -EOPNOTSUPP; - if (!err) - goto do_resident_extend; - /* There is nothing we can do to make enough space. )-: */ - goto err_out; -#endif -do_non_resident_extend: - BUG_ON(!NInoNonResident(ni)); - if (new_alloc_size == allocated_size) { - BUG_ON(vcn); - goto alloc_done; - } - /* - * If the data starts after the end of the old allocation, this is a - * $DATA attribute and sparse attributes are enabled on the volume and - * for this inode, then create a sparse region between the old - * allocated size and the start of the data. Otherwise simply proceed - * with filling the whole space between the old allocated size and the - * new allocated size with clusters. - */ - if ((start >= 0 && start <= allocated_size) || ni->type != AT_DATA || - !NVolSparseEnabled(vol) || NInoSparseDisabled(ni)) - goto skip_sparse; - // TODO: This is not implemented yet. We just fill in with real - // clusters for now... - ntfs_debug("Inserting holes is not-implemented yet. Falling back to " - "allocating real clusters instead."); -skip_sparse: - rl = ni->runlist.rl; - if (likely(rl)) { - /* Seek to the end of the runlist. */ - while (rl->length) - rl++; - } - /* If this attribute extent is not mapped, map it now. */ - if (unlikely(!rl || rl->lcn == LCN_RL_NOT_MAPPED || - (rl->lcn == LCN_ENOENT && rl > ni->runlist.rl && - (rl-1)->lcn == LCN_RL_NOT_MAPPED))) { - if (!rl && !allocated_size) - goto first_alloc; - rl = ntfs_mapping_pairs_decompress(vol, a, ni->runlist.rl); - if (IS_ERR(rl)) { - err = PTR_ERR(rl); - if (start < 0 || start >= allocated_size) - ntfs_error(vol->sb, "Cannot extend allocation " - "of inode 0x%lx, attribute " - "type 0x%x, because the " - "mapping of a runlist " - "fragment failed with error " - "code %i.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), - err); - if (err != -ENOMEM) - err = -EIO; - goto err_out; - } - ni->runlist.rl = rl; - /* Seek to the end of the runlist. */ - while (rl->length) - rl++; - } - /* - * We now know the runlist of the last extent is mapped and @rl is at - * the end of the runlist. We want to begin allocating clusters - * starting at the last allocated cluster to reduce fragmentation. If - * there are no valid LCNs in the attribute we let the cluster - * allocator choose the starting cluster. - */ - /* If the last LCN is a hole or simillar seek back to last real LCN. */ - while (rl->lcn < 0 && rl > ni->runlist.rl) - rl--; -first_alloc: - // FIXME: Need to implement partial allocations so at least part of the - // write can be performed when start >= 0. (Needed for POSIX write(2) - // conformance.) - rl2 = ntfs_cluster_alloc(vol, allocated_size >> vol->cluster_size_bits, - (new_alloc_size - allocated_size) >> - vol->cluster_size_bits, (rl && (rl->lcn >= 0)) ? - rl->lcn + rl->length : -1, DATA_ZONE, true); - if (IS_ERR(rl2)) { - err = PTR_ERR(rl2); - if (start < 0 || start >= allocated_size) - ntfs_error(vol->sb, "Cannot extend allocation of " - "inode 0x%lx, attribute type 0x%x, " - "because the allocation of clusters " - "failed with error code %i.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), err); - if (err != -ENOMEM && err != -ENOSPC) - err = -EIO; - goto err_out; - } - rl = ntfs_runlists_merge(ni->runlist.rl, rl2); - if (IS_ERR(rl)) { - err = PTR_ERR(rl); - if (start < 0 || start >= allocated_size) - ntfs_error(vol->sb, "Cannot extend allocation of " - "inode 0x%lx, attribute type 0x%x, " - "because the runlist merge failed " - "with error code %i.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), err); - if (err != -ENOMEM) - err = -EIO; - if (ntfs_cluster_free_from_rl(vol, rl2)) { - ntfs_error(vol->sb, "Failed to release allocated " - "cluster(s) in error code path. Run " - "chkdsk to recover the lost " - "cluster(s)."); - NVolSetErrors(vol); - } - ntfs_free(rl2); - goto err_out; - } - ni->runlist.rl = rl; - ntfs_debug("Allocated 0x%llx clusters.", (long long)(new_alloc_size - - allocated_size) >> vol->cluster_size_bits); - /* Find the runlist element with which the attribute extent starts. */ - ll = sle64_to_cpu(a->data.non_resident.lowest_vcn); - rl2 = ntfs_rl_find_vcn_nolock(rl, ll); - BUG_ON(!rl2); - BUG_ON(!rl2->length); - BUG_ON(rl2->lcn < LCN_HOLE); - mp_rebuilt = false; - /* Get the size for the new mapping pairs array for this extent. */ - mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1); - if (unlikely(mp_size <= 0)) { - err = mp_size; - if (start < 0 || start >= allocated_size) - ntfs_error(vol->sb, "Cannot extend allocation of " - "inode 0x%lx, attribute type 0x%x, " - "because determining the size for the " - "mapping pairs failed with error code " - "%i.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), err); - err = -EIO; - goto undo_alloc; - } - /* Extend the attribute record to fit the bigger mapping pairs array. */ - attr_len = le32_to_cpu(a->length); - err = ntfs_attr_record_resize(m, a, mp_size + - le16_to_cpu(a->data.non_resident.mapping_pairs_offset)); - if (unlikely(err)) { - BUG_ON(err != -ENOSPC); - // TODO: Deal with this by moving this extent to a new mft - // record or by starting a new extent in a new mft record, - // possibly by extending this extent partially and filling it - // and creating a new extent for the remainder, or by making - // other attributes non-resident and/or by moving other - // attributes out of this mft record. - if (start < 0 || start >= allocated_size) - ntfs_error(vol->sb, "Not enough space in the mft " - "record for the extended attribute " - "record. This case is not " - "implemented yet."); - err = -EOPNOTSUPP; - goto undo_alloc; - } - mp_rebuilt = true; - /* Generate the mapping pairs array directly into the attr record. */ - err = ntfs_mapping_pairs_build(vol, (u8*)a + - le16_to_cpu(a->data.non_resident.mapping_pairs_offset), - mp_size, rl2, ll, -1, NULL); - if (unlikely(err)) { - if (start < 0 || start >= allocated_size) - ntfs_error(vol->sb, "Cannot extend allocation of " - "inode 0x%lx, attribute type 0x%x, " - "because building the mapping pairs " - "failed with error code %i.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), err); - err = -EIO; - goto undo_alloc; - } - /* Update the highest_vcn. */ - a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >> - vol->cluster_size_bits) - 1); - /* - * We now have extended the allocated size of the attribute. Reflect - * this in the ntfs_inode structure and the attribute record. - */ - if (a->data.non_resident.lowest_vcn) { - /* - * We are not in the first attribute extent, switch to it, but - * first ensure the changes will make it to disk later. - */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_reinit_search_ctx(ctx); - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) - goto restore_undo_alloc; - /* @m is not used any more so no need to set it. */ - a = ctx->attr; - } - write_lock_irqsave(&ni->size_lock, flags); - ni->allocated_size = new_alloc_size; - a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size); - /* - * FIXME: This would fail if @ni is a directory, $MFT, or an index, - * since those can have sparse/compressed set. For example can be - * set compressed even though it is not compressed itself and in that - * case the bit means that files are to be created compressed in the - * directory... At present this is ok as this code is only called for - * regular files, and only for their $DATA attribute(s). - * FIXME: The calculation is wrong if we created a hole above. For now - * it does not matter as we never create holes. - */ - if (NInoSparse(ni) || NInoCompressed(ni)) { - ni->itype.compressed.size += new_alloc_size - allocated_size; - a->data.non_resident.compressed_size = - cpu_to_sle64(ni->itype.compressed.size); - vi->i_blocks = ni->itype.compressed.size >> 9; - } else - vi->i_blocks = new_alloc_size >> 9; - write_unlock_irqrestore(&ni->size_lock, flags); -alloc_done: - if (new_data_size >= 0) { - BUG_ON(new_data_size < - sle64_to_cpu(a->data.non_resident.data_size)); - a->data.non_resident.data_size = cpu_to_sle64(new_data_size); - } -flush_done: - /* Ensure the changes make it to disk. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); -done: - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - up_write(&ni->runlist.lock); - ntfs_debug("Done, new_allocated_size 0x%llx.", - (unsigned long long)new_alloc_size); - return new_alloc_size; -restore_undo_alloc: - if (start < 0 || start >= allocated_size) - ntfs_error(vol->sb, "Cannot complete extension of allocation " - "of inode 0x%lx, attribute type 0x%x, because " - "lookup of first attribute extent failed with " - "error code %i.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), err); - if (err == -ENOENT) - err = -EIO; - ntfs_attr_reinit_search_ctx(ctx); - if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len, CASE_SENSITIVE, - allocated_size >> vol->cluster_size_bits, NULL, 0, - ctx)) { - ntfs_error(vol->sb, "Failed to find last attribute extent of " - "attribute in error code path. Run chkdsk to " - "recover."); - write_lock_irqsave(&ni->size_lock, flags); - ni->allocated_size = new_alloc_size; - /* - * FIXME: This would fail if @ni is a directory... See above. - * FIXME: The calculation is wrong if we created a hole above. - * For now it does not matter as we never create holes. - */ - if (NInoSparse(ni) || NInoCompressed(ni)) { - ni->itype.compressed.size += new_alloc_size - - allocated_size; - vi->i_blocks = ni->itype.compressed.size >> 9; - } else - vi->i_blocks = new_alloc_size >> 9; - write_unlock_irqrestore(&ni->size_lock, flags); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - up_write(&ni->runlist.lock); - /* - * The only thing that is now wrong is the allocated size of the - * base attribute extent which chkdsk should be able to fix. - */ - NVolSetErrors(vol); - return err; - } - ctx->attr->data.non_resident.highest_vcn = cpu_to_sle64( - (allocated_size >> vol->cluster_size_bits) - 1); -undo_alloc: - ll = allocated_size >> vol->cluster_size_bits; - if (ntfs_cluster_free(ni, ll, -1, ctx) < 0) { - ntfs_error(vol->sb, "Failed to release allocated cluster(s) " - "in error code path. Run chkdsk to recover " - "the lost cluster(s)."); - NVolSetErrors(vol); - } - m = ctx->mrec; - a = ctx->attr; - /* - * If the runlist truncation fails and/or the search context is no - * longer valid, we cannot resize the attribute record or build the - * mapping pairs array thus we mark the inode bad so that no access to - * the freed clusters can happen. - */ - if (ntfs_rl_truncate_nolock(vol, &ni->runlist, ll) || IS_ERR(m)) { - ntfs_error(vol->sb, "Failed to %s in error code path. Run " - "chkdsk to recover.", IS_ERR(m) ? - "restore attribute search context" : - "truncate attribute runlist"); - NVolSetErrors(vol); - } else if (mp_rebuilt) { - if (ntfs_attr_record_resize(m, a, attr_len)) { - ntfs_error(vol->sb, "Failed to restore attribute " - "record in error code path. Run " - "chkdsk to recover."); - NVolSetErrors(vol); - } else /* if (success) */ { - if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( - a->data.non_resident. - mapping_pairs_offset), attr_len - - le16_to_cpu(a->data.non_resident. - mapping_pairs_offset), rl2, ll, -1, - NULL)) { - ntfs_error(vol->sb, "Failed to restore " - "mapping pairs array in error " - "code path. Run chkdsk to " - "recover."); - NVolSetErrors(vol); - } - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - } - } -err_out: - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(base_ni); - up_write(&ni->runlist.lock); -conv_err_out: - ntfs_debug("Failed. Returning error code %i.", err); - return err; -} - -/** - * ntfs_attr_set - fill (a part of) an attribute with a byte - * @ni: ntfs inode describing the attribute to fill - * @ofs: offset inside the attribute at which to start to fill - * @cnt: number of bytes to fill - * @val: the unsigned 8-bit value with which to fill the attribute - * - * Fill @cnt bytes of the attribute described by the ntfs inode @ni starting at - * byte offset @ofs inside the attribute with the constant byte @val. - * - * This function is effectively like memset() applied to an ntfs attribute. - * Note this function actually only operates on the page cache pages belonging - * to the ntfs attribute and it marks them dirty after doing the memset(). - * Thus it relies on the vm dirty page write code paths to cause the modified - * pages to be written to the mft record/disk. - * - * Return 0 on success and -errno on error. An error code of -ESPIPE means - * that @ofs + @cnt were outside the end of the attribute and no write was - * performed. - */ -int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val) -{ - ntfs_volume *vol = ni->vol; - struct address_space *mapping; - struct page *page; - u8 *kaddr; - pgoff_t idx, end; - unsigned start_ofs, end_ofs, size; - - ntfs_debug("Entering for ofs 0x%llx, cnt 0x%llx, val 0x%hx.", - (long long)ofs, (long long)cnt, val); - BUG_ON(ofs < 0); - BUG_ON(cnt < 0); - if (!cnt) - goto done; - /* - * FIXME: Compressed and encrypted attributes are not supported when - * writing and we should never have gotten here for them. - */ - BUG_ON(NInoCompressed(ni)); - BUG_ON(NInoEncrypted(ni)); - mapping = VFS_I(ni)->i_mapping; - /* Work out the starting index and page offset. */ - idx = ofs >> PAGE_SHIFT; - start_ofs = ofs & ~PAGE_MASK; - /* Work out the ending index and page offset. */ - end = ofs + cnt; - end_ofs = end & ~PAGE_MASK; - /* If the end is outside the inode size return -ESPIPE. */ - if (unlikely(end > i_size_read(VFS_I(ni)))) { - ntfs_error(vol->sb, "Request exceeds end of attribute."); - return -ESPIPE; - } - end >>= PAGE_SHIFT; - /* If there is a first partial page, need to do it the slow way. */ - if (start_ofs) { - page = read_mapping_page(mapping, idx, NULL); - if (IS_ERR(page)) { - ntfs_error(vol->sb, "Failed to read first partial " - "page (error, index 0x%lx).", idx); - return PTR_ERR(page); - } - /* - * If the last page is the same as the first page, need to - * limit the write to the end offset. - */ - size = PAGE_SIZE; - if (idx == end) - size = end_ofs; - kaddr = kmap_atomic(page); - memset(kaddr + start_ofs, val, size - start_ofs); - flush_dcache_page(page); - kunmap_atomic(kaddr); - set_page_dirty(page); - put_page(page); - balance_dirty_pages_ratelimited(mapping); - cond_resched(); - if (idx == end) - goto done; - idx++; - } - /* Do the whole pages the fast way. */ - for (; idx < end; idx++) { - /* Find or create the current page. (The page is locked.) */ - page = grab_cache_page(mapping, idx); - if (unlikely(!page)) { - ntfs_error(vol->sb, "Insufficient memory to grab " - "page (index 0x%lx).", idx); - return -ENOMEM; - } - kaddr = kmap_atomic(page); - memset(kaddr, val, PAGE_SIZE); - flush_dcache_page(page); - kunmap_atomic(kaddr); - /* - * If the page has buffers, mark them uptodate since buffer - * state and not page state is definitive in 2.6 kernels. - */ - if (page_has_buffers(page)) { - struct buffer_head *bh, *head; - - bh = head = page_buffers(page); - do { - set_buffer_uptodate(bh); - } while ((bh = bh->b_this_page) != head); - } - /* Now that buffers are uptodate, set the page uptodate, too. */ - SetPageUptodate(page); - /* - * Set the page and all its buffers dirty and mark the inode - * dirty, too. The VM will write the page later on. - */ - set_page_dirty(page); - /* Finally unlock and release the page. */ - unlock_page(page); - put_page(page); - balance_dirty_pages_ratelimited(mapping); - cond_resched(); - } - /* If there is a last partial page, need to do it the slow way. */ - if (end_ofs) { - page = read_mapping_page(mapping, idx, NULL); - if (IS_ERR(page)) { - ntfs_error(vol->sb, "Failed to read last partial page " - "(error, index 0x%lx).", idx); - return PTR_ERR(page); - } - kaddr = kmap_atomic(page); - memset(kaddr, val, end_ofs); - flush_dcache_page(page); - kunmap_atomic(kaddr); - set_page_dirty(page); - put_page(page); - balance_dirty_pages_ratelimited(mapping); - cond_resched(); - } -done: - ntfs_debug("Done."); - return 0; -} - -#endif /* NTFS_RW */ diff --git a/fs/ntfs/attrib.h b/fs/ntfs/attrib.h deleted file mode 100644 index fe0890d3d072..000000000000 --- a/fs/ntfs/attrib.h +++ /dev/null @@ -1,102 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * attrib.h - Defines for attribute handling in NTFS Linux kernel driver. - * Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2005 Anton Altaparmakov - * Copyright (c) 2002 Richard Russon - */ - -#ifndef _LINUX_NTFS_ATTRIB_H -#define _LINUX_NTFS_ATTRIB_H - -#include "endian.h" -#include "types.h" -#include "layout.h" -#include "inode.h" -#include "runlist.h" -#include "volume.h" - -/** - * ntfs_attr_search_ctx - used in attribute search functions - * @mrec: buffer containing mft record to search - * @attr: attribute record in @mrec where to begin/continue search - * @is_first: if true ntfs_attr_lookup() begins search with @attr, else after - * - * Structure must be initialized to zero before the first call to one of the - * attribute search functions. Initialize @mrec to point to the mft record to - * search, and @attr to point to the first attribute within @mrec (not necessary - * if calling the _first() functions), and set @is_first to 'true' (not necessary - * if calling the _first() functions). - * - * If @is_first is 'true', the search begins with @attr. If @is_first is 'false', - * the search begins after @attr. This is so that, after the first call to one - * of the search attribute functions, we can call the function again, without - * any modification of the search context, to automagically get the next - * matching attribute. - */ -typedef struct { - MFT_RECORD *mrec; - ATTR_RECORD *attr; - bool is_first; - ntfs_inode *ntfs_ino; - ATTR_LIST_ENTRY *al_entry; - ntfs_inode *base_ntfs_ino; - MFT_RECORD *base_mrec; - ATTR_RECORD *base_attr; -} ntfs_attr_search_ctx; - -extern int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, - ntfs_attr_search_ctx *ctx); -extern int ntfs_map_runlist(ntfs_inode *ni, VCN vcn); - -extern LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn, - const bool write_locked); - -extern runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, - const VCN vcn, ntfs_attr_search_ctx *ctx); - -int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name, - const u32 name_len, const IGNORE_CASE_BOOL ic, - const VCN lowest_vcn, const u8 *val, const u32 val_len, - ntfs_attr_search_ctx *ctx); - -extern int load_attribute_list(ntfs_volume *vol, runlist *rl, u8 *al_start, - const s64 size, const s64 initialized_size); - -static inline s64 ntfs_attr_size(const ATTR_RECORD *a) -{ - if (!a->non_resident) - return (s64)le32_to_cpu(a->data.resident.value_length); - return sle64_to_cpu(a->data.non_resident.data_size); -} - -extern void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx); -extern ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni, - MFT_RECORD *mrec); -extern void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx); - -#ifdef NTFS_RW - -extern int ntfs_attr_size_bounds_check(const ntfs_volume *vol, - const ATTR_TYPE type, const s64 size); -extern int ntfs_attr_can_be_non_resident(const ntfs_volume *vol, - const ATTR_TYPE type); -extern int ntfs_attr_can_be_resident(const ntfs_volume *vol, - const ATTR_TYPE type); - -extern int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size); -extern int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a, - const u32 new_size); - -extern int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size); - -extern s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size, - const s64 new_data_size, const s64 data_start); - -extern int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, - const u8 val); - -#endif /* NTFS_RW */ - -#endif /* _LINUX_NTFS_ATTRIB_H */ diff --git a/fs/ntfs/bitmap.c b/fs/ntfs/bitmap.c deleted file mode 100644 index 0675b2400873..000000000000 --- a/fs/ntfs/bitmap.c +++ /dev/null @@ -1,179 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * bitmap.c - NTFS kernel bitmap handling. Part of the Linux-NTFS project. - * - * Copyright (c) 2004-2005 Anton Altaparmakov - */ - -#ifdef NTFS_RW - -#include <linux/pagemap.h> - -#include "bitmap.h" -#include "debug.h" -#include "aops.h" -#include "ntfs.h" - -/** - * __ntfs_bitmap_set_bits_in_run - set a run of bits in a bitmap to a value - * @vi: vfs inode describing the bitmap - * @start_bit: first bit to set - * @count: number of bits to set - * @value: value to set the bits to (i.e. 0 or 1) - * @is_rollback: if 'true' this is a rollback operation - * - * Set @count bits starting at bit @start_bit in the bitmap described by the - * vfs inode @vi to @value, where @value is either 0 or 1. - * - * @is_rollback should always be 'false', it is for internal use to rollback - * errors. You probably want to use ntfs_bitmap_set_bits_in_run() instead. - * - * Return 0 on success and -errno on error. - */ -int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit, - const s64 count, const u8 value, const bool is_rollback) -{ - s64 cnt = count; - pgoff_t index, end_index; - struct address_space *mapping; - struct page *page; - u8 *kaddr; - int pos, len; - u8 bit; - - BUG_ON(!vi); - ntfs_debug("Entering for i_ino 0x%lx, start_bit 0x%llx, count 0x%llx, " - "value %u.%s", vi->i_ino, (unsigned long long)start_bit, - (unsigned long long)cnt, (unsigned int)value, - is_rollback ? " (rollback)" : ""); - BUG_ON(start_bit < 0); - BUG_ON(cnt < 0); - BUG_ON(value > 1); - /* - * Calculate the indices for the pages containing the first and last - * bits, i.e. @start_bit and @start_bit + @cnt - 1, respectively. - */ - index = start_bit >> (3 + PAGE_SHIFT); - end_index = (start_bit + cnt - 1) >> (3 + PAGE_SHIFT); - - /* Get the page containing the first bit (@start_bit). */ - mapping = vi->i_mapping; - page = ntfs_map_page(mapping, index); - if (IS_ERR(page)) { - if (!is_rollback) - ntfs_error(vi->i_sb, "Failed to map first page (error " - "%li), aborting.", PTR_ERR(page)); - return PTR_ERR(page); - } - kaddr = page_address(page); - - /* Set @pos to the position of the byte containing @start_bit. */ - pos = (start_bit >> 3) & ~PAGE_MASK; - - /* Calculate the position of @start_bit in the first byte. */ - bit = start_bit & 7; - - /* If the first byte is partial, modify the appropriate bits in it. */ - if (bit) { - u8 *byte = kaddr + pos; - while ((bit & 7) && cnt) { - cnt--; - if (value) - *byte |= 1 << bit++; - else - *byte &= ~(1 << bit++); - } - /* If we are done, unmap the page and return success. */ - if (!cnt) - goto done; - - /* Update @pos to the new position. */ - pos++; - } - /* - * Depending on @value, modify all remaining whole bytes in the page up - * to @cnt. - */ - len = min_t(s64, cnt >> 3, PAGE_SIZE - pos); - memset(kaddr + pos, value ? 0xff : 0, len); - cnt -= len << 3; - - /* Update @len to point to the first not-done byte in the page. */ - if (cnt < 8) - len += pos; - - /* If we are not in the last page, deal with all subsequent pages. */ - while (index < end_index) { - BUG_ON(cnt <= 0); - - /* Update @index and get the next page. */ - flush_dcache_page(page); - set_page_dirty(page); - ntfs_unmap_page(page); - page = ntfs_map_page(mapping, ++index); - if (IS_ERR(page)) - goto rollback; - kaddr = page_address(page); - /* - * Depending on @value, modify all remaining whole bytes in the - * page up to @cnt. - */ - len = min_t(s64, cnt >> 3, PAGE_SIZE); - memset(kaddr, value ? 0xff : 0, len); - cnt -= len << 3; - } - /* - * The currently mapped page is the last one. If the last byte is - * partial, modify the appropriate bits in it. Note, @len is the - * position of the last byte inside the page. - */ - if (cnt) { - u8 *byte; - - BUG_ON(cnt > 7); - - bit = cnt; - byte = kaddr + len; - while (bit--) { - if (value) - *byte |= 1 << bit; - else - *byte &= ~(1 << bit); - } - } -done: - /* We are done. Unmap the page and return success. */ - flush_dcache_page(page); - set_page_dirty(page); - ntfs_unmap_page(page); - ntfs_debug("Done."); - return 0; -rollback: - /* - * Current state: - * - no pages are mapped - * - @count - @cnt is the number of bits that have been modified - */ - if (is_rollback) - return PTR_ERR(page); - if (count != cnt) - pos = __ntfs_bitmap_set_bits_in_run(vi, start_bit, count - cnt, - value ? 0 : 1, true); - else - pos = 0; - if (!pos) { - /* Rollback was successful. */ - ntfs_error(vi->i_sb, "Failed to map subsequent page (error " - "%li), aborting.", PTR_ERR(page)); - } else { - /* Rollback failed. */ - ntfs_error(vi->i_sb, "Failed to map subsequent page (error " - "%li) and rollback failed (error %i). " - "Aborting and leaving inconsistent metadata. " - "Unmount and run chkdsk.", PTR_ERR(page), pos); - NVolSetErrors(NTFS_SB(vi->i_sb)); - } - return PTR_ERR(page); -} - -#endif /* NTFS_RW */ diff --git a/fs/ntfs/bitmap.h b/fs/ntfs/bitmap.h deleted file mode 100644 index 9dd2224ca9c4..000000000000 --- a/fs/ntfs/bitmap.h +++ /dev/null @@ -1,104 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * bitmap.h - Defines for NTFS kernel bitmap handling. Part of the Linux-NTFS - * project. - * - * Copyright (c) 2004 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_BITMAP_H -#define _LINUX_NTFS_BITMAP_H - -#ifdef NTFS_RW - -#include <linux/fs.h> - -#include "types.h" - -extern int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit, - const s64 count, const u8 value, const bool is_rollback); - -/** - * ntfs_bitmap_set_bits_in_run - set a run of bits in a bitmap to a value - * @vi: vfs inode describing the bitmap - * @start_bit: first bit to set - * @count: number of bits to set - * @value: value to set the bits to (i.e. 0 or 1) - * - * Set @count bits starting at bit @start_bit in the bitmap described by the - * vfs inode @vi to @value, where @value is either 0 or 1. - * - * Return 0 on success and -errno on error. - */ -static inline int ntfs_bitmap_set_bits_in_run(struct inode *vi, - const s64 start_bit, const s64 count, const u8 value) -{ - return __ntfs_bitmap_set_bits_in_run(vi, start_bit, count, value, - false); -} - -/** - * ntfs_bitmap_set_run - set a run of bits in a bitmap - * @vi: vfs inode describing the bitmap - * @start_bit: first bit to set - * @count: number of bits to set - * - * Set @count bits starting at bit @start_bit in the bitmap described by the - * vfs inode @vi. - * - * Return 0 on success and -errno on error. - */ -static inline int ntfs_bitmap_set_run(struct inode *vi, const s64 start_bit, - const s64 count) -{ - return ntfs_bitmap_set_bits_in_run(vi, start_bit, count, 1); -} - -/** - * ntfs_bitmap_clear_run - clear a run of bits in a bitmap - * @vi: vfs inode describing the bitmap - * @start_bit: first bit to clear - * @count: number of bits to clear - * - * Clear @count bits starting at bit @start_bit in the bitmap described by the - * vfs inode @vi. - * - * Return 0 on success and -errno on error. - */ -static inline int ntfs_bitmap_clear_run(struct inode *vi, const s64 start_bit, - const s64 count) -{ - return ntfs_bitmap_set_bits_in_run(vi, start_bit, count, 0); -} - -/** - * ntfs_bitmap_set_bit - set a bit in a bitmap - * @vi: vfs inode describing the bitmap - * @bit: bit to set - * - * Set bit @bit in the bitmap described by the vfs inode @vi. - * - * Return 0 on success and -errno on error. - */ -static inline int ntfs_bitmap_set_bit(struct inode *vi, const s64 bit) -{ - return ntfs_bitmap_set_run(vi, bit, 1); -} - -/** - * ntfs_bitmap_clear_bit - clear a bit in a bitmap - * @vi: vfs inode describing the bitmap - * @bit: bit to clear - * - * Clear bit @bit in the bitmap described by the vfs inode @vi. - * - * Return 0 on success and -errno on error. - */ -static inline int ntfs_bitmap_clear_bit(struct inode *vi, const s64 bit) -{ - return ntfs_bitmap_clear_run(vi, bit, 1); -} - -#endif /* NTFS_RW */ - -#endif /* defined _LINUX_NTFS_BITMAP_H */ diff --git a/fs/ntfs/collate.c b/fs/ntfs/collate.c deleted file mode 100644 index 3ab6ec96abfe..000000000000 --- a/fs/ntfs/collate.c +++ /dev/null @@ -1,110 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * collate.c - NTFS kernel collation handling. Part of the Linux-NTFS project. - * - * Copyright (c) 2004 Anton Altaparmakov - */ - -#include "collate.h" -#include "debug.h" -#include "ntfs.h" - -static int ntfs_collate_binary(ntfs_volume *vol, - const void *data1, const int data1_len, - const void *data2, const int data2_len) -{ - int rc; - - ntfs_debug("Entering."); - rc = memcmp(data1, data2, min(data1_len, data2_len)); - if (!rc && (data1_len != data2_len)) { - if (data1_len < data2_len) - rc = -1; - else - rc = 1; - } - ntfs_debug("Done, returning %i", rc); - return rc; -} - -static int ntfs_collate_ntofs_ulong(ntfs_volume *vol, - const void *data1, const int data1_len, - const void *data2, const int data2_len) -{ - int rc; - u32 d1, d2; - - ntfs_debug("Entering."); - // FIXME: We don't really want to bug here. - BUG_ON(data1_len != data2_len); - BUG_ON(data1_len != 4); - d1 = le32_to_cpup(data1); - d2 = le32_to_cpup(data2); - if (d1 < d2) - rc = -1; - else { - if (d1 == d2) - rc = 0; - else - rc = 1; - } - ntfs_debug("Done, returning %i", rc); - return rc; -} - -typedef int (*ntfs_collate_func_t)(ntfs_volume *, const void *, const int, - const void *, const int); - -static ntfs_collate_func_t ntfs_do_collate0x0[3] = { - ntfs_collate_binary, - NULL/*ntfs_collate_file_name*/, - NULL/*ntfs_collate_unicode_string*/, -}; - -static ntfs_collate_func_t ntfs_do_collate0x1[4] = { - ntfs_collate_ntofs_ulong, - NULL/*ntfs_collate_ntofs_sid*/, - NULL/*ntfs_collate_ntofs_security_hash*/, - NULL/*ntfs_collate_ntofs_ulongs*/, -}; - -/** - * ntfs_collate - collate two data items using a specified collation rule - * @vol: ntfs volume to which the data items belong - * @cr: collation rule to use when comparing the items - * @data1: first data item to collate - * @data1_len: length in bytes of @data1 - * @data2: second data item to collate - * @data2_len: length in bytes of @data2 - * - * Collate the two data items @data1 and @data2 using the collation rule @cr - * and return -1, 0, ir 1 if @data1 is found, respectively, to collate before, - * to match, or to collate after @data2. - * - * For speed we use the collation rule @cr as an index into two tables of - * function pointers to call the appropriate collation function. - */ -int ntfs_collate(ntfs_volume *vol, COLLATION_RULE cr, - const void *data1, const int data1_len, - const void *data2, const int data2_len) { - int i; - - ntfs_debug("Entering."); - /* - * FIXME: At the moment we only support COLLATION_BINARY and - * COLLATION_NTOFS_ULONG, so we BUG() for everything else for now. - */ - BUG_ON(cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG); - i = le32_to_cpu(cr); - BUG_ON(i < 0); - if (i <= 0x02) - return ntfs_do_collate0x0[i](vol, data1, data1_len, - data2, data2_len); - BUG_ON(i < 0x10); - i -= 0x10; - if (likely(i <= 3)) - return ntfs_do_collate0x1[i](vol, data1, data1_len, - data2, data2_len); - BUG(); - return 0; -} diff --git a/fs/ntfs/collate.h b/fs/ntfs/collate.h deleted file mode 100644 index f2255619b4f4..000000000000 --- a/fs/ntfs/collate.h +++ /dev/null @@ -1,36 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * collate.h - Defines for NTFS kernel collation handling. Part of the - * Linux-NTFS project. - * - * Copyright (c) 2004 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_COLLATE_H -#define _LINUX_NTFS_COLLATE_H - -#include "types.h" -#include "volume.h" - -static inline bool ntfs_is_collation_rule_supported(COLLATION_RULE cr) { - int i; - - /* - * FIXME: At the moment we only support COLLATION_BINARY and - * COLLATION_NTOFS_ULONG, so we return false for everything else for - * now. - */ - if (unlikely(cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG)) - return false; - i = le32_to_cpu(cr); - if (likely(((i >= 0) && (i <= 0x02)) || - ((i >= 0x10) && (i <= 0x13)))) - return true; - return false; -} - -extern int ntfs_collate(ntfs_volume *vol, COLLATION_RULE cr, - const void *data1, const int data1_len, - const void *data2, const int data2_len); - -#endif /* _LINUX_NTFS_COLLATE_H */ diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c deleted file mode 100644 index 761aaa0195d6..000000000000 --- a/fs/ntfs/compress.c +++ /dev/null @@ -1,950 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * compress.c - NTFS kernel compressed attributes handling. - * Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2004 Anton Altaparmakov - * Copyright (c) 2002 Richard Russon - */ - -#include <linux/fs.h> -#include <linux/buffer_head.h> -#include <linux/blkdev.h> -#include <linux/vmalloc.h> -#include <linux/slab.h> - -#include "attrib.h" -#include "inode.h" -#include "debug.h" -#include "ntfs.h" - -/** - * ntfs_compression_constants - enum of constants used in the compression code - */ -typedef enum { - /* Token types and access mask. */ - NTFS_SYMBOL_TOKEN = 0, - NTFS_PHRASE_TOKEN = 1, - NTFS_TOKEN_MASK = 1, - - /* Compression sub-block constants. */ - NTFS_SB_SIZE_MASK = 0x0fff, - NTFS_SB_SIZE = 0x1000, - NTFS_SB_IS_COMPRESSED = 0x8000, - - /* - * The maximum compression block size is by definition 16 * the cluster - * size, with the maximum supported cluster size being 4kiB. Thus the - * maximum compression buffer size is 64kiB, so we use this when - * initializing the compression buffer. - */ - NTFS_MAX_CB_SIZE = 64 * 1024, -} ntfs_compression_constants; - -/* - * ntfs_compression_buffer - one buffer for the decompression engine - */ -static u8 *ntfs_compression_buffer; - -/* - * ntfs_cb_lock - spinlock which protects ntfs_compression_buffer - */ -static DEFINE_SPINLOCK(ntfs_cb_lock); - -/** - * allocate_compression_buffers - allocate the decompression buffers - * - * Caller has to hold the ntfs_lock mutex. - * - * Return 0 on success or -ENOMEM if the allocations failed. - */ -int allocate_compression_buffers(void) -{ - BUG_ON(ntfs_compression_buffer); - - ntfs_compression_buffer = vmalloc(NTFS_MAX_CB_SIZE); - if (!ntfs_compression_buffer) - return -ENOMEM; - return 0; -} - -/** - * free_compression_buffers - free the decompression buffers - * - * Caller has to hold the ntfs_lock mutex. - */ -void free_compression_buffers(void) -{ - BUG_ON(!ntfs_compression_buffer); - vfree(ntfs_compression_buffer); - ntfs_compression_buffer = NULL; -} - -/** - * zero_partial_compressed_page - zero out of bounds compressed page region - */ -static void zero_partial_compressed_page(struct page *page, - const s64 initialized_size) -{ - u8 *kp = page_address(page); - unsigned int kp_ofs; - - ntfs_debug("Zeroing page region outside initialized size."); - if (((s64)page->index << PAGE_SHIFT) >= initialized_size) { - clear_page(kp); - return; - } - kp_ofs = initialized_size & ~PAGE_MASK; - memset(kp + kp_ofs, 0, PAGE_SIZE - kp_ofs); - return; -} - -/** - * handle_bounds_compressed_page - test for&handle out of bounds compressed page - */ -static inline void handle_bounds_compressed_page(struct page *page, - const loff_t i_size, const s64 initialized_size) -{ - if ((page->index >= (initialized_size >> PAGE_SHIFT)) && - (initialized_size < i_size)) - zero_partial_compressed_page(page, initialized_size); - return; -} - -/** - * ntfs_decompress - decompress a compression block into an array of pages - * @dest_pages: destination array of pages - * @completed_pages: scratch space to track completed pages - * @dest_index: current index into @dest_pages (IN/OUT) - * @dest_ofs: current offset within @dest_pages[@dest_index] (IN/OUT) - * @dest_max_index: maximum index into @dest_pages (IN) - * @dest_max_ofs: maximum offset within @dest_pages[@dest_max_index] (IN) - * @xpage: the target page (-1 if none) (IN) - * @xpage_done: set to 1 if xpage was completed successfully (IN/OUT) - * @cb_start: compression block to decompress (IN) - * @cb_size: size of compression block @cb_start in bytes (IN) - * @i_size: file size when we started the read (IN) - * @initialized_size: initialized file size when we started the read (IN) - * - * The caller must have disabled preemption. ntfs_decompress() reenables it when - * the critical section is finished. - * - * This decompresses the compression block @cb_start into the array of - * destination pages @dest_pages starting at index @dest_index into @dest_pages - * and at offset @dest_pos into the page @dest_pages[@dest_index]. - * - * When the page @dest_pages[@xpage] is completed, @xpage_done is set to 1. - * If xpage is -1 or @xpage has not been completed, @xpage_done is not modified. - * - * @cb_start is a pointer to the compression block which needs decompressing - * and @cb_size is the size of @cb_start in bytes (8-64kiB). - * - * Return 0 if success or -EOVERFLOW on error in the compressed stream. - * @xpage_done indicates whether the target page (@dest_pages[@xpage]) was - * completed during the decompression of the compression block (@cb_start). - * - * Warning: This function *REQUIRES* PAGE_SIZE >= 4096 or it will blow up - * unpredicatbly! You have been warned! - * - * Note to hackers: This function may not sleep until it has finished accessing - * the compression block @cb_start as it is a per-CPU buffer. - */ -static int ntfs_decompress(struct page *dest_pages[], int completed_pages[], - int *dest_index, int *dest_ofs, const int dest_max_index, - const int dest_max_ofs, const int xpage, char *xpage_done, - u8 *const cb_start, const u32 cb_size, const loff_t i_size, - const s64 initialized_size) -{ - /* - * Pointers into the compressed data, i.e. the compression block (cb), - * and the therein contained sub-blocks (sb). - */ - u8 *cb_end = cb_start + cb_size; /* End of cb. */ - u8 *cb = cb_start; /* Current position in cb. */ - u8 *cb_sb_start; /* Beginning of the current sb in the cb. */ - u8 *cb_sb_end; /* End of current sb / beginning of next sb. */ - - /* Variables for uncompressed data / destination. */ - struct page *dp; /* Current destination page being worked on. */ - u8 *dp_addr; /* Current pointer into dp. */ - u8 *dp_sb_start; /* Start of current sub-block in dp. */ - u8 *dp_sb_end; /* End of current sb in dp (dp_sb_start + - NTFS_SB_SIZE). */ - u16 do_sb_start; /* @dest_ofs when starting this sub-block. */ - u16 do_sb_end; /* @dest_ofs of end of this sb (do_sb_start + - NTFS_SB_SIZE). */ - - /* Variables for tag and token parsing. */ - u8 tag; /* Current tag. */ - int token; /* Loop counter for the eight tokens in tag. */ - int nr_completed_pages = 0; - - /* Default error code. */ - int err = -EOVERFLOW; - - ntfs_debug("Entering, cb_size = 0x%x.", cb_size); -do_next_sb: - ntfs_debug("Beginning sub-block at offset = 0x%zx in the cb.", - cb - cb_start); - /* - * Have we reached the end of the compression block or the end of the - * decompressed data? The latter can happen for example if the current - * position in the compression block is one byte before its end so the - * first two checks do not detect it. - */ - if (cb == cb_end || !le16_to_cpup((le16*)cb) || - (*dest_index == dest_max_index && - *dest_ofs == dest_max_ofs)) { - int i; - - ntfs_debug("Completed. Returning success (0)."); - err = 0; -return_error: - /* We can sleep from now on, so we drop lock. */ - spin_unlock(&ntfs_cb_lock); - /* Second stage: finalize completed pages. */ - if (nr_completed_pages > 0) { - for (i = 0; i < nr_completed_pages; i++) { - int di = completed_pages[i]; - - dp = dest_pages[di]; - /* - * If we are outside the initialized size, zero - * the out of bounds page range. - */ - handle_bounds_compressed_page(dp, i_size, - initialized_size); - flush_dcache_page(dp); - kunmap(dp); - SetPageUptodate(dp); - unlock_page(dp); - if (di == xpage) - *xpage_done = 1; - else - put_page(dp); - dest_pages[di] = NULL; - } - } - return err; - } - - /* Setup offsets for the current sub-block destination. */ - do_sb_start = *dest_ofs; - do_sb_end = do_sb_start + NTFS_SB_SIZE; - - /* Check that we are still within allowed boundaries. */ - if (*dest_index == dest_max_index && do_sb_end > dest_max_ofs) - goto return_overflow; - - /* Does the minimum size of a compressed sb overflow valid range? */ - if (cb + 6 > cb_end) - goto return_overflow; - - /* Setup the current sub-block source pointers and validate range. */ - cb_sb_start = cb; - cb_sb_end = cb_sb_start + (le16_to_cpup((le16*)cb) & NTFS_SB_SIZE_MASK) - + 3; - if (cb_sb_end > cb_end) - goto return_overflow; - - /* Get the current destination page. */ - dp = dest_pages[*dest_index]; - if (!dp) { - /* No page present. Skip decompression of this sub-block. */ - cb = cb_sb_end; - - /* Advance destination position to next sub-block. */ - *dest_ofs = (*dest_ofs + NTFS_SB_SIZE) & ~PAGE_MASK; - if (!*dest_ofs && (++*dest_index > dest_max_index)) - goto return_overflow; - goto do_next_sb; - } - - /* We have a valid destination page. Setup the destination pointers. */ - dp_addr = (u8*)page_address(dp) + do_sb_start; - - /* Now, we are ready to process the current sub-block (sb). */ - if (!(le16_to_cpup((le16*)cb) & NTFS_SB_IS_COMPRESSED)) { - ntfs_debug("Found uncompressed sub-block."); - /* This sb is not compressed, just copy it into destination. */ - - /* Advance source position to first data byte. */ - cb += 2; - - /* An uncompressed sb must be full size. */ - if (cb_sb_end - cb != NTFS_SB_SIZE) - goto return_overflow; - - /* Copy the block and advance the source position. */ - memcpy(dp_addr, cb, NTFS_SB_SIZE); - cb += NTFS_SB_SIZE; - - /* Advance destination position to next sub-block. */ - *dest_ofs += NTFS_SB_SIZE; - if (!(*dest_ofs &= ~PAGE_MASK)) { -finalize_page: - /* - * First stage: add current page index to array of - * completed pages. - */ - completed_pages[nr_completed_pages++] = *dest_index; - if (++*dest_index > dest_max_index) - goto return_overflow; - } - goto do_next_sb; - } - ntfs_debug("Found compressed sub-block."); - /* This sb is compressed, decompress it into destination. */ - - /* Setup destination pointers. */ - dp_sb_start = dp_addr; - dp_sb_end = dp_sb_start + NTFS_SB_SIZE; - - /* Forward to the first tag in the sub-block. */ - cb += 2; -do_next_tag: - if (cb == cb_sb_end) { - /* Check if the decompressed sub-block was not full-length. */ - if (dp_addr < dp_sb_end) { - int nr_bytes = do_sb_end - *dest_ofs; - - ntfs_debug("Filling incomplete sub-block with " - "zeroes."); - /* Zero remainder and update destination position. */ - memset(dp_addr, 0, nr_bytes); - *dest_ofs += nr_bytes; - } - /* We have finished the current sub-block. */ - if (!(*dest_ofs &= ~PAGE_MASK)) - goto finalize_page; - goto do_next_sb; - } - - /* Check we are still in range. */ - if (cb > cb_sb_end || dp_addr > dp_sb_end) - goto return_overflow; - - /* Get the next tag and advance to first token. */ - tag = *cb++; - - /* Parse the eight tokens described by the tag. */ - for (token = 0; token < 8; token++, tag >>= 1) { - u16 lg, pt, length, max_non_overlap; - register u16 i; - u8 *dp_back_addr; - - /* Check if we are done / still in range. */ - if (cb >= cb_sb_end || dp_addr > dp_sb_end) - break; - - /* Determine token type and parse appropriately.*/ - if ((tag & NTFS_TOKEN_MASK) == NTFS_SYMBOL_TOKEN) { - /* - * We have a symbol token, copy the symbol across, and - * advance the source and destination positions. - */ - *dp_addr++ = *cb++; - ++*dest_ofs; - - /* Continue with the next token. */ - continue; - } - - /* - * We have a phrase token. Make sure it is not the first tag in - * the sb as this is illegal and would confuse the code below. - */ - if (dp_addr == dp_sb_start) - goto return_overflow; - - /* - * Determine the number of bytes to go back (p) and the number - * of bytes to copy (l). We use an optimized algorithm in which - * we first calculate log2(current destination position in sb), - * which allows determination of l and p in O(1) rather than - * O(n). We just need an arch-optimized log2() function now. - */ - lg = 0; - for (i = *dest_ofs - do_sb_start - 1; i >= 0x10; i >>= 1) - lg++; - - /* Get the phrase token into i. */ - pt = le16_to_cpup((le16*)cb); - - /* - * Calculate starting position of the byte sequence in - * the destination using the fact that p = (pt >> (12 - lg)) + 1 - * and make sure we don't go too far back. - */ - dp_back_addr = dp_addr - (pt >> (12 - lg)) - 1; - if (dp_back_addr < dp_sb_start) - goto return_overflow; - - /* Now calculate the length of the byte sequence. */ - length = (pt & (0xfff >> lg)) + 3; - - /* Advance destination position and verify it is in range. */ - *dest_ofs += length; - if (*dest_ofs > do_sb_end) - goto return_overflow; - - /* The number of non-overlapping bytes. */ - max_non_overlap = dp_addr - dp_back_addr; - - if (length <= max_non_overlap) { - /* The byte sequence doesn't overlap, just copy it. */ - memcpy(dp_addr, dp_back_addr, length); - - /* Advance destination pointer. */ - dp_addr += length; - } else { - /* - * The byte sequence does overlap, copy non-overlapping - * part and then do a slow byte by byte copy for the - * overlapping part. Also, advance the destination - * pointer. - */ - memcpy(dp_addr, dp_back_addr, max_non_overlap); - dp_addr += max_non_overlap; - dp_back_addr += max_non_overlap; - length -= max_non_overlap; - while (length--) - *dp_addr++ = *dp_back_addr++; - } - - /* Advance source position and continue with the next token. */ - cb += 2; - } - - /* No tokens left in the current tag. Continue with the next tag. */ - goto do_next_tag; - -return_overflow: - ntfs_error(NULL, "Failed. Returning -EOVERFLOW."); - goto return_error; -} - -/** - * ntfs_read_compressed_block - read a compressed block into the page cache - * @page: locked page in the compression block(s) we need to read - * - * When we are called the page has already been verified to be locked and the - * attribute is known to be non-resident, not encrypted, but compressed. - * - * 1. Determine which compression block(s) @page is in. - * 2. Get hold of all pages corresponding to this/these compression block(s). - * 3. Read the (first) compression block. - * 4. Decompress it into the corresponding pages. - * 5. Throw the compressed data away and proceed to 3. for the next compression - * block or return success if no more compression blocks left. - * - * Warning: We have to be careful what we do about existing pages. They might - * have been written to so that we would lose data if we were to just overwrite - * them with the out-of-date uncompressed data. - * - * FIXME: For PAGE_SIZE > cb_size we are not doing the Right Thing(TM) at - * the end of the file I think. We need to detect this case and zero the out - * of bounds remainder of the page in question and mark it as handled. At the - * moment we would just return -EIO on such a page. This bug will only become - * apparent if pages are above 8kiB and the NTFS volume only uses 512 byte - * clusters so is probably not going to be seen by anyone. Still this should - * be fixed. (AIA) - * - * FIXME: Again for PAGE_SIZE > cb_size we are screwing up both in - * handling sparse and compressed cbs. (AIA) - * - * FIXME: At the moment we don't do any zeroing out in the case that - * initialized_size is less than data_size. This should be safe because of the - * nature of the compression algorithm used. Just in case we check and output - * an error message in read inode if the two sizes are not equal for a - * compressed file. (AIA) - */ -int ntfs_read_compressed_block(struct page *page) -{ - loff_t i_size; - s64 initialized_size; - struct address_space *mapping = page->mapping; - ntfs_inode *ni = NTFS_I(mapping->host); - ntfs_volume *vol = ni->vol; - struct super_block *sb = vol->sb; - runlist_element *rl; - unsigned long flags, block_size = sb->s_blocksize; - unsigned char block_size_bits = sb->s_blocksize_bits; - u8 *cb, *cb_pos, *cb_end; - struct buffer_head **bhs; - unsigned long offset, index = page->index; - u32 cb_size = ni->itype.compressed.block_size; - u64 cb_size_mask = cb_size - 1UL; - VCN vcn; - LCN lcn; - /* The first wanted vcn (minimum alignment is PAGE_SIZE). */ - VCN start_vcn = (((s64)index << PAGE_SHIFT) & ~cb_size_mask) >> - vol->cluster_size_bits; - /* - * The first vcn after the last wanted vcn (minimum alignment is again - * PAGE_SIZE. - */ - VCN end_vcn = ((((s64)(index + 1UL) << PAGE_SHIFT) + cb_size - 1) - & ~cb_size_mask) >> vol->cluster_size_bits; - /* Number of compression blocks (cbs) in the wanted vcn range. */ - unsigned int nr_cbs = (end_vcn - start_vcn) << vol->cluster_size_bits - >> ni->itype.compressed.block_size_bits; - /* - * Number of pages required to store the uncompressed data from all - * compression blocks (cbs) overlapping @page. Due to alignment - * guarantees of start_vcn and end_vcn, no need to round up here. - */ - unsigned int nr_pages = (end_vcn - start_vcn) << - vol->cluster_size_bits >> PAGE_SHIFT; - unsigned int xpage, max_page, cur_page, cur_ofs, i; - unsigned int cb_clusters, cb_max_ofs; - int block, max_block, cb_max_page, bhs_size, nr_bhs, err = 0; - struct page **pages; - int *completed_pages; - unsigned char xpage_done = 0; - - ntfs_debug("Entering, page->index = 0x%lx, cb_size = 0x%x, nr_pages = " - "%i.", index, cb_size, nr_pages); - /* - * Bad things happen if we get here for anything that is not an - * unnamed $DATA attribute. - */ - BUG_ON(ni->type != AT_DATA); - BUG_ON(ni->name_len); - - pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS); - completed_pages = kmalloc_array(nr_pages + 1, sizeof(int), GFP_NOFS); - - /* Allocate memory to store the buffer heads we need. */ - bhs_size = cb_size / block_size * sizeof(struct buffer_head *); - bhs = kmalloc(bhs_size, GFP_NOFS); - - if (unlikely(!pages || !bhs || !completed_pages)) { - kfree(bhs); - kfree(pages); - kfree(completed_pages); - unlock_page(page); - ntfs_error(vol->sb, "Failed to allocate internal buffers."); - return -ENOMEM; - } - - /* - * We have already been given one page, this is the one we must do. - * Once again, the alignment guarantees keep it simple. - */ - offset = start_vcn << vol->cluster_size_bits >> PAGE_SHIFT; - xpage = index - offset; - pages[xpage] = page; - /* - * The remaining pages need to be allocated and inserted into the page - * cache, alignment guarantees keep all the below much simpler. (-8 - */ - read_lock_irqsave(&ni->size_lock, flags); - i_size = i_size_read(VFS_I(ni)); - initialized_size = ni->initialized_size; - read_unlock_irqrestore(&ni->size_lock, flags); - max_page = ((i_size + PAGE_SIZE - 1) >> PAGE_SHIFT) - - offset; - /* Is the page fully outside i_size? (truncate in progress) */ - if (xpage >= max_page) { - kfree(bhs); - kfree(pages); - kfree(completed_pages); - zero_user(page, 0, PAGE_SIZE); - ntfs_debug("Compressed read outside i_size - truncated?"); - SetPageUptodate(page); - unlock_page(page); - return 0; - } - if (nr_pages < max_page) - max_page = nr_pages; - for (i = 0; i < max_page; i++, offset++) { - if (i != xpage) - pages[i] = grab_cache_page_nowait(mapping, offset); - page = pages[i]; - if (page) { - /* - * We only (re)read the page if it isn't already read - * in and/or dirty or we would be losing data or at - * least wasting our time. - */ - if (!PageDirty(page) && (!PageUptodate(page) || - PageError(page))) { - ClearPageError(page); - kmap(page); - continue; - } - unlock_page(page); - put_page(page); - pages[i] = NULL; - } - } - - /* - * We have the runlist, and all the destination pages we need to fill. - * Now read the first compression block. - */ - cur_page = 0; - cur_ofs = 0; - cb_clusters = ni->itype.compressed.block_clusters; -do_next_cb: - nr_cbs--; - nr_bhs = 0; - - /* Read all cb buffer heads one cluster at a time. */ - rl = NULL; - for (vcn = start_vcn, start_vcn += cb_clusters; vcn < start_vcn; - vcn++) { - bool is_retry = false; - - if (!rl) { -lock_retry_remap: - down_read(&ni->runlist.lock); - rl = ni->runlist.rl; - } - if (likely(rl != NULL)) { - /* Seek to element containing target vcn. */ - while (rl->length && rl[1].vcn <= vcn) - rl++; - lcn = ntfs_rl_vcn_to_lcn(rl, vcn); - } else - lcn = LCN_RL_NOT_MAPPED; - ntfs_debug("Reading vcn = 0x%llx, lcn = 0x%llx.", - (unsigned long long)vcn, - (unsigned long long)lcn); - if (lcn < 0) { - /* - * When we reach the first sparse cluster we have - * finished with the cb. - */ - if (lcn == LCN_HOLE) - break; - if (is_retry || lcn != LCN_RL_NOT_MAPPED) - goto rl_err; - is_retry = true; - /* - * Attempt to map runlist, dropping lock for the - * duration. - */ - up_read(&ni->runlist.lock); - if (!ntfs_map_runlist(ni, vcn)) - goto lock_retry_remap; - goto map_rl_err; - } - block = lcn << vol->cluster_size_bits >> block_size_bits; - /* Read the lcn from device in chunks of block_size bytes. */ - max_block = block + (vol->cluster_size >> block_size_bits); - do { - ntfs_debug("block = 0x%x.", block); - if (unlikely(!(bhs[nr_bhs] = sb_getblk(sb, block)))) - goto getblk_err; - nr_bhs++; - } while (++block < max_block); - } - - /* Release the lock if we took it. */ - if (rl) - up_read(&ni->runlist.lock); - - /* Setup and initiate io on all buffer heads. */ - for (i = 0; i < nr_bhs; i++) { - struct buffer_head *tbh = bhs[i]; - - if (!trylock_buffer(tbh)) - continue; - if (unlikely(buffer_uptodate(tbh))) { - unlock_buffer(tbh); - continue; - } - get_bh(tbh); - tbh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, tbh); - } - - /* Wait for io completion on all buffer heads. */ - for (i = 0; i < nr_bhs; i++) { - struct buffer_head *tbh = bhs[i]; - - if (buffer_uptodate(tbh)) - continue; - wait_on_buffer(tbh); - /* - * We need an optimization barrier here, otherwise we start - * hitting the below fixup code when accessing a loopback - * mounted ntfs partition. This indicates either there is a - * race condition in the loop driver or, more likely, gcc - * overoptimises the code without the barrier and it doesn't - * do the Right Thing(TM). - */ - barrier(); - if (unlikely(!buffer_uptodate(tbh))) { - ntfs_warning(vol->sb, "Buffer is unlocked but not " - "uptodate! Unplugging the disk queue " - "and rescheduling."); - get_bh(tbh); - io_schedule(); - put_bh(tbh); - if (unlikely(!buffer_uptodate(tbh))) - goto read_err; - ntfs_warning(vol->sb, "Buffer is now uptodate. Good."); - } - } - - /* - * Get the compression buffer. We must not sleep any more - * until we are finished with it. - */ - spin_lock(&ntfs_cb_lock); - cb = ntfs_compression_buffer; - - BUG_ON(!cb); - - cb_pos = cb; - cb_end = cb + cb_size; - - /* Copy the buffer heads into the contiguous buffer. */ - for (i = 0; i < nr_bhs; i++) { - memcpy(cb_pos, bhs[i]->b_data, block_size); - cb_pos += block_size; - } - - /* Just a precaution. */ - if (cb_pos + 2 <= cb + cb_size) - *(u16*)cb_pos = 0; - - /* Reset cb_pos back to the beginning. */ - cb_pos = cb; - - /* We now have both source (if present) and destination. */ - ntfs_debug("Successfully read the compression block."); - - /* The last page and maximum offset within it for the current cb. */ - cb_max_page = (cur_page << PAGE_SHIFT) + cur_ofs + cb_size; - cb_max_ofs = cb_max_page & ~PAGE_MASK; - cb_max_page >>= PAGE_SHIFT; - - /* Catch end of file inside a compression block. */ - if (cb_max_page > max_page) - cb_max_page = max_page; - - if (vcn == start_vcn - cb_clusters) { - /* Sparse cb, zero out page range overlapping the cb. */ - ntfs_debug("Found sparse compression block."); - /* We can sleep from now on, so we drop lock. */ - spin_unlock(&ntfs_cb_lock); - if (cb_max_ofs) - cb_max_page--; - for (; cur_page < cb_max_page; cur_page++) { - page = pages[cur_page]; - if (page) { - if (likely(!cur_ofs)) - clear_page(page_address(page)); - else - memset(page_address(page) + cur_ofs, 0, - PAGE_SIZE - - cur_ofs); - flush_dcache_page(page); - kunmap(page); - SetPageUptodate(page); - unlock_page(page); - if (cur_page == xpage) - xpage_done = 1; - else - put_page(page); - pages[cur_page] = NULL; - } - cb_pos += PAGE_SIZE - cur_ofs; - cur_ofs = 0; - if (cb_pos >= cb_end) - break; - } - /* If we have a partial final page, deal with it now. */ - if (cb_max_ofs && cb_pos < cb_end) { - page = pages[cur_page]; - if (page) - memset(page_address(page) + cur_ofs, 0, - cb_max_ofs - cur_ofs); - /* - * No need to update cb_pos at this stage: - * cb_pos += cb_max_ofs - cur_ofs; - */ - cur_ofs = cb_max_ofs; - } - } else if (vcn == start_vcn) { - /* We can't sleep so we need two stages. */ - unsigned int cur2_page = cur_page; - unsigned int cur_ofs2 = cur_ofs; - u8 *cb_pos2 = cb_pos; - - ntfs_debug("Found uncompressed compression block."); - /* Uncompressed cb, copy it to the destination pages. */ - /* - * TODO: As a big optimization, we could detect this case - * before we read all the pages and use block_read_full_folio() - * on all full pages instead (we still have to treat partial - * pages especially but at least we are getting rid of the - * synchronous io for the majority of pages. - * Or if we choose not to do the read-ahead/-behind stuff, we - * could just return block_read_full_folio(pages[xpage]) as long - * as PAGE_SIZE <= cb_size. - */ - if (cb_max_ofs) - cb_max_page--; - /* First stage: copy data into destination pages. */ - for (; cur_page < cb_max_page; cur_page++) { - page = pages[cur_page]; - if (page) - memcpy(page_address(page) + cur_ofs, cb_pos, - PAGE_SIZE - cur_ofs); - cb_pos += PAGE_SIZE - cur_ofs; - cur_ofs = 0; - if (cb_pos >= cb_end) - break; - } - /* If we have a partial final page, deal with it now. */ - if (cb_max_ofs && cb_pos < cb_end) { - page = pages[cur_page]; - if (page) - memcpy(page_address(page) + cur_ofs, cb_pos, - cb_max_ofs - cur_ofs); - cb_pos += cb_max_ofs - cur_ofs; - cur_ofs = cb_max_ofs; - } - /* We can sleep from now on, so drop lock. */ - spin_unlock(&ntfs_cb_lock); - /* Second stage: finalize pages. */ - for (; cur2_page < cb_max_page; cur2_page++) { - page = pages[cur2_page]; - if (page) { - /* - * If we are outside the initialized size, zero - * the out of bounds page range. - */ - handle_bounds_compressed_page(page, i_size, - initialized_size); - flush_dcache_page(page); - kunmap(page); - SetPageUptodate(page); - unlock_page(page); - if (cur2_page == xpage) - xpage_done = 1; - else - put_page(page); - pages[cur2_page] = NULL; - } - cb_pos2 += PAGE_SIZE - cur_ofs2; - cur_ofs2 = 0; - if (cb_pos2 >= cb_end) - break; - } - } else { - /* Compressed cb, decompress it into the destination page(s). */ - unsigned int prev_cur_page = cur_page; - - ntfs_debug("Found compressed compression block."); - err = ntfs_decompress(pages, completed_pages, &cur_page, - &cur_ofs, cb_max_page, cb_max_ofs, xpage, - &xpage_done, cb_pos, cb_size - (cb_pos - cb), - i_size, initialized_size); - /* - * We can sleep from now on, lock already dropped by - * ntfs_decompress(). - */ - if (err) { - ntfs_error(vol->sb, "ntfs_decompress() failed in inode " - "0x%lx with error code %i. Skipping " - "this compression block.", - ni->mft_no, -err); - /* Release the unfinished pages. */ - for (; prev_cur_page < cur_page; prev_cur_page++) { - page = pages[prev_cur_page]; - if (page) { - flush_dcache_page(page); - kunmap(page); - unlock_page(page); - if (prev_cur_page != xpage) - put_page(page); - pages[prev_cur_page] = NULL; - } - } - } - } - - /* Release the buffer heads. */ - for (i = 0; i < nr_bhs; i++) - brelse(bhs[i]); - - /* Do we have more work to do? */ - if (nr_cbs) - goto do_next_cb; - - /* We no longer need the list of buffer heads. */ - kfree(bhs); - - /* Clean up if we have any pages left. Should never happen. */ - for (cur_page = 0; cur_page < max_page; cur_page++) { - page = pages[cur_page]; - if (page) { - ntfs_error(vol->sb, "Still have pages left! " - "Terminating them with extreme " - "prejudice. Inode 0x%lx, page index " - "0x%lx.", ni->mft_no, page->index); - flush_dcache_page(page); - kunmap(page); - unlock_page(page); - if (cur_page != xpage) - put_page(page); - pages[cur_page] = NULL; - } - } - - /* We no longer need the list of pages. */ - kfree(pages); - kfree(completed_pages); - - /* If we have completed the requested page, we return success. */ - if (likely(xpage_done)) - return 0; - - ntfs_debug("Failed. Returning error code %s.", err == -EOVERFLOW ? - "EOVERFLOW" : (!err ? "EIO" : "unknown error")); - return err < 0 ? err : -EIO; - -read_err: - ntfs_error(vol->sb, "IO error while reading compressed data."); - /* Release the buffer heads. */ - for (i = 0; i < nr_bhs; i++) - brelse(bhs[i]); - goto err_out; - -map_rl_err: - ntfs_error(vol->sb, "ntfs_map_runlist() failed. Cannot read " - "compression block."); - goto err_out; - -rl_err: - up_read(&ni->runlist.lock); - ntfs_error(vol->sb, "ntfs_rl_vcn_to_lcn() failed. Cannot read " - "compression block."); - goto err_out; - -getblk_err: - up_read(&ni->runlist.lock); - ntfs_error(vol->sb, "getblk() failed. Cannot read compression block."); - -err_out: - kfree(bhs); - for (i = cur_page; i < max_page; i++) { - page = pages[i]; - if (page) { - flush_dcache_page(page); - kunmap(page); - unlock_page(page); - if (i != xpage) - put_page(page); - } - } - kfree(pages); - kfree(completed_pages); - return -EIO; -} diff --git a/fs/ntfs/debug.c b/fs/ntfs/debug.c deleted file mode 100644 index a3c1c5656f8f..000000000000 --- a/fs/ntfs/debug.c +++ /dev/null @@ -1,159 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * debug.c - NTFS kernel debug support. Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2004 Anton Altaparmakov - */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include "debug.h" - -/** - * __ntfs_warning - output a warning to the syslog - * @function: name of function outputting the warning - * @sb: super block of mounted ntfs filesystem - * @fmt: warning string containing format specifications - * @...: a variable number of arguments specified in @fmt - * - * Outputs a warning to the syslog for the mounted ntfs filesystem described - * by @sb. - * - * @fmt and the corresponding @... is printf style format string containing - * the warning string and the corresponding format arguments, respectively. - * - * @function is the name of the function from which __ntfs_warning is being - * called. - * - * Note, you should be using debug.h::ntfs_warning(@sb, @fmt, @...) instead - * as this provides the @function parameter automatically. - */ -void __ntfs_warning(const char *function, const struct super_block *sb, - const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - int flen = 0; - -#ifndef DEBUG - if (!printk_ratelimit()) - return; -#endif - if (function) - flen = strlen(function); - va_start(args, fmt); - vaf.fmt = fmt; - vaf.va = &args; - if (sb) - pr_warn("(device %s): %s(): %pV\n", - sb->s_id, flen ? function : "", &vaf); - else - pr_warn("%s(): %pV\n", flen ? function : "", &vaf); - va_end(args); -} - -/** - * __ntfs_error - output an error to the syslog - * @function: name of function outputting the error - * @sb: super block of mounted ntfs filesystem - * @fmt: error string containing format specifications - * @...: a variable number of arguments specified in @fmt - * - * Outputs an error to the syslog for the mounted ntfs filesystem described - * by @sb. - * - * @fmt and the corresponding @... is printf style format string containing - * the error string and the corresponding format arguments, respectively. - * - * @function is the name of the function from which __ntfs_error is being - * called. - * - * Note, you should be using debug.h::ntfs_error(@sb, @fmt, @...) instead - * as this provides the @function parameter automatically. - */ -void __ntfs_error(const char *function, const struct super_block *sb, - const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - int flen = 0; - -#ifndef DEBUG - if (!printk_ratelimit()) - return; -#endif - if (function) - flen = strlen(function); - va_start(args, fmt); - vaf.fmt = fmt; - vaf.va = &args; - if (sb) - pr_err("(device %s): %s(): %pV\n", - sb->s_id, flen ? function : "", &vaf); - else - pr_err("%s(): %pV\n", flen ? function : "", &vaf); - va_end(args); -} - -#ifdef DEBUG - -/* If 1, output debug messages, and if 0, don't. */ -int debug_msgs = 0; - -void __ntfs_debug(const char *file, int line, const char *function, - const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - int flen = 0; - - if (!debug_msgs) - return; - if (function) - flen = strlen(function); - va_start(args, fmt); - vaf.fmt = fmt; - vaf.va = &args; - pr_debug("(%s, %d): %s(): %pV", file, line, flen ? function : "", &vaf); - va_end(args); -} - -/* Dump a runlist. Caller has to provide synchronisation for @rl. */ -void ntfs_debug_dump_runlist(const runlist_element *rl) -{ - int i; - const char *lcn_str[5] = { "LCN_HOLE ", "LCN_RL_NOT_MAPPED", - "LCN_ENOENT ", "LCN_unknown " }; - - if (!debug_msgs) - return; - pr_debug("Dumping runlist (values in hex):\n"); - if (!rl) { - pr_debug("Run list not present.\n"); - return; - } - pr_debug("VCN LCN Run length\n"); - for (i = 0; ; i++) { - LCN lcn = (rl + i)->lcn; - - if (lcn < (LCN)0) { - int index = -lcn - 1; - - if (index > -LCN_ENOENT - 1) - index = 3; - pr_debug("%-16Lx %s %-16Lx%s\n", - (long long)(rl + i)->vcn, lcn_str[index], - (long long)(rl + i)->length, - (rl + i)->length ? "" : - " (runlist end)"); - } else - pr_debug("%-16Lx %-16Lx %-16Lx%s\n", - (long long)(rl + i)->vcn, - (long long)(rl + i)->lcn, - (long long)(rl + i)->length, - (rl + i)->length ? "" : - " (runlist end)"); - if (!(rl + i)->length) - break; - } -} - -#endif diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h deleted file mode 100644 index 6fdef388f129..000000000000 --- a/fs/ntfs/debug.h +++ /dev/null @@ -1,57 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * debug.h - NTFS kernel debug support. Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2004 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_DEBUG_H -#define _LINUX_NTFS_DEBUG_H - -#include <linux/fs.h> - -#include "runlist.h" - -#ifdef DEBUG - -extern int debug_msgs; - -extern __printf(4, 5) -void __ntfs_debug(const char *file, int line, const char *function, - const char *format, ...); -/** - * ntfs_debug - write a debug level message to syslog - * @f: a printf format string containing the message - * @...: the variables to substitute into @f - * - * ntfs_debug() writes a DEBUG level message to the syslog but only if the - * driver was compiled with -DDEBUG. Otherwise, the call turns into a NOP. - */ -#define ntfs_debug(f, a...) \ - __ntfs_debug(__FILE__, __LINE__, __func__, f, ##a) - -extern void ntfs_debug_dump_runlist(const runlist_element *rl); - -#else /* !DEBUG */ - -#define ntfs_debug(fmt, ...) \ -do { \ - if (0) \ - no_printk(fmt, ##__VA_ARGS__); \ -} while (0) - -#define ntfs_debug_dump_runlist(rl) do {} while (0) - -#endif /* !DEBUG */ - -extern __printf(3, 4) -void __ntfs_warning(const char *function, const struct super_block *sb, - const char *fmt, ...); -#define ntfs_warning(sb, f, a...) __ntfs_warning(__func__, sb, f, ##a) - -extern __printf(3, 4) -void __ntfs_error(const char *function, const struct super_block *sb, - const char *fmt, ...); -#define ntfs_error(sb, f, a...) __ntfs_error(__func__, sb, f, ##a) - -#endif /* _LINUX_NTFS_DEBUG_H */ diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c deleted file mode 100644 index 629723a8d712..000000000000 --- a/fs/ntfs/dir.c +++ /dev/null @@ -1,1540 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * dir.c - NTFS kernel directory operations. Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2007 Anton Altaparmakov - * Copyright (c) 2002 Richard Russon - */ - -#include <linux/buffer_head.h> -#include <linux/slab.h> -#include <linux/blkdev.h> - -#include "dir.h" -#include "aops.h" -#include "attrib.h" -#include "mft.h" -#include "debug.h" -#include "ntfs.h" - -/* - * The little endian Unicode string $I30 as a global constant. - */ -ntfschar I30[5] = { cpu_to_le16('$'), cpu_to_le16('I'), - cpu_to_le16('3'), cpu_to_le16('0'), 0 }; - -/** - * ntfs_lookup_inode_by_name - find an inode in a directory given its name - * @dir_ni: ntfs inode of the directory in which to search for the name - * @uname: Unicode name for which to search in the directory - * @uname_len: length of the name @uname in Unicode characters - * @res: return the found file name if necessary (see below) - * - * Look for an inode with name @uname in the directory with inode @dir_ni. - * ntfs_lookup_inode_by_name() walks the contents of the directory looking for - * the Unicode name. If the name is found in the directory, the corresponding - * inode number (>= 0) is returned as a mft reference in cpu format, i.e. it - * is a 64-bit number containing the sequence number. - * - * On error, a negative value is returned corresponding to the error code. In - * particular if the inode is not found -ENOENT is returned. Note that you - * can't just check the return value for being negative, you have to check the - * inode number for being negative which you can extract using MREC(return - * value). - * - * Note, @uname_len does not include the (optional) terminating NULL character. - * - * Note, we look for a case sensitive match first but we also look for a case - * insensitive match at the same time. If we find a case insensitive match, we - * save that for the case that we don't find an exact match, where we return - * the case insensitive match and setup @res (which we allocate!) with the mft - * reference, the file name type, length and with a copy of the little endian - * Unicode file name itself. If we match a file name which is in the DOS name - * space, we only return the mft reference and file name type in @res. - * ntfs_lookup() then uses this to find the long file name in the inode itself. - * This is to avoid polluting the dcache with short file names. We want them to - * work but we don't care for how quickly one can access them. This also fixes - * the dcache aliasing issues. - * - * Locking: - Caller must hold i_mutex on the directory. - * - Each page cache page in the index allocation mapping must be - * locked whilst being accessed otherwise we may find a corrupt - * page due to it being under ->writepage at the moment which - * applies the mst protection fixups before writing out and then - * removes them again after the write is complete after which it - * unlocks the page. - */ -MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname, - const int uname_len, ntfs_name **res) -{ - ntfs_volume *vol = dir_ni->vol; - struct super_block *sb = vol->sb; - MFT_RECORD *m; - INDEX_ROOT *ir; - INDEX_ENTRY *ie; - INDEX_ALLOCATION *ia; - u8 *index_end; - u64 mref; - ntfs_attr_search_ctx *ctx; - int err, rc; - VCN vcn, old_vcn; - struct address_space *ia_mapping; - struct page *page; - u8 *kaddr; - ntfs_name *name = NULL; - - BUG_ON(!S_ISDIR(VFS_I(dir_ni)->i_mode)); - BUG_ON(NInoAttr(dir_ni)); - /* Get hold of the mft record for the directory. */ - m = map_mft_record(dir_ni); - if (IS_ERR(m)) { - ntfs_error(sb, "map_mft_record() failed with error code %ld.", - -PTR_ERR(m)); - return ERR_MREF(PTR_ERR(m)); - } - ctx = ntfs_attr_get_search_ctx(dir_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - /* Find the index root attribute in the mft record. */ - err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL, - 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) { - ntfs_error(sb, "Index root attribute missing in " - "directory inode 0x%lx.", - dir_ni->mft_no); - err = -EIO; - } - goto err_out; - } - /* Get to the index root value (it's been verified in read_inode). */ - ir = (INDEX_ROOT*)((u8*)ctx->attr + - le16_to_cpu(ctx->attr->data.resident.value_offset)); - index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); - /* The first index entry. */ - ie = (INDEX_ENTRY*)((u8*)&ir->index + - le32_to_cpu(ir->index.entries_offset)); - /* - * Loop until we exceed valid memory (corruption case) or until we - * reach the last entry. - */ - for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { - /* Bounds checks. */ - if ((u8*)ie < (u8*)ctx->mrec || (u8*)ie + - sizeof(INDEX_ENTRY_HEADER) > index_end || - (u8*)ie + le16_to_cpu(ie->key_length) > - index_end) - goto dir_err_out; - /* - * The last entry cannot contain a name. It can however contain - * a pointer to a child node in the B+tree so we just break out. - */ - if (ie->flags & INDEX_ENTRY_END) - break; - /* - * We perform a case sensitive comparison and if that matches - * we are done and return the mft reference of the inode (i.e. - * the inode number together with the sequence number for - * consistency checking). We convert it to cpu format before - * returning. - */ - if (ntfs_are_names_equal(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, - CASE_SENSITIVE, vol->upcase, vol->upcase_len)) { -found_it: - /* - * We have a perfect match, so we don't need to care - * about having matched imperfectly before, so we can - * free name and set *res to NULL. - * However, if the perfect match is a short file name, - * we need to signal this through *res, so that - * ntfs_lookup() can fix dcache aliasing issues. - * As an optimization we just reuse an existing - * allocation of *res. - */ - if (ie->key.file_name.file_name_type == FILE_NAME_DOS) { - if (!name) { - name = kmalloc(sizeof(ntfs_name), - GFP_NOFS); - if (!name) { - err = -ENOMEM; - goto err_out; - } - } - name->mref = le64_to_cpu( - ie->data.dir.indexed_file); - name->type = FILE_NAME_DOS; - name->len = 0; - *res = name; - } else { - kfree(name); - *res = NULL; - } - mref = le64_to_cpu(ie->data.dir.indexed_file); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(dir_ni); - return mref; - } - /* - * For a case insensitive mount, we also perform a case - * insensitive comparison (provided the file name is not in the - * POSIX namespace). If the comparison matches, and the name is - * in the WIN32 namespace, we cache the filename in *res so - * that the caller, ntfs_lookup(), can work on it. If the - * comparison matches, and the name is in the DOS namespace, we - * only cache the mft reference and the file name type (we set - * the name length to zero for simplicity). - */ - if (!NVolCaseSensitive(vol) && - ie->key.file_name.file_name_type && - ntfs_are_names_equal(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, - IGNORE_CASE, vol->upcase, vol->upcase_len)) { - int name_size = sizeof(ntfs_name); - u8 type = ie->key.file_name.file_name_type; - u8 len = ie->key.file_name.file_name_length; - - /* Only one case insensitive matching name allowed. */ - if (name) { - ntfs_error(sb, "Found already allocated name " - "in phase 1. Please run chkdsk " - "and if that doesn't find any " - "errors please report you saw " - "this message to " - "linux-ntfs-dev@lists." - "sourceforge.net."); - goto dir_err_out; - } - - if (type != FILE_NAME_DOS) - name_size += len * sizeof(ntfschar); - name = kmalloc(name_size, GFP_NOFS); - if (!name) { - err = -ENOMEM; - goto err_out; - } - name->mref = le64_to_cpu(ie->data.dir.indexed_file); - name->type = type; - if (type != FILE_NAME_DOS) { - name->len = len; - memcpy(name->name, ie->key.file_name.file_name, - len * sizeof(ntfschar)); - } else - name->len = 0; - *res = name; - } - /* - * Not a perfect match, need to do full blown collation so we - * know which way in the B+tree we have to go. - */ - rc = ntfs_collate_names(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, 1, - IGNORE_CASE, vol->upcase, vol->upcase_len); - /* - * If uname collates before the name of the current entry, there - * is definitely no such name in this index but we might need to - * descend into the B+tree so we just break out of the loop. - */ - if (rc == -1) - break; - /* The names are not equal, continue the search. */ - if (rc) - continue; - /* - * Names match with case insensitive comparison, now try the - * case sensitive comparison, which is required for proper - * collation. - */ - rc = ntfs_collate_names(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, 1, - CASE_SENSITIVE, vol->upcase, vol->upcase_len); - if (rc == -1) - break; - if (rc) - continue; - /* - * Perfect match, this will never happen as the - * ntfs_are_names_equal() call will have gotten a match but we - * still treat it correctly. - */ - goto found_it; - } - /* - * We have finished with this index without success. Check for the - * presence of a child node and if not present return -ENOENT, unless - * we have got a matching name cached in name in which case return the - * mft reference associated with it. - */ - if (!(ie->flags & INDEX_ENTRY_NODE)) { - if (name) { - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(dir_ni); - return name->mref; - } - ntfs_debug("Entry not found."); - err = -ENOENT; - goto err_out; - } /* Child node present, descend into it. */ - /* Consistency check: Verify that an index allocation exists. */ - if (!NInoIndexAllocPresent(dir_ni)) { - ntfs_error(sb, "No index allocation attribute but index entry " - "requires one. Directory inode 0x%lx is " - "corrupt or driver bug.", dir_ni->mft_no); - goto err_out; - } - /* Get the starting vcn of the index_block holding the child node. */ - vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8)); - ia_mapping = VFS_I(dir_ni)->i_mapping; - /* - * We are done with the index root and the mft record. Release them, - * otherwise we deadlock with ntfs_map_page(). - */ - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(dir_ni); - m = NULL; - ctx = NULL; -descend_into_child_node: - /* - * Convert vcn to index into the index allocation attribute in units - * of PAGE_SIZE and map the page cache page, reading it from - * disk if necessary. - */ - page = ntfs_map_page(ia_mapping, vcn << - dir_ni->itype.index.vcn_size_bits >> PAGE_SHIFT); - if (IS_ERR(page)) { - ntfs_error(sb, "Failed to map directory index page, error %ld.", - -PTR_ERR(page)); - err = PTR_ERR(page); - goto err_out; - } - lock_page(page); - kaddr = (u8*)page_address(page); -fast_descend_into_child_node: - /* Get to the index allocation block. */ - ia = (INDEX_ALLOCATION*)(kaddr + ((vcn << - dir_ni->itype.index.vcn_size_bits) & ~PAGE_MASK)); - /* Bounds checks. */ - if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) { - ntfs_error(sb, "Out of bounds check failed. Corrupt directory " - "inode 0x%lx or driver bug.", dir_ni->mft_no); - goto unm_err_out; - } - /* Catch multi sector transfer fixup errors. */ - if (unlikely(!ntfs_is_indx_record(ia->magic))) { - ntfs_error(sb, "Directory index record with vcn 0x%llx is " - "corrupt. Corrupt inode 0x%lx. Run chkdsk.", - (unsigned long long)vcn, dir_ni->mft_no); - goto unm_err_out; - } - if (sle64_to_cpu(ia->index_block_vcn) != vcn) { - ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is " - "different from expected VCN (0x%llx). " - "Directory inode 0x%lx is corrupt or driver " - "bug.", (unsigned long long) - sle64_to_cpu(ia->index_block_vcn), - (unsigned long long)vcn, dir_ni->mft_no); - goto unm_err_out; - } - if (le32_to_cpu(ia->index.allocated_size) + 0x18 != - dir_ni->itype.index.block_size) { - ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " - "0x%lx has a size (%u) differing from the " - "directory specified size (%u). Directory " - "inode is corrupt or driver bug.", - (unsigned long long)vcn, dir_ni->mft_no, - le32_to_cpu(ia->index.allocated_size) + 0x18, - dir_ni->itype.index.block_size); - goto unm_err_out; - } - index_end = (u8*)ia + dir_ni->itype.index.block_size; - if (index_end > kaddr + PAGE_SIZE) { - ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " - "0x%lx crosses page boundary. Impossible! " - "Cannot access! This is probably a bug in the " - "driver.", (unsigned long long)vcn, - dir_ni->mft_no); - goto unm_err_out; - } - index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length); - if (index_end > (u8*)ia + dir_ni->itype.index.block_size) { - ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory " - "inode 0x%lx exceeds maximum size.", - (unsigned long long)vcn, dir_ni->mft_no); - goto unm_err_out; - } - /* The first index entry. */ - ie = (INDEX_ENTRY*)((u8*)&ia->index + - le32_to_cpu(ia->index.entries_offset)); - /* - * Iterate similar to above big loop but applied to index buffer, thus - * loop until we exceed valid memory (corruption case) or until we - * reach the last entry. - */ - for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { - /* Bounds check. */ - if ((u8*)ie < (u8*)ia || (u8*)ie + - sizeof(INDEX_ENTRY_HEADER) > index_end || - (u8*)ie + le16_to_cpu(ie->key_length) > - index_end) { - ntfs_error(sb, "Index entry out of bounds in " - "directory inode 0x%lx.", - dir_ni->mft_no); - goto unm_err_out; - } - /* - * The last entry cannot contain a name. It can however contain - * a pointer to a child node in the B+tree so we just break out. - */ - if (ie->flags & INDEX_ENTRY_END) - break; - /* - * We perform a case sensitive comparison and if that matches - * we are done and return the mft reference of the inode (i.e. - * the inode number together with the sequence number for - * consistency checking). We convert it to cpu format before - * returning. - */ - if (ntfs_are_names_equal(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, - CASE_SENSITIVE, vol->upcase, vol->upcase_len)) { -found_it2: - /* - * We have a perfect match, so we don't need to care - * about having matched imperfectly before, so we can - * free name and set *res to NULL. - * However, if the perfect match is a short file name, - * we need to signal this through *res, so that - * ntfs_lookup() can fix dcache aliasing issues. - * As an optimization we just reuse an existing - * allocation of *res. - */ - if (ie->key.file_name.file_name_type == FILE_NAME_DOS) { - if (!name) { - name = kmalloc(sizeof(ntfs_name), - GFP_NOFS); - if (!name) { - err = -ENOMEM; - goto unm_err_out; - } - } - name->mref = le64_to_cpu( - ie->data.dir.indexed_file); - name->type = FILE_NAME_DOS; - name->len = 0; - *res = name; - } else { - kfree(name); - *res = NULL; - } - mref = le64_to_cpu(ie->data.dir.indexed_file); - unlock_page(page); - ntfs_unmap_page(page); - return mref; - } - /* - * For a case insensitive mount, we also perform a case - * insensitive comparison (provided the file name is not in the - * POSIX namespace). If the comparison matches, and the name is - * in the WIN32 namespace, we cache the filename in *res so - * that the caller, ntfs_lookup(), can work on it. If the - * comparison matches, and the name is in the DOS namespace, we - * only cache the mft reference and the file name type (we set - * the name length to zero for simplicity). - */ - if (!NVolCaseSensitive(vol) && - ie->key.file_name.file_name_type && - ntfs_are_names_equal(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, - IGNORE_CASE, vol->upcase, vol->upcase_len)) { - int name_size = sizeof(ntfs_name); - u8 type = ie->key.file_name.file_name_type; - u8 len = ie->key.file_name.file_name_length; - - /* Only one case insensitive matching name allowed. */ - if (name) { - ntfs_error(sb, "Found already allocated name " - "in phase 2. Please run chkdsk " - "and if that doesn't find any " - "errors please report you saw " - "this message to " - "linux-ntfs-dev@lists." - "sourceforge.net."); - unlock_page(page); - ntfs_unmap_page(page); - goto dir_err_out; - } - - if (type != FILE_NAME_DOS) - name_size += len * sizeof(ntfschar); - name = kmalloc(name_size, GFP_NOFS); - if (!name) { - err = -ENOMEM; - goto unm_err_out; - } - name->mref = le64_to_cpu(ie->data.dir.indexed_file); - name->type = type; - if (type != FILE_NAME_DOS) { - name->len = len; - memcpy(name->name, ie->key.file_name.file_name, - len * sizeof(ntfschar)); - } else - name->len = 0; - *res = name; - } - /* - * Not a perfect match, need to do full blown collation so we - * know which way in the B+tree we have to go. - */ - rc = ntfs_collate_names(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, 1, - IGNORE_CASE, vol->upcase, vol->upcase_len); - /* - * If uname collates before the name of the current entry, there - * is definitely no such name in this index but we might need to - * descend into the B+tree so we just break out of the loop. - */ - if (rc == -1) - break; - /* The names are not equal, continue the search. */ - if (rc) - continue; - /* - * Names match with case insensitive comparison, now try the - * case sensitive comparison, which is required for proper - * collation. - */ - rc = ntfs_collate_names(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, 1, - CASE_SENSITIVE, vol->upcase, vol->upcase_len); - if (rc == -1) - break; - if (rc) - continue; - /* - * Perfect match, this will never happen as the - * ntfs_are_names_equal() call will have gotten a match but we - * still treat it correctly. - */ - goto found_it2; - } - /* - * We have finished with this index buffer without success. Check for - * the presence of a child node. - */ - if (ie->flags & INDEX_ENTRY_NODE) { - if ((ia->index.flags & NODE_MASK) == LEAF_NODE) { - ntfs_error(sb, "Index entry with child node found in " - "a leaf node in directory inode 0x%lx.", - dir_ni->mft_no); - goto unm_err_out; - } - /* Child node present, descend into it. */ - old_vcn = vcn; - vcn = sle64_to_cpup((sle64*)((u8*)ie + - le16_to_cpu(ie->length) - 8)); - if (vcn >= 0) { - /* If vcn is in the same page cache page as old_vcn we - * recycle the mapped page. */ - if (old_vcn << vol->cluster_size_bits >> - PAGE_SHIFT == vcn << - vol->cluster_size_bits >> - PAGE_SHIFT) - goto fast_descend_into_child_node; - unlock_page(page); - ntfs_unmap_page(page); - goto descend_into_child_node; - } - ntfs_error(sb, "Negative child node vcn in directory inode " - "0x%lx.", dir_ni->mft_no); - goto unm_err_out; - } - /* - * No child node present, return -ENOENT, unless we have got a matching - * name cached in name in which case return the mft reference - * associated with it. - */ - if (name) { - unlock_page(page); - ntfs_unmap_page(page); - return name->mref; - } - ntfs_debug("Entry not found."); - err = -ENOENT; -unm_err_out: - unlock_page(page); - ntfs_unmap_page(page); -err_out: - if (!err) - err = -EIO; - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(dir_ni); - if (name) { - kfree(name); - *res = NULL; - } - return ERR_MREF(err); -dir_err_out: - ntfs_error(sb, "Corrupt directory. Aborting lookup."); - goto err_out; -} - -#if 0 - -// TODO: (AIA) -// The algorithm embedded in this code will be required for the time when we -// want to support adding of entries to directories, where we require correct -// collation of file names in order not to cause corruption of the filesystem. - -/** - * ntfs_lookup_inode_by_name - find an inode in a directory given its name - * @dir_ni: ntfs inode of the directory in which to search for the name - * @uname: Unicode name for which to search in the directory - * @uname_len: length of the name @uname in Unicode characters - * - * Look for an inode with name @uname in the directory with inode @dir_ni. - * ntfs_lookup_inode_by_name() walks the contents of the directory looking for - * the Unicode name. If the name is found in the directory, the corresponding - * inode number (>= 0) is returned as a mft reference in cpu format, i.e. it - * is a 64-bit number containing the sequence number. - * - * On error, a negative value is returned corresponding to the error code. In - * particular if the inode is not found -ENOENT is returned. Note that you - * can't just check the return value for being negative, you have to check the - * inode number for being negative which you can extract using MREC(return - * value). - * - * Note, @uname_len does not include the (optional) terminating NULL character. - */ -u64 ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname, - const int uname_len) -{ - ntfs_volume *vol = dir_ni->vol; - struct super_block *sb = vol->sb; - MFT_RECORD *m; - INDEX_ROOT *ir; - INDEX_ENTRY *ie; - INDEX_ALLOCATION *ia; - u8 *index_end; - u64 mref; - ntfs_attr_search_ctx *ctx; - int err, rc; - IGNORE_CASE_BOOL ic; - VCN vcn, old_vcn; - struct address_space *ia_mapping; - struct page *page; - u8 *kaddr; - - /* Get hold of the mft record for the directory. */ - m = map_mft_record(dir_ni); - if (IS_ERR(m)) { - ntfs_error(sb, "map_mft_record() failed with error code %ld.", - -PTR_ERR(m)); - return ERR_MREF(PTR_ERR(m)); - } - ctx = ntfs_attr_get_search_ctx(dir_ni, m); - if (!ctx) { - err = -ENOMEM; - goto err_out; - } - /* Find the index root attribute in the mft record. */ - err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL, - 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) { - ntfs_error(sb, "Index root attribute missing in " - "directory inode 0x%lx.", - dir_ni->mft_no); - err = -EIO; - } - goto err_out; - } - /* Get to the index root value (it's been verified in read_inode). */ - ir = (INDEX_ROOT*)((u8*)ctx->attr + - le16_to_cpu(ctx->attr->data.resident.value_offset)); - index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); - /* The first index entry. */ - ie = (INDEX_ENTRY*)((u8*)&ir->index + - le32_to_cpu(ir->index.entries_offset)); - /* - * Loop until we exceed valid memory (corruption case) or until we - * reach the last entry. - */ - for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { - /* Bounds checks. */ - if ((u8*)ie < (u8*)ctx->mrec || (u8*)ie + - sizeof(INDEX_ENTRY_HEADER) > index_end || - (u8*)ie + le16_to_cpu(ie->key_length) > - index_end) - goto dir_err_out; - /* - * The last entry cannot contain a name. It can however contain - * a pointer to a child node in the B+tree so we just break out. - */ - if (ie->flags & INDEX_ENTRY_END) - break; - /* - * If the current entry has a name type of POSIX, the name is - * case sensitive and not otherwise. This has the effect of us - * not being able to access any POSIX file names which collate - * after the non-POSIX one when they only differ in case, but - * anyone doing screwy stuff like that deserves to burn in - * hell... Doing that kind of stuff on NT4 actually causes - * corruption on the partition even when using SP6a and Linux - * is not involved at all. - */ - ic = ie->key.file_name.file_name_type ? IGNORE_CASE : - CASE_SENSITIVE; - /* - * If the names match perfectly, we are done and return the - * mft reference of the inode (i.e. the inode number together - * with the sequence number for consistency checking. We - * convert it to cpu format before returning. - */ - if (ntfs_are_names_equal(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, ic, - vol->upcase, vol->upcase_len)) { -found_it: - mref = le64_to_cpu(ie->data.dir.indexed_file); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(dir_ni); - return mref; - } - /* - * Not a perfect match, need to do full blown collation so we - * know which way in the B+tree we have to go. - */ - rc = ntfs_collate_names(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, 1, - IGNORE_CASE, vol->upcase, vol->upcase_len); - /* - * If uname collates before the name of the current entry, there - * is definitely no such name in this index but we might need to - * descend into the B+tree so we just break out of the loop. - */ - if (rc == -1) - break; - /* The names are not equal, continue the search. */ - if (rc) - continue; - /* - * Names match with case insensitive comparison, now try the - * case sensitive comparison, which is required for proper - * collation. - */ - rc = ntfs_collate_names(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, 1, - CASE_SENSITIVE, vol->upcase, vol->upcase_len); - if (rc == -1) - break; - if (rc) - continue; - /* - * Perfect match, this will never happen as the - * ntfs_are_names_equal() call will have gotten a match but we - * still treat it correctly. - */ - goto found_it; - } - /* - * We have finished with this index without success. Check for the - * presence of a child node. - */ - if (!(ie->flags & INDEX_ENTRY_NODE)) { - /* No child node, return -ENOENT. */ - err = -ENOENT; - goto err_out; - } /* Child node present, descend into it. */ - /* Consistency check: Verify that an index allocation exists. */ - if (!NInoIndexAllocPresent(dir_ni)) { - ntfs_error(sb, "No index allocation attribute but index entry " - "requires one. Directory inode 0x%lx is " - "corrupt or driver bug.", dir_ni->mft_no); - goto err_out; - } - /* Get the starting vcn of the index_block holding the child node. */ - vcn = sle64_to_cpup((u8*)ie + le16_to_cpu(ie->length) - 8); - ia_mapping = VFS_I(dir_ni)->i_mapping; - /* - * We are done with the index root and the mft record. Release them, - * otherwise we deadlock with ntfs_map_page(). - */ - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(dir_ni); - m = NULL; - ctx = NULL; -descend_into_child_node: - /* - * Convert vcn to index into the index allocation attribute in units - * of PAGE_SIZE and map the page cache page, reading it from - * disk if necessary. - */ - page = ntfs_map_page(ia_mapping, vcn << - dir_ni->itype.index.vcn_size_bits >> PAGE_SHIFT); - if (IS_ERR(page)) { - ntfs_error(sb, "Failed to map directory index page, error %ld.", - -PTR_ERR(page)); - err = PTR_ERR(page); - goto err_out; - } - lock_page(page); - kaddr = (u8*)page_address(page); -fast_descend_into_child_node: - /* Get to the index allocation block. */ - ia = (INDEX_ALLOCATION*)(kaddr + ((vcn << - dir_ni->itype.index.vcn_size_bits) & ~PAGE_MASK)); - /* Bounds checks. */ - if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) { - ntfs_error(sb, "Out of bounds check failed. Corrupt directory " - "inode 0x%lx or driver bug.", dir_ni->mft_no); - goto unm_err_out; - } - /* Catch multi sector transfer fixup errors. */ - if (unlikely(!ntfs_is_indx_record(ia->magic))) { - ntfs_error(sb, "Directory index record with vcn 0x%llx is " - "corrupt. Corrupt inode 0x%lx. Run chkdsk.", - (unsigned long long)vcn, dir_ni->mft_no); - goto unm_err_out; - } - if (sle64_to_cpu(ia->index_block_vcn) != vcn) { - ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is " - "different from expected VCN (0x%llx). " - "Directory inode 0x%lx is corrupt or driver " - "bug.", (unsigned long long) - sle64_to_cpu(ia->index_block_vcn), - (unsigned long long)vcn, dir_ni->mft_no); - goto unm_err_out; - } - if (le32_to_cpu(ia->index.allocated_size) + 0x18 != - dir_ni->itype.index.block_size) { - ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " - "0x%lx has a size (%u) differing from the " - "directory specified size (%u). Directory " - "inode is corrupt or driver bug.", - (unsigned long long)vcn, dir_ni->mft_no, - le32_to_cpu(ia->index.allocated_size) + 0x18, - dir_ni->itype.index.block_size); - goto unm_err_out; - } - index_end = (u8*)ia + dir_ni->itype.index.block_size; - if (index_end > kaddr + PAGE_SIZE) { - ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " - "0x%lx crosses page boundary. Impossible! " - "Cannot access! This is probably a bug in the " - "driver.", (unsigned long long)vcn, - dir_ni->mft_no); - goto unm_err_out; - } - index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length); - if (index_end > (u8*)ia + dir_ni->itype.index.block_size) { - ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory " - "inode 0x%lx exceeds maximum size.", - (unsigned long long)vcn, dir_ni->mft_no); - goto unm_err_out; - } - /* The first index entry. */ - ie = (INDEX_ENTRY*)((u8*)&ia->index + - le32_to_cpu(ia->index.entries_offset)); - /* - * Iterate similar to above big loop but applied to index buffer, thus - * loop until we exceed valid memory (corruption case) or until we - * reach the last entry. - */ - for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { - /* Bounds check. */ - if ((u8*)ie < (u8*)ia || (u8*)ie + - sizeof(INDEX_ENTRY_HEADER) > index_end || - (u8*)ie + le16_to_cpu(ie->key_length) > - index_end) { - ntfs_error(sb, "Index entry out of bounds in " - "directory inode 0x%lx.", - dir_ni->mft_no); - goto unm_err_out; - } - /* - * The last entry cannot contain a name. It can however contain - * a pointer to a child node in the B+tree so we just break out. - */ - if (ie->flags & INDEX_ENTRY_END) - break; - /* - * If the current entry has a name type of POSIX, the name is - * case sensitive and not otherwise. This has the effect of us - * not being able to access any POSIX file names which collate - * after the non-POSIX one when they only differ in case, but - * anyone doing screwy stuff like that deserves to burn in - * hell... Doing that kind of stuff on NT4 actually causes - * corruption on the partition even when using SP6a and Linux - * is not involved at all. - */ - ic = ie->key.file_name.file_name_type ? IGNORE_CASE : - CASE_SENSITIVE; - /* - * If the names match perfectly, we are done and return the - * mft reference of the inode (i.e. the inode number together - * with the sequence number for consistency checking. We - * convert it to cpu format before returning. - */ - if (ntfs_are_names_equal(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, ic, - vol->upcase, vol->upcase_len)) { -found_it2: - mref = le64_to_cpu(ie->data.dir.indexed_file); - unlock_page(page); - ntfs_unmap_page(page); - return mref; - } - /* - * Not a perfect match, need to do full blown collation so we - * know which way in the B+tree we have to go. - */ - rc = ntfs_collate_names(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, 1, - IGNORE_CASE, vol->upcase, vol->upcase_len); - /* - * If uname collates before the name of the current entry, there - * is definitely no such name in this index but we might need to - * descend into the B+tree so we just break out of the loop. - */ - if (rc == -1) - break; - /* The names are not equal, continue the search. */ - if (rc) - continue; - /* - * Names match with case insensitive comparison, now try the - * case sensitive comparison, which is required for proper - * collation. - */ - rc = ntfs_collate_names(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, 1, - CASE_SENSITIVE, vol->upcase, vol->upcase_len); - if (rc == -1) - break; - if (rc) - continue; - /* - * Perfect match, this will never happen as the - * ntfs_are_names_equal() call will have gotten a match but we - * still treat it correctly. - */ - goto found_it2; - } - /* - * We have finished with this index buffer without success. Check for - * the presence of a child node. - */ - if (ie->flags & INDEX_ENTRY_NODE) { - if ((ia->index.flags & NODE_MASK) == LEAF_NODE) { - ntfs_error(sb, "Index entry with child node found in " - "a leaf node in directory inode 0x%lx.", - dir_ni->mft_no); - goto unm_err_out; - } - /* Child node present, descend into it. */ - old_vcn = vcn; - vcn = sle64_to_cpup((u8*)ie + le16_to_cpu(ie->length) - 8); - if (vcn >= 0) { - /* If vcn is in the same page cache page as old_vcn we - * recycle the mapped page. */ - if (old_vcn << vol->cluster_size_bits >> - PAGE_SHIFT == vcn << - vol->cluster_size_bits >> - PAGE_SHIFT) - goto fast_descend_into_child_node; - unlock_page(page); - ntfs_unmap_page(page); - goto descend_into_child_node; - } - ntfs_error(sb, "Negative child node vcn in directory inode " - "0x%lx.", dir_ni->mft_no); - goto unm_err_out; - } - /* No child node, return -ENOENT. */ - ntfs_debug("Entry not found."); - err = -ENOENT; -unm_err_out: - unlock_page(page); - ntfs_unmap_page(page); -err_out: - if (!err) - err = -EIO; - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(dir_ni); - return ERR_MREF(err); -dir_err_out: - ntfs_error(sb, "Corrupt directory. Aborting lookup."); - goto err_out; -} - -#endif - -/** - * ntfs_filldir - ntfs specific filldir method - * @vol: current ntfs volume - * @ndir: ntfs inode of current directory - * @ia_page: page in which the index allocation buffer @ie is in resides - * @ie: current index entry - * @name: buffer to use for the converted name - * @actor: what to feed the entries to - * - * Convert the Unicode @name to the loaded NLS and pass it to the @filldir - * callback. - * - * If @ia_page is not NULL it is the locked page containing the index - * allocation block containing the index entry @ie. - * - * Note, we drop (and then reacquire) the page lock on @ia_page across the - * @filldir() call otherwise we would deadlock with NFSd when it calls ->lookup - * since ntfs_lookup() will lock the same page. As an optimization, we do not - * retake the lock if we are returning a non-zero value as ntfs_readdir() - * would need to drop the lock immediately anyway. - */ -static inline int ntfs_filldir(ntfs_volume *vol, - ntfs_inode *ndir, struct page *ia_page, INDEX_ENTRY *ie, - u8 *name, struct dir_context *actor) -{ - unsigned long mref; - int name_len; - unsigned dt_type; - FILE_NAME_TYPE_FLAGS name_type; - - name_type = ie->key.file_name.file_name_type; - if (name_type == FILE_NAME_DOS) { - ntfs_debug("Skipping DOS name space entry."); - return 0; - } - if (MREF_LE(ie->data.dir.indexed_file) == FILE_root) { - ntfs_debug("Skipping root directory self reference entry."); - return 0; - } - if (MREF_LE(ie->data.dir.indexed_file) < FILE_first_user && - !NVolShowSystemFiles(vol)) { - ntfs_debug("Skipping system file."); - return 0; - } - name_len = ntfs_ucstonls(vol, (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, &name, - NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1); - if (name_len <= 0) { - ntfs_warning(vol->sb, "Skipping unrepresentable inode 0x%llx.", - (long long)MREF_LE(ie->data.dir.indexed_file)); - return 0; - } - if (ie->key.file_name.file_attributes & - FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT) - dt_type = DT_DIR; - else - dt_type = DT_REG; - mref = MREF_LE(ie->data.dir.indexed_file); - /* - * Drop the page lock otherwise we deadlock with NFS when it calls - * ->lookup since ntfs_lookup() will lock the same page. - */ - if (ia_page) - unlock_page(ia_page); - ntfs_debug("Calling filldir for %s with len %i, fpos 0x%llx, inode " - "0x%lx, DT_%s.", name, name_len, actor->pos, mref, - dt_type == DT_DIR ? "DIR" : "REG"); - if (!dir_emit(actor, name, name_len, mref, dt_type)) - return 1; - /* Relock the page but not if we are aborting ->readdir. */ - if (ia_page) - lock_page(ia_page); - return 0; -} - -/* - * We use the same basic approach as the old NTFS driver, i.e. we parse the - * index root entries and then the index allocation entries that are marked - * as in use in the index bitmap. - * - * While this will return the names in random order this doesn't matter for - * ->readdir but OTOH results in a faster ->readdir. - * - * VFS calls ->readdir without BKL but with i_mutex held. This protects the VFS - * parts (e.g. ->f_pos and ->i_size, and it also protects against directory - * modifications). - * - * Locking: - Caller must hold i_mutex on the directory. - * - Each page cache page in the index allocation mapping must be - * locked whilst being accessed otherwise we may find a corrupt - * page due to it being under ->writepage at the moment which - * applies the mst protection fixups before writing out and then - * removes them again after the write is complete after which it - * unlocks the page. - */ -static int ntfs_readdir(struct file *file, struct dir_context *actor) -{ - s64 ia_pos, ia_start, prev_ia_pos, bmp_pos; - loff_t i_size; - struct inode *bmp_vi, *vdir = file_inode(file); - struct super_block *sb = vdir->i_sb; - ntfs_inode *ndir = NTFS_I(vdir); - ntfs_volume *vol = NTFS_SB(sb); - MFT_RECORD *m; - INDEX_ROOT *ir = NULL; - INDEX_ENTRY *ie; - INDEX_ALLOCATION *ia; - u8 *name = NULL; - int rc, err, ir_pos, cur_bmp_pos; - struct address_space *ia_mapping, *bmp_mapping; - struct page *bmp_page = NULL, *ia_page = NULL; - u8 *kaddr, *bmp, *index_end; - ntfs_attr_search_ctx *ctx; - - ntfs_debug("Entering for inode 0x%lx, fpos 0x%llx.", - vdir->i_ino, actor->pos); - rc = err = 0; - /* Are we at end of dir yet? */ - i_size = i_size_read(vdir); - if (actor->pos >= i_size + vol->mft_record_size) - return 0; - /* Emulate . and .. for all directories. */ - if (!dir_emit_dots(file, actor)) - return 0; - m = NULL; - ctx = NULL; - /* - * Allocate a buffer to store the current name being processed - * converted to format determined by current NLS. - */ - name = kmalloc(NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1, GFP_NOFS); - if (unlikely(!name)) { - err = -ENOMEM; - goto err_out; - } - /* Are we jumping straight into the index allocation attribute? */ - if (actor->pos >= vol->mft_record_size) - goto skip_index_root; - /* Get hold of the mft record for the directory. */ - m = map_mft_record(ndir); - if (IS_ERR(m)) { - err = PTR_ERR(m); - m = NULL; - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(ndir, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - /* Get the offset into the index root attribute. */ - ir_pos = (s64)actor->pos; - /* Find the index root attribute in the mft record. */ - err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL, - 0, ctx); - if (unlikely(err)) { - ntfs_error(sb, "Index root attribute missing in directory " - "inode 0x%lx.", vdir->i_ino); - goto err_out; - } - /* - * Copy the index root attribute value to a buffer so that we can put - * the search context and unmap the mft record before calling the - * filldir() callback. We need to do this because of NFSd which calls - * ->lookup() from its filldir callback() and this causes NTFS to - * deadlock as ntfs_lookup() maps the mft record of the directory and - * we have got it mapped here already. The only solution is for us to - * unmap the mft record here so that a call to ntfs_lookup() is able to - * map the mft record without deadlocking. - */ - rc = le32_to_cpu(ctx->attr->data.resident.value_length); - ir = kmalloc(rc, GFP_NOFS); - if (unlikely(!ir)) { - err = -ENOMEM; - goto err_out; - } - /* Copy the index root value (it has been verified in read_inode). */ - memcpy(ir, (u8*)ctx->attr + - le16_to_cpu(ctx->attr->data.resident.value_offset), rc); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(ndir); - ctx = NULL; - m = NULL; - index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); - /* The first index entry. */ - ie = (INDEX_ENTRY*)((u8*)&ir->index + - le32_to_cpu(ir->index.entries_offset)); - /* - * Loop until we exceed valid memory (corruption case) or until we - * reach the last entry or until filldir tells us it has had enough - * or signals an error (both covered by the rc test). - */ - for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { - ntfs_debug("In index root, offset 0x%zx.", (u8*)ie - (u8*)ir); - /* Bounds checks. */ - if (unlikely((u8*)ie < (u8*)ir || (u8*)ie + - sizeof(INDEX_ENTRY_HEADER) > index_end || - (u8*)ie + le16_to_cpu(ie->key_length) > - index_end)) - goto err_out; - /* The last entry cannot contain a name. */ - if (ie->flags & INDEX_ENTRY_END) - break; - /* Skip index root entry if continuing previous readdir. */ - if (ir_pos > (u8*)ie - (u8*)ir) - continue; - /* Advance the position even if going to skip the entry. */ - actor->pos = (u8*)ie - (u8*)ir; - /* Submit the name to the filldir callback. */ - rc = ntfs_filldir(vol, ndir, NULL, ie, name, actor); - if (rc) { - kfree(ir); - goto abort; - } - } - /* We are done with the index root and can free the buffer. */ - kfree(ir); - ir = NULL; - /* If there is no index allocation attribute we are finished. */ - if (!NInoIndexAllocPresent(ndir)) - goto EOD; - /* Advance fpos to the beginning of the index allocation. */ - actor->pos = vol->mft_record_size; -skip_index_root: - kaddr = NULL; - prev_ia_pos = -1LL; - /* Get the offset into the index allocation attribute. */ - ia_pos = (s64)actor->pos - vol->mft_record_size; - ia_mapping = vdir->i_mapping; - ntfs_debug("Inode 0x%lx, getting index bitmap.", vdir->i_ino); - bmp_vi = ntfs_attr_iget(vdir, AT_BITMAP, I30, 4); - if (IS_ERR(bmp_vi)) { - ntfs_error(sb, "Failed to get bitmap attribute."); - err = PTR_ERR(bmp_vi); - goto err_out; - } - bmp_mapping = bmp_vi->i_mapping; - /* Get the starting bitmap bit position and sanity check it. */ - bmp_pos = ia_pos >> ndir->itype.index.block_size_bits; - if (unlikely(bmp_pos >> 3 >= i_size_read(bmp_vi))) { - ntfs_error(sb, "Current index allocation position exceeds " - "index bitmap size."); - goto iput_err_out; - } - /* Get the starting bit position in the current bitmap page. */ - cur_bmp_pos = bmp_pos & ((PAGE_SIZE * 8) - 1); - bmp_pos &= ~(u64)((PAGE_SIZE * 8) - 1); -get_next_bmp_page: - ntfs_debug("Reading bitmap with page index 0x%llx, bit ofs 0x%llx", - (unsigned long long)bmp_pos >> (3 + PAGE_SHIFT), - (unsigned long long)bmp_pos & - (unsigned long long)((PAGE_SIZE * 8) - 1)); - bmp_page = ntfs_map_page(bmp_mapping, - bmp_pos >> (3 + PAGE_SHIFT)); - if (IS_ERR(bmp_page)) { - ntfs_error(sb, "Reading index bitmap failed."); - err = PTR_ERR(bmp_page); - bmp_page = NULL; - goto iput_err_out; - } - bmp = (u8*)page_address(bmp_page); - /* Find next index block in use. */ - while (!(bmp[cur_bmp_pos >> 3] & (1 << (cur_bmp_pos & 7)))) { -find_next_index_buffer: - cur_bmp_pos++; - /* - * If we have reached the end of the bitmap page, get the next - * page, and put away the old one. - */ - if (unlikely((cur_bmp_pos >> 3) >= PAGE_SIZE)) { - ntfs_unmap_page(bmp_page); - bmp_pos += PAGE_SIZE * 8; - cur_bmp_pos = 0; - goto get_next_bmp_page; - } - /* If we have reached the end of the bitmap, we are done. */ - if (unlikely(((bmp_pos + cur_bmp_pos) >> 3) >= i_size)) - goto unm_EOD; - ia_pos = (bmp_pos + cur_bmp_pos) << - ndir->itype.index.block_size_bits; - } - ntfs_debug("Handling index buffer 0x%llx.", - (unsigned long long)bmp_pos + cur_bmp_pos); - /* If the current index buffer is in the same page we reuse the page. */ - if ((prev_ia_pos & (s64)PAGE_MASK) != - (ia_pos & (s64)PAGE_MASK)) { - prev_ia_pos = ia_pos; - if (likely(ia_page != NULL)) { - unlock_page(ia_page); - ntfs_unmap_page(ia_page); - } - /* - * Map the page cache page containing the current ia_pos, - * reading it from disk if necessary. - */ - ia_page = ntfs_map_page(ia_mapping, ia_pos >> PAGE_SHIFT); - if (IS_ERR(ia_page)) { - ntfs_error(sb, "Reading index allocation data failed."); - err = PTR_ERR(ia_page); - ia_page = NULL; - goto err_out; - } - lock_page(ia_page); - kaddr = (u8*)page_address(ia_page); - } - /* Get the current index buffer. */ - ia = (INDEX_ALLOCATION*)(kaddr + (ia_pos & ~PAGE_MASK & - ~(s64)(ndir->itype.index.block_size - 1))); - /* Bounds checks. */ - if (unlikely((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE)) { - ntfs_error(sb, "Out of bounds check failed. Corrupt directory " - "inode 0x%lx or driver bug.", vdir->i_ino); - goto err_out; - } - /* Catch multi sector transfer fixup errors. */ - if (unlikely(!ntfs_is_indx_record(ia->magic))) { - ntfs_error(sb, "Directory index record with vcn 0x%llx is " - "corrupt. Corrupt inode 0x%lx. Run chkdsk.", - (unsigned long long)ia_pos >> - ndir->itype.index.vcn_size_bits, vdir->i_ino); - goto err_out; - } - if (unlikely(sle64_to_cpu(ia->index_block_vcn) != (ia_pos & - ~(s64)(ndir->itype.index.block_size - 1)) >> - ndir->itype.index.vcn_size_bits)) { - ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is " - "different from expected VCN (0x%llx). " - "Directory inode 0x%lx is corrupt or driver " - "bug. ", (unsigned long long) - sle64_to_cpu(ia->index_block_vcn), - (unsigned long long)ia_pos >> - ndir->itype.index.vcn_size_bits, vdir->i_ino); - goto err_out; - } - if (unlikely(le32_to_cpu(ia->index.allocated_size) + 0x18 != - ndir->itype.index.block_size)) { - ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " - "0x%lx has a size (%u) differing from the " - "directory specified size (%u). Directory " - "inode is corrupt or driver bug.", - (unsigned long long)ia_pos >> - ndir->itype.index.vcn_size_bits, vdir->i_ino, - le32_to_cpu(ia->index.allocated_size) + 0x18, - ndir->itype.index.block_size); - goto err_out; - } - index_end = (u8*)ia + ndir->itype.index.block_size; - if (unlikely(index_end > kaddr + PAGE_SIZE)) { - ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " - "0x%lx crosses page boundary. Impossible! " - "Cannot access! This is probably a bug in the " - "driver.", (unsigned long long)ia_pos >> - ndir->itype.index.vcn_size_bits, vdir->i_ino); - goto err_out; - } - ia_start = ia_pos & ~(s64)(ndir->itype.index.block_size - 1); - index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length); - if (unlikely(index_end > (u8*)ia + ndir->itype.index.block_size)) { - ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory " - "inode 0x%lx exceeds maximum size.", - (unsigned long long)ia_pos >> - ndir->itype.index.vcn_size_bits, vdir->i_ino); - goto err_out; - } - /* The first index entry in this index buffer. */ - ie = (INDEX_ENTRY*)((u8*)&ia->index + - le32_to_cpu(ia->index.entries_offset)); - /* - * Loop until we exceed valid memory (corruption case) or until we - * reach the last entry or until filldir tells us it has had enough - * or signals an error (both covered by the rc test). - */ - for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { - ntfs_debug("In index allocation, offset 0x%llx.", - (unsigned long long)ia_start + - (unsigned long long)((u8*)ie - (u8*)ia)); - /* Bounds checks. */ - if (unlikely((u8*)ie < (u8*)ia || (u8*)ie + - sizeof(INDEX_ENTRY_HEADER) > index_end || - (u8*)ie + le16_to_cpu(ie->key_length) > - index_end)) - goto err_out; - /* The last entry cannot contain a name. */ - if (ie->flags & INDEX_ENTRY_END) - break; - /* Skip index block entry if continuing previous readdir. */ - if (ia_pos - ia_start > (u8*)ie - (u8*)ia) - continue; - /* Advance the position even if going to skip the entry. */ - actor->pos = (u8*)ie - (u8*)ia + - (sle64_to_cpu(ia->index_block_vcn) << - ndir->itype.index.vcn_size_bits) + - vol->mft_record_size; - /* - * Submit the name to the @filldir callback. Note, - * ntfs_filldir() drops the lock on @ia_page but it retakes it - * before returning, unless a non-zero value is returned in - * which case the page is left unlocked. - */ - rc = ntfs_filldir(vol, ndir, ia_page, ie, name, actor); - if (rc) { - /* @ia_page is already unlocked in this case. */ - ntfs_unmap_page(ia_page); - ntfs_unmap_page(bmp_page); - iput(bmp_vi); - goto abort; - } - } - goto find_next_index_buffer; -unm_EOD: - if (ia_page) { - unlock_page(ia_page); - ntfs_unmap_page(ia_page); - } - ntfs_unmap_page(bmp_page); - iput(bmp_vi); -EOD: - /* We are finished, set fpos to EOD. */ - actor->pos = i_size + vol->mft_record_size; -abort: - kfree(name); - return 0; -err_out: - if (bmp_page) { - ntfs_unmap_page(bmp_page); -iput_err_out: - iput(bmp_vi); - } - if (ia_page) { - unlock_page(ia_page); - ntfs_unmap_page(ia_page); - } - kfree(ir); - kfree(name); - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(ndir); - if (!err) - err = -EIO; - ntfs_debug("Failed. Returning error code %i.", -err); - return err; -} - -/** - * ntfs_dir_open - called when an inode is about to be opened - * @vi: inode to be opened - * @filp: file structure describing the inode - * - * Limit directory size to the page cache limit on architectures where unsigned - * long is 32-bits. This is the most we can do for now without overflowing the - * page cache page index. Doing it this way means we don't run into problems - * because of existing too large directories. It would be better to allow the - * user to read the accessible part of the directory but I doubt very much - * anyone is going to hit this check on a 32-bit architecture, so there is no - * point in adding the extra complexity required to support this. - * - * On 64-bit architectures, the check is hopefully optimized away by the - * compiler. - */ -static int ntfs_dir_open(struct inode *vi, struct file *filp) -{ - if (sizeof(unsigned long) < 8) { - if (i_size_read(vi) > MAX_LFS_FILESIZE) - return -EFBIG; - } - return 0; -} - -#ifdef NTFS_RW - -/** - * ntfs_dir_fsync - sync a directory to disk - * @filp: directory to be synced - * @start: offset in bytes of the beginning of data range to sync - * @end: offset in bytes of the end of data range (inclusive) - * @datasync: if non-zero only flush user data and not metadata - * - * Data integrity sync of a directory to disk. Used for fsync, fdatasync, and - * msync system calls. This function is based on file.c::ntfs_file_fsync(). - * - * Write the mft record and all associated extent mft records as well as the - * $INDEX_ALLOCATION and $BITMAP attributes and then sync the block device. - * - * If @datasync is true, we do not wait on the inode(s) to be written out - * but we always wait on the page cache pages to be written out. - * - * Note: In the past @filp could be NULL so we ignore it as we don't need it - * anyway. - * - * Locking: Caller must hold i_mutex on the inode. - * - * TODO: We should probably also write all attribute/index inodes associated - * with this inode but since we have no simple way of getting to them we ignore - * this problem for now. We do write the $BITMAP attribute if it is present - * which is the important one for a directory so things are not too bad. - */ -static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end, - int datasync) -{ - struct inode *bmp_vi, *vi = filp->f_mapping->host; - int err, ret; - ntfs_attr na; - - ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); - - err = file_write_and_wait_range(filp, start, end); - if (err) - return err; - inode_lock(vi); - - BUG_ON(!S_ISDIR(vi->i_mode)); - /* If the bitmap attribute inode is in memory sync it, too. */ - na.mft_no = vi->i_ino; - na.type = AT_BITMAP; - na.name = I30; - na.name_len = 4; - bmp_vi = ilookup5(vi->i_sb, vi->i_ino, ntfs_test_inode, &na); - if (bmp_vi) { - write_inode_now(bmp_vi, !datasync); - iput(bmp_vi); - } - ret = __ntfs_write_inode(vi, 1); - write_inode_now(vi, !datasync); - err = sync_blockdev(vi->i_sb->s_bdev); - if (unlikely(err && !ret)) - ret = err; - if (likely(!ret)) - ntfs_debug("Done."); - else - ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error " - "%u.", datasync ? "data" : "", vi->i_ino, -ret); - inode_unlock(vi); - return ret; -} - -#endif /* NTFS_RW */ - -WRAP_DIR_ITER(ntfs_readdir) // FIXME! -const struct file_operations ntfs_dir_ops = { - .llseek = generic_file_llseek, /* Seek inside directory. */ - .read = generic_read_dir, /* Return -EISDIR. */ - .iterate_shared = shared_ntfs_readdir, /* Read directory contents. */ -#ifdef NTFS_RW - .fsync = ntfs_dir_fsync, /* Sync a directory to disk. */ -#endif /* NTFS_RW */ - /*.ioctl = ,*/ /* Perform function on the - mounted filesystem. */ - .open = ntfs_dir_open, /* Open directory. */ -}; diff --git a/fs/ntfs/dir.h b/fs/ntfs/dir.h deleted file mode 100644 index 0e326753df40..000000000000 --- a/fs/ntfs/dir.h +++ /dev/null @@ -1,34 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * dir.h - Defines for directory handling in NTFS Linux kernel driver. Part of - * the Linux-NTFS project. - * - * Copyright (c) 2002-2004 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_DIR_H -#define _LINUX_NTFS_DIR_H - -#include "layout.h" -#include "inode.h" -#include "types.h" - -/* - * ntfs_name is used to return the file name to the caller of - * ntfs_lookup_inode_by_name() in order for the caller (namei.c::ntfs_lookup()) - * to be able to deal with dcache aliasing issues. - */ -typedef struct { - MFT_REF mref; - FILE_NAME_TYPE_FLAGS type; - u8 len; - ntfschar name[0]; -} __attribute__ ((__packed__)) ntfs_name; - -/* The little endian Unicode string $I30 as a global constant. */ -extern ntfschar I30[5]; - -extern MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, - const ntfschar *uname, const int uname_len, ntfs_name **res); - -#endif /* _LINUX_NTFS_FS_DIR_H */ diff --git a/fs/ntfs/endian.h b/fs/ntfs/endian.h deleted file mode 100644 index f30c139bf9ae..000000000000 --- a/fs/ntfs/endian.h +++ /dev/null @@ -1,79 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * endian.h - Defines for endianness handling in NTFS Linux kernel driver. - * Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2004 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_ENDIAN_H -#define _LINUX_NTFS_ENDIAN_H - -#include <asm/byteorder.h> -#include "types.h" - -/* - * Signed endianness conversion functions. - */ - -static inline s16 sle16_to_cpu(sle16 x) -{ - return le16_to_cpu((__force le16)x); -} - -static inline s32 sle32_to_cpu(sle32 x) -{ - return le32_to_cpu((__force le32)x); -} - -static inline s64 sle64_to_cpu(sle64 x) -{ - return le64_to_cpu((__force le64)x); -} - -static inline s16 sle16_to_cpup(sle16 *x) -{ - return le16_to_cpu(*(__force le16*)x); -} - -static inline s32 sle32_to_cpup(sle32 *x) -{ - return le32_to_cpu(*(__force le32*)x); -} - -static inline s64 sle64_to_cpup(sle64 *x) -{ - return le64_to_cpu(*(__force le64*)x); -} - -static inline sle16 cpu_to_sle16(s16 x) -{ - return (__force sle16)cpu_to_le16(x); -} - -static inline sle32 cpu_to_sle32(s32 x) -{ - return (__force sle32)cpu_to_le32(x); -} - -static inline sle64 cpu_to_sle64(s64 x) -{ - return (__force sle64)cpu_to_le64(x); -} - -static inline sle16 cpu_to_sle16p(s16 *x) -{ - return (__force sle16)cpu_to_le16(*x); -} - -static inline sle32 cpu_to_sle32p(s32 *x) -{ - return (__force sle32)cpu_to_le32(*x); -} - -static inline sle64 cpu_to_sle64p(s64 *x) -{ - return (__force sle64)cpu_to_le64(*x); -} - -#endif /* _LINUX_NTFS_ENDIAN_H */ diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c deleted file mode 100644 index 297c0b9db621..000000000000 --- a/fs/ntfs/file.c +++ /dev/null @@ -1,1997 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * file.c - NTFS kernel file operations. Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc. - */ - -#include <linux/blkdev.h> -#include <linux/backing-dev.h> -#include <linux/buffer_head.h> -#include <linux/gfp.h> -#include <linux/pagemap.h> -#include <linux/pagevec.h> -#include <linux/sched/signal.h> -#include <linux/swap.h> -#include <linux/uio.h> -#include <linux/writeback.h> - -#include <asm/page.h> -#include <linux/uaccess.h> - -#include "attrib.h" -#include "bitmap.h" -#include "inode.h" -#include "debug.h" -#include "lcnalloc.h" -#include "malloc.h" -#include "mft.h" -#include "ntfs.h" - -/** - * ntfs_file_open - called when an inode is about to be opened - * @vi: inode to be opened - * @filp: file structure describing the inode - * - * Limit file size to the page cache limit on architectures where unsigned long - * is 32-bits. This is the most we can do for now without overflowing the page - * cache page index. Doing it this way means we don't run into problems because - * of existing too large files. It would be better to allow the user to read - * the beginning of the file but I doubt very much anyone is going to hit this - * check on a 32-bit architecture, so there is no point in adding the extra - * complexity required to support this. - * - * On 64-bit architectures, the check is hopefully optimized away by the - * compiler. - * - * After the check passes, just call generic_file_open() to do its work. - */ -static int ntfs_file_open(struct inode *vi, struct file *filp) -{ - if (sizeof(unsigned long) < 8) { - if (i_size_read(vi) > MAX_LFS_FILESIZE) - return -EOVERFLOW; - } - return generic_file_open(vi, filp); -} - -#ifdef NTFS_RW - -/** - * ntfs_attr_extend_initialized - extend the initialized size of an attribute - * @ni: ntfs inode of the attribute to extend - * @new_init_size: requested new initialized size in bytes - * - * Extend the initialized size of an attribute described by the ntfs inode @ni - * to @new_init_size bytes. This involves zeroing any non-sparse space between - * the old initialized size and @new_init_size both in the page cache and on - * disk (if relevant complete pages are already uptodate in the page cache then - * these are simply marked dirty). - * - * As a side-effect, the file size (vfs inode->i_size) may be incremented as, - * in the resident attribute case, it is tied to the initialized size and, in - * the non-resident attribute case, it may not fall below the initialized size. - * - * Note that if the attribute is resident, we do not need to touch the page - * cache at all. This is because if the page cache page is not uptodate we - * bring it uptodate later, when doing the write to the mft record since we - * then already have the page mapped. And if the page is uptodate, the - * non-initialized region will already have been zeroed when the page was - * brought uptodate and the region may in fact already have been overwritten - * with new data via mmap() based writes, so we cannot just zero it. And since - * POSIX specifies that the behaviour of resizing a file whilst it is mmap()ped - * is unspecified, we choose not to do zeroing and thus we do not need to touch - * the page at all. For a more detailed explanation see ntfs_truncate() in - * fs/ntfs/inode.c. - * - * Return 0 on success and -errno on error. In the case that an error is - * encountered it is possible that the initialized size will already have been - * incremented some way towards @new_init_size but it is guaranteed that if - * this is the case, the necessary zeroing will also have happened and that all - * metadata is self-consistent. - * - * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be - * held by the caller. - */ -static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size) -{ - s64 old_init_size; - loff_t old_i_size; - pgoff_t index, end_index; - unsigned long flags; - struct inode *vi = VFS_I(ni); - ntfs_inode *base_ni; - MFT_RECORD *m = NULL; - ATTR_RECORD *a; - ntfs_attr_search_ctx *ctx = NULL; - struct address_space *mapping; - struct page *page = NULL; - u8 *kattr; - int err; - u32 attr_len; - - read_lock_irqsave(&ni->size_lock, flags); - old_init_size = ni->initialized_size; - old_i_size = i_size_read(vi); - BUG_ON(new_init_size > ni->allocated_size); - read_unlock_irqrestore(&ni->size_lock, flags); - ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, " - "old_initialized_size 0x%llx, " - "new_initialized_size 0x%llx, i_size 0x%llx.", - vi->i_ino, (unsigned)le32_to_cpu(ni->type), - (unsigned long long)old_init_size, - (unsigned long long)new_init_size, old_i_size); - if (!NInoAttr(ni)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - /* Use goto to reduce indentation and we need the label below anyway. */ - if (NInoNonResident(ni)) - goto do_non_resident_extend; - BUG_ON(old_init_size != old_i_size); - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - m = NULL; - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - err = -EIO; - goto err_out; - } - m = ctx->mrec; - a = ctx->attr; - BUG_ON(a->non_resident); - /* The total length of the attribute value. */ - attr_len = le32_to_cpu(a->data.resident.value_length); - BUG_ON(old_i_size != (loff_t)attr_len); - /* - * Do the zeroing in the mft record and update the attribute size in - * the mft record. - */ - kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset); - memset(kattr + attr_len, 0, new_init_size - attr_len); - a->data.resident.value_length = cpu_to_le32((u32)new_init_size); - /* Finally, update the sizes in the vfs and ntfs inodes. */ - write_lock_irqsave(&ni->size_lock, flags); - i_size_write(vi, new_init_size); - ni->initialized_size = new_init_size; - write_unlock_irqrestore(&ni->size_lock, flags); - goto done; -do_non_resident_extend: - /* - * If the new initialized size @new_init_size exceeds the current file - * size (vfs inode->i_size), we need to extend the file size to the - * new initialized size. - */ - if (new_init_size > old_i_size) { - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - m = NULL; - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - err = -EIO; - goto err_out; - } - m = ctx->mrec; - a = ctx->attr; - BUG_ON(!a->non_resident); - BUG_ON(old_i_size != (loff_t) - sle64_to_cpu(a->data.non_resident.data_size)); - a->data.non_resident.data_size = cpu_to_sle64(new_init_size); - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - /* Update the file size in the vfs inode. */ - i_size_write(vi, new_init_size); - ntfs_attr_put_search_ctx(ctx); - ctx = NULL; - unmap_mft_record(base_ni); - m = NULL; - } - mapping = vi->i_mapping; - index = old_init_size >> PAGE_SHIFT; - end_index = (new_init_size + PAGE_SIZE - 1) >> PAGE_SHIFT; - do { - /* - * Read the page. If the page is not present, this will zero - * the uninitialized regions for us. - */ - page = read_mapping_page(mapping, index, NULL); - if (IS_ERR(page)) { - err = PTR_ERR(page); - goto init_err_out; - } - /* - * Update the initialized size in the ntfs inode. This is - * enough to make ntfs_writepage() work. - */ - write_lock_irqsave(&ni->size_lock, flags); - ni->initialized_size = (s64)(index + 1) << PAGE_SHIFT; - if (ni->initialized_size > new_init_size) - ni->initialized_size = new_init_size; - write_unlock_irqrestore(&ni->size_lock, flags); - /* Set the page dirty so it gets written out. */ - set_page_dirty(page); - put_page(page); - /* - * Play nice with the vm and the rest of the system. This is - * very much needed as we can potentially be modifying the - * initialised size from a very small value to a really huge - * value, e.g. - * f = open(somefile, O_TRUNC); - * truncate(f, 10GiB); - * seek(f, 10GiB); - * write(f, 1); - * And this would mean we would be marking dirty hundreds of - * thousands of pages or as in the above example more than - * two and a half million pages! - * - * TODO: For sparse pages could optimize this workload by using - * the FsMisc / MiscFs page bit as a "PageIsSparse" bit. This - * would be set in read_folio for sparse pages and here we would - * not need to mark dirty any pages which have this bit set. - * The only caveat is that we have to clear the bit everywhere - * where we allocate any clusters that lie in the page or that - * contain the page. - * - * TODO: An even greater optimization would be for us to only - * call read_folio() on pages which are not in sparse regions as - * determined from the runlist. This would greatly reduce the - * number of pages we read and make dirty in the case of sparse - * files. - */ - balance_dirty_pages_ratelimited(mapping); - cond_resched(); - } while (++index < end_index); - read_lock_irqsave(&ni->size_lock, flags); - BUG_ON(ni->initialized_size != new_init_size); - read_unlock_irqrestore(&ni->size_lock, flags); - /* Now bring in sync the initialized_size in the mft record. */ - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - m = NULL; - goto init_err_out; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto init_err_out; - } - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - err = -EIO; - goto init_err_out; - } - m = ctx->mrec; - a = ctx->attr; - BUG_ON(!a->non_resident); - a->data.non_resident.initialized_size = cpu_to_sle64(new_init_size); -done: - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(base_ni); - ntfs_debug("Done, initialized_size 0x%llx, i_size 0x%llx.", - (unsigned long long)new_init_size, i_size_read(vi)); - return 0; -init_err_out: - write_lock_irqsave(&ni->size_lock, flags); - ni->initialized_size = old_init_size; - write_unlock_irqrestore(&ni->size_lock, flags); -err_out: - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(base_ni); - ntfs_debug("Failed. Returning error code %i.", err); - return err; -} - -static ssize_t ntfs_prepare_file_for_write(struct kiocb *iocb, - struct iov_iter *from) -{ - loff_t pos; - s64 end, ll; - ssize_t err; - unsigned long flags; - struct file *file = iocb->ki_filp; - struct inode *vi = file_inode(file); - ntfs_inode *ni = NTFS_I(vi); - ntfs_volume *vol = ni->vol; - - ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos " - "0x%llx, count 0x%zx.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), - (unsigned long long)iocb->ki_pos, - iov_iter_count(from)); - err = generic_write_checks(iocb, from); - if (unlikely(err <= 0)) - goto out; - /* - * All checks have passed. Before we start doing any writing we want - * to abort any totally illegal writes. - */ - BUG_ON(NInoMstProtected(ni)); - BUG_ON(ni->type != AT_DATA); - /* If file is encrypted, deny access, just like NT4. */ - if (NInoEncrypted(ni)) { - /* Only $DATA attributes can be encrypted. */ - /* - * Reminder for later: Encrypted files are _always_ - * non-resident so that the content can always be encrypted. - */ - ntfs_debug("Denying write access to encrypted file."); - err = -EACCES; - goto out; - } - if (NInoCompressed(ni)) { - /* Only unnamed $DATA attribute can be compressed. */ - BUG_ON(ni->name_len); - /* - * Reminder for later: If resident, the data is not actually - * compressed. Only on the switch to non-resident does - * compression kick in. This is in contrast to encrypted files - * (see above). - */ - ntfs_error(vi->i_sb, "Writing to compressed files is not " - "implemented yet. Sorry."); - err = -EOPNOTSUPP; - goto out; - } - err = file_remove_privs(file); - if (unlikely(err)) - goto out; - /* - * Our ->update_time method always succeeds thus file_update_time() - * cannot fail either so there is no need to check the return code. - */ - file_update_time(file); - pos = iocb->ki_pos; - /* The first byte after the last cluster being written to. */ - end = (pos + iov_iter_count(from) + vol->cluster_size_mask) & - ~(u64)vol->cluster_size_mask; - /* - * If the write goes beyond the allocated size, extend the allocation - * to cover the whole of the write, rounded up to the nearest cluster. - */ - read_lock_irqsave(&ni->size_lock, flags); - ll = ni->allocated_size; - read_unlock_irqrestore(&ni->size_lock, flags); - if (end > ll) { - /* - * Extend the allocation without changing the data size. - * - * Note we ensure the allocation is big enough to at least - * write some data but we do not require the allocation to be - * complete, i.e. it may be partial. - */ - ll = ntfs_attr_extend_allocation(ni, end, -1, pos); - if (likely(ll >= 0)) { - BUG_ON(pos >= ll); - /* If the extension was partial truncate the write. */ - if (end > ll) { - ntfs_debug("Truncating write to inode 0x%lx, " - "attribute type 0x%x, because " - "the allocation was only " - "partially extended.", - vi->i_ino, (unsigned) - le32_to_cpu(ni->type)); - iov_iter_truncate(from, ll - pos); - } - } else { - err = ll; - read_lock_irqsave(&ni->size_lock, flags); - ll = ni->allocated_size; - read_unlock_irqrestore(&ni->size_lock, flags); - /* Perform a partial write if possible or fail. */ - if (pos < ll) { - ntfs_debug("Truncating write to inode 0x%lx " - "attribute type 0x%x, because " - "extending the allocation " - "failed (error %d).", - vi->i_ino, (unsigned) - le32_to_cpu(ni->type), - (int)-err); - iov_iter_truncate(from, ll - pos); - } else { - if (err != -ENOSPC) - ntfs_error(vi->i_sb, "Cannot perform " - "write to inode " - "0x%lx, attribute " - "type 0x%x, because " - "extending the " - "allocation failed " - "(error %ld).", - vi->i_ino, (unsigned) - le32_to_cpu(ni->type), - (long)-err); - else - ntfs_debug("Cannot perform write to " - "inode 0x%lx, " - "attribute type 0x%x, " - "because there is not " - "space left.", - vi->i_ino, (unsigned) - le32_to_cpu(ni->type)); - goto out; - } - } - } - /* - * If the write starts beyond the initialized size, extend it up to the - * beginning of the write and initialize all non-sparse space between - * the old initialized size and the new one. This automatically also - * increments the vfs inode->i_size to keep it above or equal to the - * initialized_size. - */ - read_lock_irqsave(&ni->size_lock, flags); - ll = ni->initialized_size; - read_unlock_irqrestore(&ni->size_lock, flags); - if (pos > ll) { - /* - * Wait for ongoing direct i/o to complete before proceeding. - * New direct i/o cannot start as we hold i_mutex. - */ - inode_dio_wait(vi); - err = ntfs_attr_extend_initialized(ni, pos); - if (unlikely(err < 0)) - ntfs_error(vi->i_sb, "Cannot perform write to inode " - "0x%lx, attribute type 0x%x, because " - "extending the initialized size " - "failed (error %d).", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), - (int)-err); - } -out: - return err; -} - -/** - * __ntfs_grab_cache_pages - obtain a number of locked pages - * @mapping: address space mapping from which to obtain page cache pages - * @index: starting index in @mapping at which to begin obtaining pages - * @nr_pages: number of page cache pages to obtain - * @pages: array of pages in which to return the obtained page cache pages - * @cached_page: allocated but as yet unused page - * - * Obtain @nr_pages locked page cache pages from the mapping @mapping and - * starting at index @index. - * - * If a page is newly created, add it to lru list - * - * Note, the page locks are obtained in ascending page index order. - */ -static inline int __ntfs_grab_cache_pages(struct address_space *mapping, - pgoff_t index, const unsigned nr_pages, struct page **pages, - struct page **cached_page) -{ - int err, nr; - - BUG_ON(!nr_pages); - err = nr = 0; - do { - pages[nr] = find_get_page_flags(mapping, index, FGP_LOCK | - FGP_ACCESSED); - if (!pages[nr]) { - if (!*cached_page) { - *cached_page = page_cache_alloc(mapping); - if (unlikely(!*cached_page)) { - err = -ENOMEM; - goto err_out; - } - } - err = add_to_page_cache_lru(*cached_page, mapping, - index, - mapping_gfp_constraint(mapping, GFP_KERNEL)); - if (unlikely(err)) { - if (err == -EEXIST) - continue; - goto err_out; - } - pages[nr] = *cached_page; - *cached_page = NULL; - } - index++; - nr++; - } while (nr < nr_pages); -out: - return err; -err_out: - while (nr > 0) { - unlock_page(pages[--nr]); - put_page(pages[nr]); - } - goto out; -} - -static inline void ntfs_submit_bh_for_read(struct buffer_head *bh) -{ - lock_buffer(bh); - get_bh(bh); - bh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, bh); -} - -/** - * ntfs_prepare_pages_for_non_resident_write - prepare pages for receiving data - * @pages: array of destination pages - * @nr_pages: number of pages in @pages - * @pos: byte position in file at which the write begins - * @bytes: number of bytes to be written - * - * This is called for non-resident attributes from ntfs_file_buffered_write() - * with i_mutex held on the inode (@pages[0]->mapping->host). There are - * @nr_pages pages in @pages which are locked but not kmap()ped. The source - * data has not yet been copied into the @pages. - * - * Need to fill any holes with actual clusters, allocate buffers if necessary, - * ensure all the buffers are mapped, and bring uptodate any buffers that are - * only partially being written to. - * - * If @nr_pages is greater than one, we are guaranteed that the cluster size is - * greater than PAGE_SIZE, that all pages in @pages are entirely inside - * the same cluster and that they are the entirety of that cluster, and that - * the cluster is sparse, i.e. we need to allocate a cluster to fill the hole. - * - * i_size is not to be modified yet. - * - * Return 0 on success or -errno on error. - */ -static int ntfs_prepare_pages_for_non_resident_write(struct page **pages, - unsigned nr_pages, s64 pos, size_t bytes) -{ - VCN vcn, highest_vcn = 0, cpos, cend, bh_cpos, bh_cend; - LCN lcn; - s64 bh_pos, vcn_len, end, initialized_size; - sector_t lcn_block; - struct folio *folio; - struct inode *vi; - ntfs_inode *ni, *base_ni = NULL; - ntfs_volume *vol; - runlist_element *rl, *rl2; - struct buffer_head *bh, *head, *wait[2], **wait_bh = wait; - ntfs_attr_search_ctx *ctx = NULL; - MFT_RECORD *m = NULL; - ATTR_RECORD *a = NULL; - unsigned long flags; - u32 attr_rec_len = 0; - unsigned blocksize, u; - int err, mp_size; - bool rl_write_locked, was_hole, is_retry; - unsigned char blocksize_bits; - struct { - u8 runlist_merged:1; - u8 mft_attr_mapped:1; - u8 mp_rebuilt:1; - u8 attr_switched:1; - } status = { 0, 0, 0, 0 }; - - BUG_ON(!nr_pages); - BUG_ON(!pages); - BUG_ON(!*pages); - vi = pages[0]->mapping->host; - ni = NTFS_I(vi); - vol = ni->vol; - ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page " - "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.", - vi->i_ino, ni->type, pages[0]->index, nr_pages, - (long long)pos, bytes); - blocksize = vol->sb->s_blocksize; - blocksize_bits = vol->sb->s_blocksize_bits; - rl_write_locked = false; - rl = NULL; - err = 0; - vcn = lcn = -1; - vcn_len = 0; - lcn_block = -1; - was_hole = false; - cpos = pos >> vol->cluster_size_bits; - end = pos + bytes; - cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits; - /* - * Loop over each buffer in each folio. Use goto to - * reduce indentation. - */ - u = 0; -do_next_folio: - folio = page_folio(pages[u]); - bh_pos = folio_pos(folio); - head = folio_buffers(folio); - if (!head) - /* - * create_empty_buffers() will create uptodate/dirty - * buffers if the folio is uptodate/dirty. - */ - head = create_empty_buffers(folio, blocksize, 0); - bh = head; - do { - VCN cdelta; - s64 bh_end; - unsigned bh_cofs; - - /* Clear buffer_new on all buffers to reinitialise state. */ - if (buffer_new(bh)) - clear_buffer_new(bh); - bh_end = bh_pos + blocksize; - bh_cpos = bh_pos >> vol->cluster_size_bits; - bh_cofs = bh_pos & vol->cluster_size_mask; - if (buffer_mapped(bh)) { - /* - * The buffer is already mapped. If it is uptodate, - * ignore it. - */ - if (buffer_uptodate(bh)) - continue; - /* - * The buffer is not uptodate. If the folio is uptodate - * set the buffer uptodate and otherwise ignore it. - */ - if (folio_test_uptodate(folio)) { - set_buffer_uptodate(bh); - continue; - } - /* - * Neither the folio nor the buffer are uptodate. If - * the buffer is only partially being written to, we - * need to read it in before the write, i.e. now. - */ - if ((bh_pos < pos && bh_end > pos) || - (bh_pos < end && bh_end > end)) { - /* - * If the buffer is fully or partially within - * the initialized size, do an actual read. - * Otherwise, simply zero the buffer. - */ - read_lock_irqsave(&ni->size_lock, flags); - initialized_size = ni->initialized_size; - read_unlock_irqrestore(&ni->size_lock, flags); - if (bh_pos < initialized_size) { - ntfs_submit_bh_for_read(bh); - *wait_bh++ = bh; - } else { - folio_zero_range(folio, bh_offset(bh), - blocksize); - set_buffer_uptodate(bh); - } - } - continue; - } - /* Unmapped buffer. Need to map it. */ - bh->b_bdev = vol->sb->s_bdev; - /* - * If the current buffer is in the same clusters as the map - * cache, there is no need to check the runlist again. The - * map cache is made up of @vcn, which is the first cached file - * cluster, @vcn_len which is the number of cached file - * clusters, @lcn is the device cluster corresponding to @vcn, - * and @lcn_block is the block number corresponding to @lcn. - */ - cdelta = bh_cpos - vcn; - if (likely(!cdelta || (cdelta > 0 && cdelta < vcn_len))) { -map_buffer_cached: - BUG_ON(lcn < 0); - bh->b_blocknr = lcn_block + - (cdelta << (vol->cluster_size_bits - - blocksize_bits)) + - (bh_cofs >> blocksize_bits); - set_buffer_mapped(bh); - /* - * If the folio is uptodate so is the buffer. If the - * buffer is fully outside the write, we ignore it if - * it was already allocated and we mark it dirty so it - * gets written out if we allocated it. On the other - * hand, if we allocated the buffer but we are not - * marking it dirty we set buffer_new so we can do - * error recovery. - */ - if (folio_test_uptodate(folio)) { - if (!buffer_uptodate(bh)) - set_buffer_uptodate(bh); - if (unlikely(was_hole)) { - /* We allocated the buffer. */ - clean_bdev_bh_alias(bh); - if (bh_end <= pos || bh_pos >= end) - mark_buffer_dirty(bh); - else - set_buffer_new(bh); - } - continue; - } - /* Page is _not_ uptodate. */ - if (likely(!was_hole)) { - /* - * Buffer was already allocated. If it is not - * uptodate and is only partially being written - * to, we need to read it in before the write, - * i.e. now. - */ - if (!buffer_uptodate(bh) && bh_pos < end && - bh_end > pos && - (bh_pos < pos || - bh_end > end)) { - /* - * If the buffer is fully or partially - * within the initialized size, do an - * actual read. Otherwise, simply zero - * the buffer. - */ - read_lock_irqsave(&ni->size_lock, - flags); - initialized_size = ni->initialized_size; - read_unlock_irqrestore(&ni->size_lock, - flags); - if (bh_pos < initialized_size) { - ntfs_submit_bh_for_read(bh); - *wait_bh++ = bh; - } else { - folio_zero_range(folio, - bh_offset(bh), - blocksize); - set_buffer_uptodate(bh); - } - } - continue; - } - /* We allocated the buffer. */ - clean_bdev_bh_alias(bh); - /* - * If the buffer is fully outside the write, zero it, - * set it uptodate, and mark it dirty so it gets - * written out. If it is partially being written to, - * zero region surrounding the write but leave it to - * commit write to do anything else. Finally, if the - * buffer is fully being overwritten, do nothing. - */ - if (bh_end <= pos || bh_pos >= end) { - if (!buffer_uptodate(bh)) { - folio_zero_range(folio, bh_offset(bh), - blocksize); - set_buffer_uptodate(bh); - } - mark_buffer_dirty(bh); - continue; - } - set_buffer_new(bh); - if (!buffer_uptodate(bh) && - (bh_pos < pos || bh_end > end)) { - u8 *kaddr; - unsigned pofs; - - kaddr = kmap_local_folio(folio, 0); - if (bh_pos < pos) { - pofs = bh_pos & ~PAGE_MASK; - memset(kaddr + pofs, 0, pos - bh_pos); - } - if (bh_end > end) { - pofs = end & ~PAGE_MASK; - memset(kaddr + pofs, 0, bh_end - end); - } - kunmap_local(kaddr); - flush_dcache_folio(folio); - } - continue; - } - /* - * Slow path: this is the first buffer in the cluster. If it - * is outside allocated size and is not uptodate, zero it and - * set it uptodate. - */ - read_lock_irqsave(&ni->size_lock, flags); - initialized_size = ni->allocated_size; - read_unlock_irqrestore(&ni->size_lock, flags); - if (bh_pos > initialized_size) { - if (folio_test_uptodate(folio)) { - if (!buffer_uptodate(bh)) - set_buffer_uptodate(bh); - } else if (!buffer_uptodate(bh)) { - folio_zero_range(folio, bh_offset(bh), - blocksize); - set_buffer_uptodate(bh); - } - continue; - } - is_retry = false; - if (!rl) { - down_read(&ni->runlist.lock); -retry_remap: - rl = ni->runlist.rl; - } - if (likely(rl != NULL)) { - /* Seek to element containing target cluster. */ - while (rl->length && rl[1].vcn <= bh_cpos) - rl++; - lcn = ntfs_rl_vcn_to_lcn(rl, bh_cpos); - if (likely(lcn >= 0)) { - /* - * Successful remap, setup the map cache and - * use that to deal with the buffer. - */ - was_hole = false; - vcn = bh_cpos; - vcn_len = rl[1].vcn - vcn; - lcn_block = lcn << (vol->cluster_size_bits - - blocksize_bits); - cdelta = 0; - /* - * If the number of remaining clusters touched - * by the write is smaller or equal to the - * number of cached clusters, unlock the - * runlist as the map cache will be used from - * now on. - */ - if (likely(vcn + vcn_len >= cend)) { - if (rl_write_locked) { - up_write(&ni->runlist.lock); - rl_write_locked = false; - } else - up_read(&ni->runlist.lock); - rl = NULL; - } - goto map_buffer_cached; - } - } else - lcn = LCN_RL_NOT_MAPPED; - /* - * If it is not a hole and not out of bounds, the runlist is - * probably unmapped so try to map it now. - */ - if (unlikely(lcn != LCN_HOLE && lcn != LCN_ENOENT)) { - if (likely(!is_retry && lcn == LCN_RL_NOT_MAPPED)) { - /* Attempt to map runlist. */ - if (!rl_write_locked) { - /* - * We need the runlist locked for - * writing, so if it is locked for - * reading relock it now and retry in - * case it changed whilst we dropped - * the lock. - */ - up_read(&ni->runlist.lock); - down_write(&ni->runlist.lock); - rl_write_locked = true; - goto retry_remap; - } - err = ntfs_map_runlist_nolock(ni, bh_cpos, - NULL); - if (likely(!err)) { - is_retry = true; - goto retry_remap; - } - /* - * If @vcn is out of bounds, pretend @lcn is - * LCN_ENOENT. As long as the buffer is out - * of bounds this will work fine. - */ - if (err == -ENOENT) { - lcn = LCN_ENOENT; - err = 0; - goto rl_not_mapped_enoent; - } - } else - err = -EIO; - /* Failed to map the buffer, even after retrying. */ - bh->b_blocknr = -1; - ntfs_error(vol->sb, "Failed to write to inode 0x%lx, " - "attribute type 0x%x, vcn 0x%llx, " - "vcn offset 0x%x, because its " - "location on disk could not be " - "determined%s (error code %i).", - ni->mft_no, ni->type, - (unsigned long long)bh_cpos, - (unsigned)bh_pos & - vol->cluster_size_mask, - is_retry ? " even after retrying" : "", - err); - break; - } -rl_not_mapped_enoent: - /* - * The buffer is in a hole or out of bounds. We need to fill - * the hole, unless the buffer is in a cluster which is not - * touched by the write, in which case we just leave the buffer - * unmapped. This can only happen when the cluster size is - * less than the page cache size. - */ - if (unlikely(vol->cluster_size < PAGE_SIZE)) { - bh_cend = (bh_end + vol->cluster_size - 1) >> - vol->cluster_size_bits; - if ((bh_cend <= cpos || bh_cpos >= cend)) { - bh->b_blocknr = -1; - /* - * If the buffer is uptodate we skip it. If it - * is not but the folio is uptodate, we can set - * the buffer uptodate. If the folio is not - * uptodate, we can clear the buffer and set it - * uptodate. Whether this is worthwhile is - * debatable and this could be removed. - */ - if (folio_test_uptodate(folio)) { - if (!buffer_uptodate(bh)) - set_buffer_uptodate(bh); - } else if (!buffer_uptodate(bh)) { - folio_zero_range(folio, bh_offset(bh), - blocksize); - set_buffer_uptodate(bh); - } - continue; - } - } - /* - * Out of bounds buffer is invalid if it was not really out of - * bounds. - */ - BUG_ON(lcn != LCN_HOLE); - /* - * We need the runlist locked for writing, so if it is locked - * for reading relock it now and retry in case it changed - * whilst we dropped the lock. - */ - BUG_ON(!rl); - if (!rl_write_locked) { - up_read(&ni->runlist.lock); - down_write(&ni->runlist.lock); - rl_write_locked = true; - goto retry_remap; - } - /* Find the previous last allocated cluster. */ - BUG_ON(rl->lcn != LCN_HOLE); - lcn = -1; - rl2 = rl; - while (--rl2 >= ni->runlist.rl) { - if (rl2->lcn >= 0) { - lcn = rl2->lcn + rl2->length; - break; - } - } - rl2 = ntfs_cluster_alloc(vol, bh_cpos, 1, lcn, DATA_ZONE, - false); - if (IS_ERR(rl2)) { - err = PTR_ERR(rl2); - ntfs_debug("Failed to allocate cluster, error code %i.", - err); - break; - } - lcn = rl2->lcn; - rl = ntfs_runlists_merge(ni->runlist.rl, rl2); - if (IS_ERR(rl)) { - err = PTR_ERR(rl); - if (err != -ENOMEM) - err = -EIO; - if (ntfs_cluster_free_from_rl(vol, rl2)) { - ntfs_error(vol->sb, "Failed to release " - "allocated cluster in error " - "code path. Run chkdsk to " - "recover the lost cluster."); - NVolSetErrors(vol); - } - ntfs_free(rl2); - break; - } - ni->runlist.rl = rl; - status.runlist_merged = 1; - ntfs_debug("Allocated cluster, lcn 0x%llx.", - (unsigned long long)lcn); - /* Map and lock the mft record and get the attribute record. */ - if (!NInoAttr(ni)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - break; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - unmap_mft_record(base_ni); - break; - } - status.mft_attr_mapped = 1; - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, bh_cpos, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - err = -EIO; - break; - } - m = ctx->mrec; - a = ctx->attr; - /* - * Find the runlist element with which the attribute extent - * starts. Note, we cannot use the _attr_ version because we - * have mapped the mft record. That is ok because we know the - * runlist fragment must be mapped already to have ever gotten - * here, so we can just use the _rl_ version. - */ - vcn = sle64_to_cpu(a->data.non_resident.lowest_vcn); - rl2 = ntfs_rl_find_vcn_nolock(rl, vcn); - BUG_ON(!rl2); - BUG_ON(!rl2->length); - BUG_ON(rl2->lcn < LCN_HOLE); - highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn); - /* - * If @highest_vcn is zero, calculate the real highest_vcn - * (which can really be zero). - */ - if (!highest_vcn) - highest_vcn = (sle64_to_cpu( - a->data.non_resident.allocated_size) >> - vol->cluster_size_bits) - 1; - /* - * Determine the size of the mapping pairs array for the new - * extent, i.e. the old extent with the hole filled. - */ - mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, vcn, - highest_vcn); - if (unlikely(mp_size <= 0)) { - if (!(err = mp_size)) - err = -EIO; - ntfs_debug("Failed to get size for mapping pairs " - "array, error code %i.", err); - break; - } - /* - * Resize the attribute record to fit the new mapping pairs - * array. - */ - attr_rec_len = le32_to_cpu(a->length); - err = ntfs_attr_record_resize(m, a, mp_size + le16_to_cpu( - a->data.non_resident.mapping_pairs_offset)); - if (unlikely(err)) { - BUG_ON(err != -ENOSPC); - // TODO: Deal with this by using the current attribute - // and fill it with as much of the mapping pairs - // array as possible. Then loop over each attribute - // extent rewriting the mapping pairs arrays as we go - // along and if when we reach the end we have not - // enough space, try to resize the last attribute - // extent and if even that fails, add a new attribute - // extent. - // We could also try to resize at each step in the hope - // that we will not need to rewrite every single extent. - // Note, we may need to decompress some extents to fill - // the runlist as we are walking the extents... - ntfs_error(vol->sb, "Not enough space in the mft " - "record for the extended attribute " - "record. This case is not " - "implemented yet."); - err = -EOPNOTSUPP; - break ; - } - status.mp_rebuilt = 1; - /* - * Generate the mapping pairs array directly into the attribute - * record. - */ - err = ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( - a->data.non_resident.mapping_pairs_offset), - mp_size, rl2, vcn, highest_vcn, NULL); - if (unlikely(err)) { - ntfs_error(vol->sb, "Cannot fill hole in inode 0x%lx, " - "attribute type 0x%x, because building " - "the mapping pairs failed with error " - "code %i.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), err); - err = -EIO; - break; - } - /* Update the highest_vcn but only if it was not set. */ - if (unlikely(!a->data.non_resident.highest_vcn)) - a->data.non_resident.highest_vcn = - cpu_to_sle64(highest_vcn); - /* - * If the attribute is sparse/compressed, update the compressed - * size in the ntfs_inode structure and the attribute record. - */ - if (likely(NInoSparse(ni) || NInoCompressed(ni))) { - /* - * If we are not in the first attribute extent, switch - * to it, but first ensure the changes will make it to - * disk later. - */ - if (a->data.non_resident.lowest_vcn) { - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_reinit_search_ctx(ctx); - err = ntfs_attr_lookup(ni->type, ni->name, - ni->name_len, CASE_SENSITIVE, - 0, NULL, 0, ctx); - if (unlikely(err)) { - status.attr_switched = 1; - break; - } - /* @m is not used any more so do not set it. */ - a = ctx->attr; - } - write_lock_irqsave(&ni->size_lock, flags); - ni->itype.compressed.size += vol->cluster_size; - a->data.non_resident.compressed_size = - cpu_to_sle64(ni->itype.compressed.size); - write_unlock_irqrestore(&ni->size_lock, flags); - } - /* Ensure the changes make it to disk. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - /* Successfully filled the hole. */ - status.runlist_merged = 0; - status.mft_attr_mapped = 0; - status.mp_rebuilt = 0; - /* Setup the map cache and use that to deal with the buffer. */ - was_hole = true; - vcn = bh_cpos; - vcn_len = 1; - lcn_block = lcn << (vol->cluster_size_bits - blocksize_bits); - cdelta = 0; - /* - * If the number of remaining clusters in the @pages is smaller - * or equal to the number of cached clusters, unlock the - * runlist as the map cache will be used from now on. - */ - if (likely(vcn + vcn_len >= cend)) { - up_write(&ni->runlist.lock); - rl_write_locked = false; - rl = NULL; - } - goto map_buffer_cached; - } while (bh_pos += blocksize, (bh = bh->b_this_page) != head); - /* If there are no errors, do the next page. */ - if (likely(!err && ++u < nr_pages)) - goto do_next_folio; - /* If there are no errors, release the runlist lock if we took it. */ - if (likely(!err)) { - if (unlikely(rl_write_locked)) { - up_write(&ni->runlist.lock); - rl_write_locked = false; - } else if (unlikely(rl)) - up_read(&ni->runlist.lock); - rl = NULL; - } - /* If we issued read requests, let them complete. */ - read_lock_irqsave(&ni->size_lock, flags); - initialized_size = ni->initialized_size; - read_unlock_irqrestore(&ni->size_lock, flags); - while (wait_bh > wait) { - bh = *--wait_bh; - wait_on_buffer(bh); - if (likely(buffer_uptodate(bh))) { - folio = bh->b_folio; - bh_pos = folio_pos(folio) + bh_offset(bh); - /* - * If the buffer overflows the initialized size, need - * to zero the overflowing region. - */ - if (unlikely(bh_pos + blocksize > initialized_size)) { - int ofs = 0; - - if (likely(bh_pos < initialized_size)) - ofs = initialized_size - bh_pos; - folio_zero_segment(folio, bh_offset(bh) + ofs, - blocksize); - } - } else /* if (unlikely(!buffer_uptodate(bh))) */ - err = -EIO; - } - if (likely(!err)) { - /* Clear buffer_new on all buffers. */ - u = 0; - do { - bh = head = page_buffers(pages[u]); - do { - if (buffer_new(bh)) - clear_buffer_new(bh); - } while ((bh = bh->b_this_page) != head); - } while (++u < nr_pages); - ntfs_debug("Done."); - return err; - } - if (status.attr_switched) { - /* Get back to the attribute extent we modified. */ - ntfs_attr_reinit_search_ctx(ctx); - if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, bh_cpos, NULL, 0, ctx)) { - ntfs_error(vol->sb, "Failed to find required " - "attribute extent of attribute in " - "error code path. Run chkdsk to " - "recover."); - write_lock_irqsave(&ni->size_lock, flags); - ni->itype.compressed.size += vol->cluster_size; - write_unlock_irqrestore(&ni->size_lock, flags); - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - /* - * The only thing that is now wrong is the compressed - * size of the base attribute extent which chkdsk - * should be able to fix. - */ - NVolSetErrors(vol); - } else { - m = ctx->mrec; - a = ctx->attr; - status.attr_switched = 0; - } - } - /* - * If the runlist has been modified, need to restore it by punching a - * hole into it and we then need to deallocate the on-disk cluster as - * well. Note, we only modify the runlist if we are able to generate a - * new mapping pairs array, i.e. only when the mapped attribute extent - * is not switched. - */ - if (status.runlist_merged && !status.attr_switched) { - BUG_ON(!rl_write_locked); - /* Make the file cluster we allocated sparse in the runlist. */ - if (ntfs_rl_punch_nolock(vol, &ni->runlist, bh_cpos, 1)) { - ntfs_error(vol->sb, "Failed to punch hole into " - "attribute runlist in error code " - "path. Run chkdsk to recover the " - "lost cluster."); - NVolSetErrors(vol); - } else /* if (success) */ { - status.runlist_merged = 0; - /* - * Deallocate the on-disk cluster we allocated but only - * if we succeeded in punching its vcn out of the - * runlist. - */ - down_write(&vol->lcnbmp_lock); - if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) { - ntfs_error(vol->sb, "Failed to release " - "allocated cluster in error " - "code path. Run chkdsk to " - "recover the lost cluster."); - NVolSetErrors(vol); - } - up_write(&vol->lcnbmp_lock); - } - } - /* - * Resize the attribute record to its old size and rebuild the mapping - * pairs array. Note, we only can do this if the runlist has been - * restored to its old state which also implies that the mapped - * attribute extent is not switched. - */ - if (status.mp_rebuilt && !status.runlist_merged) { - if (ntfs_attr_record_resize(m, a, attr_rec_len)) { - ntfs_error(vol->sb, "Failed to restore attribute " - "record in error code path. Run " - "chkdsk to recover."); - NVolSetErrors(vol); - } else /* if (success) */ { - if (ntfs_mapping_pairs_build(vol, (u8*)a + - le16_to_cpu(a->data.non_resident. - mapping_pairs_offset), attr_rec_len - - le16_to_cpu(a->data.non_resident. - mapping_pairs_offset), ni->runlist.rl, - vcn, highest_vcn, NULL)) { - ntfs_error(vol->sb, "Failed to restore " - "mapping pairs array in error " - "code path. Run chkdsk to " - "recover."); - NVolSetErrors(vol); - } - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - } - } - /* Release the mft record and the attribute. */ - if (status.mft_attr_mapped) { - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - } - /* Release the runlist lock. */ - if (rl_write_locked) - up_write(&ni->runlist.lock); - else if (rl) - up_read(&ni->runlist.lock); - /* - * Zero out any newly allocated blocks to avoid exposing stale data. - * If BH_New is set, we know that the block was newly allocated above - * and that it has not been fully zeroed and marked dirty yet. - */ - nr_pages = u; - u = 0; - end = bh_cpos << vol->cluster_size_bits; - do { - folio = page_folio(pages[u]); - bh = head = folio_buffers(folio); - do { - if (u == nr_pages && - folio_pos(folio) + bh_offset(bh) >= end) - break; - if (!buffer_new(bh)) - continue; - clear_buffer_new(bh); - if (!buffer_uptodate(bh)) { - if (folio_test_uptodate(folio)) - set_buffer_uptodate(bh); - else { - folio_zero_range(folio, bh_offset(bh), - blocksize); - set_buffer_uptodate(bh); - } - } - mark_buffer_dirty(bh); - } while ((bh = bh->b_this_page) != head); - } while (++u <= nr_pages); - ntfs_error(vol->sb, "Failed. Returning error code %i.", err); - return err; -} - -static inline void ntfs_flush_dcache_pages(struct page **pages, - unsigned nr_pages) -{ - BUG_ON(!nr_pages); - /* - * Warning: Do not do the decrement at the same time as the call to - * flush_dcache_page() because it is a NULL macro on i386 and hence the - * decrement never happens so the loop never terminates. - */ - do { - --nr_pages; - flush_dcache_page(pages[nr_pages]); - } while (nr_pages > 0); -} - -/** - * ntfs_commit_pages_after_non_resident_write - commit the received data - * @pages: array of destination pages - * @nr_pages: number of pages in @pages - * @pos: byte position in file at which the write begins - * @bytes: number of bytes to be written - * - * See description of ntfs_commit_pages_after_write(), below. - */ -static inline int ntfs_commit_pages_after_non_resident_write( - struct page **pages, const unsigned nr_pages, - s64 pos, size_t bytes) -{ - s64 end, initialized_size; - struct inode *vi; - ntfs_inode *ni, *base_ni; - struct buffer_head *bh, *head; - ntfs_attr_search_ctx *ctx; - MFT_RECORD *m; - ATTR_RECORD *a; - unsigned long flags; - unsigned blocksize, u; - int err; - - vi = pages[0]->mapping->host; - ni = NTFS_I(vi); - blocksize = vi->i_sb->s_blocksize; - end = pos + bytes; - u = 0; - do { - s64 bh_pos; - struct page *page; - bool partial; - - page = pages[u]; - bh_pos = (s64)page->index << PAGE_SHIFT; - bh = head = page_buffers(page); - partial = false; - do { - s64 bh_end; - - bh_end = bh_pos + blocksize; - if (bh_end <= pos || bh_pos >= end) { - if (!buffer_uptodate(bh)) - partial = true; - } else { - set_buffer_uptodate(bh); - mark_buffer_dirty(bh); - } - } while (bh_pos += blocksize, (bh = bh->b_this_page) != head); - /* - * If all buffers are now uptodate but the page is not, set the - * page uptodate. - */ - if (!partial && !PageUptodate(page)) - SetPageUptodate(page); - } while (++u < nr_pages); - /* - * Finally, if we do not need to update initialized_size or i_size we - * are finished. - */ - read_lock_irqsave(&ni->size_lock, flags); - initialized_size = ni->initialized_size; - read_unlock_irqrestore(&ni->size_lock, flags); - if (end <= initialized_size) { - ntfs_debug("Done."); - return 0; - } - /* - * Update initialized_size/i_size as appropriate, both in the inode and - * the mft record. - */ - if (!NInoAttr(ni)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - /* Map, pin, and lock the mft record. */ - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - m = NULL; - ctx = NULL; - goto err_out; - } - BUG_ON(!NInoNonResident(ni)); - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - err = -EIO; - goto err_out; - } - a = ctx->attr; - BUG_ON(!a->non_resident); - write_lock_irqsave(&ni->size_lock, flags); - BUG_ON(end > ni->allocated_size); - ni->initialized_size = end; - a->data.non_resident.initialized_size = cpu_to_sle64(end); - if (end > i_size_read(vi)) { - i_size_write(vi, end); - a->data.non_resident.data_size = - a->data.non_resident.initialized_size; - } - write_unlock_irqrestore(&ni->size_lock, flags); - /* Mark the mft record dirty, so it gets written back. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - ntfs_debug("Done."); - return 0; -err_out: - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(base_ni); - ntfs_error(vi->i_sb, "Failed to update initialized_size/i_size (error " - "code %i).", err); - if (err != -ENOMEM) - NVolSetErrors(ni->vol); - return err; -} - -/** - * ntfs_commit_pages_after_write - commit the received data - * @pages: array of destination pages - * @nr_pages: number of pages in @pages - * @pos: byte position in file at which the write begins - * @bytes: number of bytes to be written - * - * This is called from ntfs_file_buffered_write() with i_mutex held on the inode - * (@pages[0]->mapping->host). There are @nr_pages pages in @pages which are - * locked but not kmap()ped. The source data has already been copied into the - * @page. ntfs_prepare_pages_for_non_resident_write() has been called before - * the data was copied (for non-resident attributes only) and it returned - * success. - * - * Need to set uptodate and mark dirty all buffers within the boundary of the - * write. If all buffers in a page are uptodate we set the page uptodate, too. - * - * Setting the buffers dirty ensures that they get written out later when - * ntfs_writepage() is invoked by the VM. - * - * Finally, we need to update i_size and initialized_size as appropriate both - * in the inode and the mft record. - * - * This is modelled after fs/buffer.c::generic_commit_write(), which marks - * buffers uptodate and dirty, sets the page uptodate if all buffers in the - * page are uptodate, and updates i_size if the end of io is beyond i_size. In - * that case, it also marks the inode dirty. - * - * If things have gone as outlined in - * ntfs_prepare_pages_for_non_resident_write(), we do not need to do any page - * content modifications here for non-resident attributes. For resident - * attributes we need to do the uptodate bringing here which we combine with - * the copying into the mft record which means we save one atomic kmap. - * - * Return 0 on success or -errno on error. - */ -static int ntfs_commit_pages_after_write(struct page **pages, - const unsigned nr_pages, s64 pos, size_t bytes) -{ - s64 end, initialized_size; - loff_t i_size; - struct inode *vi; - ntfs_inode *ni, *base_ni; - struct page *page; - ntfs_attr_search_ctx *ctx; - MFT_RECORD *m; - ATTR_RECORD *a; - char *kattr, *kaddr; - unsigned long flags; - u32 attr_len; - int err; - - BUG_ON(!nr_pages); - BUG_ON(!pages); - page = pages[0]; - BUG_ON(!page); - vi = page->mapping->host; - ni = NTFS_I(vi); - ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page " - "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.", - vi->i_ino, ni->type, page->index, nr_pages, - (long long)pos, bytes); - if (NInoNonResident(ni)) - return ntfs_commit_pages_after_non_resident_write(pages, - nr_pages, pos, bytes); - BUG_ON(nr_pages > 1); - /* - * Attribute is resident, implying it is not compressed, encrypted, or - * sparse. - */ - if (!NInoAttr(ni)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - BUG_ON(NInoNonResident(ni)); - /* Map, pin, and lock the mft record. */ - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - m = NULL; - ctx = NULL; - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - err = -EIO; - goto err_out; - } - a = ctx->attr; - BUG_ON(a->non_resident); - /* The total length of the attribute value. */ - attr_len = le32_to_cpu(a->data.resident.value_length); - i_size = i_size_read(vi); - BUG_ON(attr_len != i_size); - BUG_ON(pos > attr_len); - end = pos + bytes; - BUG_ON(end > le32_to_cpu(a->length) - - le16_to_cpu(a->data.resident.value_offset)); - kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset); - kaddr = kmap_atomic(page); - /* Copy the received data from the page to the mft record. */ - memcpy(kattr + pos, kaddr + pos, bytes); - /* Update the attribute length if necessary. */ - if (end > attr_len) { - attr_len = end; - a->data.resident.value_length = cpu_to_le32(attr_len); - } - /* - * If the page is not uptodate, bring the out of bounds area(s) - * uptodate by copying data from the mft record to the page. - */ - if (!PageUptodate(page)) { - if (pos > 0) - memcpy(kaddr, kattr, pos); - if (end < attr_len) - memcpy(kaddr + end, kattr + end, attr_len - end); - /* Zero the region outside the end of the attribute value. */ - memset(kaddr + attr_len, 0, PAGE_SIZE - attr_len); - flush_dcache_page(page); - SetPageUptodate(page); - } - kunmap_atomic(kaddr); - /* Update initialized_size/i_size if necessary. */ - read_lock_irqsave(&ni->size_lock, flags); - initialized_size = ni->initialized_size; - BUG_ON(end > ni->allocated_size); - read_unlock_irqrestore(&ni->size_lock, flags); - BUG_ON(initialized_size != i_size); - if (end > initialized_size) { - write_lock_irqsave(&ni->size_lock, flags); - ni->initialized_size = end; - i_size_write(vi, end); - write_unlock_irqrestore(&ni->size_lock, flags); - } - /* Mark the mft record dirty, so it gets written back. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - ntfs_debug("Done."); - return 0; -err_out: - if (err == -ENOMEM) { - ntfs_warning(vi->i_sb, "Error allocating memory required to " - "commit the write."); - if (PageUptodate(page)) { - ntfs_warning(vi->i_sb, "Page is uptodate, setting " - "dirty so the write will be retried " - "later on by the VM."); - /* - * Put the page on mapping->dirty_pages, but leave its - * buffers' dirty state as-is. - */ - __set_page_dirty_nobuffers(page); - err = 0; - } else - ntfs_error(vi->i_sb, "Page is not uptodate. Written " - "data has been lost."); - } else { - ntfs_error(vi->i_sb, "Resident attribute commit write failed " - "with error %i.", err); - NVolSetErrors(ni->vol); - } - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(base_ni); - return err; -} - -/* - * Copy as much as we can into the pages and return the number of bytes which - * were successfully copied. If a fault is encountered then clear the pages - * out to (ofs + bytes) and return the number of bytes which were copied. - */ -static size_t ntfs_copy_from_user_iter(struct page **pages, unsigned nr_pages, - unsigned ofs, struct iov_iter *i, size_t bytes) -{ - struct page **last_page = pages + nr_pages; - size_t total = 0; - unsigned len, copied; - - do { - len = PAGE_SIZE - ofs; - if (len > bytes) - len = bytes; - copied = copy_page_from_iter_atomic(*pages, ofs, len, i); - total += copied; - bytes -= copied; - if (!bytes) - break; - if (copied < len) - goto err; - ofs = 0; - } while (++pages < last_page); -out: - return total; -err: - /* Zero the rest of the target like __copy_from_user(). */ - len = PAGE_SIZE - copied; - do { - if (len > bytes) - len = bytes; - zero_user(*pages, copied, len); - bytes -= len; - copied = 0; - len = PAGE_SIZE; - } while (++pages < last_page); - goto out; -} - -/** - * ntfs_perform_write - perform buffered write to a file - * @file: file to write to - * @i: iov_iter with data to write - * @pos: byte offset in file at which to begin writing to - */ -static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i, - loff_t pos) -{ - struct address_space *mapping = file->f_mapping; - struct inode *vi = mapping->host; - ntfs_inode *ni = NTFS_I(vi); - ntfs_volume *vol = ni->vol; - struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER]; - struct page *cached_page = NULL; - VCN last_vcn; - LCN lcn; - size_t bytes; - ssize_t status, written = 0; - unsigned nr_pages; - - ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos " - "0x%llx, count 0x%lx.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), - (unsigned long long)pos, - (unsigned long)iov_iter_count(i)); - /* - * If a previous ntfs_truncate() failed, repeat it and abort if it - * fails again. - */ - if (unlikely(NInoTruncateFailed(ni))) { - int err; - - inode_dio_wait(vi); - err = ntfs_truncate(vi); - if (err || NInoTruncateFailed(ni)) { - if (!err) - err = -EIO; - ntfs_error(vol->sb, "Cannot perform write to inode " - "0x%lx, attribute type 0x%x, because " - "ntfs_truncate() failed (error code " - "%i).", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), err); - return err; - } - } - /* - * Determine the number of pages per cluster for non-resident - * attributes. - */ - nr_pages = 1; - if (vol->cluster_size > PAGE_SIZE && NInoNonResident(ni)) - nr_pages = vol->cluster_size >> PAGE_SHIFT; - last_vcn = -1; - do { - VCN vcn; - pgoff_t start_idx; - unsigned ofs, do_pages, u; - size_t copied; - - start_idx = pos >> PAGE_SHIFT; - ofs = pos & ~PAGE_MASK; - bytes = PAGE_SIZE - ofs; - do_pages = 1; - if (nr_pages > 1) { - vcn = pos >> vol->cluster_size_bits; - if (vcn != last_vcn) { - last_vcn = vcn; - /* - * Get the lcn of the vcn the write is in. If - * it is a hole, need to lock down all pages in - * the cluster. - */ - down_read(&ni->runlist.lock); - lcn = ntfs_attr_vcn_to_lcn_nolock(ni, pos >> - vol->cluster_size_bits, false); - up_read(&ni->runlist.lock); - if (unlikely(lcn < LCN_HOLE)) { - if (lcn == LCN_ENOMEM) - status = -ENOMEM; - else { - status = -EIO; - ntfs_error(vol->sb, "Cannot " - "perform write to " - "inode 0x%lx, " - "attribute type 0x%x, " - "because the attribute " - "is corrupt.", - vi->i_ino, (unsigned) - le32_to_cpu(ni->type)); - } - break; - } - if (lcn == LCN_HOLE) { - start_idx = (pos & ~(s64) - vol->cluster_size_mask) - >> PAGE_SHIFT; - bytes = vol->cluster_size - (pos & - vol->cluster_size_mask); - do_pages = nr_pages; - } - } - } - if (bytes > iov_iter_count(i)) - bytes = iov_iter_count(i); -again: - /* - * Bring in the user page(s) that we will copy from _first_. - * Otherwise there is a nasty deadlock on copying from the same - * page(s) as we are writing to, without it/them being marked - * up-to-date. Note, at present there is nothing to stop the - * pages being swapped out between us bringing them into memory - * and doing the actual copying. - */ - if (unlikely(fault_in_iov_iter_readable(i, bytes))) { - status = -EFAULT; - break; - } - /* Get and lock @do_pages starting at index @start_idx. */ - status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages, - pages, &cached_page); - if (unlikely(status)) - break; - /* - * For non-resident attributes, we need to fill any holes with - * actual clusters and ensure all bufferes are mapped. We also - * need to bring uptodate any buffers that are only partially - * being written to. - */ - if (NInoNonResident(ni)) { - status = ntfs_prepare_pages_for_non_resident_write( - pages, do_pages, pos, bytes); - if (unlikely(status)) { - do { - unlock_page(pages[--do_pages]); - put_page(pages[do_pages]); - } while (do_pages); - break; - } - } - u = (pos >> PAGE_SHIFT) - pages[0]->index; - copied = ntfs_copy_from_user_iter(pages + u, do_pages - u, ofs, - i, bytes); - ntfs_flush_dcache_pages(pages + u, do_pages - u); - status = 0; - if (likely(copied == bytes)) { - status = ntfs_commit_pages_after_write(pages, do_pages, - pos, bytes); - } - do { - unlock_page(pages[--do_pages]); - put_page(pages[do_pages]); - } while (do_pages); - if (unlikely(status < 0)) { - iov_iter_revert(i, copied); - break; - } - cond_resched(); - if (unlikely(copied < bytes)) { - iov_iter_revert(i, copied); - if (copied) - bytes = copied; - else if (bytes > PAGE_SIZE - ofs) - bytes = PAGE_SIZE - ofs; - goto again; - } - pos += copied; - written += copied; - balance_dirty_pages_ratelimited(mapping); - if (fatal_signal_pending(current)) { - status = -EINTR; - break; - } - } while (iov_iter_count(i)); - if (cached_page) - put_page(cached_page); - ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", - written ? "written" : "status", (unsigned long)written, - (long)status); - return written ? written : status; -} - -/** - * ntfs_file_write_iter - simple wrapper for ntfs_file_write_iter_nolock() - * @iocb: IO state structure - * @from: iov_iter with data to write - * - * Basically the same as generic_file_write_iter() except that it ends up - * up calling ntfs_perform_write() instead of generic_perform_write() and that - * O_DIRECT is not implemented. - */ -static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) -{ - struct file *file = iocb->ki_filp; - struct inode *vi = file_inode(file); - ssize_t written = 0; - ssize_t err; - - inode_lock(vi); - /* We can write back this queue in page reclaim. */ - err = ntfs_prepare_file_for_write(iocb, from); - if (iov_iter_count(from) && !err) - written = ntfs_perform_write(file, from, iocb->ki_pos); - inode_unlock(vi); - iocb->ki_pos += written; - if (likely(written > 0)) - written = generic_write_sync(iocb, written); - return written ? written : err; -} - -/** - * ntfs_file_fsync - sync a file to disk - * @filp: file to be synced - * @datasync: if non-zero only flush user data and not metadata - * - * Data integrity sync of a file to disk. Used for fsync, fdatasync, and msync - * system calls. This function is inspired by fs/buffer.c::file_fsync(). - * - * If @datasync is false, write the mft record and all associated extent mft - * records as well as the $DATA attribute and then sync the block device. - * - * If @datasync is true and the attribute is non-resident, we skip the writing - * of the mft record and all associated extent mft records (this might still - * happen due to the write_inode_now() call). - * - * Also, if @datasync is true, we do not wait on the inode to be written out - * but we always wait on the page cache pages to be written out. - * - * Locking: Caller must hold i_mutex on the inode. - * - * TODO: We should probably also write all attribute/index inodes associated - * with this inode but since we have no simple way of getting to them we ignore - * this problem for now. - */ -static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end, - int datasync) -{ - struct inode *vi = filp->f_mapping->host; - int err, ret = 0; - - ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); - - err = file_write_and_wait_range(filp, start, end); - if (err) - return err; - inode_lock(vi); - - BUG_ON(S_ISDIR(vi->i_mode)); - if (!datasync || !NInoNonResident(NTFS_I(vi))) - ret = __ntfs_write_inode(vi, 1); - write_inode_now(vi, !datasync); - /* - * NOTE: If we were to use mapping->private_list (see ext2 and - * fs/buffer.c) for dirty blocks then we could optimize the below to be - * sync_mapping_buffers(vi->i_mapping). - */ - err = sync_blockdev(vi->i_sb->s_bdev); - if (unlikely(err && !ret)) - ret = err; - if (likely(!ret)) - ntfs_debug("Done."); - else - ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error " - "%u.", datasync ? "data" : "", vi->i_ino, -ret); - inode_unlock(vi); - return ret; -} - -#endif /* NTFS_RW */ - -const struct file_operations ntfs_file_ops = { - .llseek = generic_file_llseek, - .read_iter = generic_file_read_iter, -#ifdef NTFS_RW - .write_iter = ntfs_file_write_iter, - .fsync = ntfs_file_fsync, -#endif /* NTFS_RW */ - .mmap = generic_file_mmap, - .open = ntfs_file_open, - .splice_read = filemap_splice_read, -}; - -const struct inode_operations ntfs_file_inode_ops = { -#ifdef NTFS_RW - .setattr = ntfs_setattr, -#endif /* NTFS_RW */ -}; - -const struct file_operations ntfs_empty_file_ops = {}; - -const struct inode_operations ntfs_empty_inode_ops = {}; diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c deleted file mode 100644 index d46c2c03a032..000000000000 --- a/fs/ntfs/index.c +++ /dev/null @@ -1,440 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * index.c - NTFS kernel index handling. Part of the Linux-NTFS project. - * - * Copyright (c) 2004-2005 Anton Altaparmakov - */ - -#include <linux/slab.h> - -#include "aops.h" -#include "collate.h" -#include "debug.h" -#include "index.h" -#include "ntfs.h" - -/** - * ntfs_index_ctx_get - allocate and initialize a new index context - * @idx_ni: ntfs index inode with which to initialize the context - * - * Allocate a new index context, initialize it with @idx_ni and return it. - * Return NULL if allocation failed. - * - * Locking: Caller must hold i_mutex on the index inode. - */ -ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni) -{ - ntfs_index_context *ictx; - - ictx = kmem_cache_alloc(ntfs_index_ctx_cache, GFP_NOFS); - if (ictx) - *ictx = (ntfs_index_context){ .idx_ni = idx_ni }; - return ictx; -} - -/** - * ntfs_index_ctx_put - release an index context - * @ictx: index context to free - * - * Release the index context @ictx, releasing all associated resources. - * - * Locking: Caller must hold i_mutex on the index inode. - */ -void ntfs_index_ctx_put(ntfs_index_context *ictx) -{ - if (ictx->entry) { - if (ictx->is_in_root) { - if (ictx->actx) - ntfs_attr_put_search_ctx(ictx->actx); - if (ictx->base_ni) - unmap_mft_record(ictx->base_ni); - } else { - struct page *page = ictx->page; - if (page) { - BUG_ON(!PageLocked(page)); - unlock_page(page); - ntfs_unmap_page(page); - } - } - } - kmem_cache_free(ntfs_index_ctx_cache, ictx); - return; -} - -/** - * ntfs_index_lookup - find a key in an index and return its index entry - * @key: [IN] key for which to search in the index - * @key_len: [IN] length of @key in bytes - * @ictx: [IN/OUT] context describing the index and the returned entry - * - * Before calling ntfs_index_lookup(), @ictx must have been obtained from a - * call to ntfs_index_ctx_get(). - * - * Look for the @key in the index specified by the index lookup context @ictx. - * ntfs_index_lookup() walks the contents of the index looking for the @key. - * - * If the @key is found in the index, 0 is returned and @ictx is setup to - * describe the index entry containing the matching @key. @ictx->entry is the - * index entry and @ictx->data and @ictx->data_len are the index entry data and - * its length in bytes, respectively. - * - * If the @key is not found in the index, -ENOENT is returned and @ictx is - * setup to describe the index entry whose key collates immediately after the - * search @key, i.e. this is the position in the index at which an index entry - * with a key of @key would need to be inserted. - * - * If an error occurs return the negative error code and @ictx is left - * untouched. - * - * When finished with the entry and its data, call ntfs_index_ctx_put() to free - * the context and other associated resources. - * - * If the index entry was modified, call flush_dcache_index_entry_page() - * immediately after the modification and either ntfs_index_entry_mark_dirty() - * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to - * ensure that the changes are written to disk. - * - * Locking: - Caller must hold i_mutex on the index inode. - * - Each page cache page in the index allocation mapping must be - * locked whilst being accessed otherwise we may find a corrupt - * page due to it being under ->writepage at the moment which - * applies the mst protection fixups before writing out and then - * removes them again after the write is complete after which it - * unlocks the page. - */ -int ntfs_index_lookup(const void *key, const int key_len, - ntfs_index_context *ictx) -{ - VCN vcn, old_vcn; - ntfs_inode *idx_ni = ictx->idx_ni; - ntfs_volume *vol = idx_ni->vol; - struct super_block *sb = vol->sb; - ntfs_inode *base_ni = idx_ni->ext.base_ntfs_ino; - MFT_RECORD *m; - INDEX_ROOT *ir; - INDEX_ENTRY *ie; - INDEX_ALLOCATION *ia; - u8 *index_end, *kaddr; - ntfs_attr_search_ctx *actx; - struct address_space *ia_mapping; - struct page *page; - int rc, err = 0; - - ntfs_debug("Entering."); - BUG_ON(!NInoAttr(idx_ni)); - BUG_ON(idx_ni->type != AT_INDEX_ALLOCATION); - BUG_ON(idx_ni->nr_extents != -1); - BUG_ON(!base_ni); - BUG_ON(!key); - BUG_ON(key_len <= 0); - if (!ntfs_is_collation_rule_supported( - idx_ni->itype.index.collation_rule)) { - ntfs_error(sb, "Index uses unsupported collation rule 0x%x. " - "Aborting lookup.", le32_to_cpu( - idx_ni->itype.index.collation_rule)); - return -EOPNOTSUPP; - } - /* Get hold of the mft record for the index inode. */ - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - ntfs_error(sb, "map_mft_record() failed with error code %ld.", - -PTR_ERR(m)); - return PTR_ERR(m); - } - actx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!actx)) { - err = -ENOMEM; - goto err_out; - } - /* Find the index root attribute in the mft record. */ - err = ntfs_attr_lookup(AT_INDEX_ROOT, idx_ni->name, idx_ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, actx); - if (unlikely(err)) { - if (err == -ENOENT) { - ntfs_error(sb, "Index root attribute missing in inode " - "0x%lx.", idx_ni->mft_no); - err = -EIO; - } - goto err_out; - } - /* Get to the index root value (it has been verified in read_inode). */ - ir = (INDEX_ROOT*)((u8*)actx->attr + - le16_to_cpu(actx->attr->data.resident.value_offset)); - index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); - /* The first index entry. */ - ie = (INDEX_ENTRY*)((u8*)&ir->index + - le32_to_cpu(ir->index.entries_offset)); - /* - * Loop until we exceed valid memory (corruption case) or until we - * reach the last entry. - */ - for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { - /* Bounds checks. */ - if ((u8*)ie < (u8*)actx->mrec || (u8*)ie + - sizeof(INDEX_ENTRY_HEADER) > index_end || - (u8*)ie + le16_to_cpu(ie->length) > index_end) - goto idx_err_out; - /* - * The last entry cannot contain a key. It can however contain - * a pointer to a child node in the B+tree so we just break out. - */ - if (ie->flags & INDEX_ENTRY_END) - break; - /* Further bounds checks. */ - if ((u32)sizeof(INDEX_ENTRY_HEADER) + - le16_to_cpu(ie->key_length) > - le16_to_cpu(ie->data.vi.data_offset) || - (u32)le16_to_cpu(ie->data.vi.data_offset) + - le16_to_cpu(ie->data.vi.data_length) > - le16_to_cpu(ie->length)) - goto idx_err_out; - /* If the keys match perfectly, we setup @ictx and return 0. */ - if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key, - &ie->key, key_len)) { -ir_done: - ictx->is_in_root = true; - ictx->ir = ir; - ictx->actx = actx; - ictx->base_ni = base_ni; - ictx->ia = NULL; - ictx->page = NULL; -done: - ictx->entry = ie; - ictx->data = (u8*)ie + - le16_to_cpu(ie->data.vi.data_offset); - ictx->data_len = le16_to_cpu(ie->data.vi.data_length); - ntfs_debug("Done."); - return err; - } - /* - * Not a perfect match, need to do full blown collation so we - * know which way in the B+tree we have to go. - */ - rc = ntfs_collate(vol, idx_ni->itype.index.collation_rule, key, - key_len, &ie->key, le16_to_cpu(ie->key_length)); - /* - * If @key collates before the key of the current entry, there - * is definitely no such key in this index but we might need to - * descend into the B+tree so we just break out of the loop. - */ - if (rc == -1) - break; - /* - * A match should never happen as the memcmp() call should have - * cought it, but we still treat it correctly. - */ - if (!rc) - goto ir_done; - /* The keys are not equal, continue the search. */ - } - /* - * We have finished with this index without success. Check for the - * presence of a child node and if not present setup @ictx and return - * -ENOENT. - */ - if (!(ie->flags & INDEX_ENTRY_NODE)) { - ntfs_debug("Entry not found."); - err = -ENOENT; - goto ir_done; - } /* Child node present, descend into it. */ - /* Consistency check: Verify that an index allocation exists. */ - if (!NInoIndexAllocPresent(idx_ni)) { - ntfs_error(sb, "No index allocation attribute but index entry " - "requires one. Inode 0x%lx is corrupt or " - "driver bug.", idx_ni->mft_no); - goto err_out; - } - /* Get the starting vcn of the index_block holding the child node. */ - vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8)); - ia_mapping = VFS_I(idx_ni)->i_mapping; - /* - * We are done with the index root and the mft record. Release them, - * otherwise we deadlock with ntfs_map_page(). - */ - ntfs_attr_put_search_ctx(actx); - unmap_mft_record(base_ni); - m = NULL; - actx = NULL; -descend_into_child_node: - /* - * Convert vcn to index into the index allocation attribute in units - * of PAGE_SIZE and map the page cache page, reading it from - * disk if necessary. - */ - page = ntfs_map_page(ia_mapping, vcn << - idx_ni->itype.index.vcn_size_bits >> PAGE_SHIFT); - if (IS_ERR(page)) { - ntfs_error(sb, "Failed to map index page, error %ld.", - -PTR_ERR(page)); - err = PTR_ERR(page); - goto err_out; - } - lock_page(page); - kaddr = (u8*)page_address(page); -fast_descend_into_child_node: - /* Get to the index allocation block. */ - ia = (INDEX_ALLOCATION*)(kaddr + ((vcn << - idx_ni->itype.index.vcn_size_bits) & ~PAGE_MASK)); - /* Bounds checks. */ - if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) { - ntfs_error(sb, "Out of bounds check failed. Corrupt inode " - "0x%lx or driver bug.", idx_ni->mft_no); - goto unm_err_out; - } - /* Catch multi sector transfer fixup errors. */ - if (unlikely(!ntfs_is_indx_record(ia->magic))) { - ntfs_error(sb, "Index record with vcn 0x%llx is corrupt. " - "Corrupt inode 0x%lx. Run chkdsk.", - (long long)vcn, idx_ni->mft_no); - goto unm_err_out; - } - if (sle64_to_cpu(ia->index_block_vcn) != vcn) { - ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is " - "different from expected VCN (0x%llx). Inode " - "0x%lx is corrupt or driver bug.", - (unsigned long long) - sle64_to_cpu(ia->index_block_vcn), - (unsigned long long)vcn, idx_ni->mft_no); - goto unm_err_out; - } - if (le32_to_cpu(ia->index.allocated_size) + 0x18 != - idx_ni->itype.index.block_size) { - ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx has " - "a size (%u) differing from the index " - "specified size (%u). Inode is corrupt or " - "driver bug.", (unsigned long long)vcn, - idx_ni->mft_no, - le32_to_cpu(ia->index.allocated_size) + 0x18, - idx_ni->itype.index.block_size); - goto unm_err_out; - } - index_end = (u8*)ia + idx_ni->itype.index.block_size; - if (index_end > kaddr + PAGE_SIZE) { - ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx " - "crosses page boundary. Impossible! Cannot " - "access! This is probably a bug in the " - "driver.", (unsigned long long)vcn, - idx_ni->mft_no); - goto unm_err_out; - } - index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length); - if (index_end > (u8*)ia + idx_ni->itype.index.block_size) { - ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of inode " - "0x%lx exceeds maximum size.", - (unsigned long long)vcn, idx_ni->mft_no); - goto unm_err_out; - } - /* The first index entry. */ - ie = (INDEX_ENTRY*)((u8*)&ia->index + - le32_to_cpu(ia->index.entries_offset)); - /* - * Iterate similar to above big loop but applied to index buffer, thus - * loop until we exceed valid memory (corruption case) or until we - * reach the last entry. - */ - for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { - /* Bounds checks. */ - if ((u8*)ie < (u8*)ia || (u8*)ie + - sizeof(INDEX_ENTRY_HEADER) > index_end || - (u8*)ie + le16_to_cpu(ie->length) > index_end) { - ntfs_error(sb, "Index entry out of bounds in inode " - "0x%lx.", idx_ni->mft_no); - goto unm_err_out; - } - /* - * The last entry cannot contain a key. It can however contain - * a pointer to a child node in the B+tree so we just break out. - */ - if (ie->flags & INDEX_ENTRY_END) - break; - /* Further bounds checks. */ - if ((u32)sizeof(INDEX_ENTRY_HEADER) + - le16_to_cpu(ie->key_length) > - le16_to_cpu(ie->data.vi.data_offset) || - (u32)le16_to_cpu(ie->data.vi.data_offset) + - le16_to_cpu(ie->data.vi.data_length) > - le16_to_cpu(ie->length)) { - ntfs_error(sb, "Index entry out of bounds in inode " - "0x%lx.", idx_ni->mft_no); - goto unm_err_out; - } - /* If the keys match perfectly, we setup @ictx and return 0. */ - if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key, - &ie->key, key_len)) { -ia_done: - ictx->is_in_root = false; - ictx->actx = NULL; - ictx->base_ni = NULL; - ictx->ia = ia; - ictx->page = page; - goto done; - } - /* - * Not a perfect match, need to do full blown collation so we - * know which way in the B+tree we have to go. - */ - rc = ntfs_collate(vol, idx_ni->itype.index.collation_rule, key, - key_len, &ie->key, le16_to_cpu(ie->key_length)); - /* - * If @key collates before the key of the current entry, there - * is definitely no such key in this index but we might need to - * descend into the B+tree so we just break out of the loop. - */ - if (rc == -1) - break; - /* - * A match should never happen as the memcmp() call should have - * cought it, but we still treat it correctly. - */ - if (!rc) - goto ia_done; - /* The keys are not equal, continue the search. */ - } - /* - * We have finished with this index buffer without success. Check for - * the presence of a child node and if not present return -ENOENT. - */ - if (!(ie->flags & INDEX_ENTRY_NODE)) { - ntfs_debug("Entry not found."); - err = -ENOENT; - goto ia_done; - } - if ((ia->index.flags & NODE_MASK) == LEAF_NODE) { - ntfs_error(sb, "Index entry with child node found in a leaf " - "node in inode 0x%lx.", idx_ni->mft_no); - goto unm_err_out; - } - /* Child node present, descend into it. */ - old_vcn = vcn; - vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8)); - if (vcn >= 0) { - /* - * If vcn is in the same page cache page as old_vcn we recycle - * the mapped page. - */ - if (old_vcn << vol->cluster_size_bits >> - PAGE_SHIFT == vcn << - vol->cluster_size_bits >> - PAGE_SHIFT) - goto fast_descend_into_child_node; - unlock_page(page); - ntfs_unmap_page(page); - goto descend_into_child_node; - } - ntfs_error(sb, "Negative child node vcn in inode 0x%lx.", - idx_ni->mft_no); -unm_err_out: - unlock_page(page); - ntfs_unmap_page(page); -err_out: - if (!err) - err = -EIO; - if (actx) - ntfs_attr_put_search_ctx(actx); - if (m) - unmap_mft_record(base_ni); - return err; -idx_err_out: - ntfs_error(sb, "Corrupt index. Aborting lookup."); - goto err_out; -} diff --git a/fs/ntfs/index.h b/fs/ntfs/index.h deleted file mode 100644 index bb3c3ae55138..000000000000 --- a/fs/ntfs/index.h +++ /dev/null @@ -1,134 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * index.h - Defines for NTFS kernel index handling. Part of the Linux-NTFS - * project. - * - * Copyright (c) 2004 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_INDEX_H -#define _LINUX_NTFS_INDEX_H - -#include <linux/fs.h> - -#include "types.h" -#include "layout.h" -#include "inode.h" -#include "attrib.h" -#include "mft.h" -#include "aops.h" - -/** - * @idx_ni: index inode containing the @entry described by this context - * @entry: index entry (points into @ir or @ia) - * @data: index entry data (points into @entry) - * @data_len: length in bytes of @data - * @is_in_root: 'true' if @entry is in @ir and 'false' if it is in @ia - * @ir: index root if @is_in_root and NULL otherwise - * @actx: attribute search context if @is_in_root and NULL otherwise - * @base_ni: base inode if @is_in_root and NULL otherwise - * @ia: index block if @is_in_root is 'false' and NULL otherwise - * @page: page if @is_in_root is 'false' and NULL otherwise - * - * @idx_ni is the index inode this context belongs to. - * - * @entry is the index entry described by this context. @data and @data_len - * are the index entry data and its length in bytes, respectively. @data - * simply points into @entry. This is probably what the user is interested in. - * - * If @is_in_root is 'true', @entry is in the index root attribute @ir described - * by the attribute search context @actx and the base inode @base_ni. @ia and - * @page are NULL in this case. - * - * If @is_in_root is 'false', @entry is in the index allocation attribute and @ia - * and @page point to the index allocation block and the mapped, locked page it - * is in, respectively. @ir, @actx and @base_ni are NULL in this case. - * - * To obtain a context call ntfs_index_ctx_get(). - * - * We use this context to allow ntfs_index_lookup() to return the found index - * @entry and its @data without having to allocate a buffer and copy the @entry - * and/or its @data into it. - * - * When finished with the @entry and its @data, call ntfs_index_ctx_put() to - * free the context and other associated resources. - * - * If the index entry was modified, call flush_dcache_index_entry_page() - * immediately after the modification and either ntfs_index_entry_mark_dirty() - * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to - * ensure that the changes are written to disk. - */ -typedef struct { - ntfs_inode *idx_ni; - INDEX_ENTRY *entry; - void *data; - u16 data_len; - bool is_in_root; - INDEX_ROOT *ir; - ntfs_attr_search_ctx *actx; - ntfs_inode *base_ni; - INDEX_ALLOCATION *ia; - struct page *page; -} ntfs_index_context; - -extern ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni); -extern void ntfs_index_ctx_put(ntfs_index_context *ictx); - -extern int ntfs_index_lookup(const void *key, const int key_len, - ntfs_index_context *ictx); - -#ifdef NTFS_RW - -/** - * ntfs_index_entry_flush_dcache_page - flush_dcache_page() for index entries - * @ictx: ntfs index context describing the index entry - * - * Call flush_dcache_page() for the page in which an index entry resides. - * - * This must be called every time an index entry is modified, just after the - * modification. - * - * If the index entry is in the index root attribute, simply flush the page - * containing the mft record containing the index root attribute. - * - * If the index entry is in an index block belonging to the index allocation - * attribute, simply flush the page cache page containing the index block. - */ -static inline void ntfs_index_entry_flush_dcache_page(ntfs_index_context *ictx) -{ - if (ictx->is_in_root) - flush_dcache_mft_record_page(ictx->actx->ntfs_ino); - else - flush_dcache_page(ictx->page); -} - -/** - * ntfs_index_entry_mark_dirty - mark an index entry dirty - * @ictx: ntfs index context describing the index entry - * - * Mark the index entry described by the index entry context @ictx dirty. - * - * If the index entry is in the index root attribute, simply mark the mft - * record containing the index root attribute dirty. This ensures the mft - * record, and hence the index root attribute, will be written out to disk - * later. - * - * If the index entry is in an index block belonging to the index allocation - * attribute, mark the buffers belonging to the index record as well as the - * page cache page the index block is in dirty. This automatically marks the - * VFS inode of the ntfs index inode to which the index entry belongs dirty, - * too (I_DIRTY_PAGES) and this in turn ensures the page buffers, and hence the - * dirty index block, will be written out to disk later. - */ -static inline void ntfs_index_entry_mark_dirty(ntfs_index_context *ictx) -{ - if (ictx->is_in_root) - mark_mft_record_dirty(ictx->actx->ntfs_ino); - else - mark_ntfs_record_dirty(ictx->page, - (u8*)ictx->ia - (u8*)page_address(ictx->page)); -} - -#endif /* NTFS_RW */ - -#endif /* _LINUX_NTFS_INDEX_H */ diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c deleted file mode 100644 index aba1e22db4e9..000000000000 --- a/fs/ntfs/inode.c +++ /dev/null @@ -1,3102 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * inode.c - NTFS kernel inode handling. - * - * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc. - */ - -#include <linux/buffer_head.h> -#include <linux/fs.h> -#include <linux/mm.h> -#include <linux/mount.h> -#include <linux/mutex.h> -#include <linux/pagemap.h> -#include <linux/quotaops.h> -#include <linux/slab.h> -#include <linux/log2.h> - -#include "aops.h" -#include "attrib.h" -#include "bitmap.h" -#include "dir.h" -#include "debug.h" -#include "inode.h" -#include "lcnalloc.h" -#include "malloc.h" -#include "mft.h" -#include "time.h" -#include "ntfs.h" - -/** - * ntfs_test_inode - compare two (possibly fake) inodes for equality - * @vi: vfs inode which to test - * @data: data which is being tested with - * - * Compare the ntfs attribute embedded in the ntfs specific part of the vfs - * inode @vi for equality with the ntfs attribute @data. - * - * If searching for the normal file/directory inode, set @na->type to AT_UNUSED. - * @na->name and @na->name_len are then ignored. - * - * Return 1 if the attributes match and 0 if not. - * - * NOTE: This function runs with the inode_hash_lock spin lock held so it is not - * allowed to sleep. - */ -int ntfs_test_inode(struct inode *vi, void *data) -{ - ntfs_attr *na = (ntfs_attr *)data; - ntfs_inode *ni; - - if (vi->i_ino != na->mft_no) - return 0; - ni = NTFS_I(vi); - /* If !NInoAttr(ni), @vi is a normal file or directory inode. */ - if (likely(!NInoAttr(ni))) { - /* If not looking for a normal inode this is a mismatch. */ - if (unlikely(na->type != AT_UNUSED)) - return 0; - } else { - /* A fake inode describing an attribute. */ - if (ni->type != na->type) - return 0; - if (ni->name_len != na->name_len) - return 0; - if (na->name_len && memcmp(ni->name, na->name, - na->name_len * sizeof(ntfschar))) - return 0; - } - /* Match! */ - return 1; -} - -/** - * ntfs_init_locked_inode - initialize an inode - * @vi: vfs inode to initialize - * @data: data which to initialize @vi to - * - * Initialize the vfs inode @vi with the values from the ntfs attribute @data in - * order to enable ntfs_test_inode() to do its work. - * - * If initializing the normal file/directory inode, set @na->type to AT_UNUSED. - * In that case, @na->name and @na->name_len should be set to NULL and 0, - * respectively. Although that is not strictly necessary as - * ntfs_read_locked_inode() will fill them in later. - * - * Return 0 on success and -errno on error. - * - * NOTE: This function runs with the inode->i_lock spin lock held so it is not - * allowed to sleep. (Hence the GFP_ATOMIC allocation.) - */ -static int ntfs_init_locked_inode(struct inode *vi, void *data) -{ - ntfs_attr *na = (ntfs_attr *)data; - ntfs_inode *ni = NTFS_I(vi); - - vi->i_ino = na->mft_no; - - ni->type = na->type; - if (na->type == AT_INDEX_ALLOCATION) - NInoSetMstProtected(ni); - - ni->name = na->name; - ni->name_len = na->name_len; - - /* If initializing a normal inode, we are done. */ - if (likely(na->type == AT_UNUSED)) { - BUG_ON(na->name); - BUG_ON(na->name_len); - return 0; - } - - /* It is a fake inode. */ - NInoSetAttr(ni); - - /* - * We have I30 global constant as an optimization as it is the name - * in >99.9% of named attributes! The other <0.1% incur a GFP_ATOMIC - * allocation but that is ok. And most attributes are unnamed anyway, - * thus the fraction of named attributes with name != I30 is actually - * absolutely tiny. - */ - if (na->name_len && na->name != I30) { - unsigned int i; - - BUG_ON(!na->name); - i = na->name_len * sizeof(ntfschar); - ni->name = kmalloc(i + sizeof(ntfschar), GFP_ATOMIC); - if (!ni->name) - return -ENOMEM; - memcpy(ni->name, na->name, i); - ni->name[na->name_len] = 0; - } - return 0; -} - -static int ntfs_read_locked_inode(struct inode *vi); -static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi); -static int ntfs_read_locked_index_inode(struct inode *base_vi, - struct inode *vi); - -/** - * ntfs_iget - obtain a struct inode corresponding to a specific normal inode - * @sb: super block of mounted volume - * @mft_no: mft record number / inode number to obtain - * - * Obtain the struct inode corresponding to a specific normal inode (i.e. a - * file or directory). - * - * If the inode is in the cache, it is just returned with an increased - * reference count. Otherwise, a new struct inode is allocated and initialized, - * and finally ntfs_read_locked_inode() is called to read in the inode and - * fill in the remainder of the inode structure. - * - * Return the struct inode on success. Check the return value with IS_ERR() and - * if true, the function failed and the error code is obtained from PTR_ERR(). - */ -struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no) -{ - struct inode *vi; - int err; - ntfs_attr na; - - na.mft_no = mft_no; - na.type = AT_UNUSED; - na.name = NULL; - na.name_len = 0; - - vi = iget5_locked(sb, mft_no, ntfs_test_inode, - ntfs_init_locked_inode, &na); - if (unlikely(!vi)) - return ERR_PTR(-ENOMEM); - - err = 0; - - /* If this is a freshly allocated inode, need to read it now. */ - if (vi->i_state & I_NEW) { - err = ntfs_read_locked_inode(vi); - unlock_new_inode(vi); - } - /* - * There is no point in keeping bad inodes around if the failure was - * due to ENOMEM. We want to be able to retry again later. - */ - if (unlikely(err == -ENOMEM)) { - iput(vi); - vi = ERR_PTR(err); - } - return vi; -} - -/** - * ntfs_attr_iget - obtain a struct inode corresponding to an attribute - * @base_vi: vfs base inode containing the attribute - * @type: attribute type - * @name: Unicode name of the attribute (NULL if unnamed) - * @name_len: length of @name in Unicode characters (0 if unnamed) - * - * Obtain the (fake) struct inode corresponding to the attribute specified by - * @type, @name, and @name_len, which is present in the base mft record - * specified by the vfs inode @base_vi. - * - * If the attribute inode is in the cache, it is just returned with an - * increased reference count. Otherwise, a new struct inode is allocated and - * initialized, and finally ntfs_read_locked_attr_inode() is called to read the - * attribute and fill in the inode structure. - * - * Note, for index allocation attributes, you need to use ntfs_index_iget() - * instead of ntfs_attr_iget() as working with indices is a lot more complex. - * - * Return the struct inode of the attribute inode on success. Check the return - * value with IS_ERR() and if true, the function failed and the error code is - * obtained from PTR_ERR(). - */ -struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type, - ntfschar *name, u32 name_len) -{ - struct inode *vi; - int err; - ntfs_attr na; - - /* Make sure no one calls ntfs_attr_iget() for indices. */ - BUG_ON(type == AT_INDEX_ALLOCATION); - - na.mft_no = base_vi->i_ino; - na.type = type; - na.name = name; - na.name_len = name_len; - - vi = iget5_locked(base_vi->i_sb, na.mft_no, ntfs_test_inode, - ntfs_init_locked_inode, &na); - if (unlikely(!vi)) - return ERR_PTR(-ENOMEM); - - err = 0; - - /* If this is a freshly allocated inode, need to read it now. */ - if (vi->i_state & I_NEW) { - err = ntfs_read_locked_attr_inode(base_vi, vi); - unlock_new_inode(vi); - } - /* - * There is no point in keeping bad attribute inodes around. This also - * simplifies things in that we never need to check for bad attribute - * inodes elsewhere. - */ - if (unlikely(err)) { - iput(vi); - vi = ERR_PTR(err); - } - return vi; -} - -/** - * ntfs_index_iget - obtain a struct inode corresponding to an index - * @base_vi: vfs base inode containing the index related attributes - * @name: Unicode name of the index - * @name_len: length of @name in Unicode characters - * - * Obtain the (fake) struct inode corresponding to the index specified by @name - * and @name_len, which is present in the base mft record specified by the vfs - * inode @base_vi. - * - * If the index inode is in the cache, it is just returned with an increased - * reference count. Otherwise, a new struct inode is allocated and - * initialized, and finally ntfs_read_locked_index_inode() is called to read - * the index related attributes and fill in the inode structure. - * - * Return the struct inode of the index inode on success. Check the return - * value with IS_ERR() and if true, the function failed and the error code is - * obtained from PTR_ERR(). - */ -struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name, - u32 name_len) -{ - struct inode *vi; - int err; - ntfs_attr na; - - na.mft_no = base_vi->i_ino; - na.type = AT_INDEX_ALLOCATION; - na.name = name; - na.name_len = name_len; - - vi = iget5_locked(base_vi->i_sb, na.mft_no, ntfs_test_inode, - ntfs_init_locked_inode, &na); - if (unlikely(!vi)) - return ERR_PTR(-ENOMEM); - - err = 0; - - /* If this is a freshly allocated inode, need to read it now. */ - if (vi->i_state & I_NEW) { - err = ntfs_read_locked_index_inode(base_vi, vi); - unlock_new_inode(vi); - } - /* - * There is no point in keeping bad index inodes around. This also - * simplifies things in that we never need to check for bad index - * inodes elsewhere. - */ - if (unlikely(err)) { - iput(vi); - vi = ERR_PTR(err); - } - return vi; -} - -struct inode *ntfs_alloc_big_inode(struct super_block *sb) -{ - ntfs_inode *ni; - - ntfs_debug("Entering."); - ni = alloc_inode_sb(sb, ntfs_big_inode_cache, GFP_NOFS); - if (likely(ni != NULL)) { - ni->state = 0; - return VFS_I(ni); - } - ntfs_error(sb, "Allocation of NTFS big inode structure failed."); - return NULL; -} - -void ntfs_free_big_inode(struct inode *inode) -{ - kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode)); -} - -static inline ntfs_inode *ntfs_alloc_extent_inode(void) -{ - ntfs_inode *ni; - - ntfs_debug("Entering."); - ni = kmem_cache_alloc(ntfs_inode_cache, GFP_NOFS); - if (likely(ni != NULL)) { - ni->state = 0; - return ni; - } - ntfs_error(NULL, "Allocation of NTFS inode structure failed."); - return NULL; -} - -static void ntfs_destroy_extent_inode(ntfs_inode *ni) -{ - ntfs_debug("Entering."); - BUG_ON(ni->page); - if (!atomic_dec_and_test(&ni->count)) - BUG(); - kmem_cache_free(ntfs_inode_cache, ni); -} - -/* - * The attribute runlist lock has separate locking rules from the - * normal runlist lock, so split the two lock-classes: - */ -static struct lock_class_key attr_list_rl_lock_class; - -/** - * __ntfs_init_inode - initialize ntfs specific part of an inode - * @sb: super block of mounted volume - * @ni: freshly allocated ntfs inode which to initialize - * - * Initialize an ntfs inode to defaults. - * - * NOTE: ni->mft_no, ni->state, ni->type, ni->name, and ni->name_len are left - * untouched. Make sure to initialize them elsewhere. - * - * Return zero on success and -ENOMEM on error. - */ -void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni) -{ - ntfs_debug("Entering."); - rwlock_init(&ni->size_lock); - ni->initialized_size = ni->allocated_size = 0; - ni->seq_no = 0; - atomic_set(&ni->count, 1); - ni->vol = NTFS_SB(sb); - ntfs_init_runlist(&ni->runlist); - mutex_init(&ni->mrec_lock); - ni->page = NULL; - ni->page_ofs = 0; - ni->attr_list_size = 0; - ni->attr_list = NULL; - ntfs_init_runlist(&ni->attr_list_rl); - lockdep_set_class(&ni->attr_list_rl.lock, - &attr_list_rl_lock_class); - ni->itype.index.block_size = 0; - ni->itype.index.vcn_size = 0; - ni->itype.index.collation_rule = 0; - ni->itype.index.block_size_bits = 0; - ni->itype.index.vcn_size_bits = 0; - mutex_init(&ni->extent_lock); - ni->nr_extents = 0; - ni->ext.base_ntfs_ino = NULL; -} - -/* - * Extent inodes get MFT-mapped in a nested way, while the base inode - * is still mapped. Teach this nesting to the lock validator by creating - * a separate class for nested inode's mrec_lock's: - */ -static struct lock_class_key extent_inode_mrec_lock_key; - -inline ntfs_inode *ntfs_new_extent_inode(struct super_block *sb, - unsigned long mft_no) -{ - ntfs_inode *ni = ntfs_alloc_extent_inode(); - - ntfs_debug("Entering."); - if (likely(ni != NULL)) { - __ntfs_init_inode(sb, ni); - lockdep_set_class(&ni->mrec_lock, &extent_inode_mrec_lock_key); - ni->mft_no = mft_no; - ni->type = AT_UNUSED; - ni->name = NULL; - ni->name_len = 0; - } - return ni; -} - -/** - * ntfs_is_extended_system_file - check if a file is in the $Extend directory - * @ctx: initialized attribute search context - * - * Search all file name attributes in the inode described by the attribute - * search context @ctx and check if any of the names are in the $Extend system - * directory. - * - * Return values: - * 1: file is in $Extend directory - * 0: file is not in $Extend directory - * -errno: failed to determine if the file is in the $Extend directory - */ -static int ntfs_is_extended_system_file(ntfs_attr_search_ctx *ctx) -{ - int nr_links, err; - - /* Restart search. */ - ntfs_attr_reinit_search_ctx(ctx); - - /* Get number of hard links. */ - nr_links = le16_to_cpu(ctx->mrec->link_count); - - /* Loop through all hard links. */ - while (!(err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, 0, 0, NULL, 0, - ctx))) { - FILE_NAME_ATTR *file_name_attr; - ATTR_RECORD *attr = ctx->attr; - u8 *p, *p2; - - nr_links--; - /* - * Maximum sanity checking as we are called on an inode that - * we suspect might be corrupt. - */ - p = (u8*)attr + le32_to_cpu(attr->length); - if (p < (u8*)ctx->mrec || (u8*)p > (u8*)ctx->mrec + - le32_to_cpu(ctx->mrec->bytes_in_use)) { -err_corrupt_attr: - ntfs_error(ctx->ntfs_ino->vol->sb, "Corrupt file name " - "attribute. You should run chkdsk."); - return -EIO; - } - if (attr->non_resident) { - ntfs_error(ctx->ntfs_ino->vol->sb, "Non-resident file " - "name. You should run chkdsk."); - return -EIO; - } - if (attr->flags) { - ntfs_error(ctx->ntfs_ino->vol->sb, "File name with " - "invalid flags. You should run " - "chkdsk."); - return -EIO; - } - if (!(attr->data.resident.flags & RESIDENT_ATTR_IS_INDEXED)) { - ntfs_error(ctx->ntfs_ino->vol->sb, "Unindexed file " - "name. You should run chkdsk."); - return -EIO; - } - file_name_attr = (FILE_NAME_ATTR*)((u8*)attr + - le16_to_cpu(attr->data.resident.value_offset)); - p2 = (u8 *)file_name_attr + le32_to_cpu(attr->data.resident.value_length); - if (p2 < (u8*)attr || p2 > p) - goto err_corrupt_attr; - /* This attribute is ok, but is it in the $Extend directory? */ - if (MREF_LE(file_name_attr->parent_directory) == FILE_Extend) - return 1; /* YES, it's an extended system file. */ - } - if (unlikely(err != -ENOENT)) - return err; - if (unlikely(nr_links)) { - ntfs_error(ctx->ntfs_ino->vol->sb, "Inode hard link count " - "doesn't match number of name attributes. You " - "should run chkdsk."); - return -EIO; - } - return 0; /* NO, it is not an extended system file. */ -} - -/** - * ntfs_read_locked_inode - read an inode from its device - * @vi: inode to read - * - * ntfs_read_locked_inode() is called from ntfs_iget() to read the inode - * described by @vi into memory from the device. - * - * The only fields in @vi that we need to/can look at when the function is - * called are i_sb, pointing to the mounted device's super block, and i_ino, - * the number of the inode to load. - * - * ntfs_read_locked_inode() maps, pins and locks the mft record number i_ino - * for reading and sets up the necessary @vi fields as well as initializing - * the ntfs inode. - * - * Q: What locks are held when the function is called? - * A: i_state has I_NEW set, hence the inode is locked, also - * i_count is set to 1, so it is not going to go away - * i_flags is set to 0 and we have no business touching it. Only an ioctl() - * is allowed to write to them. We should of course be honouring them but - * we need to do that using the IS_* macros defined in include/linux/fs.h. - * In any case ntfs_read_locked_inode() has nothing to do with i_flags. - * - * Return 0 on success and -errno on error. In the error case, the inode will - * have had make_bad_inode() executed on it. - */ -static int ntfs_read_locked_inode(struct inode *vi) -{ - ntfs_volume *vol = NTFS_SB(vi->i_sb); - ntfs_inode *ni; - struct inode *bvi; - MFT_RECORD *m; - ATTR_RECORD *a; - STANDARD_INFORMATION *si; - ntfs_attr_search_ctx *ctx; - int err = 0; - - ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino); - - /* Setup the generic vfs inode parts now. */ - vi->i_uid = vol->uid; - vi->i_gid = vol->gid; - vi->i_mode = 0; - - /* - * Initialize the ntfs specific part of @vi special casing - * FILE_MFT which we need to do at mount time. - */ - if (vi->i_ino != FILE_MFT) - ntfs_init_big_inode(vi); - ni = NTFS_I(vi); - - m = map_mft_record(ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(ni, m); - if (!ctx) { - err = -ENOMEM; - goto unm_err_out; - } - - if (!(m->flags & MFT_RECORD_IN_USE)) { - ntfs_error(vi->i_sb, "Inode is not in use!"); - goto unm_err_out; - } - if (m->base_mft_record) { - ntfs_error(vi->i_sb, "Inode is an extent inode!"); - goto unm_err_out; - } - - /* Transfer information from mft record into vfs and ntfs inodes. */ - vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number); - - /* - * FIXME: Keep in mind that link_count is two for files which have both - * a long file name and a short file name as separate entries, so if - * we are hiding short file names this will be too high. Either we need - * to account for the short file names by subtracting them or we need - * to make sure we delete files even though i_nlink is not zero which - * might be tricky due to vfs interactions. Need to think about this - * some more when implementing the unlink command. - */ - set_nlink(vi, le16_to_cpu(m->link_count)); - /* - * FIXME: Reparse points can have the directory bit set even though - * they would be S_IFLNK. Need to deal with this further below when we - * implement reparse points / symbolic links but it will do for now. - * Also if not a directory, it could be something else, rather than - * a regular file. But again, will do for now. - */ - /* Everyone gets all permissions. */ - vi->i_mode |= S_IRWXUGO; - /* If read-only, no one gets write permissions. */ - if (IS_RDONLY(vi)) - vi->i_mode &= ~S_IWUGO; - if (m->flags & MFT_RECORD_IS_DIRECTORY) { - vi->i_mode |= S_IFDIR; - /* - * Apply the directory permissions mask set in the mount - * options. - */ - vi->i_mode &= ~vol->dmask; - /* Things break without this kludge! */ - if (vi->i_nlink > 1) - set_nlink(vi, 1); - } else { - vi->i_mode |= S_IFREG; - /* Apply the file permissions mask set in the mount options. */ - vi->i_mode &= ~vol->fmask; - } - /* - * Find the standard information attribute in the mft record. At this - * stage we haven't setup the attribute list stuff yet, so this could - * in fact fail if the standard information is in an extent record, but - * I don't think this actually ever happens. - */ - err = ntfs_attr_lookup(AT_STANDARD_INFORMATION, NULL, 0, 0, 0, NULL, 0, - ctx); - if (unlikely(err)) { - if (err == -ENOENT) { - /* - * TODO: We should be performing a hot fix here (if the - * recover mount option is set) by creating a new - * attribute. - */ - ntfs_error(vi->i_sb, "$STANDARD_INFORMATION attribute " - "is missing."); - } - goto unm_err_out; - } - a = ctx->attr; - /* Get the standard information attribute value. */ - if ((u8 *)a + le16_to_cpu(a->data.resident.value_offset) - + le32_to_cpu(a->data.resident.value_length) > - (u8 *)ctx->mrec + vol->mft_record_size) { - ntfs_error(vi->i_sb, "Corrupt standard information attribute in inode."); - goto unm_err_out; - } - si = (STANDARD_INFORMATION*)((u8*)a + - le16_to_cpu(a->data.resident.value_offset)); - - /* Transfer information from the standard information into vi. */ - /* - * Note: The i_?times do not quite map perfectly onto the NTFS times, - * but they are close enough, and in the end it doesn't really matter - * that much... - */ - /* - * mtime is the last change of the data within the file. Not changed - * when only metadata is changed, e.g. a rename doesn't affect mtime. - */ - inode_set_mtime_to_ts(vi, ntfs2utc(si->last_data_change_time)); - /* - * ctime is the last change of the metadata of the file. This obviously - * always changes, when mtime is changed. ctime can be changed on its - * own, mtime is then not changed, e.g. when a file is renamed. - */ - inode_set_ctime_to_ts(vi, ntfs2utc(si->last_mft_change_time)); - /* - * Last access to the data within the file. Not changed during a rename - * for example but changed whenever the file is written to. - */ - inode_set_atime_to_ts(vi, ntfs2utc(si->last_access_time)); - - /* Find the attribute list attribute if present. */ - ntfs_attr_reinit_search_ctx(ctx); - err = ntfs_attr_lookup(AT_ATTRIBUTE_LIST, NULL, 0, 0, 0, NULL, 0, ctx); - if (err) { - if (unlikely(err != -ENOENT)) { - ntfs_error(vi->i_sb, "Failed to lookup attribute list " - "attribute."); - goto unm_err_out; - } - } else /* if (!err) */ { - if (vi->i_ino == FILE_MFT) - goto skip_attr_list_load; - ntfs_debug("Attribute list found in inode 0x%lx.", vi->i_ino); - NInoSetAttrList(ni); - a = ctx->attr; - if (a->flags & ATTR_COMPRESSION_MASK) { - ntfs_error(vi->i_sb, "Attribute list attribute is " - "compressed."); - goto unm_err_out; - } - if (a->flags & ATTR_IS_ENCRYPTED || - a->flags & ATTR_IS_SPARSE) { - if (a->non_resident) { - ntfs_error(vi->i_sb, "Non-resident attribute " - "list attribute is encrypted/" - "sparse."); - goto unm_err_out; - } - ntfs_warning(vi->i_sb, "Resident attribute list " - "attribute in inode 0x%lx is marked " - "encrypted/sparse which is not true. " - "However, Windows allows this and " - "chkdsk does not detect or correct it " - "so we will just ignore the invalid " - "flags and pretend they are not set.", - vi->i_ino); - } - /* Now allocate memory for the attribute list. */ - ni->attr_list_size = (u32)ntfs_attr_size(a); - ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size); - if (!ni->attr_list) { - ntfs_error(vi->i_sb, "Not enough memory to allocate " - "buffer for attribute list."); - err = -ENOMEM; - goto unm_err_out; - } - if (a->non_resident) { - NInoSetAttrListNonResident(ni); - if (a->data.non_resident.lowest_vcn) { - ntfs_error(vi->i_sb, "Attribute list has non " - "zero lowest_vcn."); - goto unm_err_out; - } - /* - * Setup the runlist. No need for locking as we have - * exclusive access to the inode at this time. - */ - ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol, - a, NULL); - if (IS_ERR(ni->attr_list_rl.rl)) { - err = PTR_ERR(ni->attr_list_rl.rl); - ni->attr_list_rl.rl = NULL; - ntfs_error(vi->i_sb, "Mapping pairs " - "decompression failed."); - goto unm_err_out; - } - /* Now load the attribute list. */ - if ((err = load_attribute_list(vol, &ni->attr_list_rl, - ni->attr_list, ni->attr_list_size, - sle64_to_cpu(a->data.non_resident. - initialized_size)))) { - ntfs_error(vi->i_sb, "Failed to load " - "attribute list attribute."); - goto unm_err_out; - } - } else /* if (!a->non_resident) */ { - if ((u8*)a + le16_to_cpu(a->data.resident.value_offset) - + le32_to_cpu( - a->data.resident.value_length) > - (u8*)ctx->mrec + vol->mft_record_size) { - ntfs_error(vi->i_sb, "Corrupt attribute list " - "in inode."); - goto unm_err_out; - } - /* Now copy the attribute list. */ - memcpy(ni->attr_list, (u8*)a + le16_to_cpu( - a->data.resident.value_offset), - le32_to_cpu( - a->data.resident.value_length)); - } - } -skip_attr_list_load: - /* - * If an attribute list is present we now have the attribute list value - * in ntfs_ino->attr_list and it is ntfs_ino->attr_list_size bytes. - */ - if (S_ISDIR(vi->i_mode)) { - loff_t bvi_size; - ntfs_inode *bni; - INDEX_ROOT *ir; - u8 *ir_end, *index_end; - - /* It is a directory, find index root attribute. */ - ntfs_attr_reinit_search_ctx(ctx); - err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, - 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) { - // FIXME: File is corrupt! Hot-fix with empty - // index root attribute if recovery option is - // set. - ntfs_error(vi->i_sb, "$INDEX_ROOT attribute " - "is missing."); - } - goto unm_err_out; - } - a = ctx->attr; - /* Set up the state. */ - if (unlikely(a->non_resident)) { - ntfs_error(vol->sb, "$INDEX_ROOT attribute is not " - "resident."); - goto unm_err_out; - } - /* Ensure the attribute name is placed before the value. */ - if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= - le16_to_cpu(a->data.resident.value_offset)))) { - ntfs_error(vol->sb, "$INDEX_ROOT attribute name is " - "placed after the attribute value."); - goto unm_err_out; - } - /* - * Compressed/encrypted index root just means that the newly - * created files in that directory should be created compressed/ - * encrypted. However index root cannot be both compressed and - * encrypted. - */ - if (a->flags & ATTR_COMPRESSION_MASK) - NInoSetCompressed(ni); - if (a->flags & ATTR_IS_ENCRYPTED) { - if (a->flags & ATTR_COMPRESSION_MASK) { - ntfs_error(vi->i_sb, "Found encrypted and " - "compressed attribute."); - goto unm_err_out; - } - NInoSetEncrypted(ni); - } - if (a->flags & ATTR_IS_SPARSE) - NInoSetSparse(ni); - ir = (INDEX_ROOT*)((u8*)a + - le16_to_cpu(a->data.resident.value_offset)); - ir_end = (u8*)ir + le32_to_cpu(a->data.resident.value_length); - if (ir_end > (u8*)ctx->mrec + vol->mft_record_size) { - ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is " - "corrupt."); - goto unm_err_out; - } - index_end = (u8*)&ir->index + - le32_to_cpu(ir->index.index_length); - if (index_end > ir_end) { - ntfs_error(vi->i_sb, "Directory index is corrupt."); - goto unm_err_out; - } - if (ir->type != AT_FILE_NAME) { - ntfs_error(vi->i_sb, "Indexed attribute is not " - "$FILE_NAME."); - goto unm_err_out; - } - if (ir->collation_rule != COLLATION_FILE_NAME) { - ntfs_error(vi->i_sb, "Index collation rule is not " - "COLLATION_FILE_NAME."); - goto unm_err_out; - } - ni->itype.index.collation_rule = ir->collation_rule; - ni->itype.index.block_size = le32_to_cpu(ir->index_block_size); - if (ni->itype.index.block_size & - (ni->itype.index.block_size - 1)) { - ntfs_error(vi->i_sb, "Index block size (%u) is not a " - "power of two.", - ni->itype.index.block_size); - goto unm_err_out; - } - if (ni->itype.index.block_size > PAGE_SIZE) { - ntfs_error(vi->i_sb, "Index block size (%u) > " - "PAGE_SIZE (%ld) is not " - "supported. Sorry.", - ni->itype.index.block_size, - PAGE_SIZE); - err = -EOPNOTSUPP; - goto unm_err_out; - } - if (ni->itype.index.block_size < NTFS_BLOCK_SIZE) { - ntfs_error(vi->i_sb, "Index block size (%u) < " - "NTFS_BLOCK_SIZE (%i) is not " - "supported. Sorry.", - ni->itype.index.block_size, - NTFS_BLOCK_SIZE); - err = -EOPNOTSUPP; - goto unm_err_out; - } - ni->itype.index.block_size_bits = - ffs(ni->itype.index.block_size) - 1; - /* Determine the size of a vcn in the directory index. */ - if (vol->cluster_size <= ni->itype.index.block_size) { - ni->itype.index.vcn_size = vol->cluster_size; - ni->itype.index.vcn_size_bits = vol->cluster_size_bits; - } else { - ni->itype.index.vcn_size = vol->sector_size; - ni->itype.index.vcn_size_bits = vol->sector_size_bits; - } - - /* Setup the index allocation attribute, even if not present. */ - NInoSetMstProtected(ni); - ni->type = AT_INDEX_ALLOCATION; - ni->name = I30; - ni->name_len = 4; - - if (!(ir->index.flags & LARGE_INDEX)) { - /* No index allocation. */ - vi->i_size = ni->initialized_size = - ni->allocated_size = 0; - /* We are done with the mft record, so we release it. */ - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(ni); - m = NULL; - ctx = NULL; - goto skip_large_dir_stuff; - } /* LARGE_INDEX: Index allocation present. Setup state. */ - NInoSetIndexAllocPresent(ni); - /* Find index allocation attribute. */ - ntfs_attr_reinit_search_ctx(ctx); - err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, I30, 4, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - ntfs_error(vi->i_sb, "$INDEX_ALLOCATION " - "attribute is not present but " - "$INDEX_ROOT indicated it is."); - else - ntfs_error(vi->i_sb, "Failed to lookup " - "$INDEX_ALLOCATION " - "attribute."); - goto unm_err_out; - } - a = ctx->attr; - if (!a->non_resident) { - ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute " - "is resident."); - goto unm_err_out; - } - /* - * Ensure the attribute name is placed before the mapping pairs - * array. - */ - if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= - le16_to_cpu( - a->data.non_resident.mapping_pairs_offset)))) { - ntfs_error(vol->sb, "$INDEX_ALLOCATION attribute name " - "is placed after the mapping pairs " - "array."); - goto unm_err_out; - } - if (a->flags & ATTR_IS_ENCRYPTED) { - ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute " - "is encrypted."); - goto unm_err_out; - } - if (a->flags & ATTR_IS_SPARSE) { - ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute " - "is sparse."); - goto unm_err_out; - } - if (a->flags & ATTR_COMPRESSION_MASK) { - ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute " - "is compressed."); - goto unm_err_out; - } - if (a->data.non_resident.lowest_vcn) { - ntfs_error(vi->i_sb, "First extent of " - "$INDEX_ALLOCATION attribute has non " - "zero lowest_vcn."); - goto unm_err_out; - } - vi->i_size = sle64_to_cpu(a->data.non_resident.data_size); - ni->initialized_size = sle64_to_cpu( - a->data.non_resident.initialized_size); - ni->allocated_size = sle64_to_cpu( - a->data.non_resident.allocated_size); - /* - * We are done with the mft record, so we release it. Otherwise - * we would deadlock in ntfs_attr_iget(). - */ - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(ni); - m = NULL; - ctx = NULL; - /* Get the index bitmap attribute inode. */ - bvi = ntfs_attr_iget(vi, AT_BITMAP, I30, 4); - if (IS_ERR(bvi)) { - ntfs_error(vi->i_sb, "Failed to get bitmap attribute."); - err = PTR_ERR(bvi); - goto unm_err_out; - } - bni = NTFS_I(bvi); - if (NInoCompressed(bni) || NInoEncrypted(bni) || - NInoSparse(bni)) { - ntfs_error(vi->i_sb, "$BITMAP attribute is compressed " - "and/or encrypted and/or sparse."); - goto iput_unm_err_out; - } - /* Consistency check bitmap size vs. index allocation size. */ - bvi_size = i_size_read(bvi); - if ((bvi_size << 3) < (vi->i_size >> - ni->itype.index.block_size_bits)) { - ntfs_error(vi->i_sb, "Index bitmap too small (0x%llx) " - "for index allocation (0x%llx).", - bvi_size << 3, vi->i_size); - goto iput_unm_err_out; - } - /* No longer need the bitmap attribute inode. */ - iput(bvi); -skip_large_dir_stuff: - /* Setup the operations for this inode. */ - vi->i_op = &ntfs_dir_inode_ops; - vi->i_fop = &ntfs_dir_ops; - vi->i_mapping->a_ops = &ntfs_mst_aops; - } else { - /* It is a file. */ - ntfs_attr_reinit_search_ctx(ctx); - - /* Setup the data attribute, even if not present. */ - ni->type = AT_DATA; - ni->name = NULL; - ni->name_len = 0; - - /* Find first extent of the unnamed data attribute. */ - err = ntfs_attr_lookup(AT_DATA, NULL, 0, 0, 0, NULL, 0, ctx); - if (unlikely(err)) { - vi->i_size = ni->initialized_size = - ni->allocated_size = 0; - if (err != -ENOENT) { - ntfs_error(vi->i_sb, "Failed to lookup $DATA " - "attribute."); - goto unm_err_out; - } - /* - * FILE_Secure does not have an unnamed $DATA - * attribute, so we special case it here. - */ - if (vi->i_ino == FILE_Secure) - goto no_data_attr_special_case; - /* - * Most if not all the system files in the $Extend - * system directory do not have unnamed data - * attributes so we need to check if the parent - * directory of the file is FILE_Extend and if it is - * ignore this error. To do this we need to get the - * name of this inode from the mft record as the name - * contains the back reference to the parent directory. - */ - if (ntfs_is_extended_system_file(ctx) > 0) - goto no_data_attr_special_case; - // FIXME: File is corrupt! Hot-fix with empty data - // attribute if recovery option is set. - ntfs_error(vi->i_sb, "$DATA attribute is missing."); - goto unm_err_out; - } - a = ctx->attr; - /* Setup the state. */ - if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) { - if (a->flags & ATTR_COMPRESSION_MASK) { - NInoSetCompressed(ni); - if (vol->cluster_size > 4096) { - ntfs_error(vi->i_sb, "Found " - "compressed data but " - "compression is " - "disabled due to " - "cluster size (%i) > " - "4kiB.", - vol->cluster_size); - goto unm_err_out; - } - if ((a->flags & ATTR_COMPRESSION_MASK) - != ATTR_IS_COMPRESSED) { - ntfs_error(vi->i_sb, "Found unknown " - "compression method " - "or corrupt file."); - goto unm_err_out; - } - } - if (a->flags & ATTR_IS_SPARSE) - NInoSetSparse(ni); - } - if (a->flags & ATTR_IS_ENCRYPTED) { - if (NInoCompressed(ni)) { - ntfs_error(vi->i_sb, "Found encrypted and " - "compressed data."); - goto unm_err_out; - } - NInoSetEncrypted(ni); - } - if (a->non_resident) { - NInoSetNonResident(ni); - if (NInoCompressed(ni) || NInoSparse(ni)) { - if (NInoCompressed(ni) && a->data.non_resident. - compression_unit != 4) { - ntfs_error(vi->i_sb, "Found " - "non-standard " - "compression unit (%u " - "instead of 4). " - "Cannot handle this.", - a->data.non_resident. - compression_unit); - err = -EOPNOTSUPP; - goto unm_err_out; - } - if (a->data.non_resident.compression_unit) { - ni->itype.compressed.block_size = 1U << - (a->data.non_resident. - compression_unit + - vol->cluster_size_bits); - ni->itype.compressed.block_size_bits = - ffs(ni->itype. - compressed. - block_size) - 1; - ni->itype.compressed.block_clusters = - 1U << a->data. - non_resident. - compression_unit; - } else { - ni->itype.compressed.block_size = 0; - ni->itype.compressed.block_size_bits = - 0; - ni->itype.compressed.block_clusters = - 0; - } - ni->itype.compressed.size = sle64_to_cpu( - a->data.non_resident. - compressed_size); - } - if (a->data.non_resident.lowest_vcn) { - ntfs_error(vi->i_sb, "First extent of $DATA " - "attribute has non zero " - "lowest_vcn."); - goto unm_err_out; - } - vi->i_size = sle64_to_cpu( - a->data.non_resident.data_size); - ni->initialized_size = sle64_to_cpu( - a->data.non_resident.initialized_size); - ni->allocated_size = sle64_to_cpu( - a->data.non_resident.allocated_size); - } else { /* Resident attribute. */ - vi->i_size = ni->initialized_size = le32_to_cpu( - a->data.resident.value_length); - ni->allocated_size = le32_to_cpu(a->length) - - le16_to_cpu( - a->data.resident.value_offset); - if (vi->i_size > ni->allocated_size) { - ntfs_error(vi->i_sb, "Resident data attribute " - "is corrupt (size exceeds " - "allocation)."); - goto unm_err_out; - } - } -no_data_attr_special_case: - /* We are done with the mft record, so we release it. */ - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(ni); - m = NULL; - ctx = NULL; - /* Setup the operations for this inode. */ - vi->i_op = &ntfs_file_inode_ops; - vi->i_fop = &ntfs_file_ops; - vi->i_mapping->a_ops = &ntfs_normal_aops; - if (NInoMstProtected(ni)) - vi->i_mapping->a_ops = &ntfs_mst_aops; - else if (NInoCompressed(ni)) - vi->i_mapping->a_ops = &ntfs_compressed_aops; - } - /* - * The number of 512-byte blocks used on disk (for stat). This is in so - * far inaccurate as it doesn't account for any named streams or other - * special non-resident attributes, but that is how Windows works, too, - * so we are at least consistent with Windows, if not entirely - * consistent with the Linux Way. Doing it the Linux Way would cause a - * significant slowdown as it would involve iterating over all - * attributes in the mft record and adding the allocated/compressed - * sizes of all non-resident attributes present to give us the Linux - * correct size that should go into i_blocks (after division by 512). - */ - if (S_ISREG(vi->i_mode) && (NInoCompressed(ni) || NInoSparse(ni))) - vi->i_blocks = ni->itype.compressed.size >> 9; - else - vi->i_blocks = ni->allocated_size >> 9; - ntfs_debug("Done."); - return 0; -iput_unm_err_out: - iput(bvi); -unm_err_out: - if (!err) - err = -EIO; - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(ni); -err_out: - ntfs_error(vol->sb, "Failed with error code %i. Marking corrupt " - "inode 0x%lx as bad. Run chkdsk.", err, vi->i_ino); - make_bad_inode(vi); - if (err != -EOPNOTSUPP && err != -ENOMEM) - NVolSetErrors(vol); - return err; -} - -/** - * ntfs_read_locked_attr_inode - read an attribute inode from its base inode - * @base_vi: base inode - * @vi: attribute inode to read - * - * ntfs_read_locked_attr_inode() is called from ntfs_attr_iget() to read the - * attribute inode described by @vi into memory from the base mft record - * described by @base_ni. - * - * ntfs_read_locked_attr_inode() maps, pins and locks the base inode for - * reading and looks up the attribute described by @vi before setting up the - * necessary fields in @vi as well as initializing the ntfs inode. - * - * Q: What locks are held when the function is called? - * A: i_state has I_NEW set, hence the inode is locked, also - * i_count is set to 1, so it is not going to go away - * - * Return 0 on success and -errno on error. In the error case, the inode will - * have had make_bad_inode() executed on it. - * - * Note this cannot be called for AT_INDEX_ALLOCATION. - */ -static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi) -{ - ntfs_volume *vol = NTFS_SB(vi->i_sb); - ntfs_inode *ni, *base_ni; - MFT_RECORD *m; - ATTR_RECORD *a; - ntfs_attr_search_ctx *ctx; - int err = 0; - - ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino); - - ntfs_init_big_inode(vi); - - ni = NTFS_I(vi); - base_ni = NTFS_I(base_vi); - - /* Just mirror the values from the base inode. */ - vi->i_uid = base_vi->i_uid; - vi->i_gid = base_vi->i_gid; - set_nlink(vi, base_vi->i_nlink); - inode_set_mtime_to_ts(vi, inode_get_mtime(base_vi)); - inode_set_ctime_to_ts(vi, inode_get_ctime(base_vi)); - inode_set_atime_to_ts(vi, inode_get_atime(base_vi)); - vi->i_generation = ni->seq_no = base_ni->seq_no; - - /* Set inode type to zero but preserve permissions. */ - vi->i_mode = base_vi->i_mode & ~S_IFMT; - - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (!ctx) { - err = -ENOMEM; - goto unm_err_out; - } - /* Find the attribute. */ - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) - goto unm_err_out; - a = ctx->attr; - if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) { - if (a->flags & ATTR_COMPRESSION_MASK) { - NInoSetCompressed(ni); - if ((ni->type != AT_DATA) || (ni->type == AT_DATA && - ni->name_len)) { - ntfs_error(vi->i_sb, "Found compressed " - "non-data or named data " - "attribute. Please report " - "you saw this message to " - "linux-ntfs-dev@lists." - "sourceforge.net"); - goto unm_err_out; - } - if (vol->cluster_size > 4096) { - ntfs_error(vi->i_sb, "Found compressed " - "attribute but compression is " - "disabled due to cluster size " - "(%i) > 4kiB.", - vol->cluster_size); - goto unm_err_out; - } - if ((a->flags & ATTR_COMPRESSION_MASK) != - ATTR_IS_COMPRESSED) { - ntfs_error(vi->i_sb, "Found unknown " - "compression method."); - goto unm_err_out; - } - } - /* - * The compressed/sparse flag set in an index root just means - * to compress all files. - */ - if (NInoMstProtected(ni) && ni->type != AT_INDEX_ROOT) { - ntfs_error(vi->i_sb, "Found mst protected attribute " - "but the attribute is %s. Please " - "report you saw this message to " - "linux-ntfs-dev@lists.sourceforge.net", - NInoCompressed(ni) ? "compressed" : - "sparse"); - goto unm_err_out; - } - if (a->flags & ATTR_IS_SPARSE) - NInoSetSparse(ni); - } - if (a->flags & ATTR_IS_ENCRYPTED) { - if (NInoCompressed(ni)) { - ntfs_error(vi->i_sb, "Found encrypted and compressed " - "data."); - goto unm_err_out; - } - /* - * The encryption flag set in an index root just means to - * encrypt all files. - */ - if (NInoMstProtected(ni) && ni->type != AT_INDEX_ROOT) { - ntfs_error(vi->i_sb, "Found mst protected attribute " - "but the attribute is encrypted. " - "Please report you saw this message " - "to linux-ntfs-dev@lists.sourceforge." - "net"); - goto unm_err_out; - } - if (ni->type != AT_DATA) { - ntfs_error(vi->i_sb, "Found encrypted non-data " - "attribute."); - goto unm_err_out; - } - NInoSetEncrypted(ni); - } - if (!a->non_resident) { - /* Ensure the attribute name is placed before the value. */ - if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= - le16_to_cpu(a->data.resident.value_offset)))) { - ntfs_error(vol->sb, "Attribute name is placed after " - "the attribute value."); - goto unm_err_out; - } - if (NInoMstProtected(ni)) { - ntfs_error(vi->i_sb, "Found mst protected attribute " - "but the attribute is resident. " - "Please report you saw this message to " - "linux-ntfs-dev@lists.sourceforge.net"); - goto unm_err_out; - } - vi->i_size = ni->initialized_size = le32_to_cpu( - a->data.resident.value_length); - ni->allocated_size = le32_to_cpu(a->length) - - le16_to_cpu(a->data.resident.value_offset); - if (vi->i_size > ni->allocated_size) { - ntfs_error(vi->i_sb, "Resident attribute is corrupt " - "(size exceeds allocation)."); - goto unm_err_out; - } - } else { - NInoSetNonResident(ni); - /* - * Ensure the attribute name is placed before the mapping pairs - * array. - */ - if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= - le16_to_cpu( - a->data.non_resident.mapping_pairs_offset)))) { - ntfs_error(vol->sb, "Attribute name is placed after " - "the mapping pairs array."); - goto unm_err_out; - } - if (NInoCompressed(ni) || NInoSparse(ni)) { - if (NInoCompressed(ni) && a->data.non_resident. - compression_unit != 4) { - ntfs_error(vi->i_sb, "Found non-standard " - "compression unit (%u instead " - "of 4). Cannot handle this.", - a->data.non_resident. - compression_unit); - err = -EOPNOTSUPP; - goto unm_err_out; - } - if (a->data.non_resident.compression_unit) { - ni->itype.compressed.block_size = 1U << - (a->data.non_resident. - compression_unit + - vol->cluster_size_bits); - ni->itype.compressed.block_size_bits = - ffs(ni->itype.compressed. - block_size) - 1; - ni->itype.compressed.block_clusters = 1U << - a->data.non_resident. - compression_unit; - } else { - ni->itype.compressed.block_size = 0; - ni->itype.compressed.block_size_bits = 0; - ni->itype.compressed.block_clusters = 0; - } - ni->itype.compressed.size = sle64_to_cpu( - a->data.non_resident.compressed_size); - } - if (a->data.non_resident.lowest_vcn) { - ntfs_error(vi->i_sb, "First extent of attribute has " - "non-zero lowest_vcn."); - goto unm_err_out; - } - vi->i_size = sle64_to_cpu(a->data.non_resident.data_size); - ni->initialized_size = sle64_to_cpu( - a->data.non_resident.initialized_size); - ni->allocated_size = sle64_to_cpu( - a->data.non_resident.allocated_size); - } - vi->i_mapping->a_ops = &ntfs_normal_aops; - if (NInoMstProtected(ni)) - vi->i_mapping->a_ops = &ntfs_mst_aops; - else if (NInoCompressed(ni)) - vi->i_mapping->a_ops = &ntfs_compressed_aops; - if ((NInoCompressed(ni) || NInoSparse(ni)) && ni->type != AT_INDEX_ROOT) - vi->i_blocks = ni->itype.compressed.size >> 9; - else - vi->i_blocks = ni->allocated_size >> 9; - /* - * Make sure the base inode does not go away and attach it to the - * attribute inode. - */ - igrab(base_vi); - ni->ext.base_ntfs_ino = base_ni; - ni->nr_extents = -1; - - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - - ntfs_debug("Done."); - return 0; - -unm_err_out: - if (!err) - err = -EIO; - if (ctx) - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); -err_out: - ntfs_error(vol->sb, "Failed with error code %i while reading attribute " - "inode (mft_no 0x%lx, type 0x%x, name_len %i). " - "Marking corrupt inode and base inode 0x%lx as bad. " - "Run chkdsk.", err, vi->i_ino, ni->type, ni->name_len, - base_vi->i_ino); - make_bad_inode(vi); - if (err != -ENOMEM) - NVolSetErrors(vol); - return err; -} - -/** - * ntfs_read_locked_index_inode - read an index inode from its base inode - * @base_vi: base inode - * @vi: index inode to read - * - * ntfs_read_locked_index_inode() is called from ntfs_index_iget() to read the - * index inode described by @vi into memory from the base mft record described - * by @base_ni. - * - * ntfs_read_locked_index_inode() maps, pins and locks the base inode for - * reading and looks up the attributes relating to the index described by @vi - * before setting up the necessary fields in @vi as well as initializing the - * ntfs inode. - * - * Note, index inodes are essentially attribute inodes (NInoAttr() is true) - * with the attribute type set to AT_INDEX_ALLOCATION. Apart from that, they - * are setup like directory inodes since directories are a special case of - * indices ao they need to be treated in much the same way. Most importantly, - * for small indices the index allocation attribute might not actually exist. - * However, the index root attribute always exists but this does not need to - * have an inode associated with it and this is why we define a new inode type - * index. Also, like for directories, we need to have an attribute inode for - * the bitmap attribute corresponding to the index allocation attribute and we - * can store this in the appropriate field of the inode, just like we do for - * normal directory inodes. - * - * Q: What locks are held when the function is called? - * A: i_state has I_NEW set, hence the inode is locked, also - * i_count is set to 1, so it is not going to go away - * - * Return 0 on success and -errno on error. In the error case, the inode will - * have had make_bad_inode() executed on it. - */ -static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi) -{ - loff_t bvi_size; - ntfs_volume *vol = NTFS_SB(vi->i_sb); - ntfs_inode *ni, *base_ni, *bni; - struct inode *bvi; - MFT_RECORD *m; - ATTR_RECORD *a; - ntfs_attr_search_ctx *ctx; - INDEX_ROOT *ir; - u8 *ir_end, *index_end; - int err = 0; - - ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino); - ntfs_init_big_inode(vi); - ni = NTFS_I(vi); - base_ni = NTFS_I(base_vi); - /* Just mirror the values from the base inode. */ - vi->i_uid = base_vi->i_uid; - vi->i_gid = base_vi->i_gid; - set_nlink(vi, base_vi->i_nlink); - inode_set_mtime_to_ts(vi, inode_get_mtime(base_vi)); - inode_set_ctime_to_ts(vi, inode_get_ctime(base_vi)); - inode_set_atime_to_ts(vi, inode_get_atime(base_vi)); - vi->i_generation = ni->seq_no = base_ni->seq_no; - /* Set inode type to zero but preserve permissions. */ - vi->i_mode = base_vi->i_mode & ~S_IFMT; - /* Map the mft record for the base inode. */ - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (!ctx) { - err = -ENOMEM; - goto unm_err_out; - } - /* Find the index root attribute. */ - err = ntfs_attr_lookup(AT_INDEX_ROOT, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is " - "missing."); - goto unm_err_out; - } - a = ctx->attr; - /* Set up the state. */ - if (unlikely(a->non_resident)) { - ntfs_error(vol->sb, "$INDEX_ROOT attribute is not resident."); - goto unm_err_out; - } - /* Ensure the attribute name is placed before the value. */ - if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= - le16_to_cpu(a->data.resident.value_offset)))) { - ntfs_error(vol->sb, "$INDEX_ROOT attribute name is placed " - "after the attribute value."); - goto unm_err_out; - } - /* - * Compressed/encrypted/sparse index root is not allowed, except for - * directories of course but those are not dealt with here. - */ - if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_ENCRYPTED | - ATTR_IS_SPARSE)) { - ntfs_error(vi->i_sb, "Found compressed/encrypted/sparse index " - "root attribute."); - goto unm_err_out; - } - ir = (INDEX_ROOT*)((u8*)a + le16_to_cpu(a->data.resident.value_offset)); - ir_end = (u8*)ir + le32_to_cpu(a->data.resident.value_length); - if (ir_end > (u8*)ctx->mrec + vol->mft_record_size) { - ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is corrupt."); - goto unm_err_out; - } - index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); - if (index_end > ir_end) { - ntfs_error(vi->i_sb, "Index is corrupt."); - goto unm_err_out; - } - if (ir->type) { - ntfs_error(vi->i_sb, "Index type is not 0 (type is 0x%x).", - le32_to_cpu(ir->type)); - goto unm_err_out; - } - ni->itype.index.collation_rule = ir->collation_rule; - ntfs_debug("Index collation rule is 0x%x.", - le32_to_cpu(ir->collation_rule)); - ni->itype.index.block_size = le32_to_cpu(ir->index_block_size); - if (!is_power_of_2(ni->itype.index.block_size)) { - ntfs_error(vi->i_sb, "Index block size (%u) is not a power of " - "two.", ni->itype.index.block_size); - goto unm_err_out; - } - if (ni->itype.index.block_size > PAGE_SIZE) { - ntfs_error(vi->i_sb, "Index block size (%u) > PAGE_SIZE " - "(%ld) is not supported. Sorry.", - ni->itype.index.block_size, PAGE_SIZE); - err = -EOPNOTSUPP; - goto unm_err_out; - } - if (ni->itype.index.block_size < NTFS_BLOCK_SIZE) { - ntfs_error(vi->i_sb, "Index block size (%u) < NTFS_BLOCK_SIZE " - "(%i) is not supported. Sorry.", - ni->itype.index.block_size, NTFS_BLOCK_SIZE); - err = -EOPNOTSUPP; - goto unm_err_out; - } - ni->itype.index.block_size_bits = ffs(ni->itype.index.block_size) - 1; - /* Determine the size of a vcn in the index. */ - if (vol->cluster_size <= ni->itype.index.block_size) { - ni->itype.index.vcn_size = vol->cluster_size; - ni->itype.index.vcn_size_bits = vol->cluster_size_bits; - } else { - ni->itype.index.vcn_size = vol->sector_size; - ni->itype.index.vcn_size_bits = vol->sector_size_bits; - } - /* Check for presence of index allocation attribute. */ - if (!(ir->index.flags & LARGE_INDEX)) { - /* No index allocation. */ - vi->i_size = ni->initialized_size = ni->allocated_size = 0; - /* We are done with the mft record, so we release it. */ - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - m = NULL; - ctx = NULL; - goto skip_large_index_stuff; - } /* LARGE_INDEX: Index allocation present. Setup state. */ - NInoSetIndexAllocPresent(ni); - /* Find index allocation attribute. */ - ntfs_attr_reinit_search_ctx(ctx); - err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is " - "not present but $INDEX_ROOT " - "indicated it is."); - else - ntfs_error(vi->i_sb, "Failed to lookup " - "$INDEX_ALLOCATION attribute."); - goto unm_err_out; - } - a = ctx->attr; - if (!a->non_resident) { - ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is " - "resident."); - goto unm_err_out; - } - /* - * Ensure the attribute name is placed before the mapping pairs array. - */ - if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= - le16_to_cpu( - a->data.non_resident.mapping_pairs_offset)))) { - ntfs_error(vol->sb, "$INDEX_ALLOCATION attribute name is " - "placed after the mapping pairs array."); - goto unm_err_out; - } - if (a->flags & ATTR_IS_ENCRYPTED) { - ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is " - "encrypted."); - goto unm_err_out; - } - if (a->flags & ATTR_IS_SPARSE) { - ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is sparse."); - goto unm_err_out; - } - if (a->flags & ATTR_COMPRESSION_MASK) { - ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is " - "compressed."); - goto unm_err_out; - } - if (a->data.non_resident.lowest_vcn) { - ntfs_error(vi->i_sb, "First extent of $INDEX_ALLOCATION " - "attribute has non zero lowest_vcn."); - goto unm_err_out; - } - vi->i_size = sle64_to_cpu(a->data.non_resident.data_size); - ni->initialized_size = sle64_to_cpu( - a->data.non_resident.initialized_size); - ni->allocated_size = sle64_to_cpu(a->data.non_resident.allocated_size); - /* - * We are done with the mft record, so we release it. Otherwise - * we would deadlock in ntfs_attr_iget(). - */ - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - m = NULL; - ctx = NULL; - /* Get the index bitmap attribute inode. */ - bvi = ntfs_attr_iget(base_vi, AT_BITMAP, ni->name, ni->name_len); - if (IS_ERR(bvi)) { - ntfs_error(vi->i_sb, "Failed to get bitmap attribute."); - err = PTR_ERR(bvi); - goto unm_err_out; - } - bni = NTFS_I(bvi); - if (NInoCompressed(bni) || NInoEncrypted(bni) || - NInoSparse(bni)) { - ntfs_error(vi->i_sb, "$BITMAP attribute is compressed and/or " - "encrypted and/or sparse."); - goto iput_unm_err_out; - } - /* Consistency check bitmap size vs. index allocation size. */ - bvi_size = i_size_read(bvi); - if ((bvi_size << 3) < (vi->i_size >> ni->itype.index.block_size_bits)) { - ntfs_error(vi->i_sb, "Index bitmap too small (0x%llx) for " - "index allocation (0x%llx).", bvi_size << 3, - vi->i_size); - goto iput_unm_err_out; - } - iput(bvi); -skip_large_index_stuff: - /* Setup the operations for this index inode. */ - vi->i_mapping->a_ops = &ntfs_mst_aops; - vi->i_blocks = ni->allocated_size >> 9; - /* - * Make sure the base inode doesn't go away and attach it to the - * index inode. - */ - igrab(base_vi); - ni->ext.base_ntfs_ino = base_ni; - ni->nr_extents = -1; - - ntfs_debug("Done."); - return 0; -iput_unm_err_out: - iput(bvi); -unm_err_out: - if (!err) - err = -EIO; - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(base_ni); -err_out: - ntfs_error(vi->i_sb, "Failed with error code %i while reading index " - "inode (mft_no 0x%lx, name_len %i.", err, vi->i_ino, - ni->name_len); - make_bad_inode(vi); - if (err != -EOPNOTSUPP && err != -ENOMEM) - NVolSetErrors(vol); - return err; -} - -/* - * The MFT inode has special locking, so teach the lock validator - * about this by splitting off the locking rules of the MFT from - * the locking rules of other inodes. The MFT inode can never be - * accessed from the VFS side (or even internally), only by the - * map_mft functions. - */ -static struct lock_class_key mft_ni_runlist_lock_key, mft_ni_mrec_lock_key; - -/** - * ntfs_read_inode_mount - special read_inode for mount time use only - * @vi: inode to read - * - * Read inode FILE_MFT at mount time, only called with super_block lock - * held from within the read_super() code path. - * - * This function exists because when it is called the page cache for $MFT/$DATA - * is not initialized and hence we cannot get at the contents of mft records - * by calling map_mft_record*(). - * - * Further it needs to cope with the circular references problem, i.e. cannot - * load any attributes other than $ATTRIBUTE_LIST until $DATA is loaded, because - * we do not know where the other extent mft records are yet and again, because - * we cannot call map_mft_record*() yet. Obviously this applies only when an - * attribute list is actually present in $MFT inode. - * - * We solve these problems by starting with the $DATA attribute before anything - * else and iterating using ntfs_attr_lookup($DATA) over all extents. As each - * extent is found, we ntfs_mapping_pairs_decompress() including the implied - * ntfs_runlists_merge(). Each step of the iteration necessarily provides - * sufficient information for the next step to complete. - * - * This should work but there are two possible pit falls (see inline comments - * below), but only time will tell if they are real pits or just smoke... - */ -int ntfs_read_inode_mount(struct inode *vi) -{ - VCN next_vcn, last_vcn, highest_vcn; - s64 block; - struct super_block *sb = vi->i_sb; - ntfs_volume *vol = NTFS_SB(sb); - struct buffer_head *bh; - ntfs_inode *ni; - MFT_RECORD *m = NULL; - ATTR_RECORD *a; - ntfs_attr_search_ctx *ctx; - unsigned int i, nr_blocks; - int err; - - ntfs_debug("Entering."); - - /* Initialize the ntfs specific part of @vi. */ - ntfs_init_big_inode(vi); - - ni = NTFS_I(vi); - - /* Setup the data attribute. It is special as it is mst protected. */ - NInoSetNonResident(ni); - NInoSetMstProtected(ni); - NInoSetSparseDisabled(ni); - ni->type = AT_DATA; - ni->name = NULL; - ni->name_len = 0; - /* - * This sets up our little cheat allowing us to reuse the async read io - * completion handler for directories. - */ - ni->itype.index.block_size = vol->mft_record_size; - ni->itype.index.block_size_bits = vol->mft_record_size_bits; - - /* Very important! Needed to be able to call map_mft_record*(). */ - vol->mft_ino = vi; - - /* Allocate enough memory to read the first mft record. */ - if (vol->mft_record_size > 64 * 1024) { - ntfs_error(sb, "Unsupported mft record size %i (max 64kiB).", - vol->mft_record_size); - goto err_out; - } - i = vol->mft_record_size; - if (i < sb->s_blocksize) - i = sb->s_blocksize; - m = (MFT_RECORD*)ntfs_malloc_nofs(i); - if (!m) { - ntfs_error(sb, "Failed to allocate buffer for $MFT record 0."); - goto err_out; - } - - /* Determine the first block of the $MFT/$DATA attribute. */ - block = vol->mft_lcn << vol->cluster_size_bits >> - sb->s_blocksize_bits; - nr_blocks = vol->mft_record_size >> sb->s_blocksize_bits; - if (!nr_blocks) - nr_blocks = 1; - - /* Load $MFT/$DATA's first mft record. */ - for (i = 0; i < nr_blocks; i++) { - bh = sb_bread(sb, block++); - if (!bh) { - ntfs_error(sb, "Device read failed."); - goto err_out; - } - memcpy((char*)m + (i << sb->s_blocksize_bits), bh->b_data, - sb->s_blocksize); - brelse(bh); - } - - if (le32_to_cpu(m->bytes_allocated) != vol->mft_record_size) { - ntfs_error(sb, "Incorrect mft record size %u in superblock, should be %u.", - le32_to_cpu(m->bytes_allocated), vol->mft_record_size); - goto err_out; - } - - /* Apply the mst fixups. */ - if (post_read_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size)) { - /* FIXME: Try to use the $MFTMirr now. */ - ntfs_error(sb, "MST fixup failed. $MFT is corrupt."); - goto err_out; - } - - /* Sanity check offset to the first attribute */ - if (le16_to_cpu(m->attrs_offset) >= le32_to_cpu(m->bytes_allocated)) { - ntfs_error(sb, "Incorrect mft offset to the first attribute %u in superblock.", - le16_to_cpu(m->attrs_offset)); - goto err_out; - } - - /* Need this to sanity check attribute list references to $MFT. */ - vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number); - - /* Provides read_folio() for map_mft_record(). */ - vi->i_mapping->a_ops = &ntfs_mst_aops; - - ctx = ntfs_attr_get_search_ctx(ni, m); - if (!ctx) { - err = -ENOMEM; - goto err_out; - } - - /* Find the attribute list attribute if present. */ - err = ntfs_attr_lookup(AT_ATTRIBUTE_LIST, NULL, 0, 0, 0, NULL, 0, ctx); - if (err) { - if (unlikely(err != -ENOENT)) { - ntfs_error(sb, "Failed to lookup attribute list " - "attribute. You should run chkdsk."); - goto put_err_out; - } - } else /* if (!err) */ { - ATTR_LIST_ENTRY *al_entry, *next_al_entry; - u8 *al_end; - static const char *es = " Not allowed. $MFT is corrupt. " - "You should run chkdsk."; - - ntfs_debug("Attribute list attribute found in $MFT."); - NInoSetAttrList(ni); - a = ctx->attr; - if (a->flags & ATTR_COMPRESSION_MASK) { - ntfs_error(sb, "Attribute list attribute is " - "compressed.%s", es); - goto put_err_out; - } - if (a->flags & ATTR_IS_ENCRYPTED || - a->flags & ATTR_IS_SPARSE) { - if (a->non_resident) { - ntfs_error(sb, "Non-resident attribute list " - "attribute is encrypted/" - "sparse.%s", es); - goto put_err_out; - } - ntfs_warning(sb, "Resident attribute list attribute " - "in $MFT system file is marked " - "encrypted/sparse which is not true. " - "However, Windows allows this and " - "chkdsk does not detect or correct it " - "so we will just ignore the invalid " - "flags and pretend they are not set."); - } - /* Now allocate memory for the attribute list. */ - ni->attr_list_size = (u32)ntfs_attr_size(a); - if (!ni->attr_list_size) { - ntfs_error(sb, "Attr_list_size is zero"); - goto put_err_out; - } - ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size); - if (!ni->attr_list) { - ntfs_error(sb, "Not enough memory to allocate buffer " - "for attribute list."); - goto put_err_out; - } - if (a->non_resident) { - NInoSetAttrListNonResident(ni); - if (a->data.non_resident.lowest_vcn) { - ntfs_error(sb, "Attribute list has non zero " - "lowest_vcn. $MFT is corrupt. " - "You should run chkdsk."); - goto put_err_out; - } - /* Setup the runlist. */ - ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol, - a, NULL); - if (IS_ERR(ni->attr_list_rl.rl)) { - err = PTR_ERR(ni->attr_list_rl.rl); - ni->attr_list_rl.rl = NULL; - ntfs_error(sb, "Mapping pairs decompression " - "failed with error code %i.", - -err); - goto put_err_out; - } - /* Now load the attribute list. */ - if ((err = load_attribute_list(vol, &ni->attr_list_rl, - ni->attr_list, ni->attr_list_size, - sle64_to_cpu(a->data. - non_resident.initialized_size)))) { - ntfs_error(sb, "Failed to load attribute list " - "attribute with error code %i.", - -err); - goto put_err_out; - } - } else /* if (!ctx.attr->non_resident) */ { - if ((u8*)a + le16_to_cpu( - a->data.resident.value_offset) + - le32_to_cpu( - a->data.resident.value_length) > - (u8*)ctx->mrec + vol->mft_record_size) { - ntfs_error(sb, "Corrupt attribute list " - "attribute."); - goto put_err_out; - } - /* Now copy the attribute list. */ - memcpy(ni->attr_list, (u8*)a + le16_to_cpu( - a->data.resident.value_offset), - le32_to_cpu( - a->data.resident.value_length)); - } - /* The attribute list is now setup in memory. */ - /* - * FIXME: I don't know if this case is actually possible. - * According to logic it is not possible but I have seen too - * many weird things in MS software to rely on logic... Thus we - * perform a manual search and make sure the first $MFT/$DATA - * extent is in the base inode. If it is not we abort with an - * error and if we ever see a report of this error we will need - * to do some magic in order to have the necessary mft record - * loaded and in the right place in the page cache. But - * hopefully logic will prevail and this never happens... - */ - al_entry = (ATTR_LIST_ENTRY*)ni->attr_list; - al_end = (u8*)al_entry + ni->attr_list_size; - for (;; al_entry = next_al_entry) { - /* Out of bounds check. */ - if ((u8*)al_entry < ni->attr_list || - (u8*)al_entry > al_end) - goto em_put_err_out; - /* Catch the end of the attribute list. */ - if ((u8*)al_entry == al_end) - goto em_put_err_out; - if (!al_entry->length) - goto em_put_err_out; - if ((u8*)al_entry + 6 > al_end || (u8*)al_entry + - le16_to_cpu(al_entry->length) > al_end) - goto em_put_err_out; - next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry + - le16_to_cpu(al_entry->length)); - if (le32_to_cpu(al_entry->type) > le32_to_cpu(AT_DATA)) - goto em_put_err_out; - if (AT_DATA != al_entry->type) - continue; - /* We want an unnamed attribute. */ - if (al_entry->name_length) - goto em_put_err_out; - /* Want the first entry, i.e. lowest_vcn == 0. */ - if (al_entry->lowest_vcn) - goto em_put_err_out; - /* First entry has to be in the base mft record. */ - if (MREF_LE(al_entry->mft_reference) != vi->i_ino) { - /* MFT references do not match, logic fails. */ - ntfs_error(sb, "BUG: The first $DATA extent " - "of $MFT is not in the base " - "mft record. Please report " - "you saw this message to " - "linux-ntfs-dev@lists." - "sourceforge.net"); - goto put_err_out; - } else { - /* Sequence numbers must match. */ - if (MSEQNO_LE(al_entry->mft_reference) != - ni->seq_no) - goto em_put_err_out; - /* Got it. All is ok. We can stop now. */ - break; - } - } - } - - ntfs_attr_reinit_search_ctx(ctx); - - /* Now load all attribute extents. */ - a = NULL; - next_vcn = last_vcn = highest_vcn = 0; - while (!(err = ntfs_attr_lookup(AT_DATA, NULL, 0, 0, next_vcn, NULL, 0, - ctx))) { - runlist_element *nrl; - - /* Cache the current attribute. */ - a = ctx->attr; - /* $MFT must be non-resident. */ - if (!a->non_resident) { - ntfs_error(sb, "$MFT must be non-resident but a " - "resident extent was found. $MFT is " - "corrupt. Run chkdsk."); - goto put_err_out; - } - /* $MFT must be uncompressed and unencrypted. */ - if (a->flags & ATTR_COMPRESSION_MASK || - a->flags & ATTR_IS_ENCRYPTED || - a->flags & ATTR_IS_SPARSE) { - ntfs_error(sb, "$MFT must be uncompressed, " - "non-sparse, and unencrypted but a " - "compressed/sparse/encrypted extent " - "was found. $MFT is corrupt. Run " - "chkdsk."); - goto put_err_out; - } - /* - * Decompress the mapping pairs array of this extent and merge - * the result into the existing runlist. No need for locking - * as we have exclusive access to the inode at this time and we - * are a mount in progress task, too. - */ - nrl = ntfs_mapping_pairs_decompress(vol, a, ni->runlist.rl); - if (IS_ERR(nrl)) { - ntfs_error(sb, "ntfs_mapping_pairs_decompress() " - "failed with error code %ld. $MFT is " - "corrupt.", PTR_ERR(nrl)); - goto put_err_out; - } - ni->runlist.rl = nrl; - - /* Are we in the first extent? */ - if (!next_vcn) { - if (a->data.non_resident.lowest_vcn) { - ntfs_error(sb, "First extent of $DATA " - "attribute has non zero " - "lowest_vcn. $MFT is corrupt. " - "You should run chkdsk."); - goto put_err_out; - } - /* Get the last vcn in the $DATA attribute. */ - last_vcn = sle64_to_cpu( - a->data.non_resident.allocated_size) - >> vol->cluster_size_bits; - /* Fill in the inode size. */ - vi->i_size = sle64_to_cpu( - a->data.non_resident.data_size); - ni->initialized_size = sle64_to_cpu( - a->data.non_resident.initialized_size); - ni->allocated_size = sle64_to_cpu( - a->data.non_resident.allocated_size); - /* - * Verify the number of mft records does not exceed - * 2^32 - 1. - */ - if ((vi->i_size >> vol->mft_record_size_bits) >= - (1ULL << 32)) { - ntfs_error(sb, "$MFT is too big! Aborting."); - goto put_err_out; - } - /* - * We have got the first extent of the runlist for - * $MFT which means it is now relatively safe to call - * the normal ntfs_read_inode() function. - * Complete reading the inode, this will actually - * re-read the mft record for $MFT, this time entering - * it into the page cache with which we complete the - * kick start of the volume. It should be safe to do - * this now as the first extent of $MFT/$DATA is - * already known and we would hope that we don't need - * further extents in order to find the other - * attributes belonging to $MFT. Only time will tell if - * this is really the case. If not we will have to play - * magic at this point, possibly duplicating a lot of - * ntfs_read_inode() at this point. We will need to - * ensure we do enough of its work to be able to call - * ntfs_read_inode() on extents of $MFT/$DATA. But lets - * hope this never happens... - */ - ntfs_read_locked_inode(vi); - if (is_bad_inode(vi)) { - ntfs_error(sb, "ntfs_read_inode() of $MFT " - "failed. BUG or corrupt $MFT. " - "Run chkdsk and if no errors " - "are found, please report you " - "saw this message to " - "linux-ntfs-dev@lists." - "sourceforge.net"); - ntfs_attr_put_search_ctx(ctx); - /* Revert to the safe super operations. */ - ntfs_free(m); - return -1; - } - /* - * Re-initialize some specifics about $MFT's inode as - * ntfs_read_inode() will have set up the default ones. - */ - /* Set uid and gid to root. */ - vi->i_uid = GLOBAL_ROOT_UID; - vi->i_gid = GLOBAL_ROOT_GID; - /* Regular file. No access for anyone. */ - vi->i_mode = S_IFREG; - /* No VFS initiated operations allowed for $MFT. */ - vi->i_op = &ntfs_empty_inode_ops; - vi->i_fop = &ntfs_empty_file_ops; - } - - /* Get the lowest vcn for the next extent. */ - highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn); - next_vcn = highest_vcn + 1; - - /* Only one extent or error, which we catch below. */ - if (next_vcn <= 0) - break; - - /* Avoid endless loops due to corruption. */ - if (next_vcn < sle64_to_cpu( - a->data.non_resident.lowest_vcn)) { - ntfs_error(sb, "$MFT has corrupt attribute list " - "attribute. Run chkdsk."); - goto put_err_out; - } - } - if (err != -ENOENT) { - ntfs_error(sb, "Failed to lookup $MFT/$DATA attribute extent. " - "$MFT is corrupt. Run chkdsk."); - goto put_err_out; - } - if (!a) { - ntfs_error(sb, "$MFT/$DATA attribute not found. $MFT is " - "corrupt. Run chkdsk."); - goto put_err_out; - } - if (highest_vcn && highest_vcn != last_vcn - 1) { - ntfs_error(sb, "Failed to load the complete runlist for " - "$MFT/$DATA. Driver bug or corrupt $MFT. " - "Run chkdsk."); - ntfs_debug("highest_vcn = 0x%llx, last_vcn - 1 = 0x%llx", - (unsigned long long)highest_vcn, - (unsigned long long)last_vcn - 1); - goto put_err_out; - } - ntfs_attr_put_search_ctx(ctx); - ntfs_debug("Done."); - ntfs_free(m); - - /* - * Split the locking rules of the MFT inode from the - * locking rules of other inodes: - */ - lockdep_set_class(&ni->runlist.lock, &mft_ni_runlist_lock_key); - lockdep_set_class(&ni->mrec_lock, &mft_ni_mrec_lock_key); - - return 0; - -em_put_err_out: - ntfs_error(sb, "Couldn't find first extent of $DATA attribute in " - "attribute list. $MFT is corrupt. Run chkdsk."); -put_err_out: - ntfs_attr_put_search_ctx(ctx); -err_out: - ntfs_error(sb, "Failed. Marking inode as bad."); - make_bad_inode(vi); - ntfs_free(m); - return -1; -} - -static void __ntfs_clear_inode(ntfs_inode *ni) -{ - /* Free all alocated memory. */ - down_write(&ni->runlist.lock); - if (ni->runlist.rl) { - ntfs_free(ni->runlist.rl); - ni->runlist.rl = NULL; - } - up_write(&ni->runlist.lock); - - if (ni->attr_list) { - ntfs_free(ni->attr_list); - ni->attr_list = NULL; - } - - down_write(&ni->attr_list_rl.lock); - if (ni->attr_list_rl.rl) { - ntfs_free(ni->attr_list_rl.rl); - ni->attr_list_rl.rl = NULL; - } - up_write(&ni->attr_list_rl.lock); - - if (ni->name_len && ni->name != I30) { - /* Catch bugs... */ - BUG_ON(!ni->name); - kfree(ni->name); - } -} - -void ntfs_clear_extent_inode(ntfs_inode *ni) -{ - ntfs_debug("Entering for inode 0x%lx.", ni->mft_no); - - BUG_ON(NInoAttr(ni)); - BUG_ON(ni->nr_extents != -1); - -#ifdef NTFS_RW - if (NInoDirty(ni)) { - if (!is_bad_inode(VFS_I(ni->ext.base_ntfs_ino))) - ntfs_error(ni->vol->sb, "Clearing dirty extent inode! " - "Losing data! This is a BUG!!!"); - // FIXME: Do something!!! - } -#endif /* NTFS_RW */ - - __ntfs_clear_inode(ni); - - /* Bye, bye... */ - ntfs_destroy_extent_inode(ni); -} - -/** - * ntfs_evict_big_inode - clean up the ntfs specific part of an inode - * @vi: vfs inode pending annihilation - * - * When the VFS is going to remove an inode from memory, ntfs_clear_big_inode() - * is called, which deallocates all memory belonging to the NTFS specific part - * of the inode and returns. - * - * If the MFT record is dirty, we commit it before doing anything else. - */ -void ntfs_evict_big_inode(struct inode *vi) -{ - ntfs_inode *ni = NTFS_I(vi); - - truncate_inode_pages_final(&vi->i_data); - clear_inode(vi); - -#ifdef NTFS_RW - if (NInoDirty(ni)) { - bool was_bad = (is_bad_inode(vi)); - - /* Committing the inode also commits all extent inodes. */ - ntfs_commit_inode(vi); - - if (!was_bad && (is_bad_inode(vi) || NInoDirty(ni))) { - ntfs_error(vi->i_sb, "Failed to commit dirty inode " - "0x%lx. Losing data!", vi->i_ino); - // FIXME: Do something!!! - } - } -#endif /* NTFS_RW */ - - /* No need to lock at this stage as no one else has a reference. */ - if (ni->nr_extents > 0) { - int i; - - for (i = 0; i < ni->nr_extents; i++) - ntfs_clear_extent_inode(ni->ext.extent_ntfs_inos[i]); - kfree(ni->ext.extent_ntfs_inos); - } - - __ntfs_clear_inode(ni); - - if (NInoAttr(ni)) { - /* Release the base inode if we are holding it. */ - if (ni->nr_extents == -1) { - iput(VFS_I(ni->ext.base_ntfs_ino)); - ni->nr_extents = 0; - ni->ext.base_ntfs_ino = NULL; - } - } - BUG_ON(ni->page); - if (!atomic_dec_and_test(&ni->count)) - BUG(); - return; -} - -/** - * ntfs_show_options - show mount options in /proc/mounts - * @sf: seq_file in which to write our mount options - * @root: root of the mounted tree whose mount options to display - * - * Called by the VFS once for each mounted ntfs volume when someone reads - * /proc/mounts in order to display the NTFS specific mount options of each - * mount. The mount options of fs specified by @root are written to the seq file - * @sf and success is returned. - */ -int ntfs_show_options(struct seq_file *sf, struct dentry *root) -{ - ntfs_volume *vol = NTFS_SB(root->d_sb); - int i; - - seq_printf(sf, ",uid=%i", from_kuid_munged(&init_user_ns, vol->uid)); - seq_printf(sf, ",gid=%i", from_kgid_munged(&init_user_ns, vol->gid)); - if (vol->fmask == vol->dmask) - seq_printf(sf, ",umask=0%o", vol->fmask); - else { - seq_printf(sf, ",fmask=0%o", vol->fmask); - seq_printf(sf, ",dmask=0%o", vol->dmask); - } - seq_printf(sf, ",nls=%s", vol->nls_map->charset); - if (NVolCaseSensitive(vol)) - seq_printf(sf, ",case_sensitive"); - if (NVolShowSystemFiles(vol)) - seq_printf(sf, ",show_sys_files"); - if (!NVolSparseEnabled(vol)) - seq_printf(sf, ",disable_sparse"); - for (i = 0; on_errors_arr[i].val; i++) { - if (on_errors_arr[i].val & vol->on_errors) - seq_printf(sf, ",errors=%s", on_errors_arr[i].str); - } - seq_printf(sf, ",mft_zone_multiplier=%i", vol->mft_zone_multiplier); - return 0; -} - -#ifdef NTFS_RW - -static const char *es = " Leaving inconsistent metadata. Unmount and run " - "chkdsk."; - -/** - * ntfs_truncate - called when the i_size of an ntfs inode is changed - * @vi: inode for which the i_size was changed - * - * We only support i_size changes for normal files at present, i.e. not - * compressed and not encrypted. This is enforced in ntfs_setattr(), see - * below. - * - * The kernel guarantees that @vi is a regular file (S_ISREG() is true) and - * that the change is allowed. - * - * This implies for us that @vi is a file inode rather than a directory, index, - * or attribute inode as well as that @vi is a base inode. - * - * Returns 0 on success or -errno on error. - * - * Called with ->i_mutex held. - */ -int ntfs_truncate(struct inode *vi) -{ - s64 new_size, old_size, nr_freed, new_alloc_size, old_alloc_size; - VCN highest_vcn; - unsigned long flags; - ntfs_inode *base_ni, *ni = NTFS_I(vi); - ntfs_volume *vol = ni->vol; - ntfs_attr_search_ctx *ctx; - MFT_RECORD *m; - ATTR_RECORD *a; - const char *te = " Leaving file length out of sync with i_size."; - int err, mp_size, size_change, alloc_change; - - ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); - BUG_ON(NInoAttr(ni)); - BUG_ON(S_ISDIR(vi->i_mode)); - BUG_ON(NInoMstProtected(ni)); - BUG_ON(ni->nr_extents < 0); -retry_truncate: - /* - * Lock the runlist for writing and map the mft record to ensure it is - * safe to mess with the attribute runlist and sizes. - */ - down_write(&ni->runlist.lock); - if (!NInoAttr(ni)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - ntfs_error(vi->i_sb, "Failed to map mft record for inode 0x%lx " - "(error code %d).%s", vi->i_ino, err, te); - ctx = NULL; - m = NULL; - goto old_bad_out; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - ntfs_error(vi->i_sb, "Failed to allocate a search context for " - "inode 0x%lx (not enough memory).%s", - vi->i_ino, te); - err = -ENOMEM; - goto old_bad_out; - } - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) { - ntfs_error(vi->i_sb, "Open attribute is missing from " - "mft record. Inode 0x%lx is corrupt. " - "Run chkdsk.%s", vi->i_ino, te); - err = -EIO; - } else - ntfs_error(vi->i_sb, "Failed to lookup attribute in " - "inode 0x%lx (error code %d).%s", - vi->i_ino, err, te); - goto old_bad_out; - } - m = ctx->mrec; - a = ctx->attr; - /* - * The i_size of the vfs inode is the new size for the attribute value. - */ - new_size = i_size_read(vi); - /* The current size of the attribute value is the old size. */ - old_size = ntfs_attr_size(a); - /* Calculate the new allocated size. */ - if (NInoNonResident(ni)) - new_alloc_size = (new_size + vol->cluster_size - 1) & - ~(s64)vol->cluster_size_mask; - else - new_alloc_size = (new_size + 7) & ~7; - /* The current allocated size is the old allocated size. */ - read_lock_irqsave(&ni->size_lock, flags); - old_alloc_size = ni->allocated_size; - read_unlock_irqrestore(&ni->size_lock, flags); - /* - * The change in the file size. This will be 0 if no change, >0 if the - * size is growing, and <0 if the size is shrinking. - */ - size_change = -1; - if (new_size - old_size >= 0) { - size_change = 1; - if (new_size == old_size) - size_change = 0; - } - /* As above for the allocated size. */ - alloc_change = -1; - if (new_alloc_size - old_alloc_size >= 0) { - alloc_change = 1; - if (new_alloc_size == old_alloc_size) - alloc_change = 0; - } - /* - * If neither the size nor the allocation are being changed there is - * nothing to do. - */ - if (!size_change && !alloc_change) - goto unm_done; - /* If the size is changing, check if new size is allowed in $AttrDef. */ - if (size_change) { - err = ntfs_attr_size_bounds_check(vol, ni->type, new_size); - if (unlikely(err)) { - if (err == -ERANGE) { - ntfs_error(vol->sb, "Truncate would cause the " - "inode 0x%lx to %simum size " - "for its attribute type " - "(0x%x). Aborting truncate.", - vi->i_ino, - new_size > old_size ? "exceed " - "the max" : "go under the min", - le32_to_cpu(ni->type)); - err = -EFBIG; - } else { - ntfs_error(vol->sb, "Inode 0x%lx has unknown " - "attribute type 0x%x. " - "Aborting truncate.", - vi->i_ino, - le32_to_cpu(ni->type)); - err = -EIO; - } - /* Reset the vfs inode size to the old size. */ - i_size_write(vi, old_size); - goto err_out; - } - } - if (NInoCompressed(ni) || NInoEncrypted(ni)) { - ntfs_warning(vi->i_sb, "Changes in inode size are not " - "supported yet for %s files, ignoring.", - NInoCompressed(ni) ? "compressed" : - "encrypted"); - err = -EOPNOTSUPP; - goto bad_out; - } - if (a->non_resident) - goto do_non_resident_truncate; - BUG_ON(NInoNonResident(ni)); - /* Resize the attribute record to best fit the new attribute size. */ - if (new_size < vol->mft_record_size && - !ntfs_resident_attr_value_resize(m, a, new_size)) { - /* The resize succeeded! */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - write_lock_irqsave(&ni->size_lock, flags); - /* Update the sizes in the ntfs inode and all is done. */ - ni->allocated_size = le32_to_cpu(a->length) - - le16_to_cpu(a->data.resident.value_offset); - /* - * Note ntfs_resident_attr_value_resize() has already done any - * necessary data clearing in the attribute record. When the - * file is being shrunk vmtruncate() will already have cleared - * the top part of the last partial page, i.e. since this is - * the resident case this is the page with index 0. However, - * when the file is being expanded, the page cache page data - * between the old data_size, i.e. old_size, and the new_size - * has not been zeroed. Fortunately, we do not need to zero it - * either since on one hand it will either already be zero due - * to both read_folio and writepage clearing partial page data - * beyond i_size in which case there is nothing to do or in the - * case of the file being mmap()ped at the same time, POSIX - * specifies that the behaviour is unspecified thus we do not - * have to do anything. This means that in our implementation - * in the rare case that the file is mmap()ped and a write - * occurred into the mmap()ped region just beyond the file size - * and writepage has not yet been called to write out the page - * (which would clear the area beyond the file size) and we now - * extend the file size to incorporate this dirty region - * outside the file size, a write of the page would result in - * this data being written to disk instead of being cleared. - * Given both POSIX and the Linux mmap(2) man page specify that - * this corner case is undefined, we choose to leave it like - * that as this is much simpler for us as we cannot lock the - * relevant page now since we are holding too many ntfs locks - * which would result in a lock reversal deadlock. - */ - ni->initialized_size = new_size; - write_unlock_irqrestore(&ni->size_lock, flags); - goto unm_done; - } - /* If the above resize failed, this must be an attribute extension. */ - BUG_ON(size_change < 0); - /* - * We have to drop all the locks so we can call - * ntfs_attr_make_non_resident(). This could be optimised by try- - * locking the first page cache page and only if that fails dropping - * the locks, locking the page, and redoing all the locking and - * lookups. While this would be a huge optimisation, it is not worth - * it as this is definitely a slow code path as it only ever can happen - * once for any given file. - */ - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - up_write(&ni->runlist.lock); - /* - * Not enough space in the mft record, try to make the attribute - * non-resident and if successful restart the truncation process. - */ - err = ntfs_attr_make_non_resident(ni, old_size); - if (likely(!err)) - goto retry_truncate; - /* - * Could not make non-resident. If this is due to this not being - * permitted for this attribute type or there not being enough space, - * try to make other attributes non-resident. Otherwise fail. - */ - if (unlikely(err != -EPERM && err != -ENOSPC)) { - ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, attribute " - "type 0x%x, because the conversion from " - "resident to non-resident attribute failed " - "with error code %i.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), err); - if (err != -ENOMEM) - err = -EIO; - goto conv_err_out; - } - /* TODO: Not implemented from here, abort. */ - if (err == -ENOSPC) - ntfs_error(vol->sb, "Not enough space in the mft record/on " - "disk for the non-resident attribute value. " - "This case is not implemented yet."); - else /* if (err == -EPERM) */ - ntfs_error(vol->sb, "This attribute type may not be " - "non-resident. This case is not implemented " - "yet."); - err = -EOPNOTSUPP; - goto conv_err_out; -#if 0 - // TODO: Attempt to make other attributes non-resident. - if (!err) - goto do_resident_extend; - /* - * Both the attribute list attribute and the standard information - * attribute must remain in the base inode. Thus, if this is one of - * these attributes, we have to try to move other attributes out into - * extent mft records instead. - */ - if (ni->type == AT_ATTRIBUTE_LIST || - ni->type == AT_STANDARD_INFORMATION) { - // TODO: Attempt to move other attributes into extent mft - // records. - err = -EOPNOTSUPP; - if (!err) - goto do_resident_extend; - goto err_out; - } - // TODO: Attempt to move this attribute to an extent mft record, but - // only if it is not already the only attribute in an mft record in - // which case there would be nothing to gain. - err = -EOPNOTSUPP; - if (!err) - goto do_resident_extend; - /* There is nothing we can do to make enough space. )-: */ - goto err_out; -#endif -do_non_resident_truncate: - BUG_ON(!NInoNonResident(ni)); - if (alloc_change < 0) { - highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn); - if (highest_vcn > 0 && - old_alloc_size >> vol->cluster_size_bits > - highest_vcn + 1) { - /* - * This attribute has multiple extents. Not yet - * supported. - */ - ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, " - "attribute type 0x%x, because the " - "attribute is highly fragmented (it " - "consists of multiple extents) and " - "this case is not implemented yet.", - vi->i_ino, - (unsigned)le32_to_cpu(ni->type)); - err = -EOPNOTSUPP; - goto bad_out; - } - } - /* - * If the size is shrinking, need to reduce the initialized_size and - * the data_size before reducing the allocation. - */ - if (size_change < 0) { - /* - * Make the valid size smaller (i_size is already up-to-date). - */ - write_lock_irqsave(&ni->size_lock, flags); - if (new_size < ni->initialized_size) { - ni->initialized_size = new_size; - a->data.non_resident.initialized_size = - cpu_to_sle64(new_size); - } - a->data.non_resident.data_size = cpu_to_sle64(new_size); - write_unlock_irqrestore(&ni->size_lock, flags); - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - /* If the allocated size is not changing, we are done. */ - if (!alloc_change) - goto unm_done; - /* - * If the size is shrinking it makes no sense for the - * allocation to be growing. - */ - BUG_ON(alloc_change > 0); - } else /* if (size_change >= 0) */ { - /* - * The file size is growing or staying the same but the - * allocation can be shrinking, growing or staying the same. - */ - if (alloc_change > 0) { - /* - * We need to extend the allocation and possibly update - * the data size. If we are updating the data size, - * since we are not touching the initialized_size we do - * not need to worry about the actual data on disk. - * And as far as the page cache is concerned, there - * will be no pages beyond the old data size and any - * partial region in the last page between the old and - * new data size (or the end of the page if the new - * data size is outside the page) does not need to be - * modified as explained above for the resident - * attribute truncate case. To do this, we simply drop - * the locks we hold and leave all the work to our - * friendly helper ntfs_attr_extend_allocation(). - */ - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - up_write(&ni->runlist.lock); - err = ntfs_attr_extend_allocation(ni, new_size, - size_change > 0 ? new_size : -1, -1); - /* - * ntfs_attr_extend_allocation() will have done error - * output already. - */ - goto done; - } - if (!alloc_change) - goto alloc_done; - } - /* alloc_change < 0 */ - /* Free the clusters. */ - nr_freed = ntfs_cluster_free(ni, new_alloc_size >> - vol->cluster_size_bits, -1, ctx); - m = ctx->mrec; - a = ctx->attr; - if (unlikely(nr_freed < 0)) { - ntfs_error(vol->sb, "Failed to release cluster(s) (error code " - "%lli). Unmount and run chkdsk to recover " - "the lost cluster(s).", (long long)nr_freed); - NVolSetErrors(vol); - nr_freed = 0; - } - /* Truncate the runlist. */ - err = ntfs_rl_truncate_nolock(vol, &ni->runlist, - new_alloc_size >> vol->cluster_size_bits); - /* - * If the runlist truncation failed and/or the search context is no - * longer valid, we cannot resize the attribute record or build the - * mapping pairs array thus we mark the inode bad so that no access to - * the freed clusters can happen. - */ - if (unlikely(err || IS_ERR(m))) { - ntfs_error(vol->sb, "Failed to %s (error code %li).%s", - IS_ERR(m) ? - "restore attribute search context" : - "truncate attribute runlist", - IS_ERR(m) ? PTR_ERR(m) : err, es); - err = -EIO; - goto bad_out; - } - /* Get the size for the shrunk mapping pairs array for the runlist. */ - mp_size = ntfs_get_size_for_mapping_pairs(vol, ni->runlist.rl, 0, -1); - if (unlikely(mp_size <= 0)) { - ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, " - "attribute type 0x%x, because determining the " - "size for the mapping pairs failed with error " - "code %i.%s", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), mp_size, es); - err = -EIO; - goto bad_out; - } - /* - * Shrink the attribute record for the new mapping pairs array. Note, - * this cannot fail since we are making the attribute smaller thus by - * definition there is enough space to do so. - */ - err = ntfs_attr_record_resize(m, a, mp_size + - le16_to_cpu(a->data.non_resident.mapping_pairs_offset)); - BUG_ON(err); - /* - * Generate the mapping pairs array directly into the attribute record. - */ - err = ntfs_mapping_pairs_build(vol, (u8*)a + - le16_to_cpu(a->data.non_resident.mapping_pairs_offset), - mp_size, ni->runlist.rl, 0, -1, NULL); - if (unlikely(err)) { - ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, " - "attribute type 0x%x, because building the " - "mapping pairs failed with error code %i.%s", - vi->i_ino, (unsigned)le32_to_cpu(ni->type), - err, es); - err = -EIO; - goto bad_out; - } - /* Update the allocated/compressed size as well as the highest vcn. */ - a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >> - vol->cluster_size_bits) - 1); - write_lock_irqsave(&ni->size_lock, flags); - ni->allocated_size = new_alloc_size; - a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size); - if (NInoSparse(ni) || NInoCompressed(ni)) { - if (nr_freed) { - ni->itype.compressed.size -= nr_freed << - vol->cluster_size_bits; - BUG_ON(ni->itype.compressed.size < 0); - a->data.non_resident.compressed_size = cpu_to_sle64( - ni->itype.compressed.size); - vi->i_blocks = ni->itype.compressed.size >> 9; - } - } else - vi->i_blocks = new_alloc_size >> 9; - write_unlock_irqrestore(&ni->size_lock, flags); - /* - * We have shrunk the allocation. If this is a shrinking truncate we - * have already dealt with the initialized_size and the data_size above - * and we are done. If the truncate is only changing the allocation - * and not the data_size, we are also done. If this is an extending - * truncate, need to extend the data_size now which is ensured by the - * fact that @size_change is positive. - */ -alloc_done: - /* - * If the size is growing, need to update it now. If it is shrinking, - * we have already updated it above (before the allocation change). - */ - if (size_change > 0) - a->data.non_resident.data_size = cpu_to_sle64(new_size); - /* Ensure the modified mft record is written out. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); -unm_done: - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - up_write(&ni->runlist.lock); -done: - /* Update the mtime and ctime on the base inode. */ - /* normally ->truncate shouldn't update ctime or mtime, - * but ntfs did before so it got a copy & paste version - * of file_update_time. one day someone should fix this - * for real. - */ - if (!IS_NOCMTIME(VFS_I(base_ni)) && !IS_RDONLY(VFS_I(base_ni))) { - struct timespec64 now = current_time(VFS_I(base_ni)); - struct timespec64 ctime = inode_get_ctime(VFS_I(base_ni)); - struct timespec64 mtime = inode_get_mtime(VFS_I(base_ni)); - int sync_it = 0; - - if (!timespec64_equal(&mtime, &now) || - !timespec64_equal(&ctime, &now)) - sync_it = 1; - inode_set_ctime_to_ts(VFS_I(base_ni), now); - inode_set_mtime_to_ts(VFS_I(base_ni), now); - - if (sync_it) - mark_inode_dirty_sync(VFS_I(base_ni)); - } - - if (likely(!err)) { - NInoClearTruncateFailed(ni); - ntfs_debug("Done."); - } - return err; -old_bad_out: - old_size = -1; -bad_out: - if (err != -ENOMEM && err != -EOPNOTSUPP) - NVolSetErrors(vol); - if (err != -EOPNOTSUPP) - NInoSetTruncateFailed(ni); - else if (old_size >= 0) - i_size_write(vi, old_size); -err_out: - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(base_ni); - up_write(&ni->runlist.lock); -out: - ntfs_debug("Failed. Returning error code %i.", err); - return err; -conv_err_out: - if (err != -ENOMEM && err != -EOPNOTSUPP) - NVolSetErrors(vol); - if (err != -EOPNOTSUPP) - NInoSetTruncateFailed(ni); - else - i_size_write(vi, old_size); - goto out; -} - -/** - * ntfs_truncate_vfs - wrapper for ntfs_truncate() that has no return value - * @vi: inode for which the i_size was changed - * - * Wrapper for ntfs_truncate() that has no return value. - * - * See ntfs_truncate() description above for details. - */ -#ifdef NTFS_RW -void ntfs_truncate_vfs(struct inode *vi) { - ntfs_truncate(vi); -} -#endif - -/** - * ntfs_setattr - called from notify_change() when an attribute is being changed - * @idmap: idmap of the mount the inode was found from - * @dentry: dentry whose attributes to change - * @attr: structure describing the attributes and the changes - * - * We have to trap VFS attempts to truncate the file described by @dentry as - * soon as possible, because we do not implement changes in i_size yet. So we - * abort all i_size changes here. - * - * We also abort all changes of user, group, and mode as we do not implement - * the NTFS ACLs yet. - * - * Called with ->i_mutex held. - */ -int ntfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, - struct iattr *attr) -{ - struct inode *vi = d_inode(dentry); - int err; - unsigned int ia_valid = attr->ia_valid; - - err = setattr_prepare(&nop_mnt_idmap, dentry, attr); - if (err) - goto out; - /* We do not support NTFS ACLs yet. */ - if (ia_valid & (ATTR_UID | ATTR_GID | ATTR_MODE)) { - ntfs_warning(vi->i_sb, "Changes in user/group/mode are not " - "supported yet, ignoring."); - err = -EOPNOTSUPP; - goto out; - } - if (ia_valid & ATTR_SIZE) { - if (attr->ia_size != i_size_read(vi)) { - ntfs_inode *ni = NTFS_I(vi); - /* - * FIXME: For now we do not support resizing of - * compressed or encrypted files yet. - */ - if (NInoCompressed(ni) || NInoEncrypted(ni)) { - ntfs_warning(vi->i_sb, "Changes in inode size " - "are not supported yet for " - "%s files, ignoring.", - NInoCompressed(ni) ? - "compressed" : "encrypted"); - err = -EOPNOTSUPP; - } else { - truncate_setsize(vi, attr->ia_size); - ntfs_truncate_vfs(vi); - } - if (err || ia_valid == ATTR_SIZE) - goto out; - } else { - /* - * We skipped the truncate but must still update - * timestamps. - */ - ia_valid |= ATTR_MTIME | ATTR_CTIME; - } - } - if (ia_valid & ATTR_ATIME) - inode_set_atime_to_ts(vi, attr->ia_atime); - if (ia_valid & ATTR_MTIME) - inode_set_mtime_to_ts(vi, attr->ia_mtime); - if (ia_valid & ATTR_CTIME) - inode_set_ctime_to_ts(vi, attr->ia_ctime); - mark_inode_dirty(vi); -out: - return err; -} - -/** - * __ntfs_write_inode - write out a dirty inode - * @vi: inode to write out - * @sync: if true, write out synchronously - * - * Write out a dirty inode to disk including any extent inodes if present. - * - * If @sync is true, commit the inode to disk and wait for io completion. This - * is done using write_mft_record(). - * - * If @sync is false, just schedule the write to happen but do not wait for i/o - * completion. In 2.6 kernels, scheduling usually happens just by virtue of - * marking the page (and in this case mft record) dirty but we do not implement - * this yet as write_mft_record() largely ignores the @sync parameter and - * always performs synchronous writes. - * - * Return 0 on success and -errno on error. - */ -int __ntfs_write_inode(struct inode *vi, int sync) -{ - sle64 nt; - ntfs_inode *ni = NTFS_I(vi); - ntfs_attr_search_ctx *ctx; - MFT_RECORD *m; - STANDARD_INFORMATION *si; - int err = 0; - bool modified = false; - - ntfs_debug("Entering for %sinode 0x%lx.", NInoAttr(ni) ? "attr " : "", - vi->i_ino); - /* - * Dirty attribute inodes are written via their real inodes so just - * clean them here. Access time updates are taken care off when the - * real inode is written. - */ - if (NInoAttr(ni)) { - NInoClearDirty(ni); - ntfs_debug("Done."); - return 0; - } - /* Map, pin, and lock the mft record belonging to the inode. */ - m = map_mft_record(ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - goto err_out; - } - /* Update the access times in the standard information attribute. */ - ctx = ntfs_attr_get_search_ctx(ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto unm_err_out; - } - err = ntfs_attr_lookup(AT_STANDARD_INFORMATION, NULL, 0, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - ntfs_attr_put_search_ctx(ctx); - goto unm_err_out; - } - si = (STANDARD_INFORMATION*)((u8*)ctx->attr + - le16_to_cpu(ctx->attr->data.resident.value_offset)); - /* Update the access times if they have changed. */ - nt = utc2ntfs(inode_get_mtime(vi)); - if (si->last_data_change_time != nt) { - ntfs_debug("Updating mtime for inode 0x%lx: old = 0x%llx, " - "new = 0x%llx", vi->i_ino, (long long) - sle64_to_cpu(si->last_data_change_time), - (long long)sle64_to_cpu(nt)); - si->last_data_change_time = nt; - modified = true; - } - nt = utc2ntfs(inode_get_ctime(vi)); - if (si->last_mft_change_time != nt) { - ntfs_debug("Updating ctime for inode 0x%lx: old = 0x%llx, " - "new = 0x%llx", vi->i_ino, (long long) - sle64_to_cpu(si->last_mft_change_time), - (long long)sle64_to_cpu(nt)); - si->last_mft_change_time = nt; - modified = true; - } - nt = utc2ntfs(inode_get_atime(vi)); - if (si->last_access_time != nt) { - ntfs_debug("Updating atime for inode 0x%lx: old = 0x%llx, " - "new = 0x%llx", vi->i_ino, - (long long)sle64_to_cpu(si->last_access_time), - (long long)sle64_to_cpu(nt)); - si->last_access_time = nt; - modified = true; - } - /* - * If we just modified the standard information attribute we need to - * mark the mft record it is in dirty. We do this manually so that - * mark_inode_dirty() is not called which would redirty the inode and - * hence result in an infinite loop of trying to write the inode. - * There is no need to mark the base inode nor the base mft record - * dirty, since we are going to write this mft record below in any case - * and the base mft record may actually not have been modified so it - * might not need to be written out. - * NOTE: It is not a problem when the inode for $MFT itself is being - * written out as mark_ntfs_record_dirty() will only set I_DIRTY_PAGES - * on the $MFT inode and hence __ntfs_write_inode() will not be - * re-invoked because of it which in turn is ok since the dirtied mft - * record will be cleaned and written out to disk below, i.e. before - * this function returns. - */ - if (modified) { - flush_dcache_mft_record_page(ctx->ntfs_ino); - if (!NInoTestSetDirty(ctx->ntfs_ino)) - mark_ntfs_record_dirty(ctx->ntfs_ino->page, - ctx->ntfs_ino->page_ofs); - } - ntfs_attr_put_search_ctx(ctx); - /* Now the access times are updated, write the base mft record. */ - if (NInoDirty(ni)) - err = write_mft_record(ni, m, sync); - /* Write all attached extent mft records. */ - mutex_lock(&ni->extent_lock); - if (ni->nr_extents > 0) { - ntfs_inode **extent_nis = ni->ext.extent_ntfs_inos; - int i; - - ntfs_debug("Writing %i extent inodes.", ni->nr_extents); - for (i = 0; i < ni->nr_extents; i++) { - ntfs_inode *tni = extent_nis[i]; - - if (NInoDirty(tni)) { - MFT_RECORD *tm = map_mft_record(tni); - int ret; - - if (IS_ERR(tm)) { - if (!err || err == -ENOMEM) - err = PTR_ERR(tm); - continue; - } - ret = write_mft_record(tni, tm, sync); - unmap_mft_record(tni); - if (unlikely(ret)) { - if (!err || err == -ENOMEM) - err = ret; - } - } - } - } - mutex_unlock(&ni->extent_lock); - unmap_mft_record(ni); - if (unlikely(err)) - goto err_out; - ntfs_debug("Done."); - return 0; -unm_err_out: - unmap_mft_record(ni); -err_out: - if (err == -ENOMEM) { - ntfs_warning(vi->i_sb, "Not enough memory to write inode. " - "Marking the inode dirty again, so the VFS " - "retries later."); - mark_inode_dirty(vi); - } else { - ntfs_error(vi->i_sb, "Failed (error %i): Run chkdsk.", -err); - NVolSetErrors(ni->vol); - } - return err; -} - -#endif /* NTFS_RW */ diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h deleted file mode 100644 index 147ef4ddb691..000000000000 --- a/fs/ntfs/inode.h +++ /dev/null @@ -1,310 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * inode.h - Defines for inode structures NTFS Linux kernel driver. Part of - * the Linux-NTFS project. - * - * Copyright (c) 2001-2007 Anton Altaparmakov - * Copyright (c) 2002 Richard Russon - */ - -#ifndef _LINUX_NTFS_INODE_H -#define _LINUX_NTFS_INODE_H - -#include <linux/atomic.h> - -#include <linux/fs.h> -#include <linux/list.h> -#include <linux/mm.h> -#include <linux/mutex.h> -#include <linux/seq_file.h> - -#include "layout.h" -#include "volume.h" -#include "types.h" -#include "runlist.h" -#include "debug.h" - -typedef struct _ntfs_inode ntfs_inode; - -/* - * The NTFS in-memory inode structure. It is just used as an extension to the - * fields already provided in the VFS inode. - */ -struct _ntfs_inode { - rwlock_t size_lock; /* Lock serializing access to inode sizes. */ - s64 initialized_size; /* Copy from the attribute record. */ - s64 allocated_size; /* Copy from the attribute record. */ - unsigned long state; /* NTFS specific flags describing this inode. - See ntfs_inode_state_bits below. */ - unsigned long mft_no; /* Number of the mft record / inode. */ - u16 seq_no; /* Sequence number of the mft record. */ - atomic_t count; /* Inode reference count for book keeping. */ - ntfs_volume *vol; /* Pointer to the ntfs volume of this inode. */ - /* - * If NInoAttr() is true, the below fields describe the attribute which - * this fake inode belongs to. The actual inode of this attribute is - * pointed to by base_ntfs_ino and nr_extents is always set to -1 (see - * below). For real inodes, we also set the type (AT_DATA for files and - * AT_INDEX_ALLOCATION for directories), with the name = NULL and - * name_len = 0 for files and name = I30 (global constant) and - * name_len = 4 for directories. - */ - ATTR_TYPE type; /* Attribute type of this fake inode. */ - ntfschar *name; /* Attribute name of this fake inode. */ - u32 name_len; /* Attribute name length of this fake inode. */ - runlist runlist; /* If state has the NI_NonResident bit set, - the runlist of the unnamed data attribute - (if a file) or of the index allocation - attribute (directory) or of the attribute - described by the fake inode (if NInoAttr()). - If runlist.rl is NULL, the runlist has not - been read in yet or has been unmapped. If - NI_NonResident is clear, the attribute is - resident (file and fake inode) or there is - no $I30 index allocation attribute - (small directory). In the latter case - runlist.rl is always NULL.*/ - /* - * The following fields are only valid for real inodes and extent - * inodes. - */ - struct mutex mrec_lock; /* Lock for serializing access to the - mft record belonging to this inode. */ - struct page *page; /* The page containing the mft record of the - inode. This should only be touched by the - (un)map_mft_record*() functions. */ - int page_ofs; /* Offset into the page at which the mft record - begins. This should only be touched by the - (un)map_mft_record*() functions. */ - /* - * Attribute list support (only for use by the attribute lookup - * functions). Setup during read_inode for all inodes with attribute - * lists. Only valid if NI_AttrList is set in state, and attr_list_rl is - * further only valid if NI_AttrListNonResident is set. - */ - u32 attr_list_size; /* Length of attribute list value in bytes. */ - u8 *attr_list; /* Attribute list value itself. */ - runlist attr_list_rl; /* Run list for the attribute list value. */ - union { - struct { /* It is a directory, $MFT, or an index inode. */ - u32 block_size; /* Size of an index block. */ - u32 vcn_size; /* Size of a vcn in this - index. */ - COLLATION_RULE collation_rule; /* The collation rule - for the index. */ - u8 block_size_bits; /* Log2 of the above. */ - u8 vcn_size_bits; /* Log2 of the above. */ - } index; - struct { /* It is a compressed/sparse file/attribute inode. */ - s64 size; /* Copy of compressed_size from - $DATA. */ - u32 block_size; /* Size of a compression block - (cb). */ - u8 block_size_bits; /* Log2 of the size of a cb. */ - u8 block_clusters; /* Number of clusters per cb. */ - } compressed; - } itype; - struct mutex extent_lock; /* Lock for accessing/modifying the - below . */ - s32 nr_extents; /* For a base mft record, the number of attached extent - inodes (0 if none), for extent records and for fake - inodes describing an attribute this is -1. */ - union { /* This union is only used if nr_extents != 0. */ - ntfs_inode **extent_ntfs_inos; /* For nr_extents > 0, array of - the ntfs inodes of the extent - mft records belonging to - this base inode which have - been loaded. */ - ntfs_inode *base_ntfs_ino; /* For nr_extents == -1, the - ntfs inode of the base mft - record. For fake inodes, the - real (base) inode to which - the attribute belongs. */ - } ext; -}; - -/* - * Defined bits for the state field in the ntfs_inode structure. - * (f) = files only, (d) = directories only, (a) = attributes/fake inodes only - */ -typedef enum { - NI_Dirty, /* 1: Mft record needs to be written to disk. */ - NI_AttrList, /* 1: Mft record contains an attribute list. */ - NI_AttrListNonResident, /* 1: Attribute list is non-resident. Implies - NI_AttrList is set. */ - - NI_Attr, /* 1: Fake inode for attribute i/o. - 0: Real inode or extent inode. */ - - NI_MstProtected, /* 1: Attribute is protected by MST fixups. - 0: Attribute is not protected by fixups. */ - NI_NonResident, /* 1: Unnamed data attr is non-resident (f). - 1: Attribute is non-resident (a). */ - NI_IndexAllocPresent = NI_NonResident, /* 1: $I30 index alloc attr is - present (d). */ - NI_Compressed, /* 1: Unnamed data attr is compressed (f). - 1: Create compressed files by default (d). - 1: Attribute is compressed (a). */ - NI_Encrypted, /* 1: Unnamed data attr is encrypted (f). - 1: Create encrypted files by default (d). - 1: Attribute is encrypted (a). */ - NI_Sparse, /* 1: Unnamed data attr is sparse (f). - 1: Create sparse files by default (d). - 1: Attribute is sparse (a). */ - NI_SparseDisabled, /* 1: May not create sparse regions. */ - NI_TruncateFailed, /* 1: Last ntfs_truncate() call failed. */ -} ntfs_inode_state_bits; - -/* - * NOTE: We should be adding dirty mft records to a list somewhere and they - * should be independent of the (ntfs/vfs) inode structure so that an inode can - * be removed but the record can be left dirty for syncing later. - */ - -/* - * Macro tricks to expand the NInoFoo(), NInoSetFoo(), and NInoClearFoo() - * functions. - */ -#define NINO_FNS(flag) \ -static inline int NIno##flag(ntfs_inode *ni) \ -{ \ - return test_bit(NI_##flag, &(ni)->state); \ -} \ -static inline void NInoSet##flag(ntfs_inode *ni) \ -{ \ - set_bit(NI_##flag, &(ni)->state); \ -} \ -static inline void NInoClear##flag(ntfs_inode *ni) \ -{ \ - clear_bit(NI_##flag, &(ni)->state); \ -} - -/* - * As above for NInoTestSetFoo() and NInoTestClearFoo(). - */ -#define TAS_NINO_FNS(flag) \ -static inline int NInoTestSet##flag(ntfs_inode *ni) \ -{ \ - return test_and_set_bit(NI_##flag, &(ni)->state); \ -} \ -static inline int NInoTestClear##flag(ntfs_inode *ni) \ -{ \ - return test_and_clear_bit(NI_##flag, &(ni)->state); \ -} - -/* Emit the ntfs inode bitops functions. */ -NINO_FNS(Dirty) -TAS_NINO_FNS(Dirty) -NINO_FNS(AttrList) -NINO_FNS(AttrListNonResident) -NINO_FNS(Attr) -NINO_FNS(MstProtected) -NINO_FNS(NonResident) -NINO_FNS(IndexAllocPresent) -NINO_FNS(Compressed) -NINO_FNS(Encrypted) -NINO_FNS(Sparse) -NINO_FNS(SparseDisabled) -NINO_FNS(TruncateFailed) - -/* - * The full structure containing a ntfs_inode and a vfs struct inode. Used for - * all real and fake inodes but not for extent inodes which lack the vfs struct - * inode. - */ -typedef struct { - ntfs_inode ntfs_inode; - struct inode vfs_inode; /* The vfs inode structure. */ -} big_ntfs_inode; - -/** - * NTFS_I - return the ntfs inode given a vfs inode - * @inode: VFS inode - * - * NTFS_I() returns the ntfs inode associated with the VFS @inode. - */ -static inline ntfs_inode *NTFS_I(struct inode *inode) -{ - return (ntfs_inode *)container_of(inode, big_ntfs_inode, vfs_inode); -} - -static inline struct inode *VFS_I(ntfs_inode *ni) -{ - return &((big_ntfs_inode *)ni)->vfs_inode; -} - -/** - * ntfs_attr - ntfs in memory attribute structure - * @mft_no: mft record number of the base mft record of this attribute - * @name: Unicode name of the attribute (NULL if unnamed) - * @name_len: length of @name in Unicode characters (0 if unnamed) - * @type: attribute type (see layout.h) - * - * This structure exists only to provide a small structure for the - * ntfs_{attr_}iget()/ntfs_test_inode()/ntfs_init_locked_inode() mechanism. - * - * NOTE: Elements are ordered by size to make the structure as compact as - * possible on all architectures. - */ -typedef struct { - unsigned long mft_no; - ntfschar *name; - u32 name_len; - ATTR_TYPE type; -} ntfs_attr; - -extern int ntfs_test_inode(struct inode *vi, void *data); - -extern struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no); -extern struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type, - ntfschar *name, u32 name_len); -extern struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name, - u32 name_len); - -extern struct inode *ntfs_alloc_big_inode(struct super_block *sb); -extern void ntfs_free_big_inode(struct inode *inode); -extern void ntfs_evict_big_inode(struct inode *vi); - -extern void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni); - -static inline void ntfs_init_big_inode(struct inode *vi) -{ - ntfs_inode *ni = NTFS_I(vi); - - ntfs_debug("Entering."); - __ntfs_init_inode(vi->i_sb, ni); - ni->mft_no = vi->i_ino; -} - -extern ntfs_inode *ntfs_new_extent_inode(struct super_block *sb, - unsigned long mft_no); -extern void ntfs_clear_extent_inode(ntfs_inode *ni); - -extern int ntfs_read_inode_mount(struct inode *vi); - -extern int ntfs_show_options(struct seq_file *sf, struct dentry *root); - -#ifdef NTFS_RW - -extern int ntfs_truncate(struct inode *vi); -extern void ntfs_truncate_vfs(struct inode *vi); - -extern int ntfs_setattr(struct mnt_idmap *idmap, - struct dentry *dentry, struct iattr *attr); - -extern int __ntfs_write_inode(struct inode *vi, int sync); - -static inline void ntfs_commit_inode(struct inode *vi) -{ - if (!is_bad_inode(vi)) - __ntfs_write_inode(vi, 1); - return; -} - -#else - -static inline void ntfs_truncate_vfs(struct inode *vi) {} - -#endif /* NTFS_RW */ - -#endif /* _LINUX_NTFS_INODE_H */ diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h deleted file mode 100644 index 5d4bf7a3259f..000000000000 --- a/fs/ntfs/layout.h +++ /dev/null @@ -1,2421 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * layout.h - All NTFS associated on-disk structures. Part of the Linux-NTFS - * project. - * - * Copyright (c) 2001-2005 Anton Altaparmakov - * Copyright (c) 2002 Richard Russon - */ - -#ifndef _LINUX_NTFS_LAYOUT_H -#define _LINUX_NTFS_LAYOUT_H - -#include <linux/types.h> -#include <linux/bitops.h> -#include <linux/list.h> -#include <asm/byteorder.h> - -#include "types.h" - -/* The NTFS oem_id "NTFS " */ -#define magicNTFS cpu_to_le64(0x202020205346544eULL) - -/* - * Location of bootsector on partition: - * The standard NTFS_BOOT_SECTOR is on sector 0 of the partition. - * On NT4 and above there is one backup copy of the boot sector to - * be found on the last sector of the partition (not normally accessible - * from within Windows as the bootsector contained number of sectors - * value is one less than the actual value!). - * On versions of NT 3.51 and earlier, the backup copy was located at - * number of sectors/2 (integer divide), i.e. in the middle of the volume. - */ - -/* - * BIOS parameter block (bpb) structure. - */ -typedef struct { - le16 bytes_per_sector; /* Size of a sector in bytes. */ - u8 sectors_per_cluster; /* Size of a cluster in sectors. */ - le16 reserved_sectors; /* zero */ - u8 fats; /* zero */ - le16 root_entries; /* zero */ - le16 sectors; /* zero */ - u8 media_type; /* 0xf8 = hard disk */ - le16 sectors_per_fat; /* zero */ - le16 sectors_per_track; /* irrelevant */ - le16 heads; /* irrelevant */ - le32 hidden_sectors; /* zero */ - le32 large_sectors; /* zero */ -} __attribute__ ((__packed__)) BIOS_PARAMETER_BLOCK; - -/* - * NTFS boot sector structure. - */ -typedef struct { - u8 jump[3]; /* Irrelevant (jump to boot up code).*/ - le64 oem_id; /* Magic "NTFS ". */ - BIOS_PARAMETER_BLOCK bpb; /* See BIOS_PARAMETER_BLOCK. */ - u8 unused[4]; /* zero, NTFS diskedit.exe states that - this is actually: - __u8 physical_drive; // 0x80 - __u8 current_head; // zero - __u8 extended_boot_signature; - // 0x80 - __u8 unused; // zero - */ -/*0x28*/sle64 number_of_sectors; /* Number of sectors in volume. Gives - maximum volume size of 2^63 sectors. - Assuming standard sector size of 512 - bytes, the maximum byte size is - approx. 4.7x10^21 bytes. (-; */ - sle64 mft_lcn; /* Cluster location of mft data. */ - sle64 mftmirr_lcn; /* Cluster location of copy of mft. */ - s8 clusters_per_mft_record; /* Mft record size in clusters. */ - u8 reserved0[3]; /* zero */ - s8 clusters_per_index_record; /* Index block size in clusters. */ - u8 reserved1[3]; /* zero */ - le64 volume_serial_number; /* Irrelevant (serial number). */ - le32 checksum; /* Boot sector checksum. */ -/*0x54*/u8 bootstrap[426]; /* Irrelevant (boot up code). */ - le16 end_of_sector_marker; /* End of bootsector magic. Always is - 0xaa55 in little endian. */ -/* sizeof() = 512 (0x200) bytes */ -} __attribute__ ((__packed__)) NTFS_BOOT_SECTOR; - -/* - * Magic identifiers present at the beginning of all ntfs record containing - * records (like mft records for example). - */ -enum { - /* Found in $MFT/$DATA. */ - magic_FILE = cpu_to_le32(0x454c4946), /* Mft entry. */ - magic_INDX = cpu_to_le32(0x58444e49), /* Index buffer. */ - magic_HOLE = cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */ - - /* Found in $LogFile/$DATA. */ - magic_RSTR = cpu_to_le32(0x52545352), /* Restart page. */ - magic_RCRD = cpu_to_le32(0x44524352), /* Log record page. */ - - /* Found in $LogFile/$DATA. (May be found in $MFT/$DATA, also?) */ - magic_CHKD = cpu_to_le32(0x444b4843), /* Modified by chkdsk. */ - - /* Found in all ntfs record containing records. */ - magic_BAAD = cpu_to_le32(0x44414142), /* Failed multi sector - transfer was detected. */ - /* - * Found in $LogFile/$DATA when a page is full of 0xff bytes and is - * thus not initialized. Page must be initialized before using it. - */ - magic_empty = cpu_to_le32(0xffffffff) /* Record is empty. */ -}; - -typedef le32 NTFS_RECORD_TYPE; - -/* - * Generic magic comparison macros. Finally found a use for the ## preprocessor - * operator! (-8 - */ - -static inline bool __ntfs_is_magic(le32 x, NTFS_RECORD_TYPE r) -{ - return (x == r); -} -#define ntfs_is_magic(x, m) __ntfs_is_magic(x, magic_##m) - -static inline bool __ntfs_is_magicp(le32 *p, NTFS_RECORD_TYPE r) -{ - return (*p == r); -} -#define ntfs_is_magicp(p, m) __ntfs_is_magicp(p, magic_##m) - -/* - * Specialised magic comparison macros for the NTFS_RECORD_TYPEs defined above. - */ -#define ntfs_is_file_record(x) ( ntfs_is_magic (x, FILE) ) -#define ntfs_is_file_recordp(p) ( ntfs_is_magicp(p, FILE) ) -#define ntfs_is_mft_record(x) ( ntfs_is_file_record (x) ) -#define ntfs_is_mft_recordp(p) ( ntfs_is_file_recordp(p) ) -#define ntfs_is_indx_record(x) ( ntfs_is_magic (x, INDX) ) -#define ntfs_is_indx_recordp(p) ( ntfs_is_magicp(p, INDX) ) -#define ntfs_is_hole_record(x) ( ntfs_is_magic (x, HOLE) ) -#define ntfs_is_hole_recordp(p) ( ntfs_is_magicp(p, HOLE) ) - -#define ntfs_is_rstr_record(x) ( ntfs_is_magic (x, RSTR) ) -#define ntfs_is_rstr_recordp(p) ( ntfs_is_magicp(p, RSTR) ) -#define ntfs_is_rcrd_record(x) ( ntfs_is_magic (x, RCRD) ) -#define ntfs_is_rcrd_recordp(p) ( ntfs_is_magicp(p, RCRD) ) - -#define ntfs_is_chkd_record(x) ( ntfs_is_magic (x, CHKD) ) -#define ntfs_is_chkd_recordp(p) ( ntfs_is_magicp(p, CHKD) ) - -#define ntfs_is_baad_record(x) ( ntfs_is_magic (x, BAAD) ) -#define ntfs_is_baad_recordp(p) ( ntfs_is_magicp(p, BAAD) ) - -#define ntfs_is_empty_record(x) ( ntfs_is_magic (x, empty) ) -#define ntfs_is_empty_recordp(p) ( ntfs_is_magicp(p, empty) ) - -/* - * The Update Sequence Array (usa) is an array of the le16 values which belong - * to the end of each sector protected by the update sequence record in which - * this array is contained. Note that the first entry is the Update Sequence - * Number (usn), a cyclic counter of how many times the protected record has - * been written to disk. The values 0 and -1 (ie. 0xffff) are not used. All - * last le16's of each sector have to be equal to the usn (during reading) or - * are set to it (during writing). If they are not, an incomplete multi sector - * transfer has occurred when the data was written. - * The maximum size for the update sequence array is fixed to: - * maximum size = usa_ofs + (usa_count * 2) = 510 bytes - * The 510 bytes comes from the fact that the last le16 in the array has to - * (obviously) finish before the last le16 of the first 512-byte sector. - * This formula can be used as a consistency check in that usa_ofs + - * (usa_count * 2) has to be less than or equal to 510. - */ -typedef struct { - NTFS_RECORD_TYPE magic; /* A four-byte magic identifying the record - type and/or status. */ - le16 usa_ofs; /* Offset to the Update Sequence Array (usa) - from the start of the ntfs record. */ - le16 usa_count; /* Number of le16 sized entries in the usa - including the Update Sequence Number (usn), - thus the number of fixups is the usa_count - minus 1. */ -} __attribute__ ((__packed__)) NTFS_RECORD; - -/* - * System files mft record numbers. All these files are always marked as used - * in the bitmap attribute of the mft; presumably in order to avoid accidental - * allocation for random other mft records. Also, the sequence number for each - * of the system files is always equal to their mft record number and it is - * never modified. - */ -typedef enum { - FILE_MFT = 0, /* Master file table (mft). Data attribute - contains the entries and bitmap attribute - records which ones are in use (bit==1). */ - FILE_MFTMirr = 1, /* Mft mirror: copy of first four mft records - in data attribute. If cluster size > 4kiB, - copy of first N mft records, with - N = cluster_size / mft_record_size. */ - FILE_LogFile = 2, /* Journalling log in data attribute. */ - FILE_Volume = 3, /* Volume name attribute and volume information - attribute (flags and ntfs version). Windows - refers to this file as volume DASD (Direct - Access Storage Device). */ - FILE_AttrDef = 4, /* Array of attribute definitions in data - attribute. */ - FILE_root = 5, /* Root directory. */ - FILE_Bitmap = 6, /* Allocation bitmap of all clusters (lcns) in - data attribute. */ - FILE_Boot = 7, /* Boot sector (always at cluster 0) in data - attribute. */ - FILE_BadClus = 8, /* Contains all bad clusters in the non-resident - data attribute. */ - FILE_Secure = 9, /* Shared security descriptors in data attribute - and two indexes into the descriptors. - Appeared in Windows 2000. Before that, this - file was named $Quota but was unused. */ - FILE_UpCase = 10, /* Uppercase equivalents of all 65536 Unicode - characters in data attribute. */ - FILE_Extend = 11, /* Directory containing other system files (eg. - $ObjId, $Quota, $Reparse and $UsnJrnl). This - is new to NTFS3.0. */ - FILE_reserved12 = 12, /* Reserved for future use (records 12-15). */ - FILE_reserved13 = 13, - FILE_reserved14 = 14, - FILE_reserved15 = 15, - FILE_first_user = 16, /* First user file, used as test limit for - whether to allow opening a file or not. */ -} NTFS_SYSTEM_FILES; - -/* - * These are the so far known MFT_RECORD_* flags (16-bit) which contain - * information about the mft record in which they are present. - */ -enum { - MFT_RECORD_IN_USE = cpu_to_le16(0x0001), - MFT_RECORD_IS_DIRECTORY = cpu_to_le16(0x0002), -} __attribute__ ((__packed__)); - -typedef le16 MFT_RECORD_FLAGS; - -/* - * mft references (aka file references or file record segment references) are - * used whenever a structure needs to refer to a record in the mft. - * - * A reference consists of a 48-bit index into the mft and a 16-bit sequence - * number used to detect stale references. - * - * For error reporting purposes we treat the 48-bit index as a signed quantity. - * - * The sequence number is a circular counter (skipping 0) describing how many - * times the referenced mft record has been (re)used. This has to match the - * sequence number of the mft record being referenced, otherwise the reference - * is considered stale and removed (FIXME: only ntfsck or the driver itself?). - * - * If the sequence number is zero it is assumed that no sequence number - * consistency checking should be performed. - * - * FIXME: Since inodes are 32-bit as of now, the driver needs to always check - * for high_part being 0 and if not either BUG(), cause a panic() or handle - * the situation in some other way. This shouldn't be a problem as a volume has - * to become HUGE in order to need more than 32-bits worth of mft records. - * Assuming the standard mft record size of 1kb only the records (never mind - * the non-resident attributes, etc.) would require 4Tb of space on their own - * for the first 32 bits worth of records. This is only if some strange person - * doesn't decide to foul play and make the mft sparse which would be a really - * horrible thing to do as it would trash our current driver implementation. )-: - * Do I hear screams "we want 64-bit inodes!" ?!? (-; - * - * FIXME: The mft zone is defined as the first 12% of the volume. This space is - * reserved so that the mft can grow contiguously and hence doesn't become - * fragmented. Volume free space includes the empty part of the mft zone and - * when the volume's free 88% are used up, the mft zone is shrunk by a factor - * of 2, thus making more space available for more files/data. This process is - * repeated every time there is no more free space except for the mft zone until - * there really is no more free space. - */ - -/* - * Typedef the MFT_REF as a 64-bit value for easier handling. - * Also define two unpacking macros to get to the reference (MREF) and - * sequence number (MSEQNO) respectively. - * The _LE versions are to be applied on little endian MFT_REFs. - * Note: The _LE versions will return a CPU endian formatted value! - */ -#define MFT_REF_MASK_CPU 0x0000ffffffffffffULL -#define MFT_REF_MASK_LE cpu_to_le64(MFT_REF_MASK_CPU) - -typedef u64 MFT_REF; -typedef le64 leMFT_REF; - -#define MK_MREF(m, s) ((MFT_REF)(((MFT_REF)(s) << 48) | \ - ((MFT_REF)(m) & MFT_REF_MASK_CPU))) -#define MK_LE_MREF(m, s) cpu_to_le64(MK_MREF(m, s)) - -#define MREF(x) ((unsigned long)((x) & MFT_REF_MASK_CPU)) -#define MSEQNO(x) ((u16)(((x) >> 48) & 0xffff)) -#define MREF_LE(x) ((unsigned long)(le64_to_cpu(x) & MFT_REF_MASK_CPU)) -#define MSEQNO_LE(x) ((u16)((le64_to_cpu(x) >> 48) & 0xffff)) - -#define IS_ERR_MREF(x) (((x) & 0x0000800000000000ULL) ? true : false) -#define ERR_MREF(x) ((u64)((s64)(x))) -#define MREF_ERR(x) ((int)((s64)(x))) - -/* - * The mft record header present at the beginning of every record in the mft. - * This is followed by a sequence of variable length attribute records which - * is terminated by an attribute of type AT_END which is a truncated attribute - * in that it only consists of the attribute type code AT_END and none of the - * other members of the attribute structure are present. - */ -typedef struct { -/*Ofs*/ -/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */ - NTFS_RECORD_TYPE magic; /* Usually the magic is "FILE". */ - le16 usa_ofs; /* See NTFS_RECORD definition above. */ - le16 usa_count; /* See NTFS_RECORD definition above. */ - -/* 8*/ le64 lsn; /* $LogFile sequence number for this record. - Changed every time the record is modified. */ -/* 16*/ le16 sequence_number; /* Number of times this mft record has been - reused. (See description for MFT_REF - above.) NOTE: The increment (skipping zero) - is done when the file is deleted. NOTE: If - this is zero it is left zero. */ -/* 18*/ le16 link_count; /* Number of hard links, i.e. the number of - directory entries referencing this record. - NOTE: Only used in mft base records. - NOTE: When deleting a directory entry we - check the link_count and if it is 1 we - delete the file. Otherwise we delete the - FILE_NAME_ATTR being referenced by the - directory entry from the mft record and - decrement the link_count. - FIXME: Careful with Win32 + DOS names! */ -/* 20*/ le16 attrs_offset; /* Byte offset to the first attribute in this - mft record from the start of the mft record. - NOTE: Must be aligned to 8-byte boundary. */ -/* 22*/ MFT_RECORD_FLAGS flags; /* Bit array of MFT_RECORD_FLAGS. When a file - is deleted, the MFT_RECORD_IN_USE flag is - set to zero. */ -/* 24*/ le32 bytes_in_use; /* Number of bytes used in this mft record. - NOTE: Must be aligned to 8-byte boundary. */ -/* 28*/ le32 bytes_allocated; /* Number of bytes allocated for this mft - record. This should be equal to the mft - record size. */ -/* 32*/ leMFT_REF base_mft_record;/* This is zero for base mft records. - When it is not zero it is a mft reference - pointing to the base mft record to which - this record belongs (this is then used to - locate the attribute list attribute present - in the base record which describes this - extension record and hence might need - modification when the extension record - itself is modified, also locating the - attribute list also means finding the other - potential extents, belonging to the non-base - mft record). */ -/* 40*/ le16 next_attr_instance;/* The instance number that will be assigned to - the next attribute added to this mft record. - NOTE: Incremented each time after it is used. - NOTE: Every time the mft record is reused - this number is set to zero. NOTE: The first - instance number is always 0. */ -/* The below fields are specific to NTFS 3.1+ (Windows XP and above): */ -/* 42*/ le16 reserved; /* Reserved/alignment. */ -/* 44*/ le32 mft_record_number; /* Number of this mft record. */ -/* sizeof() = 48 bytes */ -/* - * When (re)using the mft record, we place the update sequence array at this - * offset, i.e. before we start with the attributes. This also makes sense, - * otherwise we could run into problems with the update sequence array - * containing in itself the last two bytes of a sector which would mean that - * multi sector transfer protection wouldn't work. As you can't protect data - * by overwriting it since you then can't get it back... - * When reading we obviously use the data from the ntfs record header. - */ -} __attribute__ ((__packed__)) MFT_RECORD; - -/* This is the version without the NTFS 3.1+ specific fields. */ -typedef struct { -/*Ofs*/ -/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */ - NTFS_RECORD_TYPE magic; /* Usually the magic is "FILE". */ - le16 usa_ofs; /* See NTFS_RECORD definition above. */ - le16 usa_count; /* See NTFS_RECORD definition above. */ - -/* 8*/ le64 lsn; /* $LogFile sequence number for this record. - Changed every time the record is modified. */ -/* 16*/ le16 sequence_number; /* Number of times this mft record has been - reused. (See description for MFT_REF - above.) NOTE: The increment (skipping zero) - is done when the file is deleted. NOTE: If - this is zero it is left zero. */ -/* 18*/ le16 link_count; /* Number of hard links, i.e. the number of - directory entries referencing this record. - NOTE: Only used in mft base records. - NOTE: When deleting a directory entry we - check the link_count and if it is 1 we - delete the file. Otherwise we delete the - FILE_NAME_ATTR being referenced by the - directory entry from the mft record and - decrement the link_count. - FIXME: Careful with Win32 + DOS names! */ -/* 20*/ le16 attrs_offset; /* Byte offset to the first attribute in this - mft record from the start of the mft record. - NOTE: Must be aligned to 8-byte boundary. */ -/* 22*/ MFT_RECORD_FLAGS flags; /* Bit array of MFT_RECORD_FLAGS. When a file - is deleted, the MFT_RECORD_IN_USE flag is - set to zero. */ -/* 24*/ le32 bytes_in_use; /* Number of bytes used in this mft record. - NOTE: Must be aligned to 8-byte boundary. */ -/* 28*/ le32 bytes_allocated; /* Number of bytes allocated for this mft - record. This should be equal to the mft - record size. */ -/* 32*/ leMFT_REF base_mft_record;/* This is zero for base mft records. - When it is not zero it is a mft reference - pointing to the base mft record to which - this record belongs (this is then used to - locate the attribute list attribute present - in the base record which describes this - extension record and hence might need - modification when the extension record - itself is modified, also locating the - attribute list also means finding the other - potential extents, belonging to the non-base - mft record). */ -/* 40*/ le16 next_attr_instance;/* The instance number that will be assigned to - the next attribute added to this mft record. - NOTE: Incremented each time after it is used. - NOTE: Every time the mft record is reused - this number is set to zero. NOTE: The first - instance number is always 0. */ -/* sizeof() = 42 bytes */ -/* - * When (re)using the mft record, we place the update sequence array at this - * offset, i.e. before we start with the attributes. This also makes sense, - * otherwise we could run into problems with the update sequence array - * containing in itself the last two bytes of a sector which would mean that - * multi sector transfer protection wouldn't work. As you can't protect data - * by overwriting it since you then can't get it back... - * When reading we obviously use the data from the ntfs record header. - */ -} __attribute__ ((__packed__)) MFT_RECORD_OLD; - -/* - * System defined attributes (32-bit). Each attribute type has a corresponding - * attribute name (Unicode string of maximum 64 character length) as described - * by the attribute definitions present in the data attribute of the $AttrDef - * system file. On NTFS 3.0 volumes the names are just as the types are named - * in the below defines exchanging AT_ for the dollar sign ($). If that is not - * a revealing choice of symbol I do not know what is... (-; - */ -enum { - AT_UNUSED = cpu_to_le32( 0), - AT_STANDARD_INFORMATION = cpu_to_le32( 0x10), - AT_ATTRIBUTE_LIST = cpu_to_le32( 0x20), - AT_FILE_NAME = cpu_to_le32( 0x30), - AT_OBJECT_ID = cpu_to_le32( 0x40), - AT_SECURITY_DESCRIPTOR = cpu_to_le32( 0x50), - AT_VOLUME_NAME = cpu_to_le32( 0x60), - AT_VOLUME_INFORMATION = cpu_to_le32( 0x70), - AT_DATA = cpu_to_le32( 0x80), - AT_INDEX_ROOT = cpu_to_le32( 0x90), - AT_INDEX_ALLOCATION = cpu_to_le32( 0xa0), - AT_BITMAP = cpu_to_le32( 0xb0), - AT_REPARSE_POINT = cpu_to_le32( 0xc0), - AT_EA_INFORMATION = cpu_to_le32( 0xd0), - AT_EA = cpu_to_le32( 0xe0), - AT_PROPERTY_SET = cpu_to_le32( 0xf0), - AT_LOGGED_UTILITY_STREAM = cpu_to_le32( 0x100), - AT_FIRST_USER_DEFINED_ATTRIBUTE = cpu_to_le32( 0x1000), - AT_END = cpu_to_le32(0xffffffff) -}; - -typedef le32 ATTR_TYPE; - -/* - * The collation rules for sorting views/indexes/etc (32-bit). - * - * COLLATION_BINARY - Collate by binary compare where the first byte is most - * significant. - * COLLATION_UNICODE_STRING - Collate Unicode strings by comparing their binary - * Unicode values, except that when a character can be uppercased, the - * upper case value collates before the lower case one. - * COLLATION_FILE_NAME - Collate file names as Unicode strings. The collation - * is done very much like COLLATION_UNICODE_STRING. In fact I have no idea - * what the difference is. Perhaps the difference is that file names - * would treat some special characters in an odd way (see - * unistr.c::ntfs_collate_names() and unistr.c::legal_ansi_char_array[] - * for what I mean but COLLATION_UNICODE_STRING would not give any special - * treatment to any characters at all, but this is speculation. - * COLLATION_NTOFS_ULONG - Sorting is done according to ascending le32 key - * values. E.g. used for $SII index in FILE_Secure, which sorts by - * security_id (le32). - * COLLATION_NTOFS_SID - Sorting is done according to ascending SID values. - * E.g. used for $O index in FILE_Extend/$Quota. - * COLLATION_NTOFS_SECURITY_HASH - Sorting is done first by ascending hash - * values and second by ascending security_id values. E.g. used for $SDH - * index in FILE_Secure. - * COLLATION_NTOFS_ULONGS - Sorting is done according to a sequence of ascending - * le32 key values. E.g. used for $O index in FILE_Extend/$ObjId, which - * sorts by object_id (16-byte), by splitting up the object_id in four - * le32 values and using them as individual keys. E.g. take the following - * two security_ids, stored as follows on disk: - * 1st: a1 61 65 b7 65 7b d4 11 9e 3d 00 e0 81 10 42 59 - * 2nd: 38 14 37 d2 d2 f3 d4 11 a5 21 c8 6b 79 b1 97 45 - * To compare them, they are split into four le32 values each, like so: - * 1st: 0xb76561a1 0x11d47b65 0xe0003d9e 0x59421081 - * 2nd: 0xd2371438 0x11d4f3d2 0x6bc821a5 0x4597b179 - * Now, it is apparent why the 2nd object_id collates after the 1st: the - * first le32 value of the 1st object_id is less than the first le32 of - * the 2nd object_id. If the first le32 values of both object_ids were - * equal then the second le32 values would be compared, etc. - */ -enum { - COLLATION_BINARY = cpu_to_le32(0x00), - COLLATION_FILE_NAME = cpu_to_le32(0x01), - COLLATION_UNICODE_STRING = cpu_to_le32(0x02), - COLLATION_NTOFS_ULONG = cpu_to_le32(0x10), - COLLATION_NTOFS_SID = cpu_to_le32(0x11), - COLLATION_NTOFS_SECURITY_HASH = cpu_to_le32(0x12), - COLLATION_NTOFS_ULONGS = cpu_to_le32(0x13), -}; - -typedef le32 COLLATION_RULE; - -/* - * The flags (32-bit) describing attribute properties in the attribute - * definition structure. FIXME: This information is based on Regis's - * information and, according to him, it is not certain and probably - * incomplete. The INDEXABLE flag is fairly certainly correct as only the file - * name attribute has this flag set and this is the only attribute indexed in - * NT4. - */ -enum { - ATTR_DEF_INDEXABLE = cpu_to_le32(0x02), /* Attribute can be - indexed. */ - ATTR_DEF_MULTIPLE = cpu_to_le32(0x04), /* Attribute type - can be present multiple times in the - mft records of an inode. */ - ATTR_DEF_NOT_ZERO = cpu_to_le32(0x08), /* Attribute value - must contain at least one non-zero - byte. */ - ATTR_DEF_INDEXED_UNIQUE = cpu_to_le32(0x10), /* Attribute must be - indexed and the attribute value must be - unique for the attribute type in all of - the mft records of an inode. */ - ATTR_DEF_NAMED_UNIQUE = cpu_to_le32(0x20), /* Attribute must be - named and the name must be unique for - the attribute type in all of the mft - records of an inode. */ - ATTR_DEF_RESIDENT = cpu_to_le32(0x40), /* Attribute must be - resident. */ - ATTR_DEF_ALWAYS_LOG = cpu_to_le32(0x80), /* Always log - modifications to this attribute, - regardless of whether it is resident or - non-resident. Without this, only log - modifications if the attribute is - resident. */ -}; - -typedef le32 ATTR_DEF_FLAGS; - -/* - * The data attribute of FILE_AttrDef contains a sequence of attribute - * definitions for the NTFS volume. With this, it is supposed to be safe for an - * older NTFS driver to mount a volume containing a newer NTFS version without - * damaging it (that's the theory. In practice it's: not damaging it too much). - * Entries are sorted by attribute type. The flags describe whether the - * attribute can be resident/non-resident and possibly other things, but the - * actual bits are unknown. - */ -typedef struct { -/*hex ofs*/ -/* 0*/ ntfschar name[0x40]; /* Unicode name of the attribute. Zero - terminated. */ -/* 80*/ ATTR_TYPE type; /* Type of the attribute. */ -/* 84*/ le32 display_rule; /* Default display rule. - FIXME: What does it mean? (AIA) */ -/* 88*/ COLLATION_RULE collation_rule; /* Default collation rule. */ -/* 8c*/ ATTR_DEF_FLAGS flags; /* Flags describing the attribute. */ -/* 90*/ sle64 min_size; /* Optional minimum attribute size. */ -/* 98*/ sle64 max_size; /* Maximum size of attribute. */ -/* sizeof() = 0xa0 or 160 bytes */ -} __attribute__ ((__packed__)) ATTR_DEF; - -/* - * Attribute flags (16-bit). - */ -enum { - ATTR_IS_COMPRESSED = cpu_to_le16(0x0001), - ATTR_COMPRESSION_MASK = cpu_to_le16(0x00ff), /* Compression method - mask. Also, first - illegal value. */ - ATTR_IS_ENCRYPTED = cpu_to_le16(0x4000), - ATTR_IS_SPARSE = cpu_to_le16(0x8000), -} __attribute__ ((__packed__)); - -typedef le16 ATTR_FLAGS; - -/* - * Attribute compression. - * - * Only the data attribute is ever compressed in the current ntfs driver in - * Windows. Further, compression is only applied when the data attribute is - * non-resident. Finally, to use compression, the maximum allowed cluster size - * on a volume is 4kib. - * - * The compression method is based on independently compressing blocks of X - * clusters, where X is determined from the compression_unit value found in the - * non-resident attribute record header (more precisely: X = 2^compression_unit - * clusters). On Windows NT/2k, X always is 16 clusters (compression_unit = 4). - * - * There are three different cases of how a compression block of X clusters - * can be stored: - * - * 1) The data in the block is all zero (a sparse block): - * This is stored as a sparse block in the runlist, i.e. the runlist - * entry has length = X and lcn = -1. The mapping pairs array actually - * uses a delta_lcn value length of 0, i.e. delta_lcn is not present at - * all, which is then interpreted by the driver as lcn = -1. - * NOTE: Even uncompressed files can be sparse on NTFS 3.0 volumes, then - * the same principles apply as above, except that the length is not - * restricted to being any particular value. - * - * 2) The data in the block is not compressed: - * This happens when compression doesn't reduce the size of the block - * in clusters. I.e. if compression has a small effect so that the - * compressed data still occupies X clusters, then the uncompressed data - * is stored in the block. - * This case is recognised by the fact that the runlist entry has - * length = X and lcn >= 0. The mapping pairs array stores this as - * normal with a run length of X and some specific delta_lcn, i.e. - * delta_lcn has to be present. - * - * 3) The data in the block is compressed: - * The common case. This case is recognised by the fact that the run - * list entry has length L < X and lcn >= 0. The mapping pairs array - * stores this as normal with a run length of X and some specific - * delta_lcn, i.e. delta_lcn has to be present. This runlist entry is - * immediately followed by a sparse entry with length = X - L and - * lcn = -1. The latter entry is to make up the vcn counting to the - * full compression block size X. - * - * In fact, life is more complicated because adjacent entries of the same type - * can be coalesced. This means that one has to keep track of the number of - * clusters handled and work on a basis of X clusters at a time being one - * block. An example: if length L > X this means that this particular runlist - * entry contains a block of length X and part of one or more blocks of length - * L - X. Another example: if length L < X, this does not necessarily mean that - * the block is compressed as it might be that the lcn changes inside the block - * and hence the following runlist entry describes the continuation of the - * potentially compressed block. The block would be compressed if the - * following runlist entry describes at least X - L sparse clusters, thus - * making up the compression block length as described in point 3 above. (Of - * course, there can be several runlist entries with small lengths so that the - * sparse entry does not follow the first data containing entry with - * length < X.) - * - * NOTE: At the end of the compressed attribute value, there most likely is not - * just the right amount of data to make up a compression block, thus this data - * is not even attempted to be compressed. It is just stored as is, unless - * the number of clusters it occupies is reduced when compressed in which case - * it is stored as a compressed compression block, complete with sparse - * clusters at the end. - */ - -/* - * Flags of resident attributes (8-bit). - */ -enum { - RESIDENT_ATTR_IS_INDEXED = 0x01, /* Attribute is referenced in an index - (has implications for deleting and - modifying the attribute). */ -} __attribute__ ((__packed__)); - -typedef u8 RESIDENT_ATTR_FLAGS; - -/* - * Attribute record header. Always aligned to 8-byte boundary. - */ -typedef struct { -/*Ofs*/ -/* 0*/ ATTR_TYPE type; /* The (32-bit) type of the attribute. */ -/* 4*/ le32 length; /* Byte size of the resident part of the - attribute (aligned to 8-byte boundary). - Used to get to the next attribute. */ -/* 8*/ u8 non_resident; /* If 0, attribute is resident. - If 1, attribute is non-resident. */ -/* 9*/ u8 name_length; /* Unicode character size of name of attribute. - 0 if unnamed. */ -/* 10*/ le16 name_offset; /* If name_length != 0, the byte offset to the - beginning of the name from the attribute - record. Note that the name is stored as a - Unicode string. When creating, place offset - just at the end of the record header. Then, - follow with attribute value or mapping pairs - array, resident and non-resident attributes - respectively, aligning to an 8-byte - boundary. */ -/* 12*/ ATTR_FLAGS flags; /* Flags describing the attribute. */ -/* 14*/ le16 instance; /* The instance of this attribute record. This - number is unique within this mft record (see - MFT_RECORD/next_attribute_instance notes in - mft.h for more details). */ -/* 16*/ union { - /* Resident attributes. */ - struct { -/* 16 */ le32 value_length;/* Byte size of attribute value. */ -/* 20 */ le16 value_offset;/* Byte offset of the attribute - value from the start of the - attribute record. When creating, - align to 8-byte boundary if we - have a name present as this might - not have a length of a multiple - of 8-bytes. */ -/* 22 */ RESIDENT_ATTR_FLAGS flags; /* See above. */ -/* 23 */ s8 reserved; /* Reserved/alignment to 8-byte - boundary. */ - } __attribute__ ((__packed__)) resident; - /* Non-resident attributes. */ - struct { -/* 16*/ leVCN lowest_vcn;/* Lowest valid virtual cluster number - for this portion of the attribute value or - 0 if this is the only extent (usually the - case). - Only when an attribute list is used - does lowest_vcn != 0 ever occur. */ -/* 24*/ leVCN highest_vcn;/* Highest valid vcn of this extent of - the attribute value. - Usually there is only one - portion, so this usually equals the attribute - value size in clusters minus 1. Can be -1 for - zero length files. Can be 0 for "single extent" - attributes. */ -/* 32*/ le16 mapping_pairs_offset; /* Byte offset from the - beginning of the structure to the mapping pairs - array which contains the mappings between the - vcns and the logical cluster numbers (lcns). - When creating, place this at the end of this - record header aligned to 8-byte boundary. */ -/* 34*/ u8 compression_unit; /* The compression unit expressed - as the log to the base 2 of the number of - clusters in a compression unit. 0 means not - compressed. (This effectively limits the - compression unit size to be a power of two - clusters.) WinNT4 only uses a value of 4. - Sparse files have this set to 0 on XPSP2. */ -/* 35*/ u8 reserved[5]; /* Align to 8-byte boundary. */ -/* The sizes below are only used when lowest_vcn is zero, as otherwise it would - be difficult to keep them up-to-date.*/ -/* 40*/ sle64 allocated_size; /* Byte size of disk space - allocated to hold the attribute value. Always - is a multiple of the cluster size. When a file - is compressed, this field is a multiple of the - compression block size (2^compression_unit) and - it represents the logically allocated space - rather than the actual on disk usage. For this - use the compressed_size (see below). */ -/* 48*/ sle64 data_size; /* Byte size of the attribute - value. Can be larger than allocated_size if - attribute value is compressed or sparse. */ -/* 56*/ sle64 initialized_size; /* Byte size of initialized - portion of the attribute value. Usually equals - data_size. */ -/* sizeof(uncompressed attr) = 64*/ -/* 64*/ sle64 compressed_size; /* Byte size of the attribute - value after compression. Only present when - compressed or sparse. Always is a multiple of - the cluster size. Represents the actual amount - of disk space being used on the disk. */ -/* sizeof(compressed attr) = 72*/ - } __attribute__ ((__packed__)) non_resident; - } __attribute__ ((__packed__)) data; -} __attribute__ ((__packed__)) ATTR_RECORD; - -typedef ATTR_RECORD ATTR_REC; - -/* - * File attribute flags (32-bit) appearing in the file_attributes fields of the - * STANDARD_INFORMATION attribute of MFT_RECORDs and the FILENAME_ATTR - * attributes of MFT_RECORDs and directory index entries. - * - * All of the below flags appear in the directory index entries but only some - * appear in the STANDARD_INFORMATION attribute whilst only some others appear - * in the FILENAME_ATTR attribute of MFT_RECORDs. Unless otherwise stated the - * flags appear in all of the above. - */ -enum { - FILE_ATTR_READONLY = cpu_to_le32(0x00000001), - FILE_ATTR_HIDDEN = cpu_to_le32(0x00000002), - FILE_ATTR_SYSTEM = cpu_to_le32(0x00000004), - /* Old DOS volid. Unused in NT. = cpu_to_le32(0x00000008), */ - - FILE_ATTR_DIRECTORY = cpu_to_le32(0x00000010), - /* Note, FILE_ATTR_DIRECTORY is not considered valid in NT. It is - reserved for the DOS SUBDIRECTORY flag. */ - FILE_ATTR_ARCHIVE = cpu_to_le32(0x00000020), - FILE_ATTR_DEVICE = cpu_to_le32(0x00000040), - FILE_ATTR_NORMAL = cpu_to_le32(0x00000080), - - FILE_ATTR_TEMPORARY = cpu_to_le32(0x00000100), - FILE_ATTR_SPARSE_FILE = cpu_to_le32(0x00000200), - FILE_ATTR_REPARSE_POINT = cpu_to_le32(0x00000400), - FILE_ATTR_COMPRESSED = cpu_to_le32(0x00000800), - - FILE_ATTR_OFFLINE = cpu_to_le32(0x00001000), - FILE_ATTR_NOT_CONTENT_INDEXED = cpu_to_le32(0x00002000), - FILE_ATTR_ENCRYPTED = cpu_to_le32(0x00004000), - - FILE_ATTR_VALID_FLAGS = cpu_to_le32(0x00007fb7), - /* Note, FILE_ATTR_VALID_FLAGS masks out the old DOS VolId and the - FILE_ATTR_DEVICE and preserves everything else. This mask is used - to obtain all flags that are valid for reading. */ - FILE_ATTR_VALID_SET_FLAGS = cpu_to_le32(0x000031a7), - /* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the - F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT, - F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest. This mask - is used to obtain all flags that are valid for setting. */ - /* - * The flag FILE_ATTR_DUP_FILENAME_INDEX_PRESENT is present in all - * FILENAME_ATTR attributes but not in the STANDARD_INFORMATION - * attribute of an mft record. - */ - FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT = cpu_to_le32(0x10000000), - /* Note, this is a copy of the corresponding bit from the mft record, - telling us whether this is a directory or not, i.e. whether it has - an index root attribute or not. */ - FILE_ATTR_DUP_VIEW_INDEX_PRESENT = cpu_to_le32(0x20000000), - /* Note, this is a copy of the corresponding bit from the mft record, - telling us whether this file has a view index present (eg. object id - index, quota index, one of the security indexes or the encrypting - filesystem related indexes). */ -}; - -typedef le32 FILE_ATTR_FLAGS; - -/* - * NOTE on times in NTFS: All times are in MS standard time format, i.e. they - * are the number of 100-nanosecond intervals since 1st January 1601, 00:00:00 - * universal coordinated time (UTC). (In Linux time starts 1st January 1970, - * 00:00:00 UTC and is stored as the number of 1-second intervals since then.) - */ - -/* - * Attribute: Standard information (0x10). - * - * NOTE: Always resident. - * NOTE: Present in all base file records on a volume. - * NOTE: There is conflicting information about the meaning of each of the time - * fields but the meaning as defined below has been verified to be - * correct by practical experimentation on Windows NT4 SP6a and is hence - * assumed to be the one and only correct interpretation. - */ -typedef struct { -/*Ofs*/ -/* 0*/ sle64 creation_time; /* Time file was created. Updated when - a filename is changed(?). */ -/* 8*/ sle64 last_data_change_time; /* Time the data attribute was last - modified. */ -/* 16*/ sle64 last_mft_change_time; /* Time this mft record was last - modified. */ -/* 24*/ sle64 last_access_time; /* Approximate time when the file was - last accessed (obviously this is not - updated on read-only volumes). In - Windows this is only updated when - accessed if some time delta has - passed since the last update. Also, - last access time updates can be - disabled altogether for speed. */ -/* 32*/ FILE_ATTR_FLAGS file_attributes; /* Flags describing the file. */ -/* 36*/ union { - /* NTFS 1.2 */ - struct { - /* 36*/ u8 reserved12[12]; /* Reserved/alignment to 8-byte - boundary. */ - } __attribute__ ((__packed__)) v1; - /* sizeof() = 48 bytes */ - /* NTFS 3.x */ - struct { -/* - * If a volume has been upgraded from a previous NTFS version, then these - * fields are present only if the file has been accessed since the upgrade. - * Recognize the difference by comparing the length of the resident attribute - * value. If it is 48, then the following fields are missing. If it is 72 then - * the fields are present. Maybe just check like this: - * if (resident.ValueLength < sizeof(STANDARD_INFORMATION)) { - * Assume NTFS 1.2- format. - * If (volume version is 3.x) - * Upgrade attribute to NTFS 3.x format. - * else - * Use NTFS 1.2- format for access. - * } else - * Use NTFS 3.x format for access. - * Only problem is that it might be legal to set the length of the value to - * arbitrarily large values thus spoiling this check. - But chkdsk probably - * views that as a corruption, assuming that it behaves like this for all - * attributes. - */ - /* 36*/ le32 maximum_versions; /* Maximum allowed versions for - file. Zero if version numbering is disabled. */ - /* 40*/ le32 version_number; /* This file's version (if any). - Set to zero if maximum_versions is zero. */ - /* 44*/ le32 class_id; /* Class id from bidirectional - class id index (?). */ - /* 48*/ le32 owner_id; /* Owner_id of the user owning - the file. Translate via $Q index in FILE_Extend - /$Quota to the quota control entry for the user - owning the file. Zero if quotas are disabled. */ - /* 52*/ le32 security_id; /* Security_id for the file. - Translate via $SII index and $SDS data stream - in FILE_Secure to the security descriptor. */ - /* 56*/ le64 quota_charged; /* Byte size of the charge to - the quota for all streams of the file. Note: Is - zero if quotas are disabled. */ - /* 64*/ leUSN usn; /* Last update sequence number - of the file. This is a direct index into the - transaction log file ($UsnJrnl). It is zero if - the usn journal is disabled or this file has - not been subject to logging yet. See usnjrnl.h - for details. */ - } __attribute__ ((__packed__)) v3; - /* sizeof() = 72 bytes (NTFS 3.x) */ - } __attribute__ ((__packed__)) ver; -} __attribute__ ((__packed__)) STANDARD_INFORMATION; - -/* - * Attribute: Attribute list (0x20). - * - * - Can be either resident or non-resident. - * - Value consists of a sequence of variable length, 8-byte aligned, - * ATTR_LIST_ENTRY records. - * - The list is not terminated by anything at all! The only way to know when - * the end is reached is to keep track of the current offset and compare it to - * the attribute value size. - * - The attribute list attribute contains one entry for each attribute of - * the file in which the list is located, except for the list attribute - * itself. The list is sorted: first by attribute type, second by attribute - * name (if present), third by instance number. The extents of one - * non-resident attribute (if present) immediately follow after the initial - * extent. They are ordered by lowest_vcn and have their instace set to zero. - * It is not allowed to have two attributes with all sorting keys equal. - * - Further restrictions: - * - If not resident, the vcn to lcn mapping array has to fit inside the - * base mft record. - * - The attribute list attribute value has a maximum size of 256kb. This - * is imposed by the Windows cache manager. - * - Attribute lists are only used when the attributes of mft record do not - * fit inside the mft record despite all attributes (that can be made - * non-resident) having been made non-resident. This can happen e.g. when: - * - File has a large number of hard links (lots of file name - * attributes present). - * - The mapping pairs array of some non-resident attribute becomes so - * large due to fragmentation that it overflows the mft record. - * - The security descriptor is very complex (not applicable to - * NTFS 3.0 volumes). - * - There are many named streams. - */ -typedef struct { -/*Ofs*/ -/* 0*/ ATTR_TYPE type; /* Type of referenced attribute. */ -/* 4*/ le16 length; /* Byte size of this entry (8-byte aligned). */ -/* 6*/ u8 name_length; /* Size in Unicode chars of the name of the - attribute or 0 if unnamed. */ -/* 7*/ u8 name_offset; /* Byte offset to beginning of attribute name - (always set this to where the name would - start even if unnamed). */ -/* 8*/ leVCN lowest_vcn; /* Lowest virtual cluster number of this portion - of the attribute value. This is usually 0. It - is non-zero for the case where one attribute - does not fit into one mft record and thus - several mft records are allocated to hold - this attribute. In the latter case, each mft - record holds one extent of the attribute and - there is one attribute list entry for each - extent. NOTE: This is DEFINITELY a signed - value! The windows driver uses cmp, followed - by jg when comparing this, thus it treats it - as signed. */ -/* 16*/ leMFT_REF mft_reference;/* The reference of the mft record holding - the ATTR_RECORD for this portion of the - attribute value. */ -/* 24*/ le16 instance; /* If lowest_vcn = 0, the instance of the - attribute being referenced; otherwise 0. */ -/* 26*/ ntfschar name[0]; /* Use when creating only. When reading use - name_offset to determine the location of the - name. */ -/* sizeof() = 26 + (attribute_name_length * 2) bytes */ -} __attribute__ ((__packed__)) ATTR_LIST_ENTRY; - -/* - * The maximum allowed length for a file name. - */ -#define MAXIMUM_FILE_NAME_LENGTH 255 - -/* - * Possible namespaces for filenames in ntfs (8-bit). - */ -enum { - FILE_NAME_POSIX = 0x00, - /* This is the largest namespace. It is case sensitive and allows all - Unicode characters except for: '\0' and '/'. Beware that in - WinNT/2k/2003 by default files which eg have the same name except - for their case will not be distinguished by the standard utilities - and thus a "del filename" will delete both "filename" and "fileName" - without warning. However if for example Services For Unix (SFU) are - installed and the case sensitive option was enabled at installation - time, then you can create/access/delete such files. - Note that even SFU places restrictions on the filenames beyond the - '\0' and '/' and in particular the following set of characters is - not allowed: '"', '/', '<', '>', '\'. All other characters, - including the ones no allowed in WIN32 namespace are allowed. - Tested with SFU 3.5 (this is now free) running on Windows XP. */ - FILE_NAME_WIN32 = 0x01, - /* The standard WinNT/2k NTFS long filenames. Case insensitive. All - Unicode chars except: '\0', '"', '*', '/', ':', '<', '>', '?', '\', - and '|'. Further, names cannot end with a '.' or a space. */ - FILE_NAME_DOS = 0x02, - /* The standard DOS filenames (8.3 format). Uppercase only. All 8-bit - characters greater space, except: '"', '*', '+', ',', '/', ':', ';', - '<', '=', '>', '?', and '\'. */ - FILE_NAME_WIN32_AND_DOS = 0x03, - /* 3 means that both the Win32 and the DOS filenames are identical and - hence have been saved in this single filename record. */ -} __attribute__ ((__packed__)); - -typedef u8 FILE_NAME_TYPE_FLAGS; - -/* - * Attribute: Filename (0x30). - * - * NOTE: Always resident. - * NOTE: All fields, except the parent_directory, are only updated when the - * filename is changed. Until then, they just become out of sync with - * reality and the more up to date values are present in the standard - * information attribute. - * NOTE: There is conflicting information about the meaning of each of the time - * fields but the meaning as defined below has been verified to be - * correct by practical experimentation on Windows NT4 SP6a and is hence - * assumed to be the one and only correct interpretation. - */ -typedef struct { -/*hex ofs*/ -/* 0*/ leMFT_REF parent_directory; /* Directory this filename is - referenced from. */ -/* 8*/ sle64 creation_time; /* Time file was created. */ -/* 10*/ sle64 last_data_change_time; /* Time the data attribute was last - modified. */ -/* 18*/ sle64 last_mft_change_time; /* Time this mft record was last - modified. */ -/* 20*/ sle64 last_access_time; /* Time this mft record was last - accessed. */ -/* 28*/ sle64 allocated_size; /* Byte size of on-disk allocated space - for the unnamed data attribute. So - for normal $DATA, this is the - allocated_size from the unnamed - $DATA attribute and for compressed - and/or sparse $DATA, this is the - compressed_size from the unnamed - $DATA attribute. For a directory or - other inode without an unnamed $DATA - attribute, this is always 0. NOTE: - This is a multiple of the cluster - size. */ -/* 30*/ sle64 data_size; /* Byte size of actual data in unnamed - data attribute. For a directory or - other inode without an unnamed $DATA - attribute, this is always 0. */ -/* 38*/ FILE_ATTR_FLAGS file_attributes; /* Flags describing the file. */ -/* 3c*/ union { - /* 3c*/ struct { - /* 3c*/ le16 packed_ea_size; /* Size of the buffer needed to - pack the extended attributes - (EAs), if such are present.*/ - /* 3e*/ le16 reserved; /* Reserved for alignment. */ - } __attribute__ ((__packed__)) ea; - /* 3c*/ struct { - /* 3c*/ le32 reparse_point_tag; /* Type of reparse point, - present only in reparse - points and only if there are - no EAs. */ - } __attribute__ ((__packed__)) rp; - } __attribute__ ((__packed__)) type; -/* 40*/ u8 file_name_length; /* Length of file name in - (Unicode) characters. */ -/* 41*/ FILE_NAME_TYPE_FLAGS file_name_type; /* Namespace of the file name.*/ -/* 42*/ ntfschar file_name[0]; /* File name in Unicode. */ -} __attribute__ ((__packed__)) FILE_NAME_ATTR; - -/* - * GUID structures store globally unique identifiers (GUID). A GUID is a - * 128-bit value consisting of one group of eight hexadecimal digits, followed - * by three groups of four hexadecimal digits each, followed by one group of - * twelve hexadecimal digits. GUIDs are Microsoft's implementation of the - * distributed computing environment (DCE) universally unique identifier (UUID). - * Example of a GUID: - * 1F010768-5A73-BC91-0010A52216A7 - */ -typedef struct { - le32 data1; /* The first eight hexadecimal digits of the GUID. */ - le16 data2; /* The first group of four hexadecimal digits. */ - le16 data3; /* The second group of four hexadecimal digits. */ - u8 data4[8]; /* The first two bytes are the third group of four - hexadecimal digits. The remaining six bytes are the - final 12 hexadecimal digits. */ -} __attribute__ ((__packed__)) GUID; - -/* - * FILE_Extend/$ObjId contains an index named $O. This index contains all - * object_ids present on the volume as the index keys and the corresponding - * mft_record numbers as the index entry data parts. The data part (defined - * below) also contains three other object_ids: - * birth_volume_id - object_id of FILE_Volume on which the file was first - * created. Optional (i.e. can be zero). - * birth_object_id - object_id of file when it was first created. Usually - * equals the object_id. Optional (i.e. can be zero). - * domain_id - Reserved (always zero). - */ -typedef struct { - leMFT_REF mft_reference;/* Mft record containing the object_id in - the index entry key. */ - union { - struct { - GUID birth_volume_id; - GUID birth_object_id; - GUID domain_id; - } __attribute__ ((__packed__)) origin; - u8 extended_info[48]; - } __attribute__ ((__packed__)) opt; -} __attribute__ ((__packed__)) OBJ_ID_INDEX_DATA; - -/* - * Attribute: Object id (NTFS 3.0+) (0x40). - * - * NOTE: Always resident. - */ -typedef struct { - GUID object_id; /* Unique id assigned to the - file.*/ - /* The following fields are optional. The attribute value size is 16 - bytes, i.e. sizeof(GUID), if these are not present at all. Note, - the entries can be present but one or more (or all) can be zero - meaning that that particular value(s) is(are) not defined. */ - union { - struct { - GUID birth_volume_id; /* Unique id of volume on which - the file was first created.*/ - GUID birth_object_id; /* Unique id of file when it was - first created. */ - GUID domain_id; /* Reserved, zero. */ - } __attribute__ ((__packed__)) origin; - u8 extended_info[48]; - } __attribute__ ((__packed__)) opt; -} __attribute__ ((__packed__)) OBJECT_ID_ATTR; - -/* - * The pre-defined IDENTIFIER_AUTHORITIES used as SID_IDENTIFIER_AUTHORITY in - * the SID structure (see below). - */ -//typedef enum { /* SID string prefix. */ -// SECURITY_NULL_SID_AUTHORITY = {0, 0, 0, 0, 0, 0}, /* S-1-0 */ -// SECURITY_WORLD_SID_AUTHORITY = {0, 0, 0, 0, 0, 1}, /* S-1-1 */ -// SECURITY_LOCAL_SID_AUTHORITY = {0, 0, 0, 0, 0, 2}, /* S-1-2 */ -// SECURITY_CREATOR_SID_AUTHORITY = {0, 0, 0, 0, 0, 3}, /* S-1-3 */ -// SECURITY_NON_UNIQUE_AUTHORITY = {0, 0, 0, 0, 0, 4}, /* S-1-4 */ -// SECURITY_NT_SID_AUTHORITY = {0, 0, 0, 0, 0, 5}, /* S-1-5 */ -//} IDENTIFIER_AUTHORITIES; - -/* - * These relative identifiers (RIDs) are used with the above identifier - * authorities to make up universal well-known SIDs. - * - * Note: The relative identifier (RID) refers to the portion of a SID, which - * identifies a user or group in relation to the authority that issued the SID. - * For example, the universal well-known SID Creator Owner ID (S-1-3-0) is - * made up of the identifier authority SECURITY_CREATOR_SID_AUTHORITY (3) and - * the relative identifier SECURITY_CREATOR_OWNER_RID (0). - */ -typedef enum { /* Identifier authority. */ - SECURITY_NULL_RID = 0, /* S-1-0 */ - SECURITY_WORLD_RID = 0, /* S-1-1 */ - SECURITY_LOCAL_RID = 0, /* S-1-2 */ - - SECURITY_CREATOR_OWNER_RID = 0, /* S-1-3 */ - SECURITY_CREATOR_GROUP_RID = 1, /* S-1-3 */ - - SECURITY_CREATOR_OWNER_SERVER_RID = 2, /* S-1-3 */ - SECURITY_CREATOR_GROUP_SERVER_RID = 3, /* S-1-3 */ - - SECURITY_DIALUP_RID = 1, - SECURITY_NETWORK_RID = 2, - SECURITY_BATCH_RID = 3, - SECURITY_INTERACTIVE_RID = 4, - SECURITY_SERVICE_RID = 6, - SECURITY_ANONYMOUS_LOGON_RID = 7, - SECURITY_PROXY_RID = 8, - SECURITY_ENTERPRISE_CONTROLLERS_RID=9, - SECURITY_SERVER_LOGON_RID = 9, - SECURITY_PRINCIPAL_SELF_RID = 0xa, - SECURITY_AUTHENTICATED_USER_RID = 0xb, - SECURITY_RESTRICTED_CODE_RID = 0xc, - SECURITY_TERMINAL_SERVER_RID = 0xd, - - SECURITY_LOGON_IDS_RID = 5, - SECURITY_LOGON_IDS_RID_COUNT = 3, - - SECURITY_LOCAL_SYSTEM_RID = 0x12, - - SECURITY_NT_NON_UNIQUE = 0x15, - - SECURITY_BUILTIN_DOMAIN_RID = 0x20, - - /* - * Well-known domain relative sub-authority values (RIDs). - */ - - /* Users. */ - DOMAIN_USER_RID_ADMIN = 0x1f4, - DOMAIN_USER_RID_GUEST = 0x1f5, - DOMAIN_USER_RID_KRBTGT = 0x1f6, - - /* Groups. */ - DOMAIN_GROUP_RID_ADMINS = 0x200, - DOMAIN_GROUP_RID_USERS = 0x201, - DOMAIN_GROUP_RID_GUESTS = 0x202, - DOMAIN_GROUP_RID_COMPUTERS = 0x203, - DOMAIN_GROUP_RID_CONTROLLERS = 0x204, - DOMAIN_GROUP_RID_CERT_ADMINS = 0x205, - DOMAIN_GROUP_RID_SCHEMA_ADMINS = 0x206, - DOMAIN_GROUP_RID_ENTERPRISE_ADMINS= 0x207, - DOMAIN_GROUP_RID_POLICY_ADMINS = 0x208, - - /* Aliases. */ - DOMAIN_ALIAS_RID_ADMINS = 0x220, - DOMAIN_ALIAS_RID_USERS = 0x221, - DOMAIN_ALIAS_RID_GUESTS = 0x222, - DOMAIN_ALIAS_RID_POWER_USERS = 0x223, - - DOMAIN_ALIAS_RID_ACCOUNT_OPS = 0x224, - DOMAIN_ALIAS_RID_SYSTEM_OPS = 0x225, - DOMAIN_ALIAS_RID_PRINT_OPS = 0x226, - DOMAIN_ALIAS_RID_BACKUP_OPS = 0x227, - - DOMAIN_ALIAS_RID_REPLICATOR = 0x228, - DOMAIN_ALIAS_RID_RAS_SERVERS = 0x229, - DOMAIN_ALIAS_RID_PREW2KCOMPACCESS = 0x22a, -} RELATIVE_IDENTIFIERS; - -/* - * The universal well-known SIDs: - * - * NULL_SID S-1-0-0 - * WORLD_SID S-1-1-0 - * LOCAL_SID S-1-2-0 - * CREATOR_OWNER_SID S-1-3-0 - * CREATOR_GROUP_SID S-1-3-1 - * CREATOR_OWNER_SERVER_SID S-1-3-2 - * CREATOR_GROUP_SERVER_SID S-1-3-3 - * - * (Non-unique IDs) S-1-4 - * - * NT well-known SIDs: - * - * NT_AUTHORITY_SID S-1-5 - * DIALUP_SID S-1-5-1 - * - * NETWORD_SID S-1-5-2 - * BATCH_SID S-1-5-3 - * INTERACTIVE_SID S-1-5-4 - * SERVICE_SID S-1-5-6 - * ANONYMOUS_LOGON_SID S-1-5-7 (aka null logon session) - * PROXY_SID S-1-5-8 - * SERVER_LOGON_SID S-1-5-9 (aka domain controller account) - * SELF_SID S-1-5-10 (self RID) - * AUTHENTICATED_USER_SID S-1-5-11 - * RESTRICTED_CODE_SID S-1-5-12 (running restricted code) - * TERMINAL_SERVER_SID S-1-5-13 (running on terminal server) - * - * (Logon IDs) S-1-5-5-X-Y - * - * (NT non-unique IDs) S-1-5-0x15-... - * - * (Built-in domain) S-1-5-0x20 - */ - -/* - * The SID_IDENTIFIER_AUTHORITY is a 48-bit value used in the SID structure. - * - * NOTE: This is stored as a big endian number, hence the high_part comes - * before the low_part. - */ -typedef union { - struct { - u16 high_part; /* High 16-bits. */ - u32 low_part; /* Low 32-bits. */ - } __attribute__ ((__packed__)) parts; - u8 value[6]; /* Value as individual bytes. */ -} __attribute__ ((__packed__)) SID_IDENTIFIER_AUTHORITY; - -/* - * The SID structure is a variable-length structure used to uniquely identify - * users or groups. SID stands for security identifier. - * - * The standard textual representation of the SID is of the form: - * S-R-I-S-S... - * Where: - * - The first "S" is the literal character 'S' identifying the following - * digits as a SID. - * - R is the revision level of the SID expressed as a sequence of digits - * either in decimal or hexadecimal (if the later, prefixed by "0x"). - * - I is the 48-bit identifier_authority, expressed as digits as R above. - * - S... is one or more sub_authority values, expressed as digits as above. - * - * Example SID; the domain-relative SID of the local Administrators group on - * Windows NT/2k: - * S-1-5-32-544 - * This translates to a SID with: - * revision = 1, - * sub_authority_count = 2, - * identifier_authority = {0,0,0,0,0,5}, // SECURITY_NT_AUTHORITY - * sub_authority[0] = 32, // SECURITY_BUILTIN_DOMAIN_RID - * sub_authority[1] = 544 // DOMAIN_ALIAS_RID_ADMINS - */ -typedef struct { - u8 revision; - u8 sub_authority_count; - SID_IDENTIFIER_AUTHORITY identifier_authority; - le32 sub_authority[1]; /* At least one sub_authority. */ -} __attribute__ ((__packed__)) SID; - -/* - * Current constants for SIDs. - */ -typedef enum { - SID_REVISION = 1, /* Current revision level. */ - SID_MAX_SUB_AUTHORITIES = 15, /* Maximum number of those. */ - SID_RECOMMENDED_SUB_AUTHORITIES = 1, /* Will change to around 6 in - a future revision. */ -} SID_CONSTANTS; - -/* - * The predefined ACE types (8-bit, see below). - */ -enum { - ACCESS_MIN_MS_ACE_TYPE = 0, - ACCESS_ALLOWED_ACE_TYPE = 0, - ACCESS_DENIED_ACE_TYPE = 1, - SYSTEM_AUDIT_ACE_TYPE = 2, - SYSTEM_ALARM_ACE_TYPE = 3, /* Not implemented as of Win2k. */ - ACCESS_MAX_MS_V2_ACE_TYPE = 3, - - ACCESS_ALLOWED_COMPOUND_ACE_TYPE= 4, - ACCESS_MAX_MS_V3_ACE_TYPE = 4, - - /* The following are Win2k only. */ - ACCESS_MIN_MS_OBJECT_ACE_TYPE = 5, - ACCESS_ALLOWED_OBJECT_ACE_TYPE = 5, - ACCESS_DENIED_OBJECT_ACE_TYPE = 6, - SYSTEM_AUDIT_OBJECT_ACE_TYPE = 7, - SYSTEM_ALARM_OBJECT_ACE_TYPE = 8, - ACCESS_MAX_MS_OBJECT_ACE_TYPE = 8, - - ACCESS_MAX_MS_V4_ACE_TYPE = 8, - - /* This one is for WinNT/2k. */ - ACCESS_MAX_MS_ACE_TYPE = 8, -} __attribute__ ((__packed__)); - -typedef u8 ACE_TYPES; - -/* - * The ACE flags (8-bit) for audit and inheritance (see below). - * - * SUCCESSFUL_ACCESS_ACE_FLAG is only used with system audit and alarm ACE - * types to indicate that a message is generated (in Windows!) for successful - * accesses. - * - * FAILED_ACCESS_ACE_FLAG is only used with system audit and alarm ACE types - * to indicate that a message is generated (in Windows!) for failed accesses. - */ -enum { - /* The inheritance flags. */ - OBJECT_INHERIT_ACE = 0x01, - CONTAINER_INHERIT_ACE = 0x02, - NO_PROPAGATE_INHERIT_ACE = 0x04, - INHERIT_ONLY_ACE = 0x08, - INHERITED_ACE = 0x10, /* Win2k only. */ - VALID_INHERIT_FLAGS = 0x1f, - - /* The audit flags. */ - SUCCESSFUL_ACCESS_ACE_FLAG = 0x40, - FAILED_ACCESS_ACE_FLAG = 0x80, -} __attribute__ ((__packed__)); - -typedef u8 ACE_FLAGS; - -/* - * An ACE is an access-control entry in an access-control list (ACL). - * An ACE defines access to an object for a specific user or group or defines - * the types of access that generate system-administration messages or alarms - * for a specific user or group. The user or group is identified by a security - * identifier (SID). - * - * Each ACE starts with an ACE_HEADER structure (aligned on 4-byte boundary), - * which specifies the type and size of the ACE. The format of the subsequent - * data depends on the ACE type. - */ -typedef struct { -/*Ofs*/ -/* 0*/ ACE_TYPES type; /* Type of the ACE. */ -/* 1*/ ACE_FLAGS flags; /* Flags describing the ACE. */ -/* 2*/ le16 size; /* Size in bytes of the ACE. */ -} __attribute__ ((__packed__)) ACE_HEADER; - -/* - * The access mask (32-bit). Defines the access rights. - * - * The specific rights (bits 0 to 15). These depend on the type of the object - * being secured by the ACE. - */ -enum { - /* Specific rights for files and directories are as follows: */ - - /* Right to read data from the file. (FILE) */ - FILE_READ_DATA = cpu_to_le32(0x00000001), - /* Right to list contents of a directory. (DIRECTORY) */ - FILE_LIST_DIRECTORY = cpu_to_le32(0x00000001), - - /* Right to write data to the file. (FILE) */ - FILE_WRITE_DATA = cpu_to_le32(0x00000002), - /* Right to create a file in the directory. (DIRECTORY) */ - FILE_ADD_FILE = cpu_to_le32(0x00000002), - - /* Right to append data to the file. (FILE) */ - FILE_APPEND_DATA = cpu_to_le32(0x00000004), - /* Right to create a subdirectory. (DIRECTORY) */ - FILE_ADD_SUBDIRECTORY = cpu_to_le32(0x00000004), - - /* Right to read extended attributes. (FILE/DIRECTORY) */ - FILE_READ_EA = cpu_to_le32(0x00000008), - - /* Right to write extended attributes. (FILE/DIRECTORY) */ - FILE_WRITE_EA = cpu_to_le32(0x00000010), - - /* Right to execute a file. (FILE) */ - FILE_EXECUTE = cpu_to_le32(0x00000020), - /* Right to traverse the directory. (DIRECTORY) */ - FILE_TRAVERSE = cpu_to_le32(0x00000020), - - /* - * Right to delete a directory and all the files it contains (its - * children), even if the files are read-only. (DIRECTORY) - */ - FILE_DELETE_CHILD = cpu_to_le32(0x00000040), - - /* Right to read file attributes. (FILE/DIRECTORY) */ - FILE_READ_ATTRIBUTES = cpu_to_le32(0x00000080), - - /* Right to change file attributes. (FILE/DIRECTORY) */ - FILE_WRITE_ATTRIBUTES = cpu_to_le32(0x00000100), - - /* - * The standard rights (bits 16 to 23). These are independent of the - * type of object being secured. - */ - - /* Right to delete the object. */ - DELETE = cpu_to_le32(0x00010000), - - /* - * Right to read the information in the object's security descriptor, - * not including the information in the SACL, i.e. right to read the - * security descriptor and owner. - */ - READ_CONTROL = cpu_to_le32(0x00020000), - - /* Right to modify the DACL in the object's security descriptor. */ - WRITE_DAC = cpu_to_le32(0x00040000), - - /* Right to change the owner in the object's security descriptor. */ - WRITE_OWNER = cpu_to_le32(0x00080000), - - /* - * Right to use the object for synchronization. Enables a process to - * wait until the object is in the signalled state. Some object types - * do not support this access right. - */ - SYNCHRONIZE = cpu_to_le32(0x00100000), - - /* - * The following STANDARD_RIGHTS_* are combinations of the above for - * convenience and are defined by the Win32 API. - */ - - /* These are currently defined to READ_CONTROL. */ - STANDARD_RIGHTS_READ = cpu_to_le32(0x00020000), - STANDARD_RIGHTS_WRITE = cpu_to_le32(0x00020000), - STANDARD_RIGHTS_EXECUTE = cpu_to_le32(0x00020000), - - /* Combines DELETE, READ_CONTROL, WRITE_DAC, and WRITE_OWNER access. */ - STANDARD_RIGHTS_REQUIRED = cpu_to_le32(0x000f0000), - - /* - * Combines DELETE, READ_CONTROL, WRITE_DAC, WRITE_OWNER, and - * SYNCHRONIZE access. - */ - STANDARD_RIGHTS_ALL = cpu_to_le32(0x001f0000), - - /* - * The access system ACL and maximum allowed access types (bits 24 to - * 25, bits 26 to 27 are reserved). - */ - ACCESS_SYSTEM_SECURITY = cpu_to_le32(0x01000000), - MAXIMUM_ALLOWED = cpu_to_le32(0x02000000), - - /* - * The generic rights (bits 28 to 31). These map onto the standard and - * specific rights. - */ - - /* Read, write, and execute access. */ - GENERIC_ALL = cpu_to_le32(0x10000000), - - /* Execute access. */ - GENERIC_EXECUTE = cpu_to_le32(0x20000000), - - /* - * Write access. For files, this maps onto: - * FILE_APPEND_DATA | FILE_WRITE_ATTRIBUTES | FILE_WRITE_DATA | - * FILE_WRITE_EA | STANDARD_RIGHTS_WRITE | SYNCHRONIZE - * For directories, the mapping has the same numerical value. See - * above for the descriptions of the rights granted. - */ - GENERIC_WRITE = cpu_to_le32(0x40000000), - - /* - * Read access. For files, this maps onto: - * FILE_READ_ATTRIBUTES | FILE_READ_DATA | FILE_READ_EA | - * STANDARD_RIGHTS_READ | SYNCHRONIZE - * For directories, the mapping has the same numberical value. See - * above for the descriptions of the rights granted. - */ - GENERIC_READ = cpu_to_le32(0x80000000), -}; - -typedef le32 ACCESS_MASK; - -/* - * The generic mapping array. Used to denote the mapping of each generic - * access right to a specific access mask. - * - * FIXME: What exactly is this and what is it for? (AIA) - */ -typedef struct { - ACCESS_MASK generic_read; - ACCESS_MASK generic_write; - ACCESS_MASK generic_execute; - ACCESS_MASK generic_all; -} __attribute__ ((__packed__)) GENERIC_MAPPING; - -/* - * The predefined ACE type structures are as defined below. - */ - -/* - * ACCESS_ALLOWED_ACE, ACCESS_DENIED_ACE, SYSTEM_AUDIT_ACE, SYSTEM_ALARM_ACE - */ -typedef struct { -/* 0 ACE_HEADER; -- Unfolded here as gcc doesn't like unnamed structs. */ - ACE_TYPES type; /* Type of the ACE. */ - ACE_FLAGS flags; /* Flags describing the ACE. */ - le16 size; /* Size in bytes of the ACE. */ -/* 4*/ ACCESS_MASK mask; /* Access mask associated with the ACE. */ - -/* 8*/ SID sid; /* The SID associated with the ACE. */ -} __attribute__ ((__packed__)) ACCESS_ALLOWED_ACE, ACCESS_DENIED_ACE, - SYSTEM_AUDIT_ACE, SYSTEM_ALARM_ACE; - -/* - * The object ACE flags (32-bit). - */ -enum { - ACE_OBJECT_TYPE_PRESENT = cpu_to_le32(1), - ACE_INHERITED_OBJECT_TYPE_PRESENT = cpu_to_le32(2), -}; - -typedef le32 OBJECT_ACE_FLAGS; - -typedef struct { -/* 0 ACE_HEADER; -- Unfolded here as gcc doesn't like unnamed structs. */ - ACE_TYPES type; /* Type of the ACE. */ - ACE_FLAGS flags; /* Flags describing the ACE. */ - le16 size; /* Size in bytes of the ACE. */ -/* 4*/ ACCESS_MASK mask; /* Access mask associated with the ACE. */ - -/* 8*/ OBJECT_ACE_FLAGS object_flags; /* Flags describing the object ACE. */ -/* 12*/ GUID object_type; -/* 28*/ GUID inherited_object_type; - -/* 44*/ SID sid; /* The SID associated with the ACE. */ -} __attribute__ ((__packed__)) ACCESS_ALLOWED_OBJECT_ACE, - ACCESS_DENIED_OBJECT_ACE, - SYSTEM_AUDIT_OBJECT_ACE, - SYSTEM_ALARM_OBJECT_ACE; - -/* - * An ACL is an access-control list (ACL). - * An ACL starts with an ACL header structure, which specifies the size of - * the ACL and the number of ACEs it contains. The ACL header is followed by - * zero or more access control entries (ACEs). The ACL as well as each ACE - * are aligned on 4-byte boundaries. - */ -typedef struct { - u8 revision; /* Revision of this ACL. */ - u8 alignment1; - le16 size; /* Allocated space in bytes for ACL. Includes this - header, the ACEs and the remaining free space. */ - le16 ace_count; /* Number of ACEs in the ACL. */ - le16 alignment2; -/* sizeof() = 8 bytes */ -} __attribute__ ((__packed__)) ACL; - -/* - * Current constants for ACLs. - */ -typedef enum { - /* Current revision. */ - ACL_REVISION = 2, - ACL_REVISION_DS = 4, - - /* History of revisions. */ - ACL_REVISION1 = 1, - MIN_ACL_REVISION = 2, - ACL_REVISION2 = 2, - ACL_REVISION3 = 3, - ACL_REVISION4 = 4, - MAX_ACL_REVISION = 4, -} ACL_CONSTANTS; - -/* - * The security descriptor control flags (16-bit). - * - * SE_OWNER_DEFAULTED - This boolean flag, when set, indicates that the SID - * pointed to by the Owner field was provided by a defaulting mechanism - * rather than explicitly provided by the original provider of the - * security descriptor. This may affect the treatment of the SID with - * respect to inheritance of an owner. - * - * SE_GROUP_DEFAULTED - This boolean flag, when set, indicates that the SID in - * the Group field was provided by a defaulting mechanism rather than - * explicitly provided by the original provider of the security - * descriptor. This may affect the treatment of the SID with respect to - * inheritance of a primary group. - * - * SE_DACL_PRESENT - This boolean flag, when set, indicates that the security - * descriptor contains a discretionary ACL. If this flag is set and the - * Dacl field of the SECURITY_DESCRIPTOR is null, then a null ACL is - * explicitly being specified. - * - * SE_DACL_DEFAULTED - This boolean flag, when set, indicates that the ACL - * pointed to by the Dacl field was provided by a defaulting mechanism - * rather than explicitly provided by the original provider of the - * security descriptor. This may affect the treatment of the ACL with - * respect to inheritance of an ACL. This flag is ignored if the - * DaclPresent flag is not set. - * - * SE_SACL_PRESENT - This boolean flag, when set, indicates that the security - * descriptor contains a system ACL pointed to by the Sacl field. If this - * flag is set and the Sacl field of the SECURITY_DESCRIPTOR is null, then - * an empty (but present) ACL is being specified. - * - * SE_SACL_DEFAULTED - This boolean flag, when set, indicates that the ACL - * pointed to by the Sacl field was provided by a defaulting mechanism - * rather than explicitly provided by the original provider of the - * security descriptor. This may affect the treatment of the ACL with - * respect to inheritance of an ACL. This flag is ignored if the - * SaclPresent flag is not set. - * - * SE_SELF_RELATIVE - This boolean flag, when set, indicates that the security - * descriptor is in self-relative form. In this form, all fields of the - * security descriptor are contiguous in memory and all pointer fields are - * expressed as offsets from the beginning of the security descriptor. - */ -enum { - SE_OWNER_DEFAULTED = cpu_to_le16(0x0001), - SE_GROUP_DEFAULTED = cpu_to_le16(0x0002), - SE_DACL_PRESENT = cpu_to_le16(0x0004), - SE_DACL_DEFAULTED = cpu_to_le16(0x0008), - - SE_SACL_PRESENT = cpu_to_le16(0x0010), - SE_SACL_DEFAULTED = cpu_to_le16(0x0020), - - SE_DACL_AUTO_INHERIT_REQ = cpu_to_le16(0x0100), - SE_SACL_AUTO_INHERIT_REQ = cpu_to_le16(0x0200), - SE_DACL_AUTO_INHERITED = cpu_to_le16(0x0400), - SE_SACL_AUTO_INHERITED = cpu_to_le16(0x0800), - - SE_DACL_PROTECTED = cpu_to_le16(0x1000), - SE_SACL_PROTECTED = cpu_to_le16(0x2000), - SE_RM_CONTROL_VALID = cpu_to_le16(0x4000), - SE_SELF_RELATIVE = cpu_to_le16(0x8000) -} __attribute__ ((__packed__)); - -typedef le16 SECURITY_DESCRIPTOR_CONTROL; - -/* - * Self-relative security descriptor. Contains the owner and group SIDs as well - * as the sacl and dacl ACLs inside the security descriptor itself. - */ -typedef struct { - u8 revision; /* Revision level of the security descriptor. */ - u8 alignment; - SECURITY_DESCRIPTOR_CONTROL control; /* Flags qualifying the type of - the descriptor as well as the following fields. */ - le32 owner; /* Byte offset to a SID representing an object's - owner. If this is NULL, no owner SID is present in - the descriptor. */ - le32 group; /* Byte offset to a SID representing an object's - primary group. If this is NULL, no primary group - SID is present in the descriptor. */ - le32 sacl; /* Byte offset to a system ACL. Only valid, if - SE_SACL_PRESENT is set in the control field. If - SE_SACL_PRESENT is set but sacl is NULL, a NULL ACL - is specified. */ - le32 dacl; /* Byte offset to a discretionary ACL. Only valid, if - SE_DACL_PRESENT is set in the control field. If - SE_DACL_PRESENT is set but dacl is NULL, a NULL ACL - (unconditionally granting access) is specified. */ -/* sizeof() = 0x14 bytes */ -} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR_RELATIVE; - -/* - * Absolute security descriptor. Does not contain the owner and group SIDs, nor - * the sacl and dacl ACLs inside the security descriptor. Instead, it contains - * pointers to these structures in memory. Obviously, absolute security - * descriptors are only useful for in memory representations of security - * descriptors. On disk, a self-relative security descriptor is used. - */ -typedef struct { - u8 revision; /* Revision level of the security descriptor. */ - u8 alignment; - SECURITY_DESCRIPTOR_CONTROL control; /* Flags qualifying the type of - the descriptor as well as the following fields. */ - SID *owner; /* Points to a SID representing an object's owner. If - this is NULL, no owner SID is present in the - descriptor. */ - SID *group; /* Points to a SID representing an object's primary - group. If this is NULL, no primary group SID is - present in the descriptor. */ - ACL *sacl; /* Points to a system ACL. Only valid, if - SE_SACL_PRESENT is set in the control field. If - SE_SACL_PRESENT is set but sacl is NULL, a NULL ACL - is specified. */ - ACL *dacl; /* Points to a discretionary ACL. Only valid, if - SE_DACL_PRESENT is set in the control field. If - SE_DACL_PRESENT is set but dacl is NULL, a NULL ACL - (unconditionally granting access) is specified. */ -} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR; - -/* - * Current constants for security descriptors. - */ -typedef enum { - /* Current revision. */ - SECURITY_DESCRIPTOR_REVISION = 1, - SECURITY_DESCRIPTOR_REVISION1 = 1, - - /* The sizes of both the absolute and relative security descriptors is - the same as pointers, at least on ia32 architecture are 32-bit. */ - SECURITY_DESCRIPTOR_MIN_LENGTH = sizeof(SECURITY_DESCRIPTOR), -} SECURITY_DESCRIPTOR_CONSTANTS; - -/* - * Attribute: Security descriptor (0x50). A standard self-relative security - * descriptor. - * - * NOTE: Can be resident or non-resident. - * NOTE: Not used in NTFS 3.0+, as security descriptors are stored centrally - * in FILE_Secure and the correct descriptor is found using the security_id - * from the standard information attribute. - */ -typedef SECURITY_DESCRIPTOR_RELATIVE SECURITY_DESCRIPTOR_ATTR; - -/* - * On NTFS 3.0+, all security descriptors are stored in FILE_Secure. Only one - * referenced instance of each unique security descriptor is stored. - * - * FILE_Secure contains no unnamed data attribute, i.e. it has zero length. It - * does, however, contain two indexes ($SDH and $SII) as well as a named data - * stream ($SDS). - * - * Every unique security descriptor is assigned a unique security identifier - * (security_id, not to be confused with a SID). The security_id is unique for - * the NTFS volume and is used as an index into the $SII index, which maps - * security_ids to the security descriptor's storage location within the $SDS - * data attribute. The $SII index is sorted by ascending security_id. - * - * A simple hash is computed from each security descriptor. This hash is used - * as an index into the $SDH index, which maps security descriptor hashes to - * the security descriptor's storage location within the $SDS data attribute. - * The $SDH index is sorted by security descriptor hash and is stored in a B+ - * tree. When searching $SDH (with the intent of determining whether or not a - * new security descriptor is already present in the $SDS data stream), if a - * matching hash is found, but the security descriptors do not match, the - * search in the $SDH index is continued, searching for a next matching hash. - * - * When a precise match is found, the security_id coresponding to the security - * descriptor in the $SDS attribute is read from the found $SDH index entry and - * is stored in the $STANDARD_INFORMATION attribute of the file/directory to - * which the security descriptor is being applied. The $STANDARD_INFORMATION - * attribute is present in all base mft records (i.e. in all files and - * directories). - * - * If a match is not found, the security descriptor is assigned a new unique - * security_id and is added to the $SDS data attribute. Then, entries - * referencing the this security descriptor in the $SDS data attribute are - * added to the $SDH and $SII indexes. - * - * Note: Entries are never deleted from FILE_Secure, even if nothing - * references an entry any more. - */ - -/* - * This header precedes each security descriptor in the $SDS data stream. - * This is also the index entry data part of both the $SII and $SDH indexes. - */ -typedef struct { - le32 hash; /* Hash of the security descriptor. */ - le32 security_id; /* The security_id assigned to the descriptor. */ - le64 offset; /* Byte offset of this entry in the $SDS stream. */ - le32 length; /* Size in bytes of this entry in $SDS stream. */ -} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR_HEADER; - -/* - * The $SDS data stream contains the security descriptors, aligned on 16-byte - * boundaries, sorted by security_id in a B+ tree. Security descriptors cannot - * cross 256kib boundaries (this restriction is imposed by the Windows cache - * manager). Each security descriptor is contained in a SDS_ENTRY structure. - * Also, each security descriptor is stored twice in the $SDS stream with a - * fixed offset of 0x40000 bytes (256kib, the Windows cache manager's max size) - * between them; i.e. if a SDS_ENTRY specifies an offset of 0x51d0, then the - * first copy of the security descriptor will be at offset 0x51d0 in the - * $SDS data stream and the second copy will be at offset 0x451d0. - */ -typedef struct { -/*Ofs*/ -/* 0 SECURITY_DESCRIPTOR_HEADER; -- Unfolded here as gcc doesn't like - unnamed structs. */ - le32 hash; /* Hash of the security descriptor. */ - le32 security_id; /* The security_id assigned to the descriptor. */ - le64 offset; /* Byte offset of this entry in the $SDS stream. */ - le32 length; /* Size in bytes of this entry in $SDS stream. */ -/* 20*/ SECURITY_DESCRIPTOR_RELATIVE sid; /* The self-relative security - descriptor. */ -} __attribute__ ((__packed__)) SDS_ENTRY; - -/* - * The index entry key used in the $SII index. The collation type is - * COLLATION_NTOFS_ULONG. - */ -typedef struct { - le32 security_id; /* The security_id assigned to the descriptor. */ -} __attribute__ ((__packed__)) SII_INDEX_KEY; - -/* - * The index entry key used in the $SDH index. The keys are sorted first by - * hash and then by security_id. The collation rule is - * COLLATION_NTOFS_SECURITY_HASH. - */ -typedef struct { - le32 hash; /* Hash of the security descriptor. */ - le32 security_id; /* The security_id assigned to the descriptor. */ -} __attribute__ ((__packed__)) SDH_INDEX_KEY; - -/* - * Attribute: Volume name (0x60). - * - * NOTE: Always resident. - * NOTE: Present only in FILE_Volume. - */ -typedef struct { - ntfschar name[0]; /* The name of the volume in Unicode. */ -} __attribute__ ((__packed__)) VOLUME_NAME; - -/* - * Possible flags for the volume (16-bit). - */ -enum { - VOLUME_IS_DIRTY = cpu_to_le16(0x0001), - VOLUME_RESIZE_LOG_FILE = cpu_to_le16(0x0002), - VOLUME_UPGRADE_ON_MOUNT = cpu_to_le16(0x0004), - VOLUME_MOUNTED_ON_NT4 = cpu_to_le16(0x0008), - - VOLUME_DELETE_USN_UNDERWAY = cpu_to_le16(0x0010), - VOLUME_REPAIR_OBJECT_ID = cpu_to_le16(0x0020), - - VOLUME_CHKDSK_UNDERWAY = cpu_to_le16(0x4000), - VOLUME_MODIFIED_BY_CHKDSK = cpu_to_le16(0x8000), - - VOLUME_FLAGS_MASK = cpu_to_le16(0xc03f), - - /* To make our life easier when checking if we must mount read-only. */ - VOLUME_MUST_MOUNT_RO_MASK = cpu_to_le16(0xc027), -} __attribute__ ((__packed__)); - -typedef le16 VOLUME_FLAGS; - -/* - * Attribute: Volume information (0x70). - * - * NOTE: Always resident. - * NOTE: Present only in FILE_Volume. - * NOTE: Windows 2000 uses NTFS 3.0 while Windows NT4 service pack 6a uses - * NTFS 1.2. I haven't personally seen other values yet. - */ -typedef struct { - le64 reserved; /* Not used (yet?). */ - u8 major_ver; /* Major version of the ntfs format. */ - u8 minor_ver; /* Minor version of the ntfs format. */ - VOLUME_FLAGS flags; /* Bit array of VOLUME_* flags. */ -} __attribute__ ((__packed__)) VOLUME_INFORMATION; - -/* - * Attribute: Data attribute (0x80). - * - * NOTE: Can be resident or non-resident. - * - * Data contents of a file (i.e. the unnamed stream) or of a named stream. - */ -typedef struct { - u8 data[0]; /* The file's data contents. */ -} __attribute__ ((__packed__)) DATA_ATTR; - -/* - * Index header flags (8-bit). - */ -enum { - /* - * When index header is in an index root attribute: - */ - SMALL_INDEX = 0, /* The index is small enough to fit inside the index - root attribute and there is no index allocation - attribute present. */ - LARGE_INDEX = 1, /* The index is too large to fit in the index root - attribute and/or an index allocation attribute is - present. */ - /* - * When index header is in an index block, i.e. is part of index - * allocation attribute: - */ - LEAF_NODE = 0, /* This is a leaf node, i.e. there are no more nodes - branching off it. */ - INDEX_NODE = 1, /* This node indexes other nodes, i.e. it is not a leaf - node. */ - NODE_MASK = 1, /* Mask for accessing the *_NODE bits. */ -} __attribute__ ((__packed__)); - -typedef u8 INDEX_HEADER_FLAGS; - -/* - * This is the header for indexes, describing the INDEX_ENTRY records, which - * follow the INDEX_HEADER. Together the index header and the index entries - * make up a complete index. - * - * IMPORTANT NOTE: The offset, length and size structure members are counted - * relative to the start of the index header structure and not relative to the - * start of the index root or index allocation structures themselves. - */ -typedef struct { - le32 entries_offset; /* Byte offset to first INDEX_ENTRY - aligned to 8-byte boundary. */ - le32 index_length; /* Data size of the index in bytes, - i.e. bytes used from allocated - size, aligned to 8-byte boundary. */ - le32 allocated_size; /* Byte size of this index (block), - multiple of 8 bytes. */ - /* NOTE: For the index root attribute, the above two numbers are always - equal, as the attribute is resident and it is resized as needed. In - the case of the index allocation attribute the attribute is not - resident and hence the allocated_size is a fixed value and must - equal the index_block_size specified by the INDEX_ROOT attribute - corresponding to the INDEX_ALLOCATION attribute this INDEX_BLOCK - belongs to. */ - INDEX_HEADER_FLAGS flags; /* Bit field of INDEX_HEADER_FLAGS. */ - u8 reserved[3]; /* Reserved/align to 8-byte boundary. */ -} __attribute__ ((__packed__)) INDEX_HEADER; - -/* - * Attribute: Index root (0x90). - * - * NOTE: Always resident. - * - * This is followed by a sequence of index entries (INDEX_ENTRY structures) - * as described by the index header. - * - * When a directory is small enough to fit inside the index root then this - * is the only attribute describing the directory. When the directory is too - * large to fit in the index root, on the other hand, two additional attributes - * are present: an index allocation attribute, containing sub-nodes of the B+ - * directory tree (see below), and a bitmap attribute, describing which virtual - * cluster numbers (vcns) in the index allocation attribute are in use by an - * index block. - * - * NOTE: The root directory (FILE_root) contains an entry for itself. Other - * directories do not contain entries for themselves, though. - */ -typedef struct { - ATTR_TYPE type; /* Type of the indexed attribute. Is - $FILE_NAME for directories, zero - for view indexes. No other values - allowed. */ - COLLATION_RULE collation_rule; /* Collation rule used to sort the - index entries. If type is $FILE_NAME, - this must be COLLATION_FILE_NAME. */ - le32 index_block_size; /* Size of each index block in bytes (in - the index allocation attribute). */ - u8 clusters_per_index_block; /* Cluster size of each index block (in - the index allocation attribute), when - an index block is >= than a cluster, - otherwise this will be the log of - the size (like how the encoding of - the mft record size and the index - record size found in the boot sector - work). Has to be a power of 2. */ - u8 reserved[3]; /* Reserved/align to 8-byte boundary. */ - INDEX_HEADER index; /* Index header describing the - following index entries. */ -} __attribute__ ((__packed__)) INDEX_ROOT; - -/* - * Attribute: Index allocation (0xa0). - * - * NOTE: Always non-resident (doesn't make sense to be resident anyway!). - * - * This is an array of index blocks. Each index block starts with an - * INDEX_BLOCK structure containing an index header, followed by a sequence of - * index entries (INDEX_ENTRY structures), as described by the INDEX_HEADER. - */ -typedef struct { -/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */ - NTFS_RECORD_TYPE magic; /* Magic is "INDX". */ - le16 usa_ofs; /* See NTFS_RECORD definition. */ - le16 usa_count; /* See NTFS_RECORD definition. */ - -/* 8*/ sle64 lsn; /* $LogFile sequence number of the last - modification of this index block. */ -/* 16*/ leVCN index_block_vcn; /* Virtual cluster number of the index block. - If the cluster_size on the volume is <= the - index_block_size of the directory, - index_block_vcn counts in units of clusters, - and in units of sectors otherwise. */ -/* 24*/ INDEX_HEADER index; /* Describes the following index entries. */ -/* sizeof()= 40 (0x28) bytes */ -/* - * When creating the index block, we place the update sequence array at this - * offset, i.e. before we start with the index entries. This also makes sense, - * otherwise we could run into problems with the update sequence array - * containing in itself the last two bytes of a sector which would mean that - * multi sector transfer protection wouldn't work. As you can't protect data - * by overwriting it since you then can't get it back... - * When reading use the data from the ntfs record header. - */ -} __attribute__ ((__packed__)) INDEX_BLOCK; - -typedef INDEX_BLOCK INDEX_ALLOCATION; - -/* - * The system file FILE_Extend/$Reparse contains an index named $R listing - * all reparse points on the volume. The index entry keys are as defined - * below. Note, that there is no index data associated with the index entries. - * - * The index entries are sorted by the index key file_id. The collation rule is - * COLLATION_NTOFS_ULONGS. FIXME: Verify whether the reparse_tag is not the - * primary key / is not a key at all. (AIA) - */ -typedef struct { - le32 reparse_tag; /* Reparse point type (inc. flags). */ - leMFT_REF file_id; /* Mft record of the file containing the - reparse point attribute. */ -} __attribute__ ((__packed__)) REPARSE_INDEX_KEY; - -/* - * Quota flags (32-bit). - * - * The user quota flags. Names explain meaning. - */ -enum { - QUOTA_FLAG_DEFAULT_LIMITS = cpu_to_le32(0x00000001), - QUOTA_FLAG_LIMIT_REACHED = cpu_to_le32(0x00000002), - QUOTA_FLAG_ID_DELETED = cpu_to_le32(0x00000004), - - QUOTA_FLAG_USER_MASK = cpu_to_le32(0x00000007), - /* This is a bit mask for the user quota flags. */ - - /* - * These flags are only present in the quota defaults index entry, i.e. - * in the entry where owner_id = QUOTA_DEFAULTS_ID. - */ - QUOTA_FLAG_TRACKING_ENABLED = cpu_to_le32(0x00000010), - QUOTA_FLAG_ENFORCEMENT_ENABLED = cpu_to_le32(0x00000020), - QUOTA_FLAG_TRACKING_REQUESTED = cpu_to_le32(0x00000040), - QUOTA_FLAG_LOG_THRESHOLD = cpu_to_le32(0x00000080), - - QUOTA_FLAG_LOG_LIMIT = cpu_to_le32(0x00000100), - QUOTA_FLAG_OUT_OF_DATE = cpu_to_le32(0x00000200), - QUOTA_FLAG_CORRUPT = cpu_to_le32(0x00000400), - QUOTA_FLAG_PENDING_DELETES = cpu_to_le32(0x00000800), -}; - -typedef le32 QUOTA_FLAGS; - -/* - * The system file FILE_Extend/$Quota contains two indexes $O and $Q. Quotas - * are on a per volume and per user basis. - * - * The $Q index contains one entry for each existing user_id on the volume. The - * index key is the user_id of the user/group owning this quota control entry, - * i.e. the key is the owner_id. The user_id of the owner of a file, i.e. the - * owner_id, is found in the standard information attribute. The collation rule - * for $Q is COLLATION_NTOFS_ULONG. - * - * The $O index contains one entry for each user/group who has been assigned - * a quota on that volume. The index key holds the SID of the user_id the - * entry belongs to, i.e. the owner_id. The collation rule for $O is - * COLLATION_NTOFS_SID. - * - * The $O index entry data is the user_id of the user corresponding to the SID. - * This user_id is used as an index into $Q to find the quota control entry - * associated with the SID. - * - * The $Q index entry data is the quota control entry and is defined below. - */ -typedef struct { - le32 version; /* Currently equals 2. */ - QUOTA_FLAGS flags; /* Flags describing this quota entry. */ - le64 bytes_used; /* How many bytes of the quota are in use. */ - sle64 change_time; /* Last time this quota entry was changed. */ - sle64 threshold; /* Soft quota (-1 if not limited). */ - sle64 limit; /* Hard quota (-1 if not limited). */ - sle64 exceeded_time; /* How long the soft quota has been exceeded. */ - SID sid; /* The SID of the user/object associated with - this quota entry. Equals zero for the quota - defaults entry (and in fact on a WinXP - volume, it is not present at all). */ -} __attribute__ ((__packed__)) QUOTA_CONTROL_ENTRY; - -/* - * Predefined owner_id values (32-bit). - */ -enum { - QUOTA_INVALID_ID = cpu_to_le32(0x00000000), - QUOTA_DEFAULTS_ID = cpu_to_le32(0x00000001), - QUOTA_FIRST_USER_ID = cpu_to_le32(0x00000100), -}; - -/* - * Current constants for quota control entries. - */ -typedef enum { - /* Current version. */ - QUOTA_VERSION = 2, -} QUOTA_CONTROL_ENTRY_CONSTANTS; - -/* - * Index entry flags (16-bit). - */ -enum { - INDEX_ENTRY_NODE = cpu_to_le16(1), /* This entry contains a - sub-node, i.e. a reference to an index block in form of - a virtual cluster number (see below). */ - INDEX_ENTRY_END = cpu_to_le16(2), /* This signifies the last - entry in an index block. The index entry does not - represent a file but it can point to a sub-node. */ - - INDEX_ENTRY_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force - enum bit width to 16-bit. */ -} __attribute__ ((__packed__)); - -typedef le16 INDEX_ENTRY_FLAGS; - -/* - * This the index entry header (see below). - */ -typedef struct { -/* 0*/ union { - struct { /* Only valid when INDEX_ENTRY_END is not set. */ - leMFT_REF indexed_file; /* The mft reference of the file - described by this index - entry. Used for directory - indexes. */ - } __attribute__ ((__packed__)) dir; - struct { /* Used for views/indexes to find the entry's data. */ - le16 data_offset; /* Data byte offset from this - INDEX_ENTRY. Follows the - index key. */ - le16 data_length; /* Data length in bytes. */ - le32 reservedV; /* Reserved (zero). */ - } __attribute__ ((__packed__)) vi; - } __attribute__ ((__packed__)) data; -/* 8*/ le16 length; /* Byte size of this index entry, multiple of - 8-bytes. */ -/* 10*/ le16 key_length; /* Byte size of the key value, which is in the - index entry. It follows field reserved. Not - multiple of 8-bytes. */ -/* 12*/ INDEX_ENTRY_FLAGS flags; /* Bit field of INDEX_ENTRY_* flags. */ -/* 14*/ le16 reserved; /* Reserved/align to 8-byte boundary. */ -/* sizeof() = 16 bytes */ -} __attribute__ ((__packed__)) INDEX_ENTRY_HEADER; - -/* - * This is an index entry. A sequence of such entries follows each INDEX_HEADER - * structure. Together they make up a complete index. The index follows either - * an index root attribute or an index allocation attribute. - * - * NOTE: Before NTFS 3.0 only filename attributes were indexed. - */ -typedef struct { -/*Ofs*/ -/* 0 INDEX_ENTRY_HEADER; -- Unfolded here as gcc dislikes unnamed structs. */ - union { - struct { /* Only valid when INDEX_ENTRY_END is not set. */ - leMFT_REF indexed_file; /* The mft reference of the file - described by this index - entry. Used for directory - indexes. */ - } __attribute__ ((__packed__)) dir; - struct { /* Used for views/indexes to find the entry's data. */ - le16 data_offset; /* Data byte offset from this - INDEX_ENTRY. Follows the - index key. */ - le16 data_length; /* Data length in bytes. */ - le32 reservedV; /* Reserved (zero). */ - } __attribute__ ((__packed__)) vi; - } __attribute__ ((__packed__)) data; - le16 length; /* Byte size of this index entry, multiple of - 8-bytes. */ - le16 key_length; /* Byte size of the key value, which is in the - index entry. It follows field reserved. Not - multiple of 8-bytes. */ - INDEX_ENTRY_FLAGS flags; /* Bit field of INDEX_ENTRY_* flags. */ - le16 reserved; /* Reserved/align to 8-byte boundary. */ - -/* 16*/ union { /* The key of the indexed attribute. NOTE: Only present - if INDEX_ENTRY_END bit in flags is not set. NOTE: On - NTFS versions before 3.0 the only valid key is the - FILE_NAME_ATTR. On NTFS 3.0+ the following - additional index keys are defined: */ - FILE_NAME_ATTR file_name;/* $I30 index in directories. */ - SII_INDEX_KEY sii; /* $SII index in $Secure. */ - SDH_INDEX_KEY sdh; /* $SDH index in $Secure. */ - GUID object_id; /* $O index in FILE_Extend/$ObjId: The - object_id of the mft record found in - the data part of the index. */ - REPARSE_INDEX_KEY reparse; /* $R index in - FILE_Extend/$Reparse. */ - SID sid; /* $O index in FILE_Extend/$Quota: - SID of the owner of the user_id. */ - le32 owner_id; /* $Q index in FILE_Extend/$Quota: - user_id of the owner of the quota - control entry in the data part of - the index. */ - } __attribute__ ((__packed__)) key; - /* The (optional) index data is inserted here when creating. */ - // leVCN vcn; /* If INDEX_ENTRY_NODE bit in flags is set, the last - // eight bytes of this index entry contain the virtual - // cluster number of the index block that holds the - // entries immediately preceding the current entry (the - // vcn references the corresponding cluster in the data - // of the non-resident index allocation attribute). If - // the key_length is zero, then the vcn immediately - // follows the INDEX_ENTRY_HEADER. Regardless of - // key_length, the address of the 8-byte boundary - // aligned vcn of INDEX_ENTRY{_HEADER} *ie is given by - // (char*)ie + le16_to_cpu(ie*)->length) - sizeof(VCN), - // where sizeof(VCN) can be hardcoded as 8 if wanted. */ -} __attribute__ ((__packed__)) INDEX_ENTRY; - -/* - * Attribute: Bitmap (0xb0). - * - * Contains an array of bits (aka a bitfield). - * - * When used in conjunction with the index allocation attribute, each bit - * corresponds to one index block within the index allocation attribute. Thus - * the number of bits in the bitmap * index block size / cluster size is the - * number of clusters in the index allocation attribute. - */ -typedef struct { - u8 bitmap[0]; /* Array of bits. */ -} __attribute__ ((__packed__)) BITMAP_ATTR; - -/* - * The reparse point tag defines the type of the reparse point. It also - * includes several flags, which further describe the reparse point. - * - * The reparse point tag is an unsigned 32-bit value divided in three parts: - * - * 1. The least significant 16 bits (i.e. bits 0 to 15) specifiy the type of - * the reparse point. - * 2. The 13 bits after this (i.e. bits 16 to 28) are reserved for future use. - * 3. The most significant three bits are flags describing the reparse point. - * They are defined as follows: - * bit 29: Name surrogate bit. If set, the filename is an alias for - * another object in the system. - * bit 30: High-latency bit. If set, accessing the first byte of data will - * be slow. (E.g. the data is stored on a tape drive.) - * bit 31: Microsoft bit. If set, the tag is owned by Microsoft. User - * defined tags have to use zero here. - * - * These are the predefined reparse point tags: - */ -enum { - IO_REPARSE_TAG_IS_ALIAS = cpu_to_le32(0x20000000), - IO_REPARSE_TAG_IS_HIGH_LATENCY = cpu_to_le32(0x40000000), - IO_REPARSE_TAG_IS_MICROSOFT = cpu_to_le32(0x80000000), - - IO_REPARSE_TAG_RESERVED_ZERO = cpu_to_le32(0x00000000), - IO_REPARSE_TAG_RESERVED_ONE = cpu_to_le32(0x00000001), - IO_REPARSE_TAG_RESERVED_RANGE = cpu_to_le32(0x00000001), - - IO_REPARSE_TAG_NSS = cpu_to_le32(0x68000005), - IO_REPARSE_TAG_NSS_RECOVER = cpu_to_le32(0x68000006), - IO_REPARSE_TAG_SIS = cpu_to_le32(0x68000007), - IO_REPARSE_TAG_DFS = cpu_to_le32(0x68000008), - - IO_REPARSE_TAG_MOUNT_POINT = cpu_to_le32(0x88000003), - - IO_REPARSE_TAG_HSM = cpu_to_le32(0xa8000004), - - IO_REPARSE_TAG_SYMBOLIC_LINK = cpu_to_le32(0xe8000000), - - IO_REPARSE_TAG_VALID_VALUES = cpu_to_le32(0xe000ffff), -}; - -/* - * Attribute: Reparse point (0xc0). - * - * NOTE: Can be resident or non-resident. - */ -typedef struct { - le32 reparse_tag; /* Reparse point type (inc. flags). */ - le16 reparse_data_length; /* Byte size of reparse data. */ - le16 reserved; /* Align to 8-byte boundary. */ - u8 reparse_data[0]; /* Meaning depends on reparse_tag. */ -} __attribute__ ((__packed__)) REPARSE_POINT; - -/* - * Attribute: Extended attribute (EA) information (0xd0). - * - * NOTE: Always resident. (Is this true???) - */ -typedef struct { - le16 ea_length; /* Byte size of the packed extended - attributes. */ - le16 need_ea_count; /* The number of extended attributes which have - the NEED_EA bit set. */ - le32 ea_query_length; /* Byte size of the buffer required to query - the extended attributes when calling - ZwQueryEaFile() in Windows NT/2k. I.e. the - byte size of the unpacked extended - attributes. */ -} __attribute__ ((__packed__)) EA_INFORMATION; - -/* - * Extended attribute flags (8-bit). - */ -enum { - NEED_EA = 0x80 /* If set the file to which the EA belongs - cannot be interpreted without understanding - the associates extended attributes. */ -} __attribute__ ((__packed__)); - -typedef u8 EA_FLAGS; - -/* - * Attribute: Extended attribute (EA) (0xe0). - * - * NOTE: Can be resident or non-resident. - * - * Like the attribute list and the index buffer list, the EA attribute value is - * a sequence of EA_ATTR variable length records. - */ -typedef struct { - le32 next_entry_offset; /* Offset to the next EA_ATTR. */ - EA_FLAGS flags; /* Flags describing the EA. */ - u8 ea_name_length; /* Length of the name of the EA in bytes - excluding the '\0' byte terminator. */ - le16 ea_value_length; /* Byte size of the EA's value. */ - u8 ea_name[0]; /* Name of the EA. Note this is ASCII, not - Unicode and it is zero terminated. */ - u8 ea_value[0]; /* The value of the EA. Immediately follows - the name. */ -} __attribute__ ((__packed__)) EA_ATTR; - -/* - * Attribute: Property set (0xf0). - * - * Intended to support Native Structure Storage (NSS) - a feature removed from - * NTFS 3.0 during beta testing. - */ -typedef struct { - /* Irrelevant as feature unused. */ -} __attribute__ ((__packed__)) PROPERTY_SET; - -/* - * Attribute: Logged utility stream (0x100). - * - * NOTE: Can be resident or non-resident. - * - * Operations on this attribute are logged to the journal ($LogFile) like - * normal metadata changes. - * - * Used by the Encrypting File System (EFS). All encrypted files have this - * attribute with the name $EFS. - */ -typedef struct { - /* Can be anything the creator chooses. */ - /* EFS uses it as follows: */ - // FIXME: Type this info, verifying it along the way. (AIA) -} __attribute__ ((__packed__)) LOGGED_UTILITY_STREAM, EFS_ATTR; - -#endif /* _LINUX_NTFS_LAYOUT_H */ diff --git a/fs/ntfs/lcnalloc.c b/fs/ntfs/lcnalloc.c deleted file mode 100644 index eda9972e6159..000000000000 --- a/fs/ntfs/lcnalloc.c +++ /dev/null @@ -1,1000 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * lcnalloc.c - Cluster (de)allocation code. Part of the Linux-NTFS project. - * - * Copyright (c) 2004-2005 Anton Altaparmakov - */ - -#ifdef NTFS_RW - -#include <linux/pagemap.h> - -#include "lcnalloc.h" -#include "debug.h" -#include "bitmap.h" -#include "inode.h" -#include "volume.h" -#include "attrib.h" -#include "malloc.h" -#include "aops.h" -#include "ntfs.h" - -/** - * ntfs_cluster_free_from_rl_nolock - free clusters from runlist - * @vol: mounted ntfs volume on which to free the clusters - * @rl: runlist describing the clusters to free - * - * Free all the clusters described by the runlist @rl on the volume @vol. In - * the case of an error being returned, at least some of the clusters were not - * freed. - * - * Return 0 on success and -errno on error. - * - * Locking: - The volume lcn bitmap must be locked for writing on entry and is - * left locked on return. - */ -int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol, - const runlist_element *rl) -{ - struct inode *lcnbmp_vi = vol->lcnbmp_ino; - int ret = 0; - - ntfs_debug("Entering."); - if (!rl) - return 0; - for (; rl->length; rl++) { - int err; - - if (rl->lcn < 0) - continue; - err = ntfs_bitmap_clear_run(lcnbmp_vi, rl->lcn, rl->length); - if (unlikely(err && (!ret || ret == -ENOMEM) && ret != err)) - ret = err; - } - ntfs_debug("Done."); - return ret; -} - -/** - * ntfs_cluster_alloc - allocate clusters on an ntfs volume - * @vol: mounted ntfs volume on which to allocate the clusters - * @start_vcn: vcn to use for the first allocated cluster - * @count: number of clusters to allocate - * @start_lcn: starting lcn at which to allocate the clusters (or -1 if none) - * @zone: zone from which to allocate the clusters - * @is_extension: if 'true', this is an attribute extension - * - * Allocate @count clusters preferably starting at cluster @start_lcn or at the - * current allocator position if @start_lcn is -1, on the mounted ntfs volume - * @vol. @zone is either DATA_ZONE for allocation of normal clusters or - * MFT_ZONE for allocation of clusters for the master file table, i.e. the - * $MFT/$DATA attribute. - * - * @start_vcn specifies the vcn of the first allocated cluster. This makes - * merging the resulting runlist with the old runlist easier. - * - * If @is_extension is 'true', the caller is allocating clusters to extend an - * attribute and if it is 'false', the caller is allocating clusters to fill a - * hole in an attribute. Practically the difference is that if @is_extension - * is 'true' the returned runlist will be terminated with LCN_ENOENT and if - * @is_extension is 'false' the runlist will be terminated with - * LCN_RL_NOT_MAPPED. - * - * You need to check the return value with IS_ERR(). If this is false, the - * function was successful and the return value is a runlist describing the - * allocated cluster(s). If IS_ERR() is true, the function failed and - * PTR_ERR() gives you the error code. - * - * Notes on the allocation algorithm - * ================================= - * - * There are two data zones. First is the area between the end of the mft zone - * and the end of the volume, and second is the area between the start of the - * volume and the start of the mft zone. On unmodified/standard NTFS 1.x - * volumes, the second data zone does not exist due to the mft zone being - * expanded to cover the start of the volume in order to reserve space for the - * mft bitmap attribute. - * - * This is not the prettiest function but the complexity stems from the need of - * implementing the mft vs data zoned approach and from the fact that we have - * access to the lcn bitmap in portions of up to 8192 bytes at a time, so we - * need to cope with crossing over boundaries of two buffers. Further, the - * fact that the allocator allows for caller supplied hints as to the location - * of where allocation should begin and the fact that the allocator keeps track - * of where in the data zones the next natural allocation should occur, - * contribute to the complexity of the function. But it should all be - * worthwhile, because this allocator should: 1) be a full implementation of - * the MFT zone approach used by Windows NT, 2) cause reduction in - * fragmentation, and 3) be speedy in allocations (the code is not optimized - * for speed, but the algorithm is, so further speed improvements are probably - * possible). - * - * FIXME: We should be monitoring cluster allocation and increment the MFT zone - * size dynamically but this is something for the future. We will just cause - * heavier fragmentation by not doing it and I am not even sure Windows would - * grow the MFT zone dynamically, so it might even be correct not to do this. - * The overhead in doing dynamic MFT zone expansion would be very large and - * unlikely worth the effort. (AIA) - * - * TODO: I have added in double the required zone position pointer wrap around - * logic which can be optimized to having only one of the two logic sets. - * However, having the double logic will work fine, but if we have only one of - * the sets and we get it wrong somewhere, then we get into trouble, so - * removing the duplicate logic requires _very_ careful consideration of _all_ - * possible code paths. So at least for now, I am leaving the double logic - - * better safe than sorry... (AIA) - * - * Locking: - The volume lcn bitmap must be unlocked on entry and is unlocked - * on return. - * - This function takes the volume lcn bitmap lock for writing and - * modifies the bitmap contents. - */ -runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn, - const s64 count, const LCN start_lcn, - const NTFS_CLUSTER_ALLOCATION_ZONES zone, - const bool is_extension) -{ - LCN zone_start, zone_end, bmp_pos, bmp_initial_pos, last_read_pos, lcn; - LCN prev_lcn = 0, prev_run_len = 0, mft_zone_size; - s64 clusters; - loff_t i_size; - struct inode *lcnbmp_vi; - runlist_element *rl = NULL; - struct address_space *mapping; - struct page *page = NULL; - u8 *buf, *byte; - int err = 0, rlpos, rlsize, buf_size; - u8 pass, done_zones, search_zone, need_writeback = 0, bit; - - ntfs_debug("Entering for start_vcn 0x%llx, count 0x%llx, start_lcn " - "0x%llx, zone %s_ZONE.", (unsigned long long)start_vcn, - (unsigned long long)count, - (unsigned long long)start_lcn, - zone == MFT_ZONE ? "MFT" : "DATA"); - BUG_ON(!vol); - lcnbmp_vi = vol->lcnbmp_ino; - BUG_ON(!lcnbmp_vi); - BUG_ON(start_vcn < 0); - BUG_ON(count < 0); - BUG_ON(start_lcn < -1); - BUG_ON(zone < FIRST_ZONE); - BUG_ON(zone > LAST_ZONE); - - /* Return NULL if @count is zero. */ - if (!count) - return NULL; - /* Take the lcnbmp lock for writing. */ - down_write(&vol->lcnbmp_lock); - /* - * If no specific @start_lcn was requested, use the current data zone - * position, otherwise use the requested @start_lcn but make sure it - * lies outside the mft zone. Also set done_zones to 0 (no zones done) - * and pass depending on whether we are starting inside a zone (1) or - * at the beginning of a zone (2). If requesting from the MFT_ZONE, - * we either start at the current position within the mft zone or at - * the specified position. If the latter is out of bounds then we start - * at the beginning of the MFT_ZONE. - */ - done_zones = 0; - pass = 1; - /* - * zone_start and zone_end are the current search range. search_zone - * is 1 for mft zone, 2 for data zone 1 (end of mft zone till end of - * volume) and 4 for data zone 2 (start of volume till start of mft - * zone). - */ - zone_start = start_lcn; - if (zone_start < 0) { - if (zone == DATA_ZONE) - zone_start = vol->data1_zone_pos; - else - zone_start = vol->mft_zone_pos; - if (!zone_start) { - /* - * Zone starts at beginning of volume which means a - * single pass is sufficient. - */ - pass = 2; - } - } else if (zone == DATA_ZONE && zone_start >= vol->mft_zone_start && - zone_start < vol->mft_zone_end) { - zone_start = vol->mft_zone_end; - /* - * Starting at beginning of data1_zone which means a single - * pass in this zone is sufficient. - */ - pass = 2; - } else if (zone == MFT_ZONE && (zone_start < vol->mft_zone_start || - zone_start >= vol->mft_zone_end)) { - zone_start = vol->mft_lcn; - if (!vol->mft_zone_end) - zone_start = 0; - /* - * Starting at beginning of volume which means a single pass - * is sufficient. - */ - pass = 2; - } - if (zone == MFT_ZONE) { - zone_end = vol->mft_zone_end; - search_zone = 1; - } else /* if (zone == DATA_ZONE) */ { - /* Skip searching the mft zone. */ - done_zones |= 1; - if (zone_start >= vol->mft_zone_end) { - zone_end = vol->nr_clusters; - search_zone = 2; - } else { - zone_end = vol->mft_zone_start; - search_zone = 4; - } - } - /* - * bmp_pos is the current bit position inside the bitmap. We use - * bmp_initial_pos to determine whether or not to do a zone switch. - */ - bmp_pos = bmp_initial_pos = zone_start; - - /* Loop until all clusters are allocated, i.e. clusters == 0. */ - clusters = count; - rlpos = rlsize = 0; - mapping = lcnbmp_vi->i_mapping; - i_size = i_size_read(lcnbmp_vi); - while (1) { - ntfs_debug("Start of outer while loop: done_zones 0x%x, " - "search_zone %i, pass %i, zone_start 0x%llx, " - "zone_end 0x%llx, bmp_initial_pos 0x%llx, " - "bmp_pos 0x%llx, rlpos %i, rlsize %i.", - done_zones, search_zone, pass, - (unsigned long long)zone_start, - (unsigned long long)zone_end, - (unsigned long long)bmp_initial_pos, - (unsigned long long)bmp_pos, rlpos, rlsize); - /* Loop until we run out of free clusters. */ - last_read_pos = bmp_pos >> 3; - ntfs_debug("last_read_pos 0x%llx.", - (unsigned long long)last_read_pos); - if (last_read_pos > i_size) { - ntfs_debug("End of attribute reached. " - "Skipping to zone_pass_done."); - goto zone_pass_done; - } - if (likely(page)) { - if (need_writeback) { - ntfs_debug("Marking page dirty."); - flush_dcache_page(page); - set_page_dirty(page); - need_writeback = 0; - } - ntfs_unmap_page(page); - } - page = ntfs_map_page(mapping, last_read_pos >> - PAGE_SHIFT); - if (IS_ERR(page)) { - err = PTR_ERR(page); - ntfs_error(vol->sb, "Failed to map page."); - goto out; - } - buf_size = last_read_pos & ~PAGE_MASK; - buf = page_address(page) + buf_size; - buf_size = PAGE_SIZE - buf_size; - if (unlikely(last_read_pos + buf_size > i_size)) - buf_size = i_size - last_read_pos; - buf_size <<= 3; - lcn = bmp_pos & 7; - bmp_pos &= ~(LCN)7; - ntfs_debug("Before inner while loop: buf_size %i, lcn 0x%llx, " - "bmp_pos 0x%llx, need_writeback %i.", buf_size, - (unsigned long long)lcn, - (unsigned long long)bmp_pos, need_writeback); - while (lcn < buf_size && lcn + bmp_pos < zone_end) { - byte = buf + (lcn >> 3); - ntfs_debug("In inner while loop: buf_size %i, " - "lcn 0x%llx, bmp_pos 0x%llx, " - "need_writeback %i, byte ofs 0x%x, " - "*byte 0x%x.", buf_size, - (unsigned long long)lcn, - (unsigned long long)bmp_pos, - need_writeback, - (unsigned int)(lcn >> 3), - (unsigned int)*byte); - /* Skip full bytes. */ - if (*byte == 0xff) { - lcn = (lcn + 8) & ~(LCN)7; - ntfs_debug("Continuing while loop 1."); - continue; - } - bit = 1 << (lcn & 7); - ntfs_debug("bit 0x%x.", bit); - /* If the bit is already set, go onto the next one. */ - if (*byte & bit) { - lcn++; - ntfs_debug("Continuing while loop 2."); - continue; - } - /* - * Allocate more memory if needed, including space for - * the terminator element. - * ntfs_malloc_nofs() operates on whole pages only. - */ - if ((rlpos + 2) * sizeof(*rl) > rlsize) { - runlist_element *rl2; - - ntfs_debug("Reallocating memory."); - if (!rl) - ntfs_debug("First free bit is at LCN " - "0x%llx.", - (unsigned long long) - (lcn + bmp_pos)); - rl2 = ntfs_malloc_nofs(rlsize + (int)PAGE_SIZE); - if (unlikely(!rl2)) { - err = -ENOMEM; - ntfs_error(vol->sb, "Failed to " - "allocate memory."); - goto out; - } - memcpy(rl2, rl, rlsize); - ntfs_free(rl); - rl = rl2; - rlsize += PAGE_SIZE; - ntfs_debug("Reallocated memory, rlsize 0x%x.", - rlsize); - } - /* Allocate the bitmap bit. */ - *byte |= bit; - /* We need to write this bitmap page to disk. */ - need_writeback = 1; - ntfs_debug("*byte 0x%x, need_writeback is set.", - (unsigned int)*byte); - /* - * Coalesce with previous run if adjacent LCNs. - * Otherwise, append a new run. - */ - ntfs_debug("Adding run (lcn 0x%llx, len 0x%llx), " - "prev_lcn 0x%llx, lcn 0x%llx, " - "bmp_pos 0x%llx, prev_run_len 0x%llx, " - "rlpos %i.", - (unsigned long long)(lcn + bmp_pos), - 1ULL, (unsigned long long)prev_lcn, - (unsigned long long)lcn, - (unsigned long long)bmp_pos, - (unsigned long long)prev_run_len, - rlpos); - if (prev_lcn == lcn + bmp_pos - prev_run_len && rlpos) { - ntfs_debug("Coalescing to run (lcn 0x%llx, " - "len 0x%llx).", - (unsigned long long) - rl[rlpos - 1].lcn, - (unsigned long long) - rl[rlpos - 1].length); - rl[rlpos - 1].length = ++prev_run_len; - ntfs_debug("Run now (lcn 0x%llx, len 0x%llx), " - "prev_run_len 0x%llx.", - (unsigned long long) - rl[rlpos - 1].lcn, - (unsigned long long) - rl[rlpos - 1].length, - (unsigned long long) - prev_run_len); - } else { - if (likely(rlpos)) { - ntfs_debug("Adding new run, (previous " - "run lcn 0x%llx, " - "len 0x%llx).", - (unsigned long long) - rl[rlpos - 1].lcn, - (unsigned long long) - rl[rlpos - 1].length); - rl[rlpos].vcn = rl[rlpos - 1].vcn + - prev_run_len; - } else { - ntfs_debug("Adding new run, is first " - "run."); - rl[rlpos].vcn = start_vcn; - } - rl[rlpos].lcn = prev_lcn = lcn + bmp_pos; - rl[rlpos].length = prev_run_len = 1; - rlpos++; - } - /* Done? */ - if (!--clusters) { - LCN tc; - /* - * Update the current zone position. Positions - * of already scanned zones have been updated - * during the respective zone switches. - */ - tc = lcn + bmp_pos + 1; - ntfs_debug("Done. Updating current zone " - "position, tc 0x%llx, " - "search_zone %i.", - (unsigned long long)tc, - search_zone); - switch (search_zone) { - case 1: - ntfs_debug("Before checks, " - "vol->mft_zone_pos " - "0x%llx.", - (unsigned long long) - vol->mft_zone_pos); - if (tc >= vol->mft_zone_end) { - vol->mft_zone_pos = - vol->mft_lcn; - if (!vol->mft_zone_end) - vol->mft_zone_pos = 0; - } else if ((bmp_initial_pos >= - vol->mft_zone_pos || - tc > vol->mft_zone_pos) - && tc >= vol->mft_lcn) - vol->mft_zone_pos = tc; - ntfs_debug("After checks, " - "vol->mft_zone_pos " - "0x%llx.", - (unsigned long long) - vol->mft_zone_pos); - break; - case 2: - ntfs_debug("Before checks, " - "vol->data1_zone_pos " - "0x%llx.", - (unsigned long long) - vol->data1_zone_pos); - if (tc >= vol->nr_clusters) - vol->data1_zone_pos = - vol->mft_zone_end; - else if ((bmp_initial_pos >= - vol->data1_zone_pos || - tc > vol->data1_zone_pos) - && tc >= vol->mft_zone_end) - vol->data1_zone_pos = tc; - ntfs_debug("After checks, " - "vol->data1_zone_pos " - "0x%llx.", - (unsigned long long) - vol->data1_zone_pos); - break; - case 4: - ntfs_debug("Before checks, " - "vol->data2_zone_pos " - "0x%llx.", - (unsigned long long) - vol->data2_zone_pos); - if (tc >= vol->mft_zone_start) - vol->data2_zone_pos = 0; - else if (bmp_initial_pos >= - vol->data2_zone_pos || - tc > vol->data2_zone_pos) - vol->data2_zone_pos = tc; - ntfs_debug("After checks, " - "vol->data2_zone_pos " - "0x%llx.", - (unsigned long long) - vol->data2_zone_pos); - break; - default: - BUG(); - } - ntfs_debug("Finished. Going to out."); - goto out; - } - lcn++; - } - bmp_pos += buf_size; - ntfs_debug("After inner while loop: buf_size 0x%x, lcn " - "0x%llx, bmp_pos 0x%llx, need_writeback %i.", - buf_size, (unsigned long long)lcn, - (unsigned long long)bmp_pos, need_writeback); - if (bmp_pos < zone_end) { - ntfs_debug("Continuing outer while loop, " - "bmp_pos 0x%llx, zone_end 0x%llx.", - (unsigned long long)bmp_pos, - (unsigned long long)zone_end); - continue; - } -zone_pass_done: /* Finished with the current zone pass. */ - ntfs_debug("At zone_pass_done, pass %i.", pass); - if (pass == 1) { - /* - * Now do pass 2, scanning the first part of the zone - * we omitted in pass 1. - */ - pass = 2; - zone_end = zone_start; - switch (search_zone) { - case 1: /* mft_zone */ - zone_start = vol->mft_zone_start; - break; - case 2: /* data1_zone */ - zone_start = vol->mft_zone_end; - break; - case 4: /* data2_zone */ - zone_start = 0; - break; - default: - BUG(); - } - /* Sanity check. */ - if (zone_end < zone_start) - zone_end = zone_start; - bmp_pos = zone_start; - ntfs_debug("Continuing outer while loop, pass 2, " - "zone_start 0x%llx, zone_end 0x%llx, " - "bmp_pos 0x%llx.", - (unsigned long long)zone_start, - (unsigned long long)zone_end, - (unsigned long long)bmp_pos); - continue; - } /* pass == 2 */ -done_zones_check: - ntfs_debug("At done_zones_check, search_zone %i, done_zones " - "before 0x%x, done_zones after 0x%x.", - search_zone, done_zones, - done_zones | search_zone); - done_zones |= search_zone; - if (done_zones < 7) { - ntfs_debug("Switching zone."); - /* Now switch to the next zone we haven't done yet. */ - pass = 1; - switch (search_zone) { - case 1: - ntfs_debug("Switching from mft zone to data1 " - "zone."); - /* Update mft zone position. */ - if (rlpos) { - LCN tc; - - ntfs_debug("Before checks, " - "vol->mft_zone_pos " - "0x%llx.", - (unsigned long long) - vol->mft_zone_pos); - tc = rl[rlpos - 1].lcn + - rl[rlpos - 1].length; - if (tc >= vol->mft_zone_end) { - vol->mft_zone_pos = - vol->mft_lcn; - if (!vol->mft_zone_end) - vol->mft_zone_pos = 0; - } else if ((bmp_initial_pos >= - vol->mft_zone_pos || - tc > vol->mft_zone_pos) - && tc >= vol->mft_lcn) - vol->mft_zone_pos = tc; - ntfs_debug("After checks, " - "vol->mft_zone_pos " - "0x%llx.", - (unsigned long long) - vol->mft_zone_pos); - } - /* Switch from mft zone to data1 zone. */ -switch_to_data1_zone: search_zone = 2; - zone_start = bmp_initial_pos = - vol->data1_zone_pos; - zone_end = vol->nr_clusters; - if (zone_start == vol->mft_zone_end) - pass = 2; - if (zone_start >= zone_end) { - vol->data1_zone_pos = zone_start = - vol->mft_zone_end; - pass = 2; - } - break; - case 2: - ntfs_debug("Switching from data1 zone to " - "data2 zone."); - /* Update data1 zone position. */ - if (rlpos) { - LCN tc; - - ntfs_debug("Before checks, " - "vol->data1_zone_pos " - "0x%llx.", - (unsigned long long) - vol->data1_zone_pos); - tc = rl[rlpos - 1].lcn + - rl[rlpos - 1].length; - if (tc >= vol->nr_clusters) - vol->data1_zone_pos = - vol->mft_zone_end; - else if ((bmp_initial_pos >= - vol->data1_zone_pos || - tc > vol->data1_zone_pos) - && tc >= vol->mft_zone_end) - vol->data1_zone_pos = tc; - ntfs_debug("After checks, " - "vol->data1_zone_pos " - "0x%llx.", - (unsigned long long) - vol->data1_zone_pos); - } - /* Switch from data1 zone to data2 zone. */ - search_zone = 4; - zone_start = bmp_initial_pos = - vol->data2_zone_pos; - zone_end = vol->mft_zone_start; - if (!zone_start) - pass = 2; - if (zone_start >= zone_end) { - vol->data2_zone_pos = zone_start = - bmp_initial_pos = 0; - pass = 2; - } - break; - case 4: - ntfs_debug("Switching from data2 zone to " - "data1 zone."); - /* Update data2 zone position. */ - if (rlpos) { - LCN tc; - - ntfs_debug("Before checks, " - "vol->data2_zone_pos " - "0x%llx.", - (unsigned long long) - vol->data2_zone_pos); - tc = rl[rlpos - 1].lcn + - rl[rlpos - 1].length; - if (tc >= vol->mft_zone_start) - vol->data2_zone_pos = 0; - else if (bmp_initial_pos >= - vol->data2_zone_pos || - tc > vol->data2_zone_pos) - vol->data2_zone_pos = tc; - ntfs_debug("After checks, " - "vol->data2_zone_pos " - "0x%llx.", - (unsigned long long) - vol->data2_zone_pos); - } - /* Switch from data2 zone to data1 zone. */ - goto switch_to_data1_zone; - default: - BUG(); - } - ntfs_debug("After zone switch, search_zone %i, " - "pass %i, bmp_initial_pos 0x%llx, " - "zone_start 0x%llx, zone_end 0x%llx.", - search_zone, pass, - (unsigned long long)bmp_initial_pos, - (unsigned long long)zone_start, - (unsigned long long)zone_end); - bmp_pos = zone_start; - if (zone_start == zone_end) { - ntfs_debug("Empty zone, going to " - "done_zones_check."); - /* Empty zone. Don't bother searching it. */ - goto done_zones_check; - } - ntfs_debug("Continuing outer while loop."); - continue; - } /* done_zones == 7 */ - ntfs_debug("All zones are finished."); - /* - * All zones are finished! If DATA_ZONE, shrink mft zone. If - * MFT_ZONE, we have really run out of space. - */ - mft_zone_size = vol->mft_zone_end - vol->mft_zone_start; - ntfs_debug("vol->mft_zone_start 0x%llx, vol->mft_zone_end " - "0x%llx, mft_zone_size 0x%llx.", - (unsigned long long)vol->mft_zone_start, - (unsigned long long)vol->mft_zone_end, - (unsigned long long)mft_zone_size); - if (zone == MFT_ZONE || mft_zone_size <= 0) { - ntfs_debug("No free clusters left, going to out."); - /* Really no more space left on device. */ - err = -ENOSPC; - goto out; - } /* zone == DATA_ZONE && mft_zone_size > 0 */ - ntfs_debug("Shrinking mft zone."); - zone_end = vol->mft_zone_end; - mft_zone_size >>= 1; - if (mft_zone_size > 0) - vol->mft_zone_end = vol->mft_zone_start + mft_zone_size; - else /* mft zone and data2 zone no longer exist. */ - vol->data2_zone_pos = vol->mft_zone_start = - vol->mft_zone_end = 0; - if (vol->mft_zone_pos >= vol->mft_zone_end) { - vol->mft_zone_pos = vol->mft_lcn; - if (!vol->mft_zone_end) - vol->mft_zone_pos = 0; - } - bmp_pos = zone_start = bmp_initial_pos = - vol->data1_zone_pos = vol->mft_zone_end; - search_zone = 2; - pass = 2; - done_zones &= ~2; - ntfs_debug("After shrinking mft zone, mft_zone_size 0x%llx, " - "vol->mft_zone_start 0x%llx, " - "vol->mft_zone_end 0x%llx, " - "vol->mft_zone_pos 0x%llx, search_zone 2, " - "pass 2, dones_zones 0x%x, zone_start 0x%llx, " - "zone_end 0x%llx, vol->data1_zone_pos 0x%llx, " - "continuing outer while loop.", - (unsigned long long)mft_zone_size, - (unsigned long long)vol->mft_zone_start, - (unsigned long long)vol->mft_zone_end, - (unsigned long long)vol->mft_zone_pos, - done_zones, (unsigned long long)zone_start, - (unsigned long long)zone_end, - (unsigned long long)vol->data1_zone_pos); - } - ntfs_debug("After outer while loop."); -out: - ntfs_debug("At out."); - /* Add runlist terminator element. */ - if (likely(rl)) { - rl[rlpos].vcn = rl[rlpos - 1].vcn + rl[rlpos - 1].length; - rl[rlpos].lcn = is_extension ? LCN_ENOENT : LCN_RL_NOT_MAPPED; - rl[rlpos].length = 0; - } - if (likely(page && !IS_ERR(page))) { - if (need_writeback) { - ntfs_debug("Marking page dirty."); - flush_dcache_page(page); - set_page_dirty(page); - need_writeback = 0; - } - ntfs_unmap_page(page); - } - if (likely(!err)) { - up_write(&vol->lcnbmp_lock); - ntfs_debug("Done."); - return rl; - } - ntfs_error(vol->sb, "Failed to allocate clusters, aborting " - "(error %i).", err); - if (rl) { - int err2; - - if (err == -ENOSPC) - ntfs_debug("Not enough space to complete allocation, " - "err -ENOSPC, first free lcn 0x%llx, " - "could allocate up to 0x%llx " - "clusters.", - (unsigned long long)rl[0].lcn, - (unsigned long long)(count - clusters)); - /* Deallocate all allocated clusters. */ - ntfs_debug("Attempting rollback..."); - err2 = ntfs_cluster_free_from_rl_nolock(vol, rl); - if (err2) { - ntfs_error(vol->sb, "Failed to rollback (error %i). " - "Leaving inconsistent metadata! " - "Unmount and run chkdsk.", err2); - NVolSetErrors(vol); - } - /* Free the runlist. */ - ntfs_free(rl); - } else if (err == -ENOSPC) - ntfs_debug("No space left at all, err = -ENOSPC, first free " - "lcn = 0x%llx.", - (long long)vol->data1_zone_pos); - up_write(&vol->lcnbmp_lock); - return ERR_PTR(err); -} - -/** - * __ntfs_cluster_free - free clusters on an ntfs volume - * @ni: ntfs inode whose runlist describes the clusters to free - * @start_vcn: vcn in the runlist of @ni at which to start freeing clusters - * @count: number of clusters to free or -1 for all clusters - * @ctx: active attribute search context if present or NULL if not - * @is_rollback: true if this is a rollback operation - * - * Free @count clusters starting at the cluster @start_vcn in the runlist - * described by the vfs inode @ni. - * - * If @count is -1, all clusters from @start_vcn to the end of the runlist are - * deallocated. Thus, to completely free all clusters in a runlist, use - * @start_vcn = 0 and @count = -1. - * - * If @ctx is specified, it is an active search context of @ni and its base mft - * record. This is needed when __ntfs_cluster_free() encounters unmapped - * runlist fragments and allows their mapping. If you do not have the mft - * record mapped, you can specify @ctx as NULL and __ntfs_cluster_free() will - * perform the necessary mapping and unmapping. - * - * Note, __ntfs_cluster_free() saves the state of @ctx on entry and restores it - * before returning. Thus, @ctx will be left pointing to the same attribute on - * return as on entry. However, the actual pointers in @ctx may point to - * different memory locations on return, so you must remember to reset any - * cached pointers from the @ctx, i.e. after the call to __ntfs_cluster_free(), - * you will probably want to do: - * m = ctx->mrec; - * a = ctx->attr; - * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that - * you cache ctx->mrec in a variable @m of type MFT_RECORD *. - * - * @is_rollback should always be 'false', it is for internal use to rollback - * errors. You probably want to use ntfs_cluster_free() instead. - * - * Note, __ntfs_cluster_free() does not modify the runlist, so you have to - * remove from the runlist or mark sparse the freed runs later. - * - * Return the number of deallocated clusters (not counting sparse ones) on - * success and -errno on error. - * - * WARNING: If @ctx is supplied, regardless of whether success or failure is - * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx - * is no longer valid, i.e. you need to either call - * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it. - * In that case PTR_ERR(@ctx->mrec) will give you the error code for - * why the mapping of the old inode failed. - * - * Locking: - The runlist described by @ni must be locked for writing on entry - * and is locked on return. Note the runlist may be modified when - * needed runlist fragments need to be mapped. - * - The volume lcn bitmap must be unlocked on entry and is unlocked - * on return. - * - This function takes the volume lcn bitmap lock for writing and - * modifies the bitmap contents. - * - If @ctx is NULL, the base mft record of @ni must not be mapped on - * entry and it will be left unmapped on return. - * - If @ctx is not NULL, the base mft record must be mapped on entry - * and it will be left mapped on return. - */ -s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count, - ntfs_attr_search_ctx *ctx, const bool is_rollback) -{ - s64 delta, to_free, total_freed, real_freed; - ntfs_volume *vol; - struct inode *lcnbmp_vi; - runlist_element *rl; - int err; - - BUG_ON(!ni); - ntfs_debug("Entering for i_ino 0x%lx, start_vcn 0x%llx, count " - "0x%llx.%s", ni->mft_no, (unsigned long long)start_vcn, - (unsigned long long)count, - is_rollback ? " (rollback)" : ""); - vol = ni->vol; - lcnbmp_vi = vol->lcnbmp_ino; - BUG_ON(!lcnbmp_vi); - BUG_ON(start_vcn < 0); - BUG_ON(count < -1); - /* - * Lock the lcn bitmap for writing but only if not rolling back. We - * must hold the lock all the way including through rollback otherwise - * rollback is not possible because once we have cleared a bit and - * dropped the lock, anyone could have set the bit again, thus - * allocating the cluster for another use. - */ - if (likely(!is_rollback)) - down_write(&vol->lcnbmp_lock); - - total_freed = real_freed = 0; - - rl = ntfs_attr_find_vcn_nolock(ni, start_vcn, ctx); - if (IS_ERR(rl)) { - if (!is_rollback) - ntfs_error(vol->sb, "Failed to find first runlist " - "element (error %li), aborting.", - PTR_ERR(rl)); - err = PTR_ERR(rl); - goto err_out; - } - if (unlikely(rl->lcn < LCN_HOLE)) { - if (!is_rollback) - ntfs_error(vol->sb, "First runlist element has " - "invalid lcn, aborting."); - err = -EIO; - goto err_out; - } - /* Find the starting cluster inside the run that needs freeing. */ - delta = start_vcn - rl->vcn; - - /* The number of clusters in this run that need freeing. */ - to_free = rl->length - delta; - if (count >= 0 && to_free > count) - to_free = count; - - if (likely(rl->lcn >= 0)) { - /* Do the actual freeing of the clusters in this run. */ - err = ntfs_bitmap_set_bits_in_run(lcnbmp_vi, rl->lcn + delta, - to_free, likely(!is_rollback) ? 0 : 1); - if (unlikely(err)) { - if (!is_rollback) - ntfs_error(vol->sb, "Failed to clear first run " - "(error %i), aborting.", err); - goto err_out; - } - /* We have freed @to_free real clusters. */ - real_freed = to_free; - }; - /* Go to the next run and adjust the number of clusters left to free. */ - ++rl; - if (count >= 0) - count -= to_free; - - /* Keep track of the total "freed" clusters, including sparse ones. */ - total_freed = to_free; - /* - * Loop over the remaining runs, using @count as a capping value, and - * free them. - */ - for (; rl->length && count != 0; ++rl) { - if (unlikely(rl->lcn < LCN_HOLE)) { - VCN vcn; - - /* Attempt to map runlist. */ - vcn = rl->vcn; - rl = ntfs_attr_find_vcn_nolock(ni, vcn, ctx); - if (IS_ERR(rl)) { - err = PTR_ERR(rl); - if (!is_rollback) - ntfs_error(vol->sb, "Failed to map " - "runlist fragment or " - "failed to find " - "subsequent runlist " - "element."); - goto err_out; - } - if (unlikely(rl->lcn < LCN_HOLE)) { - if (!is_rollback) - ntfs_error(vol->sb, "Runlist element " - "has invalid lcn " - "(0x%llx).", - (unsigned long long) - rl->lcn); - err = -EIO; - goto err_out; - } - } - /* The number of clusters in this run that need freeing. */ - to_free = rl->length; - if (count >= 0 && to_free > count) - to_free = count; - - if (likely(rl->lcn >= 0)) { - /* Do the actual freeing of the clusters in the run. */ - err = ntfs_bitmap_set_bits_in_run(lcnbmp_vi, rl->lcn, - to_free, likely(!is_rollback) ? 0 : 1); - if (unlikely(err)) { - if (!is_rollback) - ntfs_error(vol->sb, "Failed to clear " - "subsequent run."); - goto err_out; - } - /* We have freed @to_free real clusters. */ - real_freed += to_free; - } - /* Adjust the number of clusters left to free. */ - if (count >= 0) - count -= to_free; - - /* Update the total done clusters. */ - total_freed += to_free; - } - if (likely(!is_rollback)) - up_write(&vol->lcnbmp_lock); - - BUG_ON(count > 0); - - /* We are done. Return the number of actually freed clusters. */ - ntfs_debug("Done."); - return real_freed; -err_out: - if (is_rollback) - return err; - /* If no real clusters were freed, no need to rollback. */ - if (!real_freed) { - up_write(&vol->lcnbmp_lock); - return err; - } - /* - * Attempt to rollback and if that succeeds just return the error code. - * If rollback fails, set the volume errors flag, emit an error - * message, and return the error code. - */ - delta = __ntfs_cluster_free(ni, start_vcn, total_freed, ctx, true); - if (delta < 0) { - ntfs_error(vol->sb, "Failed to rollback (error %i). Leaving " - "inconsistent metadata! Unmount and run " - "chkdsk.", (int)delta); - NVolSetErrors(vol); - } - up_write(&vol->lcnbmp_lock); - ntfs_error(vol->sb, "Aborting (error %i).", err); - return err; -} - -#endif /* NTFS_RW */ diff --git a/fs/ntfs/lcnalloc.h b/fs/ntfs/lcnalloc.h deleted file mode 100644 index 1589a6d8434b..000000000000 --- a/fs/ntfs/lcnalloc.h +++ /dev/null @@ -1,131 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * lcnalloc.h - Exports for NTFS kernel cluster (de)allocation. Part of the - * Linux-NTFS project. - * - * Copyright (c) 2004-2005 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_LCNALLOC_H -#define _LINUX_NTFS_LCNALLOC_H - -#ifdef NTFS_RW - -#include <linux/fs.h> - -#include "attrib.h" -#include "types.h" -#include "inode.h" -#include "runlist.h" -#include "volume.h" - -typedef enum { - FIRST_ZONE = 0, /* For sanity checking. */ - MFT_ZONE = 0, /* Allocate from $MFT zone. */ - DATA_ZONE = 1, /* Allocate from $DATA zone. */ - LAST_ZONE = 1, /* For sanity checking. */ -} NTFS_CLUSTER_ALLOCATION_ZONES; - -extern runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, - const VCN start_vcn, const s64 count, const LCN start_lcn, - const NTFS_CLUSTER_ALLOCATION_ZONES zone, - const bool is_extension); - -extern s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, - s64 count, ntfs_attr_search_ctx *ctx, const bool is_rollback); - -/** - * ntfs_cluster_free - free clusters on an ntfs volume - * @ni: ntfs inode whose runlist describes the clusters to free - * @start_vcn: vcn in the runlist of @ni at which to start freeing clusters - * @count: number of clusters to free or -1 for all clusters - * @ctx: active attribute search context if present or NULL if not - * - * Free @count clusters starting at the cluster @start_vcn in the runlist - * described by the ntfs inode @ni. - * - * If @count is -1, all clusters from @start_vcn to the end of the runlist are - * deallocated. Thus, to completely free all clusters in a runlist, use - * @start_vcn = 0 and @count = -1. - * - * If @ctx is specified, it is an active search context of @ni and its base mft - * record. This is needed when ntfs_cluster_free() encounters unmapped runlist - * fragments and allows their mapping. If you do not have the mft record - * mapped, you can specify @ctx as NULL and ntfs_cluster_free() will perform - * the necessary mapping and unmapping. - * - * Note, ntfs_cluster_free() saves the state of @ctx on entry and restores it - * before returning. Thus, @ctx will be left pointing to the same attribute on - * return as on entry. However, the actual pointers in @ctx may point to - * different memory locations on return, so you must remember to reset any - * cached pointers from the @ctx, i.e. after the call to ntfs_cluster_free(), - * you will probably want to do: - * m = ctx->mrec; - * a = ctx->attr; - * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that - * you cache ctx->mrec in a variable @m of type MFT_RECORD *. - * - * Note, ntfs_cluster_free() does not modify the runlist, so you have to remove - * from the runlist or mark sparse the freed runs later. - * - * Return the number of deallocated clusters (not counting sparse ones) on - * success and -errno on error. - * - * WARNING: If @ctx is supplied, regardless of whether success or failure is - * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx - * is no longer valid, i.e. you need to either call - * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it. - * In that case PTR_ERR(@ctx->mrec) will give you the error code for - * why the mapping of the old inode failed. - * - * Locking: - The runlist described by @ni must be locked for writing on entry - * and is locked on return. Note the runlist may be modified when - * needed runlist fragments need to be mapped. - * - The volume lcn bitmap must be unlocked on entry and is unlocked - * on return. - * - This function takes the volume lcn bitmap lock for writing and - * modifies the bitmap contents. - * - If @ctx is NULL, the base mft record of @ni must not be mapped on - * entry and it will be left unmapped on return. - * - If @ctx is not NULL, the base mft record must be mapped on entry - * and it will be left mapped on return. - */ -static inline s64 ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, - s64 count, ntfs_attr_search_ctx *ctx) -{ - return __ntfs_cluster_free(ni, start_vcn, count, ctx, false); -} - -extern int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol, - const runlist_element *rl); - -/** - * ntfs_cluster_free_from_rl - free clusters from runlist - * @vol: mounted ntfs volume on which to free the clusters - * @rl: runlist describing the clusters to free - * - * Free all the clusters described by the runlist @rl on the volume @vol. In - * the case of an error being returned, at least some of the clusters were not - * freed. - * - * Return 0 on success and -errno on error. - * - * Locking: - This function takes the volume lcn bitmap lock for writing and - * modifies the bitmap contents. - * - The caller must have locked the runlist @rl for reading or - * writing. - */ -static inline int ntfs_cluster_free_from_rl(ntfs_volume *vol, - const runlist_element *rl) -{ - int ret; - - down_write(&vol->lcnbmp_lock); - ret = ntfs_cluster_free_from_rl_nolock(vol, rl); - up_write(&vol->lcnbmp_lock); - return ret; -} - -#endif /* NTFS_RW */ - -#endif /* defined _LINUX_NTFS_LCNALLOC_H */ diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c deleted file mode 100644 index 6ce60ffc6ac0..000000000000 --- a/fs/ntfs/logfile.c +++ /dev/null @@ -1,849 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * logfile.c - NTFS kernel journal handling. Part of the Linux-NTFS project. - * - * Copyright (c) 2002-2007 Anton Altaparmakov - */ - -#ifdef NTFS_RW - -#include <linux/types.h> -#include <linux/fs.h> -#include <linux/highmem.h> -#include <linux/buffer_head.h> -#include <linux/bitops.h> -#include <linux/log2.h> -#include <linux/bio.h> - -#include "attrib.h" -#include "aops.h" -#include "debug.h" -#include "logfile.h" -#include "malloc.h" -#include "volume.h" -#include "ntfs.h" - -/** - * ntfs_check_restart_page_header - check the page header for consistency - * @vi: $LogFile inode to which the restart page header belongs - * @rp: restart page header to check - * @pos: position in @vi at which the restart page header resides - * - * Check the restart page header @rp for consistency and return 'true' if it is - * consistent and 'false' otherwise. - * - * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not - * require the full restart page. - */ -static bool ntfs_check_restart_page_header(struct inode *vi, - RESTART_PAGE_HEADER *rp, s64 pos) -{ - u32 logfile_system_page_size, logfile_log_page_size; - u16 ra_ofs, usa_count, usa_ofs, usa_end = 0; - bool have_usa = true; - - ntfs_debug("Entering."); - /* - * If the system or log page sizes are smaller than the ntfs block size - * or either is not a power of 2 we cannot handle this log file. - */ - logfile_system_page_size = le32_to_cpu(rp->system_page_size); - logfile_log_page_size = le32_to_cpu(rp->log_page_size); - if (logfile_system_page_size < NTFS_BLOCK_SIZE || - logfile_log_page_size < NTFS_BLOCK_SIZE || - logfile_system_page_size & - (logfile_system_page_size - 1) || - !is_power_of_2(logfile_log_page_size)) { - ntfs_error(vi->i_sb, "$LogFile uses unsupported page size."); - return false; - } - /* - * We must be either at !pos (1st restart page) or at pos = system page - * size (2nd restart page). - */ - if (pos && pos != logfile_system_page_size) { - ntfs_error(vi->i_sb, "Found restart area in incorrect " - "position in $LogFile."); - return false; - } - /* We only know how to handle version 1.1. */ - if (sle16_to_cpu(rp->major_ver) != 1 || - sle16_to_cpu(rp->minor_ver) != 1) { - ntfs_error(vi->i_sb, "$LogFile version %i.%i is not " - "supported. (This driver supports version " - "1.1 only.)", (int)sle16_to_cpu(rp->major_ver), - (int)sle16_to_cpu(rp->minor_ver)); - return false; - } - /* - * If chkdsk has been run the restart page may not be protected by an - * update sequence array. - */ - if (ntfs_is_chkd_record(rp->magic) && !le16_to_cpu(rp->usa_count)) { - have_usa = false; - goto skip_usa_checks; - } - /* Verify the size of the update sequence array. */ - usa_count = 1 + (logfile_system_page_size >> NTFS_BLOCK_SIZE_BITS); - if (usa_count != le16_to_cpu(rp->usa_count)) { - ntfs_error(vi->i_sb, "$LogFile restart page specifies " - "inconsistent update sequence array count."); - return false; - } - /* Verify the position of the update sequence array. */ - usa_ofs = le16_to_cpu(rp->usa_ofs); - usa_end = usa_ofs + usa_count * sizeof(u16); - if (usa_ofs < sizeof(RESTART_PAGE_HEADER) || - usa_end > NTFS_BLOCK_SIZE - sizeof(u16)) { - ntfs_error(vi->i_sb, "$LogFile restart page specifies " - "inconsistent update sequence array offset."); - return false; - } -skip_usa_checks: - /* - * Verify the position of the restart area. It must be: - * - aligned to 8-byte boundary, - * - after the update sequence array, and - * - within the system page size. - */ - ra_ofs = le16_to_cpu(rp->restart_area_offset); - if (ra_ofs & 7 || (have_usa ? ra_ofs < usa_end : - ra_ofs < sizeof(RESTART_PAGE_HEADER)) || - ra_ofs > logfile_system_page_size) { - ntfs_error(vi->i_sb, "$LogFile restart page specifies " - "inconsistent restart area offset."); - return false; - } - /* - * Only restart pages modified by chkdsk are allowed to have chkdsk_lsn - * set. - */ - if (!ntfs_is_chkd_record(rp->magic) && sle64_to_cpu(rp->chkdsk_lsn)) { - ntfs_error(vi->i_sb, "$LogFile restart page is not modified " - "by chkdsk but a chkdsk LSN is specified."); - return false; - } - ntfs_debug("Done."); - return true; -} - -/** - * ntfs_check_restart_area - check the restart area for consistency - * @vi: $LogFile inode to which the restart page belongs - * @rp: restart page whose restart area to check - * - * Check the restart area of the restart page @rp for consistency and return - * 'true' if it is consistent and 'false' otherwise. - * - * This function assumes that the restart page header has already been - * consistency checked. - * - * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not - * require the full restart page. - */ -static bool ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp) -{ - u64 file_size; - RESTART_AREA *ra; - u16 ra_ofs, ra_len, ca_ofs; - u8 fs_bits; - - ntfs_debug("Entering."); - ra_ofs = le16_to_cpu(rp->restart_area_offset); - ra = (RESTART_AREA*)((u8*)rp + ra_ofs); - /* - * Everything before ra->file_size must be before the first word - * protected by an update sequence number. This ensures that it is - * safe to access ra->client_array_offset. - */ - if (ra_ofs + offsetof(RESTART_AREA, file_size) > - NTFS_BLOCK_SIZE - sizeof(u16)) { - ntfs_error(vi->i_sb, "$LogFile restart area specifies " - "inconsistent file offset."); - return false; - } - /* - * Now that we can access ra->client_array_offset, make sure everything - * up to the log client array is before the first word protected by an - * update sequence number. This ensures we can access all of the - * restart area elements safely. Also, the client array offset must be - * aligned to an 8-byte boundary. - */ - ca_ofs = le16_to_cpu(ra->client_array_offset); - if (((ca_ofs + 7) & ~7) != ca_ofs || - ra_ofs + ca_ofs > NTFS_BLOCK_SIZE - sizeof(u16)) { - ntfs_error(vi->i_sb, "$LogFile restart area specifies " - "inconsistent client array offset."); - return false; - } - /* - * The restart area must end within the system page size both when - * calculated manually and as specified by ra->restart_area_length. - * Also, the calculated length must not exceed the specified length. - */ - ra_len = ca_ofs + le16_to_cpu(ra->log_clients) * - sizeof(LOG_CLIENT_RECORD); - if (ra_ofs + ra_len > le32_to_cpu(rp->system_page_size) || - ra_ofs + le16_to_cpu(ra->restart_area_length) > - le32_to_cpu(rp->system_page_size) || - ra_len > le16_to_cpu(ra->restart_area_length)) { - ntfs_error(vi->i_sb, "$LogFile restart area is out of bounds " - "of the system page size specified by the " - "restart page header and/or the specified " - "restart area length is inconsistent."); - return false; - } - /* - * The ra->client_free_list and ra->client_in_use_list must be either - * LOGFILE_NO_CLIENT or less than ra->log_clients or they are - * overflowing the client array. - */ - if ((ra->client_free_list != LOGFILE_NO_CLIENT && - le16_to_cpu(ra->client_free_list) >= - le16_to_cpu(ra->log_clients)) || - (ra->client_in_use_list != LOGFILE_NO_CLIENT && - le16_to_cpu(ra->client_in_use_list) >= - le16_to_cpu(ra->log_clients))) { - ntfs_error(vi->i_sb, "$LogFile restart area specifies " - "overflowing client free and/or in use lists."); - return false; - } - /* - * Check ra->seq_number_bits against ra->file_size for consistency. - * We cannot just use ffs() because the file size is not a power of 2. - */ - file_size = (u64)sle64_to_cpu(ra->file_size); - fs_bits = 0; - while (file_size) { - file_size >>= 1; - fs_bits++; - } - if (le32_to_cpu(ra->seq_number_bits) != 67 - fs_bits) { - ntfs_error(vi->i_sb, "$LogFile restart area specifies " - "inconsistent sequence number bits."); - return false; - } - /* The log record header length must be a multiple of 8. */ - if (((le16_to_cpu(ra->log_record_header_length) + 7) & ~7) != - le16_to_cpu(ra->log_record_header_length)) { - ntfs_error(vi->i_sb, "$LogFile restart area specifies " - "inconsistent log record header length."); - return false; - } - /* Dito for the log page data offset. */ - if (((le16_to_cpu(ra->log_page_data_offset) + 7) & ~7) != - le16_to_cpu(ra->log_page_data_offset)) { - ntfs_error(vi->i_sb, "$LogFile restart area specifies " - "inconsistent log page data offset."); - return false; - } - ntfs_debug("Done."); - return true; -} - -/** - * ntfs_check_log_client_array - check the log client array for consistency - * @vi: $LogFile inode to which the restart page belongs - * @rp: restart page whose log client array to check - * - * Check the log client array of the restart page @rp for consistency and - * return 'true' if it is consistent and 'false' otherwise. - * - * This function assumes that the restart page header and the restart area have - * already been consistency checked. - * - * Unlike ntfs_check_restart_page_header() and ntfs_check_restart_area(), this - * function needs @rp->system_page_size bytes in @rp, i.e. it requires the full - * restart page and the page must be multi sector transfer deprotected. - */ -static bool ntfs_check_log_client_array(struct inode *vi, - RESTART_PAGE_HEADER *rp) -{ - RESTART_AREA *ra; - LOG_CLIENT_RECORD *ca, *cr; - u16 nr_clients, idx; - bool in_free_list, idx_is_first; - - ntfs_debug("Entering."); - ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset)); - ca = (LOG_CLIENT_RECORD*)((u8*)ra + - le16_to_cpu(ra->client_array_offset)); - /* - * Check the ra->client_free_list first and then check the - * ra->client_in_use_list. Check each of the log client records in - * each of the lists and check that the array does not overflow the - * ra->log_clients value. Also keep track of the number of records - * visited as there cannot be more than ra->log_clients records and - * that way we detect eventual loops in within a list. - */ - nr_clients = le16_to_cpu(ra->log_clients); - idx = le16_to_cpu(ra->client_free_list); - in_free_list = true; -check_list: - for (idx_is_first = true; idx != LOGFILE_NO_CLIENT_CPU; nr_clients--, - idx = le16_to_cpu(cr->next_client)) { - if (!nr_clients || idx >= le16_to_cpu(ra->log_clients)) - goto err_out; - /* Set @cr to the current log client record. */ - cr = ca + idx; - /* The first log client record must not have a prev_client. */ - if (idx_is_first) { - if (cr->prev_client != LOGFILE_NO_CLIENT) - goto err_out; - idx_is_first = false; - } - } - /* Switch to and check the in use list if we just did the free list. */ - if (in_free_list) { - in_free_list = false; - idx = le16_to_cpu(ra->client_in_use_list); - goto check_list; - } - ntfs_debug("Done."); - return true; -err_out: - ntfs_error(vi->i_sb, "$LogFile log client array is corrupt."); - return false; -} - -/** - * ntfs_check_and_load_restart_page - check the restart page for consistency - * @vi: $LogFile inode to which the restart page belongs - * @rp: restart page to check - * @pos: position in @vi at which the restart page resides - * @wrp: [OUT] copy of the multi sector transfer deprotected restart page - * @lsn: [OUT] set to the current logfile lsn on success - * - * Check the restart page @rp for consistency and return 0 if it is consistent - * and -errno otherwise. The restart page may have been modified by chkdsk in - * which case its magic is CHKD instead of RSTR. - * - * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not - * require the full restart page. - * - * If @wrp is not NULL, on success, *@wrp will point to a buffer containing a - * copy of the complete multi sector transfer deprotected page. On failure, - * *@wrp is undefined. - * - * Simillarly, if @lsn is not NULL, on success *@lsn will be set to the current - * logfile lsn according to this restart page. On failure, *@lsn is undefined. - * - * The following error codes are defined: - * -EINVAL - The restart page is inconsistent. - * -ENOMEM - Not enough memory to load the restart page. - * -EIO - Failed to reading from $LogFile. - */ -static int ntfs_check_and_load_restart_page(struct inode *vi, - RESTART_PAGE_HEADER *rp, s64 pos, RESTART_PAGE_HEADER **wrp, - LSN *lsn) -{ - RESTART_AREA *ra; - RESTART_PAGE_HEADER *trp; - int size, err; - - ntfs_debug("Entering."); - /* Check the restart page header for consistency. */ - if (!ntfs_check_restart_page_header(vi, rp, pos)) { - /* Error output already done inside the function. */ - return -EINVAL; - } - /* Check the restart area for consistency. */ - if (!ntfs_check_restart_area(vi, rp)) { - /* Error output already done inside the function. */ - return -EINVAL; - } - ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset)); - /* - * Allocate a buffer to store the whole restart page so we can multi - * sector transfer deprotect it. - */ - trp = ntfs_malloc_nofs(le32_to_cpu(rp->system_page_size)); - if (!trp) { - ntfs_error(vi->i_sb, "Failed to allocate memory for $LogFile " - "restart page buffer."); - return -ENOMEM; - } - /* - * Read the whole of the restart page into the buffer. If it fits - * completely inside @rp, just copy it from there. Otherwise map all - * the required pages and copy the data from them. - */ - size = PAGE_SIZE - (pos & ~PAGE_MASK); - if (size >= le32_to_cpu(rp->system_page_size)) { - memcpy(trp, rp, le32_to_cpu(rp->system_page_size)); - } else { - pgoff_t idx; - struct page *page; - int have_read, to_read; - - /* First copy what we already have in @rp. */ - memcpy(trp, rp, size); - /* Copy the remaining data one page at a time. */ - have_read = size; - to_read = le32_to_cpu(rp->system_page_size) - size; - idx = (pos + size) >> PAGE_SHIFT; - BUG_ON((pos + size) & ~PAGE_MASK); - do { - page = ntfs_map_page(vi->i_mapping, idx); - if (IS_ERR(page)) { - ntfs_error(vi->i_sb, "Error mapping $LogFile " - "page (index %lu).", idx); - err = PTR_ERR(page); - if (err != -EIO && err != -ENOMEM) - err = -EIO; - goto err_out; - } - size = min_t(int, to_read, PAGE_SIZE); - memcpy((u8*)trp + have_read, page_address(page), size); - ntfs_unmap_page(page); - have_read += size; - to_read -= size; - idx++; - } while (to_read > 0); - } - /* - * Perform the multi sector transfer deprotection on the buffer if the - * restart page is protected. - */ - if ((!ntfs_is_chkd_record(trp->magic) || le16_to_cpu(trp->usa_count)) - && post_read_mst_fixup((NTFS_RECORD*)trp, - le32_to_cpu(rp->system_page_size))) { - /* - * A multi sector tranfer error was detected. We only need to - * abort if the restart page contents exceed the multi sector - * transfer fixup of the first sector. - */ - if (le16_to_cpu(rp->restart_area_offset) + - le16_to_cpu(ra->restart_area_length) > - NTFS_BLOCK_SIZE - sizeof(u16)) { - ntfs_error(vi->i_sb, "Multi sector transfer error " - "detected in $LogFile restart page."); - err = -EINVAL; - goto err_out; - } - } - /* - * If the restart page is modified by chkdsk or there are no active - * logfile clients, the logfile is consistent. Otherwise, need to - * check the log client records for consistency, too. - */ - err = 0; - if (ntfs_is_rstr_record(rp->magic) && - ra->client_in_use_list != LOGFILE_NO_CLIENT) { - if (!ntfs_check_log_client_array(vi, trp)) { - err = -EINVAL; - goto err_out; - } - } - if (lsn) { - if (ntfs_is_rstr_record(rp->magic)) - *lsn = sle64_to_cpu(ra->current_lsn); - else /* if (ntfs_is_chkd_record(rp->magic)) */ - *lsn = sle64_to_cpu(rp->chkdsk_lsn); - } - ntfs_debug("Done."); - if (wrp) - *wrp = trp; - else { -err_out: - ntfs_free(trp); - } - return err; -} - -/** - * ntfs_check_logfile - check the journal for consistency - * @log_vi: struct inode of loaded journal $LogFile to check - * @rp: [OUT] on success this is a copy of the current restart page - * - * Check the $LogFile journal for consistency and return 'true' if it is - * consistent and 'false' if not. On success, the current restart page is - * returned in *@rp. Caller must call ntfs_free(*@rp) when finished with it. - * - * At present we only check the two restart pages and ignore the log record - * pages. - * - * Note that the MstProtected flag is not set on the $LogFile inode and hence - * when reading pages they are not deprotected. This is because we do not know - * if the $LogFile was created on a system with a different page size to ours - * yet and mst deprotection would fail if our page size is smaller. - */ -bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp) -{ - s64 size, pos; - LSN rstr1_lsn, rstr2_lsn; - ntfs_volume *vol = NTFS_SB(log_vi->i_sb); - struct address_space *mapping = log_vi->i_mapping; - struct page *page = NULL; - u8 *kaddr = NULL; - RESTART_PAGE_HEADER *rstr1_ph = NULL; - RESTART_PAGE_HEADER *rstr2_ph = NULL; - int log_page_size, err; - bool logfile_is_empty = true; - u8 log_page_bits; - - ntfs_debug("Entering."); - /* An empty $LogFile must have been clean before it got emptied. */ - if (NVolLogFileEmpty(vol)) - goto is_empty; - size = i_size_read(log_vi); - /* Make sure the file doesn't exceed the maximum allowed size. */ - if (size > MaxLogFileSize) - size = MaxLogFileSize; - /* - * Truncate size to a multiple of the page cache size or the default - * log page size if the page cache size is between the default log page - * log page size if the page cache size is between the default log page - * size and twice that. - */ - if (PAGE_SIZE >= DefaultLogPageSize && PAGE_SIZE <= - DefaultLogPageSize * 2) - log_page_size = DefaultLogPageSize; - else - log_page_size = PAGE_SIZE; - /* - * Use ntfs_ffs() instead of ffs() to enable the compiler to - * optimize log_page_size and log_page_bits into constants. - */ - log_page_bits = ntfs_ffs(log_page_size) - 1; - size &= ~(s64)(log_page_size - 1); - /* - * Ensure the log file is big enough to store at least the two restart - * pages and the minimum number of log record pages. - */ - if (size < log_page_size * 2 || (size - log_page_size * 2) >> - log_page_bits < MinLogRecordPages) { - ntfs_error(vol->sb, "$LogFile is too small."); - return false; - } - /* - * Read through the file looking for a restart page. Since the restart - * page header is at the beginning of a page we only need to search at - * what could be the beginning of a page (for each page size) rather - * than scanning the whole file byte by byte. If all potential places - * contain empty and uninitialzed records, the log file can be assumed - * to be empty. - */ - for (pos = 0; pos < size; pos <<= 1) { - pgoff_t idx = pos >> PAGE_SHIFT; - if (!page || page->index != idx) { - if (page) - ntfs_unmap_page(page); - page = ntfs_map_page(mapping, idx); - if (IS_ERR(page)) { - ntfs_error(vol->sb, "Error mapping $LogFile " - "page (index %lu).", idx); - goto err_out; - } - } - kaddr = (u8*)page_address(page) + (pos & ~PAGE_MASK); - /* - * A non-empty block means the logfile is not empty while an - * empty block after a non-empty block has been encountered - * means we are done. - */ - if (!ntfs_is_empty_recordp((le32*)kaddr)) - logfile_is_empty = false; - else if (!logfile_is_empty) - break; - /* - * A log record page means there cannot be a restart page after - * this so no need to continue searching. - */ - if (ntfs_is_rcrd_recordp((le32*)kaddr)) - break; - /* If not a (modified by chkdsk) restart page, continue. */ - if (!ntfs_is_rstr_recordp((le32*)kaddr) && - !ntfs_is_chkd_recordp((le32*)kaddr)) { - if (!pos) - pos = NTFS_BLOCK_SIZE >> 1; - continue; - } - /* - * Check the (modified by chkdsk) restart page for consistency - * and get a copy of the complete multi sector transfer - * deprotected restart page. - */ - err = ntfs_check_and_load_restart_page(log_vi, - (RESTART_PAGE_HEADER*)kaddr, pos, - !rstr1_ph ? &rstr1_ph : &rstr2_ph, - !rstr1_ph ? &rstr1_lsn : &rstr2_lsn); - if (!err) { - /* - * If we have now found the first (modified by chkdsk) - * restart page, continue looking for the second one. - */ - if (!pos) { - pos = NTFS_BLOCK_SIZE >> 1; - continue; - } - /* - * We have now found the second (modified by chkdsk) - * restart page, so we can stop looking. - */ - break; - } - /* - * Error output already done inside the function. Note, we do - * not abort if the restart page was invalid as we might still - * find a valid one further in the file. - */ - if (err != -EINVAL) { - ntfs_unmap_page(page); - goto err_out; - } - /* Continue looking. */ - if (!pos) - pos = NTFS_BLOCK_SIZE >> 1; - } - if (page) - ntfs_unmap_page(page); - if (logfile_is_empty) { - NVolSetLogFileEmpty(vol); -is_empty: - ntfs_debug("Done. ($LogFile is empty.)"); - return true; - } - if (!rstr1_ph) { - BUG_ON(rstr2_ph); - ntfs_error(vol->sb, "Did not find any restart pages in " - "$LogFile and it was not empty."); - return false; - } - /* If both restart pages were found, use the more recent one. */ - if (rstr2_ph) { - /* - * If the second restart area is more recent, switch to it. - * Otherwise just throw it away. - */ - if (rstr2_lsn > rstr1_lsn) { - ntfs_debug("Using second restart page as it is more " - "recent."); - ntfs_free(rstr1_ph); - rstr1_ph = rstr2_ph; - /* rstr1_lsn = rstr2_lsn; */ - } else { - ntfs_debug("Using first restart page as it is more " - "recent."); - ntfs_free(rstr2_ph); - } - rstr2_ph = NULL; - } - /* All consistency checks passed. */ - if (rp) - *rp = rstr1_ph; - else - ntfs_free(rstr1_ph); - ntfs_debug("Done."); - return true; -err_out: - if (rstr1_ph) - ntfs_free(rstr1_ph); - return false; -} - -/** - * ntfs_is_logfile_clean - check in the journal if the volume is clean - * @log_vi: struct inode of loaded journal $LogFile to check - * @rp: copy of the current restart page - * - * Analyze the $LogFile journal and return 'true' if it indicates the volume was - * shutdown cleanly and 'false' if not. - * - * At present we only look at the two restart pages and ignore the log record - * pages. This is a little bit crude in that there will be a very small number - * of cases where we think that a volume is dirty when in fact it is clean. - * This should only affect volumes that have not been shutdown cleanly but did - * not have any pending, non-check-pointed i/o, i.e. they were completely idle - * at least for the five seconds preceding the unclean shutdown. - * - * This function assumes that the $LogFile journal has already been consistency - * checked by a call to ntfs_check_logfile() and in particular if the $LogFile - * is empty this function requires that NVolLogFileEmpty() is true otherwise an - * empty volume will be reported as dirty. - */ -bool ntfs_is_logfile_clean(struct inode *log_vi, const RESTART_PAGE_HEADER *rp) -{ - ntfs_volume *vol = NTFS_SB(log_vi->i_sb); - RESTART_AREA *ra; - - ntfs_debug("Entering."); - /* An empty $LogFile must have been clean before it got emptied. */ - if (NVolLogFileEmpty(vol)) { - ntfs_debug("Done. ($LogFile is empty.)"); - return true; - } - BUG_ON(!rp); - if (!ntfs_is_rstr_record(rp->magic) && - !ntfs_is_chkd_record(rp->magic)) { - ntfs_error(vol->sb, "Restart page buffer is invalid. This is " - "probably a bug in that the $LogFile should " - "have been consistency checked before calling " - "this function."); - return false; - } - ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset)); - /* - * If the $LogFile has active clients, i.e. it is open, and we do not - * have the RESTART_VOLUME_IS_CLEAN bit set in the restart area flags, - * we assume there was an unclean shutdown. - */ - if (ra->client_in_use_list != LOGFILE_NO_CLIENT && - !(ra->flags & RESTART_VOLUME_IS_CLEAN)) { - ntfs_debug("Done. $LogFile indicates a dirty shutdown."); - return false; - } - /* $LogFile indicates a clean shutdown. */ - ntfs_debug("Done. $LogFile indicates a clean shutdown."); - return true; -} - -/** - * ntfs_empty_logfile - empty the contents of the $LogFile journal - * @log_vi: struct inode of loaded journal $LogFile to empty - * - * Empty the contents of the $LogFile journal @log_vi and return 'true' on - * success and 'false' on error. - * - * This function assumes that the $LogFile journal has already been consistency - * checked by a call to ntfs_check_logfile() and that ntfs_is_logfile_clean() - * has been used to ensure that the $LogFile is clean. - */ -bool ntfs_empty_logfile(struct inode *log_vi) -{ - VCN vcn, end_vcn; - ntfs_inode *log_ni = NTFS_I(log_vi); - ntfs_volume *vol = log_ni->vol; - struct super_block *sb = vol->sb; - runlist_element *rl; - unsigned long flags; - unsigned block_size, block_size_bits; - int err; - bool should_wait = true; - - ntfs_debug("Entering."); - if (NVolLogFileEmpty(vol)) { - ntfs_debug("Done."); - return true; - } - /* - * We cannot use ntfs_attr_set() because we may be still in the middle - * of a mount operation. Thus we do the emptying by hand by first - * zapping the page cache pages for the $LogFile/$DATA attribute and - * then emptying each of the buffers in each of the clusters specified - * by the runlist by hand. - */ - block_size = sb->s_blocksize; - block_size_bits = sb->s_blocksize_bits; - vcn = 0; - read_lock_irqsave(&log_ni->size_lock, flags); - end_vcn = (log_ni->initialized_size + vol->cluster_size_mask) >> - vol->cluster_size_bits; - read_unlock_irqrestore(&log_ni->size_lock, flags); - truncate_inode_pages(log_vi->i_mapping, 0); - down_write(&log_ni->runlist.lock); - rl = log_ni->runlist.rl; - if (unlikely(!rl || vcn < rl->vcn || !rl->length)) { -map_vcn: - err = ntfs_map_runlist_nolock(log_ni, vcn, NULL); - if (err) { - ntfs_error(sb, "Failed to map runlist fragment (error " - "%d).", -err); - goto err; - } - rl = log_ni->runlist.rl; - BUG_ON(!rl || vcn < rl->vcn || !rl->length); - } - /* Seek to the runlist element containing @vcn. */ - while (rl->length && vcn >= rl[1].vcn) - rl++; - do { - LCN lcn; - sector_t block, end_block; - s64 len; - - /* - * If this run is not mapped map it now and start again as the - * runlist will have been updated. - */ - lcn = rl->lcn; - if (unlikely(lcn == LCN_RL_NOT_MAPPED)) { - vcn = rl->vcn; - goto map_vcn; - } - /* If this run is not valid abort with an error. */ - if (unlikely(!rl->length || lcn < LCN_HOLE)) - goto rl_err; - /* Skip holes. */ - if (lcn == LCN_HOLE) - continue; - block = lcn << vol->cluster_size_bits >> block_size_bits; - len = rl->length; - if (rl[1].vcn > end_vcn) - len = end_vcn - rl->vcn; - end_block = (lcn + len) << vol->cluster_size_bits >> - block_size_bits; - /* Iterate over the blocks in the run and empty them. */ - do { - struct buffer_head *bh; - - /* Obtain the buffer, possibly not uptodate. */ - bh = sb_getblk(sb, block); - BUG_ON(!bh); - /* Setup buffer i/o submission. */ - lock_buffer(bh); - bh->b_end_io = end_buffer_write_sync; - get_bh(bh); - /* Set the entire contents of the buffer to 0xff. */ - memset(bh->b_data, -1, block_size); - if (!buffer_uptodate(bh)) - set_buffer_uptodate(bh); - if (buffer_dirty(bh)) - clear_buffer_dirty(bh); - /* - * Submit the buffer and wait for i/o to complete but - * only for the first buffer so we do not miss really - * serious i/o errors. Once the first buffer has - * completed ignore errors afterwards as we can assume - * that if one buffer worked all of them will work. - */ - submit_bh(REQ_OP_WRITE, bh); - if (should_wait) { - should_wait = false; - wait_on_buffer(bh); - if (unlikely(!buffer_uptodate(bh))) - goto io_err; - } - brelse(bh); - } while (++block < end_block); - } while ((++rl)->vcn < end_vcn); - up_write(&log_ni->runlist.lock); - /* - * Zap the pages again just in case any got instantiated whilst we were - * emptying the blocks by hand. FIXME: We may not have completed - * writing to all the buffer heads yet so this may happen too early. - * We really should use a kernel thread to do the emptying - * asynchronously and then we can also set the volume dirty and output - * an error message if emptying should fail. - */ - truncate_inode_pages(log_vi->i_mapping, 0); - /* Set the flag so we do not have to do it again on remount. */ - NVolSetLogFileEmpty(vol); - ntfs_debug("Done."); - return true; -io_err: - ntfs_error(sb, "Failed to write buffer. Unmount and run chkdsk."); - goto dirty_err; -rl_err: - ntfs_error(sb, "Runlist is corrupt. Unmount and run chkdsk."); -dirty_err: - NVolSetErrors(vol); - err = -EIO; -err: - up_write(&log_ni->runlist.lock); - ntfs_error(sb, "Failed to fill $LogFile with 0xff bytes (error %d).", - -err); - return false; -} - -#endif /* NTFS_RW */ diff --git a/fs/ntfs/logfile.h b/fs/ntfs/logfile.h deleted file mode 100644 index 429d4909cc72..000000000000 --- a/fs/ntfs/logfile.h +++ /dev/null @@ -1,295 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * logfile.h - Defines for NTFS kernel journal ($LogFile) handling. Part of - * the Linux-NTFS project. - * - * Copyright (c) 2000-2005 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_LOGFILE_H -#define _LINUX_NTFS_LOGFILE_H - -#ifdef NTFS_RW - -#include <linux/fs.h> - -#include "types.h" -#include "endian.h" -#include "layout.h" - -/* - * Journal ($LogFile) organization: - * - * Two restart areas present in the first two pages (restart pages, one restart - * area in each page). When the volume is dismounted they should be identical, - * except for the update sequence array which usually has a different update - * sequence number. - * - * These are followed by log records organized in pages headed by a log record - * header going up to log file size. Not all pages contain log records when a - * volume is first formatted, but as the volume ages, all records will be used. - * When the log file fills up, the records at the beginning are purged (by - * modifying the oldest_lsn to a higher value presumably) and writing begins - * at the beginning of the file. Effectively, the log file is viewed as a - * circular entity. - * - * NOTE: Windows NT, 2000, and XP all use log file version 1.1 but they accept - * versions <= 1.x, including 0.-1. (Yes, that is a minus one in there!) We - * probably only want to support 1.1 as this seems to be the current version - * and we don't know how that differs from the older versions. The only - * exception is if the journal is clean as marked by the two restart pages - * then it doesn't matter whether we are on an earlier version. We can just - * reinitialize the logfile and start again with version 1.1. - */ - -/* Some $LogFile related constants. */ -#define MaxLogFileSize 0x100000000ULL -#define DefaultLogPageSize 4096 -#define MinLogRecordPages 48 - -/* - * Log file restart page header (begins the restart area). - */ -typedef struct { -/*Ofs*/ -/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */ -/* 0*/ NTFS_RECORD_TYPE magic; /* The magic is "RSTR". */ -/* 4*/ le16 usa_ofs; /* See NTFS_RECORD definition in layout.h. - When creating, set this to be immediately - after this header structure (without any - alignment). */ -/* 6*/ le16 usa_count; /* See NTFS_RECORD definition in layout.h. */ - -/* 8*/ leLSN chkdsk_lsn; /* The last log file sequence number found by - chkdsk. Only used when the magic is changed - to "CHKD". Otherwise this is zero. */ -/* 16*/ le32 system_page_size; /* Byte size of system pages when the log file - was created, has to be >= 512 and a power of - 2. Use this to calculate the required size - of the usa (usa_count) and add it to usa_ofs. - Then verify that the result is less than the - value of the restart_area_offset. */ -/* 20*/ le32 log_page_size; /* Byte size of log file pages, has to be >= - 512 and a power of 2. The default is 4096 - and is used when the system page size is - between 4096 and 8192. Otherwise this is - set to the system page size instead. */ -/* 24*/ le16 restart_area_offset;/* Byte offset from the start of this header to - the RESTART_AREA. Value has to be aligned - to 8-byte boundary. When creating, set this - to be after the usa. */ -/* 26*/ sle16 minor_ver; /* Log file minor version. Only check if major - version is 1. */ -/* 28*/ sle16 major_ver; /* Log file major version. We only support - version 1.1. */ -/* sizeof() = 30 (0x1e) bytes */ -} __attribute__ ((__packed__)) RESTART_PAGE_HEADER; - -/* - * Constant for the log client indices meaning that there are no client records - * in this particular client array. Also inside the client records themselves, - * this means that there are no client records preceding or following this one. - */ -#define LOGFILE_NO_CLIENT cpu_to_le16(0xffff) -#define LOGFILE_NO_CLIENT_CPU 0xffff - -/* - * These are the so far known RESTART_AREA_* flags (16-bit) which contain - * information about the log file in which they are present. - */ -enum { - RESTART_VOLUME_IS_CLEAN = cpu_to_le16(0x0002), - RESTART_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16. */ -} __attribute__ ((__packed__)); - -typedef le16 RESTART_AREA_FLAGS; - -/* - * Log file restart area record. The offset of this record is found by adding - * the offset of the RESTART_PAGE_HEADER to the restart_area_offset value found - * in it. See notes at restart_area_offset above. - */ -typedef struct { -/*Ofs*/ -/* 0*/ leLSN current_lsn; /* The current, i.e. last LSN inside the log - when the restart area was last written. - This happens often but what is the interval? - Is it just fixed time or is it every time a - check point is written or somethine else? - On create set to 0. */ -/* 8*/ le16 log_clients; /* Number of log client records in the array of - log client records which follows this - restart area. Must be 1. */ -/* 10*/ le16 client_free_list; /* The index of the first free log client record - in the array of log client records. - LOGFILE_NO_CLIENT means that there are no - free log client records in the array. - If != LOGFILE_NO_CLIENT, check that - log_clients > client_free_list. On Win2k - and presumably earlier, on a clean volume - this is != LOGFILE_NO_CLIENT, and it should - be 0, i.e. the first (and only) client - record is free and thus the logfile is - closed and hence clean. A dirty volume - would have left the logfile open and hence - this would be LOGFILE_NO_CLIENT. On WinXP - and presumably later, the logfile is always - open, even on clean shutdown so this should - always be LOGFILE_NO_CLIENT. */ -/* 12*/ le16 client_in_use_list;/* The index of the first in-use log client - record in the array of log client records. - LOGFILE_NO_CLIENT means that there are no - in-use log client records in the array. If - != LOGFILE_NO_CLIENT check that log_clients - > client_in_use_list. On Win2k and - presumably earlier, on a clean volume this - is LOGFILE_NO_CLIENT, i.e. there are no - client records in use and thus the logfile - is closed and hence clean. A dirty volume - would have left the logfile open and hence - this would be != LOGFILE_NO_CLIENT, and it - should be 0, i.e. the first (and only) - client record is in use. On WinXP and - presumably later, the logfile is always - open, even on clean shutdown so this should - always be 0. */ -/* 14*/ RESTART_AREA_FLAGS flags;/* Flags modifying LFS behaviour. On Win2k - and presumably earlier this is always 0. On - WinXP and presumably later, if the logfile - was shutdown cleanly, the second bit, - RESTART_VOLUME_IS_CLEAN, is set. This bit - is cleared when the volume is mounted by - WinXP and set when the volume is dismounted, - thus if the logfile is dirty, this bit is - clear. Thus we don't need to check the - Windows version to determine if the logfile - is clean. Instead if the logfile is closed, - we know it must be clean. If it is open and - this bit is set, we also know it must be - clean. If on the other hand the logfile is - open and this bit is clear, we can be almost - certain that the logfile is dirty. */ -/* 16*/ le32 seq_number_bits; /* How many bits to use for the sequence - number. This is calculated as 67 - the - number of bits required to store the logfile - size in bytes and this can be used in with - the specified file_size as a consistency - check. */ -/* 20*/ le16 restart_area_length;/* Length of the restart area including the - client array. Following checks required if - version matches. Otherwise, skip them. - restart_area_offset + restart_area_length - has to be <= system_page_size. Also, - restart_area_length has to be >= - client_array_offset + (log_clients * - sizeof(log client record)). */ -/* 22*/ le16 client_array_offset;/* Offset from the start of this record to - the first log client record if versions are - matched. When creating, set this to be - after this restart area structure, aligned - to 8-bytes boundary. If the versions do not - match, this is ignored and the offset is - assumed to be (sizeof(RESTART_AREA) + 7) & - ~7, i.e. rounded up to first 8-byte - boundary. Either way, client_array_offset - has to be aligned to an 8-byte boundary. - Also, restart_area_offset + - client_array_offset has to be <= 510. - Finally, client_array_offset + (log_clients - * sizeof(log client record)) has to be <= - system_page_size. On Win2k and presumably - earlier, this is 0x30, i.e. immediately - following this record. On WinXP and - presumably later, this is 0x40, i.e. there - are 16 extra bytes between this record and - the client array. This probably means that - the RESTART_AREA record is actually bigger - in WinXP and later. */ -/* 24*/ sle64 file_size; /* Usable byte size of the log file. If the - restart_area_offset + the offset of the - file_size are > 510 then corruption has - occurred. This is the very first check when - starting with the restart_area as if it - fails it means that some of the above values - will be corrupted by the multi sector - transfer protection. The file_size has to - be rounded down to be a multiple of the - log_page_size in the RESTART_PAGE_HEADER and - then it has to be at least big enough to - store the two restart pages and 48 (0x30) - log record pages. */ -/* 32*/ le32 last_lsn_data_length;/* Length of data of last LSN, not including - the log record header. On create set to - 0. */ -/* 36*/ le16 log_record_header_length;/* Byte size of the log record header. - If the version matches then check that the - value of log_record_header_length is a - multiple of 8, i.e. - (log_record_header_length + 7) & ~7 == - log_record_header_length. When creating set - it to sizeof(LOG_RECORD_HEADER), aligned to - 8 bytes. */ -/* 38*/ le16 log_page_data_offset;/* Offset to the start of data in a log record - page. Must be a multiple of 8. On create - set it to immediately after the update - sequence array of the log record page. */ -/* 40*/ le32 restart_log_open_count;/* A counter that gets incremented every - time the logfile is restarted which happens - at mount time when the logfile is opened. - When creating set to a random value. Win2k - sets it to the low 32 bits of the current - system time in NTFS format (see time.h). */ -/* 44*/ le32 reserved; /* Reserved/alignment to 8-byte boundary. */ -/* sizeof() = 48 (0x30) bytes */ -} __attribute__ ((__packed__)) RESTART_AREA; - -/* - * Log client record. The offset of this record is found by adding the offset - * of the RESTART_AREA to the client_array_offset value found in it. - */ -typedef struct { -/*Ofs*/ -/* 0*/ leLSN oldest_lsn; /* Oldest LSN needed by this client. On create - set to 0. */ -/* 8*/ leLSN client_restart_lsn;/* LSN at which this client needs to restart - the volume, i.e. the current position within - the log file. At present, if clean this - should = current_lsn in restart area but it - probably also = current_lsn when dirty most - of the time. At create set to 0. */ -/* 16*/ le16 prev_client; /* The offset to the previous log client record - in the array of log client records. - LOGFILE_NO_CLIENT means there is no previous - client record, i.e. this is the first one. - This is always LOGFILE_NO_CLIENT. */ -/* 18*/ le16 next_client; /* The offset to the next log client record in - the array of log client records. - LOGFILE_NO_CLIENT means there are no next - client records, i.e. this is the last one. - This is always LOGFILE_NO_CLIENT. */ -/* 20*/ le16 seq_number; /* On Win2k and presumably earlier, this is set - to zero every time the logfile is restarted - and it is incremented when the logfile is - closed at dismount time. Thus it is 0 when - dirty and 1 when clean. On WinXP and - presumably later, this is always 0. */ -/* 22*/ u8 reserved[6]; /* Reserved/alignment. */ -/* 28*/ le32 client_name_length;/* Length of client name in bytes. Should - always be 8. */ -/* 32*/ ntfschar client_name[64];/* Name of the client in Unicode. Should - always be "NTFS" with the remaining bytes - set to 0. */ -/* sizeof() = 160 (0xa0) bytes */ -} __attribute__ ((__packed__)) LOG_CLIENT_RECORD; - -extern bool ntfs_check_logfile(struct inode *log_vi, - RESTART_PAGE_HEADER **rp); - -extern bool ntfs_is_logfile_clean(struct inode *log_vi, - const RESTART_PAGE_HEADER *rp); - -extern bool ntfs_empty_logfile(struct inode *log_vi); - -#endif /* NTFS_RW */ - -#endif /* _LINUX_NTFS_LOGFILE_H */ diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h deleted file mode 100644 index 7068425735f1..000000000000 --- a/fs/ntfs/malloc.h +++ /dev/null @@ -1,77 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * malloc.h - NTFS kernel memory handling. Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2005 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_MALLOC_H -#define _LINUX_NTFS_MALLOC_H - -#include <linux/vmalloc.h> -#include <linux/slab.h> -#include <linux/highmem.h> - -/** - * __ntfs_malloc - allocate memory in multiples of pages - * @size: number of bytes to allocate - * @gfp_mask: extra flags for the allocator - * - * Internal function. You probably want ntfs_malloc_nofs()... - * - * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and - * returns a pointer to the allocated memory. - * - * If there was insufficient memory to complete the request, return NULL. - * Depending on @gfp_mask the allocation may be guaranteed to succeed. - */ -static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask) -{ - if (likely(size <= PAGE_SIZE)) { - BUG_ON(!size); - /* kmalloc() has per-CPU caches so is faster for now. */ - return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM); - /* return (void *)__get_free_page(gfp_mask); */ - } - if (likely((size >> PAGE_SHIFT) < totalram_pages())) - return __vmalloc(size, gfp_mask); - return NULL; -} - -/** - * ntfs_malloc_nofs - allocate memory in multiples of pages - * @size: number of bytes to allocate - * - * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and - * returns a pointer to the allocated memory. - * - * If there was insufficient memory to complete the request, return NULL. - */ -static inline void *ntfs_malloc_nofs(unsigned long size) -{ - return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM); -} - -/** - * ntfs_malloc_nofs_nofail - allocate memory in multiples of pages - * @size: number of bytes to allocate - * - * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and - * returns a pointer to the allocated memory. - * - * This function guarantees that the allocation will succeed. It will sleep - * for as long as it takes to complete the allocation. - * - * If there was insufficient memory to complete the request, return NULL. - */ -static inline void *ntfs_malloc_nofs_nofail(unsigned long size) -{ - return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_NOFAIL); -} - -static inline void ntfs_free(void *addr) -{ - kvfree(addr); -} - -#endif /* _LINUX_NTFS_MALLOC_H */ diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c deleted file mode 100644 index 6fd1dc4b08c8..000000000000 --- a/fs/ntfs/mft.c +++ /dev/null @@ -1,2907 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc. - * Copyright (c) 2002 Richard Russon - */ - -#include <linux/buffer_head.h> -#include <linux/slab.h> -#include <linux/swap.h> -#include <linux/bio.h> - -#include "attrib.h" -#include "aops.h" -#include "bitmap.h" -#include "debug.h" -#include "dir.h" -#include "lcnalloc.h" -#include "malloc.h" -#include "mft.h" -#include "ntfs.h" - -#define MAX_BHS (PAGE_SIZE / NTFS_BLOCK_SIZE) - -/** - * map_mft_record_page - map the page in which a specific mft record resides - * @ni: ntfs inode whose mft record page to map - * - * This maps the page in which the mft record of the ntfs inode @ni is situated - * and returns a pointer to the mft record within the mapped page. - * - * Return value needs to be checked with IS_ERR() and if that is true PTR_ERR() - * contains the negative error code returned. - */ -static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni) -{ - loff_t i_size; - ntfs_volume *vol = ni->vol; - struct inode *mft_vi = vol->mft_ino; - struct page *page; - unsigned long index, end_index; - unsigned ofs; - - BUG_ON(ni->page); - /* - * The index into the page cache and the offset within the page cache - * page of the wanted mft record. FIXME: We need to check for - * overflowing the unsigned long, but I don't think we would ever get - * here if the volume was that big... - */ - index = (u64)ni->mft_no << vol->mft_record_size_bits >> - PAGE_SHIFT; - ofs = (ni->mft_no << vol->mft_record_size_bits) & ~PAGE_MASK; - - i_size = i_size_read(mft_vi); - /* The maximum valid index into the page cache for $MFT's data. */ - end_index = i_size >> PAGE_SHIFT; - - /* If the wanted index is out of bounds the mft record doesn't exist. */ - if (unlikely(index >= end_index)) { - if (index > end_index || (i_size & ~PAGE_MASK) < ofs + - vol->mft_record_size) { - page = ERR_PTR(-ENOENT); - ntfs_error(vol->sb, "Attempt to read mft record 0x%lx, " - "which is beyond the end of the mft. " - "This is probably a bug in the ntfs " - "driver.", ni->mft_no); - goto err_out; - } - } - /* Read, map, and pin the page. */ - page = ntfs_map_page(mft_vi->i_mapping, index); - if (!IS_ERR(page)) { - /* Catch multi sector transfer fixup errors. */ - if (likely(ntfs_is_mft_recordp((le32*)(page_address(page) + - ofs)))) { - ni->page = page; - ni->page_ofs = ofs; - return page_address(page) + ofs; - } - ntfs_error(vol->sb, "Mft record 0x%lx is corrupt. " - "Run chkdsk.", ni->mft_no); - ntfs_unmap_page(page); - page = ERR_PTR(-EIO); - NVolSetErrors(vol); - } -err_out: - ni->page = NULL; - ni->page_ofs = 0; - return (void*)page; -} - -/** - * map_mft_record - map, pin and lock an mft record - * @ni: ntfs inode whose MFT record to map - * - * First, take the mrec_lock mutex. We might now be sleeping, while waiting - * for the mutex if it was already locked by someone else. - * - * The page of the record is mapped using map_mft_record_page() before being - * returned to the caller. - * - * This in turn uses ntfs_map_page() to get the page containing the wanted mft - * record (it in turn calls read_cache_page() which reads it in from disk if - * necessary, increments the use count on the page so that it cannot disappear - * under us and returns a reference to the page cache page). - * - * If read_cache_page() invokes ntfs_readpage() to load the page from disk, it - * sets PG_locked and clears PG_uptodate on the page. Once I/O has completed - * and the post-read mst fixups on each mft record in the page have been - * performed, the page gets PG_uptodate set and PG_locked cleared (this is done - * in our asynchronous I/O completion handler end_buffer_read_mft_async()). - * ntfs_map_page() waits for PG_locked to become clear and checks if - * PG_uptodate is set and returns an error code if not. This provides - * sufficient protection against races when reading/using the page. - * - * However there is the write mapping to think about. Doing the above described - * checking here will be fine, because when initiating the write we will set - * PG_locked and clear PG_uptodate making sure nobody is touching the page - * contents. Doing the locking this way means that the commit to disk code in - * the page cache code paths is automatically sufficiently locked with us as - * we will not touch a page that has been locked or is not uptodate. The only - * locking problem then is them locking the page while we are accessing it. - * - * So that code will end up having to own the mrec_lock of all mft - * records/inodes present in the page before I/O can proceed. In that case we - * wouldn't need to bother with PG_locked and PG_uptodate as nobody will be - * accessing anything without owning the mrec_lock mutex. But we do need to - * use them because of the read_cache_page() invocation and the code becomes so - * much simpler this way that it is well worth it. - * - * The mft record is now ours and we return a pointer to it. You need to check - * the returned pointer with IS_ERR() and if that is true, PTR_ERR() will return - * the error code. - * - * NOTE: Caller is responsible for setting the mft record dirty before calling - * unmap_mft_record(). This is obviously only necessary if the caller really - * modified the mft record... - * Q: Do we want to recycle one of the VFS inode state bits instead? - * A: No, the inode ones mean we want to change the mft record, not we want to - * write it out. - */ -MFT_RECORD *map_mft_record(ntfs_inode *ni) -{ - MFT_RECORD *m; - - ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no); - - /* Make sure the ntfs inode doesn't go away. */ - atomic_inc(&ni->count); - - /* Serialize access to this mft record. */ - mutex_lock(&ni->mrec_lock); - - m = map_mft_record_page(ni); - if (!IS_ERR(m)) - return m; - - mutex_unlock(&ni->mrec_lock); - atomic_dec(&ni->count); - ntfs_error(ni->vol->sb, "Failed with error code %lu.", -PTR_ERR(m)); - return m; -} - -/** - * unmap_mft_record_page - unmap the page in which a specific mft record resides - * @ni: ntfs inode whose mft record page to unmap - * - * This unmaps the page in which the mft record of the ntfs inode @ni is - * situated and returns. This is a NOOP if highmem is not configured. - * - * The unmap happens via ntfs_unmap_page() which in turn decrements the use - * count on the page thus releasing it from the pinned state. - * - * We do not actually unmap the page from memory of course, as that will be - * done by the page cache code itself when memory pressure increases or - * whatever. - */ -static inline void unmap_mft_record_page(ntfs_inode *ni) -{ - BUG_ON(!ni->page); - - // TODO: If dirty, blah... - ntfs_unmap_page(ni->page); - ni->page = NULL; - ni->page_ofs = 0; - return; -} - -/** - * unmap_mft_record - release a mapped mft record - * @ni: ntfs inode whose MFT record to unmap - * - * We release the page mapping and the mrec_lock mutex which unmaps the mft - * record and releases it for others to get hold of. We also release the ntfs - * inode by decrementing the ntfs inode reference count. - * - * NOTE: If caller has modified the mft record, it is imperative to set the mft - * record dirty BEFORE calling unmap_mft_record(). - */ -void unmap_mft_record(ntfs_inode *ni) -{ - struct page *page = ni->page; - - BUG_ON(!page); - - ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no); - - unmap_mft_record_page(ni); - mutex_unlock(&ni->mrec_lock); - atomic_dec(&ni->count); - /* - * If pure ntfs_inode, i.e. no vfs inode attached, we leave it to - * ntfs_clear_extent_inode() in the extent inode case, and to the - * caller in the non-extent, yet pure ntfs inode case, to do the actual - * tear down of all structures and freeing of all allocated memory. - */ - return; -} - -/** - * map_extent_mft_record - load an extent inode and attach it to its base - * @base_ni: base ntfs inode - * @mref: mft reference of the extent inode to load - * @ntfs_ino: on successful return, pointer to the ntfs_inode structure - * - * Load the extent mft record @mref and attach it to its base inode @base_ni. - * Return the mapped extent mft record if IS_ERR(result) is false. Otherwise - * PTR_ERR(result) gives the negative error code. - * - * On successful return, @ntfs_ino contains a pointer to the ntfs_inode - * structure of the mapped extent inode. - */ -MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref, - ntfs_inode **ntfs_ino) -{ - MFT_RECORD *m; - ntfs_inode *ni = NULL; - ntfs_inode **extent_nis = NULL; - int i; - unsigned long mft_no = MREF(mref); - u16 seq_no = MSEQNO(mref); - bool destroy_ni = false; - - ntfs_debug("Mapping extent mft record 0x%lx (base mft record 0x%lx).", - mft_no, base_ni->mft_no); - /* Make sure the base ntfs inode doesn't go away. */ - atomic_inc(&base_ni->count); - /* - * Check if this extent inode has already been added to the base inode, - * in which case just return it. If not found, add it to the base - * inode before returning it. - */ - mutex_lock(&base_ni->extent_lock); - if (base_ni->nr_extents > 0) { - extent_nis = base_ni->ext.extent_ntfs_inos; - for (i = 0; i < base_ni->nr_extents; i++) { - if (mft_no != extent_nis[i]->mft_no) - continue; - ni = extent_nis[i]; - /* Make sure the ntfs inode doesn't go away. */ - atomic_inc(&ni->count); - break; - } - } - if (likely(ni != NULL)) { - mutex_unlock(&base_ni->extent_lock); - atomic_dec(&base_ni->count); - /* We found the record; just have to map and return it. */ - m = map_mft_record(ni); - /* map_mft_record() has incremented this on success. */ - atomic_dec(&ni->count); - if (!IS_ERR(m)) { - /* Verify the sequence number. */ - if (likely(le16_to_cpu(m->sequence_number) == seq_no)) { - ntfs_debug("Done 1."); - *ntfs_ino = ni; - return m; - } - unmap_mft_record(ni); - ntfs_error(base_ni->vol->sb, "Found stale extent mft " - "reference! Corrupt filesystem. " - "Run chkdsk."); - return ERR_PTR(-EIO); - } -map_err_out: - ntfs_error(base_ni->vol->sb, "Failed to map extent " - "mft record, error code %ld.", -PTR_ERR(m)); - return m; - } - /* Record wasn't there. Get a new ntfs inode and initialize it. */ - ni = ntfs_new_extent_inode(base_ni->vol->sb, mft_no); - if (unlikely(!ni)) { - mutex_unlock(&base_ni->extent_lock); - atomic_dec(&base_ni->count); - return ERR_PTR(-ENOMEM); - } - ni->vol = base_ni->vol; - ni->seq_no = seq_no; - ni->nr_extents = -1; - ni->ext.base_ntfs_ino = base_ni; - /* Now map the record. */ - m = map_mft_record(ni); - if (IS_ERR(m)) { - mutex_unlock(&base_ni->extent_lock); - atomic_dec(&base_ni->count); - ntfs_clear_extent_inode(ni); - goto map_err_out; - } - /* Verify the sequence number if it is present. */ - if (seq_no && (le16_to_cpu(m->sequence_number) != seq_no)) { - ntfs_error(base_ni->vol->sb, "Found stale extent mft " - "reference! Corrupt filesystem. Run chkdsk."); - destroy_ni = true; - m = ERR_PTR(-EIO); - goto unm_err_out; - } - /* Attach extent inode to base inode, reallocating memory if needed. */ - if (!(base_ni->nr_extents & 3)) { - ntfs_inode **tmp; - int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode *); - - tmp = kmalloc(new_size, GFP_NOFS); - if (unlikely(!tmp)) { - ntfs_error(base_ni->vol->sb, "Failed to allocate " - "internal buffer."); - destroy_ni = true; - m = ERR_PTR(-ENOMEM); - goto unm_err_out; - } - if (base_ni->nr_extents) { - BUG_ON(!base_ni->ext.extent_ntfs_inos); - memcpy(tmp, base_ni->ext.extent_ntfs_inos, new_size - - 4 * sizeof(ntfs_inode *)); - kfree(base_ni->ext.extent_ntfs_inos); - } - base_ni->ext.extent_ntfs_inos = tmp; - } - base_ni->ext.extent_ntfs_inos[base_ni->nr_extents++] = ni; - mutex_unlock(&base_ni->extent_lock); - atomic_dec(&base_ni->count); - ntfs_debug("Done 2."); - *ntfs_ino = ni; - return m; -unm_err_out: - unmap_mft_record(ni); - mutex_unlock(&base_ni->extent_lock); - atomic_dec(&base_ni->count); - /* - * If the extent inode was not attached to the base inode we need to - * release it or we will leak memory. - */ - if (destroy_ni) - ntfs_clear_extent_inode(ni); - return m; -} - -#ifdef NTFS_RW - -/** - * __mark_mft_record_dirty - set the mft record and the page containing it dirty - * @ni: ntfs inode describing the mapped mft record - * - * Internal function. Users should call mark_mft_record_dirty() instead. - * - * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni, - * as well as the page containing the mft record, dirty. Also, mark the base - * vfs inode dirty. This ensures that any changes to the mft record are - * written out to disk. - * - * NOTE: We only set I_DIRTY_DATASYNC (and not I_DIRTY_PAGES) - * on the base vfs inode, because even though file data may have been modified, - * it is dirty in the inode meta data rather than the data page cache of the - * inode, and thus there are no data pages that need writing out. Therefore, a - * full mark_inode_dirty() is overkill. A mark_inode_dirty_sync(), on the - * other hand, is not sufficient, because ->write_inode needs to be called even - * in case of fdatasync. This needs to happen or the file data would not - * necessarily hit the device synchronously, even though the vfs inode has the - * O_SYNC flag set. Also, I_DIRTY_DATASYNC simply "feels" better than just - * I_DIRTY_SYNC, since the file data has not actually hit the block device yet, - * which is not what I_DIRTY_SYNC on its own would suggest. - */ -void __mark_mft_record_dirty(ntfs_inode *ni) -{ - ntfs_inode *base_ni; - - ntfs_debug("Entering for inode 0x%lx.", ni->mft_no); - BUG_ON(NInoAttr(ni)); - mark_ntfs_record_dirty(ni->page, ni->page_ofs); - /* Determine the base vfs inode and mark it dirty, too. */ - mutex_lock(&ni->extent_lock); - if (likely(ni->nr_extents >= 0)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - mutex_unlock(&ni->extent_lock); - __mark_inode_dirty(VFS_I(base_ni), I_DIRTY_DATASYNC); -} - -static const char *ntfs_please_email = "Please email " - "linux-ntfs-dev@lists.sourceforge.net and say that you saw " - "this message. Thank you."; - -/** - * ntfs_sync_mft_mirror_umount - synchronise an mft record to the mft mirror - * @vol: ntfs volume on which the mft record to synchronize resides - * @mft_no: mft record number of mft record to synchronize - * @m: mapped, mst protected (extent) mft record to synchronize - * - * Write the mapped, mst protected (extent) mft record @m with mft record - * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol, - * bypassing the page cache and the $MFTMirr inode itself. - * - * This function is only for use at umount time when the mft mirror inode has - * already been disposed off. We BUG() if we are called while the mft mirror - * inode is still attached to the volume. - * - * On success return 0. On error return -errno. - * - * NOTE: This function is not implemented yet as I am not convinced it can - * actually be triggered considering the sequence of commits we do in super.c:: - * ntfs_put_super(). But just in case we provide this place holder as the - * alternative would be either to BUG() or to get a NULL pointer dereference - * and Oops. - */ -static int ntfs_sync_mft_mirror_umount(ntfs_volume *vol, - const unsigned long mft_no, MFT_RECORD *m) -{ - BUG_ON(vol->mftmirr_ino); - ntfs_error(vol->sb, "Umount time mft mirror syncing is not " - "implemented yet. %s", ntfs_please_email); - return -EOPNOTSUPP; -} - -/** - * ntfs_sync_mft_mirror - synchronize an mft record to the mft mirror - * @vol: ntfs volume on which the mft record to synchronize resides - * @mft_no: mft record number of mft record to synchronize - * @m: mapped, mst protected (extent) mft record to synchronize - * @sync: if true, wait for i/o completion - * - * Write the mapped, mst protected (extent) mft record @m with mft record - * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol. - * - * On success return 0. On error return -errno and set the volume errors flag - * in the ntfs volume @vol. - * - * NOTE: We always perform synchronous i/o and ignore the @sync parameter. - * - * TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just - * schedule i/o via ->writepage or do it via kntfsd or whatever. - */ -int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, - MFT_RECORD *m, int sync) -{ - struct page *page; - unsigned int blocksize = vol->sb->s_blocksize; - int max_bhs = vol->mft_record_size / blocksize; - struct buffer_head *bhs[MAX_BHS]; - struct buffer_head *bh, *head; - u8 *kmirr; - runlist_element *rl; - unsigned int block_start, block_end, m_start, m_end, page_ofs; - int i_bhs, nr_bhs, err = 0; - unsigned char blocksize_bits = vol->sb->s_blocksize_bits; - - ntfs_debug("Entering for inode 0x%lx.", mft_no); - BUG_ON(!max_bhs); - if (WARN_ON(max_bhs > MAX_BHS)) - return -EINVAL; - if (unlikely(!vol->mftmirr_ino)) { - /* This could happen during umount... */ - err = ntfs_sync_mft_mirror_umount(vol, mft_no, m); - if (likely(!err)) - return err; - goto err_out; - } - /* Get the page containing the mirror copy of the mft record @m. */ - page = ntfs_map_page(vol->mftmirr_ino->i_mapping, mft_no >> - (PAGE_SHIFT - vol->mft_record_size_bits)); - if (IS_ERR(page)) { - ntfs_error(vol->sb, "Failed to map mft mirror page."); - err = PTR_ERR(page); - goto err_out; - } - lock_page(page); - BUG_ON(!PageUptodate(page)); - ClearPageUptodate(page); - /* Offset of the mft mirror record inside the page. */ - page_ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_MASK; - /* The address in the page of the mirror copy of the mft record @m. */ - kmirr = page_address(page) + page_ofs; - /* Copy the mst protected mft record to the mirror. */ - memcpy(kmirr, m, vol->mft_record_size); - /* Create uptodate buffers if not present. */ - if (unlikely(!page_has_buffers(page))) { - struct buffer_head *tail; - - bh = head = alloc_page_buffers(page, blocksize, true); - do { - set_buffer_uptodate(bh); - tail = bh; - bh = bh->b_this_page; - } while (bh); - tail->b_this_page = head; - attach_page_private(page, head); - } - bh = head = page_buffers(page); - BUG_ON(!bh); - rl = NULL; - nr_bhs = 0; - block_start = 0; - m_start = kmirr - (u8*)page_address(page); - m_end = m_start + vol->mft_record_size; - do { - block_end = block_start + blocksize; - /* If the buffer is outside the mft record, skip it. */ - if (block_end <= m_start) - continue; - if (unlikely(block_start >= m_end)) - break; - /* Need to map the buffer if it is not mapped already. */ - if (unlikely(!buffer_mapped(bh))) { - VCN vcn; - LCN lcn; - unsigned int vcn_ofs; - - bh->b_bdev = vol->sb->s_bdev; - /* Obtain the vcn and offset of the current block. */ - vcn = ((VCN)mft_no << vol->mft_record_size_bits) + - (block_start - m_start); - vcn_ofs = vcn & vol->cluster_size_mask; - vcn >>= vol->cluster_size_bits; - if (!rl) { - down_read(&NTFS_I(vol->mftmirr_ino)-> - runlist.lock); - rl = NTFS_I(vol->mftmirr_ino)->runlist.rl; - /* - * $MFTMirr always has the whole of its runlist - * in memory. - */ - BUG_ON(!rl); - } - /* Seek to element containing target vcn. */ - while (rl->length && rl[1].vcn <= vcn) - rl++; - lcn = ntfs_rl_vcn_to_lcn(rl, vcn); - /* For $MFTMirr, only lcn >= 0 is a successful remap. */ - if (likely(lcn >= 0)) { - /* Setup buffer head to correct block. */ - bh->b_blocknr = ((lcn << - vol->cluster_size_bits) + - vcn_ofs) >> blocksize_bits; - set_buffer_mapped(bh); - } else { - bh->b_blocknr = -1; - ntfs_error(vol->sb, "Cannot write mft mirror " - "record 0x%lx because its " - "location on disk could not " - "be determined (error code " - "%lli).", mft_no, - (long long)lcn); - err = -EIO; - } - } - BUG_ON(!buffer_uptodate(bh)); - BUG_ON(!nr_bhs && (m_start != block_start)); - BUG_ON(nr_bhs >= max_bhs); - bhs[nr_bhs++] = bh; - BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end)); - } while (block_start = block_end, (bh = bh->b_this_page) != head); - if (unlikely(rl)) - up_read(&NTFS_I(vol->mftmirr_ino)->runlist.lock); - if (likely(!err)) { - /* Lock buffers and start synchronous write i/o on them. */ - for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { - struct buffer_head *tbh = bhs[i_bhs]; - - if (!trylock_buffer(tbh)) - BUG(); - BUG_ON(!buffer_uptodate(tbh)); - clear_buffer_dirty(tbh); - get_bh(tbh); - tbh->b_end_io = end_buffer_write_sync; - submit_bh(REQ_OP_WRITE, tbh); - } - /* Wait on i/o completion of buffers. */ - for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { - struct buffer_head *tbh = bhs[i_bhs]; - - wait_on_buffer(tbh); - if (unlikely(!buffer_uptodate(tbh))) { - err = -EIO; - /* - * Set the buffer uptodate so the page and - * buffer states do not become out of sync. - */ - set_buffer_uptodate(tbh); - } - } - } else /* if (unlikely(err)) */ { - /* Clean the buffers. */ - for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) - clear_buffer_dirty(bhs[i_bhs]); - } - /* Current state: all buffers are clean, unlocked, and uptodate. */ - /* Remove the mst protection fixups again. */ - post_write_mst_fixup((NTFS_RECORD*)kmirr); - flush_dcache_page(page); - SetPageUptodate(page); - unlock_page(page); - ntfs_unmap_page(page); - if (likely(!err)) { - ntfs_debug("Done."); - } else { - ntfs_error(vol->sb, "I/O error while writing mft mirror " - "record 0x%lx!", mft_no); -err_out: - ntfs_error(vol->sb, "Failed to synchronize $MFTMirr (error " - "code %i). Volume will be left marked dirty " - "on umount. Run ntfsfix on the partition " - "after umounting to correct this.", -err); - NVolSetErrors(vol); - } - return err; -} - -/** - * write_mft_record_nolock - write out a mapped (extent) mft record - * @ni: ntfs inode describing the mapped (extent) mft record - * @m: mapped (extent) mft record to write - * @sync: if true, wait for i/o completion - * - * Write the mapped (extent) mft record @m described by the (regular or extent) - * ntfs inode @ni to backing store. If the mft record @m has a counterpart in - * the mft mirror, that is also updated. - * - * We only write the mft record if the ntfs inode @ni is dirty and the first - * buffer belonging to its mft record is dirty, too. We ignore the dirty state - * of subsequent buffers because we could have raced with - * fs/ntfs/aops.c::mark_ntfs_record_dirty(). - * - * On success, clean the mft record and return 0. On error, leave the mft - * record dirty and return -errno. - * - * NOTE: We always perform synchronous i/o and ignore the @sync parameter. - * However, if the mft record has a counterpart in the mft mirror and @sync is - * true, we write the mft record, wait for i/o completion, and only then write - * the mft mirror copy. This ensures that if the system crashes either the mft - * or the mft mirror will contain a self-consistent mft record @m. If @sync is - * false on the other hand, we start i/o on both and then wait for completion - * on them. This provides a speedup but no longer guarantees that you will end - * up with a self-consistent mft record in the case of a crash but if you asked - * for asynchronous writing you probably do not care about that anyway. - * - * TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just - * schedule i/o via ->writepage or do it via kntfsd or whatever. - */ -int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync) -{ - ntfs_volume *vol = ni->vol; - struct page *page = ni->page; - unsigned int blocksize = vol->sb->s_blocksize; - unsigned char blocksize_bits = vol->sb->s_blocksize_bits; - int max_bhs = vol->mft_record_size / blocksize; - struct buffer_head *bhs[MAX_BHS]; - struct buffer_head *bh, *head; - runlist_element *rl; - unsigned int block_start, block_end, m_start, m_end; - int i_bhs, nr_bhs, err = 0; - - ntfs_debug("Entering for inode 0x%lx.", ni->mft_no); - BUG_ON(NInoAttr(ni)); - BUG_ON(!max_bhs); - BUG_ON(!PageLocked(page)); - if (WARN_ON(max_bhs > MAX_BHS)) { - err = -EINVAL; - goto err_out; - } - /* - * If the ntfs_inode is clean no need to do anything. If it is dirty, - * mark it as clean now so that it can be redirtied later on if needed. - * There is no danger of races since the caller is holding the locks - * for the mft record @m and the page it is in. - */ - if (!NInoTestClearDirty(ni)) - goto done; - bh = head = page_buffers(page); - BUG_ON(!bh); - rl = NULL; - nr_bhs = 0; - block_start = 0; - m_start = ni->page_ofs; - m_end = m_start + vol->mft_record_size; - do { - block_end = block_start + blocksize; - /* If the buffer is outside the mft record, skip it. */ - if (block_end <= m_start) - continue; - if (unlikely(block_start >= m_end)) - break; - /* - * If this block is not the first one in the record, we ignore - * the buffer's dirty state because we could have raced with a - * parallel mark_ntfs_record_dirty(). - */ - if (block_start == m_start) { - /* This block is the first one in the record. */ - if (!buffer_dirty(bh)) { - BUG_ON(nr_bhs); - /* Clean records are not written out. */ - break; - } - } - /* Need to map the buffer if it is not mapped already. */ - if (unlikely(!buffer_mapped(bh))) { - VCN vcn; - LCN lcn; - unsigned int vcn_ofs; - - bh->b_bdev = vol->sb->s_bdev; - /* Obtain the vcn and offset of the current block. */ - vcn = ((VCN)ni->mft_no << vol->mft_record_size_bits) + - (block_start - m_start); - vcn_ofs = vcn & vol->cluster_size_mask; - vcn >>= vol->cluster_size_bits; - if (!rl) { - down_read(&NTFS_I(vol->mft_ino)->runlist.lock); - rl = NTFS_I(vol->mft_ino)->runlist.rl; - BUG_ON(!rl); - } - /* Seek to element containing target vcn. */ - while (rl->length && rl[1].vcn <= vcn) - rl++; - lcn = ntfs_rl_vcn_to_lcn(rl, vcn); - /* For $MFT, only lcn >= 0 is a successful remap. */ - if (likely(lcn >= 0)) { - /* Setup buffer head to correct block. */ - bh->b_blocknr = ((lcn << - vol->cluster_size_bits) + - vcn_ofs) >> blocksize_bits; - set_buffer_mapped(bh); - } else { - bh->b_blocknr = -1; - ntfs_error(vol->sb, "Cannot write mft record " - "0x%lx because its location " - "on disk could not be " - "determined (error code %lli).", - ni->mft_no, (long long)lcn); - err = -EIO; - } - } - BUG_ON(!buffer_uptodate(bh)); - BUG_ON(!nr_bhs && (m_start != block_start)); - BUG_ON(nr_bhs >= max_bhs); - bhs[nr_bhs++] = bh; - BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end)); - } while (block_start = block_end, (bh = bh->b_this_page) != head); - if (unlikely(rl)) - up_read(&NTFS_I(vol->mft_ino)->runlist.lock); - if (!nr_bhs) - goto done; - if (unlikely(err)) - goto cleanup_out; - /* Apply the mst protection fixups. */ - err = pre_write_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size); - if (err) { - ntfs_error(vol->sb, "Failed to apply mst fixups!"); - goto cleanup_out; - } - flush_dcache_mft_record_page(ni); - /* Lock buffers and start synchronous write i/o on them. */ - for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { - struct buffer_head *tbh = bhs[i_bhs]; - - if (!trylock_buffer(tbh)) - BUG(); - BUG_ON(!buffer_uptodate(tbh)); - clear_buffer_dirty(tbh); - get_bh(tbh); - tbh->b_end_io = end_buffer_write_sync; - submit_bh(REQ_OP_WRITE, tbh); - } - /* Synchronize the mft mirror now if not @sync. */ - if (!sync && ni->mft_no < vol->mftmirr_size) - ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync); - /* Wait on i/o completion of buffers. */ - for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { - struct buffer_head *tbh = bhs[i_bhs]; - - wait_on_buffer(tbh); - if (unlikely(!buffer_uptodate(tbh))) { - err = -EIO; - /* - * Set the buffer uptodate so the page and buffer - * states do not become out of sync. - */ - if (PageUptodate(page)) - set_buffer_uptodate(tbh); - } - } - /* If @sync, now synchronize the mft mirror. */ - if (sync && ni->mft_no < vol->mftmirr_size) - ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync); - /* Remove the mst protection fixups again. */ - post_write_mst_fixup((NTFS_RECORD*)m); - flush_dcache_mft_record_page(ni); - if (unlikely(err)) { - /* I/O error during writing. This is really bad! */ - ntfs_error(vol->sb, "I/O error while writing mft record " - "0x%lx! Marking base inode as bad. You " - "should unmount the volume and run chkdsk.", - ni->mft_no); - goto err_out; - } -done: - ntfs_debug("Done."); - return 0; -cleanup_out: - /* Clean the buffers. */ - for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) - clear_buffer_dirty(bhs[i_bhs]); -err_out: - /* - * Current state: all buffers are clean, unlocked, and uptodate. - * The caller should mark the base inode as bad so that no more i/o - * happens. ->clear_inode() will still be invoked so all extent inodes - * and other allocated memory will be freed. - */ - if (err == -ENOMEM) { - ntfs_error(vol->sb, "Not enough memory to write mft record. " - "Redirtying so the write is retried later."); - mark_mft_record_dirty(ni); - err = 0; - } else - NVolSetErrors(vol); - return err; -} - -/** - * ntfs_may_write_mft_record - check if an mft record may be written out - * @vol: [IN] ntfs volume on which the mft record to check resides - * @mft_no: [IN] mft record number of the mft record to check - * @m: [IN] mapped mft record to check - * @locked_ni: [OUT] caller has to unlock this ntfs inode if one is returned - * - * Check if the mapped (base or extent) mft record @m with mft record number - * @mft_no belonging to the ntfs volume @vol may be written out. If necessary - * and possible the ntfs inode of the mft record is locked and the base vfs - * inode is pinned. The locked ntfs inode is then returned in @locked_ni. The - * caller is responsible for unlocking the ntfs inode and unpinning the base - * vfs inode. - * - * Return 'true' if the mft record may be written out and 'false' if not. - * - * The caller has locked the page and cleared the uptodate flag on it which - * means that we can safely write out any dirty mft records that do not have - * their inodes in icache as determined by ilookup5() as anyone - * opening/creating such an inode would block when attempting to map the mft - * record in read_cache_page() until we are finished with the write out. - * - * Here is a description of the tests we perform: - * - * If the inode is found in icache we know the mft record must be a base mft - * record. If it is dirty, we do not write it and return 'false' as the vfs - * inode write paths will result in the access times being updated which would - * cause the base mft record to be redirtied and written out again. (We know - * the access time update will modify the base mft record because Windows - * chkdsk complains if the standard information attribute is not in the base - * mft record.) - * - * If the inode is in icache and not dirty, we attempt to lock the mft record - * and if we find the lock was already taken, it is not safe to write the mft - * record and we return 'false'. - * - * If we manage to obtain the lock we have exclusive access to the mft record, - * which also allows us safe writeout of the mft record. We then set - * @locked_ni to the locked ntfs inode and return 'true'. - * - * Note we cannot just lock the mft record and sleep while waiting for the lock - * because this would deadlock due to lock reversal (normally the mft record is - * locked before the page is locked but we already have the page locked here - * when we try to lock the mft record). - * - * If the inode is not in icache we need to perform further checks. - * - * If the mft record is not a FILE record or it is a base mft record, we can - * safely write it and return 'true'. - * - * We now know the mft record is an extent mft record. We check if the inode - * corresponding to its base mft record is in icache and obtain a reference to - * it if it is. If it is not, we can safely write it and return 'true'. - * - * We now have the base inode for the extent mft record. We check if it has an - * ntfs inode for the extent mft record attached and if not it is safe to write - * the extent mft record and we return 'true'. - * - * The ntfs inode for the extent mft record is attached to the base inode so we - * attempt to lock the extent mft record and if we find the lock was already - * taken, it is not safe to write the extent mft record and we return 'false'. - * - * If we manage to obtain the lock we have exclusive access to the extent mft - * record, which also allows us safe writeout of the extent mft record. We - * set the ntfs inode of the extent mft record clean and then set @locked_ni to - * the now locked ntfs inode and return 'true'. - * - * Note, the reason for actually writing dirty mft records here and not just - * relying on the vfs inode dirty code paths is that we can have mft records - * modified without them ever having actual inodes in memory. Also we can have - * dirty mft records with clean ntfs inodes in memory. None of the described - * cases would result in the dirty mft records being written out if we only - * relied on the vfs inode dirty code paths. And these cases can really occur - * during allocation of new mft records and in particular when the - * initialized_size of the $MFT/$DATA attribute is extended and the new space - * is initialized using ntfs_mft_record_format(). The clean inode can then - * appear if the mft record is reused for a new inode before it got written - * out. - */ -bool ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no, - const MFT_RECORD *m, ntfs_inode **locked_ni) -{ - struct super_block *sb = vol->sb; - struct inode *mft_vi = vol->mft_ino; - struct inode *vi; - ntfs_inode *ni, *eni, **extent_nis; - int i; - ntfs_attr na; - - ntfs_debug("Entering for inode 0x%lx.", mft_no); - /* - * Normally we do not return a locked inode so set @locked_ni to NULL. - */ - BUG_ON(!locked_ni); - *locked_ni = NULL; - /* - * Check if the inode corresponding to this mft record is in the VFS - * inode cache and obtain a reference to it if it is. - */ - ntfs_debug("Looking for inode 0x%lx in icache.", mft_no); - na.mft_no = mft_no; - na.name = NULL; - na.name_len = 0; - na.type = AT_UNUSED; - /* - * Optimize inode 0, i.e. $MFT itself, since we have it in memory and - * we get here for it rather often. - */ - if (!mft_no) { - /* Balance the below iput(). */ - vi = igrab(mft_vi); - BUG_ON(vi != mft_vi); - } else { - /* - * Have to use ilookup5_nowait() since ilookup5() waits for the - * inode lock which causes ntfs to deadlock when a concurrent - * inode write via the inode dirty code paths and the page - * dirty code path of the inode dirty code path when writing - * $MFT occurs. - */ - vi = ilookup5_nowait(sb, mft_no, ntfs_test_inode, &na); - } - if (vi) { - ntfs_debug("Base inode 0x%lx is in icache.", mft_no); - /* The inode is in icache. */ - ni = NTFS_I(vi); - /* Take a reference to the ntfs inode. */ - atomic_inc(&ni->count); - /* If the inode is dirty, do not write this record. */ - if (NInoDirty(ni)) { - ntfs_debug("Inode 0x%lx is dirty, do not write it.", - mft_no); - atomic_dec(&ni->count); - iput(vi); - return false; - } - ntfs_debug("Inode 0x%lx is not dirty.", mft_no); - /* The inode is not dirty, try to take the mft record lock. */ - if (unlikely(!mutex_trylock(&ni->mrec_lock))) { - ntfs_debug("Mft record 0x%lx is already locked, do " - "not write it.", mft_no); - atomic_dec(&ni->count); - iput(vi); - return false; - } - ntfs_debug("Managed to lock mft record 0x%lx, write it.", - mft_no); - /* - * The write has to occur while we hold the mft record lock so - * return the locked ntfs inode. - */ - *locked_ni = ni; - return true; - } - ntfs_debug("Inode 0x%lx is not in icache.", mft_no); - /* The inode is not in icache. */ - /* Write the record if it is not a mft record (type "FILE"). */ - if (!ntfs_is_mft_record(m->magic)) { - ntfs_debug("Mft record 0x%lx is not a FILE record, write it.", - mft_no); - return true; - } - /* Write the mft record if it is a base inode. */ - if (!m->base_mft_record) { - ntfs_debug("Mft record 0x%lx is a base record, write it.", - mft_no); - return true; - } - /* - * This is an extent mft record. Check if the inode corresponding to - * its base mft record is in icache and obtain a reference to it if it - * is. - */ - na.mft_no = MREF_LE(m->base_mft_record); - ntfs_debug("Mft record 0x%lx is an extent record. Looking for base " - "inode 0x%lx in icache.", mft_no, na.mft_no); - if (!na.mft_no) { - /* Balance the below iput(). */ - vi = igrab(mft_vi); - BUG_ON(vi != mft_vi); - } else - vi = ilookup5_nowait(sb, na.mft_no, ntfs_test_inode, - &na); - if (!vi) { - /* - * The base inode is not in icache, write this extent mft - * record. - */ - ntfs_debug("Base inode 0x%lx is not in icache, write the " - "extent record.", na.mft_no); - return true; - } - ntfs_debug("Base inode 0x%lx is in icache.", na.mft_no); - /* - * The base inode is in icache. Check if it has the extent inode - * corresponding to this extent mft record attached. - */ - ni = NTFS_I(vi); - mutex_lock(&ni->extent_lock); - if (ni->nr_extents <= 0) { - /* - * The base inode has no attached extent inodes, write this - * extent mft record. - */ - mutex_unlock(&ni->extent_lock); - iput(vi); - ntfs_debug("Base inode 0x%lx has no attached extent inodes, " - "write the extent record.", na.mft_no); - return true; - } - /* Iterate over the attached extent inodes. */ - extent_nis = ni->ext.extent_ntfs_inos; - for (eni = NULL, i = 0; i < ni->nr_extents; ++i) { - if (mft_no == extent_nis[i]->mft_no) { - /* - * Found the extent inode corresponding to this extent - * mft record. - */ - eni = extent_nis[i]; - break; - } - } - /* - * If the extent inode was not attached to the base inode, write this - * extent mft record. - */ - if (!eni) { - mutex_unlock(&ni->extent_lock); - iput(vi); - ntfs_debug("Extent inode 0x%lx is not attached to its base " - "inode 0x%lx, write the extent record.", - mft_no, na.mft_no); - return true; - } - ntfs_debug("Extent inode 0x%lx is attached to its base inode 0x%lx.", - mft_no, na.mft_no); - /* Take a reference to the extent ntfs inode. */ - atomic_inc(&eni->count); - mutex_unlock(&ni->extent_lock); - /* - * Found the extent inode coresponding to this extent mft record. - * Try to take the mft record lock. - */ - if (unlikely(!mutex_trylock(&eni->mrec_lock))) { - atomic_dec(&eni->count); - iput(vi); - ntfs_debug("Extent mft record 0x%lx is already locked, do " - "not write it.", mft_no); - return false; - } - ntfs_debug("Managed to lock extent mft record 0x%lx, write it.", - mft_no); - if (NInoTestClearDirty(eni)) - ntfs_debug("Extent inode 0x%lx is dirty, marking it clean.", - mft_no); - /* - * The write has to occur while we hold the mft record lock so return - * the locked extent ntfs inode. - */ - *locked_ni = eni; - return true; -} - -static const char *es = " Leaving inconsistent metadata. Unmount and run " - "chkdsk."; - -/** - * ntfs_mft_bitmap_find_and_alloc_free_rec_nolock - see name - * @vol: volume on which to search for a free mft record - * @base_ni: open base inode if allocating an extent mft record or NULL - * - * Search for a free mft record in the mft bitmap attribute on the ntfs volume - * @vol. - * - * If @base_ni is NULL start the search at the default allocator position. - * - * If @base_ni is not NULL start the search at the mft record after the base - * mft record @base_ni. - * - * Return the free mft record on success and -errno on error. An error code of - * -ENOSPC means that there are no free mft records in the currently - * initialized mft bitmap. - * - * Locking: Caller must hold vol->mftbmp_lock for writing. - */ -static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol, - ntfs_inode *base_ni) -{ - s64 pass_end, ll, data_pos, pass_start, ofs, bit; - unsigned long flags; - struct address_space *mftbmp_mapping; - u8 *buf, *byte; - struct page *page; - unsigned int page_ofs, size; - u8 pass, b; - - ntfs_debug("Searching for free mft record in the currently " - "initialized mft bitmap."); - mftbmp_mapping = vol->mftbmp_ino->i_mapping; - /* - * Set the end of the pass making sure we do not overflow the mft - * bitmap. - */ - read_lock_irqsave(&NTFS_I(vol->mft_ino)->size_lock, flags); - pass_end = NTFS_I(vol->mft_ino)->allocated_size >> - vol->mft_record_size_bits; - read_unlock_irqrestore(&NTFS_I(vol->mft_ino)->size_lock, flags); - read_lock_irqsave(&NTFS_I(vol->mftbmp_ino)->size_lock, flags); - ll = NTFS_I(vol->mftbmp_ino)->initialized_size << 3; - read_unlock_irqrestore(&NTFS_I(vol->mftbmp_ino)->size_lock, flags); - if (pass_end > ll) - pass_end = ll; - pass = 1; - if (!base_ni) - data_pos = vol->mft_data_pos; - else - data_pos = base_ni->mft_no + 1; - if (data_pos < 24) - data_pos = 24; - if (data_pos >= pass_end) { - data_pos = 24; - pass = 2; - /* This happens on a freshly formatted volume. */ - if (data_pos >= pass_end) - return -ENOSPC; - } - pass_start = data_pos; - ntfs_debug("Starting bitmap search: pass %u, pass_start 0x%llx, " - "pass_end 0x%llx, data_pos 0x%llx.", pass, - (long long)pass_start, (long long)pass_end, - (long long)data_pos); - /* Loop until a free mft record is found. */ - for (; pass <= 2;) { - /* Cap size to pass_end. */ - ofs = data_pos >> 3; - page_ofs = ofs & ~PAGE_MASK; - size = PAGE_SIZE - page_ofs; - ll = ((pass_end + 7) >> 3) - ofs; - if (size > ll) - size = ll; - size <<= 3; - /* - * If we are still within the active pass, search the next page - * for a zero bit. - */ - if (size) { - page = ntfs_map_page(mftbmp_mapping, - ofs >> PAGE_SHIFT); - if (IS_ERR(page)) { - ntfs_error(vol->sb, "Failed to read mft " - "bitmap, aborting."); - return PTR_ERR(page); - } - buf = (u8*)page_address(page) + page_ofs; - bit = data_pos & 7; - data_pos &= ~7ull; - ntfs_debug("Before inner for loop: size 0x%x, " - "data_pos 0x%llx, bit 0x%llx", size, - (long long)data_pos, (long long)bit); - for (; bit < size && data_pos + bit < pass_end; - bit &= ~7ull, bit += 8) { - byte = buf + (bit >> 3); - if (*byte == 0xff) - continue; - b = ffz((unsigned long)*byte); - if (b < 8 && b >= (bit & 7)) { - ll = data_pos + (bit & ~7ull) + b; - if (unlikely(ll > (1ll << 32))) { - ntfs_unmap_page(page); - return -ENOSPC; - } - *byte |= 1 << b; - flush_dcache_page(page); - set_page_dirty(page); - ntfs_unmap_page(page); - ntfs_debug("Done. (Found and " - "allocated mft record " - "0x%llx.)", - (long long)ll); - return ll; - } - } - ntfs_debug("After inner for loop: size 0x%x, " - "data_pos 0x%llx, bit 0x%llx", size, - (long long)data_pos, (long long)bit); - data_pos += size; - ntfs_unmap_page(page); - /* - * If the end of the pass has not been reached yet, - * continue searching the mft bitmap for a zero bit. - */ - if (data_pos < pass_end) - continue; - } - /* Do the next pass. */ - if (++pass == 2) { - /* - * Starting the second pass, in which we scan the first - * part of the zone which we omitted earlier. - */ - pass_end = pass_start; - data_pos = pass_start = 24; - ntfs_debug("pass %i, pass_start 0x%llx, pass_end " - "0x%llx.", pass, (long long)pass_start, - (long long)pass_end); - if (data_pos >= pass_end) - break; - } - } - /* No free mft records in currently initialized mft bitmap. */ - ntfs_debug("Done. (No free mft records left in currently initialized " - "mft bitmap.)"); - return -ENOSPC; -} - -/** - * ntfs_mft_bitmap_extend_allocation_nolock - extend mft bitmap by a cluster - * @vol: volume on which to extend the mft bitmap attribute - * - * Extend the mft bitmap attribute on the ntfs volume @vol by one cluster. - * - * Note: Only changes allocated_size, i.e. does not touch initialized_size or - * data_size. - * - * Return 0 on success and -errno on error. - * - * Locking: - Caller must hold vol->mftbmp_lock for writing. - * - This function takes NTFS_I(vol->mftbmp_ino)->runlist.lock for - * writing and releases it before returning. - * - This function takes vol->lcnbmp_lock for writing and releases it - * before returning. - */ -static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol) -{ - LCN lcn; - s64 ll; - unsigned long flags; - struct page *page; - ntfs_inode *mft_ni, *mftbmp_ni; - runlist_element *rl, *rl2 = NULL; - ntfs_attr_search_ctx *ctx = NULL; - MFT_RECORD *mrec; - ATTR_RECORD *a = NULL; - int ret, mp_size; - u32 old_alen = 0; - u8 *b, tb; - struct { - u8 added_cluster:1; - u8 added_run:1; - u8 mp_rebuilt:1; - } status = { 0, 0, 0 }; - - ntfs_debug("Extending mft bitmap allocation."); - mft_ni = NTFS_I(vol->mft_ino); - mftbmp_ni = NTFS_I(vol->mftbmp_ino); - /* - * Determine the last lcn of the mft bitmap. The allocated size of the - * mft bitmap cannot be zero so we are ok to do this. - */ - down_write(&mftbmp_ni->runlist.lock); - read_lock_irqsave(&mftbmp_ni->size_lock, flags); - ll = mftbmp_ni->allocated_size; - read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); - rl = ntfs_attr_find_vcn_nolock(mftbmp_ni, - (ll - 1) >> vol->cluster_size_bits, NULL); - if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) { - up_write(&mftbmp_ni->runlist.lock); - ntfs_error(vol->sb, "Failed to determine last allocated " - "cluster of mft bitmap attribute."); - if (!IS_ERR(rl)) - ret = -EIO; - else - ret = PTR_ERR(rl); - return ret; - } - lcn = rl->lcn + rl->length; - ntfs_debug("Last lcn of mft bitmap attribute is 0x%llx.", - (long long)lcn); - /* - * Attempt to get the cluster following the last allocated cluster by - * hand as it may be in the MFT zone so the allocator would not give it - * to us. - */ - ll = lcn >> 3; - page = ntfs_map_page(vol->lcnbmp_ino->i_mapping, - ll >> PAGE_SHIFT); - if (IS_ERR(page)) { - up_write(&mftbmp_ni->runlist.lock); - ntfs_error(vol->sb, "Failed to read from lcn bitmap."); - return PTR_ERR(page); - } - b = (u8*)page_address(page) + (ll & ~PAGE_MASK); - tb = 1 << (lcn & 7ull); - down_write(&vol->lcnbmp_lock); - if (*b != 0xff && !(*b & tb)) { - /* Next cluster is free, allocate it. */ - *b |= tb; - flush_dcache_page(page); - set_page_dirty(page); - up_write(&vol->lcnbmp_lock); - ntfs_unmap_page(page); - /* Update the mft bitmap runlist. */ - rl->length++; - rl[1].vcn++; - status.added_cluster = 1; - ntfs_debug("Appending one cluster to mft bitmap."); - } else { - up_write(&vol->lcnbmp_lock); - ntfs_unmap_page(page); - /* Allocate a cluster from the DATA_ZONE. */ - rl2 = ntfs_cluster_alloc(vol, rl[1].vcn, 1, lcn, DATA_ZONE, - true); - if (IS_ERR(rl2)) { - up_write(&mftbmp_ni->runlist.lock); - ntfs_error(vol->sb, "Failed to allocate a cluster for " - "the mft bitmap."); - return PTR_ERR(rl2); - } - rl = ntfs_runlists_merge(mftbmp_ni->runlist.rl, rl2); - if (IS_ERR(rl)) { - up_write(&mftbmp_ni->runlist.lock); - ntfs_error(vol->sb, "Failed to merge runlists for mft " - "bitmap."); - if (ntfs_cluster_free_from_rl(vol, rl2)) { - ntfs_error(vol->sb, "Failed to deallocate " - "allocated cluster.%s", es); - NVolSetErrors(vol); - } - ntfs_free(rl2); - return PTR_ERR(rl); - } - mftbmp_ni->runlist.rl = rl; - status.added_run = 1; - ntfs_debug("Adding one run to mft bitmap."); - /* Find the last run in the new runlist. */ - for (; rl[1].length; rl++) - ; - } - /* - * Update the attribute record as well. Note: @rl is the last - * (non-terminator) runlist element of mft bitmap. - */ - mrec = map_mft_record(mft_ni); - if (IS_ERR(mrec)) { - ntfs_error(vol->sb, "Failed to map mft record."); - ret = PTR_ERR(mrec); - goto undo_alloc; - } - ctx = ntfs_attr_get_search_ctx(mft_ni, mrec); - if (unlikely(!ctx)) { - ntfs_error(vol->sb, "Failed to get search context."); - ret = -ENOMEM; - goto undo_alloc; - } - ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, - mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL, - 0, ctx); - if (unlikely(ret)) { - ntfs_error(vol->sb, "Failed to find last attribute extent of " - "mft bitmap attribute."); - if (ret == -ENOENT) - ret = -EIO; - goto undo_alloc; - } - a = ctx->attr; - ll = sle64_to_cpu(a->data.non_resident.lowest_vcn); - /* Search back for the previous last allocated cluster of mft bitmap. */ - for (rl2 = rl; rl2 > mftbmp_ni->runlist.rl; rl2--) { - if (ll >= rl2->vcn) - break; - } - BUG_ON(ll < rl2->vcn); - BUG_ON(ll >= rl2->vcn + rl2->length); - /* Get the size for the new mapping pairs array for this extent. */ - mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1); - if (unlikely(mp_size <= 0)) { - ntfs_error(vol->sb, "Get size for mapping pairs failed for " - "mft bitmap attribute extent."); - ret = mp_size; - if (!ret) - ret = -EIO; - goto undo_alloc; - } - /* Expand the attribute record if necessary. */ - old_alen = le32_to_cpu(a->length); - ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size + - le16_to_cpu(a->data.non_resident.mapping_pairs_offset)); - if (unlikely(ret)) { - if (ret != -ENOSPC) { - ntfs_error(vol->sb, "Failed to resize attribute " - "record for mft bitmap attribute."); - goto undo_alloc; - } - // TODO: Deal with this by moving this extent to a new mft - // record or by starting a new extent in a new mft record or by - // moving other attributes out of this mft record. - // Note: It will need to be a special mft record and if none of - // those are available it gets rather complicated... - ntfs_error(vol->sb, "Not enough space in this mft record to " - "accommodate extended mft bitmap attribute " - "extent. Cannot handle this yet."); - ret = -EOPNOTSUPP; - goto undo_alloc; - } - status.mp_rebuilt = 1; - /* Generate the mapping pairs array directly into the attr record. */ - ret = ntfs_mapping_pairs_build(vol, (u8*)a + - le16_to_cpu(a->data.non_resident.mapping_pairs_offset), - mp_size, rl2, ll, -1, NULL); - if (unlikely(ret)) { - ntfs_error(vol->sb, "Failed to build mapping pairs array for " - "mft bitmap attribute."); - goto undo_alloc; - } - /* Update the highest_vcn. */ - a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1); - /* - * We now have extended the mft bitmap allocated_size by one cluster. - * Reflect this in the ntfs_inode structure and the attribute record. - */ - if (a->data.non_resident.lowest_vcn) { - /* - * We are not in the first attribute extent, switch to it, but - * first ensure the changes will make it to disk later. - */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_reinit_search_ctx(ctx); - ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, - mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, - 0, ctx); - if (unlikely(ret)) { - ntfs_error(vol->sb, "Failed to find first attribute " - "extent of mft bitmap attribute."); - goto restore_undo_alloc; - } - a = ctx->attr; - } - write_lock_irqsave(&mftbmp_ni->size_lock, flags); - mftbmp_ni->allocated_size += vol->cluster_size; - a->data.non_resident.allocated_size = - cpu_to_sle64(mftbmp_ni->allocated_size); - write_unlock_irqrestore(&mftbmp_ni->size_lock, flags); - /* Ensure the changes make it to disk. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(mft_ni); - up_write(&mftbmp_ni->runlist.lock); - ntfs_debug("Done."); - return 0; -restore_undo_alloc: - ntfs_attr_reinit_search_ctx(ctx); - if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, - mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL, - 0, ctx)) { - ntfs_error(vol->sb, "Failed to find last attribute extent of " - "mft bitmap attribute.%s", es); - write_lock_irqsave(&mftbmp_ni->size_lock, flags); - mftbmp_ni->allocated_size += vol->cluster_size; - write_unlock_irqrestore(&mftbmp_ni->size_lock, flags); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(mft_ni); - up_write(&mftbmp_ni->runlist.lock); - /* - * The only thing that is now wrong is ->allocated_size of the - * base attribute extent which chkdsk should be able to fix. - */ - NVolSetErrors(vol); - return ret; - } - a = ctx->attr; - a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 2); -undo_alloc: - if (status.added_cluster) { - /* Truncate the last run in the runlist by one cluster. */ - rl->length--; - rl[1].vcn--; - } else if (status.added_run) { - lcn = rl->lcn; - /* Remove the last run from the runlist. */ - rl->lcn = rl[1].lcn; - rl->length = 0; - } - /* Deallocate the cluster. */ - down_write(&vol->lcnbmp_lock); - if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) { - ntfs_error(vol->sb, "Failed to free allocated cluster.%s", es); - NVolSetErrors(vol); - } - up_write(&vol->lcnbmp_lock); - if (status.mp_rebuilt) { - if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( - a->data.non_resident.mapping_pairs_offset), - old_alen - le16_to_cpu( - a->data.non_resident.mapping_pairs_offset), - rl2, ll, -1, NULL)) { - ntfs_error(vol->sb, "Failed to restore mapping pairs " - "array.%s", es); - NVolSetErrors(vol); - } - if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) { - ntfs_error(vol->sb, "Failed to restore attribute " - "record.%s", es); - NVolSetErrors(vol); - } - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - } - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (!IS_ERR(mrec)) - unmap_mft_record(mft_ni); - up_write(&mftbmp_ni->runlist.lock); - return ret; -} - -/** - * ntfs_mft_bitmap_extend_initialized_nolock - extend mftbmp initialized data - * @vol: volume on which to extend the mft bitmap attribute - * - * Extend the initialized portion of the mft bitmap attribute on the ntfs - * volume @vol by 8 bytes. - * - * Note: Only changes initialized_size and data_size, i.e. requires that - * allocated_size is big enough to fit the new initialized_size. - * - * Return 0 on success and -error on error. - * - * Locking: Caller must hold vol->mftbmp_lock for writing. - */ -static int ntfs_mft_bitmap_extend_initialized_nolock(ntfs_volume *vol) -{ - s64 old_data_size, old_initialized_size; - unsigned long flags; - struct inode *mftbmp_vi; - ntfs_inode *mft_ni, *mftbmp_ni; - ntfs_attr_search_ctx *ctx; - MFT_RECORD *mrec; - ATTR_RECORD *a; - int ret; - - ntfs_debug("Extending mft bitmap initiailized (and data) size."); - mft_ni = NTFS_I(vol->mft_ino); - mftbmp_vi = vol->mftbmp_ino; - mftbmp_ni = NTFS_I(mftbmp_vi); - /* Get the attribute record. */ - mrec = map_mft_record(mft_ni); - if (IS_ERR(mrec)) { - ntfs_error(vol->sb, "Failed to map mft record."); - return PTR_ERR(mrec); - } - ctx = ntfs_attr_get_search_ctx(mft_ni, mrec); - if (unlikely(!ctx)) { - ntfs_error(vol->sb, "Failed to get search context."); - ret = -ENOMEM; - goto unm_err_out; - } - ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, - mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(ret)) { - ntfs_error(vol->sb, "Failed to find first attribute extent of " - "mft bitmap attribute."); - if (ret == -ENOENT) - ret = -EIO; - goto put_err_out; - } - a = ctx->attr; - write_lock_irqsave(&mftbmp_ni->size_lock, flags); - old_data_size = i_size_read(mftbmp_vi); - old_initialized_size = mftbmp_ni->initialized_size; - /* - * We can simply update the initialized_size before filling the space - * with zeroes because the caller is holding the mft bitmap lock for - * writing which ensures that no one else is trying to access the data. - */ - mftbmp_ni->initialized_size += 8; - a->data.non_resident.initialized_size = - cpu_to_sle64(mftbmp_ni->initialized_size); - if (mftbmp_ni->initialized_size > old_data_size) { - i_size_write(mftbmp_vi, mftbmp_ni->initialized_size); - a->data.non_resident.data_size = - cpu_to_sle64(mftbmp_ni->initialized_size); - } - write_unlock_irqrestore(&mftbmp_ni->size_lock, flags); - /* Ensure the changes make it to disk. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(mft_ni); - /* Initialize the mft bitmap attribute value with zeroes. */ - ret = ntfs_attr_set(mftbmp_ni, old_initialized_size, 8, 0); - if (likely(!ret)) { - ntfs_debug("Done. (Wrote eight initialized bytes to mft " - "bitmap."); - return 0; - } - ntfs_error(vol->sb, "Failed to write to mft bitmap."); - /* Try to recover from the error. */ - mrec = map_mft_record(mft_ni); - if (IS_ERR(mrec)) { - ntfs_error(vol->sb, "Failed to map mft record.%s", es); - NVolSetErrors(vol); - return ret; - } - ctx = ntfs_attr_get_search_ctx(mft_ni, mrec); - if (unlikely(!ctx)) { - ntfs_error(vol->sb, "Failed to get search context.%s", es); - NVolSetErrors(vol); - goto unm_err_out; - } - if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, - mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx)) { - ntfs_error(vol->sb, "Failed to find first attribute extent of " - "mft bitmap attribute.%s", es); - NVolSetErrors(vol); -put_err_out: - ntfs_attr_put_search_ctx(ctx); -unm_err_out: - unmap_mft_record(mft_ni); - goto err_out; - } - a = ctx->attr; - write_lock_irqsave(&mftbmp_ni->size_lock, flags); - mftbmp_ni->initialized_size = old_initialized_size; - a->data.non_resident.initialized_size = - cpu_to_sle64(old_initialized_size); - if (i_size_read(mftbmp_vi) != old_data_size) { - i_size_write(mftbmp_vi, old_data_size); - a->data.non_resident.data_size = cpu_to_sle64(old_data_size); - } - write_unlock_irqrestore(&mftbmp_ni->size_lock, flags); - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(mft_ni); -#ifdef DEBUG - read_lock_irqsave(&mftbmp_ni->size_lock, flags); - ntfs_debug("Restored status of mftbmp: allocated_size 0x%llx, " - "data_size 0x%llx, initialized_size 0x%llx.", - (long long)mftbmp_ni->allocated_size, - (long long)i_size_read(mftbmp_vi), - (long long)mftbmp_ni->initialized_size); - read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); -#endif /* DEBUG */ -err_out: - return ret; -} - -/** - * ntfs_mft_data_extend_allocation_nolock - extend mft data attribute - * @vol: volume on which to extend the mft data attribute - * - * Extend the mft data attribute on the ntfs volume @vol by 16 mft records - * worth of clusters or if not enough space for this by one mft record worth - * of clusters. - * - * Note: Only changes allocated_size, i.e. does not touch initialized_size or - * data_size. - * - * Return 0 on success and -errno on error. - * - * Locking: - Caller must hold vol->mftbmp_lock for writing. - * - This function takes NTFS_I(vol->mft_ino)->runlist.lock for - * writing and releases it before returning. - * - This function calls functions which take vol->lcnbmp_lock for - * writing and release it before returning. - */ -static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol) -{ - LCN lcn; - VCN old_last_vcn; - s64 min_nr, nr, ll; - unsigned long flags; - ntfs_inode *mft_ni; - runlist_element *rl, *rl2; - ntfs_attr_search_ctx *ctx = NULL; - MFT_RECORD *mrec; - ATTR_RECORD *a = NULL; - int ret, mp_size; - u32 old_alen = 0; - bool mp_rebuilt = false; - - ntfs_debug("Extending mft data allocation."); - mft_ni = NTFS_I(vol->mft_ino); - /* - * Determine the preferred allocation location, i.e. the last lcn of - * the mft data attribute. The allocated size of the mft data - * attribute cannot be zero so we are ok to do this. - */ - down_write(&mft_ni->runlist.lock); - read_lock_irqsave(&mft_ni->size_lock, flags); - ll = mft_ni->allocated_size; - read_unlock_irqrestore(&mft_ni->size_lock, flags); - rl = ntfs_attr_find_vcn_nolock(mft_ni, - (ll - 1) >> vol->cluster_size_bits, NULL); - if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) { - up_write(&mft_ni->runlist.lock); - ntfs_error(vol->sb, "Failed to determine last allocated " - "cluster of mft data attribute."); - if (!IS_ERR(rl)) - ret = -EIO; - else - ret = PTR_ERR(rl); - return ret; - } - lcn = rl->lcn + rl->length; - ntfs_debug("Last lcn of mft data attribute is 0x%llx.", (long long)lcn); - /* Minimum allocation is one mft record worth of clusters. */ - min_nr = vol->mft_record_size >> vol->cluster_size_bits; - if (!min_nr) - min_nr = 1; - /* Want to allocate 16 mft records worth of clusters. */ - nr = vol->mft_record_size << 4 >> vol->cluster_size_bits; - if (!nr) - nr = min_nr; - /* Ensure we do not go above 2^32-1 mft records. */ - read_lock_irqsave(&mft_ni->size_lock, flags); - ll = mft_ni->allocated_size; - read_unlock_irqrestore(&mft_ni->size_lock, flags); - if (unlikely((ll + (nr << vol->cluster_size_bits)) >> - vol->mft_record_size_bits >= (1ll << 32))) { - nr = min_nr; - if (unlikely((ll + (nr << vol->cluster_size_bits)) >> - vol->mft_record_size_bits >= (1ll << 32))) { - ntfs_warning(vol->sb, "Cannot allocate mft record " - "because the maximum number of inodes " - "(2^32) has already been reached."); - up_write(&mft_ni->runlist.lock); - return -ENOSPC; - } - } - ntfs_debug("Trying mft data allocation with %s cluster count %lli.", - nr > min_nr ? "default" : "minimal", (long long)nr); - old_last_vcn = rl[1].vcn; - do { - rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE, - true); - if (!IS_ERR(rl2)) - break; - if (PTR_ERR(rl2) != -ENOSPC || nr == min_nr) { - ntfs_error(vol->sb, "Failed to allocate the minimal " - "number of clusters (%lli) for the " - "mft data attribute.", (long long)nr); - up_write(&mft_ni->runlist.lock); - return PTR_ERR(rl2); - } - /* - * There is not enough space to do the allocation, but there - * might be enough space to do a minimal allocation so try that - * before failing. - */ - nr = min_nr; - ntfs_debug("Retrying mft data allocation with minimal cluster " - "count %lli.", (long long)nr); - } while (1); - rl = ntfs_runlists_merge(mft_ni->runlist.rl, rl2); - if (IS_ERR(rl)) { - up_write(&mft_ni->runlist.lock); - ntfs_error(vol->sb, "Failed to merge runlists for mft data " - "attribute."); - if (ntfs_cluster_free_from_rl(vol, rl2)) { - ntfs_error(vol->sb, "Failed to deallocate clusters " - "from the mft data attribute.%s", es); - NVolSetErrors(vol); - } - ntfs_free(rl2); - return PTR_ERR(rl); - } - mft_ni->runlist.rl = rl; - ntfs_debug("Allocated %lli clusters.", (long long)nr); - /* Find the last run in the new runlist. */ - for (; rl[1].length; rl++) - ; - /* Update the attribute record as well. */ - mrec = map_mft_record(mft_ni); - if (IS_ERR(mrec)) { - ntfs_error(vol->sb, "Failed to map mft record."); - ret = PTR_ERR(mrec); - goto undo_alloc; - } - ctx = ntfs_attr_get_search_ctx(mft_ni, mrec); - if (unlikely(!ctx)) { - ntfs_error(vol->sb, "Failed to get search context."); - ret = -ENOMEM; - goto undo_alloc; - } - ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len, - CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx); - if (unlikely(ret)) { - ntfs_error(vol->sb, "Failed to find last attribute extent of " - "mft data attribute."); - if (ret == -ENOENT) - ret = -EIO; - goto undo_alloc; - } - a = ctx->attr; - ll = sle64_to_cpu(a->data.non_resident.lowest_vcn); - /* Search back for the previous last allocated cluster of mft bitmap. */ - for (rl2 = rl; rl2 > mft_ni->runlist.rl; rl2--) { - if (ll >= rl2->vcn) - break; - } - BUG_ON(ll < rl2->vcn); - BUG_ON(ll >= rl2->vcn + rl2->length); - /* Get the size for the new mapping pairs array for this extent. */ - mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1); - if (unlikely(mp_size <= 0)) { - ntfs_error(vol->sb, "Get size for mapping pairs failed for " - "mft data attribute extent."); - ret = mp_size; - if (!ret) - ret = -EIO; - goto undo_alloc; - } - /* Expand the attribute record if necessary. */ - old_alen = le32_to_cpu(a->length); - ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size + - le16_to_cpu(a->data.non_resident.mapping_pairs_offset)); - if (unlikely(ret)) { - if (ret != -ENOSPC) { - ntfs_error(vol->sb, "Failed to resize attribute " - "record for mft data attribute."); - goto undo_alloc; - } - // TODO: Deal with this by moving this extent to a new mft - // record or by starting a new extent in a new mft record or by - // moving other attributes out of this mft record. - // Note: Use the special reserved mft records and ensure that - // this extent is not required to find the mft record in - // question. If no free special records left we would need to - // move an existing record away, insert ours in its place, and - // then place the moved record into the newly allocated space - // and we would then need to update all references to this mft - // record appropriately. This is rather complicated... - ntfs_error(vol->sb, "Not enough space in this mft record to " - "accommodate extended mft data attribute " - "extent. Cannot handle this yet."); - ret = -EOPNOTSUPP; - goto undo_alloc; - } - mp_rebuilt = true; - /* Generate the mapping pairs array directly into the attr record. */ - ret = ntfs_mapping_pairs_build(vol, (u8*)a + - le16_to_cpu(a->data.non_resident.mapping_pairs_offset), - mp_size, rl2, ll, -1, NULL); - if (unlikely(ret)) { - ntfs_error(vol->sb, "Failed to build mapping pairs array of " - "mft data attribute."); - goto undo_alloc; - } - /* Update the highest_vcn. */ - a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1); - /* - * We now have extended the mft data allocated_size by nr clusters. - * Reflect this in the ntfs_inode structure and the attribute record. - * @rl is the last (non-terminator) runlist element of mft data - * attribute. - */ - if (a->data.non_resident.lowest_vcn) { - /* - * We are not in the first attribute extent, switch to it, but - * first ensure the changes will make it to disk later. - */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_reinit_search_ctx(ctx); - ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name, - mft_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, - ctx); - if (unlikely(ret)) { - ntfs_error(vol->sb, "Failed to find first attribute " - "extent of mft data attribute."); - goto restore_undo_alloc; - } - a = ctx->attr; - } - write_lock_irqsave(&mft_ni->size_lock, flags); - mft_ni->allocated_size += nr << vol->cluster_size_bits; - a->data.non_resident.allocated_size = - cpu_to_sle64(mft_ni->allocated_size); - write_unlock_irqrestore(&mft_ni->size_lock, flags); - /* Ensure the changes make it to disk. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(mft_ni); - up_write(&mft_ni->runlist.lock); - ntfs_debug("Done."); - return 0; -restore_undo_alloc: - ntfs_attr_reinit_search_ctx(ctx); - if (ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len, - CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx)) { - ntfs_error(vol->sb, "Failed to find last attribute extent of " - "mft data attribute.%s", es); - write_lock_irqsave(&mft_ni->size_lock, flags); - mft_ni->allocated_size += nr << vol->cluster_size_bits; - write_unlock_irqrestore(&mft_ni->size_lock, flags); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(mft_ni); - up_write(&mft_ni->runlist.lock); - /* - * The only thing that is now wrong is ->allocated_size of the - * base attribute extent which chkdsk should be able to fix. - */ - NVolSetErrors(vol); - return ret; - } - ctx->attr->data.non_resident.highest_vcn = - cpu_to_sle64(old_last_vcn - 1); -undo_alloc: - if (ntfs_cluster_free(mft_ni, old_last_vcn, -1, ctx) < 0) { - ntfs_error(vol->sb, "Failed to free clusters from mft data " - "attribute.%s", es); - NVolSetErrors(vol); - } - - if (ntfs_rl_truncate_nolock(vol, &mft_ni->runlist, old_last_vcn)) { - ntfs_error(vol->sb, "Failed to truncate mft data attribute " - "runlist.%s", es); - NVolSetErrors(vol); - } - if (ctx) { - a = ctx->attr; - if (mp_rebuilt && !IS_ERR(ctx->mrec)) { - if (ntfs_mapping_pairs_build(vol, (u8 *)a + le16_to_cpu( - a->data.non_resident.mapping_pairs_offset), - old_alen - le16_to_cpu( - a->data.non_resident.mapping_pairs_offset), - rl2, ll, -1, NULL)) { - ntfs_error(vol->sb, "Failed to restore mapping pairs " - "array.%s", es); - NVolSetErrors(vol); - } - if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) { - ntfs_error(vol->sb, "Failed to restore attribute " - "record.%s", es); - NVolSetErrors(vol); - } - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - } else if (IS_ERR(ctx->mrec)) { - ntfs_error(vol->sb, "Failed to restore attribute search " - "context.%s", es); - NVolSetErrors(vol); - } - ntfs_attr_put_search_ctx(ctx); - } - if (!IS_ERR(mrec)) - unmap_mft_record(mft_ni); - up_write(&mft_ni->runlist.lock); - return ret; -} - -/** - * ntfs_mft_record_layout - layout an mft record into a memory buffer - * @vol: volume to which the mft record will belong - * @mft_no: mft reference specifying the mft record number - * @m: destination buffer of size >= @vol->mft_record_size bytes - * - * Layout an empty, unused mft record with the mft record number @mft_no into - * the buffer @m. The volume @vol is needed because the mft record structure - * was modified in NTFS 3.1 so we need to know which volume version this mft - * record will be used on. - * - * Return 0 on success and -errno on error. - */ -static int ntfs_mft_record_layout(const ntfs_volume *vol, const s64 mft_no, - MFT_RECORD *m) -{ - ATTR_RECORD *a; - - ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no); - if (mft_no >= (1ll << 32)) { - ntfs_error(vol->sb, "Mft record number 0x%llx exceeds " - "maximum of 2^32.", (long long)mft_no); - return -ERANGE; - } - /* Start by clearing the whole mft record to gives us a clean slate. */ - memset(m, 0, vol->mft_record_size); - /* Aligned to 2-byte boundary. */ - if (vol->major_ver < 3 || (vol->major_ver == 3 && !vol->minor_ver)) - m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD_OLD) + 1) & ~1); - else { - m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD) + 1) & ~1); - /* - * Set the NTFS 3.1+ specific fields while we know that the - * volume version is 3.1+. - */ - m->reserved = 0; - m->mft_record_number = cpu_to_le32((u32)mft_no); - } - m->magic = magic_FILE; - if (vol->mft_record_size >= NTFS_BLOCK_SIZE) - m->usa_count = cpu_to_le16(vol->mft_record_size / - NTFS_BLOCK_SIZE + 1); - else { - m->usa_count = cpu_to_le16(1); - ntfs_warning(vol->sb, "Sector size is bigger than mft record " - "size. Setting usa_count to 1. If chkdsk " - "reports this as corruption, please email " - "linux-ntfs-dev@lists.sourceforge.net stating " - "that you saw this message and that the " - "modified filesystem created was corrupt. " - "Thank you."); - } - /* Set the update sequence number to 1. */ - *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = cpu_to_le16(1); - m->lsn = 0; - m->sequence_number = cpu_to_le16(1); - m->link_count = 0; - /* - * Place the attributes straight after the update sequence array, - * aligned to 8-byte boundary. - */ - m->attrs_offset = cpu_to_le16((le16_to_cpu(m->usa_ofs) + - (le16_to_cpu(m->usa_count) << 1) + 7) & ~7); - m->flags = 0; - /* - * Using attrs_offset plus eight bytes (for the termination attribute). - * attrs_offset is already aligned to 8-byte boundary, so no need to - * align again. - */ - m->bytes_in_use = cpu_to_le32(le16_to_cpu(m->attrs_offset) + 8); - m->bytes_allocated = cpu_to_le32(vol->mft_record_size); - m->base_mft_record = 0; - m->next_attr_instance = 0; - /* Add the termination attribute. */ - a = (ATTR_RECORD*)((u8*)m + le16_to_cpu(m->attrs_offset)); - a->type = AT_END; - a->length = 0; - ntfs_debug("Done."); - return 0; -} - -/** - * ntfs_mft_record_format - format an mft record on an ntfs volume - * @vol: volume on which to format the mft record - * @mft_no: mft record number to format - * - * Format the mft record @mft_no in $MFT/$DATA, i.e. lay out an empty, unused - * mft record into the appropriate place of the mft data attribute. This is - * used when extending the mft data attribute. - * - * Return 0 on success and -errno on error. - */ -static int ntfs_mft_record_format(const ntfs_volume *vol, const s64 mft_no) -{ - loff_t i_size; - struct inode *mft_vi = vol->mft_ino; - struct page *page; - MFT_RECORD *m; - pgoff_t index, end_index; - unsigned int ofs; - int err; - - ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no); - /* - * The index into the page cache and the offset within the page cache - * page of the wanted mft record. - */ - index = mft_no << vol->mft_record_size_bits >> PAGE_SHIFT; - ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_MASK; - /* The maximum valid index into the page cache for $MFT's data. */ - i_size = i_size_read(mft_vi); - end_index = i_size >> PAGE_SHIFT; - if (unlikely(index >= end_index)) { - if (unlikely(index > end_index || ofs + vol->mft_record_size >= - (i_size & ~PAGE_MASK))) { - ntfs_error(vol->sb, "Tried to format non-existing mft " - "record 0x%llx.", (long long)mft_no); - return -ENOENT; - } - } - /* Read, map, and pin the page containing the mft record. */ - page = ntfs_map_page(mft_vi->i_mapping, index); - if (IS_ERR(page)) { - ntfs_error(vol->sb, "Failed to map page containing mft record " - "to format 0x%llx.", (long long)mft_no); - return PTR_ERR(page); - } - lock_page(page); - BUG_ON(!PageUptodate(page)); - ClearPageUptodate(page); - m = (MFT_RECORD*)((u8*)page_address(page) + ofs); - err = ntfs_mft_record_layout(vol, mft_no, m); - if (unlikely(err)) { - ntfs_error(vol->sb, "Failed to layout mft record 0x%llx.", - (long long)mft_no); - SetPageUptodate(page); - unlock_page(page); - ntfs_unmap_page(page); - return err; - } - flush_dcache_page(page); - SetPageUptodate(page); - unlock_page(page); - /* - * Make sure the mft record is written out to disk. We could use - * ilookup5() to check if an inode is in icache and so on but this is - * unnecessary as ntfs_writepage() will write the dirty record anyway. - */ - mark_ntfs_record_dirty(page, ofs); - ntfs_unmap_page(page); - ntfs_debug("Done."); - return 0; -} - -/** - * ntfs_mft_record_alloc - allocate an mft record on an ntfs volume - * @vol: [IN] volume on which to allocate the mft record - * @mode: [IN] mode if want a file or directory, i.e. base inode or 0 - * @base_ni: [IN] open base inode if allocating an extent mft record or NULL - * @mrec: [OUT] on successful return this is the mapped mft record - * - * Allocate an mft record in $MFT/$DATA of an open ntfs volume @vol. - * - * If @base_ni is NULL make the mft record a base mft record, i.e. a file or - * direvctory inode, and allocate it at the default allocator position. In - * this case @mode is the file mode as given to us by the caller. We in - * particular use @mode to distinguish whether a file or a directory is being - * created (S_IFDIR(mode) and S_IFREG(mode), respectively). - * - * If @base_ni is not NULL make the allocated mft record an extent record, - * allocate it starting at the mft record after the base mft record and attach - * the allocated and opened ntfs inode to the base inode @base_ni. In this - * case @mode must be 0 as it is meaningless for extent inodes. - * - * You need to check the return value with IS_ERR(). If false, the function - * was successful and the return value is the now opened ntfs inode of the - * allocated mft record. *@mrec is then set to the allocated, mapped, pinned, - * and locked mft record. If IS_ERR() is true, the function failed and the - * error code is obtained from PTR_ERR(return value). *@mrec is undefined in - * this case. - * - * Allocation strategy: - * - * To find a free mft record, we scan the mft bitmap for a zero bit. To - * optimize this we start scanning at the place specified by @base_ni or if - * @base_ni is NULL we start where we last stopped and we perform wrap around - * when we reach the end. Note, we do not try to allocate mft records below - * number 24 because numbers 0 to 15 are the defined system files anyway and 16 - * to 24 are special in that they are used for storing extension mft records - * for the $DATA attribute of $MFT. This is required to avoid the possibility - * of creating a runlist with a circular dependency which once written to disk - * can never be read in again. Windows will only use records 16 to 24 for - * normal files if the volume is completely out of space. We never use them - * which means that when the volume is really out of space we cannot create any - * more files while Windows can still create up to 8 small files. We can start - * doing this at some later time, it does not matter much for now. - * - * When scanning the mft bitmap, we only search up to the last allocated mft - * record. If there are no free records left in the range 24 to number of - * allocated mft records, then we extend the $MFT/$DATA attribute in order to - * create free mft records. We extend the allocated size of $MFT/$DATA by 16 - * records at a time or one cluster, if cluster size is above 16kiB. If there - * is not sufficient space to do this, we try to extend by a single mft record - * or one cluster, if cluster size is above the mft record size. - * - * No matter how many mft records we allocate, we initialize only the first - * allocated mft record, incrementing mft data size and initialized size - * accordingly, open an ntfs_inode for it and return it to the caller, unless - * there are less than 24 mft records, in which case we allocate and initialize - * mft records until we reach record 24 which we consider as the first free mft - * record for use by normal files. - * - * If during any stage we overflow the initialized data in the mft bitmap, we - * extend the initialized size (and data size) by 8 bytes, allocating another - * cluster if required. The bitmap data size has to be at least equal to the - * number of mft records in the mft, but it can be bigger, in which case the - * superflous bits are padded with zeroes. - * - * Thus, when we return successfully (IS_ERR() is false), we will have: - * - initialized / extended the mft bitmap if necessary, - * - initialized / extended the mft data if necessary, - * - set the bit corresponding to the mft record being allocated in the - * mft bitmap, - * - opened an ntfs_inode for the allocated mft record, and we will have - * - returned the ntfs_inode as well as the allocated mapped, pinned, and - * locked mft record. - * - * On error, the volume will be left in a consistent state and no record will - * be allocated. If rolling back a partial operation fails, we may leave some - * inconsistent metadata in which case we set NVolErrors() so the volume is - * left dirty when unmounted. - * - * Note, this function cannot make use of most of the normal functions, like - * for example for attribute resizing, etc, because when the run list overflows - * the base mft record and an attribute list is used, it is very important that - * the extension mft records used to store the $DATA attribute of $MFT can be - * reached without having to read the information contained inside them, as - * this would make it impossible to find them in the first place after the - * volume is unmounted. $MFT/$BITMAP probably does not need to follow this - * rule because the bitmap is not essential for finding the mft records, but on - * the other hand, handling the bitmap in this special way would make life - * easier because otherwise there might be circular invocations of functions - * when reading the bitmap. - */ -ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode, - ntfs_inode *base_ni, MFT_RECORD **mrec) -{ - s64 ll, bit, old_data_initialized, old_data_size; - unsigned long flags; - struct inode *vi; - struct page *page; - ntfs_inode *mft_ni, *mftbmp_ni, *ni; - ntfs_attr_search_ctx *ctx; - MFT_RECORD *m; - ATTR_RECORD *a; - pgoff_t index; - unsigned int ofs; - int err; - le16 seq_no, usn; - bool record_formatted = false; - - if (base_ni) { - ntfs_debug("Entering (allocating an extent mft record for " - "base mft record 0x%llx).", - (long long)base_ni->mft_no); - /* @mode and @base_ni are mutually exclusive. */ - BUG_ON(mode); - } else - ntfs_debug("Entering (allocating a base mft record)."); - if (mode) { - /* @mode and @base_ni are mutually exclusive. */ - BUG_ON(base_ni); - /* We only support creation of normal files and directories. */ - if (!S_ISREG(mode) && !S_ISDIR(mode)) - return ERR_PTR(-EOPNOTSUPP); - } - BUG_ON(!mrec); - mft_ni = NTFS_I(vol->mft_ino); - mftbmp_ni = NTFS_I(vol->mftbmp_ino); - down_write(&vol->mftbmp_lock); - bit = ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(vol, base_ni); - if (bit >= 0) { - ntfs_debug("Found and allocated free record (#1), bit 0x%llx.", - (long long)bit); - goto have_alloc_rec; - } - if (bit != -ENOSPC) { - up_write(&vol->mftbmp_lock); - return ERR_PTR(bit); - } - /* - * No free mft records left. If the mft bitmap already covers more - * than the currently used mft records, the next records are all free, - * so we can simply allocate the first unused mft record. - * Note: We also have to make sure that the mft bitmap at least covers - * the first 24 mft records as they are special and whilst they may not - * be in use, we do not allocate from them. - */ - read_lock_irqsave(&mft_ni->size_lock, flags); - ll = mft_ni->initialized_size >> vol->mft_record_size_bits; - read_unlock_irqrestore(&mft_ni->size_lock, flags); - read_lock_irqsave(&mftbmp_ni->size_lock, flags); - old_data_initialized = mftbmp_ni->initialized_size; - read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); - if (old_data_initialized << 3 > ll && old_data_initialized > 3) { - bit = ll; - if (bit < 24) - bit = 24; - if (unlikely(bit >= (1ll << 32))) - goto max_err_out; - ntfs_debug("Found free record (#2), bit 0x%llx.", - (long long)bit); - goto found_free_rec; - } - /* - * The mft bitmap needs to be expanded until it covers the first unused - * mft record that we can allocate. - * Note: The smallest mft record we allocate is mft record 24. - */ - bit = old_data_initialized << 3; - if (unlikely(bit >= (1ll << 32))) - goto max_err_out; - read_lock_irqsave(&mftbmp_ni->size_lock, flags); - old_data_size = mftbmp_ni->allocated_size; - ntfs_debug("Status of mftbmp before extension: allocated_size 0x%llx, " - "data_size 0x%llx, initialized_size 0x%llx.", - (long long)old_data_size, - (long long)i_size_read(vol->mftbmp_ino), - (long long)old_data_initialized); - read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); - if (old_data_initialized + 8 > old_data_size) { - /* Need to extend bitmap by one more cluster. */ - ntfs_debug("mftbmp: initialized_size + 8 > allocated_size."); - err = ntfs_mft_bitmap_extend_allocation_nolock(vol); - if (unlikely(err)) { - up_write(&vol->mftbmp_lock); - goto err_out; - } -#ifdef DEBUG - read_lock_irqsave(&mftbmp_ni->size_lock, flags); - ntfs_debug("Status of mftbmp after allocation extension: " - "allocated_size 0x%llx, data_size 0x%llx, " - "initialized_size 0x%llx.", - (long long)mftbmp_ni->allocated_size, - (long long)i_size_read(vol->mftbmp_ino), - (long long)mftbmp_ni->initialized_size); - read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); -#endif /* DEBUG */ - } - /* - * We now have sufficient allocated space, extend the initialized_size - * as well as the data_size if necessary and fill the new space with - * zeroes. - */ - err = ntfs_mft_bitmap_extend_initialized_nolock(vol); - if (unlikely(err)) { - up_write(&vol->mftbmp_lock); - goto err_out; - } -#ifdef DEBUG - read_lock_irqsave(&mftbmp_ni->size_lock, flags); - ntfs_debug("Status of mftbmp after initialized extension: " - "allocated_size 0x%llx, data_size 0x%llx, " - "initialized_size 0x%llx.", - (long long)mftbmp_ni->allocated_size, - (long long)i_size_read(vol->mftbmp_ino), - (long long)mftbmp_ni->initialized_size); - read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); -#endif /* DEBUG */ - ntfs_debug("Found free record (#3), bit 0x%llx.", (long long)bit); -found_free_rec: - /* @bit is the found free mft record, allocate it in the mft bitmap. */ - ntfs_debug("At found_free_rec."); - err = ntfs_bitmap_set_bit(vol->mftbmp_ino, bit); - if (unlikely(err)) { - ntfs_error(vol->sb, "Failed to allocate bit in mft bitmap."); - up_write(&vol->mftbmp_lock); - goto err_out; - } - ntfs_debug("Set bit 0x%llx in mft bitmap.", (long long)bit); -have_alloc_rec: - /* - * The mft bitmap is now uptodate. Deal with mft data attribute now. - * Note, we keep hold of the mft bitmap lock for writing until all - * modifications to the mft data attribute are complete, too, as they - * will impact decisions for mft bitmap and mft record allocation done - * by a parallel allocation and if the lock is not maintained a - * parallel allocation could allocate the same mft record as this one. - */ - ll = (bit + 1) << vol->mft_record_size_bits; - read_lock_irqsave(&mft_ni->size_lock, flags); - old_data_initialized = mft_ni->initialized_size; - read_unlock_irqrestore(&mft_ni->size_lock, flags); - if (ll <= old_data_initialized) { - ntfs_debug("Allocated mft record already initialized."); - goto mft_rec_already_initialized; - } - ntfs_debug("Initializing allocated mft record."); - /* - * The mft record is outside the initialized data. Extend the mft data - * attribute until it covers the allocated record. The loop is only - * actually traversed more than once when a freshly formatted volume is - * first written to so it optimizes away nicely in the common case. - */ - read_lock_irqsave(&mft_ni->size_lock, flags); - ntfs_debug("Status of mft data before extension: " - "allocated_size 0x%llx, data_size 0x%llx, " - "initialized_size 0x%llx.", - (long long)mft_ni->allocated_size, - (long long)i_size_read(vol->mft_ino), - (long long)mft_ni->initialized_size); - while (ll > mft_ni->allocated_size) { - read_unlock_irqrestore(&mft_ni->size_lock, flags); - err = ntfs_mft_data_extend_allocation_nolock(vol); - if (unlikely(err)) { - ntfs_error(vol->sb, "Failed to extend mft data " - "allocation."); - goto undo_mftbmp_alloc_nolock; - } - read_lock_irqsave(&mft_ni->size_lock, flags); - ntfs_debug("Status of mft data after allocation extension: " - "allocated_size 0x%llx, data_size 0x%llx, " - "initialized_size 0x%llx.", - (long long)mft_ni->allocated_size, - (long long)i_size_read(vol->mft_ino), - (long long)mft_ni->initialized_size); - } - read_unlock_irqrestore(&mft_ni->size_lock, flags); - /* - * Extend mft data initialized size (and data size of course) to reach - * the allocated mft record, formatting the mft records allong the way. - * Note: We only modify the ntfs_inode structure as that is all that is - * needed by ntfs_mft_record_format(). We will update the attribute - * record itself in one fell swoop later on. - */ - write_lock_irqsave(&mft_ni->size_lock, flags); - old_data_initialized = mft_ni->initialized_size; - old_data_size = vol->mft_ino->i_size; - while (ll > mft_ni->initialized_size) { - s64 new_initialized_size, mft_no; - - new_initialized_size = mft_ni->initialized_size + - vol->mft_record_size; - mft_no = mft_ni->initialized_size >> vol->mft_record_size_bits; - if (new_initialized_size > i_size_read(vol->mft_ino)) - i_size_write(vol->mft_ino, new_initialized_size); - write_unlock_irqrestore(&mft_ni->size_lock, flags); - ntfs_debug("Initializing mft record 0x%llx.", - (long long)mft_no); - err = ntfs_mft_record_format(vol, mft_no); - if (unlikely(err)) { - ntfs_error(vol->sb, "Failed to format mft record."); - goto undo_data_init; - } - write_lock_irqsave(&mft_ni->size_lock, flags); - mft_ni->initialized_size = new_initialized_size; - } - write_unlock_irqrestore(&mft_ni->size_lock, flags); - record_formatted = true; - /* Update the mft data attribute record to reflect the new sizes. */ - m = map_mft_record(mft_ni); - if (IS_ERR(m)) { - ntfs_error(vol->sb, "Failed to map mft record."); - err = PTR_ERR(m); - goto undo_data_init; - } - ctx = ntfs_attr_get_search_ctx(mft_ni, m); - if (unlikely(!ctx)) { - ntfs_error(vol->sb, "Failed to get search context."); - err = -ENOMEM; - unmap_mft_record(mft_ni); - goto undo_data_init; - } - err = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - ntfs_error(vol->sb, "Failed to find first attribute extent of " - "mft data attribute."); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(mft_ni); - goto undo_data_init; - } - a = ctx->attr; - read_lock_irqsave(&mft_ni->size_lock, flags); - a->data.non_resident.initialized_size = - cpu_to_sle64(mft_ni->initialized_size); - a->data.non_resident.data_size = - cpu_to_sle64(i_size_read(vol->mft_ino)); - read_unlock_irqrestore(&mft_ni->size_lock, flags); - /* Ensure the changes make it to disk. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(mft_ni); - read_lock_irqsave(&mft_ni->size_lock, flags); - ntfs_debug("Status of mft data after mft record initialization: " - "allocated_size 0x%llx, data_size 0x%llx, " - "initialized_size 0x%llx.", - (long long)mft_ni->allocated_size, - (long long)i_size_read(vol->mft_ino), - (long long)mft_ni->initialized_size); - BUG_ON(i_size_read(vol->mft_ino) > mft_ni->allocated_size); - BUG_ON(mft_ni->initialized_size > i_size_read(vol->mft_ino)); - read_unlock_irqrestore(&mft_ni->size_lock, flags); -mft_rec_already_initialized: - /* - * We can finally drop the mft bitmap lock as the mft data attribute - * has been fully updated. The only disparity left is that the - * allocated mft record still needs to be marked as in use to match the - * set bit in the mft bitmap but this is actually not a problem since - * this mft record is not referenced from anywhere yet and the fact - * that it is allocated in the mft bitmap means that no-one will try to - * allocate it either. - */ - up_write(&vol->mftbmp_lock); - /* - * We now have allocated and initialized the mft record. Calculate the - * index of and the offset within the page cache page the record is in. - */ - index = bit << vol->mft_record_size_bits >> PAGE_SHIFT; - ofs = (bit << vol->mft_record_size_bits) & ~PAGE_MASK; - /* Read, map, and pin the page containing the mft record. */ - page = ntfs_map_page(vol->mft_ino->i_mapping, index); - if (IS_ERR(page)) { - ntfs_error(vol->sb, "Failed to map page containing allocated " - "mft record 0x%llx.", (long long)bit); - err = PTR_ERR(page); - goto undo_mftbmp_alloc; - } - lock_page(page); - BUG_ON(!PageUptodate(page)); - ClearPageUptodate(page); - m = (MFT_RECORD*)((u8*)page_address(page) + ofs); - /* If we just formatted the mft record no need to do it again. */ - if (!record_formatted) { - /* Sanity check that the mft record is really not in use. */ - if (ntfs_is_file_record(m->magic) && - (m->flags & MFT_RECORD_IN_USE)) { - ntfs_error(vol->sb, "Mft record 0x%llx was marked " - "free in mft bitmap but is marked " - "used itself. Corrupt filesystem. " - "Unmount and run chkdsk.", - (long long)bit); - err = -EIO; - SetPageUptodate(page); - unlock_page(page); - ntfs_unmap_page(page); - NVolSetErrors(vol); - goto undo_mftbmp_alloc; - } - /* - * We need to (re-)format the mft record, preserving the - * sequence number if it is not zero as well as the update - * sequence number if it is not zero or -1 (0xffff). This - * means we do not need to care whether or not something went - * wrong with the previous mft record. - */ - seq_no = m->sequence_number; - usn = *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)); - err = ntfs_mft_record_layout(vol, bit, m); - if (unlikely(err)) { - ntfs_error(vol->sb, "Failed to layout allocated mft " - "record 0x%llx.", (long long)bit); - SetPageUptodate(page); - unlock_page(page); - ntfs_unmap_page(page); - goto undo_mftbmp_alloc; - } - if (seq_no) - m->sequence_number = seq_no; - if (usn && le16_to_cpu(usn) != 0xffff) - *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = usn; - } - /* Set the mft record itself in use. */ - m->flags |= MFT_RECORD_IN_USE; - if (S_ISDIR(mode)) - m->flags |= MFT_RECORD_IS_DIRECTORY; - flush_dcache_page(page); - SetPageUptodate(page); - if (base_ni) { - MFT_RECORD *m_tmp; - - /* - * Setup the base mft record in the extent mft record. This - * completes initialization of the allocated extent mft record - * and we can simply use it with map_extent_mft_record(). - */ - m->base_mft_record = MK_LE_MREF(base_ni->mft_no, - base_ni->seq_no); - /* - * Allocate an extent inode structure for the new mft record, - * attach it to the base inode @base_ni and map, pin, and lock - * its, i.e. the allocated, mft record. - */ - m_tmp = map_extent_mft_record(base_ni, bit, &ni); - if (IS_ERR(m_tmp)) { - ntfs_error(vol->sb, "Failed to map allocated extent " - "mft record 0x%llx.", (long long)bit); - err = PTR_ERR(m_tmp); - /* Set the mft record itself not in use. */ - m->flags &= cpu_to_le16( - ~le16_to_cpu(MFT_RECORD_IN_USE)); - flush_dcache_page(page); - /* Make sure the mft record is written out to disk. */ - mark_ntfs_record_dirty(page, ofs); - unlock_page(page); - ntfs_unmap_page(page); - goto undo_mftbmp_alloc; - } - BUG_ON(m != m_tmp); - /* - * Make sure the allocated mft record is written out to disk. - * No need to set the inode dirty because the caller is going - * to do that anyway after finishing with the new extent mft - * record (e.g. at a minimum a new attribute will be added to - * the mft record. - */ - mark_ntfs_record_dirty(page, ofs); - unlock_page(page); - /* - * Need to unmap the page since map_extent_mft_record() mapped - * it as well so we have it mapped twice at the moment. - */ - ntfs_unmap_page(page); - } else { - /* - * Allocate a new VFS inode and set it up. NOTE: @vi->i_nlink - * is set to 1 but the mft record->link_count is 0. The caller - * needs to bear this in mind. - */ - vi = new_inode(vol->sb); - if (unlikely(!vi)) { - err = -ENOMEM; - /* Set the mft record itself not in use. */ - m->flags &= cpu_to_le16( - ~le16_to_cpu(MFT_RECORD_IN_USE)); - flush_dcache_page(page); - /* Make sure the mft record is written out to disk. */ - mark_ntfs_record_dirty(page, ofs); - unlock_page(page); - ntfs_unmap_page(page); - goto undo_mftbmp_alloc; - } - vi->i_ino = bit; - - /* The owner and group come from the ntfs volume. */ - vi->i_uid = vol->uid; - vi->i_gid = vol->gid; - - /* Initialize the ntfs specific part of @vi. */ - ntfs_init_big_inode(vi); - ni = NTFS_I(vi); - /* - * Set the appropriate mode, attribute type, and name. For - * directories, also setup the index values to the defaults. - */ - if (S_ISDIR(mode)) { - vi->i_mode = S_IFDIR | S_IRWXUGO; - vi->i_mode &= ~vol->dmask; - - NInoSetMstProtected(ni); - ni->type = AT_INDEX_ALLOCATION; - ni->name = I30; - ni->name_len = 4; - - ni->itype.index.block_size = 4096; - ni->itype.index.block_size_bits = ntfs_ffs(4096) - 1; - ni->itype.index.collation_rule = COLLATION_FILE_NAME; - if (vol->cluster_size <= ni->itype.index.block_size) { - ni->itype.index.vcn_size = vol->cluster_size; - ni->itype.index.vcn_size_bits = - vol->cluster_size_bits; - } else { - ni->itype.index.vcn_size = vol->sector_size; - ni->itype.index.vcn_size_bits = - vol->sector_size_bits; - } - } else { - vi->i_mode = S_IFREG | S_IRWXUGO; - vi->i_mode &= ~vol->fmask; - - ni->type = AT_DATA; - ni->name = NULL; - ni->name_len = 0; - } - if (IS_RDONLY(vi)) - vi->i_mode &= ~S_IWUGO; - - /* Set the inode times to the current time. */ - simple_inode_init_ts(vi); - /* - * Set the file size to 0, the ntfs inode sizes are set to 0 by - * the call to ntfs_init_big_inode() below. - */ - vi->i_size = 0; - vi->i_blocks = 0; - - /* Set the sequence number. */ - vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number); - /* - * Manually map, pin, and lock the mft record as we already - * have its page mapped and it is very easy to do. - */ - atomic_inc(&ni->count); - mutex_lock(&ni->mrec_lock); - ni->page = page; - ni->page_ofs = ofs; - /* - * Make sure the allocated mft record is written out to disk. - * NOTE: We do not set the ntfs inode dirty because this would - * fail in ntfs_write_inode() because the inode does not have a - * standard information attribute yet. Also, there is no need - * to set the inode dirty because the caller is going to do - * that anyway after finishing with the new mft record (e.g. at - * a minimum some new attributes will be added to the mft - * record. - */ - mark_ntfs_record_dirty(page, ofs); - unlock_page(page); - - /* Add the inode to the inode hash for the superblock. */ - insert_inode_hash(vi); - - /* Update the default mft allocation position. */ - vol->mft_data_pos = bit + 1; - } - /* - * Return the opened, allocated inode of the allocated mft record as - * well as the mapped, pinned, and locked mft record. - */ - ntfs_debug("Returning opened, allocated %sinode 0x%llx.", - base_ni ? "extent " : "", (long long)bit); - *mrec = m; - return ni; -undo_data_init: - write_lock_irqsave(&mft_ni->size_lock, flags); - mft_ni->initialized_size = old_data_initialized; - i_size_write(vol->mft_ino, old_data_size); - write_unlock_irqrestore(&mft_ni->size_lock, flags); - goto undo_mftbmp_alloc_nolock; -undo_mftbmp_alloc: - down_write(&vol->mftbmp_lock); -undo_mftbmp_alloc_nolock: - if (ntfs_bitmap_clear_bit(vol->mftbmp_ino, bit)) { - ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es); - NVolSetErrors(vol); - } - up_write(&vol->mftbmp_lock); -err_out: - return ERR_PTR(err); -max_err_out: - ntfs_warning(vol->sb, "Cannot allocate mft record because the maximum " - "number of inodes (2^32) has already been reached."); - up_write(&vol->mftbmp_lock); - return ERR_PTR(-ENOSPC); -} - -/** - * ntfs_extent_mft_record_free - free an extent mft record on an ntfs volume - * @ni: ntfs inode of the mapped extent mft record to free - * @m: mapped extent mft record of the ntfs inode @ni - * - * Free the mapped extent mft record @m of the extent ntfs inode @ni. - * - * Note that this function unmaps the mft record and closes and destroys @ni - * internally and hence you cannot use either @ni nor @m any more after this - * function returns success. - * - * On success return 0 and on error return -errno. @ni and @m are still valid - * in this case and have not been freed. - * - * For some errors an error message is displayed and the success code 0 is - * returned and the volume is then left dirty on umount. This makes sense in - * case we could not rollback the changes that were already done since the - * caller no longer wants to reference this mft record so it does not matter to - * the caller if something is wrong with it as long as it is properly detached - * from the base inode. - */ -int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m) -{ - unsigned long mft_no = ni->mft_no; - ntfs_volume *vol = ni->vol; - ntfs_inode *base_ni; - ntfs_inode **extent_nis; - int i, err; - le16 old_seq_no; - u16 seq_no; - - BUG_ON(NInoAttr(ni)); - BUG_ON(ni->nr_extents != -1); - - mutex_lock(&ni->extent_lock); - base_ni = ni->ext.base_ntfs_ino; - mutex_unlock(&ni->extent_lock); - - BUG_ON(base_ni->nr_extents <= 0); - - ntfs_debug("Entering for extent inode 0x%lx, base inode 0x%lx.\n", - mft_no, base_ni->mft_no); - - mutex_lock(&base_ni->extent_lock); - - /* Make sure we are holding the only reference to the extent inode. */ - if (atomic_read(&ni->count) > 2) { - ntfs_error(vol->sb, "Tried to free busy extent inode 0x%lx, " - "not freeing.", base_ni->mft_no); - mutex_unlock(&base_ni->extent_lock); - return -EBUSY; - } - - /* Dissociate the ntfs inode from the base inode. */ - extent_nis = base_ni->ext.extent_ntfs_inos; - err = -ENOENT; - for (i = 0; i < base_ni->nr_extents; i++) { - if (ni != extent_nis[i]) - continue; - extent_nis += i; - base_ni->nr_extents--; - memmove(extent_nis, extent_nis + 1, (base_ni->nr_extents - i) * - sizeof(ntfs_inode*)); - err = 0; - break; - } - - mutex_unlock(&base_ni->extent_lock); - - if (unlikely(err)) { - ntfs_error(vol->sb, "Extent inode 0x%lx is not attached to " - "its base inode 0x%lx.", mft_no, - base_ni->mft_no); - BUG(); - } - - /* - * The extent inode is no longer attached to the base inode so no one - * can get a reference to it any more. - */ - - /* Mark the mft record as not in use. */ - m->flags &= ~MFT_RECORD_IN_USE; - - /* Increment the sequence number, skipping zero, if it is not zero. */ - old_seq_no = m->sequence_number; - seq_no = le16_to_cpu(old_seq_no); - if (seq_no == 0xffff) - seq_no = 1; - else if (seq_no) - seq_no++; - m->sequence_number = cpu_to_le16(seq_no); - - /* - * Set the ntfs inode dirty and write it out. We do not need to worry - * about the base inode here since whatever caused the extent mft - * record to be freed is guaranteed to do it already. - */ - NInoSetDirty(ni); - err = write_mft_record(ni, m, 0); - if (unlikely(err)) { - ntfs_error(vol->sb, "Failed to write mft record 0x%lx, not " - "freeing.", mft_no); - goto rollback; - } -rollback_error: - /* Unmap and throw away the now freed extent inode. */ - unmap_extent_mft_record(ni); - ntfs_clear_extent_inode(ni); - - /* Clear the bit in the $MFT/$BITMAP corresponding to this record. */ - down_write(&vol->mftbmp_lock); - err = ntfs_bitmap_clear_bit(vol->mftbmp_ino, mft_no); - up_write(&vol->mftbmp_lock); - if (unlikely(err)) { - /* - * The extent inode is gone but we failed to deallocate it in - * the mft bitmap. Just emit a warning and leave the volume - * dirty on umount. - */ - ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es); - NVolSetErrors(vol); - } - return 0; -rollback: - /* Rollback what we did... */ - mutex_lock(&base_ni->extent_lock); - extent_nis = base_ni->ext.extent_ntfs_inos; - if (!(base_ni->nr_extents & 3)) { - int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode*); - - extent_nis = kmalloc(new_size, GFP_NOFS); - if (unlikely(!extent_nis)) { - ntfs_error(vol->sb, "Failed to allocate internal " - "buffer during rollback.%s", es); - mutex_unlock(&base_ni->extent_lock); - NVolSetErrors(vol); - goto rollback_error; - } - if (base_ni->nr_extents) { - BUG_ON(!base_ni->ext.extent_ntfs_inos); - memcpy(extent_nis, base_ni->ext.extent_ntfs_inos, - new_size - 4 * sizeof(ntfs_inode*)); - kfree(base_ni->ext.extent_ntfs_inos); - } - base_ni->ext.extent_ntfs_inos = extent_nis; - } - m->flags |= MFT_RECORD_IN_USE; - m->sequence_number = old_seq_no; - extent_nis[base_ni->nr_extents++] = ni; - mutex_unlock(&base_ni->extent_lock); - mark_mft_record_dirty(ni); - return err; -} -#endif /* NTFS_RW */ diff --git a/fs/ntfs/mft.h b/fs/ntfs/mft.h deleted file mode 100644 index 49c001af16ed..000000000000 --- a/fs/ntfs/mft.h +++ /dev/null @@ -1,110 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * mft.h - Defines for mft record handling in NTFS Linux kernel driver. - * Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2004 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_MFT_H -#define _LINUX_NTFS_MFT_H - -#include <linux/fs.h> -#include <linux/highmem.h> -#include <linux/pagemap.h> - -#include "inode.h" - -extern MFT_RECORD *map_mft_record(ntfs_inode *ni); -extern void unmap_mft_record(ntfs_inode *ni); - -extern MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref, - ntfs_inode **ntfs_ino); - -static inline void unmap_extent_mft_record(ntfs_inode *ni) -{ - unmap_mft_record(ni); - return; -} - -#ifdef NTFS_RW - -/** - * flush_dcache_mft_record_page - flush_dcache_page() for mft records - * @ni: ntfs inode structure of mft record - * - * Call flush_dcache_page() for the page in which an mft record resides. - * - * This must be called every time an mft record is modified, just after the - * modification. - */ -static inline void flush_dcache_mft_record_page(ntfs_inode *ni) -{ - flush_dcache_page(ni->page); -} - -extern void __mark_mft_record_dirty(ntfs_inode *ni); - -/** - * mark_mft_record_dirty - set the mft record and the page containing it dirty - * @ni: ntfs inode describing the mapped mft record - * - * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni, - * as well as the page containing the mft record, dirty. Also, mark the base - * vfs inode dirty. This ensures that any changes to the mft record are - * written out to disk. - * - * NOTE: Do not do anything if the mft record is already marked dirty. - */ -static inline void mark_mft_record_dirty(ntfs_inode *ni) -{ - if (!NInoTestSetDirty(ni)) - __mark_mft_record_dirty(ni); -} - -extern int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, - MFT_RECORD *m, int sync); - -extern int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync); - -/** - * write_mft_record - write out a mapped (extent) mft record - * @ni: ntfs inode describing the mapped (extent) mft record - * @m: mapped (extent) mft record to write - * @sync: if true, wait for i/o completion - * - * This is just a wrapper for write_mft_record_nolock() (see mft.c), which - * locks the page for the duration of the write. This ensures that there are - * no race conditions between writing the mft record via the dirty inode code - * paths and via the page cache write back code paths or between writing - * neighbouring mft records residing in the same page. - * - * Locking the page also serializes us against ->read_folio() if the page is not - * uptodate. - * - * On success, clean the mft record and return 0. On error, leave the mft - * record dirty and return -errno. - */ -static inline int write_mft_record(ntfs_inode *ni, MFT_RECORD *m, int sync) -{ - struct page *page = ni->page; - int err; - - BUG_ON(!page); - lock_page(page); - err = write_mft_record_nolock(ni, m, sync); - unlock_page(page); - return err; -} - -extern bool ntfs_may_write_mft_record(ntfs_volume *vol, - const unsigned long mft_no, const MFT_RECORD *m, - ntfs_inode **locked_ni); - -extern ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode, - ntfs_inode *base_ni, MFT_RECORD **mrec); -extern int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m); - -#endif /* NTFS_RW */ - -#endif /* _LINUX_NTFS_MFT_H */ diff --git a/fs/ntfs/mst.c b/fs/ntfs/mst.c deleted file mode 100644 index 16b3c884abfc..000000000000 --- a/fs/ntfs/mst.c +++ /dev/null @@ -1,189 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * mst.c - NTFS multi sector transfer protection handling code. Part of the - * Linux-NTFS project. - * - * Copyright (c) 2001-2004 Anton Altaparmakov - */ - -#include "ntfs.h" - -/** - * post_read_mst_fixup - deprotect multi sector transfer protected data - * @b: pointer to the data to deprotect - * @size: size in bytes of @b - * - * Perform the necessary post read multi sector transfer fixup and detect the - * presence of incomplete multi sector transfers. - In that case, overwrite the - * magic of the ntfs record header being processed with "BAAD" (in memory only!) - * and abort processing. - * - * Return 0 on success and -EINVAL on error ("BAAD" magic will be present). - * - * NOTE: We consider the absence / invalidity of an update sequence array to - * mean that the structure is not protected at all and hence doesn't need to - * be fixed up. Thus, we return success and not failure in this case. This is - * in contrast to pre_write_mst_fixup(), see below. - */ -int post_read_mst_fixup(NTFS_RECORD *b, const u32 size) -{ - u16 usa_ofs, usa_count, usn; - u16 *usa_pos, *data_pos; - - /* Setup the variables. */ - usa_ofs = le16_to_cpu(b->usa_ofs); - /* Decrement usa_count to get number of fixups. */ - usa_count = le16_to_cpu(b->usa_count) - 1; - /* Size and alignment checks. */ - if ( size & (NTFS_BLOCK_SIZE - 1) || - usa_ofs & 1 || - usa_ofs + (usa_count * 2) > size || - (size >> NTFS_BLOCK_SIZE_BITS) != usa_count) - return 0; - /* Position of usn in update sequence array. */ - usa_pos = (u16*)b + usa_ofs/sizeof(u16); - /* - * The update sequence number which has to be equal to each of the - * u16 values before they are fixed up. Note no need to care for - * endianness since we are comparing and moving data for on disk - * structures which means the data is consistent. - If it is - * consistenty the wrong endianness it doesn't make any difference. - */ - usn = *usa_pos; - /* - * Position in protected data of first u16 that needs fixing up. - */ - data_pos = (u16*)b + NTFS_BLOCK_SIZE/sizeof(u16) - 1; - /* - * Check for incomplete multi sector transfer(s). - */ - while (usa_count--) { - if (*data_pos != usn) { - /* - * Incomplete multi sector transfer detected! )-: - * Set the magic to "BAAD" and return failure. - * Note that magic_BAAD is already converted to le32. - */ - b->magic = magic_BAAD; - return -EINVAL; - } - data_pos += NTFS_BLOCK_SIZE/sizeof(u16); - } - /* Re-setup the variables. */ - usa_count = le16_to_cpu(b->usa_count) - 1; - data_pos = (u16*)b + NTFS_BLOCK_SIZE/sizeof(u16) - 1; - /* Fixup all sectors. */ - while (usa_count--) { - /* - * Increment position in usa and restore original data from - * the usa into the data buffer. - */ - *data_pos = *(++usa_pos); - /* Increment position in data as well. */ - data_pos += NTFS_BLOCK_SIZE/sizeof(u16); - } - return 0; -} - -/** - * pre_write_mst_fixup - apply multi sector transfer protection - * @b: pointer to the data to protect - * @size: size in bytes of @b - * - * Perform the necessary pre write multi sector transfer fixup on the data - * pointer to by @b of @size. - * - * Return 0 if fixup applied (success) or -EINVAL if no fixup was performed - * (assumed not needed). This is in contrast to post_read_mst_fixup() above. - * - * NOTE: We consider the absence / invalidity of an update sequence array to - * mean that the structure is not subject to protection and hence doesn't need - * to be fixed up. This means that you have to create a valid update sequence - * array header in the ntfs record before calling this function, otherwise it - * will fail (the header needs to contain the position of the update sequence - * array together with the number of elements in the array). You also need to - * initialise the update sequence number before calling this function - * otherwise a random word will be used (whatever was in the record at that - * position at that time). - */ -int pre_write_mst_fixup(NTFS_RECORD *b, const u32 size) -{ - le16 *usa_pos, *data_pos; - u16 usa_ofs, usa_count, usn; - le16 le_usn; - - /* Sanity check + only fixup if it makes sense. */ - if (!b || ntfs_is_baad_record(b->magic) || - ntfs_is_hole_record(b->magic)) - return -EINVAL; - /* Setup the variables. */ - usa_ofs = le16_to_cpu(b->usa_ofs); - /* Decrement usa_count to get number of fixups. */ - usa_count = le16_to_cpu(b->usa_count) - 1; - /* Size and alignment checks. */ - if ( size & (NTFS_BLOCK_SIZE - 1) || - usa_ofs & 1 || - usa_ofs + (usa_count * 2) > size || - (size >> NTFS_BLOCK_SIZE_BITS) != usa_count) - return -EINVAL; - /* Position of usn in update sequence array. */ - usa_pos = (le16*)((u8*)b + usa_ofs); - /* - * Cyclically increment the update sequence number - * (skipping 0 and -1, i.e. 0xffff). - */ - usn = le16_to_cpup(usa_pos) + 1; - if (usn == 0xffff || !usn) - usn = 1; - le_usn = cpu_to_le16(usn); - *usa_pos = le_usn; - /* Position in data of first u16 that needs fixing up. */ - data_pos = (le16*)b + NTFS_BLOCK_SIZE/sizeof(le16) - 1; - /* Fixup all sectors. */ - while (usa_count--) { - /* - * Increment the position in the usa and save the - * original data from the data buffer into the usa. - */ - *(++usa_pos) = *data_pos; - /* Apply fixup to data. */ - *data_pos = le_usn; - /* Increment position in data as well. */ - data_pos += NTFS_BLOCK_SIZE/sizeof(le16); - } - return 0; -} - -/** - * post_write_mst_fixup - fast deprotect multi sector transfer protected data - * @b: pointer to the data to deprotect - * - * Perform the necessary post write multi sector transfer fixup, not checking - * for any errors, because we assume we have just used pre_write_mst_fixup(), - * thus the data will be fine or we would never have gotten here. - */ -void post_write_mst_fixup(NTFS_RECORD *b) -{ - le16 *usa_pos, *data_pos; - - u16 usa_ofs = le16_to_cpu(b->usa_ofs); - u16 usa_count = le16_to_cpu(b->usa_count) - 1; - - /* Position of usn in update sequence array. */ - usa_pos = (le16*)b + usa_ofs/sizeof(le16); - - /* Position in protected data of first u16 that needs fixing up. */ - data_pos = (le16*)b + NTFS_BLOCK_SIZE/sizeof(le16) - 1; - - /* Fixup all sectors. */ - while (usa_count--) { - /* - * Increment position in usa and restore original data from - * the usa into the data buffer. - */ - *data_pos = *(++usa_pos); - - /* Increment position in data as well. */ - data_pos += NTFS_BLOCK_SIZE/sizeof(le16); - } -} diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c deleted file mode 100644 index d7498ddc4a72..000000000000 --- a/fs/ntfs/namei.c +++ /dev/null @@ -1,392 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * namei.c - NTFS kernel directory inode operations. Part of the Linux-NTFS - * project. - * - * Copyright (c) 2001-2006 Anton Altaparmakov - */ - -#include <linux/dcache.h> -#include <linux/exportfs.h> -#include <linux/security.h> -#include <linux/slab.h> - -#include "attrib.h" -#include "debug.h" -#include "dir.h" -#include "mft.h" -#include "ntfs.h" - -/** - * ntfs_lookup - find the inode represented by a dentry in a directory inode - * @dir_ino: directory inode in which to look for the inode - * @dent: dentry representing the inode to look for - * @flags: lookup flags - * - * In short, ntfs_lookup() looks for the inode represented by the dentry @dent - * in the directory inode @dir_ino and if found attaches the inode to the - * dentry @dent. - * - * In more detail, the dentry @dent specifies which inode to look for by - * supplying the name of the inode in @dent->d_name.name. ntfs_lookup() - * converts the name to Unicode and walks the contents of the directory inode - * @dir_ino looking for the converted Unicode name. If the name is found in the - * directory, the corresponding inode is loaded by calling ntfs_iget() on its - * inode number and the inode is associated with the dentry @dent via a call to - * d_splice_alias(). - * - * If the name is not found in the directory, a NULL inode is inserted into the - * dentry @dent via a call to d_add(). The dentry is then termed a negative - * dentry. - * - * Only if an actual error occurs, do we return an error via ERR_PTR(). - * - * In order to handle the case insensitivity issues of NTFS with regards to the - * dcache and the dcache requiring only one dentry per directory, we deal with - * dentry aliases that only differ in case in ->ntfs_lookup() while maintaining - * a case sensitive dcache. This means that we get the full benefit of dcache - * speed when the file/directory is looked up with the same case as returned by - * ->ntfs_readdir() but that a lookup for any other case (or for the short file - * name) will not find anything in dcache and will enter ->ntfs_lookup() - * instead, where we search the directory for a fully matching file name - * (including case) and if that is not found, we search for a file name that - * matches with different case and if that has non-POSIX semantics we return - * that. We actually do only one search (case sensitive) and keep tabs on - * whether we have found a case insensitive match in the process. - * - * To simplify matters for us, we do not treat the short vs long filenames as - * two hard links but instead if the lookup matches a short filename, we - * return the dentry for the corresponding long filename instead. - * - * There are three cases we need to distinguish here: - * - * 1) @dent perfectly matches (i.e. including case) a directory entry with a - * file name in the WIN32 or POSIX namespaces. In this case - * ntfs_lookup_inode_by_name() will return with name set to NULL and we - * just d_splice_alias() @dent. - * 2) @dent matches (not including case) a directory entry with a file name in - * the WIN32 namespace. In this case ntfs_lookup_inode_by_name() will return - * with name set to point to a kmalloc()ed ntfs_name structure containing - * the properly cased little endian Unicode name. We convert the name to the - * current NLS code page, search if a dentry with this name already exists - * and if so return that instead of @dent. At this point things are - * complicated by the possibility of 'disconnected' dentries due to NFS - * which we deal with appropriately (see the code comments). The VFS will - * then destroy the old @dent and use the one we returned. If a dentry is - * not found, we allocate a new one, d_splice_alias() it, and return it as - * above. - * 3) @dent matches either perfectly or not (i.e. we don't care about case) a - * directory entry with a file name in the DOS namespace. In this case - * ntfs_lookup_inode_by_name() will return with name set to point to a - * kmalloc()ed ntfs_name structure containing the mft reference (cpu endian) - * of the inode. We use the mft reference to read the inode and to find the - * file name in the WIN32 namespace corresponding to the matched short file - * name. We then convert the name to the current NLS code page, and proceed - * searching for a dentry with this name, etc, as in case 2), above. - * - * Locking: Caller must hold i_mutex on the directory. - */ -static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent, - unsigned int flags) -{ - ntfs_volume *vol = NTFS_SB(dir_ino->i_sb); - struct inode *dent_inode; - ntfschar *uname; - ntfs_name *name = NULL; - MFT_REF mref; - unsigned long dent_ino; - int uname_len; - - ntfs_debug("Looking up %pd in directory inode 0x%lx.", - dent, dir_ino->i_ino); - /* Convert the name of the dentry to Unicode. */ - uname_len = ntfs_nlstoucs(vol, dent->d_name.name, dent->d_name.len, - &uname); - if (uname_len < 0) { - if (uname_len != -ENAMETOOLONG) - ntfs_error(vol->sb, "Failed to convert name to " - "Unicode."); - return ERR_PTR(uname_len); - } - mref = ntfs_lookup_inode_by_name(NTFS_I(dir_ino), uname, uname_len, - &name); - kmem_cache_free(ntfs_name_cache, uname); - if (!IS_ERR_MREF(mref)) { - dent_ino = MREF(mref); - ntfs_debug("Found inode 0x%lx. Calling ntfs_iget.", dent_ino); - dent_inode = ntfs_iget(vol->sb, dent_ino); - if (!IS_ERR(dent_inode)) { - /* Consistency check. */ - if (is_bad_inode(dent_inode) || MSEQNO(mref) == - NTFS_I(dent_inode)->seq_no || - dent_ino == FILE_MFT) { - /* Perfect WIN32/POSIX match. -- Case 1. */ - if (!name) { - ntfs_debug("Done. (Case 1.)"); - return d_splice_alias(dent_inode, dent); - } - /* - * We are too indented. Handle imperfect - * matches and short file names further below. - */ - goto handle_name; - } - ntfs_error(vol->sb, "Found stale reference to inode " - "0x%lx (reference sequence number = " - "0x%x, inode sequence number = 0x%x), " - "returning -EIO. Run chkdsk.", - dent_ino, MSEQNO(mref), - NTFS_I(dent_inode)->seq_no); - iput(dent_inode); - dent_inode = ERR_PTR(-EIO); - } else - ntfs_error(vol->sb, "ntfs_iget(0x%lx) failed with " - "error code %li.", dent_ino, - PTR_ERR(dent_inode)); - kfree(name); - /* Return the error code. */ - return ERR_CAST(dent_inode); - } - /* It is guaranteed that @name is no longer allocated at this point. */ - if (MREF_ERR(mref) == -ENOENT) { - ntfs_debug("Entry was not found, adding negative dentry."); - /* The dcache will handle negative entries. */ - d_add(dent, NULL); - ntfs_debug("Done."); - return NULL; - } - ntfs_error(vol->sb, "ntfs_lookup_ino_by_name() failed with error " - "code %i.", -MREF_ERR(mref)); - return ERR_PTR(MREF_ERR(mref)); - // TODO: Consider moving this lot to a separate function! (AIA) -handle_name: - { - MFT_RECORD *m; - ntfs_attr_search_ctx *ctx; - ntfs_inode *ni = NTFS_I(dent_inode); - int err; - struct qstr nls_name; - - nls_name.name = NULL; - if (name->type != FILE_NAME_DOS) { /* Case 2. */ - ntfs_debug("Case 2."); - nls_name.len = (unsigned)ntfs_ucstonls(vol, - (ntfschar*)&name->name, name->len, - (unsigned char**)&nls_name.name, 0); - kfree(name); - } else /* if (name->type == FILE_NAME_DOS) */ { /* Case 3. */ - FILE_NAME_ATTR *fn; - - ntfs_debug("Case 3."); - kfree(name); - - /* Find the WIN32 name corresponding to the matched DOS name. */ - ni = NTFS_I(dent_inode); - m = map_mft_record(ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - m = NULL; - ctx = NULL; - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - do { - ATTR_RECORD *a; - u32 val_len; - - err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, 0, 0, - NULL, 0, ctx); - if (unlikely(err)) { - ntfs_error(vol->sb, "Inode corrupt: No WIN32 " - "namespace counterpart to DOS " - "file name. Run chkdsk."); - if (err == -ENOENT) - err = -EIO; - goto err_out; - } - /* Consistency checks. */ - a = ctx->attr; - if (a->non_resident || a->flags) - goto eio_err_out; - val_len = le32_to_cpu(a->data.resident.value_length); - if (le16_to_cpu(a->data.resident.value_offset) + - val_len > le32_to_cpu(a->length)) - goto eio_err_out; - fn = (FILE_NAME_ATTR*)((u8*)ctx->attr + le16_to_cpu( - ctx->attr->data.resident.value_offset)); - if ((u32)(fn->file_name_length * sizeof(ntfschar) + - sizeof(FILE_NAME_ATTR)) > val_len) - goto eio_err_out; - } while (fn->file_name_type != FILE_NAME_WIN32); - - /* Convert the found WIN32 name to current NLS code page. */ - nls_name.len = (unsigned)ntfs_ucstonls(vol, - (ntfschar*)&fn->file_name, fn->file_name_length, - (unsigned char**)&nls_name.name, 0); - - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(ni); - } - m = NULL; - ctx = NULL; - - /* Check if a conversion error occurred. */ - if ((signed)nls_name.len < 0) { - err = (signed)nls_name.len; - goto err_out; - } - nls_name.hash = full_name_hash(dent, nls_name.name, nls_name.len); - - dent = d_add_ci(dent, dent_inode, &nls_name); - kfree(nls_name.name); - return dent; - -eio_err_out: - ntfs_error(vol->sb, "Illegal file name attribute. Run chkdsk."); - err = -EIO; -err_out: - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(ni); - iput(dent_inode); - ntfs_error(vol->sb, "Failed, returning error code %i.", err); - return ERR_PTR(err); - } -} - -/* - * Inode operations for directories. - */ -const struct inode_operations ntfs_dir_inode_ops = { - .lookup = ntfs_lookup, /* VFS: Lookup directory. */ -}; - -/** - * ntfs_get_parent - find the dentry of the parent of a given directory dentry - * @child_dent: dentry of the directory whose parent directory to find - * - * Find the dentry for the parent directory of the directory specified by the - * dentry @child_dent. This function is called from - * fs/exportfs/expfs.c::find_exported_dentry() which in turn is called from the - * default ->decode_fh() which is export_decode_fh() in the same file. - * - * The code is based on the ext3 ->get_parent() implementation found in - * fs/ext3/namei.c::ext3_get_parent(). - * - * Note: ntfs_get_parent() is called with @d_inode(child_dent)->i_mutex down. - * - * Return the dentry of the parent directory on success or the error code on - * error (IS_ERR() is true). - */ -static struct dentry *ntfs_get_parent(struct dentry *child_dent) -{ - struct inode *vi = d_inode(child_dent); - ntfs_inode *ni = NTFS_I(vi); - MFT_RECORD *mrec; - ntfs_attr_search_ctx *ctx; - ATTR_RECORD *attr; - FILE_NAME_ATTR *fn; - unsigned long parent_ino; - int err; - - ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); - /* Get the mft record of the inode belonging to the child dentry. */ - mrec = map_mft_record(ni); - if (IS_ERR(mrec)) - return ERR_CAST(mrec); - /* Find the first file name attribute in the mft record. */ - ctx = ntfs_attr_get_search_ctx(ni, mrec); - if (unlikely(!ctx)) { - unmap_mft_record(ni); - return ERR_PTR(-ENOMEM); - } -try_next: - err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, CASE_SENSITIVE, 0, NULL, - 0, ctx); - if (unlikely(err)) { - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(ni); - if (err == -ENOENT) - ntfs_error(vi->i_sb, "Inode 0x%lx does not have a " - "file name attribute. Run chkdsk.", - vi->i_ino); - return ERR_PTR(err); - } - attr = ctx->attr; - if (unlikely(attr->non_resident)) - goto try_next; - fn = (FILE_NAME_ATTR *)((u8 *)attr + - le16_to_cpu(attr->data.resident.value_offset)); - if (unlikely((u8 *)fn + le32_to_cpu(attr->data.resident.value_length) > - (u8*)attr + le32_to_cpu(attr->length))) - goto try_next; - /* Get the inode number of the parent directory. */ - parent_ino = MREF_LE(fn->parent_directory); - /* Release the search context and the mft record of the child. */ - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(ni); - - return d_obtain_alias(ntfs_iget(vi->i_sb, parent_ino)); -} - -static struct inode *ntfs_nfs_get_inode(struct super_block *sb, - u64 ino, u32 generation) -{ - struct inode *inode; - - inode = ntfs_iget(sb, ino); - if (!IS_ERR(inode)) { - if (is_bad_inode(inode) || inode->i_generation != generation) { - iput(inode); - inode = ERR_PTR(-ESTALE); - } - } - - return inode; -} - -static struct dentry *ntfs_fh_to_dentry(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type) -{ - return generic_fh_to_dentry(sb, fid, fh_len, fh_type, - ntfs_nfs_get_inode); -} - -static struct dentry *ntfs_fh_to_parent(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type) -{ - return generic_fh_to_parent(sb, fid, fh_len, fh_type, - ntfs_nfs_get_inode); -} - -/* - * Export operations allowing NFS exporting of mounted NTFS partitions. - * - * We use the default ->encode_fh() for now. Note that they - * use 32 bits to store the inode number which is an unsigned long so on 64-bit - * architectures is usually 64 bits so it would all fail horribly on huge - * volumes. I guess we need to define our own encode and decode fh functions - * that store 64-bit inode numbers at some point but for now we will ignore the - * problem... - * - * We also use the default ->get_name() helper (used by ->decode_fh() via - * fs/exportfs/expfs.c::find_exported_dentry()) as that is completely fs - * independent. - * - * The default ->get_parent() just returns -EACCES so we have to provide our - * own and the default ->get_dentry() is incompatible with NTFS due to not - * allowing the inode number 0 which is used in NTFS for the system file $MFT - * and due to using iget() whereas NTFS needs ntfs_iget(). - */ -const struct export_operations ntfs_export_ops = { - .encode_fh = generic_encode_ino32_fh, - .get_parent = ntfs_get_parent, /* Find the parent of a given - directory. */ - .fh_to_dentry = ntfs_fh_to_dentry, - .fh_to_parent = ntfs_fh_to_parent, -}; diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h deleted file mode 100644 index e81376ea9152..000000000000 --- a/fs/ntfs/ntfs.h +++ /dev/null @@ -1,150 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * ntfs.h - Defines for NTFS Linux kernel driver. - * - * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc. - * Copyright (C) 2002 Richard Russon - */ - -#ifndef _LINUX_NTFS_H -#define _LINUX_NTFS_H - -#include <linux/stddef.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/compiler.h> -#include <linux/fs.h> -#include <linux/nls.h> -#include <linux/smp.h> -#include <linux/pagemap.h> - -#include "types.h" -#include "volume.h" -#include "layout.h" - -typedef enum { - NTFS_BLOCK_SIZE = 512, - NTFS_BLOCK_SIZE_BITS = 9, - NTFS_SB_MAGIC = 0x5346544e, /* 'NTFS' */ - NTFS_MAX_NAME_LEN = 255, - NTFS_MAX_ATTR_NAME_LEN = 255, - NTFS_MAX_CLUSTER_SIZE = 64 * 1024, /* 64kiB */ - NTFS_MAX_PAGES_PER_CLUSTER = NTFS_MAX_CLUSTER_SIZE / PAGE_SIZE, -} NTFS_CONSTANTS; - -/* Global variables. */ - -/* Slab caches (from super.c). */ -extern struct kmem_cache *ntfs_name_cache; -extern struct kmem_cache *ntfs_inode_cache; -extern struct kmem_cache *ntfs_big_inode_cache; -extern struct kmem_cache *ntfs_attr_ctx_cache; -extern struct kmem_cache *ntfs_index_ctx_cache; - -/* The various operations structs defined throughout the driver files. */ -extern const struct address_space_operations ntfs_normal_aops; -extern const struct address_space_operations ntfs_compressed_aops; -extern const struct address_space_operations ntfs_mst_aops; - -extern const struct file_operations ntfs_file_ops; -extern const struct inode_operations ntfs_file_inode_ops; - -extern const struct file_operations ntfs_dir_ops; -extern const struct inode_operations ntfs_dir_inode_ops; - -extern const struct file_operations ntfs_empty_file_ops; -extern const struct inode_operations ntfs_empty_inode_ops; - -extern const struct export_operations ntfs_export_ops; - -/** - * NTFS_SB - return the ntfs volume given a vfs super block - * @sb: VFS super block - * - * NTFS_SB() returns the ntfs volume associated with the VFS super block @sb. - */ -static inline ntfs_volume *NTFS_SB(struct super_block *sb) -{ - return sb->s_fs_info; -} - -/* Declarations of functions and global variables. */ - -/* From fs/ntfs/compress.c */ -extern int ntfs_read_compressed_block(struct page *page); -extern int allocate_compression_buffers(void); -extern void free_compression_buffers(void); - -/* From fs/ntfs/super.c */ -#define default_upcase_len 0x10000 -extern struct mutex ntfs_lock; - -typedef struct { - int val; - char *str; -} option_t; -extern const option_t on_errors_arr[]; - -/* From fs/ntfs/mst.c */ -extern int post_read_mst_fixup(NTFS_RECORD *b, const u32 size); -extern int pre_write_mst_fixup(NTFS_RECORD *b, const u32 size); -extern void post_write_mst_fixup(NTFS_RECORD *b); - -/* From fs/ntfs/unistr.c */ -extern bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len, - const ntfschar *s2, size_t s2_len, - const IGNORE_CASE_BOOL ic, - const ntfschar *upcase, const u32 upcase_size); -extern int ntfs_collate_names(const ntfschar *name1, const u32 name1_len, - const ntfschar *name2, const u32 name2_len, - const int err_val, const IGNORE_CASE_BOOL ic, - const ntfschar *upcase, const u32 upcase_len); -extern int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n); -extern int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n, - const ntfschar *upcase, const u32 upcase_size); -extern void ntfs_upcase_name(ntfschar *name, u32 name_len, - const ntfschar *upcase, const u32 upcase_len); -extern void ntfs_file_upcase_value(FILE_NAME_ATTR *file_name_attr, - const ntfschar *upcase, const u32 upcase_len); -extern int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1, - FILE_NAME_ATTR *file_name_attr2, - const int err_val, const IGNORE_CASE_BOOL ic, - const ntfschar *upcase, const u32 upcase_len); -extern int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins, - const int ins_len, ntfschar **outs); -extern int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins, - const int ins_len, unsigned char **outs, int outs_len); - -/* From fs/ntfs/upcase.c */ -extern ntfschar *generate_default_upcase(void); - -static inline int ntfs_ffs(int x) -{ - int r = 1; - - if (!x) - return 0; - if (!(x & 0xffff)) { - x >>= 16; - r += 16; - } - if (!(x & 0xff)) { - x >>= 8; - r += 8; - } - if (!(x & 0xf)) { - x >>= 4; - r += 4; - } - if (!(x & 3)) { - x >>= 2; - r += 2; - } - if (!(x & 1)) { - x >>= 1; - r += 1; - } - return r; -} - -#endif /* _LINUX_NTFS_H */ diff --git a/fs/ntfs/quota.c b/fs/ntfs/quota.c deleted file mode 100644 index 9160480222fd..000000000000 --- a/fs/ntfs/quota.c +++ /dev/null @@ -1,103 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * quota.c - NTFS kernel quota ($Quota) handling. Part of the Linux-NTFS - * project. - * - * Copyright (c) 2004 Anton Altaparmakov - */ - -#ifdef NTFS_RW - -#include "index.h" -#include "quota.h" -#include "debug.h" -#include "ntfs.h" - -/** - * ntfs_mark_quotas_out_of_date - mark the quotas out of date on an ntfs volume - * @vol: ntfs volume on which to mark the quotas out of date - * - * Mark the quotas out of date on the ntfs volume @vol and return 'true' on - * success and 'false' on error. - */ -bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol) -{ - ntfs_index_context *ictx; - QUOTA_CONTROL_ENTRY *qce; - const le32 qid = QUOTA_DEFAULTS_ID; - int err; - - ntfs_debug("Entering."); - if (NVolQuotaOutOfDate(vol)) - goto done; - if (!vol->quota_ino || !vol->quota_q_ino) { - ntfs_error(vol->sb, "Quota inodes are not open."); - return false; - } - inode_lock(vol->quota_q_ino); - ictx = ntfs_index_ctx_get(NTFS_I(vol->quota_q_ino)); - if (!ictx) { - ntfs_error(vol->sb, "Failed to get index context."); - goto err_out; - } - err = ntfs_index_lookup(&qid, sizeof(qid), ictx); - if (err) { - if (err == -ENOENT) - ntfs_error(vol->sb, "Quota defaults entry is not " - "present."); - else - ntfs_error(vol->sb, "Lookup of quota defaults entry " - "failed."); - goto err_out; - } - if (ictx->data_len < offsetof(QUOTA_CONTROL_ENTRY, sid)) { - ntfs_error(vol->sb, "Quota defaults entry size is invalid. " - "Run chkdsk."); - goto err_out; - } - qce = (QUOTA_CONTROL_ENTRY*)ictx->data; - if (le32_to_cpu(qce->version) != QUOTA_VERSION) { - ntfs_error(vol->sb, "Quota defaults entry version 0x%x is not " - "supported.", le32_to_cpu(qce->version)); - goto err_out; - } - ntfs_debug("Quota defaults flags = 0x%x.", le32_to_cpu(qce->flags)); - /* If quotas are already marked out of date, no need to do anything. */ - if (qce->flags & QUOTA_FLAG_OUT_OF_DATE) - goto set_done; - /* - * If quota tracking is neither requested, nor enabled and there are no - * pending deletes, no need to mark the quotas out of date. - */ - if (!(qce->flags & (QUOTA_FLAG_TRACKING_ENABLED | - QUOTA_FLAG_TRACKING_REQUESTED | - QUOTA_FLAG_PENDING_DELETES))) - goto set_done; - /* - * Set the QUOTA_FLAG_OUT_OF_DATE bit thus marking quotas out of date. - * This is verified on WinXP to be sufficient to cause windows to - * rescan the volume on boot and update all quota entries. - */ - qce->flags |= QUOTA_FLAG_OUT_OF_DATE; - /* Ensure the modified flags are written to disk. */ - ntfs_index_entry_flush_dcache_page(ictx); - ntfs_index_entry_mark_dirty(ictx); -set_done: - ntfs_index_ctx_put(ictx); - inode_unlock(vol->quota_q_ino); - /* - * We set the flag so we do not try to mark the quotas out of date - * again on remount. - */ - NVolSetQuotaOutOfDate(vol); -done: - ntfs_debug("Done."); - return true; -err_out: - if (ictx) - ntfs_index_ctx_put(ictx); - inode_unlock(vol->quota_q_ino); - return false; -} - -#endif /* NTFS_RW */ diff --git a/fs/ntfs/quota.h b/fs/ntfs/quota.h deleted file mode 100644 index fe3132a3d6d2..000000000000 --- a/fs/ntfs/quota.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * quota.h - Defines for NTFS kernel quota ($Quota) handling. Part of the - * Linux-NTFS project. - * - * Copyright (c) 2004 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_QUOTA_H -#define _LINUX_NTFS_QUOTA_H - -#ifdef NTFS_RW - -#include "types.h" -#include "volume.h" - -extern bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol); - -#endif /* NTFS_RW */ - -#endif /* _LINUX_NTFS_QUOTA_H */ diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c deleted file mode 100644 index 0d448e9881f7..000000000000 --- a/fs/ntfs/runlist.c +++ /dev/null @@ -1,1893 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * runlist.c - NTFS runlist handling code. Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2007 Anton Altaparmakov - * Copyright (c) 2002-2005 Richard Russon - */ - -#include "debug.h" -#include "dir.h" -#include "endian.h" -#include "malloc.h" -#include "ntfs.h" - -/** - * ntfs_rl_mm - runlist memmove - * - * It is up to the caller to serialize access to the runlist @base. - */ -static inline void ntfs_rl_mm(runlist_element *base, int dst, int src, - int size) -{ - if (likely((dst != src) && (size > 0))) - memmove(base + dst, base + src, size * sizeof(*base)); -} - -/** - * ntfs_rl_mc - runlist memory copy - * - * It is up to the caller to serialize access to the runlists @dstbase and - * @srcbase. - */ -static inline void ntfs_rl_mc(runlist_element *dstbase, int dst, - runlist_element *srcbase, int src, int size) -{ - if (likely(size > 0)) - memcpy(dstbase + dst, srcbase + src, size * sizeof(*dstbase)); -} - -/** - * ntfs_rl_realloc - Reallocate memory for runlists - * @rl: original runlist - * @old_size: number of runlist elements in the original runlist @rl - * @new_size: number of runlist elements we need space for - * - * As the runlists grow, more memory will be required. To prevent the - * kernel having to allocate and reallocate large numbers of small bits of - * memory, this function returns an entire page of memory. - * - * It is up to the caller to serialize access to the runlist @rl. - * - * N.B. If the new allocation doesn't require a different number of pages in - * memory, the function will return the original pointer. - * - * On success, return a pointer to the newly allocated, or recycled, memory. - * On error, return -errno. The following error codes are defined: - * -ENOMEM - Not enough memory to allocate runlist array. - * -EINVAL - Invalid parameters were passed in. - */ -static inline runlist_element *ntfs_rl_realloc(runlist_element *rl, - int old_size, int new_size) -{ - runlist_element *new_rl; - - old_size = PAGE_ALIGN(old_size * sizeof(*rl)); - new_size = PAGE_ALIGN(new_size * sizeof(*rl)); - if (old_size == new_size) - return rl; - - new_rl = ntfs_malloc_nofs(new_size); - if (unlikely(!new_rl)) - return ERR_PTR(-ENOMEM); - - if (likely(rl != NULL)) { - if (unlikely(old_size > new_size)) - old_size = new_size; - memcpy(new_rl, rl, old_size); - ntfs_free(rl); - } - return new_rl; -} - -/** - * ntfs_rl_realloc_nofail - Reallocate memory for runlists - * @rl: original runlist - * @old_size: number of runlist elements in the original runlist @rl - * @new_size: number of runlist elements we need space for - * - * As the runlists grow, more memory will be required. To prevent the - * kernel having to allocate and reallocate large numbers of small bits of - * memory, this function returns an entire page of memory. - * - * This function guarantees that the allocation will succeed. It will sleep - * for as long as it takes to complete the allocation. - * - * It is up to the caller to serialize access to the runlist @rl. - * - * N.B. If the new allocation doesn't require a different number of pages in - * memory, the function will return the original pointer. - * - * On success, return a pointer to the newly allocated, or recycled, memory. - * On error, return -errno. The following error codes are defined: - * -ENOMEM - Not enough memory to allocate runlist array. - * -EINVAL - Invalid parameters were passed in. - */ -static inline runlist_element *ntfs_rl_realloc_nofail(runlist_element *rl, - int old_size, int new_size) -{ - runlist_element *new_rl; - - old_size = PAGE_ALIGN(old_size * sizeof(*rl)); - new_size = PAGE_ALIGN(new_size * sizeof(*rl)); - if (old_size == new_size) - return rl; - - new_rl = ntfs_malloc_nofs_nofail(new_size); - BUG_ON(!new_rl); - - if (likely(rl != NULL)) { - if (unlikely(old_size > new_size)) - old_size = new_size; - memcpy(new_rl, rl, old_size); - ntfs_free(rl); - } - return new_rl; -} - -/** - * ntfs_are_rl_mergeable - test if two runlists can be joined together - * @dst: original runlist - * @src: new runlist to test for mergeability with @dst - * - * Test if two runlists can be joined together. For this, their VCNs and LCNs - * must be adjacent. - * - * It is up to the caller to serialize access to the runlists @dst and @src. - * - * Return: true Success, the runlists can be merged. - * false Failure, the runlists cannot be merged. - */ -static inline bool ntfs_are_rl_mergeable(runlist_element *dst, - runlist_element *src) -{ - BUG_ON(!dst); - BUG_ON(!src); - - /* We can merge unmapped regions even if they are misaligned. */ - if ((dst->lcn == LCN_RL_NOT_MAPPED) && (src->lcn == LCN_RL_NOT_MAPPED)) - return true; - /* If the runs are misaligned, we cannot merge them. */ - if ((dst->vcn + dst->length) != src->vcn) - return false; - /* If both runs are non-sparse and contiguous, we can merge them. */ - if ((dst->lcn >= 0) && (src->lcn >= 0) && - ((dst->lcn + dst->length) == src->lcn)) - return true; - /* If we are merging two holes, we can merge them. */ - if ((dst->lcn == LCN_HOLE) && (src->lcn == LCN_HOLE)) - return true; - /* Cannot merge. */ - return false; -} - -/** - * __ntfs_rl_merge - merge two runlists without testing if they can be merged - * @dst: original, destination runlist - * @src: new runlist to merge with @dst - * - * Merge the two runlists, writing into the destination runlist @dst. The - * caller must make sure the runlists can be merged or this will corrupt the - * destination runlist. - * - * It is up to the caller to serialize access to the runlists @dst and @src. - */ -static inline void __ntfs_rl_merge(runlist_element *dst, runlist_element *src) -{ - dst->length += src->length; -} - -/** - * ntfs_rl_append - append a runlist after a given element - * @dst: original runlist to be worked on - * @dsize: number of elements in @dst (including end marker) - * @src: runlist to be inserted into @dst - * @ssize: number of elements in @src (excluding end marker) - * @loc: append the new runlist @src after this element in @dst - * - * Append the runlist @src after element @loc in @dst. Merge the right end of - * the new runlist, if necessary. Adjust the size of the hole before the - * appended runlist. - * - * It is up to the caller to serialize access to the runlists @dst and @src. - * - * On success, return a pointer to the new, combined, runlist. Note, both - * runlists @dst and @src are deallocated before returning so you cannot use - * the pointers for anything any more. (Strictly speaking the returned runlist - * may be the same as @dst but this is irrelevant.) - * - * On error, return -errno. Both runlists are left unmodified. The following - * error codes are defined: - * -ENOMEM - Not enough memory to allocate runlist array. - * -EINVAL - Invalid parameters were passed in. - */ -static inline runlist_element *ntfs_rl_append(runlist_element *dst, - int dsize, runlist_element *src, int ssize, int loc) -{ - bool right = false; /* Right end of @src needs merging. */ - int marker; /* End of the inserted runs. */ - - BUG_ON(!dst); - BUG_ON(!src); - - /* First, check if the right hand end needs merging. */ - if ((loc + 1) < dsize) - right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1); - - /* Space required: @dst size + @src size, less one if we merged. */ - dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - right); - if (IS_ERR(dst)) - return dst; - /* - * We are guaranteed to succeed from here so can start modifying the - * original runlists. - */ - - /* First, merge the right hand end, if necessary. */ - if (right) - __ntfs_rl_merge(src + ssize - 1, dst + loc + 1); - - /* First run after the @src runs that have been inserted. */ - marker = loc + ssize + 1; - - /* Move the tail of @dst out of the way, then copy in @src. */ - ntfs_rl_mm(dst, marker, loc + 1 + right, dsize - (loc + 1 + right)); - ntfs_rl_mc(dst, loc + 1, src, 0, ssize); - - /* Adjust the size of the preceding hole. */ - dst[loc].length = dst[loc + 1].vcn - dst[loc].vcn; - - /* We may have changed the length of the file, so fix the end marker */ - if (dst[marker].lcn == LCN_ENOENT) - dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length; - - return dst; -} - -/** - * ntfs_rl_insert - insert a runlist into another - * @dst: original runlist to be worked on - * @dsize: number of elements in @dst (including end marker) - * @src: new runlist to be inserted - * @ssize: number of elements in @src (excluding end marker) - * @loc: insert the new runlist @src before this element in @dst - * - * Insert the runlist @src before element @loc in the runlist @dst. Merge the - * left end of the new runlist, if necessary. Adjust the size of the hole - * after the inserted runlist. - * - * It is up to the caller to serialize access to the runlists @dst and @src. - * - * On success, return a pointer to the new, combined, runlist. Note, both - * runlists @dst and @src are deallocated before returning so you cannot use - * the pointers for anything any more. (Strictly speaking the returned runlist - * may be the same as @dst but this is irrelevant.) - * - * On error, return -errno. Both runlists are left unmodified. The following - * error codes are defined: - * -ENOMEM - Not enough memory to allocate runlist array. - * -EINVAL - Invalid parameters were passed in. - */ -static inline runlist_element *ntfs_rl_insert(runlist_element *dst, - int dsize, runlist_element *src, int ssize, int loc) -{ - bool left = false; /* Left end of @src needs merging. */ - bool disc = false; /* Discontinuity between @dst and @src. */ - int marker; /* End of the inserted runs. */ - - BUG_ON(!dst); - BUG_ON(!src); - - /* - * disc => Discontinuity between the end of @dst and the start of @src. - * This means we might need to insert a "not mapped" run. - */ - if (loc == 0) - disc = (src[0].vcn > 0); - else { - s64 merged_length; - - left = ntfs_are_rl_mergeable(dst + loc - 1, src); - - merged_length = dst[loc - 1].length; - if (left) - merged_length += src->length; - - disc = (src[0].vcn > dst[loc - 1].vcn + merged_length); - } - /* - * Space required: @dst size + @src size, less one if we merged, plus - * one if there was a discontinuity. - */ - dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - left + disc); - if (IS_ERR(dst)) - return dst; - /* - * We are guaranteed to succeed from here so can start modifying the - * original runlist. - */ - if (left) - __ntfs_rl_merge(dst + loc - 1, src); - /* - * First run after the @src runs that have been inserted. - * Nominally, @marker equals @loc + @ssize, i.e. location + number of - * runs in @src. However, if @left, then the first run in @src has - * been merged with one in @dst. And if @disc, then @dst and @src do - * not meet and we need an extra run to fill the gap. - */ - marker = loc + ssize - left + disc; - - /* Move the tail of @dst out of the way, then copy in @src. */ - ntfs_rl_mm(dst, marker, loc, dsize - loc); - ntfs_rl_mc(dst, loc + disc, src, left, ssize - left); - - /* Adjust the VCN of the first run after the insertion... */ - dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length; - /* ... and the length. */ - if (dst[marker].lcn == LCN_HOLE || dst[marker].lcn == LCN_RL_NOT_MAPPED) - dst[marker].length = dst[marker + 1].vcn - dst[marker].vcn; - - /* Writing beyond the end of the file and there is a discontinuity. */ - if (disc) { - if (loc > 0) { - dst[loc].vcn = dst[loc - 1].vcn + dst[loc - 1].length; - dst[loc].length = dst[loc + 1].vcn - dst[loc].vcn; - } else { - dst[loc].vcn = 0; - dst[loc].length = dst[loc + 1].vcn; - } - dst[loc].lcn = LCN_RL_NOT_MAPPED; - } - return dst; -} - -/** - * ntfs_rl_replace - overwrite a runlist element with another runlist - * @dst: original runlist to be worked on - * @dsize: number of elements in @dst (including end marker) - * @src: new runlist to be inserted - * @ssize: number of elements in @src (excluding end marker) - * @loc: index in runlist @dst to overwrite with @src - * - * Replace the runlist element @dst at @loc with @src. Merge the left and - * right ends of the inserted runlist, if necessary. - * - * It is up to the caller to serialize access to the runlists @dst and @src. - * - * On success, return a pointer to the new, combined, runlist. Note, both - * runlists @dst and @src are deallocated before returning so you cannot use - * the pointers for anything any more. (Strictly speaking the returned runlist - * may be the same as @dst but this is irrelevant.) - * - * On error, return -errno. Both runlists are left unmodified. The following - * error codes are defined: - * -ENOMEM - Not enough memory to allocate runlist array. - * -EINVAL - Invalid parameters were passed in. - */ -static inline runlist_element *ntfs_rl_replace(runlist_element *dst, - int dsize, runlist_element *src, int ssize, int loc) -{ - signed delta; - bool left = false; /* Left end of @src needs merging. */ - bool right = false; /* Right end of @src needs merging. */ - int tail; /* Start of tail of @dst. */ - int marker; /* End of the inserted runs. */ - - BUG_ON(!dst); - BUG_ON(!src); - - /* First, see if the left and right ends need merging. */ - if ((loc + 1) < dsize) - right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1); - if (loc > 0) - left = ntfs_are_rl_mergeable(dst + loc - 1, src); - /* - * Allocate some space. We will need less if the left, right, or both - * ends get merged. The -1 accounts for the run being replaced. - */ - delta = ssize - 1 - left - right; - if (delta > 0) { - dst = ntfs_rl_realloc(dst, dsize, dsize + delta); - if (IS_ERR(dst)) - return dst; - } - /* - * We are guaranteed to succeed from here so can start modifying the - * original runlists. - */ - - /* First, merge the left and right ends, if necessary. */ - if (right) - __ntfs_rl_merge(src + ssize - 1, dst + loc + 1); - if (left) - __ntfs_rl_merge(dst + loc - 1, src); - /* - * Offset of the tail of @dst. This needs to be moved out of the way - * to make space for the runs to be copied from @src, i.e. the first - * run of the tail of @dst. - * Nominally, @tail equals @loc + 1, i.e. location, skipping the - * replaced run. However, if @right, then one of @dst's runs is - * already merged into @src. - */ - tail = loc + right + 1; - /* - * First run after the @src runs that have been inserted, i.e. where - * the tail of @dst needs to be moved to. - * Nominally, @marker equals @loc + @ssize, i.e. location + number of - * runs in @src. However, if @left, then the first run in @src has - * been merged with one in @dst. - */ - marker = loc + ssize - left; - - /* Move the tail of @dst out of the way, then copy in @src. */ - ntfs_rl_mm(dst, marker, tail, dsize - tail); - ntfs_rl_mc(dst, loc, src, left, ssize - left); - - /* We may have changed the length of the file, so fix the end marker. */ - if (dsize - tail > 0 && dst[marker].lcn == LCN_ENOENT) - dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length; - return dst; -} - -/** - * ntfs_rl_split - insert a runlist into the centre of a hole - * @dst: original runlist to be worked on - * @dsize: number of elements in @dst (including end marker) - * @src: new runlist to be inserted - * @ssize: number of elements in @src (excluding end marker) - * @loc: index in runlist @dst at which to split and insert @src - * - * Split the runlist @dst at @loc into two and insert @new in between the two - * fragments. No merging of runlists is necessary. Adjust the size of the - * holes either side. - * - * It is up to the caller to serialize access to the runlists @dst and @src. - * - * On success, return a pointer to the new, combined, runlist. Note, both - * runlists @dst and @src are deallocated before returning so you cannot use - * the pointers for anything any more. (Strictly speaking the returned runlist - * may be the same as @dst but this is irrelevant.) - * - * On error, return -errno. Both runlists are left unmodified. The following - * error codes are defined: - * -ENOMEM - Not enough memory to allocate runlist array. - * -EINVAL - Invalid parameters were passed in. - */ -static inline runlist_element *ntfs_rl_split(runlist_element *dst, int dsize, - runlist_element *src, int ssize, int loc) -{ - BUG_ON(!dst); - BUG_ON(!src); - - /* Space required: @dst size + @src size + one new hole. */ - dst = ntfs_rl_realloc(dst, dsize, dsize + ssize + 1); - if (IS_ERR(dst)) - return dst; - /* - * We are guaranteed to succeed from here so can start modifying the - * original runlists. - */ - - /* Move the tail of @dst out of the way, then copy in @src. */ - ntfs_rl_mm(dst, loc + 1 + ssize, loc, dsize - loc); - ntfs_rl_mc(dst, loc + 1, src, 0, ssize); - - /* Adjust the size of the holes either size of @src. */ - dst[loc].length = dst[loc+1].vcn - dst[loc].vcn; - dst[loc+ssize+1].vcn = dst[loc+ssize].vcn + dst[loc+ssize].length; - dst[loc+ssize+1].length = dst[loc+ssize+2].vcn - dst[loc+ssize+1].vcn; - - return dst; -} - -/** - * ntfs_runlists_merge - merge two runlists into one - * @drl: original runlist to be worked on - * @srl: new runlist to be merged into @drl - * - * First we sanity check the two runlists @srl and @drl to make sure that they - * are sensible and can be merged. The runlist @srl must be either after the - * runlist @drl or completely within a hole (or unmapped region) in @drl. - * - * It is up to the caller to serialize access to the runlists @drl and @srl. - * - * Merging of runlists is necessary in two cases: - * 1. When attribute lists are used and a further extent is being mapped. - * 2. When new clusters are allocated to fill a hole or extend a file. - * - * There are four possible ways @srl can be merged. It can: - * - be inserted at the beginning of a hole, - * - split the hole in two and be inserted between the two fragments, - * - be appended at the end of a hole, or it can - * - replace the whole hole. - * It can also be appended to the end of the runlist, which is just a variant - * of the insert case. - * - * On success, return a pointer to the new, combined, runlist. Note, both - * runlists @drl and @srl are deallocated before returning so you cannot use - * the pointers for anything any more. (Strictly speaking the returned runlist - * may be the same as @dst but this is irrelevant.) - * - * On error, return -errno. Both runlists are left unmodified. The following - * error codes are defined: - * -ENOMEM - Not enough memory to allocate runlist array. - * -EINVAL - Invalid parameters were passed in. - * -ERANGE - The runlists overlap and cannot be merged. - */ -runlist_element *ntfs_runlists_merge(runlist_element *drl, - runlist_element *srl) -{ - int di, si; /* Current index into @[ds]rl. */ - int sstart; /* First index with lcn > LCN_RL_NOT_MAPPED. */ - int dins; /* Index into @drl at which to insert @srl. */ - int dend, send; /* Last index into @[ds]rl. */ - int dfinal, sfinal; /* The last index into @[ds]rl with - lcn >= LCN_HOLE. */ - int marker = 0; - VCN marker_vcn = 0; - -#ifdef DEBUG - ntfs_debug("dst:"); - ntfs_debug_dump_runlist(drl); - ntfs_debug("src:"); - ntfs_debug_dump_runlist(srl); -#endif - - /* Check for silly calling... */ - if (unlikely(!srl)) - return drl; - if (IS_ERR(srl) || IS_ERR(drl)) - return ERR_PTR(-EINVAL); - - /* Check for the case where the first mapping is being done now. */ - if (unlikely(!drl)) { - drl = srl; - /* Complete the source runlist if necessary. */ - if (unlikely(drl[0].vcn)) { - /* Scan to the end of the source runlist. */ - for (dend = 0; likely(drl[dend].length); dend++) - ; - dend++; - drl = ntfs_rl_realloc(drl, dend, dend + 1); - if (IS_ERR(drl)) - return drl; - /* Insert start element at the front of the runlist. */ - ntfs_rl_mm(drl, 1, 0, dend); - drl[0].vcn = 0; - drl[0].lcn = LCN_RL_NOT_MAPPED; - drl[0].length = drl[1].vcn; - } - goto finished; - } - - si = di = 0; - - /* Skip any unmapped start element(s) in the source runlist. */ - while (srl[si].length && srl[si].lcn < LCN_HOLE) - si++; - - /* Can't have an entirely unmapped source runlist. */ - BUG_ON(!srl[si].length); - - /* Record the starting points. */ - sstart = si; - - /* - * Skip forward in @drl until we reach the position where @srl needs to - * be inserted. If we reach the end of @drl, @srl just needs to be - * appended to @drl. - */ - for (; drl[di].length; di++) { - if (drl[di].vcn + drl[di].length > srl[sstart].vcn) - break; - } - dins = di; - - /* Sanity check for illegal overlaps. */ - if ((drl[di].vcn == srl[si].vcn) && (drl[di].lcn >= 0) && - (srl[si].lcn >= 0)) { - ntfs_error(NULL, "Run lists overlap. Cannot merge!"); - return ERR_PTR(-ERANGE); - } - - /* Scan to the end of both runlists in order to know their sizes. */ - for (send = si; srl[send].length; send++) - ; - for (dend = di; drl[dend].length; dend++) - ; - - if (srl[send].lcn == LCN_ENOENT) - marker_vcn = srl[marker = send].vcn; - - /* Scan to the last element with lcn >= LCN_HOLE. */ - for (sfinal = send; sfinal >= 0 && srl[sfinal].lcn < LCN_HOLE; sfinal--) - ; - for (dfinal = dend; dfinal >= 0 && drl[dfinal].lcn < LCN_HOLE; dfinal--) - ; - - { - bool start; - bool finish; - int ds = dend + 1; /* Number of elements in drl & srl */ - int ss = sfinal - sstart + 1; - - start = ((drl[dins].lcn < LCN_RL_NOT_MAPPED) || /* End of file */ - (drl[dins].vcn == srl[sstart].vcn)); /* Start of hole */ - finish = ((drl[dins].lcn >= LCN_RL_NOT_MAPPED) && /* End of file */ - ((drl[dins].vcn + drl[dins].length) <= /* End of hole */ - (srl[send - 1].vcn + srl[send - 1].length))); - - /* Or we will lose an end marker. */ - if (finish && !drl[dins].length) - ss++; - if (marker && (drl[dins].vcn + drl[dins].length > srl[send - 1].vcn)) - finish = false; -#if 0 - ntfs_debug("dfinal = %i, dend = %i", dfinal, dend); - ntfs_debug("sstart = %i, sfinal = %i, send = %i", sstart, sfinal, send); - ntfs_debug("start = %i, finish = %i", start, finish); - ntfs_debug("ds = %i, ss = %i, dins = %i", ds, ss, dins); -#endif - if (start) { - if (finish) - drl = ntfs_rl_replace(drl, ds, srl + sstart, ss, dins); - else - drl = ntfs_rl_insert(drl, ds, srl + sstart, ss, dins); - } else { - if (finish) - drl = ntfs_rl_append(drl, ds, srl + sstart, ss, dins); - else - drl = ntfs_rl_split(drl, ds, srl + sstart, ss, dins); - } - if (IS_ERR(drl)) { - ntfs_error(NULL, "Merge failed."); - return drl; - } - ntfs_free(srl); - if (marker) { - ntfs_debug("Triggering marker code."); - for (ds = dend; drl[ds].length; ds++) - ; - /* We only need to care if @srl ended after @drl. */ - if (drl[ds].vcn <= marker_vcn) { - int slots = 0; - - if (drl[ds].vcn == marker_vcn) { - ntfs_debug("Old marker = 0x%llx, replacing " - "with LCN_ENOENT.", - (unsigned long long) - drl[ds].lcn); - drl[ds].lcn = LCN_ENOENT; - goto finished; - } - /* - * We need to create an unmapped runlist element in - * @drl or extend an existing one before adding the - * ENOENT terminator. - */ - if (drl[ds].lcn == LCN_ENOENT) { - ds--; - slots = 1; - } - if (drl[ds].lcn != LCN_RL_NOT_MAPPED) { - /* Add an unmapped runlist element. */ - if (!slots) { - drl = ntfs_rl_realloc_nofail(drl, ds, - ds + 2); - slots = 2; - } - ds++; - /* Need to set vcn if it isn't set already. */ - if (slots != 1) - drl[ds].vcn = drl[ds - 1].vcn + - drl[ds - 1].length; - drl[ds].lcn = LCN_RL_NOT_MAPPED; - /* We now used up a slot. */ - slots--; - } - drl[ds].length = marker_vcn - drl[ds].vcn; - /* Finally add the ENOENT terminator. */ - ds++; - if (!slots) - drl = ntfs_rl_realloc_nofail(drl, ds, ds + 1); - drl[ds].vcn = marker_vcn; - drl[ds].lcn = LCN_ENOENT; - drl[ds].length = (s64)0; - } - } - } - -finished: - /* The merge was completed successfully. */ - ntfs_debug("Merged runlist:"); - ntfs_debug_dump_runlist(drl); - return drl; -} - -/** - * ntfs_mapping_pairs_decompress - convert mapping pairs array to runlist - * @vol: ntfs volume on which the attribute resides - * @attr: attribute record whose mapping pairs array to decompress - * @old_rl: optional runlist in which to insert @attr's runlist - * - * It is up to the caller to serialize access to the runlist @old_rl. - * - * Decompress the attribute @attr's mapping pairs array into a runlist. On - * success, return the decompressed runlist. - * - * If @old_rl is not NULL, decompressed runlist is inserted into the - * appropriate place in @old_rl and the resultant, combined runlist is - * returned. The original @old_rl is deallocated. - * - * On error, return -errno. @old_rl is left unmodified in that case. - * - * The following error codes are defined: - * -ENOMEM - Not enough memory to allocate runlist array. - * -EIO - Corrupt runlist. - * -EINVAL - Invalid parameters were passed in. - * -ERANGE - The two runlists overlap. - * - * FIXME: For now we take the conceptionally simplest approach of creating the - * new runlist disregarding the already existing one and then splicing the - * two into one, if that is possible (we check for overlap and discard the new - * runlist if overlap present before returning ERR_PTR(-ERANGE)). - */ -runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol, - const ATTR_RECORD *attr, runlist_element *old_rl) -{ - VCN vcn; /* Current vcn. */ - LCN lcn; /* Current lcn. */ - s64 deltaxcn; /* Change in [vl]cn. */ - runlist_element *rl; /* The output runlist. */ - u8 *buf; /* Current position in mapping pairs array. */ - u8 *attr_end; /* End of attribute. */ - int rlsize; /* Size of runlist buffer. */ - u16 rlpos; /* Current runlist position in units of - runlist_elements. */ - u8 b; /* Current byte offset in buf. */ - -#ifdef DEBUG - /* Make sure attr exists and is non-resident. */ - if (!attr || !attr->non_resident || sle64_to_cpu( - attr->data.non_resident.lowest_vcn) < (VCN)0) { - ntfs_error(vol->sb, "Invalid arguments."); - return ERR_PTR(-EINVAL); - } -#endif - /* Start at vcn = lowest_vcn and lcn 0. */ - vcn = sle64_to_cpu(attr->data.non_resident.lowest_vcn); - lcn = 0; - /* Get start of the mapping pairs array. */ - buf = (u8*)attr + le16_to_cpu( - attr->data.non_resident.mapping_pairs_offset); - attr_end = (u8*)attr + le32_to_cpu(attr->length); - if (unlikely(buf < (u8*)attr || buf > attr_end)) { - ntfs_error(vol->sb, "Corrupt attribute."); - return ERR_PTR(-EIO); - } - /* If the mapping pairs array is valid but empty, nothing to do. */ - if (!vcn && !*buf) - return old_rl; - /* Current position in runlist array. */ - rlpos = 0; - /* Allocate first page and set current runlist size to one page. */ - rl = ntfs_malloc_nofs(rlsize = PAGE_SIZE); - if (unlikely(!rl)) - return ERR_PTR(-ENOMEM); - /* Insert unmapped starting element if necessary. */ - if (vcn) { - rl->vcn = 0; - rl->lcn = LCN_RL_NOT_MAPPED; - rl->length = vcn; - rlpos++; - } - while (buf < attr_end && *buf) { - /* - * Allocate more memory if needed, including space for the - * not-mapped and terminator elements. ntfs_malloc_nofs() - * operates on whole pages only. - */ - if (((rlpos + 3) * sizeof(*old_rl)) > rlsize) { - runlist_element *rl2; - - rl2 = ntfs_malloc_nofs(rlsize + (int)PAGE_SIZE); - if (unlikely(!rl2)) { - ntfs_free(rl); - return ERR_PTR(-ENOMEM); - } - memcpy(rl2, rl, rlsize); - ntfs_free(rl); - rl = rl2; - rlsize += PAGE_SIZE; - } - /* Enter the current vcn into the current runlist element. */ - rl[rlpos].vcn = vcn; - /* - * Get the change in vcn, i.e. the run length in clusters. - * Doing it this way ensures that we signextend negative values. - * A negative run length doesn't make any sense, but hey, I - * didn't make up the NTFS specs and Windows NT4 treats the run - * length as a signed value so that's how it is... - */ - b = *buf & 0xf; - if (b) { - if (unlikely(buf + b > attr_end)) - goto io_error; - for (deltaxcn = (s8)buf[b--]; b; b--) - deltaxcn = (deltaxcn << 8) + buf[b]; - } else { /* The length entry is compulsory. */ - ntfs_error(vol->sb, "Missing length entry in mapping " - "pairs array."); - deltaxcn = (s64)-1; - } - /* - * Assume a negative length to indicate data corruption and - * hence clean-up and return NULL. - */ - if (unlikely(deltaxcn < 0)) { - ntfs_error(vol->sb, "Invalid length in mapping pairs " - "array."); - goto err_out; - } - /* - * Enter the current run length into the current runlist - * element. - */ - rl[rlpos].length = deltaxcn; - /* Increment the current vcn by the current run length. */ - vcn += deltaxcn; - /* - * There might be no lcn change at all, as is the case for - * sparse clusters on NTFS 3.0+, in which case we set the lcn - * to LCN_HOLE. - */ - if (!(*buf & 0xf0)) - rl[rlpos].lcn = LCN_HOLE; - else { - /* Get the lcn change which really can be negative. */ - u8 b2 = *buf & 0xf; - b = b2 + ((*buf >> 4) & 0xf); - if (buf + b > attr_end) - goto io_error; - for (deltaxcn = (s8)buf[b--]; b > b2; b--) - deltaxcn = (deltaxcn << 8) + buf[b]; - /* Change the current lcn to its new value. */ - lcn += deltaxcn; -#ifdef DEBUG - /* - * On NTFS 1.2-, apparently can have lcn == -1 to - * indicate a hole. But we haven't verified ourselves - * whether it is really the lcn or the deltaxcn that is - * -1. So if either is found give us a message so we - * can investigate it further! - */ - if (vol->major_ver < 3) { - if (unlikely(deltaxcn == (LCN)-1)) - ntfs_error(vol->sb, "lcn delta == -1"); - if (unlikely(lcn == (LCN)-1)) - ntfs_error(vol->sb, "lcn == -1"); - } -#endif - /* Check lcn is not below -1. */ - if (unlikely(lcn < (LCN)-1)) { - ntfs_error(vol->sb, "Invalid LCN < -1 in " - "mapping pairs array."); - goto err_out; - } - /* Enter the current lcn into the runlist element. */ - rl[rlpos].lcn = lcn; - } - /* Get to the next runlist element. */ - rlpos++; - /* Increment the buffer position to the next mapping pair. */ - buf += (*buf & 0xf) + ((*buf >> 4) & 0xf) + 1; - } - if (unlikely(buf >= attr_end)) - goto io_error; - /* - * If there is a highest_vcn specified, it must be equal to the final - * vcn in the runlist - 1, or something has gone badly wrong. - */ - deltaxcn = sle64_to_cpu(attr->data.non_resident.highest_vcn); - if (unlikely(deltaxcn && vcn - 1 != deltaxcn)) { -mpa_err: - ntfs_error(vol->sb, "Corrupt mapping pairs array in " - "non-resident attribute."); - goto err_out; - } - /* Setup not mapped runlist element if this is the base extent. */ - if (!attr->data.non_resident.lowest_vcn) { - VCN max_cluster; - - max_cluster = ((sle64_to_cpu( - attr->data.non_resident.allocated_size) + - vol->cluster_size - 1) >> - vol->cluster_size_bits) - 1; - /* - * A highest_vcn of zero means this is a single extent - * attribute so simply terminate the runlist with LCN_ENOENT). - */ - if (deltaxcn) { - /* - * If there is a difference between the highest_vcn and - * the highest cluster, the runlist is either corrupt - * or, more likely, there are more extents following - * this one. - */ - if (deltaxcn < max_cluster) { - ntfs_debug("More extents to follow; deltaxcn " - "= 0x%llx, max_cluster = " - "0x%llx", - (unsigned long long)deltaxcn, - (unsigned long long) - max_cluster); - rl[rlpos].vcn = vcn; - vcn += rl[rlpos].length = max_cluster - - deltaxcn; - rl[rlpos].lcn = LCN_RL_NOT_MAPPED; - rlpos++; - } else if (unlikely(deltaxcn > max_cluster)) { - ntfs_error(vol->sb, "Corrupt attribute. " - "deltaxcn = 0x%llx, " - "max_cluster = 0x%llx", - (unsigned long long)deltaxcn, - (unsigned long long) - max_cluster); - goto mpa_err; - } - } - rl[rlpos].lcn = LCN_ENOENT; - } else /* Not the base extent. There may be more extents to follow. */ - rl[rlpos].lcn = LCN_RL_NOT_MAPPED; - - /* Setup terminating runlist element. */ - rl[rlpos].vcn = vcn; - rl[rlpos].length = (s64)0; - /* If no existing runlist was specified, we are done. */ - if (!old_rl) { - ntfs_debug("Mapping pairs array successfully decompressed:"); - ntfs_debug_dump_runlist(rl); - return rl; - } - /* Now combine the new and old runlists checking for overlaps. */ - old_rl = ntfs_runlists_merge(old_rl, rl); - if (!IS_ERR(old_rl)) - return old_rl; - ntfs_free(rl); - ntfs_error(vol->sb, "Failed to merge runlists."); - return old_rl; -io_error: - ntfs_error(vol->sb, "Corrupt attribute."); -err_out: - ntfs_free(rl); - return ERR_PTR(-EIO); -} - -/** - * ntfs_rl_vcn_to_lcn - convert a vcn into a lcn given a runlist - * @rl: runlist to use for conversion - * @vcn: vcn to convert - * - * Convert the virtual cluster number @vcn of an attribute into a logical - * cluster number (lcn) of a device using the runlist @rl to map vcns to their - * corresponding lcns. - * - * It is up to the caller to serialize access to the runlist @rl. - * - * Since lcns must be >= 0, we use negative return codes with special meaning: - * - * Return code Meaning / Description - * ================================================== - * LCN_HOLE Hole / not allocated on disk. - * LCN_RL_NOT_MAPPED This is part of the runlist which has not been - * inserted into the runlist yet. - * LCN_ENOENT There is no such vcn in the attribute. - * - * Locking: - The caller must have locked the runlist (for reading or writing). - * - This function does not touch the lock, nor does it modify the - * runlist. - */ -LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn) -{ - int i; - - BUG_ON(vcn < 0); - /* - * If rl is NULL, assume that we have found an unmapped runlist. The - * caller can then attempt to map it and fail appropriately if - * necessary. - */ - if (unlikely(!rl)) - return LCN_RL_NOT_MAPPED; - - /* Catch out of lower bounds vcn. */ - if (unlikely(vcn < rl[0].vcn)) - return LCN_ENOENT; - - for (i = 0; likely(rl[i].length); i++) { - if (unlikely(vcn < rl[i+1].vcn)) { - if (likely(rl[i].lcn >= (LCN)0)) - return rl[i].lcn + (vcn - rl[i].vcn); - return rl[i].lcn; - } - } - /* - * The terminator element is setup to the correct value, i.e. one of - * LCN_HOLE, LCN_RL_NOT_MAPPED, or LCN_ENOENT. - */ - if (likely(rl[i].lcn < (LCN)0)) - return rl[i].lcn; - /* Just in case... We could replace this with BUG() some day. */ - return LCN_ENOENT; -} - -#ifdef NTFS_RW - -/** - * ntfs_rl_find_vcn_nolock - find a vcn in a runlist - * @rl: runlist to search - * @vcn: vcn to find - * - * Find the virtual cluster number @vcn in the runlist @rl and return the - * address of the runlist element containing the @vcn on success. - * - * Return NULL if @rl is NULL or @vcn is in an unmapped part/out of bounds of - * the runlist. - * - * Locking: The runlist must be locked on entry. - */ -runlist_element *ntfs_rl_find_vcn_nolock(runlist_element *rl, const VCN vcn) -{ - BUG_ON(vcn < 0); - if (unlikely(!rl || vcn < rl[0].vcn)) - return NULL; - while (likely(rl->length)) { - if (unlikely(vcn < rl[1].vcn)) { - if (likely(rl->lcn >= LCN_HOLE)) - return rl; - return NULL; - } - rl++; - } - if (likely(rl->lcn == LCN_ENOENT)) - return rl; - return NULL; -} - -/** - * ntfs_get_nr_significant_bytes - get number of bytes needed to store a number - * @n: number for which to get the number of bytes for - * - * Return the number of bytes required to store @n unambiguously as - * a signed number. - * - * This is used in the context of the mapping pairs array to determine how - * many bytes will be needed in the array to store a given logical cluster - * number (lcn) or a specific run length. - * - * Return the number of bytes written. This function cannot fail. - */ -static inline int ntfs_get_nr_significant_bytes(const s64 n) -{ - s64 l = n; - int i; - s8 j; - - i = 0; - do { - l >>= 8; - i++; - } while (l != 0 && l != -1); - j = (n >> 8 * (i - 1)) & 0xff; - /* If the sign bit is wrong, we need an extra byte. */ - if ((n < 0 && j >= 0) || (n > 0 && j < 0)) - i++; - return i; -} - -/** - * ntfs_get_size_for_mapping_pairs - get bytes needed for mapping pairs array - * @vol: ntfs volume (needed for the ntfs version) - * @rl: locked runlist to determine the size of the mapping pairs of - * @first_vcn: first vcn which to include in the mapping pairs array - * @last_vcn: last vcn which to include in the mapping pairs array - * - * Walk the locked runlist @rl and calculate the size in bytes of the mapping - * pairs array corresponding to the runlist @rl, starting at vcn @first_vcn and - * finishing with vcn @last_vcn. - * - * A @last_vcn of -1 means end of runlist and in that case the size of the - * mapping pairs array corresponding to the runlist starting at vcn @first_vcn - * and finishing at the end of the runlist is determined. - * - * This for example allows us to allocate a buffer of the right size when - * building the mapping pairs array. - * - * If @rl is NULL, just return 1 (for the single terminator byte). - * - * Return the calculated size in bytes on success. On error, return -errno. - * The following error codes are defined: - * -EINVAL - Run list contains unmapped elements. Make sure to only pass - * fully mapped runlists to this function. - * -EIO - The runlist is corrupt. - * - * Locking: @rl must be locked on entry (either for reading or writing), it - * remains locked throughout, and is left locked upon return. - */ -int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol, - const runlist_element *rl, const VCN first_vcn, - const VCN last_vcn) -{ - LCN prev_lcn; - int rls; - bool the_end = false; - - BUG_ON(first_vcn < 0); - BUG_ON(last_vcn < -1); - BUG_ON(last_vcn >= 0 && first_vcn > last_vcn); - if (!rl) { - BUG_ON(first_vcn); - BUG_ON(last_vcn > 0); - return 1; - } - /* Skip to runlist element containing @first_vcn. */ - while (rl->length && first_vcn >= rl[1].vcn) - rl++; - if (unlikely((!rl->length && first_vcn > rl->vcn) || - first_vcn < rl->vcn)) - return -EINVAL; - prev_lcn = 0; - /* Always need the termining zero byte. */ - rls = 1; - /* Do the first partial run if present. */ - if (first_vcn > rl->vcn) { - s64 delta, length = rl->length; - - /* We know rl->length != 0 already. */ - if (unlikely(length < 0 || rl->lcn < LCN_HOLE)) - goto err_out; - /* - * If @stop_vcn is given and finishes inside this run, cap the - * run length. - */ - if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) { - s64 s1 = last_vcn + 1; - if (unlikely(rl[1].vcn > s1)) - length = s1 - rl->vcn; - the_end = true; - } - delta = first_vcn - rl->vcn; - /* Header byte + length. */ - rls += 1 + ntfs_get_nr_significant_bytes(length - delta); - /* - * If the logical cluster number (lcn) denotes a hole and we - * are on NTFS 3.0+, we don't store it at all, i.e. we need - * zero space. On earlier NTFS versions we just store the lcn. - * Note: this assumes that on NTFS 1.2-, holes are stored with - * an lcn of -1 and not a delta_lcn of -1 (unless both are -1). - */ - if (likely(rl->lcn >= 0 || vol->major_ver < 3)) { - prev_lcn = rl->lcn; - if (likely(rl->lcn >= 0)) - prev_lcn += delta; - /* Change in lcn. */ - rls += ntfs_get_nr_significant_bytes(prev_lcn); - } - /* Go to next runlist element. */ - rl++; - } - /* Do the full runs. */ - for (; rl->length && !the_end; rl++) { - s64 length = rl->length; - - if (unlikely(length < 0 || rl->lcn < LCN_HOLE)) - goto err_out; - /* - * If @stop_vcn is given and finishes inside this run, cap the - * run length. - */ - if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) { - s64 s1 = last_vcn + 1; - if (unlikely(rl[1].vcn > s1)) - length = s1 - rl->vcn; - the_end = true; - } - /* Header byte + length. */ - rls += 1 + ntfs_get_nr_significant_bytes(length); - /* - * If the logical cluster number (lcn) denotes a hole and we - * are on NTFS 3.0+, we don't store it at all, i.e. we need - * zero space. On earlier NTFS versions we just store the lcn. - * Note: this assumes that on NTFS 1.2-, holes are stored with - * an lcn of -1 and not a delta_lcn of -1 (unless both are -1). - */ - if (likely(rl->lcn >= 0 || vol->major_ver < 3)) { - /* Change in lcn. */ - rls += ntfs_get_nr_significant_bytes(rl->lcn - - prev_lcn); - prev_lcn = rl->lcn; - } - } - return rls; -err_out: - if (rl->lcn == LCN_RL_NOT_MAPPED) - rls = -EINVAL; - else - rls = -EIO; - return rls; -} - -/** - * ntfs_write_significant_bytes - write the significant bytes of a number - * @dst: destination buffer to write to - * @dst_max: pointer to last byte of destination buffer for bounds checking - * @n: number whose significant bytes to write - * - * Store in @dst, the minimum bytes of the number @n which are required to - * identify @n unambiguously as a signed number, taking care not to exceed - * @dest_max, the maximum position within @dst to which we are allowed to - * write. - * - * This is used when building the mapping pairs array of a runlist to compress - * a given logical cluster number (lcn) or a specific run length to the minimum - * size possible. - * - * Return the number of bytes written on success. On error, i.e. the - * destination buffer @dst is too small, return -ENOSPC. - */ -static inline int ntfs_write_significant_bytes(s8 *dst, const s8 *dst_max, - const s64 n) -{ - s64 l = n; - int i; - s8 j; - - i = 0; - do { - if (unlikely(dst > dst_max)) - goto err_out; - *dst++ = l & 0xffll; - l >>= 8; - i++; - } while (l != 0 && l != -1); - j = (n >> 8 * (i - 1)) & 0xff; - /* If the sign bit is wrong, we need an extra byte. */ - if (n < 0 && j >= 0) { - if (unlikely(dst > dst_max)) - goto err_out; - i++; - *dst = (s8)-1; - } else if (n > 0 && j < 0) { - if (unlikely(dst > dst_max)) - goto err_out; - i++; - *dst = (s8)0; - } - return i; -err_out: - return -ENOSPC; -} - -/** - * ntfs_mapping_pairs_build - build the mapping pairs array from a runlist - * @vol: ntfs volume (needed for the ntfs version) - * @dst: destination buffer to which to write the mapping pairs array - * @dst_len: size of destination buffer @dst in bytes - * @rl: locked runlist for which to build the mapping pairs array - * @first_vcn: first vcn which to include in the mapping pairs array - * @last_vcn: last vcn which to include in the mapping pairs array - * @stop_vcn: first vcn outside destination buffer on success or -ENOSPC - * - * Create the mapping pairs array from the locked runlist @rl, starting at vcn - * @first_vcn and finishing with vcn @last_vcn and save the array in @dst. - * @dst_len is the size of @dst in bytes and it should be at least equal to the - * value obtained by calling ntfs_get_size_for_mapping_pairs(). - * - * A @last_vcn of -1 means end of runlist and in that case the mapping pairs - * array corresponding to the runlist starting at vcn @first_vcn and finishing - * at the end of the runlist is created. - * - * If @rl is NULL, just write a single terminator byte to @dst. - * - * On success or -ENOSPC error, if @stop_vcn is not NULL, *@stop_vcn is set to - * the first vcn outside the destination buffer. Note that on error, @dst has - * been filled with all the mapping pairs that will fit, thus it can be treated - * as partial success, in that a new attribute extent needs to be created or - * the next extent has to be used and the mapping pairs build has to be - * continued with @first_vcn set to *@stop_vcn. - * - * Return 0 on success and -errno on error. The following error codes are - * defined: - * -EINVAL - Run list contains unmapped elements. Make sure to only pass - * fully mapped runlists to this function. - * -EIO - The runlist is corrupt. - * -ENOSPC - The destination buffer is too small. - * - * Locking: @rl must be locked on entry (either for reading or writing), it - * remains locked throughout, and is left locked upon return. - */ -int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst, - const int dst_len, const runlist_element *rl, - const VCN first_vcn, const VCN last_vcn, VCN *const stop_vcn) -{ - LCN prev_lcn; - s8 *dst_max, *dst_next; - int err = -ENOSPC; - bool the_end = false; - s8 len_len, lcn_len; - - BUG_ON(first_vcn < 0); - BUG_ON(last_vcn < -1); - BUG_ON(last_vcn >= 0 && first_vcn > last_vcn); - BUG_ON(dst_len < 1); - if (!rl) { - BUG_ON(first_vcn); - BUG_ON(last_vcn > 0); - if (stop_vcn) - *stop_vcn = 0; - /* Terminator byte. */ - *dst = 0; - return 0; - } - /* Skip to runlist element containing @first_vcn. */ - while (rl->length && first_vcn >= rl[1].vcn) - rl++; - if (unlikely((!rl->length && first_vcn > rl->vcn) || - first_vcn < rl->vcn)) - return -EINVAL; - /* - * @dst_max is used for bounds checking in - * ntfs_write_significant_bytes(). - */ - dst_max = dst + dst_len - 1; - prev_lcn = 0; - /* Do the first partial run if present. */ - if (first_vcn > rl->vcn) { - s64 delta, length = rl->length; - - /* We know rl->length != 0 already. */ - if (unlikely(length < 0 || rl->lcn < LCN_HOLE)) - goto err_out; - /* - * If @stop_vcn is given and finishes inside this run, cap the - * run length. - */ - if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) { - s64 s1 = last_vcn + 1; - if (unlikely(rl[1].vcn > s1)) - length = s1 - rl->vcn; - the_end = true; - } - delta = first_vcn - rl->vcn; - /* Write length. */ - len_len = ntfs_write_significant_bytes(dst + 1, dst_max, - length - delta); - if (unlikely(len_len < 0)) - goto size_err; - /* - * If the logical cluster number (lcn) denotes a hole and we - * are on NTFS 3.0+, we don't store it at all, i.e. we need - * zero space. On earlier NTFS versions we just write the lcn - * change. FIXME: Do we need to write the lcn change or just - * the lcn in that case? Not sure as I have never seen this - * case on NT4. - We assume that we just need to write the lcn - * change until someone tells us otherwise... (AIA) - */ - if (likely(rl->lcn >= 0 || vol->major_ver < 3)) { - prev_lcn = rl->lcn; - if (likely(rl->lcn >= 0)) - prev_lcn += delta; - /* Write change in lcn. */ - lcn_len = ntfs_write_significant_bytes(dst + 1 + - len_len, dst_max, prev_lcn); - if (unlikely(lcn_len < 0)) - goto size_err; - } else - lcn_len = 0; - dst_next = dst + len_len + lcn_len + 1; - if (unlikely(dst_next > dst_max)) - goto size_err; - /* Update header byte. */ - *dst = lcn_len << 4 | len_len; - /* Position at next mapping pairs array element. */ - dst = dst_next; - /* Go to next runlist element. */ - rl++; - } - /* Do the full runs. */ - for (; rl->length && !the_end; rl++) { - s64 length = rl->length; - - if (unlikely(length < 0 || rl->lcn < LCN_HOLE)) - goto err_out; - /* - * If @stop_vcn is given and finishes inside this run, cap the - * run length. - */ - if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) { - s64 s1 = last_vcn + 1; - if (unlikely(rl[1].vcn > s1)) - length = s1 - rl->vcn; - the_end = true; - } - /* Write length. */ - len_len = ntfs_write_significant_bytes(dst + 1, dst_max, - length); - if (unlikely(len_len < 0)) - goto size_err; - /* - * If the logical cluster number (lcn) denotes a hole and we - * are on NTFS 3.0+, we don't store it at all, i.e. we need - * zero space. On earlier NTFS versions we just write the lcn - * change. FIXME: Do we need to write the lcn change or just - * the lcn in that case? Not sure as I have never seen this - * case on NT4. - We assume that we just need to write the lcn - * change until someone tells us otherwise... (AIA) - */ - if (likely(rl->lcn >= 0 || vol->major_ver < 3)) { - /* Write change in lcn. */ - lcn_len = ntfs_write_significant_bytes(dst + 1 + - len_len, dst_max, rl->lcn - prev_lcn); - if (unlikely(lcn_len < 0)) - goto size_err; - prev_lcn = rl->lcn; - } else - lcn_len = 0; - dst_next = dst + len_len + lcn_len + 1; - if (unlikely(dst_next > dst_max)) - goto size_err; - /* Update header byte. */ - *dst = lcn_len << 4 | len_len; - /* Position at next mapping pairs array element. */ - dst = dst_next; - } - /* Success. */ - err = 0; -size_err: - /* Set stop vcn. */ - if (stop_vcn) - *stop_vcn = rl->vcn; - /* Add terminator byte. */ - *dst = 0; - return err; -err_out: - if (rl->lcn == LCN_RL_NOT_MAPPED) - err = -EINVAL; - else - err = -EIO; - return err; -} - -/** - * ntfs_rl_truncate_nolock - truncate a runlist starting at a specified vcn - * @vol: ntfs volume (needed for error output) - * @runlist: runlist to truncate - * @new_length: the new length of the runlist in VCNs - * - * Truncate the runlist described by @runlist as well as the memory buffer - * holding the runlist elements to a length of @new_length VCNs. - * - * If @new_length lies within the runlist, the runlist elements with VCNs of - * @new_length and above are discarded. As a special case if @new_length is - * zero, the runlist is discarded and set to NULL. - * - * If @new_length lies beyond the runlist, a sparse runlist element is added to - * the end of the runlist @runlist or if the last runlist element is a sparse - * one already, this is extended. - * - * Note, no checking is done for unmapped runlist elements. It is assumed that - * the caller has mapped any elements that need to be mapped already. - * - * Return 0 on success and -errno on error. - * - * Locking: The caller must hold @runlist->lock for writing. - */ -int ntfs_rl_truncate_nolock(const ntfs_volume *vol, runlist *const runlist, - const s64 new_length) -{ - runlist_element *rl; - int old_size; - - ntfs_debug("Entering for new_length 0x%llx.", (long long)new_length); - BUG_ON(!runlist); - BUG_ON(new_length < 0); - rl = runlist->rl; - if (!new_length) { - ntfs_debug("Freeing runlist."); - runlist->rl = NULL; - if (rl) - ntfs_free(rl); - return 0; - } - if (unlikely(!rl)) { - /* - * Create a runlist consisting of a sparse runlist element of - * length @new_length followed by a terminator runlist element. - */ - rl = ntfs_malloc_nofs(PAGE_SIZE); - if (unlikely(!rl)) { - ntfs_error(vol->sb, "Not enough memory to allocate " - "runlist element buffer."); - return -ENOMEM; - } - runlist->rl = rl; - rl[1].length = rl->vcn = 0; - rl->lcn = LCN_HOLE; - rl[1].vcn = rl->length = new_length; - rl[1].lcn = LCN_ENOENT; - return 0; - } - BUG_ON(new_length < rl->vcn); - /* Find @new_length in the runlist. */ - while (likely(rl->length && new_length >= rl[1].vcn)) - rl++; - /* - * If not at the end of the runlist we need to shrink it. - * If at the end of the runlist we need to expand it. - */ - if (rl->length) { - runlist_element *trl; - bool is_end; - - ntfs_debug("Shrinking runlist."); - /* Determine the runlist size. */ - trl = rl + 1; - while (likely(trl->length)) - trl++; - old_size = trl - runlist->rl + 1; - /* Truncate the run. */ - rl->length = new_length - rl->vcn; - /* - * If a run was partially truncated, make the following runlist - * element a terminator. - */ - is_end = false; - if (rl->length) { - rl++; - if (!rl->length) - is_end = true; - rl->vcn = new_length; - rl->length = 0; - } - rl->lcn = LCN_ENOENT; - /* Reallocate memory if necessary. */ - if (!is_end) { - int new_size = rl - runlist->rl + 1; - rl = ntfs_rl_realloc(runlist->rl, old_size, new_size); - if (IS_ERR(rl)) - ntfs_warning(vol->sb, "Failed to shrink " - "runlist buffer. This just " - "wastes a bit of memory " - "temporarily so we ignore it " - "and return success."); - else - runlist->rl = rl; - } - } else if (likely(/* !rl->length && */ new_length > rl->vcn)) { - ntfs_debug("Expanding runlist."); - /* - * If there is a previous runlist element and it is a sparse - * one, extend it. Otherwise need to add a new, sparse runlist - * element. - */ - if ((rl > runlist->rl) && ((rl - 1)->lcn == LCN_HOLE)) - (rl - 1)->length = new_length - (rl - 1)->vcn; - else { - /* Determine the runlist size. */ - old_size = rl - runlist->rl + 1; - /* Reallocate memory if necessary. */ - rl = ntfs_rl_realloc(runlist->rl, old_size, - old_size + 1); - if (IS_ERR(rl)) { - ntfs_error(vol->sb, "Failed to expand runlist " - "buffer, aborting."); - return PTR_ERR(rl); - } - runlist->rl = rl; - /* - * Set @rl to the same runlist element in the new - * runlist as before in the old runlist. - */ - rl += old_size - 1; - /* Add a new, sparse runlist element. */ - rl->lcn = LCN_HOLE; - rl->length = new_length - rl->vcn; - /* Add a new terminator runlist element. */ - rl++; - rl->length = 0; - } - rl->vcn = new_length; - rl->lcn = LCN_ENOENT; - } else /* if (unlikely(!rl->length && new_length == rl->vcn)) */ { - /* Runlist already has same size as requested. */ - rl->lcn = LCN_ENOENT; - } - ntfs_debug("Done."); - return 0; -} - -/** - * ntfs_rl_punch_nolock - punch a hole into a runlist - * @vol: ntfs volume (needed for error output) - * @runlist: runlist to punch a hole into - * @start: starting VCN of the hole to be created - * @length: size of the hole to be created in units of clusters - * - * Punch a hole into the runlist @runlist starting at VCN @start and of size - * @length clusters. - * - * Return 0 on success and -errno on error, in which case @runlist has not been - * modified. - * - * If @start and/or @start + @length are outside the runlist return error code - * -ENOENT. - * - * If the runlist contains unmapped or error elements between @start and @start - * + @length return error code -EINVAL. - * - * Locking: The caller must hold @runlist->lock for writing. - */ -int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist, - const VCN start, const s64 length) -{ - const VCN end = start + length; - s64 delta; - runlist_element *rl, *rl_end, *rl_real_end, *trl; - int old_size; - bool lcn_fixup = false; - - ntfs_debug("Entering for start 0x%llx, length 0x%llx.", - (long long)start, (long long)length); - BUG_ON(!runlist); - BUG_ON(start < 0); - BUG_ON(length < 0); - BUG_ON(end < 0); - rl = runlist->rl; - if (unlikely(!rl)) { - if (likely(!start && !length)) - return 0; - return -ENOENT; - } - /* Find @start in the runlist. */ - while (likely(rl->length && start >= rl[1].vcn)) - rl++; - rl_end = rl; - /* Find @end in the runlist. */ - while (likely(rl_end->length && end >= rl_end[1].vcn)) { - /* Verify there are no unmapped or error elements. */ - if (unlikely(rl_end->lcn < LCN_HOLE)) - return -EINVAL; - rl_end++; - } - /* Check the last element. */ - if (unlikely(rl_end->length && rl_end->lcn < LCN_HOLE)) - return -EINVAL; - /* This covers @start being out of bounds, too. */ - if (!rl_end->length && end > rl_end->vcn) - return -ENOENT; - if (!length) - return 0; - if (!rl->length) - return -ENOENT; - rl_real_end = rl_end; - /* Determine the runlist size. */ - while (likely(rl_real_end->length)) - rl_real_end++; - old_size = rl_real_end - runlist->rl + 1; - /* If @start is in a hole simply extend the hole. */ - if (rl->lcn == LCN_HOLE) { - /* - * If both @start and @end are in the same sparse run, we are - * done. - */ - if (end <= rl[1].vcn) { - ntfs_debug("Done (requested hole is already sparse)."); - return 0; - } -extend_hole: - /* Extend the hole. */ - rl->length = end - rl->vcn; - /* If @end is in a hole, merge it with the current one. */ - if (rl_end->lcn == LCN_HOLE) { - rl_end++; - rl->length = rl_end->vcn - rl->vcn; - } - /* We have done the hole. Now deal with the remaining tail. */ - rl++; - /* Cut out all runlist elements up to @end. */ - if (rl < rl_end) - memmove(rl, rl_end, (rl_real_end - rl_end + 1) * - sizeof(*rl)); - /* Adjust the beginning of the tail if necessary. */ - if (end > rl->vcn) { - delta = end - rl->vcn; - rl->vcn = end; - rl->length -= delta; - /* Only adjust the lcn if it is real. */ - if (rl->lcn >= 0) - rl->lcn += delta; - } -shrink_allocation: - /* Reallocate memory if the allocation changed. */ - if (rl < rl_end) { - rl = ntfs_rl_realloc(runlist->rl, old_size, - old_size - (rl_end - rl)); - if (IS_ERR(rl)) - ntfs_warning(vol->sb, "Failed to shrink " - "runlist buffer. This just " - "wastes a bit of memory " - "temporarily so we ignore it " - "and return success."); - else - runlist->rl = rl; - } - ntfs_debug("Done (extend hole)."); - return 0; - } - /* - * If @start is at the beginning of a run things are easier as there is - * no need to split the first run. - */ - if (start == rl->vcn) { - /* - * @start is at the beginning of a run. - * - * If the previous run is sparse, extend its hole. - * - * If @end is not in the same run, switch the run to be sparse - * and extend the newly created hole. - * - * Thus both of these cases reduce the problem to the above - * case of "@start is in a hole". - */ - if (rl > runlist->rl && (rl - 1)->lcn == LCN_HOLE) { - rl--; - goto extend_hole; - } - if (end >= rl[1].vcn) { - rl->lcn = LCN_HOLE; - goto extend_hole; - } - /* - * The final case is when @end is in the same run as @start. - * For this need to split the run into two. One run for the - * sparse region between the beginning of the old run, i.e. - * @start, and @end and one for the remaining non-sparse - * region, i.e. between @end and the end of the old run. - */ - trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1); - if (IS_ERR(trl)) - goto enomem_out; - old_size++; - if (runlist->rl != trl) { - rl = trl + (rl - runlist->rl); - rl_end = trl + (rl_end - runlist->rl); - rl_real_end = trl + (rl_real_end - runlist->rl); - runlist->rl = trl; - } -split_end: - /* Shift all the runs up by one. */ - memmove(rl + 1, rl, (rl_real_end - rl + 1) * sizeof(*rl)); - /* Finally, setup the two split runs. */ - rl->lcn = LCN_HOLE; - rl->length = length; - rl++; - rl->vcn += length; - /* Only adjust the lcn if it is real. */ - if (rl->lcn >= 0 || lcn_fixup) - rl->lcn += length; - rl->length -= length; - ntfs_debug("Done (split one)."); - return 0; - } - /* - * @start is neither in a hole nor at the beginning of a run. - * - * If @end is in a hole, things are easier as simply truncating the run - * @start is in to end at @start - 1, deleting all runs after that up - * to @end, and finally extending the beginning of the run @end is in - * to be @start is all that is needed. - */ - if (rl_end->lcn == LCN_HOLE) { - /* Truncate the run containing @start. */ - rl->length = start - rl->vcn; - rl++; - /* Cut out all runlist elements up to @end. */ - if (rl < rl_end) - memmove(rl, rl_end, (rl_real_end - rl_end + 1) * - sizeof(*rl)); - /* Extend the beginning of the run @end is in to be @start. */ - rl->vcn = start; - rl->length = rl[1].vcn - start; - goto shrink_allocation; - } - /* - * If @end is not in a hole there are still two cases to distinguish. - * Either @end is or is not in the same run as @start. - * - * The second case is easier as it can be reduced to an already solved - * problem by truncating the run @start is in to end at @start - 1. - * Then, if @end is in the next run need to split the run into a sparse - * run followed by a non-sparse run (already covered above) and if @end - * is not in the next run switching it to be sparse, again reduces the - * problem to the already covered case of "@start is in a hole". - */ - if (end >= rl[1].vcn) { - /* - * If @end is not in the next run, reduce the problem to the - * case of "@start is in a hole". - */ - if (rl[1].length && end >= rl[2].vcn) { - /* Truncate the run containing @start. */ - rl->length = start - rl->vcn; - rl++; - rl->vcn = start; - rl->lcn = LCN_HOLE; - goto extend_hole; - } - trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1); - if (IS_ERR(trl)) - goto enomem_out; - old_size++; - if (runlist->rl != trl) { - rl = trl + (rl - runlist->rl); - rl_end = trl + (rl_end - runlist->rl); - rl_real_end = trl + (rl_real_end - runlist->rl); - runlist->rl = trl; - } - /* Truncate the run containing @start. */ - rl->length = start - rl->vcn; - rl++; - /* - * @end is in the next run, reduce the problem to the case - * where "@start is at the beginning of a run and @end is in - * the same run as @start". - */ - delta = rl->vcn - start; - rl->vcn = start; - if (rl->lcn >= 0) { - rl->lcn -= delta; - /* Need this in case the lcn just became negative. */ - lcn_fixup = true; - } - rl->length += delta; - goto split_end; - } - /* - * The first case from above, i.e. @end is in the same run as @start. - * We need to split the run into three. One run for the non-sparse - * region between the beginning of the old run and @start, one for the - * sparse region between @start and @end, and one for the remaining - * non-sparse region, i.e. between @end and the end of the old run. - */ - trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 2); - if (IS_ERR(trl)) - goto enomem_out; - old_size += 2; - if (runlist->rl != trl) { - rl = trl + (rl - runlist->rl); - rl_end = trl + (rl_end - runlist->rl); - rl_real_end = trl + (rl_real_end - runlist->rl); - runlist->rl = trl; - } - /* Shift all the runs up by two. */ - memmove(rl + 2, rl, (rl_real_end - rl + 1) * sizeof(*rl)); - /* Finally, setup the three split runs. */ - rl->length = start - rl->vcn; - rl++; - rl->vcn = start; - rl->lcn = LCN_HOLE; - rl->length = length; - rl++; - delta = end - rl->vcn; - rl->vcn = end; - rl->lcn += delta; - rl->length -= delta; - ntfs_debug("Done (split both)."); - return 0; -enomem_out: - ntfs_error(vol->sb, "Not enough memory to extend runlist buffer."); - return -ENOMEM; -} - -#endif /* NTFS_RW */ diff --git a/fs/ntfs/runlist.h b/fs/ntfs/runlist.h deleted file mode 100644 index 38de0a375f59..000000000000 --- a/fs/ntfs/runlist.h +++ /dev/null @@ -1,88 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * runlist.h - Defines for runlist handling in NTFS Linux kernel driver. - * Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2005 Anton Altaparmakov - * Copyright (c) 2002 Richard Russon - */ - -#ifndef _LINUX_NTFS_RUNLIST_H -#define _LINUX_NTFS_RUNLIST_H - -#include "types.h" -#include "layout.h" -#include "volume.h" - -/** - * runlist_element - in memory vcn to lcn mapping array element - * @vcn: starting vcn of the current array element - * @lcn: starting lcn of the current array element - * @length: length in clusters of the current array element - * - * The last vcn (in fact the last vcn + 1) is reached when length == 0. - * - * When lcn == -1 this means that the count vcns starting at vcn are not - * physically allocated (i.e. this is a hole / data is sparse). - */ -typedef struct { /* In memory vcn to lcn mapping structure element. */ - VCN vcn; /* vcn = Starting virtual cluster number. */ - LCN lcn; /* lcn = Starting logical cluster number. */ - s64 length; /* Run length in clusters. */ -} runlist_element; - -/** - * runlist - in memory vcn to lcn mapping array including a read/write lock - * @rl: pointer to an array of runlist elements - * @lock: read/write spinlock for serializing access to @rl - * - */ -typedef struct { - runlist_element *rl; - struct rw_semaphore lock; -} runlist; - -static inline void ntfs_init_runlist(runlist *rl) -{ - rl->rl = NULL; - init_rwsem(&rl->lock); -} - -typedef enum { - LCN_HOLE = -1, /* Keep this as highest value or die! */ - LCN_RL_NOT_MAPPED = -2, - LCN_ENOENT = -3, - LCN_ENOMEM = -4, - LCN_EIO = -5, -} LCN_SPECIAL_VALUES; - -extern runlist_element *ntfs_runlists_merge(runlist_element *drl, - runlist_element *srl); - -extern runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol, - const ATTR_RECORD *attr, runlist_element *old_rl); - -extern LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn); - -#ifdef NTFS_RW - -extern runlist_element *ntfs_rl_find_vcn_nolock(runlist_element *rl, - const VCN vcn); - -extern int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol, - const runlist_element *rl, const VCN first_vcn, - const VCN last_vcn); - -extern int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst, - const int dst_len, const runlist_element *rl, - const VCN first_vcn, const VCN last_vcn, VCN *const stop_vcn); - -extern int ntfs_rl_truncate_nolock(const ntfs_volume *vol, - runlist *const runlist, const s64 new_length); - -int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist, - const VCN start, const s64 length); - -#endif /* NTFS_RW */ - -#endif /* _LINUX_NTFS_RUNLIST_H */ diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c deleted file mode 100644 index 56a7d5bd33e4..000000000000 --- a/fs/ntfs/super.c +++ /dev/null @@ -1,3202 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc. - * Copyright (c) 2001,2002 Richard Russon - */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/stddef.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/string.h> -#include <linux/spinlock.h> -#include <linux/blkdev.h> /* For bdev_logical_block_size(). */ -#include <linux/backing-dev.h> -#include <linux/buffer_head.h> -#include <linux/vfs.h> -#include <linux/moduleparam.h> -#include <linux/bitmap.h> - -#include "sysctl.h" -#include "logfile.h" -#include "quota.h" -#include "usnjrnl.h" -#include "dir.h" -#include "debug.h" -#include "index.h" -#include "inode.h" -#include "aops.h" -#include "layout.h" -#include "malloc.h" -#include "ntfs.h" - -/* Number of mounted filesystems which have compression enabled. */ -static unsigned long ntfs_nr_compression_users; - -/* A global default upcase table and a corresponding reference count. */ -static ntfschar *default_upcase; -static unsigned long ntfs_nr_upcase_users; - -/* Error constants/strings used in inode.c::ntfs_show_options(). */ -typedef enum { - /* One of these must be present, default is ON_ERRORS_CONTINUE. */ - ON_ERRORS_PANIC = 0x01, - ON_ERRORS_REMOUNT_RO = 0x02, - ON_ERRORS_CONTINUE = 0x04, - /* Optional, can be combined with any of the above. */ - ON_ERRORS_RECOVER = 0x10, -} ON_ERRORS_ACTIONS; - -const option_t on_errors_arr[] = { - { ON_ERRORS_PANIC, "panic" }, - { ON_ERRORS_REMOUNT_RO, "remount-ro", }, - { ON_ERRORS_CONTINUE, "continue", }, - { ON_ERRORS_RECOVER, "recover" }, - { 0, NULL } -}; - -/** - * simple_getbool - convert input string to a boolean value - * @s: input string to convert - * @setval: where to store the output boolean value - * - * Copied from old ntfs driver (which copied from vfat driver). - * - * "1", "yes", "true", or an empty string are converted to %true. - * "0", "no", and "false" are converted to %false. - * - * Return: %1 if the string is converted or was empty and *setval contains it; - * %0 if the string was not valid. - */ -static int simple_getbool(char *s, bool *setval) -{ - if (s) { - if (!strcmp(s, "1") || !strcmp(s, "yes") || !strcmp(s, "true")) - *setval = true; - else if (!strcmp(s, "0") || !strcmp(s, "no") || - !strcmp(s, "false")) - *setval = false; - else - return 0; - } else - *setval = true; - return 1; -} - -/** - * parse_options - parse the (re)mount options - * @vol: ntfs volume - * @opt: string containing the (re)mount options - * - * Parse the recognized options in @opt for the ntfs volume described by @vol. - */ -static bool parse_options(ntfs_volume *vol, char *opt) -{ - char *p, *v, *ov; - static char *utf8 = "utf8"; - int errors = 0, sloppy = 0; - kuid_t uid = INVALID_UID; - kgid_t gid = INVALID_GID; - umode_t fmask = (umode_t)-1, dmask = (umode_t)-1; - int mft_zone_multiplier = -1, on_errors = -1; - int show_sys_files = -1, case_sensitive = -1, disable_sparse = -1; - struct nls_table *nls_map = NULL, *old_nls; - - /* I am lazy... (-8 */ -#define NTFS_GETOPT_WITH_DEFAULT(option, variable, default_value) \ - if (!strcmp(p, option)) { \ - if (!v || !*v) \ - variable = default_value; \ - else { \ - variable = simple_strtoul(ov = v, &v, 0); \ - if (*v) \ - goto needs_val; \ - } \ - } -#define NTFS_GETOPT(option, variable) \ - if (!strcmp(p, option)) { \ - if (!v || !*v) \ - goto needs_arg; \ - variable = simple_strtoul(ov = v, &v, 0); \ - if (*v) \ - goto needs_val; \ - } -#define NTFS_GETOPT_UID(option, variable) \ - if (!strcmp(p, option)) { \ - uid_t uid_value; \ - if (!v || !*v) \ - goto needs_arg; \ - uid_value = simple_strtoul(ov = v, &v, 0); \ - if (*v) \ - goto needs_val; \ - variable = make_kuid(current_user_ns(), uid_value); \ - if (!uid_valid(variable)) \ - goto needs_val; \ - } -#define NTFS_GETOPT_GID(option, variable) \ - if (!strcmp(p, option)) { \ - gid_t gid_value; \ - if (!v || !*v) \ - goto needs_arg; \ - gid_value = simple_strtoul(ov = v, &v, 0); \ - if (*v) \ - goto needs_val; \ - variable = make_kgid(current_user_ns(), gid_value); \ - if (!gid_valid(variable)) \ - goto needs_val; \ - } -#define NTFS_GETOPT_OCTAL(option, variable) \ - if (!strcmp(p, option)) { \ - if (!v || !*v) \ - goto needs_arg; \ - variable = simple_strtoul(ov = v, &v, 8); \ - if (*v) \ - goto needs_val; \ - } -#define NTFS_GETOPT_BOOL(option, variable) \ - if (!strcmp(p, option)) { \ - bool val; \ - if (!simple_getbool(v, &val)) \ - goto needs_bool; \ - variable = val; \ - } -#define NTFS_GETOPT_OPTIONS_ARRAY(option, variable, opt_array) \ - if (!strcmp(p, option)) { \ - int _i; \ - if (!v || !*v) \ - goto needs_arg; \ - ov = v; \ - if (variable == -1) \ - variable = 0; \ - for (_i = 0; opt_array[_i].str && *opt_array[_i].str; _i++) \ - if (!strcmp(opt_array[_i].str, v)) { \ - variable |= opt_array[_i].val; \ - break; \ - } \ - if (!opt_array[_i].str || !*opt_array[_i].str) \ - goto needs_val; \ - } - if (!opt || !*opt) - goto no_mount_options; - ntfs_debug("Entering with mount options string: %s", opt); - while ((p = strsep(&opt, ","))) { - if ((v = strchr(p, '='))) - *v++ = 0; - NTFS_GETOPT_UID("uid", uid) - else NTFS_GETOPT_GID("gid", gid) - else NTFS_GETOPT_OCTAL("umask", fmask = dmask) - else NTFS_GETOPT_OCTAL("fmask", fmask) - else NTFS_GETOPT_OCTAL("dmask", dmask) - else NTFS_GETOPT("mft_zone_multiplier", mft_zone_multiplier) - else NTFS_GETOPT_WITH_DEFAULT("sloppy", sloppy, true) - else NTFS_GETOPT_BOOL("show_sys_files", show_sys_files) - else NTFS_GETOPT_BOOL("case_sensitive", case_sensitive) - else NTFS_GETOPT_BOOL("disable_sparse", disable_sparse) - else NTFS_GETOPT_OPTIONS_ARRAY("errors", on_errors, - on_errors_arr) - else if (!strcmp(p, "posix") || !strcmp(p, "show_inodes")) - ntfs_warning(vol->sb, "Ignoring obsolete option %s.", - p); - else if (!strcmp(p, "nls") || !strcmp(p, "iocharset")) { - if (!strcmp(p, "iocharset")) - ntfs_warning(vol->sb, "Option iocharset is " - "deprecated. Please use " - "option nls=<charsetname> in " - "the future."); - if (!v || !*v) - goto needs_arg; -use_utf8: - old_nls = nls_map; - nls_map = load_nls(v); - if (!nls_map) { - if (!old_nls) { - ntfs_error(vol->sb, "NLS character set " - "%s not found.", v); - return false; - } - ntfs_error(vol->sb, "NLS character set %s not " - "found. Using previous one %s.", - v, old_nls->charset); - nls_map = old_nls; - } else /* nls_map */ { - unload_nls(old_nls); - } - } else if (!strcmp(p, "utf8")) { - bool val = false; - ntfs_warning(vol->sb, "Option utf8 is no longer " - "supported, using option nls=utf8. Please " - "use option nls=utf8 in the future and " - "make sure utf8 is compiled either as a " - "module or into the kernel."); - if (!v || !*v) - val = true; - else if (!simple_getbool(v, &val)) - goto needs_bool; - if (val) { - v = utf8; - goto use_utf8; - } - } else { - ntfs_error(vol->sb, "Unrecognized mount option %s.", p); - if (errors < INT_MAX) - errors++; - } -#undef NTFS_GETOPT_OPTIONS_ARRAY -#undef NTFS_GETOPT_BOOL -#undef NTFS_GETOPT -#undef NTFS_GETOPT_WITH_DEFAULT - } -no_mount_options: - if (errors && !sloppy) - return false; - if (sloppy) - ntfs_warning(vol->sb, "Sloppy option given. Ignoring " - "unrecognized mount option(s) and continuing."); - /* Keep this first! */ - if (on_errors != -1) { - if (!on_errors) { - ntfs_error(vol->sb, "Invalid errors option argument " - "or bug in options parser."); - return false; - } - } - if (nls_map) { - if (vol->nls_map && vol->nls_map != nls_map) { - ntfs_error(vol->sb, "Cannot change NLS character set " - "on remount."); - return false; - } /* else (!vol->nls_map) */ - ntfs_debug("Using NLS character set %s.", nls_map->charset); - vol->nls_map = nls_map; - } else /* (!nls_map) */ { - if (!vol->nls_map) { - vol->nls_map = load_nls_default(); - if (!vol->nls_map) { - ntfs_error(vol->sb, "Failed to load default " - "NLS character set."); - return false; - } - ntfs_debug("Using default NLS character set (%s).", - vol->nls_map->charset); - } - } - if (mft_zone_multiplier != -1) { - if (vol->mft_zone_multiplier && vol->mft_zone_multiplier != - mft_zone_multiplier) { - ntfs_error(vol->sb, "Cannot change mft_zone_multiplier " - "on remount."); - return false; - } - if (mft_zone_multiplier < 1 || mft_zone_multiplier > 4) { - ntfs_error(vol->sb, "Invalid mft_zone_multiplier. " - "Using default value, i.e. 1."); - mft_zone_multiplier = 1; - } - vol->mft_zone_multiplier = mft_zone_multiplier; - } - if (!vol->mft_zone_multiplier) - vol->mft_zone_multiplier = 1; - if (on_errors != -1) - vol->on_errors = on_errors; - if (!vol->on_errors || vol->on_errors == ON_ERRORS_RECOVER) - vol->on_errors |= ON_ERRORS_CONTINUE; - if (uid_valid(uid)) - vol->uid = uid; - if (gid_valid(gid)) - vol->gid = gid; - if (fmask != (umode_t)-1) - vol->fmask = fmask; - if (dmask != (umode_t)-1) - vol->dmask = dmask; - if (show_sys_files != -1) { - if (show_sys_files) - NVolSetShowSystemFiles(vol); - else - NVolClearShowSystemFiles(vol); - } - if (case_sensitive != -1) { - if (case_sensitive) - NVolSetCaseSensitive(vol); - else - NVolClearCaseSensitive(vol); - } - if (disable_sparse != -1) { - if (disable_sparse) - NVolClearSparseEnabled(vol); - else { - if (!NVolSparseEnabled(vol) && - vol->major_ver && vol->major_ver < 3) - ntfs_warning(vol->sb, "Not enabling sparse " - "support due to NTFS volume " - "version %i.%i (need at least " - "version 3.0).", vol->major_ver, - vol->minor_ver); - else - NVolSetSparseEnabled(vol); - } - } - return true; -needs_arg: - ntfs_error(vol->sb, "The %s option requires an argument.", p); - return false; -needs_bool: - ntfs_error(vol->sb, "The %s option requires a boolean argument.", p); - return false; -needs_val: - ntfs_error(vol->sb, "Invalid %s option argument: %s", p, ov); - return false; -} - -#ifdef NTFS_RW - -/** - * ntfs_write_volume_flags - write new flags to the volume information flags - * @vol: ntfs volume on which to modify the flags - * @flags: new flags value for the volume information flags - * - * Internal function. You probably want to use ntfs_{set,clear}_volume_flags() - * instead (see below). - * - * Replace the volume information flags on the volume @vol with the value - * supplied in @flags. Note, this overwrites the volume information flags, so - * make sure to combine the flags you want to modify with the old flags and use - * the result when calling ntfs_write_volume_flags(). - * - * Return 0 on success and -errno on error. - */ -static int ntfs_write_volume_flags(ntfs_volume *vol, const VOLUME_FLAGS flags) -{ - ntfs_inode *ni = NTFS_I(vol->vol_ino); - MFT_RECORD *m; - VOLUME_INFORMATION *vi; - ntfs_attr_search_ctx *ctx; - int err; - - ntfs_debug("Entering, old flags = 0x%x, new flags = 0x%x.", - le16_to_cpu(vol->vol_flags), le16_to_cpu(flags)); - if (vol->vol_flags == flags) - goto done; - BUG_ON(!ni); - m = map_mft_record(ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(ni, m); - if (!ctx) { - err = -ENOMEM; - goto put_unm_err_out; - } - err = ntfs_attr_lookup(AT_VOLUME_INFORMATION, NULL, 0, 0, 0, NULL, 0, - ctx); - if (err) - goto put_unm_err_out; - vi = (VOLUME_INFORMATION*)((u8*)ctx->attr + - le16_to_cpu(ctx->attr->data.resident.value_offset)); - vol->vol_flags = vi->flags = flags; - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(ni); -done: - ntfs_debug("Done."); - return 0; -put_unm_err_out: - if (ctx) - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(ni); -err_out: - ntfs_error(vol->sb, "Failed with error code %i.", -err); - return err; -} - -/** - * ntfs_set_volume_flags - set bits in the volume information flags - * @vol: ntfs volume on which to modify the flags - * @flags: flags to set on the volume - * - * Set the bits in @flags in the volume information flags on the volume @vol. - * - * Return 0 on success and -errno on error. - */ -static inline int ntfs_set_volume_flags(ntfs_volume *vol, VOLUME_FLAGS flags) -{ - flags &= VOLUME_FLAGS_MASK; - return ntfs_write_volume_flags(vol, vol->vol_flags | flags); -} - -/** - * ntfs_clear_volume_flags - clear bits in the volume information flags - * @vol: ntfs volume on which to modify the flags - * @flags: flags to clear on the volume - * - * Clear the bits in @flags in the volume information flags on the volume @vol. - * - * Return 0 on success and -errno on error. - */ -static inline int ntfs_clear_volume_flags(ntfs_volume *vol, VOLUME_FLAGS flags) -{ - flags &= VOLUME_FLAGS_MASK; - flags = vol->vol_flags & cpu_to_le16(~le16_to_cpu(flags)); - return ntfs_write_volume_flags(vol, flags); -} - -#endif /* NTFS_RW */ - -/** - * ntfs_remount - change the mount options of a mounted ntfs filesystem - * @sb: superblock of mounted ntfs filesystem - * @flags: remount flags - * @opt: remount options string - * - * Change the mount options of an already mounted ntfs filesystem. - * - * NOTE: The VFS sets the @sb->s_flags remount flags to @flags after - * ntfs_remount() returns successfully (i.e. returns 0). Otherwise, - * @sb->s_flags are not changed. - */ -static int ntfs_remount(struct super_block *sb, int *flags, char *opt) -{ - ntfs_volume *vol = NTFS_SB(sb); - - ntfs_debug("Entering with remount options string: %s", opt); - - sync_filesystem(sb); - -#ifndef NTFS_RW - /* For read-only compiled driver, enforce read-only flag. */ - *flags |= SB_RDONLY; -#else /* NTFS_RW */ - /* - * For the read-write compiled driver, if we are remounting read-write, - * make sure there are no volume errors and that no unsupported volume - * flags are set. Also, empty the logfile journal as it would become - * stale as soon as something is written to the volume and mark the - * volume dirty so that chkdsk is run if the volume is not umounted - * cleanly. Finally, mark the quotas out of date so Windows rescans - * the volume on boot and updates them. - * - * When remounting read-only, mark the volume clean if no volume errors - * have occurred. - */ - if (sb_rdonly(sb) && !(*flags & SB_RDONLY)) { - static const char *es = ". Cannot remount read-write."; - - /* Remounting read-write. */ - if (NVolErrors(vol)) { - ntfs_error(sb, "Volume has errors and is read-only%s", - es); - return -EROFS; - } - if (vol->vol_flags & VOLUME_IS_DIRTY) { - ntfs_error(sb, "Volume is dirty and read-only%s", es); - return -EROFS; - } - if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) { - ntfs_error(sb, "Volume has been modified by chkdsk " - "and is read-only%s", es); - return -EROFS; - } - if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) { - ntfs_error(sb, "Volume has unsupported flags set " - "(0x%x) and is read-only%s", - (unsigned)le16_to_cpu(vol->vol_flags), - es); - return -EROFS; - } - if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) { - ntfs_error(sb, "Failed to set dirty bit in volume " - "information flags%s", es); - return -EROFS; - } -#if 0 - // TODO: Enable this code once we start modifying anything that - // is different between NTFS 1.2 and 3.x... - /* Set NT4 compatibility flag on newer NTFS version volumes. */ - if ((vol->major_ver > 1)) { - if (ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) { - ntfs_error(sb, "Failed to set NT4 " - "compatibility flag%s", es); - NVolSetErrors(vol); - return -EROFS; - } - } -#endif - if (!ntfs_empty_logfile(vol->logfile_ino)) { - ntfs_error(sb, "Failed to empty journal $LogFile%s", - es); - NVolSetErrors(vol); - return -EROFS; - } - if (!ntfs_mark_quotas_out_of_date(vol)) { - ntfs_error(sb, "Failed to mark quotas out of date%s", - es); - NVolSetErrors(vol); - return -EROFS; - } - if (!ntfs_stamp_usnjrnl(vol)) { - ntfs_error(sb, "Failed to stamp transaction log " - "($UsnJrnl)%s", es); - NVolSetErrors(vol); - return -EROFS; - } - } else if (!sb_rdonly(sb) && (*flags & SB_RDONLY)) { - /* Remounting read-only. */ - if (!NVolErrors(vol)) { - if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY)) - ntfs_warning(sb, "Failed to clear dirty bit " - "in volume information " - "flags. Run chkdsk."); - } - } -#endif /* NTFS_RW */ - - // TODO: Deal with *flags. - - if (!parse_options(vol, opt)) - return -EINVAL; - - ntfs_debug("Done."); - return 0; -} - -/** - * is_boot_sector_ntfs - check whether a boot sector is a valid NTFS boot sector - * @sb: Super block of the device to which @b belongs. - * @b: Boot sector of device @sb to check. - * @silent: If 'true', all output will be silenced. - * - * is_boot_sector_ntfs() checks whether the boot sector @b is a valid NTFS boot - * sector. Returns 'true' if it is valid and 'false' if not. - * - * @sb is only needed for warning/error output, i.e. it can be NULL when silent - * is 'true'. - */ -static bool is_boot_sector_ntfs(const struct super_block *sb, - const NTFS_BOOT_SECTOR *b, const bool silent) -{ - /* - * Check that checksum == sum of u32 values from b to the checksum - * field. If checksum is zero, no checking is done. We will work when - * the checksum test fails, since some utilities update the boot sector - * ignoring the checksum which leaves the checksum out-of-date. We - * report a warning if this is the case. - */ - if ((void*)b < (void*)&b->checksum && b->checksum && !silent) { - le32 *u; - u32 i; - - for (i = 0, u = (le32*)b; u < (le32*)(&b->checksum); ++u) - i += le32_to_cpup(u); - if (le32_to_cpu(b->checksum) != i) - ntfs_warning(sb, "Invalid boot sector checksum."); - } - /* Check OEMidentifier is "NTFS " */ - if (b->oem_id != magicNTFS) - goto not_ntfs; - /* Check bytes per sector value is between 256 and 4096. */ - if (le16_to_cpu(b->bpb.bytes_per_sector) < 0x100 || - le16_to_cpu(b->bpb.bytes_per_sector) > 0x1000) - goto not_ntfs; - /* Check sectors per cluster value is valid. */ - switch (b->bpb.sectors_per_cluster) { - case 1: case 2: case 4: case 8: case 16: case 32: case 64: case 128: - break; - default: - goto not_ntfs; - } - /* Check the cluster size is not above the maximum (64kiB). */ - if ((u32)le16_to_cpu(b->bpb.bytes_per_sector) * - b->bpb.sectors_per_cluster > NTFS_MAX_CLUSTER_SIZE) - goto not_ntfs; - /* Check reserved/unused fields are really zero. */ - if (le16_to_cpu(b->bpb.reserved_sectors) || - le16_to_cpu(b->bpb.root_entries) || - le16_to_cpu(b->bpb.sectors) || - le16_to_cpu(b->bpb.sectors_per_fat) || - le32_to_cpu(b->bpb.large_sectors) || b->bpb.fats) - goto not_ntfs; - /* Check clusters per file mft record value is valid. */ - if ((u8)b->clusters_per_mft_record < 0xe1 || - (u8)b->clusters_per_mft_record > 0xf7) - switch (b->clusters_per_mft_record) { - case 1: case 2: case 4: case 8: case 16: case 32: case 64: - break; - default: - goto not_ntfs; - } - /* Check clusters per index block value is valid. */ - if ((u8)b->clusters_per_index_record < 0xe1 || - (u8)b->clusters_per_index_record > 0xf7) - switch (b->clusters_per_index_record) { - case 1: case 2: case 4: case 8: case 16: case 32: case 64: - break; - default: - goto not_ntfs; - } - /* - * Check for valid end of sector marker. We will work without it, but - * many BIOSes will refuse to boot from a bootsector if the magic is - * incorrect, so we emit a warning. - */ - if (!silent && b->end_of_sector_marker != cpu_to_le16(0xaa55)) - ntfs_warning(sb, "Invalid end of sector marker."); - return true; -not_ntfs: - return false; -} - -/** - * read_ntfs_boot_sector - read the NTFS boot sector of a device - * @sb: super block of device to read the boot sector from - * @silent: if true, suppress all output - * - * Reads the boot sector from the device and validates it. If that fails, tries - * to read the backup boot sector, first from the end of the device a-la NT4 and - * later and then from the middle of the device a-la NT3.51 and before. - * - * If a valid boot sector is found but it is not the primary boot sector, we - * repair the primary boot sector silently (unless the device is read-only or - * the primary boot sector is not accessible). - * - * NOTE: To call this function, @sb must have the fields s_dev, the ntfs super - * block (u.ntfs_sb), nr_blocks and the device flags (s_flags) initialized - * to their respective values. - * - * Return the unlocked buffer head containing the boot sector or NULL on error. - */ -static struct buffer_head *read_ntfs_boot_sector(struct super_block *sb, - const int silent) -{ - const char *read_err_str = "Unable to read %s boot sector."; - struct buffer_head *bh_primary, *bh_backup; - sector_t nr_blocks = NTFS_SB(sb)->nr_blocks; - - /* Try to read primary boot sector. */ - if ((bh_primary = sb_bread(sb, 0))) { - if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*) - bh_primary->b_data, silent)) - return bh_primary; - if (!silent) - ntfs_error(sb, "Primary boot sector is invalid."); - } else if (!silent) - ntfs_error(sb, read_err_str, "primary"); - if (!(NTFS_SB(sb)->on_errors & ON_ERRORS_RECOVER)) { - if (bh_primary) - brelse(bh_primary); - if (!silent) - ntfs_error(sb, "Mount option errors=recover not used. " - "Aborting without trying to recover."); - return NULL; - } - /* Try to read NT4+ backup boot sector. */ - if ((bh_backup = sb_bread(sb, nr_blocks - 1))) { - if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*) - bh_backup->b_data, silent)) - goto hotfix_primary_boot_sector; - brelse(bh_backup); - } else if (!silent) - ntfs_error(sb, read_err_str, "backup"); - /* Try to read NT3.51- backup boot sector. */ - if ((bh_backup = sb_bread(sb, nr_blocks >> 1))) { - if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*) - bh_backup->b_data, silent)) - goto hotfix_primary_boot_sector; - if (!silent) - ntfs_error(sb, "Could not find a valid backup boot " - "sector."); - brelse(bh_backup); - } else if (!silent) - ntfs_error(sb, read_err_str, "backup"); - /* We failed. Cleanup and return. */ - if (bh_primary) - brelse(bh_primary); - return NULL; -hotfix_primary_boot_sector: - if (bh_primary) { - /* - * If we managed to read sector zero and the volume is not - * read-only, copy the found, valid backup boot sector to the - * primary boot sector. Note we only copy the actual boot - * sector structure, not the actual whole device sector as that - * may be bigger and would potentially damage the $Boot system - * file (FIXME: Would be nice to know if the backup boot sector - * on a large sector device contains the whole boot loader or - * just the first 512 bytes). - */ - if (!sb_rdonly(sb)) { - ntfs_warning(sb, "Hot-fix: Recovering invalid primary " - "boot sector from backup copy."); - memcpy(bh_primary->b_data, bh_backup->b_data, - NTFS_BLOCK_SIZE); - mark_buffer_dirty(bh_primary); - sync_dirty_buffer(bh_primary); - if (buffer_uptodate(bh_primary)) { - brelse(bh_backup); - return bh_primary; - } - ntfs_error(sb, "Hot-fix: Device write error while " - "recovering primary boot sector."); - } else { - ntfs_warning(sb, "Hot-fix: Recovery of primary boot " - "sector failed: Read-only mount."); - } - brelse(bh_primary); - } - ntfs_warning(sb, "Using backup boot sector."); - return bh_backup; -} - -/** - * parse_ntfs_boot_sector - parse the boot sector and store the data in @vol - * @vol: volume structure to initialise with data from boot sector - * @b: boot sector to parse - * - * Parse the ntfs boot sector @b and store all imporant information therein in - * the ntfs super block @vol. Return 'true' on success and 'false' on error. - */ -static bool parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b) -{ - unsigned int sectors_per_cluster_bits, nr_hidden_sects; - int clusters_per_mft_record, clusters_per_index_record; - s64 ll; - - vol->sector_size = le16_to_cpu(b->bpb.bytes_per_sector); - vol->sector_size_bits = ffs(vol->sector_size) - 1; - ntfs_debug("vol->sector_size = %i (0x%x)", vol->sector_size, - vol->sector_size); - ntfs_debug("vol->sector_size_bits = %i (0x%x)", vol->sector_size_bits, - vol->sector_size_bits); - if (vol->sector_size < vol->sb->s_blocksize) { - ntfs_error(vol->sb, "Sector size (%i) is smaller than the " - "device block size (%lu). This is not " - "supported. Sorry.", vol->sector_size, - vol->sb->s_blocksize); - return false; - } - ntfs_debug("sectors_per_cluster = 0x%x", b->bpb.sectors_per_cluster); - sectors_per_cluster_bits = ffs(b->bpb.sectors_per_cluster) - 1; - ntfs_debug("sectors_per_cluster_bits = 0x%x", - sectors_per_cluster_bits); - nr_hidden_sects = le32_to_cpu(b->bpb.hidden_sectors); - ntfs_debug("number of hidden sectors = 0x%x", nr_hidden_sects); - vol->cluster_size = vol->sector_size << sectors_per_cluster_bits; - vol->cluster_size_mask = vol->cluster_size - 1; - vol->cluster_size_bits = ffs(vol->cluster_size) - 1; - ntfs_debug("vol->cluster_size = %i (0x%x)", vol->cluster_size, - vol->cluster_size); - ntfs_debug("vol->cluster_size_mask = 0x%x", vol->cluster_size_mask); - ntfs_debug("vol->cluster_size_bits = %i", vol->cluster_size_bits); - if (vol->cluster_size < vol->sector_size) { - ntfs_error(vol->sb, "Cluster size (%i) is smaller than the " - "sector size (%i). This is not supported. " - "Sorry.", vol->cluster_size, vol->sector_size); - return false; - } - clusters_per_mft_record = b->clusters_per_mft_record; - ntfs_debug("clusters_per_mft_record = %i (0x%x)", - clusters_per_mft_record, clusters_per_mft_record); - if (clusters_per_mft_record > 0) - vol->mft_record_size = vol->cluster_size << - (ffs(clusters_per_mft_record) - 1); - else - /* - * When mft_record_size < cluster_size, clusters_per_mft_record - * = -log2(mft_record_size) bytes. mft_record_size normaly is - * 1024 bytes, which is encoded as 0xF6 (-10 in decimal). - */ - vol->mft_record_size = 1 << -clusters_per_mft_record; - vol->mft_record_size_mask = vol->mft_record_size - 1; - vol->mft_record_size_bits = ffs(vol->mft_record_size) - 1; - ntfs_debug("vol->mft_record_size = %i (0x%x)", vol->mft_record_size, - vol->mft_record_size); - ntfs_debug("vol->mft_record_size_mask = 0x%x", - vol->mft_record_size_mask); - ntfs_debug("vol->mft_record_size_bits = %i (0x%x)", - vol->mft_record_size_bits, vol->mft_record_size_bits); - /* - * We cannot support mft record sizes above the PAGE_SIZE since - * we store $MFT/$DATA, the table of mft records in the page cache. - */ - if (vol->mft_record_size > PAGE_SIZE) { - ntfs_error(vol->sb, "Mft record size (%i) exceeds the " - "PAGE_SIZE on your system (%lu). " - "This is not supported. Sorry.", - vol->mft_record_size, PAGE_SIZE); - return false; - } - /* We cannot support mft record sizes below the sector size. */ - if (vol->mft_record_size < vol->sector_size) { - ntfs_error(vol->sb, "Mft record size (%i) is smaller than the " - "sector size (%i). This is not supported. " - "Sorry.", vol->mft_record_size, - vol->sector_size); - return false; - } - clusters_per_index_record = b->clusters_per_index_record; - ntfs_debug("clusters_per_index_record = %i (0x%x)", - clusters_per_index_record, clusters_per_index_record); - if (clusters_per_index_record > 0) - vol->index_record_size = vol->cluster_size << - (ffs(clusters_per_index_record) - 1); - else - /* - * When index_record_size < cluster_size, - * clusters_per_index_record = -log2(index_record_size) bytes. - * index_record_size normaly equals 4096 bytes, which is - * encoded as 0xF4 (-12 in decimal). - */ - vol->index_record_size = 1 << -clusters_per_index_record; - vol->index_record_size_mask = vol->index_record_size - 1; - vol->index_record_size_bits = ffs(vol->index_record_size) - 1; - ntfs_debug("vol->index_record_size = %i (0x%x)", - vol->index_record_size, vol->index_record_size); - ntfs_debug("vol->index_record_size_mask = 0x%x", - vol->index_record_size_mask); - ntfs_debug("vol->index_record_size_bits = %i (0x%x)", - vol->index_record_size_bits, - vol->index_record_size_bits); - /* We cannot support index record sizes below the sector size. */ - if (vol->index_record_size < vol->sector_size) { - ntfs_error(vol->sb, "Index record size (%i) is smaller than " - "the sector size (%i). This is not " - "supported. Sorry.", vol->index_record_size, - vol->sector_size); - return false; - } - /* - * Get the size of the volume in clusters and check for 64-bit-ness. - * Windows currently only uses 32 bits to save the clusters so we do - * the same as it is much faster on 32-bit CPUs. - */ - ll = sle64_to_cpu(b->number_of_sectors) >> sectors_per_cluster_bits; - if ((u64)ll >= 1ULL << 32) { - ntfs_error(vol->sb, "Cannot handle 64-bit clusters. Sorry."); - return false; - } - vol->nr_clusters = ll; - ntfs_debug("vol->nr_clusters = 0x%llx", (long long)vol->nr_clusters); - /* - * On an architecture where unsigned long is 32-bits, we restrict the - * volume size to 2TiB (2^41). On a 64-bit architecture, the compiler - * will hopefully optimize the whole check away. - */ - if (sizeof(unsigned long) < 8) { - if ((ll << vol->cluster_size_bits) >= (1ULL << 41)) { - ntfs_error(vol->sb, "Volume size (%lluTiB) is too " - "large for this architecture. " - "Maximum supported is 2TiB. Sorry.", - (unsigned long long)ll >> (40 - - vol->cluster_size_bits)); - return false; - } - } - ll = sle64_to_cpu(b->mft_lcn); - if (ll >= vol->nr_clusters) { - ntfs_error(vol->sb, "MFT LCN (%lli, 0x%llx) is beyond end of " - "volume. Weird.", (unsigned long long)ll, - (unsigned long long)ll); - return false; - } - vol->mft_lcn = ll; - ntfs_debug("vol->mft_lcn = 0x%llx", (long long)vol->mft_lcn); - ll = sle64_to_cpu(b->mftmirr_lcn); - if (ll >= vol->nr_clusters) { - ntfs_error(vol->sb, "MFTMirr LCN (%lli, 0x%llx) is beyond end " - "of volume. Weird.", (unsigned long long)ll, - (unsigned long long)ll); - return false; - } - vol->mftmirr_lcn = ll; - ntfs_debug("vol->mftmirr_lcn = 0x%llx", (long long)vol->mftmirr_lcn); -#ifdef NTFS_RW - /* - * Work out the size of the mft mirror in number of mft records. If the - * cluster size is less than or equal to the size taken by four mft - * records, the mft mirror stores the first four mft records. If the - * cluster size is bigger than the size taken by four mft records, the - * mft mirror contains as many mft records as will fit into one - * cluster. - */ - if (vol->cluster_size <= (4 << vol->mft_record_size_bits)) - vol->mftmirr_size = 4; - else - vol->mftmirr_size = vol->cluster_size >> - vol->mft_record_size_bits; - ntfs_debug("vol->mftmirr_size = %i", vol->mftmirr_size); -#endif /* NTFS_RW */ - vol->serial_no = le64_to_cpu(b->volume_serial_number); - ntfs_debug("vol->serial_no = 0x%llx", - (unsigned long long)vol->serial_no); - return true; -} - -/** - * ntfs_setup_allocators - initialize the cluster and mft allocators - * @vol: volume structure for which to setup the allocators - * - * Setup the cluster (lcn) and mft allocators to the starting values. - */ -static void ntfs_setup_allocators(ntfs_volume *vol) -{ -#ifdef NTFS_RW - LCN mft_zone_size, mft_lcn; -#endif /* NTFS_RW */ - - ntfs_debug("vol->mft_zone_multiplier = 0x%x", - vol->mft_zone_multiplier); -#ifdef NTFS_RW - /* Determine the size of the MFT zone. */ - mft_zone_size = vol->nr_clusters; - switch (vol->mft_zone_multiplier) { /* % of volume size in clusters */ - case 4: - mft_zone_size >>= 1; /* 50% */ - break; - case 3: - mft_zone_size = (mft_zone_size + - (mft_zone_size >> 1)) >> 2; /* 37.5% */ - break; - case 2: - mft_zone_size >>= 2; /* 25% */ - break; - /* case 1: */ - default: - mft_zone_size >>= 3; /* 12.5% */ - break; - } - /* Setup the mft zone. */ - vol->mft_zone_start = vol->mft_zone_pos = vol->mft_lcn; - ntfs_debug("vol->mft_zone_pos = 0x%llx", - (unsigned long long)vol->mft_zone_pos); - /* - * Calculate the mft_lcn for an unmodified NTFS volume (see mkntfs - * source) and if the actual mft_lcn is in the expected place or even - * further to the front of the volume, extend the mft_zone to cover the - * beginning of the volume as well. This is in order to protect the - * area reserved for the mft bitmap as well within the mft_zone itself. - * On non-standard volumes we do not protect it as the overhead would - * be higher than the speed increase we would get by doing it. - */ - mft_lcn = (8192 + 2 * vol->cluster_size - 1) / vol->cluster_size; - if (mft_lcn * vol->cluster_size < 16 * 1024) - mft_lcn = (16 * 1024 + vol->cluster_size - 1) / - vol->cluster_size; - if (vol->mft_zone_start <= mft_lcn) - vol->mft_zone_start = 0; - ntfs_debug("vol->mft_zone_start = 0x%llx", - (unsigned long long)vol->mft_zone_start); - /* - * Need to cap the mft zone on non-standard volumes so that it does - * not point outside the boundaries of the volume. We do this by - * halving the zone size until we are inside the volume. - */ - vol->mft_zone_end = vol->mft_lcn + mft_zone_size; - while (vol->mft_zone_end >= vol->nr_clusters) { - mft_zone_size >>= 1; - vol->mft_zone_end = vol->mft_lcn + mft_zone_size; - } - ntfs_debug("vol->mft_zone_end = 0x%llx", - (unsigned long long)vol->mft_zone_end); - /* - * Set the current position within each data zone to the start of the - * respective zone. - */ - vol->data1_zone_pos = vol->mft_zone_end; - ntfs_debug("vol->data1_zone_pos = 0x%llx", - (unsigned long long)vol->data1_zone_pos); - vol->data2_zone_pos = 0; - ntfs_debug("vol->data2_zone_pos = 0x%llx", - (unsigned long long)vol->data2_zone_pos); - - /* Set the mft data allocation position to mft record 24. */ - vol->mft_data_pos = 24; - ntfs_debug("vol->mft_data_pos = 0x%llx", - (unsigned long long)vol->mft_data_pos); -#endif /* NTFS_RW */ -} - -#ifdef NTFS_RW - -/** - * load_and_init_mft_mirror - load and setup the mft mirror inode for a volume - * @vol: ntfs super block describing device whose mft mirror to load - * - * Return 'true' on success or 'false' on error. - */ -static bool load_and_init_mft_mirror(ntfs_volume *vol) -{ - struct inode *tmp_ino; - ntfs_inode *tmp_ni; - - ntfs_debug("Entering."); - /* Get mft mirror inode. */ - tmp_ino = ntfs_iget(vol->sb, FILE_MFTMirr); - if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) { - if (!IS_ERR(tmp_ino)) - iput(tmp_ino); - /* Caller will display error message. */ - return false; - } - /* - * Re-initialize some specifics about $MFTMirr's inode as - * ntfs_read_inode() will have set up the default ones. - */ - /* Set uid and gid to root. */ - tmp_ino->i_uid = GLOBAL_ROOT_UID; - tmp_ino->i_gid = GLOBAL_ROOT_GID; - /* Regular file. No access for anyone. */ - tmp_ino->i_mode = S_IFREG; - /* No VFS initiated operations allowed for $MFTMirr. */ - tmp_ino->i_op = &ntfs_empty_inode_ops; - tmp_ino->i_fop = &ntfs_empty_file_ops; - /* Put in our special address space operations. */ - tmp_ino->i_mapping->a_ops = &ntfs_mst_aops; - tmp_ni = NTFS_I(tmp_ino); - /* The $MFTMirr, like the $MFT is multi sector transfer protected. */ - NInoSetMstProtected(tmp_ni); - NInoSetSparseDisabled(tmp_ni); - /* - * Set up our little cheat allowing us to reuse the async read io - * completion handler for directories. - */ - tmp_ni->itype.index.block_size = vol->mft_record_size; - tmp_ni->itype.index.block_size_bits = vol->mft_record_size_bits; - vol->mftmirr_ino = tmp_ino; - ntfs_debug("Done."); - return true; -} - -/** - * check_mft_mirror - compare contents of the mft mirror with the mft - * @vol: ntfs super block describing device whose mft mirror to check - * - * Return 'true' on success or 'false' on error. - * - * Note, this function also results in the mft mirror runlist being completely - * mapped into memory. The mft mirror write code requires this and will BUG() - * should it find an unmapped runlist element. - */ -static bool check_mft_mirror(ntfs_volume *vol) -{ - struct super_block *sb = vol->sb; - ntfs_inode *mirr_ni; - struct page *mft_page, *mirr_page; - u8 *kmft, *kmirr; - runlist_element *rl, rl2[2]; - pgoff_t index; - int mrecs_per_page, i; - - ntfs_debug("Entering."); - /* Compare contents of $MFT and $MFTMirr. */ - mrecs_per_page = PAGE_SIZE / vol->mft_record_size; - BUG_ON(!mrecs_per_page); - BUG_ON(!vol->mftmirr_size); - mft_page = mirr_page = NULL; - kmft = kmirr = NULL; - index = i = 0; - do { - u32 bytes; - - /* Switch pages if necessary. */ - if (!(i % mrecs_per_page)) { - if (index) { - ntfs_unmap_page(mft_page); - ntfs_unmap_page(mirr_page); - } - /* Get the $MFT page. */ - mft_page = ntfs_map_page(vol->mft_ino->i_mapping, - index); - if (IS_ERR(mft_page)) { - ntfs_error(sb, "Failed to read $MFT."); - return false; - } - kmft = page_address(mft_page); - /* Get the $MFTMirr page. */ - mirr_page = ntfs_map_page(vol->mftmirr_ino->i_mapping, - index); - if (IS_ERR(mirr_page)) { - ntfs_error(sb, "Failed to read $MFTMirr."); - goto mft_unmap_out; - } - kmirr = page_address(mirr_page); - ++index; - } - /* Do not check the record if it is not in use. */ - if (((MFT_RECORD*)kmft)->flags & MFT_RECORD_IN_USE) { - /* Make sure the record is ok. */ - if (ntfs_is_baad_recordp((le32*)kmft)) { - ntfs_error(sb, "Incomplete multi sector " - "transfer detected in mft " - "record %i.", i); -mm_unmap_out: - ntfs_unmap_page(mirr_page); -mft_unmap_out: - ntfs_unmap_page(mft_page); - return false; - } - } - /* Do not check the mirror record if it is not in use. */ - if (((MFT_RECORD*)kmirr)->flags & MFT_RECORD_IN_USE) { - if (ntfs_is_baad_recordp((le32*)kmirr)) { - ntfs_error(sb, "Incomplete multi sector " - "transfer detected in mft " - "mirror record %i.", i); - goto mm_unmap_out; - } - } - /* Get the amount of data in the current record. */ - bytes = le32_to_cpu(((MFT_RECORD*)kmft)->bytes_in_use); - if (bytes < sizeof(MFT_RECORD_OLD) || - bytes > vol->mft_record_size || - ntfs_is_baad_recordp((le32*)kmft)) { - bytes = le32_to_cpu(((MFT_RECORD*)kmirr)->bytes_in_use); - if (bytes < sizeof(MFT_RECORD_OLD) || - bytes > vol->mft_record_size || - ntfs_is_baad_recordp((le32*)kmirr)) - bytes = vol->mft_record_size; - } - /* Compare the two records. */ - if (memcmp(kmft, kmirr, bytes)) { - ntfs_error(sb, "$MFT and $MFTMirr (record %i) do not " - "match. Run ntfsfix or chkdsk.", i); - goto mm_unmap_out; - } - kmft += vol->mft_record_size; - kmirr += vol->mft_record_size; - } while (++i < vol->mftmirr_size); - /* Release the last pages. */ - ntfs_unmap_page(mft_page); - ntfs_unmap_page(mirr_page); - - /* Construct the mft mirror runlist by hand. */ - rl2[0].vcn = 0; - rl2[0].lcn = vol->mftmirr_lcn; - rl2[0].length = (vol->mftmirr_size * vol->mft_record_size + - vol->cluster_size - 1) / vol->cluster_size; - rl2[1].vcn = rl2[0].length; - rl2[1].lcn = LCN_ENOENT; - rl2[1].length = 0; - /* - * Because we have just read all of the mft mirror, we know we have - * mapped the full runlist for it. - */ - mirr_ni = NTFS_I(vol->mftmirr_ino); - down_read(&mirr_ni->runlist.lock); - rl = mirr_ni->runlist.rl; - /* Compare the two runlists. They must be identical. */ - i = 0; - do { - if (rl2[i].vcn != rl[i].vcn || rl2[i].lcn != rl[i].lcn || - rl2[i].length != rl[i].length) { - ntfs_error(sb, "$MFTMirr location mismatch. " - "Run chkdsk."); - up_read(&mirr_ni->runlist.lock); - return false; - } - } while (rl2[i++].length); - up_read(&mirr_ni->runlist.lock); - ntfs_debug("Done."); - return true; -} - -/** - * load_and_check_logfile - load and check the logfile inode for a volume - * @vol: ntfs super block describing device whose logfile to load - * - * Return 'true' on success or 'false' on error. - */ -static bool load_and_check_logfile(ntfs_volume *vol, - RESTART_PAGE_HEADER **rp) -{ - struct inode *tmp_ino; - - ntfs_debug("Entering."); - tmp_ino = ntfs_iget(vol->sb, FILE_LogFile); - if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) { - if (!IS_ERR(tmp_ino)) - iput(tmp_ino); - /* Caller will display error message. */ - return false; - } - if (!ntfs_check_logfile(tmp_ino, rp)) { - iput(tmp_ino); - /* ntfs_check_logfile() will have displayed error output. */ - return false; - } - NInoSetSparseDisabled(NTFS_I(tmp_ino)); - vol->logfile_ino = tmp_ino; - ntfs_debug("Done."); - return true; -} - -#define NTFS_HIBERFIL_HEADER_SIZE 4096 - -/** - * check_windows_hibernation_status - check if Windows is suspended on a volume - * @vol: ntfs super block of device to check - * - * Check if Windows is hibernated on the ntfs volume @vol. This is done by - * looking for the file hiberfil.sys in the root directory of the volume. If - * the file is not present Windows is definitely not suspended. - * - * If hiberfil.sys exists and is less than 4kiB in size it means Windows is - * definitely suspended (this volume is not the system volume). Caveat: on a - * system with many volumes it is possible that the < 4kiB check is bogus but - * for now this should do fine. - * - * If hiberfil.sys exists and is larger than 4kiB in size, we need to read the - * hiberfil header (which is the first 4kiB). If this begins with "hibr", - * Windows is definitely suspended. If it is completely full of zeroes, - * Windows is definitely not hibernated. Any other case is treated as if - * Windows is suspended. This caters for the above mentioned caveat of a - * system with many volumes where no "hibr" magic would be present and there is - * no zero header. - * - * Return 0 if Windows is not hibernated on the volume, >0 if Windows is - * hibernated on the volume, and -errno on error. - */ -static int check_windows_hibernation_status(ntfs_volume *vol) -{ - MFT_REF mref; - struct inode *vi; - struct page *page; - u32 *kaddr, *kend; - ntfs_name *name = NULL; - int ret = 1; - static const ntfschar hiberfil[13] = { cpu_to_le16('h'), - cpu_to_le16('i'), cpu_to_le16('b'), - cpu_to_le16('e'), cpu_to_le16('r'), - cpu_to_le16('f'), cpu_to_le16('i'), - cpu_to_le16('l'), cpu_to_le16('.'), - cpu_to_le16('s'), cpu_to_le16('y'), - cpu_to_le16('s'), 0 }; - - ntfs_debug("Entering."); - /* - * Find the inode number for the hibernation file by looking up the - * filename hiberfil.sys in the root directory. - */ - inode_lock(vol->root_ino); - mref = ntfs_lookup_inode_by_name(NTFS_I(vol->root_ino), hiberfil, 12, - &name); - inode_unlock(vol->root_ino); - if (IS_ERR_MREF(mref)) { - ret = MREF_ERR(mref); - /* If the file does not exist, Windows is not hibernated. */ - if (ret == -ENOENT) { - ntfs_debug("hiberfil.sys not present. Windows is not " - "hibernated on the volume."); - return 0; - } - /* A real error occurred. */ - ntfs_error(vol->sb, "Failed to find inode number for " - "hiberfil.sys."); - return ret; - } - /* We do not care for the type of match that was found. */ - kfree(name); - /* Get the inode. */ - vi = ntfs_iget(vol->sb, MREF(mref)); - if (IS_ERR(vi) || is_bad_inode(vi)) { - if (!IS_ERR(vi)) - iput(vi); - ntfs_error(vol->sb, "Failed to load hiberfil.sys."); - return IS_ERR(vi) ? PTR_ERR(vi) : -EIO; - } - if (unlikely(i_size_read(vi) < NTFS_HIBERFIL_HEADER_SIZE)) { - ntfs_debug("hiberfil.sys is smaller than 4kiB (0x%llx). " - "Windows is hibernated on the volume. This " - "is not the system volume.", i_size_read(vi)); - goto iput_out; - } - page = ntfs_map_page(vi->i_mapping, 0); - if (IS_ERR(page)) { - ntfs_error(vol->sb, "Failed to read from hiberfil.sys."); - ret = PTR_ERR(page); - goto iput_out; - } - kaddr = (u32*)page_address(page); - if (*(le32*)kaddr == cpu_to_le32(0x72626968)/*'hibr'*/) { - ntfs_debug("Magic \"hibr\" found in hiberfil.sys. Windows is " - "hibernated on the volume. This is the " - "system volume."); - goto unm_iput_out; - } - kend = kaddr + NTFS_HIBERFIL_HEADER_SIZE/sizeof(*kaddr); - do { - if (unlikely(*kaddr)) { - ntfs_debug("hiberfil.sys is larger than 4kiB " - "(0x%llx), does not contain the " - "\"hibr\" magic, and does not have a " - "zero header. Windows is hibernated " - "on the volume. This is not the " - "system volume.", i_size_read(vi)); - goto unm_iput_out; - } - } while (++kaddr < kend); - ntfs_debug("hiberfil.sys contains a zero header. Windows is not " - "hibernated on the volume. This is the system " - "volume."); - ret = 0; -unm_iput_out: - ntfs_unmap_page(page); -iput_out: - iput(vi); - return ret; -} - -/** - * load_and_init_quota - load and setup the quota file for a volume if present - * @vol: ntfs super block describing device whose quota file to load - * - * Return 'true' on success or 'false' on error. If $Quota is not present, we - * leave vol->quota_ino as NULL and return success. - */ -static bool load_and_init_quota(ntfs_volume *vol) -{ - MFT_REF mref; - struct inode *tmp_ino; - ntfs_name *name = NULL; - static const ntfschar Quota[7] = { cpu_to_le16('$'), - cpu_to_le16('Q'), cpu_to_le16('u'), - cpu_to_le16('o'), cpu_to_le16('t'), - cpu_to_le16('a'), 0 }; - static ntfschar Q[3] = { cpu_to_le16('$'), - cpu_to_le16('Q'), 0 }; - - ntfs_debug("Entering."); - /* - * Find the inode number for the quota file by looking up the filename - * $Quota in the extended system files directory $Extend. - */ - inode_lock(vol->extend_ino); - mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), Quota, 6, - &name); - inode_unlock(vol->extend_ino); - if (IS_ERR_MREF(mref)) { - /* - * If the file does not exist, quotas are disabled and have - * never been enabled on this volume, just return success. - */ - if (MREF_ERR(mref) == -ENOENT) { - ntfs_debug("$Quota not present. Volume does not have " - "quotas enabled."); - /* - * No need to try to set quotas out of date if they are - * not enabled. - */ - NVolSetQuotaOutOfDate(vol); - return true; - } - /* A real error occurred. */ - ntfs_error(vol->sb, "Failed to find inode number for $Quota."); - return false; - } - /* We do not care for the type of match that was found. */ - kfree(name); - /* Get the inode. */ - tmp_ino = ntfs_iget(vol->sb, MREF(mref)); - if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) { - if (!IS_ERR(tmp_ino)) - iput(tmp_ino); - ntfs_error(vol->sb, "Failed to load $Quota."); - return false; - } - vol->quota_ino = tmp_ino; - /* Get the $Q index allocation attribute. */ - tmp_ino = ntfs_index_iget(vol->quota_ino, Q, 2); - if (IS_ERR(tmp_ino)) { - ntfs_error(vol->sb, "Failed to load $Quota/$Q index."); - return false; - } - vol->quota_q_ino = tmp_ino; - ntfs_debug("Done."); - return true; -} - -/** - * load_and_init_usnjrnl - load and setup the transaction log if present - * @vol: ntfs super block describing device whose usnjrnl file to load - * - * Return 'true' on success or 'false' on error. - * - * If $UsnJrnl is not present or in the process of being disabled, we set - * NVolUsnJrnlStamped() and return success. - * - * If the $UsnJrnl $DATA/$J attribute has a size equal to the lowest valid usn, - * i.e. transaction logging has only just been enabled or the journal has been - * stamped and nothing has been logged since, we also set NVolUsnJrnlStamped() - * and return success. - */ -static bool load_and_init_usnjrnl(ntfs_volume *vol) -{ - MFT_REF mref; - struct inode *tmp_ino; - ntfs_inode *tmp_ni; - struct page *page; - ntfs_name *name = NULL; - USN_HEADER *uh; - static const ntfschar UsnJrnl[9] = { cpu_to_le16('$'), - cpu_to_le16('U'), cpu_to_le16('s'), - cpu_to_le16('n'), cpu_to_le16('J'), - cpu_to_le16('r'), cpu_to_le16('n'), - cpu_to_le16('l'), 0 }; - static ntfschar Max[5] = { cpu_to_le16('$'), - cpu_to_le16('M'), cpu_to_le16('a'), - cpu_to_le16('x'), 0 }; - static ntfschar J[3] = { cpu_to_le16('$'), - cpu_to_le16('J'), 0 }; - - ntfs_debug("Entering."); - /* - * Find the inode number for the transaction log file by looking up the - * filename $UsnJrnl in the extended system files directory $Extend. - */ - inode_lock(vol->extend_ino); - mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), UsnJrnl, 8, - &name); - inode_unlock(vol->extend_ino); - if (IS_ERR_MREF(mref)) { - /* - * If the file does not exist, transaction logging is disabled, - * just return success. - */ - if (MREF_ERR(mref) == -ENOENT) { - ntfs_debug("$UsnJrnl not present. Volume does not " - "have transaction logging enabled."); -not_enabled: - /* - * No need to try to stamp the transaction log if - * transaction logging is not enabled. - */ - NVolSetUsnJrnlStamped(vol); - return true; - } - /* A real error occurred. */ - ntfs_error(vol->sb, "Failed to find inode number for " - "$UsnJrnl."); - return false; - } - /* We do not care for the type of match that was found. */ - kfree(name); - /* Get the inode. */ - tmp_ino = ntfs_iget(vol->sb, MREF(mref)); - if (IS_ERR(tmp_ino) || unlikely(is_bad_inode(tmp_ino))) { - if (!IS_ERR(tmp_ino)) - iput(tmp_ino); - ntfs_error(vol->sb, "Failed to load $UsnJrnl."); - return false; - } - vol->usnjrnl_ino = tmp_ino; - /* - * If the transaction log is in the process of being deleted, we can - * ignore it. - */ - if (unlikely(vol->vol_flags & VOLUME_DELETE_USN_UNDERWAY)) { - ntfs_debug("$UsnJrnl in the process of being disabled. " - "Volume does not have transaction logging " - "enabled."); - goto not_enabled; - } - /* Get the $DATA/$Max attribute. */ - tmp_ino = ntfs_attr_iget(vol->usnjrnl_ino, AT_DATA, Max, 4); - if (IS_ERR(tmp_ino)) { - ntfs_error(vol->sb, "Failed to load $UsnJrnl/$DATA/$Max " - "attribute."); - return false; - } - vol->usnjrnl_max_ino = tmp_ino; - if (unlikely(i_size_read(tmp_ino) < sizeof(USN_HEADER))) { - ntfs_error(vol->sb, "Found corrupt $UsnJrnl/$DATA/$Max " - "attribute (size is 0x%llx but should be at " - "least 0x%zx bytes).", i_size_read(tmp_ino), - sizeof(USN_HEADER)); - return false; - } - /* Get the $DATA/$J attribute. */ - tmp_ino = ntfs_attr_iget(vol->usnjrnl_ino, AT_DATA, J, 2); - if (IS_ERR(tmp_ino)) { - ntfs_error(vol->sb, "Failed to load $UsnJrnl/$DATA/$J " - "attribute."); - return false; - } - vol->usnjrnl_j_ino = tmp_ino; - /* Verify $J is non-resident and sparse. */ - tmp_ni = NTFS_I(vol->usnjrnl_j_ino); - if (unlikely(!NInoNonResident(tmp_ni) || !NInoSparse(tmp_ni))) { - ntfs_error(vol->sb, "$UsnJrnl/$DATA/$J attribute is resident " - "and/or not sparse."); - return false; - } - /* Read the USN_HEADER from $DATA/$Max. */ - page = ntfs_map_page(vol->usnjrnl_max_ino->i_mapping, 0); - if (IS_ERR(page)) { - ntfs_error(vol->sb, "Failed to read from $UsnJrnl/$DATA/$Max " - "attribute."); - return false; - } - uh = (USN_HEADER*)page_address(page); - /* Sanity check the $Max. */ - if (unlikely(sle64_to_cpu(uh->allocation_delta) > - sle64_to_cpu(uh->maximum_size))) { - ntfs_error(vol->sb, "Allocation delta (0x%llx) exceeds " - "maximum size (0x%llx). $UsnJrnl is corrupt.", - (long long)sle64_to_cpu(uh->allocation_delta), - (long long)sle64_to_cpu(uh->maximum_size)); - ntfs_unmap_page(page); - return false; - } - /* - * If the transaction log has been stamped and nothing has been written - * to it since, we do not need to stamp it. - */ - if (unlikely(sle64_to_cpu(uh->lowest_valid_usn) >= - i_size_read(vol->usnjrnl_j_ino))) { - if (likely(sle64_to_cpu(uh->lowest_valid_usn) == - i_size_read(vol->usnjrnl_j_ino))) { - ntfs_unmap_page(page); - ntfs_debug("$UsnJrnl is enabled but nothing has been " - "logged since it was last stamped. " - "Treating this as if the volume does " - "not have transaction logging " - "enabled."); - goto not_enabled; - } - ntfs_error(vol->sb, "$UsnJrnl has lowest valid usn (0x%llx) " - "which is out of bounds (0x%llx). $UsnJrnl " - "is corrupt.", - (long long)sle64_to_cpu(uh->lowest_valid_usn), - i_size_read(vol->usnjrnl_j_ino)); - ntfs_unmap_page(page); - return false; - } - ntfs_unmap_page(page); - ntfs_debug("Done."); - return true; -} - -/** - * load_and_init_attrdef - load the attribute definitions table for a volume - * @vol: ntfs super block describing device whose attrdef to load - * - * Return 'true' on success or 'false' on error. - */ -static bool load_and_init_attrdef(ntfs_volume *vol) -{ - loff_t i_size; - struct super_block *sb = vol->sb; - struct inode *ino; - struct page *page; - pgoff_t index, max_index; - unsigned int size; - - ntfs_debug("Entering."); - /* Read attrdef table and setup vol->attrdef and vol->attrdef_size. */ - ino = ntfs_iget(sb, FILE_AttrDef); - if (IS_ERR(ino) || is_bad_inode(ino)) { - if (!IS_ERR(ino)) - iput(ino); - goto failed; - } - NInoSetSparseDisabled(NTFS_I(ino)); - /* The size of FILE_AttrDef must be above 0 and fit inside 31 bits. */ - i_size = i_size_read(ino); - if (i_size <= 0 || i_size > 0x7fffffff) - goto iput_failed; - vol->attrdef = (ATTR_DEF*)ntfs_malloc_nofs(i_size); - if (!vol->attrdef) - goto iput_failed; - index = 0; - max_index = i_size >> PAGE_SHIFT; - size = PAGE_SIZE; - while (index < max_index) { - /* Read the attrdef table and copy it into the linear buffer. */ -read_partial_attrdef_page: - page = ntfs_map_page(ino->i_mapping, index); - if (IS_ERR(page)) - goto free_iput_failed; - memcpy((u8*)vol->attrdef + (index++ << PAGE_SHIFT), - page_address(page), size); - ntfs_unmap_page(page); - } - if (size == PAGE_SIZE) { - size = i_size & ~PAGE_MASK; - if (size) - goto read_partial_attrdef_page; - } - vol->attrdef_size = i_size; - ntfs_debug("Read %llu bytes from $AttrDef.", i_size); - iput(ino); - return true; -free_iput_failed: - ntfs_free(vol->attrdef); - vol->attrdef = NULL; -iput_failed: - iput(ino); -failed: - ntfs_error(sb, "Failed to initialize attribute definition table."); - return false; -} - -#endif /* NTFS_RW */ - -/** - * load_and_init_upcase - load the upcase table for an ntfs volume - * @vol: ntfs super block describing device whose upcase to load - * - * Return 'true' on success or 'false' on error. - */ -static bool load_and_init_upcase(ntfs_volume *vol) -{ - loff_t i_size; - struct super_block *sb = vol->sb; - struct inode *ino; - struct page *page; - pgoff_t index, max_index; - unsigned int size; - int i, max; - - ntfs_debug("Entering."); - /* Read upcase table and setup vol->upcase and vol->upcase_len. */ - ino = ntfs_iget(sb, FILE_UpCase); - if (IS_ERR(ino) || is_bad_inode(ino)) { - if (!IS_ERR(ino)) - iput(ino); - goto upcase_failed; - } - /* - * The upcase size must not be above 64k Unicode characters, must not - * be zero and must be a multiple of sizeof(ntfschar). - */ - i_size = i_size_read(ino); - if (!i_size || i_size & (sizeof(ntfschar) - 1) || - i_size > 64ULL * 1024 * sizeof(ntfschar)) - goto iput_upcase_failed; - vol->upcase = (ntfschar*)ntfs_malloc_nofs(i_size); - if (!vol->upcase) - goto iput_upcase_failed; - index = 0; - max_index = i_size >> PAGE_SHIFT; - size = PAGE_SIZE; - while (index < max_index) { - /* Read the upcase table and copy it into the linear buffer. */ -read_partial_upcase_page: - page = ntfs_map_page(ino->i_mapping, index); - if (IS_ERR(page)) - goto iput_upcase_failed; - memcpy((char*)vol->upcase + (index++ << PAGE_SHIFT), - page_address(page), size); - ntfs_unmap_page(page); - } - if (size == PAGE_SIZE) { - size = i_size & ~PAGE_MASK; - if (size) - goto read_partial_upcase_page; - } - vol->upcase_len = i_size >> UCHAR_T_SIZE_BITS; - ntfs_debug("Read %llu bytes from $UpCase (expected %zu bytes).", - i_size, 64 * 1024 * sizeof(ntfschar)); - iput(ino); - mutex_lock(&ntfs_lock); - if (!default_upcase) { - ntfs_debug("Using volume specified $UpCase since default is " - "not present."); - mutex_unlock(&ntfs_lock); - return true; - } - max = default_upcase_len; - if (max > vol->upcase_len) - max = vol->upcase_len; - for (i = 0; i < max; i++) - if (vol->upcase[i] != default_upcase[i]) - break; - if (i == max) { - ntfs_free(vol->upcase); - vol->upcase = default_upcase; - vol->upcase_len = max; - ntfs_nr_upcase_users++; - mutex_unlock(&ntfs_lock); - ntfs_debug("Volume specified $UpCase matches default. Using " - "default."); - return true; - } - mutex_unlock(&ntfs_lock); - ntfs_debug("Using volume specified $UpCase since it does not match " - "the default."); - return true; -iput_upcase_failed: - iput(ino); - ntfs_free(vol->upcase); - vol->upcase = NULL; -upcase_failed: - mutex_lock(&ntfs_lock); - if (default_upcase) { - vol->upcase = default_upcase; - vol->upcase_len = default_upcase_len; - ntfs_nr_upcase_users++; - mutex_unlock(&ntfs_lock); - ntfs_error(sb, "Failed to load $UpCase from the volume. Using " - "default."); - return true; - } - mutex_unlock(&ntfs_lock); - ntfs_error(sb, "Failed to initialize upcase table."); - return false; -} - -/* - * The lcn and mft bitmap inodes are NTFS-internal inodes with - * their own special locking rules: - */ -static struct lock_class_key - lcnbmp_runlist_lock_key, lcnbmp_mrec_lock_key, - mftbmp_runlist_lock_key, mftbmp_mrec_lock_key; - -/** - * load_system_files - open the system files using normal functions - * @vol: ntfs super block describing device whose system files to load - * - * Open the system files with normal access functions and complete setting up - * the ntfs super block @vol. - * - * Return 'true' on success or 'false' on error. - */ -static bool load_system_files(ntfs_volume *vol) -{ - struct super_block *sb = vol->sb; - MFT_RECORD *m; - VOLUME_INFORMATION *vi; - ntfs_attr_search_ctx *ctx; -#ifdef NTFS_RW - RESTART_PAGE_HEADER *rp; - int err; -#endif /* NTFS_RW */ - - ntfs_debug("Entering."); -#ifdef NTFS_RW - /* Get mft mirror inode compare the contents of $MFT and $MFTMirr. */ - if (!load_and_init_mft_mirror(vol) || !check_mft_mirror(vol)) { - static const char *es1 = "Failed to load $MFTMirr"; - static const char *es2 = "$MFTMirr does not match $MFT"; - static const char *es3 = ". Run ntfsfix and/or chkdsk."; - - /* If a read-write mount, convert it to a read-only mount. */ - if (!sb_rdonly(sb)) { - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=" - "continue nor on_errors=" - "remount-ro was specified%s", - !vol->mftmirr_ino ? es1 : es2, - es3); - goto iput_mirr_err_out; - } - sb->s_flags |= SB_RDONLY; - ntfs_error(sb, "%s. Mounting read-only%s", - !vol->mftmirr_ino ? es1 : es2, es3); - } else - ntfs_warning(sb, "%s. Will not be able to remount " - "read-write%s", - !vol->mftmirr_ino ? es1 : es2, es3); - /* This will prevent a read-write remount. */ - NVolSetErrors(vol); - } -#endif /* NTFS_RW */ - /* Get mft bitmap attribute inode. */ - vol->mftbmp_ino = ntfs_attr_iget(vol->mft_ino, AT_BITMAP, NULL, 0); - if (IS_ERR(vol->mftbmp_ino)) { - ntfs_error(sb, "Failed to load $MFT/$BITMAP attribute."); - goto iput_mirr_err_out; - } - lockdep_set_class(&NTFS_I(vol->mftbmp_ino)->runlist.lock, - &mftbmp_runlist_lock_key); - lockdep_set_class(&NTFS_I(vol->mftbmp_ino)->mrec_lock, - &mftbmp_mrec_lock_key); - /* Read upcase table and setup @vol->upcase and @vol->upcase_len. */ - if (!load_and_init_upcase(vol)) - goto iput_mftbmp_err_out; -#ifdef NTFS_RW - /* - * Read attribute definitions table and setup @vol->attrdef and - * @vol->attrdef_size. - */ - if (!load_and_init_attrdef(vol)) - goto iput_upcase_err_out; -#endif /* NTFS_RW */ - /* - * Get the cluster allocation bitmap inode and verify the size, no - * need for any locking at this stage as we are already running - * exclusively as we are mount in progress task. - */ - vol->lcnbmp_ino = ntfs_iget(sb, FILE_Bitmap); - if (IS_ERR(vol->lcnbmp_ino) || is_bad_inode(vol->lcnbmp_ino)) { - if (!IS_ERR(vol->lcnbmp_ino)) - iput(vol->lcnbmp_ino); - goto bitmap_failed; - } - lockdep_set_class(&NTFS_I(vol->lcnbmp_ino)->runlist.lock, - &lcnbmp_runlist_lock_key); - lockdep_set_class(&NTFS_I(vol->lcnbmp_ino)->mrec_lock, - &lcnbmp_mrec_lock_key); - - NInoSetSparseDisabled(NTFS_I(vol->lcnbmp_ino)); - if ((vol->nr_clusters + 7) >> 3 > i_size_read(vol->lcnbmp_ino)) { - iput(vol->lcnbmp_ino); -bitmap_failed: - ntfs_error(sb, "Failed to load $Bitmap."); - goto iput_attrdef_err_out; - } - /* - * Get the volume inode and setup our cache of the volume flags and - * version. - */ - vol->vol_ino = ntfs_iget(sb, FILE_Volume); - if (IS_ERR(vol->vol_ino) || is_bad_inode(vol->vol_ino)) { - if (!IS_ERR(vol->vol_ino)) - iput(vol->vol_ino); -volume_failed: - ntfs_error(sb, "Failed to load $Volume."); - goto iput_lcnbmp_err_out; - } - m = map_mft_record(NTFS_I(vol->vol_ino)); - if (IS_ERR(m)) { -iput_volume_failed: - iput(vol->vol_ino); - goto volume_failed; - } - if (!(ctx = ntfs_attr_get_search_ctx(NTFS_I(vol->vol_ino), m))) { - ntfs_error(sb, "Failed to get attribute search context."); - goto get_ctx_vol_failed; - } - if (ntfs_attr_lookup(AT_VOLUME_INFORMATION, NULL, 0, 0, 0, NULL, 0, - ctx) || ctx->attr->non_resident || ctx->attr->flags) { -err_put_vol: - ntfs_attr_put_search_ctx(ctx); -get_ctx_vol_failed: - unmap_mft_record(NTFS_I(vol->vol_ino)); - goto iput_volume_failed; - } - vi = (VOLUME_INFORMATION*)((char*)ctx->attr + - le16_to_cpu(ctx->attr->data.resident.value_offset)); - /* Some bounds checks. */ - if ((u8*)vi < (u8*)ctx->attr || (u8*)vi + - le32_to_cpu(ctx->attr->data.resident.value_length) > - (u8*)ctx->attr + le32_to_cpu(ctx->attr->length)) - goto err_put_vol; - /* Copy the volume flags and version to the ntfs_volume structure. */ - vol->vol_flags = vi->flags; - vol->major_ver = vi->major_ver; - vol->minor_ver = vi->minor_ver; - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(NTFS_I(vol->vol_ino)); - pr_info("volume version %i.%i.\n", vol->major_ver, - vol->minor_ver); - if (vol->major_ver < 3 && NVolSparseEnabled(vol)) { - ntfs_warning(vol->sb, "Disabling sparse support due to NTFS " - "volume version %i.%i (need at least version " - "3.0).", vol->major_ver, vol->minor_ver); - NVolClearSparseEnabled(vol); - } -#ifdef NTFS_RW - /* Make sure that no unsupported volume flags are set. */ - if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) { - static const char *es1a = "Volume is dirty"; - static const char *es1b = "Volume has been modified by chkdsk"; - static const char *es1c = "Volume has unsupported flags set"; - static const char *es2a = ". Run chkdsk and mount in Windows."; - static const char *es2b = ". Mount in Windows."; - const char *es1, *es2; - - es2 = es2a; - if (vol->vol_flags & VOLUME_IS_DIRTY) - es1 = es1a; - else if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) { - es1 = es1b; - es2 = es2b; - } else { - es1 = es1c; - ntfs_warning(sb, "Unsupported volume flags 0x%x " - "encountered.", - (unsigned)le16_to_cpu(vol->vol_flags)); - } - /* If a read-write mount, convert it to a read-only mount. */ - if (!sb_rdonly(sb)) { - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=" - "continue nor on_errors=" - "remount-ro was specified%s", - es1, es2); - goto iput_vol_err_out; - } - sb->s_flags |= SB_RDONLY; - ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - } else - ntfs_warning(sb, "%s. Will not be able to remount " - "read-write%s", es1, es2); - /* - * Do not set NVolErrors() because ntfs_remount() re-checks the - * flags which we need to do in case any flags have changed. - */ - } - /* - * Get the inode for the logfile, check it and determine if the volume - * was shutdown cleanly. - */ - rp = NULL; - if (!load_and_check_logfile(vol, &rp) || - !ntfs_is_logfile_clean(vol->logfile_ino, rp)) { - static const char *es1a = "Failed to load $LogFile"; - static const char *es1b = "$LogFile is not clean"; - static const char *es2 = ". Mount in Windows."; - const char *es1; - - es1 = !vol->logfile_ino ? es1a : es1b; - /* If a read-write mount, convert it to a read-only mount. */ - if (!sb_rdonly(sb)) { - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=" - "continue nor on_errors=" - "remount-ro was specified%s", - es1, es2); - if (vol->logfile_ino) { - BUG_ON(!rp); - ntfs_free(rp); - } - goto iput_logfile_err_out; - } - sb->s_flags |= SB_RDONLY; - ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - } else - ntfs_warning(sb, "%s. Will not be able to remount " - "read-write%s", es1, es2); - /* This will prevent a read-write remount. */ - NVolSetErrors(vol); - } - ntfs_free(rp); -#endif /* NTFS_RW */ - /* Get the root directory inode so we can do path lookups. */ - vol->root_ino = ntfs_iget(sb, FILE_root); - if (IS_ERR(vol->root_ino) || is_bad_inode(vol->root_ino)) { - if (!IS_ERR(vol->root_ino)) - iput(vol->root_ino); - ntfs_error(sb, "Failed to load root directory."); - goto iput_logfile_err_out; - } -#ifdef NTFS_RW - /* - * Check if Windows is suspended to disk on the target volume. If it - * is hibernated, we must not write *anything* to the disk so set - * NVolErrors() without setting the dirty volume flag and mount - * read-only. This will prevent read-write remounting and it will also - * prevent all writes. - */ - err = check_windows_hibernation_status(vol); - if (unlikely(err)) { - static const char *es1a = "Failed to determine if Windows is " - "hibernated"; - static const char *es1b = "Windows is hibernated"; - static const char *es2 = ". Run chkdsk."; - const char *es1; - - es1 = err < 0 ? es1a : es1b; - /* If a read-write mount, convert it to a read-only mount. */ - if (!sb_rdonly(sb)) { - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=" - "continue nor on_errors=" - "remount-ro was specified%s", - es1, es2); - goto iput_root_err_out; - } - sb->s_flags |= SB_RDONLY; - ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - } else - ntfs_warning(sb, "%s. Will not be able to remount " - "read-write%s", es1, es2); - /* This will prevent a read-write remount. */ - NVolSetErrors(vol); - } - /* If (still) a read-write mount, mark the volume dirty. */ - if (!sb_rdonly(sb) && ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) { - static const char *es1 = "Failed to set dirty bit in volume " - "information flags"; - static const char *es2 = ". Run chkdsk."; - - /* Convert to a read-only mount. */ - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=continue nor " - "on_errors=remount-ro was specified%s", - es1, es2); - goto iput_root_err_out; - } - ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - sb->s_flags |= SB_RDONLY; - /* - * Do not set NVolErrors() because ntfs_remount() might manage - * to set the dirty flag in which case all would be well. - */ - } -#if 0 - // TODO: Enable this code once we start modifying anything that is - // different between NTFS 1.2 and 3.x... - /* - * If (still) a read-write mount, set the NT4 compatibility flag on - * newer NTFS version volumes. - */ - if (!(sb->s_flags & SB_RDONLY) && (vol->major_ver > 1) && - ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) { - static const char *es1 = "Failed to set NT4 compatibility flag"; - static const char *es2 = ". Run chkdsk."; - - /* Convert to a read-only mount. */ - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=continue nor " - "on_errors=remount-ro was specified%s", - es1, es2); - goto iput_root_err_out; - } - ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - sb->s_flags |= SB_RDONLY; - NVolSetErrors(vol); - } -#endif - /* If (still) a read-write mount, empty the logfile. */ - if (!sb_rdonly(sb) && !ntfs_empty_logfile(vol->logfile_ino)) { - static const char *es1 = "Failed to empty $LogFile"; - static const char *es2 = ". Mount in Windows."; - - /* Convert to a read-only mount. */ - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=continue nor " - "on_errors=remount-ro was specified%s", - es1, es2); - goto iput_root_err_out; - } - ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - sb->s_flags |= SB_RDONLY; - NVolSetErrors(vol); - } -#endif /* NTFS_RW */ - /* If on NTFS versions before 3.0, we are done. */ - if (unlikely(vol->major_ver < 3)) - return true; - /* NTFS 3.0+ specific initialization. */ - /* Get the security descriptors inode. */ - vol->secure_ino = ntfs_iget(sb, FILE_Secure); - if (IS_ERR(vol->secure_ino) || is_bad_inode(vol->secure_ino)) { - if (!IS_ERR(vol->secure_ino)) - iput(vol->secure_ino); - ntfs_error(sb, "Failed to load $Secure."); - goto iput_root_err_out; - } - // TODO: Initialize security. - /* Get the extended system files' directory inode. */ - vol->extend_ino = ntfs_iget(sb, FILE_Extend); - if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino) || - !S_ISDIR(vol->extend_ino->i_mode)) { - if (!IS_ERR(vol->extend_ino)) - iput(vol->extend_ino); - ntfs_error(sb, "Failed to load $Extend."); - goto iput_sec_err_out; - } -#ifdef NTFS_RW - /* Find the quota file, load it if present, and set it up. */ - if (!load_and_init_quota(vol)) { - static const char *es1 = "Failed to load $Quota"; - static const char *es2 = ". Run chkdsk."; - - /* If a read-write mount, convert it to a read-only mount. */ - if (!sb_rdonly(sb)) { - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=" - "continue nor on_errors=" - "remount-ro was specified%s", - es1, es2); - goto iput_quota_err_out; - } - sb->s_flags |= SB_RDONLY; - ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - } else - ntfs_warning(sb, "%s. Will not be able to remount " - "read-write%s", es1, es2); - /* This will prevent a read-write remount. */ - NVolSetErrors(vol); - } - /* If (still) a read-write mount, mark the quotas out of date. */ - if (!sb_rdonly(sb) && !ntfs_mark_quotas_out_of_date(vol)) { - static const char *es1 = "Failed to mark quotas out of date"; - static const char *es2 = ". Run chkdsk."; - - /* Convert to a read-only mount. */ - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=continue nor " - "on_errors=remount-ro was specified%s", - es1, es2); - goto iput_quota_err_out; - } - ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - sb->s_flags |= SB_RDONLY; - NVolSetErrors(vol); - } - /* - * Find the transaction log file ($UsnJrnl), load it if present, check - * it, and set it up. - */ - if (!load_and_init_usnjrnl(vol)) { - static const char *es1 = "Failed to load $UsnJrnl"; - static const char *es2 = ". Run chkdsk."; - - /* If a read-write mount, convert it to a read-only mount. */ - if (!sb_rdonly(sb)) { - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=" - "continue nor on_errors=" - "remount-ro was specified%s", - es1, es2); - goto iput_usnjrnl_err_out; - } - sb->s_flags |= SB_RDONLY; - ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - } else - ntfs_warning(sb, "%s. Will not be able to remount " - "read-write%s", es1, es2); - /* This will prevent a read-write remount. */ - NVolSetErrors(vol); - } - /* If (still) a read-write mount, stamp the transaction log. */ - if (!sb_rdonly(sb) && !ntfs_stamp_usnjrnl(vol)) { - static const char *es1 = "Failed to stamp transaction log " - "($UsnJrnl)"; - static const char *es2 = ". Run chkdsk."; - - /* Convert to a read-only mount. */ - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=continue nor " - "on_errors=remount-ro was specified%s", - es1, es2); - goto iput_usnjrnl_err_out; - } - ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - sb->s_flags |= SB_RDONLY; - NVolSetErrors(vol); - } -#endif /* NTFS_RW */ - return true; -#ifdef NTFS_RW -iput_usnjrnl_err_out: - iput(vol->usnjrnl_j_ino); - iput(vol->usnjrnl_max_ino); - iput(vol->usnjrnl_ino); -iput_quota_err_out: - iput(vol->quota_q_ino); - iput(vol->quota_ino); - iput(vol->extend_ino); -#endif /* NTFS_RW */ -iput_sec_err_out: - iput(vol->secure_ino); -iput_root_err_out: - iput(vol->root_ino); -iput_logfile_err_out: -#ifdef NTFS_RW - iput(vol->logfile_ino); -iput_vol_err_out: -#endif /* NTFS_RW */ - iput(vol->vol_ino); -iput_lcnbmp_err_out: - iput(vol->lcnbmp_ino); -iput_attrdef_err_out: - vol->attrdef_size = 0; - if (vol->attrdef) { - ntfs_free(vol->attrdef); - vol->attrdef = NULL; - } -#ifdef NTFS_RW -iput_upcase_err_out: -#endif /* NTFS_RW */ - vol->upcase_len = 0; - mutex_lock(&ntfs_lock); - if (vol->upcase == default_upcase) { - ntfs_nr_upcase_users--; - vol->upcase = NULL; - } - mutex_unlock(&ntfs_lock); - if (vol->upcase) { - ntfs_free(vol->upcase); - vol->upcase = NULL; - } -iput_mftbmp_err_out: - iput(vol->mftbmp_ino); -iput_mirr_err_out: -#ifdef NTFS_RW - iput(vol->mftmirr_ino); -#endif /* NTFS_RW */ - return false; -} - -/** - * ntfs_put_super - called by the vfs to unmount a volume - * @sb: vfs superblock of volume to unmount - * - * ntfs_put_super() is called by the VFS (from fs/super.c::do_umount()) when - * the volume is being unmounted (umount system call has been invoked) and it - * releases all inodes and memory belonging to the NTFS specific part of the - * super block. - */ -static void ntfs_put_super(struct super_block *sb) -{ - ntfs_volume *vol = NTFS_SB(sb); - - ntfs_debug("Entering."); - -#ifdef NTFS_RW - /* - * Commit all inodes while they are still open in case some of them - * cause others to be dirtied. - */ - ntfs_commit_inode(vol->vol_ino); - - /* NTFS 3.0+ specific. */ - if (vol->major_ver >= 3) { - if (vol->usnjrnl_j_ino) - ntfs_commit_inode(vol->usnjrnl_j_ino); - if (vol->usnjrnl_max_ino) - ntfs_commit_inode(vol->usnjrnl_max_ino); - if (vol->usnjrnl_ino) - ntfs_commit_inode(vol->usnjrnl_ino); - if (vol->quota_q_ino) - ntfs_commit_inode(vol->quota_q_ino); - if (vol->quota_ino) - ntfs_commit_inode(vol->quota_ino); - if (vol->extend_ino) - ntfs_commit_inode(vol->extend_ino); - if (vol->secure_ino) - ntfs_commit_inode(vol->secure_ino); - } - - ntfs_commit_inode(vol->root_ino); - - down_write(&vol->lcnbmp_lock); - ntfs_commit_inode(vol->lcnbmp_ino); - up_write(&vol->lcnbmp_lock); - - down_write(&vol->mftbmp_lock); - ntfs_commit_inode(vol->mftbmp_ino); - up_write(&vol->mftbmp_lock); - - if (vol->logfile_ino) - ntfs_commit_inode(vol->logfile_ino); - - if (vol->mftmirr_ino) - ntfs_commit_inode(vol->mftmirr_ino); - ntfs_commit_inode(vol->mft_ino); - - /* - * If a read-write mount and no volume errors have occurred, mark the - * volume clean. Also, re-commit all affected inodes. - */ - if (!sb_rdonly(sb)) { - if (!NVolErrors(vol)) { - if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY)) - ntfs_warning(sb, "Failed to clear dirty bit " - "in volume information " - "flags. Run chkdsk."); - ntfs_commit_inode(vol->vol_ino); - ntfs_commit_inode(vol->root_ino); - if (vol->mftmirr_ino) - ntfs_commit_inode(vol->mftmirr_ino); - ntfs_commit_inode(vol->mft_ino); - } else { - ntfs_warning(sb, "Volume has errors. Leaving volume " - "marked dirty. Run chkdsk."); - } - } -#endif /* NTFS_RW */ - - iput(vol->vol_ino); - vol->vol_ino = NULL; - - /* NTFS 3.0+ specific clean up. */ - if (vol->major_ver >= 3) { -#ifdef NTFS_RW - if (vol->usnjrnl_j_ino) { - iput(vol->usnjrnl_j_ino); - vol->usnjrnl_j_ino = NULL; - } - if (vol->usnjrnl_max_ino) { - iput(vol->usnjrnl_max_ino); - vol->usnjrnl_max_ino = NULL; - } - if (vol->usnjrnl_ino) { - iput(vol->usnjrnl_ino); - vol->usnjrnl_ino = NULL; - } - if (vol->quota_q_ino) { - iput(vol->quota_q_ino); - vol->quota_q_ino = NULL; - } - if (vol->quota_ino) { - iput(vol->quota_ino); - vol->quota_ino = NULL; - } -#endif /* NTFS_RW */ - if (vol->extend_ino) { - iput(vol->extend_ino); - vol->extend_ino = NULL; - } - if (vol->secure_ino) { - iput(vol->secure_ino); - vol->secure_ino = NULL; - } - } - - iput(vol->root_ino); - vol->root_ino = NULL; - - down_write(&vol->lcnbmp_lock); - iput(vol->lcnbmp_ino); - vol->lcnbmp_ino = NULL; - up_write(&vol->lcnbmp_lock); - - down_write(&vol->mftbmp_lock); - iput(vol->mftbmp_ino); - vol->mftbmp_ino = NULL; - up_write(&vol->mftbmp_lock); - -#ifdef NTFS_RW - if (vol->logfile_ino) { - iput(vol->logfile_ino); - vol->logfile_ino = NULL; - } - if (vol->mftmirr_ino) { - /* Re-commit the mft mirror and mft just in case. */ - ntfs_commit_inode(vol->mftmirr_ino); - ntfs_commit_inode(vol->mft_ino); - iput(vol->mftmirr_ino); - vol->mftmirr_ino = NULL; - } - /* - * We should have no dirty inodes left, due to - * mft.c::ntfs_mft_writepage() cleaning all the dirty pages as - * the underlying mft records are written out and cleaned. - */ - ntfs_commit_inode(vol->mft_ino); - write_inode_now(vol->mft_ino, 1); -#endif /* NTFS_RW */ - - iput(vol->mft_ino); - vol->mft_ino = NULL; - - /* Throw away the table of attribute definitions. */ - vol->attrdef_size = 0; - if (vol->attrdef) { - ntfs_free(vol->attrdef); - vol->attrdef = NULL; - } - vol->upcase_len = 0; - /* - * Destroy the global default upcase table if necessary. Also decrease - * the number of upcase users if we are a user. - */ - mutex_lock(&ntfs_lock); - if (vol->upcase == default_upcase) { - ntfs_nr_upcase_users--; - vol->upcase = NULL; - } - if (!ntfs_nr_upcase_users && default_upcase) { - ntfs_free(default_upcase); - default_upcase = NULL; - } - if (vol->cluster_size <= 4096 && !--ntfs_nr_compression_users) - free_compression_buffers(); - mutex_unlock(&ntfs_lock); - if (vol->upcase) { - ntfs_free(vol->upcase); - vol->upcase = NULL; - } - - unload_nls(vol->nls_map); - - sb->s_fs_info = NULL; - kfree(vol); -} - -/** - * get_nr_free_clusters - return the number of free clusters on a volume - * @vol: ntfs volume for which to obtain free cluster count - * - * Calculate the number of free clusters on the mounted NTFS volume @vol. We - * actually calculate the number of clusters in use instead because this - * allows us to not care about partial pages as these will be just zero filled - * and hence not be counted as allocated clusters. - * - * The only particularity is that clusters beyond the end of the logical ntfs - * volume will be marked as allocated to prevent errors which means we have to - * discount those at the end. This is important as the cluster bitmap always - * has a size in multiples of 8 bytes, i.e. up to 63 clusters could be outside - * the logical volume and marked in use when they are not as they do not exist. - * - * If any pages cannot be read we assume all clusters in the erroring pages are - * in use. This means we return an underestimate on errors which is better than - * an overestimate. - */ -static s64 get_nr_free_clusters(ntfs_volume *vol) -{ - s64 nr_free = vol->nr_clusters; - struct address_space *mapping = vol->lcnbmp_ino->i_mapping; - struct page *page; - pgoff_t index, max_index; - - ntfs_debug("Entering."); - /* Serialize accesses to the cluster bitmap. */ - down_read(&vol->lcnbmp_lock); - /* - * Convert the number of bits into bytes rounded up, then convert into - * multiples of PAGE_SIZE, rounding up so that if we have one - * full and one partial page max_index = 2. - */ - max_index = (((vol->nr_clusters + 7) >> 3) + PAGE_SIZE - 1) >> - PAGE_SHIFT; - /* Use multiples of 4 bytes, thus max_size is PAGE_SIZE / 4. */ - ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.", - max_index, PAGE_SIZE / 4); - for (index = 0; index < max_index; index++) { - unsigned long *kaddr; - - /* - * Read the page from page cache, getting it from backing store - * if necessary, and increment the use count. - */ - page = read_mapping_page(mapping, index, NULL); - /* Ignore pages which errored synchronously. */ - if (IS_ERR(page)) { - ntfs_debug("read_mapping_page() error. Skipping " - "page (index 0x%lx).", index); - nr_free -= PAGE_SIZE * 8; - continue; - } - kaddr = kmap_atomic(page); - /* - * Subtract the number of set bits. If this - * is the last page and it is partial we don't really care as - * it just means we do a little extra work but it won't affect - * the result as all out of range bytes are set to zero by - * ntfs_readpage(). - */ - nr_free -= bitmap_weight(kaddr, - PAGE_SIZE * BITS_PER_BYTE); - kunmap_atomic(kaddr); - put_page(page); - } - ntfs_debug("Finished reading $Bitmap, last index = 0x%lx.", index - 1); - /* - * Fixup for eventual bits outside logical ntfs volume (see function - * description above). - */ - if (vol->nr_clusters & 63) - nr_free += 64 - (vol->nr_clusters & 63); - up_read(&vol->lcnbmp_lock); - /* If errors occurred we may well have gone below zero, fix this. */ - if (nr_free < 0) - nr_free = 0; - ntfs_debug("Exiting."); - return nr_free; -} - -/** - * __get_nr_free_mft_records - return the number of free inodes on a volume - * @vol: ntfs volume for which to obtain free inode count - * @nr_free: number of mft records in filesystem - * @max_index: maximum number of pages containing set bits - * - * Calculate the number of free mft records (inodes) on the mounted NTFS - * volume @vol. We actually calculate the number of mft records in use instead - * because this allows us to not care about partial pages as these will be just - * zero filled and hence not be counted as allocated mft record. - * - * If any pages cannot be read we assume all mft records in the erroring pages - * are in use. This means we return an underestimate on errors which is better - * than an overestimate. - * - * NOTE: Caller must hold mftbmp_lock rw_semaphore for reading or writing. - */ -static unsigned long __get_nr_free_mft_records(ntfs_volume *vol, - s64 nr_free, const pgoff_t max_index) -{ - struct address_space *mapping = vol->mftbmp_ino->i_mapping; - struct page *page; - pgoff_t index; - - ntfs_debug("Entering."); - /* Use multiples of 4 bytes, thus max_size is PAGE_SIZE / 4. */ - ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = " - "0x%lx.", max_index, PAGE_SIZE / 4); - for (index = 0; index < max_index; index++) { - unsigned long *kaddr; - - /* - * Read the page from page cache, getting it from backing store - * if necessary, and increment the use count. - */ - page = read_mapping_page(mapping, index, NULL); - /* Ignore pages which errored synchronously. */ - if (IS_ERR(page)) { - ntfs_debug("read_mapping_page() error. Skipping " - "page (index 0x%lx).", index); - nr_free -= PAGE_SIZE * 8; - continue; - } - kaddr = kmap_atomic(page); - /* - * Subtract the number of set bits. If this - * is the last page and it is partial we don't really care as - * it just means we do a little extra work but it won't affect - * the result as all out of range bytes are set to zero by - * ntfs_readpage(). - */ - nr_free -= bitmap_weight(kaddr, - PAGE_SIZE * BITS_PER_BYTE); - kunmap_atomic(kaddr); - put_page(page); - } - ntfs_debug("Finished reading $MFT/$BITMAP, last index = 0x%lx.", - index - 1); - /* If errors occurred we may well have gone below zero, fix this. */ - if (nr_free < 0) - nr_free = 0; - ntfs_debug("Exiting."); - return nr_free; -} - -/** - * ntfs_statfs - return information about mounted NTFS volume - * @dentry: dentry from mounted volume - * @sfs: statfs structure in which to return the information - * - * Return information about the mounted NTFS volume @dentry in the statfs structure - * pointed to by @sfs (this is initialized with zeros before ntfs_statfs is - * called). We interpret the values to be correct of the moment in time at - * which we are called. Most values are variable otherwise and this isn't just - * the free values but the totals as well. For example we can increase the - * total number of file nodes if we run out and we can keep doing this until - * there is no more space on the volume left at all. - * - * Called from vfs_statfs which is used to handle the statfs, fstatfs, and - * ustat system calls. - * - * Return 0 on success or -errno on error. - */ -static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs) -{ - struct super_block *sb = dentry->d_sb; - s64 size; - ntfs_volume *vol = NTFS_SB(sb); - ntfs_inode *mft_ni = NTFS_I(vol->mft_ino); - pgoff_t max_index; - unsigned long flags; - - ntfs_debug("Entering."); - /* Type of filesystem. */ - sfs->f_type = NTFS_SB_MAGIC; - /* Optimal transfer block size. */ - sfs->f_bsize = PAGE_SIZE; - /* - * Total data blocks in filesystem in units of f_bsize and since - * inodes are also stored in data blocs ($MFT is a file) this is just - * the total clusters. - */ - sfs->f_blocks = vol->nr_clusters << vol->cluster_size_bits >> - PAGE_SHIFT; - /* Free data blocks in filesystem in units of f_bsize. */ - size = get_nr_free_clusters(vol) << vol->cluster_size_bits >> - PAGE_SHIFT; - if (size < 0LL) - size = 0LL; - /* Free blocks avail to non-superuser, same as above on NTFS. */ - sfs->f_bavail = sfs->f_bfree = size; - /* Serialize accesses to the inode bitmap. */ - down_read(&vol->mftbmp_lock); - read_lock_irqsave(&mft_ni->size_lock, flags); - size = i_size_read(vol->mft_ino) >> vol->mft_record_size_bits; - /* - * Convert the maximum number of set bits into bytes rounded up, then - * convert into multiples of PAGE_SIZE, rounding up so that if we - * have one full and one partial page max_index = 2. - */ - max_index = ((((mft_ni->initialized_size >> vol->mft_record_size_bits) - + 7) >> 3) + PAGE_SIZE - 1) >> PAGE_SHIFT; - read_unlock_irqrestore(&mft_ni->size_lock, flags); - /* Number of inodes in filesystem (at this point in time). */ - sfs->f_files = size; - /* Free inodes in fs (based on current total count). */ - sfs->f_ffree = __get_nr_free_mft_records(vol, size, max_index); - up_read(&vol->mftbmp_lock); - /* - * File system id. This is extremely *nix flavour dependent and even - * within Linux itself all fs do their own thing. I interpret this to - * mean a unique id associated with the mounted fs and not the id - * associated with the filesystem driver, the latter is already given - * by the filesystem type in sfs->f_type. Thus we use the 64-bit - * volume serial number splitting it into two 32-bit parts. We enter - * the least significant 32-bits in f_fsid[0] and the most significant - * 32-bits in f_fsid[1]. - */ - sfs->f_fsid = u64_to_fsid(vol->serial_no); - /* Maximum length of filenames. */ - sfs->f_namelen = NTFS_MAX_NAME_LEN; - return 0; -} - -#ifdef NTFS_RW -static int ntfs_write_inode(struct inode *vi, struct writeback_control *wbc) -{ - return __ntfs_write_inode(vi, wbc->sync_mode == WB_SYNC_ALL); -} -#endif - -/* - * The complete super operations. - */ -static const struct super_operations ntfs_sops = { - .alloc_inode = ntfs_alloc_big_inode, /* VFS: Allocate new inode. */ - .free_inode = ntfs_free_big_inode, /* VFS: Deallocate inode. */ -#ifdef NTFS_RW - .write_inode = ntfs_write_inode, /* VFS: Write dirty inode to - disk. */ -#endif /* NTFS_RW */ - .put_super = ntfs_put_super, /* Syscall: umount. */ - .statfs = ntfs_statfs, /* Syscall: statfs */ - .remount_fs = ntfs_remount, /* Syscall: mount -o remount. */ - .evict_inode = ntfs_evict_big_inode, /* VFS: Called when an inode is - removed from memory. */ - .show_options = ntfs_show_options, /* Show mount options in - proc. */ -}; - -/** - * ntfs_fill_super - mount an ntfs filesystem - * @sb: super block of ntfs filesystem to mount - * @opt: string containing the mount options - * @silent: silence error output - * - * ntfs_fill_super() is called by the VFS to mount the device described by @sb - * with the mount otions in @data with the NTFS filesystem. - * - * If @silent is true, remain silent even if errors are detected. This is used - * during bootup, when the kernel tries to mount the root filesystem with all - * registered filesystems one after the other until one succeeds. This implies - * that all filesystems except the correct one will quite correctly and - * expectedly return an error, but nobody wants to see error messages when in - * fact this is what is supposed to happen. - * - * NOTE: @sb->s_flags contains the mount options flags. - */ -static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) -{ - ntfs_volume *vol; - struct buffer_head *bh; - struct inode *tmp_ino; - int blocksize, result; - - /* - * We do a pretty difficult piece of bootstrap by reading the - * MFT (and other metadata) from disk into memory. We'll only - * release this metadata during umount, so the locking patterns - * observed during bootstrap do not count. So turn off the - * observation of locking patterns (strictly for this context - * only) while mounting NTFS. [The validator is still active - * otherwise, even for this context: it will for example record - * lock class registrations.] - */ - lockdep_off(); - ntfs_debug("Entering."); -#ifndef NTFS_RW - sb->s_flags |= SB_RDONLY; -#endif /* ! NTFS_RW */ - /* Allocate a new ntfs_volume and place it in sb->s_fs_info. */ - sb->s_fs_info = kmalloc(sizeof(ntfs_volume), GFP_NOFS); - vol = NTFS_SB(sb); - if (!vol) { - if (!silent) - ntfs_error(sb, "Allocation of NTFS volume structure " - "failed. Aborting mount..."); - lockdep_on(); - return -ENOMEM; - } - /* Initialize ntfs_volume structure. */ - *vol = (ntfs_volume) { - .sb = sb, - /* - * Default is group and other don't have any access to files or - * directories while owner has full access. Further, files by - * default are not executable but directories are of course - * browseable. - */ - .fmask = 0177, - .dmask = 0077, - }; - init_rwsem(&vol->mftbmp_lock); - init_rwsem(&vol->lcnbmp_lock); - - /* By default, enable sparse support. */ - NVolSetSparseEnabled(vol); - - /* Important to get the mount options dealt with now. */ - if (!parse_options(vol, (char*)opt)) - goto err_out_now; - - /* We support sector sizes up to the PAGE_SIZE. */ - if (bdev_logical_block_size(sb->s_bdev) > PAGE_SIZE) { - if (!silent) - ntfs_error(sb, "Device has unsupported sector size " - "(%i). The maximum supported sector " - "size on this architecture is %lu " - "bytes.", - bdev_logical_block_size(sb->s_bdev), - PAGE_SIZE); - goto err_out_now; - } - /* - * Setup the device access block size to NTFS_BLOCK_SIZE or the hard - * sector size, whichever is bigger. - */ - blocksize = sb_min_blocksize(sb, NTFS_BLOCK_SIZE); - if (blocksize < NTFS_BLOCK_SIZE) { - if (!silent) - ntfs_error(sb, "Unable to set device block size."); - goto err_out_now; - } - BUG_ON(blocksize != sb->s_blocksize); - ntfs_debug("Set device block size to %i bytes (block size bits %i).", - blocksize, sb->s_blocksize_bits); - /* Determine the size of the device in units of block_size bytes. */ - vol->nr_blocks = sb_bdev_nr_blocks(sb); - if (!vol->nr_blocks) { - if (!silent) - ntfs_error(sb, "Unable to determine device size."); - goto err_out_now; - } - /* Read the boot sector and return unlocked buffer head to it. */ - if (!(bh = read_ntfs_boot_sector(sb, silent))) { - if (!silent) - ntfs_error(sb, "Not an NTFS volume."); - goto err_out_now; - } - /* - * Extract the data from the boot sector and setup the ntfs volume - * using it. - */ - result = parse_ntfs_boot_sector(vol, (NTFS_BOOT_SECTOR*)bh->b_data); - brelse(bh); - if (!result) { - if (!silent) - ntfs_error(sb, "Unsupported NTFS filesystem."); - goto err_out_now; - } - /* - * If the boot sector indicates a sector size bigger than the current - * device block size, switch the device block size to the sector size. - * TODO: It may be possible to support this case even when the set - * below fails, we would just be breaking up the i/o for each sector - * into multiple blocks for i/o purposes but otherwise it should just - * work. However it is safer to leave disabled until someone hits this - * error message and then we can get them to try it without the setting - * so we know for sure that it works. - */ - if (vol->sector_size > blocksize) { - blocksize = sb_set_blocksize(sb, vol->sector_size); - if (blocksize != vol->sector_size) { - if (!silent) - ntfs_error(sb, "Unable to set device block " - "size to sector size (%i).", - vol->sector_size); - goto err_out_now; - } - BUG_ON(blocksize != sb->s_blocksize); - vol->nr_blocks = sb_bdev_nr_blocks(sb); - ntfs_debug("Changed device block size to %i bytes (block size " - "bits %i) to match volume sector size.", - blocksize, sb->s_blocksize_bits); - } - /* Initialize the cluster and mft allocators. */ - ntfs_setup_allocators(vol); - /* Setup remaining fields in the super block. */ - sb->s_magic = NTFS_SB_MAGIC; - /* - * Ntfs allows 63 bits for the file size, i.e. correct would be: - * sb->s_maxbytes = ~0ULL >> 1; - * But the kernel uses a long as the page cache page index which on - * 32-bit architectures is only 32-bits. MAX_LFS_FILESIZE is kernel - * defined to the maximum the page cache page index can cope with - * without overflowing the index or to 2^63 - 1, whichever is smaller. - */ - sb->s_maxbytes = MAX_LFS_FILESIZE; - /* Ntfs measures time in 100ns intervals. */ - sb->s_time_gran = 100; - /* - * Now load the metadata required for the page cache and our address - * space operations to function. We do this by setting up a specialised - * read_inode method and then just calling the normal iget() to obtain - * the inode for $MFT which is sufficient to allow our normal inode - * operations and associated address space operations to function. - */ - sb->s_op = &ntfs_sops; - tmp_ino = new_inode(sb); - if (!tmp_ino) { - if (!silent) - ntfs_error(sb, "Failed to load essential metadata."); - goto err_out_now; - } - tmp_ino->i_ino = FILE_MFT; - insert_inode_hash(tmp_ino); - if (ntfs_read_inode_mount(tmp_ino) < 0) { - if (!silent) - ntfs_error(sb, "Failed to load essential metadata."); - goto iput_tmp_ino_err_out_now; - } - mutex_lock(&ntfs_lock); - /* - * The current mount is a compression user if the cluster size is - * less than or equal 4kiB. - */ - if (vol->cluster_size <= 4096 && !ntfs_nr_compression_users++) { - result = allocate_compression_buffers(); - if (result) { - ntfs_error(NULL, "Failed to allocate buffers " - "for compression engine."); - ntfs_nr_compression_users--; - mutex_unlock(&ntfs_lock); - goto iput_tmp_ino_err_out_now; - } - } - /* - * Generate the global default upcase table if necessary. Also - * temporarily increment the number of upcase users to avoid race - * conditions with concurrent (u)mounts. - */ - if (!default_upcase) - default_upcase = generate_default_upcase(); - ntfs_nr_upcase_users++; - mutex_unlock(&ntfs_lock); - /* - * From now on, ignore @silent parameter. If we fail below this line, - * it will be due to a corrupt fs or a system error, so we report it. - */ - /* - * Open the system files with normal access functions and complete - * setting up the ntfs super block. - */ - if (!load_system_files(vol)) { - ntfs_error(sb, "Failed to load system files."); - goto unl_upcase_iput_tmp_ino_err_out_now; - } - - /* We grab a reference, simulating an ntfs_iget(). */ - ihold(vol->root_ino); - if ((sb->s_root = d_make_root(vol->root_ino))) { - ntfs_debug("Exiting, status successful."); - /* Release the default upcase if it has no users. */ - mutex_lock(&ntfs_lock); - if (!--ntfs_nr_upcase_users && default_upcase) { - ntfs_free(default_upcase); - default_upcase = NULL; - } - mutex_unlock(&ntfs_lock); - sb->s_export_op = &ntfs_export_ops; - lockdep_on(); - return 0; - } - ntfs_error(sb, "Failed to allocate root directory."); - /* Clean up after the successful load_system_files() call from above. */ - // TODO: Use ntfs_put_super() instead of repeating all this code... - // FIXME: Should mark the volume clean as the error is most likely - // -ENOMEM. - iput(vol->vol_ino); - vol->vol_ino = NULL; - /* NTFS 3.0+ specific clean up. */ - if (vol->major_ver >= 3) { -#ifdef NTFS_RW - if (vol->usnjrnl_j_ino) { - iput(vol->usnjrnl_j_ino); - vol->usnjrnl_j_ino = NULL; - } - if (vol->usnjrnl_max_ino) { - iput(vol->usnjrnl_max_ino); - vol->usnjrnl_max_ino = NULL; - } - if (vol->usnjrnl_ino) { - iput(vol->usnjrnl_ino); - vol->usnjrnl_ino = NULL; - } - if (vol->quota_q_ino) { - iput(vol->quota_q_ino); - vol->quota_q_ino = NULL; - } - if (vol->quota_ino) { - iput(vol->quota_ino); - vol->quota_ino = NULL; - } -#endif /* NTFS_RW */ - if (vol->extend_ino) { - iput(vol->extend_ino); - vol->extend_ino = NULL; - } - if (vol->secure_ino) { - iput(vol->secure_ino); - vol->secure_ino = NULL; - } - } - iput(vol->root_ino); - vol->root_ino = NULL; - iput(vol->lcnbmp_ino); - vol->lcnbmp_ino = NULL; - iput(vol->mftbmp_ino); - vol->mftbmp_ino = NULL; -#ifdef NTFS_RW - if (vol->logfile_ino) { - iput(vol->logfile_ino); - vol->logfile_ino = NULL; - } - if (vol->mftmirr_ino) { - iput(vol->mftmirr_ino); - vol->mftmirr_ino = NULL; - } -#endif /* NTFS_RW */ - /* Throw away the table of attribute definitions. */ - vol->attrdef_size = 0; - if (vol->attrdef) { - ntfs_free(vol->attrdef); - vol->attrdef = NULL; - } - vol->upcase_len = 0; - mutex_lock(&ntfs_lock); - if (vol->upcase == default_upcase) { - ntfs_nr_upcase_users--; - vol->upcase = NULL; - } - mutex_unlock(&ntfs_lock); - if (vol->upcase) { - ntfs_free(vol->upcase); - vol->upcase = NULL; - } - if (vol->nls_map) { - unload_nls(vol->nls_map); - vol->nls_map = NULL; - } - /* Error exit code path. */ -unl_upcase_iput_tmp_ino_err_out_now: - /* - * Decrease the number of upcase users and destroy the global default - * upcase table if necessary. - */ - mutex_lock(&ntfs_lock); - if (!--ntfs_nr_upcase_users && default_upcase) { - ntfs_free(default_upcase); - default_upcase = NULL; - } - if (vol->cluster_size <= 4096 && !--ntfs_nr_compression_users) - free_compression_buffers(); - mutex_unlock(&ntfs_lock); -iput_tmp_ino_err_out_now: - iput(tmp_ino); - if (vol->mft_ino && vol->mft_ino != tmp_ino) - iput(vol->mft_ino); - vol->mft_ino = NULL; - /* Errors at this stage are irrelevant. */ -err_out_now: - sb->s_fs_info = NULL; - kfree(vol); - ntfs_debug("Failed, returning -EINVAL."); - lockdep_on(); - return -EINVAL; -} - -/* - * This is a slab cache to optimize allocations and deallocations of Unicode - * strings of the maximum length allowed by NTFS, which is NTFS_MAX_NAME_LEN - * (255) Unicode characters + a terminating NULL Unicode character. - */ -struct kmem_cache *ntfs_name_cache; - -/* Slab caches for efficient allocation/deallocation of inodes. */ -struct kmem_cache *ntfs_inode_cache; -struct kmem_cache *ntfs_big_inode_cache; - -/* Init once constructor for the inode slab cache. */ -static void ntfs_big_inode_init_once(void *foo) -{ - ntfs_inode *ni = (ntfs_inode *)foo; - - inode_init_once(VFS_I(ni)); -} - -/* - * Slab caches to optimize allocations and deallocations of attribute search - * contexts and index contexts, respectively. - */ -struct kmem_cache *ntfs_attr_ctx_cache; -struct kmem_cache *ntfs_index_ctx_cache; - -/* Driver wide mutex. */ -DEFINE_MUTEX(ntfs_lock); - -static struct dentry *ntfs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) -{ - return mount_bdev(fs_type, flags, dev_name, data, ntfs_fill_super); -} - -static struct file_system_type ntfs_fs_type = { - .owner = THIS_MODULE, - .name = "ntfs", - .mount = ntfs_mount, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, -}; -MODULE_ALIAS_FS("ntfs"); - -/* Stable names for the slab caches. */ -static const char ntfs_index_ctx_cache_name[] = "ntfs_index_ctx_cache"; -static const char ntfs_attr_ctx_cache_name[] = "ntfs_attr_ctx_cache"; -static const char ntfs_name_cache_name[] = "ntfs_name_cache"; -static const char ntfs_inode_cache_name[] = "ntfs_inode_cache"; -static const char ntfs_big_inode_cache_name[] = "ntfs_big_inode_cache"; - -static int __init init_ntfs_fs(void) -{ - int err = 0; - - /* This may be ugly but it results in pretty output so who cares. (-8 */ - pr_info("driver " NTFS_VERSION " [Flags: R/" -#ifdef NTFS_RW - "W" -#else - "O" -#endif -#ifdef DEBUG - " DEBUG" -#endif -#ifdef MODULE - " MODULE" -#endif - "].\n"); - - ntfs_debug("Debug messages are enabled."); - - ntfs_index_ctx_cache = kmem_cache_create(ntfs_index_ctx_cache_name, - sizeof(ntfs_index_context), 0 /* offset */, - SLAB_HWCACHE_ALIGN, NULL /* ctor */); - if (!ntfs_index_ctx_cache) { - pr_crit("Failed to create %s!\n", ntfs_index_ctx_cache_name); - goto ictx_err_out; - } - ntfs_attr_ctx_cache = kmem_cache_create(ntfs_attr_ctx_cache_name, - sizeof(ntfs_attr_search_ctx), 0 /* offset */, - SLAB_HWCACHE_ALIGN, NULL /* ctor */); - if (!ntfs_attr_ctx_cache) { - pr_crit("NTFS: Failed to create %s!\n", - ntfs_attr_ctx_cache_name); - goto actx_err_out; - } - - ntfs_name_cache = kmem_cache_create(ntfs_name_cache_name, - (NTFS_MAX_NAME_LEN+1) * sizeof(ntfschar), 0, - SLAB_HWCACHE_ALIGN, NULL); - if (!ntfs_name_cache) { - pr_crit("Failed to create %s!\n", ntfs_name_cache_name); - goto name_err_out; - } - - ntfs_inode_cache = kmem_cache_create(ntfs_inode_cache_name, - sizeof(ntfs_inode), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); - if (!ntfs_inode_cache) { - pr_crit("Failed to create %s!\n", ntfs_inode_cache_name); - goto inode_err_out; - } - - ntfs_big_inode_cache = kmem_cache_create(ntfs_big_inode_cache_name, - sizeof(big_ntfs_inode), 0, - SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| - SLAB_ACCOUNT, ntfs_big_inode_init_once); - if (!ntfs_big_inode_cache) { - pr_crit("Failed to create %s!\n", ntfs_big_inode_cache_name); - goto big_inode_err_out; - } - - /* Register the ntfs sysctls. */ - err = ntfs_sysctl(1); - if (err) { - pr_crit("Failed to register NTFS sysctls!\n"); - goto sysctl_err_out; - } - - err = register_filesystem(&ntfs_fs_type); - if (!err) { - ntfs_debug("NTFS driver registered successfully."); - return 0; /* Success! */ - } - pr_crit("Failed to register NTFS filesystem driver!\n"); - - /* Unregister the ntfs sysctls. */ - ntfs_sysctl(0); -sysctl_err_out: - kmem_cache_destroy(ntfs_big_inode_cache); -big_inode_err_out: - kmem_cache_destroy(ntfs_inode_cache); -inode_err_out: - kmem_cache_destroy(ntfs_name_cache); -name_err_out: - kmem_cache_destroy(ntfs_attr_ctx_cache); -actx_err_out: - kmem_cache_destroy(ntfs_index_ctx_cache); -ictx_err_out: - if (!err) { - pr_crit("Aborting NTFS filesystem driver registration...\n"); - err = -ENOMEM; - } - return err; -} - -static void __exit exit_ntfs_fs(void) -{ - ntfs_debug("Unregistering NTFS driver."); - - unregister_filesystem(&ntfs_fs_type); - - /* - * Make sure all delayed rcu free inodes are flushed before we - * destroy cache. - */ - rcu_barrier(); - kmem_cache_destroy(ntfs_big_inode_cache); - kmem_cache_destroy(ntfs_inode_cache); - kmem_cache_destroy(ntfs_name_cache); - kmem_cache_destroy(ntfs_attr_ctx_cache); - kmem_cache_destroy(ntfs_index_ctx_cache); - /* Unregister the ntfs sysctls. */ - ntfs_sysctl(0); -} - -MODULE_AUTHOR("Anton Altaparmakov <anton@tuxera.com>"); -MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc."); -MODULE_VERSION(NTFS_VERSION); -MODULE_LICENSE("GPL"); -#ifdef DEBUG -module_param(debug_msgs, bint, 0); -MODULE_PARM_DESC(debug_msgs, "Enable debug messages."); -#endif - -module_init(init_ntfs_fs) -module_exit(exit_ntfs_fs) diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c deleted file mode 100644 index 4e980170d86a..000000000000 --- a/fs/ntfs/sysctl.c +++ /dev/null @@ -1,58 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * sysctl.c - Code for sysctl handling in NTFS Linux kernel driver. Part of - * the Linux-NTFS project. Adapted from the old NTFS driver, - * Copyright (C) 1997 Martin von Löwis, Régis Duchesne - * - * Copyright (c) 2002-2005 Anton Altaparmakov - */ - -#ifdef DEBUG - -#include <linux/module.h> - -#ifdef CONFIG_SYSCTL - -#include <linux/proc_fs.h> -#include <linux/sysctl.h> - -#include "sysctl.h" -#include "debug.h" - -/* Definition of the ntfs sysctl. */ -static struct ctl_table ntfs_sysctls[] = { - { - .procname = "ntfs-debug", - .data = &debug_msgs, /* Data pointer and size. */ - .maxlen = sizeof(debug_msgs), - .mode = 0644, /* Mode, proc handler. */ - .proc_handler = proc_dointvec - }, -}; - -/* Storage for the sysctls header. */ -static struct ctl_table_header *sysctls_root_table; - -/** - * ntfs_sysctl - add or remove the debug sysctl - * @add: add (1) or remove (0) the sysctl - * - * Add or remove the debug sysctl. Return 0 on success or -errno on error. - */ -int ntfs_sysctl(int add) -{ - if (add) { - BUG_ON(sysctls_root_table); - sysctls_root_table = register_sysctl("fs", ntfs_sysctls); - if (!sysctls_root_table) - return -ENOMEM; - } else { - BUG_ON(!sysctls_root_table); - unregister_sysctl_table(sysctls_root_table); - sysctls_root_table = NULL; - } - return 0; -} - -#endif /* CONFIG_SYSCTL */ -#endif /* DEBUG */ diff --git a/fs/ntfs/sysctl.h b/fs/ntfs/sysctl.h deleted file mode 100644 index 96bb2299d2d5..000000000000 --- a/fs/ntfs/sysctl.h +++ /dev/null @@ -1,27 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * sysctl.h - Defines for sysctl handling in NTFS Linux kernel driver. Part of - * the Linux-NTFS project. Adapted from the old NTFS driver, - * Copyright (C) 1997 Martin von Löwis, Régis Duchesne - * - * Copyright (c) 2002-2004 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_SYSCTL_H -#define _LINUX_NTFS_SYSCTL_H - - -#if defined(DEBUG) && defined(CONFIG_SYSCTL) - -extern int ntfs_sysctl(int add); - -#else - -/* Just return success. */ -static inline int ntfs_sysctl(int add) -{ - return 0; -} - -#endif /* DEBUG && CONFIG_SYSCTL */ -#endif /* _LINUX_NTFS_SYSCTL_H */ diff --git a/fs/ntfs/time.h b/fs/ntfs/time.h deleted file mode 100644 index 6b63261300cc..000000000000 --- a/fs/ntfs/time.h +++ /dev/null @@ -1,89 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * time.h - NTFS time conversion functions. Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2005 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_TIME_H -#define _LINUX_NTFS_TIME_H - -#include <linux/time.h> /* For current_kernel_time(). */ -#include <asm/div64.h> /* For do_div(). */ - -#include "endian.h" - -#define NTFS_TIME_OFFSET ((s64)(369 * 365 + 89) * 24 * 3600 * 10000000) - -/** - * utc2ntfs - convert Linux UTC time to NTFS time - * @ts: Linux UTC time to convert to NTFS time - * - * Convert the Linux UTC time @ts to its corresponding NTFS time and return - * that in little endian format. - * - * Linux stores time in a struct timespec64 consisting of a time64_t tv_sec - * and a long tv_nsec where tv_sec is the number of 1-second intervals since - * 1st January 1970, 00:00:00 UTC and tv_nsec is the number of 1-nano-second - * intervals since the value of tv_sec. - * - * NTFS uses Microsoft's standard time format which is stored in a s64 and is - * measured as the number of 100-nano-second intervals since 1st January 1601, - * 00:00:00 UTC. - */ -static inline sle64 utc2ntfs(const struct timespec64 ts) -{ - /* - * Convert the seconds to 100ns intervals, add the nano-seconds - * converted to 100ns intervals, and then add the NTFS time offset. - */ - return cpu_to_sle64((s64)ts.tv_sec * 10000000 + ts.tv_nsec / 100 + - NTFS_TIME_OFFSET); -} - -/** - * get_current_ntfs_time - get the current time in little endian NTFS format - * - * Get the current time from the Linux kernel, convert it to its corresponding - * NTFS time and return that in little endian format. - */ -static inline sle64 get_current_ntfs_time(void) -{ - struct timespec64 ts; - - ktime_get_coarse_real_ts64(&ts); - return utc2ntfs(ts); -} - -/** - * ntfs2utc - convert NTFS time to Linux time - * @time: NTFS time (little endian) to convert to Linux UTC - * - * Convert the little endian NTFS time @time to its corresponding Linux UTC - * time and return that in cpu format. - * - * Linux stores time in a struct timespec64 consisting of a time64_t tv_sec - * and a long tv_nsec where tv_sec is the number of 1-second intervals since - * 1st January 1970, 00:00:00 UTC and tv_nsec is the number of 1-nano-second - * intervals since the value of tv_sec. - * - * NTFS uses Microsoft's standard time format which is stored in a s64 and is - * measured as the number of 100 nano-second intervals since 1st January 1601, - * 00:00:00 UTC. - */ -static inline struct timespec64 ntfs2utc(const sle64 time) -{ - struct timespec64 ts; - - /* Subtract the NTFS time offset. */ - u64 t = (u64)(sle64_to_cpu(time) - NTFS_TIME_OFFSET); - /* - * Convert the time to 1-second intervals and the remainder to - * 1-nano-second intervals. - */ - ts.tv_nsec = do_div(t, 10000000) * 100; - ts.tv_sec = t; - return ts; -} - -#endif /* _LINUX_NTFS_TIME_H */ diff --git a/fs/ntfs/types.h b/fs/ntfs/types.h deleted file mode 100644 index 9a47859e7a06..000000000000 --- a/fs/ntfs/types.h +++ /dev/null @@ -1,55 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * types.h - Defines for NTFS Linux kernel driver specific types. - * Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2005 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_TYPES_H -#define _LINUX_NTFS_TYPES_H - -#include <linux/types.h> - -typedef __le16 le16; -typedef __le32 le32; -typedef __le64 le64; -typedef __u16 __bitwise sle16; -typedef __u32 __bitwise sle32; -typedef __u64 __bitwise sle64; - -/* 2-byte Unicode character type. */ -typedef le16 ntfschar; -#define UCHAR_T_SIZE_BITS 1 - -/* - * Clusters are signed 64-bit values on NTFS volumes. We define two types, LCN - * and VCN, to allow for type checking and better code readability. - */ -typedef s64 VCN; -typedef sle64 leVCN; -typedef s64 LCN; -typedef sle64 leLCN; - -/* - * The NTFS journal $LogFile uses log sequence numbers which are signed 64-bit - * values. We define our own type LSN, to allow for type checking and better - * code readability. - */ -typedef s64 LSN; -typedef sle64 leLSN; - -/* - * The NTFS transaction log $UsnJrnl uses usn which are signed 64-bit values. - * We define our own type USN, to allow for type checking and better code - * readability. - */ -typedef s64 USN; -typedef sle64 leUSN; - -typedef enum { - CASE_SENSITIVE = 0, - IGNORE_CASE = 1, -} IGNORE_CASE_BOOL; - -#endif /* _LINUX_NTFS_TYPES_H */ diff --git a/fs/ntfs/unistr.c b/fs/ntfs/unistr.c deleted file mode 100644 index a6b6c64f14a9..000000000000 --- a/fs/ntfs/unistr.c +++ /dev/null @@ -1,384 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * unistr.c - NTFS Unicode string handling. Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2006 Anton Altaparmakov - */ - -#include <linux/slab.h> - -#include "types.h" -#include "debug.h" -#include "ntfs.h" - -/* - * IMPORTANT - * ========= - * - * All these routines assume that the Unicode characters are in little endian - * encoding inside the strings!!! - */ - -/* - * This is used by the name collation functions to quickly determine what - * characters are (in)valid. - */ -static const u8 legal_ansi_char_array[0x40] = { - 0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, - 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, - - 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, - 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, - - 0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17, - 0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00, - - 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, - 0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18, -}; - -/** - * ntfs_are_names_equal - compare two Unicode names for equality - * @s1: name to compare to @s2 - * @s1_len: length in Unicode characters of @s1 - * @s2: name to compare to @s1 - * @s2_len: length in Unicode characters of @s2 - * @ic: ignore case bool - * @upcase: upcase table (only if @ic == IGNORE_CASE) - * @upcase_size: length in Unicode characters of @upcase (if present) - * - * Compare the names @s1 and @s2 and return 'true' (1) if the names are - * identical, or 'false' (0) if they are not identical. If @ic is IGNORE_CASE, - * the @upcase table is used to performa a case insensitive comparison. - */ -bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len, - const ntfschar *s2, size_t s2_len, const IGNORE_CASE_BOOL ic, - const ntfschar *upcase, const u32 upcase_size) -{ - if (s1_len != s2_len) - return false; - if (ic == CASE_SENSITIVE) - return !ntfs_ucsncmp(s1, s2, s1_len); - return !ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size); -} - -/** - * ntfs_collate_names - collate two Unicode names - * @name1: first Unicode name to compare - * @name2: second Unicode name to compare - * @err_val: if @name1 contains an invalid character return this value - * @ic: either CASE_SENSITIVE or IGNORE_CASE - * @upcase: upcase table (ignored if @ic is CASE_SENSITIVE) - * @upcase_len: upcase table size (ignored if @ic is CASE_SENSITIVE) - * - * ntfs_collate_names collates two Unicode names and returns: - * - * -1 if the first name collates before the second one, - * 0 if the names match, - * 1 if the second name collates before the first one, or - * @err_val if an invalid character is found in @name1 during the comparison. - * - * The following characters are considered invalid: '"', '*', '<', '>' and '?'. - */ -int ntfs_collate_names(const ntfschar *name1, const u32 name1_len, - const ntfschar *name2, const u32 name2_len, - const int err_val, const IGNORE_CASE_BOOL ic, - const ntfschar *upcase, const u32 upcase_len) -{ - u32 cnt, min_len; - u16 c1, c2; - - min_len = name1_len; - if (name1_len > name2_len) - min_len = name2_len; - for (cnt = 0; cnt < min_len; ++cnt) { - c1 = le16_to_cpu(*name1++); - c2 = le16_to_cpu(*name2++); - if (ic) { - if (c1 < upcase_len) - c1 = le16_to_cpu(upcase[c1]); - if (c2 < upcase_len) - c2 = le16_to_cpu(upcase[c2]); - } - if (c1 < 64 && legal_ansi_char_array[c1] & 8) - return err_val; - if (c1 < c2) - return -1; - if (c1 > c2) - return 1; - } - if (name1_len < name2_len) - return -1; - if (name1_len == name2_len) - return 0; - /* name1_len > name2_len */ - c1 = le16_to_cpu(*name1); - if (c1 < 64 && legal_ansi_char_array[c1] & 8) - return err_val; - return 1; -} - -/** - * ntfs_ucsncmp - compare two little endian Unicode strings - * @s1: first string - * @s2: second string - * @n: maximum unicode characters to compare - * - * Compare the first @n characters of the Unicode strings @s1 and @s2, - * The strings in little endian format and appropriate le16_to_cpu() - * conversion is performed on non-little endian machines. - * - * The function returns an integer less than, equal to, or greater than zero - * if @s1 (or the first @n Unicode characters thereof) is found, respectively, - * to be less than, to match, or be greater than @s2. - */ -int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n) -{ - u16 c1, c2; - size_t i; - - for (i = 0; i < n; ++i) { - c1 = le16_to_cpu(s1[i]); - c2 = le16_to_cpu(s2[i]); - if (c1 < c2) - return -1; - if (c1 > c2) - return 1; - if (!c1) - break; - } - return 0; -} - -/** - * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case - * @s1: first string - * @s2: second string - * @n: maximum unicode characters to compare - * @upcase: upcase table - * @upcase_size: upcase table size in Unicode characters - * - * Compare the first @n characters of the Unicode strings @s1 and @s2, - * ignoring case. The strings in little endian format and appropriate - * le16_to_cpu() conversion is performed on non-little endian machines. - * - * Each character is uppercased using the @upcase table before the comparison. - * - * The function returns an integer less than, equal to, or greater than zero - * if @s1 (or the first @n Unicode characters thereof) is found, respectively, - * to be less than, to match, or be greater than @s2. - */ -int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n, - const ntfschar *upcase, const u32 upcase_size) -{ - size_t i; - u16 c1, c2; - - for (i = 0; i < n; ++i) { - if ((c1 = le16_to_cpu(s1[i])) < upcase_size) - c1 = le16_to_cpu(upcase[c1]); - if ((c2 = le16_to_cpu(s2[i])) < upcase_size) - c2 = le16_to_cpu(upcase[c2]); - if (c1 < c2) - return -1; - if (c1 > c2) - return 1; - if (!c1) - break; - } - return 0; -} - -void ntfs_upcase_name(ntfschar *name, u32 name_len, const ntfschar *upcase, - const u32 upcase_len) -{ - u32 i; - u16 u; - - for (i = 0; i < name_len; i++) - if ((u = le16_to_cpu(name[i])) < upcase_len) - name[i] = upcase[u]; -} - -void ntfs_file_upcase_value(FILE_NAME_ATTR *file_name_attr, - const ntfschar *upcase, const u32 upcase_len) -{ - ntfs_upcase_name((ntfschar*)&file_name_attr->file_name, - file_name_attr->file_name_length, upcase, upcase_len); -} - -int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1, - FILE_NAME_ATTR *file_name_attr2, - const int err_val, const IGNORE_CASE_BOOL ic, - const ntfschar *upcase, const u32 upcase_len) -{ - return ntfs_collate_names((ntfschar*)&file_name_attr1->file_name, - file_name_attr1->file_name_length, - (ntfschar*)&file_name_attr2->file_name, - file_name_attr2->file_name_length, - err_val, ic, upcase, upcase_len); -} - -/** - * ntfs_nlstoucs - convert NLS string to little endian Unicode string - * @vol: ntfs volume which we are working with - * @ins: input NLS string buffer - * @ins_len: length of input string in bytes - * @outs: on return contains the allocated output Unicode string buffer - * - * Convert the input string @ins, which is in whatever format the loaded NLS - * map dictates, into a little endian, 2-byte Unicode string. - * - * This function allocates the string and the caller is responsible for - * calling kmem_cache_free(ntfs_name_cache, *@outs); when finished with it. - * - * On success the function returns the number of Unicode characters written to - * the output string *@outs (>= 0), not counting the terminating Unicode NULL - * character. *@outs is set to the allocated output string buffer. - * - * On error, a negative number corresponding to the error code is returned. In - * that case the output string is not allocated. Both *@outs and *@outs_len - * are then undefined. - * - * This might look a bit odd due to fast path optimization... - */ -int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins, - const int ins_len, ntfschar **outs) -{ - struct nls_table *nls = vol->nls_map; - ntfschar *ucs; - wchar_t wc; - int i, o, wc_len; - - /* We do not trust outside sources. */ - if (likely(ins)) { - ucs = kmem_cache_alloc(ntfs_name_cache, GFP_NOFS); - if (likely(ucs)) { - for (i = o = 0; i < ins_len; i += wc_len) { - wc_len = nls->char2uni(ins + i, ins_len - i, - &wc); - if (likely(wc_len >= 0 && - o < NTFS_MAX_NAME_LEN)) { - if (likely(wc)) { - ucs[o++] = cpu_to_le16(wc); - continue; - } /* else if (!wc) */ - break; - } /* else if (wc_len < 0 || - o >= NTFS_MAX_NAME_LEN) */ - goto name_err; - } - ucs[o] = 0; - *outs = ucs; - return o; - } /* else if (!ucs) */ - ntfs_error(vol->sb, "Failed to allocate buffer for converted " - "name from ntfs_name_cache."); - return -ENOMEM; - } /* else if (!ins) */ - ntfs_error(vol->sb, "Received NULL pointer."); - return -EINVAL; -name_err: - kmem_cache_free(ntfs_name_cache, ucs); - if (wc_len < 0) { - ntfs_error(vol->sb, "Name using character set %s contains " - "characters that cannot be converted to " - "Unicode.", nls->charset); - i = -EILSEQ; - } else /* if (o >= NTFS_MAX_NAME_LEN) */ { - ntfs_error(vol->sb, "Name is too long (maximum length for a " - "name on NTFS is %d Unicode characters.", - NTFS_MAX_NAME_LEN); - i = -ENAMETOOLONG; - } - return i; -} - -/** - * ntfs_ucstonls - convert little endian Unicode string to NLS string - * @vol: ntfs volume which we are working with - * @ins: input Unicode string buffer - * @ins_len: length of input string in Unicode characters - * @outs: on return contains the (allocated) output NLS string buffer - * @outs_len: length of output string buffer in bytes - * - * Convert the input little endian, 2-byte Unicode string @ins, of length - * @ins_len into the string format dictated by the loaded NLS. - * - * If *@outs is NULL, this function allocates the string and the caller is - * responsible for calling kfree(*@outs); when finished with it. In this case - * @outs_len is ignored and can be 0. - * - * On success the function returns the number of bytes written to the output - * string *@outs (>= 0), not counting the terminating NULL byte. If the output - * string buffer was allocated, *@outs is set to it. - * - * On error, a negative number corresponding to the error code is returned. In - * that case the output string is not allocated. The contents of *@outs are - * then undefined. - * - * This might look a bit odd due to fast path optimization... - */ -int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins, - const int ins_len, unsigned char **outs, int outs_len) -{ - struct nls_table *nls = vol->nls_map; - unsigned char *ns; - int i, o, ns_len, wc; - - /* We don't trust outside sources. */ - if (ins) { - ns = *outs; - ns_len = outs_len; - if (ns && !ns_len) { - wc = -ENAMETOOLONG; - goto conversion_err; - } - if (!ns) { - ns_len = ins_len * NLS_MAX_CHARSET_SIZE; - ns = kmalloc(ns_len + 1, GFP_NOFS); - if (!ns) - goto mem_err_out; - } - for (i = o = 0; i < ins_len; i++) { -retry: wc = nls->uni2char(le16_to_cpu(ins[i]), ns + o, - ns_len - o); - if (wc > 0) { - o += wc; - continue; - } else if (!wc) - break; - else if (wc == -ENAMETOOLONG && ns != *outs) { - unsigned char *tc; - /* Grow in multiples of 64 bytes. */ - tc = kmalloc((ns_len + 64) & - ~63, GFP_NOFS); - if (tc) { - memcpy(tc, ns, ns_len); - ns_len = ((ns_len + 64) & ~63) - 1; - kfree(ns); - ns = tc; - goto retry; - } /* No memory so goto conversion_error; */ - } /* wc < 0, real error. */ - goto conversion_err; - } - ns[o] = 0; - *outs = ns; - return o; - } /* else (!ins) */ - ntfs_error(vol->sb, "Received NULL pointer."); - return -EINVAL; -conversion_err: - ntfs_error(vol->sb, "Unicode name contains characters that cannot be " - "converted to character set %s. You might want to " - "try to use the mount option nls=utf8.", nls->charset); - if (ns != *outs) - kfree(ns); - if (wc != -ENAMETOOLONG) - wc = -EILSEQ; - return wc; -mem_err_out: - ntfs_error(vol->sb, "Failed to allocate name!"); - return -ENOMEM; -} diff --git a/fs/ntfs/upcase.c b/fs/ntfs/upcase.c deleted file mode 100644 index 4ebe84a78dea..000000000000 --- a/fs/ntfs/upcase.c +++ /dev/null @@ -1,73 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * upcase.c - Generate the full NTFS Unicode upcase table in little endian. - * Part of the Linux-NTFS project. - * - * Copyright (c) 2001 Richard Russon <ntfs@flatcap.org> - * Copyright (c) 2001-2006 Anton Altaparmakov - */ - -#include "malloc.h" -#include "ntfs.h" - -ntfschar *generate_default_upcase(void) -{ - static const int uc_run_table[][3] = { /* Start, End, Add */ - {0x0061, 0x007B, -32}, {0x0451, 0x045D, -80}, {0x1F70, 0x1F72, 74}, - {0x00E0, 0x00F7, -32}, {0x045E, 0x0460, -80}, {0x1F72, 0x1F76, 86}, - {0x00F8, 0x00FF, -32}, {0x0561, 0x0587, -48}, {0x1F76, 0x1F78, 100}, - {0x0256, 0x0258, -205}, {0x1F00, 0x1F08, 8}, {0x1F78, 0x1F7A, 128}, - {0x028A, 0x028C, -217}, {0x1F10, 0x1F16, 8}, {0x1F7A, 0x1F7C, 112}, - {0x03AC, 0x03AD, -38}, {0x1F20, 0x1F28, 8}, {0x1F7C, 0x1F7E, 126}, - {0x03AD, 0x03B0, -37}, {0x1F30, 0x1F38, 8}, {0x1FB0, 0x1FB2, 8}, - {0x03B1, 0x03C2, -32}, {0x1F40, 0x1F46, 8}, {0x1FD0, 0x1FD2, 8}, - {0x03C2, 0x03C3, -31}, {0x1F51, 0x1F52, 8}, {0x1FE0, 0x1FE2, 8}, - {0x03C3, 0x03CC, -32}, {0x1F53, 0x1F54, 8}, {0x1FE5, 0x1FE6, 7}, - {0x03CC, 0x03CD, -64}, {0x1F55, 0x1F56, 8}, {0x2170, 0x2180, -16}, - {0x03CD, 0x03CF, -63}, {0x1F57, 0x1F58, 8}, {0x24D0, 0x24EA, -26}, - {0x0430, 0x0450, -32}, {0x1F60, 0x1F68, 8}, {0xFF41, 0xFF5B, -32}, - {0} - }; - - static const int uc_dup_table[][2] = { /* Start, End */ - {0x0100, 0x012F}, {0x01A0, 0x01A6}, {0x03E2, 0x03EF}, {0x04CB, 0x04CC}, - {0x0132, 0x0137}, {0x01B3, 0x01B7}, {0x0460, 0x0481}, {0x04D0, 0x04EB}, - {0x0139, 0x0149}, {0x01CD, 0x01DD}, {0x0490, 0x04BF}, {0x04EE, 0x04F5}, - {0x014A, 0x0178}, {0x01DE, 0x01EF}, {0x04BF, 0x04BF}, {0x04F8, 0x04F9}, - {0x0179, 0x017E}, {0x01F4, 0x01F5}, {0x04C1, 0x04C4}, {0x1E00, 0x1E95}, - {0x018B, 0x018B}, {0x01FA, 0x0218}, {0x04C7, 0x04C8}, {0x1EA0, 0x1EF9}, - {0} - }; - - static const int uc_word_table[][2] = { /* Offset, Value */ - {0x00FF, 0x0178}, {0x01AD, 0x01AC}, {0x01F3, 0x01F1}, {0x0269, 0x0196}, - {0x0183, 0x0182}, {0x01B0, 0x01AF}, {0x0253, 0x0181}, {0x026F, 0x019C}, - {0x0185, 0x0184}, {0x01B9, 0x01B8}, {0x0254, 0x0186}, {0x0272, 0x019D}, - {0x0188, 0x0187}, {0x01BD, 0x01BC}, {0x0259, 0x018F}, {0x0275, 0x019F}, - {0x018C, 0x018B}, {0x01C6, 0x01C4}, {0x025B, 0x0190}, {0x0283, 0x01A9}, - {0x0192, 0x0191}, {0x01C9, 0x01C7}, {0x0260, 0x0193}, {0x0288, 0x01AE}, - {0x0199, 0x0198}, {0x01CC, 0x01CA}, {0x0263, 0x0194}, {0x0292, 0x01B7}, - {0x01A8, 0x01A7}, {0x01DD, 0x018E}, {0x0268, 0x0197}, - {0} - }; - - int i, r; - ntfschar *uc; - - uc = ntfs_malloc_nofs(default_upcase_len * sizeof(ntfschar)); - if (!uc) - return uc; - memset(uc, 0, default_upcase_len * sizeof(ntfschar)); - /* Generate the little endian Unicode upcase table used by ntfs. */ - for (i = 0; i < default_upcase_len; i++) - uc[i] = cpu_to_le16(i); - for (r = 0; uc_run_table[r][0]; r++) - for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++) - le16_add_cpu(&uc[i], uc_run_table[r][2]); - for (r = 0; uc_dup_table[r][0]; r++) - for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2) - le16_add_cpu(&uc[i + 1], -1); - for (r = 0; uc_word_table[r][0]; r++) - uc[uc_word_table[r][0]] = cpu_to_le16(uc_word_table[r][1]); - return uc; -} diff --git a/fs/ntfs/usnjrnl.c b/fs/ntfs/usnjrnl.c deleted file mode 100644 index 9097a0b4ef25..000000000000 --- a/fs/ntfs/usnjrnl.c +++ /dev/null @@ -1,70 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * usnjrnl.h - NTFS kernel transaction log ($UsnJrnl) handling. Part of the - * Linux-NTFS project. - * - * Copyright (c) 2005 Anton Altaparmakov - */ - -#ifdef NTFS_RW - -#include <linux/fs.h> -#include <linux/highmem.h> -#include <linux/mm.h> - -#include "aops.h" -#include "debug.h" -#include "endian.h" -#include "time.h" -#include "types.h" -#include "usnjrnl.h" -#include "volume.h" - -/** - * ntfs_stamp_usnjrnl - stamp the transaction log ($UsnJrnl) on an ntfs volume - * @vol: ntfs volume on which to stamp the transaction log - * - * Stamp the transaction log ($UsnJrnl) on the ntfs volume @vol and return - * 'true' on success and 'false' on error. - * - * This function assumes that the transaction log has already been loaded and - * consistency checked by a call to fs/ntfs/super.c::load_and_init_usnjrnl(). - */ -bool ntfs_stamp_usnjrnl(ntfs_volume *vol) -{ - ntfs_debug("Entering."); - if (likely(!NVolUsnJrnlStamped(vol))) { - sle64 stamp; - struct page *page; - USN_HEADER *uh; - - page = ntfs_map_page(vol->usnjrnl_max_ino->i_mapping, 0); - if (IS_ERR(page)) { - ntfs_error(vol->sb, "Failed to read from " - "$UsnJrnl/$DATA/$Max attribute."); - return false; - } - uh = (USN_HEADER*)page_address(page); - stamp = get_current_ntfs_time(); - ntfs_debug("Stamping transaction log ($UsnJrnl): old " - "journal_id 0x%llx, old lowest_valid_usn " - "0x%llx, new journal_id 0x%llx, new " - "lowest_valid_usn 0x%llx.", - (long long)sle64_to_cpu(uh->journal_id), - (long long)sle64_to_cpu(uh->lowest_valid_usn), - (long long)sle64_to_cpu(stamp), - i_size_read(vol->usnjrnl_j_ino)); - uh->lowest_valid_usn = - cpu_to_sle64(i_size_read(vol->usnjrnl_j_ino)); - uh->journal_id = stamp; - flush_dcache_page(page); - set_page_dirty(page); - ntfs_unmap_page(page); - /* Set the flag so we do not have to do it again on remount. */ - NVolSetUsnJrnlStamped(vol); - } - ntfs_debug("Done."); - return true; -} - -#endif /* NTFS_RW */ diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h deleted file mode 100644 index 85f531b59395..000000000000 --- a/fs/ntfs/usnjrnl.h +++ /dev/null @@ -1,191 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * usnjrnl.h - Defines for NTFS kernel transaction log ($UsnJrnl) handling. - * Part of the Linux-NTFS project. - * - * Copyright (c) 2005 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_USNJRNL_H -#define _LINUX_NTFS_USNJRNL_H - -#ifdef NTFS_RW - -#include "types.h" -#include "endian.h" -#include "layout.h" -#include "volume.h" - -/* - * Transaction log ($UsnJrnl) organization: - * - * The transaction log records whenever a file is modified in any way. So for - * example it will record that file "blah" was written to at a particular time - * but not what was written. If will record that a file was deleted or - * created, that a file was truncated, etc. See below for all the reason - * codes used. - * - * The transaction log is in the $Extend directory which is in the root - * directory of each volume. If it is not present it means transaction - * logging is disabled. If it is present it means transaction logging is - * either enabled or in the process of being disabled in which case we can - * ignore it as it will go away as soon as Windows gets its hands on it. - * - * To determine whether the transaction logging is enabled or in the process - * of being disabled, need to check the volume flags in the - * $VOLUME_INFORMATION attribute in the $Volume system file (which is present - * in the root directory and has a fixed mft record number, see layout.h). - * If the flag VOLUME_DELETE_USN_UNDERWAY is set it means the transaction log - * is in the process of being disabled and if this flag is clear it means the - * transaction log is enabled. - * - * The transaction log consists of two parts; the $DATA/$Max attribute as well - * as the $DATA/$J attribute. $Max is a header describing the transaction - * log whilst $J is the transaction log data itself as a sequence of variable - * sized USN_RECORDs (see below for all the structures). - * - * We do not care about transaction logging at this point in time but we still - * need to let windows know that the transaction log is out of date. To do - * this we need to stamp the transaction log. This involves setting the - * lowest_valid_usn field in the $DATA/$Max attribute to the usn to be used - * for the next added USN_RECORD to the $DATA/$J attribute as well as - * generating a new journal_id in $DATA/$Max. - * - * The journal_id is as of the current version (2.0) of the transaction log - * simply the 64-bit timestamp of when the journal was either created or last - * stamped. - * - * To determine the next usn there are two ways. The first is to parse - * $DATA/$J and to find the last USN_RECORD in it and to add its record_length - * to its usn (which is the byte offset in the $DATA/$J attribute). The - * second is simply to take the data size of the attribute. Since the usns - * are simply byte offsets into $DATA/$J, this is exactly the next usn. For - * obvious reasons we use the second method as it is much simpler and faster. - * - * As an aside, note that to actually disable the transaction log, one would - * need to set the VOLUME_DELETE_USN_UNDERWAY flag (see above), then go - * through all the mft records on the volume and set the usn field in their - * $STANDARD_INFORMATION attribute to zero. Once that is done, one would need - * to delete the transaction log file, i.e. \$Extent\$UsnJrnl, and finally, - * one would need to clear the VOLUME_DELETE_USN_UNDERWAY flag. - * - * Note that if a volume is unmounted whilst the transaction log is being - * disabled, the process will continue the next time the volume is mounted. - * This is why we can safely mount read-write when we see a transaction log - * in the process of being deleted. - */ - -/* Some $UsnJrnl related constants. */ -#define UsnJrnlMajorVer 2 -#define UsnJrnlMinorVer 0 - -/* - * $DATA/$Max attribute. This is (always?) resident and has a fixed size of - * 32 bytes. It contains the header describing the transaction log. - */ -typedef struct { -/*Ofs*/ -/* 0*/sle64 maximum_size; /* The maximum on-disk size of the $DATA/$J - attribute. */ -/* 8*/sle64 allocation_delta; /* Number of bytes by which to increase the - size of the $DATA/$J attribute. */ -/*0x10*/sle64 journal_id; /* Current id of the transaction log. */ -/*0x18*/leUSN lowest_valid_usn; /* Lowest valid usn in $DATA/$J for the - current journal_id. */ -/* sizeof() = 32 (0x20) bytes */ -} __attribute__ ((__packed__)) USN_HEADER; - -/* - * Reason flags (32-bit). Cumulative flags describing the change(s) to the - * file since it was last opened. I think the names speak for themselves but - * if you disagree check out the descriptions in the Linux NTFS project NTFS - * documentation: http://www.linux-ntfs.org/ - */ -enum { - USN_REASON_DATA_OVERWRITE = cpu_to_le32(0x00000001), - USN_REASON_DATA_EXTEND = cpu_to_le32(0x00000002), - USN_REASON_DATA_TRUNCATION = cpu_to_le32(0x00000004), - USN_REASON_NAMED_DATA_OVERWRITE = cpu_to_le32(0x00000010), - USN_REASON_NAMED_DATA_EXTEND = cpu_to_le32(0x00000020), - USN_REASON_NAMED_DATA_TRUNCATION= cpu_to_le32(0x00000040), - USN_REASON_FILE_CREATE = cpu_to_le32(0x00000100), - USN_REASON_FILE_DELETE = cpu_to_le32(0x00000200), - USN_REASON_EA_CHANGE = cpu_to_le32(0x00000400), - USN_REASON_SECURITY_CHANGE = cpu_to_le32(0x00000800), - USN_REASON_RENAME_OLD_NAME = cpu_to_le32(0x00001000), - USN_REASON_RENAME_NEW_NAME = cpu_to_le32(0x00002000), - USN_REASON_INDEXABLE_CHANGE = cpu_to_le32(0x00004000), - USN_REASON_BASIC_INFO_CHANGE = cpu_to_le32(0x00008000), - USN_REASON_HARD_LINK_CHANGE = cpu_to_le32(0x00010000), - USN_REASON_COMPRESSION_CHANGE = cpu_to_le32(0x00020000), - USN_REASON_ENCRYPTION_CHANGE = cpu_to_le32(0x00040000), - USN_REASON_OBJECT_ID_CHANGE = cpu_to_le32(0x00080000), - USN_REASON_REPARSE_POINT_CHANGE = cpu_to_le32(0x00100000), - USN_REASON_STREAM_CHANGE = cpu_to_le32(0x00200000), - USN_REASON_CLOSE = cpu_to_le32(0x80000000), -}; - -typedef le32 USN_REASON_FLAGS; - -/* - * Source info flags (32-bit). Information about the source of the change(s) - * to the file. For detailed descriptions of what these mean, see the Linux - * NTFS project NTFS documentation: - * http://www.linux-ntfs.org/ - */ -enum { - USN_SOURCE_DATA_MANAGEMENT = cpu_to_le32(0x00000001), - USN_SOURCE_AUXILIARY_DATA = cpu_to_le32(0x00000002), - USN_SOURCE_REPLICATION_MANAGEMENT = cpu_to_le32(0x00000004), -}; - -typedef le32 USN_SOURCE_INFO_FLAGS; - -/* - * $DATA/$J attribute. This is always non-resident, is marked as sparse, and - * is of variabled size. It consists of a sequence of variable size - * USN_RECORDS. The minimum allocated_size is allocation_delta as - * specified in $DATA/$Max. When the maximum_size specified in $DATA/$Max is - * exceeded by more than allocation_delta bytes, allocation_delta bytes are - * allocated and appended to the $DATA/$J attribute and an equal number of - * bytes at the beginning of the attribute are freed and made sparse. Note the - * making sparse only happens at volume checkpoints and hence the actual - * $DATA/$J size can exceed maximum_size + allocation_delta temporarily. - */ -typedef struct { -/*Ofs*/ -/* 0*/le32 length; /* Byte size of this record (8-byte - aligned). */ -/* 4*/le16 major_ver; /* Major version of the transaction log used - for this record. */ -/* 6*/le16 minor_ver; /* Minor version of the transaction log used - for this record. */ -/* 8*/leMFT_REF mft_reference;/* The mft reference of the file (or - directory) described by this record. */ -/*0x10*/leMFT_REF parent_directory;/* The mft reference of the parent - directory of the file described by this - record. */ -/*0x18*/leUSN usn; /* The usn of this record. Equals the offset - within the $DATA/$J attribute. */ -/*0x20*/sle64 time; /* Time when this record was created. */ -/*0x28*/USN_REASON_FLAGS reason;/* Reason flags (see above). */ -/*0x2c*/USN_SOURCE_INFO_FLAGS source_info;/* Source info flags (see above). */ -/*0x30*/le32 security_id; /* File security_id copied from - $STANDARD_INFORMATION. */ -/*0x34*/FILE_ATTR_FLAGS file_attributes; /* File attributes copied from - $STANDARD_INFORMATION or $FILE_NAME (not - sure which). */ -/*0x38*/le16 file_name_size; /* Size of the file name in bytes. */ -/*0x3a*/le16 file_name_offset; /* Offset to the file name in bytes from the - start of this record. */ -/*0x3c*/ntfschar file_name[0]; /* Use when creating only. When reading use - file_name_offset to determine the location - of the name. */ -/* sizeof() = 60 (0x3c) bytes */ -} __attribute__ ((__packed__)) USN_RECORD; - -extern bool ntfs_stamp_usnjrnl(ntfs_volume *vol); - -#endif /* NTFS_RW */ - -#endif /* _LINUX_NTFS_USNJRNL_H */ diff --git a/fs/ntfs/volume.h b/fs/ntfs/volume.h deleted file mode 100644 index 930a9ae8a053..000000000000 --- a/fs/ntfs/volume.h +++ /dev/null @@ -1,164 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * volume.h - Defines for volume structures in NTFS Linux kernel driver. Part - * of the Linux-NTFS project. - * - * Copyright (c) 2001-2006 Anton Altaparmakov - * Copyright (c) 2002 Richard Russon - */ - -#ifndef _LINUX_NTFS_VOLUME_H -#define _LINUX_NTFS_VOLUME_H - -#include <linux/rwsem.h> -#include <linux/uidgid.h> - -#include "types.h" -#include "layout.h" - -/* - * The NTFS in memory super block structure. - */ -typedef struct { - /* - * FIXME: Reorder to have commonly used together element within the - * same cache line, aiming at a cache line size of 32 bytes. Aim for - * 64 bytes for less commonly used together elements. Put most commonly - * used elements to front of structure. Obviously do this only when the - * structure has stabilized... (AIA) - */ - /* Device specifics. */ - struct super_block *sb; /* Pointer back to the super_block. */ - LCN nr_blocks; /* Number of sb->s_blocksize bytes - sized blocks on the device. */ - /* Configuration provided by user at mount time. */ - unsigned long flags; /* Miscellaneous flags, see below. */ - kuid_t uid; /* uid that files will be mounted as. */ - kgid_t gid; /* gid that files will be mounted as. */ - umode_t fmask; /* The mask for file permissions. */ - umode_t dmask; /* The mask for directory - permissions. */ - u8 mft_zone_multiplier; /* Initial mft zone multiplier. */ - u8 on_errors; /* What to do on filesystem errors. */ - /* NTFS bootsector provided information. */ - u16 sector_size; /* in bytes */ - u8 sector_size_bits; /* log2(sector_size) */ - u32 cluster_size; /* in bytes */ - u32 cluster_size_mask; /* cluster_size - 1 */ - u8 cluster_size_bits; /* log2(cluster_size) */ - u32 mft_record_size; /* in bytes */ - u32 mft_record_size_mask; /* mft_record_size - 1 */ - u8 mft_record_size_bits; /* log2(mft_record_size) */ - u32 index_record_size; /* in bytes */ - u32 index_record_size_mask; /* index_record_size - 1 */ - u8 index_record_size_bits; /* log2(index_record_size) */ - LCN nr_clusters; /* Volume size in clusters == number of - bits in lcn bitmap. */ - LCN mft_lcn; /* Cluster location of mft data. */ - LCN mftmirr_lcn; /* Cluster location of copy of mft. */ - u64 serial_no; /* The volume serial number. */ - /* Mount specific NTFS information. */ - u32 upcase_len; /* Number of entries in upcase[]. */ - ntfschar *upcase; /* The upcase table. */ - - s32 attrdef_size; /* Size of the attribute definition - table in bytes. */ - ATTR_DEF *attrdef; /* Table of attribute definitions. - Obtained from FILE_AttrDef. */ - -#ifdef NTFS_RW - /* Variables used by the cluster and mft allocators. */ - s64 mft_data_pos; /* Mft record number at which to - allocate the next mft record. */ - LCN mft_zone_start; /* First cluster of the mft zone. */ - LCN mft_zone_end; /* First cluster beyond the mft zone. */ - LCN mft_zone_pos; /* Current position in the mft zone. */ - LCN data1_zone_pos; /* Current position in the first data - zone. */ - LCN data2_zone_pos; /* Current position in the second data - zone. */ -#endif /* NTFS_RW */ - - struct inode *mft_ino; /* The VFS inode of $MFT. */ - - struct inode *mftbmp_ino; /* Attribute inode for $MFT/$BITMAP. */ - struct rw_semaphore mftbmp_lock; /* Lock for serializing accesses to the - mft record bitmap ($MFT/$BITMAP). */ -#ifdef NTFS_RW - struct inode *mftmirr_ino; /* The VFS inode of $MFTMirr. */ - int mftmirr_size; /* Size of mft mirror in mft records. */ - - struct inode *logfile_ino; /* The VFS inode of $LogFile. */ -#endif /* NTFS_RW */ - - struct inode *lcnbmp_ino; /* The VFS inode of $Bitmap. */ - struct rw_semaphore lcnbmp_lock; /* Lock for serializing accesses to the - cluster bitmap ($Bitmap/$DATA). */ - - struct inode *vol_ino; /* The VFS inode of $Volume. */ - VOLUME_FLAGS vol_flags; /* Volume flags. */ - u8 major_ver; /* Ntfs major version of volume. */ - u8 minor_ver; /* Ntfs minor version of volume. */ - - struct inode *root_ino; /* The VFS inode of the root - directory. */ - struct inode *secure_ino; /* The VFS inode of $Secure (NTFS3.0+ - only, otherwise NULL). */ - struct inode *extend_ino; /* The VFS inode of $Extend (NTFS3.0+ - only, otherwise NULL). */ -#ifdef NTFS_RW - /* $Quota stuff is NTFS3.0+ specific. Unused/NULL otherwise. */ - struct inode *quota_ino; /* The VFS inode of $Quota. */ - struct inode *quota_q_ino; /* Attribute inode for $Quota/$Q. */ - /* $UsnJrnl stuff is NTFS3.0+ specific. Unused/NULL otherwise. */ - struct inode *usnjrnl_ino; /* The VFS inode of $UsnJrnl. */ - struct inode *usnjrnl_max_ino; /* Attribute inode for $UsnJrnl/$Max. */ - struct inode *usnjrnl_j_ino; /* Attribute inode for $UsnJrnl/$J. */ -#endif /* NTFS_RW */ - struct nls_table *nls_map; -} ntfs_volume; - -/* - * Defined bits for the flags field in the ntfs_volume structure. - */ -typedef enum { - NV_Errors, /* 1: Volume has errors, prevent remount rw. */ - NV_ShowSystemFiles, /* 1: Return system files in ntfs_readdir(). */ - NV_CaseSensitive, /* 1: Treat file names as case sensitive and - create filenames in the POSIX namespace. - Otherwise be case insensitive but still - create file names in POSIX namespace. */ - NV_LogFileEmpty, /* 1: $LogFile journal is empty. */ - NV_QuotaOutOfDate, /* 1: $Quota is out of date. */ - NV_UsnJrnlStamped, /* 1: $UsnJrnl has been stamped. */ - NV_SparseEnabled, /* 1: May create sparse files. */ -} ntfs_volume_flags; - -/* - * Macro tricks to expand the NVolFoo(), NVolSetFoo(), and NVolClearFoo() - * functions. - */ -#define DEFINE_NVOL_BIT_OPS(flag) \ -static inline int NVol##flag(ntfs_volume *vol) \ -{ \ - return test_bit(NV_##flag, &(vol)->flags); \ -} \ -static inline void NVolSet##flag(ntfs_volume *vol) \ -{ \ - set_bit(NV_##flag, &(vol)->flags); \ -} \ -static inline void NVolClear##flag(ntfs_volume *vol) \ -{ \ - clear_bit(NV_##flag, &(vol)->flags); \ -} - -/* Emit the ntfs volume bitops functions. */ -DEFINE_NVOL_BIT_OPS(Errors) -DEFINE_NVOL_BIT_OPS(ShowSystemFiles) -DEFINE_NVOL_BIT_OPS(CaseSensitive) -DEFINE_NVOL_BIT_OPS(LogFileEmpty) -DEFINE_NVOL_BIT_OPS(QuotaOutOfDate) -DEFINE_NVOL_BIT_OPS(UsnJrnlStamped) -DEFINE_NVOL_BIT_OPS(SparseEnabled) - -#endif /* _LINUX_NTFS_VOLUME_H */ diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c index 63f70259edc0..7aadf5010999 100644 --- a/fs/ntfs3/attrib.c +++ b/fs/ntfs3/attrib.c @@ -886,7 +886,7 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn, struct runs_tree *run = &ni->file.run; struct ntfs_sb_info *sbi; u8 cluster_bits; - struct ATTRIB *attr = NULL, *attr_b; + struct ATTRIB *attr, *attr_b; struct ATTR_LIST_ENTRY *le, *le_b; struct mft_inode *mi, *mi_b; CLST hint, svcn, to_alloc, evcn1, next_svcn, asize, end, vcn0, alen; @@ -904,12 +904,8 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn, *len = 0; up_read(&ni->file.run_lock); - if (*len) { - if (*lcn != SPARSE_LCN || !new) - return 0; /* Fast normal way without allocation. */ - else if (clen > *len) - clen = *len; - } + if (*len && (*lcn != SPARSE_LCN || !new)) + return 0; /* Fast normal way without allocation. */ /* No cluster in cache or we need to allocate cluster in hole. */ sbi = ni->mi.sbi; @@ -918,6 +914,17 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn, ni_lock(ni); down_write(&ni->file.run_lock); + /* Repeat the code above (under write lock). */ + if (!run_lookup_entry(run, vcn, lcn, len, NULL)) + *len = 0; + + if (*len) { + if (*lcn != SPARSE_LCN || !new) + goto out; /* normal way without allocation. */ + if (clen > *len) + clen = *len; + } + le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); if (!attr_b) { @@ -1736,8 +1743,10 @@ repack: le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); - if (!attr_b) - return -ENOENT; + if (!attr_b) { + err = -ENOENT; + goto out; + } attr = attr_b; le = le_b; @@ -1818,13 +1827,15 @@ ins_ext: ok: run_truncate_around(run, vcn); out: - if (new_valid > data_size) - new_valid = data_size; + if (attr_b) { + if (new_valid > data_size) + new_valid = data_size; - valid_size = le64_to_cpu(attr_b->nres.valid_size); - if (new_valid != valid_size) { - attr_b->nres.valid_size = cpu_to_le64(valid_size); - mi_b->dirty = true; + valid_size = le64_to_cpu(attr_b->nres.valid_size); + if (new_valid != valid_size) { + attr_b->nres.valid_size = cpu_to_le64(valid_size); + mi_b->dirty = true; + } } return err; @@ -2073,7 +2084,7 @@ next_attr: /* Update inode size. */ ni->i_valid = valid_size; - ni->vfs_inode.i_size = data_size; + i_size_write(&ni->vfs_inode, data_size); inode_set_bytes(&ni->vfs_inode, total_size); ni->ni_flags |= NI_FLAG_UPDATE_PARENT; mark_inode_dirty(&ni->vfs_inode); @@ -2488,7 +2499,7 @@ int attr_insert_range(struct ntfs_inode *ni, u64 vbo, u64 bytes) mi_b->dirty = true; done: - ni->vfs_inode.i_size += bytes; + i_size_write(&ni->vfs_inode, ni->vfs_inode.i_size + bytes); ni->ni_flags |= NI_FLAG_UPDATE_PARENT; mark_inode_dirty(&ni->vfs_inode); diff --git a/fs/ntfs3/attrlist.c b/fs/ntfs3/attrlist.c index 7c01735d1219..9f4bd8d26090 100644 --- a/fs/ntfs3/attrlist.c +++ b/fs/ntfs3/attrlist.c @@ -29,7 +29,7 @@ static inline bool al_is_valid_le(const struct ntfs_inode *ni, void al_destroy(struct ntfs_inode *ni) { run_close(&ni->attr_list.run); - kfree(ni->attr_list.le); + kvfree(ni->attr_list.le); ni->attr_list.le = NULL; ni->attr_list.size = 0; ni->attr_list.dirty = false; @@ -127,12 +127,13 @@ struct ATTR_LIST_ENTRY *al_enumerate(struct ntfs_inode *ni, { size_t off; u16 sz; + const unsigned le_min_size = le_size(0); if (!le) { le = ni->attr_list.le; } else { sz = le16_to_cpu(le->size); - if (sz < sizeof(struct ATTR_LIST_ENTRY)) { + if (sz < le_min_size) { /* Impossible 'cause we should not return such le. */ return NULL; } @@ -141,7 +142,7 @@ struct ATTR_LIST_ENTRY *al_enumerate(struct ntfs_inode *ni, /* Check boundary. */ off = PtrOffset(ni->attr_list.le, le); - if (off + sizeof(struct ATTR_LIST_ENTRY) > ni->attr_list.size) { + if (off + le_min_size > ni->attr_list.size) { /* The regular end of list. */ return NULL; } @@ -149,8 +150,7 @@ struct ATTR_LIST_ENTRY *al_enumerate(struct ntfs_inode *ni, sz = le16_to_cpu(le->size); /* Check le for errors. */ - if (sz < sizeof(struct ATTR_LIST_ENTRY) || - off + sz > ni->attr_list.size || + if (sz < le_min_size || off + sz > ni->attr_list.size || sz < le->name_off + le->name_len * sizeof(short)) { return NULL; } @@ -318,7 +318,7 @@ int al_add_le(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, memcpy(ptr, al->le, off); memcpy(Add2Ptr(ptr, off + sz), le, old_size - off); le = Add2Ptr(ptr, off); - kfree(al->le); + kvfree(al->le); al->le = ptr; } else { memmove(Add2Ptr(le, sz), le, old_size - off); diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c index 63f14a0232f6..845f9b22deef 100644 --- a/fs/ntfs3/bitmap.c +++ b/fs/ntfs3/bitmap.c @@ -124,7 +124,7 @@ void wnd_close(struct wnd_bitmap *wnd) { struct rb_node *node, *next; - kfree(wnd->free_bits); + kvfree(wnd->free_bits); wnd->free_bits = NULL; run_close(&wnd->run); @@ -1360,7 +1360,7 @@ int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits) memcpy(new_free, wnd->free_bits, wnd->nwnd * sizeof(short)); memset(new_free + wnd->nwnd, 0, (new_wnd - wnd->nwnd) * sizeof(short)); - kfree(wnd->free_bits); + kvfree(wnd->free_bits); wnd->free_bits = new_free; } diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c index ec0566b322d5..5cf3d9decf64 100644 --- a/fs/ntfs3/dir.c +++ b/fs/ntfs3/dir.c @@ -309,11 +309,31 @@ static inline int ntfs_filldir(struct ntfs_sb_info *sbi, struct ntfs_inode *ni, return 0; } - /* NTFS: symlinks are "dir + reparse" or "file + reparse" */ - if (fname->dup.fa & FILE_ATTRIBUTE_REPARSE_POINT) - dt_type = DT_LNK; - else - dt_type = (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) ? DT_DIR : DT_REG; + /* + * NTFS: symlinks are "dir + reparse" or "file + reparse" + * Unfortunately reparse attribute is used for many purposes (several dozens). + * It is not possible here to know is this name symlink or not. + * To get exactly the type of name we should to open inode (read mft). + * getattr for opened file (fstat) correctly returns symlink. + */ + dt_type = (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) ? DT_DIR : DT_REG; + + /* + * It is not reliable to detect the type of name using duplicated information + * stored in parent directory. + * The only correct way to get the type of name - read MFT record and find ATTR_STD. + * The code below is not good idea. + * It does additional locks/reads just to get the type of name. + * Should we use additional mount option to enable branch below? + */ + if ((fname->dup.fa & FILE_ATTRIBUTE_REPARSE_POINT) && + ino != ni->mi.rno) { + struct inode *inode = ntfs_iget5(sbi->sb, &e->ref, NULL); + if (!IS_ERR_OR_NULL(inode)) { + dt_type = fs_umode_to_dtype(inode->i_mode); + iput(inode); + } + } return !dir_emit(ctx, (s8 *)name, name_len, ino, dt_type); } @@ -495,11 +515,9 @@ static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs, struct INDEX_HDR *hdr; const struct ATTR_FILE_NAME *fname; u32 e_size, off, end; - u64 vbo = 0; size_t drs = 0, fles = 0, bit = 0; - loff_t i_size = ni->vfs_inode.i_size; struct indx_node *node = NULL; - u8 index_bits = ni->dir.index_bits; + size_t max_indx = i_size_read(&ni->vfs_inode) >> ni->dir.index_bits; if (is_empty) *is_empty = true; @@ -518,8 +536,10 @@ static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs, e = Add2Ptr(hdr, off); e_size = le16_to_cpu(e->size); if (e_size < sizeof(struct NTFS_DE) || - off + e_size > end) + off + e_size > end) { + /* Looks like corruption. */ break; + } if (de_is_last(e)) break; @@ -543,7 +563,7 @@ static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs, fles += 1; } - if (vbo >= i_size) + if (bit >= max_indx) goto out; err = indx_used_bit(&ni->dir, ni, &bit); @@ -553,8 +573,7 @@ static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs, if (bit == MINUS_ONE_T) goto out; - vbo = (u64)bit << index_bits; - if (vbo >= i_size) + if (bit >= max_indx) goto out; err = indx_read(&ni->dir, ni, bit << ni->dir.idx2vbn_bits, @@ -564,7 +583,6 @@ static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs, hdr = &node->index->ihdr; bit += 1; - vbo = (u64)bit << ni->dir.idx2vbn_bits; } out: @@ -593,5 +611,9 @@ const struct file_operations ntfs_dir_operations = { .iterate_shared = ntfs_readdir, .fsync = generic_file_fsync, .open = ntfs_file_open, + .unlocked_ioctl = ntfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ntfs_compat_ioctl, +#endif }; // clang-format on diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index a5a30a24ce5d..5418662c80d8 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -48,7 +48,7 @@ static int ntfs_ioctl_fitrim(struct ntfs_sb_info *sbi, unsigned long arg) return 0; } -static long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg) +long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg) { struct inode *inode = file_inode(filp); struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info; @@ -61,7 +61,7 @@ static long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg) } #ifdef CONFIG_COMPAT -static long ntfs_compat_ioctl(struct file *filp, u32 cmd, unsigned long arg) +long ntfs_compat_ioctl(struct file *filp, u32 cmd, unsigned long arg) { return ntfs_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)); @@ -188,6 +188,7 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to) u32 bh_next, bh_off, to; sector_t iblock; struct folio *folio; + bool dirty = false; for (; idx < idx_end; idx += 1, from = 0) { page_off = (loff_t)idx << PAGE_SHIFT; @@ -223,29 +224,27 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to) /* Ok, it's mapped. Make sure it's up-to-date. */ if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); - - if (!buffer_uptodate(bh)) { - err = bh_read(bh, 0); - if (err < 0) { - folio_unlock(folio); - folio_put(folio); - goto out; - } + else if (bh_read(bh, 0) < 0) { + err = -EIO; + folio_unlock(folio); + folio_put(folio); + goto out; } mark_buffer_dirty(bh); - } while (bh_off = bh_next, iblock += 1, head != (bh = bh->b_this_page)); folio_zero_segment(folio, from, to); + dirty = true; folio_unlock(folio); folio_put(folio); cond_resched(); } out: - mark_inode_dirty(inode); + if (dirty) + mark_inode_dirty(inode); return err; } @@ -261,6 +260,9 @@ static int ntfs_file_mmap(struct file *file, struct vm_area_struct *vma) bool rw = vma->vm_flags & VM_WRITE; int err; + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + if (is_encrypted(ni)) { ntfs_inode_warn(inode, "mmap encrypted not supported"); return -EOPNOTSUPP; @@ -499,10 +501,14 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) ni_lock(ni); err = attr_punch_hole(ni, vbo, len, &frame_size); ni_unlock(ni); + if (!err) + goto ok; + if (err != E_NTFS_NOTALIGNED) goto out; /* Process not aligned punch. */ + err = 0; mask = frame_size - 1; vbo_a = (vbo + mask) & ~mask; end_a = end & ~mask; @@ -525,6 +531,8 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) ni_lock(ni); err = attr_punch_hole(ni, vbo_a, end_a - vbo_a, NULL); ni_unlock(ni); + if (err) + goto out; } } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { /* @@ -564,6 +572,8 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) ni_lock(ni); err = attr_insert_range(ni, vbo, len); ni_unlock(ni); + if (err) + goto out; } else { /* Check new size. */ u8 cluster_bits = sbi->cluster_bits; @@ -633,11 +643,18 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) &ni->file.run, i_size, &ni->i_valid, true, NULL); ni_unlock(ni); + if (err) + goto out; } else if (new_size > i_size) { - inode->i_size = new_size; + i_size_write(inode, new_size); } } +ok: + err = file_modified(file); + if (err) + goto out; + out: if (map_locked) filemap_invalidate_unlock(mapping); @@ -663,6 +680,9 @@ int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode = inode->i_mode; int err; + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + err = setattr_prepare(idmap, dentry, attr); if (err) goto out; @@ -676,7 +696,7 @@ int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry, goto out; } inode_dio_wait(inode); - oldsize = inode->i_size; + oldsize = i_size_read(inode); newsize = attr->ia_size; if (newsize <= oldsize) @@ -688,7 +708,7 @@ int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry, goto out; ni->ni_flags |= NI_FLAG_UPDATE_PARENT; - inode->i_size = newsize; + i_size_write(inode, newsize); } setattr_copy(idmap, inode, attr); @@ -718,6 +738,9 @@ static ssize_t ntfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) struct inode *inode = file->f_mapping->host; struct ntfs_inode *ni = ntfs_i(inode); + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + if (is_encrypted(ni)) { ntfs_inode_warn(inode, "encrypted i/o not supported"); return -EOPNOTSUPP; @@ -752,6 +775,9 @@ static ssize_t ntfs_file_splice_read(struct file *in, loff_t *ppos, struct inode *inode = in->f_mapping->host; struct ntfs_inode *ni = ntfs_i(inode); + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + if (is_encrypted(ni)) { ntfs_inode_warn(inode, "encrypted i/o not supported"); return -EOPNOTSUPP; @@ -821,7 +847,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) size_t count = iov_iter_count(from); loff_t pos = iocb->ki_pos; struct inode *inode = file_inode(file); - loff_t i_size = inode->i_size; + loff_t i_size = i_size_read(inode); struct address_space *mapping = inode->i_mapping; struct ntfs_inode *ni = ntfs_i(inode); u64 valid = ni->i_valid; @@ -1028,6 +1054,8 @@ out: iocb->ki_pos += written; if (iocb->ki_pos > ni->i_valid) ni->i_valid = iocb->ki_pos; + if (iocb->ki_pos > i_size) + i_size_write(inode, iocb->ki_pos); return written; } @@ -1041,8 +1069,12 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; ssize_t ret; + int err; struct ntfs_inode *ni = ntfs_i(inode); + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + if (is_encrypted(ni)) { ntfs_inode_warn(inode, "encrypted i/o not supported"); return -EOPNOTSUPP; @@ -1068,6 +1100,12 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret <= 0) goto out; + err = file_modified(iocb->ki_filp); + if (err) { + ret = err; + goto out; + } + if (WARN_ON(ni->ni_flags & NI_FLAG_COMPRESSED_MASK)) { /* Should never be here, see ntfs_file_open(). */ ret = -EOPNOTSUPP; @@ -1097,6 +1135,9 @@ int ntfs_file_open(struct inode *inode, struct file *file) { struct ntfs_inode *ni = ntfs_i(inode); + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + if (unlikely((is_compressed(ni) || is_encrypted(ni)) && (file->f_flags & O_DIRECT))) { return -EOPNOTSUPP; @@ -1138,7 +1179,8 @@ static int ntfs_file_release(struct inode *inode, struct file *file) down_write(&ni->file.run_lock); err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run, - inode->i_size, &ni->i_valid, false, NULL); + i_size_read(inode), &ni->i_valid, false, + NULL); up_write(&ni->file.run_lock); ni_unlock(ni); diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c index 3df2d9e34b91..7f27382e0ce2 100644 --- a/fs/ntfs3/frecord.c +++ b/fs/ntfs3/frecord.c @@ -778,7 +778,7 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni) run_deallocate(sbi, &ni->attr_list.run, true); run_close(&ni->attr_list.run); ni->attr_list.size = 0; - kfree(ni->attr_list.le); + kvfree(ni->attr_list.le); ni->attr_list.le = NULL; ni->attr_list.dirty = false; @@ -927,7 +927,7 @@ int ni_create_attr_list(struct ntfs_inode *ni) return 0; out: - kfree(ni->attr_list.le); + kvfree(ni->attr_list.le); ni->attr_list.le = NULL; ni->attr_list.size = 0; return err; @@ -2099,7 +2099,7 @@ int ni_readpage_cmpr(struct ntfs_inode *ni, struct page *page) gfp_t gfp_mask; struct page *pg; - if (vbo >= ni->vfs_inode.i_size) { + if (vbo >= i_size_read(&ni->vfs_inode)) { SetPageUptodate(page); err = 0; goto out; @@ -2173,7 +2173,7 @@ int ni_decompress_file(struct ntfs_inode *ni) { struct ntfs_sb_info *sbi = ni->mi.sbi; struct inode *inode = &ni->vfs_inode; - loff_t i_size = inode->i_size; + loff_t i_size = i_size_read(inode); struct address_space *mapping = inode->i_mapping; gfp_t gfp_mask = mapping_gfp_mask(mapping); struct page **pages = NULL; @@ -2508,6 +2508,7 @@ int ni_read_frame(struct ntfs_inode *ni, u64 frame_vbo, struct page **pages, err = -EOPNOTSUPP; goto out1; #else + loff_t i_size = i_size_read(&ni->vfs_inode); u32 frame_bits = ni_ext_compress_bits(ni); u64 frame64 = frame_vbo >> frame_bits; u64 frames, vbo_data; @@ -2548,7 +2549,7 @@ int ni_read_frame(struct ntfs_inode *ni, u64 frame_vbo, struct page **pages, } } - frames = (ni->vfs_inode.i_size - 1) >> frame_bits; + frames = (i_size - 1) >> frame_bits; err = attr_wof_frame_info(ni, attr, run, frame64, frames, frame_bits, &ondisk_size, &vbo_data); @@ -2556,8 +2557,7 @@ int ni_read_frame(struct ntfs_inode *ni, u64 frame_vbo, struct page **pages, goto out2; if (frame64 == frames) { - unc_size = 1 + ((ni->vfs_inode.i_size - 1) & - (frame_size - 1)); + unc_size = 1 + ((i_size - 1) & (frame_size - 1)); ondisk_size = attr_size(attr) - vbo_data; } else { unc_size = frame_size; @@ -3259,6 +3259,9 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint) if (is_bad_inode(inode) || sb_rdonly(sb)) return 0; + if (unlikely(ntfs3_forced_shutdown(sb))) + return -EIO; + if (!ni_trylock(ni)) { /* 'ni' is under modification, skip for now. */ mark_inode_dirty_sync(inode); @@ -3288,7 +3291,7 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint) modified = true; } - ts = inode_get_mtime(inode); + ts = inode_get_ctime(inode); dup.c_time = kernel2nt(&ts); if (std->c_time != dup.c_time) { std->c_time = dup.c_time; diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c index 98ccb6650858..855519713bf7 100644 --- a/fs/ntfs3/fslog.c +++ b/fs/ntfs3/fslog.c @@ -465,7 +465,7 @@ static inline bool is_rst_area_valid(const struct RESTART_HDR *rhdr) { const struct RESTART_AREA *ra; u16 cl, fl, ul; - u32 off, l_size, file_dat_bits, file_size_round; + u32 off, l_size, seq_bits; u16 ro = le16_to_cpu(rhdr->ra_off); u32 sys_page = le32_to_cpu(rhdr->sys_page_size); @@ -511,13 +511,15 @@ static inline bool is_rst_area_valid(const struct RESTART_HDR *rhdr) /* Make sure the sequence number bits match the log file size. */ l_size = le64_to_cpu(ra->l_size); - file_dat_bits = sizeof(u64) * 8 - le32_to_cpu(ra->seq_num_bits); - file_size_round = 1u << (file_dat_bits + 3); - if (file_size_round != l_size && - (file_size_round < l_size || (file_size_round / 2) > l_size)) { - return false; + seq_bits = sizeof(u64) * 8 + 3; + while (l_size) { + l_size >>= 1; + seq_bits -= 1; } + if (seq_bits != ra->seq_num_bits) + return false; + /* The log page data offset and record header length must be quad-aligned. */ if (!IS_ALIGNED(le16_to_cpu(ra->data_off), 8) || !IS_ALIGNED(le16_to_cpu(ra->rec_hdr_len), 8)) @@ -974,6 +976,16 @@ skip_looking: return e; } +struct restart_info { + u64 last_lsn; + struct RESTART_HDR *r_page; + u32 vbo; + bool chkdsk_was_run; + bool valid_page; + bool initialized; + bool restart; +}; + #define RESTART_SINGLE_PAGE_IO cpu_to_le16(0x0001) #define NTFSLOG_WRAPPED 0x00000001 @@ -987,6 +999,7 @@ struct ntfs_log { struct ntfs_inode *ni; u32 l_size; + u32 orig_file_size; u32 sys_page_size; u32 sys_page_mask; u32 page_size; @@ -1040,6 +1053,8 @@ struct ntfs_log { struct CLIENT_ID client_id; u32 client_undo_commit; + + struct restart_info rst_info, rst_info2; }; static inline u32 lsn_to_vbo(struct ntfs_log *log, const u64 lsn) @@ -1105,16 +1120,6 @@ static inline bool verify_client_lsn(struct ntfs_log *log, lsn <= le64_to_cpu(log->ra->current_lsn) && lsn; } -struct restart_info { - u64 last_lsn; - struct RESTART_HDR *r_page; - u32 vbo; - bool chkdsk_was_run; - bool valid_page; - bool initialized; - bool restart; -}; - static int read_log_page(struct ntfs_log *log, u32 vbo, struct RECORD_PAGE_HDR **buffer, bool *usa_error) { @@ -1176,7 +1181,7 @@ out: * restart page header. It will stop the first time we find a * valid page header. */ -static int log_read_rst(struct ntfs_log *log, u32 l_size, bool first, +static int log_read_rst(struct ntfs_log *log, bool first, struct restart_info *info) { u32 skip, vbo; @@ -1192,7 +1197,7 @@ static int log_read_rst(struct ntfs_log *log, u32 l_size, bool first, } /* Loop continuously until we succeed. */ - for (; vbo < l_size; vbo = 2 * vbo + skip, skip = 0) { + for (; vbo < log->l_size; vbo = 2 * vbo + skip, skip = 0) { bool usa_error; bool brst, bchk; struct RESTART_AREA *ra; @@ -1285,22 +1290,17 @@ check_result: /* * Ilog_init_pg_hdr - Init @log from restart page header. */ -static void log_init_pg_hdr(struct ntfs_log *log, u32 sys_page_size, - u32 page_size, u16 major_ver, u16 minor_ver) +static void log_init_pg_hdr(struct ntfs_log *log, u16 major_ver, u16 minor_ver) { - log->sys_page_size = sys_page_size; - log->sys_page_mask = sys_page_size - 1; - log->page_size = page_size; - log->page_mask = page_size - 1; - log->page_bits = blksize_bits(page_size); + log->sys_page_size = log->page_size; + log->sys_page_mask = log->page_mask; log->clst_per_page = log->page_size >> log->ni->mi.sbi->cluster_bits; if (!log->clst_per_page) log->clst_per_page = 1; - log->first_page = major_ver >= 2 ? - 0x22 * page_size : - ((sys_page_size << 1) + (page_size << 1)); + log->first_page = major_ver >= 2 ? 0x22 * log->page_size : + 4 * log->page_size; log->major_ver = major_ver; log->minor_ver = minor_ver; } @@ -1308,12 +1308,11 @@ static void log_init_pg_hdr(struct ntfs_log *log, u32 sys_page_size, /* * log_create - Init @log in cases when we don't have a restart area to use. */ -static void log_create(struct ntfs_log *log, u32 l_size, const u64 last_lsn, +static void log_create(struct ntfs_log *log, const u64 last_lsn, u32 open_log_count, bool wrapped, bool use_multi_page) { - log->l_size = l_size; /* All file offsets must be quadword aligned. */ - log->file_data_bits = blksize_bits(l_size) - 3; + log->file_data_bits = blksize_bits(log->l_size) - 3; log->seq_num_mask = (8 << log->file_data_bits) - 1; log->seq_num_bits = sizeof(u64) * 8 - log->file_data_bits; log->seq_num = (last_lsn >> log->file_data_bits) + 2; @@ -3720,10 +3719,8 @@ int log_replay(struct ntfs_inode *ni, bool *initialized) struct ntfs_sb_info *sbi = ni->mi.sbi; struct ntfs_log *log; - struct restart_info rst_info, rst_info2; - u64 rec_lsn, ra_lsn, checkpt_lsn = 0, rlsn = 0; + u64 rec_lsn, checkpt_lsn = 0, rlsn = 0; struct ATTR_NAME_ENTRY *attr_names = NULL; - struct ATTR_NAME_ENTRY *ane; struct RESTART_TABLE *dptbl = NULL; struct RESTART_TABLE *trtbl = NULL; const struct RESTART_TABLE *rt; @@ -3741,9 +3738,7 @@ int log_replay(struct ntfs_inode *ni, bool *initialized) struct TRANSACTION_ENTRY *tr; struct DIR_PAGE_ENTRY *dp; u32 i, bytes_per_attr_entry; - u32 l_size = ni->vfs_inode.i_size; - u32 orig_file_size = l_size; - u32 page_size, vbo, tail, off, dlen; + u32 vbo, tail, off, dlen; u32 saved_len, rec_len, transact_id; bool use_second_page; struct RESTART_AREA *ra2, *ra = NULL; @@ -3758,52 +3753,50 @@ int log_replay(struct ntfs_inode *ni, bool *initialized) u16 t16; u32 t32; - /* Get the size of page. NOTE: To replay we can use default page. */ -#if PAGE_SIZE >= DefaultLogPageSize && PAGE_SIZE <= DefaultLogPageSize * 2 - page_size = norm_file_page(PAGE_SIZE, &l_size, true); -#else - page_size = norm_file_page(PAGE_SIZE, &l_size, false); -#endif - if (!page_size) - return -EINVAL; - log = kzalloc(sizeof(struct ntfs_log), GFP_NOFS); if (!log) return -ENOMEM; log->ni = ni; - log->l_size = l_size; - log->one_page_buf = kmalloc(page_size, GFP_NOFS); + log->l_size = log->orig_file_size = ni->vfs_inode.i_size; + /* Get the size of page. NOTE: To replay we can use default page. */ +#if PAGE_SIZE >= DefaultLogPageSize && PAGE_SIZE <= DefaultLogPageSize * 2 + log->page_size = norm_file_page(PAGE_SIZE, &log->l_size, true); +#else + log->page_size = norm_file_page(PAGE_SIZE, &log->l_size, false); +#endif + if (!log->page_size) { + err = -EINVAL; + goto out; + } + + log->one_page_buf = kmalloc(log->page_size, GFP_NOFS); if (!log->one_page_buf) { err = -ENOMEM; goto out; } - log->page_size = page_size; - log->page_mask = page_size - 1; - log->page_bits = blksize_bits(page_size); + log->page_mask = log->page_size - 1; + log->page_bits = blksize_bits(log->page_size); /* Look for a restart area on the disk. */ - memset(&rst_info, 0, sizeof(struct restart_info)); - err = log_read_rst(log, l_size, true, &rst_info); + err = log_read_rst(log, true, &log->rst_info); if (err) goto out; /* remember 'initialized' */ - *initialized = rst_info.initialized; + *initialized = log->rst_info.initialized; - if (!rst_info.restart) { - if (rst_info.initialized) { + if (!log->rst_info.restart) { + if (log->rst_info.initialized) { /* No restart area but the file is not initialized. */ err = -EINVAL; goto out; } - log_init_pg_hdr(log, page_size, page_size, 1, 1); - log_create(log, l_size, 0, get_random_u32(), false, false); - - log->ra = ra; + log_init_pg_hdr(log, 1, 1); + log_create(log, 0, get_random_u32(), false, false); ra = log_create_ra(log); if (!ra) { @@ -3820,25 +3813,26 @@ int log_replay(struct ntfs_inode *ni, bool *initialized) * If the restart offset above wasn't zero then we won't * look for a second restart. */ - if (rst_info.vbo) + if (log->rst_info.vbo) goto check_restart_area; - memset(&rst_info2, 0, sizeof(struct restart_info)); - err = log_read_rst(log, l_size, false, &rst_info2); + err = log_read_rst(log, false, &log->rst_info2); if (err) goto out; /* Determine which restart area to use. */ - if (!rst_info2.restart || rst_info2.last_lsn <= rst_info.last_lsn) + if (!log->rst_info2.restart || + log->rst_info2.last_lsn <= log->rst_info.last_lsn) goto use_first_page; use_second_page = true; - if (rst_info.chkdsk_was_run && page_size != rst_info.vbo) { + if (log->rst_info.chkdsk_was_run && + log->page_size != log->rst_info.vbo) { struct RECORD_PAGE_HDR *sp = NULL; bool usa_error; - if (!read_log_page(log, page_size, &sp, &usa_error) && + if (!read_log_page(log, log->page_size, &sp, &usa_error) && sp->rhdr.sign == NTFS_CHKD_SIGNATURE) { use_second_page = false; } @@ -3846,52 +3840,43 @@ int log_replay(struct ntfs_inode *ni, bool *initialized) } if (use_second_page) { - kfree(rst_info.r_page); - memcpy(&rst_info, &rst_info2, sizeof(struct restart_info)); - rst_info2.r_page = NULL; + kfree(log->rst_info.r_page); + memcpy(&log->rst_info, &log->rst_info2, + sizeof(struct restart_info)); + log->rst_info2.r_page = NULL; } use_first_page: - kfree(rst_info2.r_page); + kfree(log->rst_info2.r_page); check_restart_area: /* * If the restart area is at offset 0, we want * to write the second restart area first. */ - log->init_ra = !!rst_info.vbo; + log->init_ra = !!log->rst_info.vbo; /* If we have a valid page then grab a pointer to the restart area. */ - ra2 = rst_info.valid_page ? - Add2Ptr(rst_info.r_page, - le16_to_cpu(rst_info.r_page->ra_off)) : + ra2 = log->rst_info.valid_page ? + Add2Ptr(log->rst_info.r_page, + le16_to_cpu(log->rst_info.r_page->ra_off)) : NULL; - if (rst_info.chkdsk_was_run || + if (log->rst_info.chkdsk_was_run || (ra2 && ra2->client_idx[1] == LFS_NO_CLIENT_LE)) { bool wrapped = false; bool use_multi_page = false; u32 open_log_count; /* Do some checks based on whether we have a valid log page. */ - if (!rst_info.valid_page) { - open_log_count = get_random_u32(); - goto init_log_instance; - } - open_log_count = le32_to_cpu(ra2->open_log_count); - - /* - * If the restart page size isn't changing then we want to - * check how much work we need to do. - */ - if (page_size != le32_to_cpu(rst_info.r_page->sys_page_size)) - goto init_log_instance; + open_log_count = log->rst_info.valid_page ? + le32_to_cpu(ra2->open_log_count) : + get_random_u32(); -init_log_instance: - log_init_pg_hdr(log, page_size, page_size, 1, 1); + log_init_pg_hdr(log, 1, 1); - log_create(log, l_size, rst_info.last_lsn, open_log_count, - wrapped, use_multi_page); + log_create(log, log->rst_info.last_lsn, open_log_count, wrapped, + use_multi_page); ra = log_create_ra(log); if (!ra) { @@ -3916,28 +3901,27 @@ init_log_instance: * use the log file. We must use the system page size instead of the * default size if there is not a clean shutdown. */ - t32 = le32_to_cpu(rst_info.r_page->sys_page_size); - if (page_size != t32) { - l_size = orig_file_size; - page_size = - norm_file_page(t32, &l_size, t32 == DefaultLogPageSize); + t32 = le32_to_cpu(log->rst_info.r_page->sys_page_size); + if (log->page_size != t32) { + log->l_size = log->orig_file_size; + log->page_size = norm_file_page(t32, &log->l_size, + t32 == DefaultLogPageSize); } - if (page_size != t32 || - page_size != le32_to_cpu(rst_info.r_page->page_size)) { + if (log->page_size != t32 || + log->page_size != le32_to_cpu(log->rst_info.r_page->page_size)) { err = -EINVAL; goto out; } /* If the file size has shrunk then we won't mount it. */ - if (l_size < le64_to_cpu(ra2->l_size)) { + if (log->l_size < le64_to_cpu(ra2->l_size)) { err = -EINVAL; goto out; } - log_init_pg_hdr(log, page_size, page_size, - le16_to_cpu(rst_info.r_page->major_ver), - le16_to_cpu(rst_info.r_page->minor_ver)); + log_init_pg_hdr(log, le16_to_cpu(log->rst_info.r_page->major_ver), + le16_to_cpu(log->rst_info.r_page->minor_ver)); log->l_size = le64_to_cpu(ra2->l_size); log->seq_num_bits = le32_to_cpu(ra2->seq_num_bits); @@ -3945,7 +3929,7 @@ init_log_instance: log->seq_num_mask = (8 << log->file_data_bits) - 1; log->last_lsn = le64_to_cpu(ra2->current_lsn); log->seq_num = log->last_lsn >> log->file_data_bits; - log->ra_off = le16_to_cpu(rst_info.r_page->ra_off); + log->ra_off = le16_to_cpu(log->rst_info.r_page->ra_off); log->restart_size = log->sys_page_size - log->ra_off; log->record_header_len = le16_to_cpu(ra2->rec_hdr_len); log->ra_size = le16_to_cpu(ra2->ra_len); @@ -4045,7 +4029,7 @@ find_oldest: log->current_avail = current_log_avail(log); /* Remember which restart area to write first. */ - log->init_ra = rst_info.vbo; + log->init_ra = log->rst_info.vbo; process_log: /* 1.0, 1.1, 2.0 log->major_ver/minor_ver - short values. */ @@ -4105,7 +4089,7 @@ process_log: log->client_id.seq_num = cr->seq_num; log->client_id.client_idx = client; - err = read_rst_area(log, &rst, &ra_lsn); + err = read_rst_area(log, &rst, &checkpt_lsn); if (err) goto out; @@ -4114,9 +4098,8 @@ process_log: bytes_per_attr_entry = !rst->major_ver ? 0x2C : 0x28; - checkpt_lsn = le64_to_cpu(rst->check_point_start); - if (!checkpt_lsn) - checkpt_lsn = ra_lsn; + if (rst->check_point_start) + checkpt_lsn = le64_to_cpu(rst->check_point_start); /* Allocate and Read the Transaction Table. */ if (!rst->transact_table_len) @@ -4330,23 +4313,20 @@ check_attr_table: lcb = NULL; check_attribute_names2: - if (!rst->attr_names_len) - goto trace_attribute_table; - - ane = attr_names; - if (!oatbl) - goto trace_attribute_table; - while (ane->off) { - /* TODO: Clear table on exit! */ - oe = Add2Ptr(oatbl, le16_to_cpu(ane->off)); - t16 = le16_to_cpu(ane->name_bytes); - oe->name_len = t16 / sizeof(short); - oe->ptr = ane->name; - oe->is_attr_name = 2; - ane = Add2Ptr(ane, sizeof(struct ATTR_NAME_ENTRY) + t16); - } - -trace_attribute_table: + if (rst->attr_names_len && oatbl) { + struct ATTR_NAME_ENTRY *ane = attr_names; + while (ane->off) { + /* TODO: Clear table on exit! */ + oe = Add2Ptr(oatbl, le16_to_cpu(ane->off)); + t16 = le16_to_cpu(ane->name_bytes); + oe->name_len = t16 / sizeof(short); + oe->ptr = ane->name; + oe->is_attr_name = 2; + ane = Add2Ptr(ane, + sizeof(struct ATTR_NAME_ENTRY) + t16); + } + } + /* * If the checkpt_lsn is zero, then this is a freshly * formatted disk and we have no work to do. @@ -5189,7 +5169,7 @@ out: kfree(oatbl); kfree(dptbl); kfree(attr_names); - kfree(rst_info.r_page); + kfree(log->rst_info.r_page); kfree(ra); kfree(log->one_page_buf); diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c index fbfe21dbb425..ae2ef5c11868 100644 --- a/fs/ntfs3/fsntfs.c +++ b/fs/ntfs3/fsntfs.c @@ -853,7 +853,8 @@ void ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait) /* * sb can be NULL here. In this case sbi->flags should be 0 too. */ - if (!sb || !(sbi->flags & NTFS_FLAGS_MFTMIRR)) + if (!sb || !(sbi->flags & NTFS_FLAGS_MFTMIRR) || + unlikely(ntfs3_forced_shutdown(sb))) return; blocksize = sb->s_blocksize; @@ -1006,6 +1007,30 @@ static inline __le32 security_hash(const void *sd, size_t bytes) return cpu_to_le32(hash); } +/* + * simple wrapper for sb_bread_unmovable. + */ +struct buffer_head *ntfs_bread(struct super_block *sb, sector_t block) +{ + struct ntfs_sb_info *sbi = sb->s_fs_info; + struct buffer_head *bh; + + if (unlikely(block >= sbi->volume.blocks)) { + /* prevent generic message "attempt to access beyond end of device" */ + ntfs_err(sb, "try to read out of volume at offset 0x%llx", + (u64)block << sb->s_blocksize_bits); + return NULL; + } + + bh = sb_bread_unmovable(sb, block); + if (bh) + return bh; + + ntfs_err(sb, "failed to read volume at offset 0x%llx", + (u64)block << sb->s_blocksize_bits); + return NULL; +} + int ntfs_sb_read(struct super_block *sb, u64 lbo, size_t bytes, void *buffer) { struct block_device *bdev = sb->s_bdev; @@ -2128,8 +2153,8 @@ int ntfs_insert_security(struct ntfs_sb_info *sbi, if (le32_to_cpu(d_security->size) == new_sec_size && d_security->key.hash == hash_key.hash && !memcmp(d_security + 1, sd, size_sd)) { - *security_id = d_security->key.sec_id; /* Such security already exists. */ + *security_id = d_security->key.sec_id; err = 0; goto out; } diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c index cf92b2433f7a..daabaad63aaf 100644 --- a/fs/ntfs3/index.c +++ b/fs/ntfs3/index.c @@ -1462,7 +1462,7 @@ static int indx_create_allocate(struct ntfs_index *indx, struct ntfs_inode *ni, goto out2; if (in->name == I30_NAME) { - ni->vfs_inode.i_size = data_size; + i_size_write(&ni->vfs_inode, data_size); inode_set_bytes(&ni->vfs_inode, alloc_size); } @@ -1544,7 +1544,7 @@ static int indx_add_allocate(struct ntfs_index *indx, struct ntfs_inode *ni, } if (in->name == I30_NAME) - ni->vfs_inode.i_size = data_size; + i_size_write(&ni->vfs_inode, data_size); *vbn = bit << indx->idx2vbn_bits; @@ -2090,7 +2090,7 @@ static int indx_shrink(struct ntfs_index *indx, struct ntfs_inode *ni, return err; if (in->name == I30_NAME) - ni->vfs_inode.i_size = new_data; + i_size_write(&ni->vfs_inode, new_data); bpb = bitmap_size(bit); if (bpb * 8 == nbits) @@ -2576,7 +2576,7 @@ int indx_delete_entry(struct ntfs_index *indx, struct ntfs_inode *ni, err = attr_set_size(ni, ATTR_ALLOC, in->name, in->name_len, &indx->alloc_run, 0, NULL, false, NULL); if (in->name == I30_NAME) - ni->vfs_inode.i_size = 0; + i_size_write(&ni->vfs_inode, 0); err = ni_remove_attr(ni, ATTR_ALLOC, in->name, in->name_len, false, NULL); diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 5e3d71374918..eb7a8c9fba01 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -345,9 +345,7 @@ next_attr: inode->i_size = le16_to_cpu(rp.SymbolicLinkReparseBuffer .PrintNameLength) / sizeof(u16); - ni->i_valid = inode->i_size; - /* Clear directory bit. */ if (ni->ni_flags & NI_FLAG_DIR) { indx_clear(&ni->dir); @@ -412,7 +410,6 @@ end_enum: goto out; if (!is_match && name) { - /* Reuse rec as buffer for ascii name. */ err = -ENOENT; goto out; } @@ -427,6 +424,7 @@ end_enum: if (names != le16_to_cpu(rec->hard_links)) { /* Correct minor error on the fly. Do not mark inode as dirty. */ + ntfs_inode_warn(inode, "Correct links count -> %u.", names); rec->hard_links = cpu_to_le16(names); ni->mi.dirty = true; } @@ -653,9 +651,10 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo, off = vbo & (PAGE_SIZE - 1); folio_set_bh(bh, folio, off); - err = bh_read(bh, 0); - if (err < 0) + if (bh_read(bh, 0) < 0) { + err = -EIO; goto out; + } folio_zero_segment(folio, off + voff, off + block_size); } } @@ -853,9 +852,13 @@ static int ntfs_resident_writepage(struct folio *folio, struct writeback_control *wbc, void *data) { struct address_space *mapping = data; - struct ntfs_inode *ni = ntfs_i(mapping->host); + struct inode *inode = mapping->host; + struct ntfs_inode *ni = ntfs_i(inode); int ret; + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + ni_lock(ni); ret = attr_data_write_resident(ni, &folio->page); ni_unlock(ni); @@ -869,7 +872,12 @@ static int ntfs_resident_writepage(struct folio *folio, static int ntfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { - if (is_resident(ntfs_i(mapping->host))) + struct inode *inode = mapping->host; + + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + + if (is_resident(ntfs_i(inode))) return write_cache_pages(mapping, wbc, ntfs_resident_writepage, mapping); return mpage_writepages(mapping, wbc, ntfs_get_block); @@ -889,6 +897,9 @@ int ntfs_write_begin(struct file *file, struct address_space *mapping, struct inode *inode = mapping->host; struct ntfs_inode *ni = ntfs_i(inode); + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + *pagep = NULL; if (is_resident(ni)) { struct page *page = @@ -974,7 +985,7 @@ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, } if (pos + err > inode->i_size) { - inode->i_size = pos + err; + i_size_write(inode, pos + err); dirty = true; } @@ -1306,6 +1317,11 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir, goto out1; } + if (unlikely(ntfs3_forced_shutdown(sb))) { + err = -EIO; + goto out2; + } + /* Mark rw ntfs as dirty. it will be cleared at umount. */ ntfs_set_state(sbi, NTFS_DIRTY_DIRTY); diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c index ee3093be5170..084d19d78397 100644 --- a/fs/ntfs3/namei.c +++ b/fs/ntfs3/namei.c @@ -181,6 +181,9 @@ static int ntfs_unlink(struct inode *dir, struct dentry *dentry) struct ntfs_inode *ni = ntfs_i(dir); int err; + if (unlikely(ntfs3_forced_shutdown(dir->i_sb))) + return -EIO; + ni_lock_dir(ni); err = ntfs_unlink_inode(dir, dentry); @@ -199,6 +202,9 @@ static int ntfs_symlink(struct mnt_idmap *idmap, struct inode *dir, u32 size = strlen(symname); struct inode *inode; + if (unlikely(ntfs3_forced_shutdown(dir->i_sb))) + return -EIO; + inode = ntfs_create_inode(idmap, dir, dentry, NULL, S_IFLNK | 0777, 0, symname, size, NULL); @@ -227,6 +233,9 @@ static int ntfs_rmdir(struct inode *dir, struct dentry *dentry) struct ntfs_inode *ni = ntfs_i(dir); int err; + if (unlikely(ntfs3_forced_shutdown(dir->i_sb))) + return -EIO; + ni_lock_dir(ni); err = ntfs_unlink_inode(dir, dentry); @@ -264,6 +273,9 @@ static int ntfs_rename(struct mnt_idmap *idmap, struct inode *dir, 1024); static_assert(PATH_MAX >= 4 * 1024); + if (unlikely(ntfs3_forced_shutdown(sb))) + return -EIO; + if (flags & ~RENAME_NOREPLACE) return -EINVAL; @@ -419,7 +431,7 @@ static int ntfs_atomic_open(struct inode *dir, struct dentry *dentry, * fnd contains tree's path to insert to. * If fnd is not NULL then dir is locked. */ - inode = ntfs_create_inode(mnt_idmap(file->f_path.mnt), dir, dentry, uni, + inode = ntfs_create_inode(file_mnt_idmap(file), dir, dentry, uni, mode, 0, NULL, 0, fnd); err = IS_ERR(inode) ? PTR_ERR(inode) : finish_open(file, dentry, ntfs_file_open); diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h index 86aecbb01a92..9c7478150a03 100644 --- a/fs/ntfs3/ntfs.h +++ b/fs/ntfs3/ntfs.h @@ -523,12 +523,10 @@ struct ATTR_LIST_ENTRY { __le64 vcn; // 0x08: Starting VCN of this attribute. struct MFT_REF ref; // 0x10: MFT record number with attribute. __le16 id; // 0x18: struct ATTRIB ID. - __le16 name[3]; // 0x1A: Just to align. To get real name can use bNameOffset. + __le16 name[]; // 0x1A: To get real name use name_off. }; // sizeof(0x20) -static_assert(sizeof(struct ATTR_LIST_ENTRY) == 0x20); - static inline u32 le_size(u8 name_len) { return ALIGN(offsetof(struct ATTR_LIST_ENTRY, name) + diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index f6706143d14b..79356fd29a14 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -61,6 +61,8 @@ enum utf16_endian; /* sbi->flags */ #define NTFS_FLAGS_NODISCARD 0x00000001 +/* ntfs in shutdown state. */ +#define NTFS_FLAGS_SHUTDOWN_BIT 0x00000002 /* == 4*/ /* Set when LogFile is replaying. */ #define NTFS_FLAGS_LOG_REPLAYING 0x00000008 /* Set when we changed first MFT's which copy must be updated in $MftMirr. */ @@ -226,7 +228,7 @@ struct ntfs_sb_info { u64 maxbytes; // Maximum size for normal files. u64 maxbytes_sparse; // Maximum size for sparse file. - u32 flags; // See NTFS_FLAGS_XXX. + unsigned long flags; // See NTFS_FLAGS_ CLST zone_max; // Maximum MFT zone length in clusters CLST bad_clusters; // The count of marked bad clusters. @@ -473,7 +475,7 @@ bool al_delete_le(struct ntfs_inode *ni, enum ATTR_TYPE type, CLST vcn, int al_update(struct ntfs_inode *ni, int sync); static inline size_t al_aligned(size_t size) { - return (size + 1023) & ~(size_t)1023; + return size_add(size, 1023) & ~(size_t)1023; } /* Globals from bitfunc.c */ @@ -500,6 +502,8 @@ int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry, int ntfs_file_open(struct inode *inode, struct file *file); int ntfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); +long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg); +long ntfs_compat_ioctl(struct file *filp, u32 cmd, unsigned long arg); extern const struct inode_operations ntfs_special_inode_operations; extern const struct inode_operations ntfs_file_inode_operations; extern const struct file_operations ntfs_file_operations; @@ -584,6 +588,7 @@ bool check_index_header(const struct INDEX_HDR *hdr, size_t bytes); int log_replay(struct ntfs_inode *ni, bool *initialized); /* Globals from fsntfs.c */ +struct buffer_head *ntfs_bread(struct super_block *sb, sector_t block); bool ntfs_fix_pre_write(struct NTFS_RECORD_HEADER *rhdr, size_t bytes); int ntfs_fix_post_read(struct NTFS_RECORD_HEADER *rhdr, size_t bytes, bool simple); @@ -872,7 +877,7 @@ int ntfs_init_acl(struct mnt_idmap *idmap, struct inode *inode, int ntfs_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry); ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size); -extern const struct xattr_handler * const ntfs_xattr_handlers[]; +extern const struct xattr_handler *const ntfs_xattr_handlers[]; int ntfs_save_wsl_perm(struct inode *inode, __le16 *ea_size); void ntfs_get_wsl_perm(struct inode *inode); @@ -999,6 +1004,11 @@ static inline struct ntfs_sb_info *ntfs_sb(struct super_block *sb) return sb->s_fs_info; } +static inline int ntfs3_forced_shutdown(struct super_block *sb) +{ + return test_bit(NTFS_FLAGS_SHUTDOWN_BIT, &ntfs_sb(sb)->flags); +} + /* * ntfs_up_cluster - Align up on cluster boundary. */ @@ -1025,19 +1035,6 @@ static inline u64 bytes_to_block(const struct super_block *sb, u64 size) return (size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; } -static inline struct buffer_head *ntfs_bread(struct super_block *sb, - sector_t block) -{ - struct buffer_head *bh = sb_bread(sb, block); - - if (bh) - return bh; - - ntfs_err(sb, "failed to read volume at offset 0x%llx", - (u64)block << sb->s_blocksize_bits); - return NULL; -} - static inline struct ntfs_inode *ntfs_i(struct inode *inode) { return container_of(inode, struct ntfs_inode, vfs_inode); diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c index 53629b1f65e9..6aa3a9d44df1 100644 --- a/fs/ntfs3/record.c +++ b/fs/ntfs3/record.c @@ -279,7 +279,7 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) if (t16 > asize) return NULL; - if (t16 + le32_to_cpu(attr->res.data_size) > asize) + if (le32_to_cpu(attr->res.data_size) > asize - t16) return NULL; t32 = sizeof(short) * attr->name_len; @@ -535,8 +535,20 @@ bool mi_remove_attr(struct ntfs_inode *ni, struct mft_inode *mi, return false; if (ni && is_attr_indexed(attr)) { - le16_add_cpu(&ni->mi.mrec->hard_links, -1); - ni->mi.dirty = true; + u16 links = le16_to_cpu(ni->mi.mrec->hard_links); + struct ATTR_FILE_NAME *fname = + attr->type != ATTR_NAME ? + NULL : + resident_data_ex(attr, + SIZEOF_ATTRIBUTE_FILENAME); + if (fname && fname->type == FILE_NAME_DOS) { + /* Do not decrease links count deleting DOS name. */ + } else if (!links) { + /* minor error. Not critical. */ + } else { + ni->mi.mrec->hard_links = cpu_to_le16(links - 1); + ni->mi.dirty = true; + } } used -= asize; diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index 9153dffde950..9df7c20d066f 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -122,13 +122,12 @@ void ntfs_inode_printk(struct inode *inode, const char *fmt, ...) if (name) { struct dentry *de = d_find_alias(inode); - const u32 name_len = ARRAY_SIZE(s_name_buf) - 1; if (de) { spin_lock(&de->d_lock); - snprintf(name, name_len, " \"%s\"", de->d_name.name); + snprintf(name, sizeof(s_name_buf), " \"%s\"", + de->d_name.name); spin_unlock(&de->d_lock); - name[name_len] = 0; /* To be sure. */ } else { name[0] = 0; } @@ -625,7 +624,7 @@ static void ntfs3_free_sbi(struct ntfs_sb_info *sbi) { kfree(sbi->new_rec); kvfree(ntfs_put_shared(sbi->upcase)); - kfree(sbi->def_table); + kvfree(sbi->def_table); kfree(sbi->compress.lznt); #ifdef CONFIG_NTFS3_LZX_XPRESS xpress_free_decompressor(sbi->compress.xpress); @@ -715,6 +714,14 @@ static int ntfs_show_options(struct seq_file *m, struct dentry *root) } /* + * ntfs_shutdown - super_operations::shutdown + */ +static void ntfs_shutdown(struct super_block *sb) +{ + set_bit(NTFS_FLAGS_SHUTDOWN_BIT, &ntfs_sb(sb)->flags); +} + +/* * ntfs_sync_fs - super_operations::sync_fs */ static int ntfs_sync_fs(struct super_block *sb, int wait) @@ -724,6 +731,9 @@ static int ntfs_sync_fs(struct super_block *sb, int wait) struct ntfs_inode *ni; struct inode *inode; + if (unlikely(ntfs3_forced_shutdown(sb))) + return -EIO; + ni = sbi->security.ni; if (ni) { inode = &ni->vfs_inode; @@ -763,6 +773,7 @@ static const struct super_operations ntfs_sops = { .put_super = ntfs_put_super, .statfs = ntfs_statfs, .show_options = ntfs_show_options, + .shutdown = ntfs_shutdown, .sync_fs = ntfs_sync_fs, .write_inode = ntfs3_write_inode, }; @@ -866,6 +877,7 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, u16 fn, ao; u8 cluster_bits; u32 boot_off = 0; + sector_t boot_block = 0; const char *hint = "Primary boot"; /* Save original dev_size. Used with alternative boot. */ @@ -873,11 +885,11 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, sbi->volume.blocks = dev_size >> PAGE_SHIFT; - bh = ntfs_bread(sb, 0); +read_boot: + bh = ntfs_bread(sb, boot_block); if (!bh) - return -EIO; + return boot_block ? -EINVAL : -EIO; -check_boot: err = -EINVAL; /* Corrupted image; do not read OOB */ @@ -1108,26 +1120,24 @@ check_boot: } out: - if (err == -EINVAL && !bh->b_blocknr && dev_size0 > PAGE_SHIFT) { + brelse(bh); + + if (err == -EINVAL && !boot_block && dev_size0 > PAGE_SHIFT) { u32 block_size = min_t(u32, sector_size, PAGE_SIZE); u64 lbo = dev_size0 - sizeof(*boot); - /* - * Try alternative boot (last sector) - */ - brelse(bh); - - sb_set_blocksize(sb, block_size); - bh = ntfs_bread(sb, lbo >> blksize_bits(block_size)); - if (!bh) - return -EINVAL; - + boot_block = lbo >> blksize_bits(block_size); boot_off = lbo & (block_size - 1); - hint = "Alternative boot"; - dev_size = dev_size0; /* restore original size. */ - goto check_boot; + if (boot_block && block_size >= boot_off + sizeof(*boot)) { + /* + * Try alternative boot (last sector) + */ + sb_set_blocksize(sb, block_size); + hint = "Alternative boot"; + dev_size = dev_size0; /* restore original size. */ + goto read_boot; + } } - brelse(bh); return err; } @@ -1815,7 +1825,7 @@ static int __init init_ntfs_fs(void) ntfs_inode_cachep = kmem_cache_create( "ntfs_inode_cache", sizeof(struct ntfs_inode), 0, - (SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT), + (SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT), init_once); if (!ntfs_inode_cachep) { err = -ENOMEM; diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c index 4274b6f31cfa..53e7d1fa036a 100644 --- a/fs/ntfs3/xattr.c +++ b/fs/ntfs3/xattr.c @@ -219,6 +219,9 @@ static ssize_t ntfs_list_ea(struct ntfs_inode *ni, char *buffer, if (!ea->name_len) break; + if (ea->name_len > ea_size) + break; + if (buffer) { /* Check if we can use field ea->name */ if (off + ea_size > size) @@ -744,6 +747,9 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de, int err; struct ntfs_inode *ni = ntfs_i(inode); + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + /* Dispatch request. */ if (!strcmp(name, SYSTEM_DOS_ATTRIB)) { /* system.dos_attrib */ diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 4d7efefa98c5..1bde1281d514 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -213,7 +213,7 @@ struct o2hb_region { unsigned int hr_num_pages; struct page **hr_slot_data; - struct bdev_handle *hr_bdev_handle; + struct file *hr_bdev_file; struct o2hb_disk_slot *hr_slots; /* live node map of this region */ @@ -263,7 +263,7 @@ struct o2hb_region { static inline struct block_device *reg_bdev(struct o2hb_region *reg) { - return reg->hr_bdev_handle ? reg->hr_bdev_handle->bdev : NULL; + return reg->hr_bdev_file ? file_bdev(reg->hr_bdev_file) : NULL; } struct o2hb_bio_wait_ctxt { @@ -1509,8 +1509,8 @@ static void o2hb_region_release(struct config_item *item) kfree(reg->hr_slot_data); } - if (reg->hr_bdev_handle) - bdev_release(reg->hr_bdev_handle); + if (reg->hr_bdev_file) + fput(reg->hr_bdev_file); kfree(reg->hr_slots); @@ -1569,7 +1569,7 @@ static ssize_t o2hb_region_block_bytes_store(struct config_item *item, unsigned long block_bytes; unsigned int block_bits; - if (reg->hr_bdev_handle) + if (reg->hr_bdev_file) return -EINVAL; status = o2hb_read_block_input(reg, page, &block_bytes, @@ -1598,7 +1598,7 @@ static ssize_t o2hb_region_start_block_store(struct config_item *item, char *p = (char *)page; ssize_t ret; - if (reg->hr_bdev_handle) + if (reg->hr_bdev_file) return -EINVAL; ret = kstrtoull(p, 0, &tmp); @@ -1623,7 +1623,7 @@ static ssize_t o2hb_region_blocks_store(struct config_item *item, unsigned long tmp; char *p = (char *)page; - if (reg->hr_bdev_handle) + if (reg->hr_bdev_file) return -EINVAL; tmp = simple_strtoul(p, &p, 0); @@ -1642,7 +1642,7 @@ static ssize_t o2hb_region_dev_show(struct config_item *item, char *page) { unsigned int ret = 0; - if (to_o2hb_region(item)->hr_bdev_handle) + if (to_o2hb_region(item)->hr_bdev_file) ret = sprintf(page, "%pg\n", reg_bdev(to_o2hb_region(item))); return ret; @@ -1753,7 +1753,7 @@ out: } /* - * this is acting as commit; we set up all of hr_bdev_handle and hr_task or + * this is acting as commit; we set up all of hr_bdev_file and hr_task or * nothing */ static ssize_t o2hb_region_dev_store(struct config_item *item, @@ -1769,7 +1769,7 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, ssize_t ret = -EINVAL; int live_threshold; - if (reg->hr_bdev_handle) + if (reg->hr_bdev_file) goto out; /* We can't heartbeat without having had our node number @@ -1795,11 +1795,11 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, if (!S_ISBLK(f.file->f_mapping->host->i_mode)) goto out2; - reg->hr_bdev_handle = bdev_open_by_dev(f.file->f_mapping->host->i_rdev, + reg->hr_bdev_file = bdev_file_open_by_dev(f.file->f_mapping->host->i_rdev, BLK_OPEN_WRITE | BLK_OPEN_READ, NULL, NULL); - if (IS_ERR(reg->hr_bdev_handle)) { - ret = PTR_ERR(reg->hr_bdev_handle); - reg->hr_bdev_handle = NULL; + if (IS_ERR(reg->hr_bdev_file)) { + ret = PTR_ERR(reg->hr_bdev_file); + reg->hr_bdev_file = NULL; goto out2; } @@ -1903,8 +1903,8 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, out3: if (ret < 0) { - bdev_release(reg->hr_bdev_handle); - reg->hr_bdev_handle = NULL; + fput(reg->hr_bdev_file); + reg->hr_bdev_file = NULL; } out2: fdput(f); diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 85215162c9dd..7fc0e920eda7 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -578,7 +578,7 @@ static int __init init_dlmfs_fs(void) dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache", sizeof(struct dlmfs_inode_private), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), + SLAB_ACCOUNT), dlmfs_init_once); if (!dlmfs_inode_cache) { status = -ENOMEM; diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c index f37174e79fad..6de944818c56 100644 --- a/fs/ocfs2/locks.c +++ b/fs/ocfs2/locks.c @@ -27,7 +27,7 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode, struct ocfs2_file_private *fp = file->private_data; struct ocfs2_lock_res *lockres = &fp->fp_flock; - if (fl->fl_type == F_WRLCK) + if (lock_is_write(fl)) level = 1; if (!IS_SETLKW(cmd)) trylock = 1; @@ -53,8 +53,8 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode, */ locks_init_lock(&request); - request.fl_type = F_UNLCK; - request.fl_flags = FL_FLOCK; + request.c.flc_type = F_UNLCK; + request.c.flc_flags = FL_FLOCK; locks_lock_file_wait(file, &request); ocfs2_file_unlock(file); @@ -100,14 +100,14 @@ int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl) struct inode *inode = file->f_mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - if (!(fl->fl_flags & FL_FLOCK)) + if (!(fl->c.flc_flags & FL_FLOCK)) return -ENOLCK; if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) || ocfs2_mount_local(osb)) return locks_lock_file_wait(file, fl); - if (fl->fl_type == F_UNLCK) + if (lock_is_unlock(fl)) return ocfs2_do_funlock(file, cmd, fl); else return ocfs2_do_flock(file, inode, cmd, fl); @@ -118,7 +118,7 @@ int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl) struct inode *inode = file->f_mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - if (!(fl->fl_flags & FL_POSIX)) + if (!(fl->c.flc_flags & FL_POSIX)) return -ENOLCK; return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl); diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 9b76ee66aeb2..c11406cd87a8 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -744,7 +744,7 @@ static int user_plock(struct ocfs2_cluster_connection *conn, return dlm_posix_cancel(conn->cc_lockspace, ino, file, fl); else if (IS_GETLK(cmd)) return dlm_posix_get(conn->cc_lockspace, ino, file, fl); - else if (fl->fl_type == F_UNLCK) + else if (lock_is_unlock(fl)) return dlm_posix_unlock(conn->cc_lockspace, ino, file, fl); else return dlm_posix_lock(conn->cc_lockspace, ino, file, cmd, fl); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 6b906424902b..b3f860888e93 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1706,18 +1706,17 @@ static int ocfs2_initialize_mem_caches(void) sizeof(struct ocfs2_inode_info), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), + SLAB_ACCOUNT), ocfs2_inode_init_once); ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache", sizeof(struct ocfs2_dquot), 0, - (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT), NULL); ocfs2_qf_chunk_cachep = kmem_cache_create("ocfs2_qf_chunk_cache", sizeof(struct ocfs2_quota_chunk), 0, - (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), + (SLAB_RECLAIM_ACCOUNT), NULL); if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep || !ocfs2_qf_chunk_cachep) { @@ -2027,8 +2026,8 @@ static int ocfs2_initialize_super(struct super_block *sb, cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits); bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits); sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits); - memcpy(&sb->s_uuid, di->id2.i_super.s_uuid, - sizeof(di->id2.i_super.s_uuid)); + super_set_uuid(sb, di->id2.i_super.s_uuid, + sizeof(di->id2.i_super.s_uuid)); osb->osb_dx_mask = (1 << (cbits - bbits)) - 1; diff --git a/fs/open.c b/fs/open.c index a84d21e55c39..ee8460c83c77 100644 --- a/fs/open.c +++ b/fs/open.c @@ -29,7 +29,6 @@ #include <linux/audit.h> #include <linux/falloc.h> #include <linux/fs_struct.h> -#include <linux/ima.h> #include <linux/dnotify.h> #include <linux/compat.h> #include <linux/mnt_idmapping.h> @@ -154,49 +153,52 @@ COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length } #endif -long do_sys_ftruncate(unsigned int fd, loff_t length, int small) +long do_ftruncate(struct file *file, loff_t length, int small) { struct inode *inode; struct dentry *dentry; - struct fd f; int error; - error = -EINVAL; - if (length < 0) - goto out; - error = -EBADF; - f = fdget(fd); - if (!f.file) - goto out; - /* explicitly opened as large or we are on 64-bit box */ - if (f.file->f_flags & O_LARGEFILE) + if (file->f_flags & O_LARGEFILE) small = 0; - dentry = f.file->f_path.dentry; + dentry = file->f_path.dentry; inode = dentry->d_inode; - error = -EINVAL; - if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE)) - goto out_putf; + if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE)) + return -EINVAL; - error = -EINVAL; /* Cannot ftruncate over 2^31 bytes without large file support */ if (small && length > MAX_NON_LFS) - goto out_putf; + return -EINVAL; - error = -EPERM; /* Check IS_APPEND on real upper inode */ - if (IS_APPEND(file_inode(f.file))) - goto out_putf; + if (IS_APPEND(file_inode(file))) + return -EPERM; sb_start_write(inode->i_sb); - error = security_file_truncate(f.file); + error = security_file_truncate(file); if (!error) - error = do_truncate(file_mnt_idmap(f.file), dentry, length, - ATTR_MTIME | ATTR_CTIME, f.file); + error = do_truncate(file_mnt_idmap(file), dentry, length, + ATTR_MTIME | ATTR_CTIME, file); sb_end_write(inode->i_sb); -out_putf: + + return error; +} + +long do_sys_ftruncate(unsigned int fd, loff_t length, int small) +{ + struct fd f; + int error; + + if (length < 0) + return -EINVAL; + f = fdget(fd); + if (!f.file) + return -EBADF; + + error = do_ftruncate(f.file, length, small); + fdput(f); -out: return error; } @@ -1364,7 +1366,7 @@ struct file *filp_open(const char *filename, int flags, umode_t mode) { struct filename *name = getname_kernel(filename); struct file *file = ERR_CAST(name); - + if (!IS_ERR(name)) { file = file_open_name(name, flags, mode); putname(name); diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index c4b65a6d41cc..4a0779e3ef79 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -446,7 +446,7 @@ static int __init init_openprom_fs(void) sizeof(struct op_inode_info), 0, (SLAB_RECLAIM_ACCOUNT | - SLAB_MEM_SPREAD | SLAB_ACCOUNT), + SLAB_ACCOUNT), op_inode_init_once); if (!op_inode_cachep) return -ENOMEM; diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index b8e25ca51016..8586e2f5d243 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -265,20 +265,18 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry, if (IS_ERR(old_file)) return PTR_ERR(old_file); + /* Try to use clone_file_range to clone up within the same fs */ + cloned = vfs_clone_file_range(old_file, 0, new_file, 0, len, 0); + if (cloned == len) + goto out_fput; + + /* Couldn't clone, so now we try to copy the data */ error = rw_verify_area(READ, old_file, &old_pos, len); if (!error) error = rw_verify_area(WRITE, new_file, &new_pos, len); if (error) goto out_fput; - /* Try to use clone_file_range to clone up within the same fs */ - ovl_start_write(dentry); - cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0); - ovl_end_write(dentry); - if (cloned == len) - goto out_fput; - /* Couldn't clone, so now we try to copy the data */ - /* Check if lower fs supports seek operation */ if (old_file->f_mode & FMODE_LSEEK) skip_hole = true; diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 984ffdaeed6c..5764f91d283e 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -18,10 +18,11 @@ struct ovl_lookup_data { struct super_block *sb; - struct vfsmount *mnt; + const struct ovl_layer *layer; struct qstr name; bool is_dir; bool opaque; + bool xwhiteouts; bool stop; bool last; char *redirect; @@ -201,17 +202,13 @@ struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh, return real; } -static bool ovl_is_opaquedir(struct ovl_fs *ofs, const struct path *path) -{ - return ovl_path_check_dir_xattr(ofs, path, OVL_XATTR_OPAQUE); -} - static struct dentry *ovl_lookup_positive_unlocked(struct ovl_lookup_data *d, const char *name, struct dentry *base, int len, bool drop_negative) { - struct dentry *ret = lookup_one_unlocked(mnt_idmap(d->mnt), name, base, len); + struct dentry *ret = lookup_one_unlocked(mnt_idmap(d->layer->mnt), name, + base, len); if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { if (drop_negative && ret->d_lockref.count == 1) { @@ -232,10 +229,13 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, size_t prelen, const char *post, struct dentry **ret, bool drop_negative) { + struct ovl_fs *ofs = OVL_FS(d->sb); struct dentry *this; struct path path; int err; bool last_element = !post[0]; + bool is_upper = d->layer->idx == 0; + char val; this = ovl_lookup_positive_unlocked(d, name, base, namelen, drop_negative); if (IS_ERR(this)) { @@ -253,8 +253,8 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, } path.dentry = this; - path.mnt = d->mnt; - if (ovl_path_is_whiteout(OVL_FS(d->sb), &path)) { + path.mnt = d->layer->mnt; + if (ovl_path_is_whiteout(ofs, &path)) { d->stop = d->opaque = true; goto put_and_out; } @@ -272,7 +272,7 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, d->stop = true; goto put_and_out; } - err = ovl_check_metacopy_xattr(OVL_FS(d->sb), &path, NULL); + err = ovl_check_metacopy_xattr(ofs, &path, NULL); if (err < 0) goto out_err; @@ -292,7 +292,12 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, if (d->last) goto out; - if (ovl_is_opaquedir(OVL_FS(d->sb), &path)) { + /* overlay.opaque=x means xwhiteouts directory */ + val = ovl_get_opaquedir_val(ofs, &path); + if (last_element && !is_upper && val == 'x') { + d->xwhiteouts = true; + ovl_layer_set_xwhiteouts(ofs, d->layer); + } else if (val == 'y') { d->stop = true; if (last_element) d->opaque = true; @@ -863,7 +868,8 @@ fail: * Returns next layer in stack starting from top. * Returns -1 if this is the last layer. */ -int ovl_path_next(int idx, struct dentry *dentry, struct path *path) +int ovl_path_next(int idx, struct dentry *dentry, struct path *path, + const struct ovl_layer **layer) { struct ovl_entry *oe = OVL_E(dentry); struct ovl_path *lowerstack = ovl_lowerstack(oe); @@ -871,13 +877,16 @@ int ovl_path_next(int idx, struct dentry *dentry, struct path *path) BUG_ON(idx < 0); if (idx == 0) { ovl_path_upper(dentry, path); - if (path->dentry) + if (path->dentry) { + *layer = &OVL_FS(dentry->d_sb)->layers[0]; return ovl_numlower(oe) ? 1 : -1; + } idx++; } BUG_ON(idx > ovl_numlower(oe)); path->dentry = lowerstack[idx - 1].dentry; - path->mnt = lowerstack[idx - 1].layer->mnt; + *layer = lowerstack[idx - 1].layer; + path->mnt = (*layer)->mnt; return (idx < ovl_numlower(oe)) ? idx + 1 : -1; } @@ -1055,7 +1064,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, old_cred = ovl_override_creds(dentry->d_sb); upperdir = ovl_dentry_upper(dentry->d_parent); if (upperdir) { - d.mnt = ovl_upper_mnt(ofs); + d.layer = &ofs->layers[0]; err = ovl_lookup_layer(upperdir, &d, &upperdentry, true); if (err) goto out; @@ -1111,7 +1120,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, else if (d.is_dir || !ofs->numdatalayer) d.last = lower.layer->idx == ovl_numlower(roe); - d.mnt = lower.layer->mnt; + d.layer = lower.layer; err = ovl_lookup_layer(lower.dentry, &d, &this, false); if (err) goto out_put; @@ -1278,6 +1287,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, if (upperopaque) ovl_dentry_set_opaque(dentry); + if (d.xwhiteouts) + ovl_dentry_set_xwhiteouts(dentry); if (upperdentry) ovl_dentry_set_upper_alias(dentry); diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 5ba11eb43767..ee949f3e7c77 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -50,7 +50,6 @@ enum ovl_xattr { OVL_XATTR_METACOPY, OVL_XATTR_PROTATTR, OVL_XATTR_XWHITEOUT, - OVL_XATTR_XWHITEOUTS, }; enum ovl_inode_flag { @@ -70,6 +69,8 @@ enum ovl_entry_flag { OVL_E_UPPER_ALIAS, OVL_E_OPAQUE, OVL_E_CONNECTED, + /* Lower stack may contain xwhiteout entries */ + OVL_E_XWHITEOUTS, }; enum { @@ -477,6 +478,10 @@ bool ovl_dentry_test_flag(unsigned long flag, struct dentry *dentry); bool ovl_dentry_is_opaque(struct dentry *dentry); bool ovl_dentry_is_whiteout(struct dentry *dentry); void ovl_dentry_set_opaque(struct dentry *dentry); +bool ovl_dentry_has_xwhiteouts(struct dentry *dentry); +void ovl_dentry_set_xwhiteouts(struct dentry *dentry); +void ovl_layer_set_xwhiteouts(struct ovl_fs *ofs, + const struct ovl_layer *layer); bool ovl_dentry_has_upper_alias(struct dentry *dentry); void ovl_dentry_set_upper_alias(struct dentry *dentry); bool ovl_dentry_needs_data_copy_up(struct dentry *dentry, int flags); @@ -494,11 +499,10 @@ struct file *ovl_path_open(const struct path *path, int flags); int ovl_copy_up_start(struct dentry *dentry, int flags); void ovl_copy_up_end(struct dentry *dentry); bool ovl_already_copied_up(struct dentry *dentry, int flags); -bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path, - enum ovl_xattr ox); +char ovl_get_dir_xattr_val(struct ovl_fs *ofs, const struct path *path, + enum ovl_xattr ox); bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path); bool ovl_path_check_xwhiteout_xattr(struct ovl_fs *ofs, const struct path *path); -bool ovl_path_check_xwhiteouts_xattr(struct ovl_fs *ofs, const struct path *path); bool ovl_init_uuid_xattr(struct super_block *sb, struct ovl_fs *ofs, const struct path *upperpath); @@ -573,7 +577,13 @@ static inline bool ovl_is_impuredir(struct super_block *sb, .mnt = ovl_upper_mnt(ofs), }; - return ovl_path_check_dir_xattr(ofs, &upperpath, OVL_XATTR_IMPURE); + return ovl_get_dir_xattr_val(ofs, &upperpath, OVL_XATTR_IMPURE) == 'y'; +} + +static inline char ovl_get_opaquedir_val(struct ovl_fs *ofs, + const struct path *path) +{ + return ovl_get_dir_xattr_val(ofs, path, OVL_XATTR_OPAQUE); } static inline bool ovl_redirect_follow(struct ovl_fs *ofs) @@ -680,7 +690,8 @@ int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin, struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh); struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, struct dentry *origin, bool verify); -int ovl_path_next(int idx, struct dentry *dentry, struct path *path); +int ovl_path_next(int idx, struct dentry *dentry, struct path *path, + const struct ovl_layer **layer); int ovl_verify_lowerdata(struct dentry *dentry); struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 5fa9c58af65f..cb449ab310a7 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -40,6 +40,8 @@ struct ovl_layer { int idx; /* One fsid per unique underlying sb (upper fsid == 0) */ int fsid; + /* xwhiteouts were found on this layer */ + bool has_xwhiteouts; }; struct ovl_path { @@ -59,7 +61,7 @@ struct ovl_fs { unsigned int numfs; /* Number of data-only lower layers */ unsigned int numdatalayer; - const struct ovl_layer *layers; + struct ovl_layer *layers; struct ovl_sb *fs; /* workbasedir is the path at workdir= mount option */ struct dentry *workbasedir; diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c index 112b4b12f825..36dcc530ac28 100644 --- a/fs/overlayfs/params.c +++ b/fs/overlayfs/params.c @@ -280,12 +280,20 @@ static int ovl_mount_dir_check(struct fs_context *fc, const struct path *path, { struct ovl_fs_context *ctx = fc->fs_private; - if (ovl_dentry_weird(path->dentry)) - return invalfc(fc, "filesystem on %s not supported", name); - if (!d_is_dir(path->dentry)) return invalfc(fc, "%s is not a directory", name); + /* + * Root dentries of case-insensitive capable filesystems might + * not have the dentry operations set, but still be incompatible + * with overlayfs. Check explicitly to prevent post-mount + * failures. + */ + if (sb_has_encoding(path->mnt->mnt_sb)) + return invalfc(fc, "case-insensitive capable filesystem on %s not supported", name); + + if (ovl_dentry_weird(path->dentry)) + return invalfc(fc, "filesystem on %s not supported", name); /* * Check whether upper path is read-only here to report failures diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index e71156baa7bc..0ca8af060b0c 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -305,8 +305,6 @@ static inline int ovl_dir_read(const struct path *realpath, if (IS_ERR(realfile)) return PTR_ERR(realfile); - rdd->in_xwhiteouts_dir = rdd->dentry && - ovl_path_check_xwhiteouts_xattr(OVL_FS(rdd->dentry->d_sb), realpath); rdd->first_maybe_whiteout = NULL; rdd->ctx.pos = 0; do { @@ -359,10 +357,13 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list, .is_lowest = false, }; int idx, next; + const struct ovl_layer *layer; for (idx = 0; idx != -1; idx = next) { - next = ovl_path_next(idx, dentry, &realpath); + next = ovl_path_next(idx, dentry, &realpath, &layer); rdd.is_upper = ovl_dentry_upper(dentry) == realpath.dentry; + rdd.in_xwhiteouts_dir = layer->has_xwhiteouts && + ovl_dentry_has_xwhiteouts(dentry); if (next != -1) { err = ovl_dir_read(&realpath, &rdd); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 4ab66e3d4cff..a40fc7e05525 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -28,41 +28,38 @@ MODULE_LICENSE("GPL"); struct ovl_dir_cache; -static struct dentry *ovl_d_real(struct dentry *dentry, - const struct inode *inode) +static struct dentry *ovl_d_real(struct dentry *dentry, enum d_real_type type) { - struct dentry *real = NULL, *lower; + struct dentry *upper, *lower; int err; - /* - * vfs is only expected to call d_real() with NULL from d_real_inode() - * and with overlay inode from file_dentry() on an overlay file. - * - * TODO: remove @inode argument from d_real() API, remove code in this - * function that deals with non-NULL @inode and remove d_real() call - * from file_dentry(). - */ - if (inode && d_inode(dentry) == inode) - return dentry; - else if (inode) + switch (type) { + case D_REAL_DATA: + case D_REAL_METADATA: + break; + default: goto bug; + } if (!d_is_reg(dentry)) { /* d_real_inode() is only relevant for regular files */ return dentry; } - real = ovl_dentry_upper(dentry); - if (real && (inode == d_inode(real))) - return real; + upper = ovl_dentry_upper(dentry); + if (upper && (type == D_REAL_METADATA || + ovl_has_upperdata(d_inode(dentry)))) + return upper; - if (real && !inode && ovl_has_upperdata(d_inode(dentry))) - return real; + if (type == D_REAL_METADATA) { + lower = ovl_dentry_lower(dentry); + goto real_lower; + } /* - * Best effort lazy lookup of lowerdata for !inode case to return + * Best effort lazy lookup of lowerdata for D_REAL_DATA case to return * the real lowerdata dentry. The only current caller of d_real() with - * NULL inode is d_real_inode() from trace_uprobe and this caller is + * D_REAL_DATA is d_real_inode() from trace_uprobe and this caller is * likely going to be followed reading from the file, before placing * uprobes on offset within the file, so lowerdata should be available * when setting the uprobe. @@ -73,18 +70,13 @@ static struct dentry *ovl_d_real(struct dentry *dentry, lower = ovl_dentry_lowerdata(dentry); if (!lower) goto bug; - real = lower; - /* Handle recursion */ - real = d_real(real, inode); +real_lower: + /* Handle recursion into stacked lower fs */ + return d_real(lower, type); - if (!inode || inode == d_inode(real)) - return real; bug: - WARN(1, "%s(%pd4, %s:%lu): real dentry (%p/%lu) not found\n", - __func__, dentry, inode ? inode->i_sb->s_id : "NULL", - inode ? inode->i_ino : 0, real, - real && d_inode(real) ? d_inode(real)->i_ino : 0); + WARN(1, "%s(%pd4, %d): real dentry not found\n", __func__, dentry, type); return dentry; } @@ -1249,6 +1241,7 @@ static struct dentry *ovl_get_root(struct super_block *sb, struct ovl_entry *oe) { struct dentry *root; + struct ovl_fs *ofs = OVL_FS(sb); struct ovl_path *lowerpath = ovl_lowerstack(oe); unsigned long ino = d_inode(lowerpath->dentry)->i_ino; int fsid = lowerpath->layer->fsid; @@ -1270,6 +1263,20 @@ static struct dentry *ovl_get_root(struct super_block *sb, ovl_set_flag(OVL_IMPURE, d_inode(root)); } + /* Look for xwhiteouts marker except in the lowermost layer */ + for (int i = 0; i < ovl_numlower(oe) - 1; i++, lowerpath++) { + struct path path = { + .mnt = lowerpath->layer->mnt, + .dentry = lowerpath->dentry, + }; + + /* overlay.opaque=x means xwhiteouts directory */ + if (ovl_get_opaquedir_val(ofs, &path) == 'x') { + ovl_layer_set_xwhiteouts(ofs, lowerpath->layer); + ovl_dentry_set_xwhiteouts(root); + } + } + /* Root is always merge -> can have whiteouts */ ovl_set_flag(OVL_WHITEOUTS, d_inode(root)); ovl_dentry_set_flag(OVL_E_CONNECTED, root); @@ -1496,7 +1503,7 @@ static int __init ovl_init(void) ovl_inode_cachep = kmem_cache_create("ovl_inode", sizeof(struct ovl_inode), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), + SLAB_ACCOUNT), ovl_inode_init_once); if (ovl_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 0217094c23ea..d285d1d7baad 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -461,6 +461,33 @@ void ovl_dentry_set_opaque(struct dentry *dentry) ovl_dentry_set_flag(OVL_E_OPAQUE, dentry); } +bool ovl_dentry_has_xwhiteouts(struct dentry *dentry) +{ + return ovl_dentry_test_flag(OVL_E_XWHITEOUTS, dentry); +} + +void ovl_dentry_set_xwhiteouts(struct dentry *dentry) +{ + ovl_dentry_set_flag(OVL_E_XWHITEOUTS, dentry); +} + +/* + * ovl_layer_set_xwhiteouts() is called before adding the overlay dir + * dentry to dcache, while readdir of that same directory happens after + * the overlay dir dentry is in dcache, so if some cpu observes that + * ovl_dentry_is_xwhiteouts(), it will also observe layer->has_xwhiteouts + * for the layers where xwhiteouts marker was found in that merge dir. + */ +void ovl_layer_set_xwhiteouts(struct ovl_fs *ofs, + const struct ovl_layer *layer) +{ + if (layer->has_xwhiteouts) + return; + + /* Write once to read-mostly layer properties */ + ofs->layers[layer->idx].has_xwhiteouts = true; +} + /* * For hard links and decoded file handles, it's possible for ovl_dentry_upper() * to return positive, while there's no actual upper alias for the inode. @@ -739,19 +766,6 @@ bool ovl_path_check_xwhiteout_xattr(struct ovl_fs *ofs, const struct path *path) return res >= 0; } -bool ovl_path_check_xwhiteouts_xattr(struct ovl_fs *ofs, const struct path *path) -{ - struct dentry *dentry = path->dentry; - int res; - - /* xattr.whiteouts must be a directory */ - if (!d_is_dir(dentry)) - return false; - - res = ovl_path_getxattr(ofs, path, OVL_XATTR_XWHITEOUTS, NULL, 0); - return res >= 0; -} - /* * Load persistent uuid from xattr into s_uuid if found, or store a new * random generated value in s_uuid and in xattr. @@ -760,13 +774,14 @@ bool ovl_init_uuid_xattr(struct super_block *sb, struct ovl_fs *ofs, const struct path *upperpath) { bool set = false; + uuid_t uuid; int res; /* Try to load existing persistent uuid */ - res = ovl_path_getxattr(ofs, upperpath, OVL_XATTR_UUID, sb->s_uuid.b, + res = ovl_path_getxattr(ofs, upperpath, OVL_XATTR_UUID, uuid.b, UUID_SIZE); if (res == UUID_SIZE) - return true; + goto set_uuid; if (res != -ENODATA) goto fail; @@ -794,37 +809,37 @@ bool ovl_init_uuid_xattr(struct super_block *sb, struct ovl_fs *ofs, } /* Generate overlay instance uuid */ - uuid_gen(&sb->s_uuid); + uuid_gen(&uuid); /* Try to store persistent uuid */ set = true; - res = ovl_setxattr(ofs, upperpath->dentry, OVL_XATTR_UUID, sb->s_uuid.b, + res = ovl_setxattr(ofs, upperpath->dentry, OVL_XATTR_UUID, uuid.b, UUID_SIZE); - if (res == 0) - return true; + if (res) + goto fail; + +set_uuid: + super_set_uuid(sb, uuid.b, sizeof(uuid)); + return true; fail: - memset(sb->s_uuid.b, 0, UUID_SIZE); ofs->config.uuid = OVL_UUID_NULL; pr_warn("failed to %s uuid (%pd2, err=%i); falling back to uuid=null.\n", set ? "set" : "get", upperpath->dentry, res); return false; } -bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path, - enum ovl_xattr ox) +char ovl_get_dir_xattr_val(struct ovl_fs *ofs, const struct path *path, + enum ovl_xattr ox) { int res; char val; if (!d_is_dir(path->dentry)) - return false; + return 0; res = ovl_path_getxattr(ofs, path, ox, &val, 1); - if (res == 1 && val == 'y') - return true; - - return false; + return res == 1 ? val : 0; } #define OVL_XATTR_OPAQUE_POSTFIX "opaque" @@ -837,7 +852,6 @@ bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path, #define OVL_XATTR_METACOPY_POSTFIX "metacopy" #define OVL_XATTR_PROTATTR_POSTFIX "protattr" #define OVL_XATTR_XWHITEOUT_POSTFIX "whiteout" -#define OVL_XATTR_XWHITEOUTS_POSTFIX "whiteouts" #define OVL_XATTR_TAB_ENTRY(x) \ [x] = { [false] = OVL_XATTR_TRUSTED_PREFIX x ## _POSTFIX, \ @@ -854,7 +868,6 @@ const char *const ovl_xattr_table[][2] = { OVL_XATTR_TAB_ENTRY(OVL_XATTR_METACOPY), OVL_XATTR_TAB_ENTRY(OVL_XATTR_PROTATTR), OVL_XATTR_TAB_ENTRY(OVL_XATTR_XWHITEOUT), - OVL_XATTR_TAB_ENTRY(OVL_XATTR_XWHITEOUTS), }; int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry, diff --git a/fs/pidfs.c b/fs/pidfs.c new file mode 100644 index 000000000000..8fd71a00be9c --- /dev/null +++ b/fs/pidfs.c @@ -0,0 +1,290 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/anon_inodes.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/magic.h> +#include <linux/mount.h> +#include <linux/pid.h> +#include <linux/pidfs.h> +#include <linux/pid_namespace.h> +#include <linux/poll.h> +#include <linux/proc_fs.h> +#include <linux/proc_ns.h> +#include <linux/pseudo_fs.h> +#include <linux/seq_file.h> +#include <uapi/linux/pidfd.h> + +#include "internal.h" + +static int pidfd_release(struct inode *inode, struct file *file) +{ +#ifndef CONFIG_FS_PID + struct pid *pid = file->private_data; + + file->private_data = NULL; + put_pid(pid); +#endif + return 0; +} + +#ifdef CONFIG_PROC_FS +/** + * pidfd_show_fdinfo - print information about a pidfd + * @m: proc fdinfo file + * @f: file referencing a pidfd + * + * Pid: + * This function will print the pid that a given pidfd refers to in the + * pid namespace of the procfs instance. + * If the pid namespace of the process is not a descendant of the pid + * namespace of the procfs instance 0 will be shown as its pid. This is + * similar to calling getppid() on a process whose parent is outside of + * its pid namespace. + * + * NSpid: + * If pid namespaces are supported then this function will also print + * the pid of a given pidfd refers to for all descendant pid namespaces + * starting from the current pid namespace of the instance, i.e. the + * Pid field and the first entry in the NSpid field will be identical. + * If the pid namespace of the process is not a descendant of the pid + * namespace of the procfs instance 0 will be shown as its first NSpid + * entry and no others will be shown. + * Note that this differs from the Pid and NSpid fields in + * /proc/<pid>/status where Pid and NSpid are always shown relative to + * the pid namespace of the procfs instance. The difference becomes + * obvious when sending around a pidfd between pid namespaces from a + * different branch of the tree, i.e. where no ancestral relation is + * present between the pid namespaces: + * - create two new pid namespaces ns1 and ns2 in the initial pid + * namespace (also take care to create new mount namespaces in the + * new pid namespace and mount procfs) + * - create a process with a pidfd in ns1 + * - send pidfd from ns1 to ns2 + * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid + * have exactly one entry, which is 0 + */ +static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct pid *pid = pidfd_pid(f); + struct pid_namespace *ns; + pid_t nr = -1; + + if (likely(pid_has_task(pid, PIDTYPE_PID))) { + ns = proc_pid_ns(file_inode(m->file)->i_sb); + nr = pid_nr_ns(pid, ns); + } + + seq_put_decimal_ll(m, "Pid:\t", nr); + +#ifdef CONFIG_PID_NS + seq_put_decimal_ll(m, "\nNSpid:\t", nr); + if (nr > 0) { + int i; + + /* If nr is non-zero it means that 'pid' is valid and that + * ns, i.e. the pid namespace associated with the procfs + * instance, is in the pid namespace hierarchy of pid. + * Start at one below the already printed level. + */ + for (i = ns->level + 1; i <= pid->level; i++) + seq_put_decimal_ll(m, "\t", pid->numbers[i].nr); + } +#endif + seq_putc(m, '\n'); +} +#endif + +/* + * Poll support for process exit notification. + */ +static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) +{ + struct pid *pid = pidfd_pid(file); + bool thread = file->f_flags & PIDFD_THREAD; + struct task_struct *task; + __poll_t poll_flags = 0; + + poll_wait(file, &pid->wait_pidfd, pts); + /* + * Depending on PIDFD_THREAD, inform pollers when the thread + * or the whole thread-group exits. + */ + guard(rcu)(); + task = pid_task(pid, PIDTYPE_PID); + if (!task) + poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP; + else if (task->exit_state && (thread || thread_group_empty(task))) + poll_flags = EPOLLIN | EPOLLRDNORM; + + return poll_flags; +} + +static const struct file_operations pidfs_file_operations = { + .release = pidfd_release, + .poll = pidfd_poll, +#ifdef CONFIG_PROC_FS + .show_fdinfo = pidfd_show_fdinfo, +#endif +}; + +struct pid *pidfd_pid(const struct file *file) +{ + if (file->f_op != &pidfs_file_operations) + return ERR_PTR(-EBADF); +#ifdef CONFIG_FS_PID + return file_inode(file)->i_private; +#else + return file->private_data; +#endif +} + +#ifdef CONFIG_FS_PID +static struct vfsmount *pidfs_mnt __ro_after_init; + +/* + * The vfs falls back to simple_setattr() if i_op->setattr() isn't + * implemented. Let's reject it completely until we have a clean + * permission concept for pidfds. + */ +static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr) +{ + return -EOPNOTSUPP; +} + +static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags) +{ + struct inode *inode = d_inode(path->dentry); + + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); + return 0; +} + +static const struct inode_operations pidfs_inode_operations = { + .getattr = pidfs_getattr, + .setattr = pidfs_setattr, +}; + +static void pidfs_evict_inode(struct inode *inode) +{ + struct pid *pid = inode->i_private; + + clear_inode(inode); + put_pid(pid); +} + +static const struct super_operations pidfs_sops = { + .drop_inode = generic_delete_inode, + .evict_inode = pidfs_evict_inode, + .statfs = simple_statfs, +}; + +static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen) +{ + return dynamic_dname(buffer, buflen, "pidfd:[%lu]", + d_inode(dentry)->i_ino); +} + +static const struct dentry_operations pidfs_dentry_operations = { + .d_delete = always_delete_dentry, + .d_dname = pidfs_dname, + .d_prune = stashed_dentry_prune, +}; + +static void pidfs_init_inode(struct inode *inode, void *data) +{ + inode->i_private = data; + inode->i_flags |= S_PRIVATE; + inode->i_mode |= S_IRWXU; + inode->i_op = &pidfs_inode_operations; + inode->i_fop = &pidfs_file_operations; +} + +static void pidfs_put_data(void *data) +{ + struct pid *pid = data; + put_pid(pid); +} + +static const struct stashed_operations pidfs_stashed_ops = { + .init_inode = pidfs_init_inode, + .put_data = pidfs_put_data, +}; + +static int pidfs_init_fs_context(struct fs_context *fc) +{ + struct pseudo_fs_context *ctx; + + ctx = init_pseudo(fc, PID_FS_MAGIC); + if (!ctx) + return -ENOMEM; + + ctx->ops = &pidfs_sops; + ctx->dops = &pidfs_dentry_operations; + fc->s_fs_info = (void *)&pidfs_stashed_ops; + return 0; +} + +static struct file_system_type pidfs_type = { + .name = "pidfs", + .init_fs_context = pidfs_init_fs_context, + .kill_sb = kill_anon_super, +}; + +struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) +{ + + struct file *pidfd_file; + struct path path; + int ret; + + /* + * Inode numbering for pidfs start at RESERVED_PIDS + 1. + * This avoids collisions with the root inode which is 1 + * for pseudo filesystems. + */ + ret = path_from_stashed(&pid->stashed, pid->ino, pidfs_mnt, + get_pid(pid), &path); + if (ret < 0) + return ERR_PTR(ret); + + pidfd_file = dentry_open(&path, flags, current_cred()); + path_put(&path); + return pidfd_file; +} + +void __init pidfs_init(void) +{ + pidfs_mnt = kern_mount(&pidfs_type); + if (IS_ERR(pidfs_mnt)) + panic("Failed to mount pidfs pseudo filesystem"); +} + +bool is_pidfs_sb(const struct super_block *sb) +{ + return sb == pidfs_mnt->mnt_sb; +} + +#else /* !CONFIG_FS_PID */ + +struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) +{ + struct file *pidfd_file; + + pidfd_file = anon_inode_getfile("[pidfd]", &pidfs_file_operations, pid, + flags | O_RDWR); + if (IS_ERR(pidfd_file)) + return pidfd_file; + + get_pid(pid); + return pidfd_file; +} + +void __init pidfs_init(void) { } +bool is_pidfs_sb(const struct super_block *sb) +{ + return false; +} +#endif diff --git a/fs/pipe.c b/fs/pipe.c index f1adbfe743d4..50c8a8596b52 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -76,18 +76,20 @@ static unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 */ -static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass) +#define cmp_int(l, r) ((l > r) - (l < r)) + +#ifdef CONFIG_PROVE_LOCKING +static int pipe_lock_cmp_fn(const struct lockdep_map *a, + const struct lockdep_map *b) { - if (pipe->files) - mutex_lock_nested(&pipe->mutex, subclass); + return cmp_int((unsigned long) a, (unsigned long) b); } +#endif void pipe_lock(struct pipe_inode_info *pipe) { - /* - * pipe_lock() nests non-pipe inode locks (for writing to a file) - */ - pipe_lock_nested(pipe, I_MUTEX_PARENT); + if (pipe->files) + mutex_lock(&pipe->mutex); } EXPORT_SYMBOL(pipe_lock); @@ -98,28 +100,16 @@ void pipe_unlock(struct pipe_inode_info *pipe) } EXPORT_SYMBOL(pipe_unlock); -static inline void __pipe_lock(struct pipe_inode_info *pipe) -{ - mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT); -} - -static inline void __pipe_unlock(struct pipe_inode_info *pipe) -{ - mutex_unlock(&pipe->mutex); -} - void pipe_double_lock(struct pipe_inode_info *pipe1, struct pipe_inode_info *pipe2) { BUG_ON(pipe1 == pipe2); - if (pipe1 < pipe2) { - pipe_lock_nested(pipe1, I_MUTEX_PARENT); - pipe_lock_nested(pipe2, I_MUTEX_CHILD); - } else { - pipe_lock_nested(pipe2, I_MUTEX_PARENT); - pipe_lock_nested(pipe1, I_MUTEX_CHILD); - } + if (pipe1 > pipe2) + swap(pipe1, pipe2); + + pipe_lock(pipe1); + pipe_lock(pipe2); } static void anon_pipe_buf_release(struct pipe_inode_info *pipe, @@ -271,7 +261,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) return 0; ret = 0; - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); /* * We only wake up writers if the pipe was full when we started @@ -368,7 +358,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) ret = -EAGAIN; break; } - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); /* * We only get here if we didn't actually read anything. @@ -400,13 +390,13 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0) return -ERESTARTSYS; - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); wake_next_reader = true; } if (pipe_empty(pipe->head, pipe->tail)) wake_next_reader = false; - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); if (was_full) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); @@ -462,7 +452,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) if (unlikely(total_len == 0)) return 0; - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); if (!pipe->readers) { send_sig(SIGPIPE, current, 0); @@ -582,19 +572,19 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) * after waiting we need to re-check whether the pipe * become empty while we dropped the lock. */ - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); if (was_empty) wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe)); - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); was_empty = pipe_empty(pipe->head, pipe->tail); wake_next_writer = true; } out: if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) wake_next_writer = false; - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); /* * If we do do a wakeup event, we do a 'sync' wakeup, because we @@ -629,7 +619,7 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) switch (cmd) { case FIONREAD: - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); count = 0; head = pipe->head; tail = pipe->tail; @@ -639,16 +629,16 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) count += pipe->bufs[tail & mask].len; tail++; } - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); return put_user(count, (int __user *)arg); #ifdef CONFIG_WATCH_QUEUE case IOC_WATCH_QUEUE_SET_SIZE: { int ret; - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); ret = watch_queue_set_size(pipe, arg); - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); return ret; } @@ -734,7 +724,7 @@ pipe_release(struct inode *inode, struct file *file) { struct pipe_inode_info *pipe = file->private_data; - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); if (file->f_mode & FMODE_READ) pipe->readers--; if (file->f_mode & FMODE_WRITE) @@ -747,7 +737,7 @@ pipe_release(struct inode *inode, struct file *file) kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); put_pipe_info(inode, pipe); return 0; @@ -759,7 +749,7 @@ pipe_fasync(int fd, struct file *filp, int on) struct pipe_inode_info *pipe = filp->private_data; int retval = 0; - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); if (filp->f_mode & FMODE_READ) retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); if ((filp->f_mode & FMODE_WRITE) && retval >= 0) { @@ -768,7 +758,7 @@ pipe_fasync(int fd, struct file *filp, int on) /* this can happen only if on == T */ fasync_helper(-1, filp, 0, &pipe->fasync_readers); } - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); return retval; } @@ -834,6 +824,7 @@ struct pipe_inode_info *alloc_pipe_info(void) pipe->nr_accounted = pipe_bufs; pipe->user = user; mutex_init(&pipe->mutex); + lock_set_cmp_fn(&pipe->mutex, pipe_lock_cmp_fn, NULL); return pipe; } @@ -1144,7 +1135,7 @@ static int fifo_open(struct inode *inode, struct file *filp) filp->private_data = pipe; /* OK, we have a pipe and it's pinned down */ - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); /* We can only do regular read/write on fifos */ stream_open(inode, filp); @@ -1214,7 +1205,7 @@ static int fifo_open(struct inode *inode, struct file *filp) } /* Ok! */ - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); return 0; err_rd: @@ -1230,7 +1221,7 @@ err_wr: goto err; err: - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); put_pipe_info(inode, pipe); return ret; @@ -1411,7 +1402,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned int arg) if (!pipe) return -EBADF; - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); switch (cmd) { case F_SETPIPE_SZ: @@ -1425,7 +1416,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned int arg) break; } - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); return ret; } diff --git a/fs/posix_acl.c b/fs/posix_acl.c index e1af20893ebe..3f87297dbfdb 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -26,7 +26,6 @@ #include <linux/mnt_idmapping.h> #include <linux/iversion.h> #include <linux/security.h> -#include <linux/evm.h> #include <linux/fsnotify.h> #include <linux/filelock.h> @@ -786,12 +785,12 @@ struct posix_acl *posix_acl_from_xattr(struct user_namespace *userns, return ERR_PTR(count); if (count == 0) return NULL; - + acl = posix_acl_alloc(count, GFP_NOFS); if (!acl) return ERR_PTR(-ENOMEM); acl_e = acl->a_entries; - + for (end = entry + count; entry != end; acl_e++, entry++) { acl_e->e_tag = le16_to_cpu(entry->e_tag); acl_e->e_perm = le16_to_cpu(entry->e_perm); @@ -1137,7 +1136,7 @@ retry_deleg: error = -EIO; if (!error) { fsnotify_xattr(dentry); - evm_inode_post_set_acl(dentry, acl_name, kacl); + security_inode_post_set_acl(dentry, acl_name, kacl); } out_inode_unlock: @@ -1245,7 +1244,7 @@ retry_deleg: error = -EIO; if (!error) { fsnotify_xattr(dentry); - evm_inode_post_remove_acl(idmap, dentry, acl_name); + security_inode_post_remove_acl(idmap, dentry, acl_name); } out_inode_unlock: diff --git a/fs/proc/array.c b/fs/proc/array.c index ff08a8957552..34a47fb0c57f 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -477,13 +477,13 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, int permitted; struct mm_struct *mm; unsigned long long start_time; - unsigned long cmin_flt = 0, cmaj_flt = 0; - unsigned long min_flt = 0, maj_flt = 0; - u64 cutime, cstime, utime, stime; - u64 cgtime, gtime; + unsigned long cmin_flt, cmaj_flt, min_flt, maj_flt; + u64 cutime, cstime, cgtime, utime, stime, gtime; unsigned long rsslim = 0; unsigned long flags; int exit_code = task->exit_code; + struct signal_struct *sig = task->signal; + unsigned int seq = 1; state = *get_task_state(task); vsize = eip = esp = 0; @@ -511,12 +511,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, sigemptyset(&sigign); sigemptyset(&sigcatch); - cutime = cstime = utime = stime = 0; - cgtime = gtime = 0; if (lock_task_sighand(task, &flags)) { - struct signal_struct *sig = task->signal; - if (sig->tty) { struct pid *pgrp = tty_get_pgrp(sig->tty); tty_pgrp = pid_nr_ns(pgrp, ns); @@ -527,28 +523,9 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, num_threads = get_nr_threads(task); collect_sigign_sigcatch(task, &sigign, &sigcatch); - cmin_flt = sig->cmin_flt; - cmaj_flt = sig->cmaj_flt; - cutime = sig->cutime; - cstime = sig->cstime; - cgtime = sig->cgtime; rsslim = READ_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur); - /* add up live thread stats at the group level */ if (whole) { - struct task_struct *t; - - __for_each_thread(sig, t) { - min_flt += t->min_flt; - maj_flt += t->maj_flt; - gtime += task_gtime(t); - } - - min_flt += sig->min_flt; - maj_flt += sig->maj_flt; - thread_group_cputime_adjusted(task, &utime, &stime); - gtime += sig->gtime; - if (sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_STOP_STOPPED)) exit_code = sig->group_exit_code; } @@ -562,10 +539,41 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, if (permitted && (!whole || num_threads < 2)) wchan = !task_is_running(task); - if (!whole) { + + do { + seq++; /* 2 on the 1st/lockless path, otherwise odd */ + flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); + + cmin_flt = sig->cmin_flt; + cmaj_flt = sig->cmaj_flt; + cutime = sig->cutime; + cstime = sig->cstime; + cgtime = sig->cgtime; + + if (whole) { + struct task_struct *t; + + min_flt = sig->min_flt; + maj_flt = sig->maj_flt; + gtime = sig->gtime; + + rcu_read_lock(); + __for_each_thread(sig, t) { + min_flt += t->min_flt; + maj_flt += t->maj_flt; + gtime += task_gtime(t); + } + rcu_read_unlock(); + } + } while (need_seqretry(&sig->stats_lock, seq)); + done_seqretry_irqrestore(&sig->stats_lock, seq, flags); + + if (whole) { + thread_group_cputime_adjusted(task, &utime, &stime); + } else { + task_cputime_adjusted(task, &utime, &stime); min_flt = task->min_flt; maj_flt = task->maj_flt; - task_cputime_adjusted(task, &utime, &stime); gtime = task_gtime(task); } diff --git a/fs/proc/base.c b/fs/proc/base.c index 98a031ac2648..18550c071d71 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1878,8 +1878,6 @@ void proc_pid_evict_inode(struct proc_inode *ei) hlist_del_init_rcu(&ei->sibling_inodes); spin_unlock(&pid->lock); } - - put_pid(pid); } struct inode *proc_pid_make_inode(struct super_block *sb, diff --git a/fs/proc/inode.c b/fs/proc/inode.c index b33e490e3fd9..dcd513dccf55 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -30,7 +30,6 @@ static void proc_evict_inode(struct inode *inode) { - struct proc_dir_entry *de; struct ctl_table_header *head; struct proc_inode *ei = PROC_I(inode); @@ -38,17 +37,8 @@ static void proc_evict_inode(struct inode *inode) clear_inode(inode); /* Stop tracking associated processes */ - if (ei->pid) { + if (ei->pid) proc_pid_evict_inode(ei); - ei->pid = NULL; - } - - /* Let go of any associated proc directory entry */ - de = ei->pde; - if (de) { - pde_put(de); - ei->pde = NULL; - } head = ei->sysctl; if (head) { @@ -80,6 +70,13 @@ static struct inode *proc_alloc_inode(struct super_block *sb) static void proc_free_inode(struct inode *inode) { + struct proc_inode *ei = PROC_I(inode); + + if (ei->pid) + put_pid(ei->pid); + /* Let go of any associated proc directory entry */ + if (ei->pde) + pde_put(ei->pde); kmem_cache_free(proc_inode_cachep, PROC_I(inode)); } @@ -95,7 +92,7 @@ void __init proc_init_kmemcache(void) proc_inode_cachep = kmem_cache_create("proc_inode_cache", sizeof(struct proc_inode), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT| + SLAB_ACCOUNT| SLAB_PANIC), init_once); pde_opener_cache = diff --git a/fs/proc/root.c b/fs/proc/root.c index b55dbc70287b..06a297a27ba3 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -271,7 +271,7 @@ static void proc_kill_sb(struct super_block *sb) kill_anon_super(sb); put_pid_ns(fs_info->pid_ns); - kfree(fs_info); + kfree_rcu(fs_info, rcu); } static struct file_system_type proc_fs_type = { diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index d0d9bfdad30c..56815799ce79 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -307,7 +307,6 @@ int pstore_put_backend_records(struct pstore_info *psi) { struct pstore_private *pos, *tmp; struct dentry *root; - int rc = 0; root = psinfo_lock_root(); if (!root) @@ -317,11 +316,8 @@ int pstore_put_backend_records(struct pstore_info *psi) list_for_each_entry_safe(pos, tmp, &records_list, list) { if (pos->record->psi == psi) { list_del_init(&pos->list); - rc = simple_unlink(d_inode(root), pos->dentry); - if (WARN_ON(rc)) - break; - d_drop(pos->dentry); - dput(pos->dentry); + d_invalidate(pos->dentry); + simple_unlink(d_inode(root), pos->dentry); pos->dentry = NULL; } } @@ -329,7 +325,7 @@ int pstore_put_backend_records(struct pstore_info *psi) inode_unlock(d_inode(root)); - return rc; + return 0; } /* diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 88b34fdbf759..b1a455f42e93 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -893,6 +893,7 @@ static const struct of_device_id dt_match[] = { { .compatible = "ramoops" }, {} }; +MODULE_DEVICE_TABLE(of, dt_match); static struct platform_driver ramoops_driver = { .probe = ramoops_probe, diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c index 2770746bb7aa..694db616663f 100644 --- a/fs/pstore/zone.c +++ b/fs/pstore/zone.c @@ -973,6 +973,8 @@ static ssize_t psz_kmsg_read(struct pstore_zone *zone, char *buf = kasprintf(GFP_KERNEL, "%s: Total %d times\n", kmsg_dump_reason_str(record->reason), record->count); + if (!buf) + return -ENOMEM; hlen = strlen(buf); record->buf = krealloc(buf, hlen + size, GFP_KERNEL); if (!record->buf) { @@ -1215,7 +1217,6 @@ static struct pstore_zone **psz_init_zones(enum pstore_type_id type, pr_err("allocate for zones %s failed\n", name); return ERR_PTR(-ENOMEM); } - memset(zones, 0, c * sizeof(*zones)); for (i = 0; i < c; i++) { zone = psz_init_zone(type, off, record_size); diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 6eb9bb369b57..d79841e94428 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -21,6 +21,7 @@ #include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/statfs.h> +#include <linux/fs_context.h> #include "qnx4.h" #define QNX4_VERSION 4 @@ -30,28 +31,33 @@ static const struct super_operations qnx4_sops; static struct inode *qnx4_alloc_inode(struct super_block *sb); static void qnx4_free_inode(struct inode *inode); -static int qnx4_remount(struct super_block *sb, int *flags, char *data); static int qnx4_statfs(struct dentry *, struct kstatfs *); +static int qnx4_get_tree(struct fs_context *fc); static const struct super_operations qnx4_sops = { .alloc_inode = qnx4_alloc_inode, .free_inode = qnx4_free_inode, .statfs = qnx4_statfs, - .remount_fs = qnx4_remount, }; -static int qnx4_remount(struct super_block *sb, int *flags, char *data) +static int qnx4_reconfigure(struct fs_context *fc) { + struct super_block *sb = fc->root->d_sb; struct qnx4_sb_info *qs; sync_filesystem(sb); qs = qnx4_sb(sb); qs->Version = QNX4_VERSION; - *flags |= SB_RDONLY; + fc->sb_flags |= SB_RDONLY; return 0; } +static const struct fs_context_operations qnx4_context_opts = { + .get_tree = qnx4_get_tree, + .reconfigure = qnx4_reconfigure, +}; + static int qnx4_get_block( struct inode *inode, sector_t iblock, struct buffer_head *bh, int create ) { unsigned long phys; @@ -183,12 +189,13 @@ static const char *qnx4_checkroot(struct super_block *sb, return "bitmap file not found."; } -static int qnx4_fill_super(struct super_block *s, void *data, int silent) +static int qnx4_fill_super(struct super_block *s, struct fs_context *fc) { struct buffer_head *bh; struct inode *root; const char *errmsg; struct qnx4_sb_info *qs; + int silent = fc->sb_flags & SB_SILENT; qs = kzalloc(sizeof(struct qnx4_sb_info), GFP_KERNEL); if (!qs) @@ -216,7 +223,7 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent) errmsg = qnx4_checkroot(s, (struct qnx4_super_block *) bh->b_data); brelse(bh); if (errmsg != NULL) { - if (!silent) + if (!silent) printk(KERN_ERR "qnx4: %s\n", errmsg); return -EINVAL; } @@ -235,6 +242,18 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent) return 0; } +static int qnx4_get_tree(struct fs_context *fc) +{ + return get_tree_bdev(fc, qnx4_fill_super); +} + +static int qnx4_init_fs_context(struct fs_context *fc) +{ + fc->ops = &qnx4_context_opts; + + return 0; +} + static void qnx4_kill_sb(struct super_block *sb) { struct qnx4_sb_info *qs = qnx4_sb(sb); @@ -359,7 +378,7 @@ static int init_inodecache(void) qnx4_inode_cachep = kmem_cache_create("qnx4_inode_cache", sizeof(struct qnx4_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), + SLAB_ACCOUNT), init_once); if (qnx4_inode_cachep == NULL) return -ENOMEM; @@ -376,18 +395,12 @@ static void destroy_inodecache(void) kmem_cache_destroy(qnx4_inode_cachep); } -static struct dentry *qnx4_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) -{ - return mount_bdev(fs_type, flags, dev_name, data, qnx4_fill_super); -} - static struct file_system_type qnx4_fs_type = { - .owner = THIS_MODULE, - .name = "qnx4", - .mount = qnx4_mount, - .kill_sb = qnx4_kill_sb, - .fs_flags = FS_REQUIRES_DEV, + .owner = THIS_MODULE, + .name = "qnx4", + .kill_sb = qnx4_kill_sb, + .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = qnx4_init_fs_context, }; MODULE_ALIAS_FS("qnx4"); diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index a286c545717f..405913f4faff 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -615,7 +615,7 @@ static int init_inodecache(void) qnx6_inode_cachep = kmem_cache_create("qnx6_inode_cache", sizeof(struct qnx6_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), + SLAB_ACCOUNT), init_once); if (!qnx6_inode_cachep) return -ENOMEM; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 1f0c754416b6..eb6e9d95dea1 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2984,7 +2984,7 @@ static int __init dquot_init(void) dquot_cachep = kmem_cache_create("dquot", sizeof(struct dquot), sizeof(unsigned long) * 4, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_PANIC), + SLAB_PANIC), NULL); order = 0; diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 4ac05a9e25bc..8006faaaf0ec 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -102,11 +102,20 @@ ramfs_mknod(struct mnt_idmap *idmap, struct inode *dir, int error = -ENOSPC; if (inode) { + error = security_inode_init_security(inode, dir, + &dentry->d_name, NULL, + NULL); + if (error) { + iput(inode); + goto out; + } + d_instantiate(dentry, inode); dget(dentry); /* Extra count - pin the dentry in core */ error = 0; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); } +out: return error; } @@ -134,6 +143,15 @@ static int ramfs_symlink(struct mnt_idmap *idmap, struct inode *dir, inode = ramfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0); if (inode) { int l = strlen(symname)+1; + + error = security_inode_init_security(inode, dir, + &dentry->d_name, NULL, + NULL); + if (error) { + iput(inode); + goto out; + } + error = page_symlink(inode, symname, l); if (!error) { d_instantiate(dentry, inode); @@ -143,6 +161,7 @@ static int ramfs_symlink(struct mnt_idmap *idmap, struct inode *dir, } else iput(inode); } +out: return error; } @@ -150,12 +169,23 @@ static int ramfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct inode *inode; + int error; inode = ramfs_get_inode(dir->i_sb, dir, mode, 0); if (!inode) return -ENOSPC; + + error = security_inode_init_security(inode, dir, + &file_dentry(file)->d_name, NULL, + NULL); + if (error) { + iput(inode); + goto out; + } + d_tmpfile(file, inode); - return finish_open_simple(file, 0); +out: + return finish_open_simple(file, error); } static const struct inode_operations ramfs_dir_inode_operations = { diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 171c912af50f..6474529c4253 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2386,7 +2386,7 @@ static int journal_read(struct super_block *sb) cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb); reiserfs_info(sb, "checking transaction log (%pg)\n", - journal->j_bdev_handle->bdev); + file_bdev(journal->j_bdev_file)); start = ktime_get_seconds(); /* @@ -2447,7 +2447,7 @@ static int journal_read(struct super_block *sb) * device and journal device to be the same */ d_bh = - reiserfs_breada(journal->j_bdev_handle->bdev, cur_dblock, + reiserfs_breada(file_bdev(journal->j_bdev_file), cur_dblock, sb->s_blocksize, SB_ONDISK_JOURNAL_1st_BLOCK(sb) + SB_ONDISK_JOURNAL_SIZE(sb)); @@ -2588,9 +2588,9 @@ static void journal_list_init(struct super_block *sb) static void release_journal_dev(struct reiserfs_journal *journal) { - if (journal->j_bdev_handle) { - bdev_release(journal->j_bdev_handle); - journal->j_bdev_handle = NULL; + if (journal->j_bdev_file) { + fput(journal->j_bdev_file); + journal->j_bdev_file = NULL; } } @@ -2605,7 +2605,7 @@ static int journal_init_dev(struct super_block *super, result = 0; - journal->j_bdev_handle = NULL; + journal->j_bdev_file = NULL; jdev = SB_ONDISK_JOURNAL_DEVICE(super) ? new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev; @@ -2616,37 +2616,37 @@ static int journal_init_dev(struct super_block *super, if ((!jdev_name || !jdev_name[0])) { if (jdev == super->s_dev) holder = NULL; - journal->j_bdev_handle = bdev_open_by_dev(jdev, blkdev_mode, + journal->j_bdev_file = bdev_file_open_by_dev(jdev, blkdev_mode, holder, NULL); - if (IS_ERR(journal->j_bdev_handle)) { - result = PTR_ERR(journal->j_bdev_handle); - journal->j_bdev_handle = NULL; + if (IS_ERR(journal->j_bdev_file)) { + result = PTR_ERR(journal->j_bdev_file); + journal->j_bdev_file = NULL; reiserfs_warning(super, "sh-458", "cannot init journal device unknown-block(%u,%u): %i", MAJOR(jdev), MINOR(jdev), result); return result; } else if (jdev != super->s_dev) - set_blocksize(journal->j_bdev_handle->bdev, + set_blocksize(file_bdev(journal->j_bdev_file), super->s_blocksize); return 0; } - journal->j_bdev_handle = bdev_open_by_path(jdev_name, blkdev_mode, + journal->j_bdev_file = bdev_file_open_by_path(jdev_name, blkdev_mode, holder, NULL); - if (IS_ERR(journal->j_bdev_handle)) { - result = PTR_ERR(journal->j_bdev_handle); - journal->j_bdev_handle = NULL; + if (IS_ERR(journal->j_bdev_file)) { + result = PTR_ERR(journal->j_bdev_file); + journal->j_bdev_file = NULL; reiserfs_warning(super, "sh-457", "journal_init_dev: Cannot open '%s': %i", jdev_name, result); return result; } - set_blocksize(journal->j_bdev_handle->bdev, super->s_blocksize); + set_blocksize(file_bdev(journal->j_bdev_file), super->s_blocksize); reiserfs_info(super, "journal_init_dev: journal device: %pg\n", - journal->j_bdev_handle->bdev); + file_bdev(journal->j_bdev_file)); return 0; } @@ -2804,7 +2804,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name, "journal header magic %x (device %pg) does " "not match to magic found in super block %x", jh->jh_journal.jp_journal_magic, - journal->j_bdev_handle->bdev, + file_bdev(journal->j_bdev_file), sb_jp_journal_magic(rs)); brelse(bhjh); goto free_and_return; @@ -2828,7 +2828,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name, reiserfs_info(sb, "journal params: device %pg, size %u, " "journal first block %u, max trans len %u, max batch %u, " "max commit age %u, max trans age %u\n", - journal->j_bdev_handle->bdev, + file_bdev(journal->j_bdev_file), SB_ONDISK_JOURNAL_SIZE(sb), SB_ONDISK_JOURNAL_1st_BLOCK(sb), journal->j_trans_max, diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index 83cb9402e0f9..5c68a4a52d78 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -354,7 +354,7 @@ static int show_journal(struct seq_file *m, void *unused) "prepare: \t%12lu\n" "prepare_retry: \t%12lu\n", DJP(jp_journal_1st_block), - SB_JOURNAL(sb)->j_bdev_handle->bdev, + file_bdev(SB_JOURNAL(sb)->j_bdev_file), DJP(jp_journal_dev), DJP(jp_journal_size), DJP(jp_journal_trans_max), diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index 725667880e62..0554903f42a9 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -299,7 +299,7 @@ struct reiserfs_journal { /* oldest journal block. start here for traverse */ struct reiserfs_journal_cnode *j_first; - struct bdev_handle *j_bdev_handle; + struct file *j_bdev_file; /* first block on s_dev of reserved area journal */ int j_1st_reserved_block; @@ -2810,10 +2810,10 @@ struct reiserfs_journal_header { /* We need these to make journal.c code more readable */ #define journal_find_get_block(s, block) __find_get_block(\ - SB_JOURNAL(s)->j_bdev_handle->bdev, block, s->s_blocksize) -#define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_bdev_handle->bdev,\ + file_bdev(SB_JOURNAL(s)->j_bdev_file), block, s->s_blocksize) +#define journal_getblk(s, block) __getblk(file_bdev(SB_JOURNAL(s)->j_bdev_file),\ block, s->s_blocksize) -#define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_bdev_handle->bdev,\ +#define journal_bread(s, block) __bread(file_bdev(SB_JOURNAL(s)->j_bdev_file),\ block, s->s_blocksize) enum reiserfs_bh_state_bits { diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 67b5510beded..2cc469d481a2 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -670,7 +670,6 @@ static int __init init_inodecache(void) sizeof(struct reiserfs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD| SLAB_ACCOUNT), init_once); if (reiserfs_inode_cachep == NULL) diff --git a/fs/remap_range.c b/fs/remap_range.c index f8c1120b8311..de07f978ce3e 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -373,9 +373,9 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, } EXPORT_SYMBOL(generic_remap_file_range_prep); -loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t len, unsigned int remap_flags) +loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags) { loff_t ret; @@ -391,23 +391,6 @@ loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, if (!file_in->f_op->remap_file_range) return -EOPNOTSUPP; - ret = file_in->f_op->remap_file_range(file_in, pos_in, - file_out, pos_out, len, remap_flags); - if (ret < 0) - return ret; - - fsnotify_access(file_in); - fsnotify_modify(file_out); - return ret; -} -EXPORT_SYMBOL(do_clone_file_range); - -loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t len, unsigned int remap_flags) -{ - loff_t ret; - ret = remap_verify_area(file_in, pos_in, len, false); if (ret) return ret; @@ -417,10 +400,14 @@ loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, return ret; file_start_write(file_out); - ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len, - remap_flags); + ret = file_in->f_op->remap_file_range(file_in, pos_in, + file_out, pos_out, len, remap_flags); file_end_write(file_out); + if (ret < 0) + return ret; + fsnotify_access(file_in); + fsnotify_modify(file_out); return ret; } EXPORT_SYMBOL(vfs_clone_file_range); diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 545ad44f96b8..2be227532f39 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -594,7 +594,7 @@ static void romfs_kill_sb(struct super_block *sb) #ifdef CONFIG_ROMFS_ON_BLOCK if (sb->s_bdev) { sync_blockdev(sb->s_bdev); - bdev_release(sb->s_bdev_handle); + fput(sb->s_bdev_file); } #endif } @@ -630,8 +630,8 @@ static int __init init_romfs_fs(void) romfs_inode_cachep = kmem_cache_create("romfs_i", sizeof(struct romfs_inode_info), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | - SLAB_ACCOUNT, romfs_i_init_once); + SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, + romfs_i_init_once); if (!romfs_inode_cachep) { pr_err("Failed to initialise inode cache\n"); diff --git a/fs/select.c b/fs/select.c index 0ee55af1a55c..9515c3fa1a03 100644 --- a/fs/select.c +++ b/fs/select.c @@ -476,7 +476,7 @@ static inline void wait_key_set(poll_table *wait, unsigned long in, wait->_key |= POLLOUT_SET; } -static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) +static noinline_for_stack int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) { ktime_t expire, *to = NULL; struct poll_wqueues table; @@ -839,7 +839,7 @@ SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg) struct poll_list { struct poll_list *next; - int len; + unsigned int len; struct pollfd entries[]; }; @@ -975,14 +975,15 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, struct timespec64 *end_time) { struct poll_wqueues table; - int err = -EFAULT, fdcount, len; + int err = -EFAULT, fdcount; /* Allocate small arguments on the stack to save memory and be faster - use long to make sure the buffer is aligned properly on 64 bit archs to avoid unaligned access */ long stack_pps[POLL_STACK_ALLOC/sizeof(long)]; struct poll_list *const head = (struct poll_list *)stack_pps; struct poll_list *walk = head; - unsigned long todo = nfds; + unsigned int todo = nfds; + unsigned int len; if (nfds > rlimit(RLIMIT_NOFILE)) return -EINVAL; @@ -998,9 +999,9 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, sizeof(struct pollfd) * walk->len)) goto out_fds; - todo -= walk->len; - if (!todo) + if (walk->len >= todo) break; + todo -= walk->len; len = min(todo, POLLFD_PER_PAGE); walk = walk->next = kmalloc(struct_size(walk, entries, len), @@ -1020,7 +1021,7 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, for (walk = head; walk; walk = walk->next) { struct pollfd *fds = walk->entries; - int j; + unsigned int j; for (j = walk->len; j; fds++, ufds++, j--) unsafe_put_user(fds->revents, &ufds->revents, Efault); diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index 971892620504..3de5047a7ff9 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -145,21 +145,27 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, struct cached_fid *cfid; struct cached_fids *cfids; const char *npath; + int retries = 0, cur_sleep = 1; if (tcon == NULL || tcon->cfids == NULL || tcon->nohandlecache || is_smb1_server(tcon->ses->server) || (dir_cache_timeout == 0)) return -EOPNOTSUPP; ses = tcon->ses; - server = cifs_pick_channel(ses); cfids = tcon->cfids; - if (!server->ops->new_lease_key) - return -EIO; - if (cifs_sb->root == NULL) return -ENOENT; +replay_again: + /* reinitialize for possible replay */ + flags = 0; + oplock = SMB2_OPLOCK_LEVEL_II; + server = cifs_pick_channel(ses); + + if (!server->ops->new_lease_key) + return -EIO; + utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); if (!utf16_path) return -ENOMEM; @@ -236,6 +242,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, .desired_access = FILE_READ_DATA | FILE_READ_ATTRIBUTES, .disposition = FILE_OPEN, .fid = pfid, + .replay = !!(retries), }; rc = SMB2_open_init(tcon, server, @@ -268,6 +275,11 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, */ cfid->has_lease = true; + if (retries) { + smb2_set_replay(server, &rqst[0]); + smb2_set_replay(server, &rqst[1]); + } + rc = compound_send_recv(xid, ses, server, flags, 2, rqst, resp_buftype, rsp_iov); @@ -367,6 +379,11 @@ out: atomic_inc(&tcon->num_remote_opens); } kfree(utf16_path); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c index ef4c2e3c9fa6..6322f0f68a17 100644 --- a/fs/smb/client/cifsencrypt.c +++ b/fs/smb/client/cifsencrypt.c @@ -572,7 +572,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, len = cifs_strtoUTF16(user, ses->user_name, len, nls_cp); UniStrupr(user); } else { - memset(user, '\0', 2); + *(u16 *)user = 0; } rc = crypto_shash_update(ses->server->secmech.hmacmd5, diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index e902de4e475a..e0d8c79cdde1 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -396,7 +396,7 @@ cifs_alloc_inode(struct super_block *sb) spin_lock_init(&cifs_inode->writers_lock); cifs_inode->writers = 0; cifs_inode->netfs.inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ - cifs_inode->server_eof = 0; + cifs_inode->netfs.remote_i_size = 0; cifs_inode->uniqueid = 0; cifs_inode->createtime = 0; cifs_inode->epoch = 0; @@ -1085,7 +1085,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence) } static int -cifs_setlease(struct file *file, int arg, struct file_lock **lease, void **priv) +cifs_setlease(struct file *file, int arg, struct file_lease **lease, void **priv) { /* * Note that this is called by vfs setlease with i_lock held to @@ -1094,9 +1094,6 @@ cifs_setlease(struct file *file, int arg, struct file_lock **lease, void **priv) struct inode *inode = file_inode(file); struct cifsFileInfo *cfile = file->private_data; - if (!(S_ISREG(inode->i_mode))) - return -EINVAL; - /* Check if file is oplocked if this is request for new lease */ if (arg == F_UNLCK || ((arg == F_RDLCK) && CIFS_CACHE_READ(CIFS_I(inode))) || @@ -1172,6 +1169,9 @@ const char *cifs_get_link(struct dentry *dentry, struct inode *inode, { char *target_path; + if (!dentry) + return ERR_PTR(-ECHILD); + target_path = kmalloc(PATH_MAX, GFP_KERNEL); if (!target_path) return ERR_PTR(-ENOMEM); @@ -1380,6 +1380,7 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, struct inode *src_inode = file_inode(src_file); struct inode *target_inode = file_inode(dst_file); struct cifsInodeInfo *src_cifsi = CIFS_I(src_inode); + struct cifsInodeInfo *target_cifsi = CIFS_I(target_inode); struct cifsFileInfo *smb_file_src; struct cifsFileInfo *smb_file_target; struct cifs_tcon *src_tcon; @@ -1428,7 +1429,7 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, * Advance the EOF marker after the flush above to the end of the range * if it's short of that. */ - if (src_cifsi->server_eof < off + len) { + if (src_cifsi->netfs.remote_i_size < off + len) { rc = cifs_precopy_set_eof(src_inode, src_cifsi, src_tcon, xid, off + len); if (rc < 0) goto unlock; @@ -1452,12 +1453,22 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, /* Discard all the folios that overlap the destination region. */ truncate_inode_pages_range(&target_inode->i_data, fstart, fend); + fscache_invalidate(cifs_inode_cookie(target_inode), NULL, + i_size_read(target_inode), 0); + rc = file_modified(dst_file); if (!rc) { rc = target_tcon->ses->server->ops->copychunk_range(xid, smb_file_src, smb_file_target, off, len, destoff); - if (rc > 0 && destoff + rc > i_size_read(target_inode)) + if (rc > 0 && destoff + rc > i_size_read(target_inode)) { truncate_setsize(target_inode, destoff + rc); + netfs_resize_file(&target_cifsi->netfs, + i_size_read(target_inode), true); + fscache_resize_cookie(cifs_inode_cookie(target_inode), + i_size_read(target_inode)); + } + if (rc > 0 && destoff + rc > target_cifsi->netfs.zero_point) + target_cifsi->netfs.zero_point = destoff + rc; } file_accessed(src_file); @@ -1653,7 +1664,7 @@ cifs_init_inodecache(void) cifs_inode_cachep = kmem_cache_create("cifs_inode_cache", sizeof(struct cifsInodeInfo), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), + SLAB_ACCOUNT), cifs_init_once); if (cifs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 20036fb16cec..53c75cfb33ab 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -50,6 +50,11 @@ #define CIFS_DEF_ACTIMEO (1 * HZ) /* + * max sleep time before retry to server + */ +#define CIFS_MAX_SLEEP 2000 + +/* * max attribute cache timeout (jiffies) - 2^30 */ #define CIFS_MAX_ACTIMEO (1 << 30) @@ -82,7 +87,7 @@ #define SMB_INTERFACE_POLL_INTERVAL 600 /* maximum number of PDUs in one compound */ -#define MAX_COMPOUND 5 +#define MAX_COMPOUND 7 /* * Default number of credits to keep available for SMB3. @@ -1027,6 +1032,8 @@ struct cifs_chan { __u8 signkey[SMB3_SIGN_KEY_SIZE]; }; +#define CIFS_SES_FLAG_SCALE_CHANNELS (0x1) + /* * Session structure. One of these for each uid session with a particular host */ @@ -1059,6 +1066,7 @@ struct cifs_ses { enum securityEnum sectype; /* what security flavor was specified? */ bool sign; /* is signing required? */ bool domainAuto:1; + unsigned int flags; __u16 session_flags; __u8 smb3signingkey[SMB3_SIGN_KEY_SIZE]; __u8 smb3encryptionkey[SMB3_ENC_DEC_KEY_SIZE]; @@ -1370,6 +1378,7 @@ struct cifs_open_parms { struct cifs_fid *fid; umode_t mode; bool reconnect:1; + bool replay:1; /* indicates that this open is for a replay */ }; struct cifs_fid { @@ -1501,6 +1510,7 @@ struct cifs_writedata { struct smbd_mr *mr; #endif struct cifs_credits credits; + bool replay; }; /* @@ -1561,7 +1571,6 @@ struct cifsInodeInfo { spinlock_t writers_lock; unsigned int writers; /* Number of writers on this inode */ unsigned long time; /* jiffies of last update of inode */ - u64 server_eof; /* current file size on server -- protected by i_lock */ u64 uniqueid; /* server inode number */ u64 createtime; /* creation time on server */ __u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for this inode */ @@ -1831,6 +1840,13 @@ static inline bool is_retryable_error(int error) return false; } +static inline bool is_replayable_error(int error) +{ + if (error == -EAGAIN || error == -ECONNABORTED) + return true; + return false; +} + /* cifs_get_writable_file() flags */ #define FIND_WR_ANY 0 diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 01e89070df5a..5eb83bafc7fd 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -2066,20 +2066,20 @@ CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon, parm_data = (struct cifs_posix_lock *) ((char *)&pSMBr->hdr.Protocol + data_offset); if (parm_data->lock_type == cpu_to_le16(CIFS_UNLCK)) - pLockData->fl_type = F_UNLCK; + pLockData->c.flc_type = F_UNLCK; else { if (parm_data->lock_type == cpu_to_le16(CIFS_RDLCK)) - pLockData->fl_type = F_RDLCK; + pLockData->c.flc_type = F_RDLCK; else if (parm_data->lock_type == cpu_to_le16(CIFS_WRLCK)) - pLockData->fl_type = F_WRLCK; + pLockData->c.flc_type = F_WRLCK; pLockData->fl_start = le64_to_cpu(parm_data->start); pLockData->fl_end = pLockData->fl_start + (le64_to_cpu(parm_data->length) ? le64_to_cpu(parm_data->length) - 1 : 0); - pLockData->fl_pid = -le32_to_cpu(parm_data->pid); + pLockData->c.flc_pid = -le32_to_cpu(parm_data->pid); } } diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index bfd568f89710..ac9595504f4b 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -233,6 +233,12 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, list_for_each_entry_safe(ses, nses, &pserver->smb_ses_list, smb_ses_list) { /* check if iface is still active */ spin_lock(&ses->chan_lock); + if (cifs_ses_get_chan_index(ses, server) == + CIFS_INVAL_CHAN_INDEX) { + spin_unlock(&ses->chan_lock); + continue; + } + if (!cifs_chan_is_iface_active(ses, server)) { spin_unlock(&ses->chan_lock); cifs_chan_update_iface(ses, server); @@ -3438,8 +3444,18 @@ int cifs_mount_get_tcon(struct cifs_mount_ctx *mnt_ctx) * the user on mount */ if ((cifs_sb->ctx->wsize == 0) || - (cifs_sb->ctx->wsize > server->ops->negotiate_wsize(tcon, ctx))) - cifs_sb->ctx->wsize = server->ops->negotiate_wsize(tcon, ctx); + (cifs_sb->ctx->wsize > server->ops->negotiate_wsize(tcon, ctx))) { + cifs_sb->ctx->wsize = + round_down(server->ops->negotiate_wsize(tcon, ctx), PAGE_SIZE); + /* + * in the very unlikely event that the server sent a max write size under PAGE_SIZE, + * (which would get rounded down to 0) then reset wsize to absolute minimum eg 4096 + */ + if (cifs_sb->ctx->wsize == 0) { + cifs_sb->ctx->wsize = PAGE_SIZE; + cifs_dbg(VFS, "wsize too small, reset to minimum ie PAGE_SIZE, usually 4096\n"); + } + } if ((cifs_sb->ctx->rsize == 0) || (cifs_sb->ctx->rsize > server->ops->negotiate_rsize(tcon, ctx))) cifs_sb->ctx->rsize = server->ops->negotiate_rsize(tcon, ctx); @@ -4228,6 +4244,11 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru /* only send once per connect */ spin_lock(&tcon->tc_lock); + + /* if tcon is marked for needing reconnect, update state */ + if (tcon->need_reconnect) + tcon->status = TID_NEED_TCON; + if (tcon->status == TID_GOOD) { spin_unlock(&tcon->tc_lock); return 0; diff --git a/fs/smb/client/dfs.c b/fs/smb/client/dfs.c index a8a1d386da65..449c59830039 100644 --- a/fs/smb/client/dfs.c +++ b/fs/smb/client/dfs.c @@ -565,6 +565,11 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru /* only send once per connect */ spin_lock(&tcon->tc_lock); + + /* if tcon is marked for needing reconnect, update state */ + if (tcon->need_reconnect) + tcon->status = TID_NEED_TCON; + if (tcon->status == TID_GOOD) { spin_unlock(&tcon->tc_lock); return 0; @@ -625,8 +630,8 @@ out: spin_lock(&tcon->tc_lock); if (tcon->status == TID_IN_TCON) tcon->status = TID_GOOD; - spin_unlock(&tcon->tc_lock); tcon->need_reconnect = false; + spin_unlock(&tcon->tc_lock); } return rc; diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 3a213432775b..c3b8e7091a4d 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -87,7 +87,7 @@ void cifs_pages_written_back(struct inode *inode, loff_t start, unsigned int len continue; if (!folio_test_writeback(folio)) { WARN_ONCE(1, "bad %x @%llx page %lx %lx\n", - len, start, folio_index(folio), end); + len, start, folio->index, end); continue; } @@ -120,7 +120,7 @@ void cifs_pages_write_failed(struct inode *inode, loff_t start, unsigned int len continue; if (!folio_test_writeback(folio)) { WARN_ONCE(1, "bad %x @%llx page %lx %lx\n", - len, start, folio_index(folio), end); + len, start, folio->index, end); continue; } @@ -151,7 +151,7 @@ void cifs_pages_write_redirty(struct inode *inode, loff_t start, unsigned int le xas_for_each(&xas, folio, end) { if (!folio_test_writeback(folio)) { WARN_ONCE(1, "bad %x @%llx page %lx %lx\n", - len, start, folio_index(folio), end); + len, start, folio->index, end); continue; } @@ -175,6 +175,9 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) /* only send once per connect */ spin_lock(&tcon->tc_lock); + if (tcon->need_reconnect) + tcon->status = TID_NEED_RECON; + if (tcon->status != TID_NEED_RECON) { spin_unlock(&tcon->tc_lock); return; @@ -1312,20 +1315,20 @@ cifs_lock_test(struct cifsFileInfo *cfile, __u64 offset, __u64 length, down_read(&cinode->lock_sem); exist = cifs_find_lock_conflict(cfile, offset, length, type, - flock->fl_flags, &conf_lock, + flock->c.flc_flags, &conf_lock, CIFS_LOCK_OP); if (exist) { flock->fl_start = conf_lock->offset; flock->fl_end = conf_lock->offset + conf_lock->length - 1; - flock->fl_pid = conf_lock->pid; + flock->c.flc_pid = conf_lock->pid; if (conf_lock->type & server->vals->shared_lock_type) - flock->fl_type = F_RDLCK; + flock->c.flc_type = F_RDLCK; else - flock->fl_type = F_WRLCK; + flock->c.flc_type = F_WRLCK; } else if (!cinode->can_cache_brlcks) rc = 1; else - flock->fl_type = F_UNLCK; + flock->c.flc_type = F_UNLCK; up_read(&cinode->lock_sem); return rc; @@ -1401,16 +1404,16 @@ cifs_posix_lock_test(struct file *file, struct file_lock *flock) { int rc = 0; struct cifsInodeInfo *cinode = CIFS_I(file_inode(file)); - unsigned char saved_type = flock->fl_type; + unsigned char saved_type = flock->c.flc_type; - if ((flock->fl_flags & FL_POSIX) == 0) + if ((flock->c.flc_flags & FL_POSIX) == 0) return 1; down_read(&cinode->lock_sem); posix_test_lock(file, flock); - if (flock->fl_type == F_UNLCK && !cinode->can_cache_brlcks) { - flock->fl_type = saved_type; + if (lock_is_unlock(flock) && !cinode->can_cache_brlcks) { + flock->c.flc_type = saved_type; rc = 1; } @@ -1431,7 +1434,7 @@ cifs_posix_lock_set(struct file *file, struct file_lock *flock) struct cifsInodeInfo *cinode = CIFS_I(file_inode(file)); int rc = FILE_LOCK_DEFERRED + 1; - if ((flock->fl_flags & FL_POSIX) == 0) + if ((flock->c.flc_flags & FL_POSIX) == 0) return rc; cifs_down_write(&cinode->lock_sem); @@ -1581,7 +1584,9 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) el = locks_to_send.next; spin_lock(&flctx->flc_lock); - list_for_each_entry(flock, &flctx->flc_posix, fl_list) { + for_each_file_lock(flock, &flctx->flc_posix) { + unsigned char ftype = flock->c.flc_type; + if (el == &locks_to_send) { /* * The list ended. We don't have enough allocated @@ -1591,12 +1596,12 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) break; } length = cifs_flock_len(flock); - if (flock->fl_type == F_RDLCK || flock->fl_type == F_SHLCK) + if (ftype == F_RDLCK || ftype == F_SHLCK) type = CIFS_RDLCK; else type = CIFS_WRLCK; lck = list_entry(el, struct lock_to_push, llist); - lck->pid = hash_lockowner(flock->fl_owner); + lck->pid = hash_lockowner(flock->c.flc_owner); lck->netfid = cfile->fid.netfid; lck->length = length; lck->type = type; @@ -1663,42 +1668,43 @@ static void cifs_read_flock(struct file_lock *flock, __u32 *type, int *lock, int *unlock, bool *wait_flag, struct TCP_Server_Info *server) { - if (flock->fl_flags & FL_POSIX) + if (flock->c.flc_flags & FL_POSIX) cifs_dbg(FYI, "Posix\n"); - if (flock->fl_flags & FL_FLOCK) + if (flock->c.flc_flags & FL_FLOCK) cifs_dbg(FYI, "Flock\n"); - if (flock->fl_flags & FL_SLEEP) { + if (flock->c.flc_flags & FL_SLEEP) { cifs_dbg(FYI, "Blocking lock\n"); *wait_flag = true; } - if (flock->fl_flags & FL_ACCESS) + if (flock->c.flc_flags & FL_ACCESS) cifs_dbg(FYI, "Process suspended by mandatory locking - not implemented yet\n"); - if (flock->fl_flags & FL_LEASE) + if (flock->c.flc_flags & FL_LEASE) cifs_dbg(FYI, "Lease on file - not implemented yet\n"); - if (flock->fl_flags & + if (flock->c.flc_flags & (~(FL_POSIX | FL_FLOCK | FL_SLEEP | FL_ACCESS | FL_LEASE | FL_CLOSE | FL_OFDLCK))) - cifs_dbg(FYI, "Unknown lock flags 0x%x\n", flock->fl_flags); + cifs_dbg(FYI, "Unknown lock flags 0x%x\n", + flock->c.flc_flags); *type = server->vals->large_lock_type; - if (flock->fl_type == F_WRLCK) { + if (lock_is_write(flock)) { cifs_dbg(FYI, "F_WRLCK\n"); *type |= server->vals->exclusive_lock_type; *lock = 1; - } else if (flock->fl_type == F_UNLCK) { + } else if (lock_is_unlock(flock)) { cifs_dbg(FYI, "F_UNLCK\n"); *type |= server->vals->unlock_lock_type; *unlock = 1; /* Check if unlock includes more than one lock range */ - } else if (flock->fl_type == F_RDLCK) { + } else if (lock_is_read(flock)) { cifs_dbg(FYI, "F_RDLCK\n"); *type |= server->vals->shared_lock_type; *lock = 1; - } else if (flock->fl_type == F_EXLCK) { + } else if (flock->c.flc_type == F_EXLCK) { cifs_dbg(FYI, "F_EXLCK\n"); *type |= server->vals->exclusive_lock_type; *lock = 1; - } else if (flock->fl_type == F_SHLCK) { + } else if (flock->c.flc_type == F_SHLCK) { cifs_dbg(FYI, "F_SHLCK\n"); *type |= server->vals->shared_lock_type; *lock = 1; @@ -1730,7 +1736,7 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type, else posix_lock_type = CIFS_WRLCK; rc = CIFSSMBPosixLock(xid, tcon, netfid, - hash_lockowner(flock->fl_owner), + hash_lockowner(flock->c.flc_owner), flock->fl_start, length, flock, posix_lock_type, wait_flag); return rc; @@ -1747,7 +1753,7 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type, if (rc == 0) { rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length, type, 0, 1, false); - flock->fl_type = F_UNLCK; + flock->c.flc_type = F_UNLCK; if (rc != 0) cifs_dbg(VFS, "Error unlocking previously locked range %d during test of lock\n", rc); @@ -1755,7 +1761,7 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type, } if (type & server->vals->shared_lock_type) { - flock->fl_type = F_WRLCK; + flock->c.flc_type = F_WRLCK; return 0; } @@ -1767,12 +1773,12 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type, if (rc == 0) { rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length, type | server->vals->shared_lock_type, 0, 1, false); - flock->fl_type = F_RDLCK; + flock->c.flc_type = F_RDLCK; if (rc != 0) cifs_dbg(VFS, "Error unlocking previously locked range %d during test of lock\n", rc); } else - flock->fl_type = F_WRLCK; + flock->c.flc_type = F_WRLCK; return 0; } @@ -1940,7 +1946,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, posix_lock_type = CIFS_UNLCK; rc = CIFSSMBPosixLock(xid, tcon, cfile->fid.netfid, - hash_lockowner(flock->fl_owner), + hash_lockowner(flock->c.flc_owner), flock->fl_start, length, NULL, posix_lock_type, wait_flag); goto out; @@ -1950,7 +1956,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, struct cifsLockInfo *lock; lock = cifs_lock_init(flock->fl_start, length, type, - flock->fl_flags); + flock->c.flc_flags); if (!lock) return -ENOMEM; @@ -1989,7 +1995,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, rc = server->ops->mand_unlock_range(cfile, flock, xid); out: - if ((flock->fl_flags & FL_POSIX) || (flock->fl_flags & FL_FLOCK)) { + if ((flock->c.flc_flags & FL_POSIX) || (flock->c.flc_flags & FL_FLOCK)) { /* * If this is a request to remove all locks because we * are closing the file, it doesn't matter if the @@ -1998,7 +2004,7 @@ out: */ if (rc) { cifs_dbg(VFS, "%s failed rc=%d\n", __func__, rc); - if (!(flock->fl_flags & FL_CLOSE)) + if (!(flock->c.flc_flags & FL_CLOSE)) return rc; } rc = locks_lock_file_wait(file, flock); @@ -2019,7 +2025,7 @@ int cifs_flock(struct file *file, int cmd, struct file_lock *fl) xid = get_xid(); - if (!(fl->fl_flags & FL_FLOCK)) { + if (!(fl->c.flc_flags & FL_FLOCK)) { rc = -ENOLCK; free_xid(xid); return rc; @@ -2070,7 +2076,8 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock) xid = get_xid(); cifs_dbg(FYI, "%s: %pD2 cmd=0x%x type=0x%x flags=0x%x r=%lld:%lld\n", __func__, file, cmd, - flock->fl_flags, flock->fl_type, (long long)flock->fl_start, + flock->c.flc_flags, flock->c.flc_type, + (long long)flock->fl_start, (long long)flock->fl_end); cfile = (struct cifsFileInfo *)file->private_data; @@ -2120,8 +2127,8 @@ cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, { loff_t end_of_write = offset + bytes_written; - if (end_of_write > cifsi->server_eof) - cifsi->server_eof = end_of_write; + if (end_of_write > cifsi->netfs.remote_i_size) + netfs_resize_file(&cifsi->netfs, end_of_write, true); } static ssize_t @@ -2651,7 +2658,7 @@ static void cifs_extend_writeback(struct address_space *mapping, continue; if (xa_is_value(folio)) break; - if (folio_index(folio) != index) + if (folio->index != index) break; if (!folio_try_get_rcu(folio)) { xas_reset(&xas); @@ -2899,7 +2906,7 @@ redo_folio: goto skip_write; } - if (folio_mapping(folio) != mapping || + if (folio->mapping != mapping || !folio_test_dirty(folio)) { start += folio_size(folio); folio_unlock(folio); @@ -2951,7 +2958,7 @@ skip_write: continue; } - folio_batch_release(&fbatch); + folio_batch_release(&fbatch); cond_resched(); } while (wbc->nr_to_write > 0); @@ -3247,8 +3254,8 @@ cifs_uncached_writev_complete(struct work_struct *work) spin_lock(&inode->i_lock); cifs_update_eof(cifsi, wdata->offset, wdata->bytes); - if (cifsi->server_eof > inode->i_size) - i_size_write(inode, cifsi->server_eof); + if (cifsi->netfs.remote_i_size > inode->i_size) + i_size_write(inode, cifsi->netfs.remote_i_size); spin_unlock(&inode->i_lock); complete(&wdata->done); @@ -3300,6 +3307,7 @@ cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list, if (wdata->cfile->invalidHandle) rc = -EAGAIN; else { + wdata->replay = true; #ifdef CONFIG_CIFS_SMB_DIRECT if (wdata->mr) { wdata->mr->need_invalidate = true; diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index 52cbef2eeb28..4b2f5aa2ea0e 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -211,7 +211,7 @@ cifs_parse_security_flavors(struct fs_context *fc, char *value, struct smb3_fs_c switch (match_token(value, cifs_secflavor_tokens, args)) { case Opt_sec_krb5p: - cifs_errorf(fc, "sec=krb5p is not supported!\n"); + cifs_errorf(fc, "sec=krb5p is not supported. Use sec=krb5,seal instead\n"); return 1; case Opt_sec_krb5i: ctx->sign = true; @@ -1111,6 +1111,17 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, case Opt_wsize: ctx->wsize = result.uint_32; ctx->got_wsize = true; + if (ctx->wsize % PAGE_SIZE != 0) { + ctx->wsize = round_down(ctx->wsize, PAGE_SIZE); + if (ctx->wsize == 0) { + ctx->wsize = PAGE_SIZE; + cifs_dbg(VFS, "wsize too small, reset to minimum %ld\n", PAGE_SIZE); + } else { + cifs_dbg(VFS, + "wsize rounded down to %d to multiple of PAGE_SIZE %ld\n", + ctx->wsize, PAGE_SIZE); + } + } break; case Opt_acregmax: ctx->acregmax = HZ * result.uint_32; diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index f0989484f2c6..d02f8ba29cb5 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -104,7 +104,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) fattr->cf_mtime = timestamp_truncate(fattr->cf_mtime, inode); mtime = inode_get_mtime(inode); if (timespec64_equal(&mtime, &fattr->cf_mtime) && - cifs_i->server_eof == fattr->cf_eof) { + cifs_i->netfs.remote_i_size == fattr->cf_eof) { cifs_dbg(FYI, "%s: inode %llu is unchanged\n", __func__, cifs_i->uniqueid); return; @@ -194,7 +194,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) else clear_bit(CIFS_INO_DELETE_PENDING, &cifs_i->flags); - cifs_i->server_eof = fattr->cf_eof; + cifs_i->netfs.remote_i_size = fattr->cf_eof; /* * Can't safely change the file size here if the client is writing to * it due to potential races. @@ -2858,7 +2858,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs, set_size_out: if (rc == 0) { - cifsInode->server_eof = attrs->ia_size; + netfs_resize_file(&cifsInode->netfs, attrs->ia_size, true); cifs_setsize(inode, attrs->ia_size); /* * i_blocks is not related to (i_size / i_blksize), but instead @@ -3011,6 +3011,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) if ((attrs->ia_valid & ATTR_SIZE) && attrs->ia_size != i_size_read(inode)) { truncate_setsize(inode, attrs->ia_size); + netfs_resize_file(&cifsInode->netfs, attrs->ia_size, true); fscache_resize_cookie(cifs_inode_cookie(inode), attrs->ia_size); } @@ -3210,6 +3211,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) if ((attrs->ia_valid & ATTR_SIZE) && attrs->ia_size != i_size_read(inode)) { truncate_setsize(inode, attrs->ia_size); + netfs_resize_file(&cifsInode->netfs, attrs->ia_size, true); fscache_resize_cookie(cifs_inode_cookie(inode), attrs->ia_size); } diff --git a/fs/smb/client/namespace.c b/fs/smb/client/namespace.c index a6968573b775..4a517b280f2b 100644 --- a/fs/smb/client/namespace.c +++ b/fs/smb/client/namespace.c @@ -168,6 +168,21 @@ static char *automount_fullpath(struct dentry *dentry, void *page) return s; } +static void fs_context_set_ids(struct smb3_fs_context *ctx) +{ + kuid_t uid = current_fsuid(); + kgid_t gid = current_fsgid(); + + if (ctx->multiuser) { + if (!ctx->uid_specified) + ctx->linux_uid = uid; + if (!ctx->gid_specified) + ctx->linux_gid = gid; + } + if (!ctx->cruid_specified) + ctx->cred_uid = uid; +} + /* * Create a vfsmount that we can automount */ @@ -205,6 +220,7 @@ static struct vfsmount *cifs_do_automount(struct path *path) tmp.leaf_fullpath = NULL; tmp.UNC = tmp.prepath = NULL; tmp.dfs_root_ses = NULL; + fs_context_set_ids(&tmp); rc = smb3_fs_context_dup(ctx, &tmp); if (rc) { diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c index 94255401b38d..b520eea7bfce 100644 --- a/fs/smb/client/readdir.c +++ b/fs/smb/client/readdir.c @@ -141,7 +141,7 @@ retry: if (likely(reparse_inode_match(inode, fattr))) { fattr->cf_mode = inode->i_mode; fattr->cf_rdev = inode->i_rdev; - fattr->cf_eof = CIFS_I(inode)->server_eof; + fattr->cf_eof = CIFS_I(inode)->netfs.remote_i_size; fattr->cf_symlink_target = NULL; } else { CIFS_I(inode)->time = 0; @@ -307,14 +307,16 @@ cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info, } static void cifs_fulldir_info_to_fattr(struct cifs_fattr *fattr, - SEARCH_ID_FULL_DIR_INFO *info, + const void *info, struct cifs_sb_info *cifs_sb) { + const FILE_FULL_DIRECTORY_INFO *di = info; + __dir_info_to_fattr(fattr, info); - /* See MS-FSCC 2.4.19 FileIdFullDirectoryInformation */ + /* See MS-FSCC 2.4.14, 2.4.19 */ if (fattr->cf_cifsattrs & ATTR_REPARSE) - fattr->cf_cifstag = le32_to_cpu(info->EaSize); + fattr->cf_cifstag = le32_to_cpu(di->EaSize); cifs_fill_common_info(fattr, cifs_sb); } @@ -396,7 +398,7 @@ ffirst_retry: } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { cifsFile->srch_inf.info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO; } else /* not srvinos - BB fixme add check for backlevel? */ { - cifsFile->srch_inf.info_level = SMB_FIND_FILE_DIRECTORY_INFO; + cifsFile->srch_inf.info_level = SMB_FIND_FILE_FULL_DIRECTORY_INFO; } search_flags = CIFS_SEARCH_CLOSE_AT_END | CIFS_SEARCH_RETURN_RESUME; @@ -987,10 +989,9 @@ static int cifs_filldir(char *find_entry, struct file *file, (FIND_FILE_STANDARD_INFO *)find_entry, cifs_sb); break; + case SMB_FIND_FILE_FULL_DIRECTORY_INFO: case SMB_FIND_FILE_ID_FULL_DIR_INFO: - cifs_fulldir_info_to_fattr(&fattr, - (SEARCH_ID_FULL_DIR_INFO *)find_entry, - cifs_sb); + cifs_fulldir_info_to_fattr(&fattr, find_entry, cifs_sb); break; default: cifs_dir_info_to_fattr(&fattr, diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index cde81042bebd..8f37373fd333 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -75,6 +75,10 @@ cifs_ses_get_chan_index(struct cifs_ses *ses, { unsigned int i; + /* if the channel is waiting for termination */ + if (server && server->terminate) + return CIFS_INVAL_CHAN_INDEX; + for (i = 0; i < ses->chan_count; i++) { if (ses->chans[i].server == server) return i; @@ -84,7 +88,6 @@ cifs_ses_get_chan_index(struct cifs_ses *ses, if (server) cifs_dbg(VFS, "unable to get chan index for server: 0x%llx", server->conn_id); - WARN_ON(1); return CIFS_INVAL_CHAN_INDEX; } @@ -269,6 +272,8 @@ int cifs_try_adding_channels(struct cifs_ses *ses) &iface->sockaddr, rc); kref_put(&iface->refcount, release_iface); + /* failure to add chan should increase weight */ + iface->weight_fulfilled++; continue; } diff --git a/fs/smb/client/smb2file.c b/fs/smb/client/smb2file.c index e0ee96d69d49..c23478ab1cf8 100644 --- a/fs/smb/client/smb2file.c +++ b/fs/smb/client/smb2file.c @@ -228,7 +228,7 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, * flock and OFD lock are associated with an open * file description, not the process. */ - if (!(flock->fl_flags & (FL_FLOCK | FL_OFDLCK))) + if (!(flock->c.flc_flags & (FL_FLOCK | FL_OFDLCK))) continue; if (cinode->can_cache_brlcks) { /* diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index a652200540c8..05818cd6d932 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -120,6 +120,14 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, unsigned int size[2]; void *data[2]; int len; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + oplock = SMB2_OPLOCK_LEVEL_NONE; + num_rqst = 0; + server = cifs_pick_channel(ses); vars = kzalloc(sizeof(*vars), GFP_ATOMIC); if (vars == NULL) @@ -127,8 +135,6 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, rqst = &vars->rqst[0]; rsp_iov = &vars->rsp_iov[0]; - server = cifs_pick_channel(ses); - if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; @@ -463,15 +469,24 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, num_rqst++; if (cfile) { + if (retries) + for (i = 1; i < num_rqst - 2; i++) + smb2_set_replay(server, &rqst[i]); + rc = compound_send_recv(xid, ses, server, flags, num_rqst - 2, &rqst[1], &resp_buftype[1], &rsp_iov[1]); - } else + } else { + if (retries) + for (i = 0; i < num_rqst; i++) + smb2_set_replay(server, &rqst[i]); + rc = compound_send_recv(xid, ses, server, flags, num_rqst, rqst, resp_buftype, rsp_iov); + } finished: num_rqst = 0; @@ -620,9 +635,6 @@ finished: } SMB2_close_free(&rqst[num_rqst]); - if (cfile) - cifsFileInfo_put(cfile); - num_cmds += 2; if (out_iov && out_buftype) { memcpy(out_iov, rsp_iov, num_cmds * sizeof(*out_iov)); @@ -632,7 +644,16 @@ finished: for (i = 0; i < num_cmds; i++) free_rsp_buf(resp_buftype[i], rsp_iov[i].iov_base); } + num_cmds -= 2; /* correct num_cmds as there could be a retry */ kfree(vars); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + + if (cfile) + cifsFileInfo_put(cfile); + return rc; } diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index d9553c2556a2..4695433fcf39 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -619,7 +619,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, goto out; } - while (bytes_left >= sizeof(*p)) { + while (bytes_left >= (ssize_t)sizeof(*p)) { memset(&tmp_iface, 0, sizeof(tmp_iface)); tmp_iface.speed = le64_to_cpu(p->LinkSpeed); tmp_iface.rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE) ? 1 : 0; @@ -1108,7 +1108,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, { struct smb2_compound_vars *vars; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; struct smb_rqst *rqst; struct kvec *rsp_iov; __le16 *utf16_path = NULL; @@ -1124,6 +1124,13 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, struct smb2_file_full_ea_info *ea = NULL; struct smb2_query_info_rsp *rsp; int rc, used_len = 0; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = CIFS_CP_CREATE_CLOSE_OP; + oplock = SMB2_OPLOCK_LEVEL_NONE; + server = cifs_pick_channel(ses); if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; @@ -1197,6 +1204,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, .disposition = FILE_OPEN, .create_options = cifs_create_options(cifs_sb, 0), .fid = &fid, + .replay = !!(retries), }; rc = SMB2_open_init(tcon, server, @@ -1244,6 +1252,12 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, goto sea_exit; smb2_set_related(&rqst[2]); + if (retries) { + smb2_set_replay(server, &rqst[0]); + smb2_set_replay(server, &rqst[1]); + smb2_set_replay(server, &rqst[2]); + } + rc = compound_send_recv(xid, ses, server, flags, 3, rqst, resp_buftype, rsp_iov); @@ -1260,6 +1274,11 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, kfree(vars); out_free_path: kfree(utf16_path); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } #endif @@ -1484,7 +1503,7 @@ smb2_ioctl_query_info(const unsigned int xid, struct smb_rqst *rqst; struct kvec *rsp_iov; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; char __user *arg = (char __user *)p; struct smb_query_info qi; struct smb_query_info __user *pqi; @@ -1501,6 +1520,13 @@ smb2_ioctl_query_info(const unsigned int xid, void *data[2]; int create_options = is_dir ? CREATE_NOT_FILE : CREATE_NOT_DIR; void (*free_req1_func)(struct smb_rqst *r); + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = CIFS_CP_CREATE_CLOSE_OP; + oplock = SMB2_OPLOCK_LEVEL_NONE; + server = cifs_pick_channel(ses); vars = kzalloc(sizeof(*vars), GFP_ATOMIC); if (vars == NULL) @@ -1544,6 +1570,7 @@ smb2_ioctl_query_info(const unsigned int xid, .disposition = FILE_OPEN, .create_options = cifs_create_options(cifs_sb, create_options), .fid = &fid, + .replay = !!(retries), }; if (qi.flags & PASSTHRU_FSCTL) { @@ -1641,6 +1668,12 @@ smb2_ioctl_query_info(const unsigned int xid, goto free_req_1; smb2_set_related(&rqst[2]); + if (retries) { + smb2_set_replay(server, &rqst[0]); + smb2_set_replay(server, &rqst[1]); + smb2_set_replay(server, &rqst[2]); + } + rc = compound_send_recv(xid, ses, server, flags, 3, rqst, resp_buftype, rsp_iov); @@ -1701,6 +1734,11 @@ free_output_buffer: kfree(buffer); free_vars: kfree(vars); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -2227,8 +2265,14 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_open_parms oparms; struct smb2_query_directory_rsp *qd_rsp = NULL; struct smb2_create_rsp *op_rsp = NULL; - struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses); - int retry_count = 0; + struct TCP_Server_Info *server; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + oplock = SMB2_OPLOCK_LEVEL_NONE; + server = cifs_pick_channel(tcon->ses); utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); if (!utf16_path) @@ -2253,6 +2297,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, .disposition = FILE_OPEN, .create_options = cifs_create_options(cifs_sb, 0), .fid = fid, + .replay = !!(retries), }; rc = SMB2_open_init(tcon, server, @@ -2278,14 +2323,15 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, smb2_set_related(&rqst[1]); -again: + if (retries) { + smb2_set_replay(server, &rqst[0]); + smb2_set_replay(server, &rqst[1]); + } + rc = compound_send_recv(xid, tcon->ses, server, flags, 2, rqst, resp_buftype, rsp_iov); - if (rc == -EAGAIN && retry_count++ < 10) - goto again; - /* If the open failed there is nothing to do */ op_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base; if (op_rsp == NULL || op_rsp->hdr.Status != STATUS_SUCCESS) { @@ -2333,6 +2379,11 @@ again: SMB2_query_directory_free(&rqst[1]); free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -2458,6 +2509,22 @@ smb2_oplock_response(struct cifs_tcon *tcon, __u64 persistent_fid, } void +smb2_set_replay(struct TCP_Server_Info *server, struct smb_rqst *rqst) +{ + struct smb2_hdr *shdr; + + if (server->dialect < SMB30_PROT_ID) + return; + + shdr = (struct smb2_hdr *)(rqst->rq_iov[0].iov_base); + if (shdr == NULL) { + cifs_dbg(FYI, "shdr NULL in smb2_set_related\n"); + return; + } + shdr->Flags |= SMB2_FLAGS_REPLAY_OPERATION; +} + +void smb2_set_related(struct smb_rqst *rqst) { struct smb2_hdr *shdr; @@ -2530,6 +2597,27 @@ smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst) } /* + * helper function for exponential backoff and check if replayable + */ +bool smb2_should_replay(struct cifs_tcon *tcon, + int *pretries, + int *pcur_sleep) +{ + if (!pretries || !pcur_sleep) + return false; + + if (tcon->retry || (*pretries)++ < tcon->ses->server->retrans) { + msleep(*pcur_sleep); + (*pcur_sleep) = ((*pcur_sleep) << 1); + if ((*pcur_sleep) > CIFS_MAX_SLEEP) + (*pcur_sleep) = CIFS_MAX_SLEEP; + return true; + } + + return false; +} + +/* * Passes the query info response back to the caller on success. * Caller need to free this with free_rsp_buf(). */ @@ -2542,7 +2630,7 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, { struct smb2_compound_vars *vars; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; int flags = CIFS_CP_CREATE_CLOSE_OP; struct smb_rqst *rqst; int resp_buftype[3]; @@ -2553,6 +2641,13 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, int rc; __le16 *utf16_path; struct cached_fid *cfid = NULL; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = CIFS_CP_CREATE_CLOSE_OP; + oplock = SMB2_OPLOCK_LEVEL_NONE; + server = cifs_pick_channel(ses); if (!path) path = ""; @@ -2589,6 +2684,7 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, .disposition = FILE_OPEN, .create_options = cifs_create_options(cifs_sb, 0), .fid = &fid, + .replay = !!(retries), }; rc = SMB2_open_init(tcon, server, @@ -2633,6 +2729,14 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, goto qic_exit; smb2_set_related(&rqst[2]); + if (retries) { + if (!cfid) { + smb2_set_replay(server, &rqst[0]); + smb2_set_replay(server, &rqst[2]); + } + smb2_set_replay(server, &rqst[1]); + } + if (cfid) { rc = compound_send_recv(xid, ses, server, flags, 1, &rqst[1], @@ -2665,6 +2769,11 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, kfree(vars); out_free_path: kfree(utf16_path); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -3213,6 +3322,9 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, cfile->fid.volatile_fid, cfile->pid, new_size); if (rc >= 0) { truncate_setsize(inode, new_size); + netfs_resize_file(&cifsi->netfs, new_size, true); + if (offset < cifsi->netfs.zero_point) + cifsi->netfs.zero_point = offset; fscache_resize_cookie(cifs_inode_cookie(inode), new_size); } } @@ -3436,7 +3548,7 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, cfile->pid, new_eof); if (rc == 0) { - cifsi->server_eof = new_eof; + netfs_resize_file(&cifsi->netfs, new_eof, true); cifs_setsize(inode, new_eof); cifs_truncate_page(inode->i_mapping, inode->i_size); truncate_setsize(inode, new_eof); @@ -3528,8 +3640,9 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon, int rc; unsigned int xid; struct inode *inode = file_inode(file); - struct cifsFileInfo *cfile = file->private_data; struct cifsInodeInfo *cifsi = CIFS_I(inode); + struct cifsFileInfo *cfile = file->private_data; + struct netfs_inode *ictx = &cifsi->netfs; loff_t old_eof, new_eof; xid = get_xid(); @@ -3549,6 +3662,7 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon, goto out_2; truncate_pagecache_range(inode, off, old_eof); + ictx->zero_point = old_eof; rc = smb2_copychunk_range(xid, cfile, cfile, off + len, old_eof - off - len, off); @@ -3563,9 +3677,10 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon, rc = 0; - cifsi->server_eof = i_size_read(inode) - len; - truncate_setsize(inode, cifsi->server_eof); - fscache_resize_cookie(cifs_inode_cookie(inode), cifsi->server_eof); + truncate_setsize(inode, new_eof); + netfs_resize_file(&cifsi->netfs, new_eof, true); + ictx->zero_point = new_eof; + fscache_resize_cookie(cifs_inode_cookie(inode), new_eof); out_2: filemap_invalidate_unlock(inode->i_mapping); out: @@ -3581,6 +3696,7 @@ static long smb3_insert_range(struct file *file, struct cifs_tcon *tcon, unsigned int xid; struct cifsFileInfo *cfile = file->private_data; struct inode *inode = file_inode(file); + struct cifsInodeInfo *cifsi = CIFS_I(inode); __u64 count, old_eof, new_eof; xid = get_xid(); @@ -3608,6 +3724,7 @@ static long smb3_insert_range(struct file *file, struct cifs_tcon *tcon, goto out_2; truncate_setsize(inode, new_eof); + netfs_resize_file(&cifsi->netfs, i_size_read(inode), true); fscache_resize_cookie(cifs_inode_cookie(inode), i_size_read(inode)); rc = smb2_copychunk_range(xid, cfile, cfile, off, count, off + len); @@ -5100,7 +5217,7 @@ static int smb2_create_reparse_symlink(const unsigned int xid, struct inode *new; struct kvec iov; __le16 *path; - char *sym; + char *sym, sep = CIFS_DIR_SEP(cifs_sb); u16 len, plen; int rc = 0; @@ -5114,7 +5231,8 @@ static int smb2_create_reparse_symlink(const unsigned int xid, .symlink_target = sym, }; - path = cifs_convert_path_to_utf16(symname, cifs_sb); + convert_delimiter(sym, sep); + path = cifs_convert_path_to_utf16(sym, cifs_sb); if (!path) { rc = -ENOMEM; goto out; @@ -5137,7 +5255,10 @@ static int smb2_create_reparse_symlink(const unsigned int xid, buf->PrintNameLength = cpu_to_le16(plen); memcpy(buf->PathBuffer, path, plen); buf->Flags = cpu_to_le32(*symname != '/' ? SYMLINK_FLAG_RELATIVE : 0); + if (*sym != sep) + buf->Flags = cpu_to_le32(SYMLINK_FLAG_RELATIVE); + convert_delimiter(sym, '/'); iov.iov_base = buf; iov.iov_len = len; new = smb2_get_reparse_inode(&data, inode->i_sb, xid, diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 288199f0b987..608ee05491e2 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -178,6 +178,7 @@ cifs_chan_skip_or_disable(struct cifs_ses *ses, } ses->chans[chan_index].server = NULL; + server->terminate = true; spin_unlock(&ses->chan_lock); /* @@ -188,14 +189,12 @@ cifs_chan_skip_or_disable(struct cifs_ses *ses, */ cifs_put_tcp_session(server, from_reconnect); - server->terminate = true; cifs_signal_cifsd_for_reconnect(server, false); /* mark primary server as needing reconnect */ pserver = server->primary_server; cifs_signal_cifsd_for_reconnect(pserver, false); skip_terminate: - mutex_unlock(&ses->session_mutex); return -EHOSTDOWN; } @@ -400,6 +399,15 @@ skip_sess_setup: goto out; } + spin_lock(&ses->ses_lock); + if (ses->flags & CIFS_SES_FLAG_SCALE_CHANNELS) { + spin_unlock(&ses->ses_lock); + mutex_unlock(&ses->session_mutex); + goto skip_add_channels; + } + ses->flags |= CIFS_SES_FLAG_SCALE_CHANNELS; + spin_unlock(&ses->ses_lock); + if (!rc && (server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) { mutex_unlock(&ses->session_mutex); @@ -411,7 +419,7 @@ skip_sess_setup: rc = SMB3_request_interfaces(xid, tcon, false); free_xid(xid); - if (rc == -EOPNOTSUPP) { + if (rc == -EOPNOTSUPP && ses->chan_count > 1) { /* * some servers like Azure SMB server do not advertise * that multichannel has been disabled with server @@ -429,17 +437,22 @@ skip_sess_setup: if (ses->chan_max > ses->chan_count && ses->iface_count && !SERVER_IS_CHAN(server)) { - if (ses->chan_count == 1) + if (ses->chan_count == 1) { cifs_server_dbg(VFS, "supports multichannel now\n"); + queue_delayed_work(cifsiod_wq, &tcon->query_interfaces, + (SMB_INTERFACE_POLL_INTERVAL * HZ)); + } cifs_try_adding_channels(ses); - queue_delayed_work(cifsiod_wq, &tcon->query_interfaces, - (SMB_INTERFACE_POLL_INTERVAL * HZ)); } } else { mutex_unlock(&ses->session_mutex); } + skip_add_channels: + spin_lock(&ses->ses_lock); + ses->flags &= ~CIFS_SES_FLAG_SCALE_CHANNELS; + spin_unlock(&ses->ses_lock); if (smb2_command != SMB2_INTERNAL_CMD) mod_delayed_work(cifsiod_wq, &server->reconnect, 0); @@ -2391,8 +2404,13 @@ create_durable_v2_buf(struct cifs_open_parms *oparms) */ buf->dcontext.Timeout = cpu_to_le32(oparms->tcon->handle_timeout); buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT); - generate_random_uuid(buf->dcontext.CreateGuid); - memcpy(pfid->create_guid, buf->dcontext.CreateGuid, 16); + + /* for replay, we should not overwrite the existing create guid */ + if (!oparms->replay) { + generate_random_uuid(buf->dcontext.CreateGuid); + memcpy(pfid->create_guid, buf->dcontext.CreateGuid, 16); + } else + memcpy(buf->dcontext.CreateGuid, pfid->create_guid, 16); /* SMB2_CREATE_DURABLE_HANDLE_REQUEST is "DH2Q" */ buf->Name[0] = 'D'; @@ -2765,7 +2783,14 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, int flags = 0; unsigned int total_len; __le16 *utf16_path = NULL; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + n_iov = 2; + server = cifs_pick_channel(ses); cifs_dbg(FYI, "mkdir\n"); @@ -2869,6 +2894,10 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, /* no need to inc num_remote_opens because we close it just below */ trace_smb3_posix_mkdir_enter(xid, tcon->tid, ses->Suid, full_path, CREATE_NOT_FILE, FILE_WRITE_ATTRIBUTES); + + if (retries) + smb2_set_replay(server, &rqst); + /* resource #4: response buffer */ rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); @@ -2906,6 +2935,11 @@ err_free_req: cifs_small_buf_release(req); err_free_path: kfree(utf16_path); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -3101,12 +3135,19 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, struct smb2_create_rsp *rsp = NULL; struct cifs_tcon *tcon = oparms->tcon; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; struct kvec iov[SMB2_CREATE_IOV_SIZE]; struct kvec rsp_iov = {NULL, 0}; int resp_buftype = CIFS_NO_BUFFER; int rc = 0; int flags = 0; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + server = cifs_pick_channel(ses); + oparms->replay = !!(retries); cifs_dbg(FYI, "create/open\n"); if (!ses || !server) @@ -3128,6 +3169,9 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, trace_smb3_open_enter(xid, tcon->tid, tcon->ses->Suid, oparms->path, oparms->create_options, oparms->desired_access); + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); @@ -3181,6 +3225,11 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, creat_exit: SMB2_open_free(&rqst); free_rsp_buf(resp_buftype, rsp); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -3305,15 +3354,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, int resp_buftype = CIFS_NO_BUFFER; int rc = 0; int flags = 0; - - cifs_dbg(FYI, "SMB2 IOCTL\n"); - - if (out_data != NULL) - *out_data = NULL; - - /* zero out returned data len, in case of error */ - if (plen) - *plen = 0; + int retries = 0, cur_sleep = 1; if (!tcon) return -EIO; @@ -3322,10 +3363,23 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, if (!ses) return -EIO; +replay_again: + /* reinitialize for possible replay */ + flags = 0; server = cifs_pick_channel(ses); + if (!server) return -EIO; + cifs_dbg(FYI, "SMB2 IOCTL\n"); + + if (out_data != NULL) + *out_data = NULL; + + /* zero out returned data len, in case of error */ + if (plen) + *plen = 0; + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; @@ -3340,6 +3394,9 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, if (rc) goto ioctl_exit; + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); @@ -3409,6 +3466,11 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, ioctl_exit: SMB2_ioctl_free(&rqst); free_rsp_buf(resp_buftype, rsp); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -3480,13 +3542,20 @@ __SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, struct smb_rqst rqst; struct smb2_close_rsp *rsp = NULL; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; struct kvec iov[1]; struct kvec rsp_iov; int resp_buftype = CIFS_NO_BUFFER; int rc = 0; int flags = 0; bool query_attrs = false; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + query_attrs = false; + server = cifs_pick_channel(ses); cifs_dbg(FYI, "Close\n"); @@ -3512,6 +3581,9 @@ __SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, if (rc) goto close_exit; + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); rsp = (struct smb2_close_rsp *)rsp_iov.iov_base; @@ -3545,6 +3617,11 @@ close_exit: cifs_dbg(VFS, "handle cancelled close fid 0x%llx returned error %d\n", persistent_fid, tmp_rc); } + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -3675,12 +3752,19 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, struct TCP_Server_Info *server; int flags = 0; bool allocated = false; + int retries = 0, cur_sleep = 1; cifs_dbg(FYI, "Query Info\n"); if (!ses) return -EIO; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + allocated = false; server = cifs_pick_channel(ses); + if (!server) return -EIO; @@ -3702,6 +3786,9 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, trace_smb3_query_info_enter(xid, persistent_fid, tcon->tid, ses->Suid, info_class, (__u32)info_type); + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base; @@ -3744,6 +3831,11 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, qinf_exit: SMB2_query_info_free(&rqst); free_rsp_buf(resp_buftype, rsp); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -3844,7 +3936,7 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, u32 *plen /* returned data len */) { struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; struct smb_rqst rqst; struct smb2_change_notify_rsp *smb_rsp; struct kvec iov[1]; @@ -3852,6 +3944,12 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, int resp_buftype = CIFS_NO_BUFFER; int flags = 0; int rc = 0; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + server = cifs_pick_channel(ses); cifs_dbg(FYI, "change notify\n"); if (!ses || !server) @@ -3876,6 +3974,10 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, trace_smb3_notify_enter(xid, persistent_fid, tcon->tid, ses->Suid, (u8)watch_tree, completion_filter); + + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); @@ -3910,6 +4012,11 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, if (rqst.rq_iov) cifs_small_buf_release(rqst.rq_iov[0].iov_base); /* request */ free_rsp_buf(resp_buftype, rsp_iov.iov_base); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -4152,10 +4259,16 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, struct smb_rqst rqst; struct kvec iov[1]; struct kvec rsp_iov = {NULL, 0}; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; int resp_buftype = CIFS_NO_BUFFER; int flags = 0; int rc = 0; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + server = cifs_pick_channel(ses); cifs_dbg(FYI, "flush\n"); if (!ses || !(ses->server)) @@ -4175,6 +4288,10 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, goto flush_exit; trace_smb3_flush_enter(xid, persistent_fid, tcon->tid, ses->Suid); + + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); @@ -4189,6 +4306,11 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, flush_exit: SMB2_flush_free(&rqst); free_rsp_buf(resp_buftype, rsp_iov.iov_base); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -4668,7 +4790,7 @@ smb2_async_writev(struct cifs_writedata *wdata, struct cifs_io_parms *io_parms = NULL; int credit_request; - if (!wdata->server) + if (!wdata->server || wdata->replay) server = wdata->server = cifs_pick_channel(tcon->ses); /* @@ -4753,6 +4875,8 @@ smb2_async_writev(struct cifs_writedata *wdata, rqst.rq_nvec = 1; rqst.rq_iter = wdata->iter; rqst.rq_iter_size = iov_iter_count(&rqst.rq_iter); + if (wdata->replay) + smb2_set_replay(server, &rqst); #ifdef CONFIG_CIFS_SMB_DIRECT if (wdata->mr) iov[0].iov_len += sizeof(struct smbd_buffer_descriptor_v1); @@ -4826,18 +4950,21 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, int flags = 0; unsigned int total_len; struct TCP_Server_Info *server; + int retries = 0, cur_sleep = 1; +replay_again: + /* reinitialize for possible replay */ + flags = 0; *nbytes = 0; - - if (n_vec < 1) - return rc; - if (!io_parms->server) io_parms->server = cifs_pick_channel(io_parms->tcon->ses); server = io_parms->server; if (server == NULL) return -ECONNABORTED; + if (n_vec < 1) + return rc; + rc = smb2_plain_req_init(SMB2_WRITE, io_parms->tcon, server, (void **) &req, &total_len); if (rc) @@ -4871,6 +4998,9 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, rqst.rq_iov = iov; rqst.rq_nvec = n_vec + 1; + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, io_parms->tcon->ses, server, &rqst, &resp_buftype, flags, &rsp_iov); @@ -4895,6 +5025,11 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, cifs_small_buf_release(req); free_rsp_buf(resp_buftype, rsp); + + if (is_replayable_error(rc) && + smb2_should_replay(io_parms->tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -5077,6 +5212,9 @@ int SMB2_query_directory_init(const unsigned int xid, case SMB_FIND_FILE_POSIX_INFO: req->FileInformationClass = SMB_FIND_FILE_POSIX_INFO; break; + case SMB_FIND_FILE_FULL_DIRECTORY_INFO: + req->FileInformationClass = FILE_FULL_DIRECTORY_INFORMATION; + break; default: cifs_tcon_dbg(VFS, "info level %u isn't supported\n", info_level); @@ -5146,6 +5284,9 @@ smb2_parse_query_directory(struct cifs_tcon *tcon, /* note that posix payload are variable size */ info_buf_size = sizeof(struct smb2_posix_info); break; + case SMB_FIND_FILE_FULL_DIRECTORY_INFO: + info_buf_size = sizeof(FILE_FULL_DIRECTORY_INFO); + break; default: cifs_tcon_dbg(VFS, "info level %u isn't supported\n", srch_inf->info_level); @@ -5206,8 +5347,14 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, struct kvec rsp_iov; int rc = 0; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; int flags = 0; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + server = cifs_pick_channel(ses); if (!ses || !(ses->server)) return -EIO; @@ -5227,6 +5374,9 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, if (rc) goto qdir_exit; + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); rsp = (struct smb2_query_directory_rsp *)rsp_iov.iov_base; @@ -5261,6 +5411,11 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, qdir_exit: SMB2_query_directory_free(&rqst); free_rsp_buf(resp_buftype, rsp); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -5327,8 +5482,14 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; int resp_buftype; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; int flags = 0; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + server = cifs_pick_channel(ses); if (!ses || !server) return -EIO; @@ -5356,6 +5517,8 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, return rc; } + if (retries) + smb2_set_replay(server, &rqst); rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, @@ -5371,6 +5534,11 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, free_rsp_buf(resp_buftype, rsp); kfree(iov); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -5423,12 +5591,18 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, int rc; struct smb2_oplock_break *req = NULL; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; int flags = CIFS_OBREAK_OP; unsigned int total_len; struct kvec iov[1]; struct kvec rsp_iov; int resp_buf_type; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = CIFS_OBREAK_OP; + server = cifs_pick_channel(ses); cifs_dbg(FYI, "SMB2_oplock_break\n"); rc = smb2_plain_req_init(SMB2_OPLOCK_BREAK, tcon, server, @@ -5453,15 +5627,21 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, rqst.rq_iov = iov; rqst.rq_nvec = 1; + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); - if (rc) { cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE); cifs_dbg(FYI, "Send error in Oplock Break = %d\n", rc); } + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -5547,9 +5727,15 @@ SMB311_posix_qfs_info(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; int resp_buftype; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; FILE_SYSTEM_POSIX_INFO *info = NULL; int flags = 0; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + server = cifs_pick_channel(ses); rc = build_qfs_info_req(&iov, tcon, server, FS_POSIX_INFORMATION, @@ -5565,6 +5751,9 @@ SMB311_posix_qfs_info(const unsigned int xid, struct cifs_tcon *tcon, rqst.rq_iov = &iov; rqst.rq_nvec = 1; + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); free_qfs_info_req(&iov); @@ -5584,6 +5773,11 @@ SMB311_posix_qfs_info(const unsigned int xid, struct cifs_tcon *tcon, posix_qfsinf_exit: free_rsp_buf(resp_buftype, rsp_iov.iov_base); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -5598,9 +5792,15 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; int resp_buftype; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; struct smb2_fs_full_size_info *info = NULL; int flags = 0; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + server = cifs_pick_channel(ses); rc = build_qfs_info_req(&iov, tcon, server, FS_FULL_SIZE_INFORMATION, @@ -5616,6 +5816,9 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, rqst.rq_iov = &iov; rqst.rq_nvec = 1; + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); free_qfs_info_req(&iov); @@ -5635,6 +5838,11 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, qfsinf_exit: free_rsp_buf(resp_buftype, rsp_iov.iov_base); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -5649,9 +5857,15 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; int resp_buftype, max_len, min_len; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; unsigned int rsp_len, offset; int flags = 0; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + server = cifs_pick_channel(ses); if (level == FS_DEVICE_INFORMATION) { max_len = sizeof(FILE_SYSTEM_DEVICE_INFO); @@ -5683,6 +5897,9 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, rqst.rq_iov = &iov; rqst.rq_nvec = 1; + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); free_qfs_info_req(&iov); @@ -5720,6 +5937,11 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, qfsattr_exit: free_rsp_buf(resp_buftype, rsp_iov.iov_base); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -5737,7 +5959,13 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, unsigned int count; int flags = CIFS_NO_RSP_BUF; unsigned int total_len; - struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses); + struct TCP_Server_Info *server; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = CIFS_NO_RSP_BUF; + server = cifs_pick_channel(tcon->ses); cifs_dbg(FYI, "smb2_lockv num lock %d\n", num_lock); @@ -5768,6 +5996,9 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, rqst.rq_iov = iov; rqst.rq_nvec = 2; + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, tcon->ses, server, &rqst, &resp_buf_type, flags, &rsp_iov); @@ -5779,6 +6010,10 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, tcon->ses->Suid, rc); } + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h index 0034b537b0b3..b3069911e9dd 100644 --- a/fs/smb/client/smb2proto.h +++ b/fs/smb/client/smb2proto.h @@ -122,6 +122,11 @@ extern unsigned long smb_rqst_len(struct TCP_Server_Info *server, extern void smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst); extern void smb2_set_related(struct smb_rqst *rqst); +extern void smb2_set_replay(struct TCP_Server_Info *server, + struct smb_rqst *rqst); +extern bool smb2_should_replay(struct cifs_tcon *tcon, + int *pretries, + int *pcur_sleep); /* * SMB2 Worker functions - most of protocol specific implementation details diff --git a/fs/smb/client/smbencrypt.c b/fs/smb/client/smbencrypt.c index f0ce26414f17..1d1ee9f18f37 100644 --- a/fs/smb/client/smbencrypt.c +++ b/fs/smb/client/smbencrypt.c @@ -26,13 +26,6 @@ #include "cifsproto.h" #include "../common/md4.h" -#ifndef false -#define false 0 -#endif -#ifndef true -#define true 1 -#endif - /* following came from the other byteorder.h to avoid include conflicts */ #define CVAL(buf,pos) (((unsigned char *)(buf))[pos]) #define SSVALX(buf,pos,val) (CVAL(buf,pos)=(val)&0xFF,CVAL(buf,pos+1)=(val)>>8) diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c index 4f717ad7c21b..994d70193432 100644 --- a/fs/smb/client/transport.c +++ b/fs/smb/client/transport.c @@ -400,10 +400,17 @@ unmask: server->conn_id, server->hostname); } smbd_done: - if (rc < 0 && rc != -EINTR) + /* + * there's hardly any use for the layers above to know the + * actual error code here. All they should do at this point is + * to retry the connection and hope it goes away. + */ + if (rc < 0 && rc != -EINTR && rc != -EAGAIN) { cifs_server_dbg(VFS, "Error %d sending data on socket to server\n", rc); - else if (rc > 0) + rc = -ECONNABORTED; + cifs_signal_cifsd_for_reconnect(server, false); + } else if (rc > 0) rc = 0; out: cifs_in_send_dec(server); @@ -428,8 +435,8 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, if (!(flags & CIFS_TRANSFORM_REQ)) return __smb_send_rqst(server, num_rqst, rqst); - if (num_rqst > MAX_COMPOUND - 1) - return -ENOMEM; + if (WARN_ON_ONCE(num_rqst > MAX_COMPOUND - 1)) + return -EIO; if (!server->ops->init_transform_rq) { cifs_server_dbg(VFS, "Encryption requested but transform callback is missing\n"); @@ -1026,6 +1033,9 @@ struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses) if (!server || server->terminate) continue; + if (CIFS_CHAN_NEEDS_RECONNECT(ses, i)) + continue; + /* * strictly speaking, we should pick up req_lock to read * server->in_flight. But it shouldn't matter much here if we diff --git a/fs/smb/server/ksmbd_netlink.h b/fs/smb/server/ksmbd_netlink.h index b7521e41402e..0ebf91ffa236 100644 --- a/fs/smb/server/ksmbd_netlink.h +++ b/fs/smb/server/ksmbd_netlink.h @@ -304,7 +304,8 @@ enum ksmbd_event { KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST, KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE = 15, - KSMBD_EVENT_MAX + __KSMBD_EVENT_MAX, + KSMBD_EVENT_MAX = __KSMBD_EVENT_MAX - 1 }; /* diff --git a/fs/smb/server/misc.c b/fs/smb/server/misc.c index 9e8afaa686e3..1a5faa6f6e7b 100644 --- a/fs/smb/server/misc.c +++ b/fs/smb/server/misc.c @@ -261,6 +261,7 @@ out_ascii: /** * ksmbd_extract_sharename() - get share name from tree connect request + * @um: pointer to a unicode_map structure for character encoding handling * @treename: buffer containing tree name and share name * * Return: share name on success, otherwise error diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index ba7a72a6a4f4..089527a8b4ff 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -6173,8 +6173,10 @@ static noinline int smb2_read_pipe(struct ksmbd_work *work) err = ksmbd_iov_pin_rsp_read(work, (void *)rsp, offsetof(struct smb2_read_rsp, Buffer), aux_payload_buf, nbytes); - if (err) + if (err) { + kvfree(aux_payload_buf); goto out; + } kvfree(rpc_resp); } else { err = ksmbd_iov_pin_rsp(work, (void *)rsp, @@ -6384,8 +6386,10 @@ int smb2_read(struct ksmbd_work *work) err = ksmbd_iov_pin_rsp_read(work, (void *)rsp, offsetof(struct smb2_read_rsp, Buffer), aux_payload_buf, nbytes); - if (err) + if (err) { + kvfree(aux_payload_buf); goto out; + } ksmbd_fd_put(work, fp); return 0; @@ -6760,10 +6764,10 @@ struct file_lock *smb_flock_init(struct file *f) locks_init_lock(fl); - fl->fl_owner = f; - fl->fl_pid = current->tgid; - fl->fl_file = f; - fl->fl_flags = FL_POSIX; + fl->c.flc_owner = f; + fl->c.flc_pid = current->tgid; + fl->c.flc_file = f; + fl->c.flc_flags = FL_POSIX; fl->fl_ops = NULL; fl->fl_lmops = NULL; @@ -6780,30 +6784,30 @@ static int smb2_set_flock_flags(struct file_lock *flock, int flags) case SMB2_LOCKFLAG_SHARED: ksmbd_debug(SMB, "received shared request\n"); cmd = F_SETLKW; - flock->fl_type = F_RDLCK; - flock->fl_flags |= FL_SLEEP; + flock->c.flc_type = F_RDLCK; + flock->c.flc_flags |= FL_SLEEP; break; case SMB2_LOCKFLAG_EXCLUSIVE: ksmbd_debug(SMB, "received exclusive request\n"); cmd = F_SETLKW; - flock->fl_type = F_WRLCK; - flock->fl_flags |= FL_SLEEP; + flock->c.flc_type = F_WRLCK; + flock->c.flc_flags |= FL_SLEEP; break; case SMB2_LOCKFLAG_SHARED | SMB2_LOCKFLAG_FAIL_IMMEDIATELY: ksmbd_debug(SMB, "received shared & fail immediately request\n"); cmd = F_SETLK; - flock->fl_type = F_RDLCK; + flock->c.flc_type = F_RDLCK; break; case SMB2_LOCKFLAG_EXCLUSIVE | SMB2_LOCKFLAG_FAIL_IMMEDIATELY: ksmbd_debug(SMB, "received exclusive & fail immediately request\n"); cmd = F_SETLK; - flock->fl_type = F_WRLCK; + flock->c.flc_type = F_WRLCK; break; case SMB2_LOCKFLAG_UNLOCK: ksmbd_debug(SMB, "received unlock request\n"); - flock->fl_type = F_UNLCK; + flock->c.flc_type = F_UNLCK; cmd = F_SETLK; break; } @@ -6841,13 +6845,13 @@ static void smb2_remove_blocked_lock(void **argv) struct file_lock *flock = (struct file_lock *)argv[0]; ksmbd_vfs_posix_lock_unblock(flock); - wake_up(&flock->fl_wait); + locks_wake_up(flock); } static inline bool lock_defer_pending(struct file_lock *fl) { /* check pending lock waiters */ - return waitqueue_active(&fl->fl_wait); + return waitqueue_active(&fl->c.flc_wait); } /** @@ -6938,8 +6942,8 @@ int smb2_lock(struct ksmbd_work *work) list_for_each_entry(cmp_lock, &lock_list, llist) { if (cmp_lock->fl->fl_start <= flock->fl_start && cmp_lock->fl->fl_end >= flock->fl_end) { - if (cmp_lock->fl->fl_type != F_UNLCK && - flock->fl_type != F_UNLCK) { + if (cmp_lock->fl->c.flc_type != F_UNLCK && + flock->c.flc_type != F_UNLCK) { pr_err("conflict two locks in one request\n"); err = -EINVAL; locks_free_lock(flock); @@ -6987,12 +6991,12 @@ int smb2_lock(struct ksmbd_work *work) list_for_each_entry(conn, &conn_list, conns_list) { spin_lock(&conn->llist_lock); list_for_each_entry_safe(cmp_lock, tmp2, &conn->lock_list, clist) { - if (file_inode(cmp_lock->fl->fl_file) != - file_inode(smb_lock->fl->fl_file)) + if (file_inode(cmp_lock->fl->c.flc_file) != + file_inode(smb_lock->fl->c.flc_file)) continue; - if (smb_lock->fl->fl_type == F_UNLCK) { - if (cmp_lock->fl->fl_file == smb_lock->fl->fl_file && + if (lock_is_unlock(smb_lock->fl)) { + if (cmp_lock->fl->c.flc_file == smb_lock->fl->c.flc_file && cmp_lock->start == smb_lock->start && cmp_lock->end == smb_lock->end && !lock_defer_pending(cmp_lock->fl)) { @@ -7009,7 +7013,7 @@ int smb2_lock(struct ksmbd_work *work) continue; } - if (cmp_lock->fl->fl_file == smb_lock->fl->fl_file) { + if (cmp_lock->fl->c.flc_file == smb_lock->fl->c.flc_file) { if (smb_lock->flags & SMB2_LOCKFLAG_SHARED) continue; } else { @@ -7051,7 +7055,7 @@ int smb2_lock(struct ksmbd_work *work) } up_read(&conn_list_lock); out_check_cl: - if (smb_lock->fl->fl_type == F_UNLCK && nolock) { + if (lock_is_unlock(smb_lock->fl) && nolock) { pr_err("Try to unlock nolocked range\n"); rsp->hdr.Status = STATUS_RANGE_NOT_LOCKED; goto out; @@ -7175,7 +7179,7 @@ out: struct file_lock *rlock = NULL; rlock = smb_flock_init(filp); - rlock->fl_type = F_UNLCK; + rlock->c.flc_type = F_UNLCK; rlock->fl_start = smb_lock->start; rlock->fl_end = smb_lock->end; diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c index b49d47bdafc9..f29bb03f0dc4 100644 --- a/fs/smb/server/transport_ipc.c +++ b/fs/smb/server/transport_ipc.c @@ -74,7 +74,7 @@ static int handle_unsupported_event(struct sk_buff *skb, struct genl_info *info) static int handle_generic_event(struct sk_buff *skb, struct genl_info *info); static int ksmbd_ipc_heartbeat_request(void); -static const struct nla_policy ksmbd_nl_policy[KSMBD_EVENT_MAX] = { +static const struct nla_policy ksmbd_nl_policy[KSMBD_EVENT_MAX + 1] = { [KSMBD_EVENT_UNSPEC] = { .len = 0, }, @@ -403,7 +403,7 @@ static int handle_generic_event(struct sk_buff *skb, struct genl_info *info) return -EPERM; #endif - if (type >= KSMBD_EVENT_MAX) { + if (type > KSMBD_EVENT_MAX) { WARN_ON(1); return -EINVAL; } diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c index 9d4222154dcc..002a3f0dc7c5 100644 --- a/fs/smb/server/transport_tcp.c +++ b/fs/smb/server/transport_tcp.c @@ -365,6 +365,7 @@ static int ksmbd_tcp_readv(struct tcp_transport *t, struct kvec *iov_orig, * @t: TCP transport instance * @buf: buffer to store read data from socket * @to_read: number of bytes to read from socket + * @max_retries: number of retries if reading from socket fails * * Return: on success return number of bytes read from socket, * otherwise return error number @@ -416,6 +417,7 @@ static void tcp_destroy_socket(struct socket *ksmbd_socket) /** * create_socket - create socket for ksmbd/0 + * @iface: interface to bind the created socket to * * Return: 0 on success, error number otherwise */ diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index a6961bfe3e13..c487e834331a 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -337,18 +337,18 @@ static int check_lock_range(struct file *filp, loff_t start, loff_t end, return 0; spin_lock(&ctx->flc_lock); - list_for_each_entry(flock, &ctx->flc_posix, fl_list) { + for_each_file_lock(flock, &ctx->flc_posix) { /* check conflict locks */ if (flock->fl_end >= start && end >= flock->fl_start) { - if (flock->fl_type == F_RDLCK) { + if (lock_is_read(flock)) { if (type == WRITE) { pr_err("not allow write by shared lock\n"); error = 1; goto out; } - } else if (flock->fl_type == F_WRLCK) { + } else if (lock_is_write(flock)) { /* check owner in lock */ - if (flock->fl_file != filp) { + if (flock->c.flc_file != filp) { error = 1; pr_err("not allow rw access by exclusive lock from other opens\n"); goto out; @@ -1837,13 +1837,13 @@ int ksmbd_vfs_copy_file_ranges(struct ksmbd_work *work, void ksmbd_vfs_posix_lock_wait(struct file_lock *flock) { - wait_event(flock->fl_wait, !flock->fl_blocker); + wait_event(flock->c.flc_wait, !flock->c.flc_blocker); } int ksmbd_vfs_posix_lock_wait_timeout(struct file_lock *flock, long timeout) { - return wait_event_interruptible_timeout(flock->fl_wait, - !flock->fl_blocker, + return wait_event_interruptible_timeout(flock->c.flc_wait, + !flock->c.flc_blocker, timeout); } diff --git a/fs/super.c b/fs/super.c index d35e85295489..ee05ab6b37e7 100644 --- a/fs/super.c +++ b/fs/super.c @@ -274,9 +274,10 @@ static void destroy_super_work(struct work_struct *work) { struct super_block *s = container_of(work, struct super_block, destroy_work); - int i; - - for (i = 0; i < SB_FREEZE_LEVELS; i++) + security_sb_free(s); + put_user_ns(s->s_user_ns); + kfree(s->s_subtype); + for (int i = 0; i < SB_FREEZE_LEVELS; i++) percpu_free_rwsem(&s->s_writers.rw_sem[i]); kfree(s); } @@ -296,9 +297,6 @@ static void destroy_unused_super(struct super_block *s) super_unlock_excl(s); list_lru_destroy(&s->s_dentry_lru); list_lru_destroy(&s->s_inode_lru); - security_sb_free(s); - put_user_ns(s->s_user_ns); - kfree(s->s_subtype); shrinker_free(s->s_shrink); /* no delays needed */ destroy_super_work(&s->destroy_work); @@ -409,9 +407,6 @@ static void __put_super(struct super_block *s) WARN_ON(s->s_dentry_lru.node); WARN_ON(s->s_inode_lru.node); WARN_ON(!list_empty(&s->s_mounts)); - security_sb_free(s); - put_user_ns(s->s_user_ns); - kfree(s->s_subtype); call_rcu(&s->rcu, destroy_super_rcu); } } @@ -1532,16 +1527,16 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, struct fs_context *fc) { blk_mode_t mode = sb_open_mode(sb_flags); - struct bdev_handle *bdev_handle; + struct file *bdev_file; struct block_device *bdev; - bdev_handle = bdev_open_by_dev(sb->s_dev, mode, sb, &fs_holder_ops); - if (IS_ERR(bdev_handle)) { + bdev_file = bdev_file_open_by_dev(sb->s_dev, mode, sb, &fs_holder_ops); + if (IS_ERR(bdev_file)) { if (fc) errorf(fc, "%s: Can't open blockdev", fc->source); - return PTR_ERR(bdev_handle); + return PTR_ERR(bdev_file); } - bdev = bdev_handle->bdev; + bdev = file_bdev(bdev_file); /* * This really should be in blkdev_get_by_dev, but right now can't due @@ -1549,7 +1544,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, * writable from userspace even for a read-only block device. */ if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) { - bdev_release(bdev_handle); + fput(bdev_file); return -EACCES; } @@ -1560,11 +1555,11 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, if (atomic_read(&bdev->bd_fsfreeze_count) > 0) { if (fc) warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); - bdev_release(bdev_handle); + fput(bdev_file); return -EBUSY; } spin_lock(&sb_lock); - sb->s_bdev_handle = bdev_handle; + sb->s_bdev_file = bdev_file; sb->s_bdev = bdev; sb->s_bdi = bdi_get(bdev->bd_disk->bdi); if (bdev_stable_writes(bdev)) @@ -1680,7 +1675,7 @@ void kill_block_super(struct super_block *sb) generic_shutdown_super(sb); if (bdev) { sync_blockdev(bdev); - bdev_release(sb->s_bdev_handle); + fput(sb->s_bdev_file); } } diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 5a915b2e68f5..76bc2d5e75a9 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -336,7 +336,7 @@ int __init sysv_init_icache(void) { sysv_inode_cachep = kmem_cache_create("sysv_inode_cache", sizeof(struct sysv_inode_info), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT, + SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, init_once); if (!sysv_inode_cachep) return -ENOMEM; diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index 410ab2a44d2f..19bcb51a2203 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -83,9 +83,6 @@ static inline sysv_zone_t *block_end(struct buffer_head *bh) return (sysv_zone_t*)((char*)bh->b_data + bh->b_size); } -/* - * Requires read_lock(&pointers_lock) or write_lock(&pointers_lock) - */ static Indirect *get_branch(struct inode *inode, int depth, int offsets[], @@ -105,15 +102,18 @@ static Indirect *get_branch(struct inode *inode, bh = sb_bread(sb, block); if (!bh) goto failure; + read_lock(&pointers_lock); if (!verify_chain(chain, p)) goto changed; add_chain(++p, bh, (sysv_zone_t*)bh->b_data + *++offsets); + read_unlock(&pointers_lock); if (!p->key) goto no_block; } return NULL; changed: + read_unlock(&pointers_lock); brelse(bh); *err = -EAGAIN; goto no_block; @@ -219,9 +219,7 @@ static int get_block(struct inode *inode, sector_t iblock, struct buffer_head *b goto out; reread: - read_lock(&pointers_lock); partial = get_branch(inode, depth, offsets, chain, &err); - read_unlock(&pointers_lock); /* Simplest case - block found, no allocation needed */ if (!partial) { @@ -291,9 +289,9 @@ static Indirect *find_shared(struct inode *inode, *top = 0; for (k = depth; k > 1 && !offsets[k-1]; k--) ; + partial = get_branch(inode, k, offsets, chain, &err); write_lock(&pointers_lock); - partial = get_branch(inode, k, offsets, chain, &err); if (!partial) partial = chain + k-1; /* diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 6795fda2af19..110e8a272189 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -34,7 +34,15 @@ static DEFINE_MUTEX(eventfs_mutex); /* Choose something "unique" ;-) */ #define EVENTFS_FILE_INODE_INO 0x12c4e37 -#define EVENTFS_DIR_INODE_INO 0x134b2f5 + +/* Just try to make something consistent and unique */ +static int eventfs_dir_ino(struct eventfs_inode *ei) +{ + if (!ei->ino) + ei->ino = get_next_ino(); + + return ei->ino; +} /* * The eventfs_inode (ei) itself is protected by SRCU. It is released from @@ -54,6 +62,46 @@ enum { #define EVENTFS_MODE_MASK (EVENTFS_SAVE_MODE - 1) +/* + * eventfs_inode reference count management. + * + * NOTE! We count only references from dentries, in the + * form 'dentry->d_fsdata'. There are also references from + * directory inodes ('ti->private'), but the dentry reference + * count is always a superset of the inode reference count. + */ +static void release_ei(struct kref *ref) +{ + struct eventfs_inode *ei = container_of(ref, struct eventfs_inode, kref); + + WARN_ON_ONCE(!ei->is_freed); + + kfree(ei->entry_attrs); + kfree_const(ei->name); + kfree_rcu(ei, rcu); +} + +static inline void put_ei(struct eventfs_inode *ei) +{ + if (ei) + kref_put(&ei->kref, release_ei); +} + +static inline void free_ei(struct eventfs_inode *ei) +{ + if (ei) { + ei->is_freed = 1; + put_ei(ei); + } +} + +static inline struct eventfs_inode *get_ei(struct eventfs_inode *ei) +{ + if (ei) + kref_get(&ei->kref); + return ei; +} + static struct dentry *eventfs_root_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); @@ -148,33 +196,30 @@ static int eventfs_set_attr(struct mnt_idmap *idmap, struct dentry *dentry, return ret; } -static void update_top_events_attr(struct eventfs_inode *ei, struct dentry *dentry) +static void update_top_events_attr(struct eventfs_inode *ei, struct super_block *sb) { - struct inode *inode; + struct inode *root; /* Only update if the "events" was on the top level */ if (!ei || !(ei->attr.mode & EVENTFS_TOPLEVEL)) return; /* Get the tracefs root inode. */ - inode = d_inode(dentry->d_sb->s_root); - ei->attr.uid = inode->i_uid; - ei->attr.gid = inode->i_gid; + root = d_inode(sb->s_root); + ei->attr.uid = root->i_uid; + ei->attr.gid = root->i_gid; } static void set_top_events_ownership(struct inode *inode) { struct tracefs_inode *ti = get_tracefs(inode); struct eventfs_inode *ei = ti->private; - struct dentry *dentry; /* The top events directory doesn't get automatically updated */ if (!ei || !ei->is_events || !(ei->attr.mode & EVENTFS_TOPLEVEL)) return; - dentry = ei->dentry; - - update_top_events_attr(ei, dentry); + update_top_events_attr(ei, inode->i_sb); if (!(ei->attr.mode & EVENTFS_SAVE_UID)) inode->i_uid = ei->attr.uid; @@ -225,10 +270,11 @@ static struct eventfs_inode *eventfs_find_events(struct dentry *dentry) { struct eventfs_inode *ei; - mutex_lock(&eventfs_mutex); do { - /* The parent always has an ei, except for events itself */ - ei = dentry->d_parent->d_fsdata; + // The parent is stable because we do not do renames + dentry = dentry->d_parent; + // ... and directories always have d_fsdata + ei = dentry->d_fsdata; /* * If the ei is being freed, the ownership of the children @@ -238,12 +284,10 @@ static struct eventfs_inode *eventfs_find_events(struct dentry *dentry) ei = NULL; break; } - - dentry = ei->dentry; + // Walk upwards until you find the events inode } while (!ei->is_events); - mutex_unlock(&eventfs_mutex); - update_top_events_attr(ei, dentry); + update_top_events_attr(ei, dentry->d_sb); return ei; } @@ -273,50 +317,11 @@ static void update_inode_attr(struct dentry *dentry, struct inode *inode, inode->i_gid = attr->gid; } -static void update_gid(struct eventfs_inode *ei, kgid_t gid, int level) -{ - struct eventfs_inode *ei_child; - - /* at most we have events/system/event */ - if (WARN_ON_ONCE(level > 3)) - return; - - ei->attr.gid = gid; - - if (ei->entry_attrs) { - for (int i = 0; i < ei->nr_entries; i++) { - ei->entry_attrs[i].gid = gid; - } - } - - /* - * Only eventfs_inode with dentries are updated, make sure - * all eventfs_inodes are updated. If one of the children - * do not have a dentry, this function must traverse it. - */ - list_for_each_entry_srcu(ei_child, &ei->children, list, - srcu_read_lock_held(&eventfs_srcu)) { - if (!ei_child->dentry) - update_gid(ei_child, gid, level + 1); - } -} - -void eventfs_update_gid(struct dentry *dentry, kgid_t gid) -{ - struct eventfs_inode *ei = dentry->d_fsdata; - int idx; - - idx = srcu_read_lock(&eventfs_srcu); - update_gid(ei, gid, 0); - srcu_read_unlock(&eventfs_srcu, idx); -} - /** - * create_file - create a file in the tracefs filesystem - * @name: the name of the file to create. + * lookup_file - look up a file in the tracefs filesystem + * @dentry: the dentry to look up * @mode: the permission that the file should have. * @attr: saved attributes changed by user - * @parent: parent dentry for this file. * @data: something that the caller will want to get to later on. * @fop: struct file_operations that should be used for this file. * @@ -324,30 +329,25 @@ void eventfs_update_gid(struct dentry *dentry, kgid_t gid) * directory. The inode.i_private pointer will point to @data in the open() * call. */ -static struct dentry *create_file(const char *name, umode_t mode, +static struct dentry *lookup_file(struct eventfs_inode *parent_ei, + struct dentry *dentry, + umode_t mode, struct eventfs_attr *attr, - struct dentry *parent, void *data, + void *data, const struct file_operations *fop) { struct tracefs_inode *ti; - struct dentry *dentry; struct inode *inode; if (!(mode & S_IFMT)) mode |= S_IFREG; if (WARN_ON_ONCE(!S_ISREG(mode))) - return NULL; - - WARN_ON_ONCE(!parent); - dentry = eventfs_start_creating(name, parent); - - if (IS_ERR(dentry)) - return dentry; + return ERR_PTR(-EIO); inode = tracefs_get_inode(dentry->d_sb); if (unlikely(!inode)) - return eventfs_failed_creating(dentry); + return ERR_PTR(-ENOMEM); /* If the user updated the directory's attributes, use them */ update_inode_attr(dentry, inode, attr, mode); @@ -361,32 +361,31 @@ static struct dentry *create_file(const char *name, umode_t mode, ti = get_tracefs(inode); ti->flags |= TRACEFS_EVENT_INODE; - d_instantiate(dentry, inode); - fsnotify_create(dentry->d_parent->d_inode, dentry); - return eventfs_end_creating(dentry); + + // Files have their parent's ei as their fsdata + dentry->d_fsdata = get_ei(parent_ei); + + d_add(dentry, inode); + return NULL; }; /** - * create_dir - create a dir in the tracefs filesystem + * lookup_dir_entry - look up a dir in the tracefs filesystem + * @dentry: the directory to look up * @ei: the eventfs_inode that represents the directory to create - * @parent: parent dentry for this file. * - * This function will create a dentry for a directory represented by + * This function will look up a dentry for a directory represented by * a eventfs_inode. */ -static struct dentry *create_dir(struct eventfs_inode *ei, struct dentry *parent) +static struct dentry *lookup_dir_entry(struct dentry *dentry, + struct eventfs_inode *pei, struct eventfs_inode *ei) { struct tracefs_inode *ti; - struct dentry *dentry; struct inode *inode; - dentry = eventfs_start_creating(ei->name, parent); - if (IS_ERR(dentry)) - return dentry; - inode = tracefs_get_inode(dentry->d_sb); if (unlikely(!inode)) - return eventfs_failed_creating(dentry); + return ERR_PTR(-ENOMEM); /* If the user updated the directory's attributes, use them */ update_inode_attr(dentry, inode, &ei->attr, @@ -396,68 +395,50 @@ static struct dentry *create_dir(struct eventfs_inode *ei, struct dentry *parent inode->i_fop = &eventfs_file_operations; /* All directories will have the same inode number */ - inode->i_ino = EVENTFS_DIR_INODE_INO; + inode->i_ino = eventfs_dir_ino(ei); ti = get_tracefs(inode); ti->flags |= TRACEFS_EVENT_INODE; + /* Only directories have ti->private set to an ei, not files */ + ti->private = ei; - inc_nlink(inode); - d_instantiate(dentry, inode); - inc_nlink(dentry->d_parent->d_inode); - fsnotify_mkdir(dentry->d_parent->d_inode, dentry); - return eventfs_end_creating(dentry); + dentry->d_fsdata = get_ei(ei); + + d_add(dentry, inode); + return NULL; } -static void free_ei(struct eventfs_inode *ei) +static inline struct eventfs_inode *alloc_ei(const char *name) { - kfree_const(ei->name); - kfree(ei->d_children); - kfree(ei->entry_attrs); - kfree(ei); + struct eventfs_inode *ei = kzalloc(sizeof(*ei), GFP_KERNEL); + + if (!ei) + return NULL; + + ei->name = kstrdup_const(name, GFP_KERNEL); + if (!ei->name) { + kfree(ei); + return NULL; + } + kref_init(&ei->kref); + return ei; } /** - * eventfs_set_ei_status_free - remove the dentry reference from an eventfs_inode - * @ti: the tracefs_inode of the dentry + * eventfs_d_release - dentry is going away * @dentry: dentry which has the reference to remove. * * Remove the association between a dentry from an eventfs_inode. */ -void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry) +void eventfs_d_release(struct dentry *dentry) { - struct eventfs_inode *ei; - int i; - - mutex_lock(&eventfs_mutex); - - ei = dentry->d_fsdata; - if (!ei) - goto out; - - /* This could belong to one of the files of the ei */ - if (ei->dentry != dentry) { - for (i = 0; i < ei->nr_entries; i++) { - if (ei->d_children[i] == dentry) - break; - } - if (WARN_ON_ONCE(i == ei->nr_entries)) - goto out; - ei->d_children[i] = NULL; - } else if (ei->is_freed) { - free_ei(ei); - } else { - ei->dentry = NULL; - } - - dentry->d_fsdata = NULL; - out: - mutex_unlock(&eventfs_mutex); + put_ei(dentry->d_fsdata); } /** - * create_file_dentry - create a dentry for a file of an eventfs_inode + * lookup_file_dentry - create a dentry for a file of an eventfs_inode * @ei: the eventfs_inode that the file will be created under - * @idx: the index into the d_children[] of the @ei + * @idx: the index into the entry_attrs[] of the @ei * @parent: The parent dentry of the created file. * @name: The name of the file to create * @mode: The mode of the file. @@ -468,163 +449,17 @@ void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry) * address located at @e_dentry. */ static struct dentry * -create_file_dentry(struct eventfs_inode *ei, int idx, - struct dentry *parent, const char *name, umode_t mode, void *data, +lookup_file_dentry(struct dentry *dentry, + struct eventfs_inode *ei, int idx, + umode_t mode, void *data, const struct file_operations *fops) { struct eventfs_attr *attr = NULL; - struct dentry **e_dentry = &ei->d_children[idx]; - struct dentry *dentry; - - WARN_ON_ONCE(!inode_is_locked(parent->d_inode)); - mutex_lock(&eventfs_mutex); - if (ei->is_freed) { - mutex_unlock(&eventfs_mutex); - return NULL; - } - /* If the e_dentry already has a dentry, use it */ - if (*e_dentry) { - dget(*e_dentry); - mutex_unlock(&eventfs_mutex); - return *e_dentry; - } - - /* ei->entry_attrs are protected by SRCU */ if (ei->entry_attrs) attr = &ei->entry_attrs[idx]; - mutex_unlock(&eventfs_mutex); - - dentry = create_file(name, mode, attr, parent, data, fops); - - mutex_lock(&eventfs_mutex); - - if (IS_ERR_OR_NULL(dentry)) { - /* - * When the mutex was released, something else could have - * created the dentry for this e_dentry. In which case - * use that one. - * - * If ei->is_freed is set, the e_dentry is currently on its - * way to being freed, don't return it. If e_dentry is NULL - * it means it was already freed. - */ - if (ei->is_freed) { - dentry = NULL; - } else { - dentry = *e_dentry; - dget(dentry); - } - mutex_unlock(&eventfs_mutex); - return dentry; - } - - if (!*e_dentry && !ei->is_freed) { - *e_dentry = dentry; - dentry->d_fsdata = ei; - } else { - /* - * Should never happen unless we get here due to being freed. - * Otherwise it means two dentries exist with the same name. - */ - WARN_ON_ONCE(!ei->is_freed); - dentry = NULL; - } - mutex_unlock(&eventfs_mutex); - - return dentry; -} - -/** - * eventfs_post_create_dir - post create dir routine - * @ei: eventfs_inode of recently created dir - * - * Map the meta-data of files within an eventfs dir to their parent dentry - */ -static void eventfs_post_create_dir(struct eventfs_inode *ei) -{ - struct eventfs_inode *ei_child; - struct tracefs_inode *ti; - - lockdep_assert_held(&eventfs_mutex); - - /* srcu lock already held */ - /* fill parent-child relation */ - list_for_each_entry_srcu(ei_child, &ei->children, list, - srcu_read_lock_held(&eventfs_srcu)) { - ei_child->d_parent = ei->dentry; - } - - ti = get_tracefs(ei->dentry->d_inode); - ti->private = ei; -} - -/** - * create_dir_dentry - Create a directory dentry for the eventfs_inode - * @pei: The eventfs_inode parent of ei. - * @ei: The eventfs_inode to create the directory for - * @parent: The dentry of the parent of this directory - * - * This creates and attaches a directory dentry to the eventfs_inode @ei. - */ -static struct dentry * -create_dir_dentry(struct eventfs_inode *pei, struct eventfs_inode *ei, - struct dentry *parent) -{ - struct dentry *dentry = NULL; - - WARN_ON_ONCE(!inode_is_locked(parent->d_inode)); - - mutex_lock(&eventfs_mutex); - if (pei->is_freed || ei->is_freed) { - mutex_unlock(&eventfs_mutex); - return NULL; - } - if (ei->dentry) { - /* If the eventfs_inode already has a dentry, use it */ - dentry = ei->dentry; - dget(dentry); - mutex_unlock(&eventfs_mutex); - return dentry; - } - mutex_unlock(&eventfs_mutex); - - dentry = create_dir(ei, parent); - - mutex_lock(&eventfs_mutex); - - if (IS_ERR_OR_NULL(dentry) && !ei->is_freed) { - /* - * When the mutex was released, something else could have - * created the dentry for this e_dentry. In which case - * use that one. - * - * If ei->is_freed is set, the e_dentry is currently on its - * way to being freed. - */ - dentry = ei->dentry; - if (dentry) - dget(dentry); - mutex_unlock(&eventfs_mutex); - return dentry; - } - - if (!ei->dentry && !ei->is_freed) { - ei->dentry = dentry; - eventfs_post_create_dir(ei); - dentry->d_fsdata = ei; - } else { - /* - * Should never happen unless we get here due to being freed. - * Otherwise it means two dentries exist with the same name. - */ - WARN_ON_ONCE(!ei->is_freed); - dentry = NULL; - } - mutex_unlock(&eventfs_mutex); - - return dentry; + return lookup_file(ei, dentry, mode, attr, data, fops); } /** @@ -641,79 +476,50 @@ static struct dentry *eventfs_root_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - const struct file_operations *fops; - const struct eventfs_entry *entry; struct eventfs_inode *ei_child; struct tracefs_inode *ti; struct eventfs_inode *ei; - struct dentry *ei_dentry = NULL; - struct dentry *ret = NULL; - struct dentry *d; const char *name = dentry->d_name.name; - umode_t mode; - void *data; - int idx; - int i; - int r; + struct dentry *result = NULL; ti = get_tracefs(dir); if (!(ti->flags & TRACEFS_EVENT_INODE)) - return NULL; - - /* Grab srcu to prevent the ei from going away */ - idx = srcu_read_lock(&eventfs_srcu); + return ERR_PTR(-EIO); - /* - * Grab the eventfs_mutex to consistent value from ti->private. - * This s - */ mutex_lock(&eventfs_mutex); - ei = READ_ONCE(ti->private); - if (ei && !ei->is_freed) - ei_dentry = READ_ONCE(ei->dentry); - mutex_unlock(&eventfs_mutex); - if (!ei || !ei_dentry) + ei = ti->private; + if (!ei || ei->is_freed) goto out; - data = ei->data; - - list_for_each_entry_srcu(ei_child, &ei->children, list, - srcu_read_lock_held(&eventfs_srcu)) { + list_for_each_entry(ei_child, &ei->children, list) { if (strcmp(ei_child->name, name) != 0) continue; - ret = simple_lookup(dir, dentry, flags); - if (IS_ERR(ret)) + if (ei_child->is_freed) goto out; - d = create_dir_dentry(ei, ei_child, ei_dentry); - dput(d); + result = lookup_dir_entry(dentry, ei, ei_child); goto out; } - for (i = 0; i < ei->nr_entries; i++) { - entry = &ei->entries[i]; - if (strcmp(name, entry->name) == 0) { - void *cdata = data; - mutex_lock(&eventfs_mutex); - /* If ei->is_freed, then the event itself may be too */ - if (!ei->is_freed) - r = entry->callback(name, &mode, &cdata, &fops); - else - r = -1; - mutex_unlock(&eventfs_mutex); - if (r <= 0) - continue; - ret = simple_lookup(dir, dentry, flags); - if (IS_ERR(ret)) - goto out; - d = create_file_dentry(ei, i, ei_dentry, name, mode, cdata, fops); - dput(d); - break; - } + for (int i = 0; i < ei->nr_entries; i++) { + void *data; + umode_t mode; + const struct file_operations *fops; + const struct eventfs_entry *entry = &ei->entries[i]; + + if (strcmp(name, entry->name) != 0) + continue; + + data = ei->data; + if (entry->callback(name, &mode, &data, &fops) <= 0) + goto out; + + result = lookup_file_dentry(dentry, ei, i, mode, data, fops); + goto out; } out: - srcu_read_unlock(&eventfs_srcu, idx); - return ret; + mutex_unlock(&eventfs_mutex); + return result; } /* @@ -802,7 +608,7 @@ static int eventfs_iterate(struct file *file, struct dir_context *ctx) name = ei_child->name; - ino = EVENTFS_DIR_INODE_INO; + ino = eventfs_dir_ino(ei_child); if (!dir_emit(ctx, name, strlen(name), ino, DT_DIR)) goto out_dec; @@ -863,25 +669,10 @@ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode if (!parent) return ERR_PTR(-EINVAL); - ei = kzalloc(sizeof(*ei), GFP_KERNEL); + ei = alloc_ei(name); if (!ei) return ERR_PTR(-ENOMEM); - ei->name = kstrdup_const(name, GFP_KERNEL); - if (!ei->name) { - kfree(ei); - return ERR_PTR(-ENOMEM); - } - - if (size) { - ei->d_children = kcalloc(size, sizeof(*ei->d_children), GFP_KERNEL); - if (!ei->d_children) { - kfree_const(ei->name); - kfree(ei); - return ERR_PTR(-ENOMEM); - } - } - ei->entries = entries; ei->nr_entries = size; ei->data = data; @@ -889,10 +680,8 @@ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode INIT_LIST_HEAD(&ei->list); mutex_lock(&eventfs_mutex); - if (!parent->is_freed) { + if (!parent->is_freed) list_add_tail(&ei->list, &parent->children); - ei->d_parent = parent->dentry; - } mutex_unlock(&eventfs_mutex); /* Was the parent freed? */ @@ -932,28 +721,20 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry if (IS_ERR(dentry)) return ERR_CAST(dentry); - ei = kzalloc(sizeof(*ei), GFP_KERNEL); + ei = alloc_ei(name); if (!ei) - goto fail_ei; + goto fail; inode = tracefs_get_inode(dentry->d_sb); if (unlikely(!inode)) goto fail; - if (size) { - ei->d_children = kcalloc(size, sizeof(*ei->d_children), GFP_KERNEL); - if (!ei->d_children) - goto fail; - } - - ei->dentry = dentry; + // Note: we have a ref to the dentry from tracefs_start_creating() + ei->events_dir = dentry; ei->entries = entries; ei->nr_entries = size; ei->is_events = 1; ei->data = data; - ei->name = kstrdup_const(name, GFP_KERNEL); - if (!ei->name) - goto fail; /* Save the ownership of this directory */ uid = d_inode(dentry->d_parent)->i_uid; @@ -984,11 +765,19 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry inode->i_op = &eventfs_root_dir_inode_operations; inode->i_fop = &eventfs_file_operations; - dentry->d_fsdata = ei; + dentry->d_fsdata = get_ei(ei); - /* directory inodes start off with i_nlink == 2 (for "." entry) */ - inc_nlink(inode); + /* + * Keep all eventfs directories with i_nlink == 1. + * Due to the dynamic nature of the dentry creations and not + * wanting to add a pointer to the parent eventfs_inode in the + * eventfs_inode structure, keeping the i_nlink in sync with the + * number of directories would cause too much complexity for + * something not worth much. Keeping directory links at 1 + * tells userspace not to trust the link number. + */ d_instantiate(dentry, inode); + /* The dentry of the "events" parent does keep track though */ inc_nlink(dentry->d_parent->d_inode); fsnotify_mkdir(dentry->d_parent->d_inode, dentry); tracefs_end_creating(dentry); @@ -996,72 +785,11 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry return ei; fail: - kfree(ei->d_children); - kfree(ei); - fail_ei: + free_ei(ei); tracefs_failed_creating(dentry); return ERR_PTR(-ENOMEM); } -static LLIST_HEAD(free_list); - -static void eventfs_workfn(struct work_struct *work) -{ - struct eventfs_inode *ei, *tmp; - struct llist_node *llnode; - - llnode = llist_del_all(&free_list); - llist_for_each_entry_safe(ei, tmp, llnode, llist) { - /* This dput() matches the dget() from unhook_dentry() */ - for (int i = 0; i < ei->nr_entries; i++) { - if (ei->d_children[i]) - dput(ei->d_children[i]); - } - /* This should only get here if it had a dentry */ - if (!WARN_ON_ONCE(!ei->dentry)) - dput(ei->dentry); - } -} - -static DECLARE_WORK(eventfs_work, eventfs_workfn); - -static void free_rcu_ei(struct rcu_head *head) -{ - struct eventfs_inode *ei = container_of(head, struct eventfs_inode, rcu); - - if (ei->dentry) { - /* Do not free the ei until all references of dentry are gone */ - if (llist_add(&ei->llist, &free_list)) - queue_work(system_unbound_wq, &eventfs_work); - return; - } - - /* If the ei doesn't have a dentry, neither should its children */ - for (int i = 0; i < ei->nr_entries; i++) { - WARN_ON_ONCE(ei->d_children[i]); - } - - free_ei(ei); -} - -static void unhook_dentry(struct dentry *dentry) -{ - if (!dentry) - return; - /* - * Need to add a reference to the dentry that is expected by - * simple_recursive_removal(), which will include a dput(). - */ - dget(dentry); - - /* - * Also add a reference for the dput() in eventfs_workfn(). - * That is required as that dput() will free the ei after - * the SRCU grace period is over. - */ - dget(dentry); -} - /** * eventfs_remove_rec - remove eventfs dir or file from list * @ei: eventfs_inode to be removed. @@ -1074,8 +802,6 @@ static void eventfs_remove_rec(struct eventfs_inode *ei, int level) { struct eventfs_inode *ei_child; - if (!ei) - return; /* * Check recursion depth. It should never be greater than 3: * 0 - events/ @@ -1087,28 +813,11 @@ static void eventfs_remove_rec(struct eventfs_inode *ei, int level) return; /* search for nested folders or files */ - list_for_each_entry_srcu(ei_child, &ei->children, list, - lockdep_is_held(&eventfs_mutex)) { - /* Children only have dentry if parent does */ - WARN_ON_ONCE(ei_child->dentry && !ei->dentry); + list_for_each_entry(ei_child, &ei->children, list) eventfs_remove_rec(ei_child, level + 1); - } - - - ei->is_freed = 1; - for (int i = 0; i < ei->nr_entries; i++) { - if (ei->d_children[i]) { - /* Children only have dentry if parent does */ - WARN_ON_ONCE(!ei->dentry); - unhook_dentry(ei->d_children[i]); - } - } - - unhook_dentry(ei->dentry); - - list_del_rcu(&ei->list); - call_srcu(&eventfs_srcu, &ei->rcu, free_rcu_ei); + list_del(&ei->list); + free_ei(ei); } /** @@ -1119,22 +828,12 @@ static void eventfs_remove_rec(struct eventfs_inode *ei, int level) */ void eventfs_remove_dir(struct eventfs_inode *ei) { - struct dentry *dentry; - if (!ei) return; mutex_lock(&eventfs_mutex); - dentry = ei->dentry; eventfs_remove_rec(ei, 0); mutex_unlock(&eventfs_mutex); - - /* - * If any of the ei children has a dentry, then the ei itself - * must have a dentry. - */ - if (dentry) - simple_recursive_removal(dentry, NULL); } /** @@ -1147,7 +846,11 @@ void eventfs_remove_events_dir(struct eventfs_inode *ei) { struct dentry *dentry; - dentry = ei->dentry; + dentry = ei->events_dir; + if (!dentry) + return; + + ei->events_dir = NULL; eventfs_remove_dir(ei); /* @@ -1157,5 +860,6 @@ void eventfs_remove_events_dir(struct eventfs_inode *ei) * sticks around while the other ei->dentry are created * and destroyed dynamically. */ + d_invalidate(dentry); dput(dentry); } diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index e1b172c0e091..5545e6bf7d26 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -38,8 +38,6 @@ static struct inode *tracefs_alloc_inode(struct super_block *sb) if (!ti) return NULL; - ti->flags = 0; - return &ti->vfs_inode; } @@ -379,21 +377,30 @@ static const struct super_operations tracefs_super_operations = { .show_options = tracefs_show_options, }; -static void tracefs_dentry_iput(struct dentry *dentry, struct inode *inode) +/* + * It would be cleaner if eventfs had its own dentry ops. + * + * Note that d_revalidate is called potentially under RCU, + * so it can't take the eventfs mutex etc. It's fine - if + * we open a file just as it's marked dead, things will + * still work just fine, and just see the old stale case. + */ +static void tracefs_d_release(struct dentry *dentry) { - struct tracefs_inode *ti; + if (dentry->d_fsdata) + eventfs_d_release(dentry); +} - if (!dentry || !inode) - return; +static int tracefs_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct eventfs_inode *ei = dentry->d_fsdata; - ti = get_tracefs(inode); - if (ti && ti->flags & TRACEFS_EVENT_INODE) - eventfs_set_ei_status_free(ti, dentry); - iput(inode); + return !(ei && ei->is_freed); } static const struct dentry_operations tracefs_dentry_operations = { - .d_iput = tracefs_dentry_iput, + .d_revalidate = tracefs_d_revalidate, + .d_release = tracefs_d_release, }; static int trace_fill_super(struct super_block *sb, void *data, int silent) @@ -497,75 +504,6 @@ struct dentry *tracefs_end_creating(struct dentry *dentry) return dentry; } -/** - * eventfs_start_creating - start the process of creating a dentry - * @name: Name of the file created for the dentry - * @parent: The parent dentry where this dentry will be created - * - * This is a simple helper function for the dynamically created eventfs - * files. When the directory of the eventfs files are accessed, their - * dentries are created on the fly. This function is used to start that - * process. - */ -struct dentry *eventfs_start_creating(const char *name, struct dentry *parent) -{ - struct dentry *dentry; - int error; - - /* Must always have a parent. */ - if (WARN_ON_ONCE(!parent)) - return ERR_PTR(-EINVAL); - - error = simple_pin_fs(&trace_fs_type, &tracefs_mount, - &tracefs_mount_count); - if (error) - return ERR_PTR(error); - - if (unlikely(IS_DEADDIR(parent->d_inode))) - dentry = ERR_PTR(-ENOENT); - else - dentry = lookup_one_len(name, parent, strlen(name)); - - if (!IS_ERR(dentry) && dentry->d_inode) { - dput(dentry); - dentry = ERR_PTR(-EEXIST); - } - - if (IS_ERR(dentry)) - simple_release_fs(&tracefs_mount, &tracefs_mount_count); - - return dentry; -} - -/** - * eventfs_failed_creating - clean up a failed eventfs dentry creation - * @dentry: The dentry to clean up - * - * If after calling eventfs_start_creating(), a failure is detected, the - * resources created by eventfs_start_creating() needs to be cleaned up. In - * that case, this function should be called to perform that clean up. - */ -struct dentry *eventfs_failed_creating(struct dentry *dentry) -{ - dput(dentry); - simple_release_fs(&tracefs_mount, &tracefs_mount_count); - return NULL; -} - -/** - * eventfs_end_creating - Finish the process of creating a eventfs dentry - * @dentry: The dentry that has successfully been created. - * - * This function is currently just a place holder to match - * eventfs_start_creating(). In case any synchronization needs to be added, - * this function will be used to implement that without having to modify - * the callers of eventfs_start_creating(). - */ -struct dentry *eventfs_end_creating(struct dentry *dentry) -{ - return dentry; -} - /* Find the inode that this will use for default */ static struct inode *instance_inode(struct dentry *parent, struct inode *inode) { @@ -779,7 +717,11 @@ static void init_once(void *foo) { struct tracefs_inode *ti = (struct tracefs_inode *) foo; + /* inode_init_once() calls memset() on the vfs_inode portion */ inode_init_once(&ti->vfs_inode); + + /* Zero out the rest */ + memset_after(ti, 0, vfs_inode); } static int __init tracefs_init(void) @@ -789,7 +731,6 @@ static int __init tracefs_init(void) tracefs_inode_cachep = kmem_cache_create("tracefs_inode_cache", sizeof(struct tracefs_inode), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD| SLAB_ACCOUNT), init_once); if (!tracefs_inode_cachep) diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h index 12b7d0150ae9..beb3dcd0e434 100644 --- a/fs/tracefs/internal.h +++ b/fs/tracefs/internal.h @@ -11,9 +11,10 @@ enum { }; struct tracefs_inode { + struct inode vfs_inode; + /* The below gets initialized with memset_after(ti, 0, vfs_inode) */ unsigned long flags; void *private; - struct inode vfs_inode; }; /* @@ -31,42 +32,37 @@ struct eventfs_attr { /* * struct eventfs_inode - hold the properties of the eventfs directories. * @list: link list into the parent directory + * @rcu: Union with @list for freeing + * @children: link list into the child eventfs_inode * @entries: the array of entries representing the files in the directory * @name: the name of the directory to create - * @children: link list into the child eventfs_inode - * @dentry: the dentry of the directory - * @d_parent: pointer to the parent's dentry - * @d_children: The array of dentries to represent the files when created + * @events_dir: the dentry of the events directory * @entry_attrs: Saved mode and ownership of the @d_children - * @attr: Saved mode and ownership of eventfs_inode itself * @data: The private data to pass to the callbacks + * @attr: Saved mode and ownership of eventfs_inode itself * @is_freed: Flag set if the eventfs is on its way to be freed * Note if is_freed is set, then dentry is corrupted. + * @is_events: Flag set for only the top level "events" directory * @nr_entries: The number of items in @entries + * @ino: The saved inode number */ struct eventfs_inode { - struct list_head list; + union { + struct list_head list; + struct rcu_head rcu; + }; + struct list_head children; const struct eventfs_entry *entries; const char *name; - struct list_head children; - struct dentry *dentry; /* Check is_freed to access */ - struct dentry *d_parent; - struct dentry **d_children; + struct dentry *events_dir; struct eventfs_attr *entry_attrs; - struct eventfs_attr attr; void *data; - /* - * Union - used for deletion - * @llist: for calling dput() if needed after RCU - * @rcu: eventfs_inode to delete in RCU - */ - union { - struct llist_node llist; - struct rcu_head rcu; - }; + struct eventfs_attr attr; + struct kref kref; unsigned int is_freed:1; unsigned int is_events:1; unsigned int nr_entries:30; + unsigned int ino; }; static inline struct tracefs_inode *get_tracefs(const struct inode *inode) @@ -78,10 +74,7 @@ struct dentry *tracefs_start_creating(const char *name, struct dentry *parent); struct dentry *tracefs_end_creating(struct dentry *dentry); struct dentry *tracefs_failed_creating(struct dentry *dentry); struct inode *tracefs_get_inode(struct super_block *sb); -struct dentry *eventfs_start_creating(const char *name, struct dentry *parent); -struct dentry *eventfs_failed_creating(struct dentry *dentry); -struct dentry *eventfs_end_creating(struct dentry *dentry); -void eventfs_update_gid(struct dentry *dentry, kgid_t gid); -void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry); + +void eventfs_d_release(struct dentry *dentry); #endif /* _TRACEFS_INTERNAL_H */ diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index e413a9cf8ee3..551148de66cd 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -205,7 +205,6 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, dbg_gen("'%pd' in dir ino %lu", dentry, dir->i_ino); err = fscrypt_prepare_lookup(dir, dentry, &nm); - generic_set_encrypted_ci_d_ops(dentry); if (err == -ENOENT) return d_splice_alias(NULL, dentry); if (err) diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 09e270d6ed02..7f4031a15f4d 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2239,13 +2239,14 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) goto out_umount; } + generic_set_sb_d_ops(sb); sb->s_root = d_make_root(root); if (!sb->s_root) { err = -ENOMEM; goto out_umount; } - import_uuid(&sb->s_uuid, c->uuid); + super_set_uuid(sb, c->uuid, sizeof(c->uuid)); mutex_unlock(&c->umount_mutex); return 0; @@ -2433,8 +2434,8 @@ static int __init ubifs_init(void) ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab", sizeof(struct ubifs_inode), 0, - SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT | - SLAB_ACCOUNT, &inode_slab_ctor); + SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, + &inode_slab_ctor); if (!ubifs_inode_slab) return -ENOMEM; diff --git a/fs/udf/super.c b/fs/udf/super.c index 928a04d9d9e0..6f420f4ca005 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -177,7 +177,6 @@ static int __init init_inodecache(void) udf_inode_cachep = kmem_cache_create("udf_inode_cache", sizeof(struct udf_inode_info), 0, (SLAB_RECLAIM_ACCOUNT | - SLAB_MEM_SPREAD | SLAB_ACCOUNT), init_once); if (!udf_inode_cachep) diff --git a/fs/ufs/super.c b/fs/ufs/super.c index a480810cd4e3..44666afc6209 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1470,8 +1470,7 @@ static int __init init_inodecache(void) { ufs_inode_cachep = kmem_cache_create_usercopy("ufs_inode_cache", sizeof(struct ufs_inode_info), 0, - (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| - SLAB_ACCOUNT), + (SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT), offsetof(struct ufs_inode_info, i_u1.i_symlink), sizeof_field(struct ufs_inode_info, i_u1.i_symlink), diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c index 1fb8f4df60cb..cabe8ac4fefc 100644 --- a/fs/vboxsf/super.c +++ b/fs/vboxsf/super.c @@ -339,8 +339,7 @@ static int vboxsf_setup(void) vboxsf_inode_cachep = kmem_cache_create("vboxsf_inode_cache", sizeof(struct vboxsf_inode), 0, - (SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | - SLAB_ACCOUNT), + SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, vboxsf_inode_init_once); if (!vboxsf_inode_cachep) { err = -ENOMEM; diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index a6a6b2749241..b3506f56e180 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -69,7 +69,6 @@ struct fsverity_info { u8 file_digest[FS_VERITY_MAX_DIGEST_SIZE]; const struct inode *inode; unsigned long *hash_block_verified; - spinlock_t hash_page_init_lock; }; #define FS_VERITY_MAX_SIGNATURE_SIZE (FS_VERITY_MAX_DESCRIPTOR_SIZE - \ diff --git a/fs/verity/measure.c b/fs/verity/measure.c index bf7a5f4cccaf..3969d54158d1 100644 --- a/fs/verity/measure.c +++ b/fs/verity/measure.c @@ -159,9 +159,9 @@ __bpf_kfunc int bpf_get_fsverity_digest(struct file *file, struct bpf_dynptr_ker __bpf_kfunc_end_defs(); -BTF_SET8_START(fsverity_set_ids) +BTF_KFUNCS_START(fsverity_set_ids) BTF_ID_FLAGS(func, bpf_get_fsverity_digest, KF_TRUSTED_ARGS) -BTF_SET8_END(fsverity_set_ids) +BTF_KFUNCS_END(fsverity_set_ids) static int bpf_get_fsverity_digest_filter(const struct bpf_prog *prog, u32 kfunc_id) { diff --git a/fs/verity/open.c b/fs/verity/open.c index 6c31a871b84b..fdeb95eca3af 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -239,7 +239,6 @@ struct fsverity_info *fsverity_create_info(const struct inode *inode, err = -ENOMEM; goto fail; } - spin_lock_init(&vi->hash_page_init_lock); } return vi; diff --git a/fs/verity/verify.c b/fs/verity/verify.c index 904ccd7e8e16..4fcad0825a12 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -19,7 +19,6 @@ static struct workqueue_struct *fsverity_read_workqueue; static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage, unsigned long hblock_idx) { - bool verified; unsigned int blocks_per_page; unsigned int i; @@ -43,12 +42,20 @@ static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage, * re-instantiated from the backing storage are re-verified. To do * this, we use PG_checked again, but now it doesn't really mean * "checked". Instead, now it just serves as an indicator for whether - * the hash page is newly instantiated or not. + * the hash page is newly instantiated or not. If the page is new, as + * indicated by PG_checked=0, we clear the bitmap bits for the page's + * blocks since they are untrustworthy, then set PG_checked=1. + * Otherwise we return the bitmap bit for the requested block. * - * The first thread that sees PG_checked=0 must clear the corresponding - * bitmap bits, then set PG_checked=1. This requires a spinlock. To - * avoid having to take this spinlock in the common case of - * PG_checked=1, we start with an opportunistic lockless read. + * Multiple threads may execute this code concurrently on the same page. + * This is safe because we use memory barriers to ensure that if a + * thread sees PG_checked=1, then it also sees the associated bitmap + * clearing to have occurred. Also, all writes and their corresponding + * reads are atomic, and all writes are safe to repeat in the event that + * multiple threads get into the PG_checked=0 section. (Clearing a + * bitmap bit again at worst causes a hash block to be verified + * redundantly. That event should be very rare, so it's not worth using + * a lock to avoid. Setting PG_checked again has no effect.) */ if (PageChecked(hpage)) { /* @@ -58,24 +65,17 @@ static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage, smp_rmb(); return test_bit(hblock_idx, vi->hash_block_verified); } - spin_lock(&vi->hash_page_init_lock); - if (PageChecked(hpage)) { - verified = test_bit(hblock_idx, vi->hash_block_verified); - } else { - blocks_per_page = vi->tree_params.blocks_per_page; - hblock_idx = round_down(hblock_idx, blocks_per_page); - for (i = 0; i < blocks_per_page; i++) - clear_bit(hblock_idx + i, vi->hash_block_verified); - /* - * A write memory barrier is needed here to give RELEASE - * semantics to the below SetPageChecked() operation. - */ - smp_wmb(); - SetPageChecked(hpage); - verified = false; - } - spin_unlock(&vi->hash_page_init_lock); - return verified; + blocks_per_page = vi->tree_params.blocks_per_page; + hblock_idx = round_down(hblock_idx, blocks_per_page); + for (i = 0; i < blocks_per_page; i++) + clear_bit(hblock_idx + i, vi->hash_block_verified); + /* + * A write memory barrier is needed here to give RELEASE semantics to + * the below SetPageChecked() operation. + */ + smp_wmb(); + SetPageChecked(hpage); + return false; } /* diff --git a/fs/xattr.c b/fs/xattr.c index 09d927603433..f8b643f91a98 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -16,7 +16,6 @@ #include <linux/mount.h> #include <linux/namei.h> #include <linux/security.h> -#include <linux/evm.h> #include <linux/syscalls.h> #include <linux/export.h> #include <linux/fsnotify.h> @@ -552,11 +551,11 @@ __vfs_removexattr_locked(struct mnt_idmap *idmap, goto out; error = __vfs_removexattr(idmap, dentry, name); + if (error) + return error; - if (!error) { - fsnotify_xattr(dentry); - evm_inode_post_removexattr(dentry, name); - } + fsnotify_xattr(dentry); + security_inode_post_removexattr(dentry, name); out: return error; diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 9976a00a73f9..e965a48e7db9 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -421,10 +421,10 @@ xfs_attr_complete_op( bool do_replace = args->op_flags & XFS_DA_OP_REPLACE; args->op_flags &= ~XFS_DA_OP_REPLACE; - if (do_replace) { - args->attr_filter &= ~XFS_ATTR_INCOMPLETE; + args->attr_filter &= ~XFS_ATTR_INCOMPLETE; + if (do_replace) return replace_state; - } + return XFS_DAS_DONE; } diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 31100120b2c5..e31663cb7b43 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1119,20 +1119,6 @@ xfs_rtbitmap_blockcount( } /* - * Compute the maximum level number of the realtime summary file, as defined by - * mkfs. The historic use of highbit32 on a 64-bit quantity prohibited correct - * use of rt volumes with more than 2^32 extents. - */ -uint8_t -xfs_compute_rextslog( - xfs_rtbxlen_t rtextents) -{ - if (!rtextents) - return 0; - return xfs_highbit64(rtextents); -} - -/* * Compute the number of rtbitmap words needed to populate every block of a * bitmap that is large enough to track the given number of rt extents. */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h index 274dc7dae1fa..152a66750af5 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.h +++ b/fs/xfs/libxfs/xfs_rtbitmap.h @@ -351,20 +351,6 @@ xfs_rtfree_extent( int xfs_rtfree_blocks(struct xfs_trans *tp, xfs_fsblock_t rtbno, xfs_filblks_t rtlen); -uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents); - -/* Do we support an rt volume having this number of rtextents? */ -static inline bool -xfs_validate_rtextents( - xfs_rtbxlen_t rtextents) -{ - /* No runt rt volumes */ - if (rtextents == 0) - return false; - - return true; -} - xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents); unsigned long long xfs_rtbitmap_wordcount(struct xfs_mount *mp, @@ -383,8 +369,6 @@ unsigned long long xfs_rtsummary_wordcount(struct xfs_mount *mp, # define xfs_rtsummary_read_buf(a,b) (-ENOSYS) # define xfs_rtbuf_cache_relse(a) (0) # define xfs_rtalloc_extent_is_free(m,t,s,l,i) (-ENOSYS) -# define xfs_compute_rextslog(rtx) (0) -# define xfs_validate_rtextents(rtx) (false) static inline xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents) { diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 4a9e8588f4c9..5bb6e2bd6dee 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -1377,3 +1377,17 @@ xfs_validate_stripe_geometry( } return true; } + +/* + * Compute the maximum level number of the realtime summary file, as defined by + * mkfs. The historic use of highbit32 on a 64-bit quantity prohibited correct + * use of rt volumes with more than 2^32 extents. + */ +uint8_t +xfs_compute_rextslog( + xfs_rtbxlen_t rtextents) +{ + if (!rtextents) + return 0; + return xfs_highbit64(rtextents); +} diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h index 19134b23c10b..2e8e8d63d4eb 100644 --- a/fs/xfs/libxfs/xfs_sb.h +++ b/fs/xfs/libxfs/xfs_sb.h @@ -38,4 +38,6 @@ extern int xfs_sb_get_secondary(struct xfs_mount *mp, extern bool xfs_validate_stripe_geometry(struct xfs_mount *mp, __s64 sunit, __s64 swidth, int sectorsize, bool silent); +uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents); + #endif /* __XFS_SB_H__ */ diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 20b5375f2d9c..62e02d5380ad 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -251,4 +251,16 @@ bool xfs_verify_fileoff(struct xfs_mount *mp, xfs_fileoff_t off); bool xfs_verify_fileext(struct xfs_mount *mp, xfs_fileoff_t off, xfs_fileoff_t len); +/* Do we support an rt volume having this number of rtextents? */ +static inline bool +xfs_validate_rtextents( + xfs_rtbxlen_t rtextents) +{ + /* No runt rt volumes */ + if (rtextents == 0) + return false; + + return true; +} + #endif /* __XFS_TYPES_H__ */ diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index 441ca9977652..46583517377f 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -15,6 +15,7 @@ #include "xfs_inode.h" #include "xfs_bmap.h" #include "xfs_bit.h" +#include "xfs_sb.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/repair.h" diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c index fabd0ed9dfa6..b1ff4f33324a 100644 --- a/fs/xfs/scrub/rtsummary.c +++ b/fs/xfs/scrub/rtsummary.c @@ -16,6 +16,7 @@ #include "xfs_rtbitmap.h" #include "xfs_bit.h" #include "xfs_bmap.h" +#include "xfs_sb.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 813f85156b0c..1698507d1ac7 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -112,7 +112,7 @@ xfs_end_ioend( * longer dirty. If we don't remove delalloc blocks here, they become * stale and can corrupt free space accounting on unmount. */ - error = blk_status_to_errno(ioend->io_bio->bi_status); + error = blk_status_to_errno(ioend->io_bio.bi_status); if (unlikely(error)) { if (ioend->io_flags & IOMAP_F_SHARED) { xfs_reflink_cancel_cow_range(ip, offset, size, true); @@ -179,7 +179,7 @@ STATIC void xfs_end_bio( struct bio *bio) { - struct iomap_ioend *ioend = bio->bi_private; + struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); struct xfs_inode *ip = XFS_I(ioend->io_inode); unsigned long flags; @@ -276,7 +276,8 @@ static int xfs_map_blocks( struct iomap_writepage_ctx *wpc, struct inode *inode, - loff_t offset) + loff_t offset, + unsigned int len) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -444,7 +445,7 @@ xfs_prepare_ioend( /* send ioends that might require a transaction to the completion wq */ if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN || (ioend->io_flags & IOMAP_F_SHARED)) - ioend->io_bio->bi_end_io = xfs_end_bio; + ioend->io_bio.bi_end_io = xfs_end_bio; return status; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 8e5bd50d29fe..01b41fabbe3c 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1951,7 +1951,7 @@ xfs_free_buftarg( fs_put_dax(btp->bt_daxdev, btp->bt_mount); /* the main block device is closed by kill_block_super */ if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev) - bdev_release(btp->bt_bdev_handle); + fput(btp->bt_bdev_file); kmem_free(btp); } @@ -1994,7 +1994,7 @@ xfs_setsize_buftarg_early( struct xfs_buftarg * xfs_alloc_buftarg( struct xfs_mount *mp, - struct bdev_handle *bdev_handle) + struct file *bdev_file) { xfs_buftarg_t *btp; const struct dax_holder_operations *ops = NULL; @@ -2005,9 +2005,9 @@ xfs_alloc_buftarg( btp = kmem_zalloc(sizeof(*btp), KM_NOFS); btp->bt_mount = mp; - btp->bt_bdev_handle = bdev_handle; - btp->bt_dev = bdev_handle->bdev->bd_dev; - btp->bt_bdev = bdev_handle->bdev; + btp->bt_bdev_file = bdev_file; + btp->bt_bdev = file_bdev(bdev_file); + btp->bt_dev = btp->bt_bdev->bd_dev; btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off, mp, ops); diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index b470de08a46c..304e858d04fb 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -98,7 +98,7 @@ typedef unsigned int xfs_buf_flags_t; */ typedef struct xfs_buftarg { dev_t bt_dev; - struct bdev_handle *bt_bdev_handle; + struct file *bt_bdev_file; struct block_device *bt_bdev; struct dax_device *bt_daxdev; u64 bt_dax_part_off; @@ -366,7 +366,7 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset) * Handling of buftargs. */ struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp, - struct bdev_handle *bdev_handle); + struct file *bdev_file); extern void xfs_free_buftarg(struct xfs_buftarg *); extern void xfs_buftarg_wait(struct xfs_buftarg *); extern void xfs_buftarg_drain(struct xfs_buftarg *); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index aabb25dc3efa..57fa21ad7912 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -62,7 +62,7 @@ xfs_uuid_mount( int hole, i; /* Publish UUID in struct super_block */ - uuid_copy(&mp->m_super->s_uuid, uuid); + super_set_uuid(mp->m_super, uuid->b, sizeof(*uuid)); if (xfs_has_nouuid(mp)) return 0; @@ -706,6 +706,8 @@ xfs_mountfs( /* enable fail_at_unmount as default */ mp->m_fail_unmount = true; + super_set_sysfs_name_id(mp->m_super); + error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_super->s_id); if (error) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index aff20ddd4a9f..59c8c0541bdd 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -350,7 +350,6 @@ xfs_setup_dax_always( return -EINVAL; } - xfs_warn(mp, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); return 0; disable_dax: @@ -362,16 +361,16 @@ STATIC int xfs_blkdev_get( xfs_mount_t *mp, const char *name, - struct bdev_handle **handlep) + struct file **bdev_filep) { int error = 0; - *handlep = bdev_open_by_path(name, + *bdev_filep = bdev_file_open_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES, mp->m_super, &fs_holder_ops); - if (IS_ERR(*handlep)) { - error = PTR_ERR(*handlep); - *handlep = NULL; + if (IS_ERR(*bdev_filep)) { + error = PTR_ERR(*bdev_filep); + *bdev_filep = NULL; xfs_warn(mp, "Invalid device [%s], error=%d", name, error); } @@ -436,26 +435,26 @@ xfs_open_devices( { struct super_block *sb = mp->m_super; struct block_device *ddev = sb->s_bdev; - struct bdev_handle *logdev_handle = NULL, *rtdev_handle = NULL; + struct file *logdev_file = NULL, *rtdev_file = NULL; int error; /* * Open real time and log devices - order is important. */ if (mp->m_logname) { - error = xfs_blkdev_get(mp, mp->m_logname, &logdev_handle); + error = xfs_blkdev_get(mp, mp->m_logname, &logdev_file); if (error) return error; } if (mp->m_rtname) { - error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev_handle); + error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev_file); if (error) goto out_close_logdev; - if (rtdev_handle->bdev == ddev || - (logdev_handle && - rtdev_handle->bdev == logdev_handle->bdev)) { + if (file_bdev(rtdev_file) == ddev || + (logdev_file && + file_bdev(rtdev_file) == file_bdev(logdev_file))) { xfs_warn(mp, "Cannot mount filesystem with identical rtdev and ddev/logdev."); error = -EINVAL; @@ -467,25 +466,25 @@ xfs_open_devices( * Setup xfs_mount buffer target pointers */ error = -ENOMEM; - mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb->s_bdev_handle); + mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb->s_bdev_file); if (!mp->m_ddev_targp) goto out_close_rtdev; - if (rtdev_handle) { - mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev_handle); + if (rtdev_file) { + mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev_file); if (!mp->m_rtdev_targp) goto out_free_ddev_targ; } - if (logdev_handle && logdev_handle->bdev != ddev) { - mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev_handle); + if (logdev_file && file_bdev(logdev_file) != ddev) { + mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev_file); if (!mp->m_logdev_targp) goto out_free_rtdev_targ; } else { mp->m_logdev_targp = mp->m_ddev_targp; /* Handle won't be used, drop it */ - if (logdev_handle) - bdev_release(logdev_handle); + if (logdev_file) + fput(logdev_file); } return 0; @@ -496,11 +495,11 @@ xfs_open_devices( out_free_ddev_targ: xfs_free_buftarg(mp->m_ddev_targp); out_close_rtdev: - if (rtdev_handle) - bdev_release(rtdev_handle); + if (rtdev_file) + fput(rtdev_file); out_close_logdev: - if (logdev_handle) - bdev_release(logdev_handle); + if (logdev_file) + fput(logdev_file); return error; } @@ -1496,6 +1495,18 @@ xfs_fs_fill_super( mp->m_super = sb; + /* + * Copy VFS mount flags from the context now that all parameter parsing + * is guaranteed to have been completed by either the old mount API or + * the newer fsopen/fsconfig API. + */ + if (fc->sb_flags & SB_RDONLY) + set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); + if (fc->sb_flags & SB_DIRSYNC) + mp->m_features |= XFS_FEAT_DIRSYNC; + if (fc->sb_flags & SB_SYNCHRONOUS) + mp->m_features |= XFS_FEAT_WSYNC; + error = xfs_fs_validate_params(mp); if (error) return error; @@ -1965,6 +1976,11 @@ static const struct fs_context_operations xfs_context_ops = { .free = xfs_fs_free, }; +/* + * WARNING: do not initialise any parameters in this function that depend on + * mount option parsing having already been performed as this can be called from + * fsopen() before any parameters have been set. + */ static int xfs_init_fs_context( struct fs_context *fc) { @@ -1996,16 +2012,6 @@ static int xfs_init_fs_context( mp->m_logbsize = -1; mp->m_allocsize_log = 16; /* 64k */ - /* - * Copy binary VFS mount flags we are interested in. - */ - if (fc->sb_flags & SB_RDONLY) - set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); - if (fc->sb_flags & SB_DIRSYNC) - mp->m_features |= XFS_FEAT_DIRSYNC; - if (fc->sb_flags & SB_SYNCHRONOUS) - mp->m_features |= XFS_FEAT_WSYNC; - fc->s_fs_info = mp; fc->ops = &xfs_context_ops; @@ -2037,8 +2043,7 @@ xfs_init_caches(void) xfs_buf_cache = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0, SLAB_HWCACHE_ALIGN | - SLAB_RECLAIM_ACCOUNT | - SLAB_MEM_SPREAD, + SLAB_RECLAIM_ACCOUNT, NULL); if (!xfs_buf_cache) goto out; @@ -2103,14 +2108,14 @@ xfs_init_caches(void) sizeof(struct xfs_inode), 0, (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | - SLAB_MEM_SPREAD | SLAB_ACCOUNT), + SLAB_ACCOUNT), xfs_fs_inode_init_once); if (!xfs_inode_cache) goto out_destroy_efi_cache; xfs_ili_cache = kmem_cache_create("xfs_ili", sizeof(struct xfs_inode_log_item), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + SLAB_RECLAIM_ACCOUNT, NULL); if (!xfs_ili_cache) goto out_destroy_inode_cache; diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 6ab2318a9c8e..3b103715acc9 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -125,7 +125,8 @@ static void zonefs_readahead(struct readahead_control *rac) * which implies that the page range can only be within the fixed inode size. */ static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc, - struct inode *inode, loff_t offset) + struct inode *inode, loff_t offset, + unsigned int len) { struct zonefs_zone *z = zonefs_inode_zone(inode); @@ -348,7 +349,12 @@ static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, struct zonefs_inode_info *zi = ZONEFS_I(inode); if (error) { - zonefs_io_error(inode, true); + /* + * For Sync IOs, error recovery is called from + * zonefs_file_dio_write(). + */ + if (!is_sync_kiocb(iocb)) + zonefs_io_error(inode, true); return error; } @@ -491,6 +497,14 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) ret = -EINVAL; goto inode_unlock; } + /* + * Advance the zone write pointer offset. This assumes that the + * IO will succeed, which is OK to do because we do not allow + * partial writes (IOMAP_DIO_PARTIAL is not set) and if the IO + * fails, the error path will correct the write pointer offset. + */ + z->z_wpoffset += count; + zonefs_inode_account_active(inode); mutex_unlock(&zi->i_truncate_mutex); } @@ -504,20 +518,19 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) if (ret == -ENOTBLK) ret = -EBUSY; - if (zonefs_zone_is_seq(z) && - (ret > 0 || ret == -EIOCBQUEUED)) { - if (ret > 0) - count = ret; - - /* - * Update the zone write pointer offset assuming the write - * operation succeeded. If it did not, the error recovery path - * will correct it. Also do active seq file accounting. - */ - mutex_lock(&zi->i_truncate_mutex); - z->z_wpoffset += count; - zonefs_inode_account_active(inode); - mutex_unlock(&zi->i_truncate_mutex); + /* + * For a failed IO or partial completion, trigger error recovery + * to update the zone write pointer offset to a correct value. + * For asynchronous IOs, zonefs_file_write_dio_end_io() may already + * have executed error recovery if the IO already completed when we + * reach here. However, we cannot know that and execute error recovery + * again (that will not change anything). + */ + if (zonefs_zone_is_seq(z)) { + if (ret > 0 && ret != count) + ret = -EIO; + if (ret < 0 && ret != -EIOCBQUEUED) + zonefs_io_error(inode, true); } inode_unlock: diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 93971742613a..c6a124e8d565 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -15,12 +15,13 @@ #include <linux/writeback.h> #include <linux/quotaops.h> #include <linux/seq_file.h> -#include <linux/parser.h> #include <linux/uio.h> #include <linux/mman.h> #include <linux/sched/mm.h> #include <linux/crc32.h> #include <linux/task_io_accounting_ops.h> +#include <linux/fs_parser.h> +#include <linux/fs_context.h> #include "zonefs.h" @@ -113,7 +114,7 @@ static int zonefs_zone_mgmt(struct super_block *sb, trace_zonefs_zone_mgmt(sb, z, op); ret = blkdev_zone_mgmt(sb->s_bdev, op, z->z_sector, - z->z_size >> SECTOR_SHIFT, GFP_NOFS); + z->z_size >> SECTOR_SHIFT); if (ret) { zonefs_err(sb, "Zone management operation %s at %llu failed %d\n", @@ -246,16 +247,18 @@ static void zonefs_inode_update_mode(struct inode *inode) z->z_mode = inode->i_mode; } -struct zonefs_ioerr_data { - struct inode *inode; - bool write; -}; - static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, void *data) { - struct zonefs_ioerr_data *err = data; - struct inode *inode = err->inode; + struct blk_zone *z = data; + + *z = *zone; + return 0; +} + +static void zonefs_handle_io_error(struct inode *inode, struct blk_zone *zone, + bool write) +{ struct zonefs_zone *z = zonefs_inode_zone(inode); struct super_block *sb = inode->i_sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); @@ -270,8 +273,8 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, data_size = zonefs_check_zone_condition(sb, z, zone); isize = i_size_read(inode); if (!(z->z_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE)) && - !err->write && isize == data_size) - return 0; + !write && isize == data_size) + return; /* * At this point, we detected either a bad zone or an inconsistency @@ -292,7 +295,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, * In all cases, warn about inode size inconsistency and handle the * IO error according to the zone condition and to the mount options. */ - if (zonefs_zone_is_seq(z) && isize != data_size) + if (isize != data_size) zonefs_warn(sb, "inode %lu: invalid size %lld (should be %lld)\n", inode->i_ino, isize, data_size); @@ -352,8 +355,6 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, zonefs_i_size_write(inode, data_size); z->z_wpoffset = data_size; zonefs_inode_account_active(inode); - - return 0; } /* @@ -367,23 +368,25 @@ void __zonefs_io_error(struct inode *inode, bool write) { struct zonefs_zone *z = zonefs_inode_zone(inode); struct super_block *sb = inode->i_sb; - struct zonefs_sb_info *sbi = ZONEFS_SB(sb); unsigned int noio_flag; - unsigned int nr_zones = 1; - struct zonefs_ioerr_data err = { - .inode = inode, - .write = write, - }; + struct blk_zone zone; int ret; /* - * The only files that have more than one zone are conventional zone - * files with aggregated conventional zones, for which the inode zone - * size is always larger than the device zone size. + * Conventional zone have no write pointer and cannot become read-only + * or offline. So simply fake a report for a single or aggregated zone + * and let zonefs_handle_io_error() correct the zone inode information + * according to the mount options. */ - if (z->z_size > bdev_zone_sectors(sb->s_bdev)) - nr_zones = z->z_size >> - (sbi->s_zone_sectors_shift + SECTOR_SHIFT); + if (!zonefs_zone_is_seq(z)) { + zone.start = z->z_sector; + zone.len = z->z_size >> SECTOR_SHIFT; + zone.wp = zone.start + zone.len; + zone.type = BLK_ZONE_TYPE_CONVENTIONAL; + zone.cond = BLK_ZONE_COND_NOT_WP; + zone.capacity = zone.len; + goto handle_io_error; + } /* * Memory allocations in blkdev_report_zones() can trigger a memory @@ -394,12 +397,20 @@ void __zonefs_io_error(struct inode *inode, bool write) * the GFP_NOIO context avoids both problems. */ noio_flag = memalloc_noio_save(); - ret = blkdev_report_zones(sb->s_bdev, z->z_sector, nr_zones, - zonefs_io_error_cb, &err); - if (ret != nr_zones) + ret = blkdev_report_zones(sb->s_bdev, z->z_sector, 1, + zonefs_io_error_cb, &zone); + memalloc_noio_restore(noio_flag); + + if (ret != 1) { zonefs_err(sb, "Get inode %lu zone information failed %d\n", inode->i_ino, ret); - memalloc_noio_restore(noio_flag); + zonefs_warn(sb, "remounting filesystem read-only\n"); + sb->s_flags |= SB_RDONLY; + return; + } + +handle_io_error: + zonefs_handle_io_error(inode, &zone, write); } static struct kmem_cache *zonefs_inode_cachep; @@ -460,58 +471,47 @@ static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf) } enum { - Opt_errors_ro, Opt_errors_zro, Opt_errors_zol, Opt_errors_repair, - Opt_explicit_open, Opt_err, + Opt_errors, Opt_explicit_open, }; -static const match_table_t tokens = { - { Opt_errors_ro, "errors=remount-ro"}, - { Opt_errors_zro, "errors=zone-ro"}, - { Opt_errors_zol, "errors=zone-offline"}, - { Opt_errors_repair, "errors=repair"}, - { Opt_explicit_open, "explicit-open" }, - { Opt_err, NULL} +struct zonefs_context { + unsigned long s_mount_opts; }; -static int zonefs_parse_options(struct super_block *sb, char *options) -{ - struct zonefs_sb_info *sbi = ZONEFS_SB(sb); - substring_t args[MAX_OPT_ARGS]; - char *p; - - if (!options) - return 0; - - while ((p = strsep(&options, ",")) != NULL) { - int token; +static const struct constant_table zonefs_param_errors[] = { + {"remount-ro", ZONEFS_MNTOPT_ERRORS_RO}, + {"zone-ro", ZONEFS_MNTOPT_ERRORS_ZRO}, + {"zone-offline", ZONEFS_MNTOPT_ERRORS_ZOL}, + {"repair", ZONEFS_MNTOPT_ERRORS_REPAIR}, + {} +}; - if (!*p) - continue; +static const struct fs_parameter_spec zonefs_param_spec[] = { + fsparam_enum ("errors", Opt_errors, zonefs_param_errors), + fsparam_flag ("explicit-open", Opt_explicit_open), + {} +}; - token = match_token(p, tokens, args); - switch (token) { - case Opt_errors_ro: - sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK; - sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_RO; - break; - case Opt_errors_zro: - sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK; - sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_ZRO; - break; - case Opt_errors_zol: - sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK; - sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_ZOL; - break; - case Opt_errors_repair: - sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK; - sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_REPAIR; - break; - case Opt_explicit_open: - sbi->s_mount_opts |= ZONEFS_MNTOPT_EXPLICIT_OPEN; - break; - default: - return -EINVAL; - } +static int zonefs_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct zonefs_context *ctx = fc->fs_private; + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, zonefs_param_spec, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_errors: + ctx->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK; + ctx->s_mount_opts |= result.uint_32; + break; + case Opt_explicit_open: + ctx->s_mount_opts |= ZONEFS_MNTOPT_EXPLICIT_OPEN; + break; + default: + return -EINVAL; } return 0; @@ -533,13 +533,6 @@ static int zonefs_show_options(struct seq_file *seq, struct dentry *root) return 0; } -static int zonefs_remount(struct super_block *sb, int *flags, char *data) -{ - sync_filesystem(sb); - - return zonefs_parse_options(sb, data); -} - static int zonefs_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { @@ -1195,7 +1188,6 @@ static const struct super_operations zonefs_sops = { .alloc_inode = zonefs_alloc_inode, .free_inode = zonefs_free_inode, .statfs = zonefs_statfs, - .remount_fs = zonefs_remount, .show_options = zonefs_show_options, }; @@ -1240,9 +1232,10 @@ static void zonefs_release_zgroup_inodes(struct super_block *sb) * sub-directories and files according to the device zone configuration and * format options. */ -static int zonefs_fill_super(struct super_block *sb, void *data, int silent) +static int zonefs_fill_super(struct super_block *sb, struct fs_context *fc) { struct zonefs_sb_info *sbi; + struct zonefs_context *ctx = fc->fs_private; struct inode *inode; enum zonefs_ztype ztype; int ret; @@ -1279,7 +1272,7 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) sbi->s_uid = GLOBAL_ROOT_UID; sbi->s_gid = GLOBAL_ROOT_GID; sbi->s_perm = 0640; - sbi->s_mount_opts = ZONEFS_MNTOPT_ERRORS_RO; + sbi->s_mount_opts = ctx->s_mount_opts; atomic_set(&sbi->s_wro_seq_files, 0); sbi->s_max_wro_seq_files = bdev_max_open_zones(sb->s_bdev); @@ -1290,10 +1283,6 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) if (ret) return ret; - ret = zonefs_parse_options(sb, data); - if (ret) - return ret; - zonefs_info(sb, "Mounting %u zones", bdev_nr_zones(sb->s_bdev)); if (!sbi->s_max_wro_seq_files && @@ -1354,12 +1343,6 @@ cleanup: return ret; } -static struct dentry *zonefs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) -{ - return mount_bdev(fs_type, flags, dev_name, data, zonefs_fill_super); -} - static void zonefs_kill_super(struct super_block *sb) { struct zonefs_sb_info *sbi = ZONEFS_SB(sb); @@ -1374,22 +1357,72 @@ static void zonefs_kill_super(struct super_block *sb) kfree(sbi); } +static void zonefs_free_fc(struct fs_context *fc) +{ + struct zonefs_context *ctx = fc->fs_private; + + kfree(ctx); +} + +static int zonefs_get_tree(struct fs_context *fc) +{ + return get_tree_bdev(fc, zonefs_fill_super); +} + +static int zonefs_reconfigure(struct fs_context *fc) +{ + struct zonefs_context *ctx = fc->fs_private; + struct super_block *sb = fc->root->d_sb; + struct zonefs_sb_info *sbi = sb->s_fs_info; + + sync_filesystem(fc->root->d_sb); + /* Copy new options from ctx into sbi. */ + sbi->s_mount_opts = ctx->s_mount_opts; + + return 0; +} + +static const struct fs_context_operations zonefs_context_ops = { + .parse_param = zonefs_parse_param, + .get_tree = zonefs_get_tree, + .reconfigure = zonefs_reconfigure, + .free = zonefs_free_fc, +}; + +/* + * Set up the filesystem mount context. + */ +static int zonefs_init_fs_context(struct fs_context *fc) +{ + struct zonefs_context *ctx; + + ctx = kzalloc(sizeof(struct zonefs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + ctx->s_mount_opts = ZONEFS_MNTOPT_ERRORS_RO; + fc->ops = &zonefs_context_ops; + fc->fs_private = ctx; + + return 0; +} + /* * File system definition and registration. */ static struct file_system_type zonefs_type = { - .owner = THIS_MODULE, - .name = "zonefs", - .mount = zonefs_mount, - .kill_sb = zonefs_kill_super, - .fs_flags = FS_REQUIRES_DEV, + .owner = THIS_MODULE, + .name = "zonefs", + .kill_sb = zonefs_kill_super, + .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = zonefs_init_fs_context, + .parameters = zonefs_param_spec, }; static int __init zonefs_init_inodecache(void) { zonefs_inode_cachep = kmem_cache_create("zonefs_inode_cache", sizeof(struct zonefs_inode_info), 0, - (SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT), + SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, NULL); if (zonefs_inode_cachep == NULL) return -ENOMEM; |