diff options
Diffstat (limited to 'fs')
234 files changed, 4785 insertions, 2167 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index f08fbbfafd9a..d1ad3935fb85 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -166,7 +166,7 @@ config TMPFS space. If you unmount a tmpfs instance, everything stored therein is lost. - See <file:Documentation/filesystems/tmpfs.txt> for details. + See <file:Documentation/filesystems/tmpfs.rst> for details. config TMPFS_POSIX_ACL bool "Tmpfs POSIX Access Control Lists" diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 62dc4f577ba1..04f86b8c100e 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -36,6 +36,12 @@ config COMPAT_BINFMT_ELF config ARCH_BINFMT_ELF_STATE bool +config ARCH_HAVE_ELF_PROT + bool + +config ARCH_USE_GNU_PROPERTY + bool + config BINFMT_ELF_FDPIC bool "Kernel support for FDPIC ELF binaries" default y if !BINFMT_ELF @@ -72,7 +78,7 @@ config CORE_DUMP_DEFAULT_ELF_HEADERS The core dump behavior can be controlled per process using the /proc/PID/coredump_filter pseudo-file; this setting is - inherited. See Documentation/filesystems/proc.txt for details. + inherited. See Documentation/filesystems/proc.rst for details. This config option changes the default setting of coredump_filter seen at boot time. If unsure, say Y. diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig index df4650dccf68..44738fed6625 100644 --- a/fs/adfs/Kconfig +++ b/fs/adfs/Kconfig @@ -12,7 +12,7 @@ config ADFS_FS The ADFS partition should be the first partition (i.e., /dev/[hs]d?1) on each of your drives. Please read the file - <file:Documentation/filesystems/adfs.txt> for further details. + <file:Documentation/filesystems/adfs.rst> for further details. To compile this code as a module, choose M here: the module will be called adfs. diff --git a/fs/affs/Kconfig b/fs/affs/Kconfig index 84c46b9025c5..eb9d0ab850cb 100644 --- a/fs/affs/Kconfig +++ b/fs/affs/Kconfig @@ -9,7 +9,7 @@ config AFFS_FS FFS partition on your hard drive. Amiga floppies however cannot be read with this driver due to an incompatibility of the floppy controller used in an Amiga and the standard floppy controller in - PCs and workstations. Read <file:Documentation/filesystems/affs.txt> + PCs and workstations. Read <file:Documentation/filesystems/affs.rst> and <file:fs/affs/Changes>. With this driver you can also mount disk files used by Bernd diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig index 3fb1f559e317..1ad211d72b3b 100644 --- a/fs/afs/Kconfig +++ b/fs/afs/Kconfig @@ -8,7 +8,7 @@ config AFS_FS If you say Y here, you will get an experimental Andrew File System driver. It currently only supports unsecured read-only AFS access. - See <file:Documentation/filesystems/afs.txt> for more information. + See <file:Documentation/filesystems/afs.rst> for more information. If unsure, say N. @@ -18,7 +18,7 @@ config AFS_DEBUG help Say Y here to make runtime controllable debugging messages appear. - See <file:Documentation/filesystems/afs.txt> for more information. + See <file:Documentation/filesystems/afs.rst> for more information. If unsure, say N. @@ -37,6 +37,6 @@ config AFS_DEBUG_CURSOR the dmesg log if the server rotation algorithm fails to successfully contact a server. - See <file:Documentation/filesystems/afs.txt> for more information. + See <file:Documentation/filesystems/afs.rst> for more information. If unsure, say N. diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 6765949b3aab..380ad5ace7cf 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -169,7 +169,7 @@ static int afs_record_cm_probe(struct afs_call *call, struct afs_server *server) spin_lock(&server->probe_lock); - if (!test_bit(AFS_SERVER_FL_HAVE_EPOCH, &server->flags)) { + if (!test_and_set_bit(AFS_SERVER_FL_HAVE_EPOCH, &server->flags)) { server->cm_epoch = call->epoch; server->probe.cm_epoch = call->epoch; goto out; diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c index e1b9ed679045..37d1bba57b00 100644 --- a/fs/afs/fs_probe.c +++ b/fs/afs/fs_probe.c @@ -32,9 +32,8 @@ void afs_fileserver_probe_result(struct afs_call *call) struct afs_server *server = call->server; unsigned int server_index = call->server_index; unsigned int index = call->addr_ix; - unsigned int rtt = UINT_MAX; + unsigned int rtt_us = 0; bool have_result = false; - u64 _rtt; int ret = call->error; _enter("%pU,%u", &server->uuid, index); @@ -93,15 +92,9 @@ responded: } } - /* Get the RTT and scale it to fit into a 32-bit value that represents - * over a minute of time so that we can access it with one instruction - * on a 32-bit system. - */ - _rtt = rxrpc_kernel_get_rtt(call->net->socket, call->rxcall); - _rtt /= 64; - rtt = (_rtt > UINT_MAX) ? UINT_MAX : _rtt; - if (rtt < server->probe.rtt) { - server->probe.rtt = rtt; + rtt_us = rxrpc_kernel_get_srtt(call->net->socket, call->rxcall); + if (rtt_us < server->probe.rtt) { + server->probe.rtt = rtt_us; alist->preferred = index; have_result = true; } @@ -113,15 +106,11 @@ out: spin_unlock(&server->probe_lock); _debug("probe [%u][%u] %pISpc rtt=%u ret=%d", - server_index, index, &alist->addrs[index].transport, - (unsigned int)rtt, ret); + server_index, index, &alist->addrs[index].transport, rtt_us, ret); have_result |= afs_fs_probe_done(server); - if (have_result) { - server->probe.have_result = true; - wake_up_var(&server->probe.have_result); + if (have_result) wake_up_all(&server->probe_wq); - } } /* diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 68fc46634346..d2b3798c1932 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -385,8 +385,6 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) ASSERTCMP(req->offset, <=, PAGE_SIZE); if (req->offset == PAGE_SIZE) { req->offset = 0; - if (req->page_done) - req->page_done(req); req->index++; if (req->remain > 0) goto begin_page; @@ -440,11 +438,13 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) if (req->offset < PAGE_SIZE) zero_user_segment(req->pages[req->index], req->offset, PAGE_SIZE); - if (req->page_done) - req->page_done(req); req->offset = 0; } + if (req->page_done) + for (req->index = 0; req->index < req->nr_pages; req->index++) + req->page_done(req); + _leave(" = 0 [done]"); return 0; } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index ef732dd4e7ef..80255513e230 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -533,12 +533,10 @@ struct afs_server { u32 abort_code; u32 cm_epoch; short error; - bool have_result; bool responded:1; bool is_yfs:1; bool not_yfs:1; bool local_failure:1; - bool no_epoch:1; bool cm_probed:1; bool said_rebooted:1; bool said_inconsistent:1; @@ -1335,7 +1333,7 @@ extern struct afs_volume *afs_create_volume(struct afs_fs_context *); extern void afs_activate_volume(struct afs_volume *); extern void afs_deactivate_volume(struct afs_volume *); extern void afs_put_volume(struct afs_cell *, struct afs_volume *); -extern int afs_check_volume_status(struct afs_volume *, struct key *); +extern int afs_check_volume_status(struct afs_volume *, struct afs_fs_cursor *); /* * write.c diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index 172ba569cd60..2a3305e42b14 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -192,7 +192,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) write_unlock(&vnode->volume->servers_lock); set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags); - error = afs_check_volume_status(vnode->volume, fc->key); + error = afs_check_volume_status(vnode->volume, fc); if (error < 0) goto failed_set_error; @@ -281,7 +281,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) set_bit(AFS_VOLUME_WAIT, &vnode->volume->flags); set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags); - error = afs_check_volume_status(vnode->volume, fc->key); + error = afs_check_volume_status(vnode->volume, fc); if (error < 0) goto failed_set_error; @@ -341,7 +341,7 @@ start: /* See if we need to do an update of the volume record. Note that the * volume may have moved or even have been deleted. */ - error = afs_check_volume_status(vnode->volume, fc->key); + error = afs_check_volume_status(vnode->volume, fc); if (error < 0) goto failed_set_error; diff --git a/fs/afs/server.c b/fs/afs/server.c index b7f3cb2130ca..11b90ac7ea30 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -594,12 +594,9 @@ retry: } ret = wait_on_bit(&server->flags, AFS_SERVER_FL_UPDATING, - TASK_INTERRUPTIBLE); + (fc->flags & AFS_FS_CURSOR_INTR) ? + TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); if (ret == -ERESTARTSYS) { - if (!(fc->flags & AFS_FS_CURSOR_INTR) && server->addresses) { - _leave(" = t [intr]"); - return true; - } fc->error = ret; _leave(" = f [intr]"); return false; diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c index 858498cc1b05..e3aa013c2177 100644 --- a/fs/afs/vl_probe.c +++ b/fs/afs/vl_probe.c @@ -31,10 +31,9 @@ void afs_vlserver_probe_result(struct afs_call *call) struct afs_addr_list *alist = call->alist; struct afs_vlserver *server = call->vlserver; unsigned int server_index = call->server_index; + unsigned int rtt_us = 0; unsigned int index = call->addr_ix; - unsigned int rtt = UINT_MAX; bool have_result = false; - u64 _rtt; int ret = call->error; _enter("%s,%u,%u,%d,%d", server->name, server_index, index, ret, call->abort_code); @@ -93,15 +92,9 @@ responded: } } - /* Get the RTT and scale it to fit into a 32-bit value that represents - * over a minute of time so that we can access it with one instruction - * on a 32-bit system. - */ - _rtt = rxrpc_kernel_get_rtt(call->net->socket, call->rxcall); - _rtt /= 64; - rtt = (_rtt > UINT_MAX) ? UINT_MAX : _rtt; - if (rtt < server->probe.rtt) { - server->probe.rtt = rtt; + rtt_us = rxrpc_kernel_get_srtt(call->net->socket, call->rxcall); + if (rtt_us < server->probe.rtt) { + server->probe.rtt = rtt_us; alist->preferred = index; have_result = true; } @@ -113,8 +106,7 @@ out: spin_unlock(&server->probe_lock); _debug("probe [%u][%u] %pISpc rtt=%u ret=%d", - server_index, index, &alist->addrs[index].transport, - (unsigned int)rtt, ret); + server_index, index, &alist->addrs[index].transport, rtt_us, ret); have_result |= afs_vl_probe_done(server); if (have_result) { diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c index 9a5ce9687779..72eacc14e6e1 100644 --- a/fs/afs/vl_rotate.c +++ b/fs/afs/vl_rotate.c @@ -302,8 +302,8 @@ static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc) pr_notice("VC: - nr=%u/%u/%u pf=%u\n", a->nr_ipv4, a->nr_addrs, a->max_addrs, a->preferred); - pr_notice("VC: - pr=%lx R=%lx F=%lx\n", - a->probed, a->responded, a->failed); + pr_notice("VC: - R=%lx F=%lx\n", + a->responded, a->failed); if (a == vc->ac.alist) pr_notice("VC: - current\n"); } diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 92ca5e27573b..4310336b9bb8 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -281,7 +281,7 @@ error: /* * Make sure the volume record is up to date. */ -int afs_check_volume_status(struct afs_volume *volume, struct key *key) +int afs_check_volume_status(struct afs_volume *volume, struct afs_fs_cursor *fc) { time64_t now = ktime_get_real_seconds(); int ret, retries = 0; @@ -299,7 +299,7 @@ retry: } if (!test_and_set_bit_lock(AFS_VOLUME_UPDATING, &volume->flags)) { - ret = afs_update_volume_status(volume, key); + ret = afs_update_volume_status(volume, fc->key); clear_bit_unlock(AFS_VOLUME_WAIT, &volume->flags); clear_bit_unlock(AFS_VOLUME_UPDATING, &volume->flags); wake_up_bit(&volume->flags, AFS_VOLUME_WAIT); @@ -312,7 +312,9 @@ retry: return 0; } - ret = wait_on_bit(&volume->flags, AFS_VOLUME_WAIT, TASK_INTERRUPTIBLE); + ret = wait_on_bit(&volume->flags, AFS_VOLUME_WAIT, + (fc->flags & AFS_FS_CURSOR_INTR) ? + TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); if (ret == -ERESTARTSYS) { _leave(" = %d", ret); return ret; diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index b5b45c57e1b1..fe413e7a5cf4 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -497,8 +497,6 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) ASSERTCMP(req->offset, <=, PAGE_SIZE); if (req->offset == PAGE_SIZE) { req->offset = 0; - if (req->page_done) - req->page_done(req); req->index++; if (req->remain > 0) goto begin_page; @@ -556,11 +554,13 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) if (req->offset < PAGE_SIZE) zero_user_segment(req->pages[req->index], req->offset, PAGE_SIZE); - if (req->page_done) - req->page_done(req); req->offset = 0; } + if (req->page_done) + for (req->index = 0; req->index < req->nr_pages; req->index++) + req->page_done(req); + _leave(" = 0 [done]"); return 0; } @@ -176,6 +176,7 @@ struct fsync_iocb { struct file *file; struct work_struct work; bool datasync; + struct cred *creds; }; struct poll_iocb { @@ -1589,8 +1590,11 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb, static void aio_fsync_work(struct work_struct *work) { struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, fsync.work); + const struct cred *old_cred = override_creds(iocb->fsync.creds); iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync); + revert_creds(old_cred); + put_cred(iocb->fsync.creds); iocb_put(iocb); } @@ -1604,6 +1608,10 @@ static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb, if (unlikely(!req->file->f_op->fsync)) return -EINVAL; + req->creds = prepare_creds(); + if (!req->creds) + return -ENOMEM; + req->datasync = datasync; INIT_WORK(&req->work, aio_fsync_work); schedule_work(&req->work); diff --git a/fs/bfs/Kconfig b/fs/bfs/Kconfig index 3e1247f07913..3a757805b585 100644 --- a/fs/bfs/Kconfig +++ b/fs/bfs/Kconfig @@ -11,7 +11,7 @@ config BFS_FS on your /stand slice from within Linux. You then also need to say Y to "UnixWare slices support", below. More information about the BFS file system is contained in the file - <file:Documentation/filesystems/bfs.txt>. + <file:Documentation/filesystems/bfs.rst>. If you don't know what this is about, say N. diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 13f25e241ac4..8945671fe0e5 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -40,12 +40,18 @@ #include <linux/sched/coredump.h> #include <linux/sched/task_stack.h> #include <linux/sched/cputime.h> +#include <linux/sizes.h> +#include <linux/types.h> #include <linux/cred.h> #include <linux/dax.h> #include <linux/uaccess.h> #include <asm/param.h> #include <asm/page.h> +#ifndef ELF_COMPAT +#define ELF_COMPAT 0 +#endif + #ifndef user_long_t #define user_long_t long #endif @@ -539,7 +545,8 @@ static inline int arch_check_elf(struct elfhdr *ehdr, bool has_interp, #endif /* !CONFIG_ARCH_BINFMT_ELF_STATE */ -static inline int make_prot(u32 p_flags) +static inline int make_prot(u32 p_flags, struct arch_elf_state *arch_state, + bool has_interp, bool is_interp) { int prot = 0; @@ -549,7 +556,8 @@ static inline int make_prot(u32 p_flags) prot |= PROT_WRITE; if (p_flags & PF_X) prot |= PROT_EXEC; - return prot; + + return arch_elf_adjust_prot(prot, arch_state, has_interp, is_interp); } /* This is much more generalized than the library routine read function, @@ -559,7 +567,8 @@ static inline int make_prot(u32 p_flags) static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, struct file *interpreter, - unsigned long no_base, struct elf_phdr *interp_elf_phdata) + unsigned long no_base, struct elf_phdr *interp_elf_phdata, + struct arch_elf_state *arch_state) { struct elf_phdr *eppnt; unsigned long load_addr = 0; @@ -591,7 +600,8 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) { if (eppnt->p_type == PT_LOAD) { int elf_type = MAP_PRIVATE | MAP_DENYWRITE; - int elf_prot = make_prot(eppnt->p_flags); + int elf_prot = make_prot(eppnt->p_flags, arch_state, + true, true); unsigned long vaddr = 0; unsigned long k, map_addr; @@ -682,6 +692,111 @@ out: * libraries. There is no binary dependent code anywhere else. */ +static int parse_elf_property(const char *data, size_t *off, size_t datasz, + struct arch_elf_state *arch, + bool have_prev_type, u32 *prev_type) +{ + size_t o, step; + const struct gnu_property *pr; + int ret; + + if (*off == datasz) + return -ENOENT; + + if (WARN_ON_ONCE(*off > datasz || *off % ELF_GNU_PROPERTY_ALIGN)) + return -EIO; + o = *off; + datasz -= *off; + + if (datasz < sizeof(*pr)) + return -ENOEXEC; + pr = (const struct gnu_property *)(data + o); + o += sizeof(*pr); + datasz -= sizeof(*pr); + + if (pr->pr_datasz > datasz) + return -ENOEXEC; + + WARN_ON_ONCE(o % ELF_GNU_PROPERTY_ALIGN); + step = round_up(pr->pr_datasz, ELF_GNU_PROPERTY_ALIGN); + if (step > datasz) + return -ENOEXEC; + + /* Properties are supposed to be unique and sorted on pr_type: */ + if (have_prev_type && pr->pr_type <= *prev_type) + return -ENOEXEC; + *prev_type = pr->pr_type; + + ret = arch_parse_elf_property(pr->pr_type, data + o, + pr->pr_datasz, ELF_COMPAT, arch); + if (ret) + return ret; + + *off = o + step; + return 0; +} + +#define NOTE_DATA_SZ SZ_1K +#define GNU_PROPERTY_TYPE_0_NAME "GNU" +#define NOTE_NAME_SZ (sizeof(GNU_PROPERTY_TYPE_0_NAME)) + +static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr, + struct arch_elf_state *arch) +{ + union { + struct elf_note nhdr; + char data[NOTE_DATA_SZ]; + } note; + loff_t pos; + ssize_t n; + size_t off, datasz; + int ret; + bool have_prev_type; + u32 prev_type; + + if (!IS_ENABLED(CONFIG_ARCH_USE_GNU_PROPERTY) || !phdr) + return 0; + + /* load_elf_binary() shouldn't call us unless this is true... */ + if (WARN_ON_ONCE(phdr->p_type != PT_GNU_PROPERTY)) + return -ENOEXEC; + + /* If the properties are crazy large, that's too bad (for now): */ + if (phdr->p_filesz > sizeof(note)) + return -ENOEXEC; + + pos = phdr->p_offset; + n = kernel_read(f, ¬e, phdr->p_filesz, &pos); + + BUILD_BUG_ON(sizeof(note) < sizeof(note.nhdr) + NOTE_NAME_SZ); + if (n < 0 || n < sizeof(note.nhdr) + NOTE_NAME_SZ) + return -EIO; + + if (note.nhdr.n_type != NT_GNU_PROPERTY_TYPE_0 || + note.nhdr.n_namesz != NOTE_NAME_SZ || + strncmp(note.data + sizeof(note.nhdr), + GNU_PROPERTY_TYPE_0_NAME, n - sizeof(note.nhdr))) + return -ENOEXEC; + + off = round_up(sizeof(note.nhdr) + NOTE_NAME_SZ, + ELF_GNU_PROPERTY_ALIGN); + if (off > n) + return -ENOEXEC; + + if (note.nhdr.n_descsz > n - off) + return -ENOEXEC; + datasz = off + note.nhdr.n_descsz; + + have_prev_type = false; + do { + ret = parse_elf_property(note.data, &off, datasz, arch, + have_prev_type, &prev_type); + have_prev_type = true; + } while (!ret); + + return ret == -ENOENT ? 0 : ret; +} + static int load_elf_binary(struct linux_binprm *bprm) { struct file *interpreter = NULL; /* to shut gcc up */ @@ -689,6 +804,7 @@ static int load_elf_binary(struct linux_binprm *bprm) int load_addr_set = 0; unsigned long error; struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; + struct elf_phdr *elf_property_phdata = NULL; unsigned long elf_bss, elf_brk; int bss_prot = 0; int retval, i; @@ -726,6 +842,11 @@ static int load_elf_binary(struct linux_binprm *bprm) for (i = 0; i < elf_ex->e_phnum; i++, elf_ppnt++) { char *elf_interpreter; + if (elf_ppnt->p_type == PT_GNU_PROPERTY) { + elf_property_phdata = elf_ppnt; + continue; + } + if (elf_ppnt->p_type != PT_INTERP) continue; @@ -819,9 +940,14 @@ out_free_interp: goto out_free_dentry; /* Pass PT_LOPROC..PT_HIPROC headers to arch code */ + elf_property_phdata = NULL; elf_ppnt = interp_elf_phdata; for (i = 0; i < interp_elf_ex->e_phnum; i++, elf_ppnt++) switch (elf_ppnt->p_type) { + case PT_GNU_PROPERTY: + elf_property_phdata = elf_ppnt; + break; + case PT_LOPROC ... PT_HIPROC: retval = arch_elf_pt_proc(interp_elf_ex, elf_ppnt, interpreter, @@ -832,6 +958,11 @@ out_free_interp: } } + retval = parse_elf_properties(interpreter ?: bprm->file, + elf_property_phdata, &arch_state); + if (retval) + goto out_free_dentry; + /* * Allow arch code to reject the ELF at this point, whilst it's * still possible to return an error to the code that invoked @@ -913,7 +1044,8 @@ out_free_interp: } } - elf_prot = make_prot(elf_ppnt->p_flags); + elf_prot = make_prot(elf_ppnt->p_flags, &arch_state, + !!interpreter, false); elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE; @@ -1056,7 +1188,8 @@ out_free_interp: if (interpreter) { elf_entry = load_elf_interp(interp_elf_ex, interpreter, - load_bias, interp_elf_phdata); + load_bias, interp_elf_phdata, + &arch_state); if (!IS_ERR((void *)elf_entry)) { /* * load_elf_interp() returns relocation @@ -1355,7 +1488,6 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) { u32 __user *header = (u32 __user *) vma->vm_start; u32 word; - mm_segment_t fs = get_fs(); /* * Doing it this way gets the constant folded by GCC. */ @@ -1368,14 +1500,8 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, magic.elfmag[EI_MAG1] = ELFMAG1; magic.elfmag[EI_MAG2] = ELFMAG2; magic.elfmag[EI_MAG3] = ELFMAG3; - /* - * Switch to the user "segment" for get_user(), - * then put back what elf_core_dump() had in place. - */ - set_fs(USER_DS); if (unlikely(get_user(word, header))) word = 0; - set_fs(fs); if (word == magic.cmp) return PAGE_SIZE; } @@ -1556,10 +1682,7 @@ static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm) static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata, const kernel_siginfo_t *siginfo) { - mm_segment_t old_fs = get_fs(); - set_fs(KERNEL_DS); - copy_siginfo_to_user((user_siginfo_t __user *) csigdata, siginfo); - set_fs(old_fs); + copy_siginfo_to_external(csigdata, siginfo); fill_note(note, "CORE", NT_SIGINFO, sizeof(*csigdata), csigdata); } @@ -1733,7 +1856,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, (!regset->active || regset->active(t->task, regset) > 0)) { int ret; size_t size = regset_size(t->task, regset); - void *data = kmalloc(size, GFP_KERNEL); + void *data = kzalloc(size, GFP_KERNEL); if (unlikely(!data)) return 0; ret = regset->get(t->task, regset, @@ -2186,7 +2309,6 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, static int elf_core_dump(struct coredump_params *cprm) { int has_dumped = 0; - mm_segment_t fs; int segs, i; size_t vma_data_size = 0; struct vm_area_struct *vma, *gate_vma; @@ -2235,13 +2357,10 @@ static int elf_core_dump(struct coredump_params *cprm) * notes. This also sets up the file header. */ if (!fill_note_info(&elf, e_phnum, &info, cprm->siginfo, cprm->regs)) - goto cleanup; + goto end_coredump; has_dumped = 1; - fs = get_fs(); - set_fs(KERNEL_DS); - offset += sizeof(elf); /* Elf header */ offset += segs * sizeof(struct elf_phdr); /* Program headers */ @@ -2369,9 +2488,6 @@ static int elf_core_dump(struct coredump_params *cprm) } end_coredump: - set_fs(fs); - -cleanup: free_note_info(&info); kfree(shdr4extnum); kvfree(vma_filesz); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 240f66663543..d9501a86cec9 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1549,7 +1549,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) { #define NUM_NOTES 6 int has_dumped = 0; - mm_segment_t fs; int segs; int i; struct vm_area_struct *vma; @@ -1589,31 +1588,31 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) /* alloc memory for large data structures: too large to be on stack */ elf = kmalloc(sizeof(*elf), GFP_KERNEL); if (!elf) - goto cleanup; + goto end_coredump; prstatus = kzalloc(sizeof(*prstatus), GFP_KERNEL); if (!prstatus) - goto cleanup; + goto end_coredump; psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL); if (!psinfo) - goto cleanup; + goto end_coredump; notes = kmalloc_array(NUM_NOTES, sizeof(struct memelfnote), GFP_KERNEL); if (!notes) - goto cleanup; + goto end_coredump; fpu = kmalloc(sizeof(*fpu), GFP_KERNEL); if (!fpu) - goto cleanup; + goto end_coredump; #ifdef ELF_CORE_COPY_XFPREGS xfpu = kmalloc(sizeof(*xfpu), GFP_KERNEL); if (!xfpu) - goto cleanup; + goto end_coredump; #endif for (ct = current->mm->core_state->dumper.next; ct; ct = ct->next) { tmp = kzalloc(sizeof(*tmp), GFP_KERNEL); if (!tmp) - goto cleanup; + goto end_coredump; tmp->thread = ct->task; list_add(&tmp->list, &thread_list); @@ -1678,9 +1677,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) "LINUX", ELF_CORE_XFPREG_TYPE, sizeof(*xfpu), xfpu); #endif - fs = get_fs(); - set_fs(KERNEL_DS); - offset += sizeof(*elf); /* Elf header */ offset += segs * sizeof(struct elf_phdr); /* Program headers */ @@ -1788,9 +1784,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) } end_coredump: - set_fs(fs); - -cleanup: while (!list_empty(&thread_list)) { struct list_head *tmp = thread_list.next; list_del(tmp); diff --git a/fs/block_dev.c b/fs/block_dev.c index d1e08bba925a..632538d6f1dc 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -614,10 +614,9 @@ static int blkdev_readpage(struct file * file, struct page * page) return block_read_full_page(page, blkdev_get_block); } -static int blkdev_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void blkdev_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block); + mpage_readahead(rac, blkdev_get_block); } static int blkdev_write_begin(struct file *file, struct address_space *mapping, @@ -2006,8 +2005,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) if (bdev_read_only(I_BDEV(bd_inode))) return -EPERM; - /* uswsusp needs write permission to the swap */ - if (IS_SWAPFILE(bd_inode) && !hibernation_available()) + if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode)) return -ETXTBSY; if (!iov_iter_count(from)) @@ -2068,7 +2066,7 @@ static int blkdev_writepages(struct address_space *mapping, static const struct address_space_operations def_blk_aops = { .readpage = blkdev_readpage, - .readpages = blkdev_readpages, + .readahead = blkdev_readahead, .writepage = blkdev_writepage, .write_begin = blkdev_write_begin, .write_end = blkdev_write_end, diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 9c380e7edf62..0cc02577577b 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -391,7 +391,7 @@ static int is_shared_data_backref(struct preftrees *preftrees, u64 bytenr) struct rb_node **p = &preftrees->direct.root.rb_root.rb_node; struct rb_node *parent = NULL; struct prelim_ref *ref = NULL; - struct prelim_ref target = {0}; + struct prelim_ref target = {}; int result; target.parent = bytenr; diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 47f66c6a7d7f..696f47103cfc 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -916,7 +916,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; - goto out; + goto out_put_group; } /* @@ -954,7 +954,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, ret = btrfs_orphan_add(trans, BTRFS_I(inode)); if (ret) { btrfs_add_delayed_iput(inode); - goto out; + goto out_put_group; } clear_nlink(inode); /* One for the block groups ref */ @@ -977,13 +977,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); if (ret < 0) - goto out; + goto out_put_group; if (ret > 0) btrfs_release_path(path); if (ret == 0) { ret = btrfs_del_item(trans, tree_root, path); if (ret) - goto out; + goto out_put_group; btrfs_release_path(path); } @@ -1102,9 +1102,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, ret = remove_block_group_free_space(trans, block_group); if (ret) - goto out; + goto out_put_group; - btrfs_put_block_group(block_group); + /* Once for the block groups rbtree */ btrfs_put_block_group(block_group); ret = btrfs_search_slot(trans, root, &key, path, -1, 1); @@ -1127,6 +1127,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, /* once for the tree */ free_extent_map(em); } + +out_put_group: + /* Once for the lookup reference */ + btrfs_put_block_group(block_group); out: if (remove_rsv) btrfs_delayed_refs_rsv_release(fs_info, 1); @@ -1288,11 +1292,15 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans, if (ret) goto err; mutex_unlock(&fs_info->unused_bg_unpin_mutex); + if (prev_trans) + btrfs_put_transaction(prev_trans); return true; err: mutex_unlock(&fs_info->unused_bg_unpin_mutex); + if (prev_trans) + btrfs_put_transaction(prev_trans); btrfs_dec_block_group_ro(bg); return false; } diff --git a/fs/btrfs/discard.h b/fs/btrfs/discard.h index 21a15776dac4..353228d62f5a 100644 --- a/fs/btrfs/discard.h +++ b/fs/btrfs/discard.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef BTRFS_DISCARD_H #define BTRFS_DISCARD_H diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a6cb5cbbdb9f..7278789ff8a7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -980,9 +980,7 @@ static void btree_invalidatepage(struct page *page, unsigned int offset, btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info, "page private not zero on page %llu", (unsigned long long)page_offset(page)); - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); + detach_page_private(page); } } @@ -2036,9 +2034,6 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) for (i = 0; i < ret; i++) btrfs_drop_and_free_fs_root(fs_info, gang[i]); } - - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) - btrfs_free_log_root_tree(NULL, fs_info); } static void btrfs_init_scrub(struct btrfs_fs_info *fs_info) @@ -3888,7 +3883,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, spin_unlock(&fs_info->fs_roots_radix_lock); if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { - btrfs_free_log(NULL, root); + ASSERT(root->log_root == NULL); if (root->reloc_root) { btrfs_put_root(root->reloc_root); root->reloc_root = NULL; @@ -4211,6 +4206,36 @@ static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) up_write(&fs_info->cleanup_work_sem); } +static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *gang[8]; + u64 root_objectid = 0; + int ret; + + spin_lock(&fs_info->fs_roots_radix_lock); + while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, root_objectid, + ARRAY_SIZE(gang))) != 0) { + int i; + + for (i = 0; i < ret; i++) + gang[i] = btrfs_grab_root(gang[i]); + spin_unlock(&fs_info->fs_roots_radix_lock); + + for (i = 0; i < ret; i++) { + if (!gang[i]) + continue; + root_objectid = gang[i]->root_key.objectid; + btrfs_free_log(NULL, gang[i]); + btrfs_put_root(gang[i]); + } + root_objectid++; + spin_lock(&fs_info->fs_roots_radix_lock); + } + spin_unlock(&fs_info->fs_roots_radix_lock); + btrfs_free_log_root_tree(NULL, fs_info); +} + static void btrfs_destroy_ordered_extents(struct btrfs_root *root) { struct btrfs_ordered_extent *ordered; @@ -4603,6 +4628,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) btrfs_destroy_delayed_inodes(fs_info); btrfs_assert_delayed_root_empty(fs_info); btrfs_destroy_all_delalloc_inodes(fs_info); + btrfs_drop_all_logs(fs_info); mutex_unlock(&fs_info->transaction_kthread_mutex); return 0; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 39e45b8a5031..e12eb32d9e17 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3076,22 +3076,16 @@ static int submit_extent_page(unsigned int opf, static void attach_extent_buffer_page(struct extent_buffer *eb, struct page *page) { - if (!PagePrivate(page)) { - SetPagePrivate(page); - get_page(page); - set_page_private(page, (unsigned long)eb); - } else { + if (!PagePrivate(page)) + attach_page_private(page, eb); + else WARN_ON(page->private != (unsigned long)eb); - } } void set_page_extent_mapped(struct page *page) { - if (!PagePrivate(page)) { - SetPagePrivate(page); - get_page(page); - set_page_private(page, EXTENT_PAGE_PRIVATE); - } + if (!PagePrivate(page)) + attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE); } static struct extent_map * @@ -4367,51 +4361,32 @@ int extent_writepages(struct address_space *mapping, return ret; } -int extent_readpages(struct address_space *mapping, struct list_head *pages, - unsigned nr_pages) +void extent_readahead(struct readahead_control *rac) { struct bio *bio = NULL; unsigned long bio_flags = 0; struct page *pagepool[16]; struct extent_map *em_cached = NULL; - int nr = 0; u64 prev_em_start = (u64)-1; + int nr; - while (!list_empty(pages)) { - u64 contig_end = 0; - - for (nr = 0; nr < ARRAY_SIZE(pagepool) && !list_empty(pages);) { - struct page *page = lru_to_page(pages); - - prefetchw(&page->flags); - list_del(&page->lru); - if (add_to_page_cache_lru(page, mapping, page->index, - readahead_gfp_mask(mapping))) { - put_page(page); - break; - } - - pagepool[nr++] = page; - contig_end = page_offset(page) + PAGE_SIZE - 1; - } + while ((nr = readahead_page_batch(rac, pagepool))) { + u64 contig_start = page_offset(pagepool[0]); + u64 contig_end = page_offset(pagepool[nr - 1]) + PAGE_SIZE - 1; - if (nr) { - u64 contig_start = page_offset(pagepool[0]); + ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end); - ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end); - - contiguous_readpages(pagepool, nr, contig_start, - contig_end, &em_cached, &bio, &bio_flags, - &prev_em_start); - } + contiguous_readpages(pagepool, nr, contig_start, contig_end, + &em_cached, &bio, &bio_flags, &prev_em_start); } if (em_cached) free_extent_map(em_cached); - if (bio) - return submit_one_bio(bio, 0, bio_flags); - return 0; + if (bio) { + if (submit_one_bio(bio, 0, bio_flags)) + return; + } } /* @@ -4929,10 +4904,7 @@ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) * We need to make sure we haven't be attached * to a new eb. */ - ClearPagePrivate(page); - set_page_private(page, 0); - /* One for the page private */ - put_page(page); + detach_page_private(page); } if (mapped) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 2ed65bd0760e..25594e09fdcd 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -198,8 +198,7 @@ int extent_writepages(struct address_space *mapping, struct writeback_control *wbc); int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc); -int extent_readpages(struct address_space *mapping, struct list_head *pages, - unsigned nr_pages); +void extent_readahead(struct readahead_control *rac); int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); void set_page_extent_mapped(struct page *page); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 320d1062068d..8b3489f229c7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4856,8 +4856,8 @@ static void evict_inode_truncate_pages(struct inode *inode) /* * Keep looping until we have no more ranges in the io tree. - * We can have ongoing bios started by readpages (called from readahead) - * that have their endio callback (extent_io.c:end_bio_extent_readpage) + * We can have ongoing bios started by readahead that have + * their endio callback (extent_io.c:end_bio_extent_readpage) * still in progress (unlocked the pages in the bio but did not yet * unlocked the ranges in the io tree). Therefore this means some * ranges can still be locked and eviction started because before @@ -7050,11 +7050,11 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, * for it to complete) and then invalidate the pages for * this range (through invalidate_inode_pages2_range()), * but that can lead us to a deadlock with a concurrent - * call to readpages() (a buffered read or a defrag call + * call to readahead (a buffered read or a defrag call * triggered a readahead) on a page lock due to an * ordered dio extent we created before but did not have * yet a corresponding bio submitted (whence it can not - * complete), which makes readpages() wait for that + * complete), which makes readahead wait for that * ordered extent to complete while holding a lock on * that page. */ @@ -8293,21 +8293,16 @@ static int btrfs_writepages(struct address_space *mapping, return extent_writepages(mapping, wbc); } -static int -btrfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void btrfs_readahead(struct readahead_control *rac) { - return extent_readpages(mapping, pages, nr_pages); + extent_readahead(rac); } static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) { int ret = try_release_extent_mapping(page, gfp_flags); - if (ret == 1) { - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); - } + if (ret == 1) + detach_page_private(page); return ret; } @@ -8329,14 +8324,8 @@ static int btrfs_migratepage(struct address_space *mapping, if (ret != MIGRATEPAGE_SUCCESS) return ret; - if (page_has_private(page)) { - ClearPagePrivate(page); - get_page(newpage); - set_page_private(newpage, page_private(page)); - set_page_private(page, 0); - put_page(page); - SetPagePrivate(newpage); - } + if (page_has_private(page)) + attach_page_private(newpage, detach_page_private(page)); if (PagePrivate2(page)) { ClearPagePrivate2(page); @@ -8458,11 +8447,7 @@ again: } ClearPageChecked(page); - if (PagePrivate(page)) { - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); - } + detach_page_private(page); } /* @@ -10553,7 +10538,7 @@ static const struct address_space_operations btrfs_aops = { .readpage = btrfs_readpage, .writepage = btrfs_writepage, .writepages = btrfs_writepages, - .readpages = btrfs_readpages, + .readahead = btrfs_readahead, .direct_IO = btrfs_direct_IO, .invalidatepage = btrfs_invalidatepage, .releasepage = btrfs_releasepage, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index d35936c934ab..03bc7134e8cb 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4559,6 +4559,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) if (IS_ERR(fs_root)) { err = PTR_ERR(fs_root); list_add_tail(&reloc_root->root_list, &reloc_roots); + btrfs_end_transaction(trans); goto out_unset; } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index c5f41bd86765..6a92ecf9eaa2 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -7065,13 +7065,6 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) goto out; } - if (!access_ok(arg->clone_sources, - sizeof(*arg->clone_sources) * - arg->clone_sources_count)) { - ret = -EFAULT; - goto out; - } - if (arg->flags & ~BTRFS_SEND_FLAG_MASK) { ret = -EINVAL; goto out; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 8cede6eb9843..2d5498136e5e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -662,10 +662,19 @@ again: } got_it: - btrfs_record_root_in_trans(h, root); - if (!current->journal_info) current->journal_info = h; + + /* + * btrfs_record_root_in_trans() needs to alloc new extents, and may + * call btrfs_join_transaction() while we're also starting a + * transaction. + * + * Thus it need to be called after current->journal_info initialized, + * or we can deadlock. + */ + btrfs_record_root_in_trans(h, root); + return h; join_fail: diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index ec36a7c6ba3d..02ebdd9edc19 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4226,6 +4226,9 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, const u64 ino = btrfs_ino(inode); struct btrfs_path *dst_path = NULL; bool dropped_extents = false; + u64 truncate_offset = i_size; + struct extent_buffer *leaf; + int slot; int ins_nr = 0; int start_slot; int ret; @@ -4240,9 +4243,43 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, if (ret < 0) goto out; + /* + * We must check if there is a prealloc extent that starts before the + * i_size and crosses the i_size boundary. This is to ensure later we + * truncate down to the end of that extent and not to the i_size, as + * otherwise we end up losing part of the prealloc extent after a log + * replay and with an implicit hole if there is another prealloc extent + * that starts at an offset beyond i_size. + */ + ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY); + if (ret < 0) + goto out; + + if (ret == 0) { + struct btrfs_file_extent_item *ei; + + leaf = path->nodes[0]; + slot = path->slots[0]; + ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, ei) == + BTRFS_FILE_EXTENT_PREALLOC) { + u64 extent_end; + + btrfs_item_key_to_cpu(leaf, &key, slot); + extent_end = key.offset + + btrfs_file_extent_num_bytes(leaf, ei); + + if (extent_end > i_size) + truncate_offset = extent_end; + } + } else { + ret = 0; + } + while (true) { - struct extent_buffer *leaf = path->nodes[0]; - int slot = path->slots[0]; + leaf = path->nodes[0]; + slot = path->slots[0]; if (slot >= btrfs_header_nritems(leaf)) { if (ins_nr > 0) { @@ -4280,7 +4317,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, ret = btrfs_truncate_inode_items(trans, root->log_root, &inode->vfs_inode, - i_size, + truncate_offset, BTRFS_EXTENT_DATA_KEY); } while (ret == -EAGAIN); if (ret) diff --git a/fs/buffer.c b/fs/buffer.c index a60f60396cfa..64fe82ec65ff 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -123,14 +123,6 @@ void __wait_on_buffer(struct buffer_head * bh) } EXPORT_SYMBOL(__wait_on_buffer); -static void -__clear_page_buffers(struct page *page) -{ - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); -} - static void buffer_io_error(struct buffer_head *bh, char *msg) { if (!test_bit(BH_Quiet, &bh->b_state)) @@ -906,7 +898,7 @@ link_dev_buffers(struct page *page, struct buffer_head *head) bh = bh->b_this_page; } while (bh); tail->b_this_page = head; - attach_page_buffers(page, head); + attach_page_private(page, head); } static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size) @@ -1154,12 +1146,19 @@ EXPORT_SYMBOL(mark_buffer_dirty); void mark_buffer_write_io_error(struct buffer_head *bh) { + struct super_block *sb; + set_buffer_write_io_error(bh); /* FIXME: do we need to set this in both places? */ if (bh->b_page && bh->b_page->mapping) mapping_set_error(bh->b_page->mapping, -EIO); if (bh->b_assoc_map) mapping_set_error(bh->b_assoc_map, -EIO); + rcu_read_lock(); + sb = READ_ONCE(bh->b_bdev->bd_super); + if (sb) + errseq_set(&sb->s_wb_err, -EIO); + rcu_read_unlock(); } EXPORT_SYMBOL(mark_buffer_write_io_error); @@ -1580,7 +1579,7 @@ void create_empty_buffers(struct page *page, bh = bh->b_this_page; } while (bh != head); } - attach_page_buffers(page, head); + attach_page_private(page, head); spin_unlock(&page->mapping->private_lock); } EXPORT_SYMBOL(create_empty_buffers); @@ -2567,7 +2566,7 @@ static void attach_nobh_buffers(struct page *page, struct buffer_head *head) bh->b_this_page = head; bh = bh->b_this_page; } while (bh != head); - attach_page_buffers(page, head); + attach_page_private(page, head); spin_unlock(&page->mapping->private_lock); } @@ -3227,7 +3226,7 @@ drop_buffers(struct page *page, struct buffer_head **buffers_to_free) bh = next; } while (bh != head); *buffers_to_free = head; - __clear_page_buffers(page); + detach_page_private(page); return 1; failed: return 0; diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig index ae559ed5b3b3..ff9ca55a9ae9 100644 --- a/fs/cachefiles/Kconfig +++ b/fs/cachefiles/Kconfig @@ -8,7 +8,7 @@ config CACHEFILES filesystems - primarily networking filesystems - thus allowing fast local disk to enhance the speed of slower devices. - See Documentation/filesystems/caching/cachefiles.txt for more + See Documentation/filesystems/caching/cachefiles.rst for more information. config CACHEFILES_DEBUG @@ -36,5 +36,5 @@ config CACHEFILES_HISTOGRAM bouncing between CPUs. On the other hand, the histogram may be useful for debugging purposes. Saying 'N' here is recommended. - See Documentation/filesystems/caching/cachefiles.txt for more + See Documentation/filesystems/caching/cachefiles.rst for more information. diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 1dc97f2d6201..e7726f5f1241 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -60,9 +60,9 @@ static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode, object = container_of(op->op.object, struct cachefiles_object, fscache); spin_lock(&object->work_lock); list_add_tail(&monitor->op_link, &op->to_do); + fscache_enqueue_retrieval(op); spin_unlock(&object->work_lock); - fscache_enqueue_retrieval(op); fscache_put_retrieval(op); return 0; } @@ -398,7 +398,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, struct inode *inode; sector_t block; unsigned shift; - int ret; + int ret, ret2; object = container_of(op->op.object, struct cachefiles_object, fscache); @@ -430,8 +430,8 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, block = page->index; block <<= shift; - ret = bmap(inode, &block); - ASSERT(ret < 0); + ret2 = bmap(inode, &block); + ASSERT(ret2 == 0); _debug("%llx -> %llx", (unsigned long long) (page->index << shift), @@ -739,8 +739,8 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op, block = page->index; block <<= shift; - ret = bmap(inode, &block); - ASSERT(!ret); + ret2 = bmap(inode, &block); + ASSERT(ret2 == 0); _debug("%llx -> %llx", (unsigned long long) (page->index << shift), diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 185db76300b3..f1acde6fb9a6 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2749,7 +2749,7 @@ int ceph_try_get_caps(struct inode *inode, int need, int want, ret = try_get_cap_refs(inode, need, want, 0, flags, got); /* three special error codes */ - if (ret == -EAGAIN || ret == -EFBIG || ret == -EAGAIN) + if (ret == -EAGAIN || ret == -EFBIG || ret == -ESTALE) ret = 0; return ret; } @@ -3746,6 +3746,7 @@ retry: WARN_ON(1); tsession = NULL; target = -1; + mutex_lock(&session->s_mutex); } goto retry; @@ -3990,7 +3991,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, __ceph_queue_cap_release(session, cap); spin_unlock(&session->s_cap_lock); } - goto done; + goto flush_cap_releases; } /* these will work even if we don't have a cap yet */ diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 486f91f9685b..7c63abf5bea9 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3251,8 +3251,7 @@ static void handle_session(struct ceph_mds_session *session, void *end = p + msg->front.iov_len; struct ceph_mds_session_head *h; u32 op; - u64 seq; - unsigned long features = 0; + u64 seq, features = 0; int wake = 0; bool blacklisted = false; @@ -3271,9 +3270,8 @@ static void handle_session(struct ceph_mds_session *session, goto bad; /* version >= 3, feature bits */ ceph_decode_32_safe(&p, end, len, bad); - ceph_decode_need(&p, end, len, bad); - memcpy(&features, p, min_t(size_t, len, sizeof(features))); - p += len; + ceph_decode_64_safe(&p, end, features, bad); + p += len - sizeof(features); } mutex_lock(&mdsc->mutex); diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index de56dee60540..19507e2fdb57 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -159,8 +159,8 @@ static struct inode *lookup_quotarealm_inode(struct ceph_mds_client *mdsc, } if (IS_ERR(in)) { - pr_warn("Can't lookup inode %llx (err: %ld)\n", - realm->ino, PTR_ERR(in)); + dout("Can't lookup inode %llx (err: %ld)\n", + realm->ino, PTR_ERR(in)); qri->timeout = jiffies + msecs_to_jiffies(60 * 1000); /* XXX */ } else { qri->timeout = 0; diff --git a/fs/char_dev.c b/fs/char_dev.c index c5e6eff5a381..ba0ded7842a7 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -483,6 +483,9 @@ int cdev_add(struct cdev *p, dev_t dev, unsigned count) p->dev = dev; p->count = count; + if (WARN_ON(dev == WHITEOUT_DEV)) + return -EBUSY; + error = kobj_map(cdev_map, dev, count, NULL, exact_match, exact_lock, p); if (error) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 05dd3dea684b..39b708d9d86d 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1891,7 +1891,8 @@ GLOBAL_EXTERN struct list_head cifs_tcp_ses_list; /* * This lock protects the cifs_tcp_ses_list, the list of smb sessions per * tcp session, and the list of tcon's per smb session. It also protects - * the reference counters for the server, smb session, and tcon. Finally, + * the reference counters for the server, smb session, and tcon. It also + * protects some fields in the TCP_Server_Info struct such as dstaddr. Finally, * changes to the tcon->tidStatus should be done while holding this lock. * generally the locks should be taken in order tcp_ses_lock before * tcon->open_file_lock and that before file->file_info_lock since the diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 182b864b3075..5014a82391ff 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -2152,8 +2152,8 @@ cifs_writev_requeue(struct cifs_writedata *wdata) } } + kref_put(&wdata2->refcount, cifs_writedata_release); if (rc) { - kref_put(&wdata2->refcount, cifs_writedata_release); if (is_retryable_error(rc)) continue; i += nr_pages; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 95b3ab0ca8c0..28268ed461b8 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -375,8 +375,10 @@ static int reconn_set_ipaddr(struct TCP_Server_Info *server) return rc; } + spin_lock(&cifs_tcp_ses_lock); rc = cifs_convert_address((struct sockaddr *)&server->dstaddr, ipaddr, strlen(ipaddr)); + spin_unlock(&cifs_tcp_ses_lock); kfree(ipaddr); return !rc ? -1 : 0; @@ -3373,6 +3375,10 @@ cifs_find_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) spin_lock(&cifs_tcp_ses_lock); list_for_each(tmp, &ses->tcon_list) { tcon = list_entry(tmp, struct cifs_tcon, tcon_list); +#ifdef CONFIG_CIFS_DFS_UPCALL + if (tcon->dfs_path) + continue; +#endif if (!match_tcon(tcon, volume_info)) continue; ++tcon->tc_count; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 0b1528edebcf..75ddce8ef456 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -4060,7 +4060,7 @@ cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset) * than it negotiated since it will refuse the read * then. */ - if ((tcon->ses) && !(tcon->ses->capabilities & + if (!(tcon->ses->capabilities & tcon->ses->server->vals->cap_large_files)) { current_read_size = min_t(uint, current_read_size, CIFSMaxBufSize); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 390d2b15ef6e..5d2965a23730 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -730,7 +730,7 @@ static __u64 simple_hashstr(const char *str) * cifs_backup_query_path_info - SMB1 fallback code to get ino * * Fallback code to get file metadata when we don't have access to - * @full_path (EACCESS) and have backup creds. + * @full_path (EACCES) and have backup creds. * * @data will be set to search info result buffer * @resp_buf will be set to cifs resp buf and needs to be freed with diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index a456febd4109..550ce9020a3e 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -1025,51 +1025,99 @@ int copy_path_name(char *dst, const char *src) } struct super_cb_data { - struct TCP_Server_Info *server; + void *data; struct super_block *sb; }; -static void super_cb(struct super_block *sb, void *arg) +static void tcp_super_cb(struct super_block *sb, void *arg) { - struct super_cb_data *d = arg; + struct super_cb_data *sd = arg; + struct TCP_Server_Info *server = sd->data; struct cifs_sb_info *cifs_sb; struct cifs_tcon *tcon; - if (d->sb) + if (sd->sb) return; cifs_sb = CIFS_SB(sb); tcon = cifs_sb_master_tcon(cifs_sb); - if (tcon->ses->server == d->server) - d->sb = sb; + if (tcon->ses->server == server) + sd->sb = sb; } -struct super_block *cifs_get_tcp_super(struct TCP_Server_Info *server) +static struct super_block *__cifs_get_super(void (*f)(struct super_block *, void *), + void *data) { - struct super_cb_data d = { - .server = server, + struct super_cb_data sd = { + .data = data, .sb = NULL, }; - iterate_supers_type(&cifs_fs_type, super_cb, &d); + iterate_supers_type(&cifs_fs_type, f, &sd); - if (unlikely(!d.sb)) - return ERR_PTR(-ENOENT); + if (!sd.sb) + return ERR_PTR(-EINVAL); /* * Grab an active reference in order to prevent automounts (DFS links) * of expiring and then freeing up our cifs superblock pointer while * we're doing failover. */ - cifs_sb_active(d.sb); - return d.sb; + cifs_sb_active(sd.sb); + return sd.sb; } -void cifs_put_tcp_super(struct super_block *sb) +static void __cifs_put_super(struct super_block *sb) { if (!IS_ERR_OR_NULL(sb)) cifs_sb_deactive(sb); } +struct super_block *cifs_get_tcp_super(struct TCP_Server_Info *server) +{ + return __cifs_get_super(tcp_super_cb, server); +} + +void cifs_put_tcp_super(struct super_block *sb) +{ + __cifs_put_super(sb); +} + +#ifdef CONFIG_CIFS_DFS_UPCALL +static void tcon_super_cb(struct super_block *sb, void *arg) +{ + struct super_cb_data *sd = arg; + struct cifs_tcon *tcon = sd->data; + struct cifs_sb_info *cifs_sb; + + if (sd->sb) + return; + + cifs_sb = CIFS_SB(sb); + if (tcon->dfs_path && cifs_sb->origin_fullpath && + !strcasecmp(tcon->dfs_path, cifs_sb->origin_fullpath)) + sd->sb = sb; +} + +static inline struct super_block *cifs_get_tcon_super(struct cifs_tcon *tcon) +{ + return __cifs_get_super(tcon_super_cb, tcon); +} + +static inline void cifs_put_tcon_super(struct super_block *sb) +{ + __cifs_put_super(sb); +} +#else +static inline struct super_block *cifs_get_tcon_super(struct cifs_tcon *tcon) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline void cifs_put_tcon_super(struct super_block *sb) +{ +} +#endif + int update_super_prepath(struct cifs_tcon *tcon, const char *prefix, size_t prefix_len) { @@ -1077,7 +1125,7 @@ int update_super_prepath(struct cifs_tcon *tcon, const char *prefix, struct cifs_sb_info *cifs_sb; int rc = 0; - sb = cifs_get_tcp_super(tcon->ses->server); + sb = cifs_get_tcon_super(tcon); if (IS_ERR(sb)) return PTR_ERR(sb); @@ -1099,6 +1147,6 @@ int update_super_prepath(struct cifs_tcon *tcon, const char *prefix, cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH; out: - cifs_put_tcp_super(sb); + cifs_put_tcon_super(sb); return rc; } diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index b36c46f48705..f829f4165d38 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -687,6 +687,11 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; + if (!server->ops->new_lease_key) + return -EIO; + + server->ops->new_lease_key(pfid); + memset(rqst, 0, sizeof(rqst)); resp_buftype[0] = resp_buftype[1] = CIFS_NO_BUFFER; memset(rsp_iov, 0, sizeof(rsp_iov)); diff --git a/fs/coda/Kconfig b/fs/coda/Kconfig index ae6759f9594a..c3477eeafb3f 100644 --- a/fs/coda/Kconfig +++ b/fs/coda/Kconfig @@ -15,7 +15,7 @@ config CODA_FS *client*. You will need user level code as well, both for the client and server. Servers are currently user level, i.e. they need no kernel support. Please read - <file:Documentation/filesystems/coda.txt> and check out the Coda + <file:Documentation/filesystems/coda.rst> and check out the Coda home page <http://www.coda.cs.cmu.edu/>. To compile the coda client support as a module, choose M here: the diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c index aaad4ca1217e..e61f3fe8e32a 100644 --- a/fs/compat_binfmt_elf.c +++ b/fs/compat_binfmt_elf.c @@ -17,6 +17,8 @@ #include <linux/elfcore-compat.h> #include <linux/time.h> +#define ELF_COMPAT 1 + /* * Rename the basic ELF layout types to refer to the 32-bit class of files. */ @@ -28,18 +30,20 @@ #undef elf_shdr #undef elf_note #undef elf_addr_t +#undef ELF_GNU_PROPERTY_ALIGN #define elfhdr elf32_hdr #define elf_phdr elf32_phdr #define elf_shdr elf32_shdr #define elf_note elf32_note #define elf_addr_t Elf32_Addr +#define ELF_GNU_PROPERTY_ALIGN ELF32_GNU_PROPERTY_ALIGN /* * Some data types as stored in coredump. */ #define user_long_t compat_long_t #define user_siginfo_t compat_siginfo_t -#define copy_siginfo_to_user copy_siginfo_to_user32 +#define copy_siginfo_to_external copy_siginfo_to_external32 /* * The machine-dependent core note format types are defined in elfcore-compat.h, diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index cf7b7e1d5bd7..cb733652ecca 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1519,6 +1519,7 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) spin_lock(&configfs_dirent_lock); configfs_detach_rollback(dentry); spin_unlock(&configfs_dirent_lock); + config_item_put(parent_item); return -EINTR; } frag->frag_dead = true; diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index fd0b5dd68f9e..8bd6a883c94c 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -9,7 +9,7 @@ * * configfs Copyright (C) 2005 Oracle. All rights reserved. * - * Please see Documentation/filesystems/configfs/configfs.txt for more + * Please see Documentation/filesystems/configfs.rst for more * information. */ diff --git a/fs/configfs/item.c b/fs/configfs/item.c index 6e0f1fcb8a5b..704a4356f137 100644 --- a/fs/configfs/item.c +++ b/fs/configfs/item.c @@ -9,7 +9,7 @@ * * configfs Copyright (C) 2005 Oracle. All rights reserved. * - * Please see the file Documentation/filesystems/configfs/configfs.txt for + * Please see the file Documentation/filesystems/configfs.rst for * critical information about using the config_item interface. */ diff --git a/fs/coredump.c b/fs/coredump.c index f8296a82d01d..478a0d810136 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -211,6 +211,8 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm, return -ENOMEM; (*argv)[(*argc)++] = 0; ++pat_ptr; + if (!(*pat_ptr)) + return -ENOMEM; } /* Repeat as long as we have more pattern to process and more output @@ -786,6 +788,14 @@ void do_coredump(const kernel_siginfo_t *siginfo) if (displaced) put_files_struct(displaced); if (!dump_interrupted()) { + /* + * umh disabled with CONFIG_STATIC_USERMODEHELPER_PATH="" would + * have this set to NULL. + */ + if (!cprm.file) { + pr_info("Core dump to |%s disabled\n", cn.corename); + goto close_fail; + } file_start_write(cprm.file); core_dumped = binfmt->core_dump(&cprm); file_end_write(cprm.file); diff --git a/fs/cramfs/Kconfig b/fs/cramfs/Kconfig index c8bebb70a971..d98cef0dbb6b 100644 --- a/fs/cramfs/Kconfig +++ b/fs/cramfs/Kconfig @@ -9,7 +9,7 @@ config CRAMFS limited to 256MB file systems (with 16MB files), and doesn't support 16/32 bits uid/gid, hard links and timestamps. - See <file:Documentation/filesystems/cramfs.txt> and + See <file:Documentation/filesystems/cramfs.rst> and <file:fs/cramfs/README> for further information. To compile this as a module, choose M here: the module will be called diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 1ecaac7ee3cb..ed015cb66c7c 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -54,6 +54,7 @@ struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags) /** * fscrypt_free_bounce_page() - free a ciphertext bounce page + * @bounce_page: the bounce page to free, or NULL * * Free a bounce page that was allocated by fscrypt_encrypt_pagecache_blocks(), * or by fscrypt_alloc_bounce_page() directly. @@ -76,8 +77,12 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num, memset(iv, 0, ci->ci_mode->ivsize); if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) { - WARN_ON_ONCE((u32)lblk_num != lblk_num); + WARN_ON_ONCE(lblk_num > U32_MAX); + WARN_ON_ONCE(ci->ci_inode->i_ino > U32_MAX); lblk_num |= (u64)ci->ci_inode->i_ino << 32; + } else if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) { + WARN_ON_ONCE(lblk_num > U32_MAX); + lblk_num = (u32)(ci->ci_hashed_ino + lblk_num); } else if (flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) { memcpy(iv->nonce, ci->ci_nonce, FS_KEY_DERIVATION_NONCE_SIZE); } @@ -132,7 +137,8 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw, } /** - * fscrypt_encrypt_pagecache_blocks() - Encrypt filesystem blocks from a pagecache page + * fscrypt_encrypt_pagecache_blocks() - Encrypt filesystem blocks from a + * pagecache page * @page: The locked pagecache page containing the block(s) to encrypt * @len: Total size of the block(s) to encrypt. Must be a nonzero * multiple of the filesystem's block size. @@ -222,7 +228,8 @@ int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, EXPORT_SYMBOL(fscrypt_encrypt_block_inplace); /** - * fscrypt_decrypt_pagecache_blocks() - Decrypt filesystem blocks in a pagecache page + * fscrypt_decrypt_pagecache_blocks() - Decrypt filesystem blocks in a + * pagecache page * @page: The locked pagecache page containing the block(s) to decrypt * @len: Total size of the block(s) to decrypt. Must be a nonzero * multiple of the filesystem's block size. @@ -346,6 +353,8 @@ void fscrypt_msg(const struct inode *inode, const char *level, /** * fscrypt_init() - Set up for fs encryption. + * + * Return: 0 on success; -errno on failure */ static int __init fscrypt_init(void) { diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 4c212442a8f7..83ca5f1e7934 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -18,7 +18,7 @@ #include <crypto/skcipher.h> #include "fscrypt_private.h" -/** +/* * struct fscrypt_nokey_name - identifier for directory entry when key is absent * * When userspace lists an encrypted directory without access to the key, the @@ -83,13 +83,8 @@ static int fscrypt_do_sha256(const u8 *data, unsigned int data_len, u8 *result) tfm = prev_tfm; } } - { - SHASH_DESC_ON_STACK(desc, tfm); - desc->tfm = tfm; - - return crypto_shash_digest(desc, data, data_len, result); - } + return crypto_shash_tfm_digest(tfm, data, data_len, result); } static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) @@ -105,9 +100,12 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) /** * fscrypt_fname_encrypt() - encrypt a filename - * - * The output buffer must be at least as large as the input buffer. - * Any extra space is filled with NUL padding before encryption. + * @inode: inode of the parent directory (for regular filenames) + * or of the symlink (for symlink targets) + * @iname: the filename to encrypt + * @out: (output) the encrypted filename + * @olen: size of the encrypted filename. It must be at least @iname->len. + * Any extra space is filled with NUL padding before encryption. * * Return: 0 on success, -errno on failure */ @@ -157,8 +155,11 @@ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, /** * fname_decrypt() - decrypt a filename - * - * The caller must have allocated sufficient memory for the @oname string. + * @inode: inode of the parent directory (for regular filenames) + * or of the symlink (for symlink targets) + * @iname: the encrypted filename to decrypt + * @oname: (output) the decrypted filename. The caller must have allocated + * enough space for this, e.g. using fscrypt_fname_alloc_buffer(). * * Return: 0 on success, -errno on failure */ @@ -206,7 +207,10 @@ static const char lookup_table[65] = #define BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3) /** - * base64_encode() - + * base64_encode() - base64-encode some bytes + * @src: the bytes to encode + * @len: number of bytes to encode + * @dst: (output) the base64-encoded string. Not NUL-terminated. * * Encodes the input string using characters from the set [A-Za-z0-9+,]. * The encoded string is roughly 4/3 times the size of the input string. @@ -272,7 +276,12 @@ bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, } /** - * fscrypt_fname_alloc_buffer - allocate a buffer for presented filenames + * fscrypt_fname_alloc_buffer() - allocate a buffer for presented filenames + * @inode: inode of the parent directory (for regular filenames) + * or of the symlink (for symlink targets) + * @max_encrypted_len: maximum length of encrypted filenames the buffer will be + * used to present + * @crypto_str: (output) buffer to allocate * * Allocate a buffer that is large enough to hold any decrypted or encoded * filename (null-terminated), for the given maximum encrypted filename length. @@ -297,9 +306,10 @@ int fscrypt_fname_alloc_buffer(const struct inode *inode, EXPORT_SYMBOL(fscrypt_fname_alloc_buffer); /** - * fscrypt_fname_free_buffer - free the buffer for presented filenames + * fscrypt_fname_free_buffer() - free a buffer for presented filenames + * @crypto_str: the buffer to free * - * Free the buffer allocated by fscrypt_fname_alloc_buffer(). + * Free a buffer that was allocated by fscrypt_fname_alloc_buffer(). */ void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str) { @@ -311,10 +321,19 @@ void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str) EXPORT_SYMBOL(fscrypt_fname_free_buffer); /** - * fscrypt_fname_disk_to_usr() - converts a filename from disk space to user - * space - * - * The caller must have allocated sufficient memory for the @oname string. + * fscrypt_fname_disk_to_usr() - convert an encrypted filename to + * user-presentable form + * @inode: inode of the parent directory (for regular filenames) + * or of the symlink (for symlink targets) + * @hash: first part of the name's dirhash, if applicable. This only needs to + * be provided if the filename is located in an indexed directory whose + * encryption key may be unavailable. Not needed for symlink targets. + * @minor_hash: second part of the name's dirhash, if applicable + * @iname: encrypted filename to convert. May also be "." or "..", which + * aren't actually encrypted. + * @oname: output buffer for the user-presentable filename. The caller must + * have allocated enough space for this, e.g. using + * fscrypt_fname_alloc_buffer(). * * If the key is available, we'll decrypt the disk name. Otherwise, we'll * encode it for presentation in fscrypt_nokey_name format. diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index dbced2937ec8..eb7fcd2b7fb8 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -43,7 +43,7 @@ struct fscrypt_context_v2 { u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE]; }; -/** +/* * fscrypt_context - the encryption context of an inode * * This is the on-disk equivalent of an fscrypt_policy, stored alongside each @@ -157,7 +157,7 @@ fscrypt_policy_flags(const union fscrypt_policy *policy) BUG(); } -/** +/* * For encrypted symlinks, the ciphertext length is stored at the beginning * of the string in little-endian format. */ @@ -222,6 +222,9 @@ struct fscrypt_info { /* This inode's nonce, copied from the fscrypt_context */ u8 ci_nonce[FS_KEY_DERIVATION_NONCE_SIZE]; + + /* Hashed inode number. Only set for IV_INO_LBLK_32 */ + u32 ci_hashed_ino; }; typedef enum { @@ -231,15 +234,14 @@ typedef enum { /* crypto.c */ extern struct kmem_cache *fscrypt_info_cachep; -extern int fscrypt_initialize(unsigned int cop_flags); -extern int fscrypt_crypt_block(const struct inode *inode, - fscrypt_direction_t rw, u64 lblk_num, - struct page *src_page, struct page *dest_page, - unsigned int len, unsigned int offs, - gfp_t gfp_flags); -extern struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags); - -extern void __printf(3, 4) __cold +int fscrypt_initialize(unsigned int cop_flags); +int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw, + u64 lblk_num, struct page *src_page, + struct page *dest_page, unsigned int len, + unsigned int offs, gfp_t gfp_flags); +struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags); + +void __printf(3, 4) __cold fscrypt_msg(const struct inode *inode, const char *level, const char *fmt, ...); #define fscrypt_warn(inode, fmt, ...) \ @@ -264,12 +266,10 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num, const struct fscrypt_info *ci); /* fname.c */ -extern int fscrypt_fname_encrypt(const struct inode *inode, - const struct qstr *iname, - u8 *out, unsigned int olen); -extern bool fscrypt_fname_encrypted_size(const struct inode *inode, - u32 orig_len, u32 max_len, - u32 *encrypted_len_ret); +int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, + u8 *out, unsigned int olen); +bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, + u32 max_len, u32 *encrypted_len_ret); extern const struct dentry_operations fscrypt_d_ops; /* hkdf.c */ @@ -278,8 +278,8 @@ struct fscrypt_hkdf { struct crypto_shash *hmac_tfm; }; -extern int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, - unsigned int master_key_size); +int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, + unsigned int master_key_size); /* * The list of contexts in which fscrypt uses HKDF. These values are used as @@ -293,12 +293,14 @@ extern int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, #define HKDF_CONTEXT_DIRECT_KEY 3 #define HKDF_CONTEXT_IV_INO_LBLK_64_KEY 4 #define HKDF_CONTEXT_DIRHASH_KEY 5 +#define HKDF_CONTEXT_IV_INO_LBLK_32_KEY 6 +#define HKDF_CONTEXT_INODE_HASH_KEY 7 -extern int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, - const u8 *info, unsigned int infolen, - u8 *okm, unsigned int okmlen); +int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, + const u8 *info, unsigned int infolen, + u8 *okm, unsigned int okmlen); -extern void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf); +void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf); /* keyring.c */ @@ -389,14 +391,17 @@ struct fscrypt_master_key { struct list_head mk_decrypted_inodes; spinlock_t mk_decrypted_inodes_lock; - /* Crypto API transforms for DIRECT_KEY policies, allocated on-demand */ - struct crypto_skcipher *mk_direct_tfms[__FSCRYPT_MODE_MAX + 1]; - /* - * Crypto API transforms for filesystem-layer implementation of - * IV_INO_LBLK_64 policies, allocated on-demand. + * Per-mode encryption keys for the various types of encryption policies + * that use them. Allocated and derived on-demand. */ - struct crypto_skcipher *mk_iv_ino_lblk_64_tfms[__FSCRYPT_MODE_MAX + 1]; + struct crypto_skcipher *mk_direct_keys[__FSCRYPT_MODE_MAX + 1]; + struct crypto_skcipher *mk_iv_ino_lblk_64_keys[__FSCRYPT_MODE_MAX + 1]; + struct crypto_skcipher *mk_iv_ino_lblk_32_keys[__FSCRYPT_MODE_MAX + 1]; + + /* Hash key for inode numbers. Initialized only when needed. */ + siphash_key_t mk_ino_hash_key; + bool mk_ino_hash_key_initialized; } __randomize_layout; @@ -436,14 +441,17 @@ static inline int master_key_spec_len(const struct fscrypt_key_specifier *spec) return 0; } -extern struct key * +struct key * fscrypt_find_master_key(struct super_block *sb, const struct fscrypt_key_specifier *mk_spec); -extern int fscrypt_verify_key_added(struct super_block *sb, - const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); +int fscrypt_add_test_dummy_key(struct super_block *sb, + struct fscrypt_key_specifier *key_spec); + +int fscrypt_verify_key_added(struct super_block *sb, + const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); -extern int __init fscrypt_init_keyring(void); +int __init fscrypt_init_keyring(void); /* keysetup.c */ @@ -457,33 +465,32 @@ struct fscrypt_mode { extern struct fscrypt_mode fscrypt_modes[]; -extern struct crypto_skcipher * -fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key, - const struct inode *inode); +struct crypto_skcipher *fscrypt_allocate_skcipher(struct fscrypt_mode *mode, + const u8 *raw_key, + const struct inode *inode); -extern int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, - const u8 *raw_key); +int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key); -extern int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, - const struct fscrypt_master_key *mk); +int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, + const struct fscrypt_master_key *mk); /* keysetup_v1.c */ -extern void fscrypt_put_direct_key(struct fscrypt_direct_key *dk); +void fscrypt_put_direct_key(struct fscrypt_direct_key *dk); + +int fscrypt_setup_v1_file_key(struct fscrypt_info *ci, + const u8 *raw_master_key); -extern int fscrypt_setup_v1_file_key(struct fscrypt_info *ci, - const u8 *raw_master_key); +int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci); -extern int fscrypt_setup_v1_file_key_via_subscribed_keyrings( - struct fscrypt_info *ci); /* policy.c */ -extern bool fscrypt_policies_equal(const union fscrypt_policy *policy1, - const union fscrypt_policy *policy2); -extern bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, - const struct inode *inode); -extern int fscrypt_policy_from_context(union fscrypt_policy *policy_u, - const union fscrypt_context *ctx_u, - int ctx_size); +bool fscrypt_policies_equal(const union fscrypt_policy *policy1, + const union fscrypt_policy *policy2); +bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, + const struct inode *inode); +int fscrypt_policy_from_context(union fscrypt_policy *policy_u, + const union fscrypt_context *ctx_u, + int ctx_size); #endif /* _FSCRYPT_PRIVATE_H */ diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c index efb95bd19a89..0cba7928446d 100644 --- a/fs/crypto/hkdf.c +++ b/fs/crypto/hkdf.c @@ -44,17 +44,13 @@ static int hkdf_extract(struct crypto_shash *hmac_tfm, const u8 *ikm, unsigned int ikmlen, u8 prk[HKDF_HASHLEN]) { static const u8 default_salt[HKDF_HASHLEN]; - SHASH_DESC_ON_STACK(desc, hmac_tfm); int err; err = crypto_shash_setkey(hmac_tfm, default_salt, HKDF_HASHLEN); if (err) return err; - desc->tfm = hmac_tfm; - err = crypto_shash_digest(desc, ikm, ikmlen, prk); - shash_desc_zero(desc); - return err; + return crypto_shash_tfm_digest(hmac_tfm, ikm, ikmlen, prk); } /* diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 5ef861742921..09fb8aa0f2e9 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -10,7 +10,7 @@ #include "fscrypt_private.h" /** - * fscrypt_file_open - prepare to open a possibly-encrypted regular file + * fscrypt_file_open() - prepare to open a possibly-encrypted regular file * @inode: the inode being opened * @filp: the struct file being set up * @@ -262,7 +262,7 @@ err_free_sd: EXPORT_SYMBOL_GPL(__fscrypt_encrypt_symlink); /** - * fscrypt_get_symlink - get the target of an encrypted symlink + * fscrypt_get_symlink() - get the target of an encrypted symlink * @inode: the symlink inode * @caddr: the on-disk contents of the symlink * @max_size: size of @caddr buffer diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index ab41b25d4fa1..e24eb48bfbe1 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -20,6 +20,7 @@ #include <crypto/skcipher.h> #include <linux/key-type.h> +#include <linux/random.h> #include <linux/seq_file.h> #include "fscrypt_private.h" @@ -44,8 +45,9 @@ static void free_master_key(struct fscrypt_master_key *mk) wipe_master_key_secret(&mk->mk_secret); for (i = 0; i <= __FSCRYPT_MODE_MAX; i++) { - crypto_free_skcipher(mk->mk_direct_tfms[i]); - crypto_free_skcipher(mk->mk_iv_ino_lblk_64_tfms[i]); + crypto_free_skcipher(mk->mk_direct_keys[i]); + crypto_free_skcipher(mk->mk_iv_ino_lblk_64_keys[i]); + crypto_free_skcipher(mk->mk_iv_ino_lblk_32_keys[i]); } key_put(mk->mk_users); @@ -424,9 +426,9 @@ static int add_existing_master_key(struct fscrypt_master_key *mk, return 0; } -static int add_master_key(struct super_block *sb, - struct fscrypt_master_key_secret *secret, - const struct fscrypt_key_specifier *mk_spec) +static int do_add_master_key(struct super_block *sb, + struct fscrypt_master_key_secret *secret, + const struct fscrypt_key_specifier *mk_spec) { static DEFINE_MUTEX(fscrypt_add_key_mutex); struct key *key; @@ -465,6 +467,35 @@ out_unlock: return err; } +static int add_master_key(struct super_block *sb, + struct fscrypt_master_key_secret *secret, + struct fscrypt_key_specifier *key_spec) +{ + int err; + + if (key_spec->type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) { + err = fscrypt_init_hkdf(&secret->hkdf, secret->raw, + secret->size); + if (err) + return err; + + /* + * Now that the HKDF context is initialized, the raw key is no + * longer needed. + */ + memzero_explicit(secret->raw, secret->size); + + /* Calculate the key identifier */ + err = fscrypt_hkdf_expand(&secret->hkdf, + HKDF_CONTEXT_KEY_IDENTIFIER, NULL, 0, + key_spec->u.identifier, + FSCRYPT_KEY_IDENTIFIER_SIZE); + if (err) + return err; + } + return do_add_master_key(sb, secret, key_spec); +} + static int fscrypt_provisioning_key_preparse(struct key_preparsed_payload *prep) { const struct fscrypt_provisioning_key_payload *payload = prep->data; @@ -609,6 +640,15 @@ int fscrypt_ioctl_add_key(struct file *filp, void __user *_uarg) if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved))) return -EINVAL; + /* + * Only root can add keys that are identified by an arbitrary descriptor + * rather than by a cryptographic hash --- since otherwise a malicious + * user could add the wrong key. + */ + if (arg.key_spec.type == FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR && + !capable(CAP_SYS_ADMIN)) + return -EACCES; + memset(&secret, 0, sizeof(secret)); if (arg.key_id) { if (arg.raw_size != 0) @@ -626,48 +666,17 @@ int fscrypt_ioctl_add_key(struct file *filp, void __user *_uarg) goto out_wipe_secret; } - switch (arg.key_spec.type) { - case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR: - /* - * Only root can add keys that are identified by an arbitrary - * descriptor rather than by a cryptographic hash --- since - * otherwise a malicious user could add the wrong key. - */ - err = -EACCES; - if (!capable(CAP_SYS_ADMIN)) - goto out_wipe_secret; - break; - case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER: - err = fscrypt_init_hkdf(&secret.hkdf, secret.raw, secret.size); - if (err) - goto out_wipe_secret; - - /* - * Now that the HKDF context is initialized, the raw key is no - * longer needed. - */ - memzero_explicit(secret.raw, secret.size); - - /* Calculate the key identifier and return it to userspace. */ - err = fscrypt_hkdf_expand(&secret.hkdf, - HKDF_CONTEXT_KEY_IDENTIFIER, - NULL, 0, arg.key_spec.u.identifier, - FSCRYPT_KEY_IDENTIFIER_SIZE); - if (err) - goto out_wipe_secret; - err = -EFAULT; - if (copy_to_user(uarg->key_spec.u.identifier, - arg.key_spec.u.identifier, - FSCRYPT_KEY_IDENTIFIER_SIZE)) - goto out_wipe_secret; - break; - default: - WARN_ON(1); - err = -EINVAL; + err = add_master_key(sb, &secret, &arg.key_spec); + if (err) goto out_wipe_secret; - } - err = add_master_key(sb, &secret, &arg.key_spec); + /* Return the key identifier to userspace, if applicable */ + err = -EFAULT; + if (arg.key_spec.type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER && + copy_to_user(uarg->key_spec.u.identifier, arg.key_spec.u.identifier, + FSCRYPT_KEY_IDENTIFIER_SIZE)) + goto out_wipe_secret; + err = 0; out_wipe_secret: wipe_master_key_secret(&secret); return err; @@ -675,6 +684,29 @@ out_wipe_secret: EXPORT_SYMBOL_GPL(fscrypt_ioctl_add_key); /* + * Add the key for '-o test_dummy_encryption' to the filesystem keyring. + * + * Use a per-boot random key to prevent people from misusing this option. + */ +int fscrypt_add_test_dummy_key(struct super_block *sb, + struct fscrypt_key_specifier *key_spec) +{ + static u8 test_key[FSCRYPT_MAX_KEY_SIZE]; + struct fscrypt_master_key_secret secret; + int err; + + get_random_once(test_key, FSCRYPT_MAX_KEY_SIZE); + + memset(&secret, 0, sizeof(secret)); + secret.size = FSCRYPT_MAX_KEY_SIZE; + memcpy(secret.raw, test_key, FSCRYPT_MAX_KEY_SIZE); + + err = add_master_key(sb, &secret, key_spec); + wipe_master_key_secret(&secret); + return err; +} + +/* * Verify that the current user has added a master key with the given identifier * (returns -ENOKEY if not). This is needed to prevent a user from encrypting * their files using some other user's key which they don't actually know. diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index 302375e9f719..1129adfa097d 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -46,6 +46,8 @@ struct fscrypt_mode fscrypt_modes[] = { }, }; +static DEFINE_MUTEX(fscrypt_mode_key_setup_mutex); + static struct fscrypt_mode * select_encryption_mode(const union fscrypt_policy *policy, const struct inode *inode) @@ -130,7 +132,7 @@ static int setup_per_mode_enc_key(struct fscrypt_info *ci, const struct super_block *sb = inode->i_sb; struct fscrypt_mode *mode = ci->ci_mode; const u8 mode_num = mode - fscrypt_modes; - struct crypto_skcipher *tfm, *prev_tfm; + struct crypto_skcipher *tfm; u8 mode_key[FSCRYPT_MAX_KEY_SIZE]; u8 hkdf_info[sizeof(mode_num) + sizeof(sb->s_uuid)]; unsigned int hkdf_infolen = 0; @@ -139,10 +141,17 @@ static int setup_per_mode_enc_key(struct fscrypt_info *ci, if (WARN_ON(mode_num > __FSCRYPT_MODE_MAX)) return -EINVAL; - /* pairs with cmpxchg() below */ + /* pairs with smp_store_release() below */ tfm = READ_ONCE(tfms[mode_num]); - if (likely(tfm != NULL)) - goto done; + if (likely(tfm != NULL)) { + ci->ci_ctfm = tfm; + return 0; + } + + mutex_lock(&fscrypt_mode_key_setup_mutex); + + if (tfms[mode_num]) + goto done_unlock; BUILD_BUG_ON(sizeof(mode_num) != 1); BUILD_BUG_ON(sizeof(sb->s_uuid) != 16); @@ -157,21 +166,21 @@ static int setup_per_mode_enc_key(struct fscrypt_info *ci, hkdf_context, hkdf_info, hkdf_infolen, mode_key, mode->keysize); if (err) - return err; + goto out_unlock; tfm = fscrypt_allocate_skcipher(mode, mode_key, inode); memzero_explicit(mode_key, mode->keysize); - if (IS_ERR(tfm)) - return PTR_ERR(tfm); - - /* pairs with READ_ONCE() above */ - prev_tfm = cmpxchg(&tfms[mode_num], NULL, tfm); - if (prev_tfm != NULL) { - crypto_free_skcipher(tfm); - tfm = prev_tfm; + if (IS_ERR(tfm)) { + err = PTR_ERR(tfm); + goto out_unlock; } -done: + /* pairs with READ_ONCE() above */ + smp_store_release(&tfms[mode_num], tfm); +done_unlock: ci->ci_ctfm = tfm; - return 0; + err = 0; +out_unlock: + mutex_unlock(&fscrypt_mode_key_setup_mutex); + return err; } int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, @@ -189,6 +198,43 @@ int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, return 0; } +static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_info *ci, + struct fscrypt_master_key *mk) +{ + int err; + + err = setup_per_mode_enc_key(ci, mk, mk->mk_iv_ino_lblk_32_keys, + HKDF_CONTEXT_IV_INO_LBLK_32_KEY, true); + if (err) + return err; + + /* pairs with smp_store_release() below */ + if (!smp_load_acquire(&mk->mk_ino_hash_key_initialized)) { + + mutex_lock(&fscrypt_mode_key_setup_mutex); + + if (mk->mk_ino_hash_key_initialized) + goto unlock; + + err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, + HKDF_CONTEXT_INODE_HASH_KEY, NULL, 0, + (u8 *)&mk->mk_ino_hash_key, + sizeof(mk->mk_ino_hash_key)); + if (err) + goto unlock; + /* pairs with smp_load_acquire() above */ + smp_store_release(&mk->mk_ino_hash_key_initialized, true); +unlock: + mutex_unlock(&fscrypt_mode_key_setup_mutex); + if (err) + return err; + } + + ci->ci_hashed_ino = (u32)siphash_1u64(ci->ci_inode->i_ino, + &mk->mk_ino_hash_key); + return 0; +} + static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, struct fscrypt_master_key *mk) { @@ -203,7 +249,7 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, * encryption key. This ensures that the master key is * consistently used only for HKDF, avoiding key reuse issues. */ - err = setup_per_mode_enc_key(ci, mk, mk->mk_direct_tfms, + err = setup_per_mode_enc_key(ci, mk, mk->mk_direct_keys, HKDF_CONTEXT_DIRECT_KEY, false); } else if (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) { @@ -211,11 +257,14 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, * IV_INO_LBLK_64: encryption keys are derived from (master_key, * mode_num, filesystem_uuid), and inode number is included in * the IVs. This format is optimized for use with inline - * encryption hardware compliant with the UFS or eMMC standards. + * encryption hardware compliant with the UFS standard. */ - err = setup_per_mode_enc_key(ci, mk, mk->mk_iv_ino_lblk_64_tfms, + err = setup_per_mode_enc_key(ci, mk, mk->mk_iv_ino_lblk_64_keys, HKDF_CONTEXT_IV_INO_LBLK_64_KEY, true); + } else if (ci->ci_policy.v2.flags & + FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) { + err = fscrypt_setup_iv_ino_lblk_32_key(ci, mk); } else { u8 derived_key[FSCRYPT_MAX_KEY_SIZE]; @@ -395,21 +444,18 @@ int fscrypt_get_encryption_info(struct inode *inode) res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); if (res < 0) { - if (!fscrypt_dummy_context_enabled(inode) || - IS_ENCRYPTED(inode)) { + const union fscrypt_context *dummy_ctx = + fscrypt_get_dummy_context(inode->i_sb); + + if (IS_ENCRYPTED(inode) || !dummy_ctx) { fscrypt_warn(inode, "Error %d getting encryption context", res); return res; } /* Fake up a context for an unencrypted directory */ - memset(&ctx, 0, sizeof(ctx)); - ctx.version = FSCRYPT_CONTEXT_V1; - ctx.v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; - ctx.v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; - memset(ctx.v1.master_key_descriptor, 0x42, - FSCRYPT_KEY_DESCRIPTOR_SIZE); - res = sizeof(ctx.v1); + res = fscrypt_context_size(dummy_ctx); + memcpy(&ctx, dummy_ctx, res); } crypt_info = kmem_cache_zalloc(fscrypt_info_cachep, GFP_NOFS); @@ -475,7 +521,8 @@ out: EXPORT_SYMBOL(fscrypt_get_encryption_info); /** - * fscrypt_put_encryption_info - free most of an inode's fscrypt data + * fscrypt_put_encryption_info() - free most of an inode's fscrypt data + * @inode: an inode being evicted * * Free the inode's fscrypt_info. Filesystems must call this when the inode is * being evicted. An RCU grace period need not have elapsed yet. @@ -488,7 +535,8 @@ void fscrypt_put_encryption_info(struct inode *inode) EXPORT_SYMBOL(fscrypt_put_encryption_info); /** - * fscrypt_free_inode - free an inode's fscrypt data requiring RCU delay + * fscrypt_free_inode() - free an inode's fscrypt data requiring RCU delay + * @inode: an inode being freed * * Free the inode's cached decrypted symlink target, if any. Filesystems must * call this after an RCU grace period, just before they free the inode. @@ -503,7 +551,8 @@ void fscrypt_free_inode(struct inode *inode) EXPORT_SYMBOL(fscrypt_free_inode); /** - * fscrypt_drop_inode - check whether the inode's master key has been removed + * fscrypt_drop_inode() - check whether the inode's master key has been removed + * @inode: an inode being considered for eviction * * Filesystems supporting fscrypt must call this from their ->drop_inode() * method so that encrypted inodes are evicted as soon as they're no longer in diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 10ccf945020c..d23ff162c78b 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -11,12 +11,15 @@ */ #include <linux/random.h> +#include <linux/seq_file.h> #include <linux/string.h> #include <linux/mount.h> #include "fscrypt_private.h" /** - * fscrypt_policies_equal - check whether two encryption policies are the same + * fscrypt_policies_equal() - check whether two encryption policies are the same + * @policy1: the first policy + * @policy2: the second policy * * Return: %true if equal, else %false */ @@ -66,18 +69,14 @@ static bool supported_direct_key_modes(const struct inode *inode, return true; } -static bool supported_iv_ino_lblk_64_policy( - const struct fscrypt_policy_v2 *policy, - const struct inode *inode) +static bool supported_iv_ino_lblk_policy(const struct fscrypt_policy_v2 *policy, + const struct inode *inode, + const char *type, + int max_ino_bits, int max_lblk_bits) { struct super_block *sb = inode->i_sb; int ino_bits = 64, lblk_bits = 64; - if (policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) { - fscrypt_warn(inode, - "The DIRECT_KEY and IV_INO_LBLK_64 flags are mutually exclusive"); - return false; - } /* * It's unsafe to include inode numbers in the IVs if the filesystem can * potentially renumber inodes, e.g. via filesystem shrinking. @@ -85,16 +84,22 @@ static bool supported_iv_ino_lblk_64_policy( if (!sb->s_cop->has_stable_inodes || !sb->s_cop->has_stable_inodes(sb)) { fscrypt_warn(inode, - "Can't use IV_INO_LBLK_64 policy on filesystem '%s' because it doesn't have stable inode numbers", - sb->s_id); + "Can't use %s policy on filesystem '%s' because it doesn't have stable inode numbers", + type, sb->s_id); return false; } if (sb->s_cop->get_ino_and_lblk_bits) sb->s_cop->get_ino_and_lblk_bits(sb, &ino_bits, &lblk_bits); - if (ino_bits > 32 || lblk_bits > 32) { + if (ino_bits > max_ino_bits) { + fscrypt_warn(inode, + "Can't use %s policy on filesystem '%s' because its inode numbers are too long", + type, sb->s_id); + return false; + } + if (lblk_bits > max_lblk_bits) { fscrypt_warn(inode, - "Can't use IV_INO_LBLK_64 policy on filesystem '%s' because it doesn't use 32-bit inode and block numbers", - sb->s_id); + "Can't use %s policy on filesystem '%s' because its block numbers are too long", + type, sb->s_id); return false; } return true; @@ -137,6 +142,8 @@ static bool fscrypt_supported_v1_policy(const struct fscrypt_policy_v1 *policy, static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy, const struct inode *inode) { + int count = 0; + if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode, policy->filenames_encryption_mode)) { fscrypt_warn(inode, @@ -152,13 +159,29 @@ static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy, return false; } + count += !!(policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY); + count += !!(policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64); + count += !!(policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32); + if (count > 1) { + fscrypt_warn(inode, "Mutually exclusive encryption flags (0x%02x)", + policy->flags); + return false; + } + if ((policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) && !supported_direct_key_modes(inode, policy->contents_encryption_mode, policy->filenames_encryption_mode)) return false; if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) && - !supported_iv_ino_lblk_64_policy(policy, inode)) + !supported_iv_ino_lblk_policy(policy, inode, "IV_INO_LBLK_64", + 32, 32)) + return false; + + if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) && + /* This uses hashed inode numbers, so ino_bits doesn't matter. */ + !supported_iv_ino_lblk_policy(policy, inode, "IV_INO_LBLK_32", + INT_MAX, 32)) return false; if (memchr_inv(policy->__reserved, 0, sizeof(policy->__reserved))) { @@ -170,7 +193,9 @@ static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy, } /** - * fscrypt_supported_policy - check whether an encryption policy is supported + * fscrypt_supported_policy() - check whether an encryption policy is supported + * @policy_u: the encryption policy + * @inode: the inode on which the policy will be used * * Given an encryption policy, check whether all its encryption modes and other * settings are supported by this kernel on the given inode. (But we don't @@ -192,7 +217,10 @@ bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, } /** - * fscrypt_new_context_from_policy - create a new fscrypt_context from a policy + * fscrypt_new_context_from_policy() - create a new fscrypt_context from + * an fscrypt_policy + * @ctx_u: output context + * @policy_u: input policy * * Create an fscrypt_context for an inode that is being assigned the given * encryption policy. A new nonce is randomly generated. @@ -242,7 +270,11 @@ static int fscrypt_new_context_from_policy(union fscrypt_context *ctx_u, } /** - * fscrypt_policy_from_context - convert an fscrypt_context to an fscrypt_policy + * fscrypt_policy_from_context() - convert an fscrypt_context to + * an fscrypt_policy + * @policy_u: output policy + * @ctx_u: input context + * @ctx_size: size of input context in bytes * * Given an fscrypt_context, build the corresponding fscrypt_policy. * @@ -354,6 +386,9 @@ static int set_encryption_policy(struct inode *inode, policy->v2.master_key_identifier); if (err) return err; + if (policy->v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) + pr_warn_once("%s (pid %d) is setting an IV_INO_LBLK_32 encryption policy. This should only be used if there are certain hardware limitations.\n", + current->comm, current->pid); break; default: WARN_ON(1); @@ -605,3 +640,127 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child, return preload ? fscrypt_get_encryption_info(child): 0; } EXPORT_SYMBOL(fscrypt_inherit_context); + +/** + * fscrypt_set_test_dummy_encryption() - handle '-o test_dummy_encryption' + * @sb: the filesystem on which test_dummy_encryption is being specified + * @arg: the argument to the test_dummy_encryption option. + * If no argument was specified, then @arg->from == NULL. + * @dummy_ctx: the filesystem's current dummy context (input/output, see below) + * + * Handle the test_dummy_encryption mount option by creating a dummy encryption + * context, saving it in @dummy_ctx, and adding the corresponding dummy + * encryption key to the filesystem. If the @dummy_ctx is already set, then + * instead validate that it matches @arg. Don't support changing it via + * remount, as that is difficult to do safely. + * + * The reason we use an fscrypt_context rather than an fscrypt_policy is because + * we mustn't generate a new nonce each time we access a dummy-encrypted + * directory, as that would change the way filenames are encrypted. + * + * Return: 0 on success (dummy context set, or the same context is already set); + * -EEXIST if a different dummy context is already set; + * or another -errno value. + */ +int fscrypt_set_test_dummy_encryption(struct super_block *sb, + const substring_t *arg, + struct fscrypt_dummy_context *dummy_ctx) +{ + const char *argstr = "v2"; + const char *argstr_to_free = NULL; + struct fscrypt_key_specifier key_spec = { 0 }; + int version; + union fscrypt_context *ctx = NULL; + int err; + + if (arg->from) { + argstr = argstr_to_free = match_strdup(arg); + if (!argstr) + return -ENOMEM; + } + + if (!strcmp(argstr, "v1")) { + version = FSCRYPT_CONTEXT_V1; + key_spec.type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR; + memset(key_spec.u.descriptor, 0x42, + FSCRYPT_KEY_DESCRIPTOR_SIZE); + } else if (!strcmp(argstr, "v2")) { + version = FSCRYPT_CONTEXT_V2; + key_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER; + /* key_spec.u.identifier gets filled in when adding the key */ + } else { + err = -EINVAL; + goto out; + } + + if (dummy_ctx->ctx) { + /* + * Note: if we ever make test_dummy_encryption support + * specifying other encryption settings, such as the encryption + * modes, we'll need to compare those settings here. + */ + if (dummy_ctx->ctx->version == version) + err = 0; + else + err = -EEXIST; + goto out; + } + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) { + err = -ENOMEM; + goto out; + } + + err = fscrypt_add_test_dummy_key(sb, &key_spec); + if (err) + goto out; + + ctx->version = version; + switch (ctx->version) { + case FSCRYPT_CONTEXT_V1: + ctx->v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; + ctx->v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; + memcpy(ctx->v1.master_key_descriptor, key_spec.u.descriptor, + FSCRYPT_KEY_DESCRIPTOR_SIZE); + break; + case FSCRYPT_CONTEXT_V2: + ctx->v2.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; + ctx->v2.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; + memcpy(ctx->v2.master_key_identifier, key_spec.u.identifier, + FSCRYPT_KEY_IDENTIFIER_SIZE); + break; + default: + WARN_ON(1); + err = -EINVAL; + goto out; + } + dummy_ctx->ctx = ctx; + ctx = NULL; + err = 0; +out: + kfree(ctx); + kfree(argstr_to_free); + return err; +} +EXPORT_SYMBOL_GPL(fscrypt_set_test_dummy_encryption); + +/** + * fscrypt_show_test_dummy_encryption() - show '-o test_dummy_encryption' + * @seq: the seq_file to print the option to + * @sep: the separator character to use + * @sb: the filesystem whose options are being shown + * + * Show the test_dummy_encryption mount option, if it was specified. + * This is mainly used for /proc/mounts. + */ +void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep, + struct super_block *sb) +{ + const union fscrypt_context *ctx = fscrypt_get_dummy_context(sb); + + if (!ctx) + return; + seq_printf(seq, "%ctest_dummy_encryption=v%d", sep, ctx->version); +} +EXPORT_SYMBOL_GPL(fscrypt_show_test_dummy_encryption); diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 2d357680094c..ae49a55bda00 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -506,20 +506,11 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n"); * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. - * - * This function will return a pointer to a dentry if it succeeds. This - * pointer must be passed to the debugfs_remove() function when the file is - * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be - * returned. - * - * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will - * be returned. */ -struct dentry *debugfs_create_u32(const char *name, umode_t mode, - struct dentry *parent, u32 *value) +void debugfs_create_u32(const char *name, umode_t mode, struct dentry *parent, + u32 *value) { - return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u32, + debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u32, &fops_u32_ro, &fops_u32_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u32); diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig index 522c35d5292b..1bdeaa6d5790 100644 --- a/fs/ecryptfs/Kconfig +++ b/fs/ecryptfs/Kconfig @@ -7,7 +7,7 @@ config ECRYPT_FS select CRYPTO_MD5 help Encrypted filesystem that operates on the VFS layer. See - <file:Documentation/filesystems/ecryptfs.txt> to learn more about + <file:Documentation/filesystems/ecryptfs.rst> to learn more about eCryptfs. Userspace components are required and can be obtained from <http://ecryptfs.sf.net>. diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 2c449aed1b92..0681540c48d9 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -48,18 +48,6 @@ void ecryptfs_from_hex(char *dst, char *src, int dst_size) } } -static int ecryptfs_hash_digest(struct crypto_shash *tfm, - char *src, int len, char *dst) -{ - SHASH_DESC_ON_STACK(desc, tfm); - int err; - - desc->tfm = tfm; - err = crypto_shash_digest(desc, src, len, dst); - shash_desc_zero(desc); - return err; -} - /** * ecryptfs_calculate_md5 - calculates the md5 of @src * @dst: Pointer to 16 bytes of allocated memory @@ -74,11 +62,8 @@ static int ecryptfs_calculate_md5(char *dst, struct ecryptfs_crypt_stat *crypt_stat, char *src, int len) { - struct crypto_shash *tfm; - int rc = 0; + int rc = crypto_shash_tfm_digest(crypt_stat->hash_tfm, src, len, dst); - tfm = crypt_stat->hash_tfm; - rc = ecryptfs_hash_digest(tfm, src, len, dst); if (rc) { printk(KERN_ERR "%s: Error computing crypto hash; rc = [%d]\n", diff --git a/fs/erofs/data.c b/fs/erofs/data.c index fc3a8d8064f8..d0542151e8c4 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -280,47 +280,36 @@ static int erofs_raw_access_readpage(struct file *file, struct page *page) return 0; } -static int erofs_raw_access_readpages(struct file *filp, - struct address_space *mapping, - struct list_head *pages, - unsigned int nr_pages) +static void erofs_raw_access_readahead(struct readahead_control *rac) { erofs_off_t last_block; struct bio *bio = NULL; - gfp_t gfp = readahead_gfp_mask(mapping); - struct page *page = list_last_entry(pages, struct page, lru); - - trace_erofs_readpages(mapping->host, page, nr_pages, true); + struct page *page; - for (; nr_pages; --nr_pages) { - page = list_entry(pages->prev, struct page, lru); + trace_erofs_readpages(rac->mapping->host, readahead_index(rac), + readahead_count(rac), true); + while ((page = readahead_page(rac))) { prefetchw(&page->flags); - list_del(&page->lru); - if (!add_to_page_cache_lru(page, mapping, page->index, gfp)) { - bio = erofs_read_raw_page(bio, mapping, page, - &last_block, nr_pages, true); + bio = erofs_read_raw_page(bio, rac->mapping, page, &last_block, + readahead_count(rac), true); - /* all the page errors are ignored when readahead */ - if (IS_ERR(bio)) { - pr_err("%s, readahead error at page %lu of nid %llu\n", - __func__, page->index, - EROFS_I(mapping->host)->nid); + /* all the page errors are ignored when readahead */ + if (IS_ERR(bio)) { + pr_err("%s, readahead error at page %lu of nid %llu\n", + __func__, page->index, + EROFS_I(rac->mapping->host)->nid); - bio = NULL; - } + bio = NULL; } - /* pages could still be locked */ put_page(page); } - DBG_BUGON(!list_empty(pages)); /* the rare case (end in gaps) */ if (bio) submit_bio(bio); - return 0; } static int erofs_get_block(struct inode *inode, sector_t iblock, @@ -358,7 +347,7 @@ static sector_t erofs_bmap(struct address_space *mapping, sector_t block) /* for uncompressed (aligned) files and raw access for other files */ const struct address_space_operations erofs_raw_access_aops = { .readpage = erofs_raw_access_readpage, - .readpages = erofs_raw_access_readpages, + .readahead = erofs_raw_access_readahead, .bmap = erofs_bmap, }; diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 5d2d81940679..7628816f2453 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -274,7 +274,7 @@ static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq, i = 0; while (1) { - dst = vm_map_ram(rq->out, nrpages_out, -1, PAGE_KERNEL); + dst = vm_map_ram(rq->out, nrpages_out, -1); /* retry two more times (totally 3 times) */ if (dst || ++i >= 3) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index c4b6c9aa87ec..187f93b4900e 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1305,28 +1305,23 @@ static bool should_decompress_synchronously(struct erofs_sb_info *sbi, return nr <= sbi->max_sync_decompress_pages; } -static int z_erofs_readpages(struct file *filp, struct address_space *mapping, - struct list_head *pages, unsigned int nr_pages) +static void z_erofs_readahead(struct readahead_control *rac) { - struct inode *const inode = mapping->host; + struct inode *const inode = rac->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); - bool sync = should_decompress_synchronously(sbi, nr_pages); + bool sync = should_decompress_synchronously(sbi, readahead_count(rac)); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); - gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL); - struct page *head = NULL; + struct page *page, *head = NULL; LIST_HEAD(pagepool); - trace_erofs_readpages(mapping->host, lru_to_page(pages), - nr_pages, false); + trace_erofs_readpages(inode, readahead_index(rac), + readahead_count(rac), false); - f.headoffset = (erofs_off_t)lru_to_page(pages)->index << PAGE_SHIFT; - - for (; nr_pages; --nr_pages) { - struct page *page = lru_to_page(pages); + f.headoffset = readahead_pos(rac); + while ((page = readahead_page(rac))) { prefetchw(&page->flags); - list_del(&page->lru); /* * A pure asynchronous readahead is indicated if @@ -1335,11 +1330,6 @@ static int z_erofs_readpages(struct file *filp, struct address_space *mapping, */ sync &= !(PageReadahead(page) && !head); - if (add_to_page_cache_lru(page, mapping, page->index, gfp)) { - list_add(&page->lru, &pagepool); - continue; - } - set_page_private(page, (unsigned long)head); head = page; } @@ -1368,11 +1358,10 @@ static int z_erofs_readpages(struct file *filp, struct address_space *mapping, /* clean up the remaining free pages */ put_pages_list(&pagepool); - return 0; } const struct address_space_operations z_erofs_aops = { .readpage = z_erofs_readpage, - .readpages = z_erofs_readpages, + .readahead = z_erofs_readahead, }; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 8c596641a72b..12eebcdea9c8 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1171,6 +1171,10 @@ static inline bool chain_epi_lockless(struct epitem *epi) { struct eventpoll *ep = epi->ep; + /* Fast preliminary check */ + if (epi->next != EP_UNACTIVE_PTR) + return false; + /* Check that the same epi has not been just chained from another CPU */ if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR) return false; @@ -1237,16 +1241,12 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v * chained in ep->ovflist and requeued later on. */ if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) { - if (epi->next == EP_UNACTIVE_PTR && - chain_epi_lockless(epi)) + if (chain_epi_lockless(epi)) + ep_pm_stay_awake_rcu(epi); + } else if (!ep_is_linked(epi)) { + /* In the usual case, add event to ready list. */ + if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) ep_pm_stay_awake_rcu(epi); - goto out_unlock; - } - - /* If this file is already in the ready list we exit soon */ - if (!ep_is_linked(epi) && - list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) { - ep_pm_stay_awake_rcu(epi); } /* @@ -1822,7 +1822,6 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, { int res = 0, eavail, timed_out = 0; u64 slack = 0; - bool waiter = false; wait_queue_entry_t wait; ktime_t expires, *to = NULL; @@ -1867,55 +1866,75 @@ fetch_events: */ ep_reset_busy_poll_napi_id(ep); - /* - * We don't have any available event to return to the caller. We need - * to sleep here, and we will be woken by ep_poll_callback() when events - * become available. - */ - if (!waiter) { - waiter = true; - init_waitqueue_entry(&wait, current); + do { + /* + * Internally init_wait() uses autoremove_wake_function(), + * thus wait entry is removed from the wait queue on each + * wakeup. Why it is important? In case of several waiters + * each new wakeup will hit the next waiter, giving it the + * chance to harvest new event. Otherwise wakeup can be + * lost. This is also good performance-wise, because on + * normal wakeup path no need to call __remove_wait_queue() + * explicitly, thus ep->lock is not taken, which halts the + * event delivery. + */ + init_wait(&wait); write_lock_irq(&ep->lock); - __add_wait_queue_exclusive(&ep->wq, &wait); - write_unlock_irq(&ep->lock); - } - - for (;;) { /* - * We don't want to sleep if the ep_poll_callback() sends us - * a wakeup in between. That's why we set the task state - * to TASK_INTERRUPTIBLE before doing the checks. + * Barrierless variant, waitqueue_active() is called under + * the same lock on wakeup ep_poll_callback() side, so it + * is safe to avoid an explicit barrier. */ - set_current_state(TASK_INTERRUPTIBLE); + __set_current_state(TASK_INTERRUPTIBLE); + /* - * Always short-circuit for fatal signals to allow - * threads to make a timely exit without the chance of - * finding more events available and fetching - * repeatedly. + * Do the final check under the lock. ep_scan_ready_list() + * plays with two lists (->rdllist and ->ovflist) and there + * is always a race when both lists are empty for short + * period of time although events are pending, so lock is + * important. */ - if (fatal_signal_pending(current)) { - res = -EINTR; - break; + eavail = ep_events_available(ep); + if (!eavail) { + if (signal_pending(current)) + res = -EINTR; + else + __add_wait_queue_exclusive(&ep->wq, &wait); } + write_unlock_irq(&ep->lock); - eavail = ep_events_available(ep); - if (eavail) + if (eavail || res) break; - if (signal_pending(current)) { - res = -EINTR; - break; - } if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) { timed_out = 1; break; } - } + + /* We were woken up, thus go and try to harvest some events */ + eavail = 1; + + } while (0); __set_current_state(TASK_RUNNING); + if (!list_empty_careful(&wait.entry)) { + write_lock_irq(&ep->lock); + __remove_wait_queue(&ep->wq, &wait); + write_unlock_irq(&ep->lock); + } + send_events: + if (fatal_signal_pending(current)) { + /* + * Always short-circuit for fatal signals to allow + * threads to make a timely exit without the chance of + * finding more events available and fetching + * repeatedly. + */ + res = -EINTR; + } /* * Try to transfer events to user space. In case we get 0 events and * there's still timeout left over, we go trying again in search of @@ -1925,12 +1944,6 @@ send_events: !(res = ep_send_events(ep, events, maxevents)) && !timed_out) goto fetch_events; - if (waiter) { - write_lock_irq(&ep->lock); - __remove_wait_queue(&ep->wq, &wait); - write_unlock_irq(&ep->lock); - } - return res; } diff --git a/fs/exec.c b/fs/exec.c index 06b4c550af5d..2c465119affc 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1317,6 +1317,8 @@ int flush_old_exec(struct linux_binprm * bprm) */ set_mm_exe_file(bprm->mm, bprm->file); + would_dump(bprm, bprm->file); + /* * Release all of the old mmap stuff */ @@ -1876,8 +1878,6 @@ static int __do_execve_file(int fd, struct filename *filename, if (retval < 0) goto out; - would_dump(bprm, bprm->file); - retval = exec_binprm(bprm); if (retval < 0) goto out; diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c index 6a04cc02565a..6774a5a6ded8 100644 --- a/fs/exfat/balloc.c +++ b/fs/exfat/balloc.c @@ -91,7 +91,6 @@ static int exfat_allocate_bitmap(struct super_block *sb, } } - sbi->pbr_bh = NULL; return 0; } @@ -137,8 +136,6 @@ void exfat_free_bitmap(struct exfat_sb_info *sbi) { int i; - brelse(sbi->pbr_bh); - for (i = 0; i < sbi->map_sectors; i++) __brelse(sbi->vol_amap[i]); diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 67d4e46fb810..d67fb8a6f770 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -507,6 +507,7 @@ void exfat_msg(struct super_block *sb, const char *lv, const char *fmt, ...) __printf(3, 4) __cold; void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, u8 tz, __le16 time, __le16 date, u8 time_ms); +void exfat_truncate_atime(struct timespec64 *ts); void exfat_set_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, u8 *tz, __le16 *time, __le16 *date, u8 *time_ms); unsigned short exfat_calc_chksum_2byte(void *data, int len, diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 483f683757aa..c9db8eb0cfc3 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -273,6 +273,7 @@ int exfat_getattr(const struct path *path, struct kstat *stat, struct exfat_inode_info *ei = EXFAT_I(inode); generic_fillattr(inode, stat); + exfat_truncate_atime(&stat->atime); stat->result_mask |= STATX_BTIME; stat->btime.tv_sec = ei->i_crtime.tv_sec; stat->btime.tv_nsec = ei->i_crtime.tv_nsec; @@ -339,6 +340,7 @@ int exfat_setattr(struct dentry *dentry, struct iattr *attr) } setattr_copy(inode, attr); + exfat_truncate_atime(&inode->i_atime); mark_inode_dirty(inode); out: @@ -346,12 +348,13 @@ out: } const struct file_operations exfat_file_operations = { - .llseek = generic_file_llseek, - .read_iter = generic_file_read_iter, - .write_iter = generic_file_write_iter, - .mmap = generic_file_mmap, - .fsync = generic_file_fsync, - .splice_read = generic_file_splice_read, + .llseek = generic_file_llseek, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, + .mmap = generic_file_mmap, + .fsync = generic_file_fsync, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, }; const struct inode_operations exfat_file_inode_operations = { diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 06887492f54b..785ead346543 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -372,10 +372,9 @@ static int exfat_readpage(struct file *file, struct page *page) return mpage_readpage(page, exfat_get_block); } -static int exfat_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned int nr_pages) +static void exfat_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, exfat_get_block); + mpage_readahead(rac, exfat_get_block); } static int exfat_writepage(struct page *page, struct writeback_control *wbc) @@ -502,7 +501,7 @@ int exfat_block_truncate_page(struct inode *inode, loff_t from) static const struct address_space_operations exfat_aops = { .readpage = exfat_readpage, - .readpages = exfat_readpages, + .readahead = exfat_readahead, .writepage = exfat_writepage, .writepages = exfat_writepages, .write_begin = exfat_write_begin, diff --git a/fs/exfat/misc.c b/fs/exfat/misc.c index 14a3300848f6..ebd2cbe3cbc1 100644 --- a/fs/exfat/misc.c +++ b/fs/exfat/misc.c @@ -88,7 +88,8 @@ void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, if (time_ms) { ts->tv_sec += time_ms / 100; ts->tv_nsec = (time_ms % 100) * 10 * NSEC_PER_MSEC; - } + } else + ts->tv_nsec = 0; if (tz & EXFAT_TZ_VALID) /* Adjust timezone to UTC0. */ @@ -124,6 +125,17 @@ void exfat_set_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, *tz = EXFAT_TZ_VALID; } +/* + * The timestamp for access_time has double seconds granularity. + * (There is no 10msIncrement field for access_time unlike create/modify_time) + * atime also has only a 2-second resolution. + */ +void exfat_truncate_atime(struct timespec64 *ts) +{ + ts->tv_sec = round_down(ts->tv_sec, 2); + ts->tv_nsec = 0; +} + unsigned short exfat_calc_chksum_2byte(void *data, int len, unsigned short chksum, int type) { diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index a8681d91f569..a2659a8a68a1 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -595,6 +595,7 @@ static int exfat_create(struct inode *dir, struct dentry *dentry, umode_t mode, inode_inc_iversion(inode); inode->i_mtime = inode->i_atime = inode->i_ctime = EXFAT_I(inode)->i_crtime = current_time(inode); + exfat_truncate_atime(&inode->i_atime); /* timestamp is already written, so mark_inode_dirty() is unneeded. */ d_instantiate(dentry, inode); @@ -691,6 +692,7 @@ static int exfat_find(struct inode *dir, struct qstr *qname, exfat_fs_error(sb, "non-zero size file starts with zero cluster (size : %llu, p_dir : %u, entry : 0x%08x)", i_size_read(dir), ei->dir.dir, ei->entry); + kfree(es); return -EIO; } @@ -854,6 +856,7 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry) inode_inc_iversion(dir); dir->i_mtime = dir->i_atime = current_time(dir); + exfat_truncate_atime(&dir->i_atime); if (IS_DIRSYNC(dir)) exfat_sync_inode(dir); else @@ -861,6 +864,7 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry) clear_nlink(inode); inode->i_mtime = inode->i_atime = current_time(inode); + exfat_truncate_atime(&inode->i_atime); exfat_unhash_inode(inode); exfat_d_version_set(dentry, inode_query_iversion(dir)); unlock: @@ -903,6 +907,7 @@ static int exfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inode_inc_iversion(inode); inode->i_mtime = inode->i_atime = inode->i_ctime = EXFAT_I(inode)->i_crtime = current_time(inode); + exfat_truncate_atime(&inode->i_atime); /* timestamp is already written, so mark_inode_dirty() is unneeded. */ d_instantiate(dentry, inode); @@ -1019,6 +1024,7 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry) inode_inc_iversion(dir); dir->i_mtime = dir->i_atime = current_time(dir); + exfat_truncate_atime(&dir->i_atime); if (IS_DIRSYNC(dir)) exfat_sync_inode(dir); else @@ -1027,6 +1033,7 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry) clear_nlink(inode); inode->i_mtime = inode->i_atime = current_time(inode); + exfat_truncate_atime(&inode->i_atime); exfat_unhash_inode(inode); exfat_d_version_set(dentry, inode_query_iversion(dir)); unlock: @@ -1387,6 +1394,7 @@ static int exfat_rename(struct inode *old_dir, struct dentry *old_dentry, inode_inc_iversion(new_dir); new_dir->i_ctime = new_dir->i_mtime = new_dir->i_atime = EXFAT_I(new_dir)->i_crtime = current_time(new_dir); + exfat_truncate_atime(&new_dir->i_atime); if (IS_DIRSYNC(new_dir)) exfat_sync_inode(new_dir); else diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 16ed202ef527..a846ff555656 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -49,6 +49,7 @@ static void exfat_put_super(struct super_block *sb) sync_blockdev(sb->s_bdev); exfat_set_vol_flags(sb, VOL_CLEAN); exfat_free_bitmap(sbi); + brelse(sbi->pbr_bh); mutex_unlock(&sbi->s_lock); call_rcu(&sbi->rcu, exfat_delayed_free); @@ -100,7 +101,7 @@ static int exfat_statfs(struct dentry *dentry, struct kstatfs *buf) int exfat_set_vol_flags(struct super_block *sb, unsigned short new_flag) { struct exfat_sb_info *sbi = EXFAT_SB(sb); - struct pbr64 *bpb; + struct pbr64 *bpb = (struct pbr64 *)sbi->pbr_bh->b_data; bool sync = 0; /* flags are not changed */ @@ -115,15 +116,6 @@ int exfat_set_vol_flags(struct super_block *sb, unsigned short new_flag) if (sb_rdonly(sb)) return 0; - if (!sbi->pbr_bh) { - sbi->pbr_bh = sb_bread(sb, 0); - if (!sbi->pbr_bh) { - exfat_msg(sb, KERN_ERR, "failed to read boot sector"); - return -ENOMEM; - } - } - - bpb = (struct pbr64 *)sbi->pbr_bh->b_data; bpb->bsx.vol_flags = cpu_to_le16(new_flag); if (new_flag == VOL_DIRTY && !buffer_dirty(sbi->pbr_bh)) @@ -159,7 +151,6 @@ static int exfat_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",iocharset=utf8"); else if (sbi->nls_io) seq_printf(m, ",iocharset=%s", sbi->nls_io->charset); - seq_printf(m, ",bps=%ld", sb->s_blocksize); if (opts->errors == EXFAT_ERRORS_CONT) seq_puts(m, ",errors=continue"); else if (opts->errors == EXFAT_ERRORS_PANIC) @@ -212,6 +203,12 @@ enum { Opt_errors, Opt_discard, Opt_time_offset, + + /* Deprecated options */ + Opt_utf8, + Opt_debug, + Opt_namecase, + Opt_codepage, }; static const struct constant_table exfat_param_enums[] = { @@ -232,6 +229,14 @@ static const struct fs_parameter_spec exfat_parameters[] = { fsparam_enum("errors", Opt_errors, exfat_param_enums), fsparam_flag("discard", Opt_discard), fsparam_s32("time_offset", Opt_time_offset), + __fsparam(NULL, "utf8", Opt_utf8, fs_param_deprecated, + NULL), + __fsparam(NULL, "debug", Opt_debug, fs_param_deprecated, + NULL), + __fsparam(fs_param_is_u32, "namecase", Opt_namecase, + fs_param_deprecated, NULL), + __fsparam(fs_param_is_u32, "codepage", Opt_codepage, + fs_param_deprecated, NULL), {} }; @@ -287,6 +292,11 @@ static int exfat_parse_param(struct fs_context *fc, struct fs_parameter *param) return -EINVAL; opts->time_offset = result.int_32; break; + case Opt_utf8: + case Opt_debug: + case Opt_namecase: + case Opt_codepage: + break; default: return -EINVAL; } @@ -351,14 +361,15 @@ static int exfat_read_root(struct inode *inode) exfat_save_attr(inode, ATTR_SUBDIR); inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime = current_time(inode); + exfat_truncate_atime(&inode->i_atime); exfat_cache_init_inode(inode); return 0; } -static struct pbr *exfat_read_pbr_with_logical_sector(struct super_block *sb, - struct buffer_head **prev_bh) +static struct pbr *exfat_read_pbr_with_logical_sector(struct super_block *sb) { - struct pbr *p_pbr = (struct pbr *) (*prev_bh)->b_data; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct pbr *p_pbr = (struct pbr *) (sbi->pbr_bh)->b_data; unsigned short logical_sect = 0; logical_sect = 1 << p_pbr->bsx.f64.sect_size_bits; @@ -378,26 +389,23 @@ static struct pbr *exfat_read_pbr_with_logical_sector(struct super_block *sb, } if (logical_sect > sb->s_blocksize) { - struct buffer_head *bh = NULL; - - __brelse(*prev_bh); - *prev_bh = NULL; + brelse(sbi->pbr_bh); + sbi->pbr_bh = NULL; if (!sb_set_blocksize(sb, logical_sect)) { exfat_msg(sb, KERN_ERR, "unable to set blocksize %u", logical_sect); return NULL; } - bh = sb_bread(sb, 0); - if (!bh) { + sbi->pbr_bh = sb_bread(sb, 0); + if (!sbi->pbr_bh) { exfat_msg(sb, KERN_ERR, "unable to read boot sector (logical sector size = %lu)", sb->s_blocksize); return NULL; } - *prev_bh = bh; - p_pbr = (struct pbr *) bh->b_data; + p_pbr = (struct pbr *)sbi->pbr_bh->b_data; } return p_pbr; } @@ -408,21 +416,20 @@ static int __exfat_fill_super(struct super_block *sb) int ret; struct pbr *p_pbr; struct pbr64 *p_bpb; - struct buffer_head *bh; struct exfat_sb_info *sbi = EXFAT_SB(sb); /* set block size to read super block */ sb_min_blocksize(sb, 512); /* read boot sector */ - bh = sb_bread(sb, 0); - if (!bh) { + sbi->pbr_bh = sb_bread(sb, 0); + if (!sbi->pbr_bh) { exfat_msg(sb, KERN_ERR, "unable to read boot sector"); return -EIO; } /* PRB is read */ - p_pbr = (struct pbr *)bh->b_data; + p_pbr = (struct pbr *)sbi->pbr_bh->b_data; /* check the validity of PBR */ if (le16_to_cpu((p_pbr->signature)) != PBR_SIGNATURE) { @@ -433,7 +440,7 @@ static int __exfat_fill_super(struct super_block *sb) /* check logical sector size */ - p_pbr = exfat_read_pbr_with_logical_sector(sb, &bh); + p_pbr = exfat_read_pbr_with_logical_sector(sb); if (!p_pbr) { ret = -EIO; goto free_bh; @@ -514,7 +521,7 @@ free_alloc_bitmap: free_upcase_table: exfat_free_upcase_table(sbi); free_bh: - brelse(bh); + brelse(sbi->pbr_bh); return ret; } @@ -531,17 +538,18 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc) if (opts->discard) { struct request_queue *q = bdev_get_queue(sb->s_bdev); - if (!blk_queue_discard(q)) + if (!blk_queue_discard(q)) { exfat_msg(sb, KERN_WARNING, "mounting with \"discard\" option, but the device does not support discard"); - opts->discard = 0; + opts->discard = 0; + } } sb->s_flags |= SB_NODIRATIME; sb->s_magic = EXFAT_SUPER_MAGIC; sb->s_op = &exfat_sops; - sb->s_time_gran = 1; + sb->s_time_gran = 10 * NSEC_PER_MSEC; sb->s_time_min = EXFAT_MIN_TIMESTAMP_SECS; sb->s_time_max = EXFAT_MAX_TIMESTAMP_SECS; @@ -605,6 +613,7 @@ put_inode: free_table: exfat_free_upcase_table(sbi); exfat_free_bitmap(sbi); + brelse(sbi->pbr_bh); check_nls_io: unload_nls(sbi->nls_io); @@ -717,6 +726,7 @@ static void __exit exit_exfat_fs(void) module_init(init_exfat_fs); module_exit(exit_exfat_fs); +MODULE_ALIAS_FS("exfat"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("exFAT filesystem support"); MODULE_AUTHOR("Samsung Electronics Co., Ltd."); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index c885cf7d724b..2875c0a705b5 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -877,11 +877,9 @@ static int ext2_readpage(struct file *file, struct page *page) return mpage_readpage(page, ext2_get_block); } -static int -ext2_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void ext2_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, ext2_get_block); + mpage_readahead(rac, ext2_get_block); } static int @@ -967,7 +965,7 @@ ext2_dax_writepages(struct address_space *mapping, struct writeback_control *wbc const struct address_space_operations ext2_aops = { .readpage = ext2_readpage, - .readpages = ext2_readpages, + .readahead = ext2_readahead, .writepage = ext2_writepage, .write_begin = ext2_write_begin, .write_end = ext2_write_end, @@ -981,7 +979,7 @@ const struct address_space_operations ext2_aops = { const struct address_space_operations ext2_nobh_aops = { .readpage = ext2_readpage, - .readpages = ext2_readpages, + .readahead = ext2_readahead, .writepage = ext2_nobh_writepage, .write_begin = ext2_nobh_write_begin, .write_end = nobh_write_end, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 91eb4381cae5..15b062efcff1 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -722,7 +722,7 @@ enum { #define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF /* Max logical block we can support */ -#define EXT4_MAX_LOGICAL_BLOCK 0xFFFFFFFF +#define EXT4_MAX_LOGICAL_BLOCK 0xFFFFFFFE /* * Structure of an inode on the disk @@ -1357,11 +1357,9 @@ struct ext4_super_block { */ #define EXT4_MF_MNTDIR_SAMPLED 0x0001 #define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ -#define EXT4_MF_TEST_DUMMY_ENCRYPTION 0x0004 #ifdef CONFIG_FS_ENCRYPTION -#define DUMMY_ENCRYPTION_ENABLED(sbi) (unlikely((sbi)->s_mount_flags & \ - EXT4_MF_TEST_DUMMY_ENCRYPTION)) +#define DUMMY_ENCRYPTION_ENABLED(sbi) ((sbi)->s_dummy_enc_ctx.ctx != NULL) #else #define DUMMY_ENCRYPTION_ENABLED(sbi) (0) #endif @@ -1551,6 +1549,9 @@ struct ext4_sb_info { struct ratelimit_state s_warning_ratelimit_state; struct ratelimit_state s_msg_ratelimit_state; + /* Encryption context for '-o test_dummy_encryption' */ + struct fscrypt_dummy_context s_dummy_enc_ctx; + /* * Barrier between writepages ops and changing any inode's JOURNAL_DATA * or EXTENTS flag. @@ -3316,9 +3317,8 @@ static inline void ext4_set_de_type(struct super_block *sb, } /* readpages.c */ -extern int ext4_mpage_readpages(struct address_space *mapping, - struct list_head *pages, struct page *page, - unsigned nr_pages, bool is_readahead); +extern int ext4_mpage_readpages(struct inode *inode, + struct readahead_control *rac, struct page *page); extern int __init ext4_init_post_read_processing(void); extern void ext4_exit_post_read_processing(void); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f2b577b315a0..2b4b94542e34 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4832,6 +4832,28 @@ static const struct iomap_ops ext4_iomap_xattr_ops = { .iomap_begin = ext4_iomap_xattr_begin, }; +static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len) +{ + u64 maxbytes; + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + maxbytes = inode->i_sb->s_maxbytes; + else + maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; + + if (*len == 0) + return -EINVAL; + if (start > maxbytes) + return -EFBIG; + + /* + * Shrink request scope to what the fs can actually handle. + */ + if (*len > maxbytes || (maxbytes - *len) < start) + *len = maxbytes - start; + return 0; +} + static int _ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len, bool from_es_cache) { @@ -4852,6 +4874,15 @@ static int _ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (fiemap_check_flags(fieinfo, ext4_fiemap_flags)) return -EBADR; + /* + * For bitmap files the maximum size limit could be smaller than + * s_maxbytes, so check len here manually instead of just relying on the + * generic check. + */ + error = ext4_fiemap_check_ranges(inode, start, &len); + if (error) + return error; + if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR; error = iomap_fiemap(inode, fieinfo, start, len, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2a4aae6acdcb..52be85f96159 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3224,23 +3224,20 @@ static int ext4_readpage(struct file *file, struct page *page) ret = ext4_readpage_inline(inode, page); if (ret == -EAGAIN) - return ext4_mpage_readpages(page->mapping, NULL, page, 1, - false); + return ext4_mpage_readpages(inode, NULL, page); return ret; } -static int -ext4_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void ext4_readahead(struct readahead_control *rac) { - struct inode *inode = mapping->host; + struct inode *inode = rac->mapping->host; - /* If the file has inline data, no need to do readpages. */ + /* If the file has inline data, no need to do readahead. */ if (ext4_has_inline_data(inode)) - return 0; + return; - return ext4_mpage_readpages(mapping, pages, NULL, nr_pages, true); + ext4_mpage_readpages(inode, rac, NULL); } static void ext4_invalidatepage(struct page *page, unsigned int offset, @@ -3605,7 +3602,7 @@ static int ext4_set_page_dirty(struct page *page) static const struct address_space_operations ext4_aops = { .readpage = ext4_readpage, - .readpages = ext4_readpages, + .readahead = ext4_readahead, .writepage = ext4_writepage, .writepages = ext4_writepages, .write_begin = ext4_write_begin, @@ -3622,7 +3619,7 @@ static const struct address_space_operations ext4_aops = { static const struct address_space_operations ext4_journalled_aops = { .readpage = ext4_readpage, - .readpages = ext4_readpages, + .readahead = ext4_readahead, .writepage = ext4_writepage, .writepages = ext4_writepages, .write_begin = ext4_write_begin, @@ -3638,7 +3635,7 @@ static const struct address_space_operations ext4_journalled_aops = { static const struct address_space_operations ext4_da_aops = { .readpage = ext4_readpage, - .readpages = ext4_readpages, + .readahead = ext4_readahead, .writepage = ext4_writepage, .writepages = ext4_writepages, .write_begin = ext4_da_write_begin, diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index bfc1281fc4cb..0746532ba463 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -733,29 +733,6 @@ static void ext4_fill_fsxattr(struct inode *inode, struct fsxattr *fa) fa->fsx_projid = from_kprojid(&init_user_ns, ei->i_projid); } -/* copied from fs/ioctl.c */ -static int fiemap_check_ranges(struct super_block *sb, - u64 start, u64 len, u64 *new_len) -{ - u64 maxbytes = (u64) sb->s_maxbytes; - - *new_len = len; - - if (len == 0) - return -EINVAL; - - if (start > maxbytes) - return -EFBIG; - - /* - * Shrink request scope to what the fs can actually handle. - */ - if (len > maxbytes || (maxbytes - len) < start) - *new_len = maxbytes - start; - - return 0; -} - /* So that the fiemap access checks can't overflow on 32 bit machines. */ #define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent)) @@ -765,8 +742,6 @@ static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg) struct fiemap __user *ufiemap = (struct fiemap __user *) arg; struct fiemap_extent_info fieinfo = { 0, }; struct inode *inode = file_inode(filp); - struct super_block *sb = inode->i_sb; - u64 len; int error; if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap))) @@ -775,11 +750,6 @@ static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg) if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS) return -EINVAL; - error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length, - &len); - if (error) - return error; - fieinfo.fi_flags = fiemap.fm_flags; fieinfo.fi_extents_max = fiemap.fm_extent_count; fieinfo.fi_extents_start = ufiemap->fm_extents; @@ -792,7 +762,8 @@ static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg) if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC) filemap_write_and_wait(inode->i_mapping); - error = ext4_get_es_cache(inode, &fieinfo, fiemap.fm_start, len); + error = ext4_get_es_cache(inode, &fieinfo, fiemap.fm_start, + fiemap.fm_length); fiemap.fm_flags = fieinfo.fi_flags; fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped; if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap))) diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index c1769afbf799..5761e9961682 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -7,8 +7,8 @@ * * This was originally taken from fs/mpage.c * - * The intent is the ext4_mpage_readpages() function here is intended - * to replace mpage_readpages() in the general case, not just for + * The ext4_mpage_readpages() function here is intended to + * replace mpage_readahead() in the general case, not just for * encrypted files. It has some limitations (see below), where it * will fall back to read_block_full_page(), but these limitations * should only be hit when page_size != block_size. @@ -221,14 +221,12 @@ static inline loff_t ext4_readpage_limit(struct inode *inode) return i_size_read(inode); } -int ext4_mpage_readpages(struct address_space *mapping, - struct list_head *pages, struct page *page, - unsigned nr_pages, bool is_readahead) +int ext4_mpage_readpages(struct inode *inode, + struct readahead_control *rac, struct page *page) { struct bio *bio = NULL; sector_t last_block_in_bio = 0; - struct inode *inode = mapping->host; const unsigned blkbits = inode->i_blkbits; const unsigned blocks_per_page = PAGE_SIZE >> blkbits; const unsigned blocksize = 1 << blkbits; @@ -241,6 +239,7 @@ int ext4_mpage_readpages(struct address_space *mapping, int length; unsigned relative_block = 0; struct ext4_map_blocks map; + unsigned int nr_pages = rac ? readahead_count(rac) : 1; map.m_pblk = 0; map.m_lblk = 0; @@ -251,14 +250,9 @@ int ext4_mpage_readpages(struct address_space *mapping, int fully_mapped = 1; unsigned first_hole = blocks_per_page; - if (pages) { - page = lru_to_page(pages); - + if (rac) { + page = readahead_page(rac); prefetchw(&page->flags); - list_del(&page->lru); - if (add_to_page_cache_lru(page, mapping, page->index, - readahead_gfp_mask(mapping))) - goto next_page; } if (page_has_buffers(page)) @@ -381,7 +375,7 @@ int ext4_mpage_readpages(struct address_space *mapping, bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); bio->bi_end_io = mpage_end_io; bio_set_op_attrs(bio, REQ_OP_READ, - is_readahead ? REQ_RAHEAD : 0); + rac ? REQ_RAHEAD : 0); } length = first_hole << blkbits; @@ -406,10 +400,9 @@ int ext4_mpage_readpages(struct address_space *mapping, else unlock_page(page); next_page: - if (pages) + if (rac) put_page(page); } - BUG_ON(pages && !list_empty(pages)); if (bio) submit_bio(bio); return 0; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 629a56b5c859..9824cd8203e8 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1106,6 +1106,7 @@ static void ext4_put_super(struct super_block *sb) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->s_blockgroup_lock); fs_put_dax(sbi->s_daxdev); + fscrypt_free_dummy_context(&sbi->s_dummy_enc_ctx); #ifdef CONFIG_UNICODE utf8_unload(sbi->s_encoding); #endif @@ -1389,9 +1390,10 @@ retry: return res; } -static bool ext4_dummy_context(struct inode *inode) +static const union fscrypt_context * +ext4_get_dummy_context(struct super_block *sb) { - return DUMMY_ENCRYPTION_ENABLED(EXT4_SB(inode->i_sb)); + return EXT4_SB(sb)->s_dummy_enc_ctx.ctx; } static bool ext4_has_stable_inodes(struct super_block *sb) @@ -1410,7 +1412,7 @@ static const struct fscrypt_operations ext4_cryptops = { .key_prefix = "ext4:", .get_context = ext4_get_context, .set_context = ext4_set_context, - .dummy_context = ext4_dummy_context, + .get_dummy_context = ext4_get_dummy_context, .empty_dir = ext4_empty_dir, .max_namelen = EXT4_NAME_LEN, .has_stable_inodes = ext4_has_stable_inodes, @@ -1605,6 +1607,7 @@ static const match_table_t tokens = { {Opt_init_itable, "init_itable"}, {Opt_noinit_itable, "noinit_itable"}, {Opt_max_dir_size_kb, "max_dir_size_kb=%u"}, + {Opt_test_dummy_encryption, "test_dummy_encryption=%s"}, {Opt_test_dummy_encryption, "test_dummy_encryption"}, {Opt_nombcache, "nombcache"}, {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */ @@ -1816,7 +1819,7 @@ static const struct mount_opts { {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT}, {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT}, {Opt_max_dir_size_kb, 0, MOPT_GTE0}, - {Opt_test_dummy_encryption, 0, MOPT_GTE0}, + {Opt_test_dummy_encryption, 0, MOPT_STRING}, {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, {Opt_err, 0, 0} }; @@ -1851,6 +1854,48 @@ static int ext4_sb_read_encoding(const struct ext4_super_block *es, } #endif +static int ext4_set_test_dummy_encryption(struct super_block *sb, + const char *opt, + const substring_t *arg, + bool is_remount) +{ +#ifdef CONFIG_FS_ENCRYPTION + struct ext4_sb_info *sbi = EXT4_SB(sb); + int err; + + /* + * This mount option is just for testing, and it's not worthwhile to + * implement the extra complexity (e.g. RCU protection) that would be + * needed to allow it to be set or changed during remount. We do allow + * it to be specified during remount, but only if there is no change. + */ + if (is_remount && !sbi->s_dummy_enc_ctx.ctx) { + ext4_msg(sb, KERN_WARNING, + "Can't set test_dummy_encryption on remount"); + return -1; + } + err = fscrypt_set_test_dummy_encryption(sb, arg, &sbi->s_dummy_enc_ctx); + if (err) { + if (err == -EEXIST) + ext4_msg(sb, KERN_WARNING, + "Can't change test_dummy_encryption on remount"); + else if (err == -EINVAL) + ext4_msg(sb, KERN_WARNING, + "Value of option \"%s\" is unrecognized", opt); + else + ext4_msg(sb, KERN_WARNING, + "Error processing option \"%s\" [%d]", + opt, err); + return -1; + } + ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled"); +#else + ext4_msg(sb, KERN_WARNING, + "Test dummy encryption mount option ignored"); +#endif + return 1; +} + static int handle_mount_opt(struct super_block *sb, char *opt, int token, substring_t *args, unsigned long *journal_devnum, unsigned int *journal_ioprio, int is_remount) @@ -2047,14 +2092,8 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg); } else if (token == Opt_test_dummy_encryption) { -#ifdef CONFIG_FS_ENCRYPTION - sbi->s_mount_flags |= EXT4_MF_TEST_DUMMY_ENCRYPTION; - ext4_msg(sb, KERN_WARNING, - "Test dummy encryption mode enabled"); -#else - ext4_msg(sb, KERN_WARNING, - "Test dummy encryption mount option ignored"); -#endif + return ext4_set_test_dummy_encryption(sb, opt, &args[0], + is_remount); } else if (m->flags & MOPT_DATAJ) { if (is_remount) { if (!sbi->s_journal) @@ -2311,8 +2350,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb); if (test_opt(sb, DATA_ERR_ABORT)) SEQ_OPTS_PUTS("data_err=abort"); - if (DUMMY_ENCRYPTION_ENABLED(sbi)) - SEQ_OPTS_PUTS("test_dummy_encryption"); + + fscrypt_show_test_dummy_encryption(seq, sep, sb); ext4_show_quota_options(seq, sb); return 0; @@ -4780,6 +4819,7 @@ failed_mount: for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(get_qf_name(sb, sbi, i)); #endif + fscrypt_free_dummy_context(&sbi->s_dummy_enc_ctx); ext4_blkdev_remove(sbi); brelse(bh); out_fail: diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 04bfaf63752c..6c9fc9e21c13 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -293,6 +293,7 @@ EXT4_ATTR_FEATURE(batched_discard); EXT4_ATTR_FEATURE(meta_bg_resize); #ifdef CONFIG_FS_ENCRYPTION EXT4_ATTR_FEATURE(encryption); +EXT4_ATTR_FEATURE(test_dummy_encryption_v2); #endif #ifdef CONFIG_UNICODE EXT4_ATTR_FEATURE(casefold); @@ -308,6 +309,7 @@ static struct attribute *ext4_feat_attrs[] = { ATTR_LIST(meta_bg_resize), #ifdef CONFIG_FS_ENCRYPTION ATTR_LIST(encryption), + ATTR_LIST(test_dummy_encryption_v2), #endif #ifdef CONFIG_UNICODE ATTR_LIST(casefold), diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index dc5ec724d889..dec1244dd062 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -342,37 +342,6 @@ static int ext4_get_verity_descriptor(struct inode *inode, void *buf, return desc_size; } -/* - * Prefetch some pages from the file's Merkle tree. - * - * This is basically a stripped-down version of __do_page_cache_readahead() - * which works on pages past i_size. - */ -static void ext4_merkle_tree_readahead(struct address_space *mapping, - pgoff_t start_index, unsigned long count) -{ - LIST_HEAD(pages); - unsigned int nr_pages = 0; - struct page *page; - pgoff_t index; - struct blk_plug plug; - - for (index = start_index; index < start_index + count; index++) { - page = xa_load(&mapping->i_pages, index); - if (!page || xa_is_value(page)) { - page = __page_cache_alloc(readahead_gfp_mask(mapping)); - if (!page) - break; - page->index = index; - list_add(&page->lru, &pages); - nr_pages++; - } - } - blk_start_plug(&plug); - ext4_mpage_readpages(mapping, &pages, NULL, nr_pages, true); - blk_finish_plug(&plug); -} - static struct page *ext4_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) @@ -386,8 +355,8 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode, if (page) put_page(page); else if (num_ra_pages > 1) - ext4_merkle_tree_readahead(inode->i_mapping, index, - num_ra_pages); + page_cache_readahead_unbounded(inode->i_mapping, NULL, + index, num_ra_pages, 0); page = read_mapping_page(inode->i_mapping, index, NULL); } return page; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index cdf2f626bea7..03ec97f28235 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2177,13 +2177,11 @@ out: * use ->readpage() or do the necessary surgery to decouple ->readpages() * from read-ahead. */ -int f2fs_mpage_readpages(struct address_space *mapping, - struct list_head *pages, struct page *page, - unsigned nr_pages, bool is_readahead) +static int f2fs_mpage_readpages(struct inode *inode, + struct readahead_control *rac, struct page *page) { struct bio *bio = NULL; sector_t last_block_in_bio = 0; - struct inode *inode = mapping->host; struct f2fs_map_blocks map; #ifdef CONFIG_F2FS_FS_COMPRESSION struct compress_ctx cc = { @@ -2197,6 +2195,7 @@ int f2fs_mpage_readpages(struct address_space *mapping, .nr_cpages = 0, }; #endif + unsigned nr_pages = rac ? readahead_count(rac) : 1; unsigned max_nr_pages = nr_pages; int ret = 0; @@ -2210,15 +2209,9 @@ int f2fs_mpage_readpages(struct address_space *mapping, map.m_may_create = false; for (; nr_pages; nr_pages--) { - if (pages) { - page = list_last_entry(pages, struct page, lru); - + if (rac) { + page = readahead_page(rac); prefetchw(&page->flags); - list_del(&page->lru); - if (add_to_page_cache_lru(page, mapping, - page_index(page), - readahead_gfp_mask(mapping))) - goto next_page; } #ifdef CONFIG_F2FS_FS_COMPRESSION @@ -2228,7 +2221,7 @@ int f2fs_mpage_readpages(struct address_space *mapping, ret = f2fs_read_multi_pages(&cc, &bio, max_nr_pages, &last_block_in_bio, - is_readahead, false); + rac != NULL, false); f2fs_destroy_compress_ctx(&cc); if (ret) goto set_error_page; @@ -2251,7 +2244,7 @@ read_single_page: #endif ret = f2fs_read_single_page(inode, page, max_nr_pages, &map, - &bio, &last_block_in_bio, is_readahead); + &bio, &last_block_in_bio, rac); if (ret) { #ifdef CONFIG_F2FS_FS_COMPRESSION set_error_page: @@ -2260,8 +2253,10 @@ set_error_page: zero_user_segment(page, 0, PAGE_SIZE); unlock_page(page); } +#ifdef CONFIG_F2FS_FS_COMPRESSION next_page: - if (pages) +#endif + if (rac) put_page(page); #ifdef CONFIG_F2FS_FS_COMPRESSION @@ -2271,16 +2266,15 @@ next_page: ret = f2fs_read_multi_pages(&cc, &bio, max_nr_pages, &last_block_in_bio, - is_readahead, false); + rac != NULL, false); f2fs_destroy_compress_ctx(&cc); } } #endif } - BUG_ON(pages && !list_empty(pages)); if (bio) __submit_bio(F2FS_I_SB(inode), bio, DATA); - return pages ? 0 : ret; + return ret; } static int f2fs_read_data_page(struct file *file, struct page *page) @@ -2299,28 +2293,24 @@ static int f2fs_read_data_page(struct file *file, struct page *page) if (f2fs_has_inline_data(inode)) ret = f2fs_read_inline_data(inode, page); if (ret == -EAGAIN) - ret = f2fs_mpage_readpages(page_file_mapping(page), - NULL, page, 1, false); + ret = f2fs_mpage_readpages(inode, NULL, page); return ret; } -static int f2fs_read_data_pages(struct file *file, - struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void f2fs_readahead(struct readahead_control *rac) { - struct inode *inode = mapping->host; - struct page *page = list_last_entry(pages, struct page, lru); + struct inode *inode = rac->mapping->host; - trace_f2fs_readpages(inode, page, nr_pages); + trace_f2fs_readpages(inode, readahead_index(rac), readahead_count(rac)); if (!f2fs_is_compress_backend_ready(inode)) - return 0; + return; /* If the file has inline data, skip readpages */ if (f2fs_has_inline_data(inode)) - return 0; + return; - return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages, true); + f2fs_mpage_readpages(inode, rac, NULL); } int f2fs_encrypt_one_page(struct f2fs_io_info *fio) @@ -3805,7 +3795,7 @@ static void f2fs_swap_deactivate(struct file *file) const struct address_space_operations f2fs_dblock_aops = { .readpage = f2fs_read_data_page, - .readpages = f2fs_read_data_pages, + .readahead = f2fs_readahead, .writepage = f2fs_write_data_page, .writepages = f2fs_write_data_pages, .write_begin = f2fs_write_begin, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ba470d5687fe..5c0149d2f46a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -138,7 +138,7 @@ struct f2fs_mount_info { int fsync_mode; /* fsync policy */ int fs_mode; /* fs mode: LFS or ADAPTIVE */ int bggc_mode; /* bggc mode: off, on or sync */ - bool test_dummy_encryption; /* test dummy encryption */ + struct fscrypt_dummy_context dummy_enc_ctx; /* test dummy encryption */ block_t unusable_cap; /* Amount of space allowed to be * unusable when disabling checkpoint */ @@ -1259,7 +1259,7 @@ enum fsync_mode { #ifdef CONFIG_FS_ENCRYPTION #define DUMMY_ENCRYPTION_ENABLED(sbi) \ - (unlikely(F2FS_OPTION(sbi).test_dummy_encryption)) + (unlikely(F2FS_OPTION(sbi).dummy_enc_ctx.ctx != NULL)) #else #define DUMMY_ENCRYPTION_ENABLED(sbi) (0) #endif @@ -3051,19 +3051,12 @@ static inline void f2fs_set_page_private(struct page *page, if (PagePrivate(page)) return; - get_page(page); - SetPagePrivate(page); - set_page_private(page, data); + attach_page_private(page, (void *)data); } static inline void f2fs_clear_page_private(struct page *page) { - if (!PagePrivate(page)) - return; - - set_page_private(page, 0); - ClearPagePrivate(page); - f2fs_put_page(page, 0); + detach_page_private(page); } /* @@ -3373,9 +3366,6 @@ int f2fs_reserve_new_block(struct dnode_of_data *dn); int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index); int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from); int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); -int f2fs_mpage_readpages(struct address_space *mapping, - struct list_head *pages, struct page *page, - unsigned nr_pages, bool is_readahead); struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, int op_flags, bool for_write); struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index); diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index 5bc4dcd8fc03..8c4ea5003ef8 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -12,7 +12,6 @@ #include <linux/types.h> #include <linux/fs.h> #include <linux/f2fs_fs.h> -#include <linux/cryptohash.h> #include <linux/pagemap.h> #include <linux/unicode.h> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f2dfc21c6abb..8a9955902d84 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -202,6 +202,7 @@ static match_table_t f2fs_tokens = { {Opt_whint, "whint_mode=%s"}, {Opt_alloc, "alloc_mode=%s"}, {Opt_fsync, "fsync_mode=%s"}, + {Opt_test_dummy_encryption, "test_dummy_encryption=%s"}, {Opt_test_dummy_encryption, "test_dummy_encryption"}, {Opt_checkpoint_disable, "checkpoint=disable"}, {Opt_checkpoint_disable_cap, "checkpoint=disable:%u"}, @@ -394,7 +395,52 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) } #endif -static int parse_options(struct super_block *sb, char *options) +static int f2fs_set_test_dummy_encryption(struct super_block *sb, + const char *opt, + const substring_t *arg, + bool is_remount) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); +#ifdef CONFIG_FS_ENCRYPTION + int err; + + if (!f2fs_sb_has_encrypt(sbi)) { + f2fs_err(sbi, "Encrypt feature is off"); + return -EINVAL; + } + + /* + * This mount option is just for testing, and it's not worthwhile to + * implement the extra complexity (e.g. RCU protection) that would be + * needed to allow it to be set or changed during remount. We do allow + * it to be specified during remount, but only if there is no change. + */ + if (is_remount && !F2FS_OPTION(sbi).dummy_enc_ctx.ctx) { + f2fs_warn(sbi, "Can't set test_dummy_encryption on remount"); + return -EINVAL; + } + err = fscrypt_set_test_dummy_encryption( + sb, arg, &F2FS_OPTION(sbi).dummy_enc_ctx); + if (err) { + if (err == -EEXIST) + f2fs_warn(sbi, + "Can't change test_dummy_encryption on remount"); + else if (err == -EINVAL) + f2fs_warn(sbi, "Value of option \"%s\" is unrecognized", + opt); + else + f2fs_warn(sbi, "Error processing option \"%s\" [%d]", + opt, err); + return -EINVAL; + } + f2fs_warn(sbi, "Test dummy encryption mode enabled"); +#else + f2fs_warn(sbi, "Test dummy encryption mount option ignored"); +#endif + return 0; +} + +static int parse_options(struct super_block *sb, char *options, bool is_remount) { struct f2fs_sb_info *sbi = F2FS_SB(sb); substring_t args[MAX_OPT_ARGS]; @@ -403,9 +449,7 @@ static int parse_options(struct super_block *sb, char *options) int arg = 0, ext_cnt; kuid_t uid; kgid_t gid; -#ifdef CONFIG_QUOTA int ret; -#endif if (!options) return 0; @@ -778,17 +822,10 @@ static int parse_options(struct super_block *sb, char *options) kvfree(name); break; case Opt_test_dummy_encryption: -#ifdef CONFIG_FS_ENCRYPTION - if (!f2fs_sb_has_encrypt(sbi)) { - f2fs_err(sbi, "Encrypt feature is off"); - return -EINVAL; - } - - F2FS_OPTION(sbi).test_dummy_encryption = true; - f2fs_info(sbi, "Test dummy encryption mode enabled"); -#else - f2fs_info(sbi, "Test dummy encryption mount option ignored"); -#endif + ret = f2fs_set_test_dummy_encryption(sb, p, &args[0], + is_remount); + if (ret) + return ret; break; case Opt_checkpoint_disable_cap_perc: if (args->from && match_int(args, &arg)) @@ -1213,6 +1250,7 @@ static void f2fs_put_super(struct super_block *sb) for (i = 0; i < MAXQUOTAS; i++) kvfree(F2FS_OPTION(sbi).s_qf_names[i]); #endif + fscrypt_free_dummy_context(&F2FS_OPTION(sbi).dummy_enc_ctx); destroy_percpu_info(sbi); for (i = 0; i < NR_PAGE_TYPE; i++) kvfree(sbi->write_io[i]); @@ -1543,10 +1581,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",whint_mode=%s", "user-based"); else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) seq_printf(seq, ",whint_mode=%s", "fs-based"); -#ifdef CONFIG_FS_ENCRYPTION - if (F2FS_OPTION(sbi).test_dummy_encryption) - seq_puts(seq, ",test_dummy_encryption"); -#endif + + fscrypt_show_test_dummy_encryption(seq, ',', sbi->sb); if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_DEFAULT) seq_printf(seq, ",alloc_mode=%s", "default"); @@ -1575,7 +1611,6 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; - F2FS_OPTION(sbi).test_dummy_encryption = false; F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZ4; @@ -1734,7 +1769,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) default_options(sbi); /* parse mount options */ - err = parse_options(sb, data); + err = parse_options(sb, data, true); if (err) goto restore_opts; checkpoint_changed = @@ -2410,9 +2445,10 @@ static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len, ctx, len, fs_data, XATTR_CREATE); } -static bool f2fs_dummy_context(struct inode *inode) +static const union fscrypt_context * +f2fs_get_dummy_context(struct super_block *sb) { - return DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(inode)); + return F2FS_OPTION(F2FS_SB(sb)).dummy_enc_ctx.ctx; } static bool f2fs_has_stable_inodes(struct super_block *sb) @@ -2431,7 +2467,7 @@ static const struct fscrypt_operations f2fs_cryptops = { .key_prefix = "f2fs:", .get_context = f2fs_get_context, .set_context = f2fs_set_context, - .dummy_context = f2fs_dummy_context, + .get_dummy_context = f2fs_get_dummy_context, .empty_dir = f2fs_empty_dir, .max_namelen = F2FS_NAME_LEN, .has_stable_inodes = f2fs_has_stable_inodes, @@ -3366,7 +3402,7 @@ try_onemore: goto free_sb_buf; } - err = parse_options(sb, options); + err = parse_options(sb, options, false); if (err) goto free_options; @@ -3769,6 +3805,7 @@ free_options: for (i = 0; i < MAXQUOTAS; i++) kvfree(F2FS_OPTION(sbi).s_qf_names[i]); #endif + fscrypt_free_dummy_context(&F2FS_OPTION(sbi).dummy_enc_ctx); kvfree(options); free_sb_buf: kvfree(raw_super); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index e3bbbef9b4f0..3162f46b3c9b 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -446,6 +446,7 @@ enum feat_id { FEAT_SB_CHECKSUM, FEAT_CASEFOLD, FEAT_COMPRESSION, + FEAT_TEST_DUMMY_ENCRYPTION_V2, }; static ssize_t f2fs_feature_show(struct f2fs_attr *a, @@ -466,6 +467,7 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a, case FEAT_SB_CHECKSUM: case FEAT_CASEFOLD: case FEAT_COMPRESSION: + case FEAT_TEST_DUMMY_ENCRYPTION_V2: return sprintf(buf, "supported\n"); } return 0; @@ -563,6 +565,7 @@ F2FS_GENERAL_RO_ATTR(avg_vblocks); #ifdef CONFIG_FS_ENCRYPTION F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO); +F2FS_FEATURE_RO_ATTR(test_dummy_encryption_v2, FEAT_TEST_DUMMY_ENCRYPTION_V2); #endif #ifdef CONFIG_BLK_DEV_ZONED F2FS_FEATURE_RO_ATTR(block_zoned, FEAT_BLKZONED); @@ -647,6 +650,7 @@ ATTRIBUTE_GROUPS(f2fs); static struct attribute *f2fs_feat_attrs[] = { #ifdef CONFIG_FS_ENCRYPTION ATTR_LIST(encryption), + ATTR_LIST(test_dummy_encryption_v2), #endif #ifdef CONFIG_BLK_DEV_ZONED ATTR_LIST(block_zoned), diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index d7d430a6f130..865c9fb774fb 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -222,37 +222,6 @@ static int f2fs_get_verity_descriptor(struct inode *inode, void *buf, return size; } -/* - * Prefetch some pages from the file's Merkle tree. - * - * This is basically a stripped-down version of __do_page_cache_readahead() - * which works on pages past i_size. - */ -static void f2fs_merkle_tree_readahead(struct address_space *mapping, - pgoff_t start_index, unsigned long count) -{ - LIST_HEAD(pages); - unsigned int nr_pages = 0; - struct page *page; - pgoff_t index; - struct blk_plug plug; - - for (index = start_index; index < start_index + count; index++) { - page = xa_load(&mapping->i_pages, index); - if (!page || xa_is_value(page)) { - page = __page_cache_alloc(readahead_gfp_mask(mapping)); - if (!page) - break; - page->index = index; - list_add(&page->lru, &pages); - nr_pages++; - } - } - blk_start_plug(&plug); - f2fs_mpage_readpages(mapping, &pages, NULL, nr_pages, true); - blk_finish_plug(&plug); -} - static struct page *f2fs_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) @@ -266,8 +235,8 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode, if (page) put_page(page); else if (num_ra_pages > 1) - f2fs_merkle_tree_readahead(inode->i_mapping, index, - num_ra_pages); + page_cache_readahead_unbounded(inode->i_mapping, NULL, + index, num_ra_pages, 0); page = read_mapping_page(inode->i_mapping, index, NULL); } return page; diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig index 718163d0c621..ca31993dcb47 100644 --- a/fs/fat/Kconfig +++ b/fs/fat/Kconfig @@ -69,7 +69,7 @@ config VFAT_FS The VFAT support enlarges your kernel by about 10 KB and it only works if you said Y to the "DOS FAT fs support" above. Please read - the file <file:Documentation/filesystems/vfat.txt> for details. If + the file <file:Documentation/filesystems/vfat.rst> for details. If unsure, say Y. To compile this as a module, choose M here: the module will be called @@ -82,7 +82,7 @@ config FAT_DEFAULT_CODEPAGE help This option should be set to the codepage of your FAT filesystems. It can be overridden with the "codepage" mount option. - See <file:Documentation/filesystems/vfat.txt> for more information. + See <file:Documentation/filesystems/vfat.rst> for more information. config FAT_DEFAULT_IOCHARSET string "Default iocharset for FAT" @@ -96,7 +96,7 @@ config FAT_DEFAULT_IOCHARSET Note that "utf8" is not recommended for FAT filesystems. If unsure, you shouldn't set "utf8" here - select the next option instead if you would like to use UTF-8 encoded file names by default. - See <file:Documentation/filesystems/vfat.txt> for more information. + See <file:Documentation/filesystems/vfat.rst> for more information. Enable any character sets you need in File Systems/Native Language Support. @@ -114,4 +114,4 @@ config FAT_DEFAULT_UTF8 Say Y if you use UTF-8 encoding for file names, N otherwise. - See <file:Documentation/filesystems/vfat.txt> for more information. + See <file:Documentation/filesystems/vfat.rst> for more information. diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 054acd9fd033..b4ddf48fa444 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -804,8 +804,6 @@ static long fat_dir_ioctl(struct file *filp, unsigned int cmd, return fat_generic_ioctl(filp, cmd, arg); } - if (!access_ok(d1, sizeof(struct __fat_dirent[2]))) - return -EFAULT; /* * Yes, we don't need this put_user() absolutely. However old * code didn't return the right value. So, app use this value, @@ -844,8 +842,6 @@ static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd, return fat_generic_ioctl(filp, cmd, (unsigned long)arg); } - if (!access_ok(d1, sizeof(struct compat_dirent[2]))) - return -EFAULT; /* * Yes, we don't need this put_user() absolutely. However old * code didn't return the right value. So, app use this value, diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 71946da84388..e6e68b2274a5 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -210,10 +210,9 @@ static int fat_readpage(struct file *file, struct page *page) return mpage_readpage(page, fat_get_block); } -static int fat_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void fat_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, fat_get_block); + mpage_readahead(rac, fat_get_block); } static void fat_write_failed(struct address_space *mapping, loff_t to) @@ -344,7 +343,7 @@ int fat_block_truncate_page(struct inode *inode, loff_t from) static const struct address_space_operations fat_aops = { .readpage = fat_readpage, - .readpages = fat_readpages, + .readahead = fat_readahead, .writepage = fat_writepage, .writepages = fat_writepages, .write_begin = fat_write_begin, diff --git a/fs/file.c b/fs/file.c index c8a4e4c86e55..abb8b7081d7a 100644 --- a/fs/file.c +++ b/fs/file.c @@ -70,7 +70,7 @@ static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt, */ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) { - unsigned int cpy, set; + size_t cpy, set; BUG_ON(nfdt->max_fds < ofdt->max_fds); diff --git a/fs/file_table.c b/fs/file_table.c index 30d55c9a1744..676e620948d2 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -198,6 +198,7 @@ static struct file *alloc_file(const struct path *path, int flags, file->f_inode = path->dentry->d_inode; file->f_mapping = path->dentry->d_inode->i_mapping; file->f_wb_err = filemap_sample_wb_err(file->f_mapping); + file->f_sb_err = file_sample_sb_err(file); if ((file->f_mode & FMODE_READ) && likely(fop->read || fop->read_iter)) file->f_mode |= FMODE_CAN_READ; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index d85323607b49..a750381d554a 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1070,7 +1070,6 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, static unsigned long get_nr_dirty_pages(void) { return global_node_page_state(NR_FILE_DIRTY) + - global_node_page_state(NR_UNSTABLE_NFS) + get_nr_dirty_inodes(); } diff --git a/fs/fs_context.c b/fs/fs_context.c index fc9f6ef93b55..7d5c5dd2b1d5 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -42,7 +42,6 @@ static const struct constant_table common_set_sb_flag[] = { { "dirsync", SB_DIRSYNC }, { "lazytime", SB_LAZYTIME }, { "mand", SB_MANDLOCK }, - { "posixacl", SB_POSIXACL }, { "ro", SB_RDONLY }, { "sync", SB_SYNCHRONOUS }, { }, @@ -53,44 +52,15 @@ static const struct constant_table common_clear_sb_flag[] = { { "nolazytime", SB_LAZYTIME }, { "nomand", SB_MANDLOCK }, { "rw", SB_RDONLY }, - { "silent", SB_SILENT }, { }, }; -static const char *const forbidden_sb_flag[] = { - "bind", - "dev", - "exec", - "move", - "noatime", - "nodev", - "nodiratime", - "noexec", - "norelatime", - "nostrictatime", - "nosuid", - "private", - "rec", - "relatime", - "remount", - "shared", - "slave", - "strictatime", - "suid", - "unbindable", -}; - /* * Check for a common mount option that manipulates s_flags. */ static int vfs_parse_sb_flag(struct fs_context *fc, const char *key) { unsigned int token; - unsigned int i; - - for (i = 0; i < ARRAY_SIZE(forbidden_sb_flag); i++) - if (strcmp(key, forbidden_sb_flag[i]) == 0) - return -EINVAL; token = lookup_constant(common_set_sb_flag, key, 0); if (token) { diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig index 506c5e643f0d..5e796e6c38e5 100644 --- a/fs/fscache/Kconfig +++ b/fs/fscache/Kconfig @@ -8,7 +8,7 @@ config FSCACHE Different sorts of caches can be plugged in, depending on the resources available. - See Documentation/filesystems/caching/fscache.txt for more information. + See Documentation/filesystems/caching/fscache.rst for more information. config FSCACHE_STATS bool "Gather statistical information on local caching" @@ -25,7 +25,7 @@ config FSCACHE_STATS between CPUs. On the other hand, the stats are very useful for debugging purposes. Saying 'Y' here is recommended. - See Documentation/filesystems/caching/fscache.txt for more information. + See Documentation/filesystems/caching/fscache.rst for more information. config FSCACHE_HISTOGRAM bool "Gather latency information on local caching" @@ -42,7 +42,7 @@ config FSCACHE_HISTOGRAM bouncing between CPUs. On the other hand, the histogram may be useful for debugging purposes. Saying 'N' here is recommended. - See Documentation/filesystems/caching/fscache.txt for more information. + See Documentation/filesystems/caching/fscache.rst for more information. config FSCACHE_DEBUG bool "Debug FS-Cache" @@ -52,7 +52,7 @@ config FSCACHE_DEBUG management module. If this is set, the debugging output may be enabled by setting bits in /sys/modules/fscache/parameter/debug. - See Documentation/filesystems/caching/fscache.txt for more information. + See Documentation/filesystems/caching/fscache.rst for more information. config FSCACHE_OBJECT_LIST bool "Maintain global object list for debugging purposes" diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c index f78793f3d21e..fcc136361415 100644 --- a/fs/fscache/cache.c +++ b/fs/fscache/cache.c @@ -172,7 +172,7 @@ no_preference: * * Initialise a record of a cache and fill in the name. * - * See Documentation/filesystems/caching/backend-api.txt for a complete + * See Documentation/filesystems/caching/backend-api.rst for a complete * description. */ void fscache_init_cache(struct fscache_cache *cache, @@ -207,7 +207,7 @@ EXPORT_SYMBOL(fscache_init_cache); * * Add a cache to the system, making it available for netfs's to use. * - * See Documentation/filesystems/caching/backend-api.txt for a complete + * See Documentation/filesystems/caching/backend-api.rst for a complete * description. */ int fscache_add_cache(struct fscache_cache *cache, @@ -307,7 +307,7 @@ EXPORT_SYMBOL(fscache_add_cache); * Note that an I/O error occurred in a cache and that it should no longer be * used for anything. This also reports the error into the kernel log. * - * See Documentation/filesystems/caching/backend-api.txt for a complete + * See Documentation/filesystems/caching/backend-api.rst for a complete * description. */ void fscache_io_error(struct fscache_cache *cache) @@ -355,7 +355,7 @@ static void fscache_withdraw_all_objects(struct fscache_cache *cache, * Withdraw a cache from service, unbinding all its cache objects from the * netfs cookies they're currently representing. * - * See Documentation/filesystems/caching/backend-api.txt for a complete + * See Documentation/filesystems/caching/backend-api.rst for a complete * description. */ void fscache_withdraw_cache(struct fscache_cache *cache) diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 0ce39658a620..751bc5b1cddf 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -4,7 +4,7 @@ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * - * See Documentation/filesystems/caching/netfs-api.txt for more information on + * See Documentation/filesystems/caching/netfs-api.rst for more information on * the netfs API. */ diff --git a/fs/fscache/object.c b/fs/fscache/object.c index cfeba839a0f2..cb2146e02cd5 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -4,7 +4,7 @@ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * - * See Documentation/filesystems/caching/object.txt for a description of the + * See Documentation/filesystems/caching/object.rst for a description of the * object state machine and the in-kernel representations. */ @@ -295,7 +295,7 @@ static void fscache_object_work_func(struct work_struct *work) * * Initialise a cache object description to its basic values. * - * See Documentation/filesystems/caching/backend-api.txt for a complete + * See Documentation/filesystems/caching/backend-api.rst for a complete * description. */ void fscache_object_init(struct fscache_object *object, diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 1a22a55f75a0..4a5651d4904e 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -4,7 +4,7 @@ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * - * See Documentation/filesystems/caching/operations.txt + * See Documentation/filesystems/caching/operations.rst */ #define FSCACHE_DEBUG_LEVEL OPERATION diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index eb2a585572dc..774b2618018a 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -12,7 +12,7 @@ config FUSE_FS although chances are your distribution already has that library installed if you've installed the "fuse" package itself. - See <file:Documentation/filesystems/fuse.txt> for more information. + See <file:Documentation/filesystems/fuse.rst> for more information. See <file:Documentation/Changes> for needed library/utility version. If you want to develop a userspace FS, or if you want to use diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 97eec7522bf2..c7a65cf2bcca 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2081,7 +2081,7 @@ static void end_polls(struct fuse_conn *fc) * The same effect is usually achievable through killing the filesystem daemon * and all users of the filesystem. The exception is the combination of an * asynchronous request and the tricky deadlock (see - * Documentation/filesystems/fuse.txt). + * Documentation/filesystems/fuse.rst). * * Aborting requests under I/O goes as follows: 1: Separate out unlocked * requests, they should be finished off immediately. Locked requests will be diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 9d67b830fb7a..bac51c32d660 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -915,84 +915,40 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) fuse_readpages_end(fc, &ap->args, err); } -struct fuse_fill_data { - struct fuse_io_args *ia; - struct file *file; - struct inode *inode; - unsigned int nr_pages; - unsigned int max_pages; -}; - -static int fuse_readpages_fill(void *_data, struct page *page) +static void fuse_readahead(struct readahead_control *rac) { - struct fuse_fill_data *data = _data; - struct fuse_io_args *ia = data->ia; - struct fuse_args_pages *ap = &ia->ap; - struct inode *inode = data->inode; + struct inode *inode = rac->mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); + unsigned int i, max_pages, nr_pages = 0; - fuse_wait_on_page_writeback(inode, page->index); - - if (ap->num_pages && - (ap->num_pages == fc->max_pages || - (ap->num_pages + 1) * PAGE_SIZE > fc->max_read || - ap->pages[ap->num_pages - 1]->index + 1 != page->index)) { - data->max_pages = min_t(unsigned int, data->nr_pages, - fc->max_pages); - fuse_send_readpages(ia, data->file); - data->ia = ia = fuse_io_alloc(NULL, data->max_pages); - if (!ia) { - unlock_page(page); - return -ENOMEM; - } - ap = &ia->ap; - } - - if (WARN_ON(ap->num_pages >= data->max_pages)) { - unlock_page(page); - fuse_io_free(ia); - return -EIO; - } - - get_page(page); - ap->pages[ap->num_pages] = page; - ap->descs[ap->num_pages].length = PAGE_SIZE; - ap->num_pages++; - data->nr_pages--; - return 0; -} - -static int fuse_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) -{ - struct inode *inode = mapping->host; - struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_fill_data data; - int err; - - err = -EIO; if (is_bad_inode(inode)) - goto out; + return; - data.file = file; - data.inode = inode; - data.nr_pages = nr_pages; - data.max_pages = min_t(unsigned int, nr_pages, fc->max_pages); -; - data.ia = fuse_io_alloc(NULL, data.max_pages); - err = -ENOMEM; - if (!data.ia) - goto out; + max_pages = min_t(unsigned int, fc->max_pages, + fc->max_read / PAGE_SIZE); - err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data); - if (!err) { - if (data.ia->ap.num_pages) - fuse_send_readpages(data.ia, file); - else - fuse_io_free(data.ia); + for (;;) { + struct fuse_io_args *ia; + struct fuse_args_pages *ap; + + nr_pages = readahead_count(rac) - nr_pages; + if (nr_pages > max_pages) + nr_pages = max_pages; + if (nr_pages == 0) + break; + ia = fuse_io_alloc(NULL, nr_pages); + if (!ia) + return; + ap = &ia->ap; + nr_pages = __readahead_batch(rac, ap->pages, nr_pages); + for (i = 0; i < nr_pages; i++) { + fuse_wait_on_page_writeback(inode, + readahead_index(rac) + i); + ap->descs[i].length = PAGE_SIZE; + } + ap->num_pages = nr_pages; + fuse_send_readpages(ia, rac->file); } -out: - return err; } static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to) @@ -3373,10 +3329,10 @@ static const struct file_operations fuse_file_operations = { static const struct address_space_operations fuse_file_aops = { .readpage = fuse_readpage, + .readahead = fuse_readahead, .writepage = fuse_writepage, .writepages = fuse_writepages, .launder_page = fuse_launder_page, - .readpages = fuse_readpages, .set_page_dirty = __set_page_dirty_nobuffers, .bmap = fuse_bmap, .direct_IO = fuse_direct_IO, diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 786c1ce8f030..72c9560f4467 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -577,7 +577,7 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos, } /** - * gfs2_readpages - Read a bunch of pages at once + * gfs2_readahead - Read a bunch of pages at once * @file: The file to read from * @mapping: Address space info * @pages: List of pages to read @@ -590,31 +590,24 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos, * obviously not something we'd want to do on too regular a basis. * Any I/O we ignore at this time will be done via readpage later. * 2. We don't handle stuffed files here we let readpage do the honours. - * 3. mpage_readpages() does most of the heavy lifting in the common case. + * 3. mpage_readahead() does most of the heavy lifting in the common case. * 4. gfs2_block_map() is relied upon to set BH_Boundary in the right places. */ -static int gfs2_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void gfs2_readahead(struct readahead_control *rac) { - struct inode *inode = mapping->host; + struct inode *inode = rac->mapping->host; struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_holder gh; - int ret; gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); - ret = gfs2_glock_nq(&gh); - if (unlikely(ret)) + if (gfs2_glock_nq(&gh)) goto out_uninit; if (!gfs2_is_stuffed(ip)) - ret = mpage_readpages(mapping, pages, nr_pages, gfs2_block_map); + mpage_readahead(rac, gfs2_block_map); gfs2_glock_dq(&gh); out_uninit: gfs2_holder_uninit(&gh); - if (unlikely(gfs2_withdrawn(sdp))) - ret = -EIO; - return ret; } /** @@ -833,7 +826,7 @@ static const struct address_space_operations gfs2_aops = { .writepage = gfs2_writepage, .writepages = gfs2_writepages, .readpage = gfs2_readpage, - .readpages = gfs2_readpages, + .readahead = gfs2_readahead, .bmap = gfs2_bmap, .invalidatepage = gfs2_invalidatepage, .releasepage = gfs2_releasepage, @@ -847,7 +840,7 @@ static const struct address_space_operations gfs2_jdata_aops = { .writepage = gfs2_jdata_writepage, .writepages = gfs2_jdata_writepages, .readpage = gfs2_readpage, - .readpages = gfs2_readpages, + .readahead = gfs2_readahead, .set_page_dirty = jdata_set_page_dirty, .bmap = gfs2_bmap, .invalidatepage = gfs2_invalidatepage, diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 936a8ec6b48e..6306eaae378b 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -528,10 +528,12 @@ lower_metapath: /* Advance in metadata tree. */ (mp->mp_list[hgt])++; - if (mp->mp_list[hgt] >= sdp->sd_inptrs) { - if (!hgt) + if (hgt) { + if (mp->mp_list[hgt] >= sdp->sd_inptrs) + goto lower_metapath; + } else { + if (mp->mp_list[hgt] >= sdp->sd_diptrs) break; - goto lower_metapath; } fill_up_metapath: @@ -876,10 +878,9 @@ static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, ret = -ENOENT; goto unlock; } else { - /* report a hole */ iomap->offset = pos; iomap->length = length; - goto do_alloc; + goto hole_found; } } iomap->length = size; @@ -933,8 +934,6 @@ unlock: return ret; do_alloc: - iomap->addr = IOMAP_NULL_ADDR; - iomap->type = IOMAP_HOLE; if (flags & IOMAP_REPORT) { if (pos >= size) ret = -ENOENT; @@ -956,6 +955,9 @@ do_alloc: if (pos < size && height == ip->i_height) ret = gfs2_hole_size(inode, lblock, len, mp, iomap); } +hole_found: + iomap->addr = IOMAP_NULL_ADDR; + iomap->type = IOMAP_HOLE; goto out; } diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index c3f7732415be..c0f2875c946c 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -354,7 +354,7 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip) hc = kmalloc(hsize, GFP_NOFS | __GFP_NOWARN); if (hc == NULL) - hc = __vmalloc(hsize, GFP_NOFS, PAGE_KERNEL); + hc = __vmalloc(hsize, GFP_NOFS); if (hc == NULL) return ERR_PTR(-ENOMEM); @@ -1166,7 +1166,7 @@ static int dir_double_exhash(struct gfs2_inode *dip) hc2 = kmalloc_array(hsize_bytes, 2, GFP_NOFS | __GFP_NOWARN); if (hc2 == NULL) - hc2 = __vmalloc(hsize_bytes * 2, GFP_NOFS, PAGE_KERNEL); + hc2 = __vmalloc(hsize_bytes * 2, GFP_NOFS); if (!hc2) return -ENOMEM; @@ -1327,7 +1327,7 @@ static void *gfs2_alloc_sort_buffer(unsigned size) if (size < KMALLOC_MAX_SIZE) ptr = kmalloc(size, GFP_NOFS | __GFP_NOWARN); if (!ptr) - ptr = __vmalloc(size, GFP_NOFS, PAGE_KERNEL); + ptr = __vmalloc(size, GFP_NOFS); return ptr; } @@ -1987,8 +1987,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, ht = kzalloc(size, GFP_NOFS | __GFP_NOWARN); if (ht == NULL) - ht = __vmalloc(size, GFP_NOFS | __GFP_NOWARN | __GFP_ZERO, - PAGE_KERNEL); + ht = __vmalloc(size, GFP_NOFS | __GFP_NOWARN | __GFP_ZERO); if (!ht) return -ENOMEM; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 29f9b6684b74..bf70e3b14938 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -613,7 +613,7 @@ __acquires(&gl->gl_lockref.lock) fs_err(sdp, "Error %d syncing glock \n", ret); gfs2_dump_glock(NULL, gl, true); } - return; + goto skip_inval; } } if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) { @@ -633,6 +633,7 @@ __acquires(&gl->gl_lockref.lock) clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); } +skip_inval: gfs2_glock_hold(gl); /* * Check for an error encountered since we called go_sync and go_inval. @@ -722,9 +723,6 @@ __acquires(&gl->gl_lockref.lock) goto out_unlock; if (nonblock) goto out_sched; - smp_mb(); - if (atomic_read(&gl->gl_revokes) != 0) - goto out_sched; set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); GLOCK_BUG_ON(gl, gl->gl_demote_state == LM_ST_EXCLUSIVE); gl->gl_target = gl->gl_demote_state; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 70b2d3a1e866..5acd3ce30759 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -622,7 +622,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = finish_no_open(file, NULL); } gfs2_glock_dq_uninit(ghs); - return error; + goto fail; } else if (error != -ENOENT) { goto fail_gunlock; } @@ -764,9 +764,11 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = finish_open(file, dentry, gfs2_open_common); } gfs2_glock_dq_uninit(ghs); + gfs2_qa_put(ip); gfs2_glock_dq_uninit(ghs + 1); clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags); gfs2_glock_put(io_gl); + gfs2_qa_put(dip); return error; fail_gunlock3: @@ -776,7 +778,6 @@ fail_gunlock2: clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags); gfs2_glock_put(io_gl); fail_free_inode: - gfs2_qa_put(ip); if (ip->i_gl) { glock_clear_object(ip->i_gl, ip); gfs2_glock_put(ip->i_gl); @@ -1005,7 +1006,7 @@ out_gunlock: out_child: gfs2_glock_dq(ghs); out_parent: - gfs2_qa_put(ip); + gfs2_qa_put(dip); gfs2_holder_uninit(ghs); gfs2_holder_uninit(ghs + 1); return error; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 3a75843ae580..0644e58c6191 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -669,13 +669,13 @@ void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) struct buffer_head *bh = bd->bd_bh; struct gfs2_glock *gl = bd->bd_gl; + sdp->sd_log_num_revoke++; + if (atomic_inc_return(&gl->gl_revokes) == 1) + gfs2_glock_hold(gl); bh->b_private = NULL; bd->bd_blkno = bh->b_blocknr; gfs2_remove_from_ail(bd); /* drops ref on bh */ bd->bd_bh = NULL; - sdp->sd_log_num_revoke++; - if (atomic_inc_return(&gl->gl_revokes) == 1) - gfs2_glock_hold(gl); set_bit(GLF_LFLUSH, &gl->gl_flags); list_add(&bd->bd_list, &sdp->sd_log_revokes); } @@ -1131,6 +1131,10 @@ int gfs2_logd(void *data) while (!kthread_should_stop()) { + if (gfs2_withdrawn(sdp)) { + msleep_interruptible(HZ); + continue; + } /* Check for errors writing to the journal */ if (sdp->sd_log_error) { gfs2_lm(sdp, @@ -1139,6 +1143,7 @@ int gfs2_logd(void *data) "prevent further damage.\n", sdp->sd_fsname, sdp->sd_log_error); gfs2_withdraw(sdp); + continue; } did_flush = false; diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 5ea96757afc4..cb2a11b458c6 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -263,7 +263,7 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno, struct super_block *sb = sdp->sd_vfs; struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); - bio->bi_iter.bi_sector = blkno << (sb->s_blocksize_bits - 9); + bio->bi_iter.bi_sector = blkno << sdp->sd_fsb2bb_shift; bio_set_dev(bio, sb->s_bdev); bio->bi_end_io = end_io; bio->bi_private = sdp; @@ -509,12 +509,12 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, unsigned int bsize = sdp->sd_sb.sb_bsize, off; unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift; unsigned int shift = PAGE_SHIFT - bsize_shift; - unsigned int readahead_blocks = BIO_MAX_PAGES << shift; + unsigned int max_blocks = 2 * 1024 * 1024 >> bsize_shift; struct gfs2_journal_extent *je; int sz, ret = 0; struct bio *bio = NULL; struct page *page = NULL; - bool bio_chained = false, done = false; + bool done = false; errseq_t since; memset(head, 0, sizeof(*head)); @@ -537,30 +537,30 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, off = 0; } - if (!bio || (bio_chained && !off)) { - /* start new bio */ - } else { - sz = bio_add_page(bio, page, bsize, off); - if (sz == bsize) - goto block_added; + if (bio && (off || block < blocks_submitted + max_blocks)) { + sector_t sector = dblock << sdp->sd_fsb2bb_shift; + + if (bio_end_sector(bio) == sector) { + sz = bio_add_page(bio, page, bsize, off); + if (sz == bsize) + goto block_added; + } if (off) { unsigned int blocks = (PAGE_SIZE - off) >> bsize_shift; bio = gfs2_chain_bio(bio, blocks); - bio_chained = true; goto add_block_to_new_bio; } } if (bio) { - blocks_submitted = block + 1; + blocks_submitted = block; submit_bio(bio); } bio = gfs2_log_alloc_bio(sdp, dblock, gfs2_end_log_read); bio->bi_opf = REQ_OP_READ; - bio_chained = false; add_block_to_new_bio: sz = bio_add_page(bio, page, bsize, off); BUG_ON(sz != bsize); @@ -568,7 +568,7 @@ block_added: off += bsize; if (off == PAGE_SIZE) page = NULL; - if (blocks_submitted < blocks_read + readahead_blocks) { + if (blocks_submitted <= blocks_read + max_blocks) { /* Keep at least one bio in flight */ continue; } diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 4b72abcf83b2..9856cc2e0795 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -252,7 +252,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, int num = 0; if (unlikely(gfs2_withdrawn(sdp)) && - (!sdp->sd_jdesc || (blkno != sdp->sd_jdesc->jd_no_addr))) { + (!sdp->sd_jdesc || gl != sdp->sd_jinode_gl)) { *bhp = NULL; return -EIO; } diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index cc0c4b5800be..4b67d47a7e00 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -1051,8 +1051,7 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) u32 x; int error = 0; - if (capable(CAP_SYS_RESOURCE) || - sdp->sd_args.ar_quota != GFS2_QUOTA_ON) + if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON) return 0; error = gfs2_quota_hold(ip, uid, gid); @@ -1125,7 +1124,7 @@ void gfs2_quota_unlock(struct gfs2_inode *ip) int found; if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags)) - goto out; + return; for (x = 0; x < ip->i_qadata->qa_qd_num; x++) { struct gfs2_quota_data *qd; @@ -1162,7 +1161,6 @@ void gfs2_quota_unlock(struct gfs2_inode *ip) qd_unlock(qda[x]); } -out: gfs2_quota_unhold(ip); } @@ -1210,9 +1208,6 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid, if (!test_bit(GIF_QD_LOCKED, &ip->i_flags)) return 0; - if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON) - return 0; - for (x = 0; x < ip->i_qadata->qa_qd_num; x++) { qd = ip->i_qadata->qa_qd[x]; @@ -1270,7 +1265,9 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change, if (ip->i_diskflags & GFS2_DIF_SYSTEM) return; - BUG_ON(ip->i_qadata->qa_ref <= 0); + if (gfs2_assert_withdraw(sdp, ip->i_qadata && + ip->i_qadata->qa_ref > 0)) + return; for (x = 0; x < ip->i_qadata->qa_qd_num; x++) { qd = ip->i_qadata->qa_qd[x]; @@ -1368,7 +1365,7 @@ int gfs2_quota_init(struct gfs2_sbd *sdp) sdp->sd_quota_bitmap = kzalloc(bm_size, GFP_NOFS | __GFP_NOWARN); if (sdp->sd_quota_bitmap == NULL) sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS | - __GFP_ZERO, PAGE_KERNEL); + __GFP_ZERO); if (!sdp->sd_quota_bitmap) return error; diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index 7f9ca8ef40fc..21ada332d555 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -44,7 +44,8 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip, int ret; ap->allowed = UINT_MAX; /* Assume we are permitted a whole lot */ - if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) + if (capable(CAP_SYS_RESOURCE) || + sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) return 0; ret = gfs2_quota_lock(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); if (ret) diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 37fc41632aa2..956fced0a8ec 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1404,7 +1404,6 @@ out: if (ip->i_qadata) gfs2_assert_warn(sdp, ip->i_qadata->qa_ref == 0); gfs2_rs_delete(ip, NULL); - gfs2_qa_put(ip); gfs2_ordered_del_inode(ip); clear_inode(inode); gfs2_dir_hash_inval(ip); diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 9b64d40ab379..aa087a5675af 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -119,6 +119,12 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) if (!sb_rdonly(sdp->sd_vfs)) ret = gfs2_make_fs_ro(sdp); + if (sdp->sd_lockstruct.ls_ops->lm_lock == NULL) { /* lock_nolock */ + if (!ret) + ret = -EIO; + clear_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags); + goto skip_recovery; + } /* * Drop the glock for our journal so another node can recover it. */ @@ -159,10 +165,6 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) wait_on_bit(&gl->gl_flags, GLF_FREEING, TASK_UNINTERRUPTIBLE); } - if (sdp->sd_lockstruct.ls_ops->lm_lock == NULL) { /* lock_nolock */ - clear_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags); - goto skip_recovery; - } /* * Dequeue the "live" glock, but keep a reference so it's never freed. */ diff --git a/fs/hfs/Kconfig b/fs/hfs/Kconfig index 44f6e89bcb75..129926b5142d 100644 --- a/fs/hfs/Kconfig +++ b/fs/hfs/Kconfig @@ -6,7 +6,7 @@ config HFS_FS help If you say Y here, you will be able to mount Macintosh-formatted floppy disks and hard drive partitions with full read-write access. - Please read <file:Documentation/filesystems/hfs.txt> to learn about + Please read <file:Documentation/filesystems/hfs.rst> to learn about the available mount options. To compile this file system support as a module, choose M here: the diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig index 56aa0336254a..2b36dc6f0a10 100644 --- a/fs/hpfs/Kconfig +++ b/fs/hpfs/Kconfig @@ -9,7 +9,7 @@ config HPFS_FS write files to an OS/2 HPFS partition on your hard drive. OS/2 floppies however are in regular MSDOS format, so you don't need this option in order to be able to read them. Read - <file:Documentation/filesystems/hpfs.txt>. + <file:Documentation/filesystems/hpfs.rst>. To compile this file system support as a module, choose M here: the module will be called hpfs. If unsure, say N. diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index b36abf9cb345..2de0d3492d15 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -125,10 +125,9 @@ static int hpfs_writepage(struct page *page, struct writeback_control *wbc) return block_write_full_page(page, hpfs_get_block, wbc); } -static int hpfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void hpfs_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, hpfs_get_block); + mpage_readahead(rac, hpfs_get_block); } static int hpfs_writepages(struct address_space *mapping, @@ -198,7 +197,7 @@ static int hpfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, const struct address_space_operations hpfs_aops = { .readpage = hpfs_readpage, .writepage = hpfs_writepage, - .readpages = hpfs_readpages, + .readahead = hpfs_readahead, .writepages = hpfs_writepages, .write_begin = hpfs_write_begin, .write_end = hpfs_write_end, diff --git a/fs/inode.c b/fs/inode.c index 93d9252a00ab..37226a9cfa4f 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1606,14 +1606,14 @@ EXPORT_SYMBOL(iput); * @inode: inode owning the block number being requested * @block: pointer containing the block to find * - * Replaces the value in *block with the block number on the device holding + * Replaces the value in ``*block`` with the block number on the device holding * corresponding to the requested block number in the file. * That is, asked for block 4 of inode 1 the function will replace the - * 4 in *block, with disk block relative to the disk start that holds that + * 4 in ``*block``, with disk block relative to the disk start that holds that * block of the file. * * Returns -EINVAL in case of error, 0 otherwise. If mapping falls into a - * hole, returns 0 and *block is also set to 0. + * hole, returns 0 and ``*block`` is also set to 0. */ int bmap(struct inode *inode, sector_t *block) { diff --git a/fs/internal.h b/fs/internal.h index aa5d45524e87..0d467e32dd7e 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -126,7 +126,6 @@ extern struct open_how build_open_how(int flags, umode_t mode); extern int build_open_flags(const struct open_how *how, struct open_flags *op); long do_sys_ftruncate(unsigned int fd, loff_t length, int small); -long do_faccessat(int dfd, const char __user *filename, int mode); int do_fchmodat(int dfd, const char __user *filename, umode_t mode); int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, int flag); diff --git a/fs/io_uring.c b/fs/io_uring.c index 381d50becd04..bb25e3997d41 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -524,6 +524,7 @@ enum { REQ_F_OVERFLOW_BIT, REQ_F_POLLED_BIT, REQ_F_BUFFER_SELECTED_BIT, + REQ_F_NO_FILE_TABLE_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -577,6 +578,8 @@ enum { REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), /* buffer already selected */ REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), + /* doesn't need file table for this request */ + REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT), }; struct async_poll { @@ -616,6 +619,8 @@ struct io_kiocb { bool needs_fixed_file; u8 opcode; + u16 buf_index; + struct io_ring_ctx *ctx; struct list_head list; unsigned int flags; @@ -677,8 +682,6 @@ struct io_op_def { unsigned needs_mm : 1; /* needs req->file assigned */ unsigned needs_file : 1; - /* needs req->file assigned IFF fd is >= 0 */ - unsigned fd_non_neg : 1; /* hash wq insertion if file is a regular file */ unsigned hash_reg_file : 1; /* unbound wq insertion if file is a non-regular file */ @@ -781,8 +784,6 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, }, [IORING_OP_OPENAT] = { - .needs_file = 1, - .fd_non_neg = 1, .file_table = 1, .needs_fs = 1, }, @@ -796,9 +797,8 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_STATX] = { .needs_mm = 1, - .needs_file = 1, - .fd_non_neg = 1, .needs_fs = 1, + .file_table = 1, }, [IORING_OP_READ] = { .needs_mm = 1, @@ -833,8 +833,6 @@ static const struct io_op_def io_op_defs[] = { .buffer_select = 1, }, [IORING_OP_OPENAT2] = { - .needs_file = 1, - .fd_non_neg = 1, .file_table = 1, .needs_fs = 1, }, @@ -928,6 +926,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) goto err; ctx->flags = p->flags; + init_waitqueue_head(&ctx->sqo_wait); init_waitqueue_head(&ctx->cq_wait); INIT_LIST_HEAD(&ctx->cq_overflow_list); init_completion(&ctx->completions[0]); @@ -1291,7 +1290,7 @@ static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx) struct io_kiocb *req; req = ctx->fallback_req; - if (!test_and_set_bit_lock(0, (unsigned long *) ctx->fallback_req)) + if (!test_and_set_bit_lock(0, (unsigned long *) &ctx->fallback_req)) return req; return NULL; @@ -1378,7 +1377,7 @@ static void __io_free_req(struct io_kiocb *req) if (likely(!io_is_fallback_req(req))) kmem_cache_free(req_cachep, req); else - clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req); + clear_bit_unlock(0, (unsigned long *) &req->ctx->fallback_req); } struct req_batch { @@ -1398,10 +1397,6 @@ static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb) for (i = 0; i < rb->to_free; i++) { struct io_kiocb *req = rb->reqs[i]; - if (req->flags & REQ_F_FIXED_FILE) { - req->file = NULL; - percpu_ref_put(req->fixed_file_refs); - } if (req->flags & REQ_F_INFLIGHT) inflight++; __io_req_aux_free(req); @@ -1674,7 +1669,7 @@ static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req) if ((req->flags & REQ_F_LINK_HEAD) || io_is_fallback_req(req)) return false; - if (!(req->flags & REQ_F_FIXED_FILE) || req->io) + if (req->file || req->io) rb->need_iter++; rb->reqs[rb->to_free++] = req; @@ -2034,7 +2029,7 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd) * any file. For now, just ensure that anything potentially problematic is done * inline. */ -static bool io_file_supports_async(struct file *file) +static bool io_file_supports_async(struct file *file, int rw) { umode_t mode = file_inode(file)->i_mode; @@ -2043,7 +2038,13 @@ static bool io_file_supports_async(struct file *file) if (S_ISREG(mode) && file->f_op != &io_uring_fops) return true; - return false; + if (!(file->f_mode & FMODE_NOWAIT)) + return false; + + if (rw == READ) + return file->f_op->read_iter != NULL; + + return file->f_op->write_iter != NULL; } static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, @@ -2102,9 +2103,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, req->rw.addr = READ_ONCE(sqe->addr); req->rw.len = READ_ONCE(sqe->len); - /* we own ->private, reuse it for the buffer index / buffer ID */ - req->rw.kiocb.private = (void *) (unsigned long) - READ_ONCE(sqe->buf_index); + req->buf_index = READ_ONCE(sqe->buf_index); return 0; } @@ -2147,7 +2146,7 @@ static ssize_t io_import_fixed(struct io_kiocb *req, int rw, struct io_ring_ctx *ctx = req->ctx; size_t len = req->rw.len; struct io_mapped_ubuf *imu; - unsigned index, buf_index; + u16 index, buf_index; size_t offset; u64 buf_addr; @@ -2155,7 +2154,7 @@ static ssize_t io_import_fixed(struct io_kiocb *req, int rw, if (unlikely(!ctx->user_bufs)) return -EFAULT; - buf_index = (unsigned long) req->rw.kiocb.private; + buf_index = req->buf_index; if (unlikely(buf_index >= ctx->nr_user_bufs)) return -EFAULT; @@ -2271,10 +2270,10 @@ static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len, bool needs_lock) { struct io_buffer *kbuf; - int bgid; + u16 bgid; kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; - bgid = (int) (unsigned long) req->rw.kiocb.private; + bgid = req->buf_index; kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock); if (IS_ERR(kbuf)) return kbuf; @@ -2365,7 +2364,7 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, } /* buffer index only valid with fixed read/write, or buffer select */ - if (req->rw.kiocb.private && !(req->flags & REQ_F_BUFFER_SELECT)) + if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { @@ -2571,7 +2570,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock) * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so * we know to async punt it even if it was opened O_NONBLOCK */ - if (force_nonblock && !io_file_supports_async(req->file)) + if (force_nonblock && !io_file_supports_async(req->file, READ)) goto copy_iov; iov_count = iov_iter_count(&iter); @@ -2594,7 +2593,8 @@ copy_iov: if (ret) goto out_free; /* any defer here is final, must blocking retry */ - if (!(req->flags & REQ_F_NOWAIT)) + if (!(req->flags & REQ_F_NOWAIT) && + !file_can_poll(req->file)) req->flags |= REQ_F_MUST_PUNT; return -EAGAIN; } @@ -2662,7 +2662,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock) * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so * we know to async punt it even if it was opened O_NONBLOCK */ - if (force_nonblock && !io_file_supports_async(req->file)) + if (force_nonblock && !io_file_supports_async(req->file, WRITE)) goto copy_iov; /* file path doesn't support NOWAIT for non-direct_IO */ @@ -2716,7 +2716,8 @@ copy_iov: if (ret) goto out_free; /* any defer here is final, must blocking retry */ - req->flags |= REQ_F_MUST_PUNT; + if (!file_can_poll(req->file)) + req->flags |= REQ_F_MUST_PUNT; return -EAGAIN; } } @@ -2756,15 +2757,6 @@ static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static bool io_splice_punt(struct file *file) -{ - if (get_pipe_info(file)) - return false; - if (!io_file_supports_async(file)) - return true; - return !(file->f_flags & O_NONBLOCK); -} - static int io_splice(struct io_kiocb *req, bool force_nonblock) { struct io_splice *sp = &req->splice; @@ -2772,19 +2764,16 @@ static int io_splice(struct io_kiocb *req, bool force_nonblock) struct file *out = sp->file_out; unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; loff_t *poff_in, *poff_out; - long ret; + long ret = 0; - if (force_nonblock) { - if (io_splice_punt(in) || io_splice_punt(out)) - return -EAGAIN; - flags |= SPLICE_F_NONBLOCK; - } + if (force_nonblock) + return -EAGAIN; poff_in = (sp->off_in == -1) ? NULL : &sp->off_in; poff_out = (sp->off_out == -1) ? NULL : &sp->off_out; - ret = do_splice(in, poff_in, out, poff_out, sp->len, flags); - if (force_nonblock && ret == -EAGAIN) - return -EAGAIN; + + if (sp->len) + ret = do_splice(in, poff_in, out, poff_out, sp->len, flags); io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED)); req->flags &= ~REQ_F_NEED_CLEANUP; @@ -3355,8 +3344,12 @@ static int io_statx(struct io_kiocb *req, bool force_nonblock) struct kstat stat; int ret; - if (force_nonblock) + if (force_nonblock) { + /* only need file table for an actual valid fd */ + if (ctx->dfd == -1 || ctx->dfd == AT_FDCWD) + req->flags |= REQ_F_NO_FILE_TABLE; return -EAGAIN; + } if (vfs_stat_set_lookup_flags(&lookup_flags, ctx->how.flags)) return -EINVAL; @@ -3502,7 +3495,7 @@ static void io_sync_file_range_finish(struct io_wq_work **workptr) if (io_req_cancelled(req)) return; __io_sync_file_range(req); - io_put_req(req); /* put submission ref */ + io_steal_work(req, workptr); } static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock) @@ -4142,12 +4135,14 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, req->result = mask; init_task_work(&req->task_work, func); /* - * If this fails, then the task is exiting. Punt to one of the io-wq - * threads to ensure the work gets run, we can't always rely on exit - * cancelation taking care of this. + * If this fails, then the task is exiting. When a task exits, the + * work gets canceled, so just cancel this request as well instead + * of executing it. We can't safely execute it anyway, as we may not + * have the needed state needed for it anyway. */ ret = task_work_add(tsk, &req->task_work, true); if (unlikely(ret)) { + WRITE_ONCE(poll->canceled, true); tsk = io_wq_get_task(req->ctx->io_wq); task_work_add(tsk, &req->task_work, true); } @@ -4200,17 +4195,17 @@ static void io_async_task_func(struct callback_head *cb) spin_unlock_irq(&ctx->completion_lock); + /* restore ->work in case we need to retry again */ + memcpy(&req->work, &apoll->work, sizeof(req->work)); + if (canceled) { kfree(apoll); io_cqring_ev_posted(ctx); req_set_fail_links(req); - io_put_req(req); + io_double_put_req(req); return; } - /* restore ->work in case we need to retry again */ - memcpy(&req->work, &apoll->work, sizeof(req->work)); - __set_current_state(TASK_RUNNING); mutex_lock(&ctx->uring_lock); __io_queue_sqe(req, NULL); @@ -4369,7 +4364,7 @@ static bool io_poll_remove_one(struct io_kiocb *req) hash_del(&req->hash_node); - if (apoll) { + if (do_complete && apoll) { /* * restore ->work because we need to call io_req_work_drop_env. */ @@ -5015,15 +5010,16 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) int ret; /* Still need defer if there is pending req in defer list. */ - if (!req_need_defer(req) && list_empty(&ctx->defer_list)) + if (!req_need_defer(req) && list_empty_careful(&ctx->defer_list)) return 0; - if (!req->io && io_alloc_async_ctx(req)) - return -EAGAIN; - - ret = io_req_defer_prep(req, sqe); - if (ret < 0) - return ret; + if (!req->io) { + if (io_alloc_async_ctx(req)) + return -EAGAIN; + ret = io_req_defer_prep(req, sqe); + if (ret < 0) + return ret; + } spin_lock_irq(&ctx->completion_lock); if (!req_need_defer(req) && list_empty(&ctx->defer_list)) { @@ -5310,7 +5306,8 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) return ret; - if (ctx->flags & IORING_SETUP_IOPOLL) { + /* If the op doesn't have a file, we're not polling for it */ + if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) { const bool in_async = io_wq_current_is_worker(); if (req->result == -EAGAIN) @@ -5364,15 +5361,6 @@ static void io_wq_submit_work(struct io_wq_work **workptr) io_steal_work(req, workptr); } -static int io_req_needs_file(struct io_kiocb *req, int fd) -{ - if (!io_op_defs[req->opcode].needs_file) - return 0; - if ((fd == -1 || fd == AT_FDCWD) && io_op_defs[req->opcode].fd_non_neg) - return 0; - return 1; -} - static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, int index) { @@ -5410,14 +5398,11 @@ static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, } static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, - int fd, unsigned int flags) + int fd) { bool fixed; - if (!io_req_needs_file(req, fd)) - return 0; - - fixed = (flags & IOSQE_FIXED_FILE); + fixed = (req->flags & REQ_F_FIXED_FILE) != 0; if (unlikely(!fixed && req->needs_fixed_file)) return -EBADF; @@ -5429,7 +5414,7 @@ static int io_grab_files(struct io_kiocb *req) int ret = -EBADF; struct io_ring_ctx *ctx = req->ctx; - if (req->work.files) + if (req->work.files || (req->flags & REQ_F_NO_FILE_TABLE)) return 0; if (!ctx->ring_file) return -EBADF; @@ -5623,9 +5608,15 @@ fail_req: io_double_put_req(req); } } else if (req->flags & REQ_F_FORCE_ASYNC) { - ret = io_req_defer_prep(req, sqe); - if (unlikely(ret < 0)) - goto fail_req; + if (!req->io) { + ret = -EAGAIN; + if (io_alloc_async_ctx(req)) + goto fail_req; + ret = io_req_defer_prep(req, sqe); + if (unlikely(ret < 0)) + goto fail_req; + } + /* * Never try inline submit of IOSQE_ASYNC is set, go straight * to async execution. @@ -5794,7 +5785,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, struct io_submit_state *state, bool async) { unsigned int sqe_flags; - int id, fd; + int id; /* * All io need record the previous position, if LINK vs DARIN, @@ -5846,8 +5837,10 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, IOSQE_ASYNC | IOSQE_FIXED_FILE | IOSQE_BUFFER_SELECT | IOSQE_IO_LINK); - fd = READ_ONCE(sqe->fd); - return io_req_set_file(state, req, fd, sqe_flags); + if (!io_op_defs[req->opcode].needs_file) + return 0; + + return io_req_set_file(state, req, READ_ONCE(sqe->fd)); } static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, @@ -6039,6 +6032,7 @@ static int io_sq_thread(void *data) finish_wait(&ctx->sqo_wait, &wait); ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; + ret = 0; continue; } finish_wait(&ctx->sqo_wait, &wait); @@ -6852,7 +6846,6 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, { int ret; - init_waitqueue_head(&ctx->sqo_wait); mmgrab(current->mm); ctx->sqo_mm = current->mm; @@ -7327,7 +7320,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) * it could cause shutdown to hang. */ while (ctx->sqo_thread && !wq_has_sleeper(&ctx->sqo_wait)) - cpu_relax(); + cond_resched(); io_kill_timeouts(ctx); io_poll_remove_all(ctx); @@ -7356,11 +7349,9 @@ static int io_uring_release(struct inode *inode, struct file *file) static void io_uring_cancel_files(struct io_ring_ctx *ctx, struct files_struct *files) { - struct io_kiocb *req; - DEFINE_WAIT(wait); - while (!list_empty_careful(&ctx->inflight_list)) { - struct io_kiocb *cancel_req = NULL; + struct io_kiocb *cancel_req = NULL, *req; + DEFINE_WAIT(wait); spin_lock_irq(&ctx->inflight_lock); list_for_each_entry(req, &ctx->inflight_list, inflight_entry) { @@ -7400,6 +7391,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, */ if (refcount_sub_and_test(2, &cancel_req->refs)) { io_put_req(cancel_req); + finish_wait(&ctx->inflight_wait, &wait); continue; } } @@ -7407,8 +7399,8 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, io_wq_cancel_work(ctx->io_wq, &cancel_req->work); io_put_req(cancel_req); schedule(); + finish_wait(&ctx->inflight_wait, &wait); } - finish_wait(&ctx->inflight_wait, &wait); } static int io_uring_flush(struct file *file, void *data) @@ -7757,7 +7749,8 @@ err: return ret; } -static int io_uring_create(unsigned entries, struct io_uring_params *p) +static int io_uring_create(unsigned entries, struct io_uring_params *p, + struct io_uring_params __user *params) { struct user_struct *user = NULL; struct io_ring_ctx *ctx; @@ -7849,6 +7842,14 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); p->cq_off.cqes = offsetof(struct io_rings, cqes); + p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | + IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | + IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL; + + if (copy_to_user(params, p, sizeof(*p))) { + ret = -EFAULT; + goto err; + } /* * Install ring fd as the very last thing, so we don't risk someone * having closed it before we finish setup @@ -7857,9 +7858,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) if (ret < 0) goto err; - p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | - IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | - IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; err: @@ -7875,7 +7873,6 @@ err: static long io_uring_setup(u32 entries, struct io_uring_params __user *params) { struct io_uring_params p; - long ret; int i; if (copy_from_user(&p, params, sizeof(p))) @@ -7890,14 +7887,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ)) return -EINVAL; - ret = io_uring_create(entries, &p); - if (ret < 0) - return ret; - - if (copy_to_user(params, &p, sizeof(p))) - return -EFAULT; - - return ret; + return io_uring_create(entries, &p, params); } SYSCALL_DEFINE2(io_uring_setup, u32, entries, diff --git a/fs/ioctl.c b/fs/ioctl.c index 282d45be6f45..5e80b40bc1b5 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -55,6 +55,7 @@ EXPORT_SYMBOL(vfs_ioctl); static int ioctl_fibmap(struct file *filp, int __user *p) { struct inode *inode = file_inode(filp); + struct super_block *sb = inode->i_sb; int error, ur_block; sector_t block; @@ -71,6 +72,13 @@ static int ioctl_fibmap(struct file *filp, int __user *p) block = ur_block; error = bmap(inode, &block); + if (block > INT_MAX) { + error = -ERANGE; + pr_warn_ratelimited("[%s/%d] FS: %s File: %pD4 would truncate fibmap result\n", + current->comm, task_pid_nr(current), + sb->s_id, filp); + } + if (error) ur_block = 0; else diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 89e21961d1ad..a1ed7620fbac 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -59,24 +59,19 @@ iomap_page_create(struct inode *inode, struct page *page) * migrate_page_move_mapping() assumes that pages with private data have * their count elevated by 1. */ - get_page(page); - set_page_private(page, (unsigned long)iop); - SetPagePrivate(page); + attach_page_private(page, iop); return iop; } static void iomap_page_release(struct page *page) { - struct iomap_page *iop = to_iomap_page(page); + struct iomap_page *iop = detach_page_private(page); if (!iop) return; WARN_ON_ONCE(atomic_read(&iop->read_count)); WARN_ON_ONCE(atomic_read(&iop->write_count)); - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); kfree(iop); } @@ -214,9 +209,8 @@ iomap_read_end_io(struct bio *bio) struct iomap_readpage_ctx { struct page *cur_page; bool cur_page_in_bio; - bool is_readahead; struct bio *bio; - struct list_head *pages; + struct readahead_control *rac; }; static void @@ -308,7 +302,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, if (ctx->bio) submit_bio(ctx->bio); - if (ctx->is_readahead) /* same as readahead_gfp_mask */ + if (ctx->rac) /* same as readahead_gfp_mask */ gfp |= __GFP_NORETRY | __GFP_NOWARN; ctx->bio = bio_alloc(gfp, min(BIO_MAX_PAGES, nr_vecs)); /* @@ -319,7 +313,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, if (!ctx->bio) ctx->bio = bio_alloc(orig_gfp, 1); ctx->bio->bi_opf = REQ_OP_READ; - if (ctx->is_readahead) + if (ctx->rac) ctx->bio->bi_opf |= REQ_RAHEAD; ctx->bio->bi_iter.bi_sector = sector; bio_set_dev(ctx->bio, iomap->bdev); @@ -367,7 +361,7 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops) } /* - * Just like mpage_readpages and block_read_full_page we always + * Just like mpage_readahead and block_read_full_page we always * return 0 and just mark the page as PageError on errors. This * should be cleaned up all through the stack eventually. */ @@ -375,36 +369,8 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops) } EXPORT_SYMBOL_GPL(iomap_readpage); -static struct page * -iomap_next_page(struct inode *inode, struct list_head *pages, loff_t pos, - loff_t length, loff_t *done) -{ - while (!list_empty(pages)) { - struct page *page = lru_to_page(pages); - - if (page_offset(page) >= (u64)pos + length) - break; - - list_del(&page->lru); - if (!add_to_page_cache_lru(page, inode->i_mapping, page->index, - GFP_NOFS)) - return page; - - /* - * If we already have a page in the page cache at index we are - * done. Upper layers don't care if it is uptodate after the - * readpages call itself as every page gets checked again once - * actually needed. - */ - *done += PAGE_SIZE; - put_page(page); - } - - return NULL; -} - static loff_t -iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length, +iomap_readahead_actor(struct inode *inode, loff_t pos, loff_t length, void *data, struct iomap *iomap, struct iomap *srcmap) { struct iomap_readpage_ctx *ctx = data; @@ -418,10 +384,7 @@ iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length, ctx->cur_page = NULL; } if (!ctx->cur_page) { - ctx->cur_page = iomap_next_page(inode, ctx->pages, - pos, length, &done); - if (!ctx->cur_page) - break; + ctx->cur_page = readahead_page(ctx->rac); ctx->cur_page_in_bio = false; } ret = iomap_readpage_actor(inode, pos + done, length - done, @@ -431,32 +394,43 @@ iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length, return done; } -int -iomap_readpages(struct address_space *mapping, struct list_head *pages, - unsigned nr_pages, const struct iomap_ops *ops) +/** + * iomap_readahead - Attempt to read pages from a file. + * @rac: Describes the pages to be read. + * @ops: The operations vector for the filesystem. + * + * This function is for filesystems to call to implement their readahead + * address_space operation. + * + * Context: The @ops callbacks may submit I/O (eg to read the addresses of + * blocks from disc), and may wait for it. The caller may be trying to + * access a different page, and so sleeping excessively should be avoided. + * It may allocate memory, but should avoid costly allocations. This + * function is called with memalloc_nofs set, so allocations will not cause + * the filesystem to be reentered. + */ +void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) { + struct inode *inode = rac->mapping->host; + loff_t pos = readahead_pos(rac); + loff_t length = readahead_length(rac); struct iomap_readpage_ctx ctx = { - .pages = pages, - .is_readahead = true, + .rac = rac, }; - loff_t pos = page_offset(list_entry(pages->prev, struct page, lru)); - loff_t last = page_offset(list_entry(pages->next, struct page, lru)); - loff_t length = last - pos + PAGE_SIZE, ret = 0; - trace_iomap_readpages(mapping->host, nr_pages); + trace_iomap_readahead(inode, readahead_count(rac)); while (length > 0) { - ret = iomap_apply(mapping->host, pos, length, 0, ops, - &ctx, iomap_readpages_actor); + loff_t ret = iomap_apply(inode, pos, length, 0, ops, + &ctx, iomap_readahead_actor); if (ret <= 0) { WARN_ON_ONCE(ret == 0); - goto done; + break; } pos += ret; length -= ret; } - ret = 0; -done: + if (ctx.bio) submit_bio(ctx.bio); if (ctx.cur_page) { @@ -464,15 +438,8 @@ done: unlock_page(ctx.cur_page); put_page(ctx.cur_page); } - - /* - * Check that we didn't lose a page due to the arcance calling - * conventions.. - */ - WARN_ON_ONCE(!ret && !list_empty(ctx.pages)); - return ret; } -EXPORT_SYMBOL_GPL(iomap_readpages); +EXPORT_SYMBOL_GPL(iomap_readahead); /* * iomap_is_partially_uptodate checks whether blocks within a page are @@ -554,14 +521,8 @@ iomap_migrate_page(struct address_space *mapping, struct page *newpage, if (ret != MIGRATEPAGE_SUCCESS) return ret; - if (page_has_private(page)) { - ClearPagePrivate(page); - get_page(newpage); - set_page_private(newpage, page_private(page)); - set_page_private(page, 0); - put_page(page); - SetPagePrivate(newpage); - } + if (page_has_private(page)) + attach_page_private(newpage, detach_page_private(page)); if (mode != MIGRATE_SYNC_NO_COPY) migrate_page_copy(newpage, page); diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c index bccf305ea9ce..d55e8f491a5e 100644 --- a/fs/iomap/fiemap.c +++ b/fs/iomap/fiemap.c @@ -117,10 +117,7 @@ iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length, if (iomap->type == IOMAP_MAPPED) { addr = (pos - iomap->offset + iomap->addr) >> inode->i_blkbits; - if (addr > INT_MAX) - WARN(1, "would truncate bmap result\n"); - else - *bno = addr; + *bno = addr; } return 0; } diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index 4df19c66f597..5693a39d52fb 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -39,7 +39,7 @@ DEFINE_EVENT(iomap_readpage_class, name, \ TP_PROTO(struct inode *inode, int nr_pages), \ TP_ARGS(inode, nr_pages)) DEFINE_READPAGE_EVENT(iomap_readpage); -DEFINE_READPAGE_EVENT(iomap_readpages); +DEFINE_READPAGE_EVENT(iomap_readahead); DECLARE_EVENT_CLASS(iomap_range_class, TP_PROTO(struct inode *inode, unsigned long off, unsigned int len), diff --git a/fs/isofs/Kconfig b/fs/isofs/Kconfig index 5e7419599f50..08ffd37b9bb8 100644 --- a/fs/isofs/Kconfig +++ b/fs/isofs/Kconfig @@ -8,7 +8,7 @@ config ISO9660_FS long Unix filenames and symbolic links are also supported by this driver. If you have a CD-ROM drive and want to do more with it than just listen to audio CDs and watch its LEDs, say Y (and read - <file:Documentation/filesystems/isofs.txt> and the CD-ROM-HOWTO, + <file:Documentation/filesystems/isofs.rst> and the CD-ROM-HOWTO, available from <http://www.tldp.org/docs.html#howto>), thereby enlarging your kernel by about 27 KB; otherwise say N. diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 276107cdaaf1..d634561f871a 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -1183,10 +1183,9 @@ static int isofs_readpage(struct file *file, struct page *page) return mpage_readpage(page, isofs_get_block); } -static int isofs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void isofs_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, isofs_get_block); + mpage_readahead(rac, isofs_get_block); } static sector_t _isofs_bmap(struct address_space *mapping, sector_t block) @@ -1196,7 +1195,7 @@ static sector_t _isofs_bmap(struct address_space *mapping, sector_t block) static const struct address_space_operations isofs_aops = { .readpage = isofs_readpage, - .readpages = isofs_readpages, + .readahead = isofs_readahead, .bmap = _isofs_bmap }; diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 9486afcdac76..6f65bfa9f18d 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -296,10 +296,9 @@ static int jfs_readpage(struct file *file, struct page *page) return mpage_readpage(page, jfs_get_block); } -static int jfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void jfs_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, jfs_get_block); + mpage_readahead(rac, jfs_get_block); } static void jfs_write_failed(struct address_space *mapping, loff_t to) @@ -358,7 +357,7 @@ static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) const struct address_space_operations jfs_aops = { .readpage = jfs_readpage, - .readpages = jfs_readpages, + .readahead = jfs_readahead, .writepage = jfs_writepage, .writepages = jfs_writepages, .write_begin = jfs_write_begin, diff --git a/fs/locks.c b/fs/locks.c index b8a31c1c4fff..1d4f4d5da704 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -61,7 +61,7 @@ * * Initial implementation of mandatory locks. SunOS turned out to be * a rotten model, so I implemented the "obvious" semantics. - * See 'Documentation/filesystems/mandatory-locking.txt' for details. + * See 'Documentation/filesystems/mandatory-locking.rst' for details. * Andy Walker (andy@lysaker.kvaerner.no), April 06, 1996. * * Don't allow mandatory locks on mmap()'ed files. Added simple functions to diff --git a/fs/mount.h b/fs/mount.h index 711a4093e475..c7abb7b394d8 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -9,7 +9,13 @@ struct mnt_namespace { atomic_t count; struct ns_common ns; struct mount * root; + /* + * Traversal and modification of .list is protected by either + * - taking namespace_sem for write, OR + * - taking namespace_sem for read AND taking .ns_lock. + */ struct list_head list; + spinlock_t ns_lock; struct user_namespace *user_ns; struct ucounts *ucounts; u64 seq; /* Sequence number to prevent loops */ @@ -133,9 +139,7 @@ struct proc_mounts { struct mnt_namespace *ns; struct path root; int (*show)(struct seq_file *, struct vfsmount *); - void *cached_mount; - u64 cached_event; - loff_t cached_index; + struct mount cursor; }; extern const struct seq_operations mounts_op; @@ -153,3 +157,5 @@ static inline bool is_anon_ns(struct mnt_namespace *ns) { return ns->seq == 0; } + +extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor); diff --git a/fs/mpage.c b/fs/mpage.c index ccba3c4c4479..830e6cc2a9e7 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -91,7 +91,7 @@ mpage_alloc(struct block_device *bdev, } /* - * support function for mpage_readpages. The fs supplied get_block might + * support function for mpage_readahead. The fs supplied get_block might * return an up to date buffer. This is used to map that buffer into * the page, which allows readpage to avoid triggering a duplicate call * to get_block. @@ -338,13 +338,8 @@ confused: } /** - * mpage_readpages - populate an address space with some pages & start reads against them - * @mapping: the address_space - * @pages: The address of a list_head which contains the target pages. These - * pages have their ->index populated and are otherwise uninitialised. - * The page at @pages->prev has the lowest file offset, and reads should be - * issued in @pages->prev to @pages->next order. - * @nr_pages: The number of pages at *@pages + * mpage_readahead - start reads against pages + * @rac: Describes which pages to read. * @get_block: The filesystem's block mapper function. * * This function walks the pages and the blocks within each page, building and @@ -381,36 +376,25 @@ confused: * * This all causes the disk requests to be issued in the correct order. */ -int -mpage_readpages(struct address_space *mapping, struct list_head *pages, - unsigned nr_pages, get_block_t get_block) +void mpage_readahead(struct readahead_control *rac, get_block_t get_block) { + struct page *page; struct mpage_readpage_args args = { .get_block = get_block, .is_readahead = true, }; - unsigned page_idx; - - for (page_idx = 0; page_idx < nr_pages; page_idx++) { - struct page *page = lru_to_page(pages); + while ((page = readahead_page(rac))) { prefetchw(&page->flags); - list_del(&page->lru); - if (!add_to_page_cache_lru(page, mapping, - page->index, - readahead_gfp_mask(mapping))) { - args.page = page; - args.nr_pages = nr_pages - page_idx; - args.bio = do_mpage_readpage(&args); - } + args.page = page; + args.nr_pages = readahead_count(rac); + args.bio = do_mpage_readpage(&args); put_page(page); } - BUG_ON(!list_empty(pages)); if (args.bio) mpage_bio_submit(REQ_OP_READ, REQ_RAHEAD, args.bio); - return 0; } -EXPORT_SYMBOL(mpage_readpages); +EXPORT_SYMBOL(mpage_readahead); /* * This isn't called much at all @@ -563,7 +547,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, * Page has buffers, but they are all unmapped. The page was * created by pagein or read over a hole which was handled by * block_read_full_page(). If this address_space is also - * using mpage_readpages then this can rarely happen. + * using mpage_readahead then this can rarely happen. */ goto confused; } diff --git a/fs/namei.c b/fs/namei.c index a320371899cf..d81f73ff1a8b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3505,12 +3505,14 @@ EXPORT_SYMBOL(user_path_create); int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { + bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV; int error = may_create(dir, dentry); if (error) return error; - if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD)) + if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout && + !capable(CAP_MKNOD)) return -EPERM; if (!dir->i_op->mknod) @@ -4345,9 +4347,6 @@ static int do_renameat2(int olddfd, const char __user *oldname, int newdfd, (flags & RENAME_EXCHANGE)) return -EINVAL; - if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD)) - return -EPERM; - if (flags & RENAME_EXCHANGE) target_flags = 0; @@ -4483,20 +4482,6 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna return do_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0); } -int vfs_whiteout(struct inode *dir, struct dentry *dentry) -{ - int error = may_create(dir, dentry); - if (error) - return error; - - if (!dir->i_op->mknod) - return -EPERM; - - return dir->i_op->mknod(dir, dentry, - S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); -} -EXPORT_SYMBOL(vfs_whiteout); - int readlink_copy(char __user *buffer, int buflen, const char *link) { int len = PTR_ERR(link); diff --git a/fs/namespace.c b/fs/namespace.c index a28e4db075ed..a6baee3c7904 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -648,6 +648,21 @@ struct vfsmount *lookup_mnt(const struct path *path) return m; } +static inline void lock_ns_list(struct mnt_namespace *ns) +{ + spin_lock(&ns->ns_lock); +} + +static inline void unlock_ns_list(struct mnt_namespace *ns) +{ + spin_unlock(&ns->ns_lock); +} + +static inline bool mnt_is_cursor(struct mount *mnt) +{ + return mnt->mnt.mnt_flags & MNT_CURSOR; +} + /* * __is_local_mountpoint - Test to see if dentry is a mountpoint in the * current mount namespace. @@ -673,11 +688,15 @@ bool __is_local_mountpoint(struct dentry *dentry) goto out; down_read(&namespace_sem); + lock_ns_list(ns); list_for_each_entry(mnt, &ns->list, mnt_list) { + if (mnt_is_cursor(mnt)) + continue; is_covered = (mnt->mnt_mountpoint == dentry); if (is_covered) break; } + unlock_ns_list(ns); up_read(&namespace_sem); out: return is_covered; @@ -1245,46 +1264,71 @@ struct vfsmount *mnt_clone_internal(const struct path *path) } #ifdef CONFIG_PROC_FS +static struct mount *mnt_list_next(struct mnt_namespace *ns, + struct list_head *p) +{ + struct mount *mnt, *ret = NULL; + + lock_ns_list(ns); + list_for_each_continue(p, &ns->list) { + mnt = list_entry(p, typeof(*mnt), mnt_list); + if (!mnt_is_cursor(mnt)) { + ret = mnt; + break; + } + } + unlock_ns_list(ns); + + return ret; +} + /* iterator; we want it to have access to namespace_sem, thus here... */ static void *m_start(struct seq_file *m, loff_t *pos) { struct proc_mounts *p = m->private; + struct list_head *prev; down_read(&namespace_sem); - if (p->cached_event == p->ns->event) { - void *v = p->cached_mount; - if (*pos == p->cached_index) - return v; - if (*pos == p->cached_index + 1) { - v = seq_list_next(v, &p->ns->list, &p->cached_index); - return p->cached_mount = v; - } + if (!*pos) { + prev = &p->ns->list; + } else { + prev = &p->cursor.mnt_list; + + /* Read after we'd reached the end? */ + if (list_empty(prev)) + return NULL; } - p->cached_event = p->ns->event; - p->cached_mount = seq_list_start(&p->ns->list, *pos); - p->cached_index = *pos; - return p->cached_mount; + return mnt_list_next(p->ns, prev); } static void *m_next(struct seq_file *m, void *v, loff_t *pos) { struct proc_mounts *p = m->private; + struct mount *mnt = v; - p->cached_mount = seq_list_next(v, &p->ns->list, pos); - p->cached_index = *pos; - return p->cached_mount; + ++*pos; + return mnt_list_next(p->ns, &mnt->mnt_list); } static void m_stop(struct seq_file *m, void *v) { + struct proc_mounts *p = m->private; + struct mount *mnt = v; + + lock_ns_list(p->ns); + if (mnt) + list_move_tail(&p->cursor.mnt_list, &mnt->mnt_list); + else + list_del_init(&p->cursor.mnt_list); + unlock_ns_list(p->ns); up_read(&namespace_sem); } static int m_show(struct seq_file *m, void *v) { struct proc_mounts *p = m->private; - struct mount *r = list_entry(v, struct mount, mnt_list); + struct mount *r = v; return p->show(m, &r->mnt); } @@ -1294,6 +1338,15 @@ const struct seq_operations mounts_op = { .stop = m_stop, .show = m_show, }; + +void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor) +{ + down_read(&namespace_sem); + lock_ns_list(ns); + list_del(&cursor->mnt_list); + unlock_ns_list(ns); + up_read(&namespace_sem); +} #endif /* CONFIG_PROC_FS */ /** @@ -3202,6 +3255,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a atomic_set(&new_ns->count, 1); INIT_LIST_HEAD(&new_ns->list); init_waitqueue_head(&new_ns->poll); + spin_lock_init(&new_ns->ns_lock); new_ns->user_ns = get_user_ns(user_ns); new_ns->ucounts = ucounts; return new_ns; @@ -3595,7 +3649,7 @@ EXPORT_SYMBOL(path_is_under); * file system may be mounted on put_old. After all, new_root is a mountpoint. * * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem. - * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives + * See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives * in this situation. * * Notes: @@ -3842,10 +3896,14 @@ static bool mnt_already_visible(struct mnt_namespace *ns, bool visible = false; down_read(&namespace_sem); + lock_ns_list(ns); list_for_each_entry(mnt, &ns->list, mnt_list) { struct mount *child; int mnt_flags; + if (mnt_is_cursor(mnt)) + continue; + if (mnt->mnt.mnt_sb->s_type != sb->s_type) continue; @@ -3893,6 +3951,7 @@ static bool mnt_already_visible(struct mnt_namespace *ns, next: ; } found: + unlock_ns_list(ns); up_read(&namespace_sem); return visible; } diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c index 7a57ff2528af..8f7cff7a4293 100644 --- a/fs/nfs/blocklayout/extent_tree.c +++ b/fs/nfs/blocklayout/extent_tree.c @@ -582,7 +582,7 @@ retry: if (!arg->layoutupdate_pages) return -ENOMEM; - start_p = __vmalloc(buffer_size, GFP_NOFS, PAGE_KERNEL); + start_p = __vmalloc(buffer_size, GFP_NOFS); if (!start_p) { kfree(arg->layoutupdate_pages); return -ENOMEM; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index 1abf126c2df4..a60df88efc40 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -118,8 +118,6 @@ void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int nfss->fscache_key = NULL; nfss->fscache = NULL; - if (!(nfss->options & NFS_OPTION_FSCACHE)) - return; if (!uniq) { uniq = ""; ulen = 1; @@ -188,7 +186,8 @@ void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int /* create a cache index for looking up filehandles */ nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache, &nfs_fscache_super_index_def, - key, sizeof(*key) + ulen, + &key->key, + sizeof(key->key) + ulen, NULL, 0, nfss, 0, true); dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n", @@ -226,6 +225,19 @@ void nfs_fscache_release_super_cookie(struct super_block *sb) } } +static void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata *auxdata, + struct nfs_inode *nfsi) +{ + memset(auxdata, 0, sizeof(*auxdata)); + auxdata->mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec; + auxdata->mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec; + auxdata->ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec; + auxdata->ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec; + + if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) + auxdata->change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode); +} + /* * Initialise the per-inode cache cookie pointer for an NFS inode. */ @@ -239,14 +251,7 @@ void nfs_fscache_init_inode(struct inode *inode) if (!(nfss->fscache && S_ISREG(inode->i_mode))) return; - memset(&auxdata, 0, sizeof(auxdata)); - auxdata.mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec; - auxdata.mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec; - auxdata.ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec; - auxdata.ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec; - - if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) - auxdata.change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode); + nfs_fscache_update_auxdata(&auxdata, nfsi); nfsi->fscache = fscache_acquire_cookie(NFS_SB(inode->i_sb)->fscache, &nfs_fscache_inode_object_def, @@ -266,11 +271,7 @@ void nfs_fscache_clear_inode(struct inode *inode) dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", nfsi, cookie); - memset(&auxdata, 0, sizeof(auxdata)); - auxdata.mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec; - auxdata.mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec; - auxdata.ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec; - auxdata.ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec; + nfs_fscache_update_auxdata(&auxdata, nfsi); fscache_relinquish_cookie(cookie, &auxdata, false); nfsi->fscache = NULL; } @@ -310,11 +311,7 @@ void nfs_fscache_open_file(struct inode *inode, struct file *filp) if (!fscache_cookie_valid(cookie)) return; - memset(&auxdata, 0, sizeof(auxdata)); - auxdata.mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec; - auxdata.mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec; - auxdata.ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec; - auxdata.ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec; + nfs_fscache_update_auxdata(&auxdata, nfsi); if (inode_is_open_for_write(inode)) { dfprintk(FSCACHE, "NFS: nfsi 0x%p disabling cache\n", nfsi); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 1f32a9fbfdaf..6673a77884d9 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -668,7 +668,8 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize) } /* - * Record the page as unstable and mark its inode as dirty. + * Record the page as unstable (an extra writeback period) and mark its + * inode as dirty. */ static inline void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo) @@ -676,8 +677,11 @@ void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo) if (!cinfo->dreq) { struct inode *inode = page_file_mapping(page)->host; - inc_node_page_state(page, NR_UNSTABLE_NFS); - inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE); + /* This page is really still in write-back - just that the + * writeback is happening on the server now. + */ + inc_node_page_state(page, NR_WRITEBACK); + inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); } } diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 35c8cb2d7637..dda5c3e65d8d 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -30,6 +30,7 @@ #define encode_dirpath_sz (1 + XDR_QUADLEN(MNTPATHLEN)) #define MNT_status_sz (1) #define MNT_fhandle_sz XDR_QUADLEN(NFS2_FHSIZE) +#define MNT_fhandlev3_sz XDR_QUADLEN(NFS3_FHSIZE) #define MNT_authflav3_sz (1 + NFS_MAX_SECFLAVORS) /* @@ -37,7 +38,7 @@ */ #define MNT_enc_dirpath_sz encode_dirpath_sz #define MNT_dec_mountres_sz (MNT_status_sz + MNT_fhandle_sz) -#define MNT_dec_mountres3_sz (MNT_status_sz + MNT_fhandle_sz + \ +#define MNT_dec_mountres3_sz (MNT_status_sz + MNT_fhandlev3_sz + \ MNT_authflav3_sz) /* diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index c5c3fc6e6c60..26c94b32d6f4 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -253,37 +253,45 @@ int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type) { - struct posix_acl *alloc = NULL, *dfacl = NULL; + struct posix_acl *orig = acl, *dfacl = NULL, *alloc; int status; if (S_ISDIR(inode->i_mode)) { switch(type) { case ACL_TYPE_ACCESS: - alloc = dfacl = get_acl(inode, ACL_TYPE_DEFAULT); + alloc = get_acl(inode, ACL_TYPE_DEFAULT); if (IS_ERR(alloc)) goto fail; + dfacl = alloc; break; case ACL_TYPE_DEFAULT: - dfacl = acl; - alloc = acl = get_acl(inode, ACL_TYPE_ACCESS); + alloc = get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(alloc)) goto fail; + dfacl = acl; + acl = alloc; break; } } if (acl == NULL) { - alloc = acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); + alloc = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); if (IS_ERR(alloc)) goto fail; + acl = alloc; } status = __nfs3_proc_setacls(inode, acl, dfacl); - posix_acl_release(alloc); +out: + if (acl != orig) + posix_acl_release(acl); + if (dfacl != orig) + posix_acl_release(dfacl); return status; fail: - return PTR_ERR(alloc); + status = PTR_ERR(alloc); + goto out; } const struct xattr_handler *nfs3_xattr_handlers[] = { diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 512afb1c7867..9056f3dd380e 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6347,7 +6347,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, .rpc_client = server->client, .rpc_message = &msg, .callback_ops = &nfs4_delegreturn_ops, - .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF | RPC_TASK_TIMEOUT, + .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT, }; int status = 0; @@ -7891,6 +7891,7 @@ static void nfs4_bind_one_conn_to_session_done(struct rpc_task *task, void *calldata) { struct nfs41_bind_conn_to_session_args *args = task->tk_msg.rpc_argp; + struct nfs41_bind_conn_to_session_res *res = task->tk_msg.rpc_resp; struct nfs_client *clp = args->client; switch (task->tk_status) { @@ -7899,6 +7900,12 @@ nfs4_bind_one_conn_to_session_done(struct rpc_task *task, void *calldata) nfs4_schedule_session_recovery(clp->cl_session, task->tk_status); } + if (args->dir == NFS4_CDFC4_FORE_OR_BOTH && + res->dir != NFS4_CDFS4_BOTH) { + rpc_task_close_connection(task); + if (args->retries++ < MAX_BIND_CONN_TO_SESSION_RETRIES) + rpc_restart_call(task); + } } static const struct rpc_call_ops nfs4_bind_one_conn_to_session_ops = { @@ -7921,6 +7928,7 @@ int nfs4_proc_bind_one_conn_to_session(struct rpc_clnt *clnt, struct nfs41_bind_conn_to_session_args args = { .client = clp, .dir = NFS4_CDFC4_FORE_OR_BOTH, + .retries = 0, }; struct nfs41_bind_conn_to_session_res res; struct rpc_message msg = { @@ -9191,8 +9199,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout) nfs4_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0, 0); task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) - return ERR_CAST(task); + status = rpc_wait_for_completion_task(task); if (status != 0) goto out; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index ac93715c05a4..a8dc25ce48bb 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -734,9 +734,9 @@ nfs4_get_open_state(struct inode *inode, struct nfs4_state_owner *owner) state = new; state->owner = owner; atomic_inc(&owner->so_count); - list_add_rcu(&state->inode_states, &nfsi->open_states); ihold(inode); state->inode = inode; + list_add_rcu(&state->inode_states, &nfsi->open_states); spin_unlock(&inode->i_lock); /* Note: The reclaim code dictates that we add stateless * and read-only stateids to the end of the list */ diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index f61f96603df7..6ca421cbe19c 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -752,7 +752,7 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, .callback_ops = call_ops, .callback_data = hdr, .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF | flags, + .flags = RPC_TASK_ASYNC | flags, }; hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how); @@ -950,7 +950,8 @@ static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc) hdr->cred, NFS_PROTO(hdr->inode), desc->pg_rpc_callops, - desc->pg_ioflags, 0); + desc->pg_ioflags, + RPC_TASK_CRED_NOREF); return ret; } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index b8d78f393365..dd2e14f5875d 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1332,13 +1332,15 @@ _pnfs_return_layout(struct inode *ino) !valid_layout) { spin_unlock(&ino->i_lock); dprintk("NFS: %s no layout segments to return\n", __func__); - goto out_put_layout_hdr; + goto out_wait_layoutreturn; } send = pnfs_prepare_layoutreturn(lo, &stateid, &cred, NULL); spin_unlock(&ino->i_lock); if (send) status = pnfs_send_layoutreturn(lo, &stateid, &cred, IOMODE_ANY, true); +out_wait_layoutreturn: + wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN, TASK_UNINTERRUPTIBLE); out_put_layout_hdr: pnfs_free_lseg_list(&tmp_list); pnfs_put_layout_hdr(lo); @@ -1456,18 +1458,15 @@ retry: /* lo ref dropped in pnfs_roc_release() */ layoutreturn = pnfs_prepare_layoutreturn(lo, &stateid, &lc_cred, &iomode); /* If the creds don't match, we can't compound the layoutreturn */ - if (!layoutreturn) + if (!layoutreturn || cred_fscmp(cred, lc_cred) != 0) goto out_noroc; - if (cred_fscmp(cred, lc_cred) != 0) - goto out_noroc_put_cred; roc = layoutreturn; pnfs_init_layoutreturn_args(args, lo, &stateid, iomode); res->lrs_present = 0; layoutreturn = false; - -out_noroc_put_cred: put_cred(lc_cred); + out_noroc: spin_unlock(&ino->i_lock); rcu_read_unlock(); diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index e7ddbce48321..679767ac258d 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -536,7 +536,8 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, nfs_init_commit(data, NULL, NULL, cinfo); nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(data->inode), - data->mds_ops, how, 0); + data->mds_ops, how, + RPC_TASK_CRED_NOREF); } else { nfs_init_commit(data, NULL, data->lseg, cinfo); initiate_commit(data, how); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 59ef3b13ccca..7a70287f21a2 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -185,7 +185,7 @@ static int __nfs_list_for_each_server(struct list_head *head, rcu_read_lock(); list_for_each_entry_rcu(server, head, client_link) { - if (!nfs_sb_active(server->super)) + if (!(server->super && nfs_sb_active(server->super))) continue; rcu_read_unlock(); if (last) @@ -1189,7 +1189,6 @@ static void nfs_get_cache_cookie(struct super_block *sb, uniq = ctx->fscache_uniq; ulen = strlen(ctx->fscache_uniq); } - return; } nfs_fscache_get_super_cookie(sb, uniq, ulen); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index df4b87c30ac9..639c34fec04a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -946,9 +946,9 @@ nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, static void nfs_clear_page_commit(struct page *page) { - dec_node_page_state(page, NR_UNSTABLE_NFS); + dec_node_page_state(page, NR_WRITEBACK); dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb, - WB_RECLAIMABLE); + WB_WRITEBACK); } /* Called holding the request lock on @req */ @@ -1695,7 +1695,7 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, .callback_ops = call_ops, .callback_data = data, .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF | flags, + .flags = RPC_TASK_ASYNC | flags, .priority = priority, }; /* Set up the initial task struct. */ @@ -1813,7 +1813,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how, nfs_init_commit(data, head, NULL, cinfo); atomic_inc(&cinfo->mds->rpcs_out); return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode), - data->mds_ops, how, 0); + data->mds_ops, how, RPC_TASK_CRED_NOREF); } /* diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index c3b11a715082..5cf91322de0f 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -1312,6 +1312,7 @@ nfsd4_run_cb_work(struct work_struct *work) container_of(work, struct nfsd4_callback, cb_work); struct nfs4_client *clp = cb->cb_clp; struct rpc_clnt *clnt; + int flags; if (cb->cb_need_restart) { cb->cb_need_restart = false; @@ -1340,7 +1341,8 @@ nfsd4_run_cb_work(struct work_struct *work) } cb->cb_msg.rpc_cred = clp->cl_cb_cred; - rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN, + flags = clp->cl_minorversion ? RPC_TASK_NOCONNECT : RPC_TASK_SOFTCONN; + rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | flags, cb->cb_ops ? &nfsd4_cb_ops : &nfsd4_cb_probe_ops, cb); } diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index a8fb18609146..9e40dfecf1b1 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -127,16 +127,8 @@ nfs4_make_rec_clidname(char *dname, const struct xdr_netobj *clname) goto out; } - { - SHASH_DESC_ON_STACK(desc, tfm); - - desc->tfm = tfm; - - status = crypto_shash_digest(desc, clname->data, clname->len, - cksum.data); - shash_desc_zero(desc); - } - + status = crypto_shash_tfm_digest(tfm, clname->data, clname->len, + cksum.data); if (status) goto out; @@ -1148,7 +1140,6 @@ nfsd4_cld_create_v2(struct nfs4_client *clp) struct crypto_shash *tfm = cn->cn_tfm; struct xdr_netobj cksum; char *principal = NULL; - SHASH_DESC_ON_STACK(desc, tfm); /* Don't upcall if it's already stored */ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) @@ -1170,16 +1161,14 @@ nfsd4_cld_create_v2(struct nfs4_client *clp) else if (clp->cl_cred.cr_principal) principal = clp->cl_cred.cr_principal; if (principal) { - desc->tfm = tfm; cksum.len = crypto_shash_digestsize(tfm); cksum.data = kmalloc(cksum.len, GFP_KERNEL); if (cksum.data == NULL) { ret = -ENOMEM; goto out; } - ret = crypto_shash_digest(desc, principal, strlen(principal), - cksum.data); - shash_desc_zero(desc); + ret = crypto_shash_tfm_digest(tfm, principal, strlen(principal), + cksum.data); if (ret) { kfree(cksum.data); goto out; @@ -1343,7 +1332,6 @@ nfsd4_cld_check_v2(struct nfs4_client *clp) struct crypto_shash *tfm = cn->cn_tfm; struct xdr_netobj cksum; char *principal = NULL; - SHASH_DESC_ON_STACK(desc, tfm); /* did we already find that this client is stable? */ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) @@ -1381,14 +1369,12 @@ found: principal = clp->cl_cred.cr_principal; if (principal == NULL) return -ENOENT; - desc->tfm = tfm; cksum.len = crypto_shash_digestsize(tfm); cksum.data = kmalloc(cksum.len, GFP_KERNEL); if (cksum.data == NULL) return -ENOENT; - status = crypto_shash_digest(desc, principal, strlen(principal), - cksum.data); - shash_desc_zero(desc); + status = crypto_shash_tfm_digest(tfm, principal, + strlen(principal), cksum.data); if (status) { kfree(cksum.data); return -ENOENT; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index e32ecedece0f..c107caa56525 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -267,6 +267,8 @@ find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh, if (!nbl) { nbl= kmalloc(sizeof(*nbl), GFP_KERNEL); if (nbl) { + INIT_LIST_HEAD(&nbl->nbl_list); + INIT_LIST_HEAD(&nbl->nbl_lru); fh_copy_shallow(&nbl->nbl_fh, fh); locks_init_lock(&nbl->nbl_lock); nfsd4_init_cb(&nbl->nbl_cb, lo->lo_owner.so_client, diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 0aa02eb18bd3..c3fbab1753ec 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -979,12 +979,13 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) /* - * We want less throttling in balance_dirty_pages() - * and shrink_inactive_list() so that nfs to + * We want throttling in balance_dirty_pages() + * and shrink_inactive_list() to only consider + * the backingdev we are writing to, so that nfs to * localhost doesn't cause nfsd to lock up due to all * the client's dirty pages or its congested queue. */ - current->flags |= PF_LESS_THROTTLE; + current->flags |= PF_LOCAL_THROTTLE; exp = fhp->fh_export; use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp); @@ -1037,7 +1038,7 @@ out_nfserr: nfserr = nfserrno(host_err); } if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) - current_restore_flags(pflags, PF_LESS_THROTTLE); + current_restore_flags(pflags, PF_LOCAL_THROTTLE); return nfserr; } diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 671085512e0f..ceeb3b441844 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -145,18 +145,9 @@ static int nilfs_readpage(struct file *file, struct page *page) return mpage_readpage(page, nilfs_get_block); } -/** - * nilfs_readpages() - implement readpages() method of nilfs_aops {} - * address_space_operations. - * @file - file struct of the file to be read - * @mapping - address_space struct used for reading multiple pages - * @pages - the pages to be read - * @nr_pages - number of pages to be read - */ -static int nilfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned int nr_pages) +static void nilfs_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block); + mpage_readahead(rac, nilfs_get_block); } static int nilfs_writepages(struct address_space *mapping, @@ -308,7 +299,7 @@ const struct address_space_operations nilfs_aops = { .readpage = nilfs_readpage, .writepages = nilfs_writepages, .set_page_dirty = nilfs_set_page_dirty, - .readpages = nilfs_readpages, + .readahead = nilfs_readahead, .write_begin = nilfs_write_begin, .write_end = nilfs_write_end, /* .releasepage = nilfs_releasepage, */ diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 5435a40f82be..c18459cea6f4 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -520,7 +520,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, BUILD_BUG_ON(FAN_OPEN_EXEC != FS_OPEN_EXEC); BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM); - BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 20); + BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 19); mask = fanotify_group_event_mask(group, iter_info, mask, data, data_type); diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig index 6736e47d94d8..7715fadd5fff 100644 --- a/fs/notify/inotify/Kconfig +++ b/fs/notify/inotify/Kconfig @@ -12,6 +12,6 @@ config INOTIFY_USER new features including multiple file events, one-shot support, and unmount notification. - For more information, see <file:Documentation/filesystems/inotify.txt> + For more information, see <file:Documentation/filesystems/inotify.rst> If unsure, say Y. diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig index de9fb5cff226..1667a7e590d8 100644 --- a/fs/ntfs/Kconfig +++ b/fs/ntfs/Kconfig @@ -18,7 +18,7 @@ config NTFS_FS the Linux 2.4 kernel series is separately available as a patch from the project web site. - For more information see <file:Documentation/filesystems/ntfs.txt> + For more information see <file:Documentation/filesystems/ntfs.rst> and <http://www.linux-ntfs.org/>. To compile this file system support as a module, choose M here: the diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 554b744f41bf..bb0a43860ad2 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -1732,7 +1732,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) { bh = bh->b_this_page; } while (bh); tail->b_this_page = head; - attach_page_buffers(page, head); + attach_page_private(page, head); } else buffers_to_free = bh; } diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h index 842b0bfc3ac9..7068425735f1 100644 --- a/fs/ntfs/malloc.h +++ b/fs/ntfs/malloc.h @@ -34,7 +34,7 @@ static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask) /* return (void *)__get_free_page(gfp_mask); */ } if (likely((size >> PAGE_SHIFT) < totalram_pages())) - return __vmalloc(size, gfp_mask, PAGE_KERNEL); + return __vmalloc(size, gfp_mask); return NULL; } diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 3aac5c917afe..fbb9f1bc623d 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -504,7 +504,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, bh = bh->b_this_page; } while (bh); tail->b_this_page = head; - attach_page_buffers(page, head); + attach_page_private(page, head); } bh = head = page_buffers(page); BUG_ON(!bh); diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig index 46bba20da6b5..1177c33df895 100644 --- a/fs/ocfs2/Kconfig +++ b/fs/ocfs2/Kconfig @@ -21,7 +21,7 @@ config OCFS2_FS OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/ For more information on OCFS2, see the file - <file:Documentation/filesystems/ocfs2.txt>. + <file:Documentation/filesystems/ocfs2.rst>. config OCFS2_FS_O2CB tristate "O2CB Kernelspace Clustering" diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 3a67a6518ddf..3bfb4147895a 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -350,14 +350,11 @@ out: * grow out to a tree. If need be, detecting boundary extents could * trivially be added in a future version of ocfs2_get_block(). */ -static int ocfs2_readpages(struct file *filp, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void ocfs2_readahead(struct readahead_control *rac) { - int ret, err = -EIO; - struct inode *inode = mapping->host; + int ret; + struct inode *inode = rac->mapping->host; struct ocfs2_inode_info *oi = OCFS2_I(inode); - loff_t start; - struct page *last; /* * Use the nonblocking flag for the dlm code to avoid page @@ -365,36 +362,31 @@ static int ocfs2_readpages(struct file *filp, struct address_space *mapping, */ ret = ocfs2_inode_lock_full(inode, NULL, 0, OCFS2_LOCK_NONBLOCK); if (ret) - return err; + return; - if (down_read_trylock(&oi->ip_alloc_sem) == 0) { - ocfs2_inode_unlock(inode, 0); - return err; - } + if (down_read_trylock(&oi->ip_alloc_sem) == 0) + goto out_unlock; /* * Don't bother with inline-data. There isn't anything * to read-ahead in that case anyway... */ if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) - goto out_unlock; + goto out_up; /* * Check whether a remote node truncated this file - we just * drop out in that case as it's not worth handling here. */ - last = lru_to_page(pages); - start = (loff_t)last->index << PAGE_SHIFT; - if (start >= i_size_read(inode)) - goto out_unlock; + if (readahead_pos(rac) >= i_size_read(inode)) + goto out_up; - err = mpage_readpages(mapping, pages, nr_pages, ocfs2_get_block); + mpage_readahead(rac, ocfs2_get_block); -out_unlock: +out_up: up_read(&oi->ip_alloc_sem); +out_unlock: ocfs2_inode_unlock(inode, 0); - - return err; } /* Note: Because we don't support holes, our allocation has @@ -2474,7 +2466,7 @@ static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) const struct address_space_operations ocfs2_aops = { .readpage = ocfs2_readpage, - .readpages = ocfs2_readpages, + .readahead = ocfs2_readahead, .writepage = ocfs2_writepage, .write_begin = ocfs2_write_begin, .write_end = ocfs2_write_end, diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 55a6512e9fde..f105746063ed 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2760,6 +2760,7 @@ leave: * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped */ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) + __must_hold(&dlm->spinlock) { int ret; int lock_dropped = 0; diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 8e4f1ace467c..ea868c6f9800 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -227,7 +227,7 @@ static ssize_t dlmfs_file_read(struct file *filp, loff_t *ppos) { int bytes_left; - ssize_t readlen, got; + ssize_t got; char *lvb_buf; struct inode *inode = file_inode(filp); @@ -237,36 +237,31 @@ static ssize_t dlmfs_file_read(struct file *filp, if (*ppos >= i_size_read(inode)) return 0; + /* don't read past the lvb */ + if (count > i_size_read(inode) - *ppos) + count = i_size_read(inode) - *ppos; + if (!count) return 0; - if (!access_ok(buf, count)) - return -EFAULT; - - /* don't read past the lvb */ - if ((count + *ppos) > i_size_read(inode)) - readlen = i_size_read(inode) - *ppos; - else - readlen = count; - - lvb_buf = kmalloc(readlen, GFP_NOFS); + lvb_buf = kmalloc(count, GFP_NOFS); if (!lvb_buf) return -ENOMEM; - got = user_dlm_read_lvb(inode, lvb_buf, readlen); + got = user_dlm_read_lvb(inode, lvb_buf, count); if (got) { - BUG_ON(got != readlen); - bytes_left = __copy_to_user(buf, lvb_buf, readlen); - readlen -= bytes_left; + BUG_ON(got != count); + bytes_left = copy_to_user(buf, lvb_buf, count); + count -= bytes_left; } else - readlen = 0; + count = 0; kfree(lvb_buf); - *ppos = *ppos + readlen; + *ppos = *ppos + count; - mlog(0, "read %zd bytes\n", readlen); - return readlen; + mlog(0, "read %zu bytes\n", count); + return count; } static ssize_t dlmfs_file_write(struct file *filp, @@ -275,7 +270,6 @@ static ssize_t dlmfs_file_write(struct file *filp, loff_t *ppos) { int bytes_left; - ssize_t writelen; char *lvb_buf; struct inode *inode = file_inode(filp); @@ -285,32 +279,27 @@ static ssize_t dlmfs_file_write(struct file *filp, if (*ppos >= i_size_read(inode)) return -ENOSPC; + /* don't write past the lvb */ + if (count > i_size_read(inode) - *ppos) + count = i_size_read(inode) - *ppos; + if (!count) return 0; - if (!access_ok(buf, count)) - return -EFAULT; - - /* don't write past the lvb */ - if ((count + *ppos) > i_size_read(inode)) - writelen = i_size_read(inode) - *ppos; - else - writelen = count - *ppos; - - lvb_buf = kmalloc(writelen, GFP_NOFS); + lvb_buf = kmalloc(count, GFP_NOFS); if (!lvb_buf) return -ENOMEM; - bytes_left = copy_from_user(lvb_buf, buf, writelen); - writelen -= bytes_left; - if (writelen) - user_dlm_write_lvb(inode, lvb_buf, writelen); + bytes_left = copy_from_user(lvb_buf, buf, count); + count -= bytes_left; + if (count) + user_dlm_write_lvb(inode, lvb_buf, count); kfree(lvb_buf); - *ppos = *ppos + writelen; - mlog(0, "wrote %zd bytes\n", writelen); - return writelen; + *ppos = *ppos + count; + mlog(0, "wrote %zu bytes\n", count); + return count; } static void dlmfs_init_once(void *foo) diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 9150cfa4df7d..ee5d98516212 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -279,6 +279,7 @@ enum ocfs2_mount_options OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */ OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */ OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */ + OCFS2_MOUNT_NOCLUSTER = 1 << 18, /* No cluster aware filesystem mount */ }; #define OCFS2_OSB_SOFT_RO 0x0001 @@ -673,7 +674,8 @@ static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb) static inline int ocfs2_mount_local(struct ocfs2_super *osb) { - return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT); + return ((osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT) + || (osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER)); } static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb) diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 8caeceeaeda7..4da0e4b1e79b 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -254,14 +254,16 @@ static int __ocfs2_find_empty_slot(struct ocfs2_slot_info *si, int i, ret = -ENOSPC; if ((preferred >= 0) && (preferred < si->si_num_slots)) { - if (!si->si_slots[preferred].sl_valid) { + if (!si->si_slots[preferred].sl_valid || + !si->si_slots[preferred].sl_node_num) { ret = preferred; goto out; } } for(i = 0; i < si->si_num_slots; i++) { - if (!si->si_slots[i].sl_valid) { + if (!si->si_slots[i].sl_valid || + !si->si_slots[i].sl_node_num) { ret = i; break; } @@ -456,24 +458,30 @@ int ocfs2_find_slot(struct ocfs2_super *osb) spin_lock(&osb->osb_lock); ocfs2_update_slot_info(si); - /* search for ourselves first and take the slot if it already - * exists. Perhaps we need to mark this in a variable for our - * own journal recovery? Possibly not, though we certainly - * need to warn to the user */ - slot = __ocfs2_node_num_to_slot(si, osb->node_num); - if (slot < 0) { - /* if no slot yet, then just take 1st available - * one. */ - slot = __ocfs2_find_empty_slot(si, osb->preferred_slot); + if (ocfs2_mount_local(osb)) + /* use slot 0 directly in local mode */ + slot = 0; + else { + /* search for ourselves first and take the slot if it already + * exists. Perhaps we need to mark this in a variable for our + * own journal recovery? Possibly not, though we certainly + * need to warn to the user */ + slot = __ocfs2_node_num_to_slot(si, osb->node_num); if (slot < 0) { - spin_unlock(&osb->osb_lock); - mlog(ML_ERROR, "no free slots available!\n"); - status = -EINVAL; - goto bail; - } - } else - printk(KERN_INFO "ocfs2: Slot %d on device (%s) was already " - "allocated to this node!\n", slot, osb->dev_str); + /* if no slot yet, then just take 1st available + * one. */ + slot = __ocfs2_find_empty_slot(si, osb->preferred_slot); + if (slot < 0) { + spin_unlock(&osb->osb_lock); + mlog(ML_ERROR, "no free slots available!\n"); + status = -EINVAL; + goto bail; + } + } else + printk(KERN_INFO "ocfs2: Slot %d on device (%s) was " + "already allocated to this node!\n", + slot, osb->dev_str); + } ocfs2_set_slot(si, slot, osb->node_num); osb->slot_num = slot; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index ac61eeaf3837..71ea9ce71a6b 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -175,6 +175,7 @@ enum { Opt_dir_resv_level, Opt_journal_async_commit, Opt_err_cont, + Opt_nocluster, Opt_err, }; @@ -208,6 +209,7 @@ static const match_table_t tokens = { {Opt_dir_resv_level, "dir_resv_level=%u"}, {Opt_journal_async_commit, "journal_async_commit"}, {Opt_err_cont, "errors=continue"}, + {Opt_nocluster, "nocluster"}, {Opt_err, NULL} }; @@ -619,6 +621,13 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) goto out; } + tmp = OCFS2_MOUNT_NOCLUSTER; + if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) { + ret = -EINVAL; + mlog(ML_ERROR, "Cannot change nocluster option on remount\n"); + goto out; + } + tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL | OCFS2_MOUNT_HB_NONE; if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) { @@ -859,6 +868,7 @@ static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb, } if (ocfs2_userspace_stack(osb) && + !(osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER) && strncmp(osb->osb_cluster_stack, mopt->cluster_stack, OCFS2_STACK_LABEL_LEN)) { mlog(ML_ERROR, @@ -1139,6 +1149,11 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" : "ordered"); + if ((osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER) && + !(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT)) + printk(KERN_NOTICE "ocfs2: The shared device (%s) is mounted " + "without cluster aware mode.\n", osb->dev_str); + atomic_set(&osb->vol_state, VOLUME_MOUNTED); wake_up(&osb->osb_mount_event); @@ -1445,6 +1460,9 @@ static int ocfs2_parse_options(struct super_block *sb, case Opt_journal_async_commit: mopt->mount_opt |= OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT; break; + case Opt_nocluster: + mopt->mount_opt |= OCFS2_MOUNT_NOCLUSTER; + break; default: mlog(ML_ERROR, "Unrecognized mount option \"%s\" " @@ -1556,6 +1574,9 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root) if (opts & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT) seq_printf(s, ",journal_async_commit"); + if (opts & OCFS2_MOUNT_NOCLUSTER) + seq_printf(s, ",nocluster"); + return 0; } diff --git a/fs/omfs/file.c b/fs/omfs/file.c index d640b9388238..d7b5f09d298c 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -289,10 +289,9 @@ static int omfs_readpage(struct file *file, struct page *page) return block_read_full_page(page, omfs_get_block); } -static int omfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void omfs_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, omfs_get_block); + mpage_readahead(rac, omfs_get_block); } static int omfs_writepage(struct page *page, struct writeback_control *wbc) @@ -373,7 +372,7 @@ const struct inode_operations omfs_file_inops = { const struct address_space_operations omfs_aops = { .readpage = omfs_readpage, - .readpages = omfs_readpages, + .readahead = omfs_readahead, .writepage = omfs_writepage, .writepages = omfs_writepages, .write_begin = omfs_write_begin, diff --git a/fs/open.c b/fs/open.c index 719b320ede52..6cd48a61cda3 100644 --- a/fs/open.c +++ b/fs/open.c @@ -345,21 +345,14 @@ SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) * We do this by temporarily clearing all FS-related capabilities and * switching the fsuid/fsgid around to the real ones. */ -long do_faccessat(int dfd, const char __user *filename, int mode) +static const struct cred *access_override_creds(void) { const struct cred *old_cred; struct cred *override_cred; - struct path path; - struct inode *inode; - int res; - unsigned int lookup_flags = LOOKUP_FOLLOW; - - if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ - return -EINVAL; override_cred = prepare_creds(); if (!override_cred) - return -ENOMEM; + return NULL; override_cred->fsuid = override_cred->uid; override_cred->fsgid = override_cred->gid; @@ -394,6 +387,38 @@ long do_faccessat(int dfd, const char __user *filename, int mode) override_cred->non_rcu = 1; old_cred = override_creds(override_cred); + + /* override_cred() gets its own ref */ + put_cred(override_cred); + + return old_cred; +} + +long do_faccessat(int dfd, const char __user *filename, int mode, int flags) +{ + struct path path; + struct inode *inode; + int res; + unsigned int lookup_flags = LOOKUP_FOLLOW; + const struct cred *old_cred = NULL; + + if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ + return -EINVAL; + + if (flags & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) + return -EINVAL; + + if (flags & AT_SYMLINK_NOFOLLOW) + lookup_flags &= ~LOOKUP_FOLLOW; + if (flags & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + + if (!(flags & AT_EACCESS)) { + old_cred = access_override_creds(); + if (!old_cred) + return -ENOMEM; + } + retry: res = user_path_at(dfd, filename, lookup_flags, &path); if (res) @@ -435,19 +460,26 @@ out_path_release: goto retry; } out: - revert_creds(old_cred); - put_cred(override_cred); + if (old_cred) + revert_creds(old_cred); + return res; } SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) { - return do_faccessat(dfd, filename, mode); + return do_faccessat(dfd, filename, mode, 0); +} + +SYSCALL_DEFINE4(faccessat2, int, dfd, const char __user *, filename, int, mode, + int, flags) +{ + return do_faccessat(dfd, filename, mode, flags); } SYSCALL_DEFINE2(access, const char __user *, filename, int, mode) { - return do_faccessat(AT_FDCWD, filename, mode); + return do_faccessat(AT_FDCWD, filename, mode, 0); } int ksys_chdir(const char __user *filename) @@ -743,9 +775,8 @@ static int do_dentry_open(struct file *f, path_get(&f->f_path); f->f_inode = inode; f->f_mapping = inode->i_mapping; - - /* Ensure that we skip any errors that predate opening of the file */ f->f_wb_err = filemap_sample_wb_err(f->f_mapping); + f->f_sb_err = file_sample_sb_err(f); if (unlikely(f->f_flags & O_PATH)) { f->f_mode = FMODE_PATH | FMODE_OPENED; diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 12ae630fbed7..48f0547d4850 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -62,12 +62,7 @@ static int orangefs_writepage_locked(struct page *page, } else { ret = 0; } - if (wr) { - kfree(wr); - set_page_private(page, 0); - ClearPagePrivate(page); - put_page(page); - } + kfree(detach_page_private(page)); return ret; } @@ -409,9 +404,7 @@ static int orangefs_write_begin(struct file *file, wr->len = len; wr->uid = current_fsuid(); wr->gid = current_fsgid(); - SetPagePrivate(page); - set_page_private(page, (unsigned long)wr); - get_page(page); + attach_page_private(page, wr); okay: return 0; } @@ -459,18 +452,12 @@ static void orangefs_invalidatepage(struct page *page, wr = (struct orangefs_write_range *)page_private(page); if (offset == 0 && length == PAGE_SIZE) { - kfree((struct orangefs_write_range *)page_private(page)); - set_page_private(page, 0); - ClearPagePrivate(page); - put_page(page); + kfree(detach_page_private(page)); return; /* write range entirely within invalidate range (or equal) */ } else if (page_offset(page) + offset <= wr->pos && wr->pos + wr->len <= page_offset(page) + offset + length) { - kfree((struct orangefs_write_range *)page_private(page)); - set_page_private(page, 0); - ClearPagePrivate(page); - put_page(page); + kfree(detach_page_private(page)); /* XXX is this right? only caller in fs */ cancel_dirty_page(page); return; @@ -535,12 +522,7 @@ static int orangefs_releasepage(struct page *page, gfp_t foo) static void orangefs_freepage(struct page *page) { - if (PagePrivate(page)) { - kfree((struct orangefs_write_range *)page_private(page)); - set_page_private(page, 0); - ClearPagePrivate(page); - put_page(page); - } + kfree(detach_page_private(page)); } static int orangefs_launder_page(struct page *page) @@ -740,9 +722,7 @@ vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf) wr->len = PAGE_SIZE; wr->uid = current_fsuid(); wr->gid = current_fsgid(); - SetPagePrivate(page); - set_page_private(page, (unsigned long)wr); - get_page(page); + attach_page_private(page, wr); okay: file_update_time(vmf->vma->vm_file); diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index 714c14c47ca5..dd188c7996b3 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -9,7 +9,7 @@ config OVERLAY_FS 'lower' filesystem is either hidden or, in the case of directories, merged with the 'upper' object. - For more information see Documentation/filesystems/overlayfs.txt + For more information see Documentation/filesystems/overlayfs.rst config OVERLAY_FS_REDIRECT_DIR bool "Overlayfs: turn on redirect directory feature by default" @@ -38,7 +38,7 @@ config OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW If backward compatibility is not an issue, then it is safe and recommended to say N here. - For more information, see Documentation/filesystems/overlayfs.txt + For more information, see Documentation/filesystems/overlayfs.rst If unsure, say Y. @@ -103,7 +103,7 @@ config OVERLAY_FS_XINO_AUTO If compatibility with applications that expect 32bit inodes is not an issue, then it is safe and recommended to say Y here. - For more information, see Documentation/filesystems/overlayfs.txt + For more information, see Documentation/filesystems/overlayfs.rst If unsure, say N. diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 475c61f53f0f..ed5c1078919c 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -783,6 +783,9 @@ static struct ovl_fh *ovl_fid_to_fh(struct fid *fid, int buflen, int fh_type) if (fh_type != OVL_FILEID_V0) return ERR_PTR(-EINVAL); + if (buflen <= OVL_FH_WIRE_OFFSET) + return ERR_PTR(-EINVAL); + fh = kzalloc(buflen, GFP_KERNEL); if (!fh) return ERR_PTR(-ENOMEM); diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index b0d42ece4d7c..981f11ec51bc 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -58,6 +58,24 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) attr->ia_valid &= ~ATTR_MODE; + /* + * We might have to translate ovl file into real file object + * once use cases emerge. For now, simply don't let underlying + * filesystem rely on attr->ia_file + */ + attr->ia_valid &= ~ATTR_FILE; + + /* + * If open(O_TRUNC) is done, VFS calls ->setattr with ATTR_OPEN + * set. Overlayfs does not pass O_TRUNC flag to underlying + * filesystem during open -> do not pass ATTR_OPEN. This + * disables optimization in fuse which assumes open(O_TRUNC) + * already set file size to 0. But we never passed O_TRUNC to + * fuse. So by clearing ATTR_OPEN, fuse will be forced to send + * setattr request to server. + */ + attr->ia_valid &= ~ATTR_OPEN; + inode_lock(upperdentry->d_inode); old_cred = ovl_override_creds(dentry->d_sb); err = notify_change(upperdentry, attr, NULL); diff --git a/fs/pnode.c b/fs/pnode.c index 49f6d7ff2139..1106137c747a 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -261,14 +261,13 @@ static int propagate_one(struct mount *m) child = copy_tree(last_source, last_source->mnt.mnt_root, type); if (IS_ERR(child)) return PTR_ERR(child); + read_seqlock_excl(&mount_lock); mnt_set_mountpoint(m, mp, child); + if (m->mnt_master != dest_master) + SET_MNT_MARK(m->mnt_master); + read_sequnlock_excl(&mount_lock); last_dest = m; last_source = child; - if (m->mnt_master != dest_master) { - read_seqlock_excl(&mount_lock); - SET_MNT_MARK(m->mnt_master); - read_sequnlock_excl(&mount_lock); - } hlist_add_head(&child->mnt_hash, list); return count_mounts(m->mnt_ns, child); } diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 27ef84d99f59..971a42f6357d 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -23,7 +23,7 @@ config PROC_FS /proc" or the equivalent line in /etc/fstab does the job. The /proc file system is explained in the file - <file:Documentation/filesystems/proc.txt> and on the proc(5) manpage + <file:Documentation/filesystems/proc.rst> and on the proc(5) manpage ("man 5 proc"). This option will enlarge your kernel by about 67 KB. Several @@ -95,7 +95,7 @@ config PROC_CHILDREN default n help Provides a fast way to retrieve first level children pids of a task. See - <file:Documentation/filesystems/proc.txt> for more information. + <file:Documentation/filesystems/proc.rst> for more information. Say Y if you are running any user-space software which takes benefit from this interface. For example, rkt is such a piece of software. diff --git a/fs/proc/base.c b/fs/proc/base.c index 572898dd16a0..eb2255e95f62 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3286,7 +3286,6 @@ static const struct inode_operations proc_tgid_base_inode_operations = { void proc_flush_pid(struct pid *pid) { proc_invalidate_siblings_dcache(&pid->inodes, &pid->lock); - put_pid(pid); } static struct dentry *proc_pid_instantiate(struct dentry * dentry, diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 8c1f1bb1a5ce..ecc63ce01be7 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -103,11 +103,14 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "SUnreclaim: ", sunreclaim); seq_printf(m, "KernelStack: %8lu kB\n", global_zone_page_state(NR_KERNEL_STACK_KB)); +#ifdef CONFIG_SHADOW_CALL_STACK + seq_printf(m, "ShadowCallStack:%8lu kB\n", + global_zone_page_state(NR_KERNEL_SCS_KB)); +#endif show_val_kb(m, "PageTables: ", global_zone_page_state(NR_PAGETABLE)); - show_val_kb(m, "NFS_Unstable: ", - global_node_page_state(NR_UNSTABLE_NFS)); + show_val_kb(m, "NFS_Unstable: ", 0); show_val_kb(m, "Bounce: ", global_zone_page_state(NR_BOUNCE)); show_val_kb(m, "WritebackTmp: ", diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 8d382d4ec067..6ad407d5efe2 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -546,10 +546,17 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); - struct page *page; + struct page *page = NULL; - /* FOLL_DUMP will return -EFAULT on huge zero page */ - page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP); + if (pmd_present(*pmd)) { + /* FOLL_DUMP will return -EFAULT on huge zero page */ + page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP); + } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) { + swp_entry_t entry = pmd_to_swp_entry(*pmd); + + if (is_migration_entry(entry)) + page = migration_entry_to_page(entry); + } if (IS_ERR_OR_NULL(page)) return; if (PageAnon(page)) @@ -578,8 +585,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { - if (pmd_present(*pmd)) - smaps_pmd_entry(pmd, addr, walk); + smaps_pmd_entry(pmd, addr, walk); spin_unlock(ptl); goto out; } @@ -622,9 +628,6 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_GROWSDOWN)] = "gd", [ilog2(VM_PFNMAP)] = "pf", [ilog2(VM_DENYWRITE)] = "dw", -#ifdef CONFIG_X86_INTEL_MPX - [ilog2(VM_MPX)] = "mp", -#endif [ilog2(VM_LOCKED)] = "lo", [ilog2(VM_IO)] = "io", [ilog2(VM_SEQ_READ)] = "sr", @@ -638,6 +641,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_ARCH_1)] = "ar", [ilog2(VM_WIPEONFORK)] = "wf", [ilog2(VM_DONTDUMP)] = "dd", +#ifdef CONFIG_ARM64_BTI + [ilog2(VM_ARM64_BTI)] = "bt", +#endif #ifdef CONFIG_MEM_SOFT_DIRTY [ilog2(VM_SOFTDIRTY)] = "sd", #endif diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 7dc800cce354..c663202da8de 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -266,7 +266,8 @@ static int vmcoredd_mmap_dumps(struct vm_area_struct *vma, unsigned long dst, if (start < offset + dump->size) { tsz = min(offset + (u64)dump->size - start, (u64)size); buf = dump->buf + start - offset; - if (remap_vmalloc_range_partial(vma, dst, buf, tsz)) { + if (remap_vmalloc_range_partial(vma, dst, buf, 0, + tsz)) { ret = -EFAULT; goto out_unlock; } @@ -624,7 +625,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)start, size); kaddr = elfnotes_buf + start - elfcorebuf_sz - vmcoredd_orig_sz; if (remap_vmalloc_range_partial(vma, vma->vm_start + len, - kaddr, tsz)) + kaddr, 0, tsz)) goto fail; size -= tsz; diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 273ee82d8aa9..e4d70c0dffe9 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -279,7 +279,8 @@ static int mounts_open_common(struct inode *inode, struct file *file, p->ns = ns; p->root = root; p->show = show; - p->cached_event = ~0ULL; + INIT_LIST_HEAD(&p->cursor.mnt_list); + p->cursor.mnt.mnt_flags = MNT_CURSOR; return 0; @@ -296,6 +297,7 @@ static int mounts_release(struct inode *inode, struct file *file) struct seq_file *m = file->private_data; struct proc_mounts *p = m->private; path_put(&p->root); + mnt_cursor_del(p->ns, &p->cursor); put_mnt_ns(p->ns); return seq_release_private(inode, file); } diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index 8f0369aad22a..e16a49ebfe54 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -153,3 +153,112 @@ config PSTORE_RAM "ramoops.ko". For more information, see Documentation/admin-guide/ramoops.rst. + +config PSTORE_ZONE + tristate + depends on PSTORE + help + The common layer for pstore/blk (and pstore/ram in the future) + to manage storage in zones. + +config PSTORE_BLK + tristate "Log panic/oops to a block device" + depends on PSTORE + depends on BLOCK + select PSTORE_ZONE + default n + help + This enables panic and oops message to be logged to a block dev + where it can be read back at some later point. + + For more information, see Documentation/admin-guide/pstore-blk.rst + + If unsure, say N. + +config PSTORE_BLK_BLKDEV + string "block device identifier" + depends on PSTORE_BLK + default "" + help + Which block device should be used for pstore/blk. + + It accepts the following variants: + 1) <hex_major><hex_minor> device number in hexadecimal representation, + with no leading 0x, for example b302. + 2) /dev/<disk_name> represents the device name of disk + 3) /dev/<disk_name><decimal> represents the device name and number + of partition - device number of disk plus the partition number + 4) /dev/<disk_name>p<decimal> - same as the above, this form is + used when disk name of partitioned disk ends with a digit. + 5) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the + unique id of a partition if the partition table provides it. + The UUID may be either an EFI/GPT UUID, or refer to an MSDOS + partition using the format SSSSSSSS-PP, where SSSSSSSS is a zero- + filled hex representation of the 32-bit "NT disk signature", and PP + is a zero-filled hex representation of the 1-based partition number. + 6) PARTUUID=<UUID>/PARTNROFF=<int> to select a partition in relation + to a partition with a known unique id. + 7) <major>:<minor> major and minor number of the device separated by + a colon. + + NOTE that, both Kconfig and module parameters can configure + pstore/blk, but module parameters have priority over Kconfig. + +config PSTORE_BLK_KMSG_SIZE + int "Size in Kbytes of kmsg dump log to store" + depends on PSTORE_BLK + default 64 + help + This just sets size of kmsg dump (oops, panic, etc) log for + pstore/blk. The size is in KB and must be a multiple of 4. + + NOTE that, both Kconfig and module parameters can configure + pstore/blk, but module parameters have priority over Kconfig. + +config PSTORE_BLK_MAX_REASON + int "Maximum kmsg dump reason to store" + depends on PSTORE_BLK + default 2 + help + The maximum reason for kmsg dumps to store. The default is + 2 (KMSG_DUMP_OOPS), see include/linux/kmsg_dump.h's + enum kmsg_dump_reason for more details. + + NOTE that, both Kconfig and module parameters can configure + pstore/blk, but module parameters have priority over Kconfig. + +config PSTORE_BLK_PMSG_SIZE + int "Size in Kbytes of pmsg to store" + depends on PSTORE_BLK + depends on PSTORE_PMSG + default 64 + help + This just sets size of pmsg (pmsg_size) for pstore/blk. The size is + in KB and must be a multiple of 4. + + NOTE that, both Kconfig and module parameters can configure + pstore/blk, but module parameters have priority over Kconfig. + +config PSTORE_BLK_CONSOLE_SIZE + int "Size in Kbytes of console log to store" + depends on PSTORE_BLK + depends on PSTORE_CONSOLE + default 64 + help + This just sets size of console log (console_size) to store via + pstore/blk. The size is in KB and must be a multiple of 4. + + NOTE that, both Kconfig and module parameters can configure + pstore/blk, but module parameters have priority over Kconfig. + +config PSTORE_BLK_FTRACE_SIZE + int "Size in Kbytes of ftrace log to store" + depends on PSTORE_BLK + depends on PSTORE_FTRACE + default 64 + help + This just sets size of ftrace log (ftrace_size) for pstore/blk. The + size is in KB and must be a multiple of 4. + + NOTE that, both Kconfig and module parameters can configure + pstore/blk, but module parameters have priority over Kconfig. diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile index 967b5891f325..c270467aeece 100644 --- a/fs/pstore/Makefile +++ b/fs/pstore/Makefile @@ -12,3 +12,9 @@ pstore-$(CONFIG_PSTORE_PMSG) += pmsg.o ramoops-objs += ram.o ram_core.o obj-$(CONFIG_PSTORE_RAM) += ramoops.o + +pstore_zone-objs += zone.o +obj-$(CONFIG_PSTORE_ZONE) += pstore_zone.o + +pstore_blk-objs += blk.o +obj-$(CONFIG_PSTORE_BLK) += pstore_blk.o diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c new file mode 100644 index 000000000000..fcd5563dde06 --- /dev/null +++ b/fs/pstore/blk.c @@ -0,0 +1,517 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Implements pstore backend driver that write to block (or non-block) storage + * devices, using the pstore/zone API. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include "../../block/blk.h" +#include <linux/blkdev.h> +#include <linux/string.h> +#include <linux/of.h> +#include <linux/of_address.h> +#include <linux/platform_device.h> +#include <linux/pstore_blk.h> +#include <linux/mount.h> +#include <linux/uio.h> + +static long kmsg_size = CONFIG_PSTORE_BLK_KMSG_SIZE; +module_param(kmsg_size, long, 0400); +MODULE_PARM_DESC(kmsg_size, "kmsg dump record size in kbytes"); + +static int max_reason = CONFIG_PSTORE_BLK_MAX_REASON; +module_param(max_reason, int, 0400); +MODULE_PARM_DESC(max_reason, + "maximum reason for kmsg dump (default 2: Oops and Panic)"); + +#if IS_ENABLED(CONFIG_PSTORE_PMSG) +static long pmsg_size = CONFIG_PSTORE_BLK_PMSG_SIZE; +#else +static long pmsg_size = -1; +#endif +module_param(pmsg_size, long, 0400); +MODULE_PARM_DESC(pmsg_size, "pmsg size in kbytes"); + +#if IS_ENABLED(CONFIG_PSTORE_CONSOLE) +static long console_size = CONFIG_PSTORE_BLK_CONSOLE_SIZE; +#else +static long console_size = -1; +#endif +module_param(console_size, long, 0400); +MODULE_PARM_DESC(console_size, "console size in kbytes"); + +#if IS_ENABLED(CONFIG_PSTORE_FTRACE) +static long ftrace_size = CONFIG_PSTORE_BLK_FTRACE_SIZE; +#else +static long ftrace_size = -1; +#endif +module_param(ftrace_size, long, 0400); +MODULE_PARM_DESC(ftrace_size, "ftrace size in kbytes"); + +static bool best_effort; +module_param(best_effort, bool, 0400); +MODULE_PARM_DESC(best_effort, "use best effort to write (i.e. do not require storage driver pstore support, default: off)"); + +/* + * blkdev - the block device to use for pstore storage + * + * Usually, this will be a partition of a block device. + * + * blkdev accepts the following variants: + * 1) <hex_major><hex_minor> device number in hexadecimal representation, + * with no leading 0x, for example b302. + * 2) /dev/<disk_name> represents the device number of disk + * 3) /dev/<disk_name><decimal> represents the device number + * of partition - device number of disk plus the partition number + * 4) /dev/<disk_name>p<decimal> - same as the above, that form is + * used when disk name of partitioned disk ends on a digit. + * 5) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the + * unique id of a partition if the partition table provides it. + * The UUID may be either an EFI/GPT UUID, or refer to an MSDOS + * partition using the format SSSSSSSS-PP, where SSSSSSSS is a zero- + * filled hex representation of the 32-bit "NT disk signature", and PP + * is a zero-filled hex representation of the 1-based partition number. + * 6) PARTUUID=<UUID>/PARTNROFF=<int> to select a partition in relation to + * a partition with a known unique id. + * 7) <major>:<minor> major and minor number of the device separated by + * a colon. + */ +static char blkdev[80] = CONFIG_PSTORE_BLK_BLKDEV; +module_param_string(blkdev, blkdev, 80, 0400); +MODULE_PARM_DESC(blkdev, "block device for pstore storage"); + +/* + * All globals must only be accessed under the pstore_blk_lock + * during the register/unregister functions. + */ +static DEFINE_MUTEX(pstore_blk_lock); +static struct block_device *psblk_bdev; +static struct pstore_zone_info *pstore_zone_info; +static pstore_blk_panic_write_op blkdev_panic_write; + +struct bdev_info { + dev_t devt; + sector_t nr_sects; + sector_t start_sect; +}; + +#define check_size(name, alignsize) ({ \ + long _##name_ = (name); \ + _##name_ = _##name_ <= 0 ? 0 : (_##name_ * 1024); \ + if (_##name_ & ((alignsize) - 1)) { \ + pr_info(#name " must align to %d\n", \ + (alignsize)); \ + _##name_ = ALIGN(name, (alignsize)); \ + } \ + _##name_; \ +}) + +static int __register_pstore_device(struct pstore_device_info *dev) +{ + int ret; + + lockdep_assert_held(&pstore_blk_lock); + + if (!dev || !dev->total_size || !dev->read || !dev->write) + return -EINVAL; + + /* someone already registered before */ + if (pstore_zone_info) + return -EBUSY; + + pstore_zone_info = kzalloc(sizeof(struct pstore_zone_info), GFP_KERNEL); + if (!pstore_zone_info) + return -ENOMEM; + + /* zero means not limit on which backends to attempt to store. */ + if (!dev->flags) + dev->flags = UINT_MAX; + +#define verify_size(name, alignsize, enabled) { \ + long _##name_; \ + if (enabled) \ + _##name_ = check_size(name, alignsize); \ + else \ + _##name_ = 0; \ + name = _##name_ / 1024; \ + pstore_zone_info->name = _##name_; \ + } + + verify_size(kmsg_size, 4096, dev->flags & PSTORE_FLAGS_DMESG); + verify_size(pmsg_size, 4096, dev->flags & PSTORE_FLAGS_PMSG); + verify_size(console_size, 4096, dev->flags & PSTORE_FLAGS_CONSOLE); + verify_size(ftrace_size, 4096, dev->flags & PSTORE_FLAGS_FTRACE); +#undef verify_size + + pstore_zone_info->total_size = dev->total_size; + pstore_zone_info->max_reason = max_reason; + pstore_zone_info->read = dev->read; + pstore_zone_info->write = dev->write; + pstore_zone_info->erase = dev->erase; + pstore_zone_info->panic_write = dev->panic_write; + pstore_zone_info->name = KBUILD_MODNAME; + pstore_zone_info->owner = THIS_MODULE; + + ret = register_pstore_zone(pstore_zone_info); + if (ret) { + kfree(pstore_zone_info); + pstore_zone_info = NULL; + } + return ret; +} +/** + * register_pstore_device() - register non-block device to pstore/blk + * + * @dev: non-block device information + * + * Return: + * * 0 - OK + * * Others - something error. + */ +int register_pstore_device(struct pstore_device_info *dev) +{ + int ret; + + mutex_lock(&pstore_blk_lock); + ret = __register_pstore_device(dev); + mutex_unlock(&pstore_blk_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(register_pstore_device); + +static void __unregister_pstore_device(struct pstore_device_info *dev) +{ + lockdep_assert_held(&pstore_blk_lock); + if (pstore_zone_info && pstore_zone_info->read == dev->read) { + unregister_pstore_zone(pstore_zone_info); + kfree(pstore_zone_info); + pstore_zone_info = NULL; + } +} + +/** + * unregister_pstore_device() - unregister non-block device from pstore/blk + * + * @dev: non-block device information + */ +void unregister_pstore_device(struct pstore_device_info *dev) +{ + mutex_lock(&pstore_blk_lock); + __unregister_pstore_device(dev); + mutex_unlock(&pstore_blk_lock); +} +EXPORT_SYMBOL_GPL(unregister_pstore_device); + +/** + * psblk_get_bdev() - open block device + * + * @holder: Exclusive holder identifier + * @info: Information about bdev to fill in + * + * Return: pointer to block device on success and others on error. + * + * On success, the returned block_device has reference count of one. + */ +static struct block_device *psblk_get_bdev(void *holder, + struct bdev_info *info) +{ + struct block_device *bdev = ERR_PTR(-ENODEV); + fmode_t mode = FMODE_READ | FMODE_WRITE; + sector_t nr_sects; + + lockdep_assert_held(&pstore_blk_lock); + + if (pstore_zone_info) + return ERR_PTR(-EBUSY); + + if (!blkdev[0]) + return ERR_PTR(-ENODEV); + + if (holder) + mode |= FMODE_EXCL; + bdev = blkdev_get_by_path(blkdev, mode, holder); + if (IS_ERR(bdev)) { + dev_t devt; + + devt = name_to_dev_t(blkdev); + if (devt == 0) + return ERR_PTR(-ENODEV); + bdev = blkdev_get_by_dev(devt, mode, holder); + if (IS_ERR(bdev)) + return bdev; + } + + nr_sects = part_nr_sects_read(bdev->bd_part); + if (!nr_sects) { + pr_err("not enough space for '%s'\n", blkdev); + blkdev_put(bdev, mode); + return ERR_PTR(-ENOSPC); + } + + if (info) { + info->devt = bdev->bd_dev; + info->nr_sects = nr_sects; + info->start_sect = get_start_sect(bdev); + } + + return bdev; +} + +static void psblk_put_bdev(struct block_device *bdev, void *holder) +{ + fmode_t mode = FMODE_READ | FMODE_WRITE; + + lockdep_assert_held(&pstore_blk_lock); + + if (!bdev) + return; + + if (holder) + mode |= FMODE_EXCL; + blkdev_put(bdev, mode); +} + +static ssize_t psblk_generic_blk_read(char *buf, size_t bytes, loff_t pos) +{ + struct block_device *bdev = psblk_bdev; + struct file file; + struct kiocb kiocb; + struct iov_iter iter; + struct kvec iov = {.iov_base = buf, .iov_len = bytes}; + + if (!bdev) + return -ENODEV; + + memset(&file, 0, sizeof(struct file)); + file.f_mapping = bdev->bd_inode->i_mapping; + file.f_flags = O_DSYNC | __O_SYNC | O_NOATIME; + file.f_inode = bdev->bd_inode; + file_ra_state_init(&file.f_ra, file.f_mapping); + + init_sync_kiocb(&kiocb, &file); + kiocb.ki_pos = pos; + iov_iter_kvec(&iter, READ, &iov, 1, bytes); + + return generic_file_read_iter(&kiocb, &iter); +} + +static ssize_t psblk_generic_blk_write(const char *buf, size_t bytes, + loff_t pos) +{ + struct block_device *bdev = psblk_bdev; + struct iov_iter iter; + struct kiocb kiocb; + struct file file; + ssize_t ret; + struct kvec iov = {.iov_base = (void *)buf, .iov_len = bytes}; + + if (!bdev) + return -ENODEV; + + /* Console/Ftrace backend may handle buffer until flush dirty zones */ + if (in_interrupt() || irqs_disabled()) + return -EBUSY; + + memset(&file, 0, sizeof(struct file)); + file.f_mapping = bdev->bd_inode->i_mapping; + file.f_flags = O_DSYNC | __O_SYNC | O_NOATIME; + file.f_inode = bdev->bd_inode; + + init_sync_kiocb(&kiocb, &file); + kiocb.ki_pos = pos; + iov_iter_kvec(&iter, WRITE, &iov, 1, bytes); + + inode_lock(bdev->bd_inode); + ret = generic_write_checks(&kiocb, &iter); + if (ret > 0) + ret = generic_perform_write(&file, &iter, pos); + inode_unlock(bdev->bd_inode); + + if (likely(ret > 0)) { + const struct file_operations f_op = {.fsync = blkdev_fsync}; + + file.f_op = &f_op; + kiocb.ki_pos += ret; + ret = generic_write_sync(&kiocb, ret); + } + return ret; +} + +static ssize_t psblk_blk_panic_write(const char *buf, size_t size, + loff_t off) +{ + int ret; + + if (!blkdev_panic_write) + return -EOPNOTSUPP; + + /* size and off must align to SECTOR_SIZE for block device */ + ret = blkdev_panic_write(buf, off >> SECTOR_SHIFT, + size >> SECTOR_SHIFT); + /* try next zone */ + if (ret == -ENOMSG) + return ret; + return ret ? -EIO : size; +} + +static int __register_pstore_blk(struct pstore_blk_info *info) +{ + char bdev_name[BDEVNAME_SIZE]; + struct block_device *bdev; + struct pstore_device_info dev; + struct bdev_info binfo; + void *holder = blkdev; + int ret = -ENODEV; + + lockdep_assert_held(&pstore_blk_lock); + + /* hold bdev exclusively */ + memset(&binfo, 0, sizeof(binfo)); + bdev = psblk_get_bdev(holder, &binfo); + if (IS_ERR(bdev)) { + pr_err("failed to open '%s'!\n", blkdev); + return PTR_ERR(bdev); + } + + /* only allow driver matching the @blkdev */ + if (!binfo.devt || (!best_effort && + MAJOR(binfo.devt) != info->major)) { + pr_debug("invalid major %u (expect %u)\n", + info->major, MAJOR(binfo.devt)); + ret = -ENODEV; + goto err_put_bdev; + } + + /* psblk_bdev must be assigned before register to pstore/blk */ + psblk_bdev = bdev; + blkdev_panic_write = info->panic_write; + + /* Copy back block device details. */ + info->devt = binfo.devt; + info->nr_sects = binfo.nr_sects; + info->start_sect = binfo.start_sect; + + memset(&dev, 0, sizeof(dev)); + dev.total_size = info->nr_sects << SECTOR_SHIFT; + dev.flags = info->flags; + dev.read = psblk_generic_blk_read; + dev.write = psblk_generic_blk_write; + dev.erase = NULL; + dev.panic_write = info->panic_write ? psblk_blk_panic_write : NULL; + + ret = __register_pstore_device(&dev); + if (ret) + goto err_put_bdev; + + bdevname(bdev, bdev_name); + pr_info("attached %s%s\n", bdev_name, + info->panic_write ? "" : " (no dedicated panic_write!)"); + return 0; + +err_put_bdev: + psblk_bdev = NULL; + blkdev_panic_write = NULL; + psblk_put_bdev(bdev, holder); + return ret; +} + +/** + * register_pstore_blk() - register block device to pstore/blk + * + * @info: details on the desired block device interface + * + * Return: + * * 0 - OK + * * Others - something error. + */ +int register_pstore_blk(struct pstore_blk_info *info) +{ + int ret; + + mutex_lock(&pstore_blk_lock); + ret = __register_pstore_blk(info); + mutex_unlock(&pstore_blk_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(register_pstore_blk); + +static void __unregister_pstore_blk(unsigned int major) +{ + struct pstore_device_info dev = { .read = psblk_generic_blk_read }; + void *holder = blkdev; + + lockdep_assert_held(&pstore_blk_lock); + if (psblk_bdev && MAJOR(psblk_bdev->bd_dev) == major) { + __unregister_pstore_device(&dev); + psblk_put_bdev(psblk_bdev, holder); + blkdev_panic_write = NULL; + psblk_bdev = NULL; + } +} + +/** + * unregister_pstore_blk() - unregister block device from pstore/blk + * + * @major: the major device number of device + */ +void unregister_pstore_blk(unsigned int major) +{ + mutex_lock(&pstore_blk_lock); + __unregister_pstore_blk(major); + mutex_unlock(&pstore_blk_lock); +} +EXPORT_SYMBOL_GPL(unregister_pstore_blk); + +/* get information of pstore/blk */ +int pstore_blk_get_config(struct pstore_blk_config *info) +{ + strncpy(info->device, blkdev, 80); + info->max_reason = max_reason; + info->kmsg_size = check_size(kmsg_size, 4096); + info->pmsg_size = check_size(pmsg_size, 4096); + info->ftrace_size = check_size(ftrace_size, 4096); + info->console_size = check_size(console_size, 4096); + + return 0; +} +EXPORT_SYMBOL_GPL(pstore_blk_get_config); + +static int __init pstore_blk_init(void) +{ + struct pstore_blk_info info = { }; + int ret = 0; + + mutex_lock(&pstore_blk_lock); + if (!pstore_zone_info && best_effort && blkdev[0]) + ret = __register_pstore_blk(&info); + mutex_unlock(&pstore_blk_lock); + + return ret; +} +late_initcall(pstore_blk_init); + +static void __exit pstore_blk_exit(void) +{ + mutex_lock(&pstore_blk_lock); + if (psblk_bdev) + __unregister_pstore_blk(MAJOR(psblk_bdev->bd_dev)); + else { + struct pstore_device_info dev = { }; + + if (pstore_zone_info) + dev.read = pstore_zone_info->read; + __unregister_pstore_device(&dev); + } + mutex_unlock(&pstore_blk_lock); +} +module_exit(pstore_blk_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("WeiXiong Liao <liaoweixiong@allwinnertech.com>"); +MODULE_AUTHOR("Kees Cook <keescook@chromium.org>"); +MODULE_DESCRIPTION("pstore backend for block devices"); diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c index bfbfc2698070..5c0450701293 100644 --- a/fs/pstore/ftrace.c +++ b/fs/pstore/ftrace.c @@ -16,6 +16,7 @@ #include <linux/debugfs.h> #include <linux/err.h> #include <linux/cache.h> +#include <linux/slab.h> #include <asm/barrier.h> #include "internal.h" @@ -132,3 +133,56 @@ void pstore_unregister_ftrace(void) debugfs_remove_recursive(pstore_ftrace_dir); } + +ssize_t pstore_ftrace_combine_log(char **dest_log, size_t *dest_log_size, + const char *src_log, size_t src_log_size) +{ + size_t dest_size, src_size, total, dest_off, src_off; + size_t dest_idx = 0, src_idx = 0, merged_idx = 0; + void *merged_buf; + struct pstore_ftrace_record *drec, *srec, *mrec; + size_t record_size = sizeof(struct pstore_ftrace_record); + + dest_off = *dest_log_size % record_size; + dest_size = *dest_log_size - dest_off; + + src_off = src_log_size % record_size; + src_size = src_log_size - src_off; + + total = dest_size + src_size; + merged_buf = kmalloc(total, GFP_KERNEL); + if (!merged_buf) + return -ENOMEM; + + drec = (struct pstore_ftrace_record *)(*dest_log + dest_off); + srec = (struct pstore_ftrace_record *)(src_log + src_off); + mrec = (struct pstore_ftrace_record *)(merged_buf); + + while (dest_size > 0 && src_size > 0) { + if (pstore_ftrace_read_timestamp(&drec[dest_idx]) < + pstore_ftrace_read_timestamp(&srec[src_idx])) { + mrec[merged_idx++] = drec[dest_idx++]; + dest_size -= record_size; + } else { + mrec[merged_idx++] = srec[src_idx++]; + src_size -= record_size; + } + } + + while (dest_size > 0) { + mrec[merged_idx++] = drec[dest_idx++]; + dest_size -= record_size; + } + + while (src_size > 0) { + mrec[merged_idx++] = srec[src_idx++]; + src_size -= record_size; + } + + kfree(*dest_log); + *dest_log = merged_buf; + *dest_log_size = total; + + return 0; +} +EXPORT_SYMBOL_GPL(pstore_ftrace_combine_log); diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index d99b5d39aa90..c331efe8de95 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -22,18 +22,21 @@ #include <linux/magic.h> #include <linux/pstore.h> #include <linux/slab.h> -#include <linux/spinlock.h> #include <linux/uaccess.h> #include "internal.h" #define PSTORE_NAMELEN 64 -static DEFINE_SPINLOCK(allpstore_lock); -static LIST_HEAD(allpstore); +static DEFINE_MUTEX(records_list_lock); +static LIST_HEAD(records_list); + +static DEFINE_MUTEX(pstore_sb_lock); +static struct super_block *pstore_sb; struct pstore_private { struct list_head list; + struct dentry *dentry; struct pstore_record *record; size_t total_size; }; @@ -178,10 +181,22 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry) { struct pstore_private *p = d_inode(dentry)->i_private; struct pstore_record *record = p->record; + int rc = 0; if (!record->psi->erase) return -EPERM; + /* Make sure we can't race while removing this file. */ + mutex_lock(&records_list_lock); + if (!list_empty(&p->list)) + list_del_init(&p->list); + else + rc = -ENOENT; + p->dentry = NULL; + mutex_unlock(&records_list_lock); + if (rc) + return rc; + mutex_lock(&record->psi->read_mutex); record->psi->erase(record); mutex_unlock(&record->psi->read_mutex); @@ -192,15 +207,9 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry) static void pstore_evict_inode(struct inode *inode) { struct pstore_private *p = inode->i_private; - unsigned long flags; clear_inode(inode); - if (p) { - spin_lock_irqsave(&allpstore_lock, flags); - list_del(&p->list); - spin_unlock_irqrestore(&allpstore_lock, flags); - free_pstore_private(p); - } + free_pstore_private(p); } static const struct inode_operations pstore_dir_inode_operations = { @@ -278,11 +287,54 @@ static const struct super_operations pstore_ops = { .show_options = pstore_show_options, }; -static struct super_block *pstore_sb; +static struct dentry *psinfo_lock_root(void) +{ + struct dentry *root; -bool pstore_is_mounted(void) + mutex_lock(&pstore_sb_lock); + /* + * Having no backend is fine -- no records appear. + * Not being mounted is fine -- nothing to do. + */ + if (!psinfo || !pstore_sb) { + mutex_unlock(&pstore_sb_lock); + return NULL; + } + + root = pstore_sb->s_root; + inode_lock(d_inode(root)); + mutex_unlock(&pstore_sb_lock); + + return root; +} + +int pstore_put_backend_records(struct pstore_info *psi) { - return pstore_sb != NULL; + struct pstore_private *pos, *tmp; + struct dentry *root; + int rc = 0; + + root = psinfo_lock_root(); + if (!root) + return 0; + + mutex_lock(&records_list_lock); + list_for_each_entry_safe(pos, tmp, &records_list, list) { + if (pos->record->psi == psi) { + list_del_init(&pos->list); + rc = simple_unlink(d_inode(root), pos->dentry); + if (WARN_ON(rc)) + break; + d_drop(pos->dentry); + dput(pos->dentry); + pos->dentry = NULL; + } + } + mutex_unlock(&records_list_lock); + + inode_unlock(d_inode(root)); + + return rc; } /* @@ -297,23 +349,20 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record) int rc = 0; char name[PSTORE_NAMELEN]; struct pstore_private *private, *pos; - unsigned long flags; size_t size = record->size + record->ecc_notice_size; - WARN_ON(!inode_is_locked(d_inode(root))); + if (WARN_ON(!inode_is_locked(d_inode(root)))) + return -EINVAL; - spin_lock_irqsave(&allpstore_lock, flags); - list_for_each_entry(pos, &allpstore, list) { + rc = -EEXIST; + /* Skip records that are already present in the filesystem. */ + mutex_lock(&records_list_lock); + list_for_each_entry(pos, &records_list, list) { if (pos->record->type == record->type && pos->record->id == record->id && - pos->record->psi == record->psi) { - rc = -EEXIST; - break; - } + pos->record->psi == record->psi) + goto fail; } - spin_unlock_irqrestore(&allpstore_lock, flags); - if (rc) - return rc; rc = -ENOMEM; inode = pstore_get_inode(root->d_sb); @@ -334,6 +383,7 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record) if (!dentry) goto fail_private; + private->dentry = dentry; private->record = record; inode->i_size = private->total_size = size; inode->i_private = private; @@ -343,9 +393,8 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record) d_add(dentry, inode); - spin_lock_irqsave(&allpstore_lock, flags); - list_add(&private->list, &allpstore); - spin_unlock_irqrestore(&allpstore_lock, flags); + list_add(&private->list, &records_list); + mutex_unlock(&records_list_lock); return 0; @@ -353,8 +402,8 @@ fail_private: free_pstore_private(private); fail_inode: iput(inode); - fail: + mutex_unlock(&records_list_lock); return rc; } @@ -366,16 +415,13 @@ fail: */ void pstore_get_records(int quiet) { - struct pstore_info *psi = psinfo; struct dentry *root; - if (!psi || !pstore_sb) + root = psinfo_lock_root(); + if (!root) return; - root = pstore_sb->s_root; - - inode_lock(d_inode(root)); - pstore_get_backend_records(psi, root, quiet); + pstore_get_backend_records(psinfo, root, quiet); inode_unlock(d_inode(root)); } @@ -383,8 +429,6 @@ static int pstore_fill_super(struct super_block *sb, void *data, int silent) { struct inode *inode; - pstore_sb = sb; - sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; @@ -405,6 +449,10 @@ static int pstore_fill_super(struct super_block *sb, void *data, int silent) if (!sb->s_root) return -ENOMEM; + mutex_lock(&pstore_sb_lock); + pstore_sb = sb; + mutex_unlock(&pstore_sb_lock); + pstore_get_records(0); return 0; @@ -418,8 +466,17 @@ static struct dentry *pstore_mount(struct file_system_type *fs_type, static void pstore_kill_sb(struct super_block *sb) { + mutex_lock(&pstore_sb_lock); + WARN_ON(pstore_sb != sb); + kill_litter_super(sb); pstore_sb = NULL; + + mutex_lock(&records_list_lock); + INIT_LIST_HEAD(&records_list); + mutex_unlock(&records_list_lock); + + mutex_unlock(&pstore_sb_lock); } static struct file_system_type pstore_fs_type = { diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index 7062ea4bc57c..7fb219042f13 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -12,9 +12,18 @@ extern unsigned long kmsg_bytes; #ifdef CONFIG_PSTORE_FTRACE extern void pstore_register_ftrace(void); extern void pstore_unregister_ftrace(void); +ssize_t pstore_ftrace_combine_log(char **dest_log, size_t *dest_log_size, + const char *src_log, size_t src_log_size); #else static inline void pstore_register_ftrace(void) {} static inline void pstore_unregister_ftrace(void) {} +static inline ssize_t +pstore_ftrace_combine_log(char **dest_log, size_t *dest_log_size, + const char *src_log, size_t src_log_size) +{ + *dest_log_size = 0; + return 0; +} #endif #ifdef CONFIG_PSTORE_PMSG @@ -31,9 +40,9 @@ extern void pstore_set_kmsg_bytes(int); extern void pstore_get_records(int); extern void pstore_get_backend_records(struct pstore_info *psi, struct dentry *root, int quiet); +extern int pstore_put_backend_records(struct pstore_info *psi); extern int pstore_mkfile(struct dentry *root, struct pstore_record *record); -extern bool pstore_is_mounted(void); extern void pstore_record_init(struct pstore_record *record, struct pstore_info *psi); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 408277ee3cdb..a9e297eefdff 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -44,7 +44,7 @@ static int pstore_update_ms = -1; module_param_named(update_ms, pstore_update_ms, int, 0600); MODULE_PARM_DESC(update_ms, "milliseconds before pstore updates its content " "(default is -1, which means runtime updates are disabled; " - "enabling this option is not safe, it may lead to further " + "enabling this option may not be safe; it may lead to further " "corruption on Oopses)"); /* Names should be in the same order as the enum pstore_type_id */ @@ -69,19 +69,25 @@ static void pstore_dowork(struct work_struct *); static DECLARE_WORK(pstore_work, pstore_dowork); /* - * pstore_lock just protects "psinfo" during - * calls to pstore_register() + * psinfo_lock protects "psinfo" during calls to + * pstore_register(), pstore_unregister(), and + * the filesystem mount/unmount routines. */ -static DEFINE_SPINLOCK(pstore_lock); +static DEFINE_MUTEX(psinfo_lock); struct pstore_info *psinfo; static char *backend; +module_param(backend, charp, 0444); +MODULE_PARM_DESC(backend, "specific backend to use"); + static char *compress = #ifdef CONFIG_PSTORE_COMPRESS_DEFAULT CONFIG_PSTORE_COMPRESS_DEFAULT; #else NULL; #endif +module_param(compress, charp, 0444); +MODULE_PARM_DESC(compress, "compression to use"); /* Compression parameters */ static struct crypto_comp *tfm; @@ -129,24 +135,12 @@ enum pstore_type_id pstore_name_to_type(const char *name) } EXPORT_SYMBOL_GPL(pstore_name_to_type); -static const char *get_reason_str(enum kmsg_dump_reason reason) +static void pstore_timer_kick(void) { - switch (reason) { - case KMSG_DUMP_PANIC: - return "Panic"; - case KMSG_DUMP_OOPS: - return "Oops"; - case KMSG_DUMP_EMERG: - return "Emergency"; - case KMSG_DUMP_RESTART: - return "Restart"; - case KMSG_DUMP_HALT: - return "Halt"; - case KMSG_DUMP_POWEROFF: - return "Poweroff"; - default: - return "Unknown"; - } + if (pstore_update_ms < 0) + return; + + mod_timer(&pstore_timer, jiffies + msecs_to_jiffies(pstore_update_ms)); } /* @@ -393,7 +387,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, unsigned int part = 1; int ret; - why = get_reason_str(reason); + why = kmsg_dump_reason_str(reason); if (down_trylock(&psinfo->buf_lock)) { /* Failed to acquire lock: give up if we cannot wait. */ @@ -459,8 +453,10 @@ static void pstore_dump(struct kmsg_dumper *dumper, } ret = psinfo->write(&record); - if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted()) + if (ret == 0 && reason == KMSG_DUMP_OOPS) { pstore_new_entry = 1; + pstore_timer_kick(); + } total += record.size; part++; @@ -503,14 +499,20 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c) } static struct console pstore_console = { - .name = "pstore", .write = pstore_console_write, - .flags = CON_PRINTBUFFER | CON_ENABLED | CON_ANYTIME, .index = -1, }; static void pstore_register_console(void) { + /* Show which backend is going to get console writes. */ + strscpy(pstore_console.name, psinfo->name, + sizeof(pstore_console.name)); + /* + * Always initialize flags here since prior unregister_console() + * calls may have changed settings (specifically CON_ENABLED). + */ + pstore_console.flags = CON_PRINTBUFFER | CON_ENABLED | CON_ANYTIME; register_console(&pstore_console); } @@ -555,8 +557,6 @@ out: */ int pstore_register(struct pstore_info *psi) { - struct module *owner = psi->owner; - if (backend && strcmp(backend, psi->name)) { pr_warn("ignoring unexpected backend '%s'\n", psi->name); return -EPERM; @@ -576,11 +576,11 @@ int pstore_register(struct pstore_info *psi) return -EINVAL; } - spin_lock(&pstore_lock); + mutex_lock(&psinfo_lock); if (psinfo) { pr_warn("backend '%s' already loaded: ignoring '%s'\n", psinfo->name, psi->name); - spin_unlock(&pstore_lock); + mutex_unlock(&psinfo_lock); return -EBUSY; } @@ -589,21 +589,16 @@ int pstore_register(struct pstore_info *psi) psinfo = psi; mutex_init(&psinfo->read_mutex); sema_init(&psinfo->buf_lock, 1); - spin_unlock(&pstore_lock); - - if (owner && !try_module_get(owner)) { - psinfo = NULL; - return -EINVAL; - } if (psi->flags & PSTORE_FLAGS_DMESG) allocate_buf_for_compression(); - if (pstore_is_mounted()) - pstore_get_records(0); + pstore_get_records(0); - if (psi->flags & PSTORE_FLAGS_DMESG) + if (psi->flags & PSTORE_FLAGS_DMESG) { + pstore_dumper.max_reason = psinfo->max_reason; pstore_register_kmsg(); + } if (psi->flags & PSTORE_FLAGS_CONSOLE) pstore_register_console(); if (psi->flags & PSTORE_FLAGS_FTRACE) @@ -612,33 +607,36 @@ int pstore_register(struct pstore_info *psi) pstore_register_pmsg(); /* Start watching for new records, if desired. */ - if (pstore_update_ms >= 0) { - pstore_timer.expires = jiffies + - msecs_to_jiffies(pstore_update_ms); - add_timer(&pstore_timer); - } + pstore_timer_kick(); /* * Update the module parameter backend, so it is visible * through /sys/module/pstore/parameters/backend */ - backend = psi->name; + backend = kstrdup(psi->name, GFP_KERNEL); pr_info("Registered %s as persistent store backend\n", psi->name); - module_put(owner); - + mutex_unlock(&psinfo_lock); return 0; } EXPORT_SYMBOL_GPL(pstore_register); void pstore_unregister(struct pstore_info *psi) { - /* Stop timer and make sure all work has finished. */ - pstore_update_ms = -1; - del_timer_sync(&pstore_timer); - flush_work(&pstore_work); + /* It's okay to unregister nothing. */ + if (!psi) + return; + + mutex_lock(&psinfo_lock); + + /* Only one backend can be registered at a time. */ + if (WARN_ON(psi != psinfo)) { + mutex_unlock(&psinfo_lock); + return; + } + /* Unregister all callbacks. */ if (psi->flags & PSTORE_FLAGS_PMSG) pstore_unregister_pmsg(); if (psi->flags & PSTORE_FLAGS_FTRACE) @@ -648,10 +646,19 @@ void pstore_unregister(struct pstore_info *psi) if (psi->flags & PSTORE_FLAGS_DMESG) pstore_unregister_kmsg(); + /* Stop timer and make sure all work has finished. */ + del_timer_sync(&pstore_timer); + flush_work(&pstore_work); + + /* Remove all backend records from filesystem tree. */ + pstore_put_backend_records(psi); + free_buf_for_compression(); psinfo = NULL; + kfree(backend); backend = NULL; + mutex_unlock(&psinfo_lock); } EXPORT_SYMBOL_GPL(pstore_unregister); @@ -788,9 +795,7 @@ static void pstore_timefunc(struct timer_list *unused) schedule_work(&pstore_work); } - if (pstore_update_ms >= 0) - mod_timer(&pstore_timer, - jiffies + msecs_to_jiffies(pstore_update_ms)); + pstore_timer_kick(); } static void __init pstore_choose_compression(void) @@ -835,11 +840,5 @@ static void __exit pstore_exit(void) } module_exit(pstore_exit) -module_param(compress, charp, 0444); -MODULE_PARM_DESC(compress, "Pstore compression to use"); - -module_param(backend, charp, 0444); -MODULE_PARM_DESC(backend, "Pstore backend to use"); - MODULE_AUTHOR("Tony Luck <tony.luck@intel.com>"); MODULE_LICENSE("GPL"); diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 795622190c01..ca6d8a867285 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -21,6 +21,7 @@ #include <linux/pstore_ram.h> #include <linux/of.h> #include <linux/of_address.h> +#include "internal.h" #define RAMOOPS_KERNMSG_HDR "====" #define MIN_MEM_SIZE 4096UL @@ -53,22 +54,27 @@ MODULE_PARM_DESC(mem_size, "size of reserved RAM used to store oops/panic logs"); static unsigned int mem_type; -module_param(mem_type, uint, 0600); +module_param(mem_type, uint, 0400); MODULE_PARM_DESC(mem_type, "set to 1 to try to use unbuffered memory (default 0)"); -static int dump_oops = 1; -module_param(dump_oops, int, 0600); -MODULE_PARM_DESC(dump_oops, - "set to 1 to dump oopses, 0 to only dump panics (default 1)"); +static int ramoops_max_reason = -1; +module_param_named(max_reason, ramoops_max_reason, int, 0400); +MODULE_PARM_DESC(max_reason, + "maximum reason for kmsg dump (default 2: Oops and Panic) "); static int ramoops_ecc; -module_param_named(ecc, ramoops_ecc, int, 0600); +module_param_named(ecc, ramoops_ecc, int, 0400); MODULE_PARM_DESC(ramoops_ecc, "if non-zero, the option enables ECC support and specifies " "ECC buffer size in bytes (1 is a special value, means 16 " "bytes ECC)"); +static int ramoops_dump_oops = -1; +module_param_named(dump_oops, ramoops_dump_oops, int, 0400); +MODULE_PARM_DESC(dump_oops, + "(deprecated: use max_reason instead) set to 1 to dump oopses & panics, 0 to only dump panics"); + struct ramoops_context { struct persistent_ram_zone **dprzs; /* Oops dump zones */ struct persistent_ram_zone *cprz; /* Console zone */ @@ -81,7 +87,6 @@ struct ramoops_context { size_t console_size; size_t ftrace_size; size_t pmsg_size; - int dump_oops; u32 flags; struct persistent_ram_ecc_info ecc_info; unsigned int max_dump_cnt; @@ -168,58 +173,6 @@ static bool prz_ok(struct persistent_ram_zone *prz) persistent_ram_ecc_string(prz, NULL, 0)); } -static ssize_t ftrace_log_combine(struct persistent_ram_zone *dest, - struct persistent_ram_zone *src) -{ - size_t dest_size, src_size, total, dest_off, src_off; - size_t dest_idx = 0, src_idx = 0, merged_idx = 0; - void *merged_buf; - struct pstore_ftrace_record *drec, *srec, *mrec; - size_t record_size = sizeof(struct pstore_ftrace_record); - - dest_off = dest->old_log_size % record_size; - dest_size = dest->old_log_size - dest_off; - - src_off = src->old_log_size % record_size; - src_size = src->old_log_size - src_off; - - total = dest_size + src_size; - merged_buf = kmalloc(total, GFP_KERNEL); - if (!merged_buf) - return -ENOMEM; - - drec = (struct pstore_ftrace_record *)(dest->old_log + dest_off); - srec = (struct pstore_ftrace_record *)(src->old_log + src_off); - mrec = (struct pstore_ftrace_record *)(merged_buf); - - while (dest_size > 0 && src_size > 0) { - if (pstore_ftrace_read_timestamp(&drec[dest_idx]) < - pstore_ftrace_read_timestamp(&srec[src_idx])) { - mrec[merged_idx++] = drec[dest_idx++]; - dest_size -= record_size; - } else { - mrec[merged_idx++] = srec[src_idx++]; - src_size -= record_size; - } - } - - while (dest_size > 0) { - mrec[merged_idx++] = drec[dest_idx++]; - dest_size -= record_size; - } - - while (src_size > 0) { - mrec[merged_idx++] = srec[src_idx++]; - src_size -= record_size; - } - - kfree(dest->old_log); - dest->old_log = merged_buf; - dest->old_log_size = total; - - return 0; -} - static ssize_t ramoops_pstore_read(struct pstore_record *record) { ssize_t size = 0; @@ -291,7 +244,12 @@ static ssize_t ramoops_pstore_read(struct pstore_record *record) tmp_prz->corrected_bytes += prz_next->corrected_bytes; tmp_prz->bad_blocks += prz_next->bad_blocks; - size = ftrace_log_combine(tmp_prz, prz_next); + + size = pstore_ftrace_combine_log( + &tmp_prz->old_log, + &tmp_prz->old_log_size, + prz_next->old_log, + prz_next->old_log_size); if (size) goto out; } @@ -382,16 +340,14 @@ static int notrace ramoops_pstore_write(struct pstore_record *record) return -EINVAL; /* - * Out of the various dmesg dump types, ramoops is currently designed - * to only store crash logs, rather than storing general kernel logs. + * We could filter on record->reason here if we wanted to (which + * would duplicate what happened before the "max_reason" setting + * was added), but that would defeat the purpose of a system + * changing printk.always_kmsg_dump, so instead log everything that + * the kmsg dumper sends us, since it should be doing the filtering + * based on the combination of printk.always_kmsg_dump and our + * requested "max_reason". */ - if (record->reason != KMSG_DUMP_OOPS && - record->reason != KMSG_DUMP_PANIC) - return -EINVAL; - - /* Skip Oopes when configured to do so. */ - if (record->reason == KMSG_DUMP_OOPS && !cxt->dump_oops) - return -EINVAL; /* * Explicitly only take the first part of any new crash. @@ -644,19 +600,25 @@ static int ramoops_init_prz(const char *name, return 0; } -static int ramoops_parse_dt_size(struct platform_device *pdev, - const char *propname, u32 *value) +/* Read a u32 from a dt property and make sure it's safe for an int. */ +static int ramoops_parse_dt_u32(struct platform_device *pdev, + const char *propname, + u32 default_value, u32 *value) { u32 val32 = 0; int ret; ret = of_property_read_u32(pdev->dev.of_node, propname, &val32); - if (ret < 0 && ret != -EINVAL) { + if (ret == -EINVAL) { + /* field is missing, use default value. */ + val32 = default_value; + } else if (ret < 0) { dev_err(&pdev->dev, "failed to parse property %s: %d\n", propname, ret); return ret; } + /* Sanity check our results. */ if (val32 > INT_MAX) { dev_err(&pdev->dev, "%s %u > INT_MAX\n", propname, val32); return -EOVERFLOW; @@ -687,23 +649,32 @@ static int ramoops_parse_dt(struct platform_device *pdev, pdata->mem_size = resource_size(res); pdata->mem_address = res->start; pdata->mem_type = of_property_read_bool(of_node, "unbuffered"); - pdata->dump_oops = !of_property_read_bool(of_node, "no-dump-oops"); - -#define parse_size(name, field) { \ - ret = ramoops_parse_dt_size(pdev, name, &value); \ + /* + * Setting "no-dump-oops" is deprecated and will be ignored if + * "max_reason" is also specified. + */ + if (of_property_read_bool(of_node, "no-dump-oops")) + pdata->max_reason = KMSG_DUMP_PANIC; + else + pdata->max_reason = KMSG_DUMP_OOPS; + +#define parse_u32(name, field, default_value) { \ + ret = ramoops_parse_dt_u32(pdev, name, default_value, \ + &value); \ if (ret < 0) \ return ret; \ field = value; \ } - parse_size("record-size", pdata->record_size); - parse_size("console-size", pdata->console_size); - parse_size("ftrace-size", pdata->ftrace_size); - parse_size("pmsg-size", pdata->pmsg_size); - parse_size("ecc-size", pdata->ecc_info.ecc_size); - parse_size("flags", pdata->flags); + parse_u32("record-size", pdata->record_size, 0); + parse_u32("console-size", pdata->console_size, 0); + parse_u32("ftrace-size", pdata->ftrace_size, 0); + parse_u32("pmsg-size", pdata->pmsg_size, 0); + parse_u32("ecc-size", pdata->ecc_info.ecc_size, 0); + parse_u32("flags", pdata->flags, 0); + parse_u32("max-reason", pdata->max_reason, pdata->max_reason); -#undef parse_size +#undef parse_u32 /* * Some old Chromebooks relied on the kernel setting the @@ -785,7 +756,6 @@ static int ramoops_probe(struct platform_device *pdev) cxt->console_size = pdata->console_size; cxt->ftrace_size = pdata->ftrace_size; cxt->pmsg_size = pdata->pmsg_size; - cxt->dump_oops = pdata->dump_oops; cxt->flags = pdata->flags; cxt->ecc_info = pdata->ecc_info; @@ -828,8 +798,10 @@ static int ramoops_probe(struct platform_device *pdev) * the single region size is how to check. */ cxt->pstore.flags = 0; - if (cxt->max_dump_cnt) + if (cxt->max_dump_cnt) { cxt->pstore.flags |= PSTORE_FLAGS_DMESG; + cxt->pstore.max_reason = pdata->max_reason; + } if (cxt->console_size) cxt->pstore.flags |= PSTORE_FLAGS_CONSOLE; if (cxt->max_ftrace_cnt) @@ -865,7 +837,7 @@ static int ramoops_probe(struct platform_device *pdev) mem_size = pdata->mem_size; mem_address = pdata->mem_address; record_size = pdata->record_size; - dump_oops = pdata->dump_oops; + ramoops_max_reason = pdata->max_reason; ramoops_console_size = pdata->console_size; ramoops_pmsg_size = pdata->pmsg_size; ramoops_ftrace_size = pdata->ftrace_size; @@ -948,7 +920,16 @@ static void __init ramoops_register_dummy(void) pdata.console_size = ramoops_console_size; pdata.ftrace_size = ramoops_ftrace_size; pdata.pmsg_size = ramoops_pmsg_size; - pdata.dump_oops = dump_oops; + /* If "max_reason" is set, its value has priority over "dump_oops". */ + if (ramoops_max_reason >= 0) + pdata.max_reason = ramoops_max_reason; + /* Otherwise, if "dump_oops" is set, parse it into "max_reason". */ + else if (ramoops_dump_oops != -1) + pdata.max_reason = ramoops_dump_oops ? KMSG_DUMP_OOPS + : KMSG_DUMP_PANIC; + /* And if neither are explicitly set, use the default. */ + else + pdata.max_reason = KMSG_DUMP_OOPS; pdata.flags = RAMOOPS_FLAG_FTRACE_PER_CPU; /* diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index c917c191e78c..aa8e0b65ff1a 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -283,7 +283,7 @@ static int notrace persistent_ram_update_user(struct persistent_ram_zone *prz, const void __user *s, unsigned int start, unsigned int count) { struct persistent_ram_buffer *buffer = prz->buffer; - int ret = unlikely(__copy_from_user(buffer->data + start, s, count)) ? + int ret = unlikely(copy_from_user(buffer->data + start, s, count)) ? -EFAULT : 0; persistent_ram_update_ecc(prz, start, count); return ret; @@ -348,8 +348,6 @@ int notrace persistent_ram_write_user(struct persistent_ram_zone *prz, int rem, ret = 0, c = count; size_t start; - if (unlikely(!access_ok(s, count))) - return -EFAULT; if (unlikely(c > prz->buffer_size)) { s += c - prz->buffer_size; c = prz->buffer_size; diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c new file mode 100644 index 000000000000..819428dfa32f --- /dev/null +++ b/fs/pstore/zone.c @@ -0,0 +1,1465 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Provide a pstore intermediate backend, organized into kernel memory + * allocated zones that are then mapped and flushed into a single + * contiguous region on a storage backend of some kind (block, mtd, etc). + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/mount.h> +#include <linux/printk.h> +#include <linux/fs.h> +#include <linux/pstore_zone.h> +#include <linux/kdev_t.h> +#include <linux/device.h> +#include <linux/namei.h> +#include <linux/fcntl.h> +#include <linux/uio.h> +#include <linux/writeback.h> +#include "internal.h" + +/** + * struct psz_head - header of zone to flush to storage + * + * @sig: signature to indicate header (PSZ_SIG xor PSZONE-type value) + * @datalen: length of data in @data + * @start: offset into @data where the beginning of the stored bytes begin + * @data: zone data. + */ +struct psz_buffer { +#define PSZ_SIG (0x43474244) /* DBGC */ + uint32_t sig; + atomic_t datalen; + atomic_t start; + uint8_t data[]; +}; + +/** + * struct psz_kmsg_header - kmsg dump-specific header to flush to storage + * + * @magic: magic num for kmsg dump header + * @time: kmsg dump trigger time + * @compressed: whether conpressed + * @counter: kmsg dump counter + * @reason: the kmsg dump reason (e.g. oops, panic, etc) + * @data: pointer to log data + * + * This is a sub-header for a kmsg dump, trailing after &psz_buffer. + */ +struct psz_kmsg_header { +#define PSTORE_KMSG_HEADER_MAGIC 0x4dfc3ae5 /* Just a random number */ + uint32_t magic; + struct timespec64 time; + bool compressed; + uint32_t counter; + enum kmsg_dump_reason reason; + uint8_t data[]; +}; + +/** + * struct pstore_zone - single stored buffer + * + * @off: zone offset of storage + * @type: front-end type for this zone + * @name: front-end name for this zone + * @buffer: pointer to data buffer managed by this zone + * @oldbuf: pointer to old data buffer + * @buffer_size: bytes in @buffer->data + * @should_recover: whether this zone should recover from storage + * @dirty: whether the data in @buffer dirty + * + * zone structure in memory. + */ +struct pstore_zone { + loff_t off; + const char *name; + enum pstore_type_id type; + + struct psz_buffer *buffer; + struct psz_buffer *oldbuf; + size_t buffer_size; + bool should_recover; + atomic_t dirty; +}; + +/** + * struct psz_context - all about running state of pstore/zone + * + * @kpszs: kmsg dump storage zones + * @ppsz: pmsg storage zone + * @cpsz: console storage zone + * @fpszs: ftrace storage zones + * @kmsg_max_cnt: max count of @kpszs + * @kmsg_read_cnt: counter of total read kmsg dumps + * @kmsg_write_cnt: counter of total kmsg dump writes + * @pmsg_read_cnt: counter of total read pmsg zone + * @console_read_cnt: counter of total read console zone + * @ftrace_max_cnt: max count of @fpszs + * @ftrace_read_cnt: counter of max read ftrace zone + * @oops_counter: counter of oops dumps + * @panic_counter: counter of panic dumps + * @recovered: whether finished recovering data from storage + * @on_panic: whether panic is happening + * @pstore_zone_info_lock: lock to @pstore_zone_info + * @pstore_zone_info: information from backend + * @pstore: structure for pstore + */ +struct psz_context { + struct pstore_zone **kpszs; + struct pstore_zone *ppsz; + struct pstore_zone *cpsz; + struct pstore_zone **fpszs; + unsigned int kmsg_max_cnt; + unsigned int kmsg_read_cnt; + unsigned int kmsg_write_cnt; + unsigned int pmsg_read_cnt; + unsigned int console_read_cnt; + unsigned int ftrace_max_cnt; + unsigned int ftrace_read_cnt; + /* + * These counters should be calculated during recovery. + * It records the oops/panic times after crashes rather than boots. + */ + unsigned int oops_counter; + unsigned int panic_counter; + atomic_t recovered; + atomic_t on_panic; + + /* + * pstore_zone_info_lock protects this entire structure during calls + * to register_pstore_zone()/unregister_pstore_zone(). + */ + struct mutex pstore_zone_info_lock; + struct pstore_zone_info *pstore_zone_info; + struct pstore_info pstore; +}; +static struct psz_context pstore_zone_cxt; + +static void psz_flush_all_dirty_zones(struct work_struct *); +static DECLARE_DELAYED_WORK(psz_cleaner, psz_flush_all_dirty_zones); + +/** + * enum psz_flush_mode - flush mode for psz_zone_write() + * + * @FLUSH_NONE: do not flush to storage but update data on memory + * @FLUSH_PART: just flush part of data including meta data to storage + * @FLUSH_META: just flush meta data of zone to storage + * @FLUSH_ALL: flush all of zone + */ +enum psz_flush_mode { + FLUSH_NONE = 0, + FLUSH_PART, + FLUSH_META, + FLUSH_ALL, +}; + +static inline int buffer_datalen(struct pstore_zone *zone) +{ + return atomic_read(&zone->buffer->datalen); +} + +static inline int buffer_start(struct pstore_zone *zone) +{ + return atomic_read(&zone->buffer->start); +} + +static inline bool is_on_panic(void) +{ + return atomic_read(&pstore_zone_cxt.on_panic); +} + +static ssize_t psz_zone_read_buffer(struct pstore_zone *zone, char *buf, + size_t len, unsigned long off) +{ + if (!buf || !zone || !zone->buffer) + return -EINVAL; + if (off > zone->buffer_size) + return -EINVAL; + len = min_t(size_t, len, zone->buffer_size - off); + memcpy(buf, zone->buffer->data + off, len); + return len; +} + +static int psz_zone_read_oldbuf(struct pstore_zone *zone, char *buf, + size_t len, unsigned long off) +{ + if (!buf || !zone || !zone->oldbuf) + return -EINVAL; + if (off > zone->buffer_size) + return -EINVAL; + len = min_t(size_t, len, zone->buffer_size - off); + memcpy(buf, zone->oldbuf->data + off, len); + return 0; +} + +static int psz_zone_write(struct pstore_zone *zone, + enum psz_flush_mode flush_mode, const char *buf, + size_t len, unsigned long off) +{ + struct pstore_zone_info *info = pstore_zone_cxt.pstore_zone_info; + ssize_t wcnt = 0; + ssize_t (*writeop)(const char *buf, size_t bytes, loff_t pos); + size_t wlen; + + if (off > zone->buffer_size) + return -EINVAL; + + wlen = min_t(size_t, len, zone->buffer_size - off); + if (buf && wlen) { + memcpy(zone->buffer->data + off, buf, wlen); + atomic_set(&zone->buffer->datalen, wlen + off); + } + + /* avoid to damage old records */ + if (!is_on_panic() && !atomic_read(&pstore_zone_cxt.recovered)) + goto dirty; + + writeop = is_on_panic() ? info->panic_write : info->write; + if (!writeop) + goto dirty; + + switch (flush_mode) { + case FLUSH_NONE: + if (unlikely(buf && wlen)) + goto dirty; + return 0; + case FLUSH_PART: + wcnt = writeop((const char *)zone->buffer->data + off, wlen, + zone->off + sizeof(*zone->buffer) + off); + if (wcnt != wlen) + goto dirty; + fallthrough; + case FLUSH_META: + wlen = sizeof(struct psz_buffer); + wcnt = writeop((const char *)zone->buffer, wlen, zone->off); + if (wcnt != wlen) + goto dirty; + break; + case FLUSH_ALL: + wlen = zone->buffer_size + sizeof(*zone->buffer); + wcnt = writeop((const char *)zone->buffer, wlen, zone->off); + if (wcnt != wlen) + goto dirty; + break; + } + + return 0; +dirty: + /* no need to mark dirty if going to try next zone */ + if (wcnt == -ENOMSG) + return -ENOMSG; + atomic_set(&zone->dirty, true); + /* flush dirty zones nicely */ + if (wcnt == -EBUSY && !is_on_panic()) + schedule_delayed_work(&psz_cleaner, msecs_to_jiffies(500)); + return -EBUSY; +} + +static int psz_flush_dirty_zone(struct pstore_zone *zone) +{ + int ret; + + if (unlikely(!zone)) + return -EINVAL; + + if (unlikely(!atomic_read(&pstore_zone_cxt.recovered))) + return -EBUSY; + + if (!atomic_xchg(&zone->dirty, false)) + return 0; + + ret = psz_zone_write(zone, FLUSH_ALL, NULL, 0, 0); + if (ret) + atomic_set(&zone->dirty, true); + return ret; +} + +static int psz_flush_dirty_zones(struct pstore_zone **zones, unsigned int cnt) +{ + int i, ret; + struct pstore_zone *zone; + + if (!zones) + return -EINVAL; + + for (i = 0; i < cnt; i++) { + zone = zones[i]; + if (!zone) + return -EINVAL; + ret = psz_flush_dirty_zone(zone); + if (ret) + return ret; + } + return 0; +} + +static int psz_move_zone(struct pstore_zone *old, struct pstore_zone *new) +{ + const char *data = (const char *)old->buffer->data; + int ret; + + ret = psz_zone_write(new, FLUSH_ALL, data, buffer_datalen(old), 0); + if (ret) { + atomic_set(&new->buffer->datalen, 0); + atomic_set(&new->dirty, false); + return ret; + } + atomic_set(&old->buffer->datalen, 0); + return 0; +} + +static void psz_flush_all_dirty_zones(struct work_struct *work) +{ + struct psz_context *cxt = &pstore_zone_cxt; + int ret = 0; + + if (cxt->ppsz) + ret |= psz_flush_dirty_zone(cxt->ppsz); + if (cxt->cpsz) + ret |= psz_flush_dirty_zone(cxt->cpsz); + if (cxt->kpszs) + ret |= psz_flush_dirty_zones(cxt->kpszs, cxt->kmsg_max_cnt); + if (cxt->fpszs) + ret |= psz_flush_dirty_zones(cxt->fpszs, cxt->ftrace_max_cnt); + if (ret && cxt->pstore_zone_info) + schedule_delayed_work(&psz_cleaner, msecs_to_jiffies(1000)); +} + +static int psz_kmsg_recover_data(struct psz_context *cxt) +{ + struct pstore_zone_info *info = cxt->pstore_zone_info; + struct pstore_zone *zone = NULL; + struct psz_buffer *buf; + unsigned long i; + ssize_t rcnt; + + if (!info->read) + return -EINVAL; + + for (i = 0; i < cxt->kmsg_max_cnt; i++) { + zone = cxt->kpszs[i]; + if (unlikely(!zone)) + return -EINVAL; + if (atomic_read(&zone->dirty)) { + unsigned int wcnt = cxt->kmsg_write_cnt; + struct pstore_zone *new = cxt->kpszs[wcnt]; + int ret; + + ret = psz_move_zone(zone, new); + if (ret) { + pr_err("move zone from %lu to %d failed\n", + i, wcnt); + return ret; + } + cxt->kmsg_write_cnt = (wcnt + 1) % cxt->kmsg_max_cnt; + } + if (!zone->should_recover) + continue; + buf = zone->buffer; + rcnt = info->read((char *)buf, zone->buffer_size + sizeof(*buf), + zone->off); + if (rcnt != zone->buffer_size + sizeof(*buf)) + return (int)rcnt < 0 ? (int)rcnt : -EIO; + } + return 0; +} + +static int psz_kmsg_recover_meta(struct psz_context *cxt) +{ + struct pstore_zone_info *info = cxt->pstore_zone_info; + struct pstore_zone *zone; + size_t rcnt, len; + struct psz_buffer *buf; + struct psz_kmsg_header *hdr; + struct timespec64 time = { }; + unsigned long i; + /* + * Recover may on panic, we can't allocate any memory by kmalloc. + * So, we use local array instead. + */ + char buffer_header[sizeof(*buf) + sizeof(*hdr)] = {0}; + + if (!info->read) + return -EINVAL; + + len = sizeof(*buf) + sizeof(*hdr); + buf = (struct psz_buffer *)buffer_header; + for (i = 0; i < cxt->kmsg_max_cnt; i++) { + zone = cxt->kpszs[i]; + if (unlikely(!zone)) + return -EINVAL; + + rcnt = info->read((char *)buf, len, zone->off); + if (rcnt == -ENOMSG) { + pr_debug("%s with id %lu may be broken, skip\n", + zone->name, i); + continue; + } else if (rcnt != len) { + pr_err("read %s with id %lu failed\n", zone->name, i); + return (int)rcnt < 0 ? (int)rcnt : -EIO; + } + + if (buf->sig != zone->buffer->sig) { + pr_debug("no valid data in kmsg dump zone %lu\n", i); + continue; + } + + if (zone->buffer_size < atomic_read(&buf->datalen)) { + pr_info("found overtop zone: %s: id %lu, off %lld, size %zu\n", + zone->name, i, zone->off, + zone->buffer_size); + continue; + } + + hdr = (struct psz_kmsg_header *)buf->data; + if (hdr->magic != PSTORE_KMSG_HEADER_MAGIC) { + pr_info("found invalid zone: %s: id %lu, off %lld, size %zu\n", + zone->name, i, zone->off, + zone->buffer_size); + continue; + } + + /* + * we get the newest zone, and the next one must be the oldest + * or unused zone, because we do write one by one like a circle. + */ + if (hdr->time.tv_sec >= time.tv_sec) { + time.tv_sec = hdr->time.tv_sec; + cxt->kmsg_write_cnt = (i + 1) % cxt->kmsg_max_cnt; + } + + if (hdr->reason == KMSG_DUMP_OOPS) + cxt->oops_counter = + max(cxt->oops_counter, hdr->counter); + else if (hdr->reason == KMSG_DUMP_PANIC) + cxt->panic_counter = + max(cxt->panic_counter, hdr->counter); + + if (!atomic_read(&buf->datalen)) { + pr_debug("found erased zone: %s: id %lu, off %lld, size %zu, datalen %d\n", + zone->name, i, zone->off, + zone->buffer_size, + atomic_read(&buf->datalen)); + continue; + } + + if (!is_on_panic()) + zone->should_recover = true; + pr_debug("found nice zone: %s: id %lu, off %lld, size %zu, datalen %d\n", + zone->name, i, zone->off, + zone->buffer_size, atomic_read(&buf->datalen)); + } + + return 0; +} + +static int psz_kmsg_recover(struct psz_context *cxt) +{ + int ret; + + if (!cxt->kpszs) + return 0; + + ret = psz_kmsg_recover_meta(cxt); + if (ret) + goto recover_fail; + + ret = psz_kmsg_recover_data(cxt); + if (ret) + goto recover_fail; + + return 0; +recover_fail: + pr_debug("psz_recover_kmsg failed\n"); + return ret; +} + +static int psz_recover_zone(struct psz_context *cxt, struct pstore_zone *zone) +{ + struct pstore_zone_info *info = cxt->pstore_zone_info; + struct psz_buffer *oldbuf, tmpbuf; + int ret = 0; + char *buf; + ssize_t rcnt, len, start, off; + + if (!zone || zone->oldbuf) + return 0; + + if (is_on_panic()) { + /* save data as much as possible */ + psz_flush_dirty_zone(zone); + return 0; + } + + if (unlikely(!info->read)) + return -EINVAL; + + len = sizeof(struct psz_buffer); + rcnt = info->read((char *)&tmpbuf, len, zone->off); + if (rcnt != len) { + pr_debug("read zone %s failed\n", zone->name); + return (int)rcnt < 0 ? (int)rcnt : -EIO; + } + + if (tmpbuf.sig != zone->buffer->sig) { + pr_debug("no valid data in zone %s\n", zone->name); + return 0; + } + + if (zone->buffer_size < atomic_read(&tmpbuf.datalen) || + zone->buffer_size < atomic_read(&tmpbuf.start)) { + pr_info("found overtop zone: %s: off %lld, size %zu\n", + zone->name, zone->off, zone->buffer_size); + /* just keep going */ + return 0; + } + + if (!atomic_read(&tmpbuf.datalen)) { + pr_debug("found erased zone: %s: off %lld, size %zu, datalen %d\n", + zone->name, zone->off, zone->buffer_size, + atomic_read(&tmpbuf.datalen)); + return 0; + } + + pr_debug("found nice zone: %s: off %lld, size %zu, datalen %d\n", + zone->name, zone->off, zone->buffer_size, + atomic_read(&tmpbuf.datalen)); + + len = atomic_read(&tmpbuf.datalen) + sizeof(*oldbuf); + oldbuf = kzalloc(len, GFP_KERNEL); + if (!oldbuf) + return -ENOMEM; + + memcpy(oldbuf, &tmpbuf, sizeof(*oldbuf)); + buf = (char *)oldbuf + sizeof(*oldbuf); + len = atomic_read(&oldbuf->datalen); + start = atomic_read(&oldbuf->start); + off = zone->off + sizeof(*oldbuf); + + /* get part of data */ + rcnt = info->read(buf, len - start, off + start); + if (rcnt != len - start) { + pr_err("read zone %s failed\n", zone->name); + ret = (int)rcnt < 0 ? (int)rcnt : -EIO; + goto free_oldbuf; + } + + /* get the rest of data */ + rcnt = info->read(buf + len - start, start, off); + if (rcnt != start) { + pr_err("read zone %s failed\n", zone->name); + ret = (int)rcnt < 0 ? (int)rcnt : -EIO; + goto free_oldbuf; + } + + zone->oldbuf = oldbuf; + psz_flush_dirty_zone(zone); + return 0; + +free_oldbuf: + kfree(oldbuf); + return ret; +} + +static int psz_recover_zones(struct psz_context *cxt, + struct pstore_zone **zones, unsigned int cnt) +{ + int ret; + unsigned int i; + struct pstore_zone *zone; + + if (!zones) + return 0; + + for (i = 0; i < cnt; i++) { + zone = zones[i]; + if (unlikely(!zone)) + continue; + ret = psz_recover_zone(cxt, zone); + if (ret) + goto recover_fail; + } + + return 0; +recover_fail: + pr_debug("recover %s[%u] failed\n", zone->name, i); + return ret; +} + +/** + * psz_recovery() - recover data from storage + * @cxt: the context of pstore/zone + * + * recovery means reading data back from storage after rebooting + * + * Return: 0 on success, others on failure. + */ +static inline int psz_recovery(struct psz_context *cxt) +{ + int ret; + + if (atomic_read(&cxt->recovered)) + return 0; + + ret = psz_kmsg_recover(cxt); + if (ret) + goto out; + + ret = psz_recover_zone(cxt, cxt->ppsz); + if (ret) + goto out; + + ret = psz_recover_zone(cxt, cxt->cpsz); + if (ret) + goto out; + + ret = psz_recover_zones(cxt, cxt->fpszs, cxt->ftrace_max_cnt); + +out: + if (unlikely(ret)) + pr_err("recover failed\n"); + else { + pr_debug("recover end!\n"); + atomic_set(&cxt->recovered, 1); + } + return ret; +} + +static int psz_pstore_open(struct pstore_info *psi) +{ + struct psz_context *cxt = psi->data; + + cxt->kmsg_read_cnt = 0; + cxt->pmsg_read_cnt = 0; + cxt->console_read_cnt = 0; + cxt->ftrace_read_cnt = 0; + return 0; +} + +static inline bool psz_old_ok(struct pstore_zone *zone) +{ + if (zone && zone->oldbuf && atomic_read(&zone->oldbuf->datalen)) + return true; + return false; +} + +static inline bool psz_ok(struct pstore_zone *zone) +{ + if (zone && zone->buffer && buffer_datalen(zone)) + return true; + return false; +} + +static inline int psz_kmsg_erase(struct psz_context *cxt, + struct pstore_zone *zone, struct pstore_record *record) +{ + struct psz_buffer *buffer = zone->buffer; + struct psz_kmsg_header *hdr = + (struct psz_kmsg_header *)buffer->data; + size_t size; + + if (unlikely(!psz_ok(zone))) + return 0; + + /* this zone is already updated, no need to erase */ + if (record->count != hdr->counter) + return 0; + + size = buffer_datalen(zone) + sizeof(*zone->buffer); + atomic_set(&zone->buffer->datalen, 0); + if (cxt->pstore_zone_info->erase) + return cxt->pstore_zone_info->erase(size, zone->off); + else + return psz_zone_write(zone, FLUSH_META, NULL, 0, 0); +} + +static inline int psz_record_erase(struct psz_context *cxt, + struct pstore_zone *zone) +{ + if (unlikely(!psz_old_ok(zone))) + return 0; + + kfree(zone->oldbuf); + zone->oldbuf = NULL; + /* + * if there are new data in zone buffer, that means the old data + * are already invalid. It is no need to flush 0 (erase) to + * block device. + */ + if (!buffer_datalen(zone)) + return psz_zone_write(zone, FLUSH_META, NULL, 0, 0); + psz_flush_dirty_zone(zone); + return 0; +} + +static int psz_pstore_erase(struct pstore_record *record) +{ + struct psz_context *cxt = record->psi->data; + + switch (record->type) { + case PSTORE_TYPE_DMESG: + if (record->id >= cxt->kmsg_max_cnt) + return -EINVAL; + return psz_kmsg_erase(cxt, cxt->kpszs[record->id], record); + case PSTORE_TYPE_PMSG: + return psz_record_erase(cxt, cxt->ppsz); + case PSTORE_TYPE_CONSOLE: + return psz_record_erase(cxt, cxt->cpsz); + case PSTORE_TYPE_FTRACE: + if (record->id >= cxt->ftrace_max_cnt) + return -EINVAL; + return psz_record_erase(cxt, cxt->fpszs[record->id]); + default: return -EINVAL; + } +} + +static void psz_write_kmsg_hdr(struct pstore_zone *zone, + struct pstore_record *record) +{ + struct psz_context *cxt = record->psi->data; + struct psz_buffer *buffer = zone->buffer; + struct psz_kmsg_header *hdr = + (struct psz_kmsg_header *)buffer->data; + + hdr->magic = PSTORE_KMSG_HEADER_MAGIC; + hdr->compressed = record->compressed; + hdr->time.tv_sec = record->time.tv_sec; + hdr->time.tv_nsec = record->time.tv_nsec; + hdr->reason = record->reason; + if (hdr->reason == KMSG_DUMP_OOPS) + hdr->counter = ++cxt->oops_counter; + else if (hdr->reason == KMSG_DUMP_PANIC) + hdr->counter = ++cxt->panic_counter; + else + hdr->counter = 0; +} + +/* + * In case zone is broken, which may occur to MTD device, we try each zones, + * start at cxt->kmsg_write_cnt. + */ +static inline int notrace psz_kmsg_write_record(struct psz_context *cxt, + struct pstore_record *record) +{ + size_t size, hlen; + struct pstore_zone *zone; + unsigned int i; + + for (i = 0; i < cxt->kmsg_max_cnt; i++) { + unsigned int zonenum, len; + int ret; + + zonenum = (cxt->kmsg_write_cnt + i) % cxt->kmsg_max_cnt; + zone = cxt->kpszs[zonenum]; + if (unlikely(!zone)) + return -ENOSPC; + + /* avoid destroying old data, allocate a new one */ + len = zone->buffer_size + sizeof(*zone->buffer); + zone->oldbuf = zone->buffer; + zone->buffer = kzalloc(len, GFP_KERNEL); + if (!zone->buffer) { + zone->buffer = zone->oldbuf; + return -ENOMEM; + } + zone->buffer->sig = zone->oldbuf->sig; + + pr_debug("write %s to zone id %d\n", zone->name, zonenum); + psz_write_kmsg_hdr(zone, record); + hlen = sizeof(struct psz_kmsg_header); + size = min_t(size_t, record->size, zone->buffer_size - hlen); + ret = psz_zone_write(zone, FLUSH_ALL, record->buf, size, hlen); + if (likely(!ret || ret != -ENOMSG)) { + cxt->kmsg_write_cnt = zonenum + 1; + cxt->kmsg_write_cnt %= cxt->kmsg_max_cnt; + /* no need to try next zone, free last zone buffer */ + kfree(zone->oldbuf); + zone->oldbuf = NULL; + return ret; + } + + pr_debug("zone %u may be broken, try next dmesg zone\n", + zonenum); + kfree(zone->buffer); + zone->buffer = zone->oldbuf; + zone->oldbuf = NULL; + } + + return -EBUSY; +} + +static int notrace psz_kmsg_write(struct psz_context *cxt, + struct pstore_record *record) +{ + int ret; + + /* + * Explicitly only take the first part of any new crash. + * If our buffer is larger than kmsg_bytes, this can never happen, + * and if our buffer is smaller than kmsg_bytes, we don't want the + * report split across multiple records. + */ + if (record->part != 1) + return -ENOSPC; + + if (!cxt->kpszs) + return -ENOSPC; + + ret = psz_kmsg_write_record(cxt, record); + if (!ret && is_on_panic()) { + /* ensure all data are flushed to storage when panic */ + pr_debug("try to flush other dirty zones\n"); + psz_flush_all_dirty_zones(NULL); + } + + /* always return 0 as we had handled it on buffer */ + return 0; +} + +static int notrace psz_record_write(struct pstore_zone *zone, + struct pstore_record *record) +{ + size_t start, rem; + bool is_full_data = false; + char *buf; + int cnt; + + if (!zone || !record) + return -ENOSPC; + + if (atomic_read(&zone->buffer->datalen) >= zone->buffer_size) + is_full_data = true; + + cnt = record->size; + buf = record->buf; + if (unlikely(cnt > zone->buffer_size)) { + buf += cnt - zone->buffer_size; + cnt = zone->buffer_size; + } + + start = buffer_start(zone); + rem = zone->buffer_size - start; + if (unlikely(rem < cnt)) { + psz_zone_write(zone, FLUSH_PART, buf, rem, start); + buf += rem; + cnt -= rem; + start = 0; + is_full_data = true; + } + + atomic_set(&zone->buffer->start, cnt + start); + psz_zone_write(zone, FLUSH_PART, buf, cnt, start); + + /** + * psz_zone_write will set datalen as start + cnt. + * It work if actual data length lesser than buffer size. + * If data length greater than buffer size, pmsg will rewrite to + * beginning of zone, which make buffer->datalen wrongly. + * So we should reset datalen as buffer size once actual data length + * greater than buffer size. + */ + if (is_full_data) { + atomic_set(&zone->buffer->datalen, zone->buffer_size); + psz_zone_write(zone, FLUSH_META, NULL, 0, 0); + } + return 0; +} + +static int notrace psz_pstore_write(struct pstore_record *record) +{ + struct psz_context *cxt = record->psi->data; + + if (record->type == PSTORE_TYPE_DMESG && + record->reason == KMSG_DUMP_PANIC) + atomic_set(&cxt->on_panic, 1); + + /* + * if on panic, do not write except panic records + * Fix case that panic_write prints log which wakes up console backend. + */ + if (is_on_panic() && record->type != PSTORE_TYPE_DMESG) + return -EBUSY; + + switch (record->type) { + case PSTORE_TYPE_DMESG: + return psz_kmsg_write(cxt, record); + case PSTORE_TYPE_CONSOLE: + return psz_record_write(cxt->cpsz, record); + case PSTORE_TYPE_PMSG: + return psz_record_write(cxt->ppsz, record); + case PSTORE_TYPE_FTRACE: { + int zonenum = smp_processor_id(); + + if (!cxt->fpszs) + return -ENOSPC; + return psz_record_write(cxt->fpszs[zonenum], record); + } + default: + return -EINVAL; + } +} + +static struct pstore_zone *psz_read_next_zone(struct psz_context *cxt) +{ + struct pstore_zone *zone = NULL; + + while (cxt->kmsg_read_cnt < cxt->kmsg_max_cnt) { + zone = cxt->kpszs[cxt->kmsg_read_cnt++]; + if (psz_ok(zone)) + return zone; + } + + if (cxt->ftrace_read_cnt < cxt->ftrace_max_cnt) + /* + * No need psz_old_ok(). Let psz_ftrace_read() do so for + * combination. psz_ftrace_read() should traverse over + * all zones in case of some zone without data. + */ + return cxt->fpszs[cxt->ftrace_read_cnt++]; + + if (cxt->pmsg_read_cnt == 0) { + cxt->pmsg_read_cnt++; + zone = cxt->ppsz; + if (psz_old_ok(zone)) + return zone; + } + + if (cxt->console_read_cnt == 0) { + cxt->console_read_cnt++; + zone = cxt->cpsz; + if (psz_old_ok(zone)) + return zone; + } + + return NULL; +} + +static int psz_kmsg_read_hdr(struct pstore_zone *zone, + struct pstore_record *record) +{ + struct psz_buffer *buffer = zone->buffer; + struct psz_kmsg_header *hdr = + (struct psz_kmsg_header *)buffer->data; + + if (hdr->magic != PSTORE_KMSG_HEADER_MAGIC) + return -EINVAL; + record->compressed = hdr->compressed; + record->time.tv_sec = hdr->time.tv_sec; + record->time.tv_nsec = hdr->time.tv_nsec; + record->reason = hdr->reason; + record->count = hdr->counter; + return 0; +} + +static ssize_t psz_kmsg_read(struct pstore_zone *zone, + struct pstore_record *record) +{ + ssize_t size, hlen = 0; + + size = buffer_datalen(zone); + /* Clear and skip this kmsg dump record if it has no valid header */ + if (psz_kmsg_read_hdr(zone, record)) { + atomic_set(&zone->buffer->datalen, 0); + atomic_set(&zone->dirty, 0); + return -ENOMSG; + } + size -= sizeof(struct psz_kmsg_header); + + if (!record->compressed) { + char *buf = kasprintf(GFP_KERNEL, "%s: Total %d times\n", + kmsg_dump_reason_str(record->reason), + record->count); + hlen = strlen(buf); + record->buf = krealloc(buf, hlen + size, GFP_KERNEL); + if (!record->buf) { + kfree(buf); + return -ENOMEM; + } + } else { + record->buf = kmalloc(size, GFP_KERNEL); + if (!record->buf) + return -ENOMEM; + } + + size = psz_zone_read_buffer(zone, record->buf + hlen, size, + sizeof(struct psz_kmsg_header)); + if (unlikely(size < 0)) { + kfree(record->buf); + return -ENOMSG; + } + + return size + hlen; +} + +/* try to combine all ftrace zones */ +static ssize_t psz_ftrace_read(struct pstore_zone *zone, + struct pstore_record *record) +{ + struct psz_context *cxt; + struct psz_buffer *buf; + int ret; + + if (!zone || !record) + return -ENOSPC; + + if (!psz_old_ok(zone)) + goto out; + + buf = (struct psz_buffer *)zone->oldbuf; + if (!buf) + return -ENOMSG; + + ret = pstore_ftrace_combine_log(&record->buf, &record->size, + (char *)buf->data, atomic_read(&buf->datalen)); + if (unlikely(ret)) + return ret; + +out: + cxt = record->psi->data; + if (cxt->ftrace_read_cnt < cxt->ftrace_max_cnt) + /* then, read next ftrace zone */ + return -ENOMSG; + record->id = 0; + return record->size ? record->size : -ENOMSG; +} + +static ssize_t psz_record_read(struct pstore_zone *zone, + struct pstore_record *record) +{ + size_t len; + struct psz_buffer *buf; + + if (!zone || !record) + return -ENOSPC; + + buf = (struct psz_buffer *)zone->oldbuf; + if (!buf) + return -ENOMSG; + + len = atomic_read(&buf->datalen); + record->buf = kmalloc(len, GFP_KERNEL); + if (!record->buf) + return -ENOMEM; + + if (unlikely(psz_zone_read_oldbuf(zone, record->buf, len, 0))) { + kfree(record->buf); + return -ENOMSG; + } + + return len; +} + +static ssize_t psz_pstore_read(struct pstore_record *record) +{ + struct psz_context *cxt = record->psi->data; + ssize_t (*readop)(struct pstore_zone *zone, + struct pstore_record *record); + struct pstore_zone *zone; + ssize_t ret; + + /* before read, we must recover from storage */ + ret = psz_recovery(cxt); + if (ret) + return ret; + +next_zone: + zone = psz_read_next_zone(cxt); + if (!zone) + return 0; + + record->type = zone->type; + switch (record->type) { + case PSTORE_TYPE_DMESG: + readop = psz_kmsg_read; + record->id = cxt->kmsg_read_cnt - 1; + break; + case PSTORE_TYPE_FTRACE: + readop = psz_ftrace_read; + break; + case PSTORE_TYPE_CONSOLE: + fallthrough; + case PSTORE_TYPE_PMSG: + readop = psz_record_read; + break; + default: + goto next_zone; + } + + ret = readop(zone, record); + if (ret == -ENOMSG) + goto next_zone; + return ret; +} + +static struct psz_context pstore_zone_cxt = { + .pstore_zone_info_lock = + __MUTEX_INITIALIZER(pstore_zone_cxt.pstore_zone_info_lock), + .recovered = ATOMIC_INIT(0), + .on_panic = ATOMIC_INIT(0), + .pstore = { + .owner = THIS_MODULE, + .open = psz_pstore_open, + .read = psz_pstore_read, + .write = psz_pstore_write, + .erase = psz_pstore_erase, + }, +}; + +static void psz_free_zone(struct pstore_zone **pszone) +{ + struct pstore_zone *zone = *pszone; + + if (!zone) + return; + + kfree(zone->buffer); + kfree(zone); + *pszone = NULL; +} + +static void psz_free_zones(struct pstore_zone ***pszones, unsigned int *cnt) +{ + struct pstore_zone **zones = *pszones; + + if (!zones) + return; + + while (*cnt > 0) { + (*cnt)--; + psz_free_zone(&(zones[*cnt])); + } + kfree(zones); + *pszones = NULL; +} + +static void psz_free_all_zones(struct psz_context *cxt) +{ + if (cxt->kpszs) + psz_free_zones(&cxt->kpszs, &cxt->kmsg_max_cnt); + if (cxt->ppsz) + psz_free_zone(&cxt->ppsz); + if (cxt->cpsz) + psz_free_zone(&cxt->cpsz); + if (cxt->fpszs) + psz_free_zones(&cxt->fpszs, &cxt->ftrace_max_cnt); +} + +static struct pstore_zone *psz_init_zone(enum pstore_type_id type, + loff_t *off, size_t size) +{ + struct pstore_zone_info *info = pstore_zone_cxt.pstore_zone_info; + struct pstore_zone *zone; + const char *name = pstore_type_to_name(type); + + if (!size) + return NULL; + + if (*off + size > info->total_size) { + pr_err("no room for %s (0x%zx@0x%llx over 0x%lx)\n", + name, size, *off, info->total_size); + return ERR_PTR(-ENOMEM); + } + + zone = kzalloc(sizeof(struct pstore_zone), GFP_KERNEL); + if (!zone) + return ERR_PTR(-ENOMEM); + + zone->buffer = kmalloc(size, GFP_KERNEL); + if (!zone->buffer) { + kfree(zone); + return ERR_PTR(-ENOMEM); + } + memset(zone->buffer, 0xFF, size); + zone->off = *off; + zone->name = name; + zone->type = type; + zone->buffer_size = size - sizeof(struct psz_buffer); + zone->buffer->sig = type ^ PSZ_SIG; + zone->oldbuf = NULL; + atomic_set(&zone->dirty, 0); + atomic_set(&zone->buffer->datalen, 0); + atomic_set(&zone->buffer->start, 0); + + *off += size; + + pr_debug("pszone %s: off 0x%llx, %zu header, %zu data\n", zone->name, + zone->off, sizeof(*zone->buffer), zone->buffer_size); + return zone; +} + +static struct pstore_zone **psz_init_zones(enum pstore_type_id type, + loff_t *off, size_t total_size, ssize_t record_size, + unsigned int *cnt) +{ + struct pstore_zone_info *info = pstore_zone_cxt.pstore_zone_info; + struct pstore_zone **zones, *zone; + const char *name = pstore_type_to_name(type); + int c, i; + + *cnt = 0; + if (!total_size || !record_size) + return NULL; + + if (*off + total_size > info->total_size) { + pr_err("no room for zones %s (0x%zx@0x%llx over 0x%lx)\n", + name, total_size, *off, info->total_size); + return ERR_PTR(-ENOMEM); + } + + c = total_size / record_size; + zones = kcalloc(c, sizeof(*zones), GFP_KERNEL); + if (!zones) { + pr_err("allocate for zones %s failed\n", name); + return ERR_PTR(-ENOMEM); + } + memset(zones, 0, c * sizeof(*zones)); + + for (i = 0; i < c; i++) { + zone = psz_init_zone(type, off, record_size); + if (!zone || IS_ERR(zone)) { + pr_err("initialize zones %s failed\n", name); + psz_free_zones(&zones, &i); + return (void *)zone; + } + zones[i] = zone; + } + + *cnt = c; + return zones; +} + +static int psz_alloc_zones(struct psz_context *cxt) +{ + struct pstore_zone_info *info = cxt->pstore_zone_info; + loff_t off = 0; + int err; + size_t off_size = 0; + + off_size += info->pmsg_size; + cxt->ppsz = psz_init_zone(PSTORE_TYPE_PMSG, &off, info->pmsg_size); + if (IS_ERR(cxt->ppsz)) { + err = PTR_ERR(cxt->ppsz); + cxt->ppsz = NULL; + goto free_out; + } + + off_size += info->console_size; + cxt->cpsz = psz_init_zone(PSTORE_TYPE_CONSOLE, &off, + info->console_size); + if (IS_ERR(cxt->cpsz)) { + err = PTR_ERR(cxt->cpsz); + cxt->cpsz = NULL; + goto free_out; + } + + off_size += info->ftrace_size; + cxt->fpszs = psz_init_zones(PSTORE_TYPE_FTRACE, &off, + info->ftrace_size, + info->ftrace_size / nr_cpu_ids, + &cxt->ftrace_max_cnt); + if (IS_ERR(cxt->fpszs)) { + err = PTR_ERR(cxt->fpszs); + cxt->fpszs = NULL; + goto free_out; + } + + cxt->kpszs = psz_init_zones(PSTORE_TYPE_DMESG, &off, + info->total_size - off_size, + info->kmsg_size, &cxt->kmsg_max_cnt); + if (IS_ERR(cxt->kpszs)) { + err = PTR_ERR(cxt->kpszs); + cxt->kpszs = NULL; + goto free_out; + } + + return 0; +free_out: + psz_free_all_zones(cxt); + return err; +} + +/** + * register_pstore_zone() - register to pstore/zone + * + * @info: back-end driver information. See &struct pstore_zone_info. + * + * Only one back-end at one time. + * + * Return: 0 on success, others on failure. + */ +int register_pstore_zone(struct pstore_zone_info *info) +{ + int err = -EINVAL; + struct psz_context *cxt = &pstore_zone_cxt; + + if (info->total_size < 4096) { + pr_warn("total_size must be >= 4096\n"); + return -EINVAL; + } + + if (!info->kmsg_size && !info->pmsg_size && !info->console_size && + !info->ftrace_size) { + pr_warn("at least one record size must be non-zero\n"); + return -EINVAL; + } + + if (!info->name || !info->name[0]) + return -EINVAL; + +#define check_size(name, size) { \ + if (info->name > 0 && info->name < (size)) { \ + pr_err(#name " must be over %d\n", (size)); \ + return -EINVAL; \ + } \ + if (info->name & (size - 1)) { \ + pr_err(#name " must be a multiple of %d\n", \ + (size)); \ + return -EINVAL; \ + } \ + } + + check_size(total_size, 4096); + check_size(kmsg_size, SECTOR_SIZE); + check_size(pmsg_size, SECTOR_SIZE); + check_size(console_size, SECTOR_SIZE); + check_size(ftrace_size, SECTOR_SIZE); + +#undef check_size + + /* + * the @read and @write must be applied. + * if no @read, pstore may mount failed. + * if no @write, pstore do not support to remove record file. + */ + if (!info->read || !info->write) { + pr_err("no valid general read/write interface\n"); + return -EINVAL; + } + + mutex_lock(&cxt->pstore_zone_info_lock); + if (cxt->pstore_zone_info) { + pr_warn("'%s' already loaded: ignoring '%s'\n", + cxt->pstore_zone_info->name, info->name); + mutex_unlock(&cxt->pstore_zone_info_lock); + return -EBUSY; + } + cxt->pstore_zone_info = info; + + pr_debug("register %s with properties:\n", info->name); + pr_debug("\ttotal size : %ld Bytes\n", info->total_size); + pr_debug("\tkmsg size : %ld Bytes\n", info->kmsg_size); + pr_debug("\tpmsg size : %ld Bytes\n", info->pmsg_size); + pr_debug("\tconsole size : %ld Bytes\n", info->console_size); + pr_debug("\tftrace size : %ld Bytes\n", info->ftrace_size); + + err = psz_alloc_zones(cxt); + if (err) { + pr_err("alloc zones failed\n"); + goto fail_out; + } + + if (info->kmsg_size) { + cxt->pstore.bufsize = cxt->kpszs[0]->buffer_size - + sizeof(struct psz_kmsg_header); + cxt->pstore.buf = kzalloc(cxt->pstore.bufsize, GFP_KERNEL); + if (!cxt->pstore.buf) { + err = -ENOMEM; + goto fail_free; + } + } + cxt->pstore.data = cxt; + + pr_info("registered %s as backend for", info->name); + cxt->pstore.max_reason = info->max_reason; + cxt->pstore.name = info->name; + if (info->kmsg_size) { + cxt->pstore.flags |= PSTORE_FLAGS_DMESG; + pr_cont(" kmsg(%s", + kmsg_dump_reason_str(cxt->pstore.max_reason)); + if (cxt->pstore_zone_info->panic_write) + pr_cont(",panic_write"); + pr_cont(")"); + } + if (info->pmsg_size) { + cxt->pstore.flags |= PSTORE_FLAGS_PMSG; + pr_cont(" pmsg"); + } + if (info->console_size) { + cxt->pstore.flags |= PSTORE_FLAGS_CONSOLE; + pr_cont(" console"); + } + if (info->ftrace_size) { + cxt->pstore.flags |= PSTORE_FLAGS_FTRACE; + pr_cont(" ftrace"); + } + pr_cont("\n"); + + err = pstore_register(&cxt->pstore); + if (err) { + pr_err("registering with pstore failed\n"); + goto fail_free; + } + mutex_unlock(&pstore_zone_cxt.pstore_zone_info_lock); + + return 0; + +fail_free: + kfree(cxt->pstore.buf); + cxt->pstore.buf = NULL; + cxt->pstore.bufsize = 0; + psz_free_all_zones(cxt); +fail_out: + pstore_zone_cxt.pstore_zone_info = NULL; + mutex_unlock(&pstore_zone_cxt.pstore_zone_info_lock); + return err; +} +EXPORT_SYMBOL_GPL(register_pstore_zone); + +/** + * unregister_pstore_zone() - unregister to pstore/zone + * + * @info: back-end driver information. See struct pstore_zone_info. + */ +void unregister_pstore_zone(struct pstore_zone_info *info) +{ + struct psz_context *cxt = &pstore_zone_cxt; + + mutex_lock(&cxt->pstore_zone_info_lock); + if (!cxt->pstore_zone_info) { + mutex_unlock(&cxt->pstore_zone_info_lock); + return; + } + + /* Stop incoming writes from pstore. */ + pstore_unregister(&cxt->pstore); + + /* Flush any pending writes. */ + psz_flush_all_dirty_zones(NULL); + flush_delayed_work(&psz_cleaner); + + /* Clean up allocations. */ + kfree(cxt->pstore.buf); + cxt->pstore.buf = NULL; + cxt->pstore.bufsize = 0; + cxt->pstore_zone_info = NULL; + + psz_free_all_zones(cxt); + + /* Clear counters and zone state. */ + cxt->oops_counter = 0; + cxt->panic_counter = 0; + atomic_set(&cxt->recovered, 0); + atomic_set(&cxt->on_panic, 0); + + mutex_unlock(&cxt->pstore_zone_info_lock); +} +EXPORT_SYMBOL_GPL(unregister_pstore_zone); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("WeiXiong Liao <liaoweixiong@allwinnertech.com>"); +MODULE_AUTHOR("Kees Cook <keescook@chromium.org>"); +MODULE_DESCRIPTION("Storage Manager for pstore/blk"); diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 345db56c98fd..755293c8c71a 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -99,10 +99,9 @@ static int qnx6_readpage(struct file *file, struct page *page) return mpage_readpage(page, qnx6_get_block); } -static int qnx6_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void qnx6_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, qnx6_get_block); + mpage_readahead(rac, qnx6_get_block); } /* @@ -499,7 +498,7 @@ static sector_t qnx6_bmap(struct address_space *mapping, sector_t block) } static const struct address_space_operations qnx6_aops = { .readpage = qnx6_readpage, - .readpages = qnx6_readpages, + .readahead = qnx6_readahead, .bmap = qnx6_bmap }; diff --git a/fs/readdir.c b/fs/readdir.c index de2eceffdee8..a49f07c11cfb 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -157,17 +157,18 @@ static int fillonedir(struct dir_context *ctx, const char *name, int namlen, } buf->result++; dirent = buf->dirent; - if (!access_ok(dirent, + if (!user_write_access_begin(dirent, (unsigned long)(dirent->d_name + namlen + 1) - (unsigned long)dirent)) goto efault; - if ( __put_user(d_ino, &dirent->d_ino) || - __put_user(offset, &dirent->d_offset) || - __put_user(namlen, &dirent->d_namlen) || - __copy_to_user(dirent->d_name, name, namlen) || - __put_user(0, dirent->d_name + namlen)) - goto efault; + unsafe_put_user(d_ino, &dirent->d_ino, efault_end); + unsafe_put_user(offset, &dirent->d_offset, efault_end); + unsafe_put_user(namlen, &dirent->d_namlen, efault_end); + unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); + user_write_access_end(); return 0; +efault_end: + user_write_access_end(); efault: buf->result = -EFAULT; return -EFAULT; @@ -242,7 +243,7 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen, return -EINTR; dirent = buf->current_dir; prev = (void __user *) dirent - prev_reclen; - if (!user_access_begin(prev, reclen + prev_reclen)) + if (!user_write_access_begin(prev, reclen + prev_reclen)) goto efault; /* This might be 'dirent->d_off', but if so it will get overwritten */ @@ -251,14 +252,14 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen, unsafe_put_user(reclen, &dirent->d_reclen, efault_end); unsafe_put_user(d_type, (char __user *) dirent + reclen - 1, efault_end); unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); - user_access_end(); + user_write_access_end(); buf->current_dir = (void __user *)dirent + reclen; buf->prev_reclen = reclen; buf->count -= reclen; return 0; efault_end: - user_access_end(); + user_write_access_end(); efault: buf->error = -EFAULT; return -EFAULT; @@ -275,9 +276,6 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd, }; int error; - if (!access_ok(dirent, count)) - return -EFAULT; - f = fdget_pos(fd); if (!f.file) return -EBADF; @@ -327,7 +325,7 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen, return -EINTR; dirent = buf->current_dir; prev = (void __user *)dirent - prev_reclen; - if (!user_access_begin(prev, reclen + prev_reclen)) + if (!user_write_access_begin(prev, reclen + prev_reclen)) goto efault; /* This might be 'dirent->d_off', but if so it will get overwritten */ @@ -336,7 +334,7 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen, unsafe_put_user(reclen, &dirent->d_reclen, efault_end); unsafe_put_user(d_type, &dirent->d_type, efault_end); unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); - user_access_end(); + user_write_access_end(); buf->prev_reclen = reclen; buf->current_dir = (void __user *)dirent + reclen; @@ -344,7 +342,7 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen, return 0; efault_end: - user_access_end(); + user_write_access_end(); efault: buf->error = -EFAULT; return -EFAULT; @@ -361,9 +359,6 @@ int ksys_getdents64(unsigned int fd, struct linux_dirent64 __user *dirent, }; int error; - if (!access_ok(dirent, count)) - return -EFAULT; - f = fdget_pos(fd); if (!f.file) return -EBADF; @@ -376,7 +371,7 @@ int ksys_getdents64(unsigned int fd, struct linux_dirent64 __user *dirent, typeof(lastdirent->d_off) d_off = buf.ctx.pos; lastdirent = (void __user *) buf.current_dir - buf.prev_reclen; - if (__put_user(d_off, &lastdirent->d_off)) + if (put_user(d_off, &lastdirent->d_off)) error = -EFAULT; else error = count - buf.count; @@ -424,17 +419,18 @@ static int compat_fillonedir(struct dir_context *ctx, const char *name, } buf->result++; dirent = buf->dirent; - if (!access_ok(dirent, + if (!user_write_access_begin(dirent, (unsigned long)(dirent->d_name + namlen + 1) - (unsigned long)dirent)) goto efault; - if ( __put_user(d_ino, &dirent->d_ino) || - __put_user(offset, &dirent->d_offset) || - __put_user(namlen, &dirent->d_namlen) || - __copy_to_user(dirent->d_name, name, namlen) || - __put_user(0, dirent->d_name + namlen)) - goto efault; + unsafe_put_user(d_ino, &dirent->d_ino, efault_end); + unsafe_put_user(offset, &dirent->d_offset, efault_end); + unsafe_put_user(namlen, &dirent->d_namlen, efault_end); + unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); + user_write_access_end(); return 0; +efault_end: + user_write_access_end(); efault: buf->result = -EFAULT; return -EFAULT; @@ -471,7 +467,7 @@ struct compat_linux_dirent { struct compat_getdents_callback { struct dir_context ctx; struct compat_linux_dirent __user *current_dir; - struct compat_linux_dirent __user *previous; + int prev_reclen; int count; int error; }; @@ -479,13 +475,17 @@ struct compat_getdents_callback { static int compat_filldir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { - struct compat_linux_dirent __user * dirent; + struct compat_linux_dirent __user *dirent, *prev; struct compat_getdents_callback *buf = container_of(ctx, struct compat_getdents_callback, ctx); compat_ulong_t d_ino; int reclen = ALIGN(offsetof(struct compat_linux_dirent, d_name) + namlen + 2, sizeof(compat_long_t)); + int prev_reclen; + buf->error = verify_dirent_name(name, namlen); + if (unlikely(buf->error)) + return buf->error; buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) return -EINVAL; @@ -494,29 +494,27 @@ static int compat_filldir(struct dir_context *ctx, const char *name, int namlen, buf->error = -EOVERFLOW; return -EOVERFLOW; } - dirent = buf->previous; - if (dirent) { - if (signal_pending(current)) - return -EINTR; - if (__put_user(offset, &dirent->d_off)) - goto efault; - } + prev_reclen = buf->prev_reclen; + if (prev_reclen && signal_pending(current)) + return -EINTR; dirent = buf->current_dir; - if (__put_user(d_ino, &dirent->d_ino)) - goto efault; - if (__put_user(reclen, &dirent->d_reclen)) - goto efault; - if (copy_to_user(dirent->d_name, name, namlen)) - goto efault; - if (__put_user(0, dirent->d_name + namlen)) - goto efault; - if (__put_user(d_type, (char __user *) dirent + reclen - 1)) + prev = (void __user *) dirent - prev_reclen; + if (!user_write_access_begin(prev, reclen + prev_reclen)) goto efault; - buf->previous = dirent; - dirent = (void __user *)dirent + reclen; - buf->current_dir = dirent; + + unsafe_put_user(offset, &prev->d_off, efault_end); + unsafe_put_user(d_ino, &dirent->d_ino, efault_end); + unsafe_put_user(reclen, &dirent->d_reclen, efault_end); + unsafe_put_user(d_type, (char __user *) dirent + reclen - 1, efault_end); + unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); + user_write_access_end(); + + buf->prev_reclen = reclen; + buf->current_dir = (void __user *)dirent + reclen; buf->count -= reclen; return 0; +efault_end: + user_write_access_end(); efault: buf->error = -EFAULT; return -EFAULT; @@ -526,7 +524,6 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd, struct compat_linux_dirent __user *, dirent, unsigned int, count) { struct fd f; - struct compat_linux_dirent __user * lastdirent; struct compat_getdents_callback buf = { .ctx.actor = compat_filldir, .current_dir = dirent, @@ -534,9 +531,6 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd, }; int error; - if (!access_ok(dirent, count)) - return -EFAULT; - f = fdget_pos(fd); if (!f.file) return -EBADF; @@ -544,8 +538,10 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd, error = iterate_dir(f.file, &buf.ctx); if (error >= 0) error = buf.error; - lastdirent = buf.previous; - if (lastdirent) { + if (buf.prev_reclen) { + struct compat_linux_dirent __user * lastdirent; + lastdirent = (void __user *)buf.current_dir - buf.prev_reclen; + if (put_user(buf.ctx.pos, &lastdirent->d_off)) error = -EFAULT; else diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 6419e6dacc39..0031070b3692 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1160,11 +1160,9 @@ failure: return retval; } -static int -reiserfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void reiserfs_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block); + mpage_readahead(rac, reiserfs_get_block); } /* @@ -3434,7 +3432,7 @@ out: const struct address_space_operations reiserfs_address_space_operations = { .writepage = reiserfs_writepage, .readpage = reiserfs_readpage, - .readpages = reiserfs_readpages, + .readahead = reiserfs_readahead, .releasepage = reiserfs_releasepage, .invalidatepage = reiserfs_invalidatepage, .write_begin = reiserfs_write_begin, diff --git a/fs/romfs/Kconfig b/fs/romfs/Kconfig index ad4c45788896..9737b8e68878 100644 --- a/fs/romfs/Kconfig +++ b/fs/romfs/Kconfig @@ -6,7 +6,7 @@ config ROMFS_FS This is a very small read-only file system mainly intended for initial ram disks of installation disks, but it could be used for other read-only media as well. Read - <file:Documentation/filesystems/romfs.txt> for details. + <file:Documentation/filesystems/romfs.rst> for details. To compile this file system support as a module, choose M here: the module will be called romfs. Note that the file system of your diff --git a/fs/splice.c b/fs/splice.c index 4735defc46ee..4e53efbd621d 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1118,6 +1118,10 @@ long do_splice(struct file *in, loff_t __user *off_in, loff_t offset; long ret; + if (unlikely(!(in->f_mode & FMODE_READ) || + !(out->f_mode & FMODE_WRITE))) + return -EBADF; + ipipe = get_pipe_info(in); opipe = get_pipe_info(out); @@ -1125,12 +1129,6 @@ long do_splice(struct file *in, loff_t __user *off_in, if (off_in || off_out) return -ESPIPE; - if (!(in->f_mode & FMODE_READ)) - return -EBADF; - - if (!(out->f_mode & FMODE_WRITE)) - return -EBADF; - /* Splicing to self would be fun, but... */ if (ipipe == opipe) return -EINVAL; @@ -1153,9 +1151,6 @@ long do_splice(struct file *in, loff_t __user *off_in, offset = out->f_pos; } - if (unlikely(!(out->f_mode & FMODE_WRITE))) - return -EBADF; - if (unlikely(out->f_flags & O_APPEND)) return -EINVAL; @@ -1440,15 +1435,11 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, error = -EBADF; in = fdget(fd_in); if (in.file) { - if (in.file->f_mode & FMODE_READ) { - out = fdget(fd_out); - if (out.file) { - if (out.file->f_mode & FMODE_WRITE) - error = do_splice(in.file, off_in, - out.file, off_out, - len, flags); - fdput(out); - } + out = fdget(fd_out); + if (out.file) { + error = do_splice(in.file, off_in, out.file, off_out, + len, flags); + fdput(out); } fdput(in); } @@ -1503,7 +1494,7 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) * Check pipe occupancy without the inode lock first. This function * is speculative anyways, so missing one is ok. */ - if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) return 0; ret = 0; @@ -1770,6 +1761,10 @@ static long do_tee(struct file *in, struct file *out, size_t len, struct pipe_inode_info *opipe = get_pipe_info(out); int ret = -EINVAL; + if (unlikely(!(in->f_mode & FMODE_READ) || + !(out->f_mode & FMODE_WRITE))) + return -EBADF; + /* * Duplicate the contents of ipipe to opipe without actually * copying the data. @@ -1795,7 +1790,7 @@ static long do_tee(struct file *in, struct file *out, size_t len, SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) { - struct fd in; + struct fd in, out; int error; if (unlikely(flags & ~SPLICE_F_ALL)) @@ -1807,14 +1802,10 @@ SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) error = -EBADF; in = fdget(fdin); if (in.file) { - if (in.file->f_mode & FMODE_READ) { - struct fd out = fdget(fdout); - if (out.file) { - if (out.file->f_mode & FMODE_WRITE) - error = do_tee(in.file, out.file, - len, flags); - fdput(out); - } + out = fdget(fdout); + if (out.file) { + error = do_tee(in.file, out.file, len, flags); + fdput(out); } fdput(in); } diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 4f9b9fb59362..64f61330564a 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -13,6 +13,7 @@ * datablocks and metadata blocks. */ +#include <linux/blkdev.h> #include <linux/fs.h> #include <linux/vfs.h> #include <linux/slab.h> @@ -27,44 +28,103 @@ #include "page_actor.h" /* - * Read the metadata block length, this is stored in the first two - * bytes of the metadata block. + * Returns the amount of bytes copied to the page actor. */ -static struct buffer_head *get_block_length(struct super_block *sb, - u64 *cur_index, int *offset, int *length) +static int copy_bio_to_actor(struct bio *bio, + struct squashfs_page_actor *actor, + int offset, int req_length) +{ + void *actor_addr = squashfs_first_page(actor); + struct bvec_iter_all iter_all = {}; + struct bio_vec *bvec = bvec_init_iter_all(&iter_all); + int copied_bytes = 0; + int actor_offset = 0; + + if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all))) + return 0; + + while (copied_bytes < req_length) { + int bytes_to_copy = min_t(int, bvec->bv_len - offset, + PAGE_SIZE - actor_offset); + + bytes_to_copy = min_t(int, bytes_to_copy, + req_length - copied_bytes); + memcpy(actor_addr + actor_offset, + page_address(bvec->bv_page) + bvec->bv_offset + offset, + bytes_to_copy); + + actor_offset += bytes_to_copy; + copied_bytes += bytes_to_copy; + offset += bytes_to_copy; + + if (actor_offset >= PAGE_SIZE) { + actor_addr = squashfs_next_page(actor); + if (!actor_addr) + break; + actor_offset = 0; + } + if (offset >= bvec->bv_len) { + if (!bio_next_segment(bio, &iter_all)) + break; + offset = 0; + } + } + squashfs_finish_page(actor); + return copied_bytes; +} + +static int squashfs_bio_read(struct super_block *sb, u64 index, int length, + struct bio **biop, int *block_offset) { struct squashfs_sb_info *msblk = sb->s_fs_info; - struct buffer_head *bh; - - bh = sb_bread(sb, *cur_index); - if (bh == NULL) - return NULL; - - if (msblk->devblksize - *offset == 1) { - *length = (unsigned char) bh->b_data[*offset]; - put_bh(bh); - bh = sb_bread(sb, ++(*cur_index)); - if (bh == NULL) - return NULL; - *length |= (unsigned char) bh->b_data[0] << 8; - *offset = 1; - } else { - *length = (unsigned char) bh->b_data[*offset] | - (unsigned char) bh->b_data[*offset + 1] << 8; - *offset += 2; - - if (*offset == msblk->devblksize) { - put_bh(bh); - bh = sb_bread(sb, ++(*cur_index)); - if (bh == NULL) - return NULL; - *offset = 0; + const u64 read_start = round_down(index, msblk->devblksize); + const sector_t block = read_start >> msblk->devblksize_log2; + const u64 read_end = round_up(index + length, msblk->devblksize); + const sector_t block_end = read_end >> msblk->devblksize_log2; + int offset = read_start - round_down(index, PAGE_SIZE); + int total_len = (block_end - block) << msblk->devblksize_log2; + const int page_count = DIV_ROUND_UP(total_len + offset, PAGE_SIZE); + int error, i; + struct bio *bio; + + bio = bio_alloc(GFP_NOIO, page_count); + if (!bio) + return -ENOMEM; + + bio_set_dev(bio, sb->s_bdev); + bio->bi_opf = READ; + bio->bi_iter.bi_sector = block * (msblk->devblksize >> SECTOR_SHIFT); + + for (i = 0; i < page_count; ++i) { + unsigned int len = + min_t(unsigned int, PAGE_SIZE - offset, total_len); + struct page *page = alloc_page(GFP_NOIO); + + if (!page) { + error = -ENOMEM; + goto out_free_bio; + } + if (!bio_add_page(bio, page, len, offset)) { + error = -EIO; + goto out_free_bio; } + offset = 0; + total_len -= len; } - return bh; -} + error = submit_bio_wait(bio); + if (error) + goto out_free_bio; + *biop = bio; + *block_offset = index & ((1 << msblk->devblksize_log2) - 1); + return 0; + +out_free_bio: + bio_free_pages(bio); + bio_put(bio); + return error; +} /* * Read and decompress a metadata block or datablock. Length is non-zero @@ -76,129 +136,88 @@ static struct buffer_head *get_block_length(struct super_block *sb, * algorithms). */ int squashfs_read_data(struct super_block *sb, u64 index, int length, - u64 *next_index, struct squashfs_page_actor *output) + u64 *next_index, struct squashfs_page_actor *output) { struct squashfs_sb_info *msblk = sb->s_fs_info; - struct buffer_head **bh; - int offset = index & ((1 << msblk->devblksize_log2) - 1); - u64 cur_index = index >> msblk->devblksize_log2; - int bytes, compressed, b = 0, k = 0, avail, i; - - bh = kcalloc(((output->length + msblk->devblksize - 1) - >> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL); - if (bh == NULL) - return -ENOMEM; + struct bio *bio = NULL; + int compressed; + int res; + int offset; if (length) { /* * Datablock. */ - bytes = -offset; compressed = SQUASHFS_COMPRESSED_BLOCK(length); length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length); - if (next_index) - *next_index = index + length; - TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n", index, compressed ? "" : "un", length, output->length); - - if (length < 0 || length > output->length || - (index + length) > msblk->bytes_used) - goto read_failure; - - for (b = 0; bytes < length; b++, cur_index++) { - bh[b] = sb_getblk(sb, cur_index); - if (bh[b] == NULL) - goto block_release; - bytes += msblk->devblksize; - } - ll_rw_block(REQ_OP_READ, 0, b, bh); } else { /* * Metadata block. */ - if ((index + 2) > msblk->bytes_used) - goto read_failure; + const u8 *data; + struct bvec_iter_all iter_all = {}; + struct bio_vec *bvec = bvec_init_iter_all(&iter_all); - bh[0] = get_block_length(sb, &cur_index, &offset, &length); - if (bh[0] == NULL) - goto read_failure; - b = 1; + if (index + 2 > msblk->bytes_used) { + res = -EIO; + goto out; + } + res = squashfs_bio_read(sb, index, 2, &bio, &offset); + if (res) + goto out; + + if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all))) { + res = -EIO; + goto out_free_bio; + } + /* Extract the length of the metadata block */ + data = page_address(bvec->bv_page) + bvec->bv_offset; + length = data[offset]; + if (offset <= bvec->bv_len - 1) { + length |= data[offset + 1] << 8; + } else { + if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all))) { + res = -EIO; + goto out_free_bio; + } + data = page_address(bvec->bv_page) + bvec->bv_offset; + length |= data[0] << 8; + } + bio_free_pages(bio); + bio_put(bio); - bytes = msblk->devblksize - offset; compressed = SQUASHFS_COMPRESSED(length); length = SQUASHFS_COMPRESSED_SIZE(length); - if (next_index) - *next_index = index + length + 2; + index += 2; TRACE("Block @ 0x%llx, %scompressed size %d\n", index, - compressed ? "" : "un", length); - - if (length < 0 || length > output->length || - (index + length) > msblk->bytes_used) - goto block_release; - - for (; bytes < length; b++) { - bh[b] = sb_getblk(sb, ++cur_index); - if (bh[b] == NULL) - goto block_release; - bytes += msblk->devblksize; - } - ll_rw_block(REQ_OP_READ, 0, b - 1, bh + 1); + compressed ? "" : "un", length); } + if (next_index) + *next_index = index + length; - for (i = 0; i < b; i++) { - wait_on_buffer(bh[i]); - if (!buffer_uptodate(bh[i])) - goto block_release; - } + res = squashfs_bio_read(sb, index, length, &bio, &offset); + if (res) + goto out; if (compressed) { - if (!msblk->stream) - goto read_failure; - length = squashfs_decompress(msblk, bh, b, offset, length, - output); - if (length < 0) - goto read_failure; - } else { - /* - * Block is uncompressed. - */ - int in, pg_offset = 0; - void *data = squashfs_first_page(output); - - for (bytes = length; k < b; k++) { - in = min(bytes, msblk->devblksize - offset); - bytes -= in; - while (in) { - if (pg_offset == PAGE_SIZE) { - data = squashfs_next_page(output); - pg_offset = 0; - } - avail = min_t(int, in, PAGE_SIZE - - pg_offset); - memcpy(data + pg_offset, bh[k]->b_data + offset, - avail); - in -= avail; - pg_offset += avail; - offset += avail; - } - offset = 0; - put_bh(bh[k]); + if (!msblk->stream) { + res = -EIO; + goto out_free_bio; } - squashfs_finish_page(output); + res = squashfs_decompress(msblk, bio, offset, length, output); + } else { + res = copy_bio_to_actor(bio, output, offset, length); } - kfree(bh); - return length; - -block_release: - for (; k < b; k++) - put_bh(bh[k]); +out_free_bio: + bio_free_pages(bio); + bio_put(bio); +out: + if (res < 0) + ERROR("Failed to read block 0x%llx: %d\n", index, res); -read_failure: - ERROR("squashfs_read_data failed to read block 0x%llx\n", - (unsigned long long) index); - kfree(bh); - return -EIO; + return res; } diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h index ec8617523e56..1b9ccfd0aa51 100644 --- a/fs/squashfs/decompressor.h +++ b/fs/squashfs/decompressor.h @@ -10,13 +10,14 @@ * decompressor.h */ +#include <linux/bio.h> + struct squashfs_decompressor { void *(*init)(struct squashfs_sb_info *, void *); void *(*comp_opts)(struct squashfs_sb_info *, void *, int); void (*free)(void *); int (*decompress)(struct squashfs_sb_info *, void *, - struct buffer_head **, int, int, int, - struct squashfs_page_actor *); + struct bio *, int, int, struct squashfs_page_actor *); int id; char *name; int supported; diff --git a/fs/squashfs/decompressor_multi.c b/fs/squashfs/decompressor_multi.c index c181dee235bb..db9f12a3ea05 100644 --- a/fs/squashfs/decompressor_multi.c +++ b/fs/squashfs/decompressor_multi.c @@ -6,7 +6,7 @@ #include <linux/types.h> #include <linux/mutex.h> #include <linux/slab.h> -#include <linux/buffer_head.h> +#include <linux/bio.h> #include <linux/sched.h> #include <linux/wait.h> #include <linux/cpumask.h> @@ -180,14 +180,15 @@ wait: } -int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh, - int b, int offset, int length, struct squashfs_page_actor *output) +int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, + int offset, int length, + struct squashfs_page_actor *output) { int res; struct squashfs_stream *stream = msblk->stream; struct decomp_stream *decomp_stream = get_decomp_stream(msblk, stream); res = msblk->decompressor->decompress(msblk, decomp_stream->stream, - bh, b, offset, length, output); + bio, offset, length, output); put_decomp_stream(decomp_stream, stream); if (res < 0) ERROR("%s decompression failed, data probably corrupt\n", diff --git a/fs/squashfs/decompressor_multi_percpu.c b/fs/squashfs/decompressor_multi_percpu.c index 2a2a2d106440..b881b9283b7f 100644 --- a/fs/squashfs/decompressor_multi_percpu.c +++ b/fs/squashfs/decompressor_multi_percpu.c @@ -8,6 +8,7 @@ #include <linux/slab.h> #include <linux/percpu.h> #include <linux/buffer_head.h> +#include <linux/local_lock.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" @@ -20,7 +21,8 @@ */ struct squashfs_stream { - void *stream; + void *stream; + local_lock_t lock; }; void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, @@ -41,6 +43,7 @@ void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, err = PTR_ERR(stream->stream); goto out; } + local_lock_init(&stream->lock); } kfree(comp_opts); @@ -72,15 +75,19 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) } } -int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh, - int b, int offset, int length, struct squashfs_page_actor *output) +int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, + int offset, int length, struct squashfs_page_actor *output) { - struct squashfs_stream __percpu *percpu = - (struct squashfs_stream __percpu *) msblk->stream; - struct squashfs_stream *stream = get_cpu_ptr(percpu); - int res = msblk->decompressor->decompress(msblk, stream->stream, bh, b, - offset, length, output); - put_cpu_ptr(stream); + struct squashfs_stream *stream; + int res; + + local_lock(&msblk->stream->lock); + stream = this_cpu_ptr(msblk->stream); + + res = msblk->decompressor->decompress(msblk, stream->stream, bio, + offset, length, output); + + local_unlock(&msblk->stream->lock); if (res < 0) ERROR("%s decompression failed, data probably corrupt\n", diff --git a/fs/squashfs/decompressor_single.c b/fs/squashfs/decompressor_single.c index 550c3e592032..4eb3d083d45e 100644 --- a/fs/squashfs/decompressor_single.c +++ b/fs/squashfs/decompressor_single.c @@ -7,7 +7,7 @@ #include <linux/types.h> #include <linux/mutex.h> #include <linux/slab.h> -#include <linux/buffer_head.h> +#include <linux/bio.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" @@ -59,14 +59,15 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) } } -int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh, - int b, int offset, int length, struct squashfs_page_actor *output) +int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, + int offset, int length, + struct squashfs_page_actor *output) { int res; struct squashfs_stream *stream = msblk->stream; mutex_lock(&stream->mutex); - res = msblk->decompressor->decompress(msblk, stream->stream, bh, b, + res = msblk->decompressor->decompress(msblk, stream->stream, bio, offset, length, output); mutex_unlock(&stream->mutex); diff --git a/fs/squashfs/lz4_wrapper.c b/fs/squashfs/lz4_wrapper.c index c4e47e0588c7..233d5582fbee 100644 --- a/fs/squashfs/lz4_wrapper.c +++ b/fs/squashfs/lz4_wrapper.c @@ -4,7 +4,7 @@ * Phillip Lougher <phillip@squashfs.org.uk> */ -#include <linux/buffer_head.h> +#include <linux/bio.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/vmalloc.h> @@ -89,20 +89,23 @@ static void lz4_free(void *strm) static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm, - struct buffer_head **bh, int b, int offset, int length, + struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { + struct bvec_iter_all iter_all = {}; + struct bio_vec *bvec = bvec_init_iter_all(&iter_all); struct squashfs_lz4 *stream = strm; void *buff = stream->input, *data; - int avail, i, bytes = length, res; + int bytes = length, res; - for (i = 0; i < b; i++) { - avail = min(bytes, msblk->devblksize - offset); - memcpy(buff, bh[i]->b_data + offset, avail); + while (bio_next_segment(bio, &iter_all)) { + int avail = min(bytes, ((int)bvec->bv_len) - offset); + + data = page_address(bvec->bv_page) + bvec->bv_offset; + memcpy(buff, data + offset, avail); buff += avail; bytes -= avail; offset = 0; - put_bh(bh[i]); } res = LZ4_decompress_safe(stream->input, stream->output, diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c index aa3c3dafc33d..97bb7d92ddcd 100644 --- a/fs/squashfs/lzo_wrapper.c +++ b/fs/squashfs/lzo_wrapper.c @@ -9,7 +9,7 @@ */ #include <linux/mutex.h> -#include <linux/buffer_head.h> +#include <linux/bio.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/lzo.h> @@ -63,21 +63,24 @@ static void lzo_free(void *strm) static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm, - struct buffer_head **bh, int b, int offset, int length, + struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { + struct bvec_iter_all iter_all = {}; + struct bio_vec *bvec = bvec_init_iter_all(&iter_all); struct squashfs_lzo *stream = strm; void *buff = stream->input, *data; - int avail, i, bytes = length, res; + int bytes = length, res; size_t out_len = output->length; - for (i = 0; i < b; i++) { - avail = min(bytes, msblk->devblksize - offset); - memcpy(buff, bh[i]->b_data + offset, avail); + while (bio_next_segment(bio, &iter_all)) { + int avail = min(bytes, ((int)bvec->bv_len) - offset); + + data = page_address(bvec->bv_page) + bvec->bv_offset; + memcpy(buff, data + offset, avail); buff += avail; bytes -= avail; offset = 0; - put_bh(bh[i]); } res = lzo1x_decompress_safe(stream->input, (size_t)length, diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index 2797763ed046..9783e01c8100 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -40,8 +40,8 @@ extern void *squashfs_decompressor_setup(struct super_block *, unsigned short); /* decompressor_xxx.c */ extern void *squashfs_decompressor_create(struct squashfs_sb_info *, void *); extern void squashfs_decompressor_destroy(struct squashfs_sb_info *); -extern int squashfs_decompress(struct squashfs_sb_info *, struct buffer_head **, - int, int, int, struct squashfs_page_actor *); +extern int squashfs_decompress(struct squashfs_sb_info *, struct bio *, + int, int, struct squashfs_page_actor *); extern int squashfs_max_decompressors(void); /* export.c */ diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c index 4b2f2051a6dc..e80419aed862 100644 --- a/fs/squashfs/xz_wrapper.c +++ b/fs/squashfs/xz_wrapper.c @@ -10,7 +10,7 @@ #include <linux/mutex.h> -#include <linux/buffer_head.h> +#include <linux/bio.h> #include <linux/slab.h> #include <linux/xz.h> #include <linux/bitops.h> @@ -117,11 +117,12 @@ static void squashfs_xz_free(void *strm) static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, - struct buffer_head **bh, int b, int offset, int length, + struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { - enum xz_ret xz_err; - int avail, total = 0, k = 0; + struct bvec_iter_all iter_all = {}; + struct bio_vec *bvec = bvec_init_iter_all(&iter_all); + int total = 0, error = 0; struct squashfs_xz *stream = strm; xz_dec_reset(stream->state); @@ -131,11 +132,23 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, stream->buf.out_size = PAGE_SIZE; stream->buf.out = squashfs_first_page(output); - do { - if (stream->buf.in_pos == stream->buf.in_size && k < b) { - avail = min(length, msblk->devblksize - offset); + for (;;) { + enum xz_ret xz_err; + + if (stream->buf.in_pos == stream->buf.in_size) { + const void *data; + int avail; + + if (!bio_next_segment(bio, &iter_all)) { + /* XZ_STREAM_END must be reached. */ + error = -EIO; + break; + } + + avail = min(length, ((int)bvec->bv_len) - offset); + data = page_address(bvec->bv_page) + bvec->bv_offset; length -= avail; - stream->buf.in = bh[k]->b_data + offset; + stream->buf.in = data + offset; stream->buf.in_size = avail; stream->buf.in_pos = 0; offset = 0; @@ -150,23 +163,17 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, } xz_err = xz_dec_run(stream->state, &stream->buf); - - if (stream->buf.in_pos == stream->buf.in_size && k < b) - put_bh(bh[k++]); - } while (xz_err == XZ_OK); + if (xz_err == XZ_STREAM_END) + break; + if (xz_err != XZ_OK) { + error = -EIO; + break; + } + } squashfs_finish_page(output); - if (xz_err != XZ_STREAM_END || k < b) - goto out; - - return total + stream->buf.out_pos; - -out: - for (; k < b; k++) - put_bh(bh[k]); - - return -EIO; + return error ? error : total + stream->buf.out_pos; } const struct squashfs_decompressor squashfs_xz_comp_ops = { diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c index f2226afa1625..bcb881ec47f2 100644 --- a/fs/squashfs/zlib_wrapper.c +++ b/fs/squashfs/zlib_wrapper.c @@ -10,7 +10,7 @@ #include <linux/mutex.h> -#include <linux/buffer_head.h> +#include <linux/bio.h> #include <linux/slab.h> #include <linux/zlib.h> #include <linux/vmalloc.h> @@ -50,21 +50,35 @@ static void zlib_free(void *strm) static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, - struct buffer_head **bh, int b, int offset, int length, + struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { - int zlib_err, zlib_init = 0, k = 0; + struct bvec_iter_all iter_all = {}; + struct bio_vec *bvec = bvec_init_iter_all(&iter_all); + int zlib_init = 0, error = 0; z_stream *stream = strm; stream->avail_out = PAGE_SIZE; stream->next_out = squashfs_first_page(output); stream->avail_in = 0; - do { - if (stream->avail_in == 0 && k < b) { - int avail = min(length, msblk->devblksize - offset); + for (;;) { + int zlib_err; + + if (stream->avail_in == 0) { + const void *data; + int avail; + + if (!bio_next_segment(bio, &iter_all)) { + /* Z_STREAM_END must be reached. */ + error = -EIO; + break; + } + + avail = min(length, ((int)bvec->bv_len) - offset); + data = page_address(bvec->bv_page) + bvec->bv_offset; length -= avail; - stream->next_in = bh[k]->b_data + offset; + stream->next_in = data + offset; stream->avail_in = avail; offset = 0; } @@ -78,37 +92,28 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, if (!zlib_init) { zlib_err = zlib_inflateInit(stream); if (zlib_err != Z_OK) { - squashfs_finish_page(output); - goto out; + error = -EIO; + break; } zlib_init = 1; } zlib_err = zlib_inflate(stream, Z_SYNC_FLUSH); - - if (stream->avail_in == 0 && k < b) - put_bh(bh[k++]); - } while (zlib_err == Z_OK); + if (zlib_err == Z_STREAM_END) + break; + if (zlib_err != Z_OK) { + error = -EIO; + break; + } + } squashfs_finish_page(output); - if (zlib_err != Z_STREAM_END) - goto out; - - zlib_err = zlib_inflateEnd(stream); - if (zlib_err != Z_OK) - goto out; - - if (k < b) - goto out; - - return stream->total_out; - -out: - for (; k < b; k++) - put_bh(bh[k]); + if (!error) + if (zlib_inflateEnd(stream) != Z_OK) + error = -EIO; - return -EIO; + return error ? error : stream->total_out; } const struct squashfs_decompressor squashfs_zlib_comp_ops = { diff --git a/fs/squashfs/zstd_wrapper.c b/fs/squashfs/zstd_wrapper.c index b448c2a1d0ed..b7cb1faa652d 100644 --- a/fs/squashfs/zstd_wrapper.c +++ b/fs/squashfs/zstd_wrapper.c @@ -9,7 +9,7 @@ */ #include <linux/mutex.h> -#include <linux/buffer_head.h> +#include <linux/bio.h> #include <linux/slab.h> #include <linux/zstd.h> #include <linux/vmalloc.h> @@ -59,33 +59,44 @@ static void zstd_free(void *strm) static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm, - struct buffer_head **bh, int b, int offset, int length, + struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { struct workspace *wksp = strm; ZSTD_DStream *stream; size_t total_out = 0; - size_t zstd_err; - int k = 0; + int error = 0; ZSTD_inBuffer in_buf = { NULL, 0, 0 }; ZSTD_outBuffer out_buf = { NULL, 0, 0 }; + struct bvec_iter_all iter_all = {}; + struct bio_vec *bvec = bvec_init_iter_all(&iter_all); stream = ZSTD_initDStream(wksp->window_size, wksp->mem, wksp->mem_size); if (!stream) { ERROR("Failed to initialize zstd decompressor\n"); - goto out; + return -EIO; } out_buf.size = PAGE_SIZE; out_buf.dst = squashfs_first_page(output); - do { - if (in_buf.pos == in_buf.size && k < b) { - int avail = min(length, msblk->devblksize - offset); + for (;;) { + size_t zstd_err; + if (in_buf.pos == in_buf.size) { + const void *data; + int avail; + + if (!bio_next_segment(bio, &iter_all)) { + error = -EIO; + break; + } + + avail = min(length, ((int)bvec->bv_len) - offset); + data = page_address(bvec->bv_page) + bvec->bv_offset; length -= avail; - in_buf.src = bh[k]->b_data + offset; + in_buf.src = data + offset; in_buf.size = avail; in_buf.pos = 0; offset = 0; @@ -97,8 +108,8 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm, /* Shouldn't run out of pages * before stream is done. */ - squashfs_finish_page(output); - goto out; + error = -EIO; + break; } out_buf.pos = 0; out_buf.size = PAGE_SIZE; @@ -107,29 +118,20 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm, total_out -= out_buf.pos; zstd_err = ZSTD_decompressStream(stream, &out_buf, &in_buf); total_out += out_buf.pos; /* add the additional data produced */ - - if (in_buf.pos == in_buf.size && k < b) - put_bh(bh[k++]); - } while (zstd_err != 0 && !ZSTD_isError(zstd_err)); - - squashfs_finish_page(output); - - if (ZSTD_isError(zstd_err)) { - ERROR("zstd decompression error: %d\n", - (int)ZSTD_getErrorCode(zstd_err)); - goto out; + if (zstd_err == 0) + break; + + if (ZSTD_isError(zstd_err)) { + ERROR("zstd decompression error: %d\n", + (int)ZSTD_getErrorCode(zstd_err)); + error = -EIO; + break; + } } - if (k < b) - goto out; - - return (int)total_out; - -out: - for (; k < b; k++) - put_bh(bh[k]); + squashfs_finish_page(output); - return -EIO; + return error ? error : total_out; } const struct squashfs_decompressor squashfs_zstd_comp_ops = { diff --git a/fs/stat.c b/fs/stat.c index 030008796479..b9faa6cafafe 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -22,6 +22,7 @@ #include <asm/unistd.h> #include "internal.h" +#include "mount.h" /** * generic_fillattr - Fill in the basic attributes from the inode struct @@ -70,11 +71,11 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, memset(stat, 0, sizeof(*stat)); stat->result_mask |= STATX_BASIC_STATS; - request_mask &= STATX_ALL; query_flags &= KSTAT_QUERY_FLAGS; /* allow the fs to override these if it really wants to */ - if (IS_NOATIME(inode)) + /* SB_NOATIME means filesystem supplies dummy atime value */ + if (inode->i_sb->s_flags & SB_NOATIME) stat->result_mask &= ~STATX_ATIME; if (IS_AUTOMOUNT(inode)) stat->attributes |= STATX_ATTR_AUTOMOUNT; @@ -199,6 +200,11 @@ retry: goto out; error = vfs_getattr(&path, stat, request_mask, flags); + stat->mnt_id = real_mount(path.mnt)->mnt_id; + stat->result_mask |= STATX_MNT_ID; + if (path.mnt->mnt_root == path.dentry) + stat->attributes |= STATX_ATTR_MOUNT_ROOT; + stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT; path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; @@ -563,6 +569,7 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer) tmp.stx_rdev_minor = MINOR(stat->rdev); tmp.stx_dev_major = MAJOR(stat->dev); tmp.stx_dev_minor = MINOR(stat->dev); + tmp.stx_mnt_id = stat->mnt_id; return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0; } diff --git a/fs/super.c b/fs/super.c index 4991f441988e..bf3b7685b52a 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1302,8 +1302,8 @@ int get_tree_bdev(struct fs_context *fc, mutex_lock(&bdev->bd_fsfreeze_mutex); if (bdev->bd_fsfreeze_count > 0) { mutex_unlock(&bdev->bd_fsfreeze_mutex); - blkdev_put(bdev, mode); warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); + blkdev_put(bdev, mode); return -EBUSY; } diff --git a/fs/sync.c b/fs/sync.c index 4d1ff010bc5a..c6f6f5be5682 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -161,7 +161,7 @@ SYSCALL_DEFINE1(syncfs, int, fd) { struct fd f = fdget(fd); struct super_block *sb; - int ret; + int ret, ret2; if (!f.file) return -EBADF; @@ -171,8 +171,10 @@ SYSCALL_DEFINE1(syncfs, int, fd) ret = sync_filesystem(sb); up_read(&sb->s_umount); + ret2 = errseq_check_and_advance(&sb->s_wb_err, &f.file->f_sb_err); + fdput(f); - return ret; + return ret ? ret : ret2; } /** diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index aa85f2874a9f..59dffd5ca517 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -6,7 +6,7 @@ * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * - * Please see Documentation/filesystems/sysfs.txt for more information. + * Please see Documentation/filesystems/sysfs.rst for more information. */ #define pr_fmt(fmt) "sysfs: " fmt diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 26bbf960e2a2..f275fcda62fb 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -6,7 +6,7 @@ * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * - * Please see Documentation/filesystems/sysfs.txt for more information. + * Please see Documentation/filesystems/sysfs.rst for more information. */ #include <linux/module.h> diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index db81cfbab9d6..e747c135c1d1 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -6,7 +6,7 @@ * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * - * Please see Documentation/filesystems/sysfs.txt for more information. + * Please see Documentation/filesystems/sysfs.rst for more information. */ #include <linux/fs.h> diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index c4deecc80f67..5603530a1a52 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -6,7 +6,7 @@ * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * - * Please see Documentation/filesystems/sysfs.txt for more information. + * Please see Documentation/filesystems/sysfs.rst for more information. */ #include <linux/fs.h> diff --git a/fs/sysv/Kconfig b/fs/sysv/Kconfig index d4edf7d9ae10..b4e23e03fbeb 100644 --- a/fs/sysv/Kconfig +++ b/fs/sysv/Kconfig @@ -28,7 +28,7 @@ config SYSV_FS tar" or preferably "info tar"). Note also that this option has nothing whatsoever to do with the option "System V IPC". Read about the System V file system in - <file:Documentation/filesystems/sysv-fs.txt>. + <file:Documentation/filesystems/sysv-fs.rst>. Saying Y here will enlarge your kernel by about 27 KB. To compile this as a module, choose M here: the module will be called diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c index 8cdbd53d780c..cc5c0abfd536 100644 --- a/fs/ubifs/auth.c +++ b/fs/ubifs/auth.c @@ -31,15 +31,9 @@ int __ubifs_node_calc_hash(const struct ubifs_info *c, const void *node, u8 *hash) { const struct ubifs_ch *ch = node; - SHASH_DESC_ON_STACK(shash, c->hash_tfm); - int err; - - shash->tfm = c->hash_tfm; - err = crypto_shash_digest(shash, node, le32_to_cpu(ch->len), hash); - if (err < 0) - return err; - return 0; + return crypto_shash_tfm_digest(c->hash_tfm, node, le32_to_cpu(ch->len), + hash); } /** @@ -53,15 +47,7 @@ int __ubifs_node_calc_hash(const struct ubifs_info *c, const void *node, static int ubifs_hash_calc_hmac(const struct ubifs_info *c, const u8 *hash, u8 *hmac) { - SHASH_DESC_ON_STACK(shash, c->hmac_tfm); - int err; - - shash->tfm = c->hmac_tfm; - - err = crypto_shash_digest(shash, hash, c->hash_len, hmac); - if (err < 0) - return err; - return 0; + return crypto_shash_tfm_digest(c->hmac_tfm, hash, c->hash_len, hmac); } /** @@ -79,13 +65,9 @@ int ubifs_prepare_auth_node(struct ubifs_info *c, void *node, struct shash_desc *inhash) { struct ubifs_auth_node *auth = node; - u8 *hash; + u8 hash[UBIFS_HASH_ARR_SZ]; int err; - hash = kmalloc(crypto_shash_descsize(c->hash_tfm), GFP_NOFS); - if (!hash) - return -ENOMEM; - { SHASH_DESC_ON_STACK(hash_desc, c->hash_tfm); @@ -94,21 +76,16 @@ int ubifs_prepare_auth_node(struct ubifs_info *c, void *node, err = crypto_shash_final(hash_desc, hash); if (err) - goto out; + return err; } err = ubifs_hash_calc_hmac(c, hash, auth->hmac); if (err) - goto out; + return err; auth->ch.node_type = UBIFS_AUTH_NODE; ubifs_prepare_node(c, auth, ubifs_auth_node_sz(c), 0); - - err = 0; -out: - kfree(hash); - - return err; + return 0; } static struct shash_desc *ubifs_get_desc(const struct ubifs_info *c, diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 0f5a480fe264..31288d8fa2ce 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -815,7 +815,7 @@ void ubifs_dump_leb(const struct ubifs_info *c, int lnum) pr_err("(pid %d) start dumping LEB %d\n", current->pid, lnum); - buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + buf = __vmalloc(c->leb_size, GFP_NOFS); if (!buf) { ubifs_err(c, "cannot allocate memory for dumping LEB %d", lnum); return; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 743928efffc1..49fe062ce45e 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1375,7 +1375,6 @@ int ubifs_update_time(struct inode *inode, struct timespec64 *time, struct ubifs_info *c = inode->i_sb->s_fs_info; struct ubifs_budget_req req = { .dirtied_ino = 1, .dirtied_ino_d = ALIGN(ui->data_len, 8) }; - int iflags = I_DIRTY_TIME; int err, release; if (!IS_ENABLED(CONFIG_UBIFS_ATIME_SUPPORT)) @@ -1393,11 +1392,8 @@ int ubifs_update_time(struct inode *inode, struct timespec64 *time, if (flags & S_MTIME) inode->i_mtime = *time; - if (!(inode->i_sb->s_flags & SB_LAZYTIME)) - iflags |= I_DIRTY_SYNC; - release = ui->dirty; - __mark_inode_dirty(inode, iflags); + __mark_inode_dirty(inode, I_DIRTY_SYNC); mutex_unlock(&ui->ui_mutex); if (release) ubifs_release_budget(c, &req); diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index 29826c51883a..22bfda158f7f 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -1095,7 +1095,7 @@ static int scan_check_cb(struct ubifs_info *c, return LPT_SCAN_CONTINUE; } - buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + buf = __vmalloc(c->leb_size, GFP_NOFS); if (!buf) return -ENOMEM; diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index ff5e0411cf2d..d76a19e460cd 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -1596,7 +1596,7 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum) if (!dbg_is_chk_lprops(c)) return 0; - buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + buf = p = __vmalloc(c->leb_size, GFP_NOFS); if (!buf) { ubifs_err(c, "cannot allocate memory for ltab checking"); return 0; @@ -1845,7 +1845,7 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum) void *buf, *p; pr_err("(pid %d) start dumping LEB %d\n", current->pid, lnum); - buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + buf = p = __vmalloc(c->leb_size, GFP_NOFS); if (!buf) { ubifs_err(c, "cannot allocate memory to dump LPT"); return; diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c index 52a85c01397e..911d0555b9f2 100644 --- a/fs/ubifs/master.c +++ b/fs/ubifs/master.c @@ -68,12 +68,9 @@ static int mst_node_check_hash(const struct ubifs_info *c, u8 calc[UBIFS_MAX_HASH_LEN]; const void *node = mst; - SHASH_DESC_ON_STACK(shash, c->hash_tfm); - - shash->tfm = c->hash_tfm; - - crypto_shash_digest(shash, node + sizeof(struct ubifs_ch), - UBIFS_MST_NODE_SZ - sizeof(struct ubifs_ch), calc); + crypto_shash_tfm_digest(c->hash_tfm, node + sizeof(struct ubifs_ch), + UBIFS_MST_NODE_SZ - sizeof(struct ubifs_ch), + calc); if (ubifs_check_hash(c, expected, calc)) return -EPERM; diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 283f9eb48410..2c294085ffed 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -977,7 +977,7 @@ static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci) if (c->no_orphs) return 0; - buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + buf = __vmalloc(c->leb_size, GFP_NOFS); if (!buf) { ubifs_err(c, "cannot allocate memory to check orphans"); return 0; diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index b28ac4dfb407..b69ffac7e415 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -558,7 +558,7 @@ static int is_last_bud(struct ubifs_info *c, struct ubifs_bud *bud) return data == 0xFFFFFFFF; } -/* authenticate_sleb_hash and authenticate_sleb_hmac are split out for stack usage */ +/* authenticate_sleb_hash is split out for stack usage */ static int authenticate_sleb_hash(struct ubifs_info *c, struct shash_desc *log_hash, u8 *hash) { SHASH_DESC_ON_STACK(hash_desc, c->hash_tfm); @@ -569,15 +569,6 @@ static int authenticate_sleb_hash(struct ubifs_info *c, struct shash_desc *log_h return crypto_shash_final(hash_desc, hash); } -static int authenticate_sleb_hmac(struct ubifs_info *c, u8 *hash, u8 *hmac) -{ - SHASH_DESC_ON_STACK(hmac_desc, c->hmac_tfm); - - hmac_desc->tfm = c->hmac_tfm; - - return crypto_shash_digest(hmac_desc, hash, c->hash_len, hmac); -} - /** * authenticate_sleb - authenticate one scan LEB * @c: UBIFS file-system description object @@ -601,18 +592,12 @@ static int authenticate_sleb(struct ubifs_info *c, struct ubifs_scan_leb *sleb, struct ubifs_scan_node *snod; int n_nodes = 0; int err; - u8 *hash, *hmac; + u8 hash[UBIFS_HASH_ARR_SZ]; + u8 hmac[UBIFS_HMAC_ARR_SZ]; if (!ubifs_authenticated(c)) return sleb->nodes_cnt; - hash = kmalloc(crypto_shash_descsize(c->hash_tfm), GFP_NOFS); - hmac = kmalloc(c->hmac_desc_len, GFP_NOFS); - if (!hash || !hmac) { - err = -ENOMEM; - goto out; - } - list_for_each_entry(snod, &sleb->nodes, list) { n_nodes++; @@ -624,7 +609,8 @@ static int authenticate_sleb(struct ubifs_info *c, struct ubifs_scan_leb *sleb, if (err) goto out; - err = authenticate_sleb_hmac(c, hash, hmac); + err = crypto_shash_tfm_digest(c->hmac_tfm, hash, + c->hash_len, hmac); if (err) goto out; @@ -662,9 +648,6 @@ static int authenticate_sleb(struct ubifs_info *c, struct ubifs_scan_leb *sleb, err = 0; } out: - kfree(hash); - kfree(hmac); - return err ? err : n_nodes - n_not_auth; } diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig index 6848de581ce1..26e1a49f3ba7 100644 --- a/fs/udf/Kconfig +++ b/fs/udf/Kconfig @@ -9,7 +9,7 @@ config UDF_FS compatible with standard unix file systems, it is also suitable for removable USB disks. Say Y if you intend to mount DVD discs or CDRW's written in packet mode, or if you want to use UDF for removable USB - disks. Please read <file:Documentation/filesystems/udf.txt>. + disks. Please read <file:Documentation/filesystems/udf.rst>. To compile this file system support as a module, choose M here: the module will be called udf. diff --git a/fs/udf/inode.c b/fs/udf/inode.c index e875bc5668ee..adaba8e8b326 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -195,10 +195,9 @@ static int udf_readpage(struct file *file, struct page *page) return mpage_readpage(page, udf_get_block); } -static int udf_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void udf_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, udf_get_block); + mpage_readahead(rac, udf_get_block); } static int udf_write_begin(struct file *file, struct address_space *mapping, @@ -234,7 +233,7 @@ static sector_t udf_bmap(struct address_space *mapping, sector_t block) const struct address_space_operations udf_aops = { .readpage = udf_readpage, - .readpages = udf_readpages, + .readahead = udf_readahead, .writepage = udf_writepage, .writepages = udf_writepages, .write_begin = udf_write_begin, diff --git a/fs/utimes.c b/fs/utimes.c index 1d17ce98cb80..b7b927502d6e 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -95,13 +95,13 @@ long do_utimes(int dfd, const char __user *filename, struct timespec64 *times, goto out; } - if (flags & ~AT_SYMLINK_NOFOLLOW) + if (flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) goto out; if (filename == NULL && dfd != AT_FDCWD) { struct fd f; - if (flags & AT_SYMLINK_NOFOLLOW) + if (flags) goto out; f = fdget(dfd); @@ -117,6 +117,8 @@ long do_utimes(int dfd, const char __user *filename, struct timespec64 *times, if (!(flags & AT_SYMLINK_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; + if (flags & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; retry: error = user_path_at(dfd, filename, lookup_flags, &path); if (error) diff --git a/fs/verity/enable.c b/fs/verity/enable.c index d98bea308fd7..5ab3bbec8108 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -329,6 +329,8 @@ rollback: /** * fsverity_ioctl_enable() - enable verity on a file + * @filp: file to enable verity on + * @uarg: user pointer to fsverity_enable_arg * * Enable fs-verity on a file. See the "FS_IOC_ENABLE_VERITY" section of * Documentation/filesystems/fsverity.rst for the documentation. diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index 74768cf539da..e96d99d5145e 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -61,7 +61,7 @@ struct merkle_tree_params { u64 level_start[FS_VERITY_MAX_LEVELS]; }; -/** +/* * fsverity_info - cached verity metadata for an inode * * When a verity file is first opened, an instance of this struct is allocated @@ -134,7 +134,7 @@ void __init fsverity_check_hash_algs(void); /* init.c */ -extern void __printf(3, 4) __cold +void __printf(3, 4) __cold fsverity_msg(const struct inode *inode, const char *level, const char *fmt, ...); diff --git a/fs/verity/measure.c b/fs/verity/measure.c index 05049b68c745..df409a5682ed 100644 --- a/fs/verity/measure.c +++ b/fs/verity/measure.c @@ -11,6 +11,8 @@ /** * fsverity_ioctl_measure() - get a verity file's measurement + * @filp: file to get measurement of + * @_uarg: user pointer to fsverity_digest * * Retrieve the file measurement that the kernel is enforcing for reads from a * verity file. See the "FS_IOC_MEASURE_VERITY" section of diff --git a/fs/verity/open.c b/fs/verity/open.c index c5fe6948e262..d007db0c9304 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -330,6 +330,7 @@ EXPORT_SYMBOL_GPL(fsverity_prepare_setattr); /** * fsverity_cleanup_inode() - free the inode's verity info, if present + * @inode: an inode being evicted * * Filesystems must call this on inode eviction to free ->i_verity_info. */ diff --git a/fs/verity/signature.c b/fs/verity/signature.c index c8b255232de5..b14ed96387ec 100644 --- a/fs/verity/signature.c +++ b/fs/verity/signature.c @@ -28,6 +28,9 @@ static struct key *fsverity_keyring; /** * fsverity_verify_signature() - check a verity file's signature + * @vi: the file's fsverity_info + * @desc: the file's fsverity_descriptor + * @desc_size: size of @desc * * If the file's fs-verity descriptor includes a signature of the file * measurement, verify it against the certificates in the fs-verity keyring. diff --git a/fs/verity/verify.c b/fs/verity/verify.c index e0cb62da3864..a8b68c6f663d 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -179,6 +179,7 @@ out: /** * fsverity_verify_page() - verify a data page + * @page: the page to verity * * Verify a page that has just been read from a verity file. The page must be a * pagecache page that is still locked and not yet uptodate. @@ -206,6 +207,7 @@ EXPORT_SYMBOL_GPL(fsverity_verify_page); #ifdef CONFIG_BLOCK /** * fsverity_verify_bio() - verify a 'read' bio that has just completed + * @bio: the bio to verify * * Verify a set of pages that have just been read from a verity file. The pages * must be pagecache pages that are still locked and not yet uptodate. Pages @@ -264,6 +266,7 @@ EXPORT_SYMBOL_GPL(fsverity_verify_bio); /** * fsverity_enqueue_verify_work() - enqueue work on the fs-verity workqueue + * @work: the work to enqueue * * Enqueue verification work for asynchronous processing. */ diff --git a/fs/xattr.c b/fs/xattr.c index e13265e65871..91608d9bfc6a 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -876,6 +876,9 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, struct simple_xattr *new_xattr = NULL; int err = 0; + if (removed_size) + *removed_size = -1; + /* value == NULL means remove */ if (value) { new_xattr = simple_xattr_alloc(value, size); @@ -914,9 +917,6 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, list_add(&new_xattr->list, &xattrs->head); xattr = NULL; } - - if (removed_size) - *removed_size = -1; out: spin_unlock(&xattrs->lock); if (xattr) { diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 1da94237a8cf..f1366475c389 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -48,7 +48,7 @@ __kmem_vmalloc(size_t size, xfs_km_flags_t flags) if (flags & KM_NOFS) nofs_flag = memalloc_nofs_save(); - ptr = __vmalloc(size, lflags, PAGE_KERNEL); + ptr = __vmalloc(size, lflags); if (flags & KM_NOFS) memalloc_nofs_restore(nofs_flag); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 9d9cebf18726..1fd4fb7a607c 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -621,14 +621,11 @@ xfs_vm_readpage( return iomap_readpage(page, &xfs_read_iomap_ops); } -STATIC int -xfs_vm_readpages( - struct file *unused, - struct address_space *mapping, - struct list_head *pages, - unsigned nr_pages) +STATIC void +xfs_vm_readahead( + struct readahead_control *rac) { - return iomap_readpages(mapping, pages, nr_pages, &xfs_read_iomap_ops); + iomap_readahead(rac, &xfs_read_iomap_ops); } static int @@ -644,7 +641,7 @@ xfs_iomap_swapfile_activate( const struct address_space_operations xfs_address_space_operations = { .readpage = xfs_vm_readpage, - .readpages = xfs_vm_readpages, + .readahead = xfs_vm_readahead, .writepage = xfs_vm_writepage, .writepages = xfs_vm_writepages, .set_page_dirty = iomap_set_page_dirty, diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 9ec3eaf1c618..65538d18e64f 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -477,7 +477,7 @@ _xfs_buf_map_pages( nofs_flag = memalloc_nofs_save(); do { bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, - -1, PAGE_KERNEL); + -1); if (bp->b_addr) break; vm_unmap_aliases(); diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 25afcf55aa41..d79b821ed1c7 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -79,10 +79,9 @@ static int zonefs_readpage(struct file *unused, struct page *page) return iomap_readpage(page, &zonefs_iomap_ops); } -static int zonefs_readpages(struct file *unused, struct address_space *mapping, - struct list_head *pages, unsigned int nr_pages) +static void zonefs_readahead(struct readahead_control *rac) { - return iomap_readpages(mapping, pages, nr_pages, &zonefs_iomap_ops); + iomap_readahead(rac, &zonefs_iomap_ops); } /* @@ -129,7 +128,7 @@ static int zonefs_writepages(struct address_space *mapping, static const struct address_space_operations zonefs_file_aops = { .readpage = zonefs_readpage, - .readpages = zonefs_readpages, + .readahead = zonefs_readahead, .writepage = zonefs_writepage, .writepages = zonefs_writepages, .set_page_dirty = iomap_set_page_dirty, |