From 2724273e8fd00b512596a77ee063f49b25f36507 Mon Sep 17 00:00:00 2001 From: Rahul Lakkireddy Date: Wed, 2 May 2018 15:17:17 +0530 Subject: vmcore: add API to collect hardware dump in second kernel The sequence of actions done by device drivers to append their device specific hardware/firmware logs to /proc/vmcore are as follows: 1. During probe (before hardware is initialized), device drivers register to the vmcore module (via vmcore_add_device_dump()), with callback function, along with buffer size and log name needed for firmware/hardware log collection. 2. vmcore module allocates the buffer with requested size. It adds an Elf note and invokes the device driver's registered callback function. 3. Device driver collects all hardware/firmware logs into the buffer and returns control back to vmcore module. Ensure that the device dump buffer size is always aligned to page size so that it can be mmaped. Also, rename alloc_elfnotes_buf() to vmcore_alloc_buf() to make it more generic and reserve NT_VMCOREDD note type to indicate vmcore device dump. Suggested-by: Eric Biederman . Signed-off-by: Rahul Lakkireddy Signed-off-by: Ganesh Goudar Signed-off-by: David S. Miller --- fs/proc/Kconfig | 15 +++++++ fs/proc/vmcore.c | 135 +++++++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 141 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 1ade1206bb89..0eaeb41453f5 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -43,6 +43,21 @@ config PROC_VMCORE help Exports the dump image of crashed kernel in ELF format. +config PROC_VMCORE_DEVICE_DUMP + bool "Device Hardware/Firmware Log Collection" + depends on PROC_VMCORE + default n + help + After kernel panic, device drivers can collect the device + specific snapshot of their hardware or firmware before the + underlying devices are initialized in crash recovery kernel. + Note that the device driver must be present in the crash + recovery kernel's initramfs to collect its underlying device + snapshot. + + If you say Y here, the collected device dumps will be added + as ELF notes to /proc/vmcore. + config PROC_SYSCTL bool "Sysctl support (/proc/sys)" if EXPERT depends on PROC_FS diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index a45f0af22a60..abb3dba0fa49 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -44,6 +45,12 @@ static u64 vmcore_size; static struct proc_dir_entry *proc_vmcore; +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP +/* Device Dump list and mutex to synchronize access to list */ +static LIST_HEAD(vmcoredd_list); +static DEFINE_MUTEX(vmcoredd_mutex); +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ + /* * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error * The called function has to take care of module refcounting. @@ -302,10 +309,8 @@ static const struct vm_operations_struct vmcore_mmap_ops = { }; /** - * alloc_elfnotes_buf - allocate buffer for ELF note segment in - * vmalloc memory - * - * @notes_sz: size of buffer + * vmcore_alloc_buf - allocate buffer in vmalloc memory + * @sizez: size of buffer * * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap * the buffer to user-space by means of remap_vmalloc_range(). @@ -313,12 +318,12 @@ static const struct vm_operations_struct vmcore_mmap_ops = { * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is * disabled and there's no need to allow users to mmap the buffer. */ -static inline char *alloc_elfnotes_buf(size_t notes_sz) +static inline char *vmcore_alloc_buf(size_t size) { #ifdef CONFIG_MMU - return vmalloc_user(notes_sz); + return vmalloc_user(size); #else - return vzalloc(notes_sz); + return vzalloc(size); #endif } @@ -665,7 +670,7 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, return rc; *notes_sz = roundup(phdr_sz, PAGE_SIZE); - *notes_buf = alloc_elfnotes_buf(*notes_sz); + *notes_buf = vmcore_alloc_buf(*notes_sz); if (!*notes_buf) return -ENOMEM; @@ -851,7 +856,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, return rc; *notes_sz = roundup(phdr_sz, PAGE_SIZE); - *notes_buf = alloc_elfnotes_buf(*notes_sz); + *notes_buf = vmcore_alloc_buf(*notes_sz); if (!*notes_buf) return -ENOMEM; @@ -1145,6 +1150,115 @@ static int __init parse_crash_elf_headers(void) return 0; } +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP +/** + * vmcoredd_write_header - Write vmcore device dump header at the + * beginning of the dump's buffer. + * @buf: Output buffer where the note is written + * @data: Dump info + * @size: Size of the dump + * + * Fills beginning of the dump's buffer with vmcore device dump header. + */ +static void vmcoredd_write_header(void *buf, struct vmcoredd_data *data, + u32 size) +{ + struct vmcoredd_header *vdd_hdr = (struct vmcoredd_header *)buf; + + vdd_hdr->n_namesz = sizeof(vdd_hdr->name); + vdd_hdr->n_descsz = size + sizeof(vdd_hdr->dump_name); + vdd_hdr->n_type = NT_VMCOREDD; + + strncpy((char *)vdd_hdr->name, VMCOREDD_NOTE_NAME, + sizeof(vdd_hdr->name)); + memcpy(vdd_hdr->dump_name, data->dump_name, sizeof(vdd_hdr->dump_name)); +} + +/** + * vmcore_add_device_dump - Add a buffer containing device dump to vmcore + * @data: dump info. + * + * Allocate a buffer and invoke the calling driver's dump collect routine. + * Write Elf note at the beginning of the buffer to indicate vmcore device + * dump and add the dump to global list. + */ +int vmcore_add_device_dump(struct vmcoredd_data *data) +{ + struct vmcoredd_node *dump; + void *buf = NULL; + size_t data_size; + int ret; + + if (!data || !strlen(data->dump_name) || + !data->vmcoredd_callback || !data->size) + return -EINVAL; + + dump = vzalloc(sizeof(*dump)); + if (!dump) { + ret = -ENOMEM; + goto out_err; + } + + /* Keep size of the buffer page aligned so that it can be mmaped */ + data_size = roundup(sizeof(struct vmcoredd_header) + data->size, + PAGE_SIZE); + + /* Allocate buffer for driver's to write their dumps */ + buf = vmcore_alloc_buf(data_size); + if (!buf) { + ret = -ENOMEM; + goto out_err; + } + + vmcoredd_write_header(buf, data, data_size - + sizeof(struct vmcoredd_header)); + + /* Invoke the driver's dump collection routing */ + ret = data->vmcoredd_callback(data, buf + + sizeof(struct vmcoredd_header)); + if (ret) + goto out_err; + + dump->buf = buf; + dump->size = data_size; + + /* Add the dump to driver sysfs list */ + mutex_lock(&vmcoredd_mutex); + list_add_tail(&dump->list, &vmcoredd_list); + mutex_unlock(&vmcoredd_mutex); + + return 0; + +out_err: + if (buf) + vfree(buf); + + if (dump) + vfree(dump); + + return ret; +} +EXPORT_SYMBOL(vmcore_add_device_dump); +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ + +/* Free all dumps in vmcore device dump list */ +static void vmcore_free_device_dumps(void) +{ +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP + mutex_lock(&vmcoredd_mutex); + while (!list_empty(&vmcoredd_list)) { + struct vmcoredd_node *dump; + + dump = list_first_entry(&vmcoredd_list, struct vmcoredd_node, + list); + list_del(&dump->list); + vfree(dump->buf); + vfree(dump); + } + mutex_unlock(&vmcoredd_mutex); +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ +} + /* Init function for vmcore module. */ static int __init vmcore_init(void) { @@ -1192,4 +1306,7 @@ void vmcore_cleanup(void) kfree(m); } free_elfcorebuf(); + + /* clear vmcore device dump list */ + vmcore_free_device_dumps(); } -- cgit v1.2.3 From 7efe48df8a3df6d089d2e9a1d429c82cc5dcf2de Mon Sep 17 00:00:00 2001 From: Rahul Lakkireddy Date: Wed, 2 May 2018 15:17:18 +0530 Subject: vmcore: append device dumps to vmcore as elf notes Update read and mmap logic to append device dumps as additional notes before the other elf notes. We add device dumps before other elf notes because the other elf notes may not fill the elf notes buffer completely and we will end up with zero-filled data between the elf notes and the device dumps. Tools will then try to decode this zero-filled data as valid notes and we don't want that. Hence, adding device dumps before the other elf notes ensure that zero-filled data can be avoided. This also ensures that the device dumps and the other elf notes can be properly mmaped at page aligned address. Incorporate device dump size into the total vmcore size. Also update offsets for other program headers after the device dumps are added. Suggested-by: Eric Biederman . Signed-off-by: Rahul Lakkireddy Signed-off-by: Ganesh Goudar Signed-off-by: David S. Miller --- fs/proc/vmcore.c | 247 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 243 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index abb3dba0fa49..247c3499e5bd 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -39,6 +39,8 @@ static size_t elfcorebuf_sz_orig; static char *elfnotes_buf; static size_t elfnotes_sz; +/* Size of all notes minus the device dump notes */ +static size_t elfnotes_orig_sz; /* Total size of vmcore file. */ static u64 vmcore_size; @@ -51,6 +53,9 @@ static LIST_HEAD(vmcoredd_list); static DEFINE_MUTEX(vmcoredd_mutex); #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ +/* Device Dump Size */ +static size_t vmcoredd_orig_sz; + /* * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error * The called function has to take care of module refcounting. @@ -185,6 +190,77 @@ static int copy_to(void *target, void *src, size_t size, int userbuf) return 0; } +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP +static int vmcoredd_copy_dumps(void *dst, u64 start, size_t size, int userbuf) +{ + struct vmcoredd_node *dump; + u64 offset = 0; + int ret = 0; + size_t tsz; + char *buf; + + mutex_lock(&vmcoredd_mutex); + list_for_each_entry(dump, &vmcoredd_list, list) { + if (start < offset + dump->size) { + tsz = min(offset + (u64)dump->size - start, (u64)size); + buf = dump->buf + start - offset; + if (copy_to(dst, buf, tsz, userbuf)) { + ret = -EFAULT; + goto out_unlock; + } + + size -= tsz; + start += tsz; + dst += tsz; + + /* Leave now if buffer filled already */ + if (!size) + goto out_unlock; + } + offset += dump->size; + } + +out_unlock: + mutex_unlock(&vmcoredd_mutex); + return ret; +} + +static int vmcoredd_mmap_dumps(struct vm_area_struct *vma, unsigned long dst, + u64 start, size_t size) +{ + struct vmcoredd_node *dump; + u64 offset = 0; + int ret = 0; + size_t tsz; + char *buf; + + mutex_lock(&vmcoredd_mutex); + list_for_each_entry(dump, &vmcoredd_list, list) { + if (start < offset + dump->size) { + tsz = min(offset + (u64)dump->size - start, (u64)size); + buf = dump->buf + start - offset; + if (remap_vmalloc_range_partial(vma, dst, buf, tsz)) { + ret = -EFAULT; + goto out_unlock; + } + + size -= tsz; + start += tsz; + dst += tsz; + + /* Leave now if buffer filled already */ + if (!size) + goto out_unlock; + } + offset += dump->size; + } + +out_unlock: + mutex_unlock(&vmcoredd_mutex); + return ret; +} +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ + /* Read from the ELF header and then the crash dump. On error, negative value is * returned otherwise number of bytes read are returned. */ @@ -222,10 +298,41 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos, if (*fpos < elfcorebuf_sz + elfnotes_sz) { void *kaddr; + /* We add device dumps before other elf notes because the + * other elf notes may not fill the elf notes buffer + * completely and we will end up with zero-filled data + * between the elf notes and the device dumps. Tools will + * then try to decode this zero-filled data as valid notes + * and we don't want that. Hence, adding device dumps before + * the other elf notes ensure that zero-filled data can be + * avoided. + */ +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP + /* Read device dumps */ + if (*fpos < elfcorebuf_sz + vmcoredd_orig_sz) { + tsz = min(elfcorebuf_sz + vmcoredd_orig_sz - + (size_t)*fpos, buflen); + start = *fpos - elfcorebuf_sz; + if (vmcoredd_copy_dumps(buffer, start, tsz, userbuf)) + return -EFAULT; + + buflen -= tsz; + *fpos += tsz; + buffer += tsz; + acc += tsz; + + /* leave now if filled buffer already */ + if (!buflen) + return acc; + } +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ + + /* Read remaining elf notes */ tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen); - kaddr = elfnotes_buf + *fpos - elfcorebuf_sz; + kaddr = elfnotes_buf + *fpos - elfcorebuf_sz - vmcoredd_orig_sz; if (copy_to(buffer, kaddr, tsz, userbuf)) return -EFAULT; + buflen -= tsz; *fpos += tsz; buffer += tsz; @@ -451,11 +558,46 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) if (start < elfcorebuf_sz + elfnotes_sz) { void *kaddr; + /* We add device dumps before other elf notes because the + * other elf notes may not fill the elf notes buffer + * completely and we will end up with zero-filled data + * between the elf notes and the device dumps. Tools will + * then try to decode this zero-filled data as valid notes + * and we don't want that. Hence, adding device dumps before + * the other elf notes ensure that zero-filled data can be + * avoided. This also ensures that the device dumps and + * other elf notes can be properly mmaped at page aligned + * address. + */ +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP + /* Read device dumps */ + if (start < elfcorebuf_sz + vmcoredd_orig_sz) { + u64 start_off; + + tsz = min(elfcorebuf_sz + vmcoredd_orig_sz - + (size_t)start, size); + start_off = start - elfcorebuf_sz; + if (vmcoredd_mmap_dumps(vma, vma->vm_start + len, + start_off, tsz)) + goto fail; + + size -= tsz; + start += tsz; + len += tsz; + + /* leave now if filled buffer already */ + if (!size) + return 0; + } +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ + + /* Read remaining elf notes */ tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)start, size); - kaddr = elfnotes_buf + start - elfcorebuf_sz; + kaddr = elfnotes_buf + start - elfcorebuf_sz - vmcoredd_orig_sz; if (remap_vmalloc_range_partial(vma, vma->vm_start + len, kaddr, tsz)) goto fail; + size -= tsz; start += tsz; len += tsz; @@ -703,6 +845,11 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, /* Modify e_phnum to reflect merged headers. */ ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; + /* Store the size of all notes. We need this to update the note + * header when the device dumps will be added. + */ + elfnotes_orig_sz = phdr.p_memsz; + return 0; } @@ -889,6 +1036,11 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, /* Modify e_phnum to reflect merged headers. */ ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; + /* Store the size of all notes. We need this to update the note + * header when the device dumps will be added. + */ + elfnotes_orig_sz = phdr.p_memsz; + return 0; } @@ -981,8 +1133,8 @@ static int __init process_ptload_program_headers_elf32(char *elfptr, } /* Sets offset fields of vmcore elements. */ -static void __init set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz, - struct list_head *vc_list) +static void set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz, + struct list_head *vc_list) { loff_t vmcore_off; struct vmcore *m; @@ -1174,6 +1326,92 @@ static void vmcoredd_write_header(void *buf, struct vmcoredd_data *data, memcpy(vdd_hdr->dump_name, data->dump_name, sizeof(vdd_hdr->dump_name)); } +/** + * vmcoredd_update_program_headers - Update all Elf program headers + * @elfptr: Pointer to elf header + * @elfnotesz: Size of elf notes aligned to page size + * @vmcoreddsz: Size of device dumps to be added to elf note header + * + * Determine type of Elf header (Elf64 or Elf32) and update the elf note size. + * Also update the offsets of all the program headers after the elf note header. + */ +static void vmcoredd_update_program_headers(char *elfptr, size_t elfnotesz, + size_t vmcoreddsz) +{ + unsigned char *e_ident = (unsigned char *)elfptr; + u64 start, end, size; + loff_t vmcore_off; + u32 i; + + vmcore_off = elfcorebuf_sz + elfnotesz; + + if (e_ident[EI_CLASS] == ELFCLASS64) { + Elf64_Ehdr *ehdr = (Elf64_Ehdr *)elfptr; + Elf64_Phdr *phdr = (Elf64_Phdr *)(elfptr + sizeof(Elf64_Ehdr)); + + /* Update all program headers */ + for (i = 0; i < ehdr->e_phnum; i++, phdr++) { + if (phdr->p_type == PT_NOTE) { + /* Update note size */ + phdr->p_memsz = elfnotes_orig_sz + vmcoreddsz; + phdr->p_filesz = phdr->p_memsz; + continue; + } + + start = rounddown(phdr->p_offset, PAGE_SIZE); + end = roundup(phdr->p_offset + phdr->p_memsz, + PAGE_SIZE); + size = end - start; + phdr->p_offset = vmcore_off + (phdr->p_offset - start); + vmcore_off += size; + } + } else { + Elf32_Ehdr *ehdr = (Elf32_Ehdr *)elfptr; + Elf32_Phdr *phdr = (Elf32_Phdr *)(elfptr + sizeof(Elf32_Ehdr)); + + /* Update all program headers */ + for (i = 0; i < ehdr->e_phnum; i++, phdr++) { + if (phdr->p_type == PT_NOTE) { + /* Update note size */ + phdr->p_memsz = elfnotes_orig_sz + vmcoreddsz; + phdr->p_filesz = phdr->p_memsz; + continue; + } + + start = rounddown(phdr->p_offset, PAGE_SIZE); + end = roundup(phdr->p_offset + phdr->p_memsz, + PAGE_SIZE); + size = end - start; + phdr->p_offset = vmcore_off + (phdr->p_offset - start); + vmcore_off += size; + } + } +} + +/** + * vmcoredd_update_size - Update the total size of the device dumps and update + * Elf header + * @dump_size: Size of the current device dump to be added to total size + * + * Update the total size of all the device dumps and update the Elf program + * headers. Calculate the new offsets for the vmcore list and update the + * total vmcore size. + */ +static void vmcoredd_update_size(size_t dump_size) +{ + vmcoredd_orig_sz += dump_size; + elfnotes_sz = roundup(elfnotes_orig_sz, PAGE_SIZE) + vmcoredd_orig_sz; + vmcoredd_update_program_headers(elfcorebuf, elfnotes_sz, + vmcoredd_orig_sz); + + /* Update vmcore list offsets */ + set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list); + + vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz, + &vmcore_list); + proc_vmcore->size = vmcore_size; +} + /** * vmcore_add_device_dump - Add a buffer containing device dump to vmcore * @data: dump info. @@ -1227,6 +1465,7 @@ int vmcore_add_device_dump(struct vmcoredd_data *data) list_add_tail(&dump->list, &vmcoredd_list); mutex_unlock(&vmcoredd_mutex); + vmcoredd_update_size(data_size); return 0; out_err: -- cgit v1.2.3 From 44c752fe584d8b9f6e0756ecffa8691677471862 Mon Sep 17 00:00:00 2001 From: Rahul Lakkireddy Date: Mon, 21 May 2018 19:07:50 +0530 Subject: vmcore: move get_vmcore_size out of __init Fix below build warning: WARNING: vmlinux.o(.text+0x422bb8): Section mismatch in reference from the function vmcore_add_device_dump() to the function .init.text:get_vmcore_size.constprop.5() The function vmcore_add_device_dump() references the function __init get_vmcore_size.constprop.5(). This is often because vmcore_add_device_dump lacks a __init annotation or the annotation of get_vmcore_size.constprop.5 is wrong. Fixes: 7efe48df8a3d ("vmcore: append device dumps to vmcore as elf notes") Signed-off-by: Rahul Lakkireddy Signed-off-by: Ganesh Goudar Signed-off-by: David S. Miller --- fs/proc/vmcore.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 247c3499e5bd..cfb6674331fd 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -649,8 +649,8 @@ static struct vmcore* __init get_new_element(void) return kzalloc(sizeof(struct vmcore), GFP_KERNEL); } -static u64 __init get_vmcore_size(size_t elfsz, size_t elfnotesegsz, - struct list_head *vc_list) +static u64 get_vmcore_size(size_t elfsz, size_t elfnotesegsz, + struct list_head *vc_list) { u64 size; struct vmcore *m; -- cgit v1.2.3 From 449325b52b7a6208f65ed67d3484fd7b7184477b Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 21 May 2018 19:22:29 -0700 Subject: umh: introduce fork_usermode_blob() helper Introduce helper: int fork_usermode_blob(void *data, size_t len, struct umh_info *info); struct umh_info { struct file *pipe_to_umh; struct file *pipe_from_umh; pid_t pid; }; that GPLed kernel modules (signed or unsigned) can use it to execute part of its own data as swappable user mode process. The kernel will do: - allocate a unique file in tmpfs - populate that file with [data, data + len] bytes - user-mode-helper code will do_execve that file and, before the process starts, the kernel will create two unix pipes for bidirectional communication between kernel module and umh - close tmpfs file, effectively deleting it - the fork_usermode_blob will return zero on success and populate 'struct umh_info' with two unix pipes and the pid of the user process As the first step in the development of the bpfilter project the fork_usermode_blob() helper is introduced to allow user mode code to be invoked from a kernel module. The idea is that user mode code plus normal kernel module code are built as part of the kernel build and installed as traditional kernel module into distro specified location, such that from a distribution point of view, there is no difference between regular kernel modules and kernel modules + umh code. Such modules can be signed, modprobed, rmmod, etc. The use of this new helper by a kernel module doesn't make it any special from kernel and user space tooling point of view. Such approach enables kernel to delegate functionality traditionally done by the kernel modules into the user space processes (either root or !root) and reduces security attack surface of the new code. The buggy umh code would crash the user process, but not the kernel. Another advantage is that umh code of the kernel module can be debugged and tested out of user space (e.g. opening the possibility to run clang sanitizers, fuzzers or user space test suites on the umh code). In case of the bpfilter project such architecture allows complex control plane to be done in the user space while bpf based data plane stays in the kernel. Since umh can crash, can be oom-ed by the kernel, killed by the admin, the kernel module that uses them (like bpfilter) needs to manage life time of umh on its own via two unix pipes and the pid of umh. The exit code of such kernel module should kill the umh it started, so that rmmod of the kernel module will cleanup the corresponding umh. Just like if the kernel module does kmalloc() it should kfree() it in the exit code. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- fs/exec.c | 38 +++++++++++---- include/linux/binfmts.h | 1 + include/linux/umh.h | 12 +++++ kernel/umh.c | 125 ++++++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 164 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 183059c427b9..30a36c2a39bf 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1706,14 +1706,13 @@ static int exec_binprm(struct linux_binprm *bprm) /* * sys_execve() executes a new program. */ -static int do_execveat_common(int fd, struct filename *filename, - struct user_arg_ptr argv, - struct user_arg_ptr envp, - int flags) +static int __do_execve_file(int fd, struct filename *filename, + struct user_arg_ptr argv, + struct user_arg_ptr envp, + int flags, struct file *file) { char *pathbuf = NULL; struct linux_binprm *bprm; - struct file *file; struct files_struct *displaced; int retval; @@ -1752,7 +1751,8 @@ static int do_execveat_common(int fd, struct filename *filename, check_unsafe_exec(bprm); current->in_execve = 1; - file = do_open_execat(fd, filename, flags); + if (!file) + file = do_open_execat(fd, filename, flags); retval = PTR_ERR(file); if (IS_ERR(file)) goto out_unmark; @@ -1760,7 +1760,9 @@ static int do_execveat_common(int fd, struct filename *filename, sched_exec(); bprm->file = file; - if (fd == AT_FDCWD || filename->name[0] == '/') { + if (!filename) { + bprm->filename = "none"; + } else if (fd == AT_FDCWD || filename->name[0] == '/') { bprm->filename = filename->name; } else { if (filename->name[0] == '\0') @@ -1826,7 +1828,8 @@ static int do_execveat_common(int fd, struct filename *filename, task_numa_free(current); free_bprm(bprm); kfree(pathbuf); - putname(filename); + if (filename) + putname(filename); if (displaced) put_files_struct(displaced); return retval; @@ -1849,10 +1852,27 @@ out_files: if (displaced) reset_files_struct(displaced); out_ret: - putname(filename); + if (filename) + putname(filename); return retval; } +static int do_execveat_common(int fd, struct filename *filename, + struct user_arg_ptr argv, + struct user_arg_ptr envp, + int flags) +{ + return __do_execve_file(fd, filename, argv, envp, flags, NULL); +} + +int do_execve_file(struct file *file, void *__argv, void *__envp) +{ + struct user_arg_ptr argv = { .ptr.native = __argv }; + struct user_arg_ptr envp = { .ptr.native = __envp }; + + return __do_execve_file(AT_FDCWD, NULL, argv, envp, 0, file); +} + int do_execve(struct filename *filename, const char __user *const __user *__argv, const char __user *const __user *__envp) diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 4955e0863b83..c05f24fac4f6 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -150,5 +150,6 @@ extern int do_execveat(int, struct filename *, const char __user * const __user *, const char __user * const __user *, int); +int do_execve_file(struct file *file, void *__argv, void *__envp); #endif /* _LINUX_BINFMTS_H */ diff --git a/include/linux/umh.h b/include/linux/umh.h index 244aff638220..5c812acbb80a 100644 --- a/include/linux/umh.h +++ b/include/linux/umh.h @@ -22,8 +22,10 @@ struct subprocess_info { const char *path; char **argv; char **envp; + struct file *file; int wait; int retval; + pid_t pid; int (*init)(struct subprocess_info *info, struct cred *new); void (*cleanup)(struct subprocess_info *info); void *data; @@ -38,6 +40,16 @@ call_usermodehelper_setup(const char *path, char **argv, char **envp, int (*init)(struct subprocess_info *info, struct cred *new), void (*cleanup)(struct subprocess_info *), void *data); +struct subprocess_info *call_usermodehelper_setup_file(struct file *file, + int (*init)(struct subprocess_info *info, struct cred *new), + void (*cleanup)(struct subprocess_info *), void *data); +struct umh_info { + struct file *pipe_to_umh; + struct file *pipe_from_umh; + pid_t pid; +}; +int fork_usermode_blob(void *data, size_t len, struct umh_info *info); + extern int call_usermodehelper_exec(struct subprocess_info *info, int wait); diff --git a/kernel/umh.c b/kernel/umh.c index f76b3ff876cf..30db93fd7e39 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include @@ -97,9 +99,13 @@ static int call_usermodehelper_exec_async(void *data) commit_creds(new); - retval = do_execve(getname_kernel(sub_info->path), - (const char __user *const __user *)sub_info->argv, - (const char __user *const __user *)sub_info->envp); + if (sub_info->file) + retval = do_execve_file(sub_info->file, + sub_info->argv, sub_info->envp); + else + retval = do_execve(getname_kernel(sub_info->path), + (const char __user *const __user *)sub_info->argv, + (const char __user *const __user *)sub_info->envp); out: sub_info->retval = retval; /* @@ -185,6 +191,8 @@ static void call_usermodehelper_exec_work(struct work_struct *work) if (pid < 0) { sub_info->retval = pid; umh_complete(sub_info); + } else { + sub_info->pid = pid; } } } @@ -393,6 +401,117 @@ struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv, } EXPORT_SYMBOL(call_usermodehelper_setup); +struct subprocess_info *call_usermodehelper_setup_file(struct file *file, + int (*init)(struct subprocess_info *info, struct cred *new), + void (*cleanup)(struct subprocess_info *info), void *data) +{ + struct subprocess_info *sub_info; + + sub_info = kzalloc(sizeof(struct subprocess_info), GFP_KERNEL); + if (!sub_info) + return NULL; + + INIT_WORK(&sub_info->work, call_usermodehelper_exec_work); + sub_info->path = "none"; + sub_info->file = file; + sub_info->init = init; + sub_info->cleanup = cleanup; + sub_info->data = data; + return sub_info; +} + +static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) +{ + struct umh_info *umh_info = info->data; + struct file *from_umh[2]; + struct file *to_umh[2]; + int err; + + /* create pipe to send data to umh */ + err = create_pipe_files(to_umh, 0); + if (err) + return err; + err = replace_fd(0, to_umh[0], 0); + fput(to_umh[0]); + if (err < 0) { + fput(to_umh[1]); + return err; + } + + /* create pipe to receive data from umh */ + err = create_pipe_files(from_umh, 0); + if (err) { + fput(to_umh[1]); + replace_fd(0, NULL, 0); + return err; + } + err = replace_fd(1, from_umh[1], 0); + fput(from_umh[1]); + if (err < 0) { + fput(to_umh[1]); + replace_fd(0, NULL, 0); + fput(from_umh[0]); + return err; + } + + umh_info->pipe_to_umh = to_umh[1]; + umh_info->pipe_from_umh = from_umh[0]; + return 0; +} + +static void umh_save_pid(struct subprocess_info *info) +{ + struct umh_info *umh_info = info->data; + + umh_info->pid = info->pid; +} + +/** + * fork_usermode_blob - fork a blob of bytes as a usermode process + * @data: a blob of bytes that can be do_execv-ed as a file + * @len: length of the blob + * @info: information about usermode process (shouldn't be NULL) + * + * Returns either negative error or zero which indicates success + * in executing a blob of bytes as a usermode process. In such + * case 'struct umh_info *info' is populated with two pipes + * and a pid of the process. The caller is responsible for health + * check of the user process, killing it via pid, and closing the + * pipes when user process is no longer needed. + */ +int fork_usermode_blob(void *data, size_t len, struct umh_info *info) +{ + struct subprocess_info *sub_info; + struct file *file; + ssize_t written; + loff_t pos = 0; + int err; + + file = shmem_kernel_file_setup("", len, 0); + if (IS_ERR(file)) + return PTR_ERR(file); + + written = kernel_write(file, data, len, &pos); + if (written != len) { + err = written; + if (err >= 0) + err = -ENOMEM; + goto out; + } + + err = -ENOMEM; + sub_info = call_usermodehelper_setup_file(file, umh_pipe_setup, + umh_save_pid, info); + if (!sub_info) + goto out; + + err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); +out: + fput(file); + return err; +} +EXPORT_SYMBOL_GPL(fork_usermode_blob); + /** * call_usermodehelper_exec - start a usermode application * @sub_info: information about the subprocessa -- cgit v1.2.3 From 1a025028d400b23477341aa7ec2ce55f8b39b554 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sun, 3 Jun 2018 02:17:39 +0100 Subject: rxrpc: Fix handling of call quietly cancelled out on server Sometimes an in-progress call will stop responding on the fileserver when the fileserver quietly cancels the call with an internally marked abort (RX_CALL_DEAD), without sending an ABORT to the client. This causes the client's call to eventually expire from lack of incoming packets directed its way, which currently leads to it being cancelled locally with ETIME. Note that it's not currently clear as to why this happens as it's really hard to reproduce. The rotation policy implement by kAFS, however, doesn't differentiate between ETIME meaning we didn't get any response from the server and ETIME meaning the call got cancelled mid-flow. The latter leads to an oops when fetching data as the rotation partially resets the afs_read descriptor, which can result in a cleared page pointer being dereferenced because that page has already been filled. Handle this by the following means: (1) Set a flag on a call when we receive a packet for it. (2) Store the highest packet serial number so far received for a call (bearing in mind this may wrap). (3) If, when the "not received anything recently" timeout expires on a call, we've received at least one packet for a call and the connection as a whole has received packets more recently than that call, then cancel the call locally with ECONNRESET rather than ETIME. This indicates that the call was definitely in progress on the server. (4) In kAFS, if the rotation algorithm sees ECONNRESET rather than ETIME, don't try the next server, but rather abort the call. This avoids the oops as we don't try to reuse the afs_read struct. Rather, as-yet ungotten pages will be reread at a later data. Also: (5) Add an rxrpc tracepoint to log detection of the call being reset. Without this, I occasionally see an oops like the following: general protection fault: 0000 [#1] SMP PTI ... RIP: 0010:_copy_to_iter+0x204/0x310 RSP: 0018:ffff8800cae0f828 EFLAGS: 00010206 RAX: 0000000000000560 RBX: 0000000000000560 RCX: 0000000000000560 RDX: ffff8800cae0f968 RSI: ffff8800d58b3312 RDI: 0005080000000000 RBP: ffff8800cae0f968 R08: 0000000000000560 R09: ffff8800ca00f400 R10: ffff8800c36f28d4 R11: 00000000000008c4 R12: ffff8800cae0f958 R13: 0000000000000560 R14: ffff8800d58b3312 R15: 0000000000000560 FS: 00007fdaef108080(0000) GS:ffff8800ca680000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fb28a8fa000 CR3: 00000000d2a76002 CR4: 00000000001606e0 Call Trace: skb_copy_datagram_iter+0x14e/0x289 rxrpc_recvmsg_data.isra.0+0x6f3/0xf68 ? trace_buffer_unlock_commit_regs+0x4f/0x89 rxrpc_kernel_recv_data+0x149/0x421 afs_extract_data+0x1e0/0x798 ? afs_wait_for_call_to_complete+0xc9/0x52e afs_deliver_fs_fetch_data+0x33a/0x5ab afs_deliver_to_call+0x1ee/0x5e0 ? afs_wait_for_call_to_complete+0xc9/0x52e afs_wait_for_call_to_complete+0x12b/0x52e ? wake_up_q+0x54/0x54 afs_make_call+0x287/0x462 ? afs_fs_fetch_data+0x3e6/0x3ed ? rcu_read_lock_sched_held+0x5d/0x63 afs_fs_fetch_data+0x3e6/0x3ed afs_fetch_data+0xbb/0x14a afs_readpages+0x317/0x40d __do_page_cache_readahead+0x203/0x2ba ? ondemand_readahead+0x3a7/0x3c1 ondemand_readahead+0x3a7/0x3c1 generic_file_buffered_read+0x18b/0x62f __vfs_read+0xdb/0xfe vfs_read+0xb2/0x137 ksys_read+0x50/0x8c do_syscall_64+0x7d/0x1a0 entry_SYSCALL_64_after_hwframe+0x49/0xbe Note the weird value in RDI which is a result of trying to kmap() a NULL page pointer. Signed-off-by: David Howells Signed-off-by: David S. Miller --- fs/afs/rotate.c | 4 ++++ include/trace/events/rxrpc.h | 32 ++++++++++++++++++++++++++++++++ net/rxrpc/ar-internal.h | 2 ++ net/rxrpc/call_event.c | 8 +++++++- net/rxrpc/input.c | 10 ++++++++-- 5 files changed, 53 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index e065bc0768e6..1faef56b12bd 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -310,6 +310,10 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) case -ETIME: _debug("no conn"); goto iterate_address; + + case -ECONNRESET: + _debug("call reset"); + goto failed; } restart_from_beginning: diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 077e664ac9a2..4fff00e9da8a 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -1459,6 +1459,38 @@ TRACE_EVENT(rxrpc_tx_fail, __print_symbolic(__entry->what, rxrpc_tx_fail_traces)) ); +TRACE_EVENT(rxrpc_call_reset, + TP_PROTO(struct rxrpc_call *call), + + TP_ARGS(call), + + TP_STRUCT__entry( + __field(unsigned int, debug_id ) + __field(u32, cid ) + __field(u32, call_id ) + __field(rxrpc_serial_t, call_serial ) + __field(rxrpc_serial_t, conn_serial ) + __field(rxrpc_seq_t, tx_seq ) + __field(rxrpc_seq_t, rx_seq ) + ), + + TP_fast_assign( + __entry->debug_id = call->debug_id; + __entry->cid = call->cid; + __entry->call_id = call->call_id; + __entry->call_serial = call->rx_serial; + __entry->conn_serial = call->conn->hi_serial; + __entry->tx_seq = call->tx_hard_ack; + __entry->rx_seq = call->ackr_seen; + ), + + TP_printk("c=%08x %08x:%08x r=%08x/%08x tx=%08x rx=%08x", + __entry->debug_id, + __entry->cid, __entry->call_id, + __entry->call_serial, __entry->conn_serial, + __entry->tx_seq, __entry->rx_seq) + ); + #endif /* _TRACE_RXRPC_H */ /* This part must be outside protection */ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 19975d2ca9a2..b2393113a251 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -477,6 +477,7 @@ enum rxrpc_call_flag { RXRPC_CALL_PINGING, /* Ping in process */ RXRPC_CALL_RETRANS_TIMEOUT, /* Retransmission due to timeout occurred */ RXRPC_CALL_BEGAN_RX_TIMER, /* We began the expect_rx_by timer */ + RXRPC_CALL_RX_HEARD, /* The peer responded at least once to this call */ }; /* @@ -624,6 +625,7 @@ struct rxrpc_call { */ rxrpc_seq_t rx_top; /* Highest Rx slot allocated. */ rxrpc_seq_t rx_expect_next; /* Expected next packet sequence number */ + rxrpc_serial_t rx_serial; /* Highest serial received for this call */ u8 rx_winsize; /* Size of Rx window */ u8 tx_winsize; /* Maximum size of Tx window */ bool tx_phase; /* T if transmission phase, F if receive phase */ diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 6e0d788b4dc4..20210418904b 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -392,7 +392,13 @@ recheck_state: /* Process events */ if (test_and_clear_bit(RXRPC_CALL_EV_EXPIRED, &call->events)) { - rxrpc_abort_call("EXP", call, 0, RX_USER_ABORT, -ETIME); + if (test_bit(RXRPC_CALL_RX_HEARD, &call->flags) && + (int)call->conn->hi_serial - (int)call->rx_serial > 0) { + trace_rxrpc_call_reset(call); + rxrpc_abort_call("EXP", call, 0, RX_USER_ABORT, -ECONNRESET); + } else { + rxrpc_abort_call("EXP", call, 0, RX_USER_ABORT, -ETIME); + } set_bit(RXRPC_CALL_EV_ABORT, &call->events); goto recheck_state; } diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index b5fd6381313d..608d078a4981 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -1278,8 +1278,14 @@ void rxrpc_data_ready(struct sock *udp_sk) call = NULL; } - if (call && sp->hdr.serviceId != call->service_id) - call->service_id = sp->hdr.serviceId; + if (call) { + if (sp->hdr.serviceId != call->service_id) + call->service_id = sp->hdr.serviceId; + if ((int)sp->hdr.serial - (int)call->rx_serial > 0) + call->rx_serial = sp->hdr.serial; + if (!test_bit(RXRPC_CALL_RX_HEARD, &call->flags)) + set_bit(RXRPC_CALL_RX_HEARD, &call->flags); + } } else { skew = 0; call = NULL; -- cgit v1.2.3