From 65e68edce0db433aa0c2b26d7dc14fbbbeb89fbb Mon Sep 17 00:00:00 2001 From: Marta Rybczynska Date: Tue, 24 Sep 2019 15:14:52 +0200 Subject: nvme: allow 64-bit results in passthru commands It is not possible to get 64-bit results from the passthru commands, what prevents from getting for the Capabilities (CAP) property value. As a result, it is not possible to implement IOL's NVMe Conformance test 4.3 Case 1 for Fabrics targets [1] (page 123). This issue has been already discussed [2], but without a solution. This patch solves the problem by adding new ioctls with a new passthru structure, including 64-bit results. The older ioctls stay unchanged. [1] https://www.iol.unh.edu/sites/default/files/testsuites/nvme/UNH-IOL_NVMe_Conformance_Test_Suite_v11.0.pdf [2] http://lists.infradead.org/pipermail/linux-nvme/2018-June/018791.html Signed-off-by: Marta Rybczynska Reviewed-by: Keith Busch Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- include/uapi/linux/nvme_ioctl.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/nvme_ioctl.h b/include/uapi/linux/nvme_ioctl.h index 1c215ea1798e..e168dc59e9a0 100644 --- a/include/uapi/linux/nvme_ioctl.h +++ b/include/uapi/linux/nvme_ioctl.h @@ -45,6 +45,27 @@ struct nvme_passthru_cmd { __u32 result; }; +struct nvme_passthru_cmd64 { + __u8 opcode; + __u8 flags; + __u16 rsvd1; + __u32 nsid; + __u32 cdw2; + __u32 cdw3; + __u64 metadata; + __u64 addr; + __u32 metadata_len; + __u32 data_len; + __u32 cdw10; + __u32 cdw11; + __u32 cdw12; + __u32 cdw13; + __u32 cdw14; + __u32 cdw15; + __u32 timeout_ms; + __u64 result; +}; + #define nvme_admin_cmd nvme_passthru_cmd #define NVME_IOCTL_ID _IO('N', 0x40) @@ -54,5 +75,7 @@ struct nvme_passthru_cmd { #define NVME_IOCTL_RESET _IO('N', 0x44) #define NVME_IOCTL_SUBSYS_RESET _IO('N', 0x45) #define NVME_IOCTL_RESCAN _IO('N', 0x46) +#define NVME_IOCTL_ADMIN64_CMD _IOWR('N', 0x47, struct nvme_passthru_cmd64) +#define NVME_IOCTL_IO64_CMD _IOWR('N', 0x48, struct nvme_passthru_cmd64) #endif /* _UAPI_LINUX_NVME_IOCTL_H */ -- cgit v1.2.3 From 833b45de69a6016c4b0cebe6765d526a31a81580 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 30 Sep 2019 18:48:44 +0200 Subject: kvm: x86, powerpc: do not allow clearing largepages debugfs entry The largepages debugfs entry is incremented/decremented as shadow pages are created or destroyed. Clearing it will result in an underflow, which is harmless to KVM but ugly (and could be misinterpreted by tools that use debugfs information), so make this particular statistic read-only. Cc: kvm-ppc@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/book3s.c | 8 ++++---- arch/x86/kvm/x86.c | 6 +++--- include/linux/kvm_host.h | 2 ++ virt/kvm/kvm_main.c | 10 +++++++--- 4 files changed, 16 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index d7fcdfa7fee4..ec2547cc5ecb 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -36,8 +36,8 @@ #include "book3s.h" #include "trace.h" -#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM -#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU +#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ +#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ /* #define EXIT_DEBUG */ @@ -69,8 +69,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "pthru_all", VCPU_STAT(pthru_all) }, { "pthru_host", VCPU_STAT(pthru_host) }, { "pthru_bad_aff", VCPU_STAT(pthru_bad_aff) }, - { "largepages_2M", VM_STAT(num_2M_pages) }, - { "largepages_1G", VM_STAT(num_1G_pages) }, + { "largepages_2M", VM_STAT(num_2M_pages, .mode = 0444) }, + { "largepages_1G", VM_STAT(num_1G_pages, .mode = 0444) }, { NULL } }; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 180c7e88577a..8072acaaf028 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -92,8 +92,8 @@ u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #endif -#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM -#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU +#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ +#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) @@ -212,7 +212,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, { "mmu_unsync", VM_STAT(mmu_unsync) }, { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, - { "largepages", VM_STAT(lpages) }, + { "largepages", VM_STAT(lpages, .mode = 0444) }, { "max_mmu_page_hash_collisions", VM_STAT(max_mmu_page_hash_collisions) }, { NULL } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index fcb46b3374c6..719fc3e15ea4 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1090,6 +1090,7 @@ enum kvm_stat_kind { struct kvm_stat_data { int offset; + int mode; struct kvm *kvm; }; @@ -1097,6 +1098,7 @@ struct kvm_stats_debugfs_item { const char *name; int offset; enum kvm_stat_kind kind; + int mode; }; extern struct kvm_stats_debugfs_item debugfs_entries[]; extern struct dentry *kvm_debugfs_dir; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e6de3159e682..fd68fbe0a75d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -617,8 +617,9 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) stat_data->kvm = kvm; stat_data->offset = p->offset; + stat_data->mode = p->mode ? p->mode : 0644; kvm->debugfs_stat_data[p - debugfs_entries] = stat_data; - debugfs_create_file(p->name, 0644, kvm->debugfs_dentry, + debugfs_create_file(p->name, stat_data->mode, kvm->debugfs_dentry, stat_data, stat_fops_per_vm[p->kind]); } return 0; @@ -3929,7 +3930,9 @@ static int kvm_debugfs_open(struct inode *inode, struct file *file, if (!refcount_inc_not_zero(&stat_data->kvm->users_count)) return -ENOENT; - if (simple_attr_open(inode, file, get, set, fmt)) { + if (simple_attr_open(inode, file, get, + stat_data->mode & S_IWUGO ? set : NULL, + fmt)) { kvm_put_kvm(stat_data->kvm); return -ENOMEM; } @@ -4177,7 +4180,8 @@ static void kvm_init_debug(void) kvm_debugfs_num_entries = 0; for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) { - debugfs_create_file(p->name, 0644, kvm_debugfs_dir, + int mode = p->mode ? p->mode : 0644; + debugfs_create_file(p->name, mode, kvm_debugfs_dir, (void *)(long)p->offset, stat_fops[p->kind]); } -- cgit v1.2.3 From 61129dd29f7962f278b618a2a3e8fdb986a66dc8 Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Tue, 17 Sep 2019 09:18:53 +0200 Subject: sched: Add __ASSEMBLY__ guards around struct clone_args The addition of struct clone_args to uapi/linux/sched.h is not protected by __ASSEMBLY__ guards, causing a failure to build from source for glibc on RISC-V. Add the guards to fix this. Fixes: 7f192e3cd316 ("fork: add clone3") Signed-off-by: Seth Forshee Cc: Acked-by: Ingo Molnar Link: https://lore.kernel.org/r/20190917071853.12385-1-seth.forshee@canonical.com Signed-off-by: Christian Brauner --- include/uapi/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index b3105ac1381a..851ff1feadd5 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -33,6 +33,7 @@ #define CLONE_NEWNET 0x40000000 /* New network namespace */ #define CLONE_IO 0x80000000 /* Clone io context */ +#ifndef __ASSEMBLY__ /* * Arguments for the clone3 syscall */ @@ -46,6 +47,7 @@ struct clone_args { __aligned_u64 stack_size; __aligned_u64 tls; }; +#endif /* * Scheduling policies -- cgit v1.2.3 From f5a1a536fa14895ccff4e94e6a5af90901ce86aa Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 1 Oct 2019 11:10:52 +1000 Subject: lib: introduce copy_struct_from_user() helper A common pattern for syscall extensions is increasing the size of a struct passed from userspace, such that the zero-value of the new fields result in the old kernel behaviour (allowing for a mix of userspace and kernel vintages to operate on one another in most cases). While this interface exists for communication in both directions, only one interface is straightforward to have reasonable semantics for (userspace passing a struct to the kernel). For kernel returns to userspace, what the correct semantics are (whether there should be an error if userspace is unaware of a new extension) is very syscall-dependent and thus probably cannot be unified between syscalls (a good example of this problem is [1]). Previously there was no common lib/ function that implemented the necessary extension-checking semantics (and different syscalls implemented them slightly differently or incompletely[2]). Future patches replace common uses of this pattern to make use of copy_struct_from_user(). Some in-kernel selftests that insure that the handling of alignment and various byte patterns are all handled identically to memchr_inv() usage. [1]: commit 1251201c0d34 ("sched/core: Fix uclamp ABI bug, clean up and robustify sched_read_attr() ABI logic and code") [2]: For instance {sched_setattr,perf_event_open,clone3}(2) all do do similar checks to copy_struct_from_user() while rt_sigprocmask(2) always rejects differently-sized struct arguments. Suggested-by: Rasmus Villemoes Signed-off-by: Aleksa Sarai Reviewed-by: Kees Cook Reviewed-by: Christian Brauner Link: https://lore.kernel.org/r/20191001011055.19283-2-cyphar@cyphar.com Signed-off-by: Christian Brauner --- include/linux/bitops.h | 7 +++ include/linux/uaccess.h | 70 +++++++++++++++++++++++++ lib/strnlen_user.c | 8 +-- lib/test_user_copy.c | 136 +++++++++++++++++++++++++++++++++++++++++++++--- lib/usercopy.c | 55 ++++++++++++++++++++ 5 files changed, 263 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index cf074bce3eb3..c94a9ff9f082 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -4,6 +4,13 @@ #include #include +/* Set bits in the first 'n' bytes when loaded from memory */ +#ifdef __LITTLE_ENDIAN +# define aligned_byte_mask(n) ((1UL << 8*(n))-1) +#else +# define aligned_byte_mask(n) (~0xffUL << (BITS_PER_LONG - 8 - 8*(n))) +#endif + #define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE) #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_TYPE(long)) diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 70bbdc38dc37..e47d0522a1f4 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -231,6 +231,76 @@ __copy_from_user_inatomic_nocache(void *to, const void __user *from, #endif /* ARCH_HAS_NOCACHE_UACCESS */ +extern __must_check int check_zeroed_user(const void __user *from, size_t size); + +/** + * copy_struct_from_user: copy a struct from userspace + * @dst: Destination address, in kernel space. This buffer must be @ksize + * bytes long. + * @ksize: Size of @dst struct. + * @src: Source address, in userspace. + * @usize: (Alleged) size of @src struct. + * + * Copies a struct from userspace to kernel space, in a way that guarantees + * backwards-compatibility for struct syscall arguments (as long as future + * struct extensions are made such that all new fields are *appended* to the + * old struct, and zeroed-out new fields have the same meaning as the old + * struct). + * + * @ksize is just sizeof(*dst), and @usize should've been passed by userspace. + * The recommended usage is something like the following: + * + * SYSCALL_DEFINE2(foobar, const struct foo __user *, uarg, size_t, usize) + * { + * int err; + * struct foo karg = {}; + * + * if (usize > PAGE_SIZE) + * return -E2BIG; + * if (usize < FOO_SIZE_VER0) + * return -EINVAL; + * + * err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize); + * if (err) + * return err; + * + * // ... + * } + * + * There are three cases to consider: + * * If @usize == @ksize, then it's copied verbatim. + * * If @usize < @ksize, then the userspace has passed an old struct to a + * newer kernel. The rest of the trailing bytes in @dst (@ksize - @usize) + * are to be zero-filled. + * * If @usize > @ksize, then the userspace has passed a new struct to an + * older kernel. The trailing bytes unknown to the kernel (@usize - @ksize) + * are checked to ensure they are zeroed, otherwise -E2BIG is returned. + * + * Returns (in all cases, some data may have been copied): + * * -E2BIG: (@usize > @ksize) and there are non-zero trailing bytes in @src. + * * -EFAULT: access to userspace failed. + */ +static __always_inline __must_check int +copy_struct_from_user(void *dst, size_t ksize, const void __user *src, + size_t usize) +{ + size_t size = min(ksize, usize); + size_t rest = max(ksize, usize) - size; + + /* Deal with trailing bytes. */ + if (usize < ksize) { + memset(dst + size, 0, rest); + } else if (usize > ksize) { + int ret = check_zeroed_user(src + size, rest); + if (ret <= 0) + return ret ?: -E2BIG; + } + /* Copy the interoperable parts of the struct. */ + if (copy_from_user(dst, src, size)) + return -EFAULT; + return 0; +} + /* * probe_kernel_read(): safely attempt to read from a location * @dst: pointer to the buffer that shall take the data diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c index 28ff554a1be8..6c0005d5dd5c 100644 --- a/lib/strnlen_user.c +++ b/lib/strnlen_user.c @@ -3,16 +3,10 @@ #include #include #include +#include #include -/* Set bits in the first 'n' bytes when loaded from memory */ -#ifdef __LITTLE_ENDIAN -# define aligned_byte_mask(n) ((1ul << 8*(n))-1) -#else -# define aligned_byte_mask(n) (~0xfful << (BITS_PER_LONG - 8 - 8*(n))) -#endif - /* * Do a strnlen, return length of string *with* final '\0'. * 'count' is the user-supplied count, while 'max' is the diff --git a/lib/test_user_copy.c b/lib/test_user_copy.c index 67bcd5dfd847..950ee88cd6ac 100644 --- a/lib/test_user_copy.c +++ b/lib/test_user_copy.c @@ -31,14 +31,133 @@ # define TEST_U64 #endif -#define test(condition, msg) \ -({ \ - int cond = (condition); \ - if (cond) \ - pr_warn("%s\n", msg); \ - cond; \ +#define test(condition, msg, ...) \ +({ \ + int cond = (condition); \ + if (cond) \ + pr_warn("[%d] " msg "\n", __LINE__, ##__VA_ARGS__); \ + cond; \ }) +static bool is_zeroed(void *from, size_t size) +{ + return memchr_inv(from, 0x0, size) == NULL; +} + +static int test_check_nonzero_user(char *kmem, char __user *umem, size_t size) +{ + int ret = 0; + size_t start, end, i; + size_t zero_start = size / 4; + size_t zero_end = size - zero_start; + + /* + * We conduct a series of check_nonzero_user() tests on a block of memory + * with the following byte-pattern (trying every possible [start,end] + * pair): + * + * [ 00 ff 00 ff ... 00 00 00 00 ... ff 00 ff 00 ] + * + * And we verify that check_nonzero_user() acts identically to memchr_inv(). + */ + + memset(kmem, 0x0, size); + for (i = 1; i < zero_start; i += 2) + kmem[i] = 0xff; + for (i = zero_end; i < size; i += 2) + kmem[i] = 0xff; + + ret |= test(copy_to_user(umem, kmem, size), + "legitimate copy_to_user failed"); + + for (start = 0; start <= size; start++) { + for (end = start; end <= size; end++) { + size_t len = end - start; + int retval = check_zeroed_user(umem + start, len); + int expected = is_zeroed(kmem + start, len); + + ret |= test(retval != expected, + "check_nonzero_user(=%d) != memchr_inv(=%d) mismatch (start=%zu, end=%zu)", + retval, expected, start, end); + } + } + + return ret; +} + +static int test_copy_struct_from_user(char *kmem, char __user *umem, + size_t size) +{ + int ret = 0; + char *umem_src = NULL, *expected = NULL; + size_t ksize, usize; + + umem_src = kmalloc(size, GFP_KERNEL); + if (ret |= test(umem_src == NULL, "kmalloc failed")) + goto out_free; + + expected = kmalloc(size, GFP_KERNEL); + if (ret |= test(expected == NULL, "kmalloc failed")) + goto out_free; + + /* Fill umem with a fixed byte pattern. */ + memset(umem_src, 0x3e, size); + ret |= test(copy_to_user(umem, umem_src, size), + "legitimate copy_to_user failed"); + + /* Check basic case -- (usize == ksize). */ + ksize = size; + usize = size; + + memcpy(expected, umem_src, ksize); + + memset(kmem, 0x0, size); + ret |= test(copy_struct_from_user(kmem, ksize, umem, usize), + "copy_struct_from_user(usize == ksize) failed"); + ret |= test(memcmp(kmem, expected, ksize), + "copy_struct_from_user(usize == ksize) gives unexpected copy"); + + /* Old userspace case -- (usize < ksize). */ + ksize = size; + usize = size / 2; + + memcpy(expected, umem_src, usize); + memset(expected + usize, 0x0, ksize - usize); + + memset(kmem, 0x0, size); + ret |= test(copy_struct_from_user(kmem, ksize, umem, usize), + "copy_struct_from_user(usize < ksize) failed"); + ret |= test(memcmp(kmem, expected, ksize), + "copy_struct_from_user(usize < ksize) gives unexpected copy"); + + /* New userspace (-E2BIG) case -- (usize > ksize). */ + ksize = size / 2; + usize = size; + + memset(kmem, 0x0, size); + ret |= test(copy_struct_from_user(kmem, ksize, umem, usize) != -E2BIG, + "copy_struct_from_user(usize > ksize) didn't give E2BIG"); + + /* New userspace (success) case -- (usize > ksize). */ + ksize = size / 2; + usize = size; + + memcpy(expected, umem_src, ksize); + ret |= test(clear_user(umem + ksize, usize - ksize), + "legitimate clear_user failed"); + + memset(kmem, 0x0, size); + ret |= test(copy_struct_from_user(kmem, ksize, umem, usize), + "copy_struct_from_user(usize > ksize) failed"); + ret |= test(memcmp(kmem, expected, ksize), + "copy_struct_from_user(usize > ksize) gives unexpected copy"); + +out_free: + kfree(expected); + kfree(umem_src); + return ret; +} + static int __init test_user_copy_init(void) { int ret = 0; @@ -106,6 +225,11 @@ static int __init test_user_copy_init(void) #endif #undef test_legit + /* Test usage of check_nonzero_user(). */ + ret |= test_check_nonzero_user(kmem, usermem, 2 * PAGE_SIZE); + /* Test usage of copy_struct_from_user(). */ + ret |= test_copy_struct_from_user(kmem, usermem, 2 * PAGE_SIZE); + /* * Invalid usage: none of these copies should succeed. */ diff --git a/lib/usercopy.c b/lib/usercopy.c index c2bfbcaeb3dc..cbb4d9ec00f2 100644 --- a/lib/usercopy.c +++ b/lib/usercopy.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include /* out-of-line parts */ @@ -31,3 +32,57 @@ unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n) } EXPORT_SYMBOL(_copy_to_user); #endif + +/** + * check_zeroed_user: check if a userspace buffer only contains zero bytes + * @from: Source address, in userspace. + * @size: Size of buffer. + * + * This is effectively shorthand for "memchr_inv(from, 0, size) == NULL" for + * userspace addresses (and is more efficient because we don't care where the + * first non-zero byte is). + * + * Returns: + * * 0: There were non-zero bytes present in the buffer. + * * 1: The buffer was full of zero bytes. + * * -EFAULT: access to userspace failed. + */ +int check_zeroed_user(const void __user *from, size_t size) +{ + unsigned long val; + uintptr_t align = (uintptr_t) from % sizeof(unsigned long); + + if (unlikely(size == 0)) + return 1; + + from -= align; + size += align; + + if (!user_access_begin(from, size)) + return -EFAULT; + + unsafe_get_user(val, (unsigned long __user *) from, err_fault); + if (align) + val &= ~aligned_byte_mask(align); + + while (size > sizeof(unsigned long)) { + if (unlikely(val)) + goto done; + + from += sizeof(unsigned long); + size -= sizeof(unsigned long); + + unsafe_get_user(val, (unsigned long __user *) from, err_fault); + } + + if (size < sizeof(unsigned long)) + val &= aligned_byte_mask(size); + +done: + user_access_end(); + return (val == 0); +err_fault: + user_access_end(); + return -EFAULT; +} +EXPORT_SYMBOL(check_zeroed_user); -- cgit v1.2.3 From f14c234b4bc5184fd40d9a47830e5b32c3b36d49 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 1 Oct 2019 11:10:53 +1000 Subject: clone3: switch to copy_struct_from_user() Switch clone3() syscall from it's own copying struct clone_args from userspace to the new dedicated copy_struct_from_user() helper. The change is very straightforward, and helps unify the syscall interface for struct-from-userspace syscalls. Additionally, explicitly define CLONE_ARGS_SIZE_VER0 to match the other users of the struct-extension pattern. Signed-off-by: Aleksa Sarai Reviewed-by: Kees Cook Reviewed-by: Christian Brauner [christian.brauner@ubuntu.com: improve commit message] Link: https://lore.kernel.org/r/20191001011055.19283-3-cyphar@cyphar.com Signed-off-by: Christian Brauner --- include/uapi/linux/sched.h | 2 ++ kernel/fork.c | 34 +++++++--------------------------- 2 files changed, 9 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index b3105ac1381a..0945805982b4 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -47,6 +47,8 @@ struct clone_args { __aligned_u64 tls; }; +#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ + /* * Scheduling policies */ diff --git a/kernel/fork.c b/kernel/fork.c index f9572f416126..2ef529869c64 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2525,39 +2525,19 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, #ifdef __ARCH_WANT_SYS_CLONE3 noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, struct clone_args __user *uargs, - size_t size) + size_t usize) { + int err; struct clone_args args; - if (unlikely(size > PAGE_SIZE)) + if (unlikely(usize > PAGE_SIZE)) return -E2BIG; - - if (unlikely(size < sizeof(struct clone_args))) + if (unlikely(usize < CLONE_ARGS_SIZE_VER0)) return -EINVAL; - if (unlikely(!access_ok(uargs, size))) - return -EFAULT; - - if (size > sizeof(struct clone_args)) { - unsigned char __user *addr; - unsigned char __user *end; - unsigned char val; - - addr = (void __user *)uargs + sizeof(struct clone_args); - end = (void __user *)uargs + size; - - for (; addr < end; addr++) { - if (get_user(val, addr)) - return -EFAULT; - if (val) - return -E2BIG; - } - - size = sizeof(struct clone_args); - } - - if (copy_from_user(&args, uargs, size)) - return -EFAULT; + err = copy_struct_from_user(&args, sizeof(args), uargs, usize); + if (err) + return err; /* * Verify that higher 32bits of exit_signal are unset and that -- cgit v1.2.3 From 895b5c9f206eb7d25dc1360a8ccfc5958895eb89 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 29 Sep 2019 20:54:03 +0200 Subject: netfilter: drop bridge nf reset from nf_reset commit 174e23810cd31 ("sk_buff: drop all skb extensions on free and skb scrubbing") made napi recycle always drop skb extensions. The additional skb_ext_del() that is performed via nf_reset on napi skb recycle is not needed anymore. Most nf_reset() calls in the stack are there so queued skb won't block 'rmmod nf_conntrack' indefinitely. This removes the skb_ext_del from nf_reset, and renames it to a more fitting nf_reset_ct(). In a few selected places, add a call to skb_ext_reset to make sure that no active extensions remain. I am submitting this for "net", because we're still early in the release cycle. The patch applies to net-next too, but I think the rename causes needless divergence between those trees. Suggested-by: Eric Dumazet Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- drivers/net/ppp/pptp.c | 4 ++-- drivers/net/tun.c | 2 +- drivers/net/virtio_net.c | 2 +- drivers/net/vrf.c | 8 ++++---- drivers/net/wireless/mac80211_hwsim.c | 4 ++-- drivers/staging/octeon/ethernet-tx.c | 6 ++---- include/linux/skbuff.h | 5 +---- net/batman-adv/soft-interface.c | 2 +- net/core/skbuff.c | 2 +- net/dccp/ipv4.c | 2 +- net/ipv4/ip_input.c | 2 +- net/ipv4/ipmr.c | 4 ++-- net/ipv4/netfilter/nf_dup_ipv4.c | 2 +- net/ipv4/raw.c | 2 +- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/udp.c | 4 ++-- net/ipv6/ip6_input.c | 2 +- net/ipv6/netfilter/nf_dup_ipv6.c | 2 +- net/ipv6/raw.c | 2 +- net/l2tp/l2tp_core.c | 2 +- net/l2tp/l2tp_eth.c | 2 +- net/l2tp/l2tp_ip.c | 2 +- net/l2tp/l2tp_ip6.c | 2 +- net/netfilter/ipvs/ip_vs_xmit.c | 2 +- net/openvswitch/vport-internal_dev.c | 2 +- net/packet/af_packet.c | 4 ++-- net/sctp/input.c | 2 +- net/xfrm/xfrm_input.c | 2 +- net/xfrm/xfrm_interface.c | 2 +- net/xfrm/xfrm_output.c | 2 +- net/xfrm/xfrm_policy.c | 2 +- 31 files changed, 40 insertions(+), 45 deletions(-) (limited to 'include') diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c index 734de7de03f7..e1fabb3e3246 100644 --- a/drivers/net/ppp/pptp.c +++ b/drivers/net/ppp/pptp.c @@ -238,7 +238,7 @@ static int pptp_xmit(struct ppp_channel *chan, struct sk_buff *skb) skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); - nf_reset(skb); + nf_reset_ct(skb); skb->ip_summed = CHECKSUM_NONE; ip_select_ident(net, skb, NULL); @@ -358,7 +358,7 @@ static int pptp_rcv(struct sk_buff *skb) po = lookup_chan(htons(header->call_id), iph->saddr); if (po) { skb_dst_drop(skb); - nf_reset(skb); + nf_reset_ct(skb); return sk_receive_skb(sk_pppox(po), skb, 0); } drop: diff --git a/drivers/net/tun.c b/drivers/net/tun.c index aab0be40d443..812dc3a65efb 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1104,7 +1104,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) */ skb_orphan(skb); - nf_reset(skb); + nf_reset_ct(skb); if (ptr_ring_produce(&tfile->tx_ring, skb)) goto drop; diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index ba98e0971b84..5a635f028bdc 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1585,7 +1585,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) /* Don't wait up for transmitted skbs to be freed. */ if (!use_napi) { skb_orphan(skb); - nf_reset(skb); + nf_reset_ct(skb); } /* If running out of space, stop queue to avoid getting packets that we diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index a4b38a980c3c..ee52bde058df 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -366,7 +366,7 @@ static int vrf_finish_output6(struct net *net, struct sock *sk, struct neighbour *neigh; int ret; - nf_reset(skb); + nf_reset_ct(skb); skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; @@ -459,7 +459,7 @@ static struct sk_buff *vrf_ip6_out_direct(struct net_device *vrf_dev, /* reset skb device */ if (likely(err == 1)) - nf_reset(skb); + nf_reset_ct(skb); else skb = NULL; @@ -560,7 +560,7 @@ static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *s bool is_v6gw = false; int ret = -EINVAL; - nf_reset(skb); + nf_reset_ct(skb); /* Be paranoid, rather than too clever. */ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { @@ -670,7 +670,7 @@ static struct sk_buff *vrf_ip_out_direct(struct net_device *vrf_dev, /* reset skb device */ if (likely(err == 1)) - nf_reset(skb); + nf_reset_ct(skb); else skb = NULL; diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 635956024e88..45c73a6f09a1 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -1261,8 +1261,8 @@ static bool mac80211_hwsim_tx_frame_no_nl(struct ieee80211_hw *hw, skb_orphan(skb); skb_dst_drop(skb); skb->mark = 0; - secpath_reset(skb); - nf_reset(skb); + skb_ext_reset(skb); + nf_reset_ct(skb); /* * Get absolute mactime here so all HWs RX at the "same time", and diff --git a/drivers/staging/octeon/ethernet-tx.c b/drivers/staging/octeon/ethernet-tx.c index c64728fc21f2..a62057555d1b 100644 --- a/drivers/staging/octeon/ethernet-tx.c +++ b/drivers/staging/octeon/ethernet-tx.c @@ -349,10 +349,8 @@ int cvm_oct_xmit(struct sk_buff *skb, struct net_device *dev) */ dst_release(skb_dst(skb)); skb_dst_set(skb, NULL); -#ifdef CONFIG_XFRM - secpath_reset(skb); -#endif - nf_reset(skb); + skb_ext_reset(skb); + nf_reset_ct(skb); #ifdef CONFIG_NET_SCHED skb->tc_index = 0; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index e7d3b1a513ef..4351577b14d7 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4160,15 +4160,12 @@ static inline void __skb_ext_copy(struct sk_buff *d, const struct sk_buff *s) {} static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) {} #endif /* CONFIG_SKB_EXTENSIONS */ -static inline void nf_reset(struct sk_buff *skb) +static inline void nf_reset_ct(struct sk_buff *skb) { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) nf_conntrack_put(skb_nfct(skb)); skb->_nfct = 0; #endif -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - skb_ext_del(skb, SKB_EXT_BRIDGE_NF); -#endif } static inline void nf_reset_trace(struct sk_buff *skb) diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index a1146cb10919..9cbed6f5a85a 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -436,7 +436,7 @@ void batadv_interface_rx(struct net_device *soft_iface, /* clean the netfilter state now that the batman-adv header has been * removed */ - nf_reset(skb); + nf_reset_ct(skb); if (unlikely(!pskb_may_pull(skb, ETH_HLEN))) goto dropped; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 01d65206f4fb..529133611ea2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5120,7 +5120,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) skb->ignore_df = 0; skb_dst_drop(skb); skb_ext_reset(skb); - nf_reset(skb); + nf_reset_ct(skb); nf_reset_trace(skb); #ifdef CONFIG_NET_SWITCHDEV diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index b685bc82f8d0..d9b4200ed12d 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -871,7 +871,7 @@ lookup: if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; - nf_reset(skb); + nf_reset_ct(skb); return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4, refcounted); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 1e2392b7c64e..c59a78a267c3 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -199,7 +199,7 @@ resubmit: kfree_skb(skb); return; } - nf_reset(skb); + nf_reset_ct(skb); } ret = INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv, skb); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 313470f6bb14..716d5472c022 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1794,7 +1794,7 @@ static void ip_encap(struct net *net, struct sk_buff *skb, ip_send_check(iph); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - nf_reset(skb); + nf_reset_ct(skb); } static inline int ipmr_forward_finish(struct net *net, struct sock *sk, @@ -2140,7 +2140,7 @@ int ip_mr_input(struct sk_buff *skb) mroute_sk = rcu_dereference(mrt->mroute_sk); if (mroute_sk) { - nf_reset(skb); + nf_reset_ct(skb); raw_rcv(mroute_sk, skb); return 0; } diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c index af3fbf76dbd3..6cc5743c553a 100644 --- a/net/ipv4/netfilter/nf_dup_ipv4.c +++ b/net/ipv4/netfilter/nf_dup_ipv4.c @@ -65,7 +65,7 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum, #if IS_ENABLED(CONFIG_NF_CONNTRACK) /* Avoid counting cloned packets towards the original connection. */ - nf_reset(skb); + nf_reset_ct(skb); nf_ct_set(skb, NULL, IP_CT_UNTRACKED); #endif /* diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 80da5a66d5d7..3183413ebc6c 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -332,7 +332,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb) kfree_skb(skb); return NET_RX_DROP; } - nf_reset(skb); + nf_reset_ct(skb); skb_push(skb, skb->data - skb_network_header(skb)); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2ee45e3755e9..bf124b1742df 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1916,7 +1916,7 @@ process: if (tcp_v4_inbound_md5_hash(sk, skb)) goto discard_and_relse; - nf_reset(skb); + nf_reset_ct(skb); if (tcp_filter(sk, skb)) goto discard_and_relse; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index cf755156a684..e8443cc5c1ab 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1969,7 +1969,7 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) */ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto drop; - nf_reset(skb); + nf_reset_ct(skb); if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) { int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); @@ -2298,7 +2298,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; - nf_reset(skb); + nf_reset_ct(skb); /* No socket. Drop packet silently, if checksum is wrong */ if (udp_lib_checksum_complete(skb)) diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index d432d0011c16..7e5df23cbe7b 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -371,7 +371,7 @@ resubmit_final: /* Free reference early: we don't need it any more, and it may hold ip_conntrack module loaded indefinitely. */ - nf_reset(skb); + nf_reset_ct(skb); skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c index e6c9da9866b1..a0a2de30be3e 100644 --- a/net/ipv6/netfilter/nf_dup_ipv6.c +++ b/net/ipv6/netfilter/nf_dup_ipv6.c @@ -54,7 +54,7 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum, return; #if IS_ENABLED(CONFIG_NF_CONNTRACK) - nf_reset(skb); + nf_reset_ct(skb); nf_ct_set(skb, NULL, IP_CT_UNTRACKED); #endif if (hooknum == NF_INET_PRE_ROUTING || diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 6e1888ee4036..a77f6b7d3a7c 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -215,7 +215,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) /* Not releasing hash table! */ if (clone) { - nf_reset(clone); + nf_reset_ct(clone); rawv6_rcv(sk, clone); } } diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 105e5a7092e7..f82ea12bac37 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1078,7 +1078,7 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); - nf_reset(skb); + nf_reset_ct(skb); bh_lock_sock(sk); if (sock_owned_by_user(sk)) { diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index bd3f39349d40..fd5ac2788e45 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -151,7 +151,7 @@ static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb, skb->ip_summed = CHECKSUM_NONE; skb_dst_drop(skb); - nf_reset(skb); + nf_reset_ct(skb); rcu_read_lock(); dev = rcu_dereference(spriv->dev); diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 622833317dcb..0d7c887a2b75 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -193,7 +193,7 @@ pass_up: if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_put; - nf_reset(skb); + nf_reset_ct(skb); return sk_receive_skb(sk, skb, 1); diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 687e23a8b326..802f19aba7e3 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -206,7 +206,7 @@ pass_up: if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_put; - nf_reset(skb); + nf_reset_ct(skb); return sk_receive_skb(sk, skb, 1); diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 9c464d24beec..888d3068a492 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -613,7 +613,7 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb, if (unlikely(cp->flags & IP_VS_CONN_F_NFCT)) ret = ip_vs_confirm_conntrack(skb); if (ret == NF_ACCEPT) { - nf_reset(skb); + nf_reset_ct(skb); skb_forward_csum(skb); } return ret; diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index d2437b5b2f6a..21c90d3a7ebf 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -237,7 +237,7 @@ static netdev_tx_t internal_dev_recv(struct sk_buff *skb) } skb_dst_drop(skb); - nf_reset(skb); + nf_reset_ct(skb); secpath_reset(skb); skb->pkt_type = PACKET_HOST; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index e2742b006d25..82a50e850245 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1821,7 +1821,7 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, skb_dst_drop(skb); /* drop conntrack reference */ - nf_reset(skb); + nf_reset_ct(skb); spkt = &PACKET_SKB_CB(skb)->sa.pkt; @@ -2121,7 +2121,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, skb_dst_drop(skb); /* drop conntrack reference */ - nf_reset(skb); + nf_reset_ct(skb); spin_lock(&sk->sk_receive_queue.lock); po->stats.stats1.tp_packets++; diff --git a/net/sctp/input.c b/net/sctp/input.c index 1008cdc44dd6..5a070fb5b278 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -201,7 +201,7 @@ int sctp_rcv(struct sk_buff *skb) if (!xfrm_policy_check(sk, XFRM_POLICY_IN, skb, family)) goto discard_release; - nf_reset(skb); + nf_reset_ct(skb); if (sk_filter(sk, skb)) goto discard_release; diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 6088bc2dc11e..9b599ed66d97 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -706,7 +706,7 @@ resume: if (err) goto drop; - nf_reset(skb); + nf_reset_ct(skb); if (decaps) { sp = skb_sec_path(skb); diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 2ab4859df55a..0f5131bc3342 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -185,7 +185,7 @@ static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet) skb->skb_iif = 0; skb->ignore_df = 0; skb_dst_drop(skb); - nf_reset(skb); + nf_reset_ct(skb); nf_reset_trace(skb); if (!xnet) diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 9499b35feb92..b1db55b50ba1 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -502,7 +502,7 @@ int xfrm_output_resume(struct sk_buff *skb, int err) struct net *net = xs_net(skb_dst(skb)->xfrm); while (likely((err = xfrm_output_one(skb, err)) == 0)) { - nf_reset(skb); + nf_reset_ct(skb); err = skb_dst(skb)->ops->local_out(net, skb->sk, skb); if (unlikely(err != 1)) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 21e939235b39..f2d1e573ea55 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2808,7 +2808,7 @@ static void xfrm_policy_queue_process(struct timer_list *t) continue; } - nf_reset(skb); + nf_reset_ct(skb); skb_dst_drop(skb); skb_dst_set(skb, dst); -- cgit v1.2.3 From 30945d31e5761436d9eba6b8cff468a5f7c9c266 Mon Sep 17 00:00:00 2001 From: Nuno Sá Date: Tue, 24 Sep 2019 14:49:43 +0200 Subject: hwmon: Fix HWMON_P_MIN_ALARM mask MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both HWMON_P_MIN_ALARM and HWMON_P_MAX_ALARM were using BIT(hwmon_power_max_alarm). Fixes: aa7f29b07c870 ("hwmon: Add support for power min, lcrit, min_alarm and lcrit_alarm") CC: Signed-off-by: Nuno Sá Link: https://lore.kernel.org/r/20190924124945.491326-2-nuno.sa@analog.com Signed-off-by: Guenter Roeck --- include/linux/hwmon.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h index 04c36b7a61dd..72579168189d 100644 --- a/include/linux/hwmon.h +++ b/include/linux/hwmon.h @@ -235,7 +235,7 @@ enum hwmon_power_attributes { #define HWMON_P_LABEL BIT(hwmon_power_label) #define HWMON_P_ALARM BIT(hwmon_power_alarm) #define HWMON_P_CAP_ALARM BIT(hwmon_power_cap_alarm) -#define HWMON_P_MIN_ALARM BIT(hwmon_power_max_alarm) +#define HWMON_P_MIN_ALARM BIT(hwmon_power_min_alarm) #define HWMON_P_MAX_ALARM BIT(hwmon_power_max_alarm) #define HWMON_P_LCRIT_ALARM BIT(hwmon_power_lcrit_alarm) #define HWMON_P_CRIT_ALARM BIT(hwmon_power_crit_alarm) -- cgit v1.2.3 From 09515706857a7d5a2ffb5ce6a44c0bc7859a745b Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 1 Oct 2019 10:25:34 +0200 Subject: xen/efi: have a common runtime setup function Today the EFI runtime functions are setup in architecture specific code (x86 and arm), with the functions themselves living in drivers/xen as they are not architecture dependent. As the setup is exactly the same for arm and x86 move the setup to drivers/xen, too. This at once removes the need to make the single functions global visible. Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich [boris: "Dropped EXPORT_SYMBOL_GPL(xen_efi_runtime_setup)"] Signed-off-by: Boris Ostrovsky --- arch/arm/include/asm/xen/xen-ops.h | 6 --- arch/arm/xen/Makefile | 1 - arch/arm/xen/efi.c | 30 ------------- arch/arm/xen/enlighten.c | 1 - arch/arm64/include/asm/xen/xen-ops.h | 7 --- arch/arm64/xen/Makefile | 1 - arch/x86/xen/efi.c | 16 +------ drivers/xen/efi.c | 84 ++++++++++++++++++++---------------- include/xen/xen-ops.h | 25 +---------- 9 files changed, 49 insertions(+), 122 deletions(-) delete mode 100644 arch/arm/include/asm/xen/xen-ops.h delete mode 100644 arch/arm/xen/efi.c delete mode 100644 arch/arm64/include/asm/xen/xen-ops.h (limited to 'include') diff --git a/arch/arm/include/asm/xen/xen-ops.h b/arch/arm/include/asm/xen/xen-ops.h deleted file mode 100644 index ec154e719b11..000000000000 --- a/arch/arm/include/asm/xen/xen-ops.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _ASM_XEN_OPS_H -#define _ASM_XEN_OPS_H - -void xen_efi_runtime_setup(void); - -#endif /* _ASM_XEN_OPS_H */ diff --git a/arch/arm/xen/Makefile b/arch/arm/xen/Makefile index 7ed28982c4c3..c32d04713ba0 100644 --- a/arch/arm/xen/Makefile +++ b/arch/arm/xen/Makefile @@ -1,3 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only obj-y := enlighten.o hypercall.o grant-table.o p2m.o mm.o -obj-$(CONFIG_XEN_EFI) += efi.o diff --git a/arch/arm/xen/efi.c b/arch/arm/xen/efi.c deleted file mode 100644 index cb2aaf98e243..000000000000 --- a/arch/arm/xen/efi.c +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (c) 2015, Linaro Limited, Shannon Zhao - */ - -#include -#include -#include - -/* Set XEN EFI runtime services function pointers. Other fields of struct efi, - * e.g. efi.systab, will be set like normal EFI. - */ -void __init xen_efi_runtime_setup(void) -{ - efi.get_time = xen_efi_get_time; - efi.set_time = xen_efi_set_time; - efi.get_wakeup_time = xen_efi_get_wakeup_time; - efi.set_wakeup_time = xen_efi_set_wakeup_time; - efi.get_variable = xen_efi_get_variable; - efi.get_next_variable = xen_efi_get_next_variable; - efi.set_variable = xen_efi_set_variable; - efi.set_variable_nonblocking = xen_efi_set_variable; - efi.query_variable_info = xen_efi_query_variable_info; - efi.query_variable_info_nonblocking = xen_efi_query_variable_info; - efi.update_capsule = xen_efi_update_capsule; - efi.query_capsule_caps = xen_efi_query_capsule_caps; - efi.get_next_high_mono_count = xen_efi_get_next_high_mono_count; - efi.reset_system = xen_efi_reset_system; -} -EXPORT_SYMBOL_GPL(xen_efi_runtime_setup); diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c index 522c97d43ef8..dd6804a64f1a 100644 --- a/arch/arm/xen/enlighten.c +++ b/arch/arm/xen/enlighten.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/arm64/include/asm/xen/xen-ops.h b/arch/arm64/include/asm/xen/xen-ops.h deleted file mode 100644 index e6e784051932..000000000000 --- a/arch/arm64/include/asm/xen/xen-ops.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_XEN_OPS_H -#define _ASM_XEN_OPS_H - -void xen_efi_runtime_setup(void); - -#endif /* _ASM_XEN_OPS_H */ diff --git a/arch/arm64/xen/Makefile b/arch/arm64/xen/Makefile index a4fc65f3928d..b66215e8658e 100644 --- a/arch/arm64/xen/Makefile +++ b/arch/arm64/xen/Makefile @@ -1,4 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only xen-arm-y += $(addprefix ../../arm/xen/, enlighten.o grant-table.o p2m.o mm.o) obj-y := xen-arm.o hypercall.o -obj-$(CONFIG_XEN_EFI) += $(addprefix ../../arm/xen/, efi.o) diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c index 7e3eb70f411a..a04551ee5568 100644 --- a/arch/x86/xen/efi.c +++ b/arch/x86/xen/efi.c @@ -57,21 +57,7 @@ static efi_system_table_t __init *xen_efi_probe(void) return NULL; /* Here we know that Xen runs on EFI platform. */ - - efi.get_time = xen_efi_get_time; - efi.set_time = xen_efi_set_time; - efi.get_wakeup_time = xen_efi_get_wakeup_time; - efi.set_wakeup_time = xen_efi_set_wakeup_time; - efi.get_variable = xen_efi_get_variable; - efi.get_next_variable = xen_efi_get_next_variable; - efi.set_variable = xen_efi_set_variable; - efi.set_variable_nonblocking = xen_efi_set_variable; - efi.query_variable_info = xen_efi_query_variable_info; - efi.query_variable_info_nonblocking = xen_efi_query_variable_info; - efi.update_capsule = xen_efi_update_capsule; - efi.query_capsule_caps = xen_efi_query_capsule_caps; - efi.get_next_high_mono_count = xen_efi_get_next_high_mono_count; - efi.reset_system = xen_efi_reset_system; + xen_efi_runtime_setup(); efi_systab_xen.tables = info->cfg.addr; efi_systab_xen.nr_tables = info->cfg.nent; diff --git a/drivers/xen/efi.c b/drivers/xen/efi.c index 89d60f8e3c18..d1ff2186ebb4 100644 --- a/drivers/xen/efi.c +++ b/drivers/xen/efi.c @@ -40,7 +40,7 @@ #define efi_data(op) (op.u.efi_runtime_call) -efi_status_t xen_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) +static efi_status_t xen_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) { struct xen_platform_op op = INIT_EFI_OP(get_time); @@ -61,9 +61,8 @@ efi_status_t xen_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) return efi_data(op).status; } -EXPORT_SYMBOL_GPL(xen_efi_get_time); -efi_status_t xen_efi_set_time(efi_time_t *tm) +static efi_status_t xen_efi_set_time(efi_time_t *tm) { struct xen_platform_op op = INIT_EFI_OP(set_time); @@ -75,10 +74,10 @@ efi_status_t xen_efi_set_time(efi_time_t *tm) return efi_data(op).status; } -EXPORT_SYMBOL_GPL(xen_efi_set_time); -efi_status_t xen_efi_get_wakeup_time(efi_bool_t *enabled, efi_bool_t *pending, - efi_time_t *tm) +static efi_status_t xen_efi_get_wakeup_time(efi_bool_t *enabled, + efi_bool_t *pending, + efi_time_t *tm) { struct xen_platform_op op = INIT_EFI_OP(get_wakeup_time); @@ -98,9 +97,8 @@ efi_status_t xen_efi_get_wakeup_time(efi_bool_t *enabled, efi_bool_t *pending, return efi_data(op).status; } -EXPORT_SYMBOL_GPL(xen_efi_get_wakeup_time); -efi_status_t xen_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm) +static efi_status_t xen_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm) { struct xen_platform_op op = INIT_EFI_OP(set_wakeup_time); @@ -117,11 +115,10 @@ efi_status_t xen_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm) return efi_data(op).status; } -EXPORT_SYMBOL_GPL(xen_efi_set_wakeup_time); -efi_status_t xen_efi_get_variable(efi_char16_t *name, efi_guid_t *vendor, - u32 *attr, unsigned long *data_size, - void *data) +static efi_status_t xen_efi_get_variable(efi_char16_t *name, efi_guid_t *vendor, + u32 *attr, unsigned long *data_size, + void *data) { struct xen_platform_op op = INIT_EFI_OP(get_variable); @@ -141,11 +138,10 @@ efi_status_t xen_efi_get_variable(efi_char16_t *name, efi_guid_t *vendor, return efi_data(op).status; } -EXPORT_SYMBOL_GPL(xen_efi_get_variable); -efi_status_t xen_efi_get_next_variable(unsigned long *name_size, - efi_char16_t *name, - efi_guid_t *vendor) +static efi_status_t xen_efi_get_next_variable(unsigned long *name_size, + efi_char16_t *name, + efi_guid_t *vendor) { struct xen_platform_op op = INIT_EFI_OP(get_next_variable_name); @@ -165,11 +161,10 @@ efi_status_t xen_efi_get_next_variable(unsigned long *name_size, return efi_data(op).status; } -EXPORT_SYMBOL_GPL(xen_efi_get_next_variable); -efi_status_t xen_efi_set_variable(efi_char16_t *name, efi_guid_t *vendor, - u32 attr, unsigned long data_size, - void *data) +static efi_status_t xen_efi_set_variable(efi_char16_t *name, efi_guid_t *vendor, + u32 attr, unsigned long data_size, + void *data) { struct xen_platform_op op = INIT_EFI_OP(set_variable); @@ -186,11 +181,10 @@ efi_status_t xen_efi_set_variable(efi_char16_t *name, efi_guid_t *vendor, return efi_data(op).status; } -EXPORT_SYMBOL_GPL(xen_efi_set_variable); -efi_status_t xen_efi_query_variable_info(u32 attr, u64 *storage_space, - u64 *remaining_space, - u64 *max_variable_size) +static efi_status_t xen_efi_query_variable_info(u32 attr, u64 *storage_space, + u64 *remaining_space, + u64 *max_variable_size) { struct xen_platform_op op = INIT_EFI_OP(query_variable_info); @@ -208,9 +202,8 @@ efi_status_t xen_efi_query_variable_info(u32 attr, u64 *storage_space, return efi_data(op).status; } -EXPORT_SYMBOL_GPL(xen_efi_query_variable_info); -efi_status_t xen_efi_get_next_high_mono_count(u32 *count) +static efi_status_t xen_efi_get_next_high_mono_count(u32 *count) { struct xen_platform_op op = INIT_EFI_OP(get_next_high_monotonic_count); @@ -221,10 +214,9 @@ efi_status_t xen_efi_get_next_high_mono_count(u32 *count) return efi_data(op).status; } -EXPORT_SYMBOL_GPL(xen_efi_get_next_high_mono_count); -efi_status_t xen_efi_update_capsule(efi_capsule_header_t **capsules, - unsigned long count, unsigned long sg_list) +static efi_status_t xen_efi_update_capsule(efi_capsule_header_t **capsules, + unsigned long count, unsigned long sg_list) { struct xen_platform_op op = INIT_EFI_OP(update_capsule); @@ -241,11 +233,9 @@ efi_status_t xen_efi_update_capsule(efi_capsule_header_t **capsules, return efi_data(op).status; } -EXPORT_SYMBOL_GPL(xen_efi_update_capsule); -efi_status_t xen_efi_query_capsule_caps(efi_capsule_header_t **capsules, - unsigned long count, u64 *max_size, - int *reset_type) +static efi_status_t xen_efi_query_capsule_caps(efi_capsule_header_t **capsules, + unsigned long count, u64 *max_size, int *reset_type) { struct xen_platform_op op = INIT_EFI_OP(query_capsule_capabilities); @@ -264,10 +254,9 @@ efi_status_t xen_efi_query_capsule_caps(efi_capsule_header_t **capsules, return efi_data(op).status; } -EXPORT_SYMBOL_GPL(xen_efi_query_capsule_caps); -void xen_efi_reset_system(int reset_type, efi_status_t status, - unsigned long data_size, efi_char16_t *data) +static void xen_efi_reset_system(int reset_type, efi_status_t status, + unsigned long data_size, efi_char16_t *data) { switch (reset_type) { case EFI_RESET_COLD: @@ -281,4 +270,25 @@ void xen_efi_reset_system(int reset_type, efi_status_t status, BUG(); } } -EXPORT_SYMBOL_GPL(xen_efi_reset_system); + +/* + * Set XEN EFI runtime services function pointers. Other fields of struct efi, + * e.g. efi.systab, will be set like normal EFI. + */ +void __init xen_efi_runtime_setup(void) +{ + efi.get_time = xen_efi_get_time; + efi.set_time = xen_efi_set_time; + efi.get_wakeup_time = xen_efi_get_wakeup_time; + efi.set_wakeup_time = xen_efi_set_wakeup_time; + efi.get_variable = xen_efi_get_variable; + efi.get_next_variable = xen_efi_get_next_variable; + efi.set_variable = xen_efi_set_variable; + efi.set_variable_nonblocking = xen_efi_set_variable; + efi.query_variable_info = xen_efi_query_variable_info; + efi.query_variable_info_nonblocking = xen_efi_query_variable_info; + efi.update_capsule = xen_efi_update_capsule; + efi.query_capsule_caps = xen_efi_query_capsule_caps; + efi.get_next_high_mono_count = xen_efi_get_next_high_mono_count; + efi.reset_system = xen_efi_reset_system; +} diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index 98b30c1613b2..d89969aa9942 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -212,30 +212,7 @@ int xen_xlate_map_ballooned_pages(xen_pfn_t **pfns, void **vaddr, bool xen_running_on_version_or_later(unsigned int major, unsigned int minor); -efi_status_t xen_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc); -efi_status_t xen_efi_set_time(efi_time_t *tm); -efi_status_t xen_efi_get_wakeup_time(efi_bool_t *enabled, efi_bool_t *pending, - efi_time_t *tm); -efi_status_t xen_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm); -efi_status_t xen_efi_get_variable(efi_char16_t *name, efi_guid_t *vendor, - u32 *attr, unsigned long *data_size, - void *data); -efi_status_t xen_efi_get_next_variable(unsigned long *name_size, - efi_char16_t *name, efi_guid_t *vendor); -efi_status_t xen_efi_set_variable(efi_char16_t *name, efi_guid_t *vendor, - u32 attr, unsigned long data_size, - void *data); -efi_status_t xen_efi_query_variable_info(u32 attr, u64 *storage_space, - u64 *remaining_space, - u64 *max_variable_size); -efi_status_t xen_efi_get_next_high_mono_count(u32 *count); -efi_status_t xen_efi_update_capsule(efi_capsule_header_t **capsules, - unsigned long count, unsigned long sg_list); -efi_status_t xen_efi_query_capsule_caps(efi_capsule_header_t **capsules, - unsigned long count, u64 *max_size, - int *reset_type); -void xen_efi_reset_system(int reset_type, efi_status_t status, - unsigned long data_size, efi_char16_t *data); +void xen_efi_runtime_setup(void); #ifdef CONFIG_PREEMPT -- cgit v1.2.3 From 3e8db7e56082156a37b71d7334860c10fcea8025 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 1 Oct 2019 21:58:19 +0300 Subject: net: dsa: sja1105: Fix sleeping while atomic in .port_hwtstamp_set Currently this stack trace can be seen with CONFIG_DEBUG_ATOMIC_SLEEP=y: [ 41.568348] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:909 [ 41.576757] in_atomic(): 1, irqs_disabled(): 0, pid: 208, name: ptp4l [ 41.583212] INFO: lockdep is turned off. [ 41.587123] CPU: 1 PID: 208 Comm: ptp4l Not tainted 5.3.0-rc6-01445-ge950f2d4bc7f-dirty #1827 [ 41.599873] [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [ 41.607584] [] (show_stack) from [] (dump_stack+0xd4/0x100) [ 41.614863] [] (dump_stack) from [] (___might_sleep+0x1c8/0x2b4) [ 41.622574] [] (___might_sleep) from [] (__mutex_lock+0x48/0xab8) [ 41.630368] [] (__mutex_lock) from [] (mutex_lock_nested+0x1c/0x24) [ 41.638340] [] (mutex_lock_nested) from [] (sja1105_static_config_reload+0x30/0x27c) [ 41.647779] [] (sja1105_static_config_reload) from [] (sja1105_hwtstamp_set+0x108/0x1cc) [ 41.657562] [] (sja1105_hwtstamp_set) from [] (dev_ifsioc+0x18c/0x330) [ 41.665788] [] (dev_ifsioc) from [] (dev_ioctl+0x320/0x6e8) [ 41.673064] [] (dev_ioctl) from [] (sock_ioctl+0x334/0x5e8) [ 41.680340] [] (sock_ioctl) from [] (do_vfs_ioctl+0xb0/0xa10) [ 41.687789] [] (do_vfs_ioctl) from [] (ksys_ioctl+0x34/0x58) [ 41.695151] [] (ksys_ioctl) from [] (ret_fast_syscall+0x0/0x28) [ 41.702768] Exception stack(0xe8495fa8 to 0xe8495ff0) [ 41.707796] 5fa0: beff4a8c 00000001 00000011 000089b0 beff4a8c beff4a80 [ 41.715933] 5fc0: beff4a8c 00000001 0000000c 00000036 b6fa98c8 004e19c1 00000001 00000000 [ 41.724069] 5fe0: 004dcedc beff4a6c 004c0738 b6e7af4c [ 41.729860] BUG: scheduling while atomic: ptp4l/208/0x00000002 [ 41.735682] INFO: lockdep is turned off. Enabling RX timestamping will logically disturb the fastpath (processing of meta frames). Replace bool hwts_rx_en with a bit that is checked atomically from the fastpath and temporarily unset from the sleepable context during a change of the RX timestamping process (a destructive operation anyways, requires switch reset). If found unset, the fastpath (net/dsa/tag_sja1105.c) will just drop any received meta frame and not take the meta_lock at all. Fixes: a602afd200f5 ("net: dsa: sja1105: Expose PTP timestamping ioctls to userspace") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_main.c | 19 +++++++++++-------- include/linux/dsa/sja1105.h | 4 +++- net/dsa/tag_sja1105.c | 12 +++++++++++- 3 files changed, 25 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index ea2e7f4f96d0..7687ddcae159 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -1897,7 +1897,9 @@ static int sja1105_set_ageing_time(struct dsa_switch *ds, return sja1105_static_config_reload(priv); } -/* Caller must hold priv->tagger_data.meta_lock */ +/* Must be called only with priv->tagger_data.state bit + * SJA1105_HWTS_RX_EN cleared + */ static int sja1105_change_rxtstamping(struct sja1105_private *priv, bool on) { @@ -1954,16 +1956,17 @@ static int sja1105_hwtstamp_set(struct dsa_switch *ds, int port, break; } - if (rx_on != priv->tagger_data.hwts_rx_en) { - spin_lock(&priv->tagger_data.meta_lock); + if (rx_on != test_bit(SJA1105_HWTS_RX_EN, &priv->tagger_data.state)) { + clear_bit(SJA1105_HWTS_RX_EN, &priv->tagger_data.state); + rc = sja1105_change_rxtstamping(priv, rx_on); - spin_unlock(&priv->tagger_data.meta_lock); if (rc < 0) { dev_err(ds->dev, "Failed to change RX timestamping: %d\n", rc); - return -EFAULT; + return rc; } - priv->tagger_data.hwts_rx_en = rx_on; + if (rx_on) + set_bit(SJA1105_HWTS_RX_EN, &priv->tagger_data.state); } if (copy_to_user(ifr->ifr_data, &config, sizeof(config))) @@ -1982,7 +1985,7 @@ static int sja1105_hwtstamp_get(struct dsa_switch *ds, int port, config.tx_type = HWTSTAMP_TX_ON; else config.tx_type = HWTSTAMP_TX_OFF; - if (priv->tagger_data.hwts_rx_en) + if (test_bit(SJA1105_HWTS_RX_EN, &priv->tagger_data.state)) config.rx_filter = HWTSTAMP_FILTER_PTP_V2_L2_EVENT; else config.rx_filter = HWTSTAMP_FILTER_NONE; @@ -2031,7 +2034,7 @@ static bool sja1105_port_rxtstamp(struct dsa_switch *ds, int port, struct sja1105_private *priv = ds->priv; struct sja1105_tagger_data *data = &priv->tagger_data; - if (!data->hwts_rx_en) + if (!test_bit(SJA1105_HWTS_RX_EN, &data->state)) return false; /* We need to read the full PTP clock to reconstruct the Rx diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index 79435cfc20eb..897e799dbcb9 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -31,6 +31,8 @@ #define SJA1105_META_SMAC 0x222222222222ull #define SJA1105_META_DMAC 0x0180C200000Eull +#define SJA1105_HWTS_RX_EN 0 + /* Global tagger data: each struct sja1105_port has a reference to * the structure defined in struct sja1105_private. */ @@ -42,7 +44,7 @@ struct sja1105_tagger_data { * from taggers running on multiple ports on SMP systems */ spinlock_t meta_lock; - bool hwts_rx_en; + unsigned long state; }; struct sja1105_skb_cb { diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 9c9aff3e52cf..63ef2a14c934 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -156,7 +156,11 @@ static struct sk_buff /* Step 1: A timestampable frame was received. * Buffer it until we get its meta frame. */ - if (is_link_local && sp->data->hwts_rx_en) { + if (is_link_local) { + if (!test_bit(SJA1105_HWTS_RX_EN, &sp->data->state)) + /* Do normal processing. */ + return skb; + spin_lock(&sp->data->meta_lock); /* Was this a link-local frame instead of the meta * that we were expecting? @@ -187,6 +191,12 @@ static struct sk_buff } else if (is_meta) { struct sk_buff *stampable_skb; + /* Drop the meta frame if we're not in the right state + * to process it. + */ + if (!test_bit(SJA1105_HWTS_RX_EN, &sp->data->state)) + return NULL; + spin_lock(&sp->data->meta_lock); stampable_skb = sp->data->stampable_skb; -- cgit v1.2.3 From 815fb4c9d7da862a47c9d2a9765a4759826c5783 Mon Sep 17 00:00:00 2001 From: Marek Olšák Date: Tue, 24 Sep 2019 17:53:25 -0400 Subject: drm/amdgpu: return tcc_disabled_mask to userspace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit UMDs need this for correct programming of harvested chips. Signed-off-by: Marek Olšák Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 2 ++ drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 12 ++++++++++++ include/uapi/drm/amdgpu_drm.h | 2 ++ 5 files changed, 19 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 264677ab248a..6f8aaf655a9f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -81,9 +81,10 @@ * - 3.32.0 - Add syncobj timeline support to AMDGPU_CS. * - 3.33.0 - Fixes for GDS ENOMEM failures in AMDGPU_CS. * - 3.34.0 - Non-DC can flip correctly between buffers with different pitches + * - 3.35.0 - Add drm_amdgpu_info_device::tcc_disabled_mask */ #define KMS_DRIVER_MAJOR 3 -#define KMS_DRIVER_MINOR 34 +#define KMS_DRIVER_MINOR 35 #define KMS_DRIVER_PATCHLEVEL 0 #define AMDGPU_MAX_TIMEOUT_PARAM_LENTH 256 diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index 554a59b3c4a6..6ee4021910e2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -165,6 +165,7 @@ struct amdgpu_gfx_config { uint32_t num_sc_per_sh; uint32_t num_packer_per_sc; uint32_t pa_sc_tile_steering_override; + uint64_t tcc_disabled_mask; }; struct amdgpu_cu_info { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index f6147528be64..f2c097983f48 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -787,6 +787,8 @@ static int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file dev_info.pa_sc_tile_steering_override = adev->gfx.config.pa_sc_tile_steering_override; + dev_info.tcc_disabled_mask = adev->gfx.config.tcc_disabled_mask; + return copy_to_user(out, &dev_info, min((size_t)size, sizeof(dev_info))) ? -EFAULT : 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index 638c821611ab..957811b73672 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -1691,6 +1691,17 @@ static void gfx_v10_0_tcp_harvest(struct amdgpu_device *adev) } } +static void gfx_v10_0_get_tcc_info(struct amdgpu_device *adev) +{ + /* TCCs are global (not instanced). */ + uint32_t tcc_disable = RREG32_SOC15(GC, 0, mmCGTS_TCC_DISABLE) | + RREG32_SOC15(GC, 0, mmCGTS_USER_TCC_DISABLE); + + adev->gfx.config.tcc_disabled_mask = + REG_GET_FIELD(tcc_disable, CGTS_TCC_DISABLE, TCC_DISABLE) | + (REG_GET_FIELD(tcc_disable, CGTS_TCC_DISABLE, HI_TCC_DISABLE) << 16); +} + static void gfx_v10_0_constants_init(struct amdgpu_device *adev) { u32 tmp; @@ -1702,6 +1713,7 @@ static void gfx_v10_0_constants_init(struct amdgpu_device *adev) gfx_v10_0_setup_rb(adev); gfx_v10_0_get_cu_info(adev, &adev->gfx.cu_info); + gfx_v10_0_get_tcc_info(adev); adev->gfx.config.pa_sc_tile_steering_override = gfx_v10_0_init_pa_sc_tile_steering_override(adev); diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index c99b4f2482c6..4fe35d600ab8 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -1003,6 +1003,8 @@ struct drm_amdgpu_info_device { __u64 high_va_max; /* gfx10 pa_sc_tile_steering_override */ __u32 pa_sc_tile_steering_override; + /* disabled TCCs */ + __u64 tcc_disabled_mask; }; struct drm_amdgpu_info_hw_ip { -- cgit v1.2.3 From 3a4b46c3bc73240b31cd324b3551fc4231d9f1d5 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 20 Jul 2019 21:05:25 +0900 Subject: block: pg: add header include guard Add a header include guard just in case. Signed-off-by: Masahiro Yamada Signed-off-by: Jens Axboe --- include/uapi/linux/pg.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/pg.h b/include/uapi/linux/pg.h index 364c350e85cd..62b6f69bd9fb 100644 --- a/include/uapi/linux/pg.h +++ b/include/uapi/linux/pg.h @@ -35,6 +35,9 @@ */ +#ifndef _UAPI_LINUX_PG_H +#define _UAPI_LINUX_PG_H + #define PG_MAGIC 'P' #define PG_RESET 'Z' #define PG_COMMAND 'C' @@ -61,4 +64,4 @@ struct pg_read_hdr { }; -/* end of pg.h */ +#endif /* _UAPI_LINUX_PG_H */ -- cgit v1.2.3 From 78f6face5af344f12f4bd48b32faa6f499a06f36 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 27 Sep 2019 17:29:05 +0200 Subject: sched: add kernel-doc for struct clone_args Add kernel-doc for struct clone_args for the clone3() syscall. Link: https://lore.kernel.org/r/20191001114701.24661-3-christian.brauner@ubuntu.com Signed-off-by: Christian Brauner --- include/uapi/linux/sched.h | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 851ff1feadd5..3d8f03bfd428 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -34,8 +34,30 @@ #define CLONE_IO 0x80000000 /* Clone io context */ #ifndef __ASSEMBLY__ -/* - * Arguments for the clone3 syscall +/** + * struct clone_args - arguments for the clone3 syscall + * @flags: Flags for the new process as listed above. + * All flags are valid except for CSIGNAL and + * CLONE_DETACHED. + * @pidfd: If CLONE_PIDFD is set, a pidfd will be + * returned in this argument. + * @child_tid: If CLONE_CHILD_SETTID is set, the TID of the + * child process will be returned in the child's + * memory. + * @parent_tid: If CLONE_PARENT_SETTID is set, the TID of + * the child process will be returned in the + * parent's memory. + * @exit_signal: The exit_signal the parent process will be + * sent when the child exits. + * @stack: Specify the location of the stack for the + * child process. + * @stack_size: The size of the stack for the child process. + * @tls: If CLONE_SETTLS is set, the tls descriptor + * is set to tls. + * + * The structure is versioned by size and thus extensible. + * New struct members must go at the end of the struct and + * must be properly 64bit aligned. */ struct clone_args { __aligned_u64 flags; -- cgit v1.2.3 From 47934ef7f1883209120781b59d78eaf8b83e2fb7 Mon Sep 17 00:00:00 2001 From: Stefan-gabriel Mirea Date: Fri, 4 Oct 2019 13:51:13 +0000 Subject: tty: serial: Fix PORT_LINFLEXUART definition The port type macros should have different values for different devices. Currently, PORT_LINFLEXUART conflicts with PORT_SUNIX. Fixes: 09864c1cdf5c ("tty: serial: Add linflexuart driver for S32V234") Signed-off-by: Stefan-Gabriel Mirea Link: https://lore.kernel.org/r/20191004135058.18007-1-stefan-gabriel.mirea@nxp.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/serial_core.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h index 0f4f87a6fd54..e7fe550b6038 100644 --- a/include/uapi/linux/serial_core.h +++ b/include/uapi/linux/serial_core.h @@ -291,6 +291,6 @@ #define PORT_SUNIX 121 /* Freescale Linflex UART */ -#define PORT_LINFLEXUART 121 +#define PORT_LINFLEXUART 122 #endif /* _UAPILINUX_SERIAL_CORE_H */ -- cgit v1.2.3 From db9b2e0af605e7c994784527abfd9276cabd718a Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 3 Oct 2019 17:44:44 +0100 Subject: rxrpc: Fix rxrpc_recvmsg tracepoint Fix the rxrpc_recvmsg tracepoint to handle being called with a NULL call parameter. Fixes: a25e21f0bcd2 ("rxrpc, afs: Use debug_ids rather than pointers in traces") Signed-off-by: David Howells Signed-off-by: David S. Miller --- include/trace/events/rxrpc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index a13a62db3565..edc5c887a44c 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -1068,7 +1068,7 @@ TRACE_EVENT(rxrpc_recvmsg, ), TP_fast_assign( - __entry->call = call->debug_id; + __entry->call = call ? call->debug_id : 0; __entry->why = why; __entry->seq = seq; __entry->offset = offset; -- cgit v1.2.3 From 4cf6c57e61fee954f7b7685de31b80ec26843d27 Mon Sep 17 00:00:00 2001 From: Russell King Date: Fri, 4 Oct 2019 17:05:58 +0100 Subject: net: phy: fix write to mii-ctrl1000 register When userspace writes to the MII_ADVERTISE register, we update phylib's advertising mask and trigger a renegotiation. However, writing to the MII_CTRL1000 register, which contains the gigabit advertisement, does neither. This can lead to phylib's copy of the advertisement becoming de-synced with the values in the PHY register set, which can result in incorrect negotiation resolution. Fixes: 5502b218e001 ("net: phy: use phy_resolve_aneg_linkmode in genphy_read_status") Reviewed-by: Andrew Lunn Signed-off-by: Russell King Signed-off-by: David S. Miller --- drivers/net/phy/phy.c | 5 +++++ include/linux/mii.h | 9 +++++++++ 2 files changed, 14 insertions(+) (limited to 'include') diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 7c92afd36bbe..119e6f466056 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -457,6 +457,11 @@ int phy_mii_ioctl(struct phy_device *phydev, struct ifreq *ifr, int cmd) val); change_autoneg = true; break; + case MII_CTRL1000: + mii_ctrl1000_mod_linkmode_adv_t(phydev->advertising, + val); + change_autoneg = true; + break; default: /* do nothing */ break; diff --git a/include/linux/mii.h b/include/linux/mii.h index 5cd824c1c0ca..4ce8901a1af6 100644 --- a/include/linux/mii.h +++ b/include/linux/mii.h @@ -455,6 +455,15 @@ static inline void mii_lpa_mod_linkmode_lpa_t(unsigned long *lp_advertising, lp_advertising, lpa & LPA_LPACK); } +static inline void mii_ctrl1000_mod_linkmode_adv_t(unsigned long *advertising, + u32 ctrl1000) +{ + linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT_Half_BIT, advertising, + ctrl1000 & ADVERTISE_1000HALF); + linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT, advertising, + ctrl1000 & ADVERTISE_1000FULL); +} + /** * linkmode_adv_to_lcl_adv_t * @advertising:pointer to linkmode advertising -- cgit v1.2.3 From 8d3dc3ac9dd6801c732a72ca6979698c38451b4f Mon Sep 17 00:00:00 2001 From: Russell King Date: Fri, 4 Oct 2019 17:06:04 +0100 Subject: net: phy: extract link partner advertisement reading Move reading the link partner advertisement out of genphy_read_status() into its own separate function. This will allow re-use of this code by PHY drivers that are able to read the resolved status from the PHY. Tested-by: tinywrkb Signed-off-by: Russell King Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/phy_device.c | 65 +++++++++++++++++++++++++++----------------- include/linux/phy.h | 1 + 2 files changed, 41 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index d347ddcac45b..9d2bbb13293e 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -1783,32 +1783,9 @@ done: } EXPORT_SYMBOL(genphy_update_link); -/** - * genphy_read_status - check the link status and update current link state - * @phydev: target phy_device struct - * - * Description: Check the link, then figure out the current state - * by comparing what we advertise with what the link partner - * advertises. Start by checking the gigabit possibilities, - * then move on to 10/100. - */ -int genphy_read_status(struct phy_device *phydev) +int genphy_read_lpa(struct phy_device *phydev) { - int lpa, lpagb, err, old_link = phydev->link; - - /* Update the link, but return if there was an error */ - err = genphy_update_link(phydev); - if (err) - return err; - - /* why bother the PHY if nothing can have changed */ - if (phydev->autoneg == AUTONEG_ENABLE && old_link && phydev->link) - return 0; - - phydev->speed = SPEED_UNKNOWN; - phydev->duplex = DUPLEX_UNKNOWN; - phydev->pause = 0; - phydev->asym_pause = 0; + int lpa, lpagb; if (phydev->autoneg == AUTONEG_ENABLE && phydev->autoneg_complete) { if (phydev->is_gigabit_capable) { @@ -1838,6 +1815,44 @@ int genphy_read_status(struct phy_device *phydev) return lpa; mii_lpa_mod_linkmode_lpa_t(phydev->lp_advertising, lpa); + } + + return 0; +} +EXPORT_SYMBOL(genphy_read_lpa); + +/** + * genphy_read_status - check the link status and update current link state + * @phydev: target phy_device struct + * + * Description: Check the link, then figure out the current state + * by comparing what we advertise with what the link partner + * advertises. Start by checking the gigabit possibilities, + * then move on to 10/100. + */ +int genphy_read_status(struct phy_device *phydev) +{ + int err, old_link = phydev->link; + + /* Update the link, but return if there was an error */ + err = genphy_update_link(phydev); + if (err) + return err; + + /* why bother the PHY if nothing can have changed */ + if (phydev->autoneg == AUTONEG_ENABLE && old_link && phydev->link) + return 0; + + phydev->speed = SPEED_UNKNOWN; + phydev->duplex = DUPLEX_UNKNOWN; + phydev->pause = 0; + phydev->asym_pause = 0; + + err = genphy_read_lpa(phydev); + if (err < 0) + return err; + + if (phydev->autoneg == AUTONEG_ENABLE && phydev->autoneg_complete) { phy_resolve_aneg_linkmode(phydev); } else if (phydev->autoneg == AUTONEG_DISABLE) { int bmcr = phy_read(phydev, MII_BMCR); diff --git a/include/linux/phy.h b/include/linux/phy.h index a7ecbe0e55aa..7abee820d05c 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1076,6 +1076,7 @@ int genphy_config_eee_advert(struct phy_device *phydev); int __genphy_config_aneg(struct phy_device *phydev, bool changed); int genphy_aneg_done(struct phy_device *phydev); int genphy_update_link(struct phy_device *phydev); +int genphy_read_lpa(struct phy_device *phydev); int genphy_read_status(struct phy_device *phydev); int genphy_suspend(struct phy_device *phydev); int genphy_resume(struct phy_device *phydev); -- cgit v1.2.3 From 2d880b8709c013d47472f85a9d42ea1aca3bce47 Mon Sep 17 00:00:00 2001 From: Russell King Date: Fri, 4 Oct 2019 17:06:09 +0100 Subject: net: phy: extract pause mode Extract the update of phylib's software pause mode state from genphy_read_status(), so that we can re-use this functionality with PHYs that have alternative ways to read the negotiation results. Tested-by: tinywrkb Reviewed-by: Andrew Lunn Signed-off-by: Russell King Signed-off-by: David S. Miller --- drivers/net/phy/phy-core.c | 20 +++++++++++++------- include/linux/phy.h | 1 + 2 files changed, 14 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c index 369903d9b6ec..9412669b579c 100644 --- a/drivers/net/phy/phy-core.c +++ b/drivers/net/phy/phy-core.c @@ -283,6 +283,18 @@ void of_set_phy_eee_broken(struct phy_device *phydev) phydev->eee_broken_modes = broken; } +void phy_resolve_aneg_pause(struct phy_device *phydev) +{ + if (phydev->duplex == DUPLEX_FULL) { + phydev->pause = linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT, + phydev->lp_advertising); + phydev->asym_pause = linkmode_test_bit( + ETHTOOL_LINK_MODE_Asym_Pause_BIT, + phydev->lp_advertising); + } +} +EXPORT_SYMBOL_GPL(phy_resolve_aneg_pause); + /** * phy_resolve_aneg_linkmode - resolve the advertisements into phy settings * @phydev: The phy_device struct @@ -305,13 +317,7 @@ void phy_resolve_aneg_linkmode(struct phy_device *phydev) break; } - if (phydev->duplex == DUPLEX_FULL) { - phydev->pause = linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT, - phydev->lp_advertising); - phydev->asym_pause = linkmode_test_bit( - ETHTOOL_LINK_MODE_Asym_Pause_BIT, - phydev->lp_advertising); - } + phy_resolve_aneg_pause(phydev); } EXPORT_SYMBOL_GPL(phy_resolve_aneg_linkmode); diff --git a/include/linux/phy.h b/include/linux/phy.h index 7abee820d05c..9a0e981df502 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -678,6 +678,7 @@ static inline bool phy_is_started(struct phy_device *phydev) return phydev->state >= PHY_UP; } +void phy_resolve_aneg_pause(struct phy_device *phydev); void phy_resolve_aneg_linkmode(struct phy_device *phydev); /** -- cgit v1.2.3 From f1da567f1dc1b55d178b8f2d0cfe8353858aac19 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 5 Oct 2019 23:04:47 +0200 Subject: driver core: platform: Add platform_get_irq_byname_optional() Some drivers (e.g dwc3) first try to get an IRQ byname and then fall back to the one at index 0. In this case we do not want the error(s) printed by platform_get_irq_byname(). This commit adds a new platform_get_irq_byname_optional(), which does not print errors, for this. While at it also improve the kdoc text for platform_get_irq_byname() a bit. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=205037 Signed-off-by: Hans de Goede Reviewed-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20191005210449.3926-2-hdegoede@redhat.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/platform.c | 46 ++++++++++++++++++++++++++++++++++------- include/linux/platform_device.h | 2 ++ 2 files changed, 41 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/base/platform.c b/drivers/base/platform.c index b6c6c7d97d5b..b230beb6ccb4 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -241,12 +241,8 @@ struct resource *platform_get_resource_byname(struct platform_device *dev, } EXPORT_SYMBOL_GPL(platform_get_resource_byname); -/** - * platform_get_irq_byname - get an IRQ for a device by name - * @dev: platform device - * @name: IRQ name - */ -int platform_get_irq_byname(struct platform_device *dev, const char *name) +static int __platform_get_irq_byname(struct platform_device *dev, + const char *name) { struct resource *r; @@ -262,11 +258,47 @@ int platform_get_irq_byname(struct platform_device *dev, const char *name) if (r) return r->start; - dev_err(&dev->dev, "IRQ %s not found\n", name); return -ENXIO; } + +/** + * platform_get_irq_byname - get an IRQ for a device by name + * @dev: platform device + * @name: IRQ name + * + * Get an IRQ like platform_get_irq(), but then by name rather then by index. + * + * Return: IRQ number on success, negative error number on failure. + */ +int platform_get_irq_byname(struct platform_device *dev, const char *name) +{ + int ret; + + ret = __platform_get_irq_byname(dev, name); + if (ret < 0 && ret != -EPROBE_DEFER) + dev_err(&dev->dev, "IRQ %s not found\n", name); + + return ret; +} EXPORT_SYMBOL_GPL(platform_get_irq_byname); +/** + * platform_get_irq_byname_optional - get an optional IRQ for a device by name + * @dev: platform device + * @name: IRQ name + * + * Get an optional IRQ by name like platform_get_irq_byname(). Except that it + * does not print an error message if an IRQ can not be obtained. + * + * Return: IRQ number on success, negative error number on failure. + */ +int platform_get_irq_byname_optional(struct platform_device *dev, + const char *name) +{ + return __platform_get_irq_byname(dev, name); +} +EXPORT_SYMBOL_GPL(platform_get_irq_byname_optional); + /** * platform_add_devices - add a numbers of platform devices * @devs: array of platform devices to add diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 1b5cec067533..f2688404d1cd 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -64,6 +64,8 @@ extern struct resource *platform_get_resource_byname(struct platform_device *, unsigned int, const char *); extern int platform_get_irq_byname(struct platform_device *, const char *); +extern int platform_get_irq_byname_optional(struct platform_device *dev, + const char *name); extern int platform_add_devices(struct platform_device **, int); struct platform_device_info { -- cgit v1.2.3 From 047d50aee341d940350897c85799e56ae57c3849 Mon Sep 17 00:00:00 2001 From: Peter Jones Date: Wed, 2 Oct 2019 18:59:00 +0200 Subject: efi/tpm: Don't access event->count when it isn't mapped Some machines generate a lot of event log entries. When we're iterating over them, the code removes the old mapping and adds a new one, so once we cross the page boundary we're unmapping the page with the count on it. Hilarity ensues. This patch keeps the info from the header in local variables so we don't need to access that page again or keep track of if it's mapped. Tested-by: Lyude Paul Signed-off-by: Peter Jones Signed-off-by: Jarkko Sakkinen Signed-off-by: Ard Biesheuvel Reviewed-by: Jarkko Sakkinen Acked-by: Matthew Garrett Acked-by: Ard Biesheuvel Cc: Ben Dooks Cc: Dave Young Cc: Jerry Snitselaar Cc: Linus Torvalds Cc: Lukas Wunner Cc: Octavian Purdila Cc: Peter Zijlstra Cc: Scott Talbert Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Cc: linux-integrity@vger.kernel.org Cc: stable@vger.kernel.org Fixes: 44038bc514a2 ("tpm: Abstract crypto agile event size calculations") Link: https://lkml.kernel.org/r/20191002165904.8819-4-ard.biesheuvel@linaro.org [ Minor edits. ] Signed-off-by: Ingo Molnar --- include/linux/tpm_eventlog.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/tpm_eventlog.h b/include/linux/tpm_eventlog.h index 63238c84dc0b..b50cc3adca18 100644 --- a/include/linux/tpm_eventlog.h +++ b/include/linux/tpm_eventlog.h @@ -170,6 +170,7 @@ static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event, u16 halg; int i; int j; + u32 count, event_type; marker = event; marker_start = marker; @@ -190,16 +191,22 @@ static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event, } event = (struct tcg_pcr_event2_head *)mapping; + /* + * The loop below will unmap these fields if the log is larger than + * one page, so save them here for reference: + */ + count = READ_ONCE(event->count); + event_type = READ_ONCE(event->event_type); efispecid = (struct tcg_efi_specid_event_head *)event_header->event; /* Check if event is malformed. */ - if (event->count > efispecid->num_algs) { + if (count > efispecid->num_algs) { size = 0; goto out; } - for (i = 0; i < event->count; i++) { + for (i = 0; i < count; i++) { halg_size = sizeof(event->digests[i].alg_id); /* Map the digest's algorithm identifier */ @@ -256,8 +263,9 @@ static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event, + event_field->event_size; size = marker - marker_start; - if ((event->event_type == 0) && (event_field->event_size == 0)) + if (event_type == 0 && event_field->event_size == 0) size = 0; + out: if (do_mapping) TPM_MEMUNMAP(mapping, mapping_size); -- cgit v1.2.3 From e658c82be5561412c5e83b5e74e9da4830593f3e Mon Sep 17 00:00:00 2001 From: Jerry Snitselaar Date: Wed, 2 Oct 2019 18:59:02 +0200 Subject: efi/tpm: Only set 'efi_tpm_final_log_size' after successful event log parsing If __calc_tpm2_event_size() fails to parse an event it will return 0, resulting tpm2_calc_event_log_size() returning -1. Currently there is no check of this return value, and 'efi_tpm_final_log_size' can end up being set to this negative value resulting in a crash like this one: BUG: unable to handle page fault for address: ffffbc8fc00866ad #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page RIP: 0010:memcpy_erms+0x6/0x10 Call Trace: tpm_read_log_efi() tpm_bios_log_setup() tpm_chip_register() tpm_tis_core_init.cold.9+0x28c/0x466 tpm_tis_plat_probe() platform_drv_probe() ... Also __calc_tpm2_event_size() returns a size of 0 when it fails to parse an event, so update function documentation to reflect this. The root cause of the issue that caused the failure of event parsing in this case is resolved by Peter Jone's patchset dealing with large event logs where crossing over a page boundary causes the page with the event count to be unmapped. Signed-off-by: Jerry Snitselaar Signed-off-by: Ard Biesheuvel Cc: Ben Dooks Cc: Dave Young Cc: Jarkko Sakkinen Cc: Linus Torvalds Cc: Lukas Wunner Cc: Lyude Paul Cc: Matthew Garrett Cc: Octavian Purdila Cc: Peter Jones Cc: Peter Zijlstra Cc: Scott Talbert Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Cc: linux-integrity@vger.kernel.org Cc: stable@vger.kernel.org Fixes: c46f3405692de ("tpm: Reserve the TPM final events table") Link: https://lkml.kernel.org/r/20191002165904.8819-6-ard.biesheuvel@linaro.org Signed-off-by: Ingo Molnar --- drivers/firmware/efi/tpm.c | 9 ++++++++- include/linux/tpm_eventlog.h | 2 +- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/firmware/efi/tpm.c b/drivers/firmware/efi/tpm.c index b9ae5c6f9b9c..703469c1ab8e 100644 --- a/drivers/firmware/efi/tpm.c +++ b/drivers/firmware/efi/tpm.c @@ -85,11 +85,18 @@ int __init efi_tpm_eventlog_init(void) final_tbl->nr_events, log_tbl->log); } + + if (tbl_size < 0) { + pr_err(FW_BUG "Failed to parse event in TPM Final Events Log\n"); + goto out_calc; + } + memblock_reserve((unsigned long)final_tbl, tbl_size + sizeof(*final_tbl)); - early_memunmap(final_tbl, sizeof(*final_tbl)); efi_tpm_final_log_size = tbl_size; +out_calc: + early_memunmap(final_tbl, sizeof(*final_tbl)); out: early_memunmap(log_tbl, sizeof(*log_tbl)); return ret; diff --git a/include/linux/tpm_eventlog.h b/include/linux/tpm_eventlog.h index b50cc3adca18..131ea1bad458 100644 --- a/include/linux/tpm_eventlog.h +++ b/include/linux/tpm_eventlog.h @@ -152,7 +152,7 @@ struct tcg_algorithm_info { * total. Once we've done this we know the offset of the data length field, * and can calculate the total size of the event. * - * Return: size of the event on success, <0 on failure + * Return: size of the event on success, 0 on failure */ static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event, -- cgit v1.2.3 From bf70b0503abd19194dba25fe383d143d0229dc6a Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 3 Oct 2019 16:58:21 +0900 Subject: module: swap the order of symbol.namespace Currently, EXPORT_SYMBOL_NS(_GPL) constructs the kernel symbol as follows: __ksymtab_SYMBOL.NAMESPACE The sym_extract_namespace() in modpost allocates memory for the part SYMBOL.NAMESPACE when '.' is contained. One problem is that the pointer returned by strdup() is lost because the symbol name will be copied to malloc'ed memory by alloc_symbol(). No one will keep track of the pointer of strdup'ed memory. sym->namespace still points to the NAMESPACE part. So, you can free it with complicated code like this: free(sym->namespace - strlen(sym->name) - 1); It complicates memory free. To fix it elegantly, I swapped the order of the symbol and the namespace as follows: __ksymtab_NAMESPACE.SYMBOL then, simplified sym_extract_namespace() so that it allocates memory only for the NAMESPACE part. I prefer this order because it is intuitive and also matches to major languages. For example, NAMESPACE::NAME in C++, MODULE.NAME in Python. Reviewed-by: Matthias Maennich Signed-off-by: Masahiro Yamada Signed-off-by: Jessica Yu --- include/linux/export.h | 4 ++-- scripts/mod/modpost.c | 16 +++++++--------- 2 files changed, 9 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/export.h b/include/linux/export.h index 95f55b7f83a0..0695d4e847d9 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -52,7 +52,7 @@ extern struct module __this_module; __ADDRESSABLE(sym) \ asm(" .section \"___ksymtab" sec "+" #sym "\", \"a\" \n" \ " .balign 4 \n" \ - "__ksymtab_" #sym NS_SEPARATOR #ns ": \n" \ + "__ksymtab_" #ns NS_SEPARATOR #sym ": \n" \ " .long " #sym "- . \n" \ " .long __kstrtab_" #sym "- . \n" \ " .long __kstrtab_ns_" #sym "- . \n" \ @@ -76,7 +76,7 @@ struct kernel_symbol { #else #define __KSYMTAB_ENTRY_NS(sym, sec, ns) \ static const struct kernel_symbol __ksymtab_##sym##__##ns \ - asm("__ksymtab_" #sym NS_SEPARATOR #ns) \ + asm("__ksymtab_" #ns NS_SEPARATOR #sym) \ __attribute__((section("___ksymtab" sec "+" #sym), used)) \ __aligned(sizeof(void *)) \ = { (unsigned long)&sym, __kstrtab_##sym, __kstrtab_ns_##sym } diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 3961941e8e7a..c4b536ac1327 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -350,18 +350,16 @@ static enum export export_from_sec(struct elf_info *elf, unsigned int sec) static const char *sym_extract_namespace(const char **symname) { - size_t n; - char *dupsymname; + char *namespace = NULL; + char *ns_separator; - n = strcspn(*symname, "."); - if (n < strlen(*symname) - 1) { - dupsymname = NOFAIL(strdup(*symname)); - dupsymname[n] = '\0'; - *symname = dupsymname; - return dupsymname + n + 1; + ns_separator = strchr(*symname, '.'); + if (ns_separator) { + namespace = NOFAIL(strndup(*symname, ns_separator - *symname)); + *symname = ns_separator + 1; } - return NULL; + return namespace; } /** -- cgit v1.2.3 From fa6643cdc5cd726b10d30eec45ff8dca267de735 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 3 Oct 2019 16:58:23 +0900 Subject: module: rename __kstrtab_ns_* to __kstrtabns_* to avoid symbol conflict The module namespace produces __strtab_ns_ symbols to store namespace strings, but it does not guarantee the name uniqueness. This is a potential problem because we have exported symbols starting with "ns_". For example, kernel/capability.c exports the following symbols: EXPORT_SYMBOL(ns_capable); EXPORT_SYMBOL(capable); Assume a situation where those are converted as follows: EXPORT_SYMBOL_NS(ns_capable, some_namespace); EXPORT_SYMBOL_NS(capable, some_namespace); The former expands to "__kstrtab_ns_capable" and "__kstrtab_ns_ns_capable", and the latter to "__kstrtab_capable" and "__kstrtab_ns_capable". Then, we have the duplicated "__kstrtab_ns_capable". To ensure the uniqueness, rename "__kstrtab_ns_*" to "__kstrtabns_*". Reviewed-by: Matthias Maennich Signed-off-by: Masahiro Yamada Signed-off-by: Jessica Yu --- include/linux/export.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/export.h b/include/linux/export.h index 0695d4e847d9..621158ecd2e2 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -55,7 +55,7 @@ extern struct module __this_module; "__ksymtab_" #ns NS_SEPARATOR #sym ": \n" \ " .long " #sym "- . \n" \ " .long __kstrtab_" #sym "- . \n" \ - " .long __kstrtab_ns_" #sym "- . \n" \ + " .long __kstrtabns_" #sym "- . \n" \ " .previous \n") #define __KSYMTAB_ENTRY(sym, sec) \ @@ -79,7 +79,7 @@ struct kernel_symbol { asm("__ksymtab_" #ns NS_SEPARATOR #sym) \ __attribute__((section("___ksymtab" sec "+" #sym), used)) \ __aligned(sizeof(void *)) \ - = { (unsigned long)&sym, __kstrtab_##sym, __kstrtab_ns_##sym } + = { (unsigned long)&sym, __kstrtab_##sym, __kstrtabns_##sym } #define __KSYMTAB_ENTRY(sym, sec) \ static const struct kernel_symbol __ksymtab_##sym \ @@ -112,7 +112,7 @@ struct kernel_symbol { /* For every exported symbol, place a struct in the __ksymtab section */ #define ___EXPORT_SYMBOL_NS(sym, sec, ns) \ ___export_symbol_common(sym, sec); \ - static const char __kstrtab_ns_##sym[] \ + static const char __kstrtabns_##sym[] \ __attribute__((section("__ksymtab_strings"), used, aligned(1))) \ = #ns; \ __KSYMTAB_ENTRY_NS(sym, sec, ns) -- cgit v1.2.3 From c512c69187197fe08026cb5bbe7b9709f4f89b73 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 7 Oct 2019 12:56:48 -0700 Subject: uaccess: implement a proper unsafe_copy_to_user() and switch filldir over to it In commit 9f79b78ef744 ("Convert filldir[64]() from __put_user() to unsafe_put_user()") I made filldir() use unsafe_put_user(), which improves code generation on x86 enormously. But because we didn't have a "unsafe_copy_to_user()", the dirent name copy was also done by hand with unsafe_put_user() in a loop, and it turns out that a lot of other architectures didn't like that, because unlike x86, they have various alignment issues. Most non-x86 architectures trap and fix it up, and some (like xtensa) will just fail unaligned put_user() accesses unconditionally. Which makes that "copy using put_user() in a loop" not work for them at all. I could make that code do explicit alignment etc, but the architectures that don't like unaligned accesses also don't really use the fancy "user_access_begin/end()" model, so they might just use the regular old __copy_to_user() interface. So this commit takes that looping implementation, turns it into the x86 version of "unsafe_copy_to_user()", and makes other architectures implement the unsafe copy version as __copy_to_user() (the same way they do for the other unsafe_xyz() accessor functions). Note that it only does this for the copying _to_ user space, and we still don't have a unsafe version of copy_from_user(). That's partly because we have no current users of it, but also partly because the copy_from_user() case is slightly different and cannot efficiently be implemented in terms of a unsafe_get_user() loop (because gcc can't do asm goto with outputs). It would be trivial to do this using "rep movsb", which would work really nicely on newer x86 cores, but really badly on some older ones. Al Viro is looking at cleaning up all our user copy routines to make this all a non-issue, but for now we have this simple-but-stupid version for x86 that works fine for the dirent name copy case because those names are short strings and we simply don't need anything fancier. Fixes: 9f79b78ef744 ("Convert filldir[64]() from __put_user() to unsafe_put_user()") Reported-by: Guenter Roeck Reported-and-tested-by: Tony Luck Cc: Al Viro Cc: Max Filippov Signed-off-by: Linus Torvalds --- arch/x86/include/asm/uaccess.h | 23 ++++++++++++++++++++++ fs/readdir.c | 44 ++---------------------------------------- include/linux/uaccess.h | 6 ++++-- 3 files changed, 29 insertions(+), 44 deletions(-) (limited to 'include') diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 35c225ede0e4..61d93f062a36 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -734,5 +734,28 @@ do { \ if (unlikely(__gu_err)) goto err_label; \ } while (0) +/* + * We want the unsafe accessors to always be inlined and use + * the error labels - thus the macro games. + */ +#define unsafe_copy_loop(dst, src, len, type, label) \ + while (len >= sizeof(type)) { \ + unsafe_put_user(*(type *)src,(type __user *)dst,label); \ + dst += sizeof(type); \ + src += sizeof(type); \ + len -= sizeof(type); \ + } + +#define unsafe_copy_to_user(_dst,_src,_len,label) \ +do { \ + char __user *__ucu_dst = (_dst); \ + const char *__ucu_src = (_src); \ + size_t __ucu_len = (_len); \ + unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u64, label); \ + unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u32, label); \ + unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u16, label); \ + unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u8, label); \ +} while (0) + #endif /* _ASM_X86_UACCESS_H */ diff --git a/fs/readdir.c b/fs/readdir.c index 19bea591c3f1..6e2623e57b2e 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -27,53 +27,13 @@ /* * Note the "unsafe_put_user() semantics: we goto a * label for errors. - * - * Also note how we use a "while()" loop here, even though - * only the biggest size needs to loop. The compiler (well, - * at least gcc) is smart enough to turn the smaller sizes - * into just if-statements, and this way we don't need to - * care whether 'u64' or 'u32' is the biggest size. - */ -#define unsafe_copy_loop(dst, src, len, type, label) \ - while (len >= sizeof(type)) { \ - unsafe_put_user(get_unaligned((type *)src), \ - (type __user *)dst, label); \ - dst += sizeof(type); \ - src += sizeof(type); \ - len -= sizeof(type); \ - } - -/* - * We avoid doing 64-bit copies on 32-bit architectures. They - * might be better, but the component names are mostly small, - * and the 64-bit cases can end up being much more complex and - * put much more register pressure on the code, so it's likely - * not worth the pain of unaligned accesses etc. - * - * So limit the copies to "unsigned long" size. I did verify - * that at least the x86-32 case is ok without this limiting, - * but I worry about random other legacy 32-bit cases that - * might not do as well. - */ -#define unsafe_copy_type(dst, src, len, type, label) do { \ - if (sizeof(type) <= sizeof(unsigned long)) \ - unsafe_copy_loop(dst, src, len, type, label); \ -} while (0) - -/* - * Copy the dirent name to user space, and NUL-terminate - * it. This should not be a function call, since we're doing - * the copy inside a "user_access_begin/end()" section. */ #define unsafe_copy_dirent_name(_dst, _src, _len, label) do { \ char __user *dst = (_dst); \ const char *src = (_src); \ size_t len = (_len); \ - unsafe_copy_type(dst, src, len, u64, label); \ - unsafe_copy_type(dst, src, len, u32, label); \ - unsafe_copy_type(dst, src, len, u16, label); \ - unsafe_copy_type(dst, src, len, u8, label); \ - unsafe_put_user(0, dst, label); \ + unsafe_put_user(0, dst+len, label); \ + unsafe_copy_to_user(dst, src, len, label); \ } while (0) diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index e47d0522a1f4..d4ee6e942562 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -355,8 +355,10 @@ extern long strnlen_unsafe_user(const void __user *unsafe_addr, long count); #ifndef user_access_begin #define user_access_begin(ptr,len) access_ok(ptr, len) #define user_access_end() do { } while (0) -#define unsafe_get_user(x, ptr, err) do { if (unlikely(__get_user(x, ptr))) goto err; } while (0) -#define unsafe_put_user(x, ptr, err) do { if (unlikely(__put_user(x, ptr))) goto err; } while (0) +#define unsafe_op_wrap(op, err) do { if (unlikely(op)) goto err; } while (0) +#define unsafe_get_user(x,p,e) unsafe_op_wrap(__get_user(x,p),e) +#define unsafe_put_user(x,p,e) unsafe_op_wrap(__put_user(x,p),e) +#define unsafe_copy_to_user(d,s,l,e) unsafe_op_wrap(__copy_to_user(d,s,l),e) static inline unsigned long user_access_save(void) { return 0UL; } static inline void user_access_restore(unsigned long flags) { } #endif -- cgit v1.2.3 From 08d1d0e6d0a00c6e687201774f3bf61177741e80 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Sun, 6 Oct 2019 17:58:15 -0700 Subject: memcg: only record foreign writebacks with dirty pages when memcg is not disabled In kdump kernel, memcg usually is disabled with 'cgroup_disable=memory' for saving memory. Now kdump kernel will always panic when dump vmcore to local disk: BUG: kernel NULL pointer dereference, address: 0000000000000ab8 Oops: 0000 [#1] SMP NOPTI CPU: 0 PID: 598 Comm: makedumpfile Not tainted 5.3.0+ #26 Hardware name: HPE ProLiant DL385 Gen10/ProLiant DL385 Gen10, BIOS A40 10/02/2018 RIP: 0010:mem_cgroup_track_foreign_dirty_slowpath+0x38/0x140 Call Trace: __set_page_dirty+0x52/0xc0 iomap_set_page_dirty+0x50/0x90 iomap_write_end+0x6e/0x270 iomap_write_actor+0xce/0x170 iomap_apply+0xba/0x11e iomap_file_buffered_write+0x62/0x90 xfs_file_buffered_aio_write+0xca/0x320 [xfs] new_sync_write+0x12d/0x1d0 vfs_write+0xa5/0x1a0 ksys_write+0x59/0xd0 do_syscall_64+0x59/0x1e0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 And this will corrupt the 1st kernel too with 'cgroup_disable=memory'. Via the trace and with debugging, it is pointing to commit 97b27821b485 ("writeback, memcg: Implement foreign dirty flushing") which introduced this regression. Disabling memcg causes the null pointer dereference at uninitialized data in function mem_cgroup_track_foreign_dirty_slowpath(). Fix it by returning directly if memcg is disabled, but not trying to record the foreign writebacks with dirty pages. Link: http://lkml.kernel.org/r/20190924141928.GD31919@MiWiFi-R3L-srv Fixes: 97b27821b485 ("writeback, memcg: Implement foreign dirty flushing") Signed-off-by: Baoquan He Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Jan Kara Cc: Tejun Heo Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 9b60863429cc..98380779f6d5 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1264,6 +1264,9 @@ void mem_cgroup_track_foreign_dirty_slowpath(struct page *page, static inline void mem_cgroup_track_foreign_dirty(struct page *page, struct bdi_writeback *wb) { + if (mem_cgroup_disabled()) + return; + if (unlikely(&page->mem_cgroup->css != wb->memcg_css)) mem_cgroup_track_foreign_dirty_slowpath(page, wb); } -- cgit v1.2.3 From 9783aa9917f8ae24759e67bf882f1aba32fe4ea1 Mon Sep 17 00:00:00 2001 From: Chris Down Date: Sun, 6 Oct 2019 17:58:32 -0700 Subject: mm, memcg: proportional memory.{low,min} reclaim cgroup v2 introduces two memory protection thresholds: memory.low (best-effort) and memory.min (hard protection). While they generally do what they say on the tin, there is a limitation in their implementation that makes them difficult to use effectively: that cliff behaviour often manifests when they become eligible for reclaim. This patch implements more intuitive and usable behaviour, where we gradually mount more reclaim pressure as cgroups further and further exceed their protection thresholds. This cliff edge behaviour happens because we only choose whether or not to reclaim based on whether the memcg is within its protection limits (see the use of mem_cgroup_protected in shrink_node), but we don't vary our reclaim behaviour based on this information. Imagine the following timeline, with the numbers the lruvec size in this zone: 1. memory.low=1000000, memory.current=999999. 0 pages may be scanned. 2. memory.low=1000000, memory.current=1000000. 0 pages may be scanned. 3. memory.low=1000000, memory.current=1000001. 1000001* pages may be scanned. (?!) * Of course, we won't usually scan all available pages in the zone even without this patch because of scan control priority, over-reclaim protection, etc. However, as shown by the tests at the end, these techniques don't sufficiently throttle such an extreme change in input, so cliff-like behaviour isn't really averted by their existence alone. Here's an example of how this plays out in practice. At Facebook, we are trying to protect various workloads from "system" software, like configuration management tools, metric collectors, etc (see this[0] case study). In order to find a suitable memory.low value, we start by determining the expected memory range within which the workload will be comfortable operating. This isn't an exact science -- memory usage deemed "comfortable" will vary over time due to user behaviour, differences in composition of work, etc, etc. As such we need to ballpark memory.low, but doing this is currently problematic: 1. If we end up setting it too low for the workload, it won't have *any* effect (see discussion above). The group will receive the full weight of reclaim and won't have any priority while competing with the less important system software, as if we had no memory.low configured at all. 2. Because of this behaviour, we end up erring on the side of setting it too high, such that the comfort range is reliably covered. However, protected memory is completely unavailable to the rest of the system, so we might cause undue memory and IO pressure there when we *know* we have some elasticity in the workload. 3. Even if we get the value totally right, smack in the middle of the comfort zone, we get extreme jumps between no pressure and full pressure that cause unpredictable pressure spikes in the workload due to the current binary reclaim behaviour. With this patch, we can set it to our ballpark estimation without too much worry. Any undesirable behaviour, such as too much or too little reclaim pressure on the workload or system will be proportional to how far our estimation is off. This means we can set memory.low much more conservatively and thus waste less resources *without* the risk of the workload falling off a cliff if we overshoot. As a more abstract technical description, this unintuitive behaviour results in having to give high-priority workloads a large protection buffer on top of their expected usage to function reliably, as otherwise we have abrupt periods of dramatically increased memory pressure which hamper performance. Having to set these thresholds so high wastes resources and generally works against the principle of work conservation. In addition, having proportional memory reclaim behaviour has other benefits. Most notably, before this patch it's basically mandatory to set memory.low to a higher than desirable value because otherwise as soon as you exceed memory.low, all protection is lost, and all pages are eligible to scan again. By contrast, having a gradual ramp in reclaim pressure means that you now still get some protection when thresholds are exceeded, which means that one can now be more comfortable setting memory.low to lower values without worrying that all protection will be lost. This is important because workingset size is really hard to know exactly, especially with variable workloads, so at least getting *some* protection if your workingset size grows larger than you expect increases user confidence in setting memory.low without a huge buffer on top being needed. Thanks a lot to Johannes Weiner and Tejun Heo for their advice and assistance in thinking about how to make this work better. In testing these changes, I intended to verify that: 1. Changes in page scanning become gradual and proportional instead of binary. To test this, I experimented stepping further and further down memory.low protection on a workload that floats around 19G workingset when under memory.low protection, watching page scan rates for the workload cgroup: +------------+-----------------+--------------------+--------------+ | memory.low | test (pgscan/s) | control (pgscan/s) | % of control | +------------+-----------------+--------------------+--------------+ | 21G | 0 | 0 | N/A | | 17G | 867 | 3799 | 23% | | 12G | 1203 | 3543 | 34% | | 8G | 2534 | 3979 | 64% | | 4G | 3980 | 4147 | 96% | | 0 | 3799 | 3980 | 95% | +------------+-----------------+--------------------+--------------+ As you can see, the test kernel (with a kernel containing this patch) ramps up page scanning significantly more gradually than the control kernel (without this patch). 2. More gradual ramp up in reclaim aggression doesn't result in premature OOMs. To test this, I wrote a script that slowly increments the number of pages held by stress(1)'s --vm-keep mode until a production system entered severe overall memory contention. This script runs in a highly protected slice taking up the majority of available system memory. Watching vmstat revealed that page scanning continued essentially nominally between test and control, without causing forward reclaim progress to become arrested. [0]: https://facebookmicrosites.github.io/cgroup2/docs/overview.html#case-study-the-fbtax2-project [akpm@linux-foundation.org: reflow block comments to fit in 80 cols] [chris@chrisdown.name: handle cgroup_disable=memory when getting memcg protection] Link: http://lkml.kernel.org/r/20190201045711.GA18302@chrisdown.name Link: http://lkml.kernel.org/r/20190124014455.GA6396@chrisdown.name Signed-off-by: Chris Down Acked-by: Johannes Weiner Reviewed-by: Roman Gushchin Cc: Michal Hocko Cc: Tejun Heo Cc: Dennis Zhou Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v2.rst | 20 +++++--- include/linux/memcontrol.h | 20 ++++++++ mm/memcontrol.c | 5 ++ mm/vmscan.c | 82 ++++++++++++++++++++++++++++++--- 4 files changed, 115 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 0fa8c0e615c2..5361ebec3361 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -615,8 +615,8 @@ on an IO device and is an example of this type. Protections ----------- -A cgroup is protected to be allocated upto the configured amount of -the resource if the usages of all its ancestors are under their +A cgroup is protected upto the configured amount of the resource +as long as the usages of all its ancestors are under their protected levels. Protections can be hard guarantees or best effort soft boundaries. Protections can also be over-committed in which case only upto the amount available to the parent is protected among @@ -1096,7 +1096,10 @@ PAGE_SIZE multiple when read back. is within its effective min boundary, the cgroup's memory won't be reclaimed under any conditions. If there is no unprotected reclaimable memory available, OOM killer - is invoked. + is invoked. Above the effective min boundary (or + effective low boundary if it is higher), pages are reclaimed + proportionally to the overage, reducing reclaim pressure for + smaller overages. Effective min boundary is limited by memory.min values of all ancestor cgroups. If there is memory.min overcommitment @@ -1118,7 +1121,10 @@ PAGE_SIZE multiple when read back. Best-effort memory protection. If the memory usage of a cgroup is within its effective low boundary, the cgroup's memory won't be reclaimed unless memory can be reclaimed - from unprotected cgroups. + from unprotected cgroups. Above the effective low boundary (or + effective min boundary if it is higher), pages are reclaimed + proportionally to the overage, reducing reclaim pressure for + smaller overages. Effective low boundary is limited by memory.low values of all ancestor cgroups. If there is memory.low overcommitment @@ -2482,8 +2488,10 @@ system performance due to overreclaim, to the point where the feature becomes self-defeating. The memory.low boundary on the other hand is a top-down allocated -reserve. A cgroup enjoys reclaim protection when it's within its low, -which makes delegation of subtrees possible. +reserve. A cgroup enjoys reclaim protection when it's within its +effective low, which makes delegation of subtrees possible. It also +enjoys having reclaim pressure proportional to its overage when +above its effective low. The original high boundary, the hard limit, is defined as a strict limit that can not budge, even if the OOM killer has to be called. diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 98380779f6d5..fa9ba2edf7e0 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -356,6 +356,14 @@ static inline bool mem_cgroup_disabled(void) return !cgroup_subsys_enabled(memory_cgrp_subsys); } +static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg) +{ + if (mem_cgroup_disabled()) + return 0; + + return max(READ_ONCE(memcg->memory.emin), READ_ONCE(memcg->memory.elow)); +} + enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, struct mem_cgroup *memcg); @@ -537,6 +545,8 @@ void mem_cgroup_handle_over_high(void); unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); +unsigned long mem_cgroup_size(struct mem_cgroup *memcg); + void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p); @@ -829,6 +839,11 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, { } +static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg) +{ + return 0; +} + static inline enum mem_cgroup_protection mem_cgroup_protected( struct mem_cgroup *root, struct mem_cgroup *memcg) { @@ -968,6 +983,11 @@ static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) return 0; } +static inline unsigned long mem_cgroup_size(struct mem_cgroup *memcg) +{ + return 0; +} + static inline void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c313c49074ca..bdac56009a38 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1567,6 +1567,11 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) return max; } +unsigned long mem_cgroup_size(struct mem_cgroup *memcg) +{ + return page_counter_read(&memcg->memory); +} + static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, int order) { diff --git a/mm/vmscan.c b/mm/vmscan.c index e5d52d6a24af..dfefa1d99d1b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2459,17 +2459,80 @@ out: *lru_pages = 0; for_each_evictable_lru(lru) { int file = is_file_lru(lru); - unsigned long size; + unsigned long lruvec_size; unsigned long scan; + unsigned long protection; + + lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); + protection = mem_cgroup_protection(memcg); + + if (protection > 0) { + /* + * Scale a cgroup's reclaim pressure by proportioning + * its current usage to its memory.low or memory.min + * setting. + * + * This is important, as otherwise scanning aggression + * becomes extremely binary -- from nothing as we + * approach the memory protection threshold, to totally + * nominal as we exceed it. This results in requiring + * setting extremely liberal protection thresholds. It + * also means we simply get no protection at all if we + * set it too low, which is not ideal. + */ + unsigned long cgroup_size = mem_cgroup_size(memcg); + unsigned long baseline = 0; + + /* + * During the reclaim first pass, we only consider + * cgroups in excess of their protection setting, but if + * that doesn't produce free pages, we come back for a + * second pass where we reclaim from all groups. + * + * To maintain fairness in both cases, the first pass + * targets groups in proportion to their overage, and + * the second pass targets groups in proportion to their + * protection utilization. + * + * So on the first pass, a group whose size is 130% of + * its protection will be targeted at 30% of its size. + * On the second pass, a group whose size is at 40% of + * its protection will be + * targeted at 40% of its size. + */ + if (!sc->memcg_low_reclaim) + baseline = lruvec_size; + scan = lruvec_size * cgroup_size / protection - baseline; + + /* + * Don't allow the scan target to exceed the lruvec + * size, which otherwise could happen if we have >200% + * overage in the normal case, or >100% overage when + * sc->memcg_low_reclaim is set. + * + * This is important because other cgroups without + * memory.low have their scan target initially set to + * their lruvec size, so allowing values >100% of the + * lruvec size here could result in penalising cgroups + * with memory.low set even *more* than their peers in + * some cases in the case of large overages. + * + * Also, minimally target SWAP_CLUSTER_MAX pages to keep + * reclaim moving forwards. + */ + scan = clamp(scan, SWAP_CLUSTER_MAX, lruvec_size); + } else { + scan = lruvec_size; + } + + scan >>= sc->priority; - size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); - scan = size >> sc->priority; /* * If the cgroup's already been deleted, make sure to * scrape out the remaining cache. */ if (!scan && !mem_cgroup_online(memcg)) - scan = min(size, SWAP_CLUSTER_MAX); + scan = min(lruvec_size, SWAP_CLUSTER_MAX); switch (scan_balance) { case SCAN_EQUAL: @@ -2489,7 +2552,7 @@ out: case SCAN_ANON: /* Scan one type exclusively */ if ((scan_balance == SCAN_FILE) != file) { - size = 0; + lruvec_size = 0; scan = 0; } break; @@ -2498,7 +2561,7 @@ out: BUG(); } - *lru_pages += size; + *lru_pages += lruvec_size; nr[lru] = scan; } } @@ -2742,6 +2805,13 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) memcg_memory_event(memcg, MEMCG_LOW); break; case MEMCG_PROT_NONE: + /* + * All protection thresholds breached. We may + * still choose to vary the scan pressure + * applied based on by how much the cgroup in + * question has exceeded its protection + * thresholds (see get_scan_count). + */ break; } -- cgit v1.2.3 From 9de7ca46ad2688bd51e80f7119fefa301ad7f3fa Mon Sep 17 00:00:00 2001 From: Chris Down Date: Sun, 6 Oct 2019 17:58:35 -0700 Subject: mm, memcg: make memory.emin the baseline for utilisation determination Roman points out that when when we do the low reclaim pass, we scale the reclaim pressure relative to position between 0 and the maximum protection threshold. However, if the maximum protection is based on memory.elow, and memory.emin is above zero, this means we still may get binary behaviour on second-pass low reclaim. This is because we scale starting at 0, not starting at memory.emin, and since we don't scan at all below emin, we end up with cliff behaviour. This should be a fairly uncommon case since usually we don't go into the second pass, but it makes sense to scale our low reclaim pressure starting at emin. You can test this by catting two large sparse files, one in a cgroup with emin set to some moderate size compared to physical RAM, and another cgroup without any emin. In both cgroups, set an elow larger than 50% of physical RAM. The one with emin will have less page scanning, as reclaim pressure is lower. Rebase on top of and apply the same idea as what was applied to handle cgroup_memory=disable properly for the original proportional patch http://lkml.kernel.org/r/20190201045711.GA18302@chrisdown.name ("mm, memcg: Handle cgroup_disable=memory when getting memcg protection"). Link: http://lkml.kernel.org/r/20190201051810.GA18895@chrisdown.name Signed-off-by: Chris Down Suggested-by: Roman Gushchin Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Tejun Heo Cc: Dennis Zhou Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 19 +++++++++++----- mm/vmscan.c | 55 +++++++++++++++++++++++++++------------------- 2 files changed, 46 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index fa9ba2edf7e0..1cbad1248e5a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -356,12 +356,17 @@ static inline bool mem_cgroup_disabled(void) return !cgroup_subsys_enabled(memory_cgrp_subsys); } -static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg) +static inline void mem_cgroup_protection(struct mem_cgroup *memcg, + unsigned long *min, unsigned long *low) { - if (mem_cgroup_disabled()) - return 0; + if (mem_cgroup_disabled()) { + *min = 0; + *low = 0; + return; + } - return max(READ_ONCE(memcg->memory.emin), READ_ONCE(memcg->memory.elow)); + *min = READ_ONCE(memcg->memory.emin); + *low = READ_ONCE(memcg->memory.elow); } enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, @@ -839,9 +844,11 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, { } -static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg) +static inline void mem_cgroup_protection(struct mem_cgroup *memcg, + unsigned long *min, unsigned long *low) { - return 0; + *min = 0; + *low = 0; } static inline enum mem_cgroup_protection mem_cgroup_protected( diff --git a/mm/vmscan.c b/mm/vmscan.c index dfefa1d99d1b..70347d626fb3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2461,12 +2461,12 @@ out: int file = is_file_lru(lru); unsigned long lruvec_size; unsigned long scan; - unsigned long protection; + unsigned long min, low; lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); - protection = mem_cgroup_protection(memcg); + mem_cgroup_protection(memcg, &min, &low); - if (protection > 0) { + if (min || low) { /* * Scale a cgroup's reclaim pressure by proportioning * its current usage to its memory.low or memory.min @@ -2481,28 +2481,38 @@ out: * set it too low, which is not ideal. */ unsigned long cgroup_size = mem_cgroup_size(memcg); - unsigned long baseline = 0; /* - * During the reclaim first pass, we only consider - * cgroups in excess of their protection setting, but if - * that doesn't produce free pages, we come back for a - * second pass where we reclaim from all groups. + * If there is any protection in place, we adjust scan + * pressure in proportion to how much a group's current + * usage exceeds that, in percent. * - * To maintain fairness in both cases, the first pass - * targets groups in proportion to their overage, and - * the second pass targets groups in proportion to their - * protection utilization. - * - * So on the first pass, a group whose size is 130% of - * its protection will be targeted at 30% of its size. - * On the second pass, a group whose size is at 40% of - * its protection will be - * targeted at 40% of its size. + * There is one special case: in the first reclaim pass, + * we skip over all groups that are within their low + * protection. If that fails to reclaim enough pages to + * satisfy the reclaim goal, we come back and override + * the best-effort low protection. However, we still + * ideally want to honor how well-behaved groups are in + * that case instead of simply punishing them all + * equally. As such, we reclaim them based on how much + * of their best-effort protection they are using. Usage + * below memory.min is excluded from consideration when + * calculating utilisation, as it isn't ever + * reclaimable, so it might as well not exist for our + * purposes. */ - if (!sc->memcg_low_reclaim) - baseline = lruvec_size; - scan = lruvec_size * cgroup_size / protection - baseline; + if (sc->memcg_low_reclaim && low > min) { + /* + * Reclaim according to utilisation between min + * and low + */ + scan = lruvec_size * (cgroup_size - min) / + (low - min); + } else { + /* Reclaim according to protection overage */ + scan = lruvec_size * cgroup_size / + max(min, low) - lruvec_size; + } /* * Don't allow the scan target to exceed the lruvec @@ -2518,7 +2528,8 @@ out: * some cases in the case of large overages. * * Also, minimally target SWAP_CLUSTER_MAX pages to keep - * reclaim moving forwards. + * reclaim moving forwards, avoiding decremeting + * sc->priority further than desirable. */ scan = clamp(scan, SWAP_CLUSTER_MAX, lruvec_size); } else { -- cgit v1.2.3 From 1bc63fb1272be0773e925f78c0fbd06c89701d55 Mon Sep 17 00:00:00 2001 From: Chris Down Date: Sun, 6 Oct 2019 17:58:38 -0700 Subject: mm, memcg: make scan aggression always exclude protection This patch is an incremental improvement on the existing memory.{low,min} relative reclaim work to base its scan pressure calculations on how much protection is available compared to the current usage, rather than how much the current usage is over some protection threshold. This change doesn't change the experience for the user in the normal case too much. One benefit is that it replaces the (somewhat arbitrary) 100% cutoff with an indefinite slope, which makes it easier to ballpark a memory.low value. As well as this, the old methodology doesn't quite apply generically to machines with varying amounts of physical memory. Let's say we have a top level cgroup, workload.slice, and another top level cgroup, system-management.slice. We want to roughly give 12G to system-management.slice, so on a 32GB machine we set memory.low to 20GB in workload.slice, and on a 64GB machine we set memory.low to 52GB. However, because these are relative amounts to the total machine size, while the amount of memory we want to generally be willing to yield to system.slice is absolute (12G), we end up putting more pressure on system.slice just because we have a larger machine and a larger workload to fill it, which seems fairly unintuitive. With this new behaviour, we don't end up with this unintended side effect. Previously the way that memory.low protection works is that if you are 50% over a certain baseline, you get 50% of your normal scan pressure. This is certainly better than the previous cliff-edge behaviour, but it can be improved even further by always considering memory under the currently enforced protection threshold to be out of bounds. This means that we can set relatively low memory.low thresholds for variable or bursty workloads while still getting a reasonable level of protection, whereas with the previous version we may still trivially hit the 100% clamp. The previous 100% clamp is also somewhat arbitrary, whereas this one is more concretely based on the currently enforced protection threshold, which is likely easier to reason about. There is also a subtle issue with the way that proportional reclaim worked previously -- it promotes having no memory.low, since it makes pressure higher during low reclaim. This happens because we base our scan pressure modulation on how far memory.current is between memory.min and memory.low, but if memory.low is unset, we only use the overage method. In most cromulent configurations, this then means that we end up with *more* pressure than with no memory.low at all when we're in low reclaim, which is not really very usable or expected. With this patch, memory.low and memory.min affect reclaim pressure in a more understandable and composable way. For example, from a user standpoint, "protected" memory now remains untouchable from a reclaim aggression standpoint, and users can also have more confidence that bursty workloads will still receive some amount of guaranteed protection. Link: http://lkml.kernel.org/r/20190322160307.GA3316@chrisdown.name Signed-off-by: Chris Down Reviewed-by: Roman Gushchin Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Cc: Dennis Zhou Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 25 +++++++++---------- mm/vmscan.c | 61 +++++++++++++++------------------------------- 2 files changed, 32 insertions(+), 54 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1cbad1248e5a..ae703ea3ef48 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -356,17 +356,17 @@ static inline bool mem_cgroup_disabled(void) return !cgroup_subsys_enabled(memory_cgrp_subsys); } -static inline void mem_cgroup_protection(struct mem_cgroup *memcg, - unsigned long *min, unsigned long *low) +static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg, + bool in_low_reclaim) { - if (mem_cgroup_disabled()) { - *min = 0; - *low = 0; - return; - } + if (mem_cgroup_disabled()) + return 0; + + if (in_low_reclaim) + return READ_ONCE(memcg->memory.emin); - *min = READ_ONCE(memcg->memory.emin); - *low = READ_ONCE(memcg->memory.elow); + return max(READ_ONCE(memcg->memory.emin), + READ_ONCE(memcg->memory.elow)); } enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, @@ -844,11 +844,10 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, { } -static inline void mem_cgroup_protection(struct mem_cgroup *memcg, - unsigned long *min, unsigned long *low) +static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg, + bool in_low_reclaim) { - *min = 0; - *low = 0; + return 0; } static inline enum mem_cgroup_protection mem_cgroup_protected( diff --git a/mm/vmscan.c b/mm/vmscan.c index 70347d626fb3..c6659bb758a4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2461,12 +2461,13 @@ out: int file = is_file_lru(lru); unsigned long lruvec_size; unsigned long scan; - unsigned long min, low; + unsigned long protection; lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); - mem_cgroup_protection(memcg, &min, &low); + protection = mem_cgroup_protection(memcg, + sc->memcg_low_reclaim); - if (min || low) { + if (protection) { /* * Scale a cgroup's reclaim pressure by proportioning * its current usage to its memory.low or memory.min @@ -2479,13 +2480,10 @@ out: * setting extremely liberal protection thresholds. It * also means we simply get no protection at all if we * set it too low, which is not ideal. - */ - unsigned long cgroup_size = mem_cgroup_size(memcg); - - /* - * If there is any protection in place, we adjust scan - * pressure in proportion to how much a group's current - * usage exceeds that, in percent. + * + * If there is any protection in place, we reduce scan + * pressure by how much of the total memory used is + * within protection thresholds. * * There is one special case: in the first reclaim pass, * we skip over all groups that are within their low @@ -2495,43 +2493,24 @@ out: * ideally want to honor how well-behaved groups are in * that case instead of simply punishing them all * equally. As such, we reclaim them based on how much - * of their best-effort protection they are using. Usage - * below memory.min is excluded from consideration when - * calculating utilisation, as it isn't ever - * reclaimable, so it might as well not exist for our - * purposes. + * memory they are using, reducing the scan pressure + * again by how much of the total memory used is under + * hard protection. */ - if (sc->memcg_low_reclaim && low > min) { - /* - * Reclaim according to utilisation between min - * and low - */ - scan = lruvec_size * (cgroup_size - min) / - (low - min); - } else { - /* Reclaim according to protection overage */ - scan = lruvec_size * cgroup_size / - max(min, low) - lruvec_size; - } + unsigned long cgroup_size = mem_cgroup_size(memcg); + + /* Avoid TOCTOU with earlier protection check */ + cgroup_size = max(cgroup_size, protection); + + scan = lruvec_size - lruvec_size * protection / + cgroup_size; /* - * Don't allow the scan target to exceed the lruvec - * size, which otherwise could happen if we have >200% - * overage in the normal case, or >100% overage when - * sc->memcg_low_reclaim is set. - * - * This is important because other cgroups without - * memory.low have their scan target initially set to - * their lruvec size, so allowing values >100% of the - * lruvec size here could result in penalising cgroups - * with memory.low set even *more* than their peers in - * some cases in the case of large overages. - * - * Also, minimally target SWAP_CLUSTER_MAX pages to keep + * Minimally target SWAP_CLUSTER_MAX pages to keep * reclaim moving forwards, avoiding decremeting * sc->priority further than desirable. */ - scan = clamp(scan, SWAP_CLUSTER_MAX, lruvec_size); + scan = max(scan, SWAP_CLUSTER_MAX); } else { scan = lruvec_size; } -- cgit v1.2.3 From 59bb47985c1db229ccff8c5deebecd54fc77d2a9 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Sun, 6 Oct 2019 17:58:45 -0700 Subject: mm, sl[aou]b: guarantee natural alignment for kmalloc(power-of-two) In most configurations, kmalloc() happens to return naturally aligned (i.e. aligned to the block size itself) blocks for power of two sizes. That means some kmalloc() users might unknowingly rely on that alignment, until stuff breaks when the kernel is built with e.g. CONFIG_SLUB_DEBUG or CONFIG_SLOB, and blocks stop being aligned. Then developers have to devise workaround such as own kmem caches with specified alignment [1], which is not always practical, as recently evidenced in [2]. The topic has been discussed at LSF/MM 2019 [3]. Adding a 'kmalloc_aligned()' variant would not help with code unknowingly relying on the implicit alignment. For slab implementations it would either require creating more kmalloc caches, or allocate a larger size and only give back part of it. That would be wasteful, especially with a generic alignment parameter (in contrast with a fixed alignment to size). Ideally we should provide to mm users what they need without difficult workarounds or own reimplementations, so let's make the kmalloc() alignment to size explicitly guaranteed for power-of-two sizes under all configurations. What this means for the three available allocators? * SLAB object layout happens to be mostly unchanged by the patch. The implicitly provided alignment could be compromised with CONFIG_DEBUG_SLAB due to redzoning, however SLAB disables redzoning for caches with alignment larger than unsigned long long. Practically on at least x86 this includes kmalloc caches as they use cache line alignment, which is larger than that. Still, this patch ensures alignment on all arches and cache sizes. * SLUB layout is also unchanged unless redzoning is enabled through CONFIG_SLUB_DEBUG and boot parameter for the particular kmalloc cache. With this patch, explicit alignment is guaranteed with redzoning as well. This will result in more memory being wasted, but that should be acceptable in a debugging scenario. * SLOB has no implicit alignment so this patch adds it explicitly for kmalloc(). The potential downside is increased fragmentation. While pathological allocation scenarios are certainly possible, in my testing, after booting a x86_64 kernel+userspace with virtme, around 16MB memory was consumed by slab pages both before and after the patch, with difference in the noise. [1] https://lore.kernel.org/linux-btrfs/c3157c8e8e0e7588312b40c853f65c02fe6c957a.1566399731.git.christophe.leroy@c-s.fr/ [2] https://lore.kernel.org/linux-fsdevel/20190225040904.5557-1-ming.lei@redhat.com/ [3] https://lwn.net/Articles/787740/ [akpm@linux-foundation.org: documentation fixlet, per Matthew] Link: http://lkml.kernel.org/r/20190826111627.7505-3-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Matthew Wilcox (Oracle) Acked-by: Michal Hocko Acked-by: Kirill A. Shutemov Acked-by: Christoph Hellwig Cc: David Sterba Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Ming Lei Cc: Dave Chinner Cc: "Darrick J . Wong" Cc: Christoph Hellwig Cc: James Bottomley Cc: Vlastimil Babka Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/core-api/memory-allocation.rst | 4 +++ include/linux/slab.h | 4 +++ mm/slab_common.c | 11 +++++++- mm/slob.c | 42 ++++++++++++++++++++-------- 4 files changed, 49 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/Documentation/core-api/memory-allocation.rst b/Documentation/core-api/memory-allocation.rst index 7744aa3bf2e0..939e3dfc86e9 100644 --- a/Documentation/core-api/memory-allocation.rst +++ b/Documentation/core-api/memory-allocation.rst @@ -98,6 +98,10 @@ limited. The actual limit depends on the hardware and the kernel configuration, but it is a good practice to use `kmalloc` for objects smaller than page size. +The address of a chunk allocated with `kmalloc` is aligned to at least +ARCH_KMALLOC_MINALIGN bytes. For sizes which are a power of two, the +alignment is also guaranteed to be at least the respective size. + For large allocations you can use :c:func:`vmalloc` and :c:func:`vzalloc`, or directly request pages from the page allocator. The memory allocated by `vmalloc` and related functions is diff --git a/include/linux/slab.h b/include/linux/slab.h index ab2b98ad76e1..4d2a2fa55ed5 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -493,6 +493,10 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags) * kmalloc is the normal method of allocating memory * for objects smaller than page size in the kernel. * + * The allocated object address is aligned to at least ARCH_KMALLOC_MINALIGN + * bytes. For @size of power of two bytes, the alignment is also guaranteed + * to be at least to the size. + * * The @flags argument may be one of the GFP flags defined at * include/linux/gfp.h and described at * :ref:`Documentation/core-api/mm-api.rst ` diff --git a/mm/slab_common.c b/mm/slab_common.c index 0a94cf858aa4..c29f03adca91 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1030,10 +1030,19 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, unsigned int useroffset, unsigned int usersize) { int err; + unsigned int align = ARCH_KMALLOC_MINALIGN; s->name = name; s->size = s->object_size = size; - s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size); + + /* + * For power of two sizes, guarantee natural alignment for kmalloc + * caches, regardless of SL*B debugging options. + */ + if (is_power_of_2(size)) + align = max(align, size); + s->align = calculate_alignment(flags, align, size); + s->useroffset = useroffset; s->usersize = usersize; diff --git a/mm/slob.c b/mm/slob.c index 835088d55645..fa53e9f73893 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -224,6 +224,7 @@ static void slob_free_pages(void *b, int order) * @sp: Page to look in. * @size: Size of the allocation. * @align: Allocation alignment. + * @align_offset: Offset in the allocated block that will be aligned. * @page_removed_from_list: Return parameter. * * Tries to find a chunk of memory at least @size bytes big within @page. @@ -234,7 +235,7 @@ static void slob_free_pages(void *b, int order) * true (set to false otherwise). */ static void *slob_page_alloc(struct page *sp, size_t size, int align, - bool *page_removed_from_list) + int align_offset, bool *page_removed_from_list) { slob_t *prev, *cur, *aligned = NULL; int delta = 0, units = SLOB_UNITS(size); @@ -243,8 +244,17 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align, for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) { slobidx_t avail = slob_units(cur); + /* + * 'aligned' will hold the address of the slob block so that the + * address 'aligned'+'align_offset' is aligned according to the + * 'align' parameter. This is for kmalloc() which prepends the + * allocated block with its size, so that the block itself is + * aligned when needed. + */ if (align) { - aligned = (slob_t *)ALIGN((unsigned long)cur, align); + aligned = (slob_t *) + (ALIGN((unsigned long)cur + align_offset, align) + - align_offset); delta = aligned - cur; } if (avail >= units + delta) { /* room enough? */ @@ -288,7 +298,8 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align, /* * slob_alloc: entry point into the slob allocator. */ -static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) +static void *slob_alloc(size_t size, gfp_t gfp, int align, int node, + int align_offset) { struct page *sp; struct list_head *slob_list; @@ -319,7 +330,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) if (sp->units < SLOB_UNITS(size)) continue; - b = slob_page_alloc(sp, size, align, &page_removed_from_list); + b = slob_page_alloc(sp, size, align, align_offset, &page_removed_from_list); if (!b) continue; @@ -356,7 +367,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) INIT_LIST_HEAD(&sp->slab_list); set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); set_slob_page_free(sp, slob_list); - b = slob_page_alloc(sp, size, align, &_unused); + b = slob_page_alloc(sp, size, align, align_offset, &_unused); BUG_ON(!b); spin_unlock_irqrestore(&slob_lock, flags); } @@ -458,7 +469,7 @@ static __always_inline void * __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) { unsigned int *m; - int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + int minalign = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); void *ret; gfp &= gfp_allowed_mask; @@ -466,19 +477,28 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) fs_reclaim_acquire(gfp); fs_reclaim_release(gfp); - if (size < PAGE_SIZE - align) { + if (size < PAGE_SIZE - minalign) { + int align = minalign; + + /* + * For power of two sizes, guarantee natural alignment for + * kmalloc()'d objects. + */ + if (is_power_of_2(size)) + align = max(minalign, (int) size); + if (!size) return ZERO_SIZE_PTR; - m = slob_alloc(size + align, gfp, align, node); + m = slob_alloc(size + minalign, gfp, align, node, minalign); if (!m) return NULL; *m = size; - ret = (void *)m + align; + ret = (void *)m + minalign; trace_kmalloc_node(caller, ret, - size, size + align, gfp, node); + size, size + minalign, gfp, node); } else { unsigned int order = get_order(size); @@ -579,7 +599,7 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) fs_reclaim_release(flags); if (c->size < PAGE_SIZE) { - b = slob_alloc(c->size, flags, c->align, node); + b = slob_alloc(c->size, flags, c->align, node, 0); trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size, SLOB_UNITS(c->size) * SLOB_UNIT, flags, node); -- cgit v1.2.3 From bec500777089b3c96c53681fc0aa6fee59711d4a Mon Sep 17 00:00:00 2001 From: Arvind Sankar Date: Mon, 7 Oct 2019 18:00:02 -0400 Subject: lib/string: Make memzero_explicit() inline instead of external With the use of the barrier implied by barrier_data(), there is no need for memzero_explicit() to be extern. Making it inline saves the overhead of a function call, and allows the code to be reused in arch/*/purgatory without having to duplicate the implementation. Tested-by: Hans de Goede Signed-off-by: Arvind Sankar Reviewed-by: Hans de Goede Cc: Ard Biesheuvel Cc: Borislav Petkov Cc: H . Peter Anvin Cc: Herbert Xu Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephan Mueller Cc: Thomas Gleixner Cc: linux-crypto@vger.kernel.org Cc: linux-s390@vger.kernel.org Fixes: 906a4bb97f5d ("crypto: sha256 - Use get/put_unaligned_be32 to get input, memzero_explicit") Link: https://lkml.kernel.org/r/20191007220000.GA408752@rani.riverdale.lan Signed-off-by: Ingo Molnar --- include/linux/string.h | 21 ++++++++++++++++++++- lib/string.c | 21 --------------------- 2 files changed, 20 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/string.h b/include/linux/string.h index b2f9df7f0761..b6ccdc2c7f02 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -227,7 +227,26 @@ static inline bool strstarts(const char *str, const char *prefix) } size_t memweight(const void *ptr, size_t bytes); -void memzero_explicit(void *s, size_t count); + +/** + * memzero_explicit - Fill a region of memory (e.g. sensitive + * keying data) with 0s. + * @s: Pointer to the start of the area. + * @count: The size of the area. + * + * Note: usually using memset() is just fine (!), but in cases + * where clearing out _local_ data at the end of a scope is + * necessary, memzero_explicit() should be used instead in + * order to prevent the compiler from optimising away zeroing. + * + * memzero_explicit() doesn't need an arch-specific version as + * it just invokes the one of memset() implicitly. + */ +static inline void memzero_explicit(void *s, size_t count) +{ + memset(s, 0, count); + barrier_data(s); +} /** * kbasename - return the last part of a pathname. diff --git a/lib/string.c b/lib/string.c index cd7a10c19210..08ec58cc673b 100644 --- a/lib/string.c +++ b/lib/string.c @@ -748,27 +748,6 @@ void *memset(void *s, int c, size_t count) EXPORT_SYMBOL(memset); #endif -/** - * memzero_explicit - Fill a region of memory (e.g. sensitive - * keying data) with 0s. - * @s: Pointer to the start of the area. - * @count: The size of the area. - * - * Note: usually using memset() is just fine (!), but in cases - * where clearing out _local_ data at the end of a scope is - * necessary, memzero_explicit() should be used instead in - * order to prevent the compiler from optimising away zeroing. - * - * memzero_explicit() doesn't need an arch-specific version as - * it just invokes the one of memset() implicitly. - */ -void memzero_explicit(void *s, size_t count) -{ - memset(s, 0, count); - barrier_data(s); -} -EXPORT_SYMBOL(memzero_explicit); - #ifndef __HAVE_ARCH_MEMSET16 /** * memset16() - Fill a memory area with a uint16_t -- cgit v1.2.3 From e3f1271474182682638654021123b94e6ec1626b Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Wed, 2 Oct 2019 07:40:42 -0500 Subject: leds: core: Fix leds.h structure documentation Update the leds.h structure documentation to define the correct arguments. Signed-off-by: Dan Murphy Signed-off-by: Jacek Anaszewski --- include/linux/leds.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/leds.h b/include/linux/leds.h index b8df71193329..efb309dba914 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -247,7 +247,7 @@ extern void led_set_brightness(struct led_classdev *led_cdev, /** * led_set_brightness_sync - set LED brightness synchronously * @led_cdev: the LED to set - * @brightness: the brightness to set it to + * @value: the brightness to set it to * * Set an LED's brightness immediately. This function will block * the caller for the time required for accessing device registers, @@ -301,8 +301,7 @@ extern void led_sysfs_enable(struct led_classdev *led_cdev); /** * led_compose_name - compose LED class device name * @dev: LED controller device object - * @child: child fwnode_handle describing a LED or a group of synchronized LEDs; - * it must be provided only for fwnode based LEDs + * @init_data: the LED class device initialization data * @led_classdev_name: composed LED class device name * * Create LED class device name basing on the provided init_data argument. -- cgit v1.2.3 From af84537dbd1b39505d1f3d8023029b4a59666513 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Wed, 2 Oct 2019 10:40:55 -0400 Subject: SUNRPC: fix race to sk_err after xs_error_report Since commit 4f8943f80883 ("SUNRPC: Replace direct task wakeups from softirq context") there has been a race to the value of the sk_err if both XPRT_SOCK_WAKE_ERROR and XPRT_SOCK_WAKE_DISCONNECT are set. In that case, we may end up losing the sk_err value that existed when xs_error_report was called. Fix this by reverting to the previous behavior: instead of using SO_ERROR to retrieve the value at a later time (which might also return sk_err_soft), copy the sk_err value onto struct sock_xprt, and use that value to wake pending tasks. Signed-off-by: Benjamin Coddington Fixes: 4f8943f80883 ("SUNRPC: Replace direct task wakeups from softirq context") Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xprtsock.h | 1 + net/sunrpc/xprtsock.c | 17 ++++++++--------- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h index 7638dbe7bc50..a940de03808d 100644 --- a/include/linux/sunrpc/xprtsock.h +++ b/include/linux/sunrpc/xprtsock.h @@ -61,6 +61,7 @@ struct sock_xprt { struct mutex recv_mutex; struct sockaddr_storage srcaddr; unsigned short srcport; + int xprt_err; /* * UDP socket buffer size parameters diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 9ac88722fa83..70e52f567b2a 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1249,19 +1249,21 @@ static void xs_error_report(struct sock *sk) { struct sock_xprt *transport; struct rpc_xprt *xprt; - int err; read_lock_bh(&sk->sk_callback_lock); if (!(xprt = xprt_from_sock(sk))) goto out; transport = container_of(xprt, struct sock_xprt, xprt); - err = -sk->sk_err; - if (err == 0) + transport->xprt_err = -sk->sk_err; + if (transport->xprt_err == 0) goto out; dprintk("RPC: xs_error_report client %p, error=%d...\n", - xprt, -err); - trace_rpc_socket_error(xprt, sk->sk_socket, err); + xprt, -transport->xprt_err); + trace_rpc_socket_error(xprt, sk->sk_socket, transport->xprt_err); + + /* barrier ensures xprt_err is set before XPRT_SOCK_WAKE_ERROR */ + smp_mb__before_atomic(); xs_run_error_worker(transport, XPRT_SOCK_WAKE_ERROR); out: read_unlock_bh(&sk->sk_callback_lock); @@ -2476,7 +2478,6 @@ static void xs_wake_write(struct sock_xprt *transport) static void xs_wake_error(struct sock_xprt *transport) { int sockerr; - int sockerr_len = sizeof(sockerr); if (!test_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state)) return; @@ -2485,9 +2486,7 @@ static void xs_wake_error(struct sock_xprt *transport) goto out; if (!test_and_clear_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state)) goto out; - if (kernel_getsockopt(transport->sock, SOL_SOCKET, SO_ERROR, - (char *)&sockerr, &sockerr_len) != 0) - goto out; + sockerr = xchg(&transport->xprt_err, 0); if (sockerr < 0) xprt_wake_pending_tasks(&transport->xprt, sockerr); out: -- cgit v1.2.3 From 294f69e662d1570703e9b56e95be37a9fd3afba5 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sat, 5 Oct 2019 09:46:42 -0700 Subject: compiler_attributes.h: Add 'fallthrough' pseudo keyword for switch/case use Reserve the pseudo keyword 'fallthrough' for the ability to convert the various case block /* fallthrough */ style comments to appear to be an actual reserved word with the same gcc case block missing fallthrough warning capability. All switch/case blocks now should end in one of: break; fallthrough; goto