From 44d79142ede8162fd67bf8ca4ddbda1fbcfa94f1 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 14 Mar 2024 17:49:31 +0000 Subject: bpf: Temporarily disable atomic operations in BPF arena Currently, the x86 JIT handling PROBE_MEM32 tagged accesses is not equipped to handle atomic accesses into PTR_TO_ARENA, as no PROBE_MEM32 tagging is performed and no handling is enabled for them. This will lead to unsafety as the offset into arena will dereferenced directly without turning it into a base + offset access into the arena region. Since the changes to the x86 JIT will be fairly involved, for now, temporarily disallow use of PTR_TO_ARENA as the destination operand for atomics until support is added to the JIT backend. Fixes: 2fe99eb0ccf2 ("bpf: Add x86-64 JIT support for PROBE_MEM32 pseudo instructions.") Reported-by: Kumar Kartikeya Dwivedi Signed-off-by: Puranjay Mohan Acked-by: Kumar Kartikeya Dwivedi Message-ID: <20240314174931.98702-1-puranjay12@gmail.com> Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 63749ad5ac6b..1dd3b99d1bb9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5682,6 +5682,13 @@ static bool is_flow_key_reg(struct bpf_verifier_env *env, int regno) return reg->type == PTR_TO_FLOW_KEYS; } +static bool is_arena_reg(struct bpf_verifier_env *env, int regno) +{ + const struct bpf_reg_state *reg = reg_state(env, regno); + + return reg->type == PTR_TO_ARENA; +} + static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = { #ifdef CONFIG_NET [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK], @@ -7019,7 +7026,8 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i if (is_ctx_reg(env, insn->dst_reg) || is_pkt_reg(env, insn->dst_reg) || is_flow_key_reg(env, insn->dst_reg) || - is_sk_reg(env, insn->dst_reg)) { + is_sk_reg(env, insn->dst_reg) || + is_arena_reg(env, insn->dst_reg)) { verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n", insn->dst_reg, reg_type_str(env, reg_state(env, insn->dst_reg)->type)); -- cgit v1.2.3 From 8076972468584d4a21dab9aa50e388b3ea9ad8c7 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Mon, 26 Feb 2024 13:07:24 +0106 Subject: printk: Update @console_may_schedule in console_trylock_spinning() console_trylock_spinning() may takeover the console lock from a schedulable context. Update @console_may_schedule to make sure it reflects a trylock acquire. Reported-by: Mukesh Ojha Closes: https://lore.kernel.org/lkml/20240222090538.23017-1-quic_mojha@quicinc.com Fixes: dbdda842fe96 ("printk: Add console owner and waiter logic to load balance console writes") Signed-off-by: John Ogness Reviewed-by: Mukesh Ojha Reviewed-by: Petr Mladek Link: https://lore.kernel.org/r/875xybmo2z.fsf@jogness.linutronix.de Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index b06f63e276c1..612c73333848 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2009,6 +2009,12 @@ static int console_trylock_spinning(void) */ mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_); + /* + * Update @console_may_schedule for trylock because the previous + * owner may have been schedulable. + */ + console_may_schedule = 0; + return 1; } -- cgit v1.2.3 From ee498a38f3177d9ee0213839d3a05b94272aa48c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Mar 2024 19:18:31 -0700 Subject: bpf: Clarify bpf_arena comments. Clarify two bpf_arena comments, use existing SZ_4G #define, improve page_cnt check. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20240315021834.62988-2-alexei.starovoitov@gmail.com --- kernel/bpf/arena.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 86571e760dd6..343c3456c8dd 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -38,7 +38,7 @@ /* number of bytes addressable by LDX/STX insn with 16-bit 'off' field */ #define GUARD_SZ (1ull << sizeof(((struct bpf_insn *)0)->off) * 8) -#define KERN_VM_SZ ((1ull << 32) + GUARD_SZ) +#define KERN_VM_SZ (SZ_4G + GUARD_SZ) struct bpf_arena { struct bpf_map map; @@ -110,7 +110,7 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr) return ERR_PTR(-EINVAL); vm_range = (u64)attr->max_entries * PAGE_SIZE; - if (vm_range > (1ull << 32)) + if (vm_range > SZ_4G) return ERR_PTR(-E2BIG); if ((attr->map_extra >> 32) != ((attr->map_extra + vm_range - 1) >> 32)) @@ -301,7 +301,7 @@ static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long ad if (pgoff) return -EINVAL; - if (len > (1ull << 32)) + if (len > SZ_4G) return -E2BIG; /* if user_vm_start was specified at arena creation time */ @@ -322,7 +322,7 @@ static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long ad if (WARN_ON_ONCE(arena->user_vm_start)) /* checks at map creation time should prevent this */ return -EFAULT; - return round_up(ret, 1ull << 32); + return round_up(ret, SZ_4G); } static int arena_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) @@ -346,7 +346,7 @@ static int arena_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) return -EBUSY; /* Earlier checks should prevent this */ - if (WARN_ON_ONCE(vma->vm_end - vma->vm_start > (1ull << 32) || vma->vm_pgoff)) + if (WARN_ON_ONCE(vma->vm_end - vma->vm_start > SZ_4G || vma->vm_pgoff)) return -EFAULT; if (remember_vma(arena, vma)) @@ -420,7 +420,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt if (uaddr & ~PAGE_MASK) return 0; pgoff = compute_pgoff(arena, uaddr); - if (pgoff + page_cnt > page_cnt_max) + if (pgoff > page_cnt_max - page_cnt) /* requested address will be outside of user VMA */ return 0; } @@ -447,7 +447,13 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt goto out; uaddr32 = (u32)(arena->user_vm_start + pgoff * PAGE_SIZE); - /* Earlier checks make sure that uaddr32 + page_cnt * PAGE_SIZE will not overflow 32-bit */ + /* Earlier checks made sure that uaddr32 + page_cnt * PAGE_SIZE - 1 + * will not overflow 32-bit. Lower 32-bit need to represent + * contiguous user address range. + * Map these pages at kern_vm_start base. + * kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow + * lower 32-bit and it's ok. + */ ret = vm_area_map_pages(arena->kern_vm, kern_vm_start + uaddr32, kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE, pages); if (ret) { @@ -510,6 +516,11 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt) if (!page) continue; if (page_cnt == 1 && page_mapped(page)) /* mapped by some user process */ + /* Optimization for the common case of page_cnt==1: + * If page wasn't mapped into some user vma there + * is no need to call zap_pages which is slow. When + * page_cnt is big it's faster to do the batched zap. + */ zap_pages(arena, full_uaddr, 1); vm_area_unmap_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE); __free_page(page); -- cgit v1.2.3 From 203a6763ab699da0568fd2b76303d03bb121abd4 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 13 Mar 2024 16:32:27 -0700 Subject: Revert "crypto: pkcs7 - remove sha1 support" This reverts commit 16ab7cb5825fc3425c16ad2c6e53d827f382d7c6 because it broke iwd. iwd uses the KEYCTL_PKEY_* UAPIs via its dependency libell, and apparently it is relying on SHA-1 signature support. These UAPIs are fairly obscure, and their documentation does not mention which algorithms they support. iwd really should be using a properly supported userspace crypto library instead. Regardless, since something broke we have to revert the change. It may be possible that some parts of this commit can be reinstated without breaking iwd (e.g. probably the removal of MODULE_SIG_SHA1), but for now this just does a full revert to get things working again. Reported-by: Karel Balej Closes: https://lore.kernel.org/r/CZSHRUIJ4RKL.34T4EASV5DNJM@matfyz.cz Cc: Dimitri John Ledkov Signed-off-by: Eric Biggers Tested-by: Karel Balej Signed-off-by: Herbert Xu --- crypto/asymmetric_keys/mscode_parser.c | 3 ++ crypto/asymmetric_keys/pkcs7_parser.c | 4 ++ crypto/asymmetric_keys/public_key.c | 3 +- crypto/asymmetric_keys/signature.c | 2 +- crypto/asymmetric_keys/x509_cert_parser.c | 8 ++++ crypto/testmgr.h | 80 +++++++++++++++++++++++++++++++ include/linux/oid_registry.h | 4 ++ kernel/module/Kconfig | 5 ++ 8 files changed, 107 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/crypto/asymmetric_keys/mscode_parser.c b/crypto/asymmetric_keys/mscode_parser.c index 05402ef8964e..8aecbe4637f3 100644 --- a/crypto/asymmetric_keys/mscode_parser.c +++ b/crypto/asymmetric_keys/mscode_parser.c @@ -75,6 +75,9 @@ int mscode_note_digest_algo(void *context, size_t hdrlen, oid = look_up_OID(value, vlen); switch (oid) { + case OID_sha1: + ctx->digest_algo = "sha1"; + break; case OID_sha256: ctx->digest_algo = "sha256"; break; diff --git a/crypto/asymmetric_keys/pkcs7_parser.c b/crypto/asymmetric_keys/pkcs7_parser.c index 5b08c50722d0..231ad7b3789d 100644 --- a/crypto/asymmetric_keys/pkcs7_parser.c +++ b/crypto/asymmetric_keys/pkcs7_parser.c @@ -227,6 +227,9 @@ int pkcs7_sig_note_digest_algo(void *context, size_t hdrlen, struct pkcs7_parse_context *ctx = context; switch (ctx->last_oid) { + case OID_sha1: + ctx->sinfo->sig->hash_algo = "sha1"; + break; case OID_sha256: ctx->sinfo->sig->hash_algo = "sha256"; break; @@ -278,6 +281,7 @@ int pkcs7_sig_note_pkey_algo(void *context, size_t hdrlen, ctx->sinfo->sig->pkey_algo = "rsa"; ctx->sinfo->sig->encoding = "pkcs1"; break; + case OID_id_ecdsa_with_sha1: case OID_id_ecdsa_with_sha224: case OID_id_ecdsa_with_sha256: case OID_id_ecdsa_with_sha384: diff --git a/crypto/asymmetric_keys/public_key.c b/crypto/asymmetric_keys/public_key.c index e5f22691febd..e314fd57e6f8 100644 --- a/crypto/asymmetric_keys/public_key.c +++ b/crypto/asymmetric_keys/public_key.c @@ -115,7 +115,8 @@ software_key_determine_akcipher(const struct public_key *pkey, */ if (!hash_algo) return -EINVAL; - if (strcmp(hash_algo, "sha224") != 0 && + if (strcmp(hash_algo, "sha1") != 0 && + strcmp(hash_algo, "sha224") != 0 && strcmp(hash_algo, "sha256") != 0 && strcmp(hash_algo, "sha384") != 0 && strcmp(hash_algo, "sha512") != 0 && diff --git a/crypto/asymmetric_keys/signature.c b/crypto/asymmetric_keys/signature.c index 398983be77e8..2deff81f8af5 100644 --- a/crypto/asymmetric_keys/signature.c +++ b/crypto/asymmetric_keys/signature.c @@ -115,7 +115,7 @@ EXPORT_SYMBOL_GPL(decrypt_blob); * Sign the specified data blob using the private key specified by params->key. * The signature is wrapped in an encoding if params->encoding is specified * (eg. "pkcs1"). If the encoding needs to know the digest type, this can be - * passed through params->hash_algo (eg. "sha512"). + * passed through params->hash_algo (eg. "sha1"). * * Returns the length of the data placed in the signature buffer or an error. */ diff --git a/crypto/asymmetric_keys/x509_cert_parser.c b/crypto/asymmetric_keys/x509_cert_parser.c index 487204d39426..bb0bffa271b5 100644 --- a/crypto/asymmetric_keys/x509_cert_parser.c +++ b/crypto/asymmetric_keys/x509_cert_parser.c @@ -198,6 +198,10 @@ int x509_note_sig_algo(void *context, size_t hdrlen, unsigned char tag, default: return -ENOPKG; /* Unsupported combination */ + case OID_sha1WithRSAEncryption: + ctx->cert->sig->hash_algo = "sha1"; + goto rsa_pkcs1; + case OID_sha256WithRSAEncryption: ctx->cert->sig->hash_algo = "sha256"; goto rsa_pkcs1; @@ -214,6 +218,10 @@ int x509_note_sig_algo(void *context, size_t hdrlen, unsigned char tag, ctx->cert->sig->hash_algo = "sha224"; goto rsa_pkcs1; + case OID_id_ecdsa_with_sha1: + ctx->cert->sig->hash_algo = "sha1"; + goto ecdsa; + case OID_id_rsassa_pkcs1_v1_5_with_sha3_256: ctx->cert->sig->hash_algo = "sha3-256"; goto rsa_pkcs1; diff --git a/crypto/testmgr.h b/crypto/testmgr.h index 986f331a5fc2..12e1c892f366 100644 --- a/crypto/testmgr.h +++ b/crypto/testmgr.h @@ -653,6 +653,30 @@ static const struct akcipher_testvec rsa_tv_template[] = { static const struct akcipher_testvec ecdsa_nist_p192_tv_template[] = { { .key = + "\x04\xf7\x46\xf8\x2f\x15\xf6\x22\x8e\xd7\x57\x4f\xcc\xe7\xbb\xc1" + "\xd4\x09\x73\xcf\xea\xd0\x15\x07\x3d\xa5\x8a\x8a\x95\x43\xe4\x68" + "\xea\xc6\x25\xc1\xc1\x01\x25\x4c\x7e\xc3\x3c\xa6\x04\x0a\xe7\x08" + "\x98", + .key_len = 49, + .params = + "\x30\x13\x06\x07\x2a\x86\x48\xce\x3d\x02\x01\x06\x08\x2a\x86\x48" + "\xce\x3d\x03\x01\x01", + .param_len = 21, + .m = + "\xcd\xb9\xd2\x1c\xb7\x6f\xcd\x44\xb3\xfd\x63\xea\xa3\x66\x7f\xae" + "\x63\x85\xe7\x82", + .m_size = 20, + .algo = OID_id_ecdsa_with_sha1, + .c = + "\x30\x35\x02\x19\x00\xba\xe5\x93\x83\x6e\xb6\x3b\x63\xa0\x27\x91" + "\xc6\xf6\x7f\xc3\x09\xad\x59\xad\x88\x27\xd6\x92\x6b\x02\x18\x10" + "\x68\x01\x9d\xba\xce\x83\x08\xef\x95\x52\x7b\xa0\x0f\xe4\x18\x86" + "\x80\x6f\xa5\x79\x77\xda\xd0", + .c_size = 55, + .public_key_vec = true, + .siggen_sigver_test = true, + }, { + .key = "\x04\xb6\x4b\xb1\xd1\xac\xba\x24\x8f\x65\xb2\x60\x00\x90\xbf\xbd" "\x78\x05\x73\xe9\x79\x1d\x6f\x7c\x0b\xd2\xc3\x93\xa7\x28\xe1\x75" "\xf7\xd5\x95\x1d\x28\x10\xc0\x75\x50\x5c\x1a\x4f\x3f\x8f\xa5\xee" @@ -756,6 +780,32 @@ static const struct akcipher_testvec ecdsa_nist_p192_tv_template[] = { static const struct akcipher_testvec ecdsa_nist_p256_tv_template[] = { { .key = + "\x04\xb9\x7b\xbb\xd7\x17\x64\xd2\x7e\xfc\x81\x5d\x87\x06\x83\x41" + "\x22\xd6\x9a\xaa\x87\x17\xec\x4f\x63\x55\x2f\x94\xba\xdd\x83\xe9" + "\x34\x4b\xf3\xe9\x91\x13\x50\xb6\xcb\xca\x62\x08\xe7\x3b\x09\xdc" + "\xc3\x63\x4b\x2d\xb9\x73\x53\xe4\x45\xe6\x7c\xad\xe7\x6b\xb0\xe8" + "\xaf", + .key_len = 65, + .params = + "\x30\x13\x06\x07\x2a\x86\x48\xce\x3d\x02\x01\x06\x08\x2a\x86\x48" + "\xce\x3d\x03\x01\x07", + .param_len = 21, + .m = + "\xc2\x2b\x5f\x91\x78\x34\x26\x09\x42\x8d\x6f\x51\xb2\xc5\xaf\x4c" + "\x0b\xde\x6a\x42", + .m_size = 20, + .algo = OID_id_ecdsa_with_sha1, + .c = + "\x30\x46\x02\x21\x00\xf9\x25\xce\x9f\x3a\xa6\x35\x81\xcf\xd4\xe7" + "\xb7\xf0\x82\x56\x41\xf7\xd4\xad\x8d\x94\x5a\x69\x89\xee\xca\x6a" + "\x52\x0e\x48\x4d\xcc\x02\x21\x00\xd7\xe4\xef\x52\x66\xd3\x5b\x9d" + "\x8a\xfa\x54\x93\x29\xa7\x70\x86\xf1\x03\x03\xf3\x3b\xe2\x73\xf7" + "\xfb\x9d\x8b\xde\xd4\x8d\x6f\xad", + .c_size = 72, + .public_key_vec = true, + .siggen_sigver_test = true, + }, { + .key = "\x04\x8b\x6d\xc0\x33\x8e\x2d\x8b\x67\xf5\xeb\xc4\x7f\xa0\xf5\xd9" "\x7b\x03\xa5\x78\x9a\xb5\xea\x14\xe4\x23\xd0\xaf\xd7\x0e\x2e\xa0" "\xc9\x8b\xdb\x95\xf8\xb3\xaf\xac\x00\x2c\x2c\x1f\x7a\xfd\x95\x88" @@ -866,6 +916,36 @@ static const struct akcipher_testvec ecdsa_nist_p256_tv_template[] = { static const struct akcipher_testvec ecdsa_nist_p384_tv_template[] = { { + .key = /* secp384r1(sha1) */ + "\x04\x89\x25\xf3\x97\x88\xcb\xb0\x78\xc5\x72\x9a\x14\x6e\x7a\xb1" + "\x5a\xa5\x24\xf1\x95\x06\x9e\x28\xfb\xc4\xb9\xbe\x5a\x0d\xd9\x9f" + "\xf3\xd1\x4d\x2d\x07\x99\xbd\xda\xa7\x66\xec\xbb\xea\xba\x79\x42" + "\xc9\x34\x89\x6a\xe7\x0b\xc3\xf2\xfe\x32\x30\xbe\xba\xf9\xdf\x7e" + "\x4b\x6a\x07\x8e\x26\x66\x3f\x1d\xec\xa2\x57\x91\x51\xdd\x17\x0e" + "\x0b\x25\xd6\x80\x5c\x3b\xe6\x1a\x98\x48\x91\x45\x7a\x73\xb0\xc3" + "\xf1", + .key_len = 97, + .params = + "\x30\x10\x06\x07\x2a\x86\x48\xce\x3d\x02\x01\x06\x05\x2b\x81\x04" + "\x00\x22", + .param_len = 18, + .m = + "\x12\x55\x28\xf0\x77\xd5\xb6\x21\x71\x32\x48\xcd\x28\xa8\x25\x22" + "\x3a\x69\xc1\x93", + .m_size = 20, + .algo = OID_id_ecdsa_with_sha1, + .c = + "\x30\x66\x02\x31\x00\xf5\x0f\x24\x4c\x07\x93\x6f\x21\x57\x55\x07" + "\x20\x43\x30\xde\xa0\x8d\x26\x8e\xae\x63\x3f\xbc\x20\x3a\xc6\xf1" + "\x32\x3c\xce\x70\x2b\x78\xf1\x4c\x26\xe6\x5b\x86\xcf\xec\x7c\x7e" + "\xd0\x87\xd7\xd7\x6e\x02\x31\x00\xcd\xbb\x7e\x81\x5d\x8f\x63\xc0" + "\x5f\x63\xb1\xbe\x5e\x4c\x0e\xa1\xdf\x28\x8c\x1b\xfa\xf9\x95\x88" + "\x74\xa0\x0f\xbf\xaf\xc3\x36\x76\x4a\xa1\x59\xf1\x1c\xa4\x58\x26" + "\x79\x12\x2a\xb7\xc5\x15\x92\xc5", + .c_size = 104, + .public_key_vec = true, + .siggen_sigver_test = true, + }, { .key = /* secp384r1(sha224) */ "\x04\x69\x6c\xcf\x62\xee\xd0\x0d\xe5\xb5\x2f\x70\x54\xcf\x26\xa0" "\xd9\x98\x8d\x92\x2a\xab\x9b\x11\xcb\x48\x18\xa1\xa9\x0d\xd5\x18" diff --git a/include/linux/oid_registry.h b/include/linux/oid_registry.h index 3921fbed0b28..51421fdbb0ba 100644 --- a/include/linux/oid_registry.h +++ b/include/linux/oid_registry.h @@ -17,10 +17,12 @@ * build_OID_registry.pl to generate the data for look_up_OID(). */ enum OID { + OID_id_dsa_with_sha1, /* 1.2.840.10030.4.3 */ OID_id_dsa, /* 1.2.840.10040.4.1 */ OID_id_ecPublicKey, /* 1.2.840.10045.2.1 */ OID_id_prime192v1, /* 1.2.840.10045.3.1.1 */ OID_id_prime256v1, /* 1.2.840.10045.3.1.7 */ + OID_id_ecdsa_with_sha1, /* 1.2.840.10045.4.1 */ OID_id_ecdsa_with_sha224, /* 1.2.840.10045.4.3.1 */ OID_id_ecdsa_with_sha256, /* 1.2.840.10045.4.3.2 */ OID_id_ecdsa_with_sha384, /* 1.2.840.10045.4.3.3 */ @@ -28,6 +30,7 @@ enum OID { /* PKCS#1 {iso(1) member-body(2) us(840) rsadsi(113549) pkcs(1) pkcs-1(1)} */ OID_rsaEncryption, /* 1.2.840.113549.1.1.1 */ + OID_sha1WithRSAEncryption, /* 1.2.840.113549.1.1.5 */ OID_sha256WithRSAEncryption, /* 1.2.840.113549.1.1.11 */ OID_sha384WithRSAEncryption, /* 1.2.840.113549.1.1.12 */ OID_sha512WithRSAEncryption, /* 1.2.840.113549.1.1.13 */ @@ -64,6 +67,7 @@ enum OID { OID_PKU2U, /* 1.3.5.1.5.2.7 */ OID_Scram, /* 1.3.6.1.5.5.14 */ OID_certAuthInfoAccess, /* 1.3.6.1.5.5.7.1.1 */ + OID_sha1, /* 1.3.14.3.2.26 */ OID_id_ansip384r1, /* 1.3.132.0.34 */ OID_sha256, /* 2.16.840.1.101.3.4.2.1 */ OID_sha384, /* 2.16.840.1.101.3.4.2.2 */ diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index 0ea1b2970a23..28db5b7589eb 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -236,6 +236,10 @@ choice possible to load a signed module containing the algorithm to check the signature on that module. +config MODULE_SIG_SHA1 + bool "Sign modules with SHA-1" + select CRYPTO_SHA1 + config MODULE_SIG_SHA256 bool "Sign modules with SHA-256" select CRYPTO_SHA256 @@ -265,6 +269,7 @@ endchoice config MODULE_SIG_HASH string depends on MODULE_SIG || IMA_APPRAISE_MODSIG + default "sha1" if MODULE_SIG_SHA1 default "sha256" if MODULE_SIG_SHA256 default "sha384" if MODULE_SIG_SHA384 default "sha512" if MODULE_SIG_SHA512 -- cgit v1.2.3 From f7f5d1808b1b66935a24dd796dd1a0612ca9c147 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 21 Mar 2024 15:39:39 +0000 Subject: bpf: verifier: fix addr_space_cast from as(1) to as(0) The verifier currently converts addr_space_cast from as(1) to as(0) that is: BPF_ALU64 | BPF_MOV | BPF_X with off=1 and imm=1 to BPF_ALU | BPF_MOV | BPF_X with imm=1 (32-bit mov) Because of this imm=1, the JITs that have bpf_jit_needs_zext() == true, interpret the converted instruction as BPF_ZEXT_REG(DST) which is a special form of mov32, used for doing explicit zero extension on dst. These JITs will just zero extend the dst reg and will not move the src to dst before the zext. Fix do_misc_fixups() to set imm=0 when converting addr_space_cast to a normal mov32. The JITs that have bpf_jit_needs_zext() == true rely on the verifier to emit zext instructions. Mark dst_reg as subreg when doing cast from as(1) to as(0) so the verifier emits a zext instruction after the mov. Fixes: 6082b6c328b5 ("bpf: Recognize addr_space_cast instruction in the verifier.") Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20240321153939.113996-1-puranjay12@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1dd3b99d1bb9..2cd7e0e79283 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14054,8 +14054,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (insn->imm) { /* off == BPF_ADDR_SPACE_CAST */ mark_reg_unknown(env, regs, insn->dst_reg); - if (insn->imm == 1) /* cast from as(1) to as(0) */ + if (insn->imm == 1) { /* cast from as(1) to as(0) */ dst_reg->type = PTR_TO_ARENA; + /* PTR_TO_ARENA is 32-bit */ + dst_reg->subreg_def = env->insn_idx + 1; + } } else if (insn->off == 0) { /* case: R1 = R2 * copy register state to dest reg @@ -19609,8 +19612,9 @@ static int do_misc_fixups(struct bpf_verifier_env *env) (((struct bpf_map *)env->prog->aux->arena)->map_flags & BPF_F_NO_USER_CONV)) { /* convert to 32-bit mov that clears upper 32-bit */ insn->code = BPF_ALU | BPF_MOV | BPF_X; - /* clear off, so it's a normal 'wX = wY' from JIT pov */ + /* clear off and imm, so it's a normal 'wX = wY' from JIT pov */ insn->off = 0; + insn->imm = 0; } /* cast from as(0) to as(1) should be handled by JIT */ goto next_insn; } -- cgit v1.2.3 From 122fdbd2a030a95128737fc77e47df15a8f170c3 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 22 Mar 2024 15:35:18 +0000 Subject: bpf: verifier: reject addr_space_cast insn without arena The verifier allows using the addr_space_cast instruction in a program that doesn't have an associated arena. This was caught in the form an invalid memory access in do_misc_fixups() when while converting addr_space_cast to a normal 32-bit mov, env->prog->aux->arena was dereferenced to check for BPF_F_NO_USER_CONV flag. Reject programs that include the addr_space_cast instruction but don't have an associated arena. root@rv-tester:~# ./reproducer Unable to handle kernel access to user memory without uaccess routines at virtual address 0000000000000030 Oops [#1] [] do_misc_fixups+0x43c/0x1168 [] bpf_check+0xda8/0x22b6 [] bpf_prog_load+0x486/0x8dc [] __sys_bpf+0xbd8/0x214e [] __riscv_sys_bpf+0x22/0x2a [] do_trap_ecall_u+0x102/0x17c [] ret_from_exception+0x0/0x64 Fixes: 6082b6c328b5 ("bpf: Recognize addr_space_cast instruction in the verifier.") Reported-by: xingwei lee Reported-by: yue sun Closes: https://lore.kernel.org/bpf/CABOYnLz09O1+2gGVJuCxd_24a-7UueXzV-Ff+Fr+h5EKFDiYCQ@mail.gmail.com/ Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20240322153518.11555-1-puranjay12@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2cd7e0e79283..0bfc0050db28 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14022,6 +14022,10 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) verbose(env, "addr_space_cast insn can only convert between address space 1 and 0\n"); return -EINVAL; } + if (!env->prog->aux->arena) { + verbose(env, "addr_space_cast insn can only be used in a program that has an associated arena\n"); + return -EINVAL; + } } else { if ((insn->off != 0 && insn->off != 8 && insn->off != 16 && insn->off != 32) || insn->imm) { -- cgit v1.2.3 From 0add699ad068d26e5b1da9ff28b15461fc4005df Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Wed, 20 Mar 2024 17:10:38 +0900 Subject: tracing: probes: Fix to zero initialize a local variable Fix to initialize 'val' local variable with zero. Dan reported that Smatch static code checker reports an error that a local 'val' variable needs to be initialized. Actually, the 'val' is expected to be initialized by FETCH_OP_ARG in the same loop, but it is not obvious. So initialize it with zero. Link: https://lore.kernel.org/all/171092223833.237219.17304490075697026697.stgit@devnote2/ Reported-by: Dan Carpenter Closes: https://lore.kernel.org/all/b010488e-68aa-407c-add0-3e059254aaa0@moroto.mountain/ Fixes: 25f00e40ce79 ("tracing/probes: Support $argN in return probe (kprobe and fprobe)") Reviewed-by: Steven Rostedt (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_probe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 217169de0920..dfe3ee6035ec 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -839,7 +839,7 @@ out: void store_trace_entry_data(void *edata, struct trace_probe *tp, struct pt_regs *regs) { struct probe_entry_arg *earg = tp->entry_arg; - unsigned long val; + unsigned long val = 0; int i; if (!earg) -- cgit v1.2.3 From c2ddeb29612f7ca84ed10c6d4f3ac99705135447 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 25 Mar 2024 13:58:08 +0100 Subject: genirq: Introduce IRQF_COND_ONESHOT and use it in pinctrl-amd There is a problem when a driver requests a shared interrupt line to run a threaded handler on it without IRQF_ONESHOT set if that flag has been set already for the IRQ in question by somebody else. Namely, the request fails which usually leads to a probe failure even though the driver might have worked just fine with IRQF_ONESHOT, but it does not want to use it by default. Currently, the only way to handle this is to try to request the IRQ without IRQF_ONESHOT, but with IRQF_PROBE_SHARED set and if this fails, try again with IRQF_ONESHOT set. However, this is a bit cumbersome and not very clean. When commit 7a36b901a6eb ("ACPI: OSL: Use a threaded interrupt handler for SCI") switched the ACPI subsystem over to using a threaded interrupt handler for the SCI, it had to use IRQF_ONESHOT for it because that's required due to the way the SCI handler works (it needs to walk all of the enabled GPEs before the interrupt line can be unmasked). The SCI interrupt line is not shared with other users very often due to the SCI handling overhead, but on sone systems it is shared and when the other user of it attempts to install a threaded handler, a flags mismatch related to IRQF_ONESHOT may occur. As it turned out, that happened to the pinctrl-amd driver and so commit 4451e8e8415e ("pinctrl: amd: Add IRQF_ONESHOT to the interrupt request") attempted to address the issue by adding IRQF_ONESHOT to the interrupt flags in that driver, but this is now causing an IRQF_ONESHOT-related mismatch to occur on another system which cannot boot as a result of it. Clearly, pinctrl-amd can work with IRQF_ONESHOT if need be, but it should not set that flag by default, so it needs a way to indicate that to the interrupt subsystem. To that end, introdcuce a new interrupt flag, IRQF_COND_ONESHOT, which will only have effect when the IRQ line is shared and IRQF_ONESHOT has been set for it already, in which case it will be promoted to the latter. This is sufficient for drivers sharing the interrupt line with the SCI as it is requested by the ACPI subsystem before any drivers are probed, so they will always see IRQF_ONESHOT set for the interrupt in question. Fixes: 4451e8e8415e ("pinctrl: amd: Add IRQF_ONESHOT to the interrupt request") Reported-by: Francisco Ayala Le Brun Signed-off-by: Rafael J. Wysocki Signed-off-by: Thomas Gleixner Reviewed-by: Linus Walleij Cc: 6.8+ # 6.8+ Closes: https://lore.kernel.org/lkml/CAN-StX1HqWqi+YW=t+V52-38Mfp5fAz7YHx4aH-CQjgyNiKx3g@mail.gmail.com/ Link: https://lore.kernel.org/r/12417336.O9o76ZdvQC@kreacher --- drivers/pinctrl/pinctrl-amd.c | 2 +- include/linux/interrupt.h | 3 +++ kernel/irq/manage.c | 9 +++++++-- 3 files changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c index 49f89b70dcec..7f66ec73199a 100644 --- a/drivers/pinctrl/pinctrl-amd.c +++ b/drivers/pinctrl/pinctrl-amd.c @@ -1159,7 +1159,7 @@ static int amd_gpio_probe(struct platform_device *pdev) } ret = devm_request_irq(&pdev->dev, gpio_dev->irq, amd_gpio_irq_handler, - IRQF_SHARED | IRQF_ONESHOT, KBUILD_MODNAME, gpio_dev); + IRQF_SHARED | IRQF_COND_ONESHOT, KBUILD_MODNAME, gpio_dev); if (ret) goto out2; diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 76121c2bb4f8..5c9bdd3ffccc 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -67,6 +67,8 @@ * later. * IRQF_NO_DEBUG - Exclude from runnaway detection for IPI and similar handlers, * depends on IRQF_PERCPU. + * IRQF_COND_ONESHOT - Agree to do IRQF_ONESHOT if already set for a shared + * interrupt. */ #define IRQF_SHARED 0x00000080 #define IRQF_PROBE_SHARED 0x00000100 @@ -82,6 +84,7 @@ #define IRQF_COND_SUSPEND 0x00040000 #define IRQF_NO_AUTOEN 0x00080000 #define IRQF_NO_DEBUG 0x00100000 +#define IRQF_COND_ONESHOT 0x00200000 #define IRQF_TIMER (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ad3eaf2ab959..bf9ae8a8686f 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1643,8 +1643,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } if (!((old->flags & new->flags) & IRQF_SHARED) || - (oldtype != (new->flags & IRQF_TRIGGER_MASK)) || - ((old->flags ^ new->flags) & IRQF_ONESHOT)) + (oldtype != (new->flags & IRQF_TRIGGER_MASK))) + goto mismatch; + + if ((old->flags & IRQF_ONESHOT) && + (new->flags & IRQF_COND_ONESHOT)) + new->flags |= IRQF_ONESHOT; + else if ((old->flags ^ new->flags) & IRQF_ONESHOT) goto mismatch; /* All handlers must agree on per-cpuness */ -- cgit v1.2.3 From d5aad4c2ca057e760a92a9a7d65bd38d72963f27 Mon Sep 17 00:00:00 2001 From: Zev Weiss Date: Mon, 26 Feb 2024 17:35:41 -0800 Subject: prctl: generalize PR_SET_MDWE support check to be per-arch Patch series "ARM: prctl: Reject PR_SET_MDWE where not supported". I noticed after a recent kernel update that my ARM926 system started segfaulting on any execve() after calling prctl(PR_SET_MDWE). After some investigation it appears that ARMv5 is incapable of providing the appropriate protections for MDWE, since any readable memory is also implicitly executable. The prctl_set_mdwe() function already had some special-case logic added disabling it on PARISC (commit 793838138c15, "prctl: Disable prctl(PR_SET_MDWE) on parisc"); this patch series (1) generalizes that check to use an arch_*() function, and (2) adds a corresponding override for ARM to disable MDWE on pre-ARMv6 CPUs. With the series applied, prctl(PR_SET_MDWE) is rejected on ARMv5 and subsequent execve() calls (as well as mmap(PROT_READ|PROT_WRITE)) can succeed instead of unconditionally failing; on ARMv6 the prctl works as it did previously. [0] https://lore.kernel.org/all/2023112456-linked-nape-bf19@gregkh/ This patch (of 2): There exist systems other than PARISC where MDWE may not be feasible to support; rather than cluttering up the generic code with additional arch-specific logic let's add a generic function for checking MDWE support and allow each arch to override it as needed. Link: https://lkml.kernel.org/r/20240227013546.15769-4-zev@bewilderbeest.net Link: https://lkml.kernel.org/r/20240227013546.15769-5-zev@bewilderbeest.net Signed-off-by: Zev Weiss Acked-by: Helge Deller [parisc] Cc: Borislav Petkov Cc: David Hildenbrand Cc: Florent Revest Cc: "James E.J. Bottomley" Cc: Josh Triplett Cc: Kees Cook Cc: Miguel Ojeda Cc: Mike Rapoport (IBM) Cc: Oleg Nesterov Cc: Ondrej Mosnacek Cc: Rick Edgecombe Cc: Russell King (Oracle) Cc: Sam James Cc: Stefan Roesch Cc: Yang Shi Cc: Yin Fengwei Cc: [6.3+] Signed-off-by: Andrew Morton --- arch/parisc/include/asm/mman.h | 14 ++++++++++++++ include/linux/mman.h | 8 ++++++++ kernel/sys.c | 7 +++++-- 3 files changed, 27 insertions(+), 2 deletions(-) create mode 100644 arch/parisc/include/asm/mman.h (limited to 'kernel') diff --git a/arch/parisc/include/asm/mman.h b/arch/parisc/include/asm/mman.h new file mode 100644 index 000000000000..47c5a1991d10 --- /dev/null +++ b/arch/parisc/include/asm/mman.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_MMAN_H__ +#define __ASM_MMAN_H__ + +#include + +/* PARISC cannot allow mdwe as it needs writable stacks */ +static inline bool arch_memory_deny_write_exec_supported(void) +{ + return false; +} +#define arch_memory_deny_write_exec_supported arch_memory_deny_write_exec_supported + +#endif /* __ASM_MMAN_H__ */ diff --git a/include/linux/mman.h b/include/linux/mman.h index dc7048824be8..bcb201ab7a41 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -162,6 +162,14 @@ calc_vm_flag_bits(unsigned long flags) unsigned long vm_commit_limit(void); +#ifndef arch_memory_deny_write_exec_supported +static inline bool arch_memory_deny_write_exec_supported(void) +{ + return true; +} +#define arch_memory_deny_write_exec_supported arch_memory_deny_write_exec_supported +#endif + /* * Denies creating a writable executable mapping or gaining executable permissions. * diff --git a/kernel/sys.c b/kernel/sys.c index f8e543f1e38a..8bb106a56b3a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2408,8 +2408,11 @@ static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3, if (bits & PR_MDWE_NO_INHERIT && !(bits & PR_MDWE_REFUSE_EXEC_GAIN)) return -EINVAL; - /* PARISC cannot allow mdwe as it needs writable stacks */ - if (IS_ENABLED(CONFIG_PARISC)) + /* + * EOPNOTSUPP might be more appropriate here in principle, but + * existing userspace depends on EINVAL specifically. + */ + if (!arch_memory_deny_write_exec_supported()) return -EINVAL; current_bits = get_current_mdwe(); -- cgit v1.2.3 From 32fbe5246582af4f611ccccee33fd6e559087252 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 25 Mar 2024 09:50:50 +0800 Subject: crash: use macro to add crashk_res into iomem early for specific arch There are regression reports[1][2] that crashkernel region on x86_64 can't be added into iomem tree sometime. This causes the later failure of kdump loading. This happened after commit 4a693ce65b18 ("kdump: defer the insertion of crashkernel resources") was merged. Even though, these reported issues are proved to be related to other component, they are just exposed after above commmit applied, I still would like to keep crashk_res and crashk_low_res being added into iomem early as before because the early adding has been always there on x86_64 and working very well. For safety of kdump, Let's change it back. Here, add a macro HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY to limit that only ARCH defining the macro can have the early adding crashk_res/_low_res into iomem. Then define HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY on x86 to enable it. Note: In reserve_crashkernel_low(), there's a remnant of crashk_low_res handling which was mistakenly added back in commit 85fcde402db1 ("kexec: split crashkernel reservation code out from crash_core.c"). [1] [PATCH V2] x86/kexec: do not update E820 kexec table for setup_data https://lore.kernel.org/all/Zfv8iCL6CT2JqLIC@darkstar.users.ipa.redhat.com/T/#u [2] Question about Address Range Validation in Crash Kernel Allocation https://lore.kernel.org/all/4eeac1f733584855965a2ea62fa4da58@huawei.com/T/#u Link: https://lkml.kernel.org/r/ZgDYemRQ2jxjLkq+@MiWiFi-R3L-srv Fixes: 4a693ce65b18 ("kdump: defer the insertion of crashkernel resources") Signed-off-by: Baoquan He Cc: Dave Young Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiri Bohac Cc: Li Huafei Cc: Signed-off-by: Andrew Morton --- arch/x86/include/asm/crash_reserve.h | 2 ++ kernel/crash_reserve.c | 7 +++++++ 2 files changed, 9 insertions(+) (limited to 'kernel') diff --git a/arch/x86/include/asm/crash_reserve.h b/arch/x86/include/asm/crash_reserve.h index 152239f95541..7835b2cdff04 100644 --- a/arch/x86/include/asm/crash_reserve.h +++ b/arch/x86/include/asm/crash_reserve.h @@ -39,4 +39,6 @@ static inline unsigned long crash_low_size_default(void) #endif } +#define HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY + #endif /* _X86_CRASH_RESERVE_H */ diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c index bbb6c3cb00e4..066668799f75 100644 --- a/kernel/crash_reserve.c +++ b/kernel/crash_reserve.c @@ -366,7 +366,9 @@ static int __init reserve_crashkernel_low(unsigned long long low_size) crashk_low_res.start = low_base; crashk_low_res.end = low_base + low_size - 1; +#ifdef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY insert_resource(&iomem_resource, &crashk_low_res); +#endif #endif return 0; } @@ -448,8 +450,12 @@ retry: crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; +#ifdef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY + insert_resource(&iomem_resource, &crashk_res); +#endif } +#ifndef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY static __init int insert_crashkernel_resources(void) { if (crashk_res.start < crashk_res.end) @@ -462,3 +468,4 @@ static __init int insert_crashkernel_resources(void) } early_initcall(insert_crashkernel_resources); #endif +#endif -- cgit v1.2.3 From 96b98a6552a90690d7bc18dd71b66312c9ded1fb Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Tue, 19 Mar 2024 13:31:52 +0530 Subject: bpf: fix warning for crash_kexec With [1], crash dump specific code is moved out of CONFIG_KEXEC_CORE and placed under CONFIG_CRASH_DUMP, where it is more appropriate. And since CONFIG_KEXEC & !CONFIG_CRASH_DUMP build option is supported with that, it led to the below warning: "WARN: resolve_btfids: unresolved symbol crash_kexec" Fix it by using the appropriate #ifdef. [1] https://lore.kernel.org/all/20240124051254.67105-1-bhe@redhat.com/ Acked-by: Baoquan He Fixes: 02aff8480533 ("crash: split crash dumping code out from kexec_core.c") Acked-by: Jiri Olsa Acked-by: Stanislav Fomichev Signed-off-by: Hari Bathini Link: https://lore.kernel.org/r/20240319080152.36987-1-hbathini@linux.ibm.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a89587859571..449b9a5d3fe3 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2548,7 +2548,7 @@ __bpf_kfunc void bpf_throw(u64 cookie) __bpf_kfunc_end_defs(); BTF_KFUNCS_START(generic_btf_ids) -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_DUMP BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) #endif BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) -- cgit v1.2.3 From 5b4cdd9c5676559b8a7c944ac5269b914b8c0bb8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 26 Mar 2024 14:59:48 -0700 Subject: Fix memory leak in posix_clock_open() If the clk ops.open() function returns an error, we don't release the pccontext we allocated for this clock. Re-organize the code slightly to make it all more obvious. Reported-by: Rohit Keshri Acked-by: Oleg Nesterov Fixes: 60c6946675fc ("posix-clock: introduce posix_clock_context concept") Cc: Jakub Kicinski Cc: David S. Miller Cc: Thomas Gleixner Signed-off-by: Linus Torvalds --- kernel/time/posix-clock.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 9de66bbbb3d1..4782edcbe7b9 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -129,15 +129,17 @@ static int posix_clock_open(struct inode *inode, struct file *fp) goto out; } pccontext->clk = clk; - fp->private_data = pccontext; - if (clk->ops.open) + if (clk->ops.open) { err = clk->ops.open(pccontext, fp->f_mode); - else - err = 0; - - if (!err) { - get_device(clk->dev); + if (err) { + kfree(pccontext); + goto out; + } } + + fp->private_data = pccontext; + get_device(clk->dev); + err = 0; out: up_read(&clk->rwsem); return err; -- cgit v1.2.3 From a8d89feba7e54e691ca7c4efc2a6264fa83f3687 Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Tue, 26 Mar 2024 22:42:44 -0400 Subject: bpf: Check bloom filter map value size This patch adds a missing check to bloom filter creating, rejecting values above KMALLOC_MAX_SIZE. This brings the bloom map in line with many other map types. The lack of this protection can cause kernel crashes for value sizes that overflow int's. Such a crash was caught by syzkaller. The next patch adds more guard-rails at a lower level. Signed-off-by: Andrei Matei Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240327024245.318299-2-andreimatei1@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bloom_filter.c | 13 +++++++++++++ tools/testing/selftests/bpf/prog_tests/bloom_filter_map.c | 6 ++++++ 2 files changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c index addf3dd57b59..35e1ddca74d2 100644 --- a/kernel/bpf/bloom_filter.c +++ b/kernel/bpf/bloom_filter.c @@ -80,6 +80,18 @@ static int bloom_map_get_next_key(struct bpf_map *map, void *key, void *next_key return -EOPNOTSUPP; } +/* Called from syscall */ +static int bloom_map_alloc_check(union bpf_attr *attr) +{ + if (attr->value_size > KMALLOC_MAX_SIZE) + /* if value_size is bigger, the user space won't be able to + * access the elements. + */ + return -E2BIG; + + return 0; +} + static struct bpf_map *bloom_map_alloc(union bpf_attr *attr) { u32 bitset_bytes, bitset_mask, nr_hash_funcs, nr_bits; @@ -191,6 +203,7 @@ static u64 bloom_map_mem_usage(const struct bpf_map *map) BTF_ID_LIST_SINGLE(bpf_bloom_map_btf_ids, struct, bpf_bloom_filter) const struct bpf_map_ops bloom_filter_map_ops = { .map_meta_equal = bpf_map_meta_equal, + .map_alloc_check = bloom_map_alloc_check, .map_alloc = bloom_map_alloc, .map_free = bloom_map_free, .map_get_next_key = bloom_map_get_next_key, diff --git a/tools/testing/selftests/bpf/prog_tests/bloom_filter_map.c b/tools/testing/selftests/bpf/prog_tests/bloom_filter_map.c index 053f4d6da77a..cc184e4420f6 100644 --- a/tools/testing/selftests/bpf/prog_tests/bloom_filter_map.c +++ b/tools/testing/selftests/bpf/prog_tests/bloom_filter_map.c @@ -2,6 +2,7 @@ /* Copyright (c) 2021 Facebook */ #include +#include #include #include "bloom_filter_map.skel.h" @@ -21,6 +22,11 @@ static void test_fail_cases(void) if (!ASSERT_LT(fd, 0, "bpf_map_create bloom filter invalid value size 0")) close(fd); + /* Invalid value size: too big */ + fd = bpf_map_create(BPF_MAP_TYPE_BLOOM_FILTER, NULL, 0, INT32_MAX, 100, NULL); + if (!ASSERT_LT(fd, 0, "bpf_map_create bloom filter invalid value too large")) + close(fd); + /* Invalid max entries size */ fd = bpf_map_create(BPF_MAP_TYPE_BLOOM_FILTER, NULL, 0, sizeof(value), 0, NULL); if (!ASSERT_LT(fd, 0, "bpf_map_create bloom filter invalid max entries size")) -- cgit v1.2.3 From ecc6a2101840177e57c925c102d2d29f260d37c8 Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Tue, 26 Mar 2024 22:42:45 -0400 Subject: bpf: Protect against int overflow for stack access size This patch re-introduces protection against the size of access to stack memory being negative; the access size can appear negative as a result of overflowing its signed int representation. This should not actually happen, as there are other protections along the way, but we should protect against it anyway. One code path was missing such protections (fixed in the previous patch in the series), causing out-of-bounds array accesses in check_stack_range_initialized(). This patch causes the verification of a program with such a non-sensical access size to fail. This check used to exist in a more indirect way, but was inadvertendly removed in a833a17aeac7. Fixes: a833a17aeac7 ("bpf: Fix verification of indirect var-off stack access") Reported-by: syzbot+33f4297b5f927648741a@syzkaller.appspotmail.com Reported-by: syzbot+aafd0513053a1cbf52ef@syzkaller.appspotmail.com Closes: https://lore.kernel.org/bpf/CAADnVQLORV5PT0iTAhRER+iLBTkByCYNBYyvBSgjN1T31K+gOw@mail.gmail.com/ Acked-by: Andrii Nakryiko Signed-off-by: Andrei Matei Link: https://lore.kernel.org/r/20240327024245.318299-3-andreimatei1@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0bfc0050db28..353985b2b6a2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6701,6 +6701,11 @@ static int check_stack_access_within_bounds( err = check_stack_slot_within_bounds(env, min_off, state, type); if (!err && max_off > 0) err = -EINVAL; /* out of stack access into non-negative offsets */ + if (!err && access_size < 0) + /* access_size should not be negative (or overflow an int); others checks + * along the way should have prevented such an access. + */ + err = -EFAULT; /* invalid negative access size; integer overflow? */ if (err) { if (tnum_is_const(reg->var_off)) { -- cgit v1.2.3 From e9c856cabefb71d47b2eeb197f72c9c88e9b45b0 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 27 Mar 2024 22:24:25 -0700 Subject: bpf: put uprobe link's path and task in release callback There is no need to delay putting either path or task to deallocation step. It can be done right after bpf_uprobe_unregister. Between release and dealloc, there could be still some running BPF programs, but they don't access either task or path, only data in link->uprobes, so it is safe to do. On the other hand, doing path_put() in dealloc callback makes this dealloc sleepable because path_put() itself might sleep. Which is problematic due to the need to call uprobe's dealloc through call_rcu(), which is what is done in the next bug fix patch. So solve the problem by releasing these resources early. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240328052426.3042617-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0a5c4efc73c3..0b73fe5f7206 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3157,6 +3157,9 @@ static void bpf_uprobe_multi_link_release(struct bpf_link *link) umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); bpf_uprobe_unregister(&umulti_link->path, umulti_link->uprobes, umulti_link->cnt); + if (umulti_link->task) + put_task_struct(umulti_link->task); + path_put(&umulti_link->path); } static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link) @@ -3164,9 +3167,6 @@ static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link) struct bpf_uprobe_multi_link *umulti_link; umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); - if (umulti_link->task) - put_task_struct(umulti_link->task); - path_put(&umulti_link->path); kvfree(umulti_link->uprobes); kfree(umulti_link); } -- cgit v1.2.3 From 1a80dbcb2dbaf6e4c216e62e30fa7d3daa8001ce Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 27 Mar 2024 22:24:26 -0700 Subject: bpf: support deferring bpf_link dealloc to after RCU grace period BPF link for some program types is passed as a "context" which can be used by those BPF programs to look up additional information. E.g., for multi-kprobes and multi-uprobes, link is used to fetch BPF cookie values. Because of this runtime dependency, when bpf_link refcnt drops to zero there could still be active BPF programs running accessing link data. This patch adds generic support to defer bpf_link dealloc callback to after RCU GP, if requested. This is done by exposing two different deallocation callbacks, one synchronous and one deferred. If deferred one is provided, bpf_link_free() will schedule dealloc_deferred() callback to happen after RCU GP. BPF is using two flavors of RCU: "classic" non-sleepable one and RCU tasks trace one. The latter is used when sleepable BPF programs are used. bpf_link_free() accommodates that by checking underlying BPF program's sleepable flag, and goes either through normal RCU GP only for non-sleepable, or through RCU tasks trace GP *and* then normal RCU GP (taking into account rcu_trace_implies_rcu_gp() optimization), if BPF program is sleepable. We use this for multi-kprobe and multi-uprobe links, which dereference link during program run. We also preventively switch raw_tp link to use deferred dealloc callback, as upcoming changes in bpf-next tree expose raw_tp link data (specifically, cookie value) to BPF program at runtime as well. Fixes: 0dcac2725406 ("bpf: Add multi kprobe link") Fixes: 89ae89f53d20 ("bpf: Add multi uprobe link") Reported-by: syzbot+981935d9485a560bfbcb@syzkaller.appspotmail.com Reported-by: syzbot+2cb5a6c573e98db598cc@syzkaller.appspotmail.com Reported-by: syzbot+62d8b26793e8a2bd0516@syzkaller.appspotmail.com Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20240328052426.3042617-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 16 +++++++++++++++- kernel/bpf/syscall.c | 35 ++++++++++++++++++++++++++++++++--- kernel/trace/bpf_trace.c | 4 ++-- 3 files changed, 49 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4f20f62f9d63..890e152d553e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1574,12 +1574,26 @@ struct bpf_link { enum bpf_link_type type; const struct bpf_link_ops *ops; struct bpf_prog *prog; - struct work_struct work; + /* rcu is used before freeing, work can be used to schedule that + * RCU-based freeing before that, so they never overlap + */ + union { + struct rcu_head rcu; + struct work_struct work; + }; }; struct bpf_link_ops { void (*release)(struct bpf_link *link); + /* deallocate link resources callback, called without RCU grace period + * waiting + */ void (*dealloc)(struct bpf_link *link); + /* deallocate link resources callback, called after RCU grace period; + * if underlying BPF program is sleepable we go through tasks trace + * RCU GP and then "classic" RCU GP + */ + void (*dealloc_deferred)(struct bpf_link *link); int (*detach)(struct bpf_link *link); int (*update_prog)(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ae2ff73bde7e..c287925471f6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3024,17 +3024,46 @@ void bpf_link_inc(struct bpf_link *link) atomic64_inc(&link->refcnt); } +static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu) +{ + struct bpf_link *link = container_of(rcu, struct bpf_link, rcu); + + /* free bpf_link and its containing memory */ + link->ops->dealloc_deferred(link); +} + +static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu) +{ + if (rcu_trace_implies_rcu_gp()) + bpf_link_defer_dealloc_rcu_gp(rcu); + else + call_rcu(rcu, bpf_link_defer_dealloc_rcu_gp); +} + /* bpf_link_free is guaranteed to be called from process context */ static void bpf_link_free(struct bpf_link *link) { + bool sleepable = false; + bpf_link_free_id(link->id); if (link->prog) { + sleepable = link->prog->sleepable; /* detach BPF program, clean up used resources */ link->ops->release(link); bpf_prog_put(link->prog); } - /* free bpf_link and its containing memory */ - link->ops->dealloc(link); + if (link->ops->dealloc_deferred) { + /* schedule BPF link deallocation; if underlying BPF program + * is sleepable, we need to first wait for RCU tasks trace + * sync, then go through "classic" RCU grace period + */ + if (sleepable) + call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp); + else + call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp); + } + if (link->ops->dealloc) + link->ops->dealloc(link); } static void bpf_link_put_deferred(struct work_struct *work) @@ -3544,7 +3573,7 @@ static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link, static const struct bpf_link_ops bpf_raw_tp_link_lops = { .release = bpf_raw_tp_link_release, - .dealloc = bpf_raw_tp_link_dealloc, + .dealloc_deferred = bpf_raw_tp_link_dealloc, .show_fdinfo = bpf_raw_tp_link_show_fdinfo, .fill_link_info = bpf_raw_tp_link_fill_link_info, }; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0b73fe5f7206..9dc605f08a23 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2728,7 +2728,7 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link, static const struct bpf_link_ops bpf_kprobe_multi_link_lops = { .release = bpf_kprobe_multi_link_release, - .dealloc = bpf_kprobe_multi_link_dealloc, + .dealloc_deferred = bpf_kprobe_multi_link_dealloc, .fill_link_info = bpf_kprobe_multi_link_fill_link_info, }; @@ -3242,7 +3242,7 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link, static const struct bpf_link_ops bpf_uprobe_multi_link_lops = { .release = bpf_uprobe_multi_link_release, - .dealloc = bpf_uprobe_multi_link_dealloc, + .dealloc_deferred = bpf_uprobe_multi_link_dealloc, .fill_link_info = bpf_uprobe_multi_link_fill_link_info, }; -- cgit v1.2.3 From 6dae957c8eef6eae5b386462767de97303235d5c Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Fri, 29 Mar 2024 07:11:06 +0000 Subject: bpf: fix possible file descriptor leaks in verifier The resolve_pseudo_ldimm64() function might have leaked file descriptors when BPF_MAP_TYPE_ARENA was used in a program (some error paths missed a corresponding fdput). Add missing fdputs. v2: remove unrelated changes from the fix Fixes: 6082b6c328b5 ("bpf: Recognize addr_space_cast instruction in the verifier.") Signed-off-by: Anton Protopopov Acked-by: Yonghong Song Acked-by: Shung-Hsi Yu Link: https://lore.kernel.org/r/20240329071106.67968-1-aspsk@isovalent.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 353985b2b6a2..98188379d5c7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18379,15 +18379,18 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) } if (!env->prog->jit_requested) { verbose(env, "JIT is required to use arena\n"); + fdput(f); return -EOPNOTSUPP; } if (!bpf_jit_supports_arena()) { verbose(env, "JIT doesn't support arena\n"); + fdput(f); return -EOPNOTSUPP; } env->prog->aux->arena = (void *)map; if (!bpf_arena_get_user_vm_start(env->prog->aux->arena)) { verbose(env, "arena's user address must be set via map_extra or mmap()\n"); + fdput(f); return -EINVAL; } } -- cgit v1.2.3 From c40845e3195d074b34f8f8e400e28c9403a06588 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 26 Mar 2024 15:47:16 +0100 Subject: kbuild: make -Woverride-init warnings more consistent The -Woverride-init warn about code that may be intentional or not, but the inintentional ones tend to be real bugs, so there is a bit of disagreement on whether this warning option should be enabled by default and we have multiple settings in scripts/Makefile.extrawarn as well as individual subsystems. Older versions of clang only supported -Wno-initializer-overrides with the same meaning as gcc's -Woverride-init, though all supported versions now work with both. Because of this difference, an earlier cleanup of mine accidentally turned the clang warning off for W=1 builds and only left it on for W=2, while it's still enabled for gcc with W=1. There is also one driver that only turns the warning off for newer versions of gcc but not other compilers, and some but not all the Makefiles still use a cc-disable-warning conditional that is no longer needed with supported compilers here. Address all of the above by removing the special cases for clang and always turning the warning off unconditionally where it got in the way, using the syntax that is supported by both compilers. Fixes: 2cd3271b7a31 ("kbuild: avoid duplicate warning options") Signed-off-by: Arnd Bergmann Acked-by: Hamza Mahfooz Acked-by: Jani Nikula Acked-by: Andrew Jeffery Signed-off-by: Jani Nikula Reviewed-by: Linus Walleij Signed-off-by: Masahiro Yamada --- drivers/gpu/drm/amd/display/dc/dce110/Makefile | 2 +- drivers/gpu/drm/amd/display/dc/dce112/Makefile | 2 +- drivers/gpu/drm/amd/display/dc/dce120/Makefile | 2 +- drivers/gpu/drm/amd/display/dc/dce60/Makefile | 2 +- drivers/gpu/drm/amd/display/dc/dce80/Makefile | 2 +- drivers/gpu/drm/i915/Makefile | 6 +++--- drivers/gpu/drm/xe/Makefile | 4 ++-- drivers/net/ethernet/renesas/sh_eth.c | 2 +- drivers/pinctrl/aspeed/Makefile | 2 +- fs/proc/Makefile | 2 +- kernel/bpf/Makefile | 2 +- mm/Makefile | 3 +-- scripts/Makefile.extrawarn | 10 +++------- 13 files changed, 18 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/drivers/gpu/drm/amd/display/dc/dce110/Makefile b/drivers/gpu/drm/amd/display/dc/dce110/Makefile index f0777d61c2cb..c307f040e48f 100644 --- a/drivers/gpu/drm/amd/display/dc/dce110/Makefile +++ b/drivers/gpu/drm/amd/display/dc/dce110/Makefile @@ -23,7 +23,7 @@ # Makefile for the 'controller' sub-component of DAL. # It provides the control and status of HW CRTC block. -CFLAGS_$(AMDDALPATH)/dc/dce110/dce110_resource.o = $(call cc-disable-warning, override-init) +CFLAGS_$(AMDDALPATH)/dc/dce110/dce110_resource.o = -Wno-override-init DCE110 = dce110_timing_generator.o \ dce110_compressor.o dce110_opp_regamma_v.o \ diff --git a/drivers/gpu/drm/amd/display/dc/dce112/Makefile b/drivers/gpu/drm/amd/display/dc/dce112/Makefile index 7e92effec894..683866797709 100644 --- a/drivers/gpu/drm/amd/display/dc/dce112/Makefile +++ b/drivers/gpu/drm/amd/display/dc/dce112/Makefile @@ -23,7 +23,7 @@ # Makefile for the 'controller' sub-component of DAL. # It provides the control and status of HW CRTC block. -CFLAGS_$(AMDDALPATH)/dc/dce112/dce112_resource.o = $(call cc-disable-warning, override-init) +CFLAGS_$(AMDDALPATH)/dc/dce112/dce112_resource.o = -Wno-override-init DCE112 = dce112_compressor.o diff --git a/drivers/gpu/drm/amd/display/dc/dce120/Makefile b/drivers/gpu/drm/amd/display/dc/dce120/Makefile index 1e3ef68a452a..8f508e662748 100644 --- a/drivers/gpu/drm/amd/display/dc/dce120/Makefile +++ b/drivers/gpu/drm/amd/display/dc/dce120/Makefile @@ -24,7 +24,7 @@ # It provides the control and status of HW CRTC block. -CFLAGS_$(AMDDALPATH)/dc/dce120/dce120_resource.o = $(call cc-disable-warning, override-init) +CFLAGS_$(AMDDALPATH)/dc/dce120/dce120_resource.o = -Wno-override-init DCE120 = dce120_timing_generator.o diff --git a/drivers/gpu/drm/amd/display/dc/dce60/Makefile b/drivers/gpu/drm/amd/display/dc/dce60/Makefile index fee331accc0e..eede83ad91fa 100644 --- a/drivers/gpu/drm/amd/display/dc/dce60/Makefile +++ b/drivers/gpu/drm/amd/display/dc/dce60/Makefile @@ -23,7 +23,7 @@ # Makefile for the 'controller' sub-component of DAL. # It provides the control and status of HW CRTC block. -CFLAGS_$(AMDDALPATH)/dc/dce60/dce60_resource.o = $(call cc-disable-warning, override-init) +CFLAGS_$(AMDDALPATH)/dc/dce60/dce60_resource.o = -Wno-override-init DCE60 = dce60_timing_generator.o dce60_hw_sequencer.o \ dce60_resource.o diff --git a/drivers/gpu/drm/amd/display/dc/dce80/Makefile b/drivers/gpu/drm/amd/display/dc/dce80/Makefile index 7eefffbdc925..fba189d26652 100644 --- a/drivers/gpu/drm/amd/display/dc/dce80/Makefile +++ b/drivers/gpu/drm/amd/display/dc/dce80/Makefile @@ -23,7 +23,7 @@ # Makefile for the 'controller' sub-component of DAL. # It provides the control and status of HW CRTC block. -CFLAGS_$(AMDDALPATH)/dc/dce80/dce80_resource.o = $(call cc-disable-warning, override-init) +CFLAGS_$(AMDDALPATH)/dc/dce80/dce80_resource.o = -Wno-override-init DCE80 = dce80_timing_generator.o diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile index 3ef6ed41e62b..4c2f85632391 100644 --- a/drivers/gpu/drm/i915/Makefile +++ b/drivers/gpu/drm/i915/Makefile @@ -33,9 +33,9 @@ endif subdir-ccflags-$(CONFIG_DRM_I915_WERROR) += -Werror # Fine grained warnings disable -CFLAGS_i915_pci.o = $(call cc-disable-warning, override-init) -CFLAGS_display/intel_display_device.o = $(call cc-disable-warning, override-init) -CFLAGS_display/intel_fbdev.o = $(call cc-disable-warning, override-init) +CFLAGS_i915_pci.o = -Wno-override-init +CFLAGS_display/intel_display_device.o = -Wno-override-init +CFLAGS_display/intel_fbdev.o = -Wno-override-init # Support compiling the display code separately for both i915 and xe # drivers. Define I915 when building i915. diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile index 5a428ca00f10..c29a850859ad 100644 --- a/drivers/gpu/drm/xe/Makefile +++ b/drivers/gpu/drm/xe/Makefile @@ -172,8 +172,8 @@ subdir-ccflags-$(CONFIG_DRM_XE_DISPLAY) += \ -Ddrm_i915_gem_object=xe_bo \ -Ddrm_i915_private=xe_device -CFLAGS_i915-display/intel_fbdev.o = $(call cc-disable-warning, override-init) -CFLAGS_i915-display/intel_display_device.o = $(call cc-disable-warning, override-init) +CFLAGS_i915-display/intel_fbdev.o = -Wno-override-init +CFLAGS_i915-display/intel_display_device.o = -Wno-override-init # Rule to build SOC code shared with i915 $(obj)/i915-soc/%.o: $(srctree)/drivers/gpu/drm/i915/soc/%.c FORCE diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index 475e1e8c1d35..0786eb0da391 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -50,7 +50,7 @@ * the macros available to do this only define GCC 8. */ __diag_push(); -__diag_ignore(GCC, 8, "-Woverride-init", +__diag_ignore_all("-Woverride-init", "logic to initialize all and then override some is OK"); static const u16 sh_eth_offset_gigabit[SH_ETH_MAX_REGISTER_OFFSET] = { SH_ETH_OFFSET_DEFAULTS, diff --git a/drivers/pinctrl/aspeed/Makefile b/drivers/pinctrl/aspeed/Makefile index 489ea1778353..db2a7600ae2b 100644 --- a/drivers/pinctrl/aspeed/Makefile +++ b/drivers/pinctrl/aspeed/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only # Aspeed pinctrl support -ccflags-y += $(call cc-option,-Woverride-init) +ccflags-y += -Woverride-init obj-$(CONFIG_PINCTRL_ASPEED) += pinctrl-aspeed.o pinmux-aspeed.o obj-$(CONFIG_PINCTRL_ASPEED_G4) += pinctrl-aspeed-g4.o obj-$(CONFIG_PINCTRL_ASPEED_G5) += pinctrl-aspeed-g5.o diff --git a/fs/proc/Makefile b/fs/proc/Makefile index bd08616ed8ba..7b4db9c56e6a 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -5,7 +5,7 @@ obj-y += proc.o -CFLAGS_task_mmu.o += $(call cc-option,-Wno-override-init,) +CFLAGS_task_mmu.o += -Wno-override-init proc-y := nommu.o task_nommu.o proc-$(CONFIG_MMU) := task_mmu.o diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 368c5d86b5b7..e497011261b8 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -4,7 +4,7 @@ ifneq ($(CONFIG_BPF_JIT_ALWAYS_ON),y) # ___bpf_prog_run() needs GCSE disabled on x86; see 3193c0836f203 for details cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse endif -CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy) +CFLAGS_core.o += -Wno-override-init $(cflags-nogcse-yy) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o diff --git a/mm/Makefile b/mm/Makefile index e4b5b75aaec9..4abb40b911ec 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -29,8 +29,7 @@ KCOV_INSTRUMENT_mmzone.o := n KCOV_INSTRUMENT_vmstat.o := n KCOV_INSTRUMENT_failslab.o := n -CFLAGS_init-mm.o += $(call cc-disable-warning, override-init) -CFLAGS_init-mm.o += $(call cc-disable-warning, initializer-overrides) +CFLAGS_init-mm.o += -Wno-override-init mmu-y := nommu.o mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \ diff --git a/scripts/Makefile.extrawarn b/scripts/Makefile.extrawarn index 3ce5d503a6da..c5af566e911a 100644 --- a/scripts/Makefile.extrawarn +++ b/scripts/Makefile.extrawarn @@ -114,6 +114,8 @@ KBUILD_CFLAGS += $(call cc-disable-warning, format-overflow) KBUILD_CFLAGS += $(call cc-disable-warning, format-truncation) KBUILD_CFLAGS += $(call cc-disable-warning, stringop-truncation) +KBUILD_CFLAGS += -Wno-override-init # alias for -Wno-initializer-overrides in clang + ifdef CONFIG_CC_IS_CLANG # Clang before clang-16 would warn on default argument promotions. ifneq ($(call clang-min-version, 160000),y) @@ -151,10 +153,6 @@ KBUILD_CFLAGS += -Wtype-limits KBUILD_CFLAGS += $(call cc-option, -Wmaybe-uninitialized) KBUILD_CFLAGS += $(call cc-option, -Wunused-macros) -ifdef CONFIG_CC_IS_CLANG -KBUILD_CFLAGS += -Winitializer-overrides -endif - KBUILD_CPPFLAGS += -DKBUILD_EXTRA_WARN2 else @@ -164,9 +162,7 @@ KBUILD_CFLAGS += -Wno-missing-field-initializers KBUILD_CFLAGS += -Wno-type-limits KBUILD_CFLAGS += -Wno-shift-negative-value -ifdef CONFIG_CC_IS_CLANG -KBUILD_CFLAGS += -Wno-initializer-overrides -else +ifdef CONFIG_CC_IS_GCC KBUILD_CFLAGS += -Wno-maybe-uninitialized endif -- cgit v1.2.3 From f29536bf1721802d2ebdc7893ed2991d4da0a4b6 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 31 Mar 2024 10:26:50 -0700 Subject: tick/sched: Fix various kernel-doc warnings Fix a slew of kernel-doc warnings in tick-sched.c: tick-sched.c:650: warning: Function parameter or struct member 'now' not described in 'tick_nohz_update_jiffies' tick-sched.c:741: warning: No description found for return value of 'get_cpu_idle_time_us' tick-sched.c:767: warning: No description found for return value of 'get_cpu_iowait_time_us' tick-sched.c:1210: warning: No description found for return value of 'tick_nohz_idle_got_tick' tick-sched.c:1228: warning: No description found for return value of 'tick_nohz_get_next_hrtimer' tick-sched.c:1243: warning: No description found for return value of 'tick_nohz_get_sleep_length' tick-sched.c:1282: warning: Function parameter or struct member 'cpu' not described in 'tick_nohz_get_idle_calls_cpu' tick-sched.c:1282: warning: No description found for return value of 'tick_nohz_get_idle_calls_cpu' tick-sched.c:1294: warning: No description found for return value of 'tick_nohz_get_idle_calls' tick-sched.c:1577: warning: Function parameter or struct member 'hrtimer' not described in 'tick_setup_sched_timer' tick-sched.c:1577: warning: Excess function parameter 'mode' description in 'tick_setup_sched_timer' Signed-off-by: Randy Dunlap Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240331172652.14086-5-rdunlap@infradead.org --- kernel/time/tick-sched.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 269e21590df5..1331216a9cae 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -697,6 +697,7 @@ bool tick_nohz_tick_stopped_cpu(int cpu) /** * tick_nohz_update_jiffies - update jiffies when idle was interrupted + * @now: current ktime_t * * Called from interrupt entry when the CPU was idle * @@ -794,7 +795,7 @@ static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime, * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. * - * This function returns -1 if NOHZ is not enabled. + * Return: -1 if NOHZ is not enabled, else total idle time of the @cpu */ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) { @@ -820,7 +821,7 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. * - * This function returns -1 if NOHZ is not enabled. + * Return: -1 if NOHZ is not enabled, else total iowait time of @cpu */ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) { @@ -1287,6 +1288,8 @@ void tick_nohz_irq_exit(void) /** * tick_nohz_idle_got_tick - Check whether or not the tick handler has run + * + * Return: %true if the tick handler has run, otherwise %false */ bool tick_nohz_idle_got_tick(void) { @@ -1305,6 +1308,8 @@ bool tick_nohz_idle_got_tick(void) * stopped, it returns the next hrtimer. * * Called from power state control code with interrupts disabled + * + * Return: the next expiration time */ ktime_t tick_nohz_get_next_hrtimer(void) { @@ -1320,6 +1325,8 @@ ktime_t tick_nohz_get_next_hrtimer(void) * The return value of this function and/or the value returned by it through the * @delta_next pointer can be negative which must be taken into account by its * callers. + * + * Return: the expected length of the current sleep */ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) { @@ -1357,8 +1364,11 @@ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) /** * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value * for a particular CPU. + * @cpu: target CPU number * * Called from the schedutil frequency scaling governor in scheduler context. + * + * Return: the current idle calls counter value for @cpu */ unsigned long tick_nohz_get_idle_calls_cpu(int cpu) { @@ -1371,6 +1381,8 @@ unsigned long tick_nohz_get_idle_calls_cpu(int cpu) * tick_nohz_get_idle_calls - return the current idle calls counter value * * Called from the schedutil frequency scaling governor in scheduler context. + * + * Return: the current idle calls counter value for the current CPU */ unsigned long tick_nohz_get_idle_calls(void) { @@ -1559,7 +1571,7 @@ early_param("skew_tick", skew_tick); /** * tick_setup_sched_timer - setup the tick emulation timer - * @mode: tick_nohz_mode to setup for + * @hrtimer: whether to use the hrtimer or not */ void tick_setup_sched_timer(bool hrtimer) { -- cgit v1.2.3 From ba6ad57b803e33ed509213a5e840427dbef501d6 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 31 Mar 2024 10:26:51 -0700 Subject: tick/sched: Fix struct tick_sched doc warnings Fix kernel-doc warnings in struct tick_sched: tick-sched.h:103: warning: Function parameter or struct member 'idle_sleeptime_seq' not described in 'tick_sched' tick-sched.h:104: warning: Excess struct member 'nohz_mode' description in 'tick_sched' Signed-off-by: Randy Dunlap Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240331172652.14086-6-rdunlap@infradead.org --- kernel/time/tick-sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index e11c4dc65bcb..b4a7822f495d 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -46,8 +46,8 @@ struct tick_device { * @next_tick: Next tick to be fired when in dynticks mode. * @idle_jiffies: jiffies at the entry to idle for idle time accounting * @idle_waketime: Time when the idle was interrupted + * @idle_sleeptime_seq: sequence counter for data consistency * @idle_entrytime: Time when the idle call was entered - * @nohz_mode: Mode - one state of tick_nohz_mode * @last_jiffies: Base jiffies snapshot when next event was last computed * @timer_expires_base: Base time clock monotonic for @timer_expires * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped) -- cgit v1.2.3 From 9e643ab59d7ee4332994671720a9528bac62e9b7 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 31 Mar 2024 10:26:52 -0700 Subject: timers: Fix text inconsistencies and spelling Fix some text for consistency: s/lvl/level/ in a comment and use correct/full function names in comments. Correct spelling errors as reported by codespell. Signed-off-by: Randy Dunlap Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240331172652.14086-7-rdunlap@infradead.org --- kernel/time/timer.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index dee29f1f5b75..3baf2fbe6848 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -64,15 +64,15 @@ EXPORT_SYMBOL(jiffies_64); /* * The timer wheel has LVL_DEPTH array levels. Each level provides an array of - * LVL_SIZE buckets. Each level is driven by its own clock and therefor each + * LVL_SIZE buckets. Each level is driven by its own clock and therefore each * level has a different granularity. * - * The level granularity is: LVL_CLK_DIV ^ lvl + * The level granularity is: LVL_CLK_DIV ^ level * The level clock frequency is: HZ / (LVL_CLK_DIV ^ level) * * The array level of a newly armed timer depends on the relative expiry * time. The farther the expiry time is away the higher the array level and - * therefor the granularity becomes. + * therefore the granularity becomes. * * Contrary to the original timer wheel implementation, which aims for 'exact' * expiry of the timers, this implementation removes the need for recascading @@ -207,7 +207,7 @@ EXPORT_SYMBOL(jiffies_64); * struct timer_base - Per CPU timer base (number of base depends on config) * @lock: Lock protecting the timer_base * @running_timer: When expiring timers, the lock is dropped. To make - * sure not to race agains deleting/modifying a + * sure not to race against deleting/modifying a * currently running timer, the pointer is set to the * timer, which expires at the moment. If no timer is * running, the pointer is NULL. @@ -737,7 +737,7 @@ static bool timer_is_static_object(void *addr) } /* - * fixup_init is called when: + * timer_fixup_init is called when: * - an active object is initialized */ static bool timer_fixup_init(void *addr, enum debug_obj_state state) @@ -761,7 +761,7 @@ static void stub_timer(struct timer_list *unused) } /* - * fixup_activate is called when: + * timer_fixup_activate is called when: * - an active object is activated * - an unknown non-static object is activated */ @@ -783,7 +783,7 @@ static bool timer_fixup_activate(void *addr, enum debug_obj_state state) } /* - * fixup_free is called when: + * timer_fixup_free is called when: * - an active object is freed */ static bool timer_fixup_free(void *addr, enum debug_obj_state state) @@ -801,7 +801,7 @@ static bool timer_fixup_free(void *addr, enum debug_obj_state state) } /* - * fixup_assert_init is called when: + * timer_fixup_assert_init is called when: * - an untracked/uninit-ed object is found */ static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state) @@ -914,7 +914,7 @@ static void do_init_timer(struct timer_list *timer, * @key: lockdep class key of the fake lock used for tracking timer * sync lock dependencies * - * init_timer_key() must be done to a timer prior calling *any* of the + * init_timer_key() must be done to a timer prior to calling *any* of the * other timer functions. */ void init_timer_key(struct timer_list *timer, @@ -1417,7 +1417,7 @@ static int __timer_delete(struct timer_list *timer, bool shutdown) * If @shutdown is set then the lock has to be taken whether the * timer is pending or not to protect against a concurrent rearm * which might hit between the lockless pending check and the lock - * aquisition. By taking the lock it is ensured that such a newly + * acquisition. By taking the lock it is ensured that such a newly * enqueued timer is dequeued and cannot end up with * timer->function == NULL in the expiry code. * @@ -2306,7 +2306,7 @@ static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem, /* * When timer base is not set idle, undo the effect of - * tmigr_cpu_deactivate() to prevent inconsitent states - active + * tmigr_cpu_deactivate() to prevent inconsistent states - active * timer base but inactive timer migration hierarchy. * * When timer base was already marked idle, nothing will be -- cgit v1.2.3 From 61f7fdf8fd00ce33d30ca3fae8d643c0850ce945 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 1 Apr 2024 23:48:59 +0200 Subject: timers/migration: Fix ignored event due to missing CPU update When a group event is updated with its expiry unchanged but a different CPU, that target change may go unnoticed and the event may be propagated up with a stale CPU value. The following depicts a scenario that has been actually observed: [GRP2:0] migrator = GRP1:1 active = GRP1:1 nextevt = TGRP1:0 (T0) / \ [GRP1:0] [GRP1:1] migrator = NONE [...] active = NONE nextevt = TGRP0:0 (T0) / \ [GRP0:0] [...] migrator = NONE active = NONE nextevt = T0 / \ 0 (T0) 1 (T1) idle idle 0) The hierarchy has 3 levels. The left part (GRP1:0) is all idle, including CPU 0 and CPU 1 which have a timer each: T0 and T1. They have the same expiry value. [GRP2:0] migrator = GRP1:1 active = GRP1:1 nextevt = KTIME_MAX / \ [GRP1:0] [GRP1:1] migrator = NONE [...] active = NONE nextevt = TGRP0:0 (T0) / \ [GRP0:0] [...] migrator = NONE active = NONE nextevt = T0 / \ 0 (T0) 1 (T1) idle idle 1) The migrator in GRP1:1 handles remotely T0. The event is dequeued from the top and T0 executed. [GRP2:0] migrator = GRP1:1 active = GRP1:1 nextevt = KTIME_MAX / \ [GRP1:0] [GRP1:1] migrator = NONE [...] active = NONE nextevt = TGRP0:0 (T0) / \ [GRP0:0] [...] migrator = NONE active = NONE nextevt = T1 / \ 0 1 (T1) idle idle 2) The migrator in GRP1:1 fetches the next timer for CPU 0 and finds none. But it updates the events from its groups, starting with GRP0:0 which now has T1 as its next event. So far so good. [GRP2:0] migrator = GRP1:1 active = GRP1:1 nextevt = KTIME_MAX / \ [GRP1:0] [GRP1:1] migrator = NONE [...] active = NONE nextevt = TGRP0:0 (T0) / \ [GRP0:0] [...] migrator = NONE active = NONE nextevt = T1 / \ 0 1 (T1) idle idle 3) The migrator in GRP1:1 proceeds upward and updates the events in GRP1:0. The child event TGRP0:0 is found queued with the same expiry as before. And therefore it is left unchanged. However the target CPU is not the same but that fact is ignored so TGRP0:0 still points to CPU 0 when it should point to CPU 1. [GRP2:0] migrator = GRP1:1 active = GRP1:1 nextevt = TGRP1:0 (T0) / \ [GRP1:0] [GRP1:1] migrator = NONE [...] active = NONE nextevt = TGRP0:0 (T0) / \ [GRP0:0] [...] migrator = NONE active = NONE nextevt = T1 / \ 0 1 (T1) idle idle 4) The propagation has reached the top level and TGRP1:0, having TGRP0:0 as its first event, also wrongly points to CPU 0. TGRP1:0 is added to the top level group. [GRP2:0] migrator = GRP1:1 active = GRP1:1 nextevt = KTIME_MAX / \ [GRP1:0] [GRP1:1] migrator = NONE [...] active = NONE nextevt = TGRP0:0 (T0) / \ [GRP0:0] [...] migrator = NONE active = NONE nextevt = T1 / \ 0 1 (T1) idle idle 5) The migrator in GRP1:1 dequeues the next event in top level pointing to CPU 0. But since it actually doesn't see any real event in CPU 0, it early returns. 6) T1 is left unhandled until either CPU 0 or CPU 1 wake up. Some other bad scenario may involve trees with just two levels. Fix this with unconditionally updating the CPU of the child event before considering to early return while updating a queued event with an unchanged expiry value. Fixes: 7ee988770326 ("timers: Implement the hierarchical pull model") Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Reviewed-by: Anna-Maria Behnsen Link: https://lore.kernel.org/r/Zg2Ct6M2RJAYHgCB@localhost.localdomain --- kernel/time/timer_migration.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index c63a0afdcebe..e3075e40cb43 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -762,8 +762,11 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child, * queue when the expiry time changed only or when it could be ignored. */ if (timerqueue_node_queued(&evt->nextevt)) { - if ((evt->nextevt.expires == nextexp) && !evt->ignore) + if ((evt->nextevt.expires == nextexp) && !evt->ignore) { + /* Make sure not to miss a new CPU event with the same expiry */ + evt->cpu = first_childevt->cpu; goto check_toplvl; + } if (!timerqueue_del(&group->events, &evt->nextevt)) WRITE_ONCE(group->next_expiry, KTIME_MAX); -- cgit v1.2.3 From 7a96a84bfbee96871bb16c70ee3e93d564e190f4 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Fri, 5 Apr 2024 10:53:21 +0200 Subject: timers/migration: Return early on deactivation Commit 4b6f4c5a67c0 ("timer/migration: Remove buggy early return on deactivation") removed the logic to return early in tmigr_update_events() on deactivation. With this the problem with a not properly updated first global event in a hierarchy containing only a single group was fixed. But when having a look at this code path with a hierarchy with more than a single level, now unnecessary work is done (example is partially copied from the message of the commit mentioned above): [GRP1:0] migrator = GRP0:0 active = GRP0:0 nextevt = T0:0i, T0:1 / \ [GRP0:0] [GRP0:1] migrator = 0 migrator = NONE active = 0 active = NONE nextevt = T0i, T1 nextevt = T2 / \ / \ 0 (T0i) 1 (T1) 2 (T2) 3 active idle idle idle 0) CPU 0 is active thus its event is ignored (the letter 'i') and so are upper levels' events. CPU 1 is idle and has the timer T1 enqueued. CPU 2 also has a timer. The expiry order is T0 (ignored) < T1 < T2 [GRP1:0] migrator = GRP0:0 active = GRP0:0 nextevt = T0:0i, T0:1 / \ [GRP0:0] [GRP0:1] migrator = NONE migrator = NONE active = NONE active = NONE nextevt = T1 nextevt = T2 / \ / \ 0 (T0i) 1 (T1) 2 (T2) 3 idle idle idle idle 1) CPU 0 goes idle without global event queued. Therefore KTIME_MAX is pushed as its next expiry and its own event kept as "ignore". Without this early return the following steps happen in tmigr_update_events() when child = null and group = GRP0:0 : lock(GRP0:0->lock); timerqueue_del(GRP0:0, T0i); unlock(GRP0:0->lock); [GRP1:0] migrator = NONE active = NONE nextevt = T0:0, T0:1 / \ [GRP0:0] [GRP0:1] migrator = NONE migrator = NONE active = NONE active = NONE nextevt = T1 nextevt = T2 / \ / \ 0 (T0i) 1 (T1) 2 (T2) 3 idle idle idle idle 2) The change now propagates up to the top. Then tmigr_update_events() updates the group event of GRP0:0 and executes the following steps (child = GRP0:0 and group = GRP0:0): lock(GRP0:0->lock); lock(GRP1:0->lock); evt = tmigr_next_groupevt(GRP0:0); -> this removes the ignored events in GRP0:0 ... update GRP1:0 group event and timerqueue ... unlock(GRP1:0->lock); unlock(GRP0:0->lock); So the dance in 1) with locking the GRP0:0->lock and removing the T0i from the timerqueue is redundand as this is done nevertheless in 2) when tmigr_next_groupevt(GRP0:0) is executed. Revert commit 4b6f4c5a67c0 ("timer/migration: Remove buggy early return on deactivation") and add a condition into return path to skip the return only, when hierarchy contains a single group. Adapt comments accordingly. Fixes: 4b6f4c5a67c0 ("timer/migration: Remove buggy early return on deactivation") Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/87cyr49on2.fsf@somnus --- kernel/time/timer_migration.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index e3075e40cb43..ccba875d2234 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -751,6 +751,33 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child, first_childevt = evt = data->evt; + /* + * Walking the hierarchy is required in any case when a + * remote expiry was done before. This ensures to not lose + * already queued events in non active groups (see section + * "Required event and timerqueue update after a remote + * expiry" in the documentation at the top). + * + * The two call sites which are executed without a remote expiry + * before, are not prevented from propagating changes through + * the hierarchy by the return: + * - When entering this path by tmigr_new_timer(), @evt->ignore + * is never set. + * - tmigr_inactive_up() takes care of the propagation by + * itself and ignores the return value. But an immediate + * return is possible if there is a parent, sparing group + * locking at this level, because the upper walking call to + * the parent will take care about removing this event from + * within the group and update next_expiry accordingly. + * + * However if there is no parent, ie: the hierarchy has only a + * single level so @group is the top level group, make sure the + * first event information of the group is updated properly and + * also handled properly, so skip this fast return path. + */ + if (evt->ignore && !remote && group->parent) + return true; + raw_spin_lock(&group->lock); childstate.state = 0; -- cgit v1.2.3