From 95fa145479fbc0a0c1fd3274ceb42ec03c042a4a Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 19 Jul 2019 10:29:22 -0700 Subject: bpf: sockmap/tls, close can race with map free When a map free is called and in parallel a socket is closed we have two paths that can potentially reset the socket prot ops, the bpf close() path and the map free path. This creates a problem with which prot ops should be used from the socket closed side. If the map_free side completes first then we want to call the original lowest level ops. However, if the tls path runs first we want to call the sockmap ops. Additionally there was no locking around prot updates in TLS code paths so the prot ops could be changed multiple times once from TLS path and again from sockmap side potentially leaving ops pointed at either TLS or sockmap when psock and/or tls context have already been destroyed. To fix this race first only update ops inside callback lock so that TLS, sockmap and lowest level all agree on prot state. Second and a ULP callback update() so that lower layers can inform the upper layer when they are being removed allowing the upper layer to reset prot ops. This gets us close to allowing sockmap and tls to be stacked in arbitrary order but will save that patch for *next trees. v4: - make sure we don't free things for device; - remove the checks which swap the callbacks back only if TLS is at the top. Reported-by: syzbot+06537213db7ba2745c4a@syzkaller.appspotmail.com Fixes: 02c558b2d5d6 ("bpf: sockmap, support for msg_peek in sk_msg with redirect ingress") Signed-off-by: John Fastabend Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: Daniel Borkmann --- include/linux/skmsg.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 50ced8aba9db..e4b3fb4bb77c 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -354,7 +354,13 @@ static inline void sk_psock_restore_proto(struct sock *sk, sk->sk_write_space = psock->saved_write_space; if (psock->sk_proto) { - sk->sk_prot = psock->sk_proto; + struct inet_connection_sock *icsk = inet_csk(sk); + bool has_ulp = !!icsk->icsk_ulp_data; + + if (has_ulp) + tcp_update_ulp(sk, psock->sk_proto); + else + sk->sk_prot = psock->sk_proto; psock->sk_proto = NULL; } } -- cgit v1.2.3 From 6ee82ef04e38b0d8b09b04bc1b068673deed6582 Mon Sep 17 00:00:00 2001 From: Sylwester Nawrocki Date: Mon, 1 Jul 2019 13:46:51 +0200 Subject: clk: Add missing documentation of devm_clk_bulk_get_optional() argument Fix an incomplete devm_clk_bulk_get_optional() function documentation by adding description of the num_clks argument as in other *clk_bulk* functions. Fixes: 9bd5ef0bd874 ("clk: Add devm_clk_bulk_get_optional() function") Reported-by: kbuild test robot Signed-off-by: Sylwester Nawrocki Signed-off-by: Stephen Boyd --- include/linux/clk.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/clk.h b/include/linux/clk.h index 3c096c7a51dc..853a8f181394 100644 --- a/include/linux/clk.h +++ b/include/linux/clk.h @@ -359,6 +359,7 @@ int __must_check devm_clk_bulk_get(struct device *dev, int num_clks, /** * devm_clk_bulk_get_optional - managed get multiple optional consumer clocks * @dev: device for clock "consumer" + * @num_clks: the number of clk_bulk_data * @clks: pointer to the clk_bulk_data table of consumer * * Behaves the same as devm_clk_bulk_get() except where there is no clock -- cgit v1.2.3 From d9b8aadaffa65809d146cf0f8632a22a946367d7 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 19 Jul 2019 11:18:15 +0200 Subject: bpf: fix narrower loads on s390 The very first check in test_pkt_md_access is failing on s390, which happens because loading a part of a struct __sk_buff field produces an incorrect result. The preprocessed code of the check is: { __u8 tmp = *((volatile __u8 *)&skb->len + ((sizeof(skb->len) - sizeof(__u8)) / sizeof(__u8))); if (tmp != ((*(volatile __u32 *)&skb->len) & 0xFF)) return 2; }; clang generates the following code for it: 0: 71 21 00 03 00 00 00 00 r2 = *(u8 *)(r1 + 3) 1: 61 31 00 00 00 00 00 00 r3 = *(u32 *)(r1 + 0) 2: 57 30 00 00 00 00 00 ff r3 &= 255 3: 5d 23 00 1d 00 00 00 00 if r2 != r3 goto +29 Finally, verifier transforms it to: 0: (61) r2 = *(u32 *)(r1 +104) 1: (bc) w2 = w2 2: (74) w2 >>= 24 3: (bc) w2 = w2 4: (54) w2 &= 255 5: (bc) w2 = w2 The problem is that when verifier emits the code to replace a partial load of a struct __sk_buff field (*(u8 *)(r1 + 3)) with a full load of struct sk_buff field (*(u32 *)(r1 + 104)), an optional shift and a bitwise AND, it assumes that the machine is little endian and incorrectly decides to use a shift. Adjust shift count calculation to account for endianness. Fixes: 31fd85816dbe ("bpf: permits narrower load from bpf program context fields") Signed-off-by: Ilya Leoshkevich Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 13 +++++++++++++ kernel/bpf/verifier.c | 4 ++-- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index ff65d22cf336..92c6e31fb008 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -24,6 +24,7 @@ #include +#include #include #include @@ -747,6 +748,18 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default) return size <= size_default && (size & (size - 1)) == 0; } +static inline u8 +bpf_ctx_narrow_load_shift(u32 off, u32 size, u32 size_default) +{ + u8 load_off = off & (size_default - 1); + +#ifdef __LITTLE_ENDIAN + return load_off * 8; +#else + return (size_default - (load_off + size)) * 8; +#endif +} + #define bpf_ctx_wide_access_ok(off, size, type, field) \ (size == sizeof(__u64) && \ off >= offsetof(type, field) && \ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5900cbb966b1..c84d83f86141 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8616,8 +8616,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } if (is_narrower_load && size < target_size) { - u8 shift = (off & (size_default - 1)) * 8; - + u8 shift = bpf_ctx_narrow_load_shift(off, size, + size_default); if (ctx_field_size <= 4) { if (shift) insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH, -- cgit v1.2.3 From 8732d85a69a0411f16a4b78df8fdc7b09c50a849 Mon Sep 17 00:00:00 2001 From: Mattias Jacobsson <2pi@mok.nu> Date: Fri, 19 Jul 2019 19:51:45 +0200 Subject: platform/x86: wmi: add missing struct parameter description Add a description for the context parameter in the struct wmi_device_id. Reported-by: kbuild test robot Fixes: a48e23385fcf ("platform/x86: wmi: add context pointer field to struct wmi_device_id") Signed-off-by: Mattias Jacobsson <2pi@mok.nu> Signed-off-by: Andy Shevchenko --- include/linux/mod_devicetable.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index b2c1648f7e5d..5714fd35a83c 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -814,6 +814,7 @@ struct tee_client_device_id { /** * struct wmi_device_id - WMI device identifier * @guid_string: 36 char string of the form fa50ff2b-f2e8-45de-83fa-65417f2f49ba + * @context: pointer to driver specific data */ struct wmi_device_id { const char guid_string[UUID_STRING_LEN+1]; -- cgit v1.2.3 From f8be17b81d44aed1f9ea68c3fc70f501c9616e2d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 23 Jul 2019 10:22:48 +0300 Subject: lib/dim: Fix -Wunused-const-variable warnings DIM causes to the following warnings during kernel compilation which indicates that tx_profile and rx_profile are supposed to be declared in *.c and not in *.h files. In file included from ./include/rdma/ib_verbs.h:64, from ./include/linux/mlx5/device.h:37, from ./include/linux/mlx5/driver.h:51, from ./include/linux/mlx5/vport.h:36, from drivers/infiniband/hw/mlx5/ib_virt.c:34: ./include/linux/dim.h:326:1: warning: _tx_profile_ defined but not used [-Wunused-const-variable=] 326 | tx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = { | ^~~~~~~~~~ ./include/linux/dim.h:320:1: warning: _rx_profile_ defined but not used [-Wunused-const-variable=] 320 | rx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = { | ^~~~~~~~~~ Fixes: 4f75da3666c0 ("linux/dim: Move implementation to .c files") Signed-off-by: Leon Romanovsky Reviewed-by: Bart Van Assche Acked-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/dim.h | 56 ----------------------------------------------------- lib/dim/net_dim.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 56 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dim.h b/include/linux/dim.h index d3a0fbfff2bb..9fa4b3f88c39 100644 --- a/include/linux/dim.h +++ b/include/linux/dim.h @@ -272,62 +272,6 @@ dim_update_sample_with_comps(u16 event_ctr, u64 packets, u64 bytes, u64 comps, /* Net DIM */ -/* - * Net DIM profiles: - * There are different set of profiles for each CQ period mode. - * There are different set of profiles for RX/TX CQs. - * Each profile size must be of NET_DIM_PARAMS_NUM_PROFILES - */ -#define NET_DIM_PARAMS_NUM_PROFILES 5 -#define NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE 256 -#define NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE 128 -#define NET_DIM_DEF_PROFILE_CQE 1 -#define NET_DIM_DEF_PROFILE_EQE 1 - -#define NET_DIM_RX_EQE_PROFILES { \ - {1, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ - {8, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ - {64, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ - {128, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ - {256, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ -} - -#define NET_DIM_RX_CQE_PROFILES { \ - {2, 256}, \ - {8, 128}, \ - {16, 64}, \ - {32, 64}, \ - {64, 64} \ -} - -#define NET_DIM_TX_EQE_PROFILES { \ - {1, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}, \ - {8, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}, \ - {32, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}, \ - {64, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}, \ - {128, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE} \ -} - -#define NET_DIM_TX_CQE_PROFILES { \ - {5, 128}, \ - {8, 64}, \ - {16, 32}, \ - {32, 32}, \ - {64, 32} \ -} - -static const struct dim_cq_moder -rx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = { - NET_DIM_RX_EQE_PROFILES, - NET_DIM_RX_CQE_PROFILES, -}; - -static const struct dim_cq_moder -tx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = { - NET_DIM_TX_EQE_PROFILES, - NET_DIM_TX_CQE_PROFILES, -}; - /** * net_dim_get_rx_moderation - provide a CQ moderation object for the given RX profile * @cq_period_mode: CQ period mode diff --git a/lib/dim/net_dim.c b/lib/dim/net_dim.c index 5bcc902c5388..a4db51c21266 100644 --- a/lib/dim/net_dim.c +++ b/lib/dim/net_dim.c @@ -5,6 +5,62 @@ #include +/* + * Net DIM profiles: + * There are different set of profiles for each CQ period mode. + * There are different set of profiles for RX/TX CQs. + * Each profile size must be of NET_DIM_PARAMS_NUM_PROFILES + */ +#define NET_DIM_PARAMS_NUM_PROFILES 5 +#define NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE 256 +#define NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE 128 +#define NET_DIM_DEF_PROFILE_CQE 1 +#define NET_DIM_DEF_PROFILE_EQE 1 + +#define NET_DIM_RX_EQE_PROFILES { \ + {1, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {8, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {64, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {128, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {256, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ +} + +#define NET_DIM_RX_CQE_PROFILES { \ + {2, 256}, \ + {8, 128}, \ + {16, 64}, \ + {32, 64}, \ + {64, 64} \ +} + +#define NET_DIM_TX_EQE_PROFILES { \ + {1, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {8, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {32, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {64, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {128, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE} \ +} + +#define NET_DIM_TX_CQE_PROFILES { \ + {5, 128}, \ + {8, 64}, \ + {16, 32}, \ + {32, 32}, \ + {64, 32} \ +} + +static const struct dim_cq_moder +rx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = { + NET_DIM_RX_EQE_PROFILES, + NET_DIM_RX_CQE_PROFILES, +}; + +static const struct dim_cq_moder +tx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = { + NET_DIM_TX_EQE_PROFILES, + NET_DIM_TX_CQE_PROFILES, +}; + struct dim_cq_moder net_dim_get_rx_moderation(u8 cq_period_mode, int ix) { -- cgit v1.2.3 From 02712bc3250849c1cf99d626aea98f610e695f34 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 24 Jul 2019 08:52:53 +0200 Subject: mm/hmm: move hmm_vma_range_done and hmm_vma_fault to nouveau These two functions are marked as a legacy APIs to get rid of, but seem to suit the current nouveau flow. Move it to the only user in preparation for fixing a locking bug involving caller and callee. All comments referring to the old API have been removed as this now is a driver private helper. Link: https://lore.kernel.org/r/20190724065258.16603-3-hch@lst.de Tested-by: Ralph Campbell Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/gpu/drm/nouveau/nouveau_svm.c | 46 +++++++++++++++++++++++++++-- include/linux/hmm.h | 54 ----------------------------------- 2 files changed, 44 insertions(+), 56 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index 8c92374afcf2..6c1b04de0db8 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -475,6 +475,48 @@ nouveau_svm_fault_cache(struct nouveau_svm *svm, fault->inst, fault->addr, fault->access); } +static inline bool +nouveau_range_done(struct hmm_range *range) +{ + bool ret = hmm_range_valid(range); + + hmm_range_unregister(range); + return ret; +} + +static int +nouveau_range_fault(struct hmm_mirror *mirror, struct hmm_range *range, + bool block) +{ + long ret; + + range->default_flags = 0; + range->pfn_flags_mask = -1UL; + + ret = hmm_range_register(range, mirror, + range->start, range->end, + PAGE_SHIFT); + if (ret) + return (int)ret; + + if (!hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT)) { + up_read(&range->vma->vm_mm->mmap_sem); + return -EAGAIN; + } + + ret = hmm_range_fault(range, block); + if (ret <= 0) { + if (ret == -EBUSY || !ret) { + up_read(&range->vma->vm_mm->mmap_sem); + ret = -EBUSY; + } else if (ret == -EAGAIN) + ret = -EBUSY; + hmm_range_unregister(range); + return ret; + } + return 0; +} + static int nouveau_svm_fault(struct nvif_notify *notify) { @@ -649,10 +691,10 @@ nouveau_svm_fault(struct nvif_notify *notify) range.values = nouveau_svm_pfn_values; range.pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT; again: - ret = hmm_vma_fault(&svmm->mirror, &range, true); + ret = nouveau_range_fault(&svmm->mirror, &range, true); if (ret == 0) { mutex_lock(&svmm->mutex); - if (!hmm_vma_range_done(&range)) { + if (!nouveau_range_done(&range)) { mutex_unlock(&svmm->mutex); goto again; } diff --git a/include/linux/hmm.h b/include/linux/hmm.h index b8a08b2a10ca..7ef56dc18050 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -484,60 +484,6 @@ long hmm_range_dma_unmap(struct hmm_range *range, */ #define HMM_RANGE_DEFAULT_TIMEOUT 1000 -/* This is a temporary helper to avoid merge conflict between trees. */ -static inline bool hmm_vma_range_done(struct hmm_range *range) -{ - bool ret = hmm_range_valid(range); - - hmm_range_unregister(range); - return ret; -} - -/* This is a temporary helper to avoid merge conflict between trees. */ -static inline int hmm_vma_fault(struct hmm_mirror *mirror, - struct hmm_range *range, bool block) -{ - long ret; - - /* - * With the old API the driver must set each individual entries with - * the requested flags (valid, write, ...). So here we set the mask to - * keep intact the entries provided by the driver and zero out the - * default_flags. - */ - range->default_flags = 0; - range->pfn_flags_mask = -1UL; - - ret = hmm_range_register(range, mirror, - range->start, range->end, - PAGE_SHIFT); - if (ret) - return (int)ret; - - if (!hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT)) { - /* - * The mmap_sem was taken by driver we release it here and - * returns -EAGAIN which correspond to mmap_sem have been - * drop in the old API. - */ - up_read(&range->vma->vm_mm->mmap_sem); - return -EAGAIN; - } - - ret = hmm_range_fault(range, block); - if (ret <= 0) { - if (ret == -EBUSY || !ret) { - /* Same as above, drop mmap_sem to match old API. */ - up_read(&range->vma->vm_mm->mmap_sem); - ret = -EBUSY; - } else if (ret == -EAGAIN) - ret = -EBUSY; - hmm_range_unregister(range); - return ret; - } - return 0; -} - /* Below are for HMM internal use only! Not to be used by device driver! */ static inline void hmm_mm_init(struct mm_struct *mm) { -- cgit v1.2.3 From 7a32f2962c56d9d8a836b4469855caeee8766bd4 Mon Sep 17 00:00:00 2001 From: Edward Srouji Date: Tue, 23 Jul 2019 10:12:55 +0300 Subject: net/mlx5: Fix modify_cq_in alignment Fix modify_cq_in alignment to match the device specification. After this fix the 'cq_umem_valid' field will be in the right offset. Cc: # 4.19 Fixes: bd37197554eb ("net/mlx5: Update mlx5_ifc with DEVX UID bits") Signed-off-by: Edward Srouji Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index b3d5752657d9..ec571fd7fcf8 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -5975,10 +5975,12 @@ struct mlx5_ifc_modify_cq_in_bits { struct mlx5_ifc_cqc_bits cq_context; - u8 reserved_at_280[0x40]; + u8 reserved_at_280[0x60]; u8 cq_umem_valid[0x1]; - u8 reserved_at_2c1[0x5bf]; + u8 reserved_at_2e1[0x1f]; + + u8 reserved_at_300[0x580]; u8 pas[0][0x40]; }; -- cgit v1.2.3 From 90bb769291161cf25a818d69cf608c181654473e Mon Sep 17 00:00:00 2001 From: Ariel Levkovich Date: Sat, 6 Jul 2019 18:06:15 +0300 Subject: net/mlx5e: Prevent encap flow counter update async to user query This patch prevents a race between user invoked cached counters query and a neighbor last usage updater. The cached flow counter stats can be queried by calling "mlx5_fc_query_cached" which provides the number of bytes and packets that passed via this flow since the last time this counter was queried. It does so by reducting the last saved stats from the current, cached stats and then updating the last saved stats with the cached stats. It also provide the lastuse value for that flow. Since "mlx5e_tc_update_neigh_used_value" needs to retrieve the last usage time of encapsulation flows, it calls the flow counter query method periodically and async to user queries of the flow counter using cls_flower. This call is causing the driver to update the last reported bytes and packets from the cache and therefore, future user queries of the flow stats will return lower than expected number for bytes and packets since the last saved stats in the driver was updated async to the last saved stats in cls_flower. This causes wrong stats presentation of encapsulation flows to user. Since the neighbor usage updater only needs the lastuse stats from the cached counter, the fix is to use a dedicated lastuse query call that returns the lastuse value without synching between the cached stats and the last saved stats. Fixes: f6dfb4c3f216 ("net/mlx5e: Update neighbour 'used' state using HW flow rules counters") Signed-off-by: Ariel Levkovich Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c | 5 +++++ include/linux/mlx5/fs.h | 1 + 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index cc096f6011d9..7ecfc53cf5f6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -1230,13 +1230,13 @@ static struct mlx5_fc *mlx5e_tc_get_counter(struct mlx5e_tc_flow *flow) void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe) { struct mlx5e_neigh *m_neigh = &nhe->m_neigh; - u64 bytes, packets, lastuse = 0; struct mlx5e_tc_flow *flow; struct mlx5e_encap_entry *e; struct mlx5_fc *counter; struct neigh_table *tbl; bool neigh_used = false; struct neighbour *n; + u64 lastuse; if (m_neigh->family == AF_INET) tbl = &arp_tbl; @@ -1256,7 +1256,7 @@ void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe) encaps[efi->index]); if (flow->flags & MLX5E_TC_FLOW_OFFLOADED) { counter = mlx5e_tc_get_counter(flow); - mlx5_fc_query_cached(counter, &bytes, &packets, &lastuse); + lastuse = mlx5_fc_query_lastuse(counter); if (time_after((unsigned long)lastuse, nhe->reported_lastuse)) { neigh_used = true; break; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c index b3762123a69c..1834d9f3aa1c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c @@ -369,6 +369,11 @@ int mlx5_fc_query(struct mlx5_core_dev *dev, struct mlx5_fc *counter, } EXPORT_SYMBOL(mlx5_fc_query); +u64 mlx5_fc_query_lastuse(struct mlx5_fc *counter) +{ + return counter->cache.lastuse; +} + void mlx5_fc_query_cached(struct mlx5_fc *counter, u64 *bytes, u64 *packets, u64 *lastuse) { diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 04a569568eac..f049af3f3cd8 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -220,6 +220,7 @@ int mlx5_modify_rule_destination(struct mlx5_flow_handle *handler, struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging); void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter); +u64 mlx5_fc_query_lastuse(struct mlx5_fc *counter); void mlx5_fc_query_cached(struct mlx5_fc *counter, u64 *bytes, u64 *packets, u64 *lastuse); int mlx5_fc_query(struct mlx5_core_dev *dev, struct mlx5_fc *counter, -- cgit v1.2.3 From a7cf3d24ee6081930feb4c830a7f6f16ebe31c49 Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Thu, 25 Jul 2019 12:07:12 -0600 Subject: net: qualcomm: rmnet: Fix incorrect UL checksum offload logic The udp_ip4_ind bit is set only for IPv4 UDP non-fragmented packets so that the hardware can flip the checksum to 0xFFFF if the computed checksum is 0 per RFC768. However, this bit had to be set for IPv6 UDP non fragmented packets as well per hardware requirements. Otherwise, IPv6 UDP packets with computed checksum as 0 were transmitted by hardware and were dropped in the network. In addition to setting this bit for IPv6 UDP, the field is also appropriately renamed to udp_ind as part of this change. Fixes: 5eb5f8608ef1 ("net: qualcomm: rmnet: Add support for TX checksum offload") Cc: Sean Tranchetti Signed-off-by: Subash Abhinov Kasiviswanathan Signed-off-by: David S. Miller --- drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c | 13 +++++++++---- include/linux/if_rmnet.h | 4 ++-- 2 files changed, 11 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c index 60189923737a..21d38167f961 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c @@ -206,9 +206,9 @@ rmnet_map_ipv4_ul_csum_header(void *iphdr, ul_header->csum_insert_offset = skb->csum_offset; ul_header->csum_enabled = 1; if (ip4h->protocol == IPPROTO_UDP) - ul_header->udp_ip4_ind = 1; + ul_header->udp_ind = 1; else - ul_header->udp_ip4_ind = 0; + ul_header->udp_ind = 0; /* Changing remaining fields to network order */ hdr++; @@ -239,6 +239,7 @@ rmnet_map_ipv6_ul_csum_header(void *ip6hdr, struct rmnet_map_ul_csum_header *ul_header, struct sk_buff *skb) { + struct ipv6hdr *ip6h = (struct ipv6hdr *)ip6hdr; __be16 *hdr = (__be16 *)ul_header, offset; offset = htons((__force u16)(skb_transport_header(skb) - @@ -246,7 +247,11 @@ rmnet_map_ipv6_ul_csum_header(void *ip6hdr, ul_header->csum_start_offset = offset; ul_header->csum_insert_offset = skb->csum_offset; ul_header->csum_enabled = 1; - ul_header->udp_ip4_ind = 0; + + if (ip6h->nexthdr == IPPROTO_UDP) + ul_header->udp_ind = 1; + else + ul_header->udp_ind = 0; /* Changing remaining fields to network order */ hdr++; @@ -419,7 +424,7 @@ sw_csum: ul_header->csum_start_offset = 0; ul_header->csum_insert_offset = 0; ul_header->csum_enabled = 0; - ul_header->udp_ip4_ind = 0; + ul_header->udp_ind = 0; priv->stats.csum_sw++; } diff --git a/include/linux/if_rmnet.h b/include/linux/if_rmnet.h index b4f5403383fc..9661416a9bb4 100644 --- a/include/linux/if_rmnet.h +++ b/include/linux/if_rmnet.h @@ -41,11 +41,11 @@ struct rmnet_map_ul_csum_header { __be16 csum_start_offset; #if defined(__LITTLE_ENDIAN_BITFIELD) u16 csum_insert_offset:14; - u16 udp_ip4_ind:1; + u16 udp_ind:1; u16 csum_enabled:1; #elif defined (__BIG_ENDIAN_BITFIELD) u16 csum_enabled:1; - u16 udp_ip4_ind:1; + u16 udp_ind:1; u16 csum_insert_offset:14; #else #error "Please fix " -- cgit v1.2.3 From ffe0bbabb0cffceceae07484fde1ec2a63b1537c Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 8 Jul 2019 10:23:43 +0200 Subject: gpio: don't WARN() on NULL descs if gpiolib is disabled If gpiolib is disabled, we use the inline stubs from gpio/consumer.h instead of regular definitions of GPIO API. The stubs for 'optional' variants of gpiod_get routines return NULL in this case as if the relevant GPIO wasn't found. This is correct so far. Calling other (non-gpio_get) stubs from this header triggers a warning because the GPIO descriptor couldn't have been requested. The warning however is unconditional (WARN_ON(1)) and is emitted even if the passed descriptor pointer is NULL. We don't want to force the users of 'optional' gpio_get to check the returned pointer before calling e.g. gpiod_set_value() so let's only WARN on non-NULL descriptors. Cc: stable@vger.kernel.org Reported-by: Claus H. Stovgaard Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/consumer.h | 64 +++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index 9ddcf50a3c59..a7f08fb0f865 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -247,7 +247,7 @@ static inline void gpiod_put(struct gpio_desc *desc) might_sleep(); /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); } static inline void devm_gpiod_unhinge(struct device *dev, @@ -256,7 +256,7 @@ static inline void devm_gpiod_unhinge(struct device *dev, might_sleep(); /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); } static inline void gpiod_put_array(struct gpio_descs *descs) @@ -264,7 +264,7 @@ static inline void gpiod_put_array(struct gpio_descs *descs) might_sleep(); /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(descs); } static inline struct gpio_desc *__must_check @@ -317,7 +317,7 @@ static inline void devm_gpiod_put(struct device *dev, struct gpio_desc *desc) might_sleep(); /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); } static inline void devm_gpiod_put_array(struct device *dev, @@ -326,32 +326,32 @@ static inline void devm_gpiod_put_array(struct device *dev, might_sleep(); /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(descs); } static inline int gpiod_get_direction(const struct gpio_desc *desc) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); return -ENOSYS; } static inline int gpiod_direction_input(struct gpio_desc *desc) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); return -ENOSYS; } static inline int gpiod_direction_output(struct gpio_desc *desc, int value) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); return -ENOSYS; } static inline int gpiod_direction_output_raw(struct gpio_desc *desc, int value) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); return -ENOSYS; } @@ -359,7 +359,7 @@ static inline int gpiod_direction_output_raw(struct gpio_desc *desc, int value) static inline int gpiod_get_value(const struct gpio_desc *desc) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); return 0; } static inline int gpiod_get_array_value(unsigned int array_size, @@ -368,13 +368,13 @@ static inline int gpiod_get_array_value(unsigned int array_size, unsigned long *value_bitmap) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc_array); return 0; } static inline void gpiod_set_value(struct gpio_desc *desc, int value) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); } static inline int gpiod_set_array_value(unsigned int array_size, struct gpio_desc **desc_array, @@ -382,13 +382,13 @@ static inline int gpiod_set_array_value(unsigned int array_size, unsigned long *value_bitmap) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc_array); return 0; } static inline int gpiod_get_raw_value(const struct gpio_desc *desc) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); return 0; } static inline int gpiod_get_raw_array_value(unsigned int array_size, @@ -397,13 +397,13 @@ static inline int gpiod_get_raw_array_value(unsigned int array_size, unsigned long *value_bitmap) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc_array); return 0; } static inline void gpiod_set_raw_value(struct gpio_desc *desc, int value) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); } static inline int gpiod_set_raw_array_value(unsigned int array_size, struct gpio_desc **desc_array, @@ -411,14 +411,14 @@ static inline int gpiod_set_raw_array_value(unsigned int array_size, unsigned long *value_bitmap) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc_array); return 0; } static inline int gpiod_get_value_cansleep(const struct gpio_desc *desc) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); return 0; } static inline int gpiod_get_array_value_cansleep(unsigned int array_size, @@ -427,13 +427,13 @@ static inline int gpiod_get_array_value_cansleep(unsigned int array_size, unsigned long *value_bitmap) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc_array); return 0; } static inline void gpiod_set_value_cansleep(struct gpio_desc *desc, int value) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); } static inline int gpiod_set_array_value_cansleep(unsigned int array_size, struct gpio_desc **desc_array, @@ -441,13 +441,13 @@ static inline int gpiod_set_array_value_cansleep(unsigned int array_size, unsigned long *value_bitmap) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc_array); return 0; } static inline int gpiod_get_raw_value_cansleep(const struct gpio_desc *desc) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); return 0; } static inline int gpiod_get_raw_array_value_cansleep(unsigned int array_size, @@ -456,14 +456,14 @@ static inline int gpiod_get_raw_array_value_cansleep(unsigned int array_size, unsigned long *value_bitmap) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc_array); return 0; } static inline void gpiod_set_raw_value_cansleep(struct gpio_desc *desc, int value) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); } static inline int gpiod_set_raw_array_value_cansleep(unsigned int array_size, struct gpio_desc **desc_array, @@ -471,41 +471,41 @@ static inline int gpiod_set_raw_array_value_cansleep(unsigned int array_size, unsigned long *value_bitmap) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc_array); return 0; } static inline int gpiod_set_debounce(struct gpio_desc *desc, unsigned debounce) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); return -ENOSYS; } static inline int gpiod_set_transitory(struct gpio_desc *desc, bool transitory) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); return -ENOSYS; } static inline int gpiod_is_active_low(const struct gpio_desc *desc) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); return 0; } static inline int gpiod_cansleep(const struct gpio_desc *desc) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); return 0; } static inline int gpiod_to_irq(const struct gpio_desc *desc) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); return -EINVAL; } @@ -513,7 +513,7 @@ static inline int gpiod_set_consumer_name(struct gpio_desc *desc, const char *name) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); return -EINVAL; } @@ -525,7 +525,7 @@ static inline struct gpio_desc *gpio_to_desc(unsigned gpio) static inline int desc_to_gpio(const struct gpio_desc *desc) { /* GPIO can never have been requested */ - WARN_ON(1); + WARN_ON(desc); return -EINVAL; } -- cgit v1.2.3 From 89e524c04fa966330e2e80ab2bc50b9944c5847a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 30 Jul 2019 13:10:14 +0200 Subject: loop: Fix mount(2) failure due to race with LOOP_SET_FD Commit 33ec3e53e7b1 ("loop: Don't change loop device under exclusive opener") made LOOP_SET_FD ioctl acquire exclusive block device reference while it updates loop device binding. However this can make perfectly valid mount(2) fail with EBUSY due to racing LOOP_SET_FD holding temporarily the exclusive bdev reference in cases like this: for i in {a..z}{a..z}; do dd if=/dev/zero of=$i.image bs=1k count=0 seek=1024 mkfs.ext2 $i.image mkdir mnt$i done echo "Run" for i in {a..z}{a..z}; do mount -o loop -t ext2 $i.image mnt$i & done Fix the problem by not getting full exclusive bdev reference in LOOP_SET_FD but instead just mark the bdev as being claimed while we update the binding information. This just blocks new exclusive openers instead of failing them with EBUSY thus fixing the problem. Fixes: 33ec3e53e7b1 ("loop: Don't change loop device under exclusive opener") Cc: stable@vger.kernel.org Tested-by: Kai-Heng Feng Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- drivers/block/loop.c | 16 +++++----- fs/block_dev.c | 83 ++++++++++++++++++++++++++++++++++++---------------- include/linux/fs.h | 6 ++++ 3 files changed, 73 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 44c9985f352a..3036883fc9f8 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -924,6 +924,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, struct file *file; struct inode *inode; struct address_space *mapping; + struct block_device *claimed_bdev = NULL; int lo_flags = 0; int error; loff_t size; @@ -942,10 +943,11 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, * here to avoid changing device under exclusive owner. */ if (!(mode & FMODE_EXCL)) { - bdgrab(bdev); - error = blkdev_get(bdev, mode | FMODE_EXCL, loop_set_fd); - if (error) + claimed_bdev = bd_start_claiming(bdev, loop_set_fd); + if (IS_ERR(claimed_bdev)) { + error = PTR_ERR(claimed_bdev); goto out_putf; + } } error = mutex_lock_killable(&loop_ctl_mutex); @@ -1015,15 +1017,15 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, mutex_unlock(&loop_ctl_mutex); if (partscan) loop_reread_partitions(lo, bdev); - if (!(mode & FMODE_EXCL)) - blkdev_put(bdev, mode | FMODE_EXCL); + if (claimed_bdev) + bd_abort_claiming(bdev, claimed_bdev, loop_set_fd); return 0; out_unlock: mutex_unlock(&loop_ctl_mutex); out_bdev: - if (!(mode & FMODE_EXCL)) - blkdev_put(bdev, mode | FMODE_EXCL); + if (claimed_bdev) + bd_abort_claiming(bdev, claimed_bdev, loop_set_fd); out_putf: fput(file); out: diff --git a/fs/block_dev.c b/fs/block_dev.c index c2a85b587922..22591bad9353 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1181,8 +1181,7 @@ static struct gendisk *bdev_get_gendisk(struct block_device *bdev, int *partno) * Pointer to the block device containing @bdev on success, ERR_PTR() * value on failure. */ -static struct block_device *bd_start_claiming(struct block_device *bdev, - void *holder) +struct block_device *bd_start_claiming(struct block_device *bdev, void *holder) { struct gendisk *disk; struct block_device *whole; @@ -1229,6 +1228,62 @@ static struct block_device *bd_start_claiming(struct block_device *bdev, return ERR_PTR(err); } } +EXPORT_SYMBOL(bd_start_claiming); + +static void bd_clear_claiming(struct block_device *whole, void *holder) +{ + lockdep_assert_held(&bdev_lock); + /* tell others that we're done */ + BUG_ON(whole->bd_claiming != holder); + whole->bd_claiming = NULL; + wake_up_bit(&whole->bd_claiming, 0); +} + +/** + * bd_finish_claiming - finish claiming of a block device + * @bdev: block device of interest + * @whole: whole block device (returned from bd_start_claiming()) + * @holder: holder that has claimed @bdev + * + * Finish exclusive open of a block device. Mark the device as exlusively + * open by the holder and wake up all waiters for exclusive open to finish. + */ +void bd_finish_claiming(struct block_device *bdev, struct block_device *whole, + void *holder) +{ + spin_lock(&bdev_lock); + BUG_ON(!bd_may_claim(bdev, whole, holder)); + /* + * Note that for a whole device bd_holders will be incremented twice, + * and bd_holder will be set to bd_may_claim before being set to holder + */ + whole->bd_holders++; + whole->bd_holder = bd_may_claim; + bdev->bd_holders++; + bdev->bd_holder = holder; + bd_clear_claiming(whole, holder); + spin_unlock(&bdev_lock); +} +EXPORT_SYMBOL(bd_finish_claiming); + +/** + * bd_abort_claiming - abort claiming of a block device + * @bdev: block device of interest + * @whole: whole block device (returned from bd_start_claiming()) + * @holder: holder that has claimed @bdev + * + * Abort claiming of a block device when the exclusive open failed. This can be + * also used when exclusive open is not actually desired and we just needed + * to block other exclusive openers for a while. + */ +void bd_abort_claiming(struct block_device *bdev, struct block_device *whole, + void *holder) +{ + spin_lock(&bdev_lock); + bd_clear_claiming(whole, holder); + spin_unlock(&bdev_lock); +} +EXPORT_SYMBOL(bd_abort_claiming); #ifdef CONFIG_SYSFS struct bd_holder_disk { @@ -1698,29 +1753,7 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) /* finish claiming */ mutex_lock(&bdev->bd_mutex); - spin_lock(&bdev_lock); - - if (!res) { - BUG_ON(!bd_may_claim(bdev, whole, holder)); - /* - * Note that for a whole device bd_holders - * will be incremented twice, and bd_holder - * will be set to bd_may_claim before being - * set to holder - */ - whole->bd_holders++; - whole->bd_holder = bd_may_claim; - bdev->bd_holders++; - bdev->bd_holder = holder; - } - - /* tell others that we're done */ - BUG_ON(whole->bd_claiming != holder); - whole->bd_claiming = NULL; - wake_up_bit(&whole->bd_claiming, 0); - - spin_unlock(&bdev_lock); - + bd_finish_claiming(bdev, whole, holder); /* * Block event polling for write claims if requested. Any * write holder makes the write_holder state stick until diff --git a/include/linux/fs.h b/include/linux/fs.h index 56b8e358af5c..997a530ff4e9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2598,6 +2598,12 @@ extern struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, void *holder); extern struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder); +extern struct block_device *bd_start_claiming(struct block_device *bdev, + void *holder); +extern void bd_finish_claiming(struct block_device *bdev, + struct block_device *whole, void *holder); +extern void bd_abort_claiming(struct block_device *bdev, + struct block_device *whole, void *holder); extern void blkdev_put(struct block_device *bdev, fmode_t mode); extern int __blkdev_reread_part(struct block_device *bdev); extern int blkdev_reread_part(struct block_device *bdev); -- cgit v1.2.3 From 055d88242a6046a1ceac3167290f054c72571cd9 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 30 Jul 2019 21:25:20 +0200 Subject: compat_ioctl: pppoe: fix PPPOEIOCSFWD handling Support for handling the PPPOEIOCSFWD ioctl in compat mode was added in linux-2.5.69 along with hundreds of other commands, but was always broken sincen only the structure is compatible, but the command number is not, due to the size being sizeof(size_t), or at first sizeof(sizeof((struct sockaddr_pppox)), which is different on 64-bit architectures. Guillaume Nault adds: And the implementation was broken until 2016 (see 29e73269aa4d ("pppoe: fix reference counting in PPPoE proxy")), and nobody ever noticed. I should probably have removed this ioctl entirely instead of fixing it. Clearly, it has never been used. Fix it by adding a compat_ioctl handler for all pppoe variants that translates the command number and then calls the regular ioctl function. All other ioctl commands handled by pppoe are compatible between 32-bit and 64-bit, and require compat_ptr() conversion. This should apply to all stable kernels. Acked-by: Guillaume Nault Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/net/ppp/pppoe.c | 3 +++ drivers/net/ppp/pppox.c | 13 +++++++++++++ drivers/net/ppp/pptp.c | 3 +++ fs/compat_ioctl.c | 3 --- include/linux/if_pppox.h | 3 +++ net/l2tp/l2tp_ppp.c | 3 +++ 6 files changed, 25 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index 1d902ecb4aa8..a44dd3c8af63 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -1115,6 +1115,9 @@ static const struct proto_ops pppoe_ops = { .recvmsg = pppoe_recvmsg, .mmap = sock_no_mmap, .ioctl = pppox_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = pppox_compat_ioctl, +#endif }; static const struct pppox_proto pppoe_proto = { diff --git a/drivers/net/ppp/pppox.c b/drivers/net/ppp/pppox.c index 5ef422a43d70..08364f10a43f 100644 --- a/drivers/net/ppp/pppox.c +++ b/drivers/net/ppp/pppox.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -98,6 +99,18 @@ int pppox_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) EXPORT_SYMBOL(pppox_ioctl); +#ifdef CONFIG_COMPAT +int pppox_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + if (cmd == PPPOEIOCSFWD32) + cmd = PPPOEIOCSFWD; + + return pppox_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); +} + +EXPORT_SYMBOL(pppox_compat_ioctl); +#endif + static int pppox_create(struct net *net, struct socket *sock, int protocol, int kern) { diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c index a8e52c8e4128..734de7de03f7 100644 --- a/drivers/net/ppp/pptp.c +++ b/drivers/net/ppp/pptp.c @@ -623,6 +623,9 @@ static const struct proto_ops pptp_ops = { .recvmsg = sock_no_recvmsg, .mmap = sock_no_mmap, .ioctl = pppox_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = pppox_compat_ioctl, +#endif }; static const struct pppox_proto pppox_pptp_proto = { diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 6e30949d9f77..a7ec2d3dff92 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -638,9 +638,6 @@ COMPATIBLE_IOCTL(PPPIOCDISCONN) COMPATIBLE_IOCTL(PPPIOCATTCHAN) COMPATIBLE_IOCTL(PPPIOCGCHAN) COMPATIBLE_IOCTL(PPPIOCGL2TPSTATS) -/* PPPOX */ -COMPATIBLE_IOCTL(PPPOEIOCSFWD) -COMPATIBLE_IOCTL(PPPOEIOCDFWD) /* Big A */ /* sparc only */ /* Big Q for sound/OSS */ diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h index 8b728750a625..69e813bcb947 100644 --- a/include/linux/if_pppox.h +++ b/include/linux/if_pppox.h @@ -80,6 +80,9 @@ extern int register_pppox_proto(int proto_num, const struct pppox_proto *pp); extern void unregister_pppox_proto(int proto_num); extern void pppox_unbind_sock(struct sock *sk);/* delete ppp-channel binding */ extern int pppox_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); +extern int pppox_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); + +#define PPPOEIOCSFWD32 _IOW(0xB1 ,0, compat_size_t) /* PPPoX socket states */ enum { diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 1d0e5904dedf..c54cb59593ef 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -1681,6 +1681,9 @@ static const struct proto_ops pppol2tp_ops = { .recvmsg = pppol2tp_recvmsg, .mmap = sock_no_mmap, .ioctl = pppox_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = pppox_compat_ioctl, +#endif }; static const struct pppox_proto pppol2tp_proto = { -- cgit v1.2.3 From b877ac9815a8fe7e5f6d7fdde3dc34652408840a Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 14 Jun 2019 07:46:04 +0200 Subject: xen/swiotlb: remember having called xen_create_contiguous_region() Instead of always calling xen_destroy_contiguous_region() in case the memory is DMA-able for the used device, do so only in case it has been made DMA-able via xen_create_contiguous_region() before. This will avoid a lot of xen_destroy_contiguous_region() calls for 64-bit capable devices. As the memory in question is owned by swiotlb-xen the PG_owner_priv_1 flag of the first allocated page can be used for remembering. Signed-off-by: Juergen Gross Acked-by: Konrad Rzeszutek Wilk Signed-off-by: Juergen Gross --- drivers/xen/swiotlb-xen.c | 4 +++- include/linux/page-flags.h | 4 ++++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 37ddcfcfbb21..ceb681cf64bb 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -322,6 +322,7 @@ xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, xen_free_coherent_pages(hwdev, size, ret, (dma_addr_t)phys, attrs); return NULL; } + SetPageXenRemapped(virt_to_page(ret)); } memset(ret, 0, size); return ret; @@ -346,7 +347,8 @@ xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, size = 1UL << (order + XEN_PAGE_SHIFT); if (!WARN_ON((dev_addr + size - 1 > dma_mask) || - range_straddles_page_boundary(phys, size))) + range_straddles_page_boundary(phys, size)) && + TestClearPageXenRemapped(virt_to_page(vaddr))) xen_destroy_contiguous_region(phys, order); xen_free_coherent_pages(hwdev, size, vaddr, (dma_addr_t)phys, attrs); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index b848517da64c..f91cb8898ff0 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -152,6 +152,8 @@ enum pageflags { PG_savepinned = PG_dirty, /* Has a grant mapping of another (foreign) domain's page. */ PG_foreign = PG_owner_priv_1, + /* Remapped by swiotlb-xen. */ + PG_xen_remapped = PG_owner_priv_1, /* SLOB */ PG_slob_free = PG_private, @@ -329,6 +331,8 @@ PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND) TESTSCFLAG(Pinned, pinned, PF_NO_COMPOUND) PAGEFLAG(SavePinned, savepinned, PF_NO_COMPOUND); PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND); +PAGEFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND) + TESTCLEARFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND) PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) -- cgit v1.2.3 From 9f00baf74e4b6f79a3a3dfab44fb7bb2e797b551 Mon Sep 17 00:00:00 2001 From: Gary R Hook Date: Tue, 30 Jul 2019 16:05:24 +0000 Subject: crypto: ccp - Add support for valid authsize values less than 16 AES GCM encryption allows for authsize values of 4, 8, and 12-16 bytes. Validate the requested authsize, and retain it to save in the request context. Fixes: 36cf515b9bbe2 ("crypto: ccp - Enable support for AES GCM on v5 CCPs") Cc: Signed-off-by: Gary R Hook Signed-off-by: Herbert Xu --- drivers/crypto/ccp/ccp-crypto-aes-galois.c | 14 ++++++++++++++ drivers/crypto/ccp/ccp-ops.c | 26 +++++++++++++++++++++----- include/linux/ccp.h | 2 ++ 3 files changed, 37 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/crypto/ccp/ccp-crypto-aes-galois.c b/drivers/crypto/ccp/ccp-crypto-aes-galois.c index d22631cb2bb3..02eba84028b3 100644 --- a/drivers/crypto/ccp/ccp-crypto-aes-galois.c +++ b/drivers/crypto/ccp/ccp-crypto-aes-galois.c @@ -58,6 +58,19 @@ static int ccp_aes_gcm_setkey(struct crypto_aead *tfm, const u8 *key, static int ccp_aes_gcm_setauthsize(struct crypto_aead *tfm, unsigned int authsize) { + switch (authsize) { + case 16: + case 15: + case 14: + case 13: + case 12: + case 8: + case 4: + break; + default: + return -EINVAL; + } + return 0; } @@ -104,6 +117,7 @@ static int ccp_aes_gcm_crypt(struct aead_request *req, bool encrypt) memset(&rctx->cmd, 0, sizeof(rctx->cmd)); INIT_LIST_HEAD(&rctx->cmd.entry); rctx->cmd.engine = CCP_ENGINE_AES; + rctx->cmd.u.aes.authsize = crypto_aead_authsize(tfm); rctx->cmd.u.aes.type = ctx->u.aes.type; rctx->cmd.u.aes.mode = ctx->u.aes.mode; rctx->cmd.u.aes.action = encrypt; diff --git a/drivers/crypto/ccp/ccp-ops.c b/drivers/crypto/ccp/ccp-ops.c index 59f9849c3662..ef723e2722a8 100644 --- a/drivers/crypto/ccp/ccp-ops.c +++ b/drivers/crypto/ccp/ccp-ops.c @@ -622,6 +622,7 @@ static int ccp_run_aes_gcm_cmd(struct ccp_cmd_queue *cmd_q, unsigned long long *final; unsigned int dm_offset; + unsigned int authsize; unsigned int jobid; unsigned int ilen; bool in_place = true; /* Default value */ @@ -643,6 +644,21 @@ static int ccp_run_aes_gcm_cmd(struct ccp_cmd_queue *cmd_q, if (!aes->key) /* Gotta have a key SGL */ return -EINVAL; + /* Zero defaults to 16 bytes, the maximum size */ + authsize = aes->authsize ? aes->authsize : AES_BLOCK_SIZE; + switch (authsize) { + case 16: + case 15: + case 14: + case 13: + case 12: + case 8: + case 4: + break; + default: + return -EINVAL; + } + /* First, decompose the source buffer into AAD & PT, * and the destination buffer into AAD, CT & tag, or * the input into CT & tag. @@ -657,7 +673,7 @@ static int ccp_run_aes_gcm_cmd(struct ccp_cmd_queue *cmd_q, p_tag = scatterwalk_ffwd(sg_tag, p_outp, ilen); } else { /* Input length for decryption includes tag */ - ilen = aes->src_len - AES_BLOCK_SIZE; + ilen = aes->src_len - authsize; p_tag = scatterwalk_ffwd(sg_tag, p_inp, ilen); } @@ -839,19 +855,19 @@ static int ccp_run_aes_gcm_cmd(struct ccp_cmd_queue *cmd_q, if (aes->action == CCP_AES_ACTION_ENCRYPT) { /* Put the ciphered tag after the ciphertext. */ - ccp_get_dm_area(&final_wa, 0, p_tag, 0, AES_BLOCK_SIZE); + ccp_get_dm_area(&final_wa, 0, p_tag, 0, authsize); } else { /* Does this ciphered tag match the input? */ - ret = ccp_init_dm_workarea(&tag, cmd_q, AES_BLOCK_SIZE, + ret = ccp_init_dm_workarea(&tag, cmd_q, authsize, DMA_BIDIRECTIONAL); if (ret) goto e_tag; - ret = ccp_set_dm_area(&tag, 0, p_tag, 0, AES_BLOCK_SIZE); + ret = ccp_set_dm_area(&tag, 0, p_tag, 0, authsize); if (ret) goto e_tag; ret = crypto_memneq(tag.address, final_wa.address, - AES_BLOCK_SIZE) ? -EBADMSG : 0; + authsize) ? -EBADMSG : 0; ccp_dm_free(&tag); } diff --git a/include/linux/ccp.h b/include/linux/ccp.h index 7e9c991c95e0..43ed9e77cf81 100644 --- a/include/linux/ccp.h +++ b/include/linux/ccp.h @@ -173,6 +173,8 @@ struct ccp_aes_engine { enum ccp_aes_mode mode; enum ccp_aes_action action; + u32 authsize; + struct scatterlist *key; u32 key_len; /* In bytes */ -- cgit v1.2.3 From ee38d94a0ad89890b770f6c876263cf9fcbfde84 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 2 Aug 2019 21:49:02 -0700 Subject: page flags: prioritize kasan bits over last-cpuid ARM64 randdconfig builds regularly run into a build error, especially when NUMA_BALANCING and SPARSEMEM are enabled but not SPARSEMEM_VMEMMAP: #error "KASAN: not enough bits in page flags for tag" The last-cpuid bits are already contitional on the available space, so the result of the calculation is a bit random on whether they were already left out or not. Adding the kasan tag bits before last-cpuid makes it much more likely to end up with a successful build here, and should be reliable for randconfig at least, as long as that does not randomize NR_CPUS or NODES_SHIFT but uses the defaults. In order for the modified check to not trigger in the x86 vdso32 code where all constants are wrong (building with -m32), enclose all the definitions with an #ifdef. [arnd@arndb.de: build fix] Link: http://lkml.kernel.org/r/CAK8P3a3Mno1SWTcuAOT0Wa9VS15pdU6EfnkxLbDpyS55yO04+g@mail.gmail.com Link: http://lkml.kernel.org/r/20190722115520.3743282-1-arnd@arndb.de Link: https://lore.kernel.org/lkml/20190618095347.3850490-1-arnd@arndb.de/ Fixes: 2813b9c02962 ("kasan, mm, arm64: tag non slab memory allocated via pagealloc") Signed-off-by: Arnd Bergmann Signed-off-by: Arnd Bergmann Reviewed-by: Andrey Konovalov Reviewed-by: Andrey Ryabinin Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Will Deacon Cc: Christoph Lameter Cc: Mark Rutland Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/vdso/vdso.h | 1 + include/linux/page-flags-layout.h | 18 +++++++++++------- 2 files changed, 12 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/arch/mips/vdso/vdso.h b/arch/mips/vdso/vdso.h index 14b1931be69c..b65b169778e3 100644 --- a/arch/mips/vdso/vdso.h +++ b/arch/mips/vdso/vdso.h @@ -9,6 +9,7 @@ #if _MIPS_SIM != _MIPS_SIM_ABI64 && defined(CONFIG_64BIT) /* Building 32-bit VDSO for the 64-bit kernel. Fake a 32-bit Kconfig. */ +#define BUILD_VDSO32_64 #undef CONFIG_64BIT #define CONFIG_32BIT 1 #ifndef __ASSEMBLY__ diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h index 1dda31825ec4..71283739ffd2 100644 --- a/include/linux/page-flags-layout.h +++ b/include/linux/page-flags-layout.h @@ -32,6 +32,7 @@ #endif /* CONFIG_SPARSEMEM */ +#ifndef BUILD_VDSO32_64 /* * page->flags layout: * @@ -76,20 +77,22 @@ #define LAST_CPUPID_SHIFT 0 #endif -#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS +#ifdef CONFIG_KASAN_SW_TAGS +#define KASAN_TAG_WIDTH 8 +#else +#define KASAN_TAG_WIDTH 0 +#endif + +#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_CPUPID_SHIFT+KASAN_TAG_WIDTH \ + <= BITS_PER_LONG - NR_PAGEFLAGS #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT #else #define LAST_CPUPID_WIDTH 0 #endif -#ifdef CONFIG_KASAN_SW_TAGS -#define KASAN_TAG_WIDTH 8 #if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH+LAST_CPUPID_WIDTH+KASAN_TAG_WIDTH \ > BITS_PER_LONG - NR_PAGEFLAGS -#error "KASAN: not enough bits in page flags for tag" -#endif -#else -#define KASAN_TAG_WIDTH 0 +#error "Not enough bits in page flags" #endif /* @@ -104,4 +107,5 @@ #define LAST_CPUPID_NOT_IN_PAGE_FLAGS #endif +#endif #endif /* _LINUX_PAGE_FLAGS_LAYOUT */ -- cgit v1.2.3 From 17e433b54393a6269acbcb792da97791fe1592d8 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 5 Aug 2019 10:03:19 +0800 Subject: KVM: Fix leak vCPU's VMCS value into other pCPU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After commit d73eb57b80b (KVM: Boost vCPUs that are delivering interrupts), a five years old bug is exposed. Running ebizzy benchmark in three 80 vCPUs VMs on one 80 pCPUs Skylake server, a lot of rcu_sched stall warning splatting in the VMs after stress testing: INFO: rcu_sched detected stalls on CPUs/tasks: { 4 41 57 62 77} (detected by 15, t=60004 jiffies, g=899, c=898, q=15073) Call Trace: flush_tlb_mm_range+0x68/0x140 tlb_flush_mmu.part.75+0x37/0xe0 tlb_finish_mmu+0x55/0x60 zap_page_range+0x142/0x190 SyS_madvise+0x3cd/0x9c0 system_call_fastpath+0x1c/0x21 swait_active() sustains to be true before finish_swait() is called in kvm_vcpu_block(), voluntarily preempted vCPUs are taken into account by kvm_vcpu_on_spin() loop greatly increases the probability condition kvm_arch_vcpu_runnable(vcpu) is checked and can be true, when APICv is enabled the yield-candidate vCPU's VMCS RVI field leaks(by vmx_sync_pir_to_irr()) into spinning-on-a-taken-lock vCPU's current VMCS. This patch fixes it by checking conservatively a subset of events. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Christian Borntraeger Cc: Marc Zyngier Cc: stable@vger.kernel.org Fixes: 98f4a1467 (KVM: add kvm_arch_vcpu_runnable() test to kvm_vcpu_on_spin() loop) Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/powerpc.c | 5 +++++ arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm.c | 6 ++++++ arch/x86/kvm/vmx/vmx.c | 6 ++++++ arch/x86/kvm/x86.c | 16 ++++++++++++++++ include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 25 ++++++++++++++++++++++++- 7 files changed, 59 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 0dba7eb24f92..3e34d5fa6708 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -50,6 +50,11 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) return !!(v->arch.pending_exceptions) || kvm_request_pending(v); } +bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) +{ + return kvm_arch_vcpu_runnable(vcpu); +} + bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) { return false; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e74f0711eaaf..fc046ca89d32 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1175,6 +1175,7 @@ struct kvm_x86_ops { int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); + bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu); int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, bool *expired); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 7eafc6907861..d685491fce4d 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5190,6 +5190,11 @@ static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) kvm_vcpu_wake_up(vcpu); } +static bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu) +{ + return false; +} + static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) { unsigned long flags; @@ -7314,6 +7319,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .pmu_ops = &amd_pmu_ops, .deliver_posted_interrupt = svm_deliver_avic_intr, + .dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt, .update_pi_irte = svm_update_pi_irte, .setup_mce = svm_setup_mce, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 074385c86c09..42ed3faa6af8 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6117,6 +6117,11 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) return max_irr; } +static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu) +{ + return pi_test_on(vcpu_to_pi_desc(vcpu)); +} + static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) { if (!kvm_vcpu_apicv_active(vcpu)) @@ -7726,6 +7731,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, .sync_pir_to_irr = vmx_sync_pir_to_irr, .deliver_posted_interrupt = vmx_deliver_posted_interrupt, + .dy_apicv_has_pending_interrupt = vmx_dy_apicv_has_pending_interrupt, .set_tss_addr = vmx_set_tss_addr, .set_identity_map_addr = vmx_set_identity_map_addr, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c6d951cbd76c..93b0bd45ac73 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9698,6 +9698,22 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); } +bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) +{ + if (READ_ONCE(vcpu->arch.pv.pv_unhalted)) + return true; + + if (kvm_test_request(KVM_REQ_NMI, vcpu) || + kvm_test_request(KVM_REQ_SMI, vcpu) || + kvm_test_request(KVM_REQ_EVENT, vcpu)) + return true; + + if (vcpu->arch.apicv_active && kvm_x86_ops->dy_apicv_has_pending_interrupt(vcpu)) + return true; + + return false; +} + bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) { return vcpu->arch.preempted_in_kernel; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5c5b5867024c..9e4c2bb90297 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -872,6 +872,7 @@ int kvm_arch_check_processor_compat(void); int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); +bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu); #ifndef __KVM_HAVE_ARCH_VM_ALLOC /* diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ed061d8a457c..1f05aeb9da27 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2477,6 +2477,29 @@ static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) #endif } +/* + * Unlike kvm_arch_vcpu_runnable, this function is called outside + * a vcpu_load/vcpu_put pair. However, for most architectures + * kvm_arch_vcpu_runnable does not require vcpu_load. + */ +bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) +{ + return kvm_arch_vcpu_runnable(vcpu); +} + +static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu) +{ + if (kvm_arch_dy_runnable(vcpu)) + return true; + +#ifdef CONFIG_KVM_ASYNC_PF + if (!list_empty_careful(&vcpu->async_pf.done)) + return true; +#endif + + return false; +} + void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) { struct kvm *kvm = me->kvm; @@ -2506,7 +2529,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) continue; if (vcpu == me) continue; - if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu)) + if (swait_active(&vcpu->wq) && !vcpu_dy_runnable(vcpu)) continue; if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode && !kvm_arch_vcpu_in_kernel(vcpu)) -- cgit v1.2.3 From 741cbbae0768b828be2d48331eb371a4f08bbea8 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sat, 3 Aug 2019 08:14:25 +0200 Subject: KVM: remove kvm_arch_has_vcpu_debugfs() There is no need for this function as all arches have to implement kvm_arch_create_vcpu_debugfs() no matter what. A #define symbol let us actually simplify the code. Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 10 ---------- arch/powerpc/kvm/powerpc.c | 10 ---------- arch/s390/kvm/kvm-s390.c | 10 ---------- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/debugfs.c | 5 ----- include/linux/kvm_host.h | 3 ++- virt/kvm/arm/arm.c | 5 ----- virt/kvm/kvm_main.c | 5 ++--- 8 files changed, 6 insertions(+), 44 deletions(-) (limited to 'include/linux') diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 2cfe839f0b3a..1109924560d8 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -150,16 +150,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) return 0; } -bool kvm_arch_has_vcpu_debugfs(void) -{ - return false; -} - -int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) -{ - return 0; -} - void kvm_mips_free_vcpus(struct kvm *kvm) { unsigned int i; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 3e34d5fa6708..3e566c2e6066 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -457,16 +457,6 @@ err_out: return -EINVAL; } -bool kvm_arch_has_vcpu_debugfs(void) -{ - return false; -} - -int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) -{ - return 0; -} - void kvm_arch_destroy_vm(struct kvm *kvm) { unsigned int i; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 3f520cd837fb..f329dcb3f44c 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2516,16 +2516,6 @@ out_err: return rc; } -bool kvm_arch_has_vcpu_debugfs(void) -{ - return false; -} - -int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) -{ - return 0; -} - void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { VCPU_EVENT(vcpu, 3, "%s", "free cpu"); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fc046ca89d32..e92725b2a46f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -35,6 +35,8 @@ #include #include +#define __KVM_HAVE_ARCH_VCPU_DEBUGFS + #define KVM_MAX_VCPUS 288 #define KVM_SOFT_MAX_VCPUS 240 #define KVM_MAX_VCPU_ID 1023 diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index 329361b69d5e..9bd93e0d5f63 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -8,11 +8,6 @@ #include #include "lapic.h" -bool kvm_arch_has_vcpu_debugfs(void) -{ - return true; -} - static int vcpu_get_timer_advance_ns(void *data, u64 *val) { struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9e4c2bb90297..8d34db3c8bc6 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -861,8 +861,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu); -bool kvm_arch_has_vcpu_debugfs(void); +#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu); +#endif int kvm_arch_hardware_enable(void); void kvm_arch_hardware_disable(void); diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index acc43242a310..13f5a1aa6d79 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -144,11 +144,6 @@ out_fail_alloc: return ret; } -bool kvm_arch_has_vcpu_debugfs(void) -{ - return false; -} - int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) { return 0; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1f05aeb9da27..4afb1a234018 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2617,12 +2617,10 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu) static int kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) { +#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS char dir_name[ITOA_MAX_LEN * 2]; int ret; - if (!kvm_arch_has_vcpu_debugfs()) - return 0; - if (!debugfs_initialized()) return 0; @@ -2637,6 +2635,7 @@ static int kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) debugfs_remove_recursive(vcpu->debugfs_dentry); return ret; } +#endif return 0; } -- cgit v1.2.3 From 3e7093d045196b1016517631645e874fe903db7e Mon Sep 17 00:00:00 2001 From: Greg KH Date: Wed, 31 Jul 2019 20:56:20 +0200 Subject: KVM: no need to check return value of debugfs_create functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When calling debugfs functions, there is no need to ever check the return value. The function can work or not, but the code logic should never do something different based on this. Also, when doing this, change kvm_arch_create_vcpu_debugfs() to return void instead of an integer, as we should not care at all about if this function actually does anything or not. Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Cc: Signed-off-by: Greg Kroah-Hartman Signed-off-by: Paolo Bonzini --- arch/x86/kvm/debugfs.c | 41 +++++++++++++---------------------------- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 21 +++++---------------- 3 files changed, 19 insertions(+), 45 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index 9bd93e0d5f63..018aebce33ff 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -43,37 +43,22 @@ static int vcpu_get_tsc_scaling_frac_bits(void *data, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_frac_fops, vcpu_get_tsc_scaling_frac_bits, NULL, "%llu\n"); -int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) +void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) { - struct dentry *ret; + debugfs_create_file("tsc-offset", 0444, vcpu->debugfs_dentry, vcpu, + &vcpu_tsc_offset_fops); - ret = debugfs_create_file("tsc-offset", 0444, - vcpu->debugfs_dentry, - vcpu, &vcpu_tsc_offset_fops); - if (!ret) - return -ENOMEM; - - if (lapic_in_kernel(vcpu)) { - ret = debugfs_create_file("lapic_timer_advance_ns", 0444, - vcpu->debugfs_dentry, - vcpu, &vcpu_timer_advance_ns_fops); - if (!ret) - return -ENOMEM; - } + if (lapic_in_kernel(vcpu)) + debugfs_create_file("lapic_timer_advance_ns", 0444, + vcpu->debugfs_dentry, vcpu, + &vcpu_timer_advance_ns_fops); if (kvm_has_tsc_control) { - ret = debugfs_create_file("tsc-scaling-ratio", 0444, - vcpu->debugfs_dentry, - vcpu, &vcpu_tsc_scaling_fops); - if (!ret) - return -ENOMEM; - ret = debugfs_create_file("tsc-scaling-ratio-frac-bits", 0444, - vcpu->debugfs_dentry, - vcpu, &vcpu_tsc_scaling_frac_fops); - if (!ret) - return -ENOMEM; - + debugfs_create_file("tsc-scaling-ratio", 0444, + vcpu->debugfs_dentry, vcpu, + &vcpu_tsc_scaling_fops); + debugfs_create_file("tsc-scaling-ratio-frac-bits", 0444, + vcpu->debugfs_dentry, vcpu, + &vcpu_tsc_scaling_frac_fops); } - - return 0; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8d34db3c8bc6..fcb46b3374c6 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -862,7 +862,7 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu); #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS -int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu); +void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu); #endif int kvm_arch_hardware_enable(void); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4afb1a234018..4feceaa03fb1 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2615,29 +2615,20 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu) return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC); } -static int kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) +static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) { #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS char dir_name[ITOA_MAX_LEN * 2]; - int ret; if (!debugfs_initialized()) - return 0; + return; snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id); vcpu->debugfs_dentry = debugfs_create_dir(dir_name, - vcpu->kvm->debugfs_dentry); - if (!vcpu->debugfs_dentry) - return -ENOMEM; + vcpu->kvm->debugfs_dentry); - ret = kvm_arch_create_vcpu_debugfs(vcpu); - if (ret < 0) { - debugfs_remove_recursive(vcpu->debugfs_dentry); - return ret; - } + kvm_arch_create_vcpu_debugfs(vcpu); #endif - - return 0; } /* @@ -2672,9 +2663,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) if (r) goto vcpu_destroy; - r = kvm_create_vcpu_debugfs(vcpu); - if (r) - goto vcpu_destroy; + kvm_create_vcpu_debugfs(vcpu); mutex_lock(&kvm->lock); if (kvm_get_vcpu_by_id(kvm, id)) { -- cgit v1.2.3