From 101ca8d05913b7d1e6e8b9dd792193d4082fff86 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Mon, 2 Jan 2023 17:06:30 -0600 Subject: rtc: efi: Enable SET/GET WAKEUP services as optional The current implementation of rtc-efi is expecting all the 4 time services GET{SET}_TIME{WAKEUP} must be supported by UEFI firmware. As per the EFI_RT_PROPERTIES_TABLE, the platform specific implementations can choose to enable selective time services based on the RTC device capabilities. This patch does the following changes to provide GET/SET RTC services on platforms that do not support the WAKEUP feature. 1) Relax time services cap check when creating a platform device. 2) Clear RTC_FEATURE_ALARM bit in the absence of WAKEUP services. 3) Conditional alarm entries in '/proc/driver/rtc'. Cc: # v6.0+ Signed-off-by: Shanker Donthineni Link: https://lore.kernel.org/r/20230102230630.192911-1-sdonthineni@nvidia.com Signed-off-by: Alexandre Belloni --- include/linux/efi.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/efi.h b/include/linux/efi.h index 4b27519143f5..98598bd1d2fa 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -668,7 +668,8 @@ extern struct efi { #define EFI_RT_SUPPORTED_ALL 0x3fff -#define EFI_RT_SUPPORTED_TIME_SERVICES 0x000f +#define EFI_RT_SUPPORTED_TIME_SERVICES 0x0003 +#define EFI_RT_SUPPORTED_WAKEUP_SERVICES 0x000c #define EFI_RT_SUPPORTED_VARIABLE_SERVICES 0x0070 extern struct mm_struct efi_mm; -- cgit v1.2.3 From 569653f022a29a1a44ea9de5308b657228303fa5 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 27 Jan 2023 10:40:09 +0000 Subject: nvmem: core: remove nvmem_config wp_gpio No one provides wp_gpio, so let's remove it to avoid issues with the nvmem core putting this gpio. Cc: stable@vger.kernel.org Signed-off-by: Russell King (Oracle) Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20230127104015.23839-5-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/nvmem/core.c | 4 +--- include/linux/nvmem-provider.h | 2 -- 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c index 7394a7598efa..608f3ad2e2e4 100644 --- a/drivers/nvmem/core.c +++ b/drivers/nvmem/core.c @@ -772,9 +772,7 @@ struct nvmem_device *nvmem_register(const struct nvmem_config *config) nvmem->id = rval; - if (config->wp_gpio) - nvmem->wp_gpio = config->wp_gpio; - else if (!config->ignore_wp) + if (!config->ignore_wp) nvmem->wp_gpio = gpiod_get_optional(config->dev, "wp", GPIOD_OUT_HIGH); if (IS_ERR(nvmem->wp_gpio)) { diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h index 50caa117cb62..bb15c9234e21 100644 --- a/include/linux/nvmem-provider.h +++ b/include/linux/nvmem-provider.h @@ -70,7 +70,6 @@ struct nvmem_keepout { * @word_size: Minimum read/write access granularity. * @stride: Minimum read/write access stride. * @priv: User context passed to read/write callbacks. - * @wp-gpio: Write protect pin * @ignore_wp: Write Protect pin is managed by the provider. * * Note: A default "nvmem" name will be assigned to the device if @@ -85,7 +84,6 @@ struct nvmem_config { const char *name; int id; struct module *owner; - struct gpio_desc *wp_gpio; const struct nvmem_cell_info *cells; int ncells; const struct nvmem_keepout *keepout; -- cgit v1.2.3 From a23eaf9368aafa4defcc8904b20391b6ea07bb1e Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Fri, 27 Jan 2023 07:54:48 +0800 Subject: KVM: arm64: Add helper vgic_write_guest_lock() Currently, the unknown no-running-vcpu sites are reported when a dirty page is tracked by mark_page_dirty_in_slot(). Until now, the only known no-running-vcpu site is saving vgic/its tables through KVM_DEV_ARM_{VGIC_GRP_CTRL, ITS_SAVE_TABLES} command on KVM device "kvm-arm-vgic-its". Unfortunately, there are more unknown sites to be handled and no-running-vcpu context will be allowed in these sites: (1) KVM_DEV_ARM_{VGIC_GRP_CTRL, ITS_RESTORE_TABLES} command on KVM device "kvm-arm-vgic-its" to restore vgic/its tables. The vgic3 LPI pending status could be restored. (2) Save vgic3 pending table through KVM_DEV_ARM_{VGIC_GRP_CTRL, VGIC_SAVE_PENDING_TABLES} command on KVM device "kvm-arm-vgic-v3". In order to handle those unknown cases, we need a unified helper vgic_write_guest_lock(). struct vgic_dist::save_its_tables_in_progress is also renamed to struct vgic_dist::save_tables_in_progress. No functional change intended. Suggested-by: Oliver Upton Signed-off-by: Gavin Shan Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230126235451.469087-3-gshan@redhat.com --- arch/arm64/kvm/vgic/vgic-its.c | 13 +++++-------- arch/arm64/kvm/vgic/vgic.h | 14 ++++++++++++++ include/kvm/arm_vgic.h | 2 +- 3 files changed, 20 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 94a666dd1443..2642e9ce2819 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -2187,7 +2187,7 @@ static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev, ((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) | ite->collection->collection_id; val = cpu_to_le64(val); - return kvm_write_guest_lock(kvm, gpa, &val, ite_esz); + return vgic_write_guest_lock(kvm, gpa, &val, ite_esz); } /** @@ -2339,7 +2339,7 @@ static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev, (itt_addr_field << KVM_ITS_DTE_ITTADDR_SHIFT) | (dev->num_eventid_bits - 1)); val = cpu_to_le64(val); - return kvm_write_guest_lock(kvm, ptr, &val, dte_esz); + return vgic_write_guest_lock(kvm, ptr, &val, dte_esz); } /** @@ -2526,7 +2526,7 @@ static int vgic_its_save_cte(struct vgic_its *its, ((u64)collection->target_addr << KVM_ITS_CTE_RDBASE_SHIFT) | collection->collection_id); val = cpu_to_le64(val); - return kvm_write_guest_lock(its->dev->kvm, gpa, &val, esz); + return vgic_write_guest_lock(its->dev->kvm, gpa, &val, esz); } /* @@ -2607,7 +2607,7 @@ static int vgic_its_save_collection_table(struct vgic_its *its) */ val = 0; BUG_ON(cte_esz > sizeof(val)); - ret = kvm_write_guest_lock(its->dev->kvm, gpa, &val, cte_esz); + ret = vgic_write_guest_lock(its->dev->kvm, gpa, &val, cte_esz); return ret; } @@ -2743,7 +2743,6 @@ static int vgic_its_has_attr(struct kvm_device *dev, static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr) { const struct vgic_its_abi *abi = vgic_its_get_abi(its); - struct vgic_dist *dist = &kvm->arch.vgic; int ret = 0; if (attr == KVM_DEV_ARM_VGIC_CTRL_INIT) /* Nothing to do */ @@ -2763,9 +2762,7 @@ static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr) vgic_its_reset(kvm, its); break; case KVM_DEV_ARM_ITS_SAVE_TABLES: - dist->save_its_tables_in_progress = true; ret = abi->save_tables(its); - dist->save_its_tables_in_progress = false; break; case KVM_DEV_ARM_ITS_RESTORE_TABLES: ret = abi->restore_tables(its); @@ -2792,7 +2789,7 @@ bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; - return dist->save_its_tables_in_progress; + return dist->table_write_in_progress; } static int vgic_its_set_attr(struct kvm_device *dev, diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index 23e280fa0a16..7f7f3c5ed85a 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -6,6 +6,7 @@ #define __KVM_ARM_VGIC_NEW_H__ #include +#include #define PRODUCT_ID_KVM 0x4b /* ASCII code K */ #define IMPLEMENTER_ARM 0x43b @@ -131,6 +132,19 @@ static inline bool vgic_irq_is_multi_sgi(struct vgic_irq *irq) return vgic_irq_get_lr_count(irq) > 1; } +static inline int vgic_write_guest_lock(struct kvm *kvm, gpa_t gpa, + const void *data, unsigned long len) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + int ret; + + dist->table_write_in_progress = true; + ret = kvm_write_guest_lock(kvm, gpa, data, len); + dist->table_write_in_progress = false; + + return ret; +} + /* * This struct provides an intermediate representation of the fields contained * in the GICH_VMCR and ICH_VMCR registers, such that code exporting the GIC diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 9270cd87da3f..6470f67e63c4 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -263,7 +263,7 @@ struct vgic_dist { struct vgic_io_device dist_iodev; bool has_its; - bool save_its_tables_in_progress; + bool table_write_in_progress; /* * Contains the attributes and gpa of the LPI configuration table. -- cgit v1.2.3 From 4f64a6c9f6f11e8b7314f8e27e2c4568706009e6 Mon Sep 17 00:00:00 2001 From: James Clark Date: Fri, 27 Jan 2023 14:31:41 +0000 Subject: perf: Fix perf_event_pmu_context serialization Syzkaller triggered a WARN in put_pmu_ctx(). WARNING: CPU: 1 PID: 2245 at kernel/events/core.c:4925 put_pmu_ctx+0x1f0/0x278 This is because there is no locking around the access of "if (!epc->ctx)" in find_get_pmu_context() and when it is set to NULL in put_pmu_ctx(). The decrement of the reference count in put_pmu_ctx() also happens outside of the spinlock, leading to the possibility of this order of events, and the context being cleared in put_pmu_ctx(), after its refcount is non zero: CPU0 CPU1 find_get_pmu_context() if (!epc->ctx) == false put_pmu_ctx() atomic_dec_and_test(&epc->refcount) == true epc->refcount == 0 atomic_inc(&epc->refcount); epc->refcount == 1 list_del_init(&epc->pmu_ctx_entry); epc->ctx = NULL; Another issue is that WARN_ON for no active PMU events in put_pmu_ctx() is outside of the lock. If the perf_event_pmu_context is an embedded one, even after clearing it, it won't be deleted and can be re-used. So the warning can trigger. For this reason it also needs to be moved inside the lock. The above warning is very quick to trigger on Arm by running these two commands at the same time: while true; do perf record -- ls; done while true; do perf record -- ls; done [peterz: atomic_dec_and_raw_lock*()] Fixes: bd2756811766 ("perf: Rewrite core context handling") Reported-by: syzbot+697196bc0265049822bd@syzkaller.appspotmail.com Signed-off-by: James Clark Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ravi Bangoria Link: https://lore.kernel.org/r/20230127143141.1782804-2-james.clark@arm.com --- include/linux/spinlock.h | 9 +++++++++ kernel/events/core.c | 39 +++++++++++++++++---------------------- lib/dec_and_lock.c | 31 +++++++++++++++++++++++++++++++ 3 files changed, 57 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 1341f7d62da4..be48f1cb1878 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -476,6 +476,15 @@ extern int _atomic_dec_and_lock_irqsave(atomic_t *atomic, spinlock_t *lock, #define atomic_dec_and_lock_irqsave(atomic, lock, flags) \ __cond_lock(lock, _atomic_dec_and_lock_irqsave(atomic, lock, &(flags))) +extern int _atomic_dec_and_raw_lock(atomic_t *atomic, raw_spinlock_t *lock); +#define atomic_dec_and_raw_lock(atomic, lock) \ + __cond_lock(lock, _atomic_dec_and_raw_lock(atomic, lock)) + +extern int _atomic_dec_and_raw_lock_irqsave(atomic_t *atomic, raw_spinlock_t *lock, + unsigned long *flags); +#define atomic_dec_and_raw_lock_irqsave(atomic, lock, flags) \ + __cond_lock(lock, _atomic_dec_and_raw_lock_irqsave(atomic, lock, &(flags))) + int __alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *lock_mask, size_t max_size, unsigned int cpu_mult, gfp_t gfp, const char *name, diff --git a/kernel/events/core.c b/kernel/events/core.c index d56328e5080e..c4be13e50547 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4813,19 +4813,17 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu); epc = &cpc->epc; - + raw_spin_lock_irq(&ctx->lock); if (!epc->ctx) { atomic_set(&epc->refcount, 1); epc->embedded = 1; - raw_spin_lock_irq(&ctx->lock); list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); epc->ctx = ctx; - raw_spin_unlock_irq(&ctx->lock); } else { WARN_ON_ONCE(epc->ctx != ctx); atomic_inc(&epc->refcount); } - + raw_spin_unlock_irq(&ctx->lock); return epc; } @@ -4896,33 +4894,30 @@ static void free_epc_rcu(struct rcu_head *head) static void put_pmu_ctx(struct perf_event_pmu_context *epc) { + struct perf_event_context *ctx = epc->ctx; unsigned long flags; - if (!atomic_dec_and_test(&epc->refcount)) + /* + * XXX + * + * lockdep_assert_held(&ctx->mutex); + * + * can't because of the call-site in _free_event()/put_event() + * which isn't always called under ctx->mutex. + */ + if (!atomic_dec_and_raw_lock_irqsave(&epc->refcount, &ctx->lock, flags)) return; - if (epc->ctx) { - struct perf_event_context *ctx = epc->ctx; + WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry)); - /* - * XXX - * - * lockdep_assert_held(&ctx->mutex); - * - * can't because of the call-site in _free_event()/put_event() - * which isn't always called under ctx->mutex. - */ - - WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry)); - raw_spin_lock_irqsave(&ctx->lock, flags); - list_del_init(&epc->pmu_ctx_entry); - epc->ctx = NULL; - raw_spin_unlock_irqrestore(&ctx->lock, flags); - } + list_del_init(&epc->pmu_ctx_entry); + epc->ctx = NULL; WARN_ON_ONCE(!list_empty(&epc->pinned_active)); WARN_ON_ONCE(!list_empty(&epc->flexible_active)); + raw_spin_unlock_irqrestore(&ctx->lock, flags); + if (epc->embedded) return; diff --git a/lib/dec_and_lock.c b/lib/dec_and_lock.c index 9555b68bb774..1dcca8f2e194 100644 --- a/lib/dec_and_lock.c +++ b/lib/dec_and_lock.c @@ -49,3 +49,34 @@ int _atomic_dec_and_lock_irqsave(atomic_t *atomic, spinlock_t *lock, return 0; } EXPORT_SYMBOL(_atomic_dec_and_lock_irqsave); + +int _atomic_dec_and_raw_lock(atomic_t *atomic, raw_spinlock_t *lock) +{ + /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ + if (atomic_add_unless(atomic, -1, 1)) + return 0; + + /* Otherwise do it the slow way */ + raw_spin_lock(lock); + if (atomic_dec_and_test(atomic)) + return 1; + raw_spin_unlock(lock); + return 0; +} +EXPORT_SYMBOL(_atomic_dec_and_raw_lock); + +int _atomic_dec_and_raw_lock_irqsave(atomic_t *atomic, raw_spinlock_t *lock, + unsigned long *flags) +{ + /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ + if (atomic_add_unless(atomic, -1, 1)) + return 0; + + /* Otherwise do it the slow way */ + raw_spin_lock_irqsave(lock, *flags); + if (atomic_dec_and_test(atomic)) + return 1; + raw_spin_unlock_irqrestore(lock, *flags); + return 0; +} +EXPORT_SYMBOL(_atomic_dec_and_raw_lock_irqsave); -- cgit v1.2.3 From 55ab834a86a9934c4f17825c115f7dc16a89aae7 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 16 Dec 2022 10:46:33 +0100 Subject: Revert "mm: add nodes= arg to memory.reclaim" This reverts commit 12a5d3955227b0d7e04fb793ccceeb2a1dd275c5. Although it is recognized that a finer grained pro-active reclaim is something we need and want the semantic of this implementation is really ambiguous. In a follow up discussion it became clear that there are two essential usecases here. One is to use memory.reclaim to pro-actively reclaim memory and expectation is that the requested and reported amount of memory is uncharged from the memcg. Another usecase focuses on pro-active demotion when the memory is merely shuffled around to demotion targets while the overall charged memory stays unchanged. The current implementation considers demoted pages as reclaimed and that break both usecases. [1] has tried to address the reporting part but there are more issues with that summarized in [2] and follow up emails. Let's revert the nodemask based extension of the memcg pro-active reclaim for now until we settle with a more robust semantic. [1] http://lkml.kernel.org/r/http://lkml.kernel.org/r/20221206023406.3182800-1-almasrymina@google.com [2] http://lkml.kernel.org/r/Y5bsmpCyeryu3Zz1@dhcp22.suse.cz Link: https://lkml.kernel.org/r/Y5xASNe1x8cusiTx@dhcp22.suse.cz Fixes: 12a5d3955227b0d ("mm: add nodes= arg to memory.reclaim") Signed-off-by: Michal Hocko Cc: Bagas Sanjaya Cc: Huang Ying Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Mina Almasry Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Cc: Tejun Heo Cc: Wei Xu Cc: Yang Shi Cc: Yosry Ahmed Cc: zefan li Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v2.rst | 15 +++----- include/linux/swap.h | 3 +- mm/memcontrol.c | 67 +++++++-------------------------- mm/vmscan.c | 4 +- 4 files changed, 21 insertions(+), 68 deletions(-) (limited to 'include') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index c8ae7c897f14..74cec76be9f2 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1245,13 +1245,17 @@ PAGE_SIZE multiple when read back. This is a simple interface to trigger memory reclaim in the target cgroup. - This file accepts a string which contains the number of bytes to - reclaim. + This file accepts a single key, the number of bytes to reclaim. + No nested keys are currently supported. Example:: echo "1G" > memory.reclaim + The interface can be later extended with nested keys to + configure the reclaim behavior. For example, specify the + type of memory to reclaim from (anon, file, ..). + Please note that the kernel can over or under reclaim from the target cgroup. If less bytes are reclaimed than the specified amount, -EAGAIN is returned. @@ -1263,13 +1267,6 @@ PAGE_SIZE multiple when read back. This means that the networking layer will not adapt based on reclaim induced by memory.reclaim. - This file also allows the user to specify the nodes to reclaim from, - via the 'nodes=' key, for example:: - - echo "1G nodes=0,1" > memory.reclaim - - The above instructs the kernel to reclaim memory from nodes 0,1. - memory.peak A read-only single value file which exists on non-root cgroups. diff --git a/include/linux/swap.h b/include/linux/swap.h index 2787b84eaf12..0ceed49516ad 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -418,8 +418,7 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - unsigned int reclaim_options, - nodemask_t *nodemask); + unsigned int reclaim_options); extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, pg_data_t *pgdat, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ab457f0394ab..73afff8062f9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -63,7 +63,6 @@ #include #include #include -#include #include "internal.h" #include #include @@ -2393,8 +2392,7 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg, psi_memstall_enter(&pflags); nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, - MEMCG_RECLAIM_MAY_SWAP, - NULL); + MEMCG_RECLAIM_MAY_SWAP); psi_memstall_leave(&pflags); } while ((memcg = parent_mem_cgroup(memcg)) && !mem_cgroup_is_root(memcg)); @@ -2685,8 +2683,7 @@ retry: psi_memstall_enter(&pflags); nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, - gfp_mask, reclaim_options, - NULL); + gfp_mask, reclaim_options); psi_memstall_leave(&pflags); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) @@ -3506,8 +3503,7 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg, } if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, - memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, - NULL)) { + memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) { ret = -EBUSY; break; } @@ -3618,8 +3614,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) return -EINTR; if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, - MEMCG_RECLAIM_MAY_SWAP, - NULL)) + MEMCG_RECLAIM_MAY_SWAP)) nr_retries--; } @@ -6429,8 +6424,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, } reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, - GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, - NULL); + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP); if (!reclaimed && !nr_retries--) break; @@ -6479,8 +6473,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, if (nr_reclaims) { if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, - GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, - NULL)) + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP)) nr_reclaims--; continue; } @@ -6603,54 +6596,21 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of, return nbytes; } -enum { - MEMORY_RECLAIM_NODES = 0, - MEMORY_RECLAIM_NULL, -}; - -static const match_table_t if_tokens = { - { MEMORY_RECLAIM_NODES, "nodes=%s" }, - { MEMORY_RECLAIM_NULL, NULL }, -}; - static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); unsigned int nr_retries = MAX_RECLAIM_RETRIES; unsigned long nr_to_reclaim, nr_reclaimed = 0; - unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP | - MEMCG_RECLAIM_PROACTIVE; - char *old_buf, *start; - substring_t args[MAX_OPT_ARGS]; - int token; - char value[256]; - nodemask_t nodemask = NODE_MASK_ALL; - - buf = strstrip(buf); - - old_buf = buf; - nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE; - if (buf == old_buf) - return -EINVAL; + unsigned int reclaim_options; + int err; buf = strstrip(buf); + err = page_counter_memparse(buf, "", &nr_to_reclaim); + if (err) + return err; - while ((start = strsep(&buf, " ")) != NULL) { - if (!strlen(start)) - continue; - token = match_token(start, if_tokens, args); - match_strlcpy(value, args, sizeof(value)); - switch (token) { - case MEMORY_RECLAIM_NODES: - if (nodelist_parse(value, nodemask) < 0) - return -EINVAL; - break; - default: - return -EINVAL; - } - } - + reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE; while (nr_reclaimed < nr_to_reclaim) { unsigned long reclaimed; @@ -6667,8 +6627,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_to_reclaim - nr_reclaimed, - GFP_KERNEL, reclaim_options, - &nodemask); + GFP_KERNEL, reclaim_options); if (!reclaimed && !nr_retries--) return -EAGAIN; diff --git a/mm/vmscan.c b/mm/vmscan.c index bd6637fcd8f9..e83d2a74e942 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -6754,8 +6754,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - unsigned int reclaim_options, - nodemask_t *nodemask) + unsigned int reclaim_options) { unsigned long nr_reclaimed; unsigned int noreclaim_flag; @@ -6770,7 +6769,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .may_unmap = 1, .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP), .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE), - .nodemask = nodemask, }; /* * Traverse the ZONELIST_FALLBACK zonelist of the current node to put -- cgit v1.2.3 From 3489dbb696d25602aea8c3e669a6d43b76bd5358 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Thu, 26 Jan 2023 14:27:20 -0800 Subject: mm: hugetlb: proc: check for hugetlb shared PMD in /proc/PID/smaps Patch series "Fixes for hugetlb mapcount at most 1 for shared PMDs". This issue of mapcount in hugetlb pages referenced by shared PMDs was discussed in [1]. The following two patches address user visible behavior caused by this issue. [1] https://lore.kernel.org/linux-mm/Y9BF+OCdWnCSilEu@monkey/ This patch (of 2): A hugetlb page will have a mapcount of 1 if mapped by multiple processes via a shared PMD. This is because only the first process increases the map count, and subsequent processes just add the shared PMD page to their page table. page_mapcount is being used to decide if a hugetlb page is shared or private in /proc/PID/smaps. Pages referenced via a shared PMD were incorrectly being counted as private. To fix, check for a shared PMD if mapcount is 1. If a shared PMD is found count the hugetlb page as shared. A new helper to check for a shared PMD is added. [akpm@linux-foundation.org: simplification, per David] [akpm@linux-foundation.org: hugetlb.h: include page_ref.h for page_count()] Link: https://lkml.kernel.org/r/20230126222721.222195-2-mike.kravetz@oracle.com Fixes: 25ee01a2fca0 ("mm: hugetlb: proc: add hugetlb-related fields to /proc/PID/smaps") Signed-off-by: Mike Kravetz Acked-by: Peter Xu Cc: David Hildenbrand Cc: James Houghton Cc: Matthew Wilcox Cc: Michal Hocko Cc: Muchun Song Cc: Naoya Horiguchi Cc: Vishal Moola (Oracle) Cc: Yang Shi Cc: Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 4 +--- include/linux/hugetlb.h | 13 +++++++++++++ 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e35a0398db63..af1c49ae11b1 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -745,9 +745,7 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, page = pfn_swap_entry_to_page(swpent); } if (page) { - int mapcount = page_mapcount(page); - - if (mapcount >= 2) + if (page_mapcount(page) >= 2 || hugetlb_pmd_shared(pte)) mss->shared_hugetlb += huge_page_size(hstate_vma(vma)); else mss->private_hugetlb += huge_page_size(hstate_vma(vma)); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 551834cd5299..db194e2ba69f 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -1187,6 +1188,18 @@ static inline __init void hugetlb_cma_reserve(int order) } #endif +#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE +static inline bool hugetlb_pmd_shared(pte_t *pte) +{ + return page_count(virt_to_page(pte)) > 1; +} +#else +static inline bool hugetlb_pmd_shared(pte_t *pte) +{ + return false; +} +#endif + bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr); #ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE -- cgit v1.2.3 From 88d7b12068b95731c280af8ce88e8ee9561f96de Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 26 Jan 2023 20:07:27 +0000 Subject: highmem: round down the address passed to kunmap_flush_on_unmap() We already round down the address in kunmap_local_indexed() which is the other implementation of __kunmap_local(). The only implementation of kunmap_flush_on_unmap() is PA-RISC which is expecting a page-aligned address. This may be causing PA-RISC to be flushing the wrong addresses currently. Link: https://lkml.kernel.org/r/20230126200727.1680362-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Fixes: 298fa1ad5571 ("highmem: Provide generic variant of kmap_atomic*") Reviewed-by: Ira Weiny Cc: "Fabio M. De Francesco" Cc: Al Viro Cc: Thomas Gleixner Cc: Helge Deller Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Bagas Sanjaya Cc: David Sterba Cc: Kees Cook Cc: Sebastian Andrzej Siewior Cc: Tony Luck Cc: Signed-off-by: Andrew Morton --- include/linux/highmem-internal.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index 034b1106d022..e098f38422af 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -200,7 +200,7 @@ static inline void *kmap_local_pfn(unsigned long pfn) static inline void __kunmap_local(const void *addr) { #ifdef ARCH_HAS_FLUSH_ON_KUNMAP - kunmap_flush_on_unmap(addr); + kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE)); #endif } @@ -227,7 +227,7 @@ static inline void *kmap_atomic_pfn(unsigned long pfn) static inline void __kunmap_atomic(const void *addr) { #ifdef ARCH_HAS_FLUSH_ON_KUNMAP - kunmap_flush_on_unmap(addr); + kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE)); #endif pagefault_enable(); if (IS_ENABLED(CONFIG_PREEMPT_RT)) -- cgit v1.2.3 From ac86f547ca1002aec2ef66b9e64d03f45bbbfbb9 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Sun, 29 Jan 2023 12:09:45 +0800 Subject: mm: memcg: fix NULL pointer in mem_cgroup_track_foreign_dirty_slowpath() As commit 18365225f044 ("hwpoison, memcg: forcibly uncharge LRU pages"), hwpoison will forcibly uncharg a LRU hwpoisoned page, the folio_memcg could be NULl, then, mem_cgroup_track_foreign_dirty_slowpath() could occurs a NULL pointer dereference, let's do not record the foreign writebacks for folio memcg is null in mem_cgroup_track_foreign_dirty() to fix it. Link: https://lkml.kernel.org/r/20230129040945.180629-1-wangkefeng.wang@huawei.com Fixes: 97b27821b485 ("writeback, memcg: Implement foreign dirty flushing") Signed-off-by: Kefeng Wang Reported-by: Ma Wupeng Tested-by: Miko Larsson Acked-by: Michal Hocko Cc: Jan Kara Cc: Jens Axboe Cc: Kefeng Wang Cc: Ma Wupeng Cc: Naoya Horiguchi Cc: Shakeel Butt Cc: Tejun Heo Cc: Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d3c8203cab6c..85dc9b88ea37 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1666,10 +1666,13 @@ void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio, static inline void mem_cgroup_track_foreign_dirty(struct folio *folio, struct bdi_writeback *wb) { + struct mem_cgroup *memcg; + if (mem_cgroup_disabled()) return; - if (unlikely(&folio_memcg(folio)->css != wb->memcg_css)) + memcg = folio_memcg(folio); + if (unlikely(memcg && &memcg->css != wb->memcg_css)) mem_cgroup_track_foreign_dirty_slowpath(folio, wb); } -- cgit v1.2.3 From b38b17b6a01ca4e738af097a1529910646ef4270 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 1 Feb 2023 09:36:44 +0800 Subject: ceph: move mount state enum to super.h These flags are only used in ceph filesystem in fs/ceph, so just move it to the place it should be. Signed-off-by: Xiubo Li Reviewed-by: Venky Shankar Signed-off-by: Ilya Dryomov --- fs/ceph/super.h | 10 ++++++++++ include/linux/ceph/libceph.h | 10 ---------- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 0ed3be75bb9a..cd95b426ee00 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -100,6 +100,16 @@ struct ceph_mount_options { char *mon_addr; }; +/* mount state */ +enum { + CEPH_MOUNT_MOUNTING, + CEPH_MOUNT_MOUNTED, + CEPH_MOUNT_UNMOUNTING, + CEPH_MOUNT_UNMOUNTED, + CEPH_MOUNT_SHUTDOWN, + CEPH_MOUNT_RECOVER, +}; + #define CEPH_ASYNC_CREATE_CONFLICT_BITS 8 struct ceph_fs_client { diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 00af2c98da75..4497d0a6772c 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -99,16 +99,6 @@ struct ceph_options { #define CEPH_AUTH_NAME_DEFAULT "guest" -/* mount state */ -enum { - CEPH_MOUNT_MOUNTING, - CEPH_MOUNT_MOUNTED, - CEPH_MOUNT_UNMOUNTING, - CEPH_MOUNT_UNMOUNTED, - CEPH_MOUNT_SHUTDOWN, - CEPH_MOUNT_RECOVER, -}; - static inline unsigned long ceph_timeout_jiffies(unsigned long timeout) { return timeout ?: MAX_SCHEDULE_TIMEOUT; -- cgit v1.2.3 From 03702d4d29be4e2510ec80b248dbbde4e57030d9 Mon Sep 17 00:00:00 2001 From: "Herton R. Krzesinski" Date: Fri, 3 Feb 2023 13:04:48 -0300 Subject: uapi: add missing ip/ipv6 header dependencies for linux/stddef.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit 58e0be1ef6118 ("net: use struct_group to copy ip/ipv6 header addresses"), ip and ipv6 headers started to use the __struct_group definition, which is defined at include/uapi/linux/stddef.h. However, linux/stddef.h isn't explicitly included in include/uapi/linux/{ip,ipv6}.h, which breaks build of xskxceiver bpf selftest if you install the uapi headers in the system: $ make V=1 xskxceiver -C tools/testing/selftests/bpf ... make: Entering directory '(...)/tools/testing/selftests/bpf' gcc -g -O0 -rdynamic -Wall -Werror (...) In file included from xskxceiver.c:79: /usr/include/linux/ip.h:103:9: error: expected specifier-qualifier-list before ‘__struct_group’ 103 | __struct_group(/* no tag */, addrs, /* no attrs */, | ^~~~~~~~~~~~~~ ... Include the missing dependency in ip.h and do the same for the ipv6.h header. Fixes: 58e0be1ef611 ("net: use struct_group to copy ip/ipv6 header addresses") Signed-off-by: Herton R. Krzesinski Reviewed-by: Carlos O'Donell Tested-by: Carlos O'Donell Signed-off-by: David S. Miller --- include/uapi/linux/ip.h | 1 + include/uapi/linux/ipv6.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h index 874a92349bf5..283dec7e3645 100644 --- a/include/uapi/linux/ip.h +++ b/include/uapi/linux/ip.h @@ -18,6 +18,7 @@ #ifndef _UAPI_LINUX_IP_H #define _UAPI_LINUX_IP_H #include +#include #include #define IPTOS_TOS_MASK 0x1E diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index 81f4243bebb1..53326dfc59ec 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -4,6 +4,7 @@ #include #include +#include #include #include -- cgit v1.2.3 From c3bdbaea654d8df39112de33037106134a520dc7 Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Sun, 22 Jan 2023 21:09:40 +0200 Subject: net/mlx5: Store page counters in a single array Currently, an independent page counter is used for tracking memory usage for each function type such as VF, PF and host PF (DPU). For better code-readibilty, use a single array that stores the number of allocated memory pages for each function type. Signed-off-by: Maher Sanalla Reviewed-by: Shay Drory Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/debugfs.c | 4 +-- drivers/net/ethernet/mellanox/mlx5/core/ecpf.c | 2 +- .../net/ethernet/mellanox/mlx5/core/pagealloc.c | 37 ++++++++++++---------- drivers/net/ethernet/mellanox/mlx5/core/sriov.c | 2 +- include/linux/mlx5/driver.h | 12 +++++-- 5 files changed, 34 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c index 3e232a65a0c3..c3e7c24a0971 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c @@ -245,8 +245,8 @@ void mlx5_pages_debugfs_init(struct mlx5_core_dev *dev) pages = dev->priv.dbg.pages_debugfs; debugfs_create_u32("fw_pages_total", 0400, pages, &dev->priv.fw_pages); - debugfs_create_u32("fw_pages_vfs", 0400, pages, &dev->priv.vfs_pages); - debugfs_create_u32("fw_pages_host_pf", 0400, pages, &dev->priv.host_pf_pages); + debugfs_create_u32("fw_pages_vfs", 0400, pages, &dev->priv.page_counters[MLX5_VF]); + debugfs_create_u32("fw_pages_host_pf", 0400, pages, &dev->priv.page_counters[MLX5_HOST_PF]); debugfs_create_u32("fw_pages_alloc_failed", 0400, pages, &dev->priv.fw_pages_alloc_failed); debugfs_create_u32("fw_pages_give_dropped", 0400, pages, &dev->priv.give_pages_dropped); debugfs_create_u32("fw_pages_reclaim_discard", 0400, pages, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c b/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c index 464eb3a18450..cdc87ecae5d3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c @@ -87,7 +87,7 @@ void mlx5_ec_cleanup(struct mlx5_core_dev *dev) mlx5_host_pf_cleanup(dev); - err = mlx5_wait_for_pages(dev, &dev->priv.host_pf_pages); + err = mlx5_wait_for_pages(dev, &dev->priv.page_counters[MLX5_HOST_PF]); if (err) mlx5_core_warn(dev, "Timeout reclaiming external host PF pages err(%d)\n", err); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c index 60596357bfc7..9f99292ab5ce 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c @@ -74,6 +74,14 @@ static u32 get_function(u16 func_id, bool ec_function) return (u32)func_id | (ec_function << 16); } +static u16 func_id_to_type(struct mlx5_core_dev *dev, u16 func_id, bool ec_function) +{ + if (!func_id) + return mlx5_core_is_ecpf(dev) && !ec_function ? MLX5_HOST_PF : MLX5_PF; + + return MLX5_VF; +} + static struct rb_root *page_root_per_function(struct mlx5_core_dev *dev, u32 function) { struct rb_root *root; @@ -332,6 +340,7 @@ static int give_pages(struct mlx5_core_dev *dev, u16 func_id, int npages, u32 out[MLX5_ST_SZ_DW(manage_pages_out)] = {0}; int inlen = MLX5_ST_SZ_BYTES(manage_pages_in); int notify_fail = event; + u16 func_type; u64 addr; int err; u32 *in; @@ -383,11 +392,9 @@ retry: goto out_dropped; } + func_type = func_id_to_type(dev, func_id, ec_function); + dev->priv.page_counters[func_type] += npages; dev->priv.fw_pages += npages; - if (func_id) - dev->priv.vfs_pages += npages; - else if (mlx5_core_is_ecpf(dev) && !ec_function) - dev->priv.host_pf_pages += npages; mlx5_core_dbg(dev, "npages %d, ec_function %d, func_id 0x%x, err %d\n", npages, ec_function, func_id, err); @@ -414,6 +421,7 @@ static void release_all_pages(struct mlx5_core_dev *dev, u16 func_id, struct rb_root *root; struct rb_node *p; int npages = 0; + u16 func_type; root = xa_load(&dev->priv.page_root_xa, function); if (WARN_ON_ONCE(!root)) @@ -428,11 +436,9 @@ static void release_all_pages(struct mlx5_core_dev *dev, u16 func_id, free_fwp(dev, fwp, fwp->free_count); } + func_type = func_id_to_type(dev, func_id, ec_function); + dev->priv.page_counters[func_type] -= npages; dev->priv.fw_pages -= npages; - if (func_id) - dev->priv.vfs_pages -= npages; - else if (mlx5_core_is_ecpf(dev) && !ec_function) - dev->priv.host_pf_pages -= npages; mlx5_core_dbg(dev, "npages %d, ec_function %d, func_id 0x%x\n", npages, ec_function, func_id); @@ -498,6 +504,7 @@ static int reclaim_pages(struct mlx5_core_dev *dev, u16 func_id, int npages, int outlen = MLX5_ST_SZ_BYTES(manage_pages_out); u32 in[MLX5_ST_SZ_DW(manage_pages_in)] = {}; int num_claimed; + u16 func_type; u32 *out; int err; int i; @@ -549,11 +556,9 @@ static int reclaim_pages(struct mlx5_core_dev *dev, u16 func_id, int npages, if (nclaimed) *nclaimed = num_claimed; + func_type = func_id_to_type(dev, func_id, ec_function); + dev->priv.page_counters[func_type] -= num_claimed; dev->priv.fw_pages -= num_claimed; - if (func_id) - dev->priv.vfs_pages -= num_claimed; - else if (mlx5_core_is_ecpf(dev) && !ec_function) - dev->priv.host_pf_pages -= num_claimed; out_free: kvfree(out); @@ -706,12 +711,12 @@ int mlx5_reclaim_startup_pages(struct mlx5_core_dev *dev) WARN(dev->priv.fw_pages, "FW pages counter is %d after reclaiming all pages\n", dev->priv.fw_pages); - WARN(dev->priv.vfs_pages, + WARN(dev->priv.page_counters[MLX5_VF], "VFs FW pages counter is %d after reclaiming all pages\n", - dev->priv.vfs_pages); - WARN(dev->priv.host_pf_pages, + dev->priv.page_counters[MLX5_VF]); + WARN(dev->priv.page_counters[MLX5_HOST_PF], "External host PF FW pages counter is %d after reclaiming all pages\n", - dev->priv.host_pf_pages); + dev->priv.page_counters[MLX5_HOST_PF]); return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c index c0e6c487c63c..3008e9ce2bbf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c @@ -147,7 +147,7 @@ mlx5_device_disable_sriov(struct mlx5_core_dev *dev, int num_vfs, bool clear_vf) mlx5_eswitch_disable_sriov(dev->priv.eswitch, clear_vf); - if (mlx5_wait_for_pages(dev, &dev->priv.vfs_pages)) + if (mlx5_wait_for_pages(dev, &dev->priv.page_counters[MLX5_VF])) mlx5_core_warn(dev, "timeout reclaiming VFs pages\n"); } diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 76ef2e4fde38..82a9bd4274b8 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -573,6 +573,13 @@ struct mlx5_debugfs_entries { struct dentry *lag_debugfs; }; +enum mlx5_func_type { + MLX5_PF, + MLX5_VF, + MLX5_HOST_PF, + MLX5_FUNC_TYPE_NUM, +}; + struct mlx5_ft_pool; struct mlx5_priv { /* IRQ table valid only for real pci devices PF or VF */ @@ -583,11 +590,10 @@ struct mlx5_priv { struct mlx5_nb pg_nb; struct workqueue_struct *pg_wq; struct xarray page_root_xa; - u32 fw_pages; atomic_t reg_pages; struct list_head free_list; - u32 vfs_pages; - u32 host_pf_pages; + u32 fw_pages; + u32 page_counters[MLX5_FUNC_TYPE_NUM]; u32 fw_pages_alloc_failed; u32 give_pages_dropped; u32 reclaim_pages_discard; -- cgit v1.2.3 From 9965bbebae59b3563a4d95e4aed121e8965dfdc2 Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Sun, 22 Jan 2023 23:24:56 +0200 Subject: net/mlx5: Expose SF firmware pages counter Currently, each core device has VF pages counter which stores number of fw pages used by its VFs and SFs. The current design led to a hang when performing firmware reset on DPU, where the DPU PFs stalled in sriov unload flow due to waiting on release of SFs pages instead of waiting on only VFs pages. Thus, Add a separate counter for SF firmware pages, which will prevent the stall scenario described above. Fixes: 1958fc2f0712 ("net/mlx5: SF, Add auxiliary device driver") Signed-off-by: Maher Sanalla Reviewed-by: Shay Drory Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/debugfs.c | 1 + drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c | 2 +- include/linux/mlx5/driver.h | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c index c3e7c24a0971..bb95b40d25eb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c @@ -246,6 +246,7 @@ void mlx5_pages_debugfs_init(struct mlx5_core_dev *dev) debugfs_create_u32("fw_pages_total", 0400, pages, &dev->priv.fw_pages); debugfs_create_u32("fw_pages_vfs", 0400, pages, &dev->priv.page_counters[MLX5_VF]); + debugfs_create_u32("fw_pages_sfs", 0400, pages, &dev->priv.page_counters[MLX5_SF]); debugfs_create_u32("fw_pages_host_pf", 0400, pages, &dev->priv.page_counters[MLX5_HOST_PF]); debugfs_create_u32("fw_pages_alloc_failed", 0400, pages, &dev->priv.fw_pages_alloc_failed); debugfs_create_u32("fw_pages_give_dropped", 0400, pages, &dev->priv.give_pages_dropped); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c index 9f99292ab5ce..0eb50be175cc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c @@ -79,7 +79,7 @@ static u16 func_id_to_type(struct mlx5_core_dev *dev, u16 func_id, bool ec_funct if (!func_id) return mlx5_core_is_ecpf(dev) && !ec_function ? MLX5_HOST_PF : MLX5_PF; - return MLX5_VF; + return func_id <= mlx5_core_max_vfs(dev) ? MLX5_VF : MLX5_SF; } static struct rb_root *page_root_per_function(struct mlx5_core_dev *dev, u32 function) diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 82a9bd4274b8..333c1fec72f8 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -576,6 +576,7 @@ struct mlx5_debugfs_entries { enum mlx5_func_type { MLX5_PF, MLX5_VF, + MLX5_SF, MLX5_HOST_PF, MLX5_FUNC_TYPE_NUM, }; -- cgit v1.2.3