From 9b12f050c76f090cc6d0aebe0ef76fed79ec3f15 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 22 Feb 2023 10:23:02 +0100 Subject: char: pcmcia: remove all the drivers These char PCMCIA drivers are buggy[1] and receive only minimal care. It was concluded[2], that we should try to remove most pcmcia drivers completely. Let's start with these char broken one. Note that I also removed a UAPI header: include/uapi/linux/cm4000_cs.h. I found only coccinelle tests mentioning some ioctl constants from that file. But they are not actually used. Anyway, should someone complain, we may reintroduce the header (or its parts). [1] https://lore.kernel.org/all/f41c2765-80e0-48bc-b1e4-8cfd3230fd4a@www.fastmail.com/ [2] https://lore.kernel.org/all/c5b39544-a4fb-4796-a046-0b9be9853787@app.fastmail.com/ Signed-off-by: Jiri Slaby (SUSE) Cc: "Hyunwoo Kim" Cc: Harald Welte Cc: Lubomir Rintel Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Acked-by: Dominik Brodowski Reviewed-by: Arnd Bergmann Link: https://lore.kernel.org/r/20230222092302.6348-2-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/cm4000_cs.h | 64 ------------------------------------------ 1 file changed, 64 deletions(-) delete mode 100644 include/uapi/linux/cm4000_cs.h (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/cm4000_cs.h b/include/uapi/linux/cm4000_cs.h deleted file mode 100644 index c70a62ec8a49..000000000000 --- a/include/uapi/linux/cm4000_cs.h +++ /dev/null @@ -1,64 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _UAPI_CM4000_H_ -#define _UAPI_CM4000_H_ - -#include -#include - -#define MAX_ATR 33 - -#define CM4000_MAX_DEV 4 - -/* those two structures are passed via ioctl() from/to userspace. They are - * used by existing userspace programs, so I kepth the awkward "bIFSD" naming - * not to break compilation of userspace apps. -HW */ - -typedef struct atreq { - __s32 atr_len; - unsigned char atr[64]; - __s32 power_act; - unsigned char bIFSD; - unsigned char bIFSC; -} atreq_t; - - -/* what is particularly stupid in the original driver is the arch-dependent - * member sizes. This leads to CONFIG_COMPAT breakage, since 32bit userspace - * will lay out the structure members differently than the 64bit kernel. - * - * I've changed "ptsreq.protocol" from "unsigned long" to "__u32". - * On 32bit this will make no difference. With 64bit kernels, it will make - * 32bit apps work, too. - */ - -typedef struct ptsreq { - __u32 protocol; /*T=0: 2^0, T=1: 2^1*/ - unsigned char flags; - unsigned char pts1; - unsigned char pts2; - unsigned char pts3; -} ptsreq_t; - -#define CM_IOC_MAGIC 'c' -#define CM_IOC_MAXNR 255 - -#define CM_IOCGSTATUS _IOR (CM_IOC_MAGIC, 0, unsigned char *) -#define CM_IOCGATR _IOWR(CM_IOC_MAGIC, 1, atreq_t *) -#define CM_IOCSPTS _IOW (CM_IOC_MAGIC, 2, ptsreq_t *) -#define CM_IOCSRDR _IO (CM_IOC_MAGIC, 3) -#define CM_IOCARDOFF _IO (CM_IOC_MAGIC, 4) - -#define CM_IOSDBGLVL _IOW(CM_IOC_MAGIC, 250, int*) - -/* card and device states */ -#define CM_CARD_INSERTED 0x01 -#define CM_CARD_POWERED 0x02 -#define CM_ATR_PRESENT 0x04 -#define CM_ATR_VALID 0x08 -#define CM_STATE_VALID 0x0f -/* extra info only from CM4000 */ -#define CM_NO_READER 0x10 -#define CM_BAD_CARD 0x20 - - -#endif /* _UAPI_CM4000_H_ */ -- cgit v1.2.3 From c5edd753a0bd6243a597f5199c227a50457ee179 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Wed, 8 Feb 2023 15:01:02 +0100 Subject: KVM: x86: Remove the KVM_GET_NR_MMU_PAGES ioctl The KVM_GET_NR_MMU_PAGES ioctl is quite questionable on 64-bit hosts since it fails to return the full 64 bits of the value that can be set with the corresponding KVM_SET_NR_MMU_PAGES call. Its "long" return value is truncated into an "int" in the kvm_arch_vm_ioctl() function. Since this ioctl also never has been used by userspace applications (QEMU, Google's internal VMM, kvmtool and CrosVM have been checked), it's likely the best if we remove this badly designed ioctl before anybody really tries to use it. Signed-off-by: Thomas Huth Reviewed-by: Sean Christopherson Message-Id: <20230208140105.655814-4-thuth@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 8 -------- include/uapi/linux/kvm.h | 2 +- tools/include/uapi/linux/kvm.h | 2 +- 3 files changed, 2 insertions(+), 10 deletions(-) (limited to 'include/uapi/linux') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ff7f398a0c6a..4282daeaee84 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6021,11 +6021,6 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, return 0; } -static unsigned long kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) -{ - return kvm->arch.n_max_mmu_pages; -} - static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) { struct kvm_pic *pic = kvm->arch.vpic; @@ -6711,9 +6706,6 @@ set_identity_unlock: case KVM_SET_NR_MMU_PAGES: r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); break; - case KVM_GET_NR_MMU_PAGES: - r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); - break; case KVM_CREATE_IRQCHIP: { mutex_lock(&kvm->lock); diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index d77aef872a0a..4003a166328c 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1451,7 +1451,7 @@ struct kvm_vfio_spapr_tce { #define KVM_CREATE_VCPU _IO(KVMIO, 0x41) #define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log) #define KVM_SET_NR_MMU_PAGES _IO(KVMIO, 0x44) -#define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45) +#define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45) /* deprecated */ #define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46, \ struct kvm_userspace_memory_region) #define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47) diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index d77aef872a0a..4003a166328c 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -1451,7 +1451,7 @@ struct kvm_vfio_spapr_tce { #define KVM_CREATE_VCPU _IO(KVMIO, 0x41) #define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log) #define KVM_SET_NR_MMU_PAGES _IO(KVMIO, 0x44) -#define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45) +#define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45) /* deprecated */ #define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46, \ struct kvm_userspace_memory_region) #define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47) -- cgit v1.2.3 From 1fb1ea0d9cb8359ae9d6f67f22666c74d5b9f47d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 10 Mar 2023 19:07:47 +0200 Subject: mei: Move uuid.h to the MEI namespace There is only a single user of the UUID uAPI, let's make it part of that user. The way it's done is to prevent compilation time breakage for the user space that does #include In the future MEI user space tools can switch over to use mei_uuid.h. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20230310170747.22782-1-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- MAINTAINERS | 1 + drivers/misc/mei/bus-fixup.c | 2 +- drivers/misc/mei/hdcp/mei_hdcp.c | 2 +- drivers/misc/mei/hw.h | 2 +- drivers/misc/mei/main.c | 1 - drivers/misc/mei/pxp/mei_pxp.c | 2 +- include/linux/mod_devicetable.h | 1 + include/linux/uuid.h | 3 --- include/uapi/linux/mei.h | 2 +- include/uapi/linux/mei_uuid.h | 29 +++++++++++++++++++++++++++++ include/uapi/linux/uuid.h | 31 +------------------------------ 11 files changed, 37 insertions(+), 39 deletions(-) create mode 100644 include/uapi/linux/mei_uuid.h (limited to 'include/uapi/linux') diff --git a/MAINTAINERS b/MAINTAINERS index 7ee30a356e89..619c30ec36dc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10459,6 +10459,7 @@ F: drivers/watchdog/mei_wdt.c F: include/linux/mei_aux.h F: include/linux/mei_cl_bus.h F: include/uapi/linux/mei.h +F: include/uapi/linux/mei_uuid.h F: include/uapi/linux/uuid.h F: samples/mei/* diff --git a/drivers/misc/mei/bus-fixup.c b/drivers/misc/mei/bus-fixup.c index 211536109308..31e3c74ca1f1 100644 --- a/drivers/misc/mei/bus-fixup.c +++ b/drivers/misc/mei/bus-fixup.c @@ -9,8 +9,8 @@ #include #include #include -#include +#include #include #include "mei_dev.h" diff --git a/drivers/misc/mei/hdcp/mei_hdcp.c b/drivers/misc/mei/hdcp/mei_hdcp.c index e0dcd5c114db..45e3d4d27797 100644 --- a/drivers/misc/mei/hdcp/mei_hdcp.c +++ b/drivers/misc/mei/hdcp/mei_hdcp.c @@ -18,7 +18,7 @@ #include #include -#include +#include #include #include #include diff --git a/drivers/misc/mei/hw.h b/drivers/misc/mei/hw.h index 319418ddf4fb..e910302fcd1f 100644 --- a/drivers/misc/mei/hw.h +++ b/drivers/misc/mei/hw.h @@ -7,7 +7,7 @@ #ifndef _MEI_HW_TYPES_H_ #define _MEI_HW_TYPES_H_ -#include +#include /* * Timeouts in Seconds diff --git a/drivers/misc/mei/main.c b/drivers/misc/mei/main.c index 632d4ae21e46..c64291741d73 100644 --- a/drivers/misc/mei/main.c +++ b/drivers/misc/mei/main.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/misc/mei/pxp/mei_pxp.c b/drivers/misc/mei/pxp/mei_pxp.c index 7ee1fa7b1cb3..3bf560bbdee0 100644 --- a/drivers/misc/mei/pxp/mei_pxp.c +++ b/drivers/misc/mei/pxp/mei_pxp.c @@ -13,7 +13,7 @@ #include #include -#include +#include #include #include #include diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 549590e9c644..35203ca3fc37 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -9,6 +9,7 @@ #define LINUX_MOD_DEVICETABLE_H #ifdef __KERNEL__ +#include #include #include typedef unsigned long kernel_ulong_t; diff --git a/include/linux/uuid.h b/include/linux/uuid.h index 6b1a3efa1e0b..43d4a79b273d 100644 --- a/include/linux/uuid.h +++ b/include/linux/uuid.h @@ -107,7 +107,4 @@ extern const u8 uuid_index[16]; int guid_parse(const char *uuid, guid_t *u); int uuid_parse(const char *uuid, uuid_t *u); -/* MEI UUID type, don't use anywhere else */ -#include - #endif diff --git a/include/uapi/linux/mei.h b/include/uapi/linux/mei.h index 4f3638489d01..6e57743628c0 100644 --- a/include/uapi/linux/mei.h +++ b/include/uapi/linux/mei.h @@ -7,7 +7,7 @@ #ifndef _LINUX_MEI_H #define _LINUX_MEI_H -#include +#include /* * This IOCTL is used to associate the current file descriptor with a diff --git a/include/uapi/linux/mei_uuid.h b/include/uapi/linux/mei_uuid.h new file mode 100644 index 000000000000..676ebe12d623 --- /dev/null +++ b/include/uapi/linux/mei_uuid.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * MEI UUID definition + * + * Copyright (C) 2010, Intel Corp. + * Huang Ying + */ + +#ifndef _UAPI_LINUX_MEI_UUID_H_ +#define _UAPI_LINUX_MEI_UUID_H_ + +#include + +typedef struct { + __u8 b[16]; +} uuid_le; + +#define UUID_LE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7) \ +((uuid_le) \ +{{ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \ + (b) & 0xff, ((b) >> 8) & 0xff, \ + (c) & 0xff, ((c) >> 8) & 0xff, \ + (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }}) + +#define NULL_UUID_LE \ + UUID_LE(0x00000000, 0x0000, 0x0000, 0x00, 0x00, 0x00, 0x00, \ + 0x00, 0x00, 0x00, 0x00) + +#endif /* _UAPI_LINUX_MEI_UUID_H_ */ diff --git a/include/uapi/linux/uuid.h b/include/uapi/linux/uuid.h index 96ac684a4b2f..8443738f4bb2 100644 --- a/include/uapi/linux/uuid.h +++ b/include/uapi/linux/uuid.h @@ -1,30 +1 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* DO NOT USE in new code! This is solely for MEI due to legacy reasons */ -/* - * MEI UUID definition - * - * Copyright (C) 2010, Intel Corp. - * Huang Ying - */ - -#ifndef _UAPI_LINUX_UUID_H_ -#define _UAPI_LINUX_UUID_H_ - -#include - -typedef struct { - __u8 b[16]; -} uuid_le; - -#define UUID_LE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7) \ -((uuid_le) \ -{{ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \ - (b) & 0xff, ((b) >> 8) & 0xff, \ - (c) & 0xff, ((c) >> 8) & 0xff, \ - (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }}) - -#define NULL_UUID_LE \ - UUID_LE(0x00000000, 0x0000, 0x0000, 0x00, 0x00, 0x00, 0x00, \ - 0x00, 0x00, 0x00, 0x00) - -#endif /* _UAPI_LINUX_UUID_H_ */ +#include -- cgit v1.2.3 From 4ca589661d964840d0d5de4b3baabbef78f453e3 Mon Sep 17 00:00:00 2001 From: Daniel Starke Date: Wed, 15 Mar 2023 11:53:52 +0100 Subject: tty: n_gsm: add ioctl for DLC specific parameter configuration Parameter negotiation has been introduced with commit 92f1f0c3290d ("tty: n_gsm: add parameter negotiation support") However, means to set individual parameters per DLCI are not yet implemented. Furthermore, it is currently not possible to keep a DLCI half open until the user application sets the right parameters for it. This is required to allow a user application to set its specific parameters before the underlying link is established. Otherwise, the link is opened and re-established right afterwards if the user application sets incompatible parameters. This may be an unexpected behavior for the peer. Add parameter 'wait_config' to 'gsm_config' to support setups where the DLCI specific user application sets its specific parameters after open() and before the link gets fully established. Setting this to zero disables the user application specific DLCI configuration option. Add the ioctls 'GSMIOC_GETCONF_DLCI' and 'GSMIOC_SETCONF_DLCI' for the ldisc and virtual ttys. This gets/sets the DLCI specific parameters and may trigger a reconnect of the DLCI if incompatible values have been set. Only the parameters for the DLCI associated with the virtual tty can be set or retrieved if called on these. Add remark within the documentation to introduce the new ioctls. Link: https://lore.kernel.org/oe-kbuild-all/202302281856.S9Lz4gHB-lkp@intel.com/ Reported-by: kernel test robot Signed-off-by: Daniel Starke Link: https://lore.kernel.org/r/20230315105354.6234-1-daniel.starke@siemens.com Signed-off-by: Greg Kroah-Hartman --- Documentation/driver-api/tty/n_gsm.rst | 4 + drivers/tty/n_gsm.c | 192 ++++++++++++++++++++++++++++++++- include/uapi/linux/gsmmux.h | 17 ++- 3 files changed, 207 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/driver-api/tty/n_gsm.rst b/Documentation/driver-api/tty/n_gsm.rst index 9447b8a3b8e2..c2624546fb8f 100644 --- a/Documentation/driver-api/tty/n_gsm.rst +++ b/Documentation/driver-api/tty/n_gsm.rst @@ -29,6 +29,8 @@ Config Initiator #. Configure the mux using ``GSMIOC_GETCONF``/``GSMIOC_SETCONF`` ioctl. +#. Configure DLCs using ``GSMIOC_GETCONF_DLCI``/``GSMIOC_SETCONF_DLCI`` ioctl for non-defaults. + #. Obtain base gsmtty number for the used serial port. Major parts of the initialization program @@ -120,6 +122,8 @@ Config Requester #. Configure the mux using ``GSMIOC_GETCONF``/``GSMIOC_SETCONF`` ioctl. +#. Configure DLCs using ``GSMIOC_GETCONF_DLCI``/``GSMIOC_SETCONF_DLCI`` ioctl for non-defaults. + #. Obtain base gsmtty number for the used serial port:: #include diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c index 7aa05ebed8e1..9d0e7701ffd4 100644 --- a/drivers/tty/n_gsm.c +++ b/drivers/tty/n_gsm.c @@ -128,6 +128,7 @@ struct gsm_msg { enum gsm_dlci_state { DLCI_CLOSED, + DLCI_WAITING_CONFIG, /* Waiting for DLCI configuration from user */ DLCI_CONFIGURE, /* Sending PN (for adaption > 1) */ DLCI_OPENING, /* Sending SABM not seen UA */ DLCI_OPEN, /* SABM/UA complete */ @@ -330,6 +331,7 @@ struct gsm_mux { unsigned int t3; /* Power wake-up timer in seconds. */ int n2; /* Retry count */ u8 k; /* Window size */ + bool wait_config; /* Wait for configuration by ioctl before DLCI open */ u32 keep_alive; /* Control channel keep-alive in 10ms */ /* Statistics (not currently exposed) */ @@ -451,6 +453,7 @@ static int gsm_modem_update(struct gsm_dlci *dlci, u8 brk); static struct gsm_msg *gsm_data_alloc(struct gsm_mux *gsm, u8 addr, int len, u8 ctrl); static int gsm_send_packet(struct gsm_mux *gsm, struct gsm_msg *msg); +static struct gsm_dlci *gsm_dlci_alloc(struct gsm_mux *gsm, int addr); static void gsmld_write_trigger(struct gsm_mux *gsm); static void gsmld_write_task(struct work_struct *work); @@ -2280,6 +2283,7 @@ static void gsm_dlci_begin_open(struct gsm_dlci *dlci) switch (dlci->state) { case DLCI_CLOSED: + case DLCI_WAITING_CONFIG: case DLCI_CLOSING: dlci->retries = gsm->n2; if (!need_pn) { @@ -2311,6 +2315,7 @@ static void gsm_dlci_set_opening(struct gsm_dlci *dlci) { switch (dlci->state) { case DLCI_CLOSED: + case DLCI_WAITING_CONFIG: case DLCI_CLOSING: dlci->state = DLCI_OPENING; break; @@ -2319,6 +2324,24 @@ static void gsm_dlci_set_opening(struct gsm_dlci *dlci) } } +/** + * gsm_dlci_set_wait_config - wait for channel configuration + * @dlci: DLCI to configure + * + * Wait for a DLCI configuration from the application. + */ +static void gsm_dlci_set_wait_config(struct gsm_dlci *dlci) +{ + switch (dlci->state) { + case DLCI_CLOSED: + case DLCI_CLOSING: + dlci->state = DLCI_WAITING_CONFIG; + break; + default: + break; + } +} + /** * gsm_dlci_begin_close - start channel open procedure * @dlci: DLCI to open @@ -2453,6 +2476,128 @@ static void gsm_kick_timer(struct timer_list *t) pr_info("%s TX queue stalled\n", __func__); } +/** + * gsm_dlci_copy_config_values - copy DLCI configuration + * @dlci: source DLCI + * @dc: configuration structure to fill + */ +static void gsm_dlci_copy_config_values(struct gsm_dlci *dlci, struct gsm_dlci_config *dc) +{ + memset(dc, 0, sizeof(*dc)); + dc->channel = (u32)dlci->addr; + dc->adaption = (u32)dlci->adaption; + dc->mtu = (u32)dlci->mtu; + dc->priority = (u32)dlci->prio; + if (dlci->ftype == UIH) + dc->i = 1; + else + dc->i = 2; + dc->k = (u32)dlci->k; +} + +/** + * gsm_dlci_config - configure DLCI from configuration + * @dlci: DLCI to configure + * @dc: DLCI configuration + * @open: open DLCI after configuration? + */ +static int gsm_dlci_config(struct gsm_dlci *dlci, struct gsm_dlci_config *dc, int open) +{ + struct gsm_mux *gsm; + bool need_restart = false; + bool need_open = false; + unsigned int i; + + /* + * Check that userspace doesn't put stuff in here to prevent breakages + * in the future. + */ + for (i = 0; i < ARRAY_SIZE(dc->reserved); i++) + if (dc->reserved[i]) + return -EINVAL; + + if (!dlci) + return -EINVAL; + gsm = dlci->gsm; + + /* Stuff we don't support yet - I frame transport */ + if (dc->adaption != 1 && dc->adaption != 2) + return -EOPNOTSUPP; + if (dc->mtu > MAX_MTU || dc->mtu < MIN_MTU || dc->mtu > gsm->mru) + return -EINVAL; + if (dc->priority >= 64) + return -EINVAL; + if (dc->i == 0 || dc->i > 2) /* UIH and UI only */ + return -EINVAL; + if (dc->k > 7) + return -EINVAL; + + /* + * See what is needed for reconfiguration + */ + /* Framing fields */ + if (dc->adaption != dlci->adaption) + need_restart = true; + if (dc->mtu != dlci->mtu) + need_restart = true; + if (dc->i != dlci->ftype) + need_restart = true; + /* Requires care */ + if (dc->priority != dlci->prio) + need_restart = true; + + if ((open && gsm->wait_config) || need_restart) + need_open = true; + if (dlci->state == DLCI_WAITING_CONFIG) { + need_restart = false; + need_open = true; + } + + /* + * Close down what is needed, restart and initiate the new + * configuration. + */ + if (need_restart) { + gsm_dlci_begin_close(dlci); + wait_event_interruptible(gsm->event, dlci->state == DLCI_CLOSED); + if (signal_pending(current)) + return -EINTR; + } + /* + * Setup the new configuration values + */ + dlci->adaption = (int)dc->adaption; + + if (dc->mtu) + dlci->mtu = (unsigned int)dc->mtu; + else + dlci->mtu = gsm->mtu; + + if (dc->priority) + dlci->prio = (u8)dc->priority; + else + dlci->prio = roundup(dlci->addr + 1, 8) - 1; + + if (dc->i == 1) + dlci->ftype = UIH; + else if (dc->i == 2) + dlci->ftype = UI; + + if (dc->k) + dlci->k = (u8)dc->k; + else + dlci->k = gsm->k; + + if (need_open) { + if (gsm->initiator) + gsm_dlci_begin_open(dlci); + else + gsm_dlci_set_opening(dlci); + } + + return 0; +} + /* * Allocate/Free DLCI channels */ @@ -3078,6 +3223,7 @@ static struct gsm_mux *gsm_alloc_mux(void) gsm->mru = 64; /* Default to encoding 1 so these should be 64 */ gsm->mtu = 64; gsm->dead = true; /* Avoid early tty opens */ + gsm->wait_config = false; /* Disabled */ gsm->keep_alive = 0; /* Disabled */ /* Store the instance to the mux array or abort if no space is @@ -3218,6 +3364,7 @@ static void gsm_copy_config_ext_values(struct gsm_mux *gsm, struct gsm_config_ext *ce) { memset(ce, 0, sizeof(*ce)); + ce->wait_config = gsm->wait_config ? 1 : 0; ce->keep_alive = gsm->keep_alive; } @@ -3233,7 +3380,12 @@ static int gsm_config_ext(struct gsm_mux *gsm, struct gsm_config_ext *ce) if (ce->reserved[i]) return -EINVAL; + /* + * Setup the new configuration values + */ + gsm->wait_config = ce->wait_config ? true : false; gsm->keep_alive = ce->keep_alive; + return 0; } @@ -3436,6 +3588,14 @@ static int gsmld_open(struct tty_struct *tty) gsm->encoding = GSM_ADV_OPT; gsmld_attach_gsm(tty, gsm); + /* The mux will not be activated yet, we wait for correct + * configuration first. + */ + if (gsm->encoding == GSM_BASIC_OPT) + gsm->receive = gsm0_receive; + else + gsm->receive = gsm1_receive; + return 0; } @@ -4008,7 +4168,6 @@ static int gsmtty_open(struct tty_struct *tty, struct file *filp) { struct gsm_dlci *dlci = tty->driver_data; struct tty_port *port = &dlci->port; - struct gsm_mux *gsm = dlci->gsm; port->count++; tty_port_tty_set(port, tty); @@ -4018,10 +4177,15 @@ static int gsmtty_open(struct tty_struct *tty, struct file *filp) a DM straight back. This is ok as that will have caused a hangup */ tty_port_set_initialized(port, true); /* Start sending off SABM messages */ - if (gsm->initiator) - gsm_dlci_begin_open(dlci); - else - gsm_dlci_set_opening(dlci); + if (!dlci->gsm->wait_config) { + /* Start sending off SABM messages */ + if (dlci->gsm->initiator) + gsm_dlci_begin_open(dlci); + else + gsm_dlci_set_opening(dlci); + } else { + gsm_dlci_set_wait_config(dlci); + } /* And wait for virtual carrier */ return tty_port_block_til_ready(port, tty, filp); } @@ -4142,6 +4306,7 @@ static int gsmtty_ioctl(struct tty_struct *tty, { struct gsm_dlci *dlci = tty->driver_data; struct gsm_netconfig nc; + struct gsm_dlci_config dc; int index; if (dlci->state == DLCI_CLOSED) @@ -4165,6 +4330,23 @@ static int gsmtty_ioctl(struct tty_struct *tty, gsm_destroy_network(dlci); mutex_unlock(&dlci->mutex); return 0; + case GSMIOC_GETCONF_DLCI: + if (copy_from_user(&dc, (void __user *)arg, sizeof(dc))) + return -EFAULT; + if (dc.channel != dlci->addr) + return -EPERM; + gsm_dlci_copy_config_values(dlci, &dc); + if (copy_to_user((void __user *)arg, &dc, sizeof(dc))) + return -EFAULT; + return 0; + case GSMIOC_SETCONF_DLCI: + if (copy_from_user(&dc, (void __user *)arg, sizeof(dc))) + return -EFAULT; + if (dc.channel >= NUM_DLCI) + return -EINVAL; + if (dc.channel != 0 && dc.channel != dlci->addr) + return -EPERM; + return gsm_dlci_config(dlci, &dc, 1); case TIOCMIWAIT: return gsm_wait_modem_change(dlci, (u32)arg); default: diff --git a/include/uapi/linux/gsmmux.h b/include/uapi/linux/gsmmux.h index a703780aa095..eb67884e5f38 100644 --- a/include/uapi/linux/gsmmux.h +++ b/include/uapi/linux/gsmmux.h @@ -43,10 +43,25 @@ struct gsm_config_ext { __u32 keep_alive; /* Control channel keep-alive in 1/100th of a * second (0 to disable) */ - __u32 reserved[7]; /* For future use, must be initialized to zero */ + __u32 wait_config; /* Wait for DLCI config before opening virtual link? */ + __u32 reserved[6]; /* For future use, must be initialized to zero */ }; #define GSMIOC_GETCONF_EXT _IOR('G', 5, struct gsm_config_ext) #define GSMIOC_SETCONF_EXT _IOW('G', 6, struct gsm_config_ext) +/* Set channel accordingly before calling GSMIOC_GETCONF_DLCI. */ +struct gsm_dlci_config { + __u32 channel; /* DLCI (0 for the associated DLCI) */ + __u32 adaption; /* Convergence layer type */ + __u32 mtu; /* Maximum transfer unit */ + __u32 priority; /* Priority (0 for default value) */ + __u32 i; /* Frame type (1 = UIH, 2 = UI) */ + __u32 k; /* Window size (0 for default value) */ + __u32 reserved[8]; /* For future use, must be initialized to zero */ +}; + +#define GSMIOC_GETCONF_DLCI _IOWR('G', 7, struct gsm_dlci_config) +#define GSMIOC_SETCONF_DLCI _IOW('G', 8, struct gsm_dlci_config) + #endif -- cgit v1.2.3 From e5a26a4048eeb9558e5c84f340a989c78db4adf4 Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Tue, 28 Mar 2023 16:52:08 -0700 Subject: tracing/user_events: Split header into uapi and kernel The UAPI parts need to be split out from the kernel parts of user_events now that other parts of the kernel will reference it. Do so by moving the existing include/linux/user_events.h into include/uapi/linux/user_events.h. Link: https://lkml.kernel.org/r/20230328235219.203-2-beaub@linux.microsoft.com Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- include/linux/user_events.h | 52 +++++----------------------------------- include/uapi/linux/user_events.h | 48 +++++++++++++++++++++++++++++++++++++ kernel/trace/trace_events_user.c | 5 ---- 3 files changed, 54 insertions(+), 51 deletions(-) create mode 100644 include/uapi/linux/user_events.h (limited to 'include/uapi/linux') diff --git a/include/linux/user_events.h b/include/linux/user_events.h index 592a3fbed98e..13689589d36e 100644 --- a/include/linux/user_events.h +++ b/include/linux/user_events.h @@ -1,54 +1,14 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* SPDX-License-Identifier: GPL-2.0-only */ /* - * Copyright (c) 2021, Microsoft Corporation. + * Copyright (c) 2022, Microsoft Corporation. * * Authors: * Beau Belgrave */ -#ifndef _UAPI_LINUX_USER_EVENTS_H -#define _UAPI_LINUX_USER_EVENTS_H -#include -#include +#ifndef _LINUX_USER_EVENTS_H +#define _LINUX_USER_EVENTS_H -#ifdef __KERNEL__ -#include -#else -#include -#endif +#include -#define USER_EVENTS_SYSTEM "user_events" -#define USER_EVENTS_PREFIX "u:" - -/* Create dynamic location entry within a 32-bit value */ -#define DYN_LOC(offset, size) ((size) << 16 | (offset)) - -/* - * Describes an event registration and stores the results of the registration. - * This structure is passed to the DIAG_IOCSREG ioctl, callers at a minimum - * must set the size and name_args before invocation. - */ -struct user_reg { - - /* Input: Size of the user_reg structure being used */ - __u32 size; - - /* Input: Pointer to string with event name, description and flags */ - __u64 name_args; - - /* Output: Bitwise index of the event within the status page */ - __u32 status_bit; - - /* Output: Index of the event to use when writing data */ - __u32 write_index; -} __attribute__((__packed__)); - -#define DIAG_IOC_MAGIC '*' - -/* Requests to register a user_event */ -#define DIAG_IOCSREG _IOWR(DIAG_IOC_MAGIC, 0, struct user_reg*) - -/* Requests to delete a user_event */ -#define DIAG_IOCSDEL _IOW(DIAG_IOC_MAGIC, 1, char*) - -#endif /* _UAPI_LINUX_USER_EVENTS_H */ +#endif /* _LINUX_USER_EVENTS_H */ diff --git a/include/uapi/linux/user_events.h b/include/uapi/linux/user_events.h new file mode 100644 index 000000000000..03f92366068d --- /dev/null +++ b/include/uapi/linux/user_events.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright (c) 2021-2022, Microsoft Corporation. + * + * Authors: + * Beau Belgrave + */ +#ifndef _UAPI_LINUX_USER_EVENTS_H +#define _UAPI_LINUX_USER_EVENTS_H + +#include +#include + +#define USER_EVENTS_SYSTEM "user_events" +#define USER_EVENTS_PREFIX "u:" + +/* Create dynamic location entry within a 32-bit value */ +#define DYN_LOC(offset, size) ((size) << 16 | (offset)) + +/* + * Describes an event registration and stores the results of the registration. + * This structure is passed to the DIAG_IOCSREG ioctl, callers at a minimum + * must set the size and name_args before invocation. + */ +struct user_reg { + + /* Input: Size of the user_reg structure being used */ + __u32 size; + + /* Input: Pointer to string with event name, description and flags */ + __u64 name_args; + + /* Output: Bitwise index of the event within the status page */ + __u32 status_bit; + + /* Output: Index of the event to use when writing data */ + __u32 write_index; +} __attribute__((__packed__)); + +#define DIAG_IOC_MAGIC '*' + +/* Request to register a user_event */ +#define DIAG_IOCSREG _IOWR(DIAG_IOC_MAGIC, 0, struct user_reg *) + +/* Request to delete a user_event */ +#define DIAG_IOCSDEL _IOW(DIAG_IOC_MAGIC, 1, char *) + +#endif /* _UAPI_LINUX_USER_EVENTS_H */ diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 908e8a13c675..070551480747 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -19,12 +19,7 @@ #include #include #include -/* Reminder to move to uapi when everything works */ -#ifdef CONFIG_COMPILE_TEST #include -#else -#include -#endif #include "trace.h" #include "trace_dynevent.h" -- cgit v1.2.3 From 7235759084a4f8524a46bd2638885ff3b34ce279 Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Tue, 28 Mar 2023 16:52:10 -0700 Subject: tracing/user_events: Use remote writes for event enablement As part of the discussions for user_events aligned with user space tracers, it was determined that user programs should register a aligned value to set or clear a bit when an event becomes enabled. Currently a shared page is being used that requires mmap(). Remove the shared page implementation and move to a user registered address implementation. In this new model during the event registration from user programs 3 new values are specified. The first is the address to update when the event is either enabled or disabled. The second is the bit to set/clear to reflect the event being enabled. The third is the size of the value at the specified address. This allows for a local 32/64-bit value in user programs to support both kernel and user tracers. As an example, setting bit 31 for kernel tracers when the event becomes enabled allows for user tracers to use the other bits for ref counts or other flags. The kernel side updates the bit atomically, user programs need to also update these values atomically. User provided addresses must be aligned on a natural boundary, this allows for single page checking and prevents odd behaviors such as a enable value straddling 2 pages instead of a single page. Currently page faults are only logged, future patches will handle these. Link: https://lkml.kernel.org/r/20230328235219.203-4-beaub@linux.microsoft.com Suggested-by: Mathieu Desnoyers Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- include/linux/user_events.h | 53 +++- include/uapi/linux/user_events.h | 15 +- kernel/trace/Kconfig | 5 +- kernel/trace/trace_events_user.c | 586 ++++++++++++++++++++++++++++++--------- 4 files changed, 517 insertions(+), 142 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/user_events.h b/include/linux/user_events.h index 3d747c45d2fa..0120b3dd5b03 100644 --- a/include/linux/user_events.h +++ b/include/linux/user_events.h @@ -9,13 +9,63 @@ #ifndef _LINUX_USER_EVENTS_H #define _LINUX_USER_EVENTS_H +#include +#include +#include +#include #include #ifdef CONFIG_USER_EVENTS struct user_event_mm { + struct list_head link; + struct list_head enablers; + struct mm_struct *mm; + struct user_event_mm *next; + refcount_t refcnt; + refcount_t tasks; + struct rcu_work put_rwork; }; -#endif +extern void user_event_mm_dup(struct task_struct *t, + struct user_event_mm *old_mm); + +extern void user_event_mm_remove(struct task_struct *t); + +static inline void user_events_fork(struct task_struct *t, + unsigned long clone_flags) +{ + struct user_event_mm *old_mm; + + if (!t || !current->user_event_mm) + return; + + old_mm = current->user_event_mm; + + if (clone_flags & CLONE_VM) { + t->user_event_mm = old_mm; + refcount_inc(&old_mm->tasks); + return; + } + + user_event_mm_dup(t, old_mm); +} + +static inline void user_events_execve(struct task_struct *t) +{ + if (!t || !t->user_event_mm) + return; + + user_event_mm_remove(t); +} + +static inline void user_events_exit(struct task_struct *t) +{ + if (!t || !t->user_event_mm) + return; + + user_event_mm_remove(t); +} +#else static inline void user_events_fork(struct task_struct *t, unsigned long clone_flags) { @@ -28,5 +78,6 @@ static inline void user_events_execve(struct task_struct *t) static inline void user_events_exit(struct task_struct *t) { } +#endif /* CONFIG_USER_EVENTS */ #endif /* _LINUX_USER_EVENTS_H */ diff --git a/include/uapi/linux/user_events.h b/include/uapi/linux/user_events.h index 03f92366068d..22521bc622db 100644 --- a/include/uapi/linux/user_events.h +++ b/include/uapi/linux/user_events.h @@ -27,12 +27,21 @@ struct user_reg { /* Input: Size of the user_reg structure being used */ __u32 size; + /* Input: Bit in enable address to use */ + __u8 enable_bit; + + /* Input: Enable size in bytes at address */ + __u8 enable_size; + + /* Input: Flags for future use, set to 0 */ + __u16 flags; + + /* Input: Address to update when enabled */ + __u64 enable_addr; + /* Input: Pointer to string with event name, description and flags */ __u64 name_args; - /* Output: Bitwise index of the event within the status page */ - __u32 status_bit; - /* Output: Index of the event to use when writing data */ __u32 write_index; } __attribute__((__packed__)); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 5b1e7fa41ca8..c7020e071bf9 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -798,9 +798,10 @@ config USER_EVENTS can be used like an existing kernel trace event. User trace events are generated by writing to a tracefs file. User processes can determine if their tracing events should be - generated by memory mapping a tracefs file and checking for - an associated byte being non-zero. + generated by registering a value and bit with the kernel + that reflects when it is enabled or not. + See Documentation/trace/user_events.rst. If in doubt, say N. config HIST_TRIGGERS diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 070551480747..553a82ee7aeb 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include "trace.h" #include "trace_dynevent.h" @@ -29,34 +30,11 @@ #define FIELD_DEPTH_NAME 1 #define FIELD_DEPTH_SIZE 2 -/* - * Limits how many trace_event calls user processes can create: - * Must be a power of two of PAGE_SIZE. - */ -#define MAX_PAGE_ORDER 0 -#define MAX_PAGES (1 << MAX_PAGE_ORDER) -#define MAX_BYTES (MAX_PAGES * PAGE_SIZE) -#define MAX_EVENTS (MAX_BYTES * 8) - /* Limit how long of an event name plus args within the subsystem. */ #define MAX_EVENT_DESC 512 #define EVENT_NAME(user_event) ((user_event)->tracepoint.name) #define MAX_FIELD_ARRAY_SIZE 1024 -/* - * The MAP_STATUS_* macros are used for taking a index and determining the - * appropriate byte and the bit in the byte to set/reset for an event. - * - * The lower 3 bits of the index decide which bit to set. - * The remaining upper bits of the index decide which byte to use for the bit. - * - * This is used when an event has a probe attached/removed to reflect live - * status of the event wanting tracing or not to user-programs via shared - * memory maps. - */ -#define MAP_STATUS_BYTE(index) ((index) >> 3) -#define MAP_STATUS_MASK(index) BIT((index) & 7) - /* * Internal bits (kernel side only) to keep track of connected probes: * These are used when status is requested in text form about an event. These @@ -70,20 +48,14 @@ #define EVENT_STATUS_OTHER BIT(7) /* - * Stores the pages, tables, and locks for a group of events. - * Each logical grouping of events has its own group, with a - * matching page for status checks within user programs. This - * allows for isolation of events to user programs by various - * means. + * Stores the system name, tables, and locks for a group of events. This + * allows isolation for events by various means. */ struct user_event_group { - struct page *pages; - char *register_page_data; char *system_name; struct hlist_node node; struct mutex reg_mutex; DECLARE_HASHTABLE(register_table, 8); - DECLARE_BITMAP(page_bitmap, MAX_EVENTS); }; /* Group for init_user_ns mapping, top-most group */ @@ -106,12 +78,34 @@ struct user_event { struct list_head fields; struct list_head validators; refcount_t refcnt; - int index; - int flags; int min_size; char status; }; +/* + * Stores per-mm/event properties that enable an address to be + * updated properly for each task. As tasks are forked, we use + * these to track enablement sites that are tied to an event. + */ +struct user_event_enabler { + struct list_head link; + struct user_event *event; + unsigned long addr; + + /* Track enable bit, flags, etc. Aligned for bitops. */ + unsigned int values; +}; + +/* Bits 0-5 are for the bit to update upon enable/disable (0-63 allowed) */ +#define ENABLE_VAL_BIT_MASK 0x3F + +/* Only duplicate the bit value */ +#define ENABLE_VAL_DUP_MASK ENABLE_VAL_BIT_MASK + +/* Global list of memory descriptors using user_events */ +static LIST_HEAD(user_event_mms); +static DEFINE_SPINLOCK(user_event_mms_lock); + /* * Stores per-file events references, as users register events * within a file this structure is modified and freed via RCU. @@ -145,33 +139,17 @@ static int user_event_parse(struct user_event_group *group, char *name, char *args, char *flags, struct user_event **newuser); +static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm); +static struct user_event_mm *user_event_mm_get_all(struct user_event *user); +static void user_event_mm_put(struct user_event_mm *mm); + static u32 user_event_key(char *name) { return jhash(name, strlen(name), 0); } -static void set_page_reservations(char *pages, bool set) -{ - int page; - - for (page = 0; page < MAX_PAGES; ++page) { - void *addr = pages + (PAGE_SIZE * page); - - if (set) - SetPageReserved(virt_to_page(addr)); - else - ClearPageReserved(virt_to_page(addr)); - } -} - static void user_event_group_destroy(struct user_event_group *group) { - if (group->register_page_data) - set_page_reservations(group->register_page_data, false); - - if (group->pages) - __free_pages(group->pages, MAX_PAGE_ORDER); - kfree(group->system_name); kfree(group); } @@ -242,19 +220,6 @@ static struct user_event_group if (!group->system_name) goto error; - group->pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER); - - if (!group->pages) - goto error; - - group->register_page_data = page_address(group->pages); - - set_page_reservations(group->register_page_data, true); - - /* Zero all bits beside 0 (which is reserved for failures) */ - bitmap_zero(group->page_bitmap, MAX_EVENTS); - set_bit(0, group->page_bitmap); - mutex_init(&group->reg_mutex); hash_init(group->register_table); @@ -266,20 +231,367 @@ error: return NULL; }; -static __always_inline -void user_event_register_set(struct user_event *user) +static void user_event_enabler_destroy(struct user_event_enabler *enabler) +{ + list_del_rcu(&enabler->link); + + /* No longer tracking the event via the enabler */ + refcount_dec(&enabler->event->refcnt); + + kfree(enabler); +} + +static int user_event_mm_fault_in(struct user_event_mm *mm, unsigned long uaddr) +{ + bool unlocked; + int ret; + + mmap_read_lock(mm->mm); + + /* Ensure MM has tasks, cannot use after exit_mm() */ + if (refcount_read(&mm->tasks) == 0) { + ret = -ENOENT; + goto out; + } + + ret = fixup_user_fault(mm->mm, uaddr, FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE, + &unlocked); +out: + mmap_read_unlock(mm->mm); + + return ret; +} + +static int user_event_enabler_write(struct user_event_mm *mm, + struct user_event_enabler *enabler) +{ + unsigned long uaddr = enabler->addr; + unsigned long *ptr; + struct page *page; + void *kaddr; + int ret; + + lockdep_assert_held(&event_mutex); + mmap_assert_locked(mm->mm); + + /* Ensure MM has tasks, cannot use after exit_mm() */ + if (refcount_read(&mm->tasks) == 0) + return -ENOENT; + + ret = pin_user_pages_remote(mm->mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT, + &page, NULL, NULL); + + if (ret <= 0) { + pr_warn("user_events: Enable write failed\n"); + return -EFAULT; + } + + kaddr = kmap_local_page(page); + ptr = kaddr + (uaddr & ~PAGE_MASK); + + /* Update bit atomically, user tracers must be atomic as well */ + if (enabler->event && enabler->event->status) + set_bit(enabler->values & ENABLE_VAL_BIT_MASK, ptr); + else + clear_bit(enabler->values & ENABLE_VAL_BIT_MASK, ptr); + + kunmap_local(kaddr); + unpin_user_pages_dirty_lock(&page, 1, true); + + return 0; +} + +static void user_event_enabler_update(struct user_event *user) +{ + struct user_event_enabler *enabler; + struct user_event_mm *mm = user_event_mm_get_all(user); + struct user_event_mm *next; + + while (mm) { + next = mm->next; + mmap_read_lock(mm->mm); + rcu_read_lock(); + + list_for_each_entry_rcu(enabler, &mm->enablers, link) + if (enabler->event == user) + user_event_enabler_write(mm, enabler); + + rcu_read_unlock(); + mmap_read_unlock(mm->mm); + user_event_mm_put(mm); + mm = next; + } +} + +static bool user_event_enabler_dup(struct user_event_enabler *orig, + struct user_event_mm *mm) +{ + struct user_event_enabler *enabler; + + enabler = kzalloc(sizeof(*enabler), GFP_NOWAIT); + + if (!enabler) + return false; + + enabler->event = orig->event; + enabler->addr = orig->addr; + + /* Only dup part of value (ignore future flags, etc) */ + enabler->values = orig->values & ENABLE_VAL_DUP_MASK; + + refcount_inc(&enabler->event->refcnt); + list_add_rcu(&enabler->link, &mm->enablers); + + return true; +} + +static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm) +{ + refcount_inc(&mm->refcnt); + + return mm; +} + +static struct user_event_mm *user_event_mm_get_all(struct user_event *user) +{ + struct user_event_mm *found = NULL; + struct user_event_enabler *enabler; + struct user_event_mm *mm; + + /* + * We do not want to block fork/exec while enablements are being + * updated, so we use RCU to walk the current tasks that have used + * user_events ABI for 1 or more events. Each enabler found in each + * task that matches the event being updated has a write to reflect + * the kernel state back into the process. Waits/faults must not occur + * during this. So we scan the list under RCU for all the mm that have + * the event within it. This is needed because mm_read_lock() can wait. + * Each user mm returned has a ref inc to handle remove RCU races. + */ + rcu_read_lock(); + + list_for_each_entry_rcu(mm, &user_event_mms, link) + list_for_each_entry_rcu(enabler, &mm->enablers, link) + if (enabler->event == user) { + mm->next = found; + found = user_event_mm_get(mm); + break; + } + + rcu_read_unlock(); + + return found; +} + +static struct user_event_mm *user_event_mm_create(struct task_struct *t) +{ + struct user_event_mm *user_mm; + unsigned long flags; + + user_mm = kzalloc(sizeof(*user_mm), GFP_KERNEL); + + if (!user_mm) + return NULL; + + user_mm->mm = t->mm; + INIT_LIST_HEAD(&user_mm->enablers); + refcount_set(&user_mm->refcnt, 1); + refcount_set(&user_mm->tasks, 1); + + spin_lock_irqsave(&user_event_mms_lock, flags); + list_add_rcu(&user_mm->link, &user_event_mms); + spin_unlock_irqrestore(&user_event_mms_lock, flags); + + t->user_event_mm = user_mm; + + /* + * The lifetime of the memory descriptor can slightly outlast + * the task lifetime if a ref to the user_event_mm is taken + * between list_del_rcu() and call_rcu(). Therefore we need + * to take a reference to it to ensure it can live this long + * under this corner case. This can also occur in clones that + * outlast the parent. + */ + mmgrab(user_mm->mm); + + return user_mm; +} + +static struct user_event_mm *current_user_event_mm(void) +{ + struct user_event_mm *user_mm = current->user_event_mm; + + if (user_mm) + goto inc; + + user_mm = user_event_mm_create(current); + + if (!user_mm) + goto error; +inc: + refcount_inc(&user_mm->refcnt); +error: + return user_mm; +} + +static void user_event_mm_destroy(struct user_event_mm *mm) +{ + struct user_event_enabler *enabler, *next; + + list_for_each_entry_safe(enabler, next, &mm->enablers, link) + user_event_enabler_destroy(enabler); + + mmdrop(mm->mm); + kfree(mm); +} + +static void user_event_mm_put(struct user_event_mm *mm) +{ + if (mm && refcount_dec_and_test(&mm->refcnt)) + user_event_mm_destroy(mm); +} + +static void delayed_user_event_mm_put(struct work_struct *work) +{ + struct user_event_mm *mm; + + mm = container_of(to_rcu_work(work), struct user_event_mm, put_rwork); + user_event_mm_put(mm); +} + +void user_event_mm_remove(struct task_struct *t) { - int i = user->index; + struct user_event_mm *mm; + unsigned long flags; + + might_sleep(); + + mm = t->user_event_mm; + t->user_event_mm = NULL; + + /* Clone will increment the tasks, only remove if last clone */ + if (!refcount_dec_and_test(&mm->tasks)) + return; + + /* Remove the mm from the list, so it can no longer be enabled */ + spin_lock_irqsave(&user_event_mms_lock, flags); + list_del_rcu(&mm->link); + spin_unlock_irqrestore(&user_event_mms_lock, flags); + + /* + * We need to wait for currently occurring writes to stop within + * the mm. This is required since exit_mm() snaps the current rss + * stats and clears them. On the final mmdrop(), check_mm() will + * report a bug if these increment. + * + * All writes/pins are done under mmap_read lock, take the write + * lock to ensure in-progress faults have completed. Faults that + * are pending but yet to run will check the task count and skip + * the fault since the mm is going away. + */ + mmap_write_lock(mm->mm); + mmap_write_unlock(mm->mm); - user->group->register_page_data[MAP_STATUS_BYTE(i)] |= MAP_STATUS_MASK(i); + /* + * Put for mm must be done after RCU delay to handle new refs in + * between the list_del_rcu() and now. This ensures any get refs + * during rcu_read_lock() are accounted for during list removal. + * + * CPU A | CPU B + * --------------------------------------------------------------- + * user_event_mm_remove() | rcu_read_lock(); + * list_del_rcu() | list_for_each_entry_rcu(); + * call_rcu() | refcount_inc(); + * . | rcu_read_unlock(); + * schedule_work() | . + * user_event_mm_put() | . + * + * mmdrop() cannot be called in the softirq context of call_rcu() + * so we use a work queue after call_rcu() to run within. + */ + INIT_RCU_WORK(&mm->put_rwork, delayed_user_event_mm_put); + queue_rcu_work(system_wq, &mm->put_rwork); } -static __always_inline -void user_event_register_clear(struct user_event *user) +void user_event_mm_dup(struct task_struct *t, struct user_event_mm *old_mm) { - int i = user->index; + struct user_event_mm *mm = user_event_mm_create(t); + struct user_event_enabler *enabler; - user->group->register_page_data[MAP_STATUS_BYTE(i)] &= ~MAP_STATUS_MASK(i); + if (!mm) + return; + + rcu_read_lock(); + + list_for_each_entry_rcu(enabler, &old_mm->enablers, link) + if (!user_event_enabler_dup(enabler, mm)) + goto error; + + rcu_read_unlock(); + + return; +error: + rcu_read_unlock(); + user_event_mm_remove(t); +} + +static struct user_event_enabler +*user_event_enabler_create(struct user_reg *reg, struct user_event *user, + int *write_result) +{ + struct user_event_enabler *enabler; + struct user_event_mm *user_mm; + unsigned long uaddr = (unsigned long)reg->enable_addr; + + user_mm = current_user_event_mm(); + + if (!user_mm) + return NULL; + + enabler = kzalloc(sizeof(*enabler), GFP_KERNEL); + + if (!enabler) + goto out; + + enabler->event = user; + enabler->addr = uaddr; + enabler->values = reg->enable_bit; +retry: + /* Prevents state changes from racing with new enablers */ + mutex_lock(&event_mutex); + + /* Attempt to reflect the current state within the process */ + mmap_read_lock(user_mm->mm); + *write_result = user_event_enabler_write(user_mm, enabler); + mmap_read_unlock(user_mm->mm); + + /* + * If the write works, then we will track the enabler. A ref to the + * underlying user_event is held by the enabler to prevent it going + * away while the enabler is still in use by a process. The ref is + * removed when the enabler is destroyed. This means a event cannot + * be forcefully deleted from the system until all tasks using it + * exit or run exec(), which includes forks and clones. + */ + if (!*write_result) { + refcount_inc(&enabler->event->refcnt); + list_add_rcu(&enabler->link, &user_mm->enablers); + } + + mutex_unlock(&event_mutex); + + if (*write_result) { + /* Attempt to fault-in and retry if it worked */ + if (!user_event_mm_fault_in(user_mm, uaddr)) + goto retry; + + kfree(enabler); + enabler = NULL; + } +out: + user_event_mm_put(user_mm); + + return enabler; } static __always_inline __must_check @@ -824,9 +1136,6 @@ static int destroy_user_event(struct user_event *user) return ret; dyn_event_remove(&user->devent); - - user_event_register_clear(user); - clear_bit(user->index, user->group->page_bitmap); hash_del(&user->node); user_event_destroy_validators(user); @@ -972,9 +1281,9 @@ discard: #endif /* - * Update the register page that is shared between user processes. + * Update the enabled bit among all user processes. */ -static void update_reg_page_for(struct user_event *user) +static void update_enable_bit_for(struct user_event *user) { struct tracepoint *tp = &user->tracepoint; char status = 0; @@ -1005,12 +1314,9 @@ static void update_reg_page_for(struct user_event *user) rcu_read_unlock_sched(); } - if (status) - user_event_register_set(user); - else - user_event_register_clear(user); - user->status = status; + + user_event_enabler_update(user); } /* @@ -1067,10 +1373,10 @@ static int user_event_reg(struct trace_event_call *call, return ret; inc: refcount_inc(&user->refcnt); - update_reg_page_for(user); + update_enable_bit_for(user); return 0; dec: - update_reg_page_for(user); + update_enable_bit_for(user); refcount_dec(&user->refcnt); return 0; } @@ -1266,7 +1572,6 @@ static int user_event_parse(struct user_event_group *group, char *name, struct user_event **newuser) { int ret; - int index; u32 key; struct user_event *user; @@ -1285,11 +1590,6 @@ static int user_event_parse(struct user_event_group *group, char *name, return 0; } - index = find_first_zero_bit(group->page_bitmap, MAX_EVENTS); - - if (index == MAX_EVENTS) - return -EMFILE; - user = kzalloc(sizeof(*user), GFP_KERNEL); if (!user) @@ -1335,14 +1635,11 @@ static int user_event_parse(struct user_event_group *group, char *name, if (ret) goto put_user_lock; - user->index = index; - /* Ensure we track self ref and caller ref (2) */ refcount_set(&user->refcnt, 2); dyn_event_init(&user->devent, &user_event_dops); dyn_event_add(&user->devent, &user->call); - set_bit(user->index, group->page_bitmap); hash_add(group->register_table, &user->node, key); mutex_unlock(&event_mutex); @@ -1559,6 +1856,37 @@ static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg) if (ret) return ret; + /* Ensure no flags, since we don't support any yet */ + if (kreg->flags != 0) + return -EINVAL; + + /* Ensure supported size */ + switch (kreg->enable_size) { + case 4: + /* 32-bit */ + break; +#if BITS_PER_LONG >= 64 + case 8: + /* 64-bit */ + break; +#endif + default: + return -EINVAL; + } + + /* Ensure natural alignment */ + if (kreg->enable_addr % kreg->enable_size) + return -EINVAL; + + /* Ensure bit range for size */ + if (kreg->enable_bit > (kreg->enable_size * BITS_PER_BYTE) - 1) + return -EINVAL; + + /* Ensure accessible */ + if (!access_ok((const void __user *)(uintptr_t)kreg->enable_addr, + kreg->enable_size)) + return -EFAULT; + kreg->size = size; return 0; @@ -1573,8 +1901,10 @@ static long user_events_ioctl_reg(struct user_event_file_info *info, struct user_reg __user *ureg = (struct user_reg __user *)uarg; struct user_reg reg; struct user_event *user; + struct user_event_enabler *enabler; char *name; long ret; + int write_result; ret = user_reg_get(ureg, ®); @@ -1605,8 +1935,28 @@ static long user_events_ioctl_reg(struct user_event_file_info *info, if (ret < 0) return ret; + /* + * user_events_ref_add succeeded: + * At this point we have a user_event, it's lifetime is bound by the + * reference count, not this file. If anything fails, the user_event + * still has a reference until the file is released. During release + * any remaining references (from user_events_ref_add) are decremented. + * + * Attempt to create an enabler, which too has a lifetime tied in the + * same way for the event. Once the task that caused the enabler to be + * created exits or issues exec() then the enablers it has created + * will be destroyed and the ref to the event will be decremented. + */ + enabler = user_event_enabler_create(®, user, &write_result); + + if (!enabler) + return -ENOMEM; + + /* Write failed/faulted, give error back to caller */ + if (write_result) + return write_result; + put_user((u32)ret, &ureg->write_index); - put_user(user->index, &ureg->status_bit); return 0; } @@ -1720,38 +2070,6 @@ static const struct file_operations user_data_fops = { .release = user_events_release, }; -static struct user_event_group *user_status_group(struct file *file) -{ - struct seq_file *m = file->private_data; - - if (!m) - return NULL; - - return m->private; -} - -/* - * Maps the shared page into the user process for checking if event is enabled. - */ -static int user_status_mmap(struct file *file, struct vm_area_struct *vma) -{ - char *pages; - struct user_event_group *group = user_status_group(file); - unsigned long size = vma->vm_end - vma->vm_start; - - if (size != MAX_BYTES) - return -EINVAL; - - if (!group) - return -EINVAL; - - pages = group->register_page_data; - - return remap_pfn_range(vma, vma->vm_start, - virt_to_phys(pages) >> PAGE_SHIFT, - size, vm_get_page_prot(VM_READ)); -} - static void *user_seq_start(struct seq_file *m, loff_t *pos) { if (*pos) @@ -1775,7 +2093,7 @@ static int user_seq_show(struct seq_file *m, void *p) struct user_event_group *group = m->private; struct user_event *user; char status; - int i, active = 0, busy = 0, flags; + int i, active = 0, busy = 0; if (!group) return -EINVAL; @@ -1784,11 +2102,10 @@ static int user_seq_show(struct seq_file *m, void *p) hash_for_each(group->register_table, i, user, node) { status = user->status; - flags = user->flags; - seq_printf(m, "%d:%s", user->index, EVENT_NAME(user)); + seq_printf(m, "%s", EVENT_NAME(user)); - if (flags != 0 || status != 0) + if (status != 0) seq_puts(m, " #"); if (status != 0) { @@ -1811,7 +2128,6 @@ static int user_seq_show(struct seq_file *m, void *p) seq_puts(m, "\n"); seq_printf(m, "Active: %d\n", active); seq_printf(m, "Busy: %d\n", busy); - seq_printf(m, "Max: %ld\n", MAX_EVENTS); return 0; } @@ -1847,7 +2163,6 @@ static int user_status_open(struct inode *node, struct file *file) static const struct file_operations user_status_fops = { .open = user_status_open, - .mmap = user_status_mmap, .read = seq_read, .llseek = seq_lseek, .release = seq_release, @@ -1868,8 +2183,7 @@ static int create_user_tracefs(void) goto err; } - /* mmap with MAP_SHARED requires writable fd */ - emmap = tracefs_create_file("user_events_status", TRACE_MODE_WRITE, + emmap = tracefs_create_file("user_events_status", TRACE_MODE_READ, NULL, NULL, &user_status_fops); if (!emmap) { -- cgit v1.2.3 From dcb8177c13953872c9e5ce4a99b63a87a3c2f683 Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Tue, 28 Mar 2023 16:52:12 -0700 Subject: tracing/user_events: Add ioctl for disabling addresses Enablements are now tracked by the lifetime of the task/mm. User processes need to be able to disable their addresses if tracing is requested to be turned off. Before unmapping the page would suffice. However, we now need a stronger contract. Add an ioctl to enable this. A new flag bit is added, freeing, to user_event_enabler to ensure that if the event is attempted to be removed while a fault is being handled that the remove is delayed until after the fault is reattempted. Link: https://lkml.kernel.org/r/20230328235219.203-6-beaub@linux.microsoft.com Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- include/uapi/linux/user_events.h | 24 ++++++++++ kernel/trace/trace_events_user.c | 97 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 119 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/user_events.h b/include/uapi/linux/user_events.h index 22521bc622db..3e7275e3234a 100644 --- a/include/uapi/linux/user_events.h +++ b/include/uapi/linux/user_events.h @@ -46,6 +46,27 @@ struct user_reg { __u32 write_index; } __attribute__((__packed__)); +/* + * Describes an event unregister, callers must set the size, address and bit. + * This structure is passed to the DIAG_IOCSUNREG ioctl to disable bit updates. + */ +struct user_unreg { + /* Input: Size of the user_unreg structure being used */ + __u32 size; + + /* Input: Bit to unregister */ + __u8 disable_bit; + + /* Input: Reserved, set to 0 */ + __u8 __reserved; + + /* Input: Reserved, set to 0 */ + __u16 __reserved2; + + /* Input: Address to unregister */ + __u64 disable_addr; +} __attribute__((__packed__)); + #define DIAG_IOC_MAGIC '*' /* Request to register a user_event */ @@ -54,4 +75,7 @@ struct user_reg { /* Request to delete a user_event */ #define DIAG_IOCSDEL _IOW(DIAG_IOC_MAGIC, 1, char *) +/* Requests to unregister a user_event */ +#define DIAG_IOCSUNREG _IOW(DIAG_IOC_MAGIC, 2, struct user_unreg*) + #endif /* _UAPI_LINUX_USER_EVENTS_H */ diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 86bda1660536..f88bab3f1fe1 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -102,6 +102,9 @@ struct user_event_enabler { /* Bit 6 is for faulting status of enablement */ #define ENABLE_VAL_FAULTING_BIT 6 +/* Bit 7 is for freeing status of enablement */ +#define ENABLE_VAL_FREEING_BIT 7 + /* Only duplicate the bit value */ #define ENABLE_VAL_DUP_MASK ENABLE_VAL_BIT_MASK @@ -301,6 +304,12 @@ static void user_event_enabler_fault_fixup(struct work_struct *work) /* Prevent state changes from racing */ mutex_lock(&event_mutex); + /* User asked for enabler to be removed during fault */ + if (test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler))) { + user_event_enabler_destroy(enabler); + goto out; + } + /* * If we managed to get the page, re-issue the write. We do not * want to get into a possible infinite loop, which is why we only @@ -315,7 +324,7 @@ static void user_event_enabler_fault_fixup(struct work_struct *work) user_event_enabler_write(mm, enabler, true); mmap_read_unlock(mm->mm); } - +out: mutex_unlock(&event_mutex); /* In all cases we no longer need the mm or fault */ @@ -370,7 +379,8 @@ static int user_event_enabler_write(struct user_event_mm *mm, if (refcount_read(&mm->tasks) == 0) return -ENOENT; - if (unlikely(test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler)))) + if (unlikely(test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler)) || + test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler)))) return -EBUSY; ret = pin_user_pages_remote(mm->mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT, @@ -428,6 +438,10 @@ static bool user_event_enabler_dup(struct user_event_enabler *orig, { struct user_event_enabler *enabler; + /* Skip pending frees */ + if (unlikely(test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(orig)))) + return true; + enabler = kzalloc(sizeof(*enabler), GFP_NOWAIT); if (!enabler) @@ -2086,6 +2100,79 @@ static long user_events_ioctl_del(struct user_event_file_info *info, return ret; } +static long user_unreg_get(struct user_unreg __user *ureg, + struct user_unreg *kreg) +{ + u32 size; + long ret; + + ret = get_user(size, &ureg->size); + + if (ret) + return ret; + + if (size > PAGE_SIZE) + return -E2BIG; + + if (size < offsetofend(struct user_unreg, disable_addr)) + return -EINVAL; + + ret = copy_struct_from_user(kreg, sizeof(*kreg), ureg, size); + + /* Ensure no reserved values, since we don't support any yet */ + if (kreg->__reserved || kreg->__reserved2) + return -EINVAL; + + return ret; +} + +/* + * Unregisters an enablement address/bit within a task/user mm. + */ +static long user_events_ioctl_unreg(unsigned long uarg) +{ + struct user_unreg __user *ureg = (struct user_unreg __user *)uarg; + struct user_event_mm *mm = current->user_event_mm; + struct user_event_enabler *enabler, *next; + struct user_unreg reg; + long ret; + + ret = user_unreg_get(ureg, ®); + + if (ret) + return ret; + + if (!mm) + return -ENOENT; + + ret = -ENOENT; + + /* + * Flags freeing and faulting are used to indicate if the enabler is in + * use at all. When faulting is set a page-fault is occurring asyncly. + * During async fault if freeing is set, the enabler will be destroyed. + * If no async fault is happening, we can destroy it now since we hold + * the event_mutex during these checks. + */ + mutex_lock(&event_mutex); + + list_for_each_entry_safe(enabler, next, &mm->enablers, link) + if (enabler->addr == reg.disable_addr && + (enabler->values & ENABLE_VAL_BIT_MASK) == reg.disable_bit) { + set_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler)); + + if (!test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler))) + user_event_enabler_destroy(enabler); + + /* Removed at least one */ + ret = 0; + } + + mutex_unlock(&event_mutex); + + return ret; +} + /* * Handles the ioctl from user mode to register or alter operations. */ @@ -2108,6 +2195,12 @@ static long user_events_ioctl(struct file *file, unsigned int cmd, ret = user_events_ioctl_del(info, uarg); mutex_unlock(&group->reg_mutex); break; + + case DIAG_IOCSUNREG: + mutex_lock(&group->reg_mutex); + ret = user_events_ioctl_unreg(uarg); + mutex_unlock(&group->reg_mutex); + break; } return ret; -- cgit v1.2.3 From a4c40c1349e32f9510707ed09e0961626980d8cb Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Tue, 28 Mar 2023 16:52:19 -0700 Subject: tracing/user_events: Align structs with tabs for readability Add tabs to make struct members easier to read and unify the style of the code. Link: https://lkml.kernel.org/r/20230328235219.203-13-beaub@linux.microsoft.com Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- include/linux/user_events.h | 14 +++---- include/uapi/linux/user_events.h | 24 ++++++------ kernel/trace/trace_events_user.c | 82 ++++++++++++++++++++-------------------- 3 files changed, 60 insertions(+), 60 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/user_events.h b/include/linux/user_events.h index 0120b3dd5b03..2847f5a18a86 100644 --- a/include/linux/user_events.h +++ b/include/linux/user_events.h @@ -17,13 +17,13 @@ #ifdef CONFIG_USER_EVENTS struct user_event_mm { - struct list_head link; - struct list_head enablers; - struct mm_struct *mm; - struct user_event_mm *next; - refcount_t refcnt; - refcount_t tasks; - struct rcu_work put_rwork; + struct list_head link; + struct list_head enablers; + struct mm_struct *mm; + struct user_event_mm *next; + refcount_t refcnt; + refcount_t tasks; + struct rcu_work put_rwork; }; extern void user_event_mm_dup(struct task_struct *t, diff --git a/include/uapi/linux/user_events.h b/include/uapi/linux/user_events.h index 3e7275e3234a..2984aae4a2b4 100644 --- a/include/uapi/linux/user_events.h +++ b/include/uapi/linux/user_events.h @@ -25,25 +25,25 @@ struct user_reg { /* Input: Size of the user_reg structure being used */ - __u32 size; + __u32 size; /* Input: Bit in enable address to use */ - __u8 enable_bit; + __u8 enable_bit; /* Input: Enable size in bytes at address */ - __u8 enable_size; + __u8 enable_size; /* Input: Flags for future use, set to 0 */ - __u16 flags; + __u16 flags; /* Input: Address to update when enabled */ - __u64 enable_addr; + __u64 enable_addr; /* Input: Pointer to string with event name, description and flags */ - __u64 name_args; + __u64 name_args; /* Output: Index of the event to use when writing data */ - __u32 write_index; + __u32 write_index; } __attribute__((__packed__)); /* @@ -52,19 +52,19 @@ struct user_reg { */ struct user_unreg { /* Input: Size of the user_unreg structure being used */ - __u32 size; + __u32 size; /* Input: Bit to unregister */ - __u8 disable_bit; + __u8 disable_bit; /* Input: Reserved, set to 0 */ - __u8 __reserved; + __u8 __reserved; /* Input: Reserved, set to 0 */ - __u16 __reserved2; + __u16 __reserved2; /* Input: Address to unregister */ - __u64 disable_addr; + __u64 disable_addr; } __attribute__((__packed__)); #define DIAG_IOC_MAGIC '*' diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 9b43a02e1597..67cb7b53caf6 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -53,9 +53,9 @@ * allows isolation for events by various means. */ struct user_event_group { - char *system_name; - struct hlist_node node; - struct mutex reg_mutex; + char *system_name; + struct hlist_node node; + struct mutex reg_mutex; DECLARE_HASHTABLE(register_table, 8); }; @@ -76,17 +76,17 @@ static unsigned int current_user_events; * refcnt reaches one. */ struct user_event { - struct user_event_group *group; - struct tracepoint tracepoint; - struct trace_event_call call; - struct trace_event_class class; - struct dyn_event devent; - struct hlist_node node; - struct list_head fields; - struct list_head validators; - refcount_t refcnt; - int min_size; - char status; + struct user_event_group *group; + struct tracepoint tracepoint; + struct trace_event_call call; + struct trace_event_class class; + struct dyn_event devent; + struct hlist_node node; + struct list_head fields; + struct list_head validators; + refcount_t refcnt; + int min_size; + char status; }; /* @@ -95,12 +95,12 @@ struct user_event { * these to track enablement sites that are tied to an event. */ struct user_event_enabler { - struct list_head link; - struct user_event *event; - unsigned long addr; + struct list_head link; + struct user_event *event; + unsigned long addr; /* Track enable bit, flags, etc. Aligned for bitops. */ - unsigned int values; + unsigned int values; }; /* Bits 0-5 are for the bit to update upon enable/disable (0-63 allowed) */ @@ -119,9 +119,9 @@ struct user_event_enabler { /* Used for asynchronous faulting in of pages */ struct user_event_enabler_fault { - struct work_struct work; - struct user_event_mm *mm; - struct user_event_enabler *enabler; + struct work_struct work; + struct user_event_mm *mm; + struct user_event_enabler *enabler; }; static struct kmem_cache *fault_cache; @@ -137,23 +137,23 @@ static DEFINE_SPINLOCK(user_event_mms_lock); * These are not shared and only accessible by the file that created it. */ struct user_event_refs { - struct rcu_head rcu; - int count; - struct user_event *events[]; + struct rcu_head rcu; + int count; + struct user_event *events[]; }; struct user_event_file_info { - struct user_event_group *group; - struct user_event_refs *refs; + struct user_event_group *group; + struct user_event_refs *refs; }; #define VALIDATOR_ENSURE_NULL (1 << 0) #define VALIDATOR_REL (1 << 1) struct user_event_validator { - struct list_head link; - int offset; - int flags; + struct list_head link; + int offset; + int flags; }; typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i, @@ -2276,11 +2276,11 @@ out: } static const struct file_operations user_data_fops = { - .open = user_events_open, - .write = user_events_write, - .write_iter = user_events_write_iter, + .open = user_events_open, + .write = user_events_write, + .write_iter = user_events_write_iter, .unlocked_ioctl = user_events_ioctl, - .release = user_events_release, + .release = user_events_release, }; static void *user_seq_start(struct seq_file *m, loff_t *pos) @@ -2346,10 +2346,10 @@ static int user_seq_show(struct seq_file *m, void *p) } static const struct seq_operations user_seq_ops = { - .start = user_seq_start, - .next = user_seq_next, - .stop = user_seq_stop, - .show = user_seq_show, + .start = user_seq_start, + .next = user_seq_next, + .stop = user_seq_stop, + .show = user_seq_show, }; static int user_status_open(struct inode *node, struct file *file) @@ -2375,10 +2375,10 @@ static int user_status_open(struct inode *node, struct file *file) } static const struct file_operations user_status_fops = { - .open = user_status_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, + .open = user_status_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, }; /* -- cgit v1.2.3 From 30ec7997d175cd689fc61bfc4059f4d35b11858c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 30 Mar 2023 18:47:47 +0100 Subject: KVM: arm64: timers: Allow userspace to set the global counter offset And this is the moment you have all been waiting for: setting the counter offset from userspace. We expose a brand new capability that reports the ability to set the offset for both the virtual and physical sides. In keeping with the architecture, the offset is expressed as a delta that is substracted from the physical counter value. Once this new API is used, there is no going back, and the counters cannot be written to to set the offsets implicitly (the writes are instead ignored). Reviewed-by: Colton Lewis Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230330174800.2677007-8-maz@kernel.org --- arch/arm64/include/asm/kvm_host.h | 4 +++ arch/arm64/include/uapi/asm/kvm.h | 9 +++++++ arch/arm64/kvm/arch_timer.c | 54 +++++++++++++++++++++++++++++++++++---- arch/arm64/kvm/arm.c | 8 ++++++ include/uapi/linux/kvm.h | 3 +++ 5 files changed, 73 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 002a10cbade2..116233a390e9 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -221,6 +221,8 @@ struct kvm_arch { #define KVM_ARCH_FLAG_EL1_32BIT 4 /* PSCI SYSTEM_SUSPEND enabled for the guest */ #define KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED 5 + /* VM counter offset */ +#define KVM_ARCH_FLAG_VM_COUNTER_OFFSET 6 unsigned long flags; @@ -1010,6 +1012,8 @@ int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu, long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, struct kvm_arm_copy_mte_tags *copy_tags); +int kvm_vm_ioctl_set_counter_offset(struct kvm *kvm, + struct kvm_arm_counter_offset *offset); /* Guest/host FPSIMD coordination helpers */ int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index f8129c624b07..12fb0d8a760a 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -198,6 +198,15 @@ struct kvm_arm_copy_mte_tags { __u64 reserved[2]; }; +/* + * Counter/Timer offset structure. Describe the virtual/physical offset. + * To be used with KVM_ARM_SET_COUNTER_OFFSET. + */ +struct kvm_arm_counter_offset { + __u64 counter_offset; + __u64 reserved; +}; + #define KVM_ARM_TAGS_TO_GUEST 0 #define KVM_ARM_TAGS_FROM_GUEST 1 diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index bb64a71ae193..771504c79711 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -851,9 +851,11 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) ptimer->vcpu = vcpu; ptimer->offset.vm_offset = &vcpu->kvm->arch.timer_data.poffset; - /* Synchronize cntvoff across all vtimers of a VM. */ - timer_set_offset(vtimer, kvm_phys_timer_read()); - timer_set_offset(ptimer, 0); + /* Synchronize offsets across timers of a VM if not already provided */ + if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &vcpu->kvm->arch.flags)) { + timer_set_offset(vtimer, kvm_phys_timer_read()); + timer_set_offset(ptimer, 0); + } hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); timer->bg_timer.function = kvm_bg_timer_expire; @@ -897,8 +899,11 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value); break; case KVM_REG_ARM_TIMER_CNT: - timer = vcpu_vtimer(vcpu); - timer_set_offset(timer, kvm_phys_timer_read() - value); + if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, + &vcpu->kvm->arch.flags)) { + timer = vcpu_vtimer(vcpu); + timer_set_offset(timer, kvm_phys_timer_read() - value); + } break; case KVM_REG_ARM_TIMER_CVAL: timer = vcpu_vtimer(vcpu); @@ -908,6 +913,13 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) timer = vcpu_ptimer(vcpu); kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value); break; + case KVM_REG_ARM_PTIMER_CNT: + if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, + &vcpu->kvm->arch.flags)) { + timer = vcpu_ptimer(vcpu); + timer_set_offset(timer, kvm_phys_timer_read() - value); + } + break; case KVM_REG_ARM_PTIMER_CVAL: timer = vcpu_ptimer(vcpu); kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value); @@ -1443,3 +1455,35 @@ int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) return -ENXIO; } + +int kvm_vm_ioctl_set_counter_offset(struct kvm *kvm, + struct kvm_arm_counter_offset *offset) +{ + int ret = 0; + + if (offset->reserved) + return -EINVAL; + + mutex_lock(&kvm->lock); + + if (lock_all_vcpus(kvm)) { + set_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &kvm->arch.flags); + + /* + * If userspace decides to set the offset using this + * API rather than merely restoring the counter + * values, the offset applies to both the virtual and + * physical views. + */ + kvm->arch.timer_data.voffset = offset->counter_offset; + kvm->arch.timer_data.poffset = offset->counter_offset; + + unlock_all_vcpus(kvm); + } else { + ret = -EBUSY; + } + + mutex_unlock(&kvm->lock); + + return ret; +} diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index ae5110cc3bad..1c8a4bbae684 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -220,6 +220,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_VCPU_ATTRIBUTES: case KVM_CAP_PTP_KVM: case KVM_CAP_ARM_SYSTEM_SUSPEND: + case KVM_CAP_COUNTER_OFFSET: r = 1; break; case KVM_CAP_SET_GUEST_DEBUG2: @@ -1479,6 +1480,13 @@ long kvm_arch_vm_ioctl(struct file *filp, return -EFAULT; return kvm_vm_ioctl_mte_copy_tags(kvm, ©_tags); } + case KVM_ARM_SET_COUNTER_OFFSET: { + struct kvm_arm_counter_offset offset; + + if (copy_from_user(&offset, argp, sizeof(offset))) + return -EFAULT; + return kvm_vm_ioctl_set_counter_offset(kvm, &offset); + } default: return -EINVAL; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index d77aef872a0a..6a7e1a0ecf04 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1184,6 +1184,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_S390_PROTECTED_ASYNC_DISABLE 224 #define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 225 #define KVM_CAP_PMU_EVENT_MASKED_EVENTS 226 +#define KVM_CAP_COUNTER_OFFSET 227 #ifdef KVM_CAP_IRQ_ROUTING @@ -1543,6 +1544,8 @@ struct kvm_s390_ucas_mapping { #define KVM_SET_PMU_EVENT_FILTER _IOW(KVMIO, 0xb2, struct kvm_pmu_event_filter) #define KVM_PPC_SVM_OFF _IO(KVMIO, 0xb3) #define KVM_ARM_MTE_COPY_TAGS _IOR(KVMIO, 0xb4, struct kvm_arm_copy_mte_tags) +/* Available with KVM_CAP_COUNTER_OFFSET */ +#define KVM_ARM_SET_COUNTER_OFFSET _IOW(KVMIO, 0xb5, struct kvm_arm_counter_offset) /* ioctl for vm fd */ #define KVM_CREATE_DEVICE _IOWR(KVMIO, 0xe0, struct kvm_create_device) -- cgit v1.2.3 From 9e410fe3dc9a938bc47f71dff254be7419bd40d2 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Fri, 3 Mar 2023 13:34:11 -0800 Subject: dmaengine: idxd: Add descriptor definitions for 16 bytes of pattern in memory fill operation The memory fill operation (0x04) can fill in memory with either 8 bytes or 16 bytes of pattern. To fill in memory with 16 bytes of pattern, the first 8 bytes are provided in pattern lower in bytes 16-23 and the next 8 bytes are in pattern upper in bytes 40-47 in the descriptor. Currently only 8 bytes of pattern is enabled. Add descriptor definitions for pattern lower and pattern upper so that user can use 16 bytes of pattern to fill memory. Signed-off-by: Fenghua Yu Reviewed-by: Dave Jiang Link: https://lore.kernel.org/r/20230303213413.3357431-2-fenghua.yu@intel.com Signed-off-by: Vinod Koul --- include/uapi/linux/idxd.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h index 1d553bedbdb5..c43d7df5fc15 100644 --- a/include/uapi/linux/idxd.h +++ b/include/uapi/linux/idxd.h @@ -180,6 +180,7 @@ struct dsa_hw_desc { uint64_t rdback_addr; uint64_t pattern; uint64_t desc_list_addr; + uint64_t pattern_lower; }; union { uint64_t dst_addr; @@ -244,6 +245,9 @@ struct dsa_hw_desc { uint16_t dest_app_tag_seed; }; + /* Fill */ + uint64_t pattern_upper; + uint8_t op_specific[24]; }; } __attribute__((packed)); -- cgit v1.2.3 From 12bbc2c2605516e781cd86e3cde9fe1f889b72cc Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Fri, 3 Mar 2023 13:34:12 -0800 Subject: dmaengine: idxd: Add descriptor definitions for DIX generate operation The Data Integrity Extension (DIX) generate operation (0x17) computes the Data Integrity Field (DIF) on the source data and writes only the computed DIF for each source block to the PI destination address. Add descriptor definitions for this operation so that user can use DSA to accelerate DIX generate operation. Signed-off-by: Fenghua Yu Reviewed-by: Dave Jiang Link: https://lore.kernel.org/r/20230303213413.3357431-3-fenghua.yu@intel.com Signed-off-by: Vinod Koul --- include/uapi/linux/idxd.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h index c43d7df5fc15..4c12e93a6aa6 100644 --- a/include/uapi/linux/idxd.h +++ b/include/uapi/linux/idxd.h @@ -78,6 +78,7 @@ enum dsa_opcode { DSA_OPCODE_DIF_INS, DSA_OPCODE_DIF_STRP, DSA_OPCODE_DIF_UPDT, + DSA_OPCODE_DIX_GEN = 0x17, DSA_OPCODE_CFLUSH = 0x20, }; @@ -248,6 +249,17 @@ struct dsa_hw_desc { /* Fill */ uint64_t pattern_upper; + /* DIX generate */ + struct { + uint8_t dix_gen_res; + uint8_t dest_dif_flags; + uint8_t dif_flags; + uint8_t dix_gen_res2[13]; + uint32_t ref_tag_seed; + uint16_t app_tag_mask; + uint16_t app_tag_seed; + }; + uint8_t op_specific[24]; }; } __attribute__((packed)); @@ -326,6 +338,14 @@ struct dsa_completion_record { uint16_t dif_upd_dest_app_tag; }; + /* DIX generate */ + struct { + uint64_t dix_gen_res; + uint32_t dix_ref_tag; + uint16_t dix_app_tag_mask; + uint16_t dix_app_tag; + }; + uint8_t op_specific[16]; }; } __attribute__((packed)); -- cgit v1.2.3 From 6fec8938b7b4fe2b2c503fe87b2783a50bff0415 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Fri, 3 Mar 2023 13:34:13 -0800 Subject: dmaengine: idxd: Add descriptor definitions for translation fetch operation The translation fetch operation (0x0A) fetches address translations for the address range specified in the descriptor by issuing address translation (ATS) requests to the IOMMU. Add descriptor definitions for the operation so that user can use DSA to accelerate translation fetch. Signed-off-by: Fenghua Yu Reviewed-by: Dave Jiang Link: https://lore.kernel.org/r/20230303213413.3357431-4-fenghua.yu@intel.com Signed-off-by: Vinod Koul --- include/uapi/linux/idxd.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h index 4c12e93a6aa6..fc47635b57dc 100644 --- a/include/uapi/linux/idxd.h +++ b/include/uapi/linux/idxd.h @@ -72,6 +72,7 @@ enum dsa_opcode { DSA_OPCODE_CR_DELTA, DSA_OPCODE_AP_DELTA, DSA_OPCODE_DUALCAST, + DSA_OPCODE_TRANSL_FETCH, DSA_OPCODE_CRCGEN = 0x10, DSA_OPCODE_COPY_CRC, DSA_OPCODE_DIF_CHECK, @@ -182,6 +183,7 @@ struct dsa_hw_desc { uint64_t pattern; uint64_t desc_list_addr; uint64_t pattern_lower; + uint64_t transl_fetch_addr; }; union { uint64_t dst_addr; @@ -192,6 +194,7 @@ struct dsa_hw_desc { union { uint32_t xfer_size; uint32_t desc_count; + uint32_t region_size; }; uint16_t int_handle; uint16_t rsvd1; @@ -249,6 +252,12 @@ struct dsa_hw_desc { /* Fill */ uint64_t pattern_upper; + /* Translation fetch */ + struct { + uint64_t transl_fetch_res; + uint32_t region_stride; + }; + /* DIX generate */ struct { uint8_t dix_gen_res; -- cgit v1.2.3 From e65733b5c59a1ea20324a03494364958bef3fc68 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 4 Apr 2023 15:40:38 +0000 Subject: KVM: x86: Redefine 'longmode' as a flag for KVM_EXIT_HYPERCALL The 'longmode' field is a bit annoying as it blows an entire __u32 to represent a boolean value. Since other architectures are looking to add support for KVM_EXIT_HYPERCALL, now is probably a good time to clean it up. Redefine the field (and the remaining padding) as a set of flags. Preserve the existing ABI by using bit 0 to indicate if the guest was in long mode and requiring that the remaining 31 bits must be zero. Cc: Paolo Bonzini Acked-by: Sean Christopherson Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230404154050.2270077-2-oliver.upton@linux.dev --- Documentation/virt/kvm/api.rst | 3 +-- arch/x86/include/asm/kvm_host.h | 7 +++++++ arch/x86/include/uapi/asm/kvm.h | 3 +++ arch/x86/kvm/x86.c | 6 +++++- include/uapi/linux/kvm.h | 9 +++++++-- 5 files changed, 23 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 62de0768d6aa..9b01e3d0e757 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6218,8 +6218,7 @@ to the byte array. __u64 nr; __u64 args[6]; __u64 ret; - __u32 longmode; - __u32 pad; + __u64 flags; } hypercall; Unused. This was once used for 'hypercall to userspace'. To implement diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 808c292ad3f4..15bda40517ff 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2204,4 +2204,11 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \ KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) +/* + * KVM previously used a u32 field in kvm_run to indicate the hypercall was + * initiated from long mode. KVM now sets bit 0 to indicate long mode, but the + * remaining 31 lower bits must be 0 to preserve ABI. + */ +#define KVM_EXIT_HYPERCALL_MBZ GENMASK_ULL(31, 1) + #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 7f467fe05d42..1a6a1f987949 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -559,4 +559,7 @@ struct kvm_pmu_event_filter { #define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */ #define KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */ +/* x86-specific KVM_EXIT_HYPERCALL flags. */ +#define KVM_EXIT_HYPERCALL_LONG_MODE BIT(0) + #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7713420abab0..27a1d5c1a018 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9803,7 +9803,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) vcpu->run->hypercall.args[0] = gpa; vcpu->run->hypercall.args[1] = npages; vcpu->run->hypercall.args[2] = attrs; - vcpu->run->hypercall.longmode = op_64_bit; + vcpu->run->hypercall.flags = 0; + if (op_64_bit) + vcpu->run->hypercall.flags |= KVM_EXIT_HYPERCALL_LONG_MODE; + + WARN_ON_ONCE(vcpu->run->hypercall.flags & KVM_EXIT_HYPERCALL_MBZ); vcpu->arch.complete_userspace_io = complete_hypercall_exit; return 0; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index d77aef872a0a..dd42d7dfb86c 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -341,8 +341,13 @@ struct kvm_run { __u64 nr; __u64 args[6]; __u64 ret; - __u32 longmode; - __u32 pad; + + union { +#ifndef __KERNEL__ + __u32 longmode; +#endif + __u64 flags; + }; } hypercall; /* KVM_EXIT_TPR_ACCESS */ struct { -- cgit v1.2.3 From 2bad466cc9d9b4c3b4b16eb9c03c919b59561316 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 9 Mar 2023 17:37:10 -0500 Subject: mm/uffd: UFFD_FEATURE_WP_UNPOPULATED Patch series "mm/uffd: Add feature bit UFFD_FEATURE_WP_UNPOPULATED", v4. The new feature bit makes anonymous memory acts the same as file memory on userfaultfd-wp in that it'll also wr-protect none ptes. It can be useful in two cases: (1) Uffd-wp app that needs to wr-protect none ptes like QEMU snapshot, so pre-fault can be replaced by enabling this flag and speed up protections (2) It helps to implement async uffd-wp mode that Muhammad is working on [1] It's debatable whether this is the most ideal solution because with the new feature bit set, wr-protect none pte needs to pre-populate the pgtables to the last level (PAGE_SIZE). But it seems fine so far to service either purpose above, so we can leave optimizations for later. The series brings pte markers to anonymous memory too. There's some change in the common mm code path in the 1st patch, great to have some eye looking at it, but hopefully they're still relatively straightforward. This patch (of 2): This is a new feature that controls how uffd-wp handles none ptes. When it's set, the kernel will handle anonymous memory the same way as file memory, by allowing the user to wr-protect unpopulated ptes. File memories handles none ptes consistently by allowing wr-protecting of none ptes because of the unawareness of page cache being exist or not. For anonymous it was not as persistent because we used to assume that we don't need protections on none ptes or known zero pages. One use case of such a feature bit was VM live snapshot, where if without wr-protecting empty ptes the snapshot can contain random rubbish in the holes of the anonymous memory, which can cause misbehave of the guest when the guest OS assumes the pages should be all zeros. QEMU worked it around by pre-populate the section with reads to fill in zero page entries before starting the whole snapshot process [1]. Recently there's another need raised on using userfaultfd wr-protect for detecting dirty pages (to replace soft-dirty in some cases) [2]. In that case if without being able to wr-protect none ptes by default, the dirty info can get lost, since we cannot treat every none pte to be dirty (the current design is identify a page dirty based on uffd-wp bit being cleared). In general, we want to be able to wr-protect empty ptes too even for anonymous. This patch implements UFFD_FEATURE_WP_UNPOPULATED so that it'll make uffd-wp handling on none ptes being consistent no matter what the memory type is underneath. It doesn't have any impact on file memories so far because we already have pte markers taking care of that. So it only affects anonymous. The feature bit is by default off, so the old behavior will be maintained. Sometimes it may be wanted because the wr-protect of none ptes will contain overheads not only during UFFDIO_WRITEPROTECT (by applying pte markers to anonymous), but also on creating the pgtables to store the pte markers. So there's potentially less chance of using thp on the first fault for a none pmd or larger than a pmd. The major implementation part is teaching the whole kernel to understand pte markers even for anonymously mapped ranges, meanwhile allowing the UFFDIO_WRITEPROTECT ioctl to apply pte markers for anonymous too when the new feature bit is set. Note that even if the patch subject starts with mm/uffd, there're a few small refactors to major mm path of handling anonymous page faults. But they should be straightforward. With WP_UNPOPUATED, application like QEMU can avoid pre-read faults all the memory before wr-protect during taking a live snapshot. Quotting from Muhammad's test result here [3] based on a simple program [4]: (1) With huge page disabled echo madvise > /sys/kernel/mm/transparent_hugepage/enabled ./uffd_wp_perf Test DEFAULT: 4 Test PRE-READ: 1111453 (pre-fault 1101011) Test MADVISE: 278276 (pre-fault 266378) Test WP-UNPOPULATE: 11712 (2) With Huge page enabled echo always > /sys/kernel/mm/transparent_hugepage/enabled ./uffd_wp_perf Test DEFAULT: 4 Test PRE-READ: 22521 (pre-fault 22348) Test MADVISE: 4909 (pre-fault 4743) Test WP-UNPOPULATE: 14448 There'll be a great perf boost for no-thp case, while for thp enabled with extreme case of all-thp-zero WP_UNPOPULATED can be slower than MADVISE, but that's low possibility in reality, also the overhead was not reduced but postponed until a follow up write on any huge zero thp, so potentially it is faster by making the follow up writes slower. [1] https://lore.kernel.org/all/20210401092226.102804-4-andrey.gruzdev@virtuozzo.com/ [2] https://lore.kernel.org/all/Y+v2HJ8+3i%2FKzDBu@x1n/ [3] https://lore.kernel.org/all/d0eb0a13-16dc-1ac1-653a-78b7273781e3@collabora.com/ [4] https://github.com/xzpeter/clibs/blob/master/uffd-test/uffd-wp-perf.c [peterx@redhat.com: comment changes, oneliner fix to khugepaged] Link: https://lkml.kernel.org/r/ZB2/8jPhD3fpx5U8@x1n Link: https://lkml.kernel.org/r/20230309223711.823547-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20230309223711.823547-2-peterx@redhat.com Signed-off-by: Peter Xu Acked-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Mike Rapoport Cc: Muhammad Usama Anjum Cc: Nadav Amit Cc: Paul Gofman Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/userfaultfd.rst | 17 +++++++++ fs/userfaultfd.c | 16 ++++++++ include/linux/mm_inline.h | 6 +++ include/linux/userfaultfd_k.h | 23 ++++++++++++ include/uapi/linux/userfaultfd.h | 10 ++++- mm/khugepaged.c | 2 +- mm/memory.c | 56 +++++++++++++++++++++------- mm/mprotect.c | 51 ++++++++++++++++++++----- 8 files changed, 155 insertions(+), 26 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/admin-guide/mm/userfaultfd.rst b/Documentation/admin-guide/mm/userfaultfd.rst index 7dc823b56ca4..bd2226299583 100644 --- a/Documentation/admin-guide/mm/userfaultfd.rst +++ b/Documentation/admin-guide/mm/userfaultfd.rst @@ -219,6 +219,23 @@ former will have ``UFFD_PAGEFAULT_FLAG_WP`` set, the latter you still need to supply a page when ``UFFDIO_REGISTER_MODE_MISSING`` was used. +Userfaultfd write-protect mode currently behave differently on none ptes +(when e.g. page is missing) over different types of memories. + +For anonymous memory, ``ioctl(UFFDIO_WRITEPROTECT)`` will ignore none ptes +(e.g. when pages are missing and not populated). For file-backed memories +like shmem and hugetlbfs, none ptes will be write protected just like a +present pte. In other words, there will be a userfaultfd write fault +message generated when writing to a missing page on file typed memories, +as long as the page range was write-protected before. Such a message will +not be generated on anonymous memories by default. + +If the application wants to be able to write protect none ptes on anonymous +memory, one can pre-populate the memory with e.g. MADV_POPULATE_READ. On +newer kernels, one can also detect the feature UFFD_FEATURE_WP_UNPOPULATED +and set the feature bit in advance to make sure none ptes will also be +write protected even upon anonymous memory. + QEMU/KVM ======== diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 44d1ee429eb0..881e9c82b9d1 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -108,6 +108,21 @@ static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx) return ctx->features & UFFD_FEATURE_INITIALIZED; } +/* + * Whether WP_UNPOPULATED is enabled on the uffd context. It is only + * meaningful when userfaultfd_wp()==true on the vma and when it's + * anonymous. + */ +bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma) +{ + struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; + + if (!ctx) + return false; + + return ctx->features & UFFD_FEATURE_WP_UNPOPULATED; +} + static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, vm_flags_t flags) { @@ -1971,6 +1986,7 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, #endif #ifndef CONFIG_PTE_MARKER_UFFD_WP uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM; + uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED; #endif uffdio_api.ioctls = UFFD_API_IOCTLS; ret = -EFAULT; diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index de1e622dd366..0e1d239a882c 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -557,6 +557,12 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, /* The current status of the pte should be "cleared" before calling */ WARN_ON_ONCE(!pte_none(*pte)); + /* + * NOTE: userfaultfd_wp_unpopulated() doesn't need this whole + * thing, because when zapping either it means it's dropping the + * page, or in TTU where the present pte will be quickly replaced + * with a swap pte. There's no way of leaking the bit. + */ if (vma_is_anonymous(vma) || !userfaultfd_wp(vma)) return; diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 3767f18114ef..0cf8880219da 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -179,6 +179,7 @@ extern int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start, unsigned long end, struct list_head *uf); extern void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf); +extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma); #else /* CONFIG_USERFAULTFD */ @@ -274,8 +275,30 @@ static inline bool uffd_disable_fault_around(struct vm_area_struct *vma) return false; } +static inline bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma) +{ + return false; +} + #endif /* CONFIG_USERFAULTFD */ +static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) +{ + /* Only wr-protect mode uses pte markers */ + if (!userfaultfd_wp(vma)) + return false; + + /* File-based uffd-wp always need markers */ + if (!vma_is_anonymous(vma)) + return true; + + /* + * Anonymous uffd-wp only needs the markers if WP_UNPOPULATED + * enabled (to apply markers on zero pages). + */ + return userfaultfd_wp_unpopulated(vma); +} + static inline bool pte_marker_entry_uffd_wp(swp_entry_t entry) { #ifdef CONFIG_PTE_MARKER_UFFD_WP diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 005e5e306266..90c958952bfc 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -38,7 +38,8 @@ UFFD_FEATURE_MINOR_HUGETLBFS | \ UFFD_FEATURE_MINOR_SHMEM | \ UFFD_FEATURE_EXACT_ADDRESS | \ - UFFD_FEATURE_WP_HUGETLBFS_SHMEM) + UFFD_FEATURE_WP_HUGETLBFS_SHMEM | \ + UFFD_FEATURE_WP_UNPOPULATED) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -203,6 +204,12 @@ struct uffdio_api { * * UFFD_FEATURE_WP_HUGETLBFS_SHMEM indicates that userfaultfd * write-protection mode is supported on both shmem and hugetlbfs. + * + * UFFD_FEATURE_WP_UNPOPULATED indicates that userfaultfd + * write-protection mode will always apply to unpopulated pages + * (i.e. empty ptes). This will be the default behavior for shmem + * & hugetlbfs, so this flag only affects anonymous memory behavior + * when userfault write-protection mode is registered. */ #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) @@ -217,6 +224,7 @@ struct uffdio_api { #define UFFD_FEATURE_MINOR_SHMEM (1<<10) #define UFFD_FEATURE_EXACT_ADDRESS (1<<11) #define UFFD_FEATURE_WP_HUGETLBFS_SHMEM (1<<12) +#define UFFD_FEATURE_WP_UNPOPULATED (1<<13) __u64 features; __u64 ioctls; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 074ea534f786..c7317678cb10 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1177,7 +1177,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, * enabled swap entries. Please see * comment below for pte_uffd_wp(). */ - if (pte_swp_uffd_wp(pteval)) { + if (pte_swp_uffd_wp_any(pteval)) { result = SCAN_PTE_UFFD_WP; goto out_unmap; } diff --git a/mm/memory.c b/mm/memory.c index 6285cad1f4fb..a890b2951b53 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -104,6 +104,20 @@ EXPORT_SYMBOL(mem_map); #endif static vm_fault_t do_fault(struct vm_fault *vmf); +static vm_fault_t do_anonymous_page(struct vm_fault *vmf); +static bool vmf_pte_changed(struct vm_fault *vmf); + +/* + * Return true if the original pte was a uffd-wp pte marker (so the pte was + * wr-protected). + */ +static bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf) +{ + if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)) + return false; + + return pte_marker_uffd_wp(vmf->orig_pte); +} /* * A number of key systems in x86 including ioremap() rely on the assumption @@ -1346,6 +1360,10 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, struct zap_details *details, pte_t pteval) { + /* Zap on anonymous always means dropping everything */ + if (vma_is_anonymous(vma)) + return; + if (zap_drop_file_uffd_wp(details)) return; @@ -1452,8 +1470,12 @@ again: continue; rss[mm_counter(page)]--; } else if (pte_marker_entry_uffd_wp(entry)) { - /* Only drop the uffd-wp marker if explicitly requested */ - if (!zap_drop_file_uffd_wp(details)) + /* + * For anon: always drop the marker; for file: only + * drop the marker if explicitly requested. + */ + if (!vma_is_anonymous(vma) && + !zap_drop_file_uffd_wp(details)) continue; } else if (is_hwpoison_entry(entry) || is_swapin_error_entry(entry)) { @@ -3620,6 +3642,14 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf) return 0; } +static vm_fault_t do_pte_missing(struct vm_fault *vmf) +{ + if (vma_is_anonymous(vmf->vma)) + return do_anonymous_page(vmf); + else + return do_fault(vmf); +} + /* * This is actually a page-missing access, but with uffd-wp special pte * installed. It means this pte was wr-protected before being unmapped. @@ -3630,11 +3660,10 @@ static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf) * Just in case there're leftover special ptes even after the region * got unregistered - we can simply clear them. */ - if (unlikely(!userfaultfd_wp(vmf->vma) || vma_is_anonymous(vmf->vma))) + if (unlikely(!userfaultfd_wp(vmf->vma))) return pte_marker_clear(vmf); - /* do_fault() can handle pte markers too like none pte */ - return do_fault(vmf); + return do_pte_missing(vmf); } static vm_fault_t handle_pte_marker(struct vm_fault *vmf) @@ -3999,6 +4028,7 @@ out_release: */ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) { + bool uffd_wp = vmf_orig_pte_uffd_wp(vmf); struct vm_area_struct *vma = vmf->vma; struct folio *folio; vm_fault_t ret = 0; @@ -4032,7 +4062,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) vma->vm_page_prot)); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); - if (!pte_none(*vmf->pte)) { + if (vmf_pte_changed(vmf)) { update_mmu_tlb(vma, vmf->address, vmf->pte); goto unlock; } @@ -4072,7 +4102,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); - if (!pte_none(*vmf->pte)) { + if (vmf_pte_changed(vmf)) { update_mmu_tlb(vma, vmf->address, vmf->pte); goto release; } @@ -4092,6 +4122,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) folio_add_new_anon_rmap(folio, vma, vmf->address); folio_add_lru_vma(folio, vma); setpte: + if (uffd_wp) + entry = pte_mkuffd_wp(entry); set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); /* No need to invalidate - it was non-present before */ @@ -4259,7 +4291,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) { struct vm_area_struct *vma = vmf->vma; - bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte); + bool uffd_wp = vmf_orig_pte_uffd_wp(vmf); bool write = vmf->flags & FAULT_FLAG_WRITE; bool prefault = vmf->address != addr; pte_t entry; @@ -4903,12 +4935,8 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) } } - if (!vmf->pte) { - if (vma_is_anonymous(vmf->vma)) - return do_anonymous_page(vmf); - else - return do_fault(vmf); - } + if (!vmf->pte) + return do_pte_missing(vmf); if (!pte_present(vmf->orig_pte)) return do_swap_page(vmf); diff --git a/mm/mprotect.c b/mm/mprotect.c index 13e84d8c0797..b9da9a5f87fe 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -276,7 +276,15 @@ static long change_pte_range(struct mmu_gather *tlb, } else { /* It must be an none page, or what else?.. */ WARN_ON_ONCE(!pte_none(oldpte)); - if (unlikely(uffd_wp && !vma_is_anonymous(vma))) { + + /* + * Nobody plays with any none ptes besides + * userfaultfd when applying the protections. + */ + if (likely(!uffd_wp)) + continue; + + if (userfaultfd_wp_use_markers(vma)) { /* * For file-backed mem, we need to be able to * wr-protect a none pte, because even if the @@ -320,23 +328,46 @@ static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd) return 0; } -/* Return true if we're uffd wr-protecting file-backed memory, or false */ +/* + * Return true if we want to split THPs into PTE mappings in change + * protection procedure, false otherwise. + */ static inline bool -uffd_wp_protect_file(struct vm_area_struct *vma, unsigned long cp_flags) +pgtable_split_needed(struct vm_area_struct *vma, unsigned long cp_flags) { + /* + * pte markers only resides in pte level, if we need pte markers, + * we need to split. We cannot wr-protect shmem thp because file + * thp is handled differently when split by erasing the pmd so far. + */ return (cp_flags & MM_CP_UFFD_WP) && !vma_is_anonymous(vma); } /* - * If wr-protecting the range for file-backed, populate pgtable for the case - * when pgtable is empty but page cache exists. When {pte|pmd|...}_alloc() - * failed we treat it the same way as pgtable allocation failures during - * page faults by kicking OOM and returning error. + * Return true if we want to populate pgtables in change protection + * procedure, false otherwise + */ +static inline bool +pgtable_populate_needed(struct vm_area_struct *vma, unsigned long cp_flags) +{ + /* If not within ioctl(UFFDIO_WRITEPROTECT), then don't bother */ + if (!(cp_flags & MM_CP_UFFD_WP)) + return false; + + /* Populate if the userfaultfd mode requires pte markers */ + return userfaultfd_wp_use_markers(vma); +} + +/* + * Populate the pgtable underneath for whatever reason if requested. + * When {pte|pmd|...}_alloc() failed we treat it the same way as pgtable + * allocation failures during page faults by kicking OOM and returning + * error. */ #define change_pmd_prepare(vma, pmd, cp_flags) \ ({ \ long err = 0; \ - if (unlikely(uffd_wp_protect_file(vma, cp_flags))) { \ + if (unlikely(pgtable_populate_needed(vma, cp_flags))) { \ if (pte_alloc(vma->vm_mm, pmd)) \ err = -ENOMEM; \ } \ @@ -351,7 +382,7 @@ uffd_wp_protect_file(struct vm_area_struct *vma, unsigned long cp_flags) #define change_prepare(vma, high, low, addr, cp_flags) \ ({ \ long err = 0; \ - if (unlikely(uffd_wp_protect_file(vma, cp_flags))) { \ + if (unlikely(pgtable_populate_needed(vma, cp_flags))) { \ low##_t *p = low##_alloc(vma->vm_mm, high, addr); \ if (p == NULL) \ err = -ENOMEM; \ @@ -404,7 +435,7 @@ static inline long change_pmd_range(struct mmu_gather *tlb, if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if ((next - addr != HPAGE_PMD_SIZE) || - uffd_wp_protect_file(vma, cp_flags)) { + pgtable_split_needed(vma, cp_flags)) { __split_huge_pmd(vma, pmd, addr, false, NULL); /* * For file-backed, the pmd could have been -- cgit v1.2.3 From 0289184476c845968ad6ac9083c96cc0f75ca505 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Tue, 14 Mar 2023 15:12:50 -0700 Subject: mm: userfaultfd: add UFFDIO_CONTINUE_MODE_WP to install WP PTEs UFFDIO_COPY already has UFFDIO_COPY_MODE_WP, so when installing a new PTE to resolve a missing fault, one can install a write-protected one. This is useful when using UFFDIO_REGISTER_MODE_{MISSING,WP} in combination. This was motivated by testing HugeTLB HGM [1], and in particular its interaction with userfaultfd features. Existing userfaultfd code supports using WP and MINOR modes together (i.e. you can register an area with both enabled), but without this CONTINUE flag the combination is in practice unusable. So, add an analogous UFFDIO_CONTINUE_MODE_WP, which does the same thing as UFFDIO_COPY_MODE_WP, but for *minor* faults. Update the selftest to do some very basic exercising of the new flag. Update Documentation/ to describe how these flags are used (neither the COPY nor the new CONTINUE versions of this mode flag were described there before). [1]: https://patchwork.kernel.org/project/linux-mm/cover/20230218002819.1486479-1-jthoughton@google.com/ Link: https://lkml.kernel.org/r/20230314221250.682452-5-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Acked-by: Peter Xu Acked-by: Mike Rapoport (IBM) Cc: Al Viro Cc: Hugh Dickins Cc: Jan Kara Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Nadav Amit Cc: Shuah Khan Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/userfaultfd.rst | 8 ++++++++ fs/userfaultfd.c | 8 ++++++-- include/linux/userfaultfd_k.h | 3 ++- include/uapi/linux/userfaultfd.h | 7 +++++++ mm/userfaultfd.c | 5 +++-- tools/testing/selftests/mm/userfaultfd.c | 4 ++++ 6 files changed, 30 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/admin-guide/mm/userfaultfd.rst b/Documentation/admin-guide/mm/userfaultfd.rst index bd2226299583..7c304e432205 100644 --- a/Documentation/admin-guide/mm/userfaultfd.rst +++ b/Documentation/admin-guide/mm/userfaultfd.rst @@ -236,6 +236,14 @@ newer kernels, one can also detect the feature UFFD_FEATURE_WP_UNPOPULATED and set the feature bit in advance to make sure none ptes will also be write protected even upon anonymous memory. +When using ``UFFDIO_REGISTER_MODE_WP`` in combination with either +``UFFDIO_REGISTER_MODE_MISSING`` or ``UFFDIO_REGISTER_MODE_MINOR``, when +resolving missing / minor faults with ``UFFDIO_COPY`` or ``UFFDIO_CONTINUE`` +respectively, it may be desirable for the new page / mapping to be +write-protected (so future writes will also result in a WP fault). These ioctls +support a mode flag (``UFFDIO_COPY_MODE_WP`` or ``UFFDIO_CONTINUE_MODE_WP`` +respectively) to configure the mapping this way. + QEMU/KVM ======== diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 8971c3613cc6..8395605790f6 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1893,6 +1893,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) struct uffdio_continue uffdio_continue; struct uffdio_continue __user *user_uffdio_continue; struct userfaultfd_wake_range range; + uffd_flags_t flags = 0; user_uffdio_continue = (struct uffdio_continue __user *)arg; @@ -1917,13 +1918,16 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) uffdio_continue.range.start) { goto out; } - if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE) + if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE | + UFFDIO_CONTINUE_MODE_WP)) goto out; + if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP) + flags |= MFILL_ATOMIC_WP; if (mmget_not_zero(ctx->mm)) { ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start, uffdio_continue.range.len, - &ctx->mmap_changing); + &ctx->mmap_changing, flags); mmput(ctx->mm); } else { return -ESRCH; diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 4c477dece540..a2c53e98dfd6 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -83,7 +83,8 @@ extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long len, atomic_t *mmap_changing); extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start, - unsigned long len, atomic_t *mmap_changing); + unsigned long len, atomic_t *mmap_changing, + uffd_flags_t flags); extern int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool enable_wp, atomic_t *mmap_changing); diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 90c958952bfc..66dd4cd277bd 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -305,6 +305,13 @@ struct uffdio_writeprotect { struct uffdio_continue { struct uffdio_range range; #define UFFDIO_CONTINUE_MODE_DONTWAKE ((__u64)1<<0) + /* + * UFFDIO_CONTINUE_MODE_WP will map the page write protected on + * the fly. UFFDIO_CONTINUE_MODE_WP is available only if the + * write protected ioctl is implemented for the range + * according to the uffdio_register.ioctls. + */ +#define UFFDIO_CONTINUE_MODE_WP ((__u64)1<<1) __u64 mode; /* diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index a9b19b39413d..7f1b5f8b712c 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -693,10 +693,11 @@ ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long start, } ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start, - unsigned long len, atomic_t *mmap_changing) + unsigned long len, atomic_t *mmap_changing, + uffd_flags_t flags) { return mfill_atomic(dst_mm, start, 0, len, mmap_changing, - uffd_flags_set_mode(0, MFILL_ATOMIC_CONTINUE)); + uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE)); } long uffd_wp_range(struct vm_area_struct *dst_vma, diff --git a/tools/testing/selftests/mm/userfaultfd.c b/tools/testing/selftests/mm/userfaultfd.c index e030d63c031a..a96d126cb40e 100644 --- a/tools/testing/selftests/mm/userfaultfd.c +++ b/tools/testing/selftests/mm/userfaultfd.c @@ -585,6 +585,8 @@ static void continue_range(int ufd, __u64 start, __u64 len) req.range.start = start; req.range.len = len; req.mode = 0; + if (test_uffdio_wp) + req.mode |= UFFDIO_CONTINUE_MODE_WP; if (ioctl(ufd, UFFDIO_CONTINUE, &req)) err("UFFDIO_CONTINUE failed for address 0x%" PRIx64, @@ -1332,6 +1334,8 @@ static int userfaultfd_minor_test(void) uffdio_register.range.start = (unsigned long)area_dst_alias; uffdio_register.range.len = nr_pages * page_size; uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR; + if (test_uffdio_wp) + uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) err("register failure"); -- cgit v1.2.3 From 244da66cda359227d80ccb41dbcb99da40eae186 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 7 Apr 2023 13:31:30 -0700 Subject: dmaengine: idxd: setup event log configuration Add setup of event log feature for supported device. Event log addresses error reporting that was lacking in gen 1 DSA devices where a second error event does not get reported when a first event is pending software handling. The event log allows a circular buffer that the device can push error events to. It is up to the user to create a large enough event log ring in order to capture the expected events. The evl size can be set in the device sysfs attribute. By default 64 entries are supported as minimal when event log is enabled. Tested-by: Tony Zhu Signed-off-by: Dave Jiang Co-developed-by: Fenghua Yu Signed-off-by: Fenghua Yu Link: https://lore.kernel.org/r/20230407203143.2189681-4-fenghua.yu@intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/device.c | 89 +++++++++++++++++++++++++++++++++++++++++++- drivers/dma/idxd/idxd.h | 19 ++++++++++ drivers/dma/idxd/init.c | 1 + drivers/dma/idxd/registers.h | 72 ++++++++++++++++++++++++++++++++++- drivers/dma/idxd/sysfs.c | 3 +- include/uapi/linux/idxd.h | 1 + 6 files changed, 181 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c index 5f321f3b4242..230fe9bb56ae 100644 --- a/drivers/dma/idxd/device.c +++ b/drivers/dma/idxd/device.c @@ -752,6 +752,83 @@ void idxd_device_clear_state(struct idxd_device *idxd) spin_unlock(&idxd->dev_lock); } +static int idxd_device_evl_setup(struct idxd_device *idxd) +{ + union gencfg_reg gencfg; + union evlcfg_reg evlcfg; + union genctrl_reg genctrl; + struct device *dev = &idxd->pdev->dev; + void *addr; + dma_addr_t dma_addr; + int size; + struct idxd_evl *evl = idxd->evl; + + if (!evl) + return 0; + + size = evl_size(idxd); + /* + * Address needs to be page aligned. However, dma_alloc_coherent() provides + * at minimal page size aligned address. No manual alignment required. + */ + addr = dma_alloc_coherent(dev, size, &dma_addr, GFP_KERNEL); + if (!addr) + return -ENOMEM; + + memset(addr, 0, size); + + spin_lock(&evl->lock); + evl->log = addr; + evl->dma = dma_addr; + evl->log_size = size; + + memset(&evlcfg, 0, sizeof(evlcfg)); + evlcfg.bits[0] = dma_addr & GENMASK(63, 12); + evlcfg.size = evl->size; + + iowrite64(evlcfg.bits[0], idxd->reg_base + IDXD_EVLCFG_OFFSET); + iowrite64(evlcfg.bits[1], idxd->reg_base + IDXD_EVLCFG_OFFSET + 8); + + genctrl.bits = ioread32(idxd->reg_base + IDXD_GENCTRL_OFFSET); + genctrl.evl_int_en = 1; + iowrite32(genctrl.bits, idxd->reg_base + IDXD_GENCTRL_OFFSET); + + gencfg.bits = ioread32(idxd->reg_base + IDXD_GENCFG_OFFSET); + gencfg.evl_en = 1; + iowrite32(gencfg.bits, idxd->reg_base + IDXD_GENCFG_OFFSET); + + spin_unlock(&evl->lock); + return 0; +} + +static void idxd_device_evl_free(struct idxd_device *idxd) +{ + union gencfg_reg gencfg; + union genctrl_reg genctrl; + struct device *dev = &idxd->pdev->dev; + struct idxd_evl *evl = idxd->evl; + + gencfg.bits = ioread32(idxd->reg_base + IDXD_GENCFG_OFFSET); + if (!gencfg.evl_en) + return; + + spin_lock(&evl->lock); + gencfg.evl_en = 0; + iowrite32(gencfg.bits, idxd->reg_base + IDXD_GENCFG_OFFSET); + + genctrl.bits = ioread32(idxd->reg_base + IDXD_GENCTRL_OFFSET); + genctrl.evl_int_en = 0; + iowrite32(genctrl.bits, idxd->reg_base + IDXD_GENCTRL_OFFSET); + + iowrite64(0, idxd->reg_base + IDXD_EVLCFG_OFFSET); + iowrite64(0, idxd->reg_base + IDXD_EVLCFG_OFFSET + 8); + + dma_free_coherent(dev, evl->log_size, evl->log, evl->dma); + evl->log = NULL; + evl->size = IDXD_EVL_SIZE_MIN; + spin_unlock(&evl->lock); +} + static void idxd_group_config_write(struct idxd_group *group) { struct idxd_device *idxd = group->idxd; @@ -1451,15 +1528,24 @@ int idxd_device_drv_probe(struct idxd_dev *idxd_dev) if (rc < 0) return -ENXIO; + rc = idxd_device_evl_setup(idxd); + if (rc < 0) { + idxd->cmd_status = IDXD_SCMD_DEV_EVL_ERR; + return rc; + } + /* Start device */ rc = idxd_device_enable(idxd); - if (rc < 0) + if (rc < 0) { + idxd_device_evl_free(idxd); return rc; + } /* Setup DMA device without channels */ rc = idxd_register_dma_device(idxd); if (rc < 0) { idxd_device_disable(idxd); + idxd_device_evl_free(idxd); idxd->cmd_status = IDXD_SCMD_DEV_DMA_ERR; return rc; } @@ -1488,6 +1574,7 @@ void idxd_device_drv_remove(struct idxd_dev *idxd_dev) idxd_device_disable(idxd); if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) idxd_device_reset(idxd); + idxd_device_evl_free(idxd); } static enum idxd_dev_type dev_types[] = { diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h index 2a71273f1822..c74681f02b18 100644 --- a/drivers/dma/idxd/idxd.h +++ b/drivers/dma/idxd/idxd.h @@ -262,7 +262,15 @@ struct idxd_driver_data { }; struct idxd_evl { + /* Lock to protect event log access. */ + spinlock_t lock; + void *log; + dma_addr_t dma; + /* Total size of event log = number of entries * entry size. */ + unsigned int log_size; + /* The number of entries in the event log. */ u16 size; + u16 head; }; struct idxd_device { @@ -324,6 +332,17 @@ struct idxd_device { struct idxd_evl *evl; }; +static inline unsigned int evl_ent_size(struct idxd_device *idxd) +{ + return idxd->hw.gen_cap.evl_support ? + (32 * (1 << idxd->hw.gen_cap.evl_support)) : 0; +} + +static inline unsigned int evl_size(struct idxd_device *idxd) +{ + return idxd->evl->size * evl_ent_size(idxd); +} + /* IDXD software descriptor */ struct idxd_desc { union { diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index 525de59df82a..f44719a11c95 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -343,6 +343,7 @@ static int idxd_init_evl(struct idxd_device *idxd) if (!evl) return -ENOMEM; + spin_lock_init(&evl->lock); evl->size = IDXD_EVL_SIZE_MIN; idxd->evl = evl; return 0; diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h index ea3a499a3c3c..11bb97cf7481 100644 --- a/drivers/dma/idxd/registers.h +++ b/drivers/dma/idxd/registers.h @@ -3,6 +3,8 @@ #ifndef _IDXD_REGISTERS_H_ #define _IDXD_REGISTERS_H_ +#include + /* PCI Config */ #define PCI_DEVICE_ID_INTEL_DSA_SPR0 0x0b25 #define PCI_DEVICE_ID_INTEL_IAX_SPR0 0x0cfe @@ -119,7 +121,8 @@ union gencfg_reg { u32 rdbuf_limit:8; u32 rsvd:4; u32 user_int_en:1; - u32 rsvd2:19; + u32 evl_en:1; + u32 rsvd2:18; }; u32 bits; } __packed; @@ -129,7 +132,8 @@ union genctrl_reg { struct { u32 softerr_int_en:1; u32 halt_int_en:1; - u32 rsvd:30; + u32 evl_int_en:1; + u32 rsvd:29; }; u32 bits; } __packed; @@ -299,6 +303,21 @@ union iaa_cap_reg { #define IDXD_IAACAP_OFFSET 0x180 +#define IDXD_EVLCFG_OFFSET 0xe0 +union evlcfg_reg { + struct { + u64 pasid_en:1; + u64 priv:1; + u64 rsvd:10; + u64 base_addr:52; + + u64 size:16; + u64 pasid:20; + u64 rsvd2:28; + }; + u64 bits[2]; +} __packed; + #define IDXD_EVL_SIZE_MIN 0x0040 #define IDXD_EVL_SIZE_MAX 0xffff @@ -539,4 +558,53 @@ union filter_cfg { u64 val; } __packed; +struct __evl_entry { + u64 rsvd:2; + u64 desc_valid:1; + u64 wq_idx_valid:1; + u64 batch:1; + u64 fault_rw:1; + u64 priv:1; + u64 err_info_valid:1; + u64 error:8; + u64 wq_idx:8; + u64 batch_id:8; + u64 operation:8; + u64 pasid:20; + u64 rsvd2:4; + + u16 batch_idx; + u16 rsvd3; + union { + /* Invalid Flags 0x11 */ + u32 invalid_flags; + /* Invalid Int Handle 0x19 */ + /* Page fault 0x1a */ + /* Page fault 0x06, 0x1f, only operand_id */ + /* Page fault before drain or in batch, 0x26, 0x27 */ + struct { + u16 int_handle; + u16 rci:1; + u16 ims:1; + u16 rcr:1; + u16 first_err_in_batch:1; + u16 rsvd4_2:9; + u16 operand_id:3; + }; + }; + u64 fault_addr; + u64 rsvd5; +} __packed; + +struct dsa_evl_entry { + struct __evl_entry e; + struct dsa_completion_record cr; +} __packed; + +struct iax_evl_entry { + struct __evl_entry e; + u64 rsvd[4]; + struct iax_completion_record cr; +} __packed; + #endif diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c index 85644e5bde83..163fdfaa5022 100644 --- a/drivers/dma/idxd/sysfs.c +++ b/drivers/dma/idxd/sysfs.c @@ -1605,7 +1605,8 @@ static ssize_t event_log_size_store(struct device *dev, if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) return -EPERM; - if (val < IDXD_EVL_SIZE_MIN || val > IDXD_EVL_SIZE_MAX) + if (val < IDXD_EVL_SIZE_MIN || val > IDXD_EVL_SIZE_MAX || + (val * evl_ent_size(idxd) > ULONG_MAX - idxd->evl->dma)) return -EINVAL; idxd->evl->size = val; diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h index fc47635b57dc..5d05bf12f2bd 100644 --- a/include/uapi/linux/idxd.h +++ b/include/uapi/linux/idxd.h @@ -30,6 +30,7 @@ enum idxd_scmd_stat { IDXD_SCMD_WQ_NO_PRIV = 0x800f0000, IDXD_SCMD_WQ_IRQ_ERR = 0x80100000, IDXD_SCMD_WQ_USER_NO_IOMMU = 0x80110000, + IDXD_SCMD_DEV_EVL_ERR = 0x80120000, }; #define IDXD_SCMD_SOFTERR_MASK 0x80000000 -- cgit v1.2.3 From 2f431ba908d2ef05da478d10925207728f1ff483 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 7 Apr 2023 13:31:31 -0700 Subject: dmaengine: idxd: add interrupt handling for event log An event log interrupt is raised in the misc interrupt INTCAUSE register when an event is written by the hardware. Add basic event log processing support to the interrupt handler. The event log is a ring where the hardware owns the tail and the software owns the head. The hardware will advance the tail index when an additional event has been pushed to memory. The software will process the log entry and then advances the head. The log is full when (tail + 1) % log_size = head. The hardware will stop writing when the log is full. The user is expected to create a log size large enough to handle all the expected events. Tested-by: Tony Zhu Signed-off-by: Dave Jiang Co-developed-by: Fenghua Yu Signed-off-by: Fenghua Yu Link: https://lore.kernel.org/r/20230407203143.2189681-5-fenghua.yu@intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/irq.c | 48 ++++++++++++++++++++++++++++++++++++++++++++ drivers/dma/idxd/registers.h | 19 ++++++++++++++++++ include/uapi/linux/idxd.h | 1 + 3 files changed, 68 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c index 0d639303b515..52b8b7d9db22 100644 --- a/drivers/dma/idxd/irq.c +++ b/drivers/dma/idxd/irq.c @@ -217,6 +217,49 @@ static void idxd_int_handle_revoke(struct work_struct *work) kfree(revoke); } +static void process_evl_entry(struct idxd_device *idxd, struct __evl_entry *entry_head) +{ + struct device *dev = &idxd->pdev->dev; + u8 status; + + status = DSA_COMP_STATUS(entry_head->error); + dev_warn_ratelimited(dev, "Device error %#x operation: %#x fault addr: %#llx\n", + status, entry_head->operation, entry_head->fault_addr); +} + +static void process_evl_entries(struct idxd_device *idxd) +{ + union evl_status_reg evl_status; + unsigned int h, t; + struct idxd_evl *evl = idxd->evl; + struct __evl_entry *entry_head; + unsigned int ent_size = evl_ent_size(idxd); + u32 size; + + evl_status.bits = 0; + evl_status.int_pending = 1; + + spin_lock(&evl->lock); + /* Clear interrupt pending bit */ + iowrite32(evl_status.bits_upper32, + idxd->reg_base + IDXD_EVLSTATUS_OFFSET + sizeof(u32)); + h = evl->head; + evl_status.bits = ioread64(idxd->reg_base + IDXD_EVLSTATUS_OFFSET); + t = evl_status.tail; + size = idxd->evl->size; + + while (h != t) { + entry_head = (struct __evl_entry *)(evl->log + (h * ent_size)); + process_evl_entry(idxd, entry_head); + h = (h + 1) % size; + } + + evl->head = h; + evl_status.head = h; + iowrite32(evl_status.bits_lower32, idxd->reg_base + IDXD_EVLSTATUS_OFFSET); + spin_unlock(&evl->lock); +} + irqreturn_t idxd_misc_thread(int vec, void *data) { struct idxd_irq_entry *irq_entry = data; @@ -304,6 +347,11 @@ irqreturn_t idxd_misc_thread(int vec, void *data) perfmon_counter_overflow(idxd); } + if (cause & IDXD_INTC_EVL) { + val |= IDXD_INTC_EVL; + process_evl_entries(idxd); + } + val ^= cause; if (val) dev_warn_once(dev, "Unexpected interrupt cause bits set: %#x\n", diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h index 11bb97cf7481..148db94f9373 100644 --- a/drivers/dma/idxd/registers.h +++ b/drivers/dma/idxd/registers.h @@ -168,6 +168,7 @@ enum idxd_device_reset_type { #define IDXD_INTC_OCCUPY 0x04 #define IDXD_INTC_PERFMON_OVFL 0x08 #define IDXD_INTC_HALT_STATE 0x10 +#define IDXD_INTC_EVL 0x20 #define IDXD_INTC_INT_HANDLE_REVOKED 0x80000000 #define IDXD_CMD_OFFSET 0xa0 @@ -558,6 +559,24 @@ union filter_cfg { u64 val; } __packed; +#define IDXD_EVLSTATUS_OFFSET 0xf0 + +union evl_status_reg { + struct { + u32 head:16; + u32 rsvd:16; + u32 tail:16; + u32 rsvd2:14; + u32 int_pending:1; + u32 rsvd3:1; + }; + struct { + u32 bits_lower32; + u32 bits_upper32; + }; + u64 bits; +} __packed; + struct __evl_entry { u64 rsvd:2; u64 desc_valid:1; diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h index 5d05bf12f2bd..0bc8eea18586 100644 --- a/include/uapi/linux/idxd.h +++ b/include/uapi/linux/idxd.h @@ -170,6 +170,7 @@ enum iax_completion_status { #define DSA_COMP_STATUS_MASK 0x7f #define DSA_COMP_STATUS_WRITE 0x80 +#define DSA_COMP_STATUS(status) ((status) & DSA_COMP_STATUS_MASK) struct dsa_hw_desc { uint32_t pasid:20; -- cgit v1.2.3 From 5fbe6503b52f5665560584f62adab5db70ac910e Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 7 Apr 2023 13:31:32 -0700 Subject: dmanegine: idxd: add debugfs for event log dump Add debugfs entry to dump the content of the event log for debugging. The function will dump all non-zero entries in the event log. It will note which entries are processed and which entries are still pending processing at the time of the dump. The entries may not always be in chronological order due to the log is a circular buffer. Tested-by: Tony Zhu Signed-off-by: Dave Jiang Co-developed-by: Fenghua Yu Signed-off-by: Fenghua Yu Link: https://lore.kernel.org/r/20230407203143.2189681-6-fenghua.yu@intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/Makefile | 2 +- drivers/dma/idxd/debugfs.c | 138 +++++++++++++++++++++++++++++++++++++++++++++ drivers/dma/idxd/idxd.h | 9 +++ drivers/dma/idxd/init.c | 12 ++++ include/uapi/linux/idxd.h | 6 +- 5 files changed, 164 insertions(+), 3 deletions(-) create mode 100644 drivers/dma/idxd/debugfs.c (limited to 'include/uapi/linux') diff --git a/drivers/dma/idxd/Makefile b/drivers/dma/idxd/Makefile index a1e9f2b3a37c..dc096839ac63 100644 --- a/drivers/dma/idxd/Makefile +++ b/drivers/dma/idxd/Makefile @@ -1,7 +1,7 @@ ccflags-y += -DDEFAULT_SYMBOL_NAMESPACE=IDXD obj-$(CONFIG_INTEL_IDXD) += idxd.o -idxd-y := init.o irq.o device.o sysfs.o submit.o dma.o cdev.o +idxd-y := init.o irq.o device.o sysfs.o submit.o dma.o cdev.o debugfs.o idxd-$(CONFIG_INTEL_IDXD_PERFMON) += perfmon.o diff --git a/drivers/dma/idxd/debugfs.c b/drivers/dma/idxd/debugfs.c new file mode 100644 index 000000000000..9cfbd9b14c4c --- /dev/null +++ b/drivers/dma/idxd/debugfs.c @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2021 Intel Corporation. All rights rsvd. */ +#include +#include +#include +#include +#include +#include +#include +#include "idxd.h" +#include "registers.h" + +static struct dentry *idxd_debugfs_dir; + +static void dump_event_entry(struct idxd_device *idxd, struct seq_file *s, + u16 index, int *count, bool processed) +{ + struct idxd_evl *evl = idxd->evl; + struct dsa_evl_entry *entry; + struct dsa_completion_record *cr; + u64 *raw; + int i; + int evl_strides = evl_ent_size(idxd) / sizeof(u64); + + entry = (struct dsa_evl_entry *)evl->log + index; + + if (!entry->e.desc_valid) + return; + + seq_printf(s, "Event Log entry %d (real index %u) processed: %u\n", + *count, index, processed); + + seq_printf(s, "desc valid %u wq idx valid %u\n" + "batch %u fault rw %u priv %u error 0x%x\n" + "wq idx %u op %#x pasid %u batch idx %u\n" + "fault addr %#llx\n", + entry->e.desc_valid, entry->e.wq_idx_valid, + entry->e.batch, entry->e.fault_rw, entry->e.priv, + entry->e.error, entry->e.wq_idx, entry->e.operation, + entry->e.pasid, entry->e.batch_idx, entry->e.fault_addr); + + cr = &entry->cr; + seq_printf(s, "status %#x result %#x fault_info %#x bytes_completed %u\n" + "fault addr %#llx inv flags %#x\n\n", + cr->status, cr->result, cr->fault_info, cr->bytes_completed, + cr->fault_addr, cr->invalid_flags); + + raw = (u64 *)entry; + + for (i = 0; i < evl_strides; i++) + seq_printf(s, "entry[%d] = %#llx\n", i, raw[i]); + + seq_puts(s, "\n"); + *count += 1; +} + +static int debugfs_evl_show(struct seq_file *s, void *d) +{ + struct idxd_device *idxd = s->private; + struct idxd_evl *evl = idxd->evl; + union evl_status_reg evl_status; + u16 h, t, evl_size, i; + int count = 0; + bool processed = true; + + if (!evl || !evl->log) + return 0; + + spin_lock(&evl->lock); + + h = evl->head; + evl_status.bits = ioread64(idxd->reg_base + IDXD_EVLSTATUS_OFFSET); + t = evl_status.tail; + evl_size = evl->size; + + seq_printf(s, "Event Log head %u tail %u interrupt pending %u\n\n", + evl_status.head, evl_status.tail, evl_status.int_pending); + + i = t; + while (1) { + i = (i + 1) % evl_size; + if (i == t) + break; + + if (processed && i == h) + processed = false; + dump_event_entry(idxd, s, i, &count, processed); + } + + spin_unlock(&evl->lock); + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(debugfs_evl); + +int idxd_device_init_debugfs(struct idxd_device *idxd) +{ + if (IS_ERR_OR_NULL(idxd_debugfs_dir)) + return 0; + + idxd->dbgfs_dir = debugfs_create_dir(dev_name(idxd_confdev(idxd)), idxd_debugfs_dir); + if (IS_ERR(idxd->dbgfs_dir)) + return PTR_ERR(idxd->dbgfs_dir); + + if (idxd->evl) { + idxd->dbgfs_evl_file = debugfs_create_file("event_log", 0400, + idxd->dbgfs_dir, idxd, + &debugfs_evl_fops); + if (IS_ERR(idxd->dbgfs_evl_file)) { + debugfs_remove_recursive(idxd->dbgfs_dir); + idxd->dbgfs_dir = NULL; + return PTR_ERR(idxd->dbgfs_evl_file); + } + } + + return 0; +} + +void idxd_device_remove_debugfs(struct idxd_device *idxd) +{ + debugfs_remove_recursive(idxd->dbgfs_dir); +} + +int idxd_init_debugfs(void) +{ + if (!debugfs_initialized()) + return 0; + + idxd_debugfs_dir = debugfs_create_dir(KBUILD_MODNAME, NULL); + if (IS_ERR(idxd_debugfs_dir)) + return PTR_ERR(idxd_debugfs_dir); + return 0; +} + +void idxd_remove_debugfs(void) +{ + debugfs_remove_recursive(idxd_debugfs_dir); +} diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h index c74681f02b18..b923b90b7299 100644 --- a/drivers/dma/idxd/idxd.h +++ b/drivers/dma/idxd/idxd.h @@ -330,6 +330,9 @@ struct idxd_device { unsigned long *opcap_bmap; struct idxd_evl *evl; + + struct dentry *dbgfs_dir; + struct dentry *dbgfs_evl_file; }; static inline unsigned int evl_ent_size(struct idxd_device *idxd) @@ -704,4 +707,10 @@ static inline void perfmon_init(void) {} static inline void perfmon_exit(void) {} #endif +/* debugfs */ +int idxd_device_init_debugfs(struct idxd_device *idxd); +void idxd_device_remove_debugfs(struct idxd_device *idxd); +int idxd_init_debugfs(void); +void idxd_remove_debugfs(void); + #endif diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index f44719a11c95..e21c4b7a1d32 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -669,6 +669,10 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto err_dev_register; } + rc = idxd_device_init_debugfs(idxd); + if (rc) + dev_warn(dev, "IDXD debugfs failed to setup\n"); + dev_info(&pdev->dev, "Intel(R) Accelerator Device (v%x)\n", idxd->hw.version); @@ -731,6 +735,7 @@ static void idxd_remove(struct pci_dev *pdev) idxd_shutdown(pdev); if (device_pasid_enabled(idxd)) idxd_disable_system_pasid(idxd); + idxd_device_remove_debugfs(idxd); irq_entry = idxd_get_ie(idxd, 0); free_irq(irq_entry->vector, irq_entry); @@ -788,6 +793,10 @@ static int __init idxd_init_module(void) if (err) goto err_cdev_register; + err = idxd_init_debugfs(); + if (err) + goto err_debugfs; + err = pci_register_driver(&idxd_pci_driver); if (err) goto err_pci_register; @@ -795,6 +804,8 @@ static int __init idxd_init_module(void) return 0; err_pci_register: + idxd_remove_debugfs(); +err_debugfs: idxd_cdev_remove(); err_cdev_register: idxd_driver_unregister(&idxd_user_drv); @@ -815,5 +826,6 @@ static void __exit idxd_exit_module(void) pci_unregister_driver(&idxd_pci_driver); idxd_cdev_remove(); perfmon_exit(); + idxd_remove_debugfs(); } module_exit(idxd_exit_module); diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h index 0bc8eea18586..e86199d09a91 100644 --- a/include/uapi/linux/idxd.h +++ b/include/uapi/linux/idxd.h @@ -311,7 +311,8 @@ struct dsa_completion_record { uint8_t result; uint8_t dif_status; }; - uint16_t rsvd; + uint8_t fault_info; + uint8_t rsvd; uint32_t bytes_completed; uint64_t fault_addr; union { @@ -368,7 +369,8 @@ struct dsa_raw_completion_record { struct iax_completion_record { volatile uint8_t status; uint8_t error_code; - uint16_t rsvd; + uint8_t fault_info; + uint8_t rsvd; uint32_t bytes_completed; uint64_t fault_addr; uint32_t invalid_flags; -- cgit v1.2.3 From c40bd7d9737bdcfb02d42765bc6c59b338151123 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 7 Apr 2023 13:31:36 -0700 Subject: dmaengine: idxd: process user page faults for completion record DSA supports page fault handling through PRS. However, the DMA engine that's processing the descriptor is blocked until the PRS response is received. Other workqueues sharing the engine are also blocked. Page fault handing by the driver with PRS disabled can be used to mitigate the stalling. With PRS disabled while ATS remain enabled, DSA handles page faults on a completion record by reporting an event in the event log. In this instance, the descriptor is completed and the event log contains the completion record address and the contents of the completion record. Add support to the event log handling code to fault in the completion record and copy the content of the completion record to user memory. A bitmap is introduced to keep track of discarded event log entries. When the user process initiates ->release() of the char device, it no longer is interested in any remaining event log entries tied to the relevant wq and PASID. The driver will mark the event log entry index in the bitmap. Upon encountering the entries during processing, the event log handler will just clear the bitmap bit and skip the entry rather than attempt to process the event log entry. Tested-by: Tony Zhu Signed-off-by: Dave Jiang Co-developed-by: Fenghua Yu Signed-off-by: Fenghua Yu Link: https://lore.kernel.org/r/20230407203143.2189681-10-fenghua.yu@intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/cdev.c | 30 ++++++++++++++++ drivers/dma/idxd/device.c | 22 ++++++++++-- drivers/dma/idxd/idxd.h | 2 ++ drivers/dma/idxd/init.c | 2 ++ drivers/dma/idxd/irq.c | 87 ++++++++++++++++++++++++++++++++++++++++++++--- include/uapi/linux/idxd.h | 1 + 6 files changed, 137 insertions(+), 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c index 8b8a0a0fb054..0a51c33198f6 100644 --- a/drivers/dma/idxd/cdev.c +++ b/drivers/dma/idxd/cdev.c @@ -164,6 +164,35 @@ failed: return rc; } +static void idxd_cdev_evl_drain_pasid(struct idxd_wq *wq, u32 pasid) +{ + struct idxd_device *idxd = wq->idxd; + struct idxd_evl *evl = idxd->evl; + union evl_status_reg status; + u16 h, t, size; + int ent_size = evl_ent_size(idxd); + struct __evl_entry *entry_head; + + if (!evl) + return; + + spin_lock(&evl->lock); + status.bits = ioread64(idxd->reg_base + IDXD_EVLSTATUS_OFFSET); + t = status.tail; + h = evl->head; + size = evl->size; + + while (h != t) { + entry_head = (struct __evl_entry *)(evl->log + (h * ent_size)); + if (entry_head->pasid == pasid && entry_head->wq_idx == wq->id) + set_bit(h, evl->bmap); + h = (h + 1) % size; + } + spin_unlock(&evl->lock); + + drain_workqueue(wq->wq); +} + static int idxd_cdev_release(struct inode *node, struct file *filep) { struct idxd_user_context *ctx = filep->private_data; @@ -190,6 +219,7 @@ static int idxd_cdev_release(struct inode *node, struct file *filep) } if (ctx->sva) { + idxd_cdev_evl_drain_pasid(wq, ctx->pasid); iommu_sva_unbind_device(ctx->sva); idxd_xa_pasid_remove(ctx); } diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c index 230fe9bb56ae..fd97b2b58734 100644 --- a/drivers/dma/idxd/device.c +++ b/drivers/dma/idxd/device.c @@ -762,18 +762,29 @@ static int idxd_device_evl_setup(struct idxd_device *idxd) dma_addr_t dma_addr; int size; struct idxd_evl *evl = idxd->evl; + unsigned long *bmap; + int rc; if (!evl) return 0; size = evl_size(idxd); + + bmap = bitmap_zalloc(size, GFP_KERNEL); + if (!bmap) { + rc = -ENOMEM; + goto err_bmap; + } + /* * Address needs to be page aligned. However, dma_alloc_coherent() provides * at minimal page size aligned address. No manual alignment required. */ addr = dma_alloc_coherent(dev, size, &dma_addr, GFP_KERNEL); - if (!addr) - return -ENOMEM; + if (!addr) { + rc = -ENOMEM; + goto err_alloc; + } memset(addr, 0, size); @@ -781,6 +792,7 @@ static int idxd_device_evl_setup(struct idxd_device *idxd) evl->log = addr; evl->dma = dma_addr; evl->log_size = size; + evl->bmap = bmap; memset(&evlcfg, 0, sizeof(evlcfg)); evlcfg.bits[0] = dma_addr & GENMASK(63, 12); @@ -799,6 +811,11 @@ static int idxd_device_evl_setup(struct idxd_device *idxd) spin_unlock(&evl->lock); return 0; + +err_alloc: + bitmap_free(bmap); +err_bmap: + return rc; } static void idxd_device_evl_free(struct idxd_device *idxd) @@ -824,6 +841,7 @@ static void idxd_device_evl_free(struct idxd_device *idxd) iowrite64(0, idxd->reg_base + IDXD_EVLCFG_OFFSET + 8); dma_free_coherent(dev, evl->log_size, evl->log, evl->dma); + bitmap_free(evl->bmap); evl->log = NULL; evl->size = IDXD_EVL_SIZE_MIN; spin_unlock(&evl->lock); diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h index b3f9a12adce2..3963c83165a6 100644 --- a/drivers/dma/idxd/idxd.h +++ b/drivers/dma/idxd/idxd.h @@ -264,6 +264,7 @@ struct idxd_driver_data { struct device_type *dev_type; int compl_size; int align; + int evl_cr_off; }; struct idxd_evl { @@ -276,6 +277,7 @@ struct idxd_evl { /* The number of entries in the event log. */ u16 size; u16 head; + unsigned long *bmap; }; struct idxd_evl_fault { diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index e6faff58733d..18344593b83f 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -46,6 +46,7 @@ static struct idxd_driver_data idxd_driver_data[] = { .compl_size = sizeof(struct dsa_completion_record), .align = 32, .dev_type = &dsa_device_type, + .evl_cr_off = offsetof(struct dsa_evl_entry, cr), }, [IDXD_TYPE_IAX] = { .name_prefix = "iax", @@ -53,6 +54,7 @@ static struct idxd_driver_data idxd_driver_data[] = { .compl_size = sizeof(struct iax_completion_record), .align = 64, .dev_type = &iax_device_type, + .evl_cr_off = offsetof(struct iax_evl_entry, cr), }, }; diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c index 52b8b7d9db22..96983975f974 100644 --- a/drivers/dma/idxd/irq.c +++ b/drivers/dma/idxd/irq.c @@ -7,6 +7,8 @@ #include #include #include +#include +#include #include #include "../dmaengine.h" #include "idxd.h" @@ -217,14 +219,89 @@ static void idxd_int_handle_revoke(struct work_struct *work) kfree(revoke); } -static void process_evl_entry(struct idxd_device *idxd, struct __evl_entry *entry_head) +static void idxd_evl_fault_work(struct work_struct *work) +{ + struct idxd_evl_fault *fault = container_of(work, struct idxd_evl_fault, work); + struct idxd_wq *wq = fault->wq; + struct idxd_device *idxd = wq->idxd; + struct device *dev = &idxd->pdev->dev; + struct __evl_entry *entry_head = fault->entry; + void *cr = (void *)entry_head + idxd->data->evl_cr_off; + int cr_size = idxd->data->compl_size, copied; + + switch (fault->status) { + case DSA_COMP_CRA_XLAT: + case DSA_COMP_DRAIN_EVL: + /* + * Copy completion record to fault_addr in user address space + * that is found by wq and PASID. + */ + copied = idxd_copy_cr(wq, entry_head->pasid, + entry_head->fault_addr, + cr, cr_size); + /* + * The task that triggered the page fault is unknown currently + * because multiple threads may share the user address + * space or the task exits already before this fault. + * So if the copy fails, SIGSEGV can not be sent to the task. + * Just print an error for the failure. The user application + * waiting for the completion record will time out on this + * failure. + */ + if (copied != cr_size) { + dev_dbg_ratelimited(dev, "Failed to write to completion record. (%d:%d)\n", + cr_size, copied); + } + break; + default: + dev_dbg_ratelimited(dev, "Unrecognized error code: %#x\n", + DSA_COMP_STATUS(entry_head->error)); + break; + } + + kmem_cache_free(idxd->evl_cache, fault); +} + +static void process_evl_entry(struct idxd_device *idxd, + struct __evl_entry *entry_head, unsigned int index) { struct device *dev = &idxd->pdev->dev; + struct idxd_evl *evl = idxd->evl; u8 status; - status = DSA_COMP_STATUS(entry_head->error); - dev_warn_ratelimited(dev, "Device error %#x operation: %#x fault addr: %#llx\n", - status, entry_head->operation, entry_head->fault_addr); + if (test_bit(index, evl->bmap)) { + clear_bit(index, evl->bmap); + } else { + status = DSA_COMP_STATUS(entry_head->error); + + if (status == DSA_COMP_CRA_XLAT || status == DSA_COMP_DRAIN_EVL) { + struct idxd_evl_fault *fault; + int ent_size = evl_ent_size(idxd); + + if (entry_head->rci) + dev_dbg(dev, "Completion Int Req set, ignoring!\n"); + + if (!entry_head->rcr && status == DSA_COMP_DRAIN_EVL) + return; + + fault = kmem_cache_alloc(idxd->evl_cache, GFP_ATOMIC); + if (fault) { + struct idxd_wq *wq = idxd->wqs[entry_head->wq_idx]; + + fault->wq = wq; + fault->status = status; + memcpy(&fault->entry, entry_head, ent_size); + INIT_WORK(&fault->work, idxd_evl_fault_work); + queue_work(wq->wq, &fault->work); + } else { + dev_warn(dev, "Failed to service fault work.\n"); + } + } else { + dev_warn_ratelimited(dev, "Device error %#x operation: %#x fault addr: %#llx\n", + status, entry_head->operation, + entry_head->fault_addr); + } + } } static void process_evl_entries(struct idxd_device *idxd) @@ -250,7 +327,7 @@ static void process_evl_entries(struct idxd_device *idxd) while (h != t) { entry_head = (struct __evl_entry *)(evl->log + (h * ent_size)); - process_evl_entry(idxd, entry_head); + process_evl_entry(idxd, entry_head, h); h = (h + 1) % size; } diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h index e86199d09a91..4b584d5afd87 100644 --- a/include/uapi/linux/idxd.h +++ b/include/uapi/linux/idxd.h @@ -135,6 +135,7 @@ enum dsa_completion_status { DSA_COMP_HW_ERR1, DSA_COMP_HW_ERR_DRB, DSA_COMP_TRANSLATION_FAIL, + DSA_COMP_DRAIN_EVL = 0x26, }; enum iax_completion_status { -- cgit v1.2.3 From 6926987185a3ae92c31b99ce1bfdfb04e95057c0 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 7 Apr 2023 13:31:37 -0700 Subject: dmaengine: idxd: add descs_completed field for completion record The descs_completed field for a completion record is part of a batch descriptor completion record. It takes the same location as bytes_completed in a normal descriptor field. Add to expose to user. Tested-by: Tony Zhu Signed-off-by: Dave Jiang Co-developed-by: Fenghua Yu Signed-off-by: Fenghua Yu Link: https://lore.kernel.org/r/20230407203143.2189681-11-fenghua.yu@intel.com Signed-off-by: Vinod Koul --- include/uapi/linux/idxd.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h index 4b584d5afd87..76ad71bf751e 100644 --- a/include/uapi/linux/idxd.h +++ b/include/uapi/linux/idxd.h @@ -314,7 +314,10 @@ struct dsa_completion_record { }; uint8_t fault_info; uint8_t rsvd; - uint32_t bytes_completed; + union { + uint32_t bytes_completed; + uint32_t descs_completed; + }; uint64_t fault_addr; union { /* common record */ -- cgit v1.2.3 From 2442b7473ad03671378d2d95651bd6bbe09a0943 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 7 Apr 2023 13:31:38 -0700 Subject: dmaengine: idxd: process batch descriptor completion record faults Add event log processing for faulting of user batch descriptor completion record. When encountering an event log entry for a page fault on a completion record, the driver is expected to do the following: 1. If the "first error in batch" bit in event log entry error info is set, discard any previously recorded errors associated with the "batch identifier". 2. Fix the page fault according to the fault address in the event log. If successful, write the completion record to the fault address in user space. 3. If an error is encountered while writing the completion record and it is associated to a descriptor in the batch, the driver associates the error with the batch identifier of the event log entry and tracks it until the event log entry for the corresponding batch desc is encountered. While processing an event log entry for a batch descriptor with error indicating that one or more descs in the batch had event log entries, the driver will do the following before writing the batch completion record: 1. If the status field of the completion record is 0x1, the driver will change it to error code 0x5 (one or more operations in batch completed with status not successful) and changes the result field to 1. 2. If the status is error code 0x6 (page fault on batch descriptor list address), change the result field to 1. 3. If status is any other value, the completion record is not changed. 4. Clear the recorded error in preparation for next batch with same batch identifier. The result field is for user software to determine whether to set the "Batch Error" flag bit in the descriptor for continuation of partial batch descriptor completion. See DSA spec 2.0 for additional information. If no error has been recorded for the batch, the batch completion record is written to user space as is. Tested-by: Tony Zhu Signed-off-by: Dave Jiang Co-developed-by: Fenghua Yu Signed-off-by: Fenghua Yu Link: https://lore.kernel.org/r/20230407203143.2189681-12-fenghua.yu@intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/idxd.h | 3 ++ drivers/dma/idxd/init.c | 4 ++ drivers/dma/idxd/irq.c | 91 ++++++++++++++++++++++++++++++++------------ drivers/dma/idxd/registers.h | 4 +- include/uapi/linux/idxd.h | 1 + 5 files changed, 78 insertions(+), 25 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h index 3963c83165a6..4c4baa80c731 100644 --- a/drivers/dma/idxd/idxd.h +++ b/drivers/dma/idxd/idxd.h @@ -265,6 +265,8 @@ struct idxd_driver_data { int compl_size; int align; int evl_cr_off; + int cr_status_off; + int cr_result_off; }; struct idxd_evl { @@ -278,6 +280,7 @@ struct idxd_evl { u16 size; u16 head; unsigned long *bmap; + bool batch_fail[IDXD_MAX_BATCH_IDENT]; }; struct idxd_evl_fault { diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index 18344593b83f..92f60d364392 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -47,6 +47,8 @@ static struct idxd_driver_data idxd_driver_data[] = { .align = 32, .dev_type = &dsa_device_type, .evl_cr_off = offsetof(struct dsa_evl_entry, cr), + .cr_status_off = offsetof(struct dsa_completion_record, status), + .cr_result_off = offsetof(struct dsa_completion_record, result), }, [IDXD_TYPE_IAX] = { .name_prefix = "iax", @@ -55,6 +57,8 @@ static struct idxd_driver_data idxd_driver_data[] = { .align = 64, .dev_type = &iax_device_type, .evl_cr_off = offsetof(struct iax_evl_entry, cr), + .cr_status_off = offsetof(struct iax_completion_record, status), + .cr_result_off = offsetof(struct iax_completion_record, error_code), }, }; diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c index 96983975f974..c660d63a3eb8 100644 --- a/drivers/dma/idxd/irq.c +++ b/drivers/dma/idxd/irq.c @@ -225,37 +225,79 @@ static void idxd_evl_fault_work(struct work_struct *work) struct idxd_wq *wq = fault->wq; struct idxd_device *idxd = wq->idxd; struct device *dev = &idxd->pdev->dev; + struct idxd_evl *evl = idxd->evl; struct __evl_entry *entry_head = fault->entry; void *cr = (void *)entry_head + idxd->data->evl_cr_off; - int cr_size = idxd->data->compl_size, copied; + int cr_size = idxd->data->compl_size; + u8 *status = (u8 *)cr + idxd->data->cr_status_off; + u8 *result = (u8 *)cr + idxd->data->cr_result_off; + int copied, copy_size; + bool *bf; switch (fault->status) { case DSA_COMP_CRA_XLAT: - case DSA_COMP_DRAIN_EVL: - /* - * Copy completion record to fault_addr in user address space - * that is found by wq and PASID. - */ - copied = idxd_copy_cr(wq, entry_head->pasid, - entry_head->fault_addr, - cr, cr_size); - /* - * The task that triggered the page fault is unknown currently - * because multiple threads may share the user address - * space or the task exits already before this fault. - * So if the copy fails, SIGSEGV can not be sent to the task. - * Just print an error for the failure. The user application - * waiting for the completion record will time out on this - * failure. - */ - if (copied != cr_size) { - dev_dbg_ratelimited(dev, "Failed to write to completion record. (%d:%d)\n", - cr_size, copied); + if (entry_head->batch && entry_head->first_err_in_batch) + evl->batch_fail[entry_head->batch_id] = false; + + copy_size = cr_size; + break; + case DSA_COMP_BATCH_EVL_ERR: + bf = &evl->batch_fail[entry_head->batch_id]; + + copy_size = entry_head->rcr || *bf ? cr_size : 0; + if (*bf) { + if (*status == DSA_COMP_SUCCESS) + *status = DSA_COMP_BATCH_FAIL; + *result = 1; + *bf = false; } break; + case DSA_COMP_DRAIN_EVL: + copy_size = cr_size; + break; default: - dev_dbg_ratelimited(dev, "Unrecognized error code: %#x\n", - DSA_COMP_STATUS(entry_head->error)); + copy_size = 0; + dev_dbg_ratelimited(dev, "Unrecognized error code: %#x\n", fault->status); + break; + } + + if (copy_size == 0) + return; + + /* + * Copy completion record to fault_addr in user address space + * that is found by wq and PASID. + */ + copied = idxd_copy_cr(wq, entry_head->pasid, entry_head->fault_addr, + cr, copy_size); + /* + * The task that triggered the page fault is unknown currently + * because multiple threads may share the user address + * space or the task exits already before this fault. + * So if the copy fails, SIGSEGV can not be sent to the task. + * Just print an error for the failure. The user application + * waiting for the completion record will time out on this + * failure. + */ + switch (fault->status) { + case DSA_COMP_CRA_XLAT: + if (copied != copy_size) { + dev_dbg_ratelimited(dev, "Failed to write to completion record: (%d:%d)\n", + copy_size, copied); + if (entry_head->batch) + evl->batch_fail[entry_head->batch_id] = true; + } + break; + case DSA_COMP_BATCH_EVL_ERR: + if (copied != copy_size) { + dev_dbg_ratelimited(dev, "Failed to write to batch completion record: (%d:%d)\n", + copy_size, copied); + } + break; + case DSA_COMP_DRAIN_EVL: + if (copied != copy_size) + dev_dbg_ratelimited(dev, "Failed to write to drain completion record: (%d:%d)\n", + copy_size, copied); break; } @@ -274,7 +316,8 @@ static void process_evl_entry(struct idxd_device *idxd, } else { status = DSA_COMP_STATUS(entry_head->error); - if (status == DSA_COMP_CRA_XLAT || status == DSA_COMP_DRAIN_EVL) { + if (status == DSA_COMP_CRA_XLAT || status == DSA_COMP_DRAIN_EVL || + status == DSA_COMP_BATCH_EVL_ERR) { struct idxd_evl_fault *fault; int ent_size = evl_ent_size(idxd); diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h index 148db94f9373..9f3959d001b6 100644 --- a/drivers/dma/idxd/registers.h +++ b/drivers/dma/idxd/registers.h @@ -35,7 +35,7 @@ union gen_cap_reg { u64 drain_readback:1; u64 rsvd2:3; u64 evl_support:2; - u64 rsvd4:1; + u64 batch_continuation:1; u64 max_xfer_shift:5; u64 max_batch_shift:4; u64 max_ims_mult:6; @@ -577,6 +577,8 @@ union evl_status_reg { u64 bits; } __packed; +#define IDXD_MAX_BATCH_IDENT 256 + struct __evl_entry { u64 rsvd:2; u64 desc_valid:1; diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h index 76ad71bf751e..606b52e88ce3 100644 --- a/include/uapi/linux/idxd.h +++ b/include/uapi/linux/idxd.h @@ -136,6 +136,7 @@ enum dsa_completion_status { DSA_COMP_HW_ERR_DRB, DSA_COMP_TRANSLATION_FAIL, DSA_COMP_DRAIN_EVL = 0x26, + DSA_COMP_BATCH_EVL_ERR, }; enum iax_completion_status { -- cgit v1.2.3 From ddc65971bb677aa9f6a4c21f76d3133e106f88eb Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Tue, 4 Apr 2023 21:31:48 +0900 Subject: prctl: add PR_GET_AUXV to copy auxv to userspace If a library wants to get information from auxv (for instance, AT_HWCAP/AT_HWCAP2), it has a few options, none of them perfectly reliable or ideal: - Be main or the pre-main startup code, and grub through the stack above main. Doesn't work for a library. - Call libc getauxval. Not ideal for libraries that are trying to be libc-independent and/or don't otherwise require anything from other libraries. - Open and read /proc/self/auxv. Doesn't work for libraries that may run in arbitrarily constrained environments that may not have /proc mounted (e.g. libraries that might be used by an init program or a container setup tool). - Assume you're on the main thread and still on the original stack, and try to walk the stack upwards, hoping to find auxv. Extremely bad idea. - Ask the caller to pass auxv in for you. Not ideal for a user-friendly library, and then your caller may have the same problem. Add a prctl that copies current->mm->saved_auxv to a userspace buffer. Link: https://lkml.kernel.org/r/d81864a7f7f43bca6afa2a09fc2e850e4050ab42.1680611394.git.josh@joshtriplett.org Signed-off-by: Josh Triplett Signed-off-by: Andrew Morton --- include/uapi/linux/prctl.h | 2 ++ kernel/sys.c | 15 +++++++++++++++ 2 files changed, 17 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 1312a137f7fb..b99c0be72577 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -290,4 +290,6 @@ struct prctl_mm_map { #define PR_SET_VMA 0x53564d41 # define PR_SET_VMA_ANON_NAME 0 +#define PR_GET_AUXV 0x41555856 + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/sys.c b/kernel/sys.c index 351de7916302..26c1399e0654 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2388,6 +2388,16 @@ static inline int prctl_get_mdwe(unsigned long arg2, unsigned long arg3, PR_MDWE_REFUSE_EXEC_GAIN : 0; } +static int prctl_get_auxv(void __user *addr, unsigned long len) +{ + struct mm_struct *mm = current->mm; + unsigned long size = min_t(unsigned long, sizeof(mm->saved_auxv), len); + + if (size && copy_to_user(addr, mm->saved_auxv, size)) + return -EFAULT; + return sizeof(mm->saved_auxv); +} + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -2518,6 +2528,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, else return -EINVAL; break; + case PR_GET_AUXV: + if (arg4 || arg5) + return -EINVAL; + error = prctl_get_auxv((void __user *)arg2, arg3); + break; default: return -EINVAL; } -- cgit v1.2.3 From a3b2aeac9d154e5e15ddbf19de934c0c606b6acd Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Sat, 8 Apr 2023 17:28:35 +0800 Subject: delayacct: track delays from IRQ/SOFTIRQ Delay accounting does not track the delay of IRQ/SOFTIRQ. While IRQ/SOFTIRQ could have obvious impact on some workloads productivity, such as when workloads are running on system which is busy handling network IRQ/SOFTIRQ. Get the delay of IRQ/SOFTIRQ could help users to reduce such delay. Such as setting interrupt affinity or task affinity, using kernel thread for NAPI etc. This is inspired by "sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ pressure"[1]. Also fix some code indent problems of older code. And update tools/accounting/getdelays.c: / # ./getdelays -p 156 -di print delayacct stats ON printing IO accounting PID 156 CPU count real total virtual total delay total delay average 15 15836008 16218149 275700790 18.380ms IO count delay total delay average 0 0 0.000ms SWAP count delay total delay average 0 0 0.000ms RECLAIM count delay total delay average 0 0 0.000ms THRASHING count delay total delay average 0 0 0.000ms COMPACT count delay total delay average 0 0 0.000ms WPCOPY count delay total delay average 36 7586118 0.211ms IRQ count delay total delay average 42 929161 0.022ms [1] commit 52b1364ba0b1("sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ pressure") Link: https://lkml.kernel.org/r/202304081728353557233@zte.com.cn Signed-off-by: Yang Yang Cc: Jiang Xuexin Cc: wangyong Cc: junhua huang Cc: Balbir Singh Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Juri Lelli Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- Documentation/accounting/delay-accounting.rst | 7 +++++-- include/linux/delayacct.h | 15 ++++++++++++++ include/uapi/linux/taskstats.h | 6 +++++- kernel/delayacct.c | 14 +++++++++++++ kernel/sched/core.c | 1 + tools/accounting/getdelays.c | 30 ++++++++++++++++----------- 6 files changed, 58 insertions(+), 15 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/accounting/delay-accounting.rst b/Documentation/accounting/delay-accounting.rst index 79f537c9f160..f61c01fc376e 100644 --- a/Documentation/accounting/delay-accounting.rst +++ b/Documentation/accounting/delay-accounting.rst @@ -16,6 +16,7 @@ d) memory reclaim e) thrashing f) direct compact g) write-protect copy +h) IRQ/SOFTIRQ and makes these statistics available to userspace through the taskstats interface. @@ -49,7 +50,7 @@ this structure. See for a description of the fields pertaining to delay accounting. It will generally be in the form of counters returning the cumulative delay seen for cpu, sync block I/O, swapin, memory reclaim, thrash page -cache, direct compact, write-protect copy etc. +cache, direct compact, write-protect copy, IRQ/SOFTIRQ etc. Taking the difference of two successive readings of a given counter (say cpu_delay_total) for a task will give the delay @@ -118,7 +119,9 @@ Get sum of delays, since system boot, for all pids with tgid 5:: 0 0 0.000ms COMPACT count delay total delay average 0 0 0.000ms - WPCOPY count delay total delay average + WPCOPY count delay total delay average + 0 0 0.000ms + IRQ count delay total delay average 0 0 0.000ms Get IO accounting for pid 1, it works only with -p:: diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 0da97dba9ef8..6639f48dac36 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -48,10 +48,13 @@ struct task_delay_info { u64 wpcopy_start; u64 wpcopy_delay; /* wait for write-protect copy */ + u64 irq_delay; /* wait for IRQ/SOFTIRQ */ + u32 freepages_count; /* total count of memory reclaim */ u32 thrashing_count; /* total count of thrash waits */ u32 compact_count; /* total count of memory compact */ u32 wpcopy_count; /* total count of write-protect copy */ + u32 irq_count; /* total count of IRQ/SOFTIRQ */ }; #endif @@ -81,6 +84,7 @@ extern void __delayacct_compact_start(void); extern void __delayacct_compact_end(void); extern void __delayacct_wpcopy_start(void); extern void __delayacct_wpcopy_end(void); +extern void __delayacct_irq(struct task_struct *task, u32 delta); static inline void delayacct_tsk_init(struct task_struct *tsk) { @@ -215,6 +219,15 @@ static inline void delayacct_wpcopy_end(void) __delayacct_wpcopy_end(); } +static inline void delayacct_irq(struct task_struct *task, u32 delta) +{ + if (!static_branch_unlikely(&delayacct_key)) + return; + + if (task->delays) + __delayacct_irq(task, delta); +} + #else static inline void delayacct_init(void) {} @@ -253,6 +266,8 @@ static inline void delayacct_wpcopy_start(void) {} static inline void delayacct_wpcopy_end(void) {} +static inline void delayacct_irq(struct task_struct *task, u32 delta) +{} #endif /* CONFIG_TASK_DELAY_ACCT */ diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h index a7f5b11a8f1b..b50b2eb257a0 100644 --- a/include/uapi/linux/taskstats.h +++ b/include/uapi/linux/taskstats.h @@ -34,7 +34,7 @@ */ -#define TASKSTATS_VERSION 13 +#define TASKSTATS_VERSION 14 #define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN * in linux/sched.h */ @@ -198,6 +198,10 @@ struct taskstats { /* v13: Delay waiting for write-protect copy */ __u64 wpcopy_count; __u64 wpcopy_delay_total; + + /* v14: Delay waiting for IRQ/SOFTIRQ */ + __u64 irq_count; + __u64 irq_delay_total; }; diff --git a/kernel/delayacct.c b/kernel/delayacct.c index e39cb696cfbd..6f0c358e73d8 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -179,12 +179,15 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->compact_delay_total = (tmp < d->compact_delay_total) ? 0 : tmp; tmp = d->wpcopy_delay_total + tsk->delays->wpcopy_delay; d->wpcopy_delay_total = (tmp < d->wpcopy_delay_total) ? 0 : tmp; + tmp = d->irq_delay_total + tsk->delays->irq_delay; + d->irq_delay_total = (tmp < d->irq_delay_total) ? 0 : tmp; d->blkio_count += tsk->delays->blkio_count; d->swapin_count += tsk->delays->swapin_count; d->freepages_count += tsk->delays->freepages_count; d->thrashing_count += tsk->delays->thrashing_count; d->compact_count += tsk->delays->compact_count; d->wpcopy_count += tsk->delays->wpcopy_count; + d->irq_count += tsk->delays->irq_count; raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); return 0; @@ -274,3 +277,14 @@ void __delayacct_wpcopy_end(void) ¤t->delays->wpcopy_delay, ¤t->delays->wpcopy_count); } + +void __delayacct_irq(struct task_struct *task, u32 delta) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&task->delays->lock, flags); + task->delays->irq_delay += delta; + task->delays->irq_count++; + raw_spin_unlock_irqrestore(&task->delays->lock, flags); +} + diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0d18c3969f90..5473e831daf3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -704,6 +704,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) rq->prev_irq_time += irq_delta; delta -= irq_delta; psi_account_irqtime(rq->curr, irq_delta); + delayacct_irq(rq->curr, irq_delta); #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING if (static_key_false((¶virt_steal_rq_enabled))) { diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c index 23a15d8f2bf4..1334214546d7 100644 --- a/tools/accounting/getdelays.c +++ b/tools/accounting/getdelays.c @@ -198,17 +198,19 @@ static void print_delayacct(struct taskstats *t) printf("\n\nCPU %15s%15s%15s%15s%15s\n" " %15llu%15llu%15llu%15llu%15.3fms\n" "IO %15s%15s%15s\n" - " %15llu%15llu%15.3fms\n" + " %15llu%15llu%15.3fms\n" "SWAP %15s%15s%15s\n" - " %15llu%15llu%15.3fms\n" + " %15llu%15llu%15.3fms\n" "RECLAIM %12s%15s%15s\n" - " %15llu%15llu%15.3fms\n" + " %15llu%15llu%15.3fms\n" "THRASHING%12s%15s%15s\n" - " %15llu%15llu%15.3fms\n" + " %15llu%15llu%15.3fms\n" "COMPACT %12s%15s%15s\n" - " %15llu%15llu%15.3fms\n" + " %15llu%15llu%15.3fms\n" "WPCOPY %12s%15s%15s\n" - " %15llu%15llu%15.3fms\n", + " %15llu%15llu%15.3fms\n" + "IRQ %15s%15s%15s\n" + " %15llu%15llu%15.3fms\n", "count", "real total", "virtual total", "delay total", "delay average", (unsigned long long)t->cpu_count, @@ -219,27 +221,31 @@ static void print_delayacct(struct taskstats *t) "count", "delay total", "delay average", (unsigned long long)t->blkio_count, (unsigned long long)t->blkio_delay_total, - average_ms((double)t->blkio_delay_total, t->blkio_count), + average_ms((double)t->blkio_delay_total, t->blkio_count), "count", "delay total", "delay average", (unsigned long long)t->swapin_count, (unsigned long long)t->swapin_delay_total, - average_ms((double)t->swapin_delay_total, t->swapin_count), + average_ms((double)t->swapin_delay_total, t->swapin_count), "count", "delay total", "delay average", (unsigned long long)t->freepages_count, (unsigned long long)t->freepages_delay_total, - average_ms((double)t->freepages_delay_total, t->freepages_count), + average_ms((double)t->freepages_delay_total, t->freepages_count), "count", "delay total", "delay average", (unsigned long long)t->thrashing_count, (unsigned long long)t->thrashing_delay_total, - average_ms((double)t->thrashing_delay_total, t->thrashing_count), + average_ms((double)t->thrashing_delay_total, t->thrashing_count), "count", "delay total", "delay average", (unsigned long long)t->compact_count, (unsigned long long)t->compact_delay_total, - average_ms((double)t->compact_delay_total, t->compact_count), + average_ms((double)t->compact_delay_total, t->compact_count), "count", "delay total", "delay average", (unsigned long long)t->wpcopy_count, (unsigned long long)t->wpcopy_delay_total, - average_ms((double)t->wpcopy_delay_total, t->wpcopy_count)); + average_ms((double)t->wpcopy_delay_total, t->wpcopy_count), + "count", "delay total", "delay average", + (unsigned long long)t->irq_count, + (unsigned long long)t->irq_delay_total, + average_ms((double)t->irq_delay_total, t->irq_count)); } static void task_context_switch_counts(struct taskstats *t) -- cgit v1.2.3 From 31088f6f7906253ef4577f6a9b84e2d42447dba0 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Tue, 11 Apr 2023 10:27:47 +0100 Subject: uapi/linux/const.h: prefer ISO-friendly __typeof__ typeof is (still) a GNU extension, which means that it cannot be used when building ISO C (e.g. -std=c99). It should therefore be avoided in uapi headers in favour of the ISO-friendly __typeof__. Unfortunately this issue could not be detected by CONFIG_UAPI_HEADER_TEST=y as the __ALIGN_KERNEL() macro is not expanded in any uapi header. This matters from a userspace perspective, not a kernel one. uapi headers and their contents are expected to be usable in a variety of situations, and in particular when building ISO C applications (with -std=c99 or similar). This particular problem can be reproduced by trying to use the __ALIGN_KERNEL macro directly in application code, say: #include int align(int x, int a) { return __KERNEL_ALIGN(x, a); } and trying to build that with -std=c99. Link: https://lkml.kernel.org/r/20230411092747.3759032-1-kevin.brodsky@arm.com Fixes: a79ff731a1b2 ("netfilter: xtables: make XT_ALIGN() usable in exported headers by exporting __ALIGN_KERNEL()") Signed-off-by: Kevin Brodsky Reported-by: Ruben Ayrapetyan Tested-by: Ruben Ayrapetyan Reviewed-by: Petr Vorel Tested-by: Petr Vorel Reviewed-by: Masahiro Yamada Cc: Sam Ravnborg Signed-off-by: Andrew Morton --- include/uapi/linux/const.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/const.h b/include/uapi/linux/const.h index af2a44c08683..a429381e7ca5 100644 --- a/include/uapi/linux/const.h +++ b/include/uapi/linux/const.h @@ -28,7 +28,7 @@ #define _BITUL(x) (_UL(1) << (x)) #define _BITULL(x) (_ULL(1) << (x)) -#define __ALIGN_KERNEL(x, a) __ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1) +#define __ALIGN_KERNEL(x, a) __ALIGN_KERNEL_MASK(x, (__typeof__(x))(a) - 1) #define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask)) #define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) -- cgit v1.2.3 From af8ececda185078c096852edb4e1d7a2349e6856 Mon Sep 17 00:00:00 2001 From: Viktor Prutyanov Date: Thu, 13 Apr 2023 11:18:54 +0300 Subject: virtio: add VIRTIO_F_NOTIFICATION_DATA feature support According to VirtIO spec v1.2, VIRTIO_F_NOTIFICATION_DATA feature indicates that the driver passes extra data along with the queue notifications. In a split queue case, the extra data is 16-bit available index. In a packed queue case, the extra data is 1-bit wrap counter and 15-bit available index. Add support for this feature for MMIO, channel I/O and modern PCI transports. Signed-off-by: Viktor Prutyanov Acked-by: Jason Wang Reviewed-by: Xuan Zhuo Message-Id: <20230413081855.36643-2-alvaro.karsz@solid-run.com> Signed-off-by: Michael S. Tsirkin --- drivers/s390/virtio/virtio_ccw.c | 22 +++++++++++++++++++--- drivers/virtio/virtio_mmio.c | 18 +++++++++++++++++- drivers/virtio/virtio_pci_modern.c | 17 ++++++++++++++++- drivers/virtio/virtio_ring.c | 19 +++++++++++++++++++ include/linux/virtio_ring.h | 2 ++ include/uapi/linux/virtio_config.h | 6 ++++++ 6 files changed, 79 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c index 954fc31b4bc7..02922768b129 100644 --- a/drivers/s390/virtio/virtio_ccw.c +++ b/drivers/s390/virtio/virtio_ccw.c @@ -391,7 +391,7 @@ static void virtio_ccw_drop_indicator(struct virtio_ccw_device *vcdev, ccw_device_dma_free(vcdev->cdev, thinint_area, sizeof(*thinint_area)); } -static bool virtio_ccw_kvm_notify(struct virtqueue *vq) +static inline bool virtio_ccw_do_kvm_notify(struct virtqueue *vq, u32 data) { struct virtio_ccw_vq_info *info = vq->priv; struct virtio_ccw_device *vcdev; @@ -402,12 +402,22 @@ static bool virtio_ccw_kvm_notify(struct virtqueue *vq) BUILD_BUG_ON(sizeof(struct subchannel_id) != sizeof(unsigned int)); info->cookie = kvm_hypercall3(KVM_S390_VIRTIO_CCW_NOTIFY, *((unsigned int *)&schid), - vq->index, info->cookie); + data, info->cookie); if (info->cookie < 0) return false; return true; } +static bool virtio_ccw_kvm_notify(struct virtqueue *vq) +{ + return virtio_ccw_do_kvm_notify(vq, vq->index); +} + +static bool virtio_ccw_kvm_notify_with_data(struct virtqueue *vq) +{ + return virtio_ccw_do_kvm_notify(vq, vring_notification_data(vq)); +} + static int virtio_ccw_read_vq_conf(struct virtio_ccw_device *vcdev, struct ccw1 *ccw, int index) { @@ -495,6 +505,7 @@ static struct virtqueue *virtio_ccw_setup_vq(struct virtio_device *vdev, struct ccw1 *ccw) { struct virtio_ccw_device *vcdev = to_vc_device(vdev); + bool (*notify)(struct virtqueue *vq); int err; struct virtqueue *vq = NULL; struct virtio_ccw_vq_info *info; @@ -502,6 +513,11 @@ static struct virtqueue *virtio_ccw_setup_vq(struct virtio_device *vdev, unsigned long flags; bool may_reduce; + if (__virtio_test_bit(vdev, VIRTIO_F_NOTIFICATION_DATA)) + notify = virtio_ccw_kvm_notify_with_data; + else + notify = virtio_ccw_kvm_notify; + /* Allocate queue. */ info = kzalloc(sizeof(struct virtio_ccw_vq_info), GFP_KERNEL); if (!info) { @@ -524,7 +540,7 @@ static struct virtqueue *virtio_ccw_setup_vq(struct virtio_device *vdev, may_reduce = vcdev->revision > 0; vq = vring_create_virtqueue(i, info->num, KVM_VIRTIO_CCW_RING_ALIGN, vdev, true, may_reduce, ctx, - virtio_ccw_kvm_notify, callback, name); + notify, callback, name); if (!vq) { /* For now, we fail if we can't get the requested size. */ diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c index 3ff746e3f24a..dd4424c14233 100644 --- a/drivers/virtio/virtio_mmio.c +++ b/drivers/virtio/virtio_mmio.c @@ -285,6 +285,16 @@ static bool vm_notify(struct virtqueue *vq) return true; } +static bool vm_notify_with_data(struct virtqueue *vq) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev); + u32 data = vring_notification_data(vq); + + writel(data, vm_dev->base + VIRTIO_MMIO_QUEUE_NOTIFY); + + return true; +} + /* Notify all virtqueues on an interrupt. */ static irqreturn_t vm_interrupt(int irq, void *opaque) { @@ -363,12 +373,18 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned int in const char *name, bool ctx) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + bool (*notify)(struct virtqueue *vq); struct virtio_mmio_vq_info *info; struct virtqueue *vq; unsigned long flags; unsigned int num; int err; + if (__virtio_test_bit(vdev, VIRTIO_F_NOTIFICATION_DATA)) + notify = vm_notify_with_data; + else + notify = vm_notify; + if (!name) return NULL; @@ -397,7 +413,7 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned int in /* Create the vring */ vq = vring_create_virtqueue(index, num, VIRTIO_MMIO_VRING_ALIGN, vdev, - true, true, ctx, vm_notify, callback, name); + true, true, ctx, notify, callback, name); if (!vq) { err = -ENOMEM; goto error_new_virtqueue; diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 6e713904d8e8..d6bb68ba84e5 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -288,6 +288,15 @@ static u16 vp_config_vector(struct virtio_pci_device *vp_dev, u16 vector) return vp_modern_config_vector(&vp_dev->mdev, vector); } +static bool vp_notify_with_data(struct virtqueue *vq) +{ + u32 data = vring_notification_data(vq); + + iowrite32(data, (void __iomem *)vq->priv); + + return true; +} + static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, struct virtio_pci_vq_info *info, unsigned int index, @@ -298,10 +307,16 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, { struct virtio_pci_modern_device *mdev = &vp_dev->mdev; + bool (*notify)(struct virtqueue *vq); struct virtqueue *vq; u16 num; int err; + if (__virtio_test_bit(&vp_dev->vdev, VIRTIO_F_NOTIFICATION_DATA)) + notify = vp_notify_with_data; + else + notify = vp_notify; + if (index >= vp_modern_get_num_queues(mdev)) return ERR_PTR(-EINVAL); @@ -316,7 +331,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, vq = vring_create_virtqueue(index, num, SMP_CACHE_BYTES, &vp_dev->vdev, true, true, ctx, - vp_notify, callback, name); + notify, callback, name); if (!vq) return ERR_PTR(-ENOMEM); diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 34e3849629da..c5310eaf8b46 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -2762,6 +2762,23 @@ void vring_del_virtqueue(struct virtqueue *_vq) } EXPORT_SYMBOL_GPL(vring_del_virtqueue); +u32 vring_notification_data(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + u16 next; + + if (vq->packed_ring) + next = (vq->packed.next_avail_idx & + ~(-(1 << VRING_PACKED_EVENT_F_WRAP_CTR))) | + vq->packed.avail_wrap_counter << + VRING_PACKED_EVENT_F_WRAP_CTR; + else + next = vq->split.avail_idx_shadow; + + return next << 16 | _vq->index; +} +EXPORT_SYMBOL_GPL(vring_notification_data); + /* Manipulates transport-specific feature bits. */ void vring_transport_features(struct virtio_device *vdev) { @@ -2781,6 +2798,8 @@ void vring_transport_features(struct virtio_device *vdev) break; case VIRTIO_F_ORDER_PLATFORM: break; + case VIRTIO_F_NOTIFICATION_DATA: + break; default: /* We don't understand this bit. */ __virtio_clear_bit(vdev, i); diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h index 8b95b69ef694..2550c9170f4f 100644 --- a/include/linux/virtio_ring.h +++ b/include/linux/virtio_ring.h @@ -117,4 +117,6 @@ void vring_del_virtqueue(struct virtqueue *vq); void vring_transport_features(struct virtio_device *vdev); irqreturn_t vring_interrupt(int irq, void *_vq); + +u32 vring_notification_data(struct virtqueue *_vq); #endif /* _LINUX_VIRTIO_RING_H */ diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h index 3c05162bc988..2c712c654165 100644 --- a/include/uapi/linux/virtio_config.h +++ b/include/uapi/linux/virtio_config.h @@ -99,6 +99,12 @@ */ #define VIRTIO_F_SR_IOV 37 +/* + * This feature indicates that the driver passes extra data (besides + * identifying the virtqueue) in its device notifications. + */ +#define VIRTIO_F_NOTIFICATION_DATA 38 + /* * This feature indicates that the driver can reset a queue individually. */ -- cgit v1.2.3 From d7597f59d1d33e9efbffa7060deb9ee5bd119e62 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Mon, 17 Apr 2023 22:13:40 -0700 Subject: mm: add new api to enable ksm per process Patch series "mm: process/cgroup ksm support", v9. So far KSM can only be enabled by calling madvise for memory regions. To be able to use KSM for more workloads, KSM needs to have the ability to be enabled / disabled at the process / cgroup level. Use case 1: The madvise call is not available in the programming language. An example for this are programs with forked workloads using a garbage collected language without pointers. In such a language madvise cannot be made available. In addition the addresses of objects get moved around as they are garbage collected. KSM sharing needs to be enabled "from the outside" for these type of workloads. Use case 2: The same interpreter can also be used for workloads where KSM brings no benefit or even has overhead. We'd like to be able to enable KSM on a workload by workload basis. Use case 3: With the madvise call sharing opportunities are only enabled for the current process: it is a workload-local decision. A considerable number of sharing opportunities may exist across multiple workloads or jobs (if they are part of the same security domain). Only a higler level entity like a job scheduler or container can know for certain if its running one or more instances of a job. That job scheduler however doesn't have the necessary internal workload knowledge to make targeted madvise calls. Security concerns: In previous discussions security concerns have been brought up. The problem is that an individual workload does not have the knowledge about what else is running on a machine. Therefore it has to be very conservative in what memory areas can be shared or not. However, if the system is dedicated to running multiple jobs within the same security domain, its the job scheduler that has the knowledge that sharing can be safely enabled and is even desirable. Performance: Experiments with using UKSM have shown a capacity increase of around 20%. Here are the metrics from an instagram workload (taken from a machine with 64GB main memory): full_scans: 445 general_profit: 20158298048 max_page_sharing: 256 merge_across_nodes: 1 pages_shared: 129547 pages_sharing: 5119146 pages_to_scan: 4000 pages_unshared: 1760924 pages_volatile: 10761341 run: 1 sleep_millisecs: 20 stable_node_chains: 167 stable_node_chains_prune_millisecs: 2000 stable_node_dups: 2751 use_zero_pages: 0 zero_pages_sharing: 0 After the service is running for 30 minutes to an hour, 4 to 5 million shared pages are common for this workload when using KSM. Detailed changes: 1. New options for prctl system command This patch series adds two new options to the prctl system call. The first one allows to enable KSM at the process level and the second one to query the setting. The setting will be inherited by child processes. With the above setting, KSM can be enabled for the seed process of a cgroup and all processes in the cgroup will inherit the setting. 2. Changes to KSM processing When KSM is enabled at the process level, the KSM code will iterate over all the VMA's and enable KSM for the eligible VMA's. When forking a process that has KSM enabled, the setting will be inherited by the new child process. 3. Add general_profit metric The general_profit metric of KSM is specified in the documentation, but not calculated. This adds the general profit metric to /sys/kernel/debug/mm/ksm. 4. Add more metrics to ksm_stat This adds the process profit metric to /proc//ksm_stat. 5. Add more tests to ksm_tests and ksm_functional_tests This adds an option to specify the merge type to the ksm_tests. This allows to test madvise and prctl KSM. It also adds a two new tests to ksm_functional_tests: one to test the new prctl options and the other one is a fork test to verify that the KSM process setting is inherited by client processes. This patch (of 3): So far KSM can only be enabled by calling madvise for memory regions. To be able to use KSM for more workloads, KSM needs to have the ability to be enabled / disabled at the process / cgroup level. 1. New options for prctl system command This patch series adds two new options to the prctl system call. The first one allows to enable KSM at the process level and the second one to query the setting. The setting will be inherited by child processes. With the above setting, KSM can be enabled for the seed process of a cgroup and all processes in the cgroup will inherit the setting. 2. Changes to KSM processing When KSM is enabled at the process level, the KSM code will iterate over all the VMA's and enable KSM for the eligible VMA's. When forking a process that has KSM enabled, the setting will be inherited by the new child process. 1) Introduce new MMF_VM_MERGE_ANY flag This introduces the new flag MMF_VM_MERGE_ANY flag. When this flag is set, kernel samepage merging (ksm) gets enabled for all vma's of a process. 2) Setting VM_MERGEABLE on VMA creation When a VMA is created, if the MMF_VM_MERGE_ANY flag is set, the VM_MERGEABLE flag will be set for this VMA. 3) support disabling of ksm for a process This adds the ability to disable ksm for a process if ksm has been enabled for the process with prctl. 4) add new prctl option to get and set ksm for a process This adds two new options to the prctl system call - enable ksm for all vmas of a process (if the vmas support it). - query if ksm has been enabled for a process. 3. Disabling MMF_VM_MERGE_ANY for storage keys in s390 In the s390 architecture when storage keys are used, the MMF_VM_MERGE_ANY will be disabled. Link: https://lkml.kernel.org/r/20230418051342.1919757-1-shr@devkernel.io Link: https://lkml.kernel.org/r/20230418051342.1919757-2-shr@devkernel.io Signed-off-by: Stefan Roesch Acked-by: David Hildenbrand Cc: David Hildenbrand Cc: Johannes Weiner Cc: Michal Hocko Cc: Rik van Riel Cc: Bagas Sanjaya Signed-off-by: Andrew Morton --- arch/s390/mm/gmap.c | 7 +++ include/linux/ksm.h | 21 ++++++++- include/linux/sched/coredump.h | 1 + include/uapi/linux/prctl.h | 2 + kernel/sys.c | 27 +++++++++++ mm/ksm.c | 104 ++++++++++++++++++++++++++++++++++------- mm/mmap.c | 3 ++ 7 files changed, 146 insertions(+), 19 deletions(-) (limited to 'include/uapi/linux') diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 5a716bdcba05..0949811761e6 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -2591,6 +2591,13 @@ int gmap_mark_unmergeable(void) int ret; VMA_ITERATOR(vmi, mm, 0); + /* + * Make sure to disable KSM (if enabled for the whole process or + * individual VMAs). Note that nothing currently hinders user space + * from re-enabling it. + */ + clear_bit(MMF_VM_MERGE_ANY, &mm->flags); + for_each_vma(vmi, vma) { /* Copy vm_flags to avoid partial modifications in ksm_madvise */ vm_flags = vma->vm_flags; diff --git a/include/linux/ksm.h b/include/linux/ksm.h index d9e701326a88..4647b0c70c12 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -18,13 +18,26 @@ #ifdef CONFIG_KSM int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags); + +void ksm_add_vma(struct vm_area_struct *vma); +int ksm_enable_merge_any(struct mm_struct *mm); + int __ksm_enter(struct mm_struct *mm); void __ksm_exit(struct mm_struct *mm); static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { - if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) - return __ksm_enter(mm); + int ret; + + if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) { + ret = __ksm_enter(mm); + if (ret) + return ret; + } + + if (test_bit(MMF_VM_MERGE_ANY, &oldmm->flags)) + set_bit(MMF_VM_MERGE_ANY, &mm->flags); + return 0; } @@ -57,6 +70,10 @@ void collect_procs_ksm(struct page *page, struct list_head *to_kill, #endif #else /* !CONFIG_KSM */ +static inline void ksm_add_vma(struct vm_area_struct *vma) +{ +} + static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { return 0; diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 0e17ae7fbfd3..0ee96ea7a0e9 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -90,4 +90,5 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK) +#define MMF_VM_MERGE_ANY 29 #endif /* _LINUX_SCHED_COREDUMP_H */ diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index b99c0be72577..f23d9a16507f 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -292,4 +292,6 @@ struct prctl_mm_map { #define PR_GET_AUXV 0x41555856 +#define PR_SET_MEMORY_MERGE 67 +#define PR_GET_MEMORY_MERGE 68 #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/sys.c b/kernel/sys.c index 26c1399e0654..72cdb16e2636 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -2687,6 +2688,32 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SET_VMA: error = prctl_set_vma(arg2, arg3, arg4, arg5); break; +#ifdef CONFIG_KSM + case PR_SET_MEMORY_MERGE: + if (arg3 || arg4 || arg5) + return -EINVAL; + if (mmap_write_lock_killable(me->mm)) + return -EINTR; + + if (arg2) { + error = ksm_enable_merge_any(me->mm); + } else { + /* + * TODO: we might want disable KSM on all VMAs and + * trigger unsharing to completely disable KSM. + */ + clear_bit(MMF_VM_MERGE_ANY, &me->mm->flags); + error = 0; + } + mmap_write_unlock(me->mm); + break; + case PR_GET_MEMORY_MERGE: + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + + error = !!test_bit(MMF_VM_MERGE_ANY, &me->mm->flags); + break; +#endif default: error = -EINVAL; break; diff --git a/mm/ksm.c b/mm/ksm.c index 37c63310bc4e..35ac6c741572 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -515,6 +515,28 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; } +static bool vma_ksm_compatible(struct vm_area_struct *vma) +{ + if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE | VM_PFNMAP | + VM_IO | VM_DONTEXPAND | VM_HUGETLB | + VM_MIXEDMAP)) + return false; /* just ignore the advice */ + + if (vma_is_dax(vma)) + return false; + +#ifdef VM_SAO + if (vma->vm_flags & VM_SAO) + return false; +#endif +#ifdef VM_SPARC_ADI + if (vma->vm_flags & VM_SPARC_ADI) + return false; +#endif + + return true; +} + static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm, unsigned long addr) { @@ -1026,6 +1048,7 @@ mm_exiting: mm_slot_free(mm_slot_cache, mm_slot); clear_bit(MMF_VM_MERGEABLE, &mm->flags); + clear_bit(MMF_VM_MERGE_ANY, &mm->flags); mmdrop(mm); } else spin_unlock(&ksm_mmlist_lock); @@ -2408,6 +2431,7 @@ no_vmas: mm_slot_free(mm_slot_cache, mm_slot); clear_bit(MMF_VM_MERGEABLE, &mm->flags); + clear_bit(MMF_VM_MERGE_ANY, &mm->flags); mmap_read_unlock(mm); mmdrop(mm); } else { @@ -2485,6 +2509,66 @@ static int ksm_scan_thread(void *nothing) return 0; } +static void __ksm_add_vma(struct vm_area_struct *vma) +{ + unsigned long vm_flags = vma->vm_flags; + + if (vm_flags & VM_MERGEABLE) + return; + + if (vma_ksm_compatible(vma)) + vm_flags_set(vma, VM_MERGEABLE); +} + +/** + * ksm_add_vma - Mark vma as mergeable if compatible + * + * @vma: Pointer to vma + */ +void ksm_add_vma(struct vm_area_struct *vma) +{ + struct mm_struct *mm = vma->vm_mm; + + if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) + __ksm_add_vma(vma); +} + +static void ksm_add_vmas(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + + VMA_ITERATOR(vmi, mm, 0); + for_each_vma(vmi, vma) + __ksm_add_vma(vma); +} + +/** + * ksm_enable_merge_any - Add mm to mm ksm list and enable merging on all + * compatible VMA's + * + * @mm: Pointer to mm + * + * Returns 0 on success, otherwise error code + */ +int ksm_enable_merge_any(struct mm_struct *mm) +{ + int err; + + if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) + return 0; + + if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { + err = __ksm_enter(mm); + if (err) + return err; + } + + set_bit(MMF_VM_MERGE_ANY, &mm->flags); + ksm_add_vmas(mm); + + return 0; +} + int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags) { @@ -2493,25 +2577,10 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, switch (advice) { case MADV_MERGEABLE: - /* - * Be somewhat over-protective for now! - */ - if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | - VM_PFNMAP | VM_IO | VM_DONTEXPAND | - VM_HUGETLB | VM_MIXEDMAP)) - return 0; /* just ignore the advice */ - - if (vma_is_dax(vma)) + if (vma->vm_flags & VM_MERGEABLE) return 0; - -#ifdef VM_SAO - if (*vm_flags & VM_SAO) + if (!vma_ksm_compatible(vma)) return 0; -#endif -#ifdef VM_SPARC_ADI - if (*vm_flags & VM_SPARC_ADI) - return 0; -#endif if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { err = __ksm_enter(mm); @@ -2615,6 +2684,7 @@ void __ksm_exit(struct mm_struct *mm) if (easy_to_free) { mm_slot_free(mm_slot_cache, mm_slot); + clear_bit(MMF_VM_MERGE_ANY, &mm->flags); clear_bit(MMF_VM_MERGEABLE, &mm->flags); mmdrop(mm); } else if (mm_slot) { diff --git a/mm/mmap.c b/mm/mmap.c index 790cc62c0038..51b6976fd525 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -2729,6 +2730,7 @@ unmap_writable: if (file && vm_flags & VM_SHARED) mapping_unmap_writable(file->f_mapping); file = vma->vm_file; + ksm_add_vma(vma); expanded: perf_event_mmap(vma); @@ -3001,6 +3003,7 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, goto mas_store_fail; mm->map_count++; + ksm_add_vma(vma); out: perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; -- cgit v1.2.3 From 3db166d6cf0ea73dd2c887036aad2e95e0884d9b Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 18 Apr 2023 10:39:01 -0700 Subject: cxl/mbox: Deprecate poison commands The CXL subsystem is adding formal mechanisms for managing device poison. Minimize the maintenance burden going forward, and maximize the investment in common tooling by deprecating direct user access to poison commands outside of CXL_MEM_RAW_COMMANDS debug scenarios. A new cxl_deprecated_commands[] list is created for querying which command ids defined in previous kernels are now deprecated. CXL Media and Poison Management commands, opcodes 0x43XX, defined in CXL 3.0 Spec, Table 8-93 are deprecated with one exception: Get Scan Media Capabilities. Keep Get Scan Media Capabilities as it simply provides information and has no impact on the device state. Effectively all of the commands defined in: commit 87815ee9d006 ("cxl/pci: Add media provisioning required commands") ...were defined prematurely and should have waited until the kernel implementation was decided. To my knowledge there are no shipping devices with poison support and no known tools that would regress with this change. Co-developed-by: Alison Schofield Signed-off-by: Alison Schofield Link: https://lore.kernel.org/r/652197e9bc8885e6448d989405b9e50ee9d6b0a6.1681838291.git.alison.schofield@intel.com Tested-by: Jonathan Cameron Signed-off-by: Dan Williams --- drivers/cxl/core/mbox.c | 5 ----- include/uapi/linux/cxl_mem.h | 35 ++++++++++++++++++++++++++++++----- 2 files changed, 30 insertions(+), 10 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index f2addb457172..938cff2c948e 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -61,12 +61,7 @@ static struct cxl_mem_command cxl_mem_commands[CXL_MEM_COMMAND_ID_MAX] = { CXL_CMD(SET_ALERT_CONFIG, 0xc, 0, 0), CXL_CMD(GET_SHUTDOWN_STATE, 0, 0x1, 0), CXL_CMD(SET_SHUTDOWN_STATE, 0x1, 0, 0), - CXL_CMD(GET_POISON, 0x10, CXL_VARIABLE_PAYLOAD, 0), - CXL_CMD(INJECT_POISON, 0x8, 0, 0), - CXL_CMD(CLEAR_POISON, 0x48, 0, 0), CXL_CMD(GET_SCAN_MEDIA_CAPS, 0x10, 0x4, 0), - CXL_CMD(SCAN_MEDIA, 0x11, 0, 0), - CXL_CMD(GET_SCAN_MEDIA, 0, CXL_VARIABLE_PAYLOAD, 0), }; /* diff --git a/include/uapi/linux/cxl_mem.h b/include/uapi/linux/cxl_mem.h index 86bbacf2a315..14bc6e742148 100644 --- a/include/uapi/linux/cxl_mem.h +++ b/include/uapi/linux/cxl_mem.h @@ -40,19 +40,22 @@ ___C(SET_ALERT_CONFIG, "Set Alert Configuration"), \ ___C(GET_SHUTDOWN_STATE, "Get Shutdown State"), \ ___C(SET_SHUTDOWN_STATE, "Set Shutdown State"), \ - ___C(GET_POISON, "Get Poison List"), \ - ___C(INJECT_POISON, "Inject Poison"), \ - ___C(CLEAR_POISON, "Clear Poison"), \ + ___DEPRECATED(GET_POISON, "Get Poison List"), \ + ___DEPRECATED(INJECT_POISON, "Inject Poison"), \ + ___DEPRECATED(CLEAR_POISON, "Clear Poison"), \ ___C(GET_SCAN_MEDIA_CAPS, "Get Scan Media Capabilities"), \ - ___C(SCAN_MEDIA, "Scan Media"), \ - ___C(GET_SCAN_MEDIA, "Get Scan Media Results"), \ + ___DEPRECATED(SCAN_MEDIA, "Scan Media"), \ + ___DEPRECATED(GET_SCAN_MEDIA, "Get Scan Media Results"), \ ___C(MAX, "invalid / last command") #define ___C(a, b) CXL_MEM_COMMAND_ID_##a +#define ___DEPRECATED(a, b) CXL_MEM_DEPRECATED_ID_##a enum { CXL_CMDS }; #undef ___C +#undef ___DEPRECATED #define ___C(a, b) { b } +#define ___DEPRECATED(a, b) { "Deprecated " b } static const struct { const char *name; } cxl_command_names[] __attribute__((__unused__)) = { CXL_CMDS }; @@ -68,6 +71,28 @@ static const struct { */ #undef ___C +#undef ___DEPRECATED +#define ___C(a, b) (0) +#define ___DEPRECATED(a, b) (1) + +static const __u8 cxl_deprecated_commands[] + __attribute__((__unused__)) = { CXL_CMDS }; + +/* + * Here's how this actually breaks out: + * cxl_deprecated_commands[] = { + * [CXL_MEM_COMMAND_ID_INVALID] = 0, + * [CXL_MEM_COMMAND_ID_IDENTIFY] = 0, + * ... + * [CXL_MEM_DEPRECATED_ID_GET_POISON] = 1, + * [CXL_MEM_DEPRECATED_ID_INJECT_POISON] = 1, + * [CXL_MEM_DEPRECATED_ID_CLEAR_POISON] = 1, + * ... + * }; + */ + +#undef ___C +#undef ___DEPRECATED /** * struct cxl_command_info - Command information returned from a query. -- cgit v1.2.3 From 9280c577431401544e63dfb489a830a42bee25eb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 20 Apr 2023 13:56:31 -0400 Subject: NFSD: Handle new xprtsec= export option Enable administrators to require clients to use transport layer security when accessing particular exports. Signed-off-by: Chuck Lever --- fs/nfsd/export.c | 51 +++++++++++++++++++++++++++++++++++++--- fs/nfsd/export.h | 1 + include/uapi/linux/nfsd/export.h | 13 ++++++++++ 3 files changed, 62 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 6da74aebe1fb..ae85257b4238 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -439,7 +439,6 @@ static int check_export(struct path *path, int *flags, unsigned char *uuid) return -EINVAL; } return 0; - } #ifdef CONFIG_NFSD_V4 @@ -546,6 +545,29 @@ static inline int secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { return 0; } #endif +static int xprtsec_parse(char **mesg, char *buf, struct svc_export *exp) +{ + unsigned int i, mode, listsize; + int err; + + err = get_uint(mesg, &listsize); + if (err) + return err; + if (listsize > NFSEXP_XPRTSEC_NUM) + return -EINVAL; + + exp->ex_xprtsec_modes = 0; + for (i = 0; i < listsize; i++) { + err = get_uint(mesg, &mode); + if (err) + return err; + if (mode > NFSEXP_XPRTSEC_MTLS) + return -EINVAL; + exp->ex_xprtsec_modes |= mode; + } + return 0; +} + static inline int nfsd_uuid_parse(char **mesg, char *buf, unsigned char **puuid) { @@ -608,6 +630,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) exp.ex_client = dom; exp.cd = cd; exp.ex_devid_map = NULL; + exp.ex_xprtsec_modes = NFSEXP_XPRTSEC_ALL; /* expiry */ err = get_expiry(&mesg, &exp.h.expiry_time); @@ -649,6 +672,8 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) err = nfsd_uuid_parse(&mesg, buf, &exp.ex_uuid); else if (strcmp(buf, "secinfo") == 0) err = secinfo_parse(&mesg, buf, &exp); + else if (strcmp(buf, "xprtsec") == 0) + err = xprtsec_parse(&mesg, buf, &exp); else /* quietly ignore unknown words and anything * following. Newer user-space can try to set @@ -662,6 +687,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) err = check_export(&exp.ex_path, &exp.ex_flags, exp.ex_uuid); if (err) goto out4; + /* * No point caching this if it would immediately expire. * Also, this protects exportfs's dummy export from the @@ -823,6 +849,7 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem) for (i = 0; i < MAX_SECINFO_LIST; i++) { new->ex_flavors[i] = item->ex_flavors[i]; } + new->ex_xprtsec_modes = item->ex_xprtsec_modes; } static struct cache_head *svc_export_alloc(void) @@ -1034,9 +1061,26 @@ static struct svc_export *exp_find(struct cache_detail *cd, __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp) { - struct exp_flavor_info *f; - struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors; + struct exp_flavor_info *f, *end = exp->ex_flavors + exp->ex_nflavors; + struct svc_xprt *xprt = rqstp->rq_xprt; + + if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_NONE) { + if (!test_bit(XPT_TLS_SESSION, &xprt->xpt_flags)) + goto ok; + } + if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_TLS) { + if (test_bit(XPT_TLS_SESSION, &xprt->xpt_flags) && + !test_bit(XPT_PEER_AUTH, &xprt->xpt_flags)) + goto ok; + } + if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_MTLS) { + if (test_bit(XPT_TLS_SESSION, &xprt->xpt_flags) && + test_bit(XPT_PEER_AUTH, &xprt->xpt_flags)) + goto ok; + } + goto denied; +ok: /* legacy gss-only clients are always OK: */ if (exp->ex_client == rqstp->rq_gssclient) return 0; @@ -1061,6 +1105,7 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp) if (nfsd4_spo_must_allow(rqstp)) return 0; +denied: return rqstp->rq_vers < 4 ? nfserr_acces : nfserr_wrongsec; } diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h index d03f7f6a8642..2df8ae25aad3 100644 --- a/fs/nfsd/export.h +++ b/fs/nfsd/export.h @@ -77,6 +77,7 @@ struct svc_export { struct cache_detail *cd; struct rcu_head ex_rcu; struct export_stats ex_stats; + unsigned long ex_xprtsec_modes; }; /* an "export key" (expkey) maps a filehandlefragement to an diff --git a/include/uapi/linux/nfsd/export.h b/include/uapi/linux/nfsd/export.h index 2124ba904779..a73ca3703abb 100644 --- a/include/uapi/linux/nfsd/export.h +++ b/include/uapi/linux/nfsd/export.h @@ -62,5 +62,18 @@ | NFSEXP_ALLSQUASH \ | NFSEXP_INSECURE_PORT) +/* + * Transport layer security policies that are permitted to access + * an export + */ +#define NFSEXP_XPRTSEC_NONE 0x0001 +#define NFSEXP_XPRTSEC_TLS 0x0002 +#define NFSEXP_XPRTSEC_MTLS 0x0004 + +#define NFSEXP_XPRTSEC_NUM (3) + +#define NFSEXP_XPRTSEC_ALL (NFSEXP_XPRTSEC_NONE | \ + NFSEXP_XPRTSEC_TLS | \ + NFSEXP_XPRTSEC_MTLS) #endif /* _UAPINFSD_EXPORT_H */ -- cgit v1.2.3 From daf376a366fd2d469d66ab83dfdc074777462bab Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Mon, 10 Apr 2023 13:06:08 -0500 Subject: uapi nbd: improve doc links to userspace spec The uapi header intentionally documents only the NBD server features that the kernel module will utilize as a client. But while it already had one mention of skipped bits due to userspace extensions, it did not actually direct the reader to the canonical source to learn about those extensions. While touching comments, fix an outdated reference that listed only READ and WRITE as commands. Signed-off-by: Eric Blake Reviewed-by: Ming Lei Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20230410180611.1051618-2-eblake@redhat.com Signed-off-by: Jens Axboe --- include/uapi/linux/nbd.h | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nbd.h b/include/uapi/linux/nbd.h index 20d6cc91435d..8797387caaf7 100644 --- a/include/uapi/linux/nbd.h +++ b/include/uapi/linux/nbd.h @@ -11,6 +11,8 @@ * Cleanup PARANOIA usage & code. * 2004/02/19 Paul Clements * Removed PARANOIA, plus various cleanup and comments + * 2023 Copyright Red Hat + * Link to userspace extensions. */ #ifndef _UAPILINUX_NBD_H @@ -30,12 +32,18 @@ #define NBD_SET_TIMEOUT _IO( 0xab, 9 ) #define NBD_SET_FLAGS _IO( 0xab, 10) +/* + * See also https://github.com/NetworkBlockDevice/nbd/blob/master/doc/proto.md + * for additional userspace extensions not yet utilized in the kernel module. + */ + enum { NBD_CMD_READ = 0, NBD_CMD_WRITE = 1, NBD_CMD_DISC = 2, NBD_CMD_FLUSH = 3, NBD_CMD_TRIM = 4 + /* userspace defines additional extension commands */ }; /* values for flags field, these are server interaction specific. */ @@ -64,14 +72,15 @@ enum { #define NBD_REQUEST_MAGIC 0x25609513 #define NBD_REPLY_MAGIC 0x67446698 /* Do *not* use magics: 0x12560953 0x96744668. */ +/* magic 0x668e33ef for structured reply not supported by kernel yet */ /* * This is the packet used for communication between client and * server. All data are in network byte order. */ struct nbd_request { - __be32 magic; - __be32 type; /* == READ || == WRITE */ + __be32 magic; /* NBD_REQUEST_MAGIC */ + __be32 type; /* See NBD_CMD_* */ char handle[8]; __be64 from; __be32 len; @@ -82,7 +91,7 @@ struct nbd_request { * it has completed an I/O request (or an error occurs). */ struct nbd_reply { - __be32 magic; + __be32 magic; /* NBD_REPLY_MAGIC */ __be32 error; /* 0 = ok, else error */ char handle[8]; /* handle you got from request */ }; -- cgit v1.2.3 From 2686eb845da7762ee98b17e578b0c081aafb77b9 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Mon, 10 Apr 2023 13:06:09 -0500 Subject: uapi nbd: add cookie alias to handle The uapi header declares a 'char handle[8]' per request; which is overloaded in English (are you referring to "handle" the verb, such as handling a signal or writing a callback handler, or "handle" the noun, the value used in a lookup table to correlate a response back to the request). Many user-space NBD implementations (both servers and clients) have instead used 'uint64_t cookie' or similar, as it is easier to directly assign an integer than to futz around with memcpy. In fact, upstream documentation is now encouraging this shift in terminology: https://github.com/NetworkBlockDevice/nbd/commit/ca4392eb2b Accomplish this by use of an anonymous union to provide the alias for anyone getting the definition from the uapi; this does not break existing clients, while exposing the nicer name for those who prefer it. Note that block/nbd.c still uses the term handle (in fact, it actually combines a 32-bit cookie and a 32-bit tag into the 64-bit handle), but that internal usage is not changed by the public uapi, since no compliant NBD server has any reason to inspect or alter the 64 bits sent over the socket. Signed-off-by: Eric Blake Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20230410180611.1051618-3-eblake@redhat.com Signed-off-by: Jens Axboe --- include/uapi/linux/nbd.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nbd.h b/include/uapi/linux/nbd.h index 8797387caaf7..80ce0ef43afd 100644 --- a/include/uapi/linux/nbd.h +++ b/include/uapi/linux/nbd.h @@ -12,7 +12,7 @@ * 2004/02/19 Paul Clements * Removed PARANOIA, plus various cleanup and comments * 2023 Copyright Red Hat - * Link to userspace extensions. + * Link to userspace extensions, favor cookie over handle. */ #ifndef _UAPILINUX_NBD_H @@ -81,7 +81,10 @@ enum { struct nbd_request { __be32 magic; /* NBD_REQUEST_MAGIC */ __be32 type; /* See NBD_CMD_* */ - char handle[8]; + union { + __be64 cookie; /* Opaque identifier for request */ + char handle[8]; /* older spelling of cookie */ + }; __be64 from; __be32 len; } __attribute__((packed)); @@ -93,6 +96,9 @@ struct nbd_request { struct nbd_reply { __be32 magic; /* NBD_REPLY_MAGIC */ __be32 error; /* 0 = ok, else error */ - char handle[8]; /* handle you got from request */ + union { + __be64 cookie; /* Opaque identifier from request */ + char handle[8]; /* older spelling of cookie */ + }; }; #endif /* _UAPILINUX_NBD_H */ -- cgit v1.2.3 From 3632679d9e4f879f49949bb5b050e0de553e4739 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 22 May 2023 14:08:20 +0200 Subject: ipv{4,6}/raw: fix output xfrm lookup wrt protocol With a raw socket bound to IPPROTO_RAW (ie with hdrincl enabled), the protocol field of the flow structure, build by raw_sendmsg() / rawv6_sendmsg()), is set to IPPROTO_RAW. This breaks the ipsec policy lookup when some policies are defined with a protocol in the selector. For ipv6, the sin6_port field from 'struct sockaddr_in6' could be used to specify the protocol. Just accept all values for IPPROTO_RAW socket. For ipv4, the sin_port field of 'struct sockaddr_in' could not be used without breaking backward compatibility (the value of this field was never checked). Let's add a new kind of control message, so that the userland could specify which protocol is used. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") CC: stable@vger.kernel.org Signed-off-by: Nicolas Dichtel Link: https://lore.kernel.org/r/20230522120820.1319391-1-nicolas.dichtel@6wind.com Signed-off-by: Paolo Abeni --- include/net/ip.h | 2 ++ include/uapi/linux/in.h | 1 + net/ipv4/ip_sockglue.c | 12 +++++++++++- net/ipv4/raw.c | 5 ++++- net/ipv6/raw.c | 3 ++- 5 files changed, 20 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/ip.h b/include/net/ip.h index c3fffaa92d6e..acec504c469a 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -76,6 +76,7 @@ struct ipcm_cookie { __be32 addr; int oif; struct ip_options_rcu *opt; + __u8 protocol; __u8 ttl; __s16 tos; char priority; @@ -96,6 +97,7 @@ static inline void ipcm_init_sk(struct ipcm_cookie *ipcm, ipcm->sockc.tsflags = inet->sk.sk_tsflags; ipcm->oif = READ_ONCE(inet->sk.sk_bound_dev_if); ipcm->addr = inet->inet_saddr; + ipcm->protocol = inet->inet_num; } #define IPCB(skb) ((struct inet_skb_parm*)((skb)->cb)) diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index 4b7f2df66b99..e682ab628dfa 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -163,6 +163,7 @@ struct in_addr { #define IP_MULTICAST_ALL 49 #define IP_UNICAST_IF 50 #define IP_LOCAL_PORT_RANGE 51 +#define IP_PROTOCOL 52 #define MCAST_EXCLUDE 0 #define MCAST_INCLUDE 1 diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index b511ff0adc0a..8e97d8d4cc9d 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -317,7 +317,14 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, ipc->tos = val; ipc->priority = rt_tos2priority(ipc->tos); break; - + case IP_PROTOCOL: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) + return -EINVAL; + val = *(int *)CMSG_DATA(cmsg); + if (val < 1 || val > 255) + return -EINVAL; + ipc->protocol = val; + break; default: return -EINVAL; } @@ -1761,6 +1768,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_LOCAL_PORT_RANGE: val = inet->local_port_range.hi << 16 | inet->local_port_range.lo; break; + case IP_PROTOCOL: + val = inet_sk(sk)->inet_num; + break; default: sockopt_release_sock(sk); return -ENOPROTOOPT; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index ff712bf2a98d..eadf1c9ef7e4 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -532,6 +532,9 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } ipcm_init_sk(&ipc, inet); + /* Keep backward compat */ + if (hdrincl) + ipc.protocol = IPPROTO_RAW; if (msg->msg_controllen) { err = ip_cmsg_send(sk, msg, &ipc, false); @@ -599,7 +602,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos, RT_SCOPE_UNIVERSE, - hdrincl ? IPPROTO_RAW : sk->sk_protocol, + hdrincl ? ipc.protocol : sk->sk_protocol, inet_sk_flowi_flags(sk) | (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), daddr, saddr, 0, 0, sk->sk_uid); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 7d0adb612bdd..44ee7a2e72ac 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -793,7 +793,8 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (!proto) proto = inet->inet_num; - else if (proto != inet->inet_num) + else if (proto != inet->inet_num && + inet->inet_num != IPPROTO_RAW) return -EINVAL; if (proto > 255) -- cgit v1.2.3 From 26fb5480a27d34975cc2b680b77af189620dd740 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 May 2023 11:49:50 -0400 Subject: net/handshake: Enable the SNI extension to work properly Enable the upper layer protocol to specify the SNI peername. This avoids the need for tlshd to use a DNS lookup, which can return a hostname that doesn't match the incoming certificate's SubjectName. Fixes: 2fd5532044a8 ("net/handshake: Add a kernel API for requesting a TLSv1.3 handshake") Reviewed-by: Simon Horman Signed-off-by: Chuck Lever Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/handshake.yaml | 4 ++++ Documentation/networking/tls-handshake.rst | 5 +++++ include/net/handshake.h | 1 + include/uapi/linux/handshake.h | 1 + net/handshake/tlshd.c | 8 ++++++++ 5 files changed, 19 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/handshake.yaml b/Documentation/netlink/specs/handshake.yaml index 614f1a585511..6d89e30f5fd5 100644 --- a/Documentation/netlink/specs/handshake.yaml +++ b/Documentation/netlink/specs/handshake.yaml @@ -68,6 +68,9 @@ attribute-sets: type: nest nested-attributes: x509 multi-attr: true + - + name: peername + type: string - name: done attributes: @@ -105,6 +108,7 @@ operations: - auth-mode - peer-identity - certificate + - peername - name: done doc: Handler reports handshake completion diff --git a/Documentation/networking/tls-handshake.rst b/Documentation/networking/tls-handshake.rst index a2817a88e905..6f5ea1646a47 100644 --- a/Documentation/networking/tls-handshake.rst +++ b/Documentation/networking/tls-handshake.rst @@ -53,6 +53,7 @@ fills in a structure that contains the parameters of the request: struct socket *ta_sock; tls_done_func_t ta_done; void *ta_data; + const char *ta_peername; unsigned int ta_timeout_ms; key_serial_t ta_keyring; key_serial_t ta_my_cert; @@ -71,6 +72,10 @@ instantiated a struct file in sock->file. has completed. Further explanation of this function is in the "Handshake Completion" sesction below. +The consumer can provide a NUL-terminated hostname in the @ta_peername +field that is sent as part of ClientHello. If no peername is provided, +the DNS hostname associated with the server's IP address is used instead. + The consumer can fill in the @ta_timeout_ms field to force the servicing handshake agent to exit after a number of milliseconds. This enables the socket to be fully closed once both the kernel and the handshake agent diff --git a/include/net/handshake.h b/include/net/handshake.h index 3352b1ab43b3..2e26e436e85f 100644 --- a/include/net/handshake.h +++ b/include/net/handshake.h @@ -24,6 +24,7 @@ struct tls_handshake_args { struct socket *ta_sock; tls_done_func_t ta_done; void *ta_data; + const char *ta_peername; unsigned int ta_timeout_ms; key_serial_t ta_keyring; key_serial_t ta_my_cert; diff --git a/include/uapi/linux/handshake.h b/include/uapi/linux/handshake.h index 1de4d0b95325..3d7ea58778c9 100644 --- a/include/uapi/linux/handshake.h +++ b/include/uapi/linux/handshake.h @@ -44,6 +44,7 @@ enum { HANDSHAKE_A_ACCEPT_AUTH_MODE, HANDSHAKE_A_ACCEPT_PEER_IDENTITY, HANDSHAKE_A_ACCEPT_CERTIFICATE, + HANDSHAKE_A_ACCEPT_PEERNAME, __HANDSHAKE_A_ACCEPT_MAX, HANDSHAKE_A_ACCEPT_MAX = (__HANDSHAKE_A_ACCEPT_MAX - 1) diff --git a/net/handshake/tlshd.c b/net/handshake/tlshd.c index fcbeb63b4eb1..b735f5cced2f 100644 --- a/net/handshake/tlshd.c +++ b/net/handshake/tlshd.c @@ -31,6 +31,7 @@ struct tls_handshake_req { int th_type; unsigned int th_timeout_ms; int th_auth_mode; + const char *th_peername; key_serial_t th_keyring; key_serial_t th_certificate; key_serial_t th_privkey; @@ -48,6 +49,7 @@ tls_handshake_req_init(struct handshake_req *req, treq->th_timeout_ms = args->ta_timeout_ms; treq->th_consumer_done = args->ta_done; treq->th_consumer_data = args->ta_data; + treq->th_peername = args->ta_peername; treq->th_keyring = args->ta_keyring; treq->th_num_peerids = 0; treq->th_certificate = TLS_NO_CERT; @@ -214,6 +216,12 @@ static int tls_handshake_accept(struct handshake_req *req, ret = nla_put_u32(msg, HANDSHAKE_A_ACCEPT_MESSAGE_TYPE, treq->th_type); if (ret < 0) goto out_cancel; + if (treq->th_peername) { + ret = nla_put_string(msg, HANDSHAKE_A_ACCEPT_PEERNAME, + treq->th_peername); + if (ret < 0) + goto out_cancel; + } if (treq->th_timeout_ms) { ret = nla_put_u32(msg, HANDSHAKE_A_ACCEPT_TIMEOUT, treq->th_timeout_ms); if (ret < 0) -- cgit v1.2.3