From 9b12f050c76f090cc6d0aebe0ef76fed79ec3f15 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@kernel.org>
Date: Wed, 22 Feb 2023 10:23:02 +0100
Subject: char: pcmcia: remove all the drivers

These char PCMCIA drivers are buggy[1] and receive only minimal care. It
was concluded[2], that we should try to remove most pcmcia drivers
completely. Let's start with these char broken one.

Note that I also removed a UAPI header: include/uapi/linux/cm4000_cs.h.
I found only coccinelle tests mentioning some ioctl constants from that
file. But they are not actually used. Anyway, should someone complain,
we may reintroduce the header (or its parts).

[1] https://lore.kernel.org/all/f41c2765-80e0-48bc-b1e4-8cfd3230fd4a@www.fastmail.com/
[2] https://lore.kernel.org/all/c5b39544-a4fb-4796-a046-0b9be9853787@app.fastmail.com/

Signed-off-by: Jiri Slaby (SUSE) <jirislaby@kernel.org>
Cc: "Hyunwoo Kim" <imv4bel@gmail.com>
Cc: Harald Welte <laforge@gnumonks.org>
Cc: Lubomir Rintel <lkundrak@v3.sk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Dominik Brodowski <linux@dominikbrodowski.net>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20230222092302.6348-2-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/cm4000_cs.h | 64 ------------------------------------------
 1 file changed, 64 deletions(-)
 delete mode 100644 include/uapi/linux/cm4000_cs.h

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/cm4000_cs.h b/include/uapi/linux/cm4000_cs.h
deleted file mode 100644
index c70a62ec8a49..000000000000
--- a/include/uapi/linux/cm4000_cs.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _UAPI_CM4000_H_
-#define _UAPI_CM4000_H_
-
-#include <linux/types.h>
-#include <linux/ioctl.h>
-
-#define	MAX_ATR			33
-
-#define	CM4000_MAX_DEV		4
-
-/* those two structures are passed via ioctl() from/to userspace.  They are
- * used by existing userspace programs, so I kepth the awkward "bIFSD" naming
- * not to break compilation of userspace apps. -HW */
-
-typedef struct atreq {
-	__s32 atr_len;
-	unsigned char atr[64];
-	__s32 power_act;
-	unsigned char bIFSD;
-	unsigned char bIFSC;
-} atreq_t;
-
-
-/* what is particularly stupid in the original driver is the arch-dependent
- * member sizes. This leads to CONFIG_COMPAT breakage, since 32bit userspace
- * will lay out the structure members differently than the 64bit kernel.
- *
- * I've changed "ptsreq.protocol" from "unsigned long" to "__u32".
- * On 32bit this will make no difference.  With 64bit kernels, it will make
- * 32bit apps work, too.
- */
-
-typedef struct ptsreq {
-	__u32 protocol; /*T=0: 2^0, T=1:  2^1*/
- 	unsigned char flags;
- 	unsigned char pts1;
- 	unsigned char pts2;
-	unsigned char pts3;
-} ptsreq_t;
-
-#define	CM_IOC_MAGIC		'c'
-#define	CM_IOC_MAXNR	        255
-
-#define	CM_IOCGSTATUS		_IOR (CM_IOC_MAGIC, 0, unsigned char *)
-#define	CM_IOCGATR		_IOWR(CM_IOC_MAGIC, 1, atreq_t *)
-#define	CM_IOCSPTS		_IOW (CM_IOC_MAGIC, 2, ptsreq_t *)
-#define	CM_IOCSRDR		_IO  (CM_IOC_MAGIC, 3)
-#define CM_IOCARDOFF            _IO  (CM_IOC_MAGIC, 4)
-
-#define CM_IOSDBGLVL            _IOW(CM_IOC_MAGIC, 250, int*)
-
-/* card and device states */
-#define	CM_CARD_INSERTED		0x01
-#define	CM_CARD_POWERED			0x02
-#define	CM_ATR_PRESENT			0x04
-#define	CM_ATR_VALID	 		0x08
-#define	CM_STATE_VALID			0x0f
-/* extra info only from CM4000 */
-#define	CM_NO_READER			0x10
-#define	CM_BAD_CARD			0x20
-
-
-#endif /* _UAPI_CM4000_H_ */
-- 
cgit v1.2.3


From c5edd753a0bd6243a597f5199c227a50457ee179 Mon Sep 17 00:00:00 2001
From: Thomas Huth <thuth@redhat.com>
Date: Wed, 8 Feb 2023 15:01:02 +0100
Subject: KVM: x86: Remove the KVM_GET_NR_MMU_PAGES ioctl

The KVM_GET_NR_MMU_PAGES ioctl is quite questionable on 64-bit hosts
since it fails to return the full 64 bits of the value that can be
set with the corresponding KVM_SET_NR_MMU_PAGES call. Its "long" return
value is truncated into an "int" in the kvm_arch_vm_ioctl() function.

Since this ioctl also never has been used by userspace applications
(QEMU, Google's internal VMM, kvmtool and CrosVM have been checked),
it's likely the best if we remove this badly designed ioctl before
anybody really tries to use it.

Signed-off-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20230208140105.655814-4-thuth@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c             | 8 --------
 include/uapi/linux/kvm.h       | 2 +-
 tools/include/uapi/linux/kvm.h | 2 +-
 3 files changed, 2 insertions(+), 10 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ff7f398a0c6a..4282daeaee84 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6021,11 +6021,6 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
 	return 0;
 }
 
-static unsigned long kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
-{
-	return kvm->arch.n_max_mmu_pages;
-}
-
 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 {
 	struct kvm_pic *pic = kvm->arch.vpic;
@@ -6711,9 +6706,6 @@ set_identity_unlock:
 	case KVM_SET_NR_MMU_PAGES:
 		r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
 		break;
-	case KVM_GET_NR_MMU_PAGES:
-		r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
-		break;
 	case KVM_CREATE_IRQCHIP: {
 		mutex_lock(&kvm->lock);
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index d77aef872a0a..4003a166328c 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1451,7 +1451,7 @@ struct kvm_vfio_spapr_tce {
 #define KVM_CREATE_VCPU           _IO(KVMIO,   0x41)
 #define KVM_GET_DIRTY_LOG         _IOW(KVMIO,  0x42, struct kvm_dirty_log)
 #define KVM_SET_NR_MMU_PAGES      _IO(KVMIO,   0x44)
-#define KVM_GET_NR_MMU_PAGES      _IO(KVMIO,   0x45)
+#define KVM_GET_NR_MMU_PAGES      _IO(KVMIO,   0x45)  /* deprecated */
 #define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46, \
 					struct kvm_userspace_memory_region)
 #define KVM_SET_TSS_ADDR          _IO(KVMIO,   0x47)
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index d77aef872a0a..4003a166328c 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -1451,7 +1451,7 @@ struct kvm_vfio_spapr_tce {
 #define KVM_CREATE_VCPU           _IO(KVMIO,   0x41)
 #define KVM_GET_DIRTY_LOG         _IOW(KVMIO,  0x42, struct kvm_dirty_log)
 #define KVM_SET_NR_MMU_PAGES      _IO(KVMIO,   0x44)
-#define KVM_GET_NR_MMU_PAGES      _IO(KVMIO,   0x45)
+#define KVM_GET_NR_MMU_PAGES      _IO(KVMIO,   0x45)  /* deprecated */
 #define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46, \
 					struct kvm_userspace_memory_region)
 #define KVM_SET_TSS_ADDR          _IO(KVMIO,   0x47)
-- 
cgit v1.2.3


From 1fb1ea0d9cb8359ae9d6f67f22666c74d5b9f47d Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 10 Mar 2023 19:07:47 +0200
Subject: mei: Move uuid.h to the MEI namespace

There is only a single user of the UUID uAPI, let's make it
part of that user.

The way it's done is to prevent compilation time breakage for
the user space that does

	#include <linux/uuid.h>

In the future MEI user space tools can switch over to use mei_uuid.h.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20230310170747.22782-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 MAINTAINERS                      |  1 +
 drivers/misc/mei/bus-fixup.c     |  2 +-
 drivers/misc/mei/hdcp/mei_hdcp.c |  2 +-
 drivers/misc/mei/hw.h            |  2 +-
 drivers/misc/mei/main.c          |  1 -
 drivers/misc/mei/pxp/mei_pxp.c   |  2 +-
 include/linux/mod_devicetable.h  |  1 +
 include/linux/uuid.h             |  3 ---
 include/uapi/linux/mei.h         |  2 +-
 include/uapi/linux/mei_uuid.h    | 29 +++++++++++++++++++++++++++++
 include/uapi/linux/uuid.h        | 31 +------------------------------
 11 files changed, 37 insertions(+), 39 deletions(-)
 create mode 100644 include/uapi/linux/mei_uuid.h

(limited to 'include/uapi/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 7ee30a356e89..619c30ec36dc 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10459,6 +10459,7 @@ F:	drivers/watchdog/mei_wdt.c
 F:	include/linux/mei_aux.h
 F:	include/linux/mei_cl_bus.h
 F:	include/uapi/linux/mei.h
+F:	include/uapi/linux/mei_uuid.h
 F:	include/uapi/linux/uuid.h
 F:	samples/mei/*
 
diff --git a/drivers/misc/mei/bus-fixup.c b/drivers/misc/mei/bus-fixup.c
index 211536109308..31e3c74ca1f1 100644
--- a/drivers/misc/mei/bus-fixup.c
+++ b/drivers/misc/mei/bus-fixup.c
@@ -9,8 +9,8 @@
 #include <linux/module.h>
 #include <linux/device.h>
 #include <linux/slab.h>
-#include <linux/uuid.h>
 
+#include <linux/mei.h>
 #include <linux/mei_cl_bus.h>
 
 #include "mei_dev.h"
diff --git a/drivers/misc/mei/hdcp/mei_hdcp.c b/drivers/misc/mei/hdcp/mei_hdcp.c
index e0dcd5c114db..45e3d4d27797 100644
--- a/drivers/misc/mei/hdcp/mei_hdcp.c
+++ b/drivers/misc/mei/hdcp/mei_hdcp.c
@@ -18,7 +18,7 @@
 
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <linux/uuid.h>
+#include <linux/mei.h>
 #include <linux/mei_cl_bus.h>
 #include <linux/component.h>
 #include <drm/drm_connector.h>
diff --git a/drivers/misc/mei/hw.h b/drivers/misc/mei/hw.h
index 319418ddf4fb..e910302fcd1f 100644
--- a/drivers/misc/mei/hw.h
+++ b/drivers/misc/mei/hw.h
@@ -7,7 +7,7 @@
 #ifndef _MEI_HW_TYPES_H_
 #define _MEI_HW_TYPES_H_
 
-#include <linux/uuid.h>
+#include <linux/mei.h>
 
 /*
  * Timeouts in Seconds
diff --git a/drivers/misc/mei/main.c b/drivers/misc/mei/main.c
index 632d4ae21e46..c64291741d73 100644
--- a/drivers/misc/mei/main.c
+++ b/drivers/misc/mei/main.c
@@ -18,7 +18,6 @@
 #include <linux/ioctl.h>
 #include <linux/cdev.h>
 #include <linux/sched/signal.h>
-#include <linux/uuid.h>
 #include <linux/compat.h>
 #include <linux/jiffies.h>
 #include <linux/interrupt.h>
diff --git a/drivers/misc/mei/pxp/mei_pxp.c b/drivers/misc/mei/pxp/mei_pxp.c
index 7ee1fa7b1cb3..3bf560bbdee0 100644
--- a/drivers/misc/mei/pxp/mei_pxp.c
+++ b/drivers/misc/mei/pxp/mei_pxp.c
@@ -13,7 +13,7 @@
 
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <linux/uuid.h>
+#include <linux/mei.h>
 #include <linux/mei_cl_bus.h>
 #include <linux/component.h>
 #include <drm/drm_connector.h>
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 549590e9c644..35203ca3fc37 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -9,6 +9,7 @@
 #define LINUX_MOD_DEVICETABLE_H
 
 #ifdef __KERNEL__
+#include <linux/mei.h>
 #include <linux/types.h>
 #include <linux/uuid.h>
 typedef unsigned long kernel_ulong_t;
diff --git a/include/linux/uuid.h b/include/linux/uuid.h
index 6b1a3efa1e0b..43d4a79b273d 100644
--- a/include/linux/uuid.h
+++ b/include/linux/uuid.h
@@ -107,7 +107,4 @@ extern const u8 uuid_index[16];
 int guid_parse(const char *uuid, guid_t *u);
 int uuid_parse(const char *uuid, uuid_t *u);
 
-/* MEI UUID type, don't use anywhere else */
-#include <uapi/linux/uuid.h>
-
 #endif
diff --git a/include/uapi/linux/mei.h b/include/uapi/linux/mei.h
index 4f3638489d01..6e57743628c0 100644
--- a/include/uapi/linux/mei.h
+++ b/include/uapi/linux/mei.h
@@ -7,7 +7,7 @@
 #ifndef _LINUX_MEI_H
 #define _LINUX_MEI_H
 
-#include <linux/uuid.h>
+#include <linux/mei_uuid.h>
 
 /*
  * This IOCTL is used to associate the current file descriptor with a
diff --git a/include/uapi/linux/mei_uuid.h b/include/uapi/linux/mei_uuid.h
new file mode 100644
index 000000000000..676ebe12d623
--- /dev/null
+++ b/include/uapi/linux/mei_uuid.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * MEI UUID definition
+ *
+ * Copyright (C) 2010, Intel Corp.
+ *	Huang Ying <ying.huang@intel.com>
+ */
+
+#ifndef _UAPI_LINUX_MEI_UUID_H_
+#define _UAPI_LINUX_MEI_UUID_H_
+
+#include <linux/types.h>
+
+typedef struct {
+	__u8 b[16];
+} uuid_le;
+
+#define UUID_LE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)		\
+((uuid_le)								\
+{{ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \
+   (b) & 0xff, ((b) >> 8) & 0xff,					\
+   (c) & 0xff, ((c) >> 8) & 0xff,					\
+   (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }})
+
+#define NULL_UUID_LE							\
+	UUID_LE(0x00000000, 0x0000, 0x0000, 0x00, 0x00, 0x00, 0x00,	\
+	     0x00, 0x00, 0x00, 0x00)
+
+#endif /* _UAPI_LINUX_MEI_UUID_H_ */
diff --git a/include/uapi/linux/uuid.h b/include/uapi/linux/uuid.h
index 96ac684a4b2f..8443738f4bb2 100644
--- a/include/uapi/linux/uuid.h
+++ b/include/uapi/linux/uuid.h
@@ -1,30 +1 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/* DO NOT USE in new code! This is solely for MEI due to legacy reasons */
-/*
- * MEI UUID definition
- *
- * Copyright (C) 2010, Intel Corp.
- *	Huang Ying <ying.huang@intel.com>
- */
-
-#ifndef _UAPI_LINUX_UUID_H_
-#define _UAPI_LINUX_UUID_H_
-
-#include <linux/types.h>
-
-typedef struct {
-	__u8 b[16];
-} uuid_le;
-
-#define UUID_LE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)		\
-((uuid_le)								\
-{{ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \
-   (b) & 0xff, ((b) >> 8) & 0xff,					\
-   (c) & 0xff, ((c) >> 8) & 0xff,					\
-   (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }})
-
-#define NULL_UUID_LE							\
-	UUID_LE(0x00000000, 0x0000, 0x0000, 0x00, 0x00, 0x00, 0x00,	\
-	     0x00, 0x00, 0x00, 0x00)
-
-#endif /* _UAPI_LINUX_UUID_H_ */
+#include <linux/mei_uuid.h>
-- 
cgit v1.2.3


From 4ca589661d964840d0d5de4b3baabbef78f453e3 Mon Sep 17 00:00:00 2001
From: Daniel Starke <daniel.starke@siemens.com>
Date: Wed, 15 Mar 2023 11:53:52 +0100
Subject: tty: n_gsm: add ioctl for DLC specific parameter configuration

Parameter negotiation has been introduced with
commit 92f1f0c3290d ("tty: n_gsm: add parameter negotiation support")

However, means to set individual parameters per DLCI are not yet
implemented. Furthermore, it is currently not possible to keep a DLCI half
open until the user application sets the right parameters for it. This is
required to allow a user application to set its specific parameters before
the underlying link is established. Otherwise, the link is opened and
re-established right afterwards if the user application sets incompatible
parameters. This may be an unexpected behavior for the peer.

Add parameter 'wait_config' to 'gsm_config' to support setups where the
DLCI specific user application sets its specific parameters after open()
and before the link gets fully established. Setting this to zero disables
the user application specific DLCI configuration option.

Add the ioctls 'GSMIOC_GETCONF_DLCI' and 'GSMIOC_SETCONF_DLCI' for the
ldisc and virtual ttys. This gets/sets the DLCI specific parameters and may
trigger a reconnect of the DLCI if incompatible values have been set. Only
the parameters for the DLCI associated with the virtual tty can be set or
retrieved if called on these.

Add remark within the documentation to introduce the new ioctls.

Link: https://lore.kernel.org/oe-kbuild-all/202302281856.S9Lz4gHB-lkp@intel.com/
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Daniel Starke <daniel.starke@siemens.com>
Link: https://lore.kernel.org/r/20230315105354.6234-1-daniel.starke@siemens.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/driver-api/tty/n_gsm.rst |   4 +
 drivers/tty/n_gsm.c                    | 192 ++++++++++++++++++++++++++++++++-
 include/uapi/linux/gsmmux.h            |  17 ++-
 3 files changed, 207 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/driver-api/tty/n_gsm.rst b/Documentation/driver-api/tty/n_gsm.rst
index 9447b8a3b8e2..c2624546fb8f 100644
--- a/Documentation/driver-api/tty/n_gsm.rst
+++ b/Documentation/driver-api/tty/n_gsm.rst
@@ -29,6 +29,8 @@ Config Initiator
 
 #. Configure the mux using ``GSMIOC_GETCONF``/``GSMIOC_SETCONF`` ioctl.
 
+#. Configure DLCs using ``GSMIOC_GETCONF_DLCI``/``GSMIOC_SETCONF_DLCI`` ioctl for non-defaults.
+
 #. Obtain base gsmtty number for the used serial port.
 
    Major parts of the initialization program
@@ -120,6 +122,8 @@ Config Requester
 
 #. Configure the mux using ``GSMIOC_GETCONF``/``GSMIOC_SETCONF`` ioctl.
 
+#. Configure DLCs using ``GSMIOC_GETCONF_DLCI``/``GSMIOC_SETCONF_DLCI`` ioctl for non-defaults.
+
 #. Obtain base gsmtty number for the used serial port::
 
         #include <stdio.h>
diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c
index 7aa05ebed8e1..9d0e7701ffd4 100644
--- a/drivers/tty/n_gsm.c
+++ b/drivers/tty/n_gsm.c
@@ -128,6 +128,7 @@ struct gsm_msg {
 
 enum gsm_dlci_state {
 	DLCI_CLOSED,
+	DLCI_WAITING_CONFIG,	/* Waiting for DLCI configuration from user */
 	DLCI_CONFIGURE,		/* Sending PN (for adaption > 1) */
 	DLCI_OPENING,		/* Sending SABM not seen UA */
 	DLCI_OPEN,		/* SABM/UA complete */
@@ -330,6 +331,7 @@ struct gsm_mux {
 	unsigned int t3;	/* Power wake-up timer in seconds. */
 	int n2;			/* Retry count */
 	u8 k;			/* Window size */
+	bool wait_config;	/* Wait for configuration by ioctl before DLCI open */
 	u32 keep_alive;		/* Control channel keep-alive in 10ms */
 
 	/* Statistics (not currently exposed) */
@@ -451,6 +453,7 @@ static int gsm_modem_update(struct gsm_dlci *dlci, u8 brk);
 static struct gsm_msg *gsm_data_alloc(struct gsm_mux *gsm, u8 addr, int len,
 								u8 ctrl);
 static int gsm_send_packet(struct gsm_mux *gsm, struct gsm_msg *msg);
+static struct gsm_dlci *gsm_dlci_alloc(struct gsm_mux *gsm, int addr);
 static void gsmld_write_trigger(struct gsm_mux *gsm);
 static void gsmld_write_task(struct work_struct *work);
 
@@ -2280,6 +2283,7 @@ static void gsm_dlci_begin_open(struct gsm_dlci *dlci)
 
 	switch (dlci->state) {
 	case DLCI_CLOSED:
+	case DLCI_WAITING_CONFIG:
 	case DLCI_CLOSING:
 		dlci->retries = gsm->n2;
 		if (!need_pn) {
@@ -2311,6 +2315,7 @@ static void gsm_dlci_set_opening(struct gsm_dlci *dlci)
 {
 	switch (dlci->state) {
 	case DLCI_CLOSED:
+	case DLCI_WAITING_CONFIG:
 	case DLCI_CLOSING:
 		dlci->state = DLCI_OPENING;
 		break;
@@ -2319,6 +2324,24 @@ static void gsm_dlci_set_opening(struct gsm_dlci *dlci)
 	}
 }
 
+/**
+ * gsm_dlci_set_wait_config	-	wait for channel configuration
+ * @dlci: DLCI to configure
+ *
+ * Wait for a DLCI configuration from the application.
+ */
+static void gsm_dlci_set_wait_config(struct gsm_dlci *dlci)
+{
+	switch (dlci->state) {
+	case DLCI_CLOSED:
+	case DLCI_CLOSING:
+		dlci->state = DLCI_WAITING_CONFIG;
+		break;
+	default:
+		break;
+	}
+}
+
 /**
  *	gsm_dlci_begin_close	-	start channel open procedure
  *	@dlci: DLCI to open
@@ -2453,6 +2476,128 @@ static void gsm_kick_timer(struct timer_list *t)
 		pr_info("%s TX queue stalled\n", __func__);
 }
 
+/**
+ * gsm_dlci_copy_config_values	-	copy DLCI configuration
+ * @dlci: source DLCI
+ * @dc: configuration structure to fill
+ */
+static void gsm_dlci_copy_config_values(struct gsm_dlci *dlci, struct gsm_dlci_config *dc)
+{
+	memset(dc, 0, sizeof(*dc));
+	dc->channel = (u32)dlci->addr;
+	dc->adaption = (u32)dlci->adaption;
+	dc->mtu = (u32)dlci->mtu;
+	dc->priority = (u32)dlci->prio;
+	if (dlci->ftype == UIH)
+		dc->i = 1;
+	else
+		dc->i = 2;
+	dc->k = (u32)dlci->k;
+}
+
+/**
+ * gsm_dlci_config	-	configure DLCI from configuration
+ * @dlci: DLCI to configure
+ * @dc: DLCI configuration
+ * @open: open DLCI after configuration?
+ */
+static int gsm_dlci_config(struct gsm_dlci *dlci, struct gsm_dlci_config *dc, int open)
+{
+	struct gsm_mux *gsm;
+	bool need_restart = false;
+	bool need_open = false;
+	unsigned int i;
+
+	/*
+	 * Check that userspace doesn't put stuff in here to prevent breakages
+	 * in the future.
+	 */
+	for (i = 0; i < ARRAY_SIZE(dc->reserved); i++)
+		if (dc->reserved[i])
+			return -EINVAL;
+
+	if (!dlci)
+		return -EINVAL;
+	gsm = dlci->gsm;
+
+	/* Stuff we don't support yet - I frame transport */
+	if (dc->adaption != 1 && dc->adaption != 2)
+		return -EOPNOTSUPP;
+	if (dc->mtu > MAX_MTU || dc->mtu < MIN_MTU || dc->mtu > gsm->mru)
+		return -EINVAL;
+	if (dc->priority >= 64)
+		return -EINVAL;
+	if (dc->i == 0 || dc->i > 2)  /* UIH and UI only */
+		return -EINVAL;
+	if (dc->k > 7)
+		return -EINVAL;
+
+	/*
+	 * See what is needed for reconfiguration
+	 */
+	/* Framing fields */
+	if (dc->adaption != dlci->adaption)
+		need_restart = true;
+	if (dc->mtu != dlci->mtu)
+		need_restart = true;
+	if (dc->i != dlci->ftype)
+		need_restart = true;
+	/* Requires care */
+	if (dc->priority != dlci->prio)
+		need_restart = true;
+
+	if ((open && gsm->wait_config) || need_restart)
+		need_open = true;
+	if (dlci->state == DLCI_WAITING_CONFIG) {
+		need_restart = false;
+		need_open = true;
+	}
+
+	/*
+	 * Close down what is needed, restart and initiate the new
+	 * configuration.
+	 */
+	if (need_restart) {
+		gsm_dlci_begin_close(dlci);
+		wait_event_interruptible(gsm->event, dlci->state == DLCI_CLOSED);
+		if (signal_pending(current))
+			return -EINTR;
+	}
+	/*
+	 * Setup the new configuration values
+	 */
+	dlci->adaption = (int)dc->adaption;
+
+	if (dc->mtu)
+		dlci->mtu = (unsigned int)dc->mtu;
+	else
+		dlci->mtu = gsm->mtu;
+
+	if (dc->priority)
+		dlci->prio = (u8)dc->priority;
+	else
+		dlci->prio = roundup(dlci->addr + 1, 8) - 1;
+
+	if (dc->i == 1)
+		dlci->ftype = UIH;
+	else if (dc->i == 2)
+		dlci->ftype = UI;
+
+	if (dc->k)
+		dlci->k = (u8)dc->k;
+	else
+		dlci->k = gsm->k;
+
+	if (need_open) {
+		if (gsm->initiator)
+			gsm_dlci_begin_open(dlci);
+		else
+			gsm_dlci_set_opening(dlci);
+	}
+
+	return 0;
+}
+
 /*
  *	Allocate/Free DLCI channels
  */
@@ -3078,6 +3223,7 @@ static struct gsm_mux *gsm_alloc_mux(void)
 	gsm->mru = 64;	/* Default to encoding 1 so these should be 64 */
 	gsm->mtu = 64;
 	gsm->dead = true;	/* Avoid early tty opens */
+	gsm->wait_config = false; /* Disabled */
 	gsm->keep_alive = 0;	/* Disabled */
 
 	/* Store the instance to the mux array or abort if no space is
@@ -3218,6 +3364,7 @@ static void gsm_copy_config_ext_values(struct gsm_mux *gsm,
 				       struct gsm_config_ext *ce)
 {
 	memset(ce, 0, sizeof(*ce));
+	ce->wait_config = gsm->wait_config ? 1 : 0;
 	ce->keep_alive = gsm->keep_alive;
 }
 
@@ -3233,7 +3380,12 @@ static int gsm_config_ext(struct gsm_mux *gsm, struct gsm_config_ext *ce)
 		if (ce->reserved[i])
 			return -EINVAL;
 
+	/*
+	 * Setup the new configuration values
+	 */
+	gsm->wait_config = ce->wait_config ? true : false;
 	gsm->keep_alive = ce->keep_alive;
+
 	return 0;
 }
 
@@ -3436,6 +3588,14 @@ static int gsmld_open(struct tty_struct *tty)
 	gsm->encoding = GSM_ADV_OPT;
 	gsmld_attach_gsm(tty, gsm);
 
+	/* The mux will not be activated yet, we wait for correct
+	 * configuration first.
+	 */
+	if (gsm->encoding == GSM_BASIC_OPT)
+		gsm->receive = gsm0_receive;
+	else
+		gsm->receive = gsm1_receive;
+
 	return 0;
 }
 
@@ -4008,7 +4168,6 @@ static int gsmtty_open(struct tty_struct *tty, struct file *filp)
 {
 	struct gsm_dlci *dlci = tty->driver_data;
 	struct tty_port *port = &dlci->port;
-	struct gsm_mux *gsm = dlci->gsm;
 
 	port->count++;
 	tty_port_tty_set(port, tty);
@@ -4018,10 +4177,15 @@ static int gsmtty_open(struct tty_struct *tty, struct file *filp)
 	   a DM straight back. This is ok as that will have caused a hangup */
 	tty_port_set_initialized(port, true);
 	/* Start sending off SABM messages */
-	if (gsm->initiator)
-		gsm_dlci_begin_open(dlci);
-	else
-		gsm_dlci_set_opening(dlci);
+	if (!dlci->gsm->wait_config) {
+		/* Start sending off SABM messages */
+		if (dlci->gsm->initiator)
+			gsm_dlci_begin_open(dlci);
+		else
+			gsm_dlci_set_opening(dlci);
+	} else {
+		gsm_dlci_set_wait_config(dlci);
+	}
 	/* And wait for virtual carrier */
 	return tty_port_block_til_ready(port, tty, filp);
 }
@@ -4142,6 +4306,7 @@ static int gsmtty_ioctl(struct tty_struct *tty,
 {
 	struct gsm_dlci *dlci = tty->driver_data;
 	struct gsm_netconfig nc;
+	struct gsm_dlci_config dc;
 	int index;
 
 	if (dlci->state == DLCI_CLOSED)
@@ -4165,6 +4330,23 @@ static int gsmtty_ioctl(struct tty_struct *tty,
 		gsm_destroy_network(dlci);
 		mutex_unlock(&dlci->mutex);
 		return 0;
+	case GSMIOC_GETCONF_DLCI:
+		if (copy_from_user(&dc, (void __user *)arg, sizeof(dc)))
+			return -EFAULT;
+		if (dc.channel != dlci->addr)
+			return -EPERM;
+		gsm_dlci_copy_config_values(dlci, &dc);
+		if (copy_to_user((void __user *)arg, &dc, sizeof(dc)))
+			return -EFAULT;
+		return 0;
+	case GSMIOC_SETCONF_DLCI:
+		if (copy_from_user(&dc, (void __user *)arg, sizeof(dc)))
+			return -EFAULT;
+		if (dc.channel >= NUM_DLCI)
+			return -EINVAL;
+		if (dc.channel != 0 && dc.channel != dlci->addr)
+			return -EPERM;
+		return gsm_dlci_config(dlci, &dc, 1);
 	case TIOCMIWAIT:
 		return gsm_wait_modem_change(dlci, (u32)arg);
 	default:
diff --git a/include/uapi/linux/gsmmux.h b/include/uapi/linux/gsmmux.h
index a703780aa095..eb67884e5f38 100644
--- a/include/uapi/linux/gsmmux.h
+++ b/include/uapi/linux/gsmmux.h
@@ -43,10 +43,25 @@ struct gsm_config_ext {
 	__u32 keep_alive;	/* Control channel keep-alive in 1/100th of a
 				 * second (0 to disable)
 				 */
-	__u32 reserved[7];	/* For future use, must be initialized to zero */
+	__u32 wait_config;	/* Wait for DLCI config before opening virtual link? */
+	__u32 reserved[6];	/* For future use, must be initialized to zero */
 };
 
 #define GSMIOC_GETCONF_EXT	_IOR('G', 5, struct gsm_config_ext)
 #define GSMIOC_SETCONF_EXT	_IOW('G', 6, struct gsm_config_ext)
 
+/* Set channel accordingly before calling GSMIOC_GETCONF_DLCI. */
+struct gsm_dlci_config {
+	__u32 channel;		/* DLCI (0 for the associated DLCI) */
+	__u32 adaption;		/* Convergence layer type */
+	__u32 mtu;		/* Maximum transfer unit */
+	__u32 priority;		/* Priority (0 for default value) */
+	__u32 i;		/* Frame type (1 = UIH, 2 = UI) */
+	__u32 k;		/* Window size (0 for default value) */
+	__u32 reserved[8];	/* For future use, must be initialized to zero */
+};
+
+#define GSMIOC_GETCONF_DLCI	_IOWR('G', 7, struct gsm_dlci_config)
+#define GSMIOC_SETCONF_DLCI	_IOW('G', 8, struct gsm_dlci_config)
+
 #endif
-- 
cgit v1.2.3


From e5a26a4048eeb9558e5c84f340a989c78db4adf4 Mon Sep 17 00:00:00 2001
From: Beau Belgrave <beaub@linux.microsoft.com>
Date: Tue, 28 Mar 2023 16:52:08 -0700
Subject: tracing/user_events: Split header into uapi and kernel

The UAPI parts need to be split out from the kernel parts of user_events
now that other parts of the kernel will reference it. Do so by moving
the existing include/linux/user_events.h into
include/uapi/linux/user_events.h.

Link: https://lkml.kernel.org/r/20230328235219.203-2-beaub@linux.microsoft.com

Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/user_events.h      | 52 +++++-----------------------------------
 include/uapi/linux/user_events.h | 48 +++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_events_user.c |  5 ----
 3 files changed, 54 insertions(+), 51 deletions(-)
 create mode 100644 include/uapi/linux/user_events.h

(limited to 'include/uapi/linux')

diff --git a/include/linux/user_events.h b/include/linux/user_events.h
index 592a3fbed98e..13689589d36e 100644
--- a/include/linux/user_events.h
+++ b/include/linux/user_events.h
@@ -1,54 +1,14 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (c) 2021, Microsoft Corporation.
+ * Copyright (c) 2022, Microsoft Corporation.
  *
  * Authors:
  *   Beau Belgrave <beaub@linux.microsoft.com>
  */
-#ifndef _UAPI_LINUX_USER_EVENTS_H
-#define _UAPI_LINUX_USER_EVENTS_H
 
-#include <linux/types.h>
-#include <linux/ioctl.h>
+#ifndef _LINUX_USER_EVENTS_H
+#define _LINUX_USER_EVENTS_H
 
-#ifdef __KERNEL__
-#include <linux/uio.h>
-#else
-#include <sys/uio.h>
-#endif
+#include <uapi/linux/user_events.h>
 
-#define USER_EVENTS_SYSTEM "user_events"
-#define USER_EVENTS_PREFIX "u:"
-
-/* Create dynamic location entry within a 32-bit value */
-#define DYN_LOC(offset, size) ((size) << 16 | (offset))
-
-/*
- * Describes an event registration and stores the results of the registration.
- * This structure is passed to the DIAG_IOCSREG ioctl, callers at a minimum
- * must set the size and name_args before invocation.
- */
-struct user_reg {
-
-	/* Input: Size of the user_reg structure being used */
-	__u32 size;
-
-	/* Input: Pointer to string with event name, description and flags */
-	__u64 name_args;
-
-	/* Output: Bitwise index of the event within the status page */
-	__u32 status_bit;
-
-	/* Output: Index of the event to use when writing data */
-	__u32 write_index;
-} __attribute__((__packed__));
-
-#define DIAG_IOC_MAGIC '*'
-
-/* Requests to register a user_event */
-#define DIAG_IOCSREG _IOWR(DIAG_IOC_MAGIC, 0, struct user_reg*)
-
-/* Requests to delete a user_event */
-#define DIAG_IOCSDEL _IOW(DIAG_IOC_MAGIC, 1, char*)
-
-#endif /* _UAPI_LINUX_USER_EVENTS_H */
+#endif /* _LINUX_USER_EVENTS_H */
diff --git a/include/uapi/linux/user_events.h b/include/uapi/linux/user_events.h
new file mode 100644
index 000000000000..03f92366068d
--- /dev/null
+++ b/include/uapi/linux/user_events.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Copyright (c) 2021-2022, Microsoft Corporation.
+ *
+ * Authors:
+ *   Beau Belgrave <beaub@linux.microsoft.com>
+ */
+#ifndef _UAPI_LINUX_USER_EVENTS_H
+#define _UAPI_LINUX_USER_EVENTS_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#define USER_EVENTS_SYSTEM "user_events"
+#define USER_EVENTS_PREFIX "u:"
+
+/* Create dynamic location entry within a 32-bit value */
+#define DYN_LOC(offset, size) ((size) << 16 | (offset))
+
+/*
+ * Describes an event registration and stores the results of the registration.
+ * This structure is passed to the DIAG_IOCSREG ioctl, callers at a minimum
+ * must set the size and name_args before invocation.
+ */
+struct user_reg {
+
+	/* Input: Size of the user_reg structure being used */
+	__u32 size;
+
+	/* Input: Pointer to string with event name, description and flags */
+	__u64 name_args;
+
+	/* Output: Bitwise index of the event within the status page */
+	__u32 status_bit;
+
+	/* Output: Index of the event to use when writing data */
+	__u32 write_index;
+} __attribute__((__packed__));
+
+#define DIAG_IOC_MAGIC '*'
+
+/* Request to register a user_event */
+#define DIAG_IOCSREG _IOWR(DIAG_IOC_MAGIC, 0, struct user_reg *)
+
+/* Request to delete a user_event */
+#define DIAG_IOCSDEL _IOW(DIAG_IOC_MAGIC, 1, char *)
+
+#endif /* _UAPI_LINUX_USER_EVENTS_H */
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 908e8a13c675..070551480747 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -19,12 +19,7 @@
 #include <linux/tracefs.h>
 #include <linux/types.h>
 #include <linux/uaccess.h>
-/* Reminder to move to uapi when everything works */
-#ifdef CONFIG_COMPILE_TEST
 #include <linux/user_events.h>
-#else
-#include <uapi/linux/user_events.h>
-#endif
 #include "trace.h"
 #include "trace_dynevent.h"
 
-- 
cgit v1.2.3


From 7235759084a4f8524a46bd2638885ff3b34ce279 Mon Sep 17 00:00:00 2001
From: Beau Belgrave <beaub@linux.microsoft.com>
Date: Tue, 28 Mar 2023 16:52:10 -0700
Subject: tracing/user_events: Use remote writes for event enablement

As part of the discussions for user_events aligned with user space
tracers, it was determined that user programs should register a aligned
value to set or clear a bit when an event becomes enabled. Currently a
shared page is being used that requires mmap(). Remove the shared page
implementation and move to a user registered address implementation.

In this new model during the event registration from user programs 3 new
values are specified. The first is the address to update when the event
is either enabled or disabled. The second is the bit to set/clear to
reflect the event being enabled. The third is the size of the value at
the specified address.

This allows for a local 32/64-bit value in user programs to support
both kernel and user tracers. As an example, setting bit 31 for kernel
tracers when the event becomes enabled allows for user tracers to use
the other bits for ref counts or other flags. The kernel side updates
the bit atomically, user programs need to also update these values
atomically.

User provided addresses must be aligned on a natural boundary, this
allows for single page checking and prevents odd behaviors such as a
enable value straddling 2 pages instead of a single page. Currently
page faults are only logged, future patches will handle these.

Link: https://lkml.kernel.org/r/20230328235219.203-4-beaub@linux.microsoft.com

Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/user_events.h      |  53 +++-
 include/uapi/linux/user_events.h |  15 +-
 kernel/trace/Kconfig             |   5 +-
 kernel/trace/trace_events_user.c | 586 ++++++++++++++++++++++++++++++---------
 4 files changed, 517 insertions(+), 142 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/user_events.h b/include/linux/user_events.h
index 3d747c45d2fa..0120b3dd5b03 100644
--- a/include/linux/user_events.h
+++ b/include/linux/user_events.h
@@ -9,13 +9,63 @@
 #ifndef _LINUX_USER_EVENTS_H
 #define _LINUX_USER_EVENTS_H
 
+#include <linux/list.h>
+#include <linux/refcount.h>
+#include <linux/mm_types.h>
+#include <linux/workqueue.h>
 #include <uapi/linux/user_events.h>
 
 #ifdef CONFIG_USER_EVENTS
 struct user_event_mm {
+	struct list_head link;
+	struct list_head enablers;
+	struct mm_struct *mm;
+	struct user_event_mm *next;
+	refcount_t refcnt;
+	refcount_t tasks;
+	struct rcu_work put_rwork;
 };
-#endif
 
+extern void user_event_mm_dup(struct task_struct *t,
+			      struct user_event_mm *old_mm);
+
+extern void user_event_mm_remove(struct task_struct *t);
+
+static inline void user_events_fork(struct task_struct *t,
+				    unsigned long clone_flags)
+{
+	struct user_event_mm *old_mm;
+
+	if (!t || !current->user_event_mm)
+		return;
+
+	old_mm = current->user_event_mm;
+
+	if (clone_flags & CLONE_VM) {
+		t->user_event_mm = old_mm;
+		refcount_inc(&old_mm->tasks);
+		return;
+	}
+
+	user_event_mm_dup(t, old_mm);
+}
+
+static inline void user_events_execve(struct task_struct *t)
+{
+	if (!t || !t->user_event_mm)
+		return;
+
+	user_event_mm_remove(t);
+}
+
+static inline void user_events_exit(struct task_struct *t)
+{
+	if (!t || !t->user_event_mm)
+		return;
+
+	user_event_mm_remove(t);
+}
+#else
 static inline void user_events_fork(struct task_struct *t,
 				    unsigned long clone_flags)
 {
@@ -28,5 +78,6 @@ static inline void user_events_execve(struct task_struct *t)
 static inline void user_events_exit(struct task_struct *t)
 {
 }
+#endif /* CONFIG_USER_EVENTS */
 
 #endif /* _LINUX_USER_EVENTS_H */
diff --git a/include/uapi/linux/user_events.h b/include/uapi/linux/user_events.h
index 03f92366068d..22521bc622db 100644
--- a/include/uapi/linux/user_events.h
+++ b/include/uapi/linux/user_events.h
@@ -27,12 +27,21 @@ struct user_reg {
 	/* Input: Size of the user_reg structure being used */
 	__u32 size;
 
+	/* Input: Bit in enable address to use */
+	__u8 enable_bit;
+
+	/* Input: Enable size in bytes at address */
+	__u8 enable_size;
+
+	/* Input: Flags for future use, set to 0 */
+	__u16 flags;
+
+	/* Input: Address to update when enabled */
+	__u64 enable_addr;
+
 	/* Input: Pointer to string with event name, description and flags */
 	__u64 name_args;
 
-	/* Output: Bitwise index of the event within the status page */
-	__u32 status_bit;
-
 	/* Output: Index of the event to use when writing data */
 	__u32 write_index;
 } __attribute__((__packed__));
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 5b1e7fa41ca8..c7020e071bf9 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -798,9 +798,10 @@ config USER_EVENTS
 	  can be used like an existing kernel trace event.  User trace
 	  events are generated by writing to a tracefs file.  User
 	  processes can determine if their tracing events should be
-	  generated by memory mapping a tracefs file and checking for
-	  an associated byte being non-zero.
+	  generated by registering a value and bit with the kernel
+	  that reflects when it is enabled or not.
 
+	  See Documentation/trace/user_events.rst.
 	  If in doubt, say N.
 
 config HIST_TRIGGERS
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 070551480747..553a82ee7aeb 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -19,6 +19,7 @@
 #include <linux/tracefs.h>
 #include <linux/types.h>
 #include <linux/uaccess.h>
+#include <linux/highmem.h>
 #include <linux/user_events.h>
 #include "trace.h"
 #include "trace_dynevent.h"
@@ -29,34 +30,11 @@
 #define FIELD_DEPTH_NAME 1
 #define FIELD_DEPTH_SIZE 2
 
-/*
- * Limits how many trace_event calls user processes can create:
- * Must be a power of two of PAGE_SIZE.
- */
-#define MAX_PAGE_ORDER 0
-#define MAX_PAGES (1 << MAX_PAGE_ORDER)
-#define MAX_BYTES (MAX_PAGES * PAGE_SIZE)
-#define MAX_EVENTS (MAX_BYTES * 8)
-
 /* Limit how long of an event name plus args within the subsystem. */
 #define MAX_EVENT_DESC 512
 #define EVENT_NAME(user_event) ((user_event)->tracepoint.name)
 #define MAX_FIELD_ARRAY_SIZE 1024
 
-/*
- * The MAP_STATUS_* macros are used for taking a index and determining the
- * appropriate byte and the bit in the byte to set/reset for an event.
- *
- * The lower 3 bits of the index decide which bit to set.
- * The remaining upper bits of the index decide which byte to use for the bit.
- *
- * This is used when an event has a probe attached/removed to reflect live
- * status of the event wanting tracing or not to user-programs via shared
- * memory maps.
- */
-#define MAP_STATUS_BYTE(index) ((index) >> 3)
-#define MAP_STATUS_MASK(index) BIT((index) & 7)
-
 /*
  * Internal bits (kernel side only) to keep track of connected probes:
  * These are used when status is requested in text form about an event. These
@@ -70,20 +48,14 @@
 #define EVENT_STATUS_OTHER BIT(7)
 
 /*
- * Stores the pages, tables, and locks for a group of events.
- * Each logical grouping of events has its own group, with a
- * matching page for status checks within user programs. This
- * allows for isolation of events to user programs by various
- * means.
+ * Stores the system name, tables, and locks for a group of events. This
+ * allows isolation for events by various means.
  */
 struct user_event_group {
-	struct page *pages;
-	char *register_page_data;
 	char *system_name;
 	struct hlist_node node;
 	struct mutex reg_mutex;
 	DECLARE_HASHTABLE(register_table, 8);
-	DECLARE_BITMAP(page_bitmap, MAX_EVENTS);
 };
 
 /* Group for init_user_ns mapping, top-most group */
@@ -106,12 +78,34 @@ struct user_event {
 	struct list_head fields;
 	struct list_head validators;
 	refcount_t refcnt;
-	int index;
-	int flags;
 	int min_size;
 	char status;
 };
 
+/*
+ * Stores per-mm/event properties that enable an address to be
+ * updated properly for each task. As tasks are forked, we use
+ * these to track enablement sites that are tied to an event.
+ */
+struct user_event_enabler {
+	struct list_head link;
+	struct user_event *event;
+	unsigned long addr;
+
+	/* Track enable bit, flags, etc. Aligned for bitops. */
+	unsigned int values;
+};
+
+/* Bits 0-5 are for the bit to update upon enable/disable (0-63 allowed) */
+#define ENABLE_VAL_BIT_MASK 0x3F
+
+/* Only duplicate the bit value */
+#define ENABLE_VAL_DUP_MASK ENABLE_VAL_BIT_MASK
+
+/* Global list of memory descriptors using user_events */
+static LIST_HEAD(user_event_mms);
+static DEFINE_SPINLOCK(user_event_mms_lock);
+
 /*
  * Stores per-file events references, as users register events
  * within a file this structure is modified and freed via RCU.
@@ -145,33 +139,17 @@ static int user_event_parse(struct user_event_group *group, char *name,
 			    char *args, char *flags,
 			    struct user_event **newuser);
 
+static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm);
+static struct user_event_mm *user_event_mm_get_all(struct user_event *user);
+static void user_event_mm_put(struct user_event_mm *mm);
+
 static u32 user_event_key(char *name)
 {
 	return jhash(name, strlen(name), 0);
 }
 
-static void set_page_reservations(char *pages, bool set)
-{
-	int page;
-
-	for (page = 0; page < MAX_PAGES; ++page) {
-		void *addr = pages + (PAGE_SIZE * page);
-
-		if (set)
-			SetPageReserved(virt_to_page(addr));
-		else
-			ClearPageReserved(virt_to_page(addr));
-	}
-}
-
 static void user_event_group_destroy(struct user_event_group *group)
 {
-	if (group->register_page_data)
-		set_page_reservations(group->register_page_data, false);
-
-	if (group->pages)
-		__free_pages(group->pages, MAX_PAGE_ORDER);
-
 	kfree(group->system_name);
 	kfree(group);
 }
@@ -242,19 +220,6 @@ static struct user_event_group
 	if (!group->system_name)
 		goto error;
 
-	group->pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER);
-
-	if (!group->pages)
-		goto error;
-
-	group->register_page_data = page_address(group->pages);
-
-	set_page_reservations(group->register_page_data, true);
-
-	/* Zero all bits beside 0 (which is reserved for failures) */
-	bitmap_zero(group->page_bitmap, MAX_EVENTS);
-	set_bit(0, group->page_bitmap);
-
 	mutex_init(&group->reg_mutex);
 	hash_init(group->register_table);
 
@@ -266,20 +231,367 @@ error:
 	return NULL;
 };
 
-static __always_inline
-void user_event_register_set(struct user_event *user)
+static void user_event_enabler_destroy(struct user_event_enabler *enabler)
+{
+	list_del_rcu(&enabler->link);
+
+	/* No longer tracking the event via the enabler */
+	refcount_dec(&enabler->event->refcnt);
+
+	kfree(enabler);
+}
+
+static int user_event_mm_fault_in(struct user_event_mm *mm, unsigned long uaddr)
+{
+	bool unlocked;
+	int ret;
+
+	mmap_read_lock(mm->mm);
+
+	/* Ensure MM has tasks, cannot use after exit_mm() */
+	if (refcount_read(&mm->tasks) == 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	ret = fixup_user_fault(mm->mm, uaddr, FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE,
+			       &unlocked);
+out:
+	mmap_read_unlock(mm->mm);
+
+	return ret;
+}
+
+static int user_event_enabler_write(struct user_event_mm *mm,
+				    struct user_event_enabler *enabler)
+{
+	unsigned long uaddr = enabler->addr;
+	unsigned long *ptr;
+	struct page *page;
+	void *kaddr;
+	int ret;
+
+	lockdep_assert_held(&event_mutex);
+	mmap_assert_locked(mm->mm);
+
+	/* Ensure MM has tasks, cannot use after exit_mm() */
+	if (refcount_read(&mm->tasks) == 0)
+		return -ENOENT;
+
+	ret = pin_user_pages_remote(mm->mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT,
+				    &page, NULL, NULL);
+
+	if (ret <= 0) {
+		pr_warn("user_events: Enable write failed\n");
+		return -EFAULT;
+	}
+
+	kaddr = kmap_local_page(page);
+	ptr = kaddr + (uaddr & ~PAGE_MASK);
+
+	/* Update bit atomically, user tracers must be atomic as well */
+	if (enabler->event && enabler->event->status)
+		set_bit(enabler->values & ENABLE_VAL_BIT_MASK, ptr);
+	else
+		clear_bit(enabler->values & ENABLE_VAL_BIT_MASK, ptr);
+
+	kunmap_local(kaddr);
+	unpin_user_pages_dirty_lock(&page, 1, true);
+
+	return 0;
+}
+
+static void user_event_enabler_update(struct user_event *user)
+{
+	struct user_event_enabler *enabler;
+	struct user_event_mm *mm = user_event_mm_get_all(user);
+	struct user_event_mm *next;
+
+	while (mm) {
+		next = mm->next;
+		mmap_read_lock(mm->mm);
+		rcu_read_lock();
+
+		list_for_each_entry_rcu(enabler, &mm->enablers, link)
+			if (enabler->event == user)
+				user_event_enabler_write(mm, enabler);
+
+		rcu_read_unlock();
+		mmap_read_unlock(mm->mm);
+		user_event_mm_put(mm);
+		mm = next;
+	}
+}
+
+static bool user_event_enabler_dup(struct user_event_enabler *orig,
+				   struct user_event_mm *mm)
+{
+	struct user_event_enabler *enabler;
+
+	enabler = kzalloc(sizeof(*enabler), GFP_NOWAIT);
+
+	if (!enabler)
+		return false;
+
+	enabler->event = orig->event;
+	enabler->addr = orig->addr;
+
+	/* Only dup part of value (ignore future flags, etc) */
+	enabler->values = orig->values & ENABLE_VAL_DUP_MASK;
+
+	refcount_inc(&enabler->event->refcnt);
+	list_add_rcu(&enabler->link, &mm->enablers);
+
+	return true;
+}
+
+static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm)
+{
+	refcount_inc(&mm->refcnt);
+
+	return mm;
+}
+
+static struct user_event_mm *user_event_mm_get_all(struct user_event *user)
+{
+	struct user_event_mm *found = NULL;
+	struct user_event_enabler *enabler;
+	struct user_event_mm *mm;
+
+	/*
+	 * We do not want to block fork/exec while enablements are being
+	 * updated, so we use RCU to walk the current tasks that have used
+	 * user_events ABI for 1 or more events. Each enabler found in each
+	 * task that matches the event being updated has a write to reflect
+	 * the kernel state back into the process. Waits/faults must not occur
+	 * during this. So we scan the list under RCU for all the mm that have
+	 * the event within it. This is needed because mm_read_lock() can wait.
+	 * Each user mm returned has a ref inc to handle remove RCU races.
+	 */
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(mm, &user_event_mms, link)
+		list_for_each_entry_rcu(enabler, &mm->enablers, link)
+			if (enabler->event == user) {
+				mm->next = found;
+				found = user_event_mm_get(mm);
+				break;
+			}
+
+	rcu_read_unlock();
+
+	return found;
+}
+
+static struct user_event_mm *user_event_mm_create(struct task_struct *t)
+{
+	struct user_event_mm *user_mm;
+	unsigned long flags;
+
+	user_mm = kzalloc(sizeof(*user_mm), GFP_KERNEL);
+
+	if (!user_mm)
+		return NULL;
+
+	user_mm->mm = t->mm;
+	INIT_LIST_HEAD(&user_mm->enablers);
+	refcount_set(&user_mm->refcnt, 1);
+	refcount_set(&user_mm->tasks, 1);
+
+	spin_lock_irqsave(&user_event_mms_lock, flags);
+	list_add_rcu(&user_mm->link, &user_event_mms);
+	spin_unlock_irqrestore(&user_event_mms_lock, flags);
+
+	t->user_event_mm = user_mm;
+
+	/*
+	 * The lifetime of the memory descriptor can slightly outlast
+	 * the task lifetime if a ref to the user_event_mm is taken
+	 * between list_del_rcu() and call_rcu(). Therefore we need
+	 * to take a reference to it to ensure it can live this long
+	 * under this corner case. This can also occur in clones that
+	 * outlast the parent.
+	 */
+	mmgrab(user_mm->mm);
+
+	return user_mm;
+}
+
+static struct user_event_mm *current_user_event_mm(void)
+{
+	struct user_event_mm *user_mm = current->user_event_mm;
+
+	if (user_mm)
+		goto inc;
+
+	user_mm = user_event_mm_create(current);
+
+	if (!user_mm)
+		goto error;
+inc:
+	refcount_inc(&user_mm->refcnt);
+error:
+	return user_mm;
+}
+
+static void user_event_mm_destroy(struct user_event_mm *mm)
+{
+	struct user_event_enabler *enabler, *next;
+
+	list_for_each_entry_safe(enabler, next, &mm->enablers, link)
+		user_event_enabler_destroy(enabler);
+
+	mmdrop(mm->mm);
+	kfree(mm);
+}
+
+static void user_event_mm_put(struct user_event_mm *mm)
+{
+	if (mm && refcount_dec_and_test(&mm->refcnt))
+		user_event_mm_destroy(mm);
+}
+
+static void delayed_user_event_mm_put(struct work_struct *work)
+{
+	struct user_event_mm *mm;
+
+	mm = container_of(to_rcu_work(work), struct user_event_mm, put_rwork);
+	user_event_mm_put(mm);
+}
+
+void user_event_mm_remove(struct task_struct *t)
 {
-	int i = user->index;
+	struct user_event_mm *mm;
+	unsigned long flags;
+
+	might_sleep();
+
+	mm = t->user_event_mm;
+	t->user_event_mm = NULL;
+
+	/* Clone will increment the tasks, only remove if last clone */
+	if (!refcount_dec_and_test(&mm->tasks))
+		return;
+
+	/* Remove the mm from the list, so it can no longer be enabled */
+	spin_lock_irqsave(&user_event_mms_lock, flags);
+	list_del_rcu(&mm->link);
+	spin_unlock_irqrestore(&user_event_mms_lock, flags);
+
+	/*
+	 * We need to wait for currently occurring writes to stop within
+	 * the mm. This is required since exit_mm() snaps the current rss
+	 * stats and clears them. On the final mmdrop(), check_mm() will
+	 * report a bug if these increment.
+	 *
+	 * All writes/pins are done under mmap_read lock, take the write
+	 * lock to ensure in-progress faults have completed. Faults that
+	 * are pending but yet to run will check the task count and skip
+	 * the fault since the mm is going away.
+	 */
+	mmap_write_lock(mm->mm);
+	mmap_write_unlock(mm->mm);
 
-	user->group->register_page_data[MAP_STATUS_BYTE(i)] |= MAP_STATUS_MASK(i);
+	/*
+	 * Put for mm must be done after RCU delay to handle new refs in
+	 * between the list_del_rcu() and now. This ensures any get refs
+	 * during rcu_read_lock() are accounted for during list removal.
+	 *
+	 * CPU A			|	CPU B
+	 * ---------------------------------------------------------------
+	 * user_event_mm_remove()	|	rcu_read_lock();
+	 * list_del_rcu()		|	list_for_each_entry_rcu();
+	 * call_rcu()			|	refcount_inc();
+	 * .				|	rcu_read_unlock();
+	 * schedule_work()		|	.
+	 * user_event_mm_put()		|	.
+	 *
+	 * mmdrop() cannot be called in the softirq context of call_rcu()
+	 * so we use a work queue after call_rcu() to run within.
+	 */
+	INIT_RCU_WORK(&mm->put_rwork, delayed_user_event_mm_put);
+	queue_rcu_work(system_wq, &mm->put_rwork);
 }
 
-static __always_inline
-void user_event_register_clear(struct user_event *user)
+void user_event_mm_dup(struct task_struct *t, struct user_event_mm *old_mm)
 {
-	int i = user->index;
+	struct user_event_mm *mm = user_event_mm_create(t);
+	struct user_event_enabler *enabler;
 
-	user->group->register_page_data[MAP_STATUS_BYTE(i)] &= ~MAP_STATUS_MASK(i);
+	if (!mm)
+		return;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(enabler, &old_mm->enablers, link)
+		if (!user_event_enabler_dup(enabler, mm))
+			goto error;
+
+	rcu_read_unlock();
+
+	return;
+error:
+	rcu_read_unlock();
+	user_event_mm_remove(t);
+}
+
+static struct user_event_enabler
+*user_event_enabler_create(struct user_reg *reg, struct user_event *user,
+			   int *write_result)
+{
+	struct user_event_enabler *enabler;
+	struct user_event_mm *user_mm;
+	unsigned long uaddr = (unsigned long)reg->enable_addr;
+
+	user_mm = current_user_event_mm();
+
+	if (!user_mm)
+		return NULL;
+
+	enabler = kzalloc(sizeof(*enabler), GFP_KERNEL);
+
+	if (!enabler)
+		goto out;
+
+	enabler->event = user;
+	enabler->addr = uaddr;
+	enabler->values = reg->enable_bit;
+retry:
+	/* Prevents state changes from racing with new enablers */
+	mutex_lock(&event_mutex);
+
+	/* Attempt to reflect the current state within the process */
+	mmap_read_lock(user_mm->mm);
+	*write_result = user_event_enabler_write(user_mm, enabler);
+	mmap_read_unlock(user_mm->mm);
+
+	/*
+	 * If the write works, then we will track the enabler. A ref to the
+	 * underlying user_event is held by the enabler to prevent it going
+	 * away while the enabler is still in use by a process. The ref is
+	 * removed when the enabler is destroyed. This means a event cannot
+	 * be forcefully deleted from the system until all tasks using it
+	 * exit or run exec(), which includes forks and clones.
+	 */
+	if (!*write_result) {
+		refcount_inc(&enabler->event->refcnt);
+		list_add_rcu(&enabler->link, &user_mm->enablers);
+	}
+
+	mutex_unlock(&event_mutex);
+
+	if (*write_result) {
+		/* Attempt to fault-in and retry if it worked */
+		if (!user_event_mm_fault_in(user_mm, uaddr))
+			goto retry;
+
+		kfree(enabler);
+		enabler = NULL;
+	}
+out:
+	user_event_mm_put(user_mm);
+
+	return enabler;
 }
 
 static __always_inline __must_check
@@ -824,9 +1136,6 @@ static int destroy_user_event(struct user_event *user)
 		return ret;
 
 	dyn_event_remove(&user->devent);
-
-	user_event_register_clear(user);
-	clear_bit(user->index, user->group->page_bitmap);
 	hash_del(&user->node);
 
 	user_event_destroy_validators(user);
@@ -972,9 +1281,9 @@ discard:
 #endif
 
 /*
- * Update the register page that is shared between user processes.
+ * Update the enabled bit among all user processes.
  */
-static void update_reg_page_for(struct user_event *user)
+static void update_enable_bit_for(struct user_event *user)
 {
 	struct tracepoint *tp = &user->tracepoint;
 	char status = 0;
@@ -1005,12 +1314,9 @@ static void update_reg_page_for(struct user_event *user)
 		rcu_read_unlock_sched();
 	}
 
-	if (status)
-		user_event_register_set(user);
-	else
-		user_event_register_clear(user);
-
 	user->status = status;
+
+	user_event_enabler_update(user);
 }
 
 /*
@@ -1067,10 +1373,10 @@ static int user_event_reg(struct trace_event_call *call,
 	return ret;
 inc:
 	refcount_inc(&user->refcnt);
-	update_reg_page_for(user);
+	update_enable_bit_for(user);
 	return 0;
 dec:
-	update_reg_page_for(user);
+	update_enable_bit_for(user);
 	refcount_dec(&user->refcnt);
 	return 0;
 }
@@ -1266,7 +1572,6 @@ static int user_event_parse(struct user_event_group *group, char *name,
 			    struct user_event **newuser)
 {
 	int ret;
-	int index;
 	u32 key;
 	struct user_event *user;
 
@@ -1285,11 +1590,6 @@ static int user_event_parse(struct user_event_group *group, char *name,
 		return 0;
 	}
 
-	index = find_first_zero_bit(group->page_bitmap, MAX_EVENTS);
-
-	if (index == MAX_EVENTS)
-		return -EMFILE;
-
 	user = kzalloc(sizeof(*user), GFP_KERNEL);
 
 	if (!user)
@@ -1335,14 +1635,11 @@ static int user_event_parse(struct user_event_group *group, char *name,
 	if (ret)
 		goto put_user_lock;
 
-	user->index = index;
-
 	/* Ensure we track self ref and caller ref (2) */
 	refcount_set(&user->refcnt, 2);
 
 	dyn_event_init(&user->devent, &user_event_dops);
 	dyn_event_add(&user->devent, &user->call);
-	set_bit(user->index, group->page_bitmap);
 	hash_add(group->register_table, &user->node, key);
 
 	mutex_unlock(&event_mutex);
@@ -1559,6 +1856,37 @@ static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg)
 	if (ret)
 		return ret;
 
+	/* Ensure no flags, since we don't support any yet */
+	if (kreg->flags != 0)
+		return -EINVAL;
+
+	/* Ensure supported size */
+	switch (kreg->enable_size) {
+	case 4:
+		/* 32-bit */
+		break;
+#if BITS_PER_LONG >= 64
+	case 8:
+		/* 64-bit */
+		break;
+#endif
+	default:
+		return -EINVAL;
+	}
+
+	/* Ensure natural alignment */
+	if (kreg->enable_addr % kreg->enable_size)
+		return -EINVAL;
+
+	/* Ensure bit range for size */
+	if (kreg->enable_bit > (kreg->enable_size * BITS_PER_BYTE) - 1)
+		return -EINVAL;
+
+	/* Ensure accessible */
+	if (!access_ok((const void __user *)(uintptr_t)kreg->enable_addr,
+		       kreg->enable_size))
+		return -EFAULT;
+
 	kreg->size = size;
 
 	return 0;
@@ -1573,8 +1901,10 @@ static long user_events_ioctl_reg(struct user_event_file_info *info,
 	struct user_reg __user *ureg = (struct user_reg __user *)uarg;
 	struct user_reg reg;
 	struct user_event *user;
+	struct user_event_enabler *enabler;
 	char *name;
 	long ret;
+	int write_result;
 
 	ret = user_reg_get(ureg, &reg);
 
@@ -1605,8 +1935,28 @@ static long user_events_ioctl_reg(struct user_event_file_info *info,
 	if (ret < 0)
 		return ret;
 
+	/*
+	 * user_events_ref_add succeeded:
+	 * At this point we have a user_event, it's lifetime is bound by the
+	 * reference count, not this file. If anything fails, the user_event
+	 * still has a reference until the file is released. During release
+	 * any remaining references (from user_events_ref_add) are decremented.
+	 *
+	 * Attempt to create an enabler, which too has a lifetime tied in the
+	 * same way for the event. Once the task that caused the enabler to be
+	 * created exits or issues exec() then the enablers it has created
+	 * will be destroyed and the ref to the event will be decremented.
+	 */
+	enabler = user_event_enabler_create(&reg, user, &write_result);
+
+	if (!enabler)
+		return -ENOMEM;
+
+	/* Write failed/faulted, give error back to caller */
+	if (write_result)
+		return write_result;
+
 	put_user((u32)ret, &ureg->write_index);
-	put_user(user->index, &ureg->status_bit);
 
 	return 0;
 }
@@ -1720,38 +2070,6 @@ static const struct file_operations user_data_fops = {
 	.release = user_events_release,
 };
 
-static struct user_event_group *user_status_group(struct file *file)
-{
-	struct seq_file *m = file->private_data;
-
-	if (!m)
-		return NULL;
-
-	return m->private;
-}
-
-/*
- * Maps the shared page into the user process for checking if event is enabled.
- */
-static int user_status_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	char *pages;
-	struct user_event_group *group = user_status_group(file);
-	unsigned long size = vma->vm_end - vma->vm_start;
-
-	if (size != MAX_BYTES)
-		return -EINVAL;
-
-	if (!group)
-		return -EINVAL;
-
-	pages = group->register_page_data;
-
-	return remap_pfn_range(vma, vma->vm_start,
-			       virt_to_phys(pages) >> PAGE_SHIFT,
-			       size, vm_get_page_prot(VM_READ));
-}
-
 static void *user_seq_start(struct seq_file *m, loff_t *pos)
 {
 	if (*pos)
@@ -1775,7 +2093,7 @@ static int user_seq_show(struct seq_file *m, void *p)
 	struct user_event_group *group = m->private;
 	struct user_event *user;
 	char status;
-	int i, active = 0, busy = 0, flags;
+	int i, active = 0, busy = 0;
 
 	if (!group)
 		return -EINVAL;
@@ -1784,11 +2102,10 @@ static int user_seq_show(struct seq_file *m, void *p)
 
 	hash_for_each(group->register_table, i, user, node) {
 		status = user->status;
-		flags = user->flags;
 
-		seq_printf(m, "%d:%s", user->index, EVENT_NAME(user));
+		seq_printf(m, "%s", EVENT_NAME(user));
 
-		if (flags != 0 || status != 0)
+		if (status != 0)
 			seq_puts(m, " #");
 
 		if (status != 0) {
@@ -1811,7 +2128,6 @@ static int user_seq_show(struct seq_file *m, void *p)
 	seq_puts(m, "\n");
 	seq_printf(m, "Active: %d\n", active);
 	seq_printf(m, "Busy: %d\n", busy);
-	seq_printf(m, "Max: %ld\n", MAX_EVENTS);
 
 	return 0;
 }
@@ -1847,7 +2163,6 @@ static int user_status_open(struct inode *node, struct file *file)
 
 static const struct file_operations user_status_fops = {
 	.open = user_status_open,
-	.mmap = user_status_mmap,
 	.read = seq_read,
 	.llseek  = seq_lseek,
 	.release = seq_release,
@@ -1868,8 +2183,7 @@ static int create_user_tracefs(void)
 		goto err;
 	}
 
-	/* mmap with MAP_SHARED requires writable fd */
-	emmap = tracefs_create_file("user_events_status", TRACE_MODE_WRITE,
+	emmap = tracefs_create_file("user_events_status", TRACE_MODE_READ,
 				    NULL, NULL, &user_status_fops);
 
 	if (!emmap) {
-- 
cgit v1.2.3


From dcb8177c13953872c9e5ce4a99b63a87a3c2f683 Mon Sep 17 00:00:00 2001
From: Beau Belgrave <beaub@linux.microsoft.com>
Date: Tue, 28 Mar 2023 16:52:12 -0700
Subject: tracing/user_events: Add ioctl for disabling addresses

Enablements are now tracked by the lifetime of the task/mm. User
processes need to be able to disable their addresses if tracing is
requested to be turned off. Before unmapping the page would suffice.
However, we now need a stronger contract. Add an ioctl to enable this.

A new flag bit is added, freeing, to user_event_enabler to ensure that
if the event is attempted to be removed while a fault is being handled
that the remove is delayed until after the fault is reattempted.

Link: https://lkml.kernel.org/r/20230328235219.203-6-beaub@linux.microsoft.com

Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/uapi/linux/user_events.h | 24 ++++++++++
 kernel/trace/trace_events_user.c | 97 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 119 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/user_events.h b/include/uapi/linux/user_events.h
index 22521bc622db..3e7275e3234a 100644
--- a/include/uapi/linux/user_events.h
+++ b/include/uapi/linux/user_events.h
@@ -46,6 +46,27 @@ struct user_reg {
 	__u32 write_index;
 } __attribute__((__packed__));
 
+/*
+ * Describes an event unregister, callers must set the size, address and bit.
+ * This structure is passed to the DIAG_IOCSUNREG ioctl to disable bit updates.
+ */
+struct user_unreg {
+	/* Input: Size of the user_unreg structure being used */
+	__u32 size;
+
+	/* Input: Bit to unregister */
+	__u8 disable_bit;
+
+	/* Input: Reserved, set to 0 */
+	__u8 __reserved;
+
+	/* Input: Reserved, set to 0 */
+	__u16 __reserved2;
+
+	/* Input: Address to unregister */
+	__u64 disable_addr;
+} __attribute__((__packed__));
+
 #define DIAG_IOC_MAGIC '*'
 
 /* Request to register a user_event */
@@ -54,4 +75,7 @@ struct user_reg {
 /* Request to delete a user_event */
 #define DIAG_IOCSDEL _IOW(DIAG_IOC_MAGIC, 1, char *)
 
+/* Requests to unregister a user_event */
+#define DIAG_IOCSUNREG _IOW(DIAG_IOC_MAGIC, 2, struct user_unreg*)
+
 #endif /* _UAPI_LINUX_USER_EVENTS_H */
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 86bda1660536..f88bab3f1fe1 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -102,6 +102,9 @@ struct user_event_enabler {
 /* Bit 6 is for faulting status of enablement */
 #define ENABLE_VAL_FAULTING_BIT 6
 
+/* Bit 7 is for freeing status of enablement */
+#define ENABLE_VAL_FREEING_BIT 7
+
 /* Only duplicate the bit value */
 #define ENABLE_VAL_DUP_MASK ENABLE_VAL_BIT_MASK
 
@@ -301,6 +304,12 @@ static void user_event_enabler_fault_fixup(struct work_struct *work)
 	/* Prevent state changes from racing */
 	mutex_lock(&event_mutex);
 
+	/* User asked for enabler to be removed during fault */
+	if (test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler))) {
+		user_event_enabler_destroy(enabler);
+		goto out;
+	}
+
 	/*
 	 * If we managed to get the page, re-issue the write. We do not
 	 * want to get into a possible infinite loop, which is why we only
@@ -315,7 +324,7 @@ static void user_event_enabler_fault_fixup(struct work_struct *work)
 		user_event_enabler_write(mm, enabler, true);
 		mmap_read_unlock(mm->mm);
 	}
-
+out:
 	mutex_unlock(&event_mutex);
 
 	/* In all cases we no longer need the mm or fault */
@@ -370,7 +379,8 @@ static int user_event_enabler_write(struct user_event_mm *mm,
 	if (refcount_read(&mm->tasks) == 0)
 		return -ENOENT;
 
-	if (unlikely(test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler))))
+	if (unlikely(test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler)) ||
+		     test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler))))
 		return -EBUSY;
 
 	ret = pin_user_pages_remote(mm->mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT,
@@ -428,6 +438,10 @@ static bool user_event_enabler_dup(struct user_event_enabler *orig,
 {
 	struct user_event_enabler *enabler;
 
+	/* Skip pending frees */
+	if (unlikely(test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(orig))))
+		return true;
+
 	enabler = kzalloc(sizeof(*enabler), GFP_NOWAIT);
 
 	if (!enabler)
@@ -2086,6 +2100,79 @@ static long user_events_ioctl_del(struct user_event_file_info *info,
 	return ret;
 }
 
+static long user_unreg_get(struct user_unreg __user *ureg,
+			   struct user_unreg *kreg)
+{
+	u32 size;
+	long ret;
+
+	ret = get_user(size, &ureg->size);
+
+	if (ret)
+		return ret;
+
+	if (size > PAGE_SIZE)
+		return -E2BIG;
+
+	if (size < offsetofend(struct user_unreg, disable_addr))
+		return -EINVAL;
+
+	ret = copy_struct_from_user(kreg, sizeof(*kreg), ureg, size);
+
+	/* Ensure no reserved values, since we don't support any yet */
+	if (kreg->__reserved || kreg->__reserved2)
+		return -EINVAL;
+
+	return ret;
+}
+
+/*
+ * Unregisters an enablement address/bit within a task/user mm.
+ */
+static long user_events_ioctl_unreg(unsigned long uarg)
+{
+	struct user_unreg __user *ureg = (struct user_unreg __user *)uarg;
+	struct user_event_mm *mm = current->user_event_mm;
+	struct user_event_enabler *enabler, *next;
+	struct user_unreg reg;
+	long ret;
+
+	ret = user_unreg_get(ureg, &reg);
+
+	if (ret)
+		return ret;
+
+	if (!mm)
+		return -ENOENT;
+
+	ret = -ENOENT;
+
+	/*
+	 * Flags freeing and faulting are used to indicate if the enabler is in
+	 * use at all. When faulting is set a page-fault is occurring asyncly.
+	 * During async fault if freeing is set, the enabler will be destroyed.
+	 * If no async fault is happening, we can destroy it now since we hold
+	 * the event_mutex during these checks.
+	 */
+	mutex_lock(&event_mutex);
+
+	list_for_each_entry_safe(enabler, next, &mm->enablers, link)
+		if (enabler->addr == reg.disable_addr &&
+		    (enabler->values & ENABLE_VAL_BIT_MASK) == reg.disable_bit) {
+			set_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler));
+
+			if (!test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler)))
+				user_event_enabler_destroy(enabler);
+
+			/* Removed at least one */
+			ret = 0;
+		}
+
+	mutex_unlock(&event_mutex);
+
+	return ret;
+}
+
 /*
  * Handles the ioctl from user mode to register or alter operations.
  */
@@ -2108,6 +2195,12 @@ static long user_events_ioctl(struct file *file, unsigned int cmd,
 		ret = user_events_ioctl_del(info, uarg);
 		mutex_unlock(&group->reg_mutex);
 		break;
+
+	case DIAG_IOCSUNREG:
+		mutex_lock(&group->reg_mutex);
+		ret = user_events_ioctl_unreg(uarg);
+		mutex_unlock(&group->reg_mutex);
+		break;
 	}
 
 	return ret;
-- 
cgit v1.2.3


From a4c40c1349e32f9510707ed09e0961626980d8cb Mon Sep 17 00:00:00 2001
From: Beau Belgrave <beaub@linux.microsoft.com>
Date: Tue, 28 Mar 2023 16:52:19 -0700
Subject: tracing/user_events: Align structs with tabs for readability

Add tabs to make struct members easier to read and unify the style of
the code.

Link: https://lkml.kernel.org/r/20230328235219.203-13-beaub@linux.microsoft.com

Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/user_events.h      | 14 +++----
 include/uapi/linux/user_events.h | 24 ++++++------
 kernel/trace/trace_events_user.c | 82 ++++++++++++++++++++--------------------
 3 files changed, 60 insertions(+), 60 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/user_events.h b/include/linux/user_events.h
index 0120b3dd5b03..2847f5a18a86 100644
--- a/include/linux/user_events.h
+++ b/include/linux/user_events.h
@@ -17,13 +17,13 @@
 
 #ifdef CONFIG_USER_EVENTS
 struct user_event_mm {
-	struct list_head link;
-	struct list_head enablers;
-	struct mm_struct *mm;
-	struct user_event_mm *next;
-	refcount_t refcnt;
-	refcount_t tasks;
-	struct rcu_work put_rwork;
+	struct list_head	link;
+	struct list_head	enablers;
+	struct mm_struct	*mm;
+	struct user_event_mm	*next;
+	refcount_t		refcnt;
+	refcount_t		tasks;
+	struct rcu_work		put_rwork;
 };
 
 extern void user_event_mm_dup(struct task_struct *t,
diff --git a/include/uapi/linux/user_events.h b/include/uapi/linux/user_events.h
index 3e7275e3234a..2984aae4a2b4 100644
--- a/include/uapi/linux/user_events.h
+++ b/include/uapi/linux/user_events.h
@@ -25,25 +25,25 @@
 struct user_reg {
 
 	/* Input: Size of the user_reg structure being used */
-	__u32 size;
+	__u32	size;
 
 	/* Input: Bit in enable address to use */
-	__u8 enable_bit;
+	__u8	enable_bit;
 
 	/* Input: Enable size in bytes at address */
-	__u8 enable_size;
+	__u8	enable_size;
 
 	/* Input: Flags for future use, set to 0 */
-	__u16 flags;
+	__u16	flags;
 
 	/* Input: Address to update when enabled */
-	__u64 enable_addr;
+	__u64	enable_addr;
 
 	/* Input: Pointer to string with event name, description and flags */
-	__u64 name_args;
+	__u64	name_args;
 
 	/* Output: Index of the event to use when writing data */
-	__u32 write_index;
+	__u32	write_index;
 } __attribute__((__packed__));
 
 /*
@@ -52,19 +52,19 @@ struct user_reg {
  */
 struct user_unreg {
 	/* Input: Size of the user_unreg structure being used */
-	__u32 size;
+	__u32	size;
 
 	/* Input: Bit to unregister */
-	__u8 disable_bit;
+	__u8	disable_bit;
 
 	/* Input: Reserved, set to 0 */
-	__u8 __reserved;
+	__u8	__reserved;
 
 	/* Input: Reserved, set to 0 */
-	__u16 __reserved2;
+	__u16	__reserved2;
 
 	/* Input: Address to unregister */
-	__u64 disable_addr;
+	__u64	disable_addr;
 } __attribute__((__packed__));
 
 #define DIAG_IOC_MAGIC '*'
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 9b43a02e1597..67cb7b53caf6 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -53,9 +53,9 @@
  * allows isolation for events by various means.
  */
 struct user_event_group {
-	char *system_name;
-	struct hlist_node node;
-	struct mutex reg_mutex;
+	char		*system_name;
+	struct		hlist_node node;
+	struct		mutex reg_mutex;
 	DECLARE_HASHTABLE(register_table, 8);
 };
 
@@ -76,17 +76,17 @@ static unsigned int current_user_events;
  * refcnt reaches one.
  */
 struct user_event {
-	struct user_event_group *group;
-	struct tracepoint tracepoint;
-	struct trace_event_call call;
-	struct trace_event_class class;
-	struct dyn_event devent;
-	struct hlist_node node;
-	struct list_head fields;
-	struct list_head validators;
-	refcount_t refcnt;
-	int min_size;
-	char status;
+	struct user_event_group		*group;
+	struct tracepoint		tracepoint;
+	struct trace_event_call		call;
+	struct trace_event_class	class;
+	struct dyn_event		devent;
+	struct hlist_node		node;
+	struct list_head		fields;
+	struct list_head		validators;
+	refcount_t			refcnt;
+	int				min_size;
+	char				status;
 };
 
 /*
@@ -95,12 +95,12 @@ struct user_event {
  * these to track enablement sites that are tied to an event.
  */
 struct user_event_enabler {
-	struct list_head link;
-	struct user_event *event;
-	unsigned long addr;
+	struct list_head	link;
+	struct user_event	*event;
+	unsigned long		addr;
 
 	/* Track enable bit, flags, etc. Aligned for bitops. */
-	unsigned int values;
+	unsigned int		values;
 };
 
 /* Bits 0-5 are for the bit to update upon enable/disable (0-63 allowed) */
@@ -119,9 +119,9 @@ struct user_event_enabler {
 
 /* Used for asynchronous faulting in of pages */
 struct user_event_enabler_fault {
-	struct work_struct work;
-	struct user_event_mm *mm;
-	struct user_event_enabler *enabler;
+	struct work_struct		work;
+	struct user_event_mm		*mm;
+	struct user_event_enabler	*enabler;
 };
 
 static struct kmem_cache *fault_cache;
@@ -137,23 +137,23 @@ static DEFINE_SPINLOCK(user_event_mms_lock);
  * These are not shared and only accessible by the file that created it.
  */
 struct user_event_refs {
-	struct rcu_head rcu;
-	int count;
-	struct user_event *events[];
+	struct rcu_head		rcu;
+	int			count;
+	struct user_event	*events[];
 };
 
 struct user_event_file_info {
-	struct user_event_group *group;
-	struct user_event_refs *refs;
+	struct user_event_group	*group;
+	struct user_event_refs	*refs;
 };
 
 #define VALIDATOR_ENSURE_NULL (1 << 0)
 #define VALIDATOR_REL (1 << 1)
 
 struct user_event_validator {
-	struct list_head link;
-	int offset;
-	int flags;
+	struct list_head	link;
+	int			offset;
+	int			flags;
 };
 
 typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i,
@@ -2276,11 +2276,11 @@ out:
 }
 
 static const struct file_operations user_data_fops = {
-	.open = user_events_open,
-	.write = user_events_write,
-	.write_iter = user_events_write_iter,
+	.open		= user_events_open,
+	.write		= user_events_write,
+	.write_iter	= user_events_write_iter,
 	.unlocked_ioctl	= user_events_ioctl,
-	.release = user_events_release,
+	.release	= user_events_release,
 };
 
 static void *user_seq_start(struct seq_file *m, loff_t *pos)
@@ -2346,10 +2346,10 @@ static int user_seq_show(struct seq_file *m, void *p)
 }
 
 static const struct seq_operations user_seq_ops = {
-	.start = user_seq_start,
-	.next  = user_seq_next,
-	.stop  = user_seq_stop,
-	.show  = user_seq_show,
+	.start	= user_seq_start,
+	.next	= user_seq_next,
+	.stop	= user_seq_stop,
+	.show	= user_seq_show,
 };
 
 static int user_status_open(struct inode *node, struct file *file)
@@ -2375,10 +2375,10 @@ static int user_status_open(struct inode *node, struct file *file)
 }
 
 static const struct file_operations user_status_fops = {
-	.open = user_status_open,
-	.read = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release,
+	.open		= user_status_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
 };
 
 /*
-- 
cgit v1.2.3


From 30ec7997d175cd689fc61bfc4059f4d35b11858c Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 30 Mar 2023 18:47:47 +0100
Subject: KVM: arm64: timers: Allow userspace to set the global counter offset

And this is the moment you have all been waiting for: setting the
counter offset from userspace.

We expose a brand new capability that reports the ability to set
the offset for both the virtual and physical sides.

In keeping with the architecture, the offset is expressed as
a delta that is substracted from the physical counter value.

Once this new API is used, there is no going back, and the counters
cannot be written to to set the offsets implicitly (the writes
are instead ignored).

Reviewed-by: Colton Lewis <coltonlewis@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230330174800.2677007-8-maz@kernel.org
---
 arch/arm64/include/asm/kvm_host.h |  4 +++
 arch/arm64/include/uapi/asm/kvm.h |  9 +++++++
 arch/arm64/kvm/arch_timer.c       | 54 +++++++++++++++++++++++++++++++++++----
 arch/arm64/kvm/arm.c              |  8 ++++++
 include/uapi/linux/kvm.h          |  3 +++
 5 files changed, 73 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 002a10cbade2..116233a390e9 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -221,6 +221,8 @@ struct kvm_arch {
 #define KVM_ARCH_FLAG_EL1_32BIT				4
 	/* PSCI SYSTEM_SUSPEND enabled for the guest */
 #define KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED		5
+	/* VM counter offset */
+#define KVM_ARCH_FLAG_VM_COUNTER_OFFSET			6
 
 	unsigned long flags;
 
@@ -1010,6 +1012,8 @@ int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
 
 long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
 				struct kvm_arm_copy_mte_tags *copy_tags);
+int kvm_vm_ioctl_set_counter_offset(struct kvm *kvm,
+				    struct kvm_arm_counter_offset *offset);
 
 /* Guest/host FPSIMD coordination helpers */
 int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index f8129c624b07..12fb0d8a760a 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -198,6 +198,15 @@ struct kvm_arm_copy_mte_tags {
 	__u64 reserved[2];
 };
 
+/*
+ * Counter/Timer offset structure. Describe the virtual/physical offset.
+ * To be used with KVM_ARM_SET_COUNTER_OFFSET.
+ */
+struct kvm_arm_counter_offset {
+	__u64 counter_offset;
+	__u64 reserved;
+};
+
 #define KVM_ARM_TAGS_TO_GUEST		0
 #define KVM_ARM_TAGS_FROM_GUEST		1
 
diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index bb64a71ae193..771504c79711 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -851,9 +851,11 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
 	ptimer->vcpu = vcpu;
 	ptimer->offset.vm_offset = &vcpu->kvm->arch.timer_data.poffset;
 
-	/* Synchronize cntvoff across all vtimers of a VM. */
-	timer_set_offset(vtimer, kvm_phys_timer_read());
-	timer_set_offset(ptimer, 0);
+	/* Synchronize offsets across timers of a VM if not already provided */
+	if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &vcpu->kvm->arch.flags)) {
+		timer_set_offset(vtimer, kvm_phys_timer_read());
+		timer_set_offset(ptimer, 0);
+	}
 
 	hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
 	timer->bg_timer.function = kvm_bg_timer_expire;
@@ -897,8 +899,11 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
 		kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value);
 		break;
 	case KVM_REG_ARM_TIMER_CNT:
-		timer = vcpu_vtimer(vcpu);
-		timer_set_offset(timer, kvm_phys_timer_read() - value);
+		if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET,
+			      &vcpu->kvm->arch.flags)) {
+			timer = vcpu_vtimer(vcpu);
+			timer_set_offset(timer, kvm_phys_timer_read() - value);
+		}
 		break;
 	case KVM_REG_ARM_TIMER_CVAL:
 		timer = vcpu_vtimer(vcpu);
@@ -908,6 +913,13 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
 		timer = vcpu_ptimer(vcpu);
 		kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value);
 		break;
+	case KVM_REG_ARM_PTIMER_CNT:
+		if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET,
+			      &vcpu->kvm->arch.flags)) {
+			timer = vcpu_ptimer(vcpu);
+			timer_set_offset(timer, kvm_phys_timer_read() - value);
+		}
+		break;
 	case KVM_REG_ARM_PTIMER_CVAL:
 		timer = vcpu_ptimer(vcpu);
 		kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value);
@@ -1443,3 +1455,35 @@ int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
 
 	return -ENXIO;
 }
+
+int kvm_vm_ioctl_set_counter_offset(struct kvm *kvm,
+				    struct kvm_arm_counter_offset *offset)
+{
+	int ret = 0;
+
+	if (offset->reserved)
+		return -EINVAL;
+
+	mutex_lock(&kvm->lock);
+
+	if (lock_all_vcpus(kvm)) {
+		set_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &kvm->arch.flags);
+
+		/*
+		 * If userspace decides to set the offset using this
+		 * API rather than merely restoring the counter
+		 * values, the offset applies to both the virtual and
+		 * physical views.
+		 */
+		kvm->arch.timer_data.voffset = offset->counter_offset;
+		kvm->arch.timer_data.poffset = offset->counter_offset;
+
+		unlock_all_vcpus(kvm);
+	} else {
+		ret = -EBUSY;
+	}
+
+	mutex_unlock(&kvm->lock);
+
+	return ret;
+}
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index ae5110cc3bad..1c8a4bbae684 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -220,6 +220,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_VCPU_ATTRIBUTES:
 	case KVM_CAP_PTP_KVM:
 	case KVM_CAP_ARM_SYSTEM_SUSPEND:
+	case KVM_CAP_COUNTER_OFFSET:
 		r = 1;
 		break;
 	case KVM_CAP_SET_GUEST_DEBUG2:
@@ -1479,6 +1480,13 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			return -EFAULT;
 		return kvm_vm_ioctl_mte_copy_tags(kvm, &copy_tags);
 	}
+	case KVM_ARM_SET_COUNTER_OFFSET: {
+		struct kvm_arm_counter_offset offset;
+
+		if (copy_from_user(&offset, argp, sizeof(offset)))
+			return -EFAULT;
+		return kvm_vm_ioctl_set_counter_offset(kvm, &offset);
+	}
 	default:
 		return -EINVAL;
 	}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index d77aef872a0a..6a7e1a0ecf04 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1184,6 +1184,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_S390_PROTECTED_ASYNC_DISABLE 224
 #define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 225
 #define KVM_CAP_PMU_EVENT_MASKED_EVENTS 226
+#define KVM_CAP_COUNTER_OFFSET 227
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1543,6 +1544,8 @@ struct kvm_s390_ucas_mapping {
 #define KVM_SET_PMU_EVENT_FILTER  _IOW(KVMIO,  0xb2, struct kvm_pmu_event_filter)
 #define KVM_PPC_SVM_OFF		  _IO(KVMIO,  0xb3)
 #define KVM_ARM_MTE_COPY_TAGS	  _IOR(KVMIO,  0xb4, struct kvm_arm_copy_mte_tags)
+/* Available with KVM_CAP_COUNTER_OFFSET */
+#define KVM_ARM_SET_COUNTER_OFFSET _IOW(KVMIO,  0xb5, struct kvm_arm_counter_offset)
 
 /* ioctl for vm fd */
 #define KVM_CREATE_DEVICE	  _IOWR(KVMIO,  0xe0, struct kvm_create_device)
-- 
cgit v1.2.3


From 9e410fe3dc9a938bc47f71dff254be7419bd40d2 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Fri, 3 Mar 2023 13:34:11 -0800
Subject: dmaengine: idxd: Add descriptor definitions for 16 bytes of pattern
 in memory fill operation

The memory fill operation (0x04) can fill in memory with either 8 bytes
or 16 bytes of pattern. To fill in memory with 16 bytes of pattern, the
first 8 bytes are provided in pattern lower in bytes 16-23 and the next
8 bytes are in pattern upper in bytes 40-47 in the descriptor. Currently
only 8 bytes of pattern is enabled.

Add descriptor definitions for pattern lower and pattern upper so that
user can use 16 bytes of pattern to fill memory.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/20230303213413.3357431-2-fenghua.yu@intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/uapi/linux/idxd.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index 1d553bedbdb5..c43d7df5fc15 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -180,6 +180,7 @@ struct dsa_hw_desc {
 		uint64_t	rdback_addr;
 		uint64_t	pattern;
 		uint64_t	desc_list_addr;
+		uint64_t	pattern_lower;
 	};
 	union {
 		uint64_t	dst_addr;
@@ -244,6 +245,9 @@ struct dsa_hw_desc {
 			uint16_t	dest_app_tag_seed;
 		};
 
+		/* Fill */
+		uint64_t	pattern_upper;
+
 		uint8_t		op_specific[24];
 	};
 } __attribute__((packed));
-- 
cgit v1.2.3


From 12bbc2c2605516e781cd86e3cde9fe1f889b72cc Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Fri, 3 Mar 2023 13:34:12 -0800
Subject: dmaengine: idxd: Add descriptor definitions for DIX generate
 operation

The Data Integrity Extension (DIX) generate operation (0x17) computes
the Data Integrity Field (DIF) on the source data and writes only the
computed DIF for each source block to the PI destination address.

Add descriptor definitions for this operation so that user can use
DSA to accelerate DIX generate operation.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/20230303213413.3357431-3-fenghua.yu@intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/uapi/linux/idxd.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index c43d7df5fc15..4c12e93a6aa6 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -78,6 +78,7 @@ enum dsa_opcode {
 	DSA_OPCODE_DIF_INS,
 	DSA_OPCODE_DIF_STRP,
 	DSA_OPCODE_DIF_UPDT,
+	DSA_OPCODE_DIX_GEN = 0x17,
 	DSA_OPCODE_CFLUSH = 0x20,
 };
 
@@ -248,6 +249,17 @@ struct dsa_hw_desc {
 		/* Fill */
 		uint64_t	pattern_upper;
 
+		/* DIX generate */
+		struct {
+			uint8_t		dix_gen_res;
+			uint8_t		dest_dif_flags;
+			uint8_t		dif_flags;
+			uint8_t		dix_gen_res2[13];
+			uint32_t	ref_tag_seed;
+			uint16_t	app_tag_mask;
+			uint16_t	app_tag_seed;
+		};
+
 		uint8_t		op_specific[24];
 	};
 } __attribute__((packed));
@@ -326,6 +338,14 @@ struct dsa_completion_record {
 			uint16_t	dif_upd_dest_app_tag;
 		};
 
+		/* DIX generate */
+		struct {
+			uint64_t	dix_gen_res;
+			uint32_t	dix_ref_tag;
+			uint16_t	dix_app_tag_mask;
+			uint16_t	dix_app_tag;
+		};
+
 		uint8_t		op_specific[16];
 	};
 } __attribute__((packed));
-- 
cgit v1.2.3


From 6fec8938b7b4fe2b2c503fe87b2783a50bff0415 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Fri, 3 Mar 2023 13:34:13 -0800
Subject: dmaengine: idxd: Add descriptor definitions for translation fetch
 operation

The translation fetch operation (0x0A) fetches address translations for the
address range specified in the descriptor by issuing address translation
(ATS) requests to the IOMMU.

Add descriptor definitions for the operation so that user can use DSA
to accelerate translation fetch.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/20230303213413.3357431-4-fenghua.yu@intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/uapi/linux/idxd.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index 4c12e93a6aa6..fc47635b57dc 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -72,6 +72,7 @@ enum dsa_opcode {
 	DSA_OPCODE_CR_DELTA,
 	DSA_OPCODE_AP_DELTA,
 	DSA_OPCODE_DUALCAST,
+	DSA_OPCODE_TRANSL_FETCH,
 	DSA_OPCODE_CRCGEN = 0x10,
 	DSA_OPCODE_COPY_CRC,
 	DSA_OPCODE_DIF_CHECK,
@@ -182,6 +183,7 @@ struct dsa_hw_desc {
 		uint64_t	pattern;
 		uint64_t	desc_list_addr;
 		uint64_t	pattern_lower;
+		uint64_t	transl_fetch_addr;
 	};
 	union {
 		uint64_t	dst_addr;
@@ -192,6 +194,7 @@ struct dsa_hw_desc {
 	union {
 		uint32_t	xfer_size;
 		uint32_t	desc_count;
+		uint32_t	region_size;
 	};
 	uint16_t	int_handle;
 	uint16_t	rsvd1;
@@ -249,6 +252,12 @@ struct dsa_hw_desc {
 		/* Fill */
 		uint64_t	pattern_upper;
 
+		/* Translation fetch */
+		struct {
+			uint64_t	transl_fetch_res;
+			uint32_t	region_stride;
+		};
+
 		/* DIX generate */
 		struct {
 			uint8_t		dix_gen_res;
-- 
cgit v1.2.3


From e65733b5c59a1ea20324a03494364958bef3fc68 Mon Sep 17 00:00:00 2001
From: Oliver Upton <oliver.upton@linux.dev>
Date: Tue, 4 Apr 2023 15:40:38 +0000
Subject: KVM: x86: Redefine 'longmode' as a flag for KVM_EXIT_HYPERCALL

The 'longmode' field is a bit annoying as it blows an entire __u32 to
represent a boolean value. Since other architectures are looking to add
support for KVM_EXIT_HYPERCALL, now is probably a good time to clean it
up.

Redefine the field (and the remaining padding) as a set of flags.
Preserve the existing ABI by using bit 0 to indicate if the guest was in
long mode and requiring that the remaining 31 bits must be zero.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Acked-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230404154050.2270077-2-oliver.upton@linux.dev
---
 Documentation/virt/kvm/api.rst  | 3 +--
 arch/x86/include/asm/kvm_host.h | 7 +++++++
 arch/x86/include/uapi/asm/kvm.h | 3 +++
 arch/x86/kvm/x86.c              | 6 +++++-
 include/uapi/linux/kvm.h        | 9 +++++++--
 5 files changed, 23 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 62de0768d6aa..9b01e3d0e757 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6218,8 +6218,7 @@ to the byte array.
 			__u64 nr;
 			__u64 args[6];
 			__u64 ret;
-			__u32 longmode;
-			__u32 pad;
+			__u64 flags;
 		} hypercall;
 
 Unused.  This was once used for 'hypercall to userspace'.  To implement
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 808c292ad3f4..15bda40517ff 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -2204,4 +2204,11 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
 	 KVM_X86_QUIRK_FIX_HYPERCALL_INSN |	\
 	 KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS)
 
+/*
+ * KVM previously used a u32 field in kvm_run to indicate the hypercall was
+ * initiated from long mode. KVM now sets bit 0 to indicate long mode, but the
+ * remaining 31 lower bits must be 0 to preserve ABI.
+ */
+#define KVM_EXIT_HYPERCALL_MBZ		GENMASK_ULL(31, 1)
+
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 7f467fe05d42..1a6a1f987949 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -559,4 +559,7 @@ struct kvm_pmu_event_filter {
 #define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */
 #define   KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */
 
+/* x86-specific KVM_EXIT_HYPERCALL flags. */
+#define KVM_EXIT_HYPERCALL_LONG_MODE	BIT(0)
+
 #endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7713420abab0..27a1d5c1a018 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9803,7 +9803,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 		vcpu->run->hypercall.args[0]  = gpa;
 		vcpu->run->hypercall.args[1]  = npages;
 		vcpu->run->hypercall.args[2]  = attrs;
-		vcpu->run->hypercall.longmode = op_64_bit;
+		vcpu->run->hypercall.flags    = 0;
+		if (op_64_bit)
+			vcpu->run->hypercall.flags |= KVM_EXIT_HYPERCALL_LONG_MODE;
+
+		WARN_ON_ONCE(vcpu->run->hypercall.flags & KVM_EXIT_HYPERCALL_MBZ);
 		vcpu->arch.complete_userspace_io = complete_hypercall_exit;
 		return 0;
 	}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index d77aef872a0a..dd42d7dfb86c 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -341,8 +341,13 @@ struct kvm_run {
 			__u64 nr;
 			__u64 args[6];
 			__u64 ret;
-			__u32 longmode;
-			__u32 pad;
+
+			union {
+#ifndef __KERNEL__
+				__u32 longmode;
+#endif
+				__u64 flags;
+			};
 		} hypercall;
 		/* KVM_EXIT_TPR_ACCESS */
 		struct {
-- 
cgit v1.2.3


From 2bad466cc9d9b4c3b4b16eb9c03c919b59561316 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Thu, 9 Mar 2023 17:37:10 -0500
Subject: mm/uffd: UFFD_FEATURE_WP_UNPOPULATED

Patch series "mm/uffd: Add feature bit UFFD_FEATURE_WP_UNPOPULATED", v4.

The new feature bit makes anonymous memory acts the same as file memory on
userfaultfd-wp in that it'll also wr-protect none ptes.

It can be useful in two cases:

(1) Uffd-wp app that needs to wr-protect none ptes like QEMU snapshot,
    so pre-fault can be replaced by enabling this flag and speed up
    protections

(2) It helps to implement async uffd-wp mode that Muhammad is working on [1]

It's debatable whether this is the most ideal solution because with the
new feature bit set, wr-protect none pte needs to pre-populate the
pgtables to the last level (PAGE_SIZE).  But it seems fine so far to
service either purpose above, so we can leave optimizations for later.

The series brings pte markers to anonymous memory too.  There's some
change in the common mm code path in the 1st patch, great to have some eye
looking at it, but hopefully they're still relatively straightforward.


This patch (of 2):

This is a new feature that controls how uffd-wp handles none ptes.  When
it's set, the kernel will handle anonymous memory the same way as file
memory, by allowing the user to wr-protect unpopulated ptes.

File memories handles none ptes consistently by allowing wr-protecting of
none ptes because of the unawareness of page cache being exist or not.
For anonymous it was not as persistent because we used to assume that we
don't need protections on none ptes or known zero pages.

One use case of such a feature bit was VM live snapshot, where if without
wr-protecting empty ptes the snapshot can contain random rubbish in the
holes of the anonymous memory, which can cause misbehave of the guest when
the guest OS assumes the pages should be all zeros.

QEMU worked it around by pre-populate the section with reads to fill in
zero page entries before starting the whole snapshot process [1].

Recently there's another need raised on using userfaultfd wr-protect for
detecting dirty pages (to replace soft-dirty in some cases) [2].  In that
case if without being able to wr-protect none ptes by default, the dirty
info can get lost, since we cannot treat every none pte to be dirty (the
current design is identify a page dirty based on uffd-wp bit being
cleared).

In general, we want to be able to wr-protect empty ptes too even for
anonymous.

This patch implements UFFD_FEATURE_WP_UNPOPULATED so that it'll make
uffd-wp handling on none ptes being consistent no matter what the memory
type is underneath.  It doesn't have any impact on file memories so far
because we already have pte markers taking care of that.  So it only
affects anonymous.

The feature bit is by default off, so the old behavior will be maintained.
Sometimes it may be wanted because the wr-protect of none ptes will
contain overheads not only during UFFDIO_WRITEPROTECT (by applying pte
markers to anonymous), but also on creating the pgtables to store the pte
markers.  So there's potentially less chance of using thp on the first
fault for a none pmd or larger than a pmd.

The major implementation part is teaching the whole kernel to understand
pte markers even for anonymously mapped ranges, meanwhile allowing the
UFFDIO_WRITEPROTECT ioctl to apply pte markers for anonymous too when the
new feature bit is set.

Note that even if the patch subject starts with mm/uffd, there're a few
small refactors to major mm path of handling anonymous page faults.  But
they should be straightforward.

With WP_UNPOPUATED, application like QEMU can avoid pre-read faults all
the memory before wr-protect during taking a live snapshot.  Quotting from
Muhammad's test result here [3] based on a simple program [4]:

  (1) With huge page disabled
  echo madvise > /sys/kernel/mm/transparent_hugepage/enabled
  ./uffd_wp_perf
  Test DEFAULT: 4
  Test PRE-READ: 1111453 (pre-fault 1101011)
  Test MADVISE: 278276 (pre-fault 266378)
  Test WP-UNPOPULATE: 11712

  (2) With Huge page enabled
  echo always > /sys/kernel/mm/transparent_hugepage/enabled
  ./uffd_wp_perf
  Test DEFAULT: 4
  Test PRE-READ: 22521 (pre-fault 22348)
  Test MADVISE: 4909 (pre-fault 4743)
  Test WP-UNPOPULATE: 14448

There'll be a great perf boost for no-thp case, while for thp enabled with
extreme case of all-thp-zero WP_UNPOPULATED can be slower than MADVISE,
but that's low possibility in reality, also the overhead was not reduced
but postponed until a follow up write on any huge zero thp, so potentially
it is faster by making the follow up writes slower.

[1] https://lore.kernel.org/all/20210401092226.102804-4-andrey.gruzdev@virtuozzo.com/
[2] https://lore.kernel.org/all/Y+v2HJ8+3i%2FKzDBu@x1n/
[3] https://lore.kernel.org/all/d0eb0a13-16dc-1ac1-653a-78b7273781e3@collabora.com/
[4] https://github.com/xzpeter/clibs/blob/master/uffd-test/uffd-wp-perf.c

[peterx@redhat.com: comment changes, oneliner fix to khugepaged]
  Link: https://lkml.kernel.org/r/ZB2/8jPhD3fpx5U8@x1n
Link: https://lkml.kernel.org/r/20230309223711.823547-1-peterx@redhat.com
Link: https://lkml.kernel.org/r/20230309223711.823547-2-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Paul Gofman <pgofman@codeweavers.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/userfaultfd.rst | 17 +++++++++
 fs/userfaultfd.c                             | 16 ++++++++
 include/linux/mm_inline.h                    |  6 +++
 include/linux/userfaultfd_k.h                | 23 ++++++++++++
 include/uapi/linux/userfaultfd.h             | 10 ++++-
 mm/khugepaged.c                              |  2 +-
 mm/memory.c                                  | 56 +++++++++++++++++++++-------
 mm/mprotect.c                                | 51 ++++++++++++++++++++-----
 8 files changed, 155 insertions(+), 26 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/admin-guide/mm/userfaultfd.rst b/Documentation/admin-guide/mm/userfaultfd.rst
index 7dc823b56ca4..bd2226299583 100644
--- a/Documentation/admin-guide/mm/userfaultfd.rst
+++ b/Documentation/admin-guide/mm/userfaultfd.rst
@@ -219,6 +219,23 @@ former will have ``UFFD_PAGEFAULT_FLAG_WP`` set, the latter
 you still need to supply a page when ``UFFDIO_REGISTER_MODE_MISSING`` was
 used.
 
+Userfaultfd write-protect mode currently behave differently on none ptes
+(when e.g. page is missing) over different types of memories.
+
+For anonymous memory, ``ioctl(UFFDIO_WRITEPROTECT)`` will ignore none ptes
+(e.g. when pages are missing and not populated).  For file-backed memories
+like shmem and hugetlbfs, none ptes will be write protected just like a
+present pte.  In other words, there will be a userfaultfd write fault
+message generated when writing to a missing page on file typed memories,
+as long as the page range was write-protected before.  Such a message will
+not be generated on anonymous memories by default.
+
+If the application wants to be able to write protect none ptes on anonymous
+memory, one can pre-populate the memory with e.g. MADV_POPULATE_READ.  On
+newer kernels, one can also detect the feature UFFD_FEATURE_WP_UNPOPULATED
+and set the feature bit in advance to make sure none ptes will also be
+write protected even upon anonymous memory.
+
 QEMU/KVM
 ========
 
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 44d1ee429eb0..881e9c82b9d1 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -108,6 +108,21 @@ static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
 	return ctx->features & UFFD_FEATURE_INITIALIZED;
 }
 
+/*
+ * Whether WP_UNPOPULATED is enabled on the uffd context.  It is only
+ * meaningful when userfaultfd_wp()==true on the vma and when it's
+ * anonymous.
+ */
+bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
+{
+	struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
+
+	if (!ctx)
+		return false;
+
+	return ctx->features & UFFD_FEATURE_WP_UNPOPULATED;
+}
+
 static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
 				     vm_flags_t flags)
 {
@@ -1971,6 +1986,7 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
 #endif
 #ifndef CONFIG_PTE_MARKER_UFFD_WP
 	uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
+	uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
 #endif
 	uffdio_api.ioctls = UFFD_API_IOCTLS;
 	ret = -EFAULT;
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index de1e622dd366..0e1d239a882c 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -557,6 +557,12 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr,
 	/* The current status of the pte should be "cleared" before calling */
 	WARN_ON_ONCE(!pte_none(*pte));
 
+	/*
+	 * NOTE: userfaultfd_wp_unpopulated() doesn't need this whole
+	 * thing, because when zapping either it means it's dropping the
+	 * page, or in TTU where the present pte will be quickly replaced
+	 * with a swap pte.  There's no way of leaking the bit.
+	 */
 	if (vma_is_anonymous(vma) || !userfaultfd_wp(vma))
 		return;
 
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 3767f18114ef..0cf8880219da 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -179,6 +179,7 @@ extern int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start,
 				  unsigned long end, struct list_head *uf);
 extern void userfaultfd_unmap_complete(struct mm_struct *mm,
 				       struct list_head *uf);
+extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma);
 
 #else /* CONFIG_USERFAULTFD */
 
@@ -274,8 +275,30 @@ static inline bool uffd_disable_fault_around(struct vm_area_struct *vma)
 	return false;
 }
 
+static inline bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
+{
+	return false;
+}
+
 #endif /* CONFIG_USERFAULTFD */
 
+static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma)
+{
+	/* Only wr-protect mode uses pte markers */
+	if (!userfaultfd_wp(vma))
+		return false;
+
+	/* File-based uffd-wp always need markers */
+	if (!vma_is_anonymous(vma))
+		return true;
+
+	/*
+	 * Anonymous uffd-wp only needs the markers if WP_UNPOPULATED
+	 * enabled (to apply markers on zero pages).
+	 */
+	return userfaultfd_wp_unpopulated(vma);
+}
+
 static inline bool pte_marker_entry_uffd_wp(swp_entry_t entry)
 {
 #ifdef CONFIG_PTE_MARKER_UFFD_WP
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 005e5e306266..90c958952bfc 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -38,7 +38,8 @@
 			   UFFD_FEATURE_MINOR_HUGETLBFS |	\
 			   UFFD_FEATURE_MINOR_SHMEM |		\
 			   UFFD_FEATURE_EXACT_ADDRESS |		\
-			   UFFD_FEATURE_WP_HUGETLBFS_SHMEM)
+			   UFFD_FEATURE_WP_HUGETLBFS_SHMEM |	\
+			   UFFD_FEATURE_WP_UNPOPULATED)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -203,6 +204,12 @@ struct uffdio_api {
 	 *
 	 * UFFD_FEATURE_WP_HUGETLBFS_SHMEM indicates that userfaultfd
 	 * write-protection mode is supported on both shmem and hugetlbfs.
+	 *
+	 * UFFD_FEATURE_WP_UNPOPULATED indicates that userfaultfd
+	 * write-protection mode will always apply to unpopulated pages
+	 * (i.e. empty ptes).  This will be the default behavior for shmem
+	 * & hugetlbfs, so this flag only affects anonymous memory behavior
+	 * when userfault write-protection mode is registered.
 	 */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
@@ -217,6 +224,7 @@ struct uffdio_api {
 #define UFFD_FEATURE_MINOR_SHMEM		(1<<10)
 #define UFFD_FEATURE_EXACT_ADDRESS		(1<<11)
 #define UFFD_FEATURE_WP_HUGETLBFS_SHMEM		(1<<12)
+#define UFFD_FEATURE_WP_UNPOPULATED		(1<<13)
 	__u64 features;
 
 	__u64 ioctls;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 074ea534f786..c7317678cb10 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1177,7 +1177,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 				 * enabled swap entries.  Please see
 				 * comment below for pte_uffd_wp().
 				 */
-				if (pte_swp_uffd_wp(pteval)) {
+				if (pte_swp_uffd_wp_any(pteval)) {
 					result = SCAN_PTE_UFFD_WP;
 					goto out_unmap;
 				}
diff --git a/mm/memory.c b/mm/memory.c
index 6285cad1f4fb..a890b2951b53 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -104,6 +104,20 @@ EXPORT_SYMBOL(mem_map);
 #endif
 
 static vm_fault_t do_fault(struct vm_fault *vmf);
+static vm_fault_t do_anonymous_page(struct vm_fault *vmf);
+static bool vmf_pte_changed(struct vm_fault *vmf);
+
+/*
+ * Return true if the original pte was a uffd-wp pte marker (so the pte was
+ * wr-protected).
+ */
+static bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf)
+{
+	if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
+		return false;
+
+	return pte_marker_uffd_wp(vmf->orig_pte);
+}
 
 /*
  * A number of key systems in x86 including ioremap() rely on the assumption
@@ -1346,6 +1360,10 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
 			      unsigned long addr, pte_t *pte,
 			      struct zap_details *details, pte_t pteval)
 {
+	/* Zap on anonymous always means dropping everything */
+	if (vma_is_anonymous(vma))
+		return;
+
 	if (zap_drop_file_uffd_wp(details))
 		return;
 
@@ -1452,8 +1470,12 @@ again:
 				continue;
 			rss[mm_counter(page)]--;
 		} else if (pte_marker_entry_uffd_wp(entry)) {
-			/* Only drop the uffd-wp marker if explicitly requested */
-			if (!zap_drop_file_uffd_wp(details))
+			/*
+			 * For anon: always drop the marker; for file: only
+			 * drop the marker if explicitly requested.
+			 */
+			if (!vma_is_anonymous(vma) &&
+			    !zap_drop_file_uffd_wp(details))
 				continue;
 		} else if (is_hwpoison_entry(entry) ||
 			   is_swapin_error_entry(entry)) {
@@ -3620,6 +3642,14 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
 	return 0;
 }
 
+static vm_fault_t do_pte_missing(struct vm_fault *vmf)
+{
+	if (vma_is_anonymous(vmf->vma))
+		return do_anonymous_page(vmf);
+	else
+		return do_fault(vmf);
+}
+
 /*
  * This is actually a page-missing access, but with uffd-wp special pte
  * installed.  It means this pte was wr-protected before being unmapped.
@@ -3630,11 +3660,10 @@ static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf)
 	 * Just in case there're leftover special ptes even after the region
 	 * got unregistered - we can simply clear them.
 	 */
-	if (unlikely(!userfaultfd_wp(vmf->vma) || vma_is_anonymous(vmf->vma)))
+	if (unlikely(!userfaultfd_wp(vmf->vma)))
 		return pte_marker_clear(vmf);
 
-	/* do_fault() can handle pte markers too like none pte */
-	return do_fault(vmf);
+	return do_pte_missing(vmf);
 }
 
 static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
@@ -3999,6 +4028,7 @@ out_release:
  */
 static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 {
+	bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
 	struct vm_area_struct *vma = vmf->vma;
 	struct folio *folio;
 	vm_fault_t ret = 0;
@@ -4032,7 +4062,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 						vma->vm_page_prot));
 		vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
 				vmf->address, &vmf->ptl);
-		if (!pte_none(*vmf->pte)) {
+		if (vmf_pte_changed(vmf)) {
 			update_mmu_tlb(vma, vmf->address, vmf->pte);
 			goto unlock;
 		}
@@ -4072,7 +4102,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 
 	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
 			&vmf->ptl);
-	if (!pte_none(*vmf->pte)) {
+	if (vmf_pte_changed(vmf)) {
 		update_mmu_tlb(vma, vmf->address, vmf->pte);
 		goto release;
 	}
@@ -4092,6 +4122,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 	folio_add_new_anon_rmap(folio, vma, vmf->address);
 	folio_add_lru_vma(folio, vma);
 setpte:
+	if (uffd_wp)
+		entry = pte_mkuffd_wp(entry);
 	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
 
 	/* No need to invalidate - it was non-present before */
@@ -4259,7 +4291,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
 void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
 {
 	struct vm_area_struct *vma = vmf->vma;
-	bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte);
+	bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
 	bool prefault = vmf->address != addr;
 	pte_t entry;
@@ -4903,12 +4935,8 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
 		}
 	}
 
-	if (!vmf->pte) {
-		if (vma_is_anonymous(vmf->vma))
-			return do_anonymous_page(vmf);
-		else
-			return do_fault(vmf);
-	}
+	if (!vmf->pte)
+		return do_pte_missing(vmf);
 
 	if (!pte_present(vmf->orig_pte))
 		return do_swap_page(vmf);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 13e84d8c0797..b9da9a5f87fe 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -276,7 +276,15 @@ static long change_pte_range(struct mmu_gather *tlb,
 		} else {
 			/* It must be an none page, or what else?.. */
 			WARN_ON_ONCE(!pte_none(oldpte));
-			if (unlikely(uffd_wp && !vma_is_anonymous(vma))) {
+
+			/*
+			 * Nobody plays with any none ptes besides
+			 * userfaultfd when applying the protections.
+			 */
+			if (likely(!uffd_wp))
+				continue;
+
+			if (userfaultfd_wp_use_markers(vma)) {
 				/*
 				 * For file-backed mem, we need to be able to
 				 * wr-protect a none pte, because even if the
@@ -320,23 +328,46 @@ static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd)
 	return 0;
 }
 
-/* Return true if we're uffd wr-protecting file-backed memory, or false */
+/*
+ * Return true if we want to split THPs into PTE mappings in change
+ * protection procedure, false otherwise.
+ */
 static inline bool
-uffd_wp_protect_file(struct vm_area_struct *vma, unsigned long cp_flags)
+pgtable_split_needed(struct vm_area_struct *vma, unsigned long cp_flags)
 {
+	/*
+	 * pte markers only resides in pte level, if we need pte markers,
+	 * we need to split.  We cannot wr-protect shmem thp because file
+	 * thp is handled differently when split by erasing the pmd so far.
+	 */
 	return (cp_flags & MM_CP_UFFD_WP) && !vma_is_anonymous(vma);
 }
 
 /*
- * If wr-protecting the range for file-backed, populate pgtable for the case
- * when pgtable is empty but page cache exists.  When {pte|pmd|...}_alloc()
- * failed we treat it the same way as pgtable allocation failures during
- * page faults by kicking OOM and returning error.
+ * Return true if we want to populate pgtables in change protection
+ * procedure, false otherwise
+ */
+static inline bool
+pgtable_populate_needed(struct vm_area_struct *vma, unsigned long cp_flags)
+{
+	/* If not within ioctl(UFFDIO_WRITEPROTECT), then don't bother */
+	if (!(cp_flags & MM_CP_UFFD_WP))
+		return false;
+
+	/* Populate if the userfaultfd mode requires pte markers */
+	return userfaultfd_wp_use_markers(vma);
+}
+
+/*
+ * Populate the pgtable underneath for whatever reason if requested.
+ * When {pte|pmd|...}_alloc() failed we treat it the same way as pgtable
+ * allocation failures during page faults by kicking OOM and returning
+ * error.
  */
 #define  change_pmd_prepare(vma, pmd, cp_flags)				\
 	({								\
 		long err = 0;						\
-		if (unlikely(uffd_wp_protect_file(vma, cp_flags))) {	\
+		if (unlikely(pgtable_populate_needed(vma, cp_flags))) {	\
 			if (pte_alloc(vma->vm_mm, pmd))			\
 				err = -ENOMEM;				\
 		}							\
@@ -351,7 +382,7 @@ uffd_wp_protect_file(struct vm_area_struct *vma, unsigned long cp_flags)
 #define  change_prepare(vma, high, low, addr, cp_flags)			\
 	  ({								\
 		long err = 0;						\
-		if (unlikely(uffd_wp_protect_file(vma, cp_flags))) {	\
+		if (unlikely(pgtable_populate_needed(vma, cp_flags))) {	\
 			low##_t *p = low##_alloc(vma->vm_mm, high, addr); \
 			if (p == NULL)					\
 				err = -ENOMEM;				\
@@ -404,7 +435,7 @@ static inline long change_pmd_range(struct mmu_gather *tlb,
 
 		if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
 			if ((next - addr != HPAGE_PMD_SIZE) ||
-			    uffd_wp_protect_file(vma, cp_flags)) {
+			    pgtable_split_needed(vma, cp_flags)) {
 				__split_huge_pmd(vma, pmd, addr, false, NULL);
 				/*
 				 * For file-backed, the pmd could have been
-- 
cgit v1.2.3


From 0289184476c845968ad6ac9083c96cc0f75ca505 Mon Sep 17 00:00:00 2001
From: Axel Rasmussen <axelrasmussen@google.com>
Date: Tue, 14 Mar 2023 15:12:50 -0700
Subject: mm: userfaultfd: add UFFDIO_CONTINUE_MODE_WP to install WP PTEs

UFFDIO_COPY already has UFFDIO_COPY_MODE_WP, so when installing a new PTE
to resolve a missing fault, one can install a write-protected one.  This
is useful when using UFFDIO_REGISTER_MODE_{MISSING,WP} in combination.

This was motivated by testing HugeTLB HGM [1], and in particular its
interaction with userfaultfd features.  Existing userfaultfd code supports
using WP and MINOR modes together (i.e.  you can register an area with
both enabled), but without this CONTINUE flag the combination is in
practice unusable.

So, add an analogous UFFDIO_CONTINUE_MODE_WP, which does the same thing as
UFFDIO_COPY_MODE_WP, but for *minor* faults.

Update the selftest to do some very basic exercising of the new flag.

Update Documentation/ to describe how these flags are used (neither the
COPY nor the new CONTINUE versions of this mode flag were described there
before).

[1]: https://patchwork.kernel.org/project/linux-mm/cover/20230218002819.1486479-1-jthoughton@google.com/

Link: https://lkml.kernel.org/r/20230314221250.682452-5-axelrasmussen@google.com
Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
Acked-by: Peter Xu <peterx@redhat.com>
Acked-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nadav Amit <namit@vmware.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/userfaultfd.rst | 8 ++++++++
 fs/userfaultfd.c                             | 8 ++++++--
 include/linux/userfaultfd_k.h                | 3 ++-
 include/uapi/linux/userfaultfd.h             | 7 +++++++
 mm/userfaultfd.c                             | 5 +++--
 tools/testing/selftests/mm/userfaultfd.c     | 4 ++++
 6 files changed, 30 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/admin-guide/mm/userfaultfd.rst b/Documentation/admin-guide/mm/userfaultfd.rst
index bd2226299583..7c304e432205 100644
--- a/Documentation/admin-guide/mm/userfaultfd.rst
+++ b/Documentation/admin-guide/mm/userfaultfd.rst
@@ -236,6 +236,14 @@ newer kernels, one can also detect the feature UFFD_FEATURE_WP_UNPOPULATED
 and set the feature bit in advance to make sure none ptes will also be
 write protected even upon anonymous memory.
 
+When using ``UFFDIO_REGISTER_MODE_WP`` in combination with either
+``UFFDIO_REGISTER_MODE_MISSING`` or ``UFFDIO_REGISTER_MODE_MINOR``, when
+resolving missing / minor faults with ``UFFDIO_COPY`` or ``UFFDIO_CONTINUE``
+respectively, it may be desirable for the new page / mapping to be
+write-protected (so future writes will also result in a WP fault). These ioctls
+support a mode flag (``UFFDIO_COPY_MODE_WP`` or ``UFFDIO_CONTINUE_MODE_WP``
+respectively) to configure the mapping this way.
+
 QEMU/KVM
 ========
 
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 8971c3613cc6..8395605790f6 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1893,6 +1893,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
 	struct uffdio_continue uffdio_continue;
 	struct uffdio_continue __user *user_uffdio_continue;
 	struct userfaultfd_wake_range range;
+	uffd_flags_t flags = 0;
 
 	user_uffdio_continue = (struct uffdio_continue __user *)arg;
 
@@ -1917,13 +1918,16 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
 	    uffdio_continue.range.start) {
 		goto out;
 	}
-	if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE)
+	if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE |
+				     UFFDIO_CONTINUE_MODE_WP))
 		goto out;
+	if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP)
+		flags |= MFILL_ATOMIC_WP;
 
 	if (mmget_not_zero(ctx->mm)) {
 		ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start,
 					    uffdio_continue.range.len,
-					    &ctx->mmap_changing);
+					    &ctx->mmap_changing, flags);
 		mmput(ctx->mm);
 	} else {
 		return -ESRCH;
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 4c477dece540..a2c53e98dfd6 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -83,7 +83,8 @@ extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm,
 				     unsigned long len,
 				     atomic_t *mmap_changing);
 extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start,
-				     unsigned long len, atomic_t *mmap_changing);
+				     unsigned long len, atomic_t *mmap_changing,
+				     uffd_flags_t flags);
 extern int mwriteprotect_range(struct mm_struct *dst_mm,
 			       unsigned long start, unsigned long len,
 			       bool enable_wp, atomic_t *mmap_changing);
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 90c958952bfc..66dd4cd277bd 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -305,6 +305,13 @@ struct uffdio_writeprotect {
 struct uffdio_continue {
 	struct uffdio_range range;
 #define UFFDIO_CONTINUE_MODE_DONTWAKE		((__u64)1<<0)
+	/*
+	 * UFFDIO_CONTINUE_MODE_WP will map the page write protected on
+	 * the fly.  UFFDIO_CONTINUE_MODE_WP is available only if the
+	 * write protected ioctl is implemented for the range
+	 * according to the uffdio_register.ioctls.
+	 */
+#define UFFDIO_CONTINUE_MODE_WP			((__u64)1<<1)
 	__u64 mode;
 
 	/*
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index a9b19b39413d..7f1b5f8b712c 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -693,10 +693,11 @@ ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long start,
 }
 
 ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start,
-			      unsigned long len, atomic_t *mmap_changing)
+			      unsigned long len, atomic_t *mmap_changing,
+			      uffd_flags_t flags)
 {
 	return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
-			    uffd_flags_set_mode(0, MFILL_ATOMIC_CONTINUE));
+			    uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE));
 }
 
 long uffd_wp_range(struct vm_area_struct *dst_vma,
diff --git a/tools/testing/selftests/mm/userfaultfd.c b/tools/testing/selftests/mm/userfaultfd.c
index e030d63c031a..a96d126cb40e 100644
--- a/tools/testing/selftests/mm/userfaultfd.c
+++ b/tools/testing/selftests/mm/userfaultfd.c
@@ -585,6 +585,8 @@ static void continue_range(int ufd, __u64 start, __u64 len)
 	req.range.start = start;
 	req.range.len = len;
 	req.mode = 0;
+	if (test_uffdio_wp)
+		req.mode |= UFFDIO_CONTINUE_MODE_WP;
 
 	if (ioctl(ufd, UFFDIO_CONTINUE, &req))
 		err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
@@ -1332,6 +1334,8 @@ static int userfaultfd_minor_test(void)
 	uffdio_register.range.start = (unsigned long)area_dst_alias;
 	uffdio_register.range.len = nr_pages * page_size;
 	uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR;
+	if (test_uffdio_wp)
+		uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
 	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
 		err("register failure");
 
-- 
cgit v1.2.3


From 244da66cda359227d80ccb41dbcb99da40eae186 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 7 Apr 2023 13:31:30 -0700
Subject: dmaengine: idxd: setup event log configuration

Add setup of event log feature for supported device. Event log addresses
error reporting that was lacking in gen 1 DSA devices where a second error
event does not get reported when a first event is pending software
handling. The event log allows a circular buffer that the device can push
error events to. It is up to the user to create a large enough event log
ring in order to capture the expected events. The evl size can be set in
the device sysfs attribute. By default 64 entries are supported as minimal
when event log is enabled.

Tested-by: Tony Zhu <tony.zhu@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Co-developed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Link: https://lore.kernel.org/r/20230407203143.2189681-4-fenghua.yu@intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/idxd/device.c    | 89 +++++++++++++++++++++++++++++++++++++++++++-
 drivers/dma/idxd/idxd.h      | 19 ++++++++++
 drivers/dma/idxd/init.c      |  1 +
 drivers/dma/idxd/registers.h | 72 ++++++++++++++++++++++++++++++++++-
 drivers/dma/idxd/sysfs.c     |  3 +-
 include/uapi/linux/idxd.h    |  1 +
 6 files changed, 181 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 5f321f3b4242..230fe9bb56ae 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -752,6 +752,83 @@ void idxd_device_clear_state(struct idxd_device *idxd)
 	spin_unlock(&idxd->dev_lock);
 }
 
+static int idxd_device_evl_setup(struct idxd_device *idxd)
+{
+	union gencfg_reg gencfg;
+	union evlcfg_reg evlcfg;
+	union genctrl_reg genctrl;
+	struct device *dev = &idxd->pdev->dev;
+	void *addr;
+	dma_addr_t dma_addr;
+	int size;
+	struct idxd_evl *evl = idxd->evl;
+
+	if (!evl)
+		return 0;
+
+	size = evl_size(idxd);
+	/*
+	 * Address needs to be page aligned. However, dma_alloc_coherent() provides
+	 * at minimal page size aligned address. No manual alignment required.
+	 */
+	addr = dma_alloc_coherent(dev, size, &dma_addr, GFP_KERNEL);
+	if (!addr)
+		return -ENOMEM;
+
+	memset(addr, 0, size);
+
+	spin_lock(&evl->lock);
+	evl->log = addr;
+	evl->dma = dma_addr;
+	evl->log_size = size;
+
+	memset(&evlcfg, 0, sizeof(evlcfg));
+	evlcfg.bits[0] = dma_addr & GENMASK(63, 12);
+	evlcfg.size = evl->size;
+
+	iowrite64(evlcfg.bits[0], idxd->reg_base + IDXD_EVLCFG_OFFSET);
+	iowrite64(evlcfg.bits[1], idxd->reg_base + IDXD_EVLCFG_OFFSET + 8);
+
+	genctrl.bits = ioread32(idxd->reg_base + IDXD_GENCTRL_OFFSET);
+	genctrl.evl_int_en = 1;
+	iowrite32(genctrl.bits, idxd->reg_base + IDXD_GENCTRL_OFFSET);
+
+	gencfg.bits = ioread32(idxd->reg_base + IDXD_GENCFG_OFFSET);
+	gencfg.evl_en = 1;
+	iowrite32(gencfg.bits, idxd->reg_base + IDXD_GENCFG_OFFSET);
+
+	spin_unlock(&evl->lock);
+	return 0;
+}
+
+static void idxd_device_evl_free(struct idxd_device *idxd)
+{
+	union gencfg_reg gencfg;
+	union genctrl_reg genctrl;
+	struct device *dev = &idxd->pdev->dev;
+	struct idxd_evl *evl = idxd->evl;
+
+	gencfg.bits = ioread32(idxd->reg_base + IDXD_GENCFG_OFFSET);
+	if (!gencfg.evl_en)
+		return;
+
+	spin_lock(&evl->lock);
+	gencfg.evl_en = 0;
+	iowrite32(gencfg.bits, idxd->reg_base + IDXD_GENCFG_OFFSET);
+
+	genctrl.bits = ioread32(idxd->reg_base + IDXD_GENCTRL_OFFSET);
+	genctrl.evl_int_en = 0;
+	iowrite32(genctrl.bits, idxd->reg_base + IDXD_GENCTRL_OFFSET);
+
+	iowrite64(0, idxd->reg_base + IDXD_EVLCFG_OFFSET);
+	iowrite64(0, idxd->reg_base + IDXD_EVLCFG_OFFSET + 8);
+
+	dma_free_coherent(dev, evl->log_size, evl->log, evl->dma);
+	evl->log = NULL;
+	evl->size = IDXD_EVL_SIZE_MIN;
+	spin_unlock(&evl->lock);
+}
+
 static void idxd_group_config_write(struct idxd_group *group)
 {
 	struct idxd_device *idxd = group->idxd;
@@ -1451,15 +1528,24 @@ int idxd_device_drv_probe(struct idxd_dev *idxd_dev)
 	if (rc < 0)
 		return -ENXIO;
 
+	rc = idxd_device_evl_setup(idxd);
+	if (rc < 0) {
+		idxd->cmd_status = IDXD_SCMD_DEV_EVL_ERR;
+		return rc;
+	}
+
 	/* Start device */
 	rc = idxd_device_enable(idxd);
-	if (rc < 0)
+	if (rc < 0) {
+		idxd_device_evl_free(idxd);
 		return rc;
+	}
 
 	/* Setup DMA device without channels */
 	rc = idxd_register_dma_device(idxd);
 	if (rc < 0) {
 		idxd_device_disable(idxd);
+		idxd_device_evl_free(idxd);
 		idxd->cmd_status = IDXD_SCMD_DEV_DMA_ERR;
 		return rc;
 	}
@@ -1488,6 +1574,7 @@ void idxd_device_drv_remove(struct idxd_dev *idxd_dev)
 	idxd_device_disable(idxd);
 	if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
 		idxd_device_reset(idxd);
+	idxd_device_evl_free(idxd);
 }
 
 static enum idxd_dev_type dev_types[] = {
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 2a71273f1822..c74681f02b18 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -262,7 +262,15 @@ struct idxd_driver_data {
 };
 
 struct idxd_evl {
+	/* Lock to protect event log access. */
+	spinlock_t lock;
+	void *log;
+	dma_addr_t dma;
+	/* Total size of event log = number of entries * entry size. */
+	unsigned int log_size;
+	/* The number of entries in the event log. */
 	u16 size;
+	u16 head;
 };
 
 struct idxd_device {
@@ -324,6 +332,17 @@ struct idxd_device {
 	struct idxd_evl *evl;
 };
 
+static inline unsigned int evl_ent_size(struct idxd_device *idxd)
+{
+	return idxd->hw.gen_cap.evl_support ?
+	       (32 * (1 << idxd->hw.gen_cap.evl_support)) : 0;
+}
+
+static inline unsigned int evl_size(struct idxd_device *idxd)
+{
+	return idxd->evl->size * evl_ent_size(idxd);
+}
+
 /* IDXD software descriptor */
 struct idxd_desc {
 	union {
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 525de59df82a..f44719a11c95 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -343,6 +343,7 @@ static int idxd_init_evl(struct idxd_device *idxd)
 	if (!evl)
 		return -ENOMEM;
 
+	spin_lock_init(&evl->lock);
 	evl->size = IDXD_EVL_SIZE_MIN;
 	idxd->evl = evl;
 	return 0;
diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
index ea3a499a3c3c..11bb97cf7481 100644
--- a/drivers/dma/idxd/registers.h
+++ b/drivers/dma/idxd/registers.h
@@ -3,6 +3,8 @@
 #ifndef _IDXD_REGISTERS_H_
 #define _IDXD_REGISTERS_H_
 
+#include <uapi/linux/idxd.h>
+
 /* PCI Config */
 #define PCI_DEVICE_ID_INTEL_DSA_SPR0	0x0b25
 #define PCI_DEVICE_ID_INTEL_IAX_SPR0	0x0cfe
@@ -119,7 +121,8 @@ union gencfg_reg {
 		u32 rdbuf_limit:8;
 		u32 rsvd:4;
 		u32 user_int_en:1;
-		u32 rsvd2:19;
+		u32 evl_en:1;
+		u32 rsvd2:18;
 	};
 	u32 bits;
 } __packed;
@@ -129,7 +132,8 @@ union genctrl_reg {
 	struct {
 		u32 softerr_int_en:1;
 		u32 halt_int_en:1;
-		u32 rsvd:30;
+		u32 evl_int_en:1;
+		u32 rsvd:29;
 	};
 	u32 bits;
 } __packed;
@@ -299,6 +303,21 @@ union iaa_cap_reg {
 
 #define IDXD_IAACAP_OFFSET	0x180
 
+#define IDXD_EVLCFG_OFFSET	0xe0
+union evlcfg_reg {
+	struct {
+		u64 pasid_en:1;
+		u64 priv:1;
+		u64 rsvd:10;
+		u64 base_addr:52;
+
+		u64 size:16;
+		u64 pasid:20;
+		u64 rsvd2:28;
+	};
+	u64 bits[2];
+} __packed;
+
 #define IDXD_EVL_SIZE_MIN	0x0040
 #define IDXD_EVL_SIZE_MAX	0xffff
 
@@ -539,4 +558,53 @@ union filter_cfg {
 	u64 val;
 } __packed;
 
+struct __evl_entry {
+	u64 rsvd:2;
+	u64 desc_valid:1;
+	u64 wq_idx_valid:1;
+	u64 batch:1;
+	u64 fault_rw:1;
+	u64 priv:1;
+	u64 err_info_valid:1;
+	u64 error:8;
+	u64 wq_idx:8;
+	u64 batch_id:8;
+	u64 operation:8;
+	u64 pasid:20;
+	u64 rsvd2:4;
+
+	u16 batch_idx;
+	u16 rsvd3;
+	union {
+		/* Invalid Flags 0x11 */
+		u32 invalid_flags;
+		/* Invalid Int Handle 0x19 */
+		/* Page fault 0x1a */
+		/* Page fault 0x06, 0x1f, only operand_id */
+		/* Page fault before drain or in batch, 0x26, 0x27 */
+		struct {
+			u16 int_handle;
+			u16 rci:1;
+			u16 ims:1;
+			u16 rcr:1;
+			u16 first_err_in_batch:1;
+			u16 rsvd4_2:9;
+			u16 operand_id:3;
+		};
+	};
+	u64 fault_addr;
+	u64 rsvd5;
+} __packed;
+
+struct dsa_evl_entry {
+	struct __evl_entry e;
+	struct dsa_completion_record cr;
+} __packed;
+
+struct iax_evl_entry {
+	struct __evl_entry e;
+	u64 rsvd[4];
+	struct iax_completion_record cr;
+} __packed;
+
 #endif
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 85644e5bde83..163fdfaa5022 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -1605,7 +1605,8 @@ static ssize_t event_log_size_store(struct device *dev,
 	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
 		return -EPERM;
 
-	if (val < IDXD_EVL_SIZE_MIN || val > IDXD_EVL_SIZE_MAX)
+	if (val < IDXD_EVL_SIZE_MIN || val > IDXD_EVL_SIZE_MAX ||
+	    (val * evl_ent_size(idxd) > ULONG_MAX - idxd->evl->dma))
 		return -EINVAL;
 
 	idxd->evl->size = val;
diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index fc47635b57dc..5d05bf12f2bd 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -30,6 +30,7 @@ enum idxd_scmd_stat {
 	IDXD_SCMD_WQ_NO_PRIV = 0x800f0000,
 	IDXD_SCMD_WQ_IRQ_ERR = 0x80100000,
 	IDXD_SCMD_WQ_USER_NO_IOMMU = 0x80110000,
+	IDXD_SCMD_DEV_EVL_ERR = 0x80120000,
 };
 
 #define IDXD_SCMD_SOFTERR_MASK	0x80000000
-- 
cgit v1.2.3


From 2f431ba908d2ef05da478d10925207728f1ff483 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 7 Apr 2023 13:31:31 -0700
Subject: dmaengine: idxd: add interrupt handling for event log

An event log interrupt is raised in the misc interrupt INTCAUSE register
when an event is written by the hardware. Add basic event log processing
support to the interrupt handler. The event log is a ring where the
hardware owns the tail and the software owns the head. The hardware will
advance the tail index when an additional event has been pushed to memory.
The software will process the log entry and then advances the head. The
log is full when (tail + 1) % log_size = head. The hardware will stop
writing when the log is full. The user is expected to create a log size
large enough to handle all the expected events.

Tested-by: Tony Zhu <tony.zhu@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Co-developed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Link: https://lore.kernel.org/r/20230407203143.2189681-5-fenghua.yu@intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/idxd/irq.c       | 48 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/dma/idxd/registers.h | 19 ++++++++++++++++++
 include/uapi/linux/idxd.h    |  1 +
 3 files changed, 68 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index 0d639303b515..52b8b7d9db22 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -217,6 +217,49 @@ static void idxd_int_handle_revoke(struct work_struct *work)
 	kfree(revoke);
 }
 
+static void process_evl_entry(struct idxd_device *idxd, struct __evl_entry *entry_head)
+{
+	struct device *dev = &idxd->pdev->dev;
+	u8 status;
+
+	status = DSA_COMP_STATUS(entry_head->error);
+	dev_warn_ratelimited(dev, "Device error %#x operation: %#x fault addr: %#llx\n",
+			     status, entry_head->operation, entry_head->fault_addr);
+}
+
+static void process_evl_entries(struct idxd_device *idxd)
+{
+	union evl_status_reg evl_status;
+	unsigned int h, t;
+	struct idxd_evl *evl = idxd->evl;
+	struct __evl_entry *entry_head;
+	unsigned int ent_size = evl_ent_size(idxd);
+	u32 size;
+
+	evl_status.bits = 0;
+	evl_status.int_pending = 1;
+
+	spin_lock(&evl->lock);
+	/* Clear interrupt pending bit */
+	iowrite32(evl_status.bits_upper32,
+		  idxd->reg_base + IDXD_EVLSTATUS_OFFSET + sizeof(u32));
+	h = evl->head;
+	evl_status.bits = ioread64(idxd->reg_base + IDXD_EVLSTATUS_OFFSET);
+	t = evl_status.tail;
+	size = idxd->evl->size;
+
+	while (h != t) {
+		entry_head = (struct __evl_entry *)(evl->log + (h * ent_size));
+		process_evl_entry(idxd, entry_head);
+		h = (h + 1) % size;
+	}
+
+	evl->head = h;
+	evl_status.head = h;
+	iowrite32(evl_status.bits_lower32, idxd->reg_base + IDXD_EVLSTATUS_OFFSET);
+	spin_unlock(&evl->lock);
+}
+
 irqreturn_t idxd_misc_thread(int vec, void *data)
 {
 	struct idxd_irq_entry *irq_entry = data;
@@ -304,6 +347,11 @@ irqreturn_t idxd_misc_thread(int vec, void *data)
 		perfmon_counter_overflow(idxd);
 	}
 
+	if (cause & IDXD_INTC_EVL) {
+		val |= IDXD_INTC_EVL;
+		process_evl_entries(idxd);
+	}
+
 	val ^= cause;
 	if (val)
 		dev_warn_once(dev, "Unexpected interrupt cause bits set: %#x\n",
diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
index 11bb97cf7481..148db94f9373 100644
--- a/drivers/dma/idxd/registers.h
+++ b/drivers/dma/idxd/registers.h
@@ -168,6 +168,7 @@ enum idxd_device_reset_type {
 #define IDXD_INTC_OCCUPY			0x04
 #define IDXD_INTC_PERFMON_OVFL		0x08
 #define IDXD_INTC_HALT_STATE		0x10
+#define IDXD_INTC_EVL			0x20
 #define IDXD_INTC_INT_HANDLE_REVOKED	0x80000000
 
 #define IDXD_CMD_OFFSET			0xa0
@@ -558,6 +559,24 @@ union filter_cfg {
 	u64 val;
 } __packed;
 
+#define IDXD_EVLSTATUS_OFFSET		0xf0
+
+union evl_status_reg {
+	struct {
+		u32 head:16;
+		u32 rsvd:16;
+		u32 tail:16;
+		u32 rsvd2:14;
+		u32 int_pending:1;
+		u32 rsvd3:1;
+	};
+	struct {
+		u32 bits_lower32;
+		u32 bits_upper32;
+	};
+	u64 bits;
+} __packed;
+
 struct __evl_entry {
 	u64 rsvd:2;
 	u64 desc_valid:1;
diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index 5d05bf12f2bd..0bc8eea18586 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -170,6 +170,7 @@ enum iax_completion_status {
 
 #define DSA_COMP_STATUS_MASK		0x7f
 #define DSA_COMP_STATUS_WRITE		0x80
+#define DSA_COMP_STATUS(status)		((status) & DSA_COMP_STATUS_MASK)
 
 struct dsa_hw_desc {
 	uint32_t	pasid:20;
-- 
cgit v1.2.3


From 5fbe6503b52f5665560584f62adab5db70ac910e Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 7 Apr 2023 13:31:32 -0700
Subject: dmanegine: idxd: add debugfs for event log dump

Add debugfs entry to dump the content of the event log for debugging. The
function will dump all non-zero entries in the event log. It will note
which entries are processed and which entries are still pending processing
at the time of the dump. The entries may not always be in chronological
order due to the log is a circular buffer.

Tested-by: Tony Zhu <tony.zhu@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Co-developed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Link: https://lore.kernel.org/r/20230407203143.2189681-6-fenghua.yu@intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/idxd/Makefile  |   2 +-
 drivers/dma/idxd/debugfs.c | 138 +++++++++++++++++++++++++++++++++++++++++++++
 drivers/dma/idxd/idxd.h    |   9 +++
 drivers/dma/idxd/init.c    |  12 ++++
 include/uapi/linux/idxd.h  |   6 +-
 5 files changed, 164 insertions(+), 3 deletions(-)
 create mode 100644 drivers/dma/idxd/debugfs.c

(limited to 'include/uapi/linux')

diff --git a/drivers/dma/idxd/Makefile b/drivers/dma/idxd/Makefile
index a1e9f2b3a37c..dc096839ac63 100644
--- a/drivers/dma/idxd/Makefile
+++ b/drivers/dma/idxd/Makefile
@@ -1,7 +1,7 @@
 ccflags-y += -DDEFAULT_SYMBOL_NAMESPACE=IDXD
 
 obj-$(CONFIG_INTEL_IDXD) += idxd.o
-idxd-y := init.o irq.o device.o sysfs.o submit.o dma.o cdev.o
+idxd-y := init.o irq.o device.o sysfs.o submit.o dma.o cdev.o debugfs.o
 
 idxd-$(CONFIG_INTEL_IDXD_PERFMON) += perfmon.o
 
diff --git a/drivers/dma/idxd/debugfs.c b/drivers/dma/idxd/debugfs.c
new file mode 100644
index 000000000000..9cfbd9b14c4c
--- /dev/null
+++ b/drivers/dma/idxd/debugfs.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2021 Intel Corporation. All rights rsvd. */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/debugfs.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <uapi/linux/idxd.h>
+#include "idxd.h"
+#include "registers.h"
+
+static struct dentry *idxd_debugfs_dir;
+
+static void dump_event_entry(struct idxd_device *idxd, struct seq_file *s,
+			     u16 index, int *count, bool processed)
+{
+	struct idxd_evl *evl = idxd->evl;
+	struct dsa_evl_entry *entry;
+	struct dsa_completion_record *cr;
+	u64 *raw;
+	int i;
+	int evl_strides = evl_ent_size(idxd) / sizeof(u64);
+
+	entry = (struct dsa_evl_entry *)evl->log + index;
+
+	if (!entry->e.desc_valid)
+		return;
+
+	seq_printf(s, "Event Log entry %d (real index %u) processed: %u\n",
+		   *count, index, processed);
+
+	seq_printf(s, "desc valid %u wq idx valid %u\n"
+		   "batch %u fault rw %u priv %u error 0x%x\n"
+		   "wq idx %u op %#x pasid %u batch idx %u\n"
+		   "fault addr %#llx\n",
+		   entry->e.desc_valid, entry->e.wq_idx_valid,
+		   entry->e.batch, entry->e.fault_rw, entry->e.priv,
+		   entry->e.error, entry->e.wq_idx, entry->e.operation,
+		   entry->e.pasid, entry->e.batch_idx, entry->e.fault_addr);
+
+	cr = &entry->cr;
+	seq_printf(s, "status %#x result %#x fault_info %#x bytes_completed %u\n"
+		   "fault addr %#llx inv flags %#x\n\n",
+		   cr->status, cr->result, cr->fault_info, cr->bytes_completed,
+		   cr->fault_addr, cr->invalid_flags);
+
+	raw = (u64 *)entry;
+
+	for (i = 0; i < evl_strides; i++)
+		seq_printf(s, "entry[%d] = %#llx\n", i, raw[i]);
+
+	seq_puts(s, "\n");
+	*count += 1;
+}
+
+static int debugfs_evl_show(struct seq_file *s, void *d)
+{
+	struct idxd_device *idxd = s->private;
+	struct idxd_evl *evl = idxd->evl;
+	union evl_status_reg evl_status;
+	u16 h, t, evl_size, i;
+	int count = 0;
+	bool processed = true;
+
+	if (!evl || !evl->log)
+		return 0;
+
+	spin_lock(&evl->lock);
+
+	h = evl->head;
+	evl_status.bits = ioread64(idxd->reg_base + IDXD_EVLSTATUS_OFFSET);
+	t = evl_status.tail;
+	evl_size = evl->size;
+
+	seq_printf(s, "Event Log head %u tail %u interrupt pending %u\n\n",
+		   evl_status.head, evl_status.tail, evl_status.int_pending);
+
+	i = t;
+	while (1) {
+		i = (i + 1) % evl_size;
+		if (i == t)
+			break;
+
+		if (processed && i == h)
+			processed = false;
+		dump_event_entry(idxd, s, i, &count, processed);
+	}
+
+	spin_unlock(&evl->lock);
+	return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(debugfs_evl);
+
+int idxd_device_init_debugfs(struct idxd_device *idxd)
+{
+	if (IS_ERR_OR_NULL(idxd_debugfs_dir))
+		return 0;
+
+	idxd->dbgfs_dir = debugfs_create_dir(dev_name(idxd_confdev(idxd)), idxd_debugfs_dir);
+	if (IS_ERR(idxd->dbgfs_dir))
+		return PTR_ERR(idxd->dbgfs_dir);
+
+	if (idxd->evl) {
+		idxd->dbgfs_evl_file = debugfs_create_file("event_log", 0400,
+							   idxd->dbgfs_dir, idxd,
+							   &debugfs_evl_fops);
+		if (IS_ERR(idxd->dbgfs_evl_file)) {
+			debugfs_remove_recursive(idxd->dbgfs_dir);
+			idxd->dbgfs_dir = NULL;
+			return PTR_ERR(idxd->dbgfs_evl_file);
+		}
+	}
+
+	return 0;
+}
+
+void idxd_device_remove_debugfs(struct idxd_device *idxd)
+{
+	debugfs_remove_recursive(idxd->dbgfs_dir);
+}
+
+int idxd_init_debugfs(void)
+{
+	if (!debugfs_initialized())
+		return 0;
+
+	idxd_debugfs_dir = debugfs_create_dir(KBUILD_MODNAME, NULL);
+	if (IS_ERR(idxd_debugfs_dir))
+		return  PTR_ERR(idxd_debugfs_dir);
+	return 0;
+}
+
+void idxd_remove_debugfs(void)
+{
+	debugfs_remove_recursive(idxd_debugfs_dir);
+}
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index c74681f02b18..b923b90b7299 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -330,6 +330,9 @@ struct idxd_device {
 
 	unsigned long *opcap_bmap;
 	struct idxd_evl *evl;
+
+	struct dentry *dbgfs_dir;
+	struct dentry *dbgfs_evl_file;
 };
 
 static inline unsigned int evl_ent_size(struct idxd_device *idxd)
@@ -704,4 +707,10 @@ static inline void perfmon_init(void) {}
 static inline void perfmon_exit(void) {}
 #endif
 
+/* debugfs */
+int idxd_device_init_debugfs(struct idxd_device *idxd);
+void idxd_device_remove_debugfs(struct idxd_device *idxd);
+int idxd_init_debugfs(void);
+void idxd_remove_debugfs(void);
+
 #endif
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index f44719a11c95..e21c4b7a1d32 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -669,6 +669,10 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto err_dev_register;
 	}
 
+	rc = idxd_device_init_debugfs(idxd);
+	if (rc)
+		dev_warn(dev, "IDXD debugfs failed to setup\n");
+
 	dev_info(&pdev->dev, "Intel(R) Accelerator Device (v%x)\n",
 		 idxd->hw.version);
 
@@ -731,6 +735,7 @@ static void idxd_remove(struct pci_dev *pdev)
 	idxd_shutdown(pdev);
 	if (device_pasid_enabled(idxd))
 		idxd_disable_system_pasid(idxd);
+	idxd_device_remove_debugfs(idxd);
 
 	irq_entry = idxd_get_ie(idxd, 0);
 	free_irq(irq_entry->vector, irq_entry);
@@ -788,6 +793,10 @@ static int __init idxd_init_module(void)
 	if (err)
 		goto err_cdev_register;
 
+	err = idxd_init_debugfs();
+	if (err)
+		goto err_debugfs;
+
 	err = pci_register_driver(&idxd_pci_driver);
 	if (err)
 		goto err_pci_register;
@@ -795,6 +804,8 @@ static int __init idxd_init_module(void)
 	return 0;
 
 err_pci_register:
+	idxd_remove_debugfs();
+err_debugfs:
 	idxd_cdev_remove();
 err_cdev_register:
 	idxd_driver_unregister(&idxd_user_drv);
@@ -815,5 +826,6 @@ static void __exit idxd_exit_module(void)
 	pci_unregister_driver(&idxd_pci_driver);
 	idxd_cdev_remove();
 	perfmon_exit();
+	idxd_remove_debugfs();
 }
 module_exit(idxd_exit_module);
diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index 0bc8eea18586..e86199d09a91 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -311,7 +311,8 @@ struct dsa_completion_record {
 		uint8_t		result;
 		uint8_t		dif_status;
 	};
-	uint16_t		rsvd;
+	uint8_t			fault_info;
+	uint8_t			rsvd;
 	uint32_t		bytes_completed;
 	uint64_t		fault_addr;
 	union {
@@ -368,7 +369,8 @@ struct dsa_raw_completion_record {
 struct iax_completion_record {
 	volatile uint8_t        status;
 	uint8_t                 error_code;
-	uint16_t                rsvd;
+	uint8_t			fault_info;
+	uint8_t			rsvd;
 	uint32_t                bytes_completed;
 	uint64_t                fault_addr;
 	uint32_t                invalid_flags;
-- 
cgit v1.2.3


From c40bd7d9737bdcfb02d42765bc6c59b338151123 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 7 Apr 2023 13:31:36 -0700
Subject: dmaengine: idxd: process user page faults for completion record

DSA supports page fault handling through PRS. However, the DMA engine
that's processing the descriptor is blocked until the PRS response is
received. Other workqueues sharing the engine are also blocked.
Page fault handing by the driver with PRS disabled can be used to
mitigate the stalling.

With PRS disabled while ATS remain enabled, DSA handles page faults on
a completion record by reporting an event in the event log. In this
instance, the descriptor is completed and the event log contains the
completion record address and the contents of the completion record. Add
support to the event log handling code to fault in the completion record
and copy the content of the completion record to user memory.

A bitmap is introduced to keep track of discarded event log entries. When
the user process initiates ->release() of the char device, it no longer is
interested in any remaining event log entries tied to the relevant wq and
PASID. The driver will mark the event log entry index in the bitmap. Upon
encountering the entries during processing, the event log handler will just
clear the bitmap bit and skip the entry rather than attempt to process the
event log entry.

Tested-by: Tony Zhu <tony.zhu@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Co-developed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Link: https://lore.kernel.org/r/20230407203143.2189681-10-fenghua.yu@intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/idxd/cdev.c   | 30 ++++++++++++++++
 drivers/dma/idxd/device.c | 22 ++++++++++--
 drivers/dma/idxd/idxd.h   |  2 ++
 drivers/dma/idxd/init.c   |  2 ++
 drivers/dma/idxd/irq.c    | 87 ++++++++++++++++++++++++++++++++++++++++++++---
 include/uapi/linux/idxd.h |  1 +
 6 files changed, 137 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index 8b8a0a0fb054..0a51c33198f6 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -164,6 +164,35 @@ failed:
 	return rc;
 }
 
+static void idxd_cdev_evl_drain_pasid(struct idxd_wq *wq, u32 pasid)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct idxd_evl *evl = idxd->evl;
+	union evl_status_reg status;
+	u16 h, t, size;
+	int ent_size = evl_ent_size(idxd);
+	struct __evl_entry *entry_head;
+
+	if (!evl)
+		return;
+
+	spin_lock(&evl->lock);
+	status.bits = ioread64(idxd->reg_base + IDXD_EVLSTATUS_OFFSET);
+	t = status.tail;
+	h = evl->head;
+	size = evl->size;
+
+	while (h != t) {
+		entry_head = (struct __evl_entry *)(evl->log + (h * ent_size));
+		if (entry_head->pasid == pasid && entry_head->wq_idx == wq->id)
+			set_bit(h, evl->bmap);
+		h = (h + 1) % size;
+	}
+	spin_unlock(&evl->lock);
+
+	drain_workqueue(wq->wq);
+}
+
 static int idxd_cdev_release(struct inode *node, struct file *filep)
 {
 	struct idxd_user_context *ctx = filep->private_data;
@@ -190,6 +219,7 @@ static int idxd_cdev_release(struct inode *node, struct file *filep)
 	}
 
 	if (ctx->sva) {
+		idxd_cdev_evl_drain_pasid(wq, ctx->pasid);
 		iommu_sva_unbind_device(ctx->sva);
 		idxd_xa_pasid_remove(ctx);
 	}
diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 230fe9bb56ae..fd97b2b58734 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -762,18 +762,29 @@ static int idxd_device_evl_setup(struct idxd_device *idxd)
 	dma_addr_t dma_addr;
 	int size;
 	struct idxd_evl *evl = idxd->evl;
+	unsigned long *bmap;
+	int rc;
 
 	if (!evl)
 		return 0;
 
 	size = evl_size(idxd);
+
+	bmap = bitmap_zalloc(size, GFP_KERNEL);
+	if (!bmap) {
+		rc = -ENOMEM;
+		goto err_bmap;
+	}
+
 	/*
 	 * Address needs to be page aligned. However, dma_alloc_coherent() provides
 	 * at minimal page size aligned address. No manual alignment required.
 	 */
 	addr = dma_alloc_coherent(dev, size, &dma_addr, GFP_KERNEL);
-	if (!addr)
-		return -ENOMEM;
+	if (!addr) {
+		rc = -ENOMEM;
+		goto err_alloc;
+	}
 
 	memset(addr, 0, size);
 
@@ -781,6 +792,7 @@ static int idxd_device_evl_setup(struct idxd_device *idxd)
 	evl->log = addr;
 	evl->dma = dma_addr;
 	evl->log_size = size;
+	evl->bmap = bmap;
 
 	memset(&evlcfg, 0, sizeof(evlcfg));
 	evlcfg.bits[0] = dma_addr & GENMASK(63, 12);
@@ -799,6 +811,11 @@ static int idxd_device_evl_setup(struct idxd_device *idxd)
 
 	spin_unlock(&evl->lock);
 	return 0;
+
+err_alloc:
+	bitmap_free(bmap);
+err_bmap:
+	return rc;
 }
 
 static void idxd_device_evl_free(struct idxd_device *idxd)
@@ -824,6 +841,7 @@ static void idxd_device_evl_free(struct idxd_device *idxd)
 	iowrite64(0, idxd->reg_base + IDXD_EVLCFG_OFFSET + 8);
 
 	dma_free_coherent(dev, evl->log_size, evl->log, evl->dma);
+	bitmap_free(evl->bmap);
 	evl->log = NULL;
 	evl->size = IDXD_EVL_SIZE_MIN;
 	spin_unlock(&evl->lock);
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index b3f9a12adce2..3963c83165a6 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -264,6 +264,7 @@ struct idxd_driver_data {
 	struct device_type *dev_type;
 	int compl_size;
 	int align;
+	int evl_cr_off;
 };
 
 struct idxd_evl {
@@ -276,6 +277,7 @@ struct idxd_evl {
 	/* The number of entries in the event log. */
 	u16 size;
 	u16 head;
+	unsigned long *bmap;
 };
 
 struct idxd_evl_fault {
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index e6faff58733d..18344593b83f 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -46,6 +46,7 @@ static struct idxd_driver_data idxd_driver_data[] = {
 		.compl_size = sizeof(struct dsa_completion_record),
 		.align = 32,
 		.dev_type = &dsa_device_type,
+		.evl_cr_off = offsetof(struct dsa_evl_entry, cr),
 	},
 	[IDXD_TYPE_IAX] = {
 		.name_prefix = "iax",
@@ -53,6 +54,7 @@ static struct idxd_driver_data idxd_driver_data[] = {
 		.compl_size = sizeof(struct iax_completion_record),
 		.align = 64,
 		.dev_type = &iax_device_type,
+		.evl_cr_off = offsetof(struct iax_evl_entry, cr),
 	},
 };
 
diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index 52b8b7d9db22..96983975f974 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -7,6 +7,8 @@
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/dmaengine.h>
 #include <linux/delay.h>
+#include <linux/iommu.h>
+#include <linux/sched/mm.h>
 #include <uapi/linux/idxd.h>
 #include "../dmaengine.h"
 #include "idxd.h"
@@ -217,14 +219,89 @@ static void idxd_int_handle_revoke(struct work_struct *work)
 	kfree(revoke);
 }
 
-static void process_evl_entry(struct idxd_device *idxd, struct __evl_entry *entry_head)
+static void idxd_evl_fault_work(struct work_struct *work)
+{
+	struct idxd_evl_fault *fault = container_of(work, struct idxd_evl_fault, work);
+	struct idxd_wq *wq = fault->wq;
+	struct idxd_device *idxd = wq->idxd;
+	struct device *dev = &idxd->pdev->dev;
+	struct __evl_entry *entry_head = fault->entry;
+	void *cr = (void *)entry_head + idxd->data->evl_cr_off;
+	int cr_size = idxd->data->compl_size, copied;
+
+	switch (fault->status) {
+	case DSA_COMP_CRA_XLAT:
+	case DSA_COMP_DRAIN_EVL:
+		/*
+		 * Copy completion record to fault_addr in user address space
+		 * that is found by wq and PASID.
+		 */
+		copied = idxd_copy_cr(wq, entry_head->pasid,
+				      entry_head->fault_addr,
+				      cr, cr_size);
+		/*
+		 * The task that triggered the page fault is unknown currently
+		 * because multiple threads may share the user address
+		 * space or the task exits already before this fault.
+		 * So if the copy fails, SIGSEGV can not be sent to the task.
+		 * Just print an error for the failure. The user application
+		 * waiting for the completion record will time out on this
+		 * failure.
+		 */
+		if (copied != cr_size) {
+			dev_dbg_ratelimited(dev, "Failed to write to completion record. (%d:%d)\n",
+					    cr_size, copied);
+		}
+		break;
+	default:
+		dev_dbg_ratelimited(dev, "Unrecognized error code: %#x\n",
+				    DSA_COMP_STATUS(entry_head->error));
+		break;
+	}
+
+	kmem_cache_free(idxd->evl_cache, fault);
+}
+
+static void process_evl_entry(struct idxd_device *idxd,
+			      struct __evl_entry *entry_head, unsigned int index)
 {
 	struct device *dev = &idxd->pdev->dev;
+	struct idxd_evl *evl = idxd->evl;
 	u8 status;
 
-	status = DSA_COMP_STATUS(entry_head->error);
-	dev_warn_ratelimited(dev, "Device error %#x operation: %#x fault addr: %#llx\n",
-			     status, entry_head->operation, entry_head->fault_addr);
+	if (test_bit(index, evl->bmap)) {
+		clear_bit(index, evl->bmap);
+	} else {
+		status = DSA_COMP_STATUS(entry_head->error);
+
+		if (status == DSA_COMP_CRA_XLAT || status == DSA_COMP_DRAIN_EVL) {
+			struct idxd_evl_fault *fault;
+			int ent_size = evl_ent_size(idxd);
+
+			if (entry_head->rci)
+				dev_dbg(dev, "Completion Int Req set, ignoring!\n");
+
+			if (!entry_head->rcr && status == DSA_COMP_DRAIN_EVL)
+				return;
+
+			fault = kmem_cache_alloc(idxd->evl_cache, GFP_ATOMIC);
+			if (fault) {
+				struct idxd_wq *wq = idxd->wqs[entry_head->wq_idx];
+
+				fault->wq = wq;
+				fault->status = status;
+				memcpy(&fault->entry, entry_head, ent_size);
+				INIT_WORK(&fault->work, idxd_evl_fault_work);
+				queue_work(wq->wq, &fault->work);
+			} else {
+				dev_warn(dev, "Failed to service fault work.\n");
+			}
+		} else {
+			dev_warn_ratelimited(dev, "Device error %#x operation: %#x fault addr: %#llx\n",
+					     status, entry_head->operation,
+					     entry_head->fault_addr);
+		}
+	}
 }
 
 static void process_evl_entries(struct idxd_device *idxd)
@@ -250,7 +327,7 @@ static void process_evl_entries(struct idxd_device *idxd)
 
 	while (h != t) {
 		entry_head = (struct __evl_entry *)(evl->log + (h * ent_size));
-		process_evl_entry(idxd, entry_head);
+		process_evl_entry(idxd, entry_head, h);
 		h = (h + 1) % size;
 	}
 
diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index e86199d09a91..4b584d5afd87 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -135,6 +135,7 @@ enum dsa_completion_status {
 	DSA_COMP_HW_ERR1,
 	DSA_COMP_HW_ERR_DRB,
 	DSA_COMP_TRANSLATION_FAIL,
+	DSA_COMP_DRAIN_EVL = 0x26,
 };
 
 enum iax_completion_status {
-- 
cgit v1.2.3


From 6926987185a3ae92c31b99ce1bfdfb04e95057c0 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 7 Apr 2023 13:31:37 -0700
Subject: dmaengine: idxd: add descs_completed field for completion record

The descs_completed field for a completion record is part of a batch
descriptor completion record. It takes the same location as bytes_completed
in a normal descriptor field. Add to expose to user.

Tested-by: Tony Zhu <tony.zhu@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Co-developed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Link: https://lore.kernel.org/r/20230407203143.2189681-11-fenghua.yu@intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/uapi/linux/idxd.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index 4b584d5afd87..76ad71bf751e 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -314,7 +314,10 @@ struct dsa_completion_record {
 	};
 	uint8_t			fault_info;
 	uint8_t			rsvd;
-	uint32_t		bytes_completed;
+	union {
+		uint32_t		bytes_completed;
+		uint32_t		descs_completed;
+	};
 	uint64_t		fault_addr;
 	union {
 		/* common record */
-- 
cgit v1.2.3


From 2442b7473ad03671378d2d95651bd6bbe09a0943 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 7 Apr 2023 13:31:38 -0700
Subject: dmaengine: idxd: process batch descriptor completion record faults

Add event log processing for faulting of user batch descriptor completion
record.

When encountering an event log entry for a page fault on a completion
record, the driver is expected to do the following:
1. If the "first error in batch" bit in event log entry error info is
set, discard any previously recorded errors associated with the
"batch identifier".
2. Fix the page fault according to the fault address in the event log. If
successful, write the completion record to the fault address in user space.
3. If an error is encountered while writing the completion record and it is
associated to a descriptor in the batch, the driver associates the error
with the batch identifier of the event log entry and tracks it until the
event log entry for the corresponding batch desc is encountered.

While processing an event log entry for a batch descriptor with error
indicating that one or more descs in the batch had event log entries,
the driver will do the following before writing the batch completion
record:
1. If the status field of the completion record is 0x1, the driver will
change it to error code 0x5 (one or more operations in batch completed
with status not successful) and changes the result field to 1.
2. If the status is error code 0x6 (page fault on batch descriptor list
address), change the result field to 1.
3. If status is any other value, the completion record is not changed.
4. Clear the recorded error in preparation for next batch with same batch
identifier.

The result field is for user software to determine whether to set the
"Batch Error" flag bit in the descriptor for continuation of partial
batch descriptor completion. See DSA spec 2.0 for additional information.

If no error has been recorded for the batch, the batch completion record is
written to user space as is.

Tested-by: Tony Zhu <tony.zhu@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Co-developed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Link: https://lore.kernel.org/r/20230407203143.2189681-12-fenghua.yu@intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/idxd/idxd.h      |  3 ++
 drivers/dma/idxd/init.c      |  4 ++
 drivers/dma/idxd/irq.c       | 91 ++++++++++++++++++++++++++++++++------------
 drivers/dma/idxd/registers.h |  4 +-
 include/uapi/linux/idxd.h    |  1 +
 5 files changed, 78 insertions(+), 25 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 3963c83165a6..4c4baa80c731 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -265,6 +265,8 @@ struct idxd_driver_data {
 	int compl_size;
 	int align;
 	int evl_cr_off;
+	int cr_status_off;
+	int cr_result_off;
 };
 
 struct idxd_evl {
@@ -278,6 +280,7 @@ struct idxd_evl {
 	u16 size;
 	u16 head;
 	unsigned long *bmap;
+	bool batch_fail[IDXD_MAX_BATCH_IDENT];
 };
 
 struct idxd_evl_fault {
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 18344593b83f..92f60d364392 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -47,6 +47,8 @@ static struct idxd_driver_data idxd_driver_data[] = {
 		.align = 32,
 		.dev_type = &dsa_device_type,
 		.evl_cr_off = offsetof(struct dsa_evl_entry, cr),
+		.cr_status_off = offsetof(struct dsa_completion_record, status),
+		.cr_result_off = offsetof(struct dsa_completion_record, result),
 	},
 	[IDXD_TYPE_IAX] = {
 		.name_prefix = "iax",
@@ -55,6 +57,8 @@ static struct idxd_driver_data idxd_driver_data[] = {
 		.align = 64,
 		.dev_type = &iax_device_type,
 		.evl_cr_off = offsetof(struct iax_evl_entry, cr),
+		.cr_status_off = offsetof(struct iax_completion_record, status),
+		.cr_result_off = offsetof(struct iax_completion_record, error_code),
 	},
 };
 
diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index 96983975f974..c660d63a3eb8 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -225,37 +225,79 @@ static void idxd_evl_fault_work(struct work_struct *work)
 	struct idxd_wq *wq = fault->wq;
 	struct idxd_device *idxd = wq->idxd;
 	struct device *dev = &idxd->pdev->dev;
+	struct idxd_evl *evl = idxd->evl;
 	struct __evl_entry *entry_head = fault->entry;
 	void *cr = (void *)entry_head + idxd->data->evl_cr_off;
-	int cr_size = idxd->data->compl_size, copied;
+	int cr_size = idxd->data->compl_size;
+	u8 *status = (u8 *)cr + idxd->data->cr_status_off;
+	u8 *result = (u8 *)cr + idxd->data->cr_result_off;
+	int copied, copy_size;
+	bool *bf;
 
 	switch (fault->status) {
 	case DSA_COMP_CRA_XLAT:
-	case DSA_COMP_DRAIN_EVL:
-		/*
-		 * Copy completion record to fault_addr in user address space
-		 * that is found by wq and PASID.
-		 */
-		copied = idxd_copy_cr(wq, entry_head->pasid,
-				      entry_head->fault_addr,
-				      cr, cr_size);
-		/*
-		 * The task that triggered the page fault is unknown currently
-		 * because multiple threads may share the user address
-		 * space or the task exits already before this fault.
-		 * So if the copy fails, SIGSEGV can not be sent to the task.
-		 * Just print an error for the failure. The user application
-		 * waiting for the completion record will time out on this
-		 * failure.
-		 */
-		if (copied != cr_size) {
-			dev_dbg_ratelimited(dev, "Failed to write to completion record. (%d:%d)\n",
-					    cr_size, copied);
+		if (entry_head->batch && entry_head->first_err_in_batch)
+			evl->batch_fail[entry_head->batch_id] = false;
+
+		copy_size = cr_size;
+		break;
+	case DSA_COMP_BATCH_EVL_ERR:
+		bf = &evl->batch_fail[entry_head->batch_id];
+
+		copy_size = entry_head->rcr || *bf ? cr_size : 0;
+		if (*bf) {
+			if (*status == DSA_COMP_SUCCESS)
+				*status = DSA_COMP_BATCH_FAIL;
+			*result = 1;
+			*bf = false;
 		}
 		break;
+	case DSA_COMP_DRAIN_EVL:
+		copy_size = cr_size;
+		break;
 	default:
-		dev_dbg_ratelimited(dev, "Unrecognized error code: %#x\n",
-				    DSA_COMP_STATUS(entry_head->error));
+		copy_size = 0;
+		dev_dbg_ratelimited(dev, "Unrecognized error code: %#x\n", fault->status);
+		break;
+	}
+
+	if (copy_size == 0)
+		return;
+
+	/*
+	 * Copy completion record to fault_addr in user address space
+	 * that is found by wq and PASID.
+	 */
+	copied = idxd_copy_cr(wq, entry_head->pasid, entry_head->fault_addr,
+			      cr, copy_size);
+	/*
+	 * The task that triggered the page fault is unknown currently
+	 * because multiple threads may share the user address
+	 * space or the task exits already before this fault.
+	 * So if the copy fails, SIGSEGV can not be sent to the task.
+	 * Just print an error for the failure. The user application
+	 * waiting for the completion record will time out on this
+	 * failure.
+	 */
+	switch (fault->status) {
+	case DSA_COMP_CRA_XLAT:
+		if (copied != copy_size) {
+			dev_dbg_ratelimited(dev, "Failed to write to completion record: (%d:%d)\n",
+					    copy_size, copied);
+			if (entry_head->batch)
+				evl->batch_fail[entry_head->batch_id] = true;
+		}
+		break;
+	case DSA_COMP_BATCH_EVL_ERR:
+		if (copied != copy_size) {
+			dev_dbg_ratelimited(dev, "Failed to write to batch completion record: (%d:%d)\n",
+					    copy_size, copied);
+		}
+		break;
+	case DSA_COMP_DRAIN_EVL:
+		if (copied != copy_size)
+			dev_dbg_ratelimited(dev, "Failed to write to drain completion record: (%d:%d)\n",
+					    copy_size, copied);
 		break;
 	}
 
@@ -274,7 +316,8 @@ static void process_evl_entry(struct idxd_device *idxd,
 	} else {
 		status = DSA_COMP_STATUS(entry_head->error);
 
-		if (status == DSA_COMP_CRA_XLAT || status == DSA_COMP_DRAIN_EVL) {
+		if (status == DSA_COMP_CRA_XLAT || status == DSA_COMP_DRAIN_EVL ||
+		    status == DSA_COMP_BATCH_EVL_ERR) {
 			struct idxd_evl_fault *fault;
 			int ent_size = evl_ent_size(idxd);
 
diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
index 148db94f9373..9f3959d001b6 100644
--- a/drivers/dma/idxd/registers.h
+++ b/drivers/dma/idxd/registers.h
@@ -35,7 +35,7 @@ union gen_cap_reg {
 		u64 drain_readback:1;
 		u64 rsvd2:3;
 		u64 evl_support:2;
-		u64 rsvd4:1;
+		u64 batch_continuation:1;
 		u64 max_xfer_shift:5;
 		u64 max_batch_shift:4;
 		u64 max_ims_mult:6;
@@ -577,6 +577,8 @@ union evl_status_reg {
 	u64 bits;
 } __packed;
 
+#define IDXD_MAX_BATCH_IDENT	256
+
 struct __evl_entry {
 	u64 rsvd:2;
 	u64 desc_valid:1;
diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index 76ad71bf751e..606b52e88ce3 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -136,6 +136,7 @@ enum dsa_completion_status {
 	DSA_COMP_HW_ERR_DRB,
 	DSA_COMP_TRANSLATION_FAIL,
 	DSA_COMP_DRAIN_EVL = 0x26,
+	DSA_COMP_BATCH_EVL_ERR,
 };
 
 enum iax_completion_status {
-- 
cgit v1.2.3


From ddc65971bb677aa9f6a4c21f76d3133e106f88eb Mon Sep 17 00:00:00 2001
From: Josh Triplett <josh@joshtriplett.org>
Date: Tue, 4 Apr 2023 21:31:48 +0900
Subject: prctl: add PR_GET_AUXV to copy auxv to userspace

If a library wants to get information from auxv (for instance,
AT_HWCAP/AT_HWCAP2), it has a few options, none of them perfectly reliable
or ideal:

- Be main or the pre-main startup code, and grub through the stack above
  main. Doesn't work for a library.
- Call libc getauxval. Not ideal for libraries that are trying to be
  libc-independent and/or don't otherwise require anything from other
  libraries.
- Open and read /proc/self/auxv. Doesn't work for libraries that may run
  in arbitrarily constrained environments that may not have /proc
  mounted (e.g. libraries that might be used by an init program or a
  container setup tool).
- Assume you're on the main thread and still on the original stack, and
  try to walk the stack upwards, hoping to find auxv. Extremely bad
  idea.
- Ask the caller to pass auxv in for you. Not ideal for a user-friendly
  library, and then your caller may have the same problem.

Add a prctl that copies current->mm->saved_auxv to a userspace buffer.

Link: https://lkml.kernel.org/r/d81864a7f7f43bca6afa2a09fc2e850e4050ab42.1680611394.git.josh@joshtriplett.org
Signed-off-by: Josh Triplett <josh@joshtriplett.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/uapi/linux/prctl.h |  2 ++
 kernel/sys.c               | 15 +++++++++++++++
 2 files changed, 17 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 1312a137f7fb..b99c0be72577 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -290,4 +290,6 @@ struct prctl_mm_map {
 #define PR_SET_VMA		0x53564d41
 # define PR_SET_VMA_ANON_NAME		0
 
+#define PR_GET_AUXV			0x41555856
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 351de7916302..26c1399e0654 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2388,6 +2388,16 @@ static inline int prctl_get_mdwe(unsigned long arg2, unsigned long arg3,
 		PR_MDWE_REFUSE_EXEC_GAIN : 0;
 }
 
+static int prctl_get_auxv(void __user *addr, unsigned long len)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long size = min_t(unsigned long, sizeof(mm->saved_auxv), len);
+
+	if (size && copy_to_user(addr, mm->saved_auxv, size))
+		return -EFAULT;
+	return sizeof(mm->saved_auxv);
+}
+
 SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		unsigned long, arg4, unsigned long, arg5)
 {
@@ -2518,6 +2528,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 			else
 				return -EINVAL;
 			break;
+	case PR_GET_AUXV:
+		if (arg4 || arg5)
+			return -EINVAL;
+		error = prctl_get_auxv((void __user *)arg2, arg3);
+		break;
 		default:
 			return -EINVAL;
 		}
-- 
cgit v1.2.3


From a3b2aeac9d154e5e15ddbf19de934c0c606b6acd Mon Sep 17 00:00:00 2001
From: Yang Yang <yang.yang19@zte.com.cn>
Date: Sat, 8 Apr 2023 17:28:35 +0800
Subject: delayacct: track delays from IRQ/SOFTIRQ

Delay accounting does not track the delay of IRQ/SOFTIRQ.  While
IRQ/SOFTIRQ could have obvious impact on some workloads productivity, such
as when workloads are running on system which is busy handling network
IRQ/SOFTIRQ.

Get the delay of IRQ/SOFTIRQ could help users to reduce such delay.  Such
as setting interrupt affinity or task affinity, using kernel thread for
NAPI etc.  This is inspired by "sched/psi: Add PSI_IRQ to track
IRQ/SOFTIRQ pressure"[1].  Also fix some code indent problems of older
code.

And update tools/accounting/getdelays.c:
    / # ./getdelays -p 156 -di
    print delayacct stats ON
    printing IO accounting
    PID     156

    CPU             count     real total  virtual total    delay total  delay average
                       15       15836008       16218149      275700790         18.380ms
    IO              count    delay total  delay average
                        0              0          0.000ms
    SWAP            count    delay total  delay average
                        0              0          0.000ms
    RECLAIM         count    delay total  delay average
                        0              0          0.000ms
    THRASHING       count    delay total  delay average
                        0              0          0.000ms
    COMPACT         count    delay total  delay average
                        0              0          0.000ms
    WPCOPY          count    delay total  delay average
                       36        7586118          0.211ms
    IRQ             count    delay total  delay average
                       42         929161          0.022ms

[1] commit 52b1364ba0b1("sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ pressure")

Link: https://lkml.kernel.org/r/202304081728353557233@zte.com.cn
Signed-off-by: Yang Yang <yang.yang29@zte.com.cn>
Cc: Jiang Xuexin <jiang.xuexin@zte.com.cn>
Cc: wangyong <wang.yong12@zte.com.cn>
Cc: junhua huang <huang.junhua@zte.com.cn>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/accounting/delay-accounting.rst |  7 +++++--
 include/linux/delayacct.h                     | 15 ++++++++++++++
 include/uapi/linux/taskstats.h                |  6 +++++-
 kernel/delayacct.c                            | 14 +++++++++++++
 kernel/sched/core.c                           |  1 +
 tools/accounting/getdelays.c                  | 30 ++++++++++++++++-----------
 6 files changed, 58 insertions(+), 15 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/accounting/delay-accounting.rst b/Documentation/accounting/delay-accounting.rst
index 79f537c9f160..f61c01fc376e 100644
--- a/Documentation/accounting/delay-accounting.rst
+++ b/Documentation/accounting/delay-accounting.rst
@@ -16,6 +16,7 @@ d) memory reclaim
 e) thrashing
 f) direct compact
 g) write-protect copy
+h) IRQ/SOFTIRQ
 
 and makes these statistics available to userspace through
 the taskstats interface.
@@ -49,7 +50,7 @@ this structure. See
 for a description of the fields pertaining to delay accounting.
 It will generally be in the form of counters returning the cumulative
 delay seen for cpu, sync block I/O, swapin, memory reclaim, thrash page
-cache, direct compact, write-protect copy etc.
+cache, direct compact, write-protect copy, IRQ/SOFTIRQ etc.
 
 Taking the difference of two successive readings of a given
 counter (say cpu_delay_total) for a task will give the delay
@@ -118,7 +119,9 @@ Get sum of delays, since system boot, for all pids with tgid 5::
                        0              0          0.000ms
 	COMPACT         count    delay total  delay average
                        0              0          0.000ms
-   WPCOPY          count    delay total  delay average
+	WPCOPY          count    delay total  delay average
+                       0              0          0.000ms
+	IRQ             count    delay total  delay average
                        0              0          0.000ms
 
 Get IO accounting for pid 1, it works only with -p::
diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 0da97dba9ef8..6639f48dac36 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -48,10 +48,13 @@ struct task_delay_info {
 	u64 wpcopy_start;
 	u64 wpcopy_delay;	/* wait for write-protect copy */
 
+	u64 irq_delay;	/* wait for IRQ/SOFTIRQ */
+
 	u32 freepages_count;	/* total count of memory reclaim */
 	u32 thrashing_count;	/* total count of thrash waits */
 	u32 compact_count;	/* total count of memory compact */
 	u32 wpcopy_count;	/* total count of write-protect copy */
+	u32 irq_count;	/* total count of IRQ/SOFTIRQ */
 };
 #endif
 
@@ -81,6 +84,7 @@ extern void __delayacct_compact_start(void);
 extern void __delayacct_compact_end(void);
 extern void __delayacct_wpcopy_start(void);
 extern void __delayacct_wpcopy_end(void);
+extern void __delayacct_irq(struct task_struct *task, u32 delta);
 
 static inline void delayacct_tsk_init(struct task_struct *tsk)
 {
@@ -215,6 +219,15 @@ static inline void delayacct_wpcopy_end(void)
 		__delayacct_wpcopy_end();
 }
 
+static inline void delayacct_irq(struct task_struct *task, u32 delta)
+{
+	if (!static_branch_unlikely(&delayacct_key))
+		return;
+
+	if (task->delays)
+		__delayacct_irq(task, delta);
+}
+
 #else
 static inline void delayacct_init(void)
 {}
@@ -253,6 +266,8 @@ static inline void delayacct_wpcopy_start(void)
 {}
 static inline void delayacct_wpcopy_end(void)
 {}
+static inline void delayacct_irq(struct task_struct *task, u32 delta)
+{}
 
 #endif /* CONFIG_TASK_DELAY_ACCT */
 
diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h
index a7f5b11a8f1b..b50b2eb257a0 100644
--- a/include/uapi/linux/taskstats.h
+++ b/include/uapi/linux/taskstats.h
@@ -34,7 +34,7 @@
  */
 
 
-#define TASKSTATS_VERSION	13
+#define TASKSTATS_VERSION	14
 #define TS_COMM_LEN		32	/* should be >= TASK_COMM_LEN
 					 * in linux/sched.h */
 
@@ -198,6 +198,10 @@ struct taskstats {
 	/* v13: Delay waiting for write-protect copy */
 	__u64    wpcopy_count;
 	__u64    wpcopy_delay_total;
+
+	/* v14: Delay waiting for IRQ/SOFTIRQ */
+	__u64    irq_count;
+	__u64    irq_delay_total;
 };
 
 
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index e39cb696cfbd..6f0c358e73d8 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -179,12 +179,15 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 	d->compact_delay_total = (tmp < d->compact_delay_total) ? 0 : tmp;
 	tmp = d->wpcopy_delay_total + tsk->delays->wpcopy_delay;
 	d->wpcopy_delay_total = (tmp < d->wpcopy_delay_total) ? 0 : tmp;
+	tmp = d->irq_delay_total + tsk->delays->irq_delay;
+	d->irq_delay_total = (tmp < d->irq_delay_total) ? 0 : tmp;
 	d->blkio_count += tsk->delays->blkio_count;
 	d->swapin_count += tsk->delays->swapin_count;
 	d->freepages_count += tsk->delays->freepages_count;
 	d->thrashing_count += tsk->delays->thrashing_count;
 	d->compact_count += tsk->delays->compact_count;
 	d->wpcopy_count += tsk->delays->wpcopy_count;
+	d->irq_count += tsk->delays->irq_count;
 	raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
 
 	return 0;
@@ -274,3 +277,14 @@ void __delayacct_wpcopy_end(void)
 		      &current->delays->wpcopy_delay,
 		      &current->delays->wpcopy_count);
 }
+
+void __delayacct_irq(struct task_struct *task, u32 delta)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&task->delays->lock, flags);
+	task->delays->irq_delay += delta;
+	task->delays->irq_count++;
+	raw_spin_unlock_irqrestore(&task->delays->lock, flags);
+}
+
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0d18c3969f90..5473e831daf3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -704,6 +704,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
 	rq->prev_irq_time += irq_delta;
 	delta -= irq_delta;
 	psi_account_irqtime(rq->curr, irq_delta);
+	delayacct_irq(rq->curr, irq_delta);
 #endif
 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
 	if (static_key_false((&paravirt_steal_rq_enabled))) {
diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c
index 23a15d8f2bf4..1334214546d7 100644
--- a/tools/accounting/getdelays.c
+++ b/tools/accounting/getdelays.c
@@ -198,17 +198,19 @@ static void print_delayacct(struct taskstats *t)
 	printf("\n\nCPU   %15s%15s%15s%15s%15s\n"
 	       "      %15llu%15llu%15llu%15llu%15.3fms\n"
 	       "IO    %15s%15s%15s\n"
-          "      %15llu%15llu%15.3fms\n"
+	       "      %15llu%15llu%15.3fms\n"
 	       "SWAP  %15s%15s%15s\n"
-          "      %15llu%15llu%15.3fms\n"
+	       "      %15llu%15llu%15.3fms\n"
 	       "RECLAIM  %12s%15s%15s\n"
-          "      %15llu%15llu%15.3fms\n"
+	       "      %15llu%15llu%15.3fms\n"
 	       "THRASHING%12s%15s%15s\n"
-          "      %15llu%15llu%15.3fms\n"
+	       "      %15llu%15llu%15.3fms\n"
 	       "COMPACT  %12s%15s%15s\n"
-          "      %15llu%15llu%15.3fms\n"
+	       "      %15llu%15llu%15.3fms\n"
 	       "WPCOPY   %12s%15s%15s\n"
-          "      %15llu%15llu%15.3fms\n",
+	       "      %15llu%15llu%15.3fms\n"
+	       "IRQ   %15s%15s%15s\n"
+	       "      %15llu%15llu%15.3fms\n",
 	       "count", "real total", "virtual total",
 	       "delay total", "delay average",
 	       (unsigned long long)t->cpu_count,
@@ -219,27 +221,31 @@ static void print_delayacct(struct taskstats *t)
 	       "count", "delay total", "delay average",
 	       (unsigned long long)t->blkio_count,
 	       (unsigned long long)t->blkio_delay_total,
-          average_ms((double)t->blkio_delay_total, t->blkio_count),
+	       average_ms((double)t->blkio_delay_total, t->blkio_count),
 	       "count", "delay total", "delay average",
 	       (unsigned long long)t->swapin_count,
 	       (unsigned long long)t->swapin_delay_total,
-          average_ms((double)t->swapin_delay_total, t->swapin_count),
+	       average_ms((double)t->swapin_delay_total, t->swapin_count),
 	       "count", "delay total", "delay average",
 	       (unsigned long long)t->freepages_count,
 	       (unsigned long long)t->freepages_delay_total,
-          average_ms((double)t->freepages_delay_total, t->freepages_count),
+	       average_ms((double)t->freepages_delay_total, t->freepages_count),
 	       "count", "delay total", "delay average",
 	       (unsigned long long)t->thrashing_count,
 	       (unsigned long long)t->thrashing_delay_total,
-          average_ms((double)t->thrashing_delay_total, t->thrashing_count),
+	       average_ms((double)t->thrashing_delay_total, t->thrashing_count),
 	       "count", "delay total", "delay average",
 	       (unsigned long long)t->compact_count,
 	       (unsigned long long)t->compact_delay_total,
-          average_ms((double)t->compact_delay_total, t->compact_count),
+	       average_ms((double)t->compact_delay_total, t->compact_count),
 	       "count", "delay total", "delay average",
 	       (unsigned long long)t->wpcopy_count,
 	       (unsigned long long)t->wpcopy_delay_total,
-          average_ms((double)t->wpcopy_delay_total, t->wpcopy_count));
+	       average_ms((double)t->wpcopy_delay_total, t->wpcopy_count),
+	       "count", "delay total", "delay average",
+	       (unsigned long long)t->irq_count,
+	       (unsigned long long)t->irq_delay_total,
+	       average_ms((double)t->irq_delay_total, t->irq_count));
 }
 
 static void task_context_switch_counts(struct taskstats *t)
-- 
cgit v1.2.3


From 31088f6f7906253ef4577f6a9b84e2d42447dba0 Mon Sep 17 00:00:00 2001
From: Kevin Brodsky <kevin.brodsky@arm.com>
Date: Tue, 11 Apr 2023 10:27:47 +0100
Subject: uapi/linux/const.h: prefer ISO-friendly __typeof__

typeof is (still) a GNU extension, which means that it cannot be used when
building ISO C (e.g.  -std=c99).  It should therefore be avoided in uapi
headers in favour of the ISO-friendly __typeof__.

Unfortunately this issue could not be detected by
CONFIG_UAPI_HEADER_TEST=y as the __ALIGN_KERNEL() macro is not expanded in
any uapi header.

This matters from a userspace perspective, not a kernel one. uapi
headers and their contents are expected to be usable in a variety of
situations, and in particular when building ISO C applications (with
-std=c99 or similar).

This particular problem can be reproduced by trying to use the
__ALIGN_KERNEL macro directly in application code, say:

#include <linux/const.h>

int align(int x, int a)
{
	return __KERNEL_ALIGN(x, a);
}

and trying to build that with -std=c99.

Link: https://lkml.kernel.org/r/20230411092747.3759032-1-kevin.brodsky@arm.com
Fixes: a79ff731a1b2 ("netfilter: xtables: make XT_ALIGN() usable in exported headers by exporting __ALIGN_KERNEL()")
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Reported-by: Ruben Ayrapetyan <ruben.ayrapetyan@arm.com>
Tested-by: Ruben Ayrapetyan <ruben.ayrapetyan@arm.com>
Reviewed-by: Petr Vorel <pvorel@suse.cz>
Tested-by: Petr Vorel <pvorel@suse.cz>
Reviewed-by: Masahiro Yamada <masahiroy@kernel.org>
Cc: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/uapi/linux/const.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/const.h b/include/uapi/linux/const.h
index af2a44c08683..a429381e7ca5 100644
--- a/include/uapi/linux/const.h
+++ b/include/uapi/linux/const.h
@@ -28,7 +28,7 @@
 #define _BITUL(x)	(_UL(1) << (x))
 #define _BITULL(x)	(_ULL(1) << (x))
 
-#define __ALIGN_KERNEL(x, a)		__ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1)
+#define __ALIGN_KERNEL(x, a)		__ALIGN_KERNEL_MASK(x, (__typeof__(x))(a) - 1)
 #define __ALIGN_KERNEL_MASK(x, mask)	(((x) + (mask)) & ~(mask))
 
 #define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
-- 
cgit v1.2.3


From af8ececda185078c096852edb4e1d7a2349e6856 Mon Sep 17 00:00:00 2001
From: Viktor Prutyanov <viktor@daynix.com>
Date: Thu, 13 Apr 2023 11:18:54 +0300
Subject: virtio: add VIRTIO_F_NOTIFICATION_DATA feature support

According to VirtIO spec v1.2, VIRTIO_F_NOTIFICATION_DATA feature
indicates that the driver passes extra data along with the queue
notifications.

In a split queue case, the extra data is 16-bit available index. In a
packed queue case, the extra data is 1-bit wrap counter and 15-bit
available index.

Add support for this feature for MMIO, channel I/O and modern PCI
transports.

Signed-off-by: Viktor Prutyanov <viktor@daynix.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Message-Id: <20230413081855.36643-2-alvaro.karsz@solid-run.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/s390/virtio/virtio_ccw.c   | 22 +++++++++++++++++++---
 drivers/virtio/virtio_mmio.c       | 18 +++++++++++++++++-
 drivers/virtio/virtio_pci_modern.c | 17 ++++++++++++++++-
 drivers/virtio/virtio_ring.c       | 19 +++++++++++++++++++
 include/linux/virtio_ring.h        |  2 ++
 include/uapi/linux/virtio_config.h |  6 ++++++
 6 files changed, 79 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c
index 954fc31b4bc7..02922768b129 100644
--- a/drivers/s390/virtio/virtio_ccw.c
+++ b/drivers/s390/virtio/virtio_ccw.c
@@ -391,7 +391,7 @@ static void virtio_ccw_drop_indicator(struct virtio_ccw_device *vcdev,
 	ccw_device_dma_free(vcdev->cdev, thinint_area, sizeof(*thinint_area));
 }
 
-static bool virtio_ccw_kvm_notify(struct virtqueue *vq)
+static inline bool virtio_ccw_do_kvm_notify(struct virtqueue *vq, u32 data)
 {
 	struct virtio_ccw_vq_info *info = vq->priv;
 	struct virtio_ccw_device *vcdev;
@@ -402,12 +402,22 @@ static bool virtio_ccw_kvm_notify(struct virtqueue *vq)
 	BUILD_BUG_ON(sizeof(struct subchannel_id) != sizeof(unsigned int));
 	info->cookie = kvm_hypercall3(KVM_S390_VIRTIO_CCW_NOTIFY,
 				      *((unsigned int *)&schid),
-				      vq->index, info->cookie);
+				      data, info->cookie);
 	if (info->cookie < 0)
 		return false;
 	return true;
 }
 
+static bool virtio_ccw_kvm_notify(struct virtqueue *vq)
+{
+	return virtio_ccw_do_kvm_notify(vq, vq->index);
+}
+
+static bool virtio_ccw_kvm_notify_with_data(struct virtqueue *vq)
+{
+	return virtio_ccw_do_kvm_notify(vq, vring_notification_data(vq));
+}
+
 static int virtio_ccw_read_vq_conf(struct virtio_ccw_device *vcdev,
 				   struct ccw1 *ccw, int index)
 {
@@ -495,6 +505,7 @@ static struct virtqueue *virtio_ccw_setup_vq(struct virtio_device *vdev,
 					     struct ccw1 *ccw)
 {
 	struct virtio_ccw_device *vcdev = to_vc_device(vdev);
+	bool (*notify)(struct virtqueue *vq);
 	int err;
 	struct virtqueue *vq = NULL;
 	struct virtio_ccw_vq_info *info;
@@ -502,6 +513,11 @@ static struct virtqueue *virtio_ccw_setup_vq(struct virtio_device *vdev,
 	unsigned long flags;
 	bool may_reduce;
 
+	if (__virtio_test_bit(vdev, VIRTIO_F_NOTIFICATION_DATA))
+		notify = virtio_ccw_kvm_notify_with_data;
+	else
+		notify = virtio_ccw_kvm_notify;
+
 	/* Allocate queue. */
 	info = kzalloc(sizeof(struct virtio_ccw_vq_info), GFP_KERNEL);
 	if (!info) {
@@ -524,7 +540,7 @@ static struct virtqueue *virtio_ccw_setup_vq(struct virtio_device *vdev,
 	may_reduce = vcdev->revision > 0;
 	vq = vring_create_virtqueue(i, info->num, KVM_VIRTIO_CCW_RING_ALIGN,
 				    vdev, true, may_reduce, ctx,
-				    virtio_ccw_kvm_notify, callback, name);
+				    notify, callback, name);
 
 	if (!vq) {
 		/* For now, we fail if we can't get the requested size. */
diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
index 3ff746e3f24a..dd4424c14233 100644
--- a/drivers/virtio/virtio_mmio.c
+++ b/drivers/virtio/virtio_mmio.c
@@ -285,6 +285,16 @@ static bool vm_notify(struct virtqueue *vq)
 	return true;
 }
 
+static bool vm_notify_with_data(struct virtqueue *vq)
+{
+	struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev);
+	u32 data = vring_notification_data(vq);
+
+	writel(data, vm_dev->base + VIRTIO_MMIO_QUEUE_NOTIFY);
+
+	return true;
+}
+
 /* Notify all virtqueues on an interrupt. */
 static irqreturn_t vm_interrupt(int irq, void *opaque)
 {
@@ -363,12 +373,18 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned int in
 				  const char *name, bool ctx)
 {
 	struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
+	bool (*notify)(struct virtqueue *vq);
 	struct virtio_mmio_vq_info *info;
 	struct virtqueue *vq;
 	unsigned long flags;
 	unsigned int num;
 	int err;
 
+	if (__virtio_test_bit(vdev, VIRTIO_F_NOTIFICATION_DATA))
+		notify = vm_notify_with_data;
+	else
+		notify = vm_notify;
+
 	if (!name)
 		return NULL;
 
@@ -397,7 +413,7 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned int in
 
 	/* Create the vring */
 	vq = vring_create_virtqueue(index, num, VIRTIO_MMIO_VRING_ALIGN, vdev,
-				 true, true, ctx, vm_notify, callback, name);
+				 true, true, ctx, notify, callback, name);
 	if (!vq) {
 		err = -ENOMEM;
 		goto error_new_virtqueue;
diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index 6e713904d8e8..d6bb68ba84e5 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -288,6 +288,15 @@ static u16 vp_config_vector(struct virtio_pci_device *vp_dev, u16 vector)
 	return vp_modern_config_vector(&vp_dev->mdev, vector);
 }
 
+static bool vp_notify_with_data(struct virtqueue *vq)
+{
+	u32 data = vring_notification_data(vq);
+
+	iowrite32(data, (void __iomem *)vq->priv);
+
+	return true;
+}
+
 static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
 				  struct virtio_pci_vq_info *info,
 				  unsigned int index,
@@ -298,10 +307,16 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
 {
 
 	struct virtio_pci_modern_device *mdev = &vp_dev->mdev;
+	bool (*notify)(struct virtqueue *vq);
 	struct virtqueue *vq;
 	u16 num;
 	int err;
 
+	if (__virtio_test_bit(&vp_dev->vdev, VIRTIO_F_NOTIFICATION_DATA))
+		notify = vp_notify_with_data;
+	else
+		notify = vp_notify;
+
 	if (index >= vp_modern_get_num_queues(mdev))
 		return ERR_PTR(-EINVAL);
 
@@ -316,7 +331,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
 	vq = vring_create_virtqueue(index, num,
 				    SMP_CACHE_BYTES, &vp_dev->vdev,
 				    true, true, ctx,
-				    vp_notify, callback, name);
+				    notify, callback, name);
 	if (!vq)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 34e3849629da..c5310eaf8b46 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -2762,6 +2762,23 @@ void vring_del_virtqueue(struct virtqueue *_vq)
 }
 EXPORT_SYMBOL_GPL(vring_del_virtqueue);
 
+u32 vring_notification_data(struct virtqueue *_vq)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	u16 next;
+
+	if (vq->packed_ring)
+		next = (vq->packed.next_avail_idx &
+				~(-(1 << VRING_PACKED_EVENT_F_WRAP_CTR))) |
+			vq->packed.avail_wrap_counter <<
+				VRING_PACKED_EVENT_F_WRAP_CTR;
+	else
+		next = vq->split.avail_idx_shadow;
+
+	return next << 16 | _vq->index;
+}
+EXPORT_SYMBOL_GPL(vring_notification_data);
+
 /* Manipulates transport-specific feature bits. */
 void vring_transport_features(struct virtio_device *vdev)
 {
@@ -2781,6 +2798,8 @@ void vring_transport_features(struct virtio_device *vdev)
 			break;
 		case VIRTIO_F_ORDER_PLATFORM:
 			break;
+		case VIRTIO_F_NOTIFICATION_DATA:
+			break;
 		default:
 			/* We don't understand this bit. */
 			__virtio_clear_bit(vdev, i);
diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
index 8b95b69ef694..2550c9170f4f 100644
--- a/include/linux/virtio_ring.h
+++ b/include/linux/virtio_ring.h
@@ -117,4 +117,6 @@ void vring_del_virtqueue(struct virtqueue *vq);
 void vring_transport_features(struct virtio_device *vdev);
 
 irqreturn_t vring_interrupt(int irq, void *_vq);
+
+u32 vring_notification_data(struct virtqueue *_vq);
 #endif /* _LINUX_VIRTIO_RING_H */
diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h
index 3c05162bc988..2c712c654165 100644
--- a/include/uapi/linux/virtio_config.h
+++ b/include/uapi/linux/virtio_config.h
@@ -99,6 +99,12 @@
  */
 #define VIRTIO_F_SR_IOV			37
 
+/*
+ * This feature indicates that the driver passes extra data (besides
+ * identifying the virtqueue) in its device notifications.
+ */
+#define VIRTIO_F_NOTIFICATION_DATA	38
+
 /*
  * This feature indicates that the driver can reset a queue individually.
  */
-- 
cgit v1.2.3


From d7597f59d1d33e9efbffa7060deb9ee5bd119e62 Mon Sep 17 00:00:00 2001
From: Stefan Roesch <shr@devkernel.io>
Date: Mon, 17 Apr 2023 22:13:40 -0700
Subject: mm: add new api to enable ksm per process

Patch series "mm: process/cgroup ksm support", v9.

So far KSM can only be enabled by calling madvise for memory regions.  To
be able to use KSM for more workloads, KSM needs to have the ability to be
enabled / disabled at the process / cgroup level.

Use case 1:
  The madvise call is not available in the programming language.  An
  example for this are programs with forked workloads using a garbage
  collected language without pointers.  In such a language madvise cannot
  be made available.

  In addition the addresses of objects get moved around as they are
  garbage collected.  KSM sharing needs to be enabled "from the outside"
  for these type of workloads.

Use case 2:
  The same interpreter can also be used for workloads where KSM brings
  no benefit or even has overhead.  We'd like to be able to enable KSM on
  a workload by workload basis.

Use case 3:
  With the madvise call sharing opportunities are only enabled for the
  current process: it is a workload-local decision.  A considerable number
  of sharing opportunities may exist across multiple workloads or jobs (if
  they are part of the same security domain).  Only a higler level entity
  like a job scheduler or container can know for certain if its running
  one or more instances of a job.  That job scheduler however doesn't have
  the necessary internal workload knowledge to make targeted madvise
  calls.

Security concerns:

  In previous discussions security concerns have been brought up.  The
  problem is that an individual workload does not have the knowledge about
  what else is running on a machine.  Therefore it has to be very
  conservative in what memory areas can be shared or not.  However, if the
  system is dedicated to running multiple jobs within the same security
  domain, its the job scheduler that has the knowledge that sharing can be
  safely enabled and is even desirable.

Performance:

  Experiments with using UKSM have shown a capacity increase of around 20%.

  Here are the metrics from an instagram workload (taken from a machine
  with 64GB main memory):

   full_scans: 445
   general_profit: 20158298048
   max_page_sharing: 256
   merge_across_nodes: 1
   pages_shared: 129547
   pages_sharing: 5119146
   pages_to_scan: 4000
   pages_unshared: 1760924
   pages_volatile: 10761341
   run: 1
   sleep_millisecs: 20
   stable_node_chains: 167
   stable_node_chains_prune_millisecs: 2000
   stable_node_dups: 2751
   use_zero_pages: 0
   zero_pages_sharing: 0

After the service is running for 30 minutes to an hour, 4 to 5 million
shared pages are common for this workload when using KSM.


Detailed changes:

1. New options for prctl system command
   This patch series adds two new options to the prctl system call.
   The first one allows to enable KSM at the process level and the second
   one to query the setting.

The setting will be inherited by child processes.

With the above setting, KSM can be enabled for the seed process of a cgroup
and all processes in the cgroup will inherit the setting.

2. Changes to KSM processing
   When KSM is enabled at the process level, the KSM code will iterate
   over all the VMA's and enable KSM for the eligible VMA's.

   When forking a process that has KSM enabled, the setting will be
   inherited by the new child process.

3. Add general_profit metric
   The general_profit metric of KSM is specified in the documentation,
   but not calculated.  This adds the general profit metric to
   /sys/kernel/debug/mm/ksm.

4. Add more metrics to ksm_stat
   This adds the process profit metric to /proc/<pid>/ksm_stat.

5. Add more tests to ksm_tests and ksm_functional_tests
   This adds an option to specify the merge type to the ksm_tests.
   This allows to test madvise and prctl KSM.

   It also adds a two new tests to ksm_functional_tests: one to test
   the new prctl options and the other one is a fork test to verify that
   the KSM process setting is inherited by client processes.


This patch (of 3):

So far KSM can only be enabled by calling madvise for memory regions.  To
be able to use KSM for more workloads, KSM needs to have the ability to be
enabled / disabled at the process / cgroup level.

1. New options for prctl system command

   This patch series adds two new options to the prctl system call.
   The first one allows to enable KSM at the process level and the second
   one to query the setting.

   The setting will be inherited by child processes.

   With the above setting, KSM can be enabled for the seed process of a
   cgroup and all processes in the cgroup will inherit the setting.

2. Changes to KSM processing

   When KSM is enabled at the process level, the KSM code will iterate
   over all the VMA's and enable KSM for the eligible VMA's.

   When forking a process that has KSM enabled, the setting will be
   inherited by the new child process.

  1) Introduce new MMF_VM_MERGE_ANY flag

     This introduces the new flag MMF_VM_MERGE_ANY flag.  When this flag
     is set, kernel samepage merging (ksm) gets enabled for all vma's of a
     process.

  2) Setting VM_MERGEABLE on VMA creation

     When a VMA is created, if the MMF_VM_MERGE_ANY flag is set, the
     VM_MERGEABLE flag will be set for this VMA.

  3) support disabling of ksm for a process

     This adds the ability to disable ksm for a process if ksm has been
     enabled for the process with prctl.

  4) add new prctl option to get and set ksm for a process

     This adds two new options to the prctl system call
     - enable ksm for all vmas of a process (if the vmas support it).
     - query if ksm has been enabled for a process.

3. Disabling MMF_VM_MERGE_ANY for storage keys in s390

   In the s390 architecture when storage keys are used, the
   MMF_VM_MERGE_ANY will be disabled.

Link: https://lkml.kernel.org/r/20230418051342.1919757-1-shr@devkernel.io
Link: https://lkml.kernel.org/r/20230418051342.1919757-2-shr@devkernel.io
Signed-off-by: Stefan Roesch <shr@devkernel.io>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Bagas Sanjaya <bagasdotme@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/s390/mm/gmap.c            |   7 +++
 include/linux/ksm.h            |  21 ++++++++-
 include/linux/sched/coredump.h |   1 +
 include/uapi/linux/prctl.h     |   2 +
 kernel/sys.c                   |  27 +++++++++++
 mm/ksm.c                       | 104 ++++++++++++++++++++++++++++++++++-------
 mm/mmap.c                      |   3 ++
 7 files changed, 146 insertions(+), 19 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 5a716bdcba05..0949811761e6 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -2591,6 +2591,13 @@ int gmap_mark_unmergeable(void)
 	int ret;
 	VMA_ITERATOR(vmi, mm, 0);
 
+	/*
+	 * Make sure to disable KSM (if enabled for the whole process or
+	 * individual VMAs). Note that nothing currently hinders user space
+	 * from re-enabling it.
+	 */
+	clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
+
 	for_each_vma(vmi, vma) {
 		/* Copy vm_flags to avoid partial modifications in ksm_madvise */
 		vm_flags = vma->vm_flags;
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index d9e701326a88..4647b0c70c12 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -18,13 +18,26 @@
 #ifdef CONFIG_KSM
 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 		unsigned long end, int advice, unsigned long *vm_flags);
+
+void ksm_add_vma(struct vm_area_struct *vma);
+int ksm_enable_merge_any(struct mm_struct *mm);
+
 int __ksm_enter(struct mm_struct *mm);
 void __ksm_exit(struct mm_struct *mm);
 
 static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 {
-	if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
-		return __ksm_enter(mm);
+	int ret;
+
+	if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) {
+		ret = __ksm_enter(mm);
+		if (ret)
+			return ret;
+	}
+
+	if (test_bit(MMF_VM_MERGE_ANY, &oldmm->flags))
+		set_bit(MMF_VM_MERGE_ANY, &mm->flags);
+
 	return 0;
 }
 
@@ -57,6 +70,10 @@ void collect_procs_ksm(struct page *page, struct list_head *to_kill,
 #endif
 #else  /* !CONFIG_KSM */
 
+static inline void ksm_add_vma(struct vm_area_struct *vma)
+{
+}
+
 static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 {
 	return 0;
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index 0e17ae7fbfd3..0ee96ea7a0e9 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -90,4 +90,5 @@ static inline int get_dumpable(struct mm_struct *mm)
 #define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
 				 MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK)
 
+#define MMF_VM_MERGE_ANY	29
 #endif /* _LINUX_SCHED_COREDUMP_H */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index b99c0be72577..f23d9a16507f 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -292,4 +292,6 @@ struct prctl_mm_map {
 
 #define PR_GET_AUXV			0x41555856
 
+#define PR_SET_MEMORY_MERGE		67
+#define PR_GET_MEMORY_MERGE		68
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 26c1399e0654..72cdb16e2636 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -15,6 +15,7 @@
 #include <linux/highuid.h>
 #include <linux/fs.h>
 #include <linux/kmod.h>
+#include <linux/ksm.h>
 #include <linux/perf_event.h>
 #include <linux/resource.h>
 #include <linux/kernel.h>
@@ -2687,6 +2688,32 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	case PR_SET_VMA:
 		error = prctl_set_vma(arg2, arg3, arg4, arg5);
 		break;
+#ifdef CONFIG_KSM
+	case PR_SET_MEMORY_MERGE:
+		if (arg3 || arg4 || arg5)
+			return -EINVAL;
+		if (mmap_write_lock_killable(me->mm))
+			return -EINTR;
+
+		if (arg2) {
+			error = ksm_enable_merge_any(me->mm);
+		} else {
+			/*
+			 * TODO: we might want disable KSM on all VMAs and
+			 * trigger unsharing to completely disable KSM.
+			 */
+			clear_bit(MMF_VM_MERGE_ANY, &me->mm->flags);
+			error = 0;
+		}
+		mmap_write_unlock(me->mm);
+		break;
+	case PR_GET_MEMORY_MERGE:
+		if (arg2 || arg3 || arg4 || arg5)
+			return -EINVAL;
+
+		error = !!test_bit(MMF_VM_MERGE_ANY, &me->mm->flags);
+		break;
+#endif
 	default:
 		error = -EINVAL;
 		break;
diff --git a/mm/ksm.c b/mm/ksm.c
index 37c63310bc4e..35ac6c741572 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -515,6 +515,28 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
 	return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
 }
 
+static bool vma_ksm_compatible(struct vm_area_struct *vma)
+{
+	if (vma->vm_flags & (VM_SHARED  | VM_MAYSHARE   | VM_PFNMAP  |
+			     VM_IO      | VM_DONTEXPAND | VM_HUGETLB |
+			     VM_MIXEDMAP))
+		return false;		/* just ignore the advice */
+
+	if (vma_is_dax(vma))
+		return false;
+
+#ifdef VM_SAO
+	if (vma->vm_flags & VM_SAO)
+		return false;
+#endif
+#ifdef VM_SPARC_ADI
+	if (vma->vm_flags & VM_SPARC_ADI)
+		return false;
+#endif
+
+	return true;
+}
+
 static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
 		unsigned long addr)
 {
@@ -1026,6 +1048,7 @@ mm_exiting:
 
 			mm_slot_free(mm_slot_cache, mm_slot);
 			clear_bit(MMF_VM_MERGEABLE, &mm->flags);
+			clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
 			mmdrop(mm);
 		} else
 			spin_unlock(&ksm_mmlist_lock);
@@ -2408,6 +2431,7 @@ no_vmas:
 
 		mm_slot_free(mm_slot_cache, mm_slot);
 		clear_bit(MMF_VM_MERGEABLE, &mm->flags);
+		clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
 		mmap_read_unlock(mm);
 		mmdrop(mm);
 	} else {
@@ -2485,6 +2509,66 @@ static int ksm_scan_thread(void *nothing)
 	return 0;
 }
 
+static void __ksm_add_vma(struct vm_area_struct *vma)
+{
+	unsigned long vm_flags = vma->vm_flags;
+
+	if (vm_flags & VM_MERGEABLE)
+		return;
+
+	if (vma_ksm_compatible(vma))
+		vm_flags_set(vma, VM_MERGEABLE);
+}
+
+/**
+ * ksm_add_vma - Mark vma as mergeable if compatible
+ *
+ * @vma:  Pointer to vma
+ */
+void ksm_add_vma(struct vm_area_struct *vma)
+{
+	struct mm_struct *mm = vma->vm_mm;
+
+	if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
+		__ksm_add_vma(vma);
+}
+
+static void ksm_add_vmas(struct mm_struct *mm)
+{
+	struct vm_area_struct *vma;
+
+	VMA_ITERATOR(vmi, mm, 0);
+	for_each_vma(vmi, vma)
+		__ksm_add_vma(vma);
+}
+
+/**
+ * ksm_enable_merge_any - Add mm to mm ksm list and enable merging on all
+ *                        compatible VMA's
+ *
+ * @mm:  Pointer to mm
+ *
+ * Returns 0 on success, otherwise error code
+ */
+int ksm_enable_merge_any(struct mm_struct *mm)
+{
+	int err;
+
+	if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
+		return 0;
+
+	if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
+		err = __ksm_enter(mm);
+		if (err)
+			return err;
+	}
+
+	set_bit(MMF_VM_MERGE_ANY, &mm->flags);
+	ksm_add_vmas(mm);
+
+	return 0;
+}
+
 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 		unsigned long end, int advice, unsigned long *vm_flags)
 {
@@ -2493,25 +2577,10 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 
 	switch (advice) {
 	case MADV_MERGEABLE:
-		/*
-		 * Be somewhat over-protective for now!
-		 */
-		if (*vm_flags & (VM_MERGEABLE | VM_SHARED  | VM_MAYSHARE   |
-				 VM_PFNMAP    | VM_IO      | VM_DONTEXPAND |
-				 VM_HUGETLB | VM_MIXEDMAP))
-			return 0;		/* just ignore the advice */
-
-		if (vma_is_dax(vma))
+		if (vma->vm_flags & VM_MERGEABLE)
 			return 0;
-
-#ifdef VM_SAO
-		if (*vm_flags & VM_SAO)
+		if (!vma_ksm_compatible(vma))
 			return 0;
-#endif
-#ifdef VM_SPARC_ADI
-		if (*vm_flags & VM_SPARC_ADI)
-			return 0;
-#endif
 
 		if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
 			err = __ksm_enter(mm);
@@ -2615,6 +2684,7 @@ void __ksm_exit(struct mm_struct *mm)
 
 	if (easy_to_free) {
 		mm_slot_free(mm_slot_cache, mm_slot);
+		clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
 		clear_bit(MMF_VM_MERGEABLE, &mm->flags);
 		mmdrop(mm);
 	} else if (mm_slot) {
diff --git a/mm/mmap.c b/mm/mmap.c
index 790cc62c0038..51b6976fd525 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -46,6 +46,7 @@
 #include <linux/pkeys.h>
 #include <linux/oom.h>
 #include <linux/sched/mm.h>
+#include <linux/ksm.h>
 
 #include <linux/uaccess.h>
 #include <asm/cacheflush.h>
@@ -2729,6 +2730,7 @@ unmap_writable:
 	if (file && vm_flags & VM_SHARED)
 		mapping_unmap_writable(file->f_mapping);
 	file = vma->vm_file;
+	ksm_add_vma(vma);
 expanded:
 	perf_event_mmap(vma);
 
@@ -3001,6 +3003,7 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
 		goto mas_store_fail;
 
 	mm->map_count++;
+	ksm_add_vma(vma);
 out:
 	perf_event_mmap(vma);
 	mm->total_vm += len >> PAGE_SHIFT;
-- 
cgit v1.2.3


From 3db166d6cf0ea73dd2c887036aad2e95e0884d9b Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 18 Apr 2023 10:39:01 -0700
Subject: cxl/mbox: Deprecate poison commands

The CXL subsystem is adding formal mechanisms for managing device
poison. Minimize the maintenance burden going forward, and maximize
the investment in common tooling by deprecating direct user access
to poison commands outside of CXL_MEM_RAW_COMMANDS debug scenarios.

A new cxl_deprecated_commands[] list is created for querying which
command ids defined in previous kernels are now deprecated.

CXL Media and Poison Management commands, opcodes 0x43XX, defined in
CXL 3.0 Spec, Table 8-93 are deprecated with one exception: Get Scan
Media Capabilities. Keep Get Scan Media Capabilities as it simply
provides information and has no impact on the device state.

Effectively all of the commands defined in:

commit 87815ee9d006 ("cxl/pci: Add media provisioning required commands")

...were defined prematurely and should have waited until the kernel
implementation was decided. To my knowledge there are no shipping
devices with poison support and no known tools that would regress with
this change.

Co-developed-by: Alison Schofield <alison.schofield@intel.com>
Signed-off-by: Alison Schofield <alison.schofield@intel.com>
Link: https://lore.kernel.org/r/652197e9bc8885e6448d989405b9e50ee9d6b0a6.1681838291.git.alison.schofield@intel.com
Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/cxl/core/mbox.c      |  5 -----
 include/uapi/linux/cxl_mem.h | 35 ++++++++++++++++++++++++++++++-----
 2 files changed, 30 insertions(+), 10 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index f2addb457172..938cff2c948e 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -61,12 +61,7 @@ static struct cxl_mem_command cxl_mem_commands[CXL_MEM_COMMAND_ID_MAX] = {
 	CXL_CMD(SET_ALERT_CONFIG, 0xc, 0, 0),
 	CXL_CMD(GET_SHUTDOWN_STATE, 0, 0x1, 0),
 	CXL_CMD(SET_SHUTDOWN_STATE, 0x1, 0, 0),
-	CXL_CMD(GET_POISON, 0x10, CXL_VARIABLE_PAYLOAD, 0),
-	CXL_CMD(INJECT_POISON, 0x8, 0, 0),
-	CXL_CMD(CLEAR_POISON, 0x48, 0, 0),
 	CXL_CMD(GET_SCAN_MEDIA_CAPS, 0x10, 0x4, 0),
-	CXL_CMD(SCAN_MEDIA, 0x11, 0, 0),
-	CXL_CMD(GET_SCAN_MEDIA, 0, CXL_VARIABLE_PAYLOAD, 0),
 };
 
 /*
diff --git a/include/uapi/linux/cxl_mem.h b/include/uapi/linux/cxl_mem.h
index 86bbacf2a315..14bc6e742148 100644
--- a/include/uapi/linux/cxl_mem.h
+++ b/include/uapi/linux/cxl_mem.h
@@ -40,19 +40,22 @@
 	___C(SET_ALERT_CONFIG, "Set Alert Configuration"),                \
 	___C(GET_SHUTDOWN_STATE, "Get Shutdown State"),                   \
 	___C(SET_SHUTDOWN_STATE, "Set Shutdown State"),                   \
-	___C(GET_POISON, "Get Poison List"),                              \
-	___C(INJECT_POISON, "Inject Poison"),                             \
-	___C(CLEAR_POISON, "Clear Poison"),                               \
+	___DEPRECATED(GET_POISON, "Get Poison List"),                     \
+	___DEPRECATED(INJECT_POISON, "Inject Poison"),                    \
+	___DEPRECATED(CLEAR_POISON, "Clear Poison"),                      \
 	___C(GET_SCAN_MEDIA_CAPS, "Get Scan Media Capabilities"),         \
-	___C(SCAN_MEDIA, "Scan Media"),                                   \
-	___C(GET_SCAN_MEDIA, "Get Scan Media Results"),                   \
+	___DEPRECATED(SCAN_MEDIA, "Scan Media"),                          \
+	___DEPRECATED(GET_SCAN_MEDIA, "Get Scan Media Results"),          \
 	___C(MAX, "invalid / last command")
 
 #define ___C(a, b) CXL_MEM_COMMAND_ID_##a
+#define ___DEPRECATED(a, b) CXL_MEM_DEPRECATED_ID_##a
 enum { CXL_CMDS };
 
 #undef ___C
+#undef ___DEPRECATED
 #define ___C(a, b) { b }
+#define ___DEPRECATED(a, b) { "Deprecated " b }
 static const struct {
 	const char *name;
 } cxl_command_names[] __attribute__((__unused__)) = { CXL_CMDS };
@@ -68,6 +71,28 @@ static const struct {
  */
 
 #undef ___C
+#undef ___DEPRECATED
+#define ___C(a, b) (0)
+#define ___DEPRECATED(a, b) (1)
+
+static const __u8 cxl_deprecated_commands[]
+	__attribute__((__unused__)) = { CXL_CMDS };
+
+/*
+ * Here's how this actually breaks out:
+ * cxl_deprecated_commands[] = {
+ *	[CXL_MEM_COMMAND_ID_INVALID] = 0,
+ *	[CXL_MEM_COMMAND_ID_IDENTIFY] = 0,
+ *	...
+ *	[CXL_MEM_DEPRECATED_ID_GET_POISON] = 1,
+ *	[CXL_MEM_DEPRECATED_ID_INJECT_POISON] = 1,
+ *	[CXL_MEM_DEPRECATED_ID_CLEAR_POISON] = 1,
+ *	...
+ * };
+ */
+
+#undef ___C
+#undef ___DEPRECATED
 
 /**
  * struct cxl_command_info - Command information returned from a query.
-- 
cgit v1.2.3


From 9280c577431401544e63dfb489a830a42bee25eb Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 20 Apr 2023 13:56:31 -0400
Subject: NFSD: Handle new xprtsec= export option

Enable administrators to require clients to use transport layer
security when accessing particular exports.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/export.c                 | 51 +++++++++++++++++++++++++++++++++++++---
 fs/nfsd/export.h                 |  1 +
 include/uapi/linux/nfsd/export.h | 13 ++++++++++
 3 files changed, 62 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 6da74aebe1fb..ae85257b4238 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -439,7 +439,6 @@ static int check_export(struct path *path, int *flags, unsigned char *uuid)
 		return -EINVAL;
 	}
 	return 0;
-
 }
 
 #ifdef CONFIG_NFSD_V4
@@ -546,6 +545,29 @@ static inline int
 secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { return 0; }
 #endif
 
+static int xprtsec_parse(char **mesg, char *buf, struct svc_export *exp)
+{
+	unsigned int i, mode, listsize;
+	int err;
+
+	err = get_uint(mesg, &listsize);
+	if (err)
+		return err;
+	if (listsize > NFSEXP_XPRTSEC_NUM)
+		return -EINVAL;
+
+	exp->ex_xprtsec_modes = 0;
+	for (i = 0; i < listsize; i++) {
+		err = get_uint(mesg, &mode);
+		if (err)
+			return err;
+		if (mode > NFSEXP_XPRTSEC_MTLS)
+			return -EINVAL;
+		exp->ex_xprtsec_modes |= mode;
+	}
+	return 0;
+}
+
 static inline int
 nfsd_uuid_parse(char **mesg, char *buf, unsigned char **puuid)
 {
@@ -608,6 +630,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 	exp.ex_client = dom;
 	exp.cd = cd;
 	exp.ex_devid_map = NULL;
+	exp.ex_xprtsec_modes = NFSEXP_XPRTSEC_ALL;
 
 	/* expiry */
 	err = get_expiry(&mesg, &exp.h.expiry_time);
@@ -649,6 +672,8 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 				err = nfsd_uuid_parse(&mesg, buf, &exp.ex_uuid);
 			else if (strcmp(buf, "secinfo") == 0)
 				err = secinfo_parse(&mesg, buf, &exp);
+			else if (strcmp(buf, "xprtsec") == 0)
+				err = xprtsec_parse(&mesg, buf, &exp);
 			else
 				/* quietly ignore unknown words and anything
 				 * following. Newer user-space can try to set
@@ -662,6 +687,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 		err = check_export(&exp.ex_path, &exp.ex_flags, exp.ex_uuid);
 		if (err)
 			goto out4;
+
 		/*
 		 * No point caching this if it would immediately expire.
 		 * Also, this protects exportfs's dummy export from the
@@ -823,6 +849,7 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
 	for (i = 0; i < MAX_SECINFO_LIST; i++) {
 		new->ex_flavors[i] = item->ex_flavors[i];
 	}
+	new->ex_xprtsec_modes = item->ex_xprtsec_modes;
 }
 
 static struct cache_head *svc_export_alloc(void)
@@ -1034,9 +1061,26 @@ static struct svc_export *exp_find(struct cache_detail *cd,
 
 __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp)
 {
-	struct exp_flavor_info *f;
-	struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
+	struct exp_flavor_info *f, *end = exp->ex_flavors + exp->ex_nflavors;
+	struct svc_xprt *xprt = rqstp->rq_xprt;
+
+	if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_NONE) {
+		if (!test_bit(XPT_TLS_SESSION, &xprt->xpt_flags))
+			goto ok;
+	}
+	if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_TLS) {
+		if (test_bit(XPT_TLS_SESSION, &xprt->xpt_flags) &&
+		    !test_bit(XPT_PEER_AUTH, &xprt->xpt_flags))
+			goto ok;
+	}
+	if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_MTLS) {
+		if (test_bit(XPT_TLS_SESSION, &xprt->xpt_flags) &&
+		    test_bit(XPT_PEER_AUTH, &xprt->xpt_flags))
+			goto ok;
+	}
+	goto denied;
 
+ok:
 	/* legacy gss-only clients are always OK: */
 	if (exp->ex_client == rqstp->rq_gssclient)
 		return 0;
@@ -1061,6 +1105,7 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp)
 	if (nfsd4_spo_must_allow(rqstp))
 		return 0;
 
+denied:
 	return rqstp->rq_vers < 4 ? nfserr_acces : nfserr_wrongsec;
 }
 
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index d03f7f6a8642..2df8ae25aad3 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -77,6 +77,7 @@ struct svc_export {
 	struct cache_detail	*cd;
 	struct rcu_head		ex_rcu;
 	struct export_stats	ex_stats;
+	unsigned long		ex_xprtsec_modes;
 };
 
 /* an "export key" (expkey) maps a filehandlefragement to an
diff --git a/include/uapi/linux/nfsd/export.h b/include/uapi/linux/nfsd/export.h
index 2124ba904779..a73ca3703abb 100644
--- a/include/uapi/linux/nfsd/export.h
+++ b/include/uapi/linux/nfsd/export.h
@@ -62,5 +62,18 @@
 					| NFSEXP_ALLSQUASH \
 					| NFSEXP_INSECURE_PORT)
 
+/*
+ * Transport layer security policies that are permitted to access
+ * an export
+ */
+#define NFSEXP_XPRTSEC_NONE	0x0001
+#define NFSEXP_XPRTSEC_TLS	0x0002
+#define NFSEXP_XPRTSEC_MTLS	0x0004
+
+#define NFSEXP_XPRTSEC_NUM	(3)
+
+#define NFSEXP_XPRTSEC_ALL	(NFSEXP_XPRTSEC_NONE | \
+				 NFSEXP_XPRTSEC_TLS | \
+				 NFSEXP_XPRTSEC_MTLS)
 
 #endif /* _UAPINFSD_EXPORT_H */
-- 
cgit v1.2.3


From daf376a366fd2d469d66ab83dfdc074777462bab Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Mon, 10 Apr 2023 13:06:08 -0500
Subject: uapi nbd: improve doc links to userspace spec

The uapi <linux/nbd.h> header intentionally documents only the NBD
server features that the kernel module will utilize as a client.  But
while it already had one mention of skipped bits due to userspace
extensions, it did not actually direct the reader to the canonical
source to learn about those extensions.

While touching comments, fix an outdated reference that listed only
READ and WRITE as commands.

Signed-off-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Link: https://lore.kernel.org/r/20230410180611.1051618-2-eblake@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/nbd.h | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nbd.h b/include/uapi/linux/nbd.h
index 20d6cc91435d..8797387caaf7 100644
--- a/include/uapi/linux/nbd.h
+++ b/include/uapi/linux/nbd.h
@@ -11,6 +11,8 @@
  *            Cleanup PARANOIA usage & code.
  * 2004/02/19 Paul Clements
  *            Removed PARANOIA, plus various cleanup and comments
+ * 2023 Copyright Red Hat
+ *            Link to userspace extensions.
  */
 
 #ifndef _UAPILINUX_NBD_H
@@ -30,12 +32,18 @@
 #define NBD_SET_TIMEOUT _IO( 0xab, 9 )
 #define NBD_SET_FLAGS   _IO( 0xab, 10)
 
+/*
+ * See also https://github.com/NetworkBlockDevice/nbd/blob/master/doc/proto.md
+ * for additional userspace extensions not yet utilized in the kernel module.
+ */
+
 enum {
 	NBD_CMD_READ = 0,
 	NBD_CMD_WRITE = 1,
 	NBD_CMD_DISC = 2,
 	NBD_CMD_FLUSH = 3,
 	NBD_CMD_TRIM = 4
+	/* userspace defines additional extension commands */
 };
 
 /* values for flags field, these are server interaction specific. */
@@ -64,14 +72,15 @@ enum {
 #define NBD_REQUEST_MAGIC 0x25609513
 #define NBD_REPLY_MAGIC 0x67446698
 /* Do *not* use magics: 0x12560953 0x96744668. */
+/* magic 0x668e33ef for structured reply not supported by kernel yet */
 
 /*
  * This is the packet used for communication between client and
  * server. All data are in network byte order.
  */
 struct nbd_request {
-	__be32 magic;
-	__be32 type;	/* == READ || == WRITE 	*/
+	__be32 magic;	/* NBD_REQUEST_MAGIC	*/
+	__be32 type;	/* See NBD_CMD_*	*/
 	char handle[8];
 	__be64 from;
 	__be32 len;
@@ -82,7 +91,7 @@ struct nbd_request {
  * it has completed an I/O request (or an error occurs).
  */
 struct nbd_reply {
-	__be32 magic;
+	__be32 magic;		/* NBD_REPLY_MAGIC	*/
 	__be32 error;		/* 0 = ok, else error	*/
 	char handle[8];		/* handle you got from request	*/
 };
-- 
cgit v1.2.3


From 2686eb845da7762ee98b17e578b0c081aafb77b9 Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Mon, 10 Apr 2023 13:06:09 -0500
Subject: uapi nbd: add cookie alias to handle

The uapi <linux/nbd.h> header declares a 'char handle[8]' per request;
which is overloaded in English (are you referring to "handle" the
verb, such as handling a signal or writing a callback handler, or
"handle" the noun, the value used in a lookup table to correlate a
response back to the request).  Many user-space NBD implementations
(both servers and clients) have instead used 'uint64_t cookie' or
similar, as it is easier to directly assign an integer than to futz
around with memcpy.  In fact, upstream documentation is now
encouraging this shift in terminology:
https://github.com/NetworkBlockDevice/nbd/commit/ca4392eb2b

Accomplish this by use of an anonymous union to provide the alias for
anyone getting the definition from the uapi; this does not break
existing clients, while exposing the nicer name for those who prefer
it.  Note that block/nbd.c still uses the term handle (in fact, it
actually combines a 32-bit cookie and a 32-bit tag into the 64-bit
handle), but that internal usage is not changed by the public uapi,
since no compliant NBD server has any reason to inspect or alter the
64 bits sent over the socket.

Signed-off-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Link: https://lore.kernel.org/r/20230410180611.1051618-3-eblake@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/nbd.h | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nbd.h b/include/uapi/linux/nbd.h
index 8797387caaf7..80ce0ef43afd 100644
--- a/include/uapi/linux/nbd.h
+++ b/include/uapi/linux/nbd.h
@@ -12,7 +12,7 @@
  * 2004/02/19 Paul Clements
  *            Removed PARANOIA, plus various cleanup and comments
  * 2023 Copyright Red Hat
- *            Link to userspace extensions.
+ *            Link to userspace extensions, favor cookie over handle.
  */
 
 #ifndef _UAPILINUX_NBD_H
@@ -81,7 +81,10 @@ enum {
 struct nbd_request {
 	__be32 magic;	/* NBD_REQUEST_MAGIC	*/
 	__be32 type;	/* See NBD_CMD_*	*/
-	char handle[8];
+	union {
+		__be64 cookie;	/* Opaque identifier for request	*/
+		char handle[8];	/* older spelling of cookie		*/
+	};
 	__be64 from;
 	__be32 len;
 } __attribute__((packed));
@@ -93,6 +96,9 @@ struct nbd_request {
 struct nbd_reply {
 	__be32 magic;		/* NBD_REPLY_MAGIC	*/
 	__be32 error;		/* 0 = ok, else error	*/
-	char handle[8];		/* handle you got from request	*/
+	union {
+		__be64 cookie;	/* Opaque identifier from request	*/
+		char handle[8];	/* older spelling of cookie		*/
+	};
 };
 #endif /* _UAPILINUX_NBD_H */
-- 
cgit v1.2.3


From 3632679d9e4f879f49949bb5b050e0de553e4739 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Mon, 22 May 2023 14:08:20 +0200
Subject: ipv{4,6}/raw: fix output xfrm lookup wrt protocol

With a raw socket bound to IPPROTO_RAW (ie with hdrincl enabled), the
protocol field of the flow structure, build by raw_sendmsg() /
rawv6_sendmsg()),  is set to IPPROTO_RAW. This breaks the ipsec policy
lookup when some policies are defined with a protocol in the selector.

For ipv6, the sin6_port field from 'struct sockaddr_in6' could be used to
specify the protocol. Just accept all values for IPPROTO_RAW socket.

For ipv4, the sin_port field of 'struct sockaddr_in' could not be used
without breaking backward compatibility (the value of this field was never
checked). Let's add a new kind of control message, so that the userland
could specify which protocol is used.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
CC: stable@vger.kernel.org
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Link: https://lore.kernel.org/r/20230522120820.1319391-1-nicolas.dichtel@6wind.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/ip.h        |  2 ++
 include/uapi/linux/in.h |  1 +
 net/ipv4/ip_sockglue.c  | 12 +++++++++++-
 net/ipv4/raw.c          |  5 ++++-
 net/ipv6/raw.c          |  3 ++-
 5 files changed, 20 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/ip.h b/include/net/ip.h
index c3fffaa92d6e..acec504c469a 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -76,6 +76,7 @@ struct ipcm_cookie {
 	__be32			addr;
 	int			oif;
 	struct ip_options_rcu	*opt;
+	__u8			protocol;
 	__u8			ttl;
 	__s16			tos;
 	char			priority;
@@ -96,6 +97,7 @@ static inline void ipcm_init_sk(struct ipcm_cookie *ipcm,
 	ipcm->sockc.tsflags = inet->sk.sk_tsflags;
 	ipcm->oif = READ_ONCE(inet->sk.sk_bound_dev_if);
 	ipcm->addr = inet->inet_saddr;
+	ipcm->protocol = inet->inet_num;
 }
 
 #define IPCB(skb) ((struct inet_skb_parm*)((skb)->cb))
diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h
index 4b7f2df66b99..e682ab628dfa 100644
--- a/include/uapi/linux/in.h
+++ b/include/uapi/linux/in.h
@@ -163,6 +163,7 @@ struct in_addr {
 #define IP_MULTICAST_ALL		49
 #define IP_UNICAST_IF			50
 #define IP_LOCAL_PORT_RANGE		51
+#define IP_PROTOCOL			52
 
 #define MCAST_EXCLUDE	0
 #define MCAST_INCLUDE	1
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index b511ff0adc0a..8e97d8d4cc9d 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -317,7 +317,14 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
 			ipc->tos = val;
 			ipc->priority = rt_tos2priority(ipc->tos);
 			break;
-
+		case IP_PROTOCOL:
+			if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
+				return -EINVAL;
+			val = *(int *)CMSG_DATA(cmsg);
+			if (val < 1 || val > 255)
+				return -EINVAL;
+			ipc->protocol = val;
+			break;
 		default:
 			return -EINVAL;
 		}
@@ -1761,6 +1768,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
 	case IP_LOCAL_PORT_RANGE:
 		val = inet->local_port_range.hi << 16 | inet->local_port_range.lo;
 		break;
+	case IP_PROTOCOL:
+		val = inet_sk(sk)->inet_num;
+		break;
 	default:
 		sockopt_release_sock(sk);
 		return -ENOPROTOOPT;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index ff712bf2a98d..eadf1c9ef7e4 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -532,6 +532,9 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	}
 
 	ipcm_init_sk(&ipc, inet);
+	/* Keep backward compat */
+	if (hdrincl)
+		ipc.protocol = IPPROTO_RAW;
 
 	if (msg->msg_controllen) {
 		err = ip_cmsg_send(sk, msg, &ipc, false);
@@ -599,7 +602,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 
 	flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos,
 			   RT_SCOPE_UNIVERSE,
-			   hdrincl ? IPPROTO_RAW : sk->sk_protocol,
+			   hdrincl ? ipc.protocol : sk->sk_protocol,
 			   inet_sk_flowi_flags(sk) |
 			    (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
 			   daddr, saddr, 0, 0, sk->sk_uid);
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 7d0adb612bdd..44ee7a2e72ac 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -793,7 +793,8 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 
 		if (!proto)
 			proto = inet->inet_num;
-		else if (proto != inet->inet_num)
+		else if (proto != inet->inet_num &&
+			 inet->inet_num != IPPROTO_RAW)
 			return -EINVAL;
 
 		if (proto > 255)
-- 
cgit v1.2.3


From 26fb5480a27d34975cc2b680b77af189620dd740 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 11 May 2023 11:49:50 -0400
Subject: net/handshake: Enable the SNI extension to work properly

Enable the upper layer protocol to specify the SNI peername. This
avoids the need for tlshd to use a DNS lookup, which can return a
hostname that doesn't match the incoming certificate's SubjectName.

Fixes: 2fd5532044a8 ("net/handshake: Add a kernel API for requesting a TLSv1.3 handshake")
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/handshake.yaml | 4 ++++
 Documentation/networking/tls-handshake.rst | 5 +++++
 include/net/handshake.h                    | 1 +
 include/uapi/linux/handshake.h             | 1 +
 net/handshake/tlshd.c                      | 8 ++++++++
 5 files changed, 19 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/netlink/specs/handshake.yaml b/Documentation/netlink/specs/handshake.yaml
index 614f1a585511..6d89e30f5fd5 100644
--- a/Documentation/netlink/specs/handshake.yaml
+++ b/Documentation/netlink/specs/handshake.yaml
@@ -68,6 +68,9 @@ attribute-sets:
         type: nest
         nested-attributes: x509
         multi-attr: true
+      -
+        name: peername
+        type: string
   -
     name: done
     attributes:
@@ -105,6 +108,7 @@ operations:
             - auth-mode
             - peer-identity
             - certificate
+            - peername
     -
       name: done
       doc: Handler reports handshake completion
diff --git a/Documentation/networking/tls-handshake.rst b/Documentation/networking/tls-handshake.rst
index a2817a88e905..6f5ea1646a47 100644
--- a/Documentation/networking/tls-handshake.rst
+++ b/Documentation/networking/tls-handshake.rst
@@ -53,6 +53,7 @@ fills in a structure that contains the parameters of the request:
         struct socket   *ta_sock;
         tls_done_func_t ta_done;
         void            *ta_data;
+        const char      *ta_peername;
         unsigned int    ta_timeout_ms;
         key_serial_t    ta_keyring;
         key_serial_t    ta_my_cert;
@@ -71,6 +72,10 @@ instantiated a struct file in sock->file.
 has completed. Further explanation of this function is in the "Handshake
 Completion" sesction below.
 
+The consumer can provide a NUL-terminated hostname in the @ta_peername
+field that is sent as part of ClientHello. If no peername is provided,
+the DNS hostname associated with the server's IP address is used instead.
+
 The consumer can fill in the @ta_timeout_ms field to force the servicing
 handshake agent to exit after a number of milliseconds. This enables the
 socket to be fully closed once both the kernel and the handshake agent
diff --git a/include/net/handshake.h b/include/net/handshake.h
index 3352b1ab43b3..2e26e436e85f 100644
--- a/include/net/handshake.h
+++ b/include/net/handshake.h
@@ -24,6 +24,7 @@ struct tls_handshake_args {
 	struct socket		*ta_sock;
 	tls_done_func_t		ta_done;
 	void			*ta_data;
+	const char		*ta_peername;
 	unsigned int		ta_timeout_ms;
 	key_serial_t		ta_keyring;
 	key_serial_t		ta_my_cert;
diff --git a/include/uapi/linux/handshake.h b/include/uapi/linux/handshake.h
index 1de4d0b95325..3d7ea58778c9 100644
--- a/include/uapi/linux/handshake.h
+++ b/include/uapi/linux/handshake.h
@@ -44,6 +44,7 @@ enum {
 	HANDSHAKE_A_ACCEPT_AUTH_MODE,
 	HANDSHAKE_A_ACCEPT_PEER_IDENTITY,
 	HANDSHAKE_A_ACCEPT_CERTIFICATE,
+	HANDSHAKE_A_ACCEPT_PEERNAME,
 
 	__HANDSHAKE_A_ACCEPT_MAX,
 	HANDSHAKE_A_ACCEPT_MAX = (__HANDSHAKE_A_ACCEPT_MAX - 1)
diff --git a/net/handshake/tlshd.c b/net/handshake/tlshd.c
index fcbeb63b4eb1..b735f5cced2f 100644
--- a/net/handshake/tlshd.c
+++ b/net/handshake/tlshd.c
@@ -31,6 +31,7 @@ struct tls_handshake_req {
 	int			th_type;
 	unsigned int		th_timeout_ms;
 	int			th_auth_mode;
+	const char		*th_peername;
 	key_serial_t		th_keyring;
 	key_serial_t		th_certificate;
 	key_serial_t		th_privkey;
@@ -48,6 +49,7 @@ tls_handshake_req_init(struct handshake_req *req,
 	treq->th_timeout_ms = args->ta_timeout_ms;
 	treq->th_consumer_done = args->ta_done;
 	treq->th_consumer_data = args->ta_data;
+	treq->th_peername = args->ta_peername;
 	treq->th_keyring = args->ta_keyring;
 	treq->th_num_peerids = 0;
 	treq->th_certificate = TLS_NO_CERT;
@@ -214,6 +216,12 @@ static int tls_handshake_accept(struct handshake_req *req,
 	ret = nla_put_u32(msg, HANDSHAKE_A_ACCEPT_MESSAGE_TYPE, treq->th_type);
 	if (ret < 0)
 		goto out_cancel;
+	if (treq->th_peername) {
+		ret = nla_put_string(msg, HANDSHAKE_A_ACCEPT_PEERNAME,
+				     treq->th_peername);
+		if (ret < 0)
+			goto out_cancel;
+	}
 	if (treq->th_timeout_ms) {
 		ret = nla_put_u32(msg, HANDSHAKE_A_ACCEPT_TIMEOUT, treq->th_timeout_ms);
 		if (ret < 0)
-- 
cgit v1.2.3