From 9271c0ca573e02a360b636ecd8cb408852f4e9f6 Mon Sep 17 00:00:00 2001
From: Ville Syrjälä <ville.syrjala@linux.intel.com>
Date: Wed, 8 Nov 2017 17:25:04 +0200
Subject: drm/edid: Don't send non-zero YQ in AVI infoframe for HDMI 1.x sinks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Apparently some sinks look at the YQ bits even when receiving RGB,
and they get somehow confused when they see a non-zero YQ value.
So we can't just blindly follow CEA-861-F and set YQ to match the
RGB range.

Unfortunately there is no good way to tell whether the sink
designer claims to have read CEA-861-F. The CEA extension block
revision number has generally been stuck at 3 since forever,
and even a very recently manufactured sink might be based on
an old design so the manufacturing date doesn't seem like
something we can use. In lieu of better information let's
follow CEA-861-F only for HDMI 2.0 sinks, since HDMI 2.0 is
based on CEA-861-F. For HDMI 1.x sinks we'll always set YQ=0.

The alternative would of course be to always set YQ=0. And if
we ever encounter a HDMI 2.0+ sink with this bug that's what
we'll probably have to do.

Cc: stable@vger.kernel.org
Cc: Jani Nikula <jani.nikula@intel.com>
Cc: Eric Anholt <eric@anholt.net>
Cc: Neil Kownacki <njkkow@gmail.com>
Reported-by: Neil Kownacki <njkkow@gmail.com>
Tested-by: Neil Kownacki <njkkow@gmail.com>
Fixes: fcc8a22cc905 ("drm/edid: Set YQ bits in the AVI infoframe according to CEA-861-F")
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=101639
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20171108152504.12596-1-ville.syrjala@linux.intel.com
Acked-by: Eric Anholt <eric@anholt.net>
---
 include/drm/drm_edid.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/drm/drm_edid.h b/include/drm/drm_edid.h
index 1e1908a6b1d6..a992434ded99 100644
--- a/include/drm/drm_edid.h
+++ b/include/drm/drm_edid.h
@@ -360,7 +360,8 @@ void
 drm_hdmi_avi_infoframe_quant_range(struct hdmi_avi_infoframe *frame,
 				   const struct drm_display_mode *mode,
 				   enum hdmi_quantization_range rgb_quant_range,
-				   bool rgb_quant_range_selectable);
+				   bool rgb_quant_range_selectable,
+				   bool is_hdmi2_sink);
 
 /**
  * drm_eld_mnl - Get ELD monitor name length in bytes.
-- 
cgit v1.2.3


From a64a62ce9a380213dc9e192f762266d70c9b40ec Mon Sep 17 00:00:00 2001
From: Lv Zheng <lv.zheng@intel.com>
Date: Tue, 26 Sep 2017 16:54:09 +0800
Subject: ACPI / EC: Fix regression related to PM ops support in ECDT device

On platforms (ASUS X550ZE and possibly all ASUS X series) with valid ECDT
EC but invalid DSDT EC, EC PM ops won't be invoked as ECDT EC is not an
ACPI device. Thus the following commit actually removed post-resume
acpi_ec_enable_event() invocation for such platforms, and triggered a
regression on them that after being resumed, EC (actually should be ECDT)
driver stops handling EC events:

 Commit: c2b46d679b30c5c0d7eb47a21085943242bdd8dc
 Subject: ACPI / EC: Add PM operations to improve event handling for resume process

Notice that the root cause actually is "ECDT is not an ACPI device" rather
than "the timing of acpi_ec_enable_event() invocation", this patch fixes
this issue by enumerating ECDT EC as an ACPI device. Due to the existence
of the noirq stage, the ability of tuning the timing of
acpi_ec_enable_event() invocation is still meaningful.

This patch is a little bit different from the posted fix by moving
acpi_config_boot_ec() from acpi_ec_ecdt_start() to acpi_ec_add() to make
sure that EC event handling won't be stopped as long as the ACPI EC driver
is bound. Thus the following sequence shouldn't disable EC event handling:
unbind,suspend,resume,bind.

Fixes: c2b46d679b30 (ACPI / EC: Add PM operations to improve event handling for resume process)
Link: https://bugzilla.kernel.org/show_bug.cgi?id=196847
Reported-by: Luya Tshimbalanga <luya@fedoraproject.org>
Tested-by: Luya Tshimbalanga <luya@fedoraproject.org>
Cc: 4.9+ <stable@vger.kernel.org> # 4.9+
Signed-off-by: Lv Zheng <lv.zheng@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/ec.c           | 69 +++++++++++++++++++++++++++++----------------
 drivers/acpi/internal.h     |  1 +
 drivers/acpi/scan.c         | 21 ++++++++++++++
 include/acpi/acpi_bus.h     |  1 +
 include/acpi/acpi_drivers.h |  1 +
 5 files changed, 69 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c
index 82b3ce5e937e..df842465634a 100644
--- a/drivers/acpi/ec.c
+++ b/drivers/acpi/ec.c
@@ -1597,32 +1597,41 @@ static int acpi_ec_add(struct acpi_device *device)
 {
 	struct acpi_ec *ec = NULL;
 	int ret;
+	bool is_ecdt = false;
+	acpi_status status;
 
 	strcpy(acpi_device_name(device), ACPI_EC_DEVICE_NAME);
 	strcpy(acpi_device_class(device), ACPI_EC_CLASS);
 
-	ec = acpi_ec_alloc();
-	if (!ec)
-		return -ENOMEM;
-	if (ec_parse_device(device->handle, 0, ec, NULL) !=
-		AE_CTRL_TERMINATE) {
+	if (!strcmp(acpi_device_hid(device), ACPI_ECDT_HID)) {
+		is_ecdt = true;
+		ec = boot_ec;
+	} else {
+		ec = acpi_ec_alloc();
+		if (!ec)
+			return -ENOMEM;
+		status = ec_parse_device(device->handle, 0, ec, NULL);
+		if (status != AE_CTRL_TERMINATE) {
 			ret = -EINVAL;
 			goto err_alloc;
+		}
 	}
 
 	if (acpi_is_boot_ec(ec)) {
-		boot_ec_is_ecdt = false;
-		/*
-		 * Trust PNP0C09 namespace location rather than ECDT ID.
-		 *
-		 * But trust ECDT GPE rather than _GPE because of ASUS quirks,
-		 * so do not change boot_ec->gpe to ec->gpe.
-		 */
-		boot_ec->handle = ec->handle;
-		acpi_handle_debug(ec->handle, "duplicated.\n");
-		acpi_ec_free(ec);
-		ec = boot_ec;
-		ret = acpi_config_boot_ec(ec, ec->handle, true, false);
+		boot_ec_is_ecdt = is_ecdt;
+		if (!is_ecdt) {
+			/*
+			 * Trust PNP0C09 namespace location rather than
+			 * ECDT ID. But trust ECDT GPE rather than _GPE
+			 * because of ASUS quirks, so do not change
+			 * boot_ec->gpe to ec->gpe.
+			 */
+			boot_ec->handle = ec->handle;
+			acpi_handle_debug(ec->handle, "duplicated.\n");
+			acpi_ec_free(ec);
+			ec = boot_ec;
+		}
+		ret = acpi_config_boot_ec(ec, ec->handle, true, is_ecdt);
 	} else
 		ret = acpi_ec_setup(ec, true);
 	if (ret)
@@ -1635,8 +1644,10 @@ static int acpi_ec_add(struct acpi_device *device)
 	ret = !!request_region(ec->command_addr, 1, "EC cmd");
 	WARN(!ret, "Could not request EC cmd io port 0x%lx", ec->command_addr);
 
-	/* Reprobe devices depending on the EC */
-	acpi_walk_dep_device_list(ec->handle);
+	if (!is_ecdt) {
+		/* Reprobe devices depending on the EC */
+		acpi_walk_dep_device_list(ec->handle);
+	}
 	acpi_handle_debug(ec->handle, "enumerated.\n");
 	return 0;
 
@@ -1692,6 +1703,7 @@ ec_parse_io_ports(struct acpi_resource *resource, void *context)
 
 static const struct acpi_device_id ec_device_ids[] = {
 	{"PNP0C09", 0},
+	{ACPI_ECDT_HID, 0},
 	{"", 0},
 };
 
@@ -1764,11 +1776,14 @@ static int __init acpi_ec_ecdt_start(void)
 	 * Note: ec->handle can be valid if this function is called after
 	 * acpi_ec_add(), hence the fast path.
 	 */
-	if (boot_ec->handle != ACPI_ROOT_OBJECT)
-		handle = boot_ec->handle;
-	else if (!acpi_ec_ecdt_get_handle(&handle))
-		return -ENODEV;
-	return acpi_config_boot_ec(boot_ec, handle, true, true);
+	if (boot_ec->handle == ACPI_ROOT_OBJECT) {
+		if (!acpi_ec_ecdt_get_handle(&handle))
+			return -ENODEV;
+		boot_ec->handle = handle;
+	}
+
+	/* Register to ACPI bus with PM ops attached */
+	return acpi_bus_register_early_device(ACPI_BUS_TYPE_ECDT_EC);
 }
 
 #if 0
@@ -2020,6 +2035,12 @@ int __init acpi_ec_init(void)
 
 	/* Drivers must be started after acpi_ec_query_init() */
 	dsdt_fail = acpi_bus_register_driver(&acpi_ec_driver);
+	/*
+	 * Register ECDT to ACPI bus only when PNP0C09 probe fails. This is
+	 * useful for platforms (confirmed on ASUS X550ZE) with valid ECDT
+	 * settings but invalid DSDT settings.
+	 * https://bugzilla.kernel.org/show_bug.cgi?id=196847
+	 */
 	ecdt_fail = acpi_ec_ecdt_start();
 	return ecdt_fail && dsdt_fail ? -ENODEV : 0;
 }
diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h
index 4361c4415b4f..ede83d38beed 100644
--- a/drivers/acpi/internal.h
+++ b/drivers/acpi/internal.h
@@ -115,6 +115,7 @@ bool acpi_device_is_present(const struct acpi_device *adev);
 bool acpi_device_is_battery(struct acpi_device *adev);
 bool acpi_device_is_first_physical_node(struct acpi_device *adev,
 					const struct device *dev);
+int acpi_bus_register_early_device(int type);
 
 /* --------------------------------------------------------------------------
                      Device Matching and Notification
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 602f8ff212f2..2f2f50322ffb 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -1024,6 +1024,9 @@ static void acpi_device_get_busid(struct acpi_device *device)
 	case ACPI_BUS_TYPE_SLEEP_BUTTON:
 		strcpy(device->pnp.bus_id, "SLPF");
 		break;
+	case ACPI_BUS_TYPE_ECDT_EC:
+		strcpy(device->pnp.bus_id, "ECDT");
+		break;
 	default:
 		acpi_get_name(device->handle, ACPI_SINGLE_NAME, &buffer);
 		/* Clean up trailing underscores (if any) */
@@ -1304,6 +1307,9 @@ static void acpi_set_pnp_ids(acpi_handle handle, struct acpi_device_pnp *pnp,
 	case ACPI_BUS_TYPE_SLEEP_BUTTON:
 		acpi_add_id(pnp, ACPI_BUTTON_HID_SLEEPF);
 		break;
+	case ACPI_BUS_TYPE_ECDT_EC:
+		acpi_add_id(pnp, ACPI_ECDT_HID);
+		break;
 	}
 }
 
@@ -2049,6 +2055,21 @@ void acpi_bus_trim(struct acpi_device *adev)
 }
 EXPORT_SYMBOL_GPL(acpi_bus_trim);
 
+int acpi_bus_register_early_device(int type)
+{
+	struct acpi_device *device = NULL;
+	int result;
+
+	result = acpi_add_single_object(&device, NULL,
+					type, ACPI_STA_DEFAULT);
+	if (result)
+		return result;
+
+	device->flags.match_driver = true;
+	return device_attach(&device->dev);
+}
+EXPORT_SYMBOL_GPL(acpi_bus_register_early_device);
+
 static int acpi_bus_scan_fixed(void)
 {
 	int result = 0;
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index fa1505292f6c..324a04df3785 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -105,6 +105,7 @@ enum acpi_bus_device_type {
 	ACPI_BUS_TYPE_THERMAL,
 	ACPI_BUS_TYPE_POWER_BUTTON,
 	ACPI_BUS_TYPE_SLEEP_BUTTON,
+	ACPI_BUS_TYPE_ECDT_EC,
 	ACPI_BUS_DEVICE_TYPE_COUNT
 };
 
diff --git a/include/acpi/acpi_drivers.h b/include/acpi/acpi_drivers.h
index 29c691265b49..14499757338f 100644
--- a/include/acpi/acpi_drivers.h
+++ b/include/acpi/acpi_drivers.h
@@ -58,6 +58,7 @@
 #define ACPI_VIDEO_HID			"LNXVIDEO"
 #define ACPI_BAY_HID			"LNXIOBAY"
 #define ACPI_DOCK_HID			"LNXDOCK"
+#define ACPI_ECDT_HID			"LNXEC"
 /* Quirk for broken IBM BIOSes */
 #define ACPI_SMBUS_IBM_HID		"SMBUSIBM"
 
-- 
cgit v1.2.3


From 860dd4424f344400b491b212ee4acb3a358ba9d9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 21 Nov 2017 14:23:37 +0100
Subject: scsi: dma-mapping: always provide dma_get_cache_alignment

Provide the dummy version of dma_get_cache_alignment that always returns
1 even if CONFIG_HAS_DMA is not set, so that drivers and subsystems can
use it without ifdefs.

Cc: stable@vger.kernel.org
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/linux/dma-mapping.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index eee1499db396..29cfd18360be 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -710,7 +710,6 @@ static inline void *dma_zalloc_coherent(struct device *dev, size_t size,
 	return ret;
 }
 
-#ifdef CONFIG_HAS_DMA
 static inline int dma_get_cache_alignment(void)
 {
 #ifdef ARCH_DMA_MINALIGN
@@ -718,7 +717,6 @@ static inline int dma_get_cache_alignment(void)
 #endif
 	return 1;
 }
-#endif
 
 /* flags for the coherent memory api */
 #define DMA_MEMORY_EXCLUSIVE		0x01
-- 
cgit v1.2.3


From c2e8fbf908afd81ad502b567a6639598f92c9b9d Mon Sep 17 00:00:00 2001
From: Huacai Chen <chenhc@lemote.com>
Date: Tue, 21 Nov 2017 14:23:39 +0100
Subject: scsi: libsas: align sata_device's rps_resp on a cacheline

The rps_resp buffer in ata_device is a DMA target, but it isn't
explicitly cacheline aligned. Due to this, adjacent fields can be
overwritten with stale data from memory on non-coherent architectures.
As a result, the kernel is sometimes unable to communicate with an SATA
device behind a SAS expander.

Fix this by ensuring that the rps_resp buffer is cacheline aligned.

This issue is similar to that fixed by Commit 84bda12af31f93 ("libata:
align ap->sector_buf") and Commit 4ee34ea3a12396f35b26 ("libata: Align
ata_device's id on a cacheline").

Cc: stable@vger.kernel.org
Signed-off-by: Huacai Chen <chenhc@lemote.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/scsi/libsas.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/scsi/libsas.h b/include/scsi/libsas.h
index 0f9cbf96c093..6df6fe0c2198 100644
--- a/include/scsi/libsas.h
+++ b/include/scsi/libsas.h
@@ -159,11 +159,11 @@ struct expander_device {
 
 struct sata_device {
 	unsigned int class;
-	struct smp_resp        rps_resp; /* report_phy_sata_resp */
 	u8     port_no;        /* port number, if this is a PM (Port) */
 
 	struct ata_port *ap;
 	struct ata_host ata_host;
+	struct smp_resp rps_resp ____cacheline_aligned; /* report_phy_sata_resp */
 	u8     fis[ATA_RESP_FIS_SIZE];
 };
 
-- 
cgit v1.2.3


From 7d2c3f54e6f646887d019faa45f35d6fe9fe82ce Mon Sep 17 00:00:00 2001
From: Stephan Mueller <smueller@chronox.de>
Date: Fri, 10 Nov 2017 13:20:55 +0100
Subject: crypto: af_alg - remove locking in async callback

The code paths protected by the socket-lock do not use or modify the
socket in a non-atomic fashion. The actions pertaining the socket do not
even need to be handled as an atomic operation. Thus, the socket-lock
can be safely ignored.

This fixes a bug regarding scheduling in atomic as the callback function
may be invoked in interrupt context.

In addition, the sock_hold is moved before the AIO encrypt/decrypt
operation to ensure that the socket is always present. This avoids a
tiny race window where the socket is unprotected and yet used by the AIO
operation.

Finally, the release of resources for a crypto operation is moved into a
common function of af_alg_free_resources.

Cc: <stable@vger.kernel.org>
Fixes: e870456d8e7c8 ("crypto: algif_skcipher - overhaul memory management")
Fixes: d887c52d6ae43 ("crypto: algif_aead - overhaul memory management")
Reported-by: Romain Izard <romain.izard.pro@gmail.com>
Signed-off-by: Stephan Mueller <smueller@chronox.de>
Tested-by: Romain Izard <romain.izard.pro@gmail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/af_alg.c         | 21 ++++++++++++++-------
 crypto/algif_aead.c     | 23 ++++++++++++-----------
 crypto/algif_skcipher.c | 23 ++++++++++++-----------
 include/crypto/if_alg.h |  1 +
 4 files changed, 39 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/crypto/af_alg.c b/crypto/af_alg.c
index 85cea9de324a..358749c38894 100644
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -1020,6 +1020,18 @@ unlock:
 }
 EXPORT_SYMBOL_GPL(af_alg_sendpage);
 
+/**
+ * af_alg_free_resources - release resources required for crypto request
+ */
+void af_alg_free_resources(struct af_alg_async_req *areq)
+{
+	struct sock *sk = areq->sk;
+
+	af_alg_free_areq_sgls(areq);
+	sock_kfree_s(sk, areq, areq->areqlen);
+}
+EXPORT_SYMBOL_GPL(af_alg_free_resources);
+
 /**
  * af_alg_async_cb - AIO callback handler
  *
@@ -1036,18 +1048,13 @@ void af_alg_async_cb(struct crypto_async_request *_req, int err)
 	struct kiocb *iocb = areq->iocb;
 	unsigned int resultlen;
 
-	lock_sock(sk);
-
 	/* Buffer size written by crypto operation. */
 	resultlen = areq->outlen;
 
-	af_alg_free_areq_sgls(areq);
-	sock_kfree_s(sk, areq, areq->areqlen);
-	__sock_put(sk);
+	af_alg_free_resources(areq);
+	sock_put(sk);
 
 	iocb->ki_complete(iocb, err ? err : resultlen, 0);
-
-	release_sock(sk);
 }
 EXPORT_SYMBOL_GPL(af_alg_async_cb);
 
diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c
index e2068b78993c..805f485ddf1b 100644
--- a/crypto/algif_aead.c
+++ b/crypto/algif_aead.c
@@ -283,12 +283,23 @@ static int _aead_recvmsg(struct socket *sock, struct msghdr *msg,
 
 	if (msg->msg_iocb && !is_sync_kiocb(msg->msg_iocb)) {
 		/* AIO operation */
+		sock_hold(sk);
 		areq->iocb = msg->msg_iocb;
 		aead_request_set_callback(&areq->cra_u.aead_req,
 					  CRYPTO_TFM_REQ_MAY_BACKLOG,
 					  af_alg_async_cb, areq);
 		err = ctx->enc ? crypto_aead_encrypt(&areq->cra_u.aead_req) :
 				 crypto_aead_decrypt(&areq->cra_u.aead_req);
+
+		/* AIO operation in progress */
+		if (err == -EINPROGRESS || err == -EBUSY) {
+			/* Remember output size that will be generated. */
+			areq->outlen = outlen;
+
+			return -EIOCBQUEUED;
+		}
+
+		sock_put(sk);
 	} else {
 		/* Synchronous operation */
 		aead_request_set_callback(&areq->cra_u.aead_req,
@@ -300,19 +311,9 @@ static int _aead_recvmsg(struct socket *sock, struct msghdr *msg,
 				&ctx->wait);
 	}
 
-	/* AIO operation in progress */
-	if (err == -EINPROGRESS) {
-		sock_hold(sk);
-
-		/* Remember output size that will be generated. */
-		areq->outlen = outlen;
-
-		return -EIOCBQUEUED;
-	}
 
 free:
-	af_alg_free_areq_sgls(areq);
-	sock_kfree_s(sk, areq, areq->areqlen);
+	af_alg_free_resources(areq);
 
 	return err ? err : outlen;
 }
diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c
index 9954b078f0b9..30cff827dd8f 100644
--- a/crypto/algif_skcipher.c
+++ b/crypto/algif_skcipher.c
@@ -117,6 +117,7 @@ static int _skcipher_recvmsg(struct socket *sock, struct msghdr *msg,
 
 	if (msg->msg_iocb && !is_sync_kiocb(msg->msg_iocb)) {
 		/* AIO operation */
+		sock_hold(sk);
 		areq->iocb = msg->msg_iocb;
 		skcipher_request_set_callback(&areq->cra_u.skcipher_req,
 					      CRYPTO_TFM_REQ_MAY_SLEEP,
@@ -124,6 +125,16 @@ static int _skcipher_recvmsg(struct socket *sock, struct msghdr *msg,
 		err = ctx->enc ?
 			crypto_skcipher_encrypt(&areq->cra_u.skcipher_req) :
 			crypto_skcipher_decrypt(&areq->cra_u.skcipher_req);
+
+		/* AIO operation in progress */
+		if (err == -EINPROGRESS || err == -EBUSY) {
+			/* Remember output size that will be generated. */
+			areq->outlen = len;
+
+			return -EIOCBQUEUED;
+		}
+
+		sock_put(sk);
 	} else {
 		/* Synchronous operation */
 		skcipher_request_set_callback(&areq->cra_u.skcipher_req,
@@ -136,19 +147,9 @@ static int _skcipher_recvmsg(struct socket *sock, struct msghdr *msg,
 						 &ctx->wait);
 	}
 
-	/* AIO operation in progress */
-	if (err == -EINPROGRESS) {
-		sock_hold(sk);
-
-		/* Remember output size that will be generated. */
-		areq->outlen = len;
-
-		return -EIOCBQUEUED;
-	}
 
 free:
-	af_alg_free_areq_sgls(areq);
-	sock_kfree_s(sk, areq, areq->areqlen);
+	af_alg_free_resources(areq);
 
 	return err ? err : len;
 }
diff --git a/include/crypto/if_alg.h b/include/crypto/if_alg.h
index 6abf0a3604dc..38d9c5861ed8 100644
--- a/include/crypto/if_alg.h
+++ b/include/crypto/if_alg.h
@@ -242,6 +242,7 @@ int af_alg_sendmsg(struct socket *sock, struct msghdr *msg, size_t size,
 		   unsigned int ivsize);
 ssize_t af_alg_sendpage(struct socket *sock, struct page *page,
 			int offset, size_t size, int flags);
+void af_alg_free_resources(struct af_alg_async_req *areq);
 void af_alg_async_cb(struct crypto_async_request *_req, int err);
 unsigned int af_alg_poll(struct file *file, struct socket *sock,
 			 poll_table *wait);
-- 
cgit v1.2.3


From a158bdd3247b9656df36ba133235fff702e9fdc3 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 24 Nov 2017 10:18:41 +0000
Subject: rxrpc: Fix call timeouts

Fix the rxrpc call expiration timeouts and make them settable from
userspace.  By analogy with other rx implementations, there should be three
timeouts:

 (1) "Normal timeout"

     This is set for all calls and is triggered if we haven't received any
     packets from the peer in a while.  It is measured from the last time
     we received any packet on that call.  This is not reset by any
     connection packets (such as CHALLENGE/RESPONSE packets).

     If a service operation takes a long time, the server should generate
     PING ACKs at a duration that's substantially less than the normal
     timeout so is to keep both sides alive.  This is set at 1/6 of normal
     timeout.

 (2) "Idle timeout"

     This is set only for a service call and is triggered if we stop
     receiving the DATA packets that comprise the request data.  It is
     measured from the last time we received a DATA packet.

 (3) "Hard timeout"

     This can be set for a call and specified the maximum lifetime of that
     call.  It should not be specified by default.  Some operations (such
     as volume transfer) take a long time.

Allow userspace to set/change the timeouts on a call with sendmsg, using a
control message:

	RXRPC_SET_CALL_TIMEOUTS

The data to the message is a number of 32-bit words, not all of which need
be given:

	u32 hard_timeout;	/* sec from first packet */
	u32 idle_timeout;	/* msec from packet Rx */
	u32 normal_timeout;	/* msec from data Rx */

This can be set in combination with any other sendmsg() that affects a
call.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h |  69 +++++++++++------
 include/uapi/linux/rxrpc.h   |   1 +
 net/rxrpc/ar-internal.h      |  37 +++++----
 net/rxrpc/call_event.c       | 179 ++++++++++++++++++++-----------------------
 net/rxrpc/call_object.c      |  27 ++++---
 net/rxrpc/conn_client.c      |   4 +-
 net/rxrpc/input.c            |  34 +++++++-
 net/rxrpc/misc.c             |  19 ++---
 net/rxrpc/recvmsg.c          |   2 +-
 net/rxrpc/sendmsg.c          |  59 +++++++++++---
 net/rxrpc/sysctl.c           |  60 +++++++--------
 11 files changed, 290 insertions(+), 201 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index ebe96796027a..01dcbc2164b5 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -138,10 +138,20 @@ enum rxrpc_rtt_rx_trace {
 
 enum rxrpc_timer_trace {
 	rxrpc_timer_begin,
+	rxrpc_timer_exp_ack,
+	rxrpc_timer_exp_hard,
+	rxrpc_timer_exp_idle,
+	rxrpc_timer_exp_normal,
+	rxrpc_timer_exp_ping,
+	rxrpc_timer_exp_resend,
 	rxrpc_timer_expired,
 	rxrpc_timer_init_for_reply,
 	rxrpc_timer_init_for_send_reply,
+	rxrpc_timer_restart,
 	rxrpc_timer_set_for_ack,
+	rxrpc_timer_set_for_hard,
+	rxrpc_timer_set_for_idle,
+	rxrpc_timer_set_for_normal,
 	rxrpc_timer_set_for_ping,
 	rxrpc_timer_set_for_resend,
 	rxrpc_timer_set_for_send,
@@ -296,12 +306,22 @@ enum rxrpc_congest_change {
 #define rxrpc_timer_traces \
 	EM(rxrpc_timer_begin,			"Begin ") \
 	EM(rxrpc_timer_expired,			"*EXPR*") \
+	EM(rxrpc_timer_exp_ack,			"ExpAck") \
+	EM(rxrpc_timer_exp_hard,		"ExpHrd") \
+	EM(rxrpc_timer_exp_idle,		"ExpIdl") \
+	EM(rxrpc_timer_exp_normal,		"ExpNml") \
+	EM(rxrpc_timer_exp_ping,		"ExpPng") \
+	EM(rxrpc_timer_exp_resend,		"ExpRsn") \
 	EM(rxrpc_timer_init_for_reply,		"IniRpl") \
 	EM(rxrpc_timer_init_for_send_reply,	"SndRpl") \
+	EM(rxrpc_timer_restart,			"Restrt") \
 	EM(rxrpc_timer_set_for_ack,		"SetAck") \
+	EM(rxrpc_timer_set_for_hard,		"SetHrd") \
+	EM(rxrpc_timer_set_for_idle,		"SetIdl") \
+	EM(rxrpc_timer_set_for_normal,		"SetNml") \
 	EM(rxrpc_timer_set_for_ping,		"SetPng") \
 	EM(rxrpc_timer_set_for_resend,		"SetRTx") \
-	E_(rxrpc_timer_set_for_send,		"SetTx ")
+	E_(rxrpc_timer_set_for_send,		"SetSnd")
 
 #define rxrpc_propose_ack_traces \
 	EM(rxrpc_propose_ack_client_tx_end,	"ClTxEnd") \
@@ -932,39 +952,44 @@ TRACE_EVENT(rxrpc_rtt_rx,
 
 TRACE_EVENT(rxrpc_timer,
 	    TP_PROTO(struct rxrpc_call *call, enum rxrpc_timer_trace why,
-		     ktime_t now, unsigned long now_j),
+		     unsigned long now),
 
-	    TP_ARGS(call, why, now, now_j),
+	    TP_ARGS(call, why, now),
 
 	    TP_STRUCT__entry(
 		    __field(struct rxrpc_call *,		call		)
 		    __field(enum rxrpc_timer_trace,		why		)
-		    __field_struct(ktime_t,			now		)
-		    __field_struct(ktime_t,			expire_at	)
-		    __field_struct(ktime_t,			ack_at		)
-		    __field_struct(ktime_t,			resend_at	)
-		    __field(unsigned long,			now_j		)
-		    __field(unsigned long,			timer		)
+		    __field(long,				now		)
+		    __field(long,				ack_at		)
+		    __field(long,				resend_at	)
+		    __field(long,				ping_at		)
+		    __field(long,				expect_rx_by	)
+		    __field(long,				expect_req_by	)
+		    __field(long,				expect_term_by	)
+		    __field(long,				timer		)
 			     ),
 
 	    TP_fast_assign(
-		    __entry->call	= call;
-		    __entry->why	= why;
-		    __entry->now	= now;
-		    __entry->expire_at	= call->expire_at;
-		    __entry->ack_at	= call->ack_at;
-		    __entry->resend_at	= call->resend_at;
-		    __entry->now_j	= now_j;
-		    __entry->timer	= call->timer.expires;
+		    __entry->call		= call;
+		    __entry->why		= why;
+		    __entry->now		= now;
+		    __entry->ack_at		= call->ack_at;
+		    __entry->resend_at		= call->resend_at;
+		    __entry->expect_rx_by	= call->expect_rx_by;
+		    __entry->expect_req_by	= call->expect_req_by;
+		    __entry->expect_term_by	= call->expect_term_by;
+		    __entry->timer		= call->timer.expires;
 			   ),
 
-	    TP_printk("c=%p %s x=%lld a=%lld r=%lld t=%ld",
+	    TP_printk("c=%p %s a=%ld r=%ld xr=%ld xq=%ld xt=%ld t=%ld",
 		      __entry->call,
 		      __print_symbolic(__entry->why, rxrpc_timer_traces),
-		      ktime_to_ns(ktime_sub(__entry->expire_at, __entry->now)),
-		      ktime_to_ns(ktime_sub(__entry->ack_at, __entry->now)),
-		      ktime_to_ns(ktime_sub(__entry->resend_at, __entry->now)),
-		      __entry->timer - __entry->now_j)
+		      __entry->ack_at - __entry->now,
+		      __entry->resend_at - __entry->now,
+		      __entry->expect_rx_by - __entry->now,
+		      __entry->expect_req_by - __entry->now,
+		      __entry->expect_term_by - __entry->now,
+		      __entry->timer - __entry->now)
 	    );
 
 TRACE_EVENT(rxrpc_rx_lose,
diff --git a/include/uapi/linux/rxrpc.h b/include/uapi/linux/rxrpc.h
index 9d4afea308a4..9335d92c14a4 100644
--- a/include/uapi/linux/rxrpc.h
+++ b/include/uapi/linux/rxrpc.h
@@ -59,6 +59,7 @@ enum rxrpc_cmsg_type {
 	RXRPC_EXCLUSIVE_CALL	= 10,	/* s-: Call should be on exclusive connection */
 	RXRPC_UPGRADE_SERVICE	= 11,	/* s-: Request service upgrade for client call */
 	RXRPC_TX_LENGTH		= 12,	/* s-: Total length of Tx data */
+	RXRPC_SET_CALL_TIMEOUT	= 13,	/* s-: Set one or more call timeouts */
 	RXRPC__SUPPORTED
 };
 
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index ba63f2231107..548411371048 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -468,9 +468,9 @@ enum rxrpc_call_flag {
 enum rxrpc_call_event {
 	RXRPC_CALL_EV_ACK,		/* need to generate ACK */
 	RXRPC_CALL_EV_ABORT,		/* need to generate abort */
-	RXRPC_CALL_EV_TIMER,		/* Timer expired */
 	RXRPC_CALL_EV_RESEND,		/* Tx resend required */
 	RXRPC_CALL_EV_PING,		/* Ping send required */
+	RXRPC_CALL_EV_EXPIRED,		/* Expiry occurred */
 };
 
 /*
@@ -514,10 +514,14 @@ struct rxrpc_call {
 	struct rxrpc_peer	*peer;		/* Peer record for remote address */
 	struct rxrpc_sock __rcu	*socket;	/* socket responsible */
 	struct mutex		user_mutex;	/* User access mutex */
-	ktime_t			ack_at;		/* When deferred ACK needs to happen */
-	ktime_t			resend_at;	/* When next resend needs to happen */
-	ktime_t			ping_at;	/* When next to send a ping */
-	ktime_t			expire_at;	/* When the call times out */
+	unsigned long		ack_at;		/* When deferred ACK needs to happen */
+	unsigned long		resend_at;	/* When next resend needs to happen */
+	unsigned long		ping_at;	/* When next to send a ping */
+	unsigned long		expect_rx_by;	/* When we expect to get a packet by */
+	unsigned long		expect_req_by;	/* When we expect to get a request DATA packet by */
+	unsigned long		expect_term_by;	/* When we expect call termination by */
+	u32			next_rx_timo;	/* Timeout for next Rx packet (jif) */
+	u32			next_req_timo;	/* Timeout for next Rx request packet (jif) */
 	struct timer_list	timer;		/* Combined event timer */
 	struct work_struct	processor;	/* Event processor */
 	rxrpc_notify_rx_t	notify_rx;	/* kernel service Rx notification function */
@@ -697,12 +701,19 @@ int rxrpc_reject_call(struct rxrpc_sock *);
 /*
  * call_event.c
  */
-void __rxrpc_set_timer(struct rxrpc_call *, enum rxrpc_timer_trace, ktime_t);
-void rxrpc_set_timer(struct rxrpc_call *, enum rxrpc_timer_trace, ktime_t);
 void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool, bool,
 		       enum rxrpc_propose_ack_trace);
 void rxrpc_process_call(struct work_struct *);
 
+static inline void rxrpc_reduce_call_timer(struct rxrpc_call *call,
+					   unsigned long expire_at,
+					   unsigned long now,
+					   enum rxrpc_timer_trace why)
+{
+	trace_rxrpc_timer(call, why, now);
+	timer_reduce(&call->timer, expire_at);
+}
+
 /*
  * call_object.c
  */
@@ -843,8 +854,8 @@ static inline bool __rxrpc_abort_eproto(struct rxrpc_call *call,
  */
 extern unsigned int rxrpc_max_client_connections;
 extern unsigned int rxrpc_reap_client_connections;
-extern unsigned int rxrpc_conn_idle_client_expiry;
-extern unsigned int rxrpc_conn_idle_client_fast_expiry;
+extern unsigned long rxrpc_conn_idle_client_expiry;
+extern unsigned long rxrpc_conn_idle_client_fast_expiry;
 extern struct idr rxrpc_client_conn_ids;
 
 void rxrpc_destroy_client_conn_ids(void);
@@ -976,13 +987,13 @@ static inline void rxrpc_queue_local(struct rxrpc_local *local)
  * misc.c
  */
 extern unsigned int rxrpc_max_backlog __read_mostly;
-extern unsigned int rxrpc_requested_ack_delay;
-extern unsigned int rxrpc_soft_ack_delay;
-extern unsigned int rxrpc_idle_ack_delay;
+extern unsigned long rxrpc_requested_ack_delay;
+extern unsigned long rxrpc_soft_ack_delay;
+extern unsigned long rxrpc_idle_ack_delay;
 extern unsigned int rxrpc_rx_window_size;
 extern unsigned int rxrpc_rx_mtu;
 extern unsigned int rxrpc_rx_jumbo_max;
-extern unsigned int rxrpc_resend_timeout;
+extern unsigned long rxrpc_resend_timeout;
 
 extern const s8 rxrpc_ack_priority[];
 
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 3574508baf9a..c14395d5ad8c 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -21,80 +21,6 @@
 #include <net/af_rxrpc.h>
 #include "ar-internal.h"
 
-/*
- * Set the timer
- */
-void __rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why,
-		       ktime_t now)
-{
-	unsigned long t_j, now_j = jiffies;
-	ktime_t t;
-	bool queue = false;
-
-	if (call->state < RXRPC_CALL_COMPLETE) {
-		t = call->expire_at;
-		if (!ktime_after(t, now)) {
-			trace_rxrpc_timer(call, why, now, now_j);
-			queue = true;
-			goto out;
-		}
-
-		if (!ktime_after(call->resend_at, now)) {
-			call->resend_at = call->expire_at;
-			if (!test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events))
-				queue = true;
-		} else if (ktime_before(call->resend_at, t)) {
-			t = call->resend_at;
-		}
-
-		if (!ktime_after(call->ack_at, now)) {
-			call->ack_at = call->expire_at;
-			if (!test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events))
-				queue = true;
-		} else if (ktime_before(call->ack_at, t)) {
-			t = call->ack_at;
-		}
-
-		if (!ktime_after(call->ping_at, now)) {
-			call->ping_at = call->expire_at;
-			if (!test_and_set_bit(RXRPC_CALL_EV_PING, &call->events))
-				queue = true;
-		} else if (ktime_before(call->ping_at, t)) {
-			t = call->ping_at;
-		}
-
-		t_j = nsecs_to_jiffies(ktime_to_ns(ktime_sub(t, now)));
-		t_j += jiffies;
-
-		/* We have to make sure that the calculated jiffies value falls
-		 * at or after the nsec value, or we may loop ceaselessly
-		 * because the timer times out, but we haven't reached the nsec
-		 * timeout yet.
-		 */
-		t_j++;
-
-		if (call->timer.expires != t_j || !timer_pending(&call->timer)) {
-			mod_timer(&call->timer, t_j);
-			trace_rxrpc_timer(call, why, now, now_j);
-		}
-	}
-
-out:
-	if (queue)
-		rxrpc_queue_call(call);
-}
-
-/*
- * Set the timer
- */
-void rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why,
-		     ktime_t now)
-{
-	read_lock_bh(&call->state_lock);
-	__rxrpc_set_timer(call, why, now);
-	read_unlock_bh(&call->state_lock);
-}
-
 /*
  * Propose a PING ACK be sent.
  */
@@ -106,12 +32,13 @@ static void rxrpc_propose_ping(struct rxrpc_call *call,
 		    !test_and_set_bit(RXRPC_CALL_EV_PING, &call->events))
 			rxrpc_queue_call(call);
 	} else {
-		ktime_t now = ktime_get_real();
-		ktime_t ping_at = ktime_add_ms(now, rxrpc_idle_ack_delay);
+		unsigned long now = jiffies;
+		unsigned long ping_at = now + rxrpc_idle_ack_delay;
 
-		if (ktime_before(ping_at, call->ping_at)) {
-			call->ping_at = ping_at;
-			rxrpc_set_timer(call, rxrpc_timer_set_for_ping, now);
+		if (time_before(ping_at, call->ping_at)) {
+			WRITE_ONCE(call->ping_at, ping_at);
+			rxrpc_reduce_call_timer(call, ping_at, now,
+						rxrpc_timer_set_for_ping);
 		}
 	}
 }
@@ -125,8 +52,7 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
 				enum rxrpc_propose_ack_trace why)
 {
 	enum rxrpc_propose_ack_outcome outcome = rxrpc_propose_ack_use;
-	unsigned int expiry = rxrpc_soft_ack_delay;
-	ktime_t now, ack_at;
+	unsigned long now, ack_at, expiry = rxrpc_soft_ack_delay;
 	s8 prior = rxrpc_ack_priority[ack_reason];
 
 	/* Pings are handled specially because we don't want to accidentally
@@ -190,11 +116,12 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
 		    background)
 			rxrpc_queue_call(call);
 	} else {
-		now = ktime_get_real();
-		ack_at = ktime_add_ms(now, expiry);
-		if (ktime_before(ack_at, call->ack_at)) {
-			call->ack_at = ack_at;
-			rxrpc_set_timer(call, rxrpc_timer_set_for_ack, now);
+		now = jiffies;
+		ack_at = jiffies + expiry;
+		if (time_before(ack_at, call->ack_at)) {
+			WRITE_ONCE(call->ack_at, ack_at);
+			rxrpc_reduce_call_timer(call, ack_at, now,
+						rxrpc_timer_set_for_ack);
 		}
 	}
 
@@ -227,18 +154,20 @@ static void rxrpc_congestion_timeout(struct rxrpc_call *call)
 /*
  * Perform retransmission of NAK'd and unack'd packets.
  */
-static void rxrpc_resend(struct rxrpc_call *call, ktime_t now)
+static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j)
 {
 	struct rxrpc_skb_priv *sp;
 	struct sk_buff *skb;
+	unsigned long resend_at;
 	rxrpc_seq_t cursor, seq, top;
-	ktime_t max_age, oldest, ack_ts;
+	ktime_t now, max_age, oldest, ack_ts;
 	int ix;
 	u8 annotation, anno_type, retrans = 0, unacked = 0;
 
 	_enter("{%d,%d}", call->tx_hard_ack, call->tx_top);
 
-	max_age = ktime_sub_ms(now, rxrpc_resend_timeout);
+	now = ktime_get_real();
+	max_age = ktime_sub_ms(now, rxrpc_resend_timeout * 1000 / HZ);
 
 	spin_lock_bh(&call->lock);
 
@@ -282,7 +211,9 @@ static void rxrpc_resend(struct rxrpc_call *call, ktime_t now)
 				       ktime_to_ns(ktime_sub(skb->tstamp, max_age)));
 	}
 
-	call->resend_at = ktime_add_ms(oldest, rxrpc_resend_timeout);
+	resend_at = nsecs_to_jiffies(ktime_to_ns(ktime_sub(oldest, now)));
+	resend_at += jiffies + rxrpc_resend_timeout;
+	WRITE_ONCE(call->resend_at, resend_at);
 
 	if (unacked)
 		rxrpc_congestion_timeout(call);
@@ -292,7 +223,8 @@ static void rxrpc_resend(struct rxrpc_call *call, ktime_t now)
 	 * retransmitting data.
 	 */
 	if (!retrans) {
-		rxrpc_set_timer(call, rxrpc_timer_set_for_resend, now);
+		rxrpc_reduce_call_timer(call, resend_at, now,
+					rxrpc_timer_set_for_resend);
 		spin_unlock_bh(&call->lock);
 		ack_ts = ktime_sub(now, call->acks_latest_ts);
 		if (ktime_to_ns(ack_ts) < call->peer->rtt)
@@ -364,7 +296,7 @@ void rxrpc_process_call(struct work_struct *work)
 {
 	struct rxrpc_call *call =
 		container_of(work, struct rxrpc_call, processor);
-	ktime_t now;
+	unsigned long now, next, t;
 
 	rxrpc_see_call(call);
 
@@ -384,8 +316,50 @@ recheck_state:
 		goto out_put;
 	}
 
-	now = ktime_get_real();
-	if (ktime_before(call->expire_at, now)) {
+	/* Work out if any timeouts tripped */
+	now = jiffies;
+	t = READ_ONCE(call->expect_rx_by);
+	if (time_after_eq(now, t)) {
+		trace_rxrpc_timer(call, rxrpc_timer_exp_normal, now);
+		set_bit(RXRPC_CALL_EV_EXPIRED, &call->events);
+	}
+
+	t = READ_ONCE(call->expect_req_by);
+	if (call->state == RXRPC_CALL_SERVER_RECV_REQUEST &&
+	    time_after_eq(now, t)) {
+		trace_rxrpc_timer(call, rxrpc_timer_exp_idle, now);
+		set_bit(RXRPC_CALL_EV_EXPIRED, &call->events);
+	}
+
+	t = READ_ONCE(call->expect_term_by);
+	if (time_after_eq(now, t)) {
+		trace_rxrpc_timer(call, rxrpc_timer_exp_hard, now);
+		set_bit(RXRPC_CALL_EV_EXPIRED, &call->events);
+	}
+
+	t = READ_ONCE(call->ack_at);
+	if (time_after_eq(now, t)) {
+		trace_rxrpc_timer(call, rxrpc_timer_exp_ack, now);
+		cmpxchg(&call->ack_at, t, now + MAX_JIFFY_OFFSET);
+		set_bit(RXRPC_CALL_EV_ACK, &call->events);
+	}
+
+	t = READ_ONCE(call->ping_at);
+	if (time_after_eq(now, t)) {
+		trace_rxrpc_timer(call, rxrpc_timer_exp_ping, now);
+		cmpxchg(&call->ping_at, t, now + MAX_JIFFY_OFFSET);
+		set_bit(RXRPC_CALL_EV_PING, &call->events);
+	}
+
+	t = READ_ONCE(call->resend_at);
+	if (time_after_eq(now, t)) {
+		trace_rxrpc_timer(call, rxrpc_timer_exp_resend, now);
+		cmpxchg(&call->resend_at, t, now + MAX_JIFFY_OFFSET);
+		set_bit(RXRPC_CALL_EV_RESEND, &call->events);
+	}
+
+	/* Process events */
+	if (test_and_clear_bit(RXRPC_CALL_EV_EXPIRED, &call->events)) {
 		rxrpc_abort_call("EXP", call, 0, RX_USER_ABORT, -ETIME);
 		set_bit(RXRPC_CALL_EV_ABORT, &call->events);
 		goto recheck_state;
@@ -408,7 +382,22 @@ recheck_state:
 		goto recheck_state;
 	}
 
-	rxrpc_set_timer(call, rxrpc_timer_set_for_resend, now);
+	/* Make sure the timer is restarted */
+	next = call->expect_rx_by;
+
+#define set(T) { t = READ_ONCE(T); if (time_before(t, next)) next = t; }
+	
+	set(call->expect_req_by);
+	set(call->expect_term_by);
+	set(call->ack_at);
+	set(call->resend_at);
+	set(call->ping_at);
+
+	now = jiffies;
+	if (time_after_eq(now, next))
+		goto recheck_state;
+
+	rxrpc_reduce_call_timer(call, next, now, rxrpc_timer_restart);
 
 	/* other events may have been raised since we started checking */
 	if (call->events && call->state < RXRPC_CALL_COMPLETE) {
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index c3e1fa854471..b305970a9b63 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -51,8 +51,10 @@ static void rxrpc_call_timer_expired(unsigned long _call)
 
 	_enter("%d", call->debug_id);
 
-	if (call->state < RXRPC_CALL_COMPLETE)
-		rxrpc_set_timer(call, rxrpc_timer_expired, ktime_get_real());
+	if (call->state < RXRPC_CALL_COMPLETE) {
+		trace_rxrpc_timer(call, rxrpc_timer_expired, jiffies);
+		rxrpc_queue_call(call);
+	}
 }
 
 static struct lock_class_key rxrpc_call_user_mutex_lock_class_key;
@@ -139,6 +141,8 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp)
 	atomic_set(&call->usage, 1);
 	call->debug_id = atomic_inc_return(&rxrpc_debug_id);
 	call->tx_total_len = -1;
+	call->next_rx_timo = 20 * HZ;
+	call->next_req_timo = 1 * HZ;
 
 	memset(&call->sock_node, 0xed, sizeof(call->sock_node));
 
@@ -189,15 +193,16 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx,
  */
 static void rxrpc_start_call_timer(struct rxrpc_call *call)
 {
-	ktime_t now = ktime_get_real(), expire_at;
-
-	expire_at = ktime_add_ms(now, rxrpc_max_call_lifetime);
-	call->expire_at = expire_at;
-	call->ack_at = expire_at;
-	call->ping_at = expire_at;
-	call->resend_at = expire_at;
-	call->timer.expires = jiffies + LONG_MAX / 2;
-	rxrpc_set_timer(call, rxrpc_timer_begin, now);
+	unsigned long now = jiffies;
+	unsigned long j = now + MAX_JIFFY_OFFSET;
+
+	call->ack_at = j;
+	call->resend_at = j;
+	call->ping_at = j;
+	call->expect_rx_by = j;
+	call->expect_req_by = j;
+	call->expect_term_by = j;
+	call->timer.expires = now;
 }
 
 /*
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index cfb997593da9..97f6a8de4845 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -85,8 +85,8 @@
 
 __read_mostly unsigned int rxrpc_max_client_connections = 1000;
 __read_mostly unsigned int rxrpc_reap_client_connections = 900;
-__read_mostly unsigned int rxrpc_conn_idle_client_expiry = 2 * 60 * HZ;
-__read_mostly unsigned int rxrpc_conn_idle_client_fast_expiry = 2 * HZ;
+__read_mostly unsigned long rxrpc_conn_idle_client_expiry = 2 * 60 * HZ;
+__read_mostly unsigned long rxrpc_conn_idle_client_fast_expiry = 2 * HZ;
 
 /*
  * We use machine-unique IDs for our client connections.
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 1b592073ec96..c89647eae86d 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -318,16 +318,18 @@ bad_state:
 static bool rxrpc_receiving_reply(struct rxrpc_call *call)
 {
 	struct rxrpc_ack_summary summary = { 0 };
+	unsigned long now, timo;
 	rxrpc_seq_t top = READ_ONCE(call->tx_top);
 
 	if (call->ackr_reason) {
 		spin_lock_bh(&call->lock);
 		call->ackr_reason = 0;
-		call->resend_at = call->expire_at;
-		call->ack_at = call->expire_at;
 		spin_unlock_bh(&call->lock);
-		rxrpc_set_timer(call, rxrpc_timer_init_for_reply,
-				ktime_get_real());
+		now = jiffies;
+		timo = now + MAX_JIFFY_OFFSET;
+		WRITE_ONCE(call->resend_at, timo);
+		WRITE_ONCE(call->ack_at, timo);
+		trace_rxrpc_timer(call, rxrpc_timer_init_for_reply, now);
 	}
 
 	if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags))
@@ -437,6 +439,19 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb,
 	if (state >= RXRPC_CALL_COMPLETE)
 		return;
 
+	if (call->state == RXRPC_CALL_SERVER_RECV_REQUEST) {
+		unsigned long timo = READ_ONCE(call->next_req_timo);
+		unsigned long now, expect_req_by;
+
+		if (timo) {
+			now = jiffies;
+			expect_req_by = now + timo;
+			WRITE_ONCE(call->expect_req_by, expect_req_by);
+			rxrpc_reduce_call_timer(call, expect_req_by, now,
+						rxrpc_timer_set_for_idle);
+		}
+	}
+
 	/* Received data implicitly ACKs all of the request packets we sent
 	 * when we're acting as a client.
 	 */
@@ -908,9 +923,20 @@ static void rxrpc_input_call_packet(struct rxrpc_call *call,
 				    struct sk_buff *skb, u16 skew)
 {
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+	unsigned long timo;
 
 	_enter("%p,%p", call, skb);
 
+	timo = READ_ONCE(call->next_rx_timo);
+	if (timo) {
+		unsigned long now = jiffies, expect_rx_by;
+
+		expect_rx_by = jiffies + timo;
+		WRITE_ONCE(call->expect_rx_by, expect_rx_by);
+		rxrpc_reduce_call_timer(call, expect_rx_by, now,
+					rxrpc_timer_set_for_normal);
+	}
+	
 	switch (sp->hdr.type) {
 	case RXRPC_PACKET_TYPE_DATA:
 		rxrpc_input_data(call, skb, skew);
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index 1a2d4b112064..c1d9e7fd7448 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -20,34 +20,29 @@
  */
 unsigned int rxrpc_max_backlog __read_mostly = 10;
 
-/*
- * Maximum lifetime of a call (in mx).
- */
-unsigned int rxrpc_max_call_lifetime = 60 * 1000;
-
 /*
  * How long to wait before scheduling ACK generation after seeing a
- * packet with RXRPC_REQUEST_ACK set (in ms).
+ * packet with RXRPC_REQUEST_ACK set (in jiffies).
  */
-unsigned int rxrpc_requested_ack_delay = 1;
+unsigned long rxrpc_requested_ack_delay = 1;
 
 /*
- * How long to wait before scheduling an ACK with subtype DELAY (in ms).
+ * How long to wait before scheduling an ACK with subtype DELAY (in jiffies).
  *
  * We use this when we've received new data packets.  If those packets aren't
  * all consumed within this time we will send a DELAY ACK if an ACK was not
  * requested to let the sender know it doesn't need to resend.
  */
-unsigned int rxrpc_soft_ack_delay = 1 * 1000;
+unsigned long rxrpc_soft_ack_delay = HZ;
 
 /*
- * How long to wait before scheduling an ACK with subtype IDLE (in ms).
+ * How long to wait before scheduling an ACK with subtype IDLE (in jiffies).
  *
  * We use this when we've consumed some previously soft-ACK'd packets when
  * further packets aren't immediately received to decide when to send an IDLE
  * ACK let the other end know that it can free up its Tx buffer space.
  */
-unsigned int rxrpc_idle_ack_delay = 0.5 * 1000;
+unsigned long rxrpc_idle_ack_delay = HZ / 2;
 
 /*
  * Receive window size in packets.  This indicates the maximum number of
@@ -75,7 +70,7 @@ unsigned int rxrpc_rx_jumbo_max = 4;
 /*
  * Time till packet resend (in milliseconds).
  */
-unsigned int rxrpc_resend_timeout = 4 * 1000;
+unsigned long rxrpc_resend_timeout = 4 * HZ;
 
 const s8 rxrpc_ack_priority[] = {
 	[0]				= 0,
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index be0b9ae13893..0b6609da80b7 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -163,7 +163,7 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial)
 	case RXRPC_CALL_SERVER_RECV_REQUEST:
 		call->tx_phase = true;
 		call->state = RXRPC_CALL_SERVER_ACK_REQUEST;
-		call->ack_at = call->expire_at;
+		call->expect_req_by = jiffies + MAX_JIFFY_OFFSET;
 		write_unlock_bh(&call->state_lock);
 		rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, 0, serial, false, true,
 				  rxrpc_propose_ack_processing_op);
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index de5ab327c18a..03e0676db28c 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -158,6 +158,7 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call,
 			       rxrpc_notify_end_tx_t notify_end_tx)
 {
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+	unsigned long now;
 	rxrpc_seq_t seq = sp->hdr.seq;
 	int ret, ix;
 	u8 annotation = RXRPC_TX_ANNO_UNACK;
@@ -197,11 +198,11 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call,
 			break;
 		case RXRPC_CALL_SERVER_ACK_REQUEST:
 			call->state = RXRPC_CALL_SERVER_SEND_REPLY;
-			call->ack_at = call->expire_at;
+			now = jiffies;
+			WRITE_ONCE(call->ack_at, now + MAX_JIFFY_OFFSET);
 			if (call->ackr_reason == RXRPC_ACK_DELAY)
 				call->ackr_reason = 0;
-			__rxrpc_set_timer(call, rxrpc_timer_init_for_send_reply,
-					  ktime_get_real());
+			trace_rxrpc_timer(call, rxrpc_timer_init_for_send_reply, now);
 			if (!last)
 				break;
 			/* Fall through */
@@ -223,14 +224,12 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call,
 		_debug("need instant resend %d", ret);
 		rxrpc_instant_resend(call, ix);
 	} else {
-		ktime_t now = ktime_get_real(), resend_at;
+		unsigned long now = jiffies, resend_at;
 
-		resend_at = ktime_add_ms(now, rxrpc_resend_timeout);
-
-		if (ktime_before(resend_at, call->resend_at)) {
-			call->resend_at = resend_at;
-			rxrpc_set_timer(call, rxrpc_timer_set_for_send, now);
-		}
+		resend_at = now + rxrpc_resend_timeout;
+		WRITE_ONCE(call->resend_at, resend_at);
+		rxrpc_reduce_call_timer(call, resend_at, now,
+					rxrpc_timer_set_for_send);
 	}
 
 	rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
@@ -513,6 +512,19 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg, struct rxrpc_send_params *p)
 				return -EINVAL;
 			break;
 
+		case RXRPC_SET_CALL_TIMEOUT:
+			if (len & 3 || len < 4 || len > 12)
+				return -EINVAL;
+			memcpy(&p->call.timeouts, CMSG_DATA(cmsg), len);
+			p->call.nr_timeouts = len / 4;
+			if (p->call.timeouts.hard > INT_MAX / HZ)
+				return -ERANGE;
+			if (p->call.nr_timeouts >= 2 && p->call.timeouts.idle > 60 * 60 * 1000)
+				return -ERANGE;
+			if (p->call.nr_timeouts >= 3 && p->call.timeouts.normal > 60 * 60 * 1000)
+				return -ERANGE;
+			break;
+
 		default:
 			return -EINVAL;
 		}
@@ -577,11 +589,13 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
 {
 	enum rxrpc_call_state state;
 	struct rxrpc_call *call;
+	unsigned long now, j;
 	int ret;
 
 	struct rxrpc_send_params p = {
 		.call.tx_total_len	= -1,
 		.call.user_call_ID	= 0,
+		.call.nr_timeouts	= 0,
 		.abort_code		= 0,
 		.command		= RXRPC_CMD_SEND_DATA,
 		.exclusive		= false,
@@ -646,6 +660,31 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
 		}
 	}
 
+	switch (p.call.nr_timeouts) {
+	case 3:
+		j = msecs_to_jiffies(p.call.timeouts.normal);
+		if (p.call.timeouts.normal > 0 && j == 0)
+			j = 1;
+		WRITE_ONCE(call->next_rx_timo, j);
+		/* Fall through */
+	case 2:
+		j = msecs_to_jiffies(p.call.timeouts.idle);
+		if (p.call.timeouts.idle > 0 && j == 0)
+			j = 1;
+		WRITE_ONCE(call->next_req_timo, j);
+		/* Fall through */
+	case 1:
+		if (p.call.timeouts.hard > 0) {
+			j = msecs_to_jiffies(p.call.timeouts.hard);
+			now = jiffies;
+			j += now;
+			WRITE_ONCE(call->expect_term_by, j);
+			rxrpc_reduce_call_timer(call, j, now,
+						rxrpc_timer_set_for_hard);
+		}
+		break;
+	}
+
 	state = READ_ONCE(call->state);
 	_debug("CALL %d USR %lx ST %d on CONN %p",
 	       call->debug_id, call->user_call_ID, state, call->conn);
diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c
index 34c706d2f79c..4a7af7aff37d 100644
--- a/net/rxrpc/sysctl.c
+++ b/net/rxrpc/sysctl.c
@@ -21,6 +21,8 @@ static const unsigned int four = 4;
 static const unsigned int thirtytwo = 32;
 static const unsigned int n_65535 = 65535;
 static const unsigned int n_max_acks = RXRPC_RXTX_BUFF_SIZE - 1;
+static const unsigned long one_jiffy = 1;
+static const unsigned long max_jiffies = MAX_JIFFY_OFFSET;
 
 /*
  * RxRPC operating parameters.
@@ -29,64 +31,60 @@ static const unsigned int n_max_acks = RXRPC_RXTX_BUFF_SIZE - 1;
  * information on the individual parameters.
  */
 static struct ctl_table rxrpc_sysctl_table[] = {
-	/* Values measured in milliseconds */
+	/* Values measured in milliseconds but used in jiffies */
 	{
 		.procname	= "req_ack_delay",
 		.data		= &rxrpc_requested_ack_delay,
-		.maxlen		= sizeof(unsigned int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-		.extra1		= (void *)&zero,
+		.proc_handler	= proc_doulongvec_ms_jiffies_minmax,
+		.extra1		= (void *)&one_jiffy,
+		.extra2		= (void *)&max_jiffies,
 	},
 	{
 		.procname	= "soft_ack_delay",
 		.data		= &rxrpc_soft_ack_delay,
-		.maxlen		= sizeof(unsigned int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-		.extra1		= (void *)&one,
+		.proc_handler	= proc_doulongvec_ms_jiffies_minmax,
+		.extra1		= (void *)&one_jiffy,
+		.extra2		= (void *)&max_jiffies,
 	},
 	{
 		.procname	= "idle_ack_delay",
 		.data		= &rxrpc_idle_ack_delay,
-		.maxlen		= sizeof(unsigned int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-		.extra1		= (void *)&one,
-	},
-	{
-		.procname	= "resend_timeout",
-		.data		= &rxrpc_resend_timeout,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-		.extra1		= (void *)&one,
+		.proc_handler	= proc_doulongvec_ms_jiffies_minmax,
+		.extra1		= (void *)&one_jiffy,
+		.extra2		= (void *)&max_jiffies,
 	},
 	{
 		.procname	= "idle_conn_expiry",
 		.data		= &rxrpc_conn_idle_client_expiry,
-		.maxlen		= sizeof(unsigned int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_ms_jiffies,
-		.extra1		= (void *)&one,
+		.proc_handler	= proc_doulongvec_ms_jiffies_minmax,
+		.extra1		= (void *)&one_jiffy,
+		.extra2		= (void *)&max_jiffies,
 	},
 	{
 		.procname	= "idle_conn_fast_expiry",
 		.data		= &rxrpc_conn_idle_client_fast_expiry,
-		.maxlen		= sizeof(unsigned int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_ms_jiffies,
-		.extra1		= (void *)&one,
+		.proc_handler	= proc_doulongvec_ms_jiffies_minmax,
+		.extra1		= (void *)&one_jiffy,
+		.extra2		= (void *)&max_jiffies,
 	},
-
-	/* Values measured in seconds but used in jiffies */
 	{
-		.procname	= "max_call_lifetime",
-		.data		= &rxrpc_max_call_lifetime,
-		.maxlen		= sizeof(unsigned int),
+		.procname	= "resend_timeout",
+		.data		= &rxrpc_resend_timeout,
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-		.extra1		= (void *)&one,
+		.proc_handler	= proc_doulongvec_ms_jiffies_minmax,
+		.extra1		= (void *)&one_jiffy,
+		.extra2		= (void *)&max_jiffies,
 	},
 
 	/* Non-time values */
-- 
cgit v1.2.3


From bd1fdf8cfdf3fdbccd2b21c33ec649ebd7429af7 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 24 Nov 2017 10:18:42 +0000
Subject: rxrpc: Add a timeout for detecting lost ACKs/lost DATA

Add an extra timeout that is set/updated when we send a DATA packet that
has the request-ack flag set.  This allows us to detect if we don't get an
ACK in response to the latest flagged packet.

The ACK packet is adjudged to have been lost if it doesn't turn up within
2*RTT of the transmission.

If the timeout occurs, we schedule the sending of a PING ACK to find out
the state of the other side.  If a new DATA packet is ready to go sooner,
we cancel the sending of the ping and set the request-ack flag on that
instead.

If we get back a PING-RESPONSE ACK that indicates a lower tx_top than what
we had at the time of the ping transmission, we adjudge all the DATA
packets sent between the response tx_top and the ping-time tx_top to have
been lost and retransmit immediately.

Rather than sending a PING ACK, we could just pick a DATA packet and
speculatively retransmit that with request-ack set.  It should result in
either a REQUESTED ACK or a DUPLICATE ACK which we can then use in lieu the
a PING-RESPONSE ACK mentioned above.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h | 11 +++++++++--
 net/rxrpc/ar-internal.h      |  6 +++++-
 net/rxrpc/call_event.c       | 26 ++++++++++++++++++++++----
 net/rxrpc/call_object.c      |  1 +
 net/rxrpc/input.c            | 40 ++++++++++++++++++++++++++++++++++++++++
 net/rxrpc/output.c           | 20 ++++++++++++++++++--
 net/rxrpc/recvmsg.c          |  4 ++--
 net/rxrpc/sendmsg.c          |  2 +-
 8 files changed, 98 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 01dcbc2164b5..84ade8b76a19 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -141,6 +141,7 @@ enum rxrpc_timer_trace {
 	rxrpc_timer_exp_ack,
 	rxrpc_timer_exp_hard,
 	rxrpc_timer_exp_idle,
+	rxrpc_timer_exp_lost_ack,
 	rxrpc_timer_exp_normal,
 	rxrpc_timer_exp_ping,
 	rxrpc_timer_exp_resend,
@@ -151,6 +152,7 @@ enum rxrpc_timer_trace {
 	rxrpc_timer_set_for_ack,
 	rxrpc_timer_set_for_hard,
 	rxrpc_timer_set_for_idle,
+	rxrpc_timer_set_for_lost_ack,
 	rxrpc_timer_set_for_normal,
 	rxrpc_timer_set_for_ping,
 	rxrpc_timer_set_for_resend,
@@ -309,6 +311,7 @@ enum rxrpc_congest_change {
 	EM(rxrpc_timer_exp_ack,			"ExpAck") \
 	EM(rxrpc_timer_exp_hard,		"ExpHrd") \
 	EM(rxrpc_timer_exp_idle,		"ExpIdl") \
+	EM(rxrpc_timer_exp_lost_ack,		"ExpLoA") \
 	EM(rxrpc_timer_exp_normal,		"ExpNml") \
 	EM(rxrpc_timer_exp_ping,		"ExpPng") \
 	EM(rxrpc_timer_exp_resend,		"ExpRsn") \
@@ -318,6 +321,7 @@ enum rxrpc_congest_change {
 	EM(rxrpc_timer_set_for_ack,		"SetAck") \
 	EM(rxrpc_timer_set_for_hard,		"SetHrd") \
 	EM(rxrpc_timer_set_for_idle,		"SetIdl") \
+	EM(rxrpc_timer_set_for_lost_ack,	"SetLoA") \
 	EM(rxrpc_timer_set_for_normal,		"SetNml") \
 	EM(rxrpc_timer_set_for_ping,		"SetPng") \
 	EM(rxrpc_timer_set_for_resend,		"SetRTx") \
@@ -961,6 +965,7 @@ TRACE_EVENT(rxrpc_timer,
 		    __field(enum rxrpc_timer_trace,		why		)
 		    __field(long,				now		)
 		    __field(long,				ack_at		)
+		    __field(long,				ack_lost_at	)
 		    __field(long,				resend_at	)
 		    __field(long,				ping_at		)
 		    __field(long,				expect_rx_by	)
@@ -974,6 +979,7 @@ TRACE_EVENT(rxrpc_timer,
 		    __entry->why		= why;
 		    __entry->now		= now;
 		    __entry->ack_at		= call->ack_at;
+		    __entry->ack_lost_at	= call->ack_lost_at;
 		    __entry->resend_at		= call->resend_at;
 		    __entry->expect_rx_by	= call->expect_rx_by;
 		    __entry->expect_req_by	= call->expect_req_by;
@@ -981,10 +987,11 @@ TRACE_EVENT(rxrpc_timer,
 		    __entry->timer		= call->timer.expires;
 			   ),
 
-	    TP_printk("c=%p %s a=%ld r=%ld xr=%ld xq=%ld xt=%ld t=%ld",
+	    TP_printk("c=%p %s a=%ld la=%ld r=%ld xr=%ld xq=%ld xt=%ld t=%ld",
 		      __entry->call,
 		      __print_symbolic(__entry->why, rxrpc_timer_traces),
 		      __entry->ack_at - __entry->now,
+		      __entry->ack_lost_at - __entry->now,
 		      __entry->resend_at - __entry->now,
 		      __entry->expect_rx_by - __entry->now,
 		      __entry->expect_req_by - __entry->now,
@@ -1105,7 +1112,7 @@ TRACE_EVENT(rxrpc_congest,
 		    memcpy(&__entry->sum, summary, sizeof(__entry->sum));
 			   ),
 
-	    TP_printk("c=%p %08x %s %08x %s cw=%u ss=%u nr=%u,%u nw=%u,%u r=%u b=%u u=%u d=%u l=%x%s%s%s",
+	    TP_printk("c=%p r=%08x %s q=%08x %s cw=%u ss=%u nr=%u,%u nw=%u,%u r=%u b=%u u=%u d=%u l=%x%s%s%s",
 		      __entry->call,
 		      __entry->ack_serial,
 		      __print_symbolic(__entry->sum.ack_reason, rxrpc_ack_names),
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 548411371048..7e7b817c69f0 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -471,6 +471,7 @@ enum rxrpc_call_event {
 	RXRPC_CALL_EV_RESEND,		/* Tx resend required */
 	RXRPC_CALL_EV_PING,		/* Ping send required */
 	RXRPC_CALL_EV_EXPIRED,		/* Expiry occurred */
+	RXRPC_CALL_EV_ACK_LOST,		/* ACK may be lost, send ping */
 };
 
 /*
@@ -515,6 +516,7 @@ struct rxrpc_call {
 	struct rxrpc_sock __rcu	*socket;	/* socket responsible */
 	struct mutex		user_mutex;	/* User access mutex */
 	unsigned long		ack_at;		/* When deferred ACK needs to happen */
+	unsigned long		ack_lost_at;	/* When ACK is figured as lost */
 	unsigned long		resend_at;	/* When next resend needs to happen */
 	unsigned long		ping_at;	/* When next to send a ping */
 	unsigned long		expect_rx_by;	/* When we expect to get a packet by */
@@ -624,6 +626,8 @@ struct rxrpc_call {
 	ktime_t			acks_latest_ts;	/* Timestamp of latest ACK received */
 	rxrpc_serial_t		acks_latest;	/* serial number of latest ACK received */
 	rxrpc_seq_t		acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */
+	rxrpc_seq_t		acks_lost_top;	/* tx_top at the time lost-ack ping sent */
+	rxrpc_serial_t		acks_lost_ping;	/* Serial number of probe ACK */
 };
 
 /*
@@ -1011,7 +1015,7 @@ static inline struct rxrpc_net *rxrpc_net(struct net *net)
 /*
  * output.c
  */
-int rxrpc_send_ack_packet(struct rxrpc_call *, bool);
+int rxrpc_send_ack_packet(struct rxrpc_call *, bool, rxrpc_serial_t *);
 int rxrpc_send_abort_packet(struct rxrpc_call *);
 int rxrpc_send_data_packet(struct rxrpc_call *, struct sk_buff *, bool);
 void rxrpc_reject_packets(struct rxrpc_local *);
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index da91f16ac77c..c65666b2f39e 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -245,7 +245,7 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j)
 			goto out;
 		rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, 0, true, false,
 				  rxrpc_propose_ack_ping_for_lost_ack);
-		rxrpc_send_ack_packet(call, true);
+		rxrpc_send_ack_packet(call, true, NULL);
 		goto out;
 	}
 
@@ -310,6 +310,7 @@ void rxrpc_process_call(struct work_struct *work)
 {
 	struct rxrpc_call *call =
 		container_of(work, struct rxrpc_call, processor);
+	rxrpc_serial_t *send_ack;
 	unsigned long now, next, t;
 
 	rxrpc_see_call(call);
@@ -358,6 +359,13 @@ recheck_state:
 		set_bit(RXRPC_CALL_EV_ACK, &call->events);
 	}
 
+	t = READ_ONCE(call->ack_lost_at);
+	if (time_after_eq(now, t)) {
+		trace_rxrpc_timer(call, rxrpc_timer_exp_lost_ack, now);
+		cmpxchg(&call->ack_lost_at, t, now + MAX_JIFFY_OFFSET);
+		set_bit(RXRPC_CALL_EV_ACK_LOST, &call->events);
+	}
+
 	t = READ_ONCE(call->ping_at);
 	if (time_after_eq(now, t)) {
 		trace_rxrpc_timer(call, rxrpc_timer_exp_ping, now);
@@ -379,15 +387,24 @@ recheck_state:
 		goto recheck_state;
 	}
 
-	if (test_and_clear_bit(RXRPC_CALL_EV_ACK, &call->events)) {
+	send_ack = NULL;
+	if (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events)) {
+		call->acks_lost_top = call->tx_top;
+		rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, 0, true, false,
+				  rxrpc_propose_ack_ping_for_lost_ack);
+		send_ack = &call->acks_lost_ping;
+	}
+
+	if (test_and_clear_bit(RXRPC_CALL_EV_ACK, &call->events) ||
+	    send_ack) {
 		if (call->ackr_reason) {
-			rxrpc_send_ack_packet(call, false);
+			rxrpc_send_ack_packet(call, false, send_ack);
 			goto recheck_state;
 		}
 	}
 
 	if (test_and_clear_bit(RXRPC_CALL_EV_PING, &call->events)) {
-		rxrpc_send_ack_packet(call, true);
+		rxrpc_send_ack_packet(call, true, NULL);
 		goto recheck_state;
 	}
 
@@ -404,6 +421,7 @@ recheck_state:
 	set(call->expect_req_by);
 	set(call->expect_term_by);
 	set(call->ack_at);
+	set(call->ack_lost_at);
 	set(call->resend_at);
 	set(call->ping_at);
 
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index b305970a9b63..7ee3d6ce5aa2 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -197,6 +197,7 @@ static void rxrpc_start_call_timer(struct rxrpc_call *call)
 	unsigned long j = now + MAX_JIFFY_OFFSET;
 
 	call->ack_at = j;
+	call->ack_lost_at = j;
 	call->resend_at = j;
 	call->ping_at = j;
 	call->expect_rx_by = j;
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index c89647eae86d..23a5e61d8f79 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -630,6 +630,43 @@ found:
 			   orig_serial, ack_serial, sent_at, resp_time);
 }
 
+/*
+ * Process the response to a ping that we sent to find out if we lost an ACK.
+ *
+ * If we got back a ping response that indicates a lower tx_top than what we
+ * had at the time of the ping transmission, we adjudge all the DATA packets
+ * sent between the response tx_top and the ping-time tx_top to have been lost.
+ */
+static void rxrpc_input_check_for_lost_ack(struct rxrpc_call *call)
+{
+	rxrpc_seq_t top, bottom, seq;
+	bool resend = false;
+
+	spin_lock_bh(&call->lock);
+
+	bottom = call->tx_hard_ack + 1;
+	top = call->acks_lost_top;
+	if (before(bottom, top)) {
+		for (seq = bottom; before_eq(seq, top); seq++) {
+			int ix = seq & RXRPC_RXTX_BUFF_MASK;
+			u8 annotation = call->rxtx_annotations[ix];
+			u8 anno_type = annotation & RXRPC_TX_ANNO_MASK;
+
+			if (anno_type != RXRPC_TX_ANNO_UNACK)
+				continue;
+			annotation &= ~RXRPC_TX_ANNO_MASK;
+			annotation |= RXRPC_TX_ANNO_RETRANS;
+			call->rxtx_annotations[ix] = annotation;
+			resend = true;
+		}
+	}
+
+	spin_unlock_bh(&call->lock);
+
+	if (resend && !test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events))
+		rxrpc_queue_call(call);
+}
+
 /*
  * Process a ping response.
  */
@@ -645,6 +682,9 @@ static void rxrpc_input_ping_response(struct rxrpc_call *call,
 	smp_rmb();
 	ping_serial = call->ping_serial;
 
+	if (orig_serial == call->acks_lost_ping)
+		rxrpc_input_check_for_lost_ack(call);
+
 	if (!test_bit(RXRPC_CALL_PINGING, &call->flags) ||
 	    before(orig_serial, ping_serial))
 		return;
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index f47659c7b224..efe06edce189 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -95,7 +95,8 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_connection *conn,
 /*
  * Send an ACK call packet.
  */
-int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping)
+int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping,
+			  rxrpc_serial_t *_serial)
 {
 	struct rxrpc_connection *conn = NULL;
 	struct rxrpc_ack_buffer *pkt;
@@ -165,6 +166,8 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping)
 			   ntohl(pkt->ack.firstPacket),
 			   ntohl(pkt->ack.serial),
 			   pkt->ack.reason, pkt->ack.nAcks);
+	if (_serial)
+		*_serial = serial;
 
 	if (ping) {
 		call->ping_serial = serial;
@@ -323,7 +326,8 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb,
 	 * ACKs if a DATA packet appears to have been lost.
 	 */
 	if (!(sp->hdr.flags & RXRPC_LAST_PACKET) &&
-	    (retrans ||
+	    (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events) ||
+	     retrans ||
 	     call->cong_mode == RXRPC_CALL_SLOW_START ||
 	     (call->peer->rtt_usage < 3 && sp->hdr.seq & 1) ||
 	     ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000),
@@ -370,6 +374,18 @@ done:
 		if (whdr.flags & RXRPC_REQUEST_ACK) {
 			call->peer->rtt_last_req = now;
 			trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_data, serial);
+			if (call->peer->rtt_usage > 1) {
+				unsigned long nowj = jiffies, ack_lost_at;
+
+				ack_lost_at = nsecs_to_jiffies(2 * call->peer->rtt);
+				if (ack_lost_at < 1)
+					ack_lost_at = 1;
+
+				ack_lost_at += nowj;
+				WRITE_ONCE(call->ack_lost_at, ack_lost_at);
+				rxrpc_reduce_call_timer(call, ack_lost_at, nowj,
+							rxrpc_timer_set_for_lost_ack);
+			}
 		}
 	}
 	_leave(" = %d [%u]", ret, call->peer->maxdata);
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index fad5f42a3abd..cc21e8db25b0 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -148,7 +148,7 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial)
 	if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) {
 		rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, serial, true, false,
 				  rxrpc_propose_ack_terminal_ack);
-		rxrpc_send_ack_packet(call, false);
+		rxrpc_send_ack_packet(call, false, NULL);
 	}
 #endif
 
@@ -222,7 +222,7 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call)
 					  true, true,
 					  rxrpc_propose_ack_rotate_rx);
 		if (call->ackr_reason && call->ackr_reason != RXRPC_ACK_DELAY)
-			rxrpc_send_ack_packet(call, false);
+			rxrpc_send_ack_packet(call, false, NULL);
 	}
 }
 
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index c56ee54fdd1f..a1c53ac066a1 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -285,7 +285,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
 	do {
 		/* Check to see if there's a ping ACK to reply to. */
 		if (call->ackr_reason == RXRPC_ACK_PING_RESPONSE)
-			rxrpc_send_ack_packet(call, false);
+			rxrpc_send_ack_packet(call, false, NULL);
 
 		if (!skb) {
 			size_t size, chunk, max, space;
-- 
cgit v1.2.3


From 415f44e43282a16ec0808c7ccfd401762e587437 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 24 Nov 2017 10:18:42 +0000
Subject: rxrpc: Add keepalive for a call

We need to transmit a packet every so often to act as a keepalive for the
peer (which has a timeout from the last time it received a packet) and also
to prevent any intervening firewalls from closing the route.

Do this by resetting a timer every time we transmit a packet.  If the timer
ever expires, we transmit a PING ACK packet and thereby also elicit a PING
RESPONSE ACK from the other side - which prevents our last-rx timeout from
expiring.

The timer is set to 1/6 of the last-rx timeout so that we can detect the
other side going away if it misses 6 replies in a row.

This is particularly necessary for servers where the processing of the
service function may take a significant amount of time.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h |  6 ++++++
 net/rxrpc/ar-internal.h      |  1 +
 net/rxrpc/call_event.c       | 10 ++++++++++
 net/rxrpc/output.c           | 23 +++++++++++++++++++++++
 4 files changed, 40 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 84ade8b76a19..e98fed6de497 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -141,6 +141,7 @@ enum rxrpc_timer_trace {
 	rxrpc_timer_exp_ack,
 	rxrpc_timer_exp_hard,
 	rxrpc_timer_exp_idle,
+	rxrpc_timer_exp_keepalive,
 	rxrpc_timer_exp_lost_ack,
 	rxrpc_timer_exp_normal,
 	rxrpc_timer_exp_ping,
@@ -152,6 +153,7 @@ enum rxrpc_timer_trace {
 	rxrpc_timer_set_for_ack,
 	rxrpc_timer_set_for_hard,
 	rxrpc_timer_set_for_idle,
+	rxrpc_timer_set_for_keepalive,
 	rxrpc_timer_set_for_lost_ack,
 	rxrpc_timer_set_for_normal,
 	rxrpc_timer_set_for_ping,
@@ -162,6 +164,7 @@ enum rxrpc_timer_trace {
 enum rxrpc_propose_ack_trace {
 	rxrpc_propose_ack_client_tx_end,
 	rxrpc_propose_ack_input_data,
+	rxrpc_propose_ack_ping_for_keepalive,
 	rxrpc_propose_ack_ping_for_lost_ack,
 	rxrpc_propose_ack_ping_for_lost_reply,
 	rxrpc_propose_ack_ping_for_params,
@@ -311,6 +314,7 @@ enum rxrpc_congest_change {
 	EM(rxrpc_timer_exp_ack,			"ExpAck") \
 	EM(rxrpc_timer_exp_hard,		"ExpHrd") \
 	EM(rxrpc_timer_exp_idle,		"ExpIdl") \
+	EM(rxrpc_timer_exp_keepalive,		"ExpKA ") \
 	EM(rxrpc_timer_exp_lost_ack,		"ExpLoA") \
 	EM(rxrpc_timer_exp_normal,		"ExpNml") \
 	EM(rxrpc_timer_exp_ping,		"ExpPng") \
@@ -321,6 +325,7 @@ enum rxrpc_congest_change {
 	EM(rxrpc_timer_set_for_ack,		"SetAck") \
 	EM(rxrpc_timer_set_for_hard,		"SetHrd") \
 	EM(rxrpc_timer_set_for_idle,		"SetIdl") \
+	EM(rxrpc_timer_set_for_keepalive,	"KeepAl") \
 	EM(rxrpc_timer_set_for_lost_ack,	"SetLoA") \
 	EM(rxrpc_timer_set_for_normal,		"SetNml") \
 	EM(rxrpc_timer_set_for_ping,		"SetPng") \
@@ -330,6 +335,7 @@ enum rxrpc_congest_change {
 #define rxrpc_propose_ack_traces \
 	EM(rxrpc_propose_ack_client_tx_end,	"ClTxEnd") \
 	EM(rxrpc_propose_ack_input_data,	"DataIn ") \
+	EM(rxrpc_propose_ack_ping_for_keepalive, "KeepAlv") \
 	EM(rxrpc_propose_ack_ping_for_lost_ack,	"LostAck") \
 	EM(rxrpc_propose_ack_ping_for_lost_reply, "LostRpl") \
 	EM(rxrpc_propose_ack_ping_for_params,	"Params ") \
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 7e7b817c69f0..cdcbc798f921 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -519,6 +519,7 @@ struct rxrpc_call {
 	unsigned long		ack_lost_at;	/* When ACK is figured as lost */
 	unsigned long		resend_at;	/* When next resend needs to happen */
 	unsigned long		ping_at;	/* When next to send a ping */
+	unsigned long		keepalive_at;	/* When next to send a keepalive ping */
 	unsigned long		expect_rx_by;	/* When we expect to get a packet by */
 	unsigned long		expect_req_by;	/* When we expect to get a request DATA packet by */
 	unsigned long		expect_term_by;	/* When we expect call termination by */
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index c65666b2f39e..bda952ffe6a6 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -366,6 +366,15 @@ recheck_state:
 		set_bit(RXRPC_CALL_EV_ACK_LOST, &call->events);
 	}
 
+	t = READ_ONCE(call->keepalive_at);
+	if (time_after_eq(now, t)) {
+		trace_rxrpc_timer(call, rxrpc_timer_exp_keepalive, now);
+		cmpxchg(&call->keepalive_at, t, now + MAX_JIFFY_OFFSET);
+		rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, 0, true, true,
+				  rxrpc_propose_ack_ping_for_keepalive);
+		set_bit(RXRPC_CALL_EV_PING, &call->events);
+	}
+
 	t = READ_ONCE(call->ping_at);
 	if (time_after_eq(now, t)) {
 		trace_rxrpc_timer(call, rxrpc_timer_exp_ping, now);
@@ -423,6 +432,7 @@ recheck_state:
 	set(call->ack_at);
 	set(call->ack_lost_at);
 	set(call->resend_at);
+	set(call->keepalive_at);
 	set(call->ping_at);
 
 	now = jiffies;
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index efe06edce189..42410e910aff 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -32,6 +32,24 @@ struct rxrpc_abort_buffer {
 	__be32 abort_code;
 };
 
+/*
+ * Arrange for a keepalive ping a certain time after we last transmitted.  This
+ * lets the far side know we're still interested in this call and helps keep
+ * the route through any intervening firewall open.
+ *
+ * Receiving a response to the ping will prevent the ->expect_rx_by timer from
+ * expiring.
+ */
+static void rxrpc_set_keepalive(struct rxrpc_call *call)
+{
+	unsigned long now = jiffies, keepalive_at = call->next_rx_timo / 6;
+
+	keepalive_at += now;
+	WRITE_ONCE(call->keepalive_at, keepalive_at);
+	rxrpc_reduce_call_timer(call, keepalive_at, now,
+				rxrpc_timer_set_for_keepalive);
+}
+
 /*
  * Fill out an ACK packet.
  */
@@ -205,6 +223,8 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping,
 				call->ackr_seen = top;
 			spin_unlock_bh(&call->lock);
 		}
+
+		rxrpc_set_keepalive(call);
 	}
 
 out:
@@ -388,6 +408,9 @@ done:
 			}
 		}
 	}
+
+	rxrpc_set_keepalive(call);
+
 	_leave(" = %d [%u]", ret, call->peer->maxdata);
 	return ret;
 
-- 
cgit v1.2.3


From f859ab61875978eeaa539740ff7f7d91f5d60006 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 24 Nov 2017 10:18:42 +0000
Subject: rxrpc: Fix service endpoint expiry

RxRPC service endpoints expire like they're supposed to by the following
means:

 (1) Mark dead rxrpc_net structs (with ->live) rather than twiddling the
     global service conn timeout, otherwise the first rxrpc_net struct to
     die will cause connections on all others to expire immediately from
     then on.

 (2) Mark local service endpoints for which the socket has been closed
     (->service_closed) so that the expiration timeout can be much
     shortened for service and client connections going through that
     endpoint.

 (3) rxrpc_put_service_conn() needs to schedule the reaper when the usage
     count reaches 1, not 0, as idle conns have a 1 count.

 (4) The accumulator for the earliest time we might want to schedule for
     should be initialised to jiffies + MAX_JIFFY_OFFSET, not ULONG_MAX as
     the comparison functions use signed arithmetic.

 (5) Simplify the expiration handling, adding the expiration value to the
     idle timestamp each time rather than keeping track of the time in the
     past before which the idle timestamp must go to be expired.  This is
     much easier to read.

 (6) Ignore the timeouts if the net namespace is dead.

 (7) Restart the service reaper work item rather the client reaper.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h |  2 ++
 net/rxrpc/af_rxrpc.c         | 13 +++++++++++++
 net/rxrpc/ar-internal.h      |  3 +++
 net/rxrpc/conn_client.c      |  2 ++
 net/rxrpc/conn_object.c      | 42 ++++++++++++++++++++++++------------------
 net/rxrpc/net_ns.c           |  3 +++
 6 files changed, 47 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index e98fed6de497..36cb50c111a6 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -49,6 +49,7 @@ enum rxrpc_conn_trace {
 	rxrpc_conn_put_client,
 	rxrpc_conn_put_service,
 	rxrpc_conn_queued,
+	rxrpc_conn_reap_service,
 	rxrpc_conn_seen,
 };
 
@@ -221,6 +222,7 @@ enum rxrpc_congest_change {
 	EM(rxrpc_conn_put_client,		"PTc") \
 	EM(rxrpc_conn_put_service,		"PTs") \
 	EM(rxrpc_conn_queued,			"QUE") \
+	EM(rxrpc_conn_reap_service,		"RPs") \
 	E_(rxrpc_conn_seen,			"SEE")
 
 #define rxrpc_client_traces \
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index c0cdcf980ffc..abb524c2b8f8 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -867,6 +867,19 @@ static int rxrpc_release_sock(struct sock *sk)
 	sock_orphan(sk);
 	sk->sk_shutdown = SHUTDOWN_MASK;
 
+	/* We want to kill off all connections from a service socket
+	 * as fast as possible because we can't share these; client
+	 * sockets, on the other hand, can share an endpoint.
+	 */
+	switch (sk->sk_state) {
+	case RXRPC_SERVER_BOUND:
+	case RXRPC_SERVER_BOUND2:
+	case RXRPC_SERVER_LISTENING:
+	case RXRPC_SERVER_LISTEN_DISABLED:
+		rx->local->service_closed = true;
+		break;
+	}
+
 	spin_lock_bh(&sk->sk_receive_queue.lock);
 	sk->sk_state = RXRPC_CLOSE;
 	spin_unlock_bh(&sk->sk_receive_queue.lock);
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index cdcbc798f921..a0082c407005 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -84,6 +84,7 @@ struct rxrpc_net {
 	unsigned int		nr_client_conns;
 	unsigned int		nr_active_client_conns;
 	bool			kill_all_client_conns;
+	bool			live;
 	spinlock_t		client_conn_cache_lock; /* Lock for ->*_client_conns */
 	spinlock_t		client_conn_discard_lock; /* Prevent multiple discarders */
 	struct list_head	waiting_client_conns;
@@ -265,6 +266,7 @@ struct rxrpc_local {
 	rwlock_t		services_lock;	/* lock for services list */
 	int			debug_id;	/* debug ID for printks */
 	bool			dead;
+	bool			service_closed;	/* Service socket closed */
 	struct sockaddr_rxrpc	srx;		/* local address */
 };
 
@@ -881,6 +883,7 @@ void rxrpc_process_connection(struct work_struct *);
  * conn_object.c
  */
 extern unsigned int rxrpc_connection_expiry;
+extern unsigned int rxrpc_closed_conn_expiry;
 
 struct rxrpc_connection *rxrpc_alloc_connection(gfp_t);
 struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *,
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 97f6a8de4845..785dfdb9fef1 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -1079,6 +1079,8 @@ next:
 		expiry = rxrpc_conn_idle_client_expiry;
 		if (nr_conns > rxrpc_reap_client_connections)
 			expiry = rxrpc_conn_idle_client_fast_expiry;
+		if (conn->params.local->service_closed)
+			expiry = rxrpc_closed_conn_expiry * HZ;
 
 		conn_expires_at = conn->idle_timestamp + expiry;
 
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index a01a3791f06a..726eda6140ae 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -20,7 +20,8 @@
 /*
  * Time till a connection expires after last use (in seconds).
  */
-unsigned int rxrpc_connection_expiry = 10 * 60;
+unsigned int __read_mostly rxrpc_connection_expiry = 10 * 60;
+unsigned int __read_mostly rxrpc_closed_conn_expiry = 10;
 
 static void rxrpc_destroy_connection(struct rcu_head *);
 
@@ -321,7 +322,7 @@ void rxrpc_put_service_conn(struct rxrpc_connection *conn)
 	n = atomic_dec_return(&conn->usage);
 	trace_rxrpc_conn(conn, rxrpc_conn_put_service, n, here);
 	ASSERTCMP(n, >=, 0);
-	if (n == 0) {
+	if (n == 1) {
 		rxnet = conn->params.local->rxnet;
 		rxrpc_queue_delayed_work(&rxnet->service_conn_reaper, 0);
 	}
@@ -363,15 +364,14 @@ void rxrpc_service_connection_reaper(struct work_struct *work)
 	struct rxrpc_net *rxnet =
 		container_of(to_delayed_work(work),
 			     struct rxrpc_net, service_conn_reaper);
-	unsigned long reap_older_than, earliest, idle_timestamp, now;
+	unsigned long expire_at, earliest, idle_timestamp, now;
 
 	LIST_HEAD(graveyard);
 
 	_enter("");
 
 	now = jiffies;
-	reap_older_than = now - rxrpc_connection_expiry * HZ;
-	earliest = ULONG_MAX;
+	earliest = now + MAX_JIFFY_OFFSET;
 
 	write_lock(&rxnet->conn_lock);
 	list_for_each_entry_safe(conn, _p, &rxnet->service_conns, link) {
@@ -381,15 +381,21 @@ void rxrpc_service_connection_reaper(struct work_struct *work)
 		if (conn->state == RXRPC_CONN_SERVICE_PREALLOC)
 			continue;
 
-		idle_timestamp = READ_ONCE(conn->idle_timestamp);
-		_debug("reap CONN %d { u=%d,t=%ld }",
-		       conn->debug_id, atomic_read(&conn->usage),
-		       (long)reap_older_than - (long)idle_timestamp);
-
-		if (time_after(idle_timestamp, reap_older_than)) {
-			if (time_before(idle_timestamp, earliest))
-				earliest = idle_timestamp;
-			continue;
+		if (rxnet->live) {
+			idle_timestamp = READ_ONCE(conn->idle_timestamp);
+			expire_at = idle_timestamp + rxrpc_connection_expiry * HZ;
+			if (conn->params.local->service_closed)
+				expire_at = idle_timestamp + rxrpc_closed_conn_expiry * HZ;
+
+			_debug("reap CONN %d { u=%d,t=%ld }",
+			       conn->debug_id, atomic_read(&conn->usage),
+			       (long)expire_at - (long)now);
+
+			if (time_before(now, expire_at)) {
+				if (time_before(expire_at, earliest))
+					earliest = expire_at;
+				continue;
+			}
 		}
 
 		/* The usage count sits at 1 whilst the object is unused on the
@@ -397,6 +403,7 @@ void rxrpc_service_connection_reaper(struct work_struct *work)
 		 */
 		if (atomic_cmpxchg(&conn->usage, 1, 0) != 1)
 			continue;
+		trace_rxrpc_conn(conn, rxrpc_conn_reap_service, 0, 0);
 
 		if (rxrpc_conn_is_client(conn))
 			BUG();
@@ -407,10 +414,10 @@ void rxrpc_service_connection_reaper(struct work_struct *work)
 	}
 	write_unlock(&rxnet->conn_lock);
 
-	if (earliest != ULONG_MAX) {
-		_debug("reschedule reaper %ld", (long) earliest - now);
+	if (earliest != now + MAX_JIFFY_OFFSET) {
+		_debug("reschedule reaper %ld", (long)earliest - (long)now);
 		ASSERT(time_after(earliest, now));
-		rxrpc_queue_delayed_work(&rxnet->client_conn_reaper,
+		rxrpc_queue_delayed_work(&rxnet->service_conn_reaper,
 					 earliest - now);
 	}
 
@@ -439,7 +446,6 @@ void rxrpc_destroy_all_connections(struct rxrpc_net *rxnet)
 
 	rxrpc_destroy_all_client_connections(rxnet);
 
-	rxrpc_connection_expiry = 0;
 	cancel_delayed_work(&rxnet->client_conn_reaper);
 	rxrpc_queue_delayed_work(&rxnet->client_conn_reaper, 0);
 	flush_workqueue(rxrpc_workqueue);
diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c
index 7edceb8522f5..684c51d600c7 100644
--- a/net/rxrpc/net_ns.c
+++ b/net/rxrpc/net_ns.c
@@ -22,6 +22,7 @@ static __net_init int rxrpc_init_net(struct net *net)
 	struct rxrpc_net *rxnet = rxrpc_net(net);
 	int ret;
 
+	rxnet->live = true;
 	get_random_bytes(&rxnet->epoch, sizeof(rxnet->epoch));
 	rxnet->epoch |= RXRPC_RANDOM_EPOCH;
 
@@ -60,6 +61,7 @@ static __net_init int rxrpc_init_net(struct net *net)
 	return 0;
 
 err_proc:
+	rxnet->live = false;
 	return ret;
 }
 
@@ -70,6 +72,7 @@ static __net_exit void rxrpc_exit_net(struct net *net)
 {
 	struct rxrpc_net *rxnet = rxrpc_net(net);
 
+	rxnet->live = false;
 	rxrpc_destroy_all_calls(rxnet);
 	rxrpc_destroy_all_connections(rxnet);
 	rxrpc_destroy_all_locals(rxnet);
-- 
cgit v1.2.3


From cf33c1ee5254c6a430bc1538232b49c3ea13e613 Mon Sep 17 00:00:00 2001
From: Huacai Chen <chenhc@lemote.com>
Date: Fri, 24 Nov 2017 15:14:25 -0800
Subject: bcache: Fix building error on MIPS

This patch try to fix the building error on MIPS. The reason is MIPS
has already defined the PTR macro, which conflicts with the PTR macro
in include/uapi/linux/bcache.h.

[fixed by mlyle: corrected a line-length issue]

Cc: stable@vger.kernel.org
Signed-off-by: Huacai Chen <chenhc@lemote.com>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/alloc.c   | 2 +-
 drivers/md/bcache/extents.c | 2 +-
 drivers/md/bcache/journal.c | 2 +-
 include/uapi/linux/bcache.h | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index a27d85232ce1..a0cc1bc6d884 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -490,7 +490,7 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
 		if (b == -1)
 			goto err;
 
-		k->ptr[i] = PTR(ca->buckets[b].gen,
+		k->ptr[i] = MAKE_PTR(ca->buckets[b].gen,
 				bucket_to_sector(c, b),
 				ca->sb.nr_this_dev);
 
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
index 41c238fc3733..f9d391711595 100644
--- a/drivers/md/bcache/extents.c
+++ b/drivers/md/bcache/extents.c
@@ -585,7 +585,7 @@ static bool bch_extent_merge(struct btree_keys *bk, struct bkey *l, struct bkey
 		return false;
 
 	for (i = 0; i < KEY_PTRS(l); i++)
-		if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
+		if (l->ptr[i] + MAKE_PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
 		    PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i))
 			return false;
 
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 5018c56ebb67..a87165c1d8e5 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -512,7 +512,7 @@ static void journal_reclaim(struct cache_set *c)
 			continue;
 
 		ja->cur_idx = next;
-		k->ptr[n++] = PTR(0,
+		k->ptr[n++] = MAKE_PTR(0,
 				  bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
 				  ca->sb.nr_this_dev);
 	}
diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h
index 90fc490f973f..821f71a2e48f 100644
--- a/include/uapi/linux/bcache.h
+++ b/include/uapi/linux/bcache.h
@@ -91,7 +91,7 @@ PTR_FIELD(PTR_GEN,			0,  8)
 
 #define PTR_CHECK_DEV			((1 << PTR_DEV_BITS) - 1)
 
-#define PTR(gen, offset, dev)						\
+#define MAKE_PTR(gen, offset, dev)					\
 	((((__u64) dev) << 51) | ((__u64) offset) << 8 | gen)
 
 /* Bkey utility code */
-- 
cgit v1.2.3


From 7bbefcfac1936c8d9082a828b09f42a3839cb06e Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Fri, 24 Nov 2017 12:08:40 -0800
Subject: uapi: add SPDX identifier to vm_sockets_diag.h

New file seems to have missed the SPDX license scan and update.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/vm_sockets_diag.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/vm_sockets_diag.h b/include/uapi/linux/vm_sockets_diag.h
index 14cd7dc5a187..0b4dd54f3d1e 100644
--- a/include/uapi/linux/vm_sockets_diag.h
+++ b/include/uapi/linux/vm_sockets_diag.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /* AF_VSOCK sock_diag(7) interface for querying open sockets */
 
 #ifndef _UAPI__VM_SOCKETS_DIAG_H__
-- 
cgit v1.2.3


From b4d085201d86af69cbda2214c6dafc0be240ef9f Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Mon, 13 Nov 2017 03:35:27 +0300
Subject: uapi: fix linux/kfd_ioctl.h userspace compilation errors

Consistently use types provided by <linux/types.h> via <drm/drm.h>
to fix the following linux/kfd_ioctl.h userspace compilation errors:

/usr/include/linux/kfd_ioctl.h:236:2: error: unknown type name 'uint64_t'
  uint64_t va_addr; /* to KFD */
/usr/include/linux/kfd_ioctl.h:237:2: error: unknown type name 'uint32_t'
  uint32_t gpu_id; /* to KFD */
/usr/include/linux/kfd_ioctl.h:238:2: error: unknown type name 'uint32_t'
  uint32_t pad;
/usr/include/linux/kfd_ioctl.h:243:2: error: unknown type name 'uint64_t'
  uint64_t tile_config_ptr;
/usr/include/linux/kfd_ioctl.h:245:2: error: unknown type name 'uint64_t'
  uint64_t macro_tile_config_ptr;
/usr/include/linux/kfd_ioctl.h:249:2: error: unknown type name 'uint32_t'
  uint32_t num_tile_configs;
/usr/include/linux/kfd_ioctl.h:253:2: error: unknown type name 'uint32_t'
  uint32_t num_macro_tile_configs;
/usr/include/linux/kfd_ioctl.h:255:2: error: unknown type name 'uint32_t'
  uint32_t gpu_id;  /* to KFD */
/usr/include/linux/kfd_ioctl.h:256:2: error: unknown type name 'uint32_t'
  uint32_t gb_addr_config; /* from KFD */
/usr/include/linux/kfd_ioctl.h:257:2: error: unknown type name 'uint32_t'
  uint32_t num_banks;  /* from KFD */
/usr/include/linux/kfd_ioctl.h:258:2: error: unknown type name 'uint32_t'
  uint32_t num_ranks;  /* from KFD */

Fixes: 6a1c9510694fe ("drm/amdkfd: Adding new IOCTL for scratch memory v2")
Fixes: 5d71dbc3a5886 ("drm/amdkfd: Implement image tiling mode support v2")
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 include/uapi/linux/kfd_ioctl.h | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 731d0df722e3..6e80501368ae 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -233,29 +233,29 @@ struct kfd_ioctl_wait_events_args {
 };
 
 struct kfd_ioctl_set_scratch_backing_va_args {
-	uint64_t va_addr;	/* to KFD */
-	uint32_t gpu_id;	/* to KFD */
-	uint32_t pad;
+	__u64 va_addr;	/* to KFD */
+	__u32 gpu_id;	/* to KFD */
+	__u32 pad;
 };
 
 struct kfd_ioctl_get_tile_config_args {
 	/* to KFD: pointer to tile array */
-	uint64_t tile_config_ptr;
+	__u64 tile_config_ptr;
 	/* to KFD: pointer to macro tile array */
-	uint64_t macro_tile_config_ptr;
+	__u64 macro_tile_config_ptr;
 	/* to KFD: array size allocated by user mode
 	 * from KFD: array size filled by kernel
 	 */
-	uint32_t num_tile_configs;
+	__u32 num_tile_configs;
 	/* to KFD: array size allocated by user mode
 	 * from KFD: array size filled by kernel
 	 */
-	uint32_t num_macro_tile_configs;
+	__u32 num_macro_tile_configs;
 
-	uint32_t gpu_id;		/* to KFD */
-	uint32_t gb_addr_config;	/* from KFD */
-	uint32_t num_banks;		/* from KFD */
-	uint32_t num_ranks;		/* from KFD */
+	__u32 gpu_id;		/* to KFD */
+	__u32 gb_addr_config;	/* from KFD */
+	__u32 num_banks;		/* from KFD */
+	__u32 num_ranks;		/* from KFD */
 	/* struct size can be extended later if needed
 	 * without breaking ABI compatibility
 	 */
-- 
cgit v1.2.3


From 7b6ddeaf27eca72795ceeae2f0f347db1b5f9a30 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 21 Nov 2017 14:46:08 +0100
Subject: mac80211: use QoS NDP for AP probing

When connected to a QoS/WMM AP, mac80211 should use a QoS NDP
for probing it, instead of a regular non-QoS one, fix this.

Change all the drivers to *not* allow QoS NDP for now, even
though it looks like most of them should be OK with that.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath9k/channel.c |  2 +-
 drivers/net/wireless/st/cw1200/sta.c     |  4 ++--
 drivers/net/wireless/ti/wl1251/main.c    |  2 +-
 drivers/net/wireless/ti/wlcore/cmd.c     |  5 +++--
 include/net/mac80211.h                   |  8 +++++++-
 net/mac80211/mlme.c                      |  2 +-
 net/mac80211/tx.c                        | 29 +++++++++++++++++++++++++++--
 7 files changed, 42 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/ath/ath9k/channel.c b/drivers/net/wireless/ath/ath9k/channel.c
index dfb26f03c1a2..1b05b5d7a038 100644
--- a/drivers/net/wireless/ath/ath9k/channel.c
+++ b/drivers/net/wireless/ath/ath9k/channel.c
@@ -1113,7 +1113,7 @@ ath_chanctx_send_vif_ps_frame(struct ath_softc *sc, struct ath_vif *avp,
 		if (!avp->assoc)
 			return false;
 
-		skb = ieee80211_nullfunc_get(sc->hw, vif);
+		skb = ieee80211_nullfunc_get(sc->hw, vif, false);
 		if (!skb)
 			return false;
 
diff --git a/drivers/net/wireless/st/cw1200/sta.c b/drivers/net/wireless/st/cw1200/sta.c
index 03687a80d6e9..38678e9a0562 100644
--- a/drivers/net/wireless/st/cw1200/sta.c
+++ b/drivers/net/wireless/st/cw1200/sta.c
@@ -198,7 +198,7 @@ void __cw1200_cqm_bssloss_sm(struct cw1200_common *priv,
 
 		priv->bss_loss_state++;
 
-		skb = ieee80211_nullfunc_get(priv->hw, priv->vif);
+		skb = ieee80211_nullfunc_get(priv->hw, priv->vif, false);
 		WARN_ON(!skb);
 		if (skb)
 			cw1200_tx(priv->hw, NULL, skb);
@@ -2265,7 +2265,7 @@ static int cw1200_upload_null(struct cw1200_common *priv)
 		.rate = 0xFF,
 	};
 
-	frame.skb = ieee80211_nullfunc_get(priv->hw, priv->vif);
+	frame.skb = ieee80211_nullfunc_get(priv->hw, priv->vif, false);
 	if (!frame.skb)
 		return -ENOMEM;
 
diff --git a/drivers/net/wireless/ti/wl1251/main.c b/drivers/net/wireless/ti/wl1251/main.c
index 9915d83a4a30..6d02c660b4ab 100644
--- a/drivers/net/wireless/ti/wl1251/main.c
+++ b/drivers/net/wireless/ti/wl1251/main.c
@@ -566,7 +566,7 @@ static int wl1251_build_null_data(struct wl1251 *wl)
 		size = sizeof(struct wl12xx_null_data_template);
 		ptr = NULL;
 	} else {
-		skb = ieee80211_nullfunc_get(wl->hw, wl->vif);
+		skb = ieee80211_nullfunc_get(wl->hw, wl->vif, false);
 		if (!skb)
 			goto out;
 		size = skb->len;
diff --git a/drivers/net/wireless/ti/wlcore/cmd.c b/drivers/net/wireless/ti/wlcore/cmd.c
index 2bfc12fdc929..761cf8573a80 100644
--- a/drivers/net/wireless/ti/wlcore/cmd.c
+++ b/drivers/net/wireless/ti/wlcore/cmd.c
@@ -1069,7 +1069,8 @@ int wl12xx_cmd_build_null_data(struct wl1271 *wl, struct wl12xx_vif *wlvif)
 		ptr = NULL;
 	} else {
 		skb = ieee80211_nullfunc_get(wl->hw,
-					     wl12xx_wlvif_to_vif(wlvif));
+					     wl12xx_wlvif_to_vif(wlvif),
+					     false);
 		if (!skb)
 			goto out;
 		size = skb->len;
@@ -1096,7 +1097,7 @@ int wl12xx_cmd_build_klv_null_data(struct wl1271 *wl,
 	struct sk_buff *skb = NULL;
 	int ret = -ENOMEM;
 
-	skb = ieee80211_nullfunc_get(wl->hw, vif);
+	skb = ieee80211_nullfunc_get(wl->hw, vif, false);
 	if (!skb)
 		goto out;
 
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index cc9073e45be9..eec143cca1c0 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -4470,18 +4470,24 @@ struct sk_buff *ieee80211_pspoll_get(struct ieee80211_hw *hw,
  * ieee80211_nullfunc_get - retrieve a nullfunc template
  * @hw: pointer obtained from ieee80211_alloc_hw().
  * @vif: &struct ieee80211_vif pointer from the add_interface callback.
+ * @qos_ok: QoS NDP is acceptable to the caller, this should be set
+ *	if at all possible
  *
  * Creates a Nullfunc template which can, for example, uploaded to
  * hardware. The template must be updated after association so that correct
  * BSSID and address is used.
  *
+ * If @qos_ndp is set and the association is to an AP with QoS/WMM, the
+ * returned packet will be QoS NDP.
+ *
  * Note: Caller (or hardware) is responsible for setting the
  * &IEEE80211_FCTL_PM bit as well as Duration and Sequence Control fields.
  *
  * Return: The nullfunc template. %NULL on error.
  */
 struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw,
-				       struct ieee80211_vif *vif);
+				       struct ieee80211_vif *vif,
+				       bool qos_ok);
 
 /**
  * ieee80211_probereq_get - retrieve a Probe Request template
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 04460440d731..c244691deab9 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -895,7 +895,7 @@ void ieee80211_send_nullfunc(struct ieee80211_local *local,
 	struct ieee80211_hdr_3addr *nullfunc;
 	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 
-	skb = ieee80211_nullfunc_get(&local->hw, &sdata->vif);
+	skb = ieee80211_nullfunc_get(&local->hw, &sdata->vif, true);
 	if (!skb)
 		return;
 
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 7b8154474b9e..3160954fc406 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -4438,13 +4438,15 @@ struct sk_buff *ieee80211_pspoll_get(struct ieee80211_hw *hw,
 EXPORT_SYMBOL(ieee80211_pspoll_get);
 
 struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw,
-				       struct ieee80211_vif *vif)
+				       struct ieee80211_vif *vif,
+				       bool qos_ok)
 {
 	struct ieee80211_hdr_3addr *nullfunc;
 	struct ieee80211_sub_if_data *sdata;
 	struct ieee80211_if_managed *ifmgd;
 	struct ieee80211_local *local;
 	struct sk_buff *skb;
+	bool qos = false;
 
 	if (WARN_ON(vif->type != NL80211_IFTYPE_STATION))
 		return NULL;
@@ -4453,7 +4455,17 @@ struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw,
 	ifmgd = &sdata->u.mgd;
 	local = sdata->local;
 
-	skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*nullfunc));
+	if (qos_ok) {
+		struct sta_info *sta;
+
+		rcu_read_lock();
+		sta = sta_info_get(sdata, ifmgd->bssid);
+		qos = sta && sta->sta.wme;
+		rcu_read_unlock();
+	}
+
+	skb = dev_alloc_skb(local->hw.extra_tx_headroom +
+			    sizeof(*nullfunc) + 2);
 	if (!skb)
 		return NULL;
 
@@ -4463,6 +4475,19 @@ struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw,
 	nullfunc->frame_control = cpu_to_le16(IEEE80211_FTYPE_DATA |
 					      IEEE80211_STYPE_NULLFUNC |
 					      IEEE80211_FCTL_TODS);
+	if (qos) {
+		__le16 qos = cpu_to_le16(7);
+
+		BUILD_BUG_ON((IEEE80211_STYPE_QOS_NULLFUNC |
+			      IEEE80211_STYPE_NULLFUNC) !=
+			     IEEE80211_STYPE_QOS_NULLFUNC);
+		nullfunc->frame_control |=
+			cpu_to_le16(IEEE80211_STYPE_QOS_NULLFUNC);
+		skb->priority = 7;
+		skb_set_queue_mapping(skb, IEEE80211_AC_VO);
+		skb_put_data(skb, &qos, sizeof(qos));
+	}
+
 	memcpy(nullfunc->addr1, ifmgd->bssid, ETH_ALEN);
 	memcpy(nullfunc->addr2, vif->addr, ETH_ALEN);
 	memcpy(nullfunc->addr3, ifmgd->bssid, ETH_ALEN);
-- 
cgit v1.2.3


From 20b7035c66bacc909ae3ffe92c1a1ea7db99fe4f Mon Sep 17 00:00:00 2001
From: "Jan H. Schönherr" <jschoenh@amazon.de>
Date: Fri, 24 Nov 2017 22:39:01 +0100
Subject: KVM: Let KVM_SET_SIGNAL_MASK work as advertised
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

KVM API says for the signal mask you set via KVM_SET_SIGNAL_MASK, that
"any unblocked signal received [...] will cause KVM_RUN to return with
-EINTR" and that "the signal will only be delivered if not blocked by
the original signal mask".

This, however, is only true, when the calling task has a signal handler
registered for a signal. If not, signal evaluation is short-circuited for
SIG_IGN and SIG_DFL, and the signal is either ignored without KVM_RUN
returning or the whole process is terminated.

Make KVM_SET_SIGNAL_MASK behave as advertised by utilizing logic similar
to that in do_sigtimedwait() to avoid short-circuiting of signals.

Signed-off-by: Jan H. SchÃ¶nherr <jschoenh@amazon.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/mips.c       |  7 ++-----
 arch/powerpc/kvm/powerpc.c |  7 ++-----
 arch/s390/kvm/kvm-s390.c   |  7 ++-----
 arch/x86/kvm/x86.c         |  7 ++-----
 include/linux/kvm_host.h   |  3 +++
 virt/kvm/arm/arm.c         |  8 +++-----
 virt/kvm/kvm_main.c        | 23 +++++++++++++++++++++++
 7 files changed, 37 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index d535edc01434..75fdeaa8c62f 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -445,10 +445,8 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
 	int r = -EINTR;
-	sigset_t sigsaved;
 
-	if (vcpu->sigset_active)
-		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
+	kvm_sigset_activate(vcpu);
 
 	if (vcpu->mmio_needed) {
 		if (!vcpu->mmio_is_write)
@@ -480,8 +478,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	local_irq_enable();
 
 out:
-	if (vcpu->sigset_active)
-		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+	kvm_sigset_deactivate(vcpu);
 
 	return r;
 }
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 6b6c53c42ac9..1915e86cef6f 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -1407,7 +1407,6 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
 	int r;
-	sigset_t sigsaved;
 
 	if (vcpu->mmio_needed) {
 		vcpu->mmio_needed = 0;
@@ -1448,16 +1447,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 #endif
 	}
 
-	if (vcpu->sigset_active)
-		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
+	kvm_sigset_activate(vcpu);
 
 	if (run->immediate_exit)
 		r = -EINTR;
 	else
 		r = kvmppc_vcpu_run(run, vcpu);
 
-	if (vcpu->sigset_active)
-		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+	kvm_sigset_deactivate(vcpu);
 
 	return r;
 }
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 98ad8b9e0360..9614aea5839b 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -3372,7 +3372,6 @@ static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	int rc;
-	sigset_t sigsaved;
 
 	if (kvm_run->immediate_exit)
 		return -EINTR;
@@ -3382,8 +3381,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		return 0;
 	}
 
-	if (vcpu->sigset_active)
-		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
+	kvm_sigset_activate(vcpu);
 
 	if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm)) {
 		kvm_s390_vcpu_start(vcpu);
@@ -3417,8 +3415,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	disable_cpu_timer_accounting(vcpu);
 	store_regs(vcpu, kvm_run);
 
-	if (vcpu->sigset_active)
-		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+	kvm_sigset_deactivate(vcpu);
 
 	vcpu->stat.exit_userspace++;
 	return rc;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f49fe514d1b2..eee8e7faf1af 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7267,12 +7267,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	struct fpu *fpu = &current->thread.fpu;
 	int r;
-	sigset_t sigsaved;
 
 	fpu__initialize(fpu);
 
-	if (vcpu->sigset_active)
-		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
+	kvm_sigset_activate(vcpu);
 
 	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
 		if (kvm_run->immediate_exit) {
@@ -7315,8 +7313,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 out:
 	post_kvm_run_save(vcpu);
-	if (vcpu->sigset_active)
-		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+	kvm_sigset_deactivate(vcpu);
 
 	return r;
 }
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2e754b7c282c..893d6d606cd0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -715,6 +715,9 @@ int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
 			 unsigned long len);
 void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);
 
+void kvm_sigset_activate(struct kvm_vcpu *vcpu);
+void kvm_sigset_deactivate(struct kvm_vcpu *vcpu);
+
 void kvm_vcpu_block(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu);
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index a6524ff27de4..a67c106d73f5 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -615,7 +615,6 @@ static void check_vcpu_requests(struct kvm_vcpu *vcpu)
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
 	int ret;
-	sigset_t sigsaved;
 
 	if (unlikely(!kvm_vcpu_initialized(vcpu)))
 		return -ENOEXEC;
@@ -633,8 +632,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	if (run->immediate_exit)
 		return -EINTR;
 
-	if (vcpu->sigset_active)
-		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
+	kvm_sigset_activate(vcpu);
 
 	ret = 1;
 	run->exit_reason = KVM_EXIT_UNKNOWN;
@@ -769,8 +767,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		kvm_pmu_update_run(vcpu);
 	}
 
-	if (vcpu->sigset_active)
-		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+	kvm_sigset_deactivate(vcpu);
+
 	return ret;
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2dd1a9ca4599..c01cff064ec5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2065,6 +2065,29 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
 
+void kvm_sigset_activate(struct kvm_vcpu *vcpu)
+{
+	if (!vcpu->sigset_active)
+		return;
+
+	/*
+	 * This does a lockless modification of ->real_blocked, which is fine
+	 * because, only current can change ->real_blocked and all readers of
+	 * ->real_blocked don't care as long ->real_blocked is always a subset
+	 * of ->blocked.
+	 */
+	sigprocmask(SIG_SETMASK, &vcpu->sigset, &current->real_blocked);
+}
+
+void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
+{
+	if (!vcpu->sigset_active)
+		return;
+
+	sigprocmask(SIG_SETMASK, &current->real_blocked, NULL);
+	sigemptyset(&current->real_blocked);
+}
+
 static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
 {
 	unsigned int old, val, grow;
-- 
cgit v1.2.3


From d34971a65b72619508f49cd237283e92f1c329d5 Mon Sep 17 00:00:00 2001
From: Bhumika Goyal <bhumirks@gmail.com>
Date: Tue, 17 Oct 2017 18:14:23 +0200
Subject: sunrpc: make the function arg as const

Make the struct cache_detail *tmpl argument of the function
cache_create_net as const as it is only getting passed to kmemup having
the argument as const void *.
Add const to the prototype too.

Signed-off-by: Bhumika Goyal <bhumirks@gmail.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/cache.h | 2 +-
 net/sunrpc/cache.c           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index 270bad0e1bed..40d2822f0e2f 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -213,7 +213,7 @@ extern void __init cache_initialize(void);
 extern int cache_register_net(struct cache_detail *cd, struct net *net);
 extern void cache_unregister_net(struct cache_detail *cd, struct net *net);
 
-extern struct cache_detail *cache_create_net(struct cache_detail *tmpl, struct net *net);
+extern struct cache_detail *cache_create_net(const struct cache_detail *tmpl, struct net *net);
 extern void cache_destroy_net(struct cache_detail *cd, struct net *net);
 
 extern void sunrpc_init_cache_detail(struct cache_detail *cd);
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 79d55d949d9a..e68943895be4 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -1674,7 +1674,7 @@ void cache_unregister_net(struct cache_detail *cd, struct net *net)
 }
 EXPORT_SYMBOL_GPL(cache_unregister_net);
 
-struct cache_detail *cache_create_net(struct cache_detail *tmpl, struct net *net)
+struct cache_detail *cache_create_net(const struct cache_detail *tmpl, struct net *net)
 {
 	struct cache_detail *cd;
 	int i;
-- 
cgit v1.2.3


From 81cf4a45360f70528f1f64ba018d61cb5767249a Mon Sep 17 00:00:00 2001
From: Masakazu Mokuno <masakazu.mokuno@gmail.com>
Date: Fri, 10 Nov 2017 01:25:50 +0900
Subject: USB: core: Add type-specific length check of BOS descriptors

As most of BOS descriptors are longer in length than their header
'struct usb_dev_cap_header', comparing solely with it is not sufficient
to avoid out-of-bounds access to BOS descriptors.

This patch adds descriptor type specific length check in
usb_get_bos_descriptor() to fix the issue.

Signed-off-by: Masakazu Mokuno <masakazu.mokuno@gmail.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/config.c    | 28 ++++++++++++++++++++++++----
 include/uapi/linux/usb/ch9.h |  3 +++
 2 files changed, 27 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c
index da8acd980fc6..55b198ba629b 100644
--- a/drivers/usb/core/config.c
+++ b/drivers/usb/core/config.c
@@ -905,14 +905,25 @@ void usb_release_bos_descriptor(struct usb_device *dev)
 	}
 }
 
+static const __u8 bos_desc_len[256] = {
+	[USB_CAP_TYPE_WIRELESS_USB] = USB_DT_USB_WIRELESS_CAP_SIZE,
+	[USB_CAP_TYPE_EXT]          = USB_DT_USB_EXT_CAP_SIZE,
+	[USB_SS_CAP_TYPE]           = USB_DT_USB_SS_CAP_SIZE,
+	[USB_SSP_CAP_TYPE]          = USB_DT_USB_SSP_CAP_SIZE(1),
+	[CONTAINER_ID_TYPE]         = USB_DT_USB_SS_CONTN_ID_SIZE,
+	[USB_PTM_CAP_TYPE]          = USB_DT_USB_PTM_ID_SIZE,
+};
+
 /* Get BOS descriptor set */
 int usb_get_bos_descriptor(struct usb_device *dev)
 {
 	struct device *ddev = &dev->dev;
 	struct usb_bos_descriptor *bos;
 	struct usb_dev_cap_header *cap;
+	struct usb_ssp_cap_descriptor *ssp_cap;
 	unsigned char *buffer;
-	int length, total_len, num, i;
+	int length, total_len, num, i, ssac;
+	__u8 cap_type;
 	int ret;
 
 	bos = kzalloc(sizeof(struct usb_bos_descriptor), GFP_KERNEL);
@@ -965,7 +976,13 @@ int usb_get_bos_descriptor(struct usb_device *dev)
 			dev->bos->desc->bNumDeviceCaps = i;
 			break;
 		}
+		cap_type = cap->bDevCapabilityType;
 		length = cap->bLength;
+		if (bos_desc_len[cap_type] && length < bos_desc_len[cap_type]) {
+			dev->bos->desc->bNumDeviceCaps = i;
+			break;
+		}
+
 		total_len -= length;
 
 		if (cap->bDescriptorType != USB_DT_DEVICE_CAPABILITY) {
@@ -973,7 +990,7 @@ int usb_get_bos_descriptor(struct usb_device *dev)
 			continue;
 		}
 
-		switch (cap->bDevCapabilityType) {
+		switch (cap_type) {
 		case USB_CAP_TYPE_WIRELESS_USB:
 			/* Wireless USB cap descriptor is handled by wusb */
 			break;
@@ -986,8 +1003,11 @@ int usb_get_bos_descriptor(struct usb_device *dev)
 				(struct usb_ss_cap_descriptor *)buffer;
 			break;
 		case USB_SSP_CAP_TYPE:
-			dev->bos->ssp_cap =
-				(struct usb_ssp_cap_descriptor *)buffer;
+			ssp_cap = (struct usb_ssp_cap_descriptor *)buffer;
+			ssac = (le32_to_cpu(ssp_cap->bmAttributes) &
+				USB_SSP_SUBLINK_SPEED_ATTRIBS) + 1;
+			if (length >= USB_DT_USB_SSP_CAP_SIZE(ssac))
+				dev->bos->ssp_cap = ssp_cap;
 			break;
 		case CONTAINER_ID_TYPE:
 			dev->bos->ss_id =
diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h
index 41a0a81b01e6..c4c79aa331bd 100644
--- a/include/uapi/linux/usb/ch9.h
+++ b/include/uapi/linux/usb/ch9.h
@@ -880,6 +880,8 @@ struct usb_wireless_cap_descriptor {	/* Ultra Wide Band */
 	__u8  bReserved;
 } __attribute__((packed));
 
+#define USB_DT_USB_WIRELESS_CAP_SIZE	11
+
 /* USB 2.0 Extension descriptor */
 #define	USB_CAP_TYPE_EXT		2
 
@@ -1072,6 +1074,7 @@ struct usb_ptm_cap_descriptor {
 	__u8  bDevCapabilityType;
 } __attribute__((packed));
 
+#define USB_DT_USB_PTM_ID_SIZE		3
 /*
  * The size of the descriptor for the Sublink Speed Attribute Count
  * (SSAC) specified in bmAttributes[4:0].
-- 
cgit v1.2.3


From f50caa9b517a2542ae9769fc17ad84110ae07c8b Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 14 Nov 2017 12:40:31 +0100
Subject: debugfs: fix debugfs_real_fops() build error

Some drivers use debugfs_real_fops() even when CONFIG_DEBUG_FS is disabled,
which now leads to a build error:

In file included from include/linux/list.h:9:0,
                 from include/linux/wait.h:7,
                 from include/linux/wait_bit.h:8,
                 from include/linux/fs.h:6,
                 from drivers/net/wireless/broadcom/b43legacy/debugfs.c:26:
drivers/net/wireless/broadcom/b43legacy/debugfs.c: In function 'b43legacy_debugfs_read':
drivers/net/wireless/broadcom/b43legacy/debugfs.c:224:23: error: implicit declaration of function 'debugfs_real_fops'; did you mean 'debugfs_create_bool'? [-Werror=implicit-function-declaration]

My first impulse was to add another 'static inline' dummy function
returning NULL for it, which would work fine. However, most callers
feed the pointer into container_of(), so it seems a little dangerous
here. Since all the callers are inside of a read/write file operation
that gets eliminated in this configuration, so having an 'extern'
declaration seems better here. If it ever gets used in a dangerous
way, that will now result in a link error.

Fixes: 7c8d469877b1 ("debugfs: add support for more elaborate ->d_fsdata")
Cc: Jakub Kicinski <jakub.kicinski@netronome.com>
Cc: Simon Horman <simon.horman@netronome.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/debugfs.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index f36ecc2a5712..3b0ba54cc4d5 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -216,6 +216,8 @@ static inline void debugfs_remove(struct dentry *dentry)
 static inline void debugfs_remove_recursive(struct dentry *dentry)
 { }
 
+const struct file_operations *debugfs_real_fops(const struct file *filp);
+
 static inline int debugfs_file_get(struct dentry *dentry)
 {
 	return 0;
-- 
cgit v1.2.3


From fd00cf81a9a84776ba58e56bd042c726dcf75cf3 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Fri, 3 Nov 2017 15:30:53 +0100
Subject: serdev: fix receive_buf return value when no callback

The receive_buf callback is supposed to return the number of bytes
processed and should specifically not return a negative errno.

Due to missing sanity checks in the serdev tty-port controller, a driver
not providing a receive_buf callback could cause the flush_to_ldisc()
worker to spin in a tight loop when the tty buffer pointers are
incremented with -EINVAL (-22).

The missing sanity checks have now been added to the tty-port
controller, but let's fix up the serdev-controller helper as well.

Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/serdev.h b/include/linux/serdev.h
index e69402d4a8ae..d609e6dc5bad 100644
--- a/include/linux/serdev.h
+++ b/include/linux/serdev.h
@@ -184,7 +184,7 @@ static inline int serdev_controller_receive_buf(struct serdev_controller *ctrl,
 	struct serdev_device *serdev = ctrl->serdev;
 
 	if (!serdev || !serdev->ops->receive_buf)
-		return -EINVAL;
+		return 0;
 
 	return serdev->ops->receive_buf(serdev, data, count);
 }
-- 
cgit v1.2.3


From 7fa32e5ec28b1609abc0b797b58267f725fc3964 Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Tue, 14 Nov 2017 06:53:33 -0700
Subject: Drivers: hv: vmbus: Fix a rescind issue

The current rescind processing code will not correctly handle
the case where the host immediately rescinds a channel that has
been offerred. In this case, we could be blocked in the open call and
since the channel is rescinded, the host will not respond and we could
be blocked forever in the vmbus open call.i Fix this problem.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Cc: stable@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/channel.c      | 10 ++++++++--
 drivers/hv/channel_mgmt.c |  7 ++++---
 include/linux/hyperv.h    |  1 +
 3 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index 19f0cf37e0ed..ba0a092ae085 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -659,22 +659,28 @@ void vmbus_close(struct vmbus_channel *channel)
 		 */
 		return;
 	}
-	mutex_lock(&vmbus_connection.channel_mutex);
 	/*
 	 * Close all the sub-channels first and then close the
 	 * primary channel.
 	 */
 	list_for_each_safe(cur, tmp, &channel->sc_list) {
 		cur_channel = list_entry(cur, struct vmbus_channel, sc_list);
-		vmbus_close_internal(cur_channel);
 		if (cur_channel->rescind) {
+			wait_for_completion(&cur_channel->rescind_event);
+			mutex_lock(&vmbus_connection.channel_mutex);
+			vmbus_close_internal(cur_channel);
 			hv_process_channel_removal(
 					   cur_channel->offermsg.child_relid);
+		} else {
+			mutex_lock(&vmbus_connection.channel_mutex);
+			vmbus_close_internal(cur_channel);
 		}
+		mutex_unlock(&vmbus_connection.channel_mutex);
 	}
 	/*
 	 * Now close the primary.
 	 */
+	mutex_lock(&vmbus_connection.channel_mutex);
 	vmbus_close_internal(channel);
 	mutex_unlock(&vmbus_connection.channel_mutex);
 }
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index ec5454f3f4a6..c21020b69114 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -333,6 +333,7 @@ static struct vmbus_channel *alloc_channel(void)
 		return NULL;
 
 	spin_lock_init(&channel->lock);
+	init_completion(&channel->rescind_event);
 
 	INIT_LIST_HEAD(&channel->sc_list);
 	INIT_LIST_HEAD(&channel->percpu_list);
@@ -898,6 +899,7 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr)
 	/*
 	 * Now wait for offer handling to complete.
 	 */
+	vmbus_rescind_cleanup(channel);
 	while (READ_ONCE(channel->probe_done) == false) {
 		/*
 		 * We wait here until any channel offer is currently
@@ -913,7 +915,6 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr)
 	if (channel->device_obj) {
 		if (channel->chn_rescind_callback) {
 			channel->chn_rescind_callback(channel);
-			vmbus_rescind_cleanup(channel);
 			return;
 		}
 		/*
@@ -922,7 +923,6 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr)
 		 */
 		dev = get_device(&channel->device_obj->device);
 		if (dev) {
-			vmbus_rescind_cleanup(channel);
 			vmbus_device_unregister(channel->device_obj);
 			put_device(dev);
 		}
@@ -936,13 +936,14 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr)
 		 * 2. Then close the primary channel.
 		 */
 		mutex_lock(&vmbus_connection.channel_mutex);
-		vmbus_rescind_cleanup(channel);
 		if (channel->state == CHANNEL_OPEN_STATE) {
 			/*
 			 * The channel is currently not open;
 			 * it is safe for us to cleanup the channel.
 			 */
 			hv_process_channel_removal(rescind->child_relid);
+		} else {
+			complete(&channel->rescind_event);
 		}
 		mutex_unlock(&vmbus_connection.channel_mutex);
 	}
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index f3e97c5f94c9..6c9336626592 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -708,6 +708,7 @@ struct vmbus_channel {
 	u8 monitor_bit;
 
 	bool rescind; /* got rescind msg */
+	struct completion rescind_event;
 
 	u32 ringbuffer_gpadlhandle;
 
-- 
cgit v1.2.3


From af2697a0273f7665429c47d71ab26f2412af924e Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 26 Nov 2017 20:16:07 +0800
Subject: sctp: force the params with right types for sctp csum apis

Now sctp_csum_xxx doesn't really match the param types of these common
csum apis. As sctp_csum_xxx is defined in sctp/checksum.h, many sparse
errors occur when make C=2 not only with M=net/sctp but also with other
modules that include this header file.

This patch is to force them fit in csum apis with the right types.

Fixes: e6d8b64b34aa ("net: sctp: fix and consolidate SCTP checksumming code")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/checksum.h | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/checksum.h b/include/net/sctp/checksum.h
index 4a5b9a306c69..32ee65a30aff 100644
--- a/include/net/sctp/checksum.h
+++ b/include/net/sctp/checksum.h
@@ -48,31 +48,32 @@ static inline __wsum sctp_csum_update(const void *buff, int len, __wsum sum)
 	/* This uses the crypto implementation of crc32c, which is either
 	 * implemented w/ hardware support or resolves to __crc32c_le().
 	 */
-	return crc32c(sum, buff, len);
+	return (__force __wsum)crc32c((__force __u32)sum, buff, len);
 }
 
 static inline __wsum sctp_csum_combine(__wsum csum, __wsum csum2,
 				       int offset, int len)
 {
-	return __crc32c_le_combine(csum, csum2, len);
+	return (__force __wsum)__crc32c_le_combine((__force __u32)csum,
+						   (__force __u32)csum2, len);
 }
 
 static inline __le32 sctp_compute_cksum(const struct sk_buff *skb,
 					unsigned int offset)
 {
 	struct sctphdr *sh = sctp_hdr(skb);
-        __le32 ret, old = sh->checksum;
 	const struct skb_checksum_ops ops = {
 		.update  = sctp_csum_update,
 		.combine = sctp_csum_combine,
 	};
+	__le32 old = sh->checksum;
+	__wsum new;
 
 	sh->checksum = 0;
-	ret = cpu_to_le32(~__skb_checksum(skb, offset, skb->len - offset,
-					  ~(__u32)0, &ops));
+	new = ~__skb_checksum(skb, offset, skb->len - offset, ~(__wsum)0, &ops);
 	sh->checksum = old;
 
-	return ret;
+	return cpu_to_le32((__force __u32)new);
 }
 
 #endif /* __sctp_checksum_h__ */
-- 
cgit v1.2.3


From 1ba896f6f52bfafac6dec4ca583cdd9a073858e8 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 26 Nov 2017 20:16:08 +0800
Subject: sctp: remove extern from stream sched

Now each stream sched ops is defined in different .c file and
added into the global ops in another .c file, it uses extern
to make this work.

However extern is not good coding style to get them in and
even make C=2 reports errors for this.

This patch adds sctp_sched_ops_xxx_init for each stream sched
ops in their .c file, then get them into the global ops by
calling them when initializing sctp module.

Fixes: 637784ade221 ("sctp: introduce priority based stream scheduler")
Fixes: ac1ed8b82cd6 ("sctp: introduce round robin stream scheduler")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h         |  5 +++++
 include/net/sctp/stream_sched.h |  5 +++++
 net/sctp/protocol.c             |  1 +
 net/sctp/stream_sched.c         | 25 ++++++++++++++++++-------
 net/sctp/stream_sched_prio.c    |  7 ++++++-
 net/sctp/stream_sched_rr.c      |  7 ++++++-
 6 files changed, 41 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 749a42882437..906a9c0efa71 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -194,6 +194,11 @@ void sctp_remaddr_proc_exit(struct net *net);
  */
 int sctp_offload_init(void);
 
+/*
+ * sctp/stream_sched.c
+ */
+void sctp_sched_ops_init(void);
+
 /*
  * sctp/stream.c
  */
diff --git a/include/net/sctp/stream_sched.h b/include/net/sctp/stream_sched.h
index c676550a4c7d..5c5da48f65e7 100644
--- a/include/net/sctp/stream_sched.h
+++ b/include/net/sctp/stream_sched.h
@@ -69,4 +69,9 @@ void sctp_sched_dequeue_common(struct sctp_outq *q, struct sctp_chunk *ch);
 int sctp_sched_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp);
 struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream);
 
+void sctp_sched_ops_register(enum sctp_sched_type sched,
+			     struct sctp_sched_ops *sched_ops);
+void sctp_sched_ops_prio_init(void);
+void sctp_sched_ops_rr_init(void);
+
 #endif /* __sctp_stream_sched_h__ */
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index f5172c21349b..6a38c2503649 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1499,6 +1499,7 @@ static __init int sctp_init(void)
 	INIT_LIST_HEAD(&sctp_address_families);
 	sctp_v4_pf_init();
 	sctp_v6_pf_init();
+	sctp_sched_ops_init();
 
 	status = register_pernet_subsys(&sctp_defaults_ops);
 	if (status)
diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c
index 0b83ec51e43b..d8c162a4089c 100644
--- a/net/sctp/stream_sched.c
+++ b/net/sctp/stream_sched.c
@@ -119,16 +119,27 @@ static struct sctp_sched_ops sctp_sched_fcfs = {
 	.unsched_all = sctp_sched_fcfs_unsched_all,
 };
 
+static void sctp_sched_ops_fcfs_init(void)
+{
+	sctp_sched_ops_register(SCTP_SS_FCFS, &sctp_sched_fcfs);
+}
+
 /* API to other parts of the stack */
 
-extern struct sctp_sched_ops sctp_sched_prio;
-extern struct sctp_sched_ops sctp_sched_rr;
+static struct sctp_sched_ops *sctp_sched_ops[SCTP_SS_MAX + 1];
 
-static struct sctp_sched_ops *sctp_sched_ops[] = {
-	&sctp_sched_fcfs,
-	&sctp_sched_prio,
-	&sctp_sched_rr,
-};
+void sctp_sched_ops_register(enum sctp_sched_type sched,
+			     struct sctp_sched_ops *sched_ops)
+{
+	sctp_sched_ops[sched] = sched_ops;
+}
+
+void sctp_sched_ops_init(void)
+{
+	sctp_sched_ops_fcfs_init();
+	sctp_sched_ops_prio_init();
+	sctp_sched_ops_rr_init();
+}
 
 int sctp_sched_set_sched(struct sctp_association *asoc,
 			 enum sctp_sched_type sched)
diff --git a/net/sctp/stream_sched_prio.c b/net/sctp/stream_sched_prio.c
index 384dbf3c8760..7997d35dd0fd 100644
--- a/net/sctp/stream_sched_prio.c
+++ b/net/sctp/stream_sched_prio.c
@@ -333,7 +333,7 @@ static void sctp_sched_prio_unsched_all(struct sctp_stream *stream)
 			sctp_sched_prio_unsched(soute);
 }
 
-struct sctp_sched_ops sctp_sched_prio = {
+static struct sctp_sched_ops sctp_sched_prio = {
 	.set = sctp_sched_prio_set,
 	.get = sctp_sched_prio_get,
 	.init = sctp_sched_prio_init,
@@ -345,3 +345,8 @@ struct sctp_sched_ops sctp_sched_prio = {
 	.sched_all = sctp_sched_prio_sched_all,
 	.unsched_all = sctp_sched_prio_unsched_all,
 };
+
+void sctp_sched_ops_prio_init(void)
+{
+	sctp_sched_ops_register(SCTP_SS_PRIO, &sctp_sched_prio);
+}
diff --git a/net/sctp/stream_sched_rr.c b/net/sctp/stream_sched_rr.c
index 7612a438c5b9..1155692448f1 100644
--- a/net/sctp/stream_sched_rr.c
+++ b/net/sctp/stream_sched_rr.c
@@ -187,7 +187,7 @@ static void sctp_sched_rr_unsched_all(struct sctp_stream *stream)
 		sctp_sched_rr_unsched(stream, soute);
 }
 
-struct sctp_sched_ops sctp_sched_rr = {
+static struct sctp_sched_ops sctp_sched_rr = {
 	.set = sctp_sched_rr_set,
 	.get = sctp_sched_rr_get,
 	.init = sctp_sched_rr_init,
@@ -199,3 +199,8 @@ struct sctp_sched_ops sctp_sched_rr = {
 	.sched_all = sctp_sched_rr_sched_all,
 	.unsched_all = sctp_sched_rr_unsched_all,
 };
+
+void sctp_sched_ops_rr_init(void)
+{
+	sctp_sched_ops_register(SCTP_SS_RR, &sctp_sched_rr);
+}
-- 
cgit v1.2.3


From af3ff8045bbf3e32f1a448542e73abb4c8ceb6f1 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 28 Nov 2017 18:01:38 -0800
Subject: crypto: hmac - require that the underlying hash algorithm is unkeyed

Because the HMAC template didn't check that its underlying hash
algorithm is unkeyed, trying to use "hmac(hmac(sha3-512-generic))"
through AF_ALG or through KEYCTL_DH_COMPUTE resulted in the inner HMAC
being used without having been keyed, resulting in sha3_update() being
called without sha3_init(), causing a stack buffer overflow.

This is a very old bug, but it seems to have only started causing real
problems when SHA-3 support was added (requires CONFIG_CRYPTO_SHA3)
because the innermost hash's state is ->import()ed from a zeroed buffer,
and it just so happens that other hash algorithms are fine with that,
but SHA-3 is not.  However, there could be arch or hardware-dependent
hash algorithms also affected; I couldn't test everything.

Fix the bug by introducing a function crypto_shash_alg_has_setkey()
which tests whether a shash algorithm is keyed.  Then update the HMAC
template to require that its underlying hash algorithm is unkeyed.

Here is a reproducer:

    #include <linux/if_alg.h>
    #include <sys/socket.h>

    int main()
    {
        int algfd;
        struct sockaddr_alg addr = {
            .salg_type = "hash",
            .salg_name = "hmac(hmac(sha3-512-generic))",
        };
        char key[4096] = { 0 };

        algfd = socket(AF_ALG, SOCK_SEQPACKET, 0);
        bind(algfd, (const struct sockaddr *)&addr, sizeof(addr));
        setsockopt(algfd, SOL_ALG, ALG_SET_KEY, key, sizeof(key));
    }

Here was the KASAN report from syzbot:

    BUG: KASAN: stack-out-of-bounds in memcpy include/linux/string.h:341  [inline]
    BUG: KASAN: stack-out-of-bounds in sha3_update+0xdf/0x2e0  crypto/sha3_generic.c:161
    Write of size 4096 at addr ffff8801cca07c40 by task syzkaller076574/3044

    CPU: 1 PID: 3044 Comm: syzkaller076574 Not tainted 4.14.0-mm1+ #25
    Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  Google 01/01/2011
    Call Trace:
      __dump_stack lib/dump_stack.c:17 [inline]
      dump_stack+0x194/0x257 lib/dump_stack.c:53
      print_address_description+0x73/0x250 mm/kasan/report.c:252
      kasan_report_error mm/kasan/report.c:351 [inline]
      kasan_report+0x25b/0x340 mm/kasan/report.c:409
      check_memory_region_inline mm/kasan/kasan.c:260 [inline]
      check_memory_region+0x137/0x190 mm/kasan/kasan.c:267
      memcpy+0x37/0x50 mm/kasan/kasan.c:303
      memcpy include/linux/string.h:341 [inline]
      sha3_update+0xdf/0x2e0 crypto/sha3_generic.c:161
      crypto_shash_update+0xcb/0x220 crypto/shash.c:109
      shash_finup_unaligned+0x2a/0x60 crypto/shash.c:151
      crypto_shash_finup+0xc4/0x120 crypto/shash.c:165
      hmac_finup+0x182/0x330 crypto/hmac.c:152
      crypto_shash_finup+0xc4/0x120 crypto/shash.c:165
      shash_digest_unaligned+0x9e/0xd0 crypto/shash.c:172
      crypto_shash_digest+0xc4/0x120 crypto/shash.c:186
      hmac_setkey+0x36a/0x690 crypto/hmac.c:66
      crypto_shash_setkey+0xad/0x190 crypto/shash.c:64
      shash_async_setkey+0x47/0x60 crypto/shash.c:207
      crypto_ahash_setkey+0xaf/0x180 crypto/ahash.c:200
      hash_setkey+0x40/0x90 crypto/algif_hash.c:446
      alg_setkey crypto/af_alg.c:221 [inline]
      alg_setsockopt+0x2a1/0x350 crypto/af_alg.c:254
      SYSC_setsockopt net/socket.c:1851 [inline]
      SyS_setsockopt+0x189/0x360 net/socket.c:1830
      entry_SYSCALL_64_fastpath+0x1f/0x96

Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/hmac.c                  | 6 +++++-
 crypto/shash.c                 | 5 +++--
 include/crypto/internal/hash.h | 8 ++++++++
 3 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/crypto/hmac.c b/crypto/hmac.c
index 92871dc2a63e..e74730224f0a 100644
--- a/crypto/hmac.c
+++ b/crypto/hmac.c
@@ -195,11 +195,15 @@ static int hmac_create(struct crypto_template *tmpl, struct rtattr **tb)
 	salg = shash_attr_alg(tb[1], 0, 0);
 	if (IS_ERR(salg))
 		return PTR_ERR(salg);
+	alg = &salg->base;
 
+	/* The underlying hash algorithm must be unkeyed */
 	err = -EINVAL;
+	if (crypto_shash_alg_has_setkey(salg))
+		goto out_put_alg;
+
 	ds = salg->digestsize;
 	ss = salg->statesize;
-	alg = &salg->base;
 	if (ds > alg->cra_blocksize ||
 	    ss < alg->cra_blocksize)
 		goto out_put_alg;
diff --git a/crypto/shash.c b/crypto/shash.c
index 325a14da5827..e849d3ee2e27 100644
--- a/crypto/shash.c
+++ b/crypto/shash.c
@@ -25,11 +25,12 @@
 
 static const struct crypto_type crypto_shash_type;
 
-static int shash_no_setkey(struct crypto_shash *tfm, const u8 *key,
-			   unsigned int keylen)
+int shash_no_setkey(struct crypto_shash *tfm, const u8 *key,
+		    unsigned int keylen)
 {
 	return -ENOSYS;
 }
+EXPORT_SYMBOL_GPL(shash_no_setkey);
 
 static int shash_setkey_unaligned(struct crypto_shash *tfm, const u8 *key,
 				  unsigned int keylen)
diff --git a/include/crypto/internal/hash.h b/include/crypto/internal/hash.h
index f0b44c16e88f..c2bae8da642c 100644
--- a/include/crypto/internal/hash.h
+++ b/include/crypto/internal/hash.h
@@ -82,6 +82,14 @@ int ahash_register_instance(struct crypto_template *tmpl,
 			    struct ahash_instance *inst);
 void ahash_free_instance(struct crypto_instance *inst);
 
+int shash_no_setkey(struct crypto_shash *tfm, const u8 *key,
+		    unsigned int keylen);
+
+static inline bool crypto_shash_alg_has_setkey(struct shash_alg *alg)
+{
+	return alg->setkey != shash_no_setkey;
+}
+
 int crypto_init_ahash_spawn(struct crypto_ahash_spawn *spawn,
 			    struct hash_alg_common *alg,
 			    struct crypto_instance *inst);
-- 
cgit v1.2.3


From ec6449a9c2296b1c04f6219f7473e0c2fedecfed Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Mon, 20 Nov 2017 12:10:15 +0100
Subject: KVM: arm/arm64: Don't enable/disable physical timer access on VHE

After the timer optimization rework we accidentally end up calling
physical timer enable/disable functions on VHE systems, which is neither
needed nor correct, since the CNTHCTL_EL2 register format is
different when HCR_EL2.E2H is set.

The CNTHCTL_EL2 is initialized when CPUs become online in
kvm_timer_init_vhe() and we don't have to call these functions on VHE
systems, which also allows us to inline the non-VHE functionality.

Reported-by: Jintack Lim <jintack@cs.columbia.edu>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 include/kvm/arm_arch_timer.h |  3 ---
 virt/kvm/arm/arch_timer.c    |  6 ------
 virt/kvm/arm/hyp/timer-sr.c  | 48 ++++++++++++++++++--------------------------
 3 files changed, 20 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h
index 01ee473517e2..6e45608b2399 100644
--- a/include/kvm/arm_arch_timer.h
+++ b/include/kvm/arm_arch_timer.h
@@ -93,7 +93,4 @@ void kvm_timer_init_vhe(void);
 #define vcpu_vtimer(v)	(&(v)->arch.timer_cpu.vtimer)
 #define vcpu_ptimer(v)	(&(v)->arch.timer_cpu.ptimer)
 
-void enable_el1_phys_timer_access(void);
-void disable_el1_phys_timer_access(void);
-
 #endif
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index 4151250ce8da..190c99ed1b73 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -479,9 +479,6 @@ void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu)
 
 	vtimer_restore_state(vcpu);
 
-	if (has_vhe())
-		disable_el1_phys_timer_access();
-
 	/* Set the background timer for the physical timer emulation. */
 	phys_timer_emulate(vcpu);
 }
@@ -510,9 +507,6 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
 	if (unlikely(!timer->enabled))
 		return;
 
-	if (has_vhe())
-		enable_el1_phys_timer_access();
-
 	vtimer_save_state(vcpu);
 
 	/*
diff --git a/virt/kvm/arm/hyp/timer-sr.c b/virt/kvm/arm/hyp/timer-sr.c
index f39861639f08..f24404b3c8df 100644
--- a/virt/kvm/arm/hyp/timer-sr.c
+++ b/virt/kvm/arm/hyp/timer-sr.c
@@ -27,42 +27,34 @@ void __hyp_text __kvm_timer_set_cntvoff(u32 cntvoff_low, u32 cntvoff_high)
 	write_sysreg(cntvoff, cntvoff_el2);
 }
 
-void __hyp_text enable_el1_phys_timer_access(void)
-{
-	u64 val;
-
-	/* Allow physical timer/counter access for the host */
-	val = read_sysreg(cnthctl_el2);
-	val |= CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN;
-	write_sysreg(val, cnthctl_el2);
-}
-
-void __hyp_text disable_el1_phys_timer_access(void)
-{
-	u64 val;
-
-	/*
-	 * Disallow physical timer access for the guest
-	 * Physical counter access is allowed
-	 */
-	val = read_sysreg(cnthctl_el2);
-	val &= ~CNTHCTL_EL1PCEN;
-	val |= CNTHCTL_EL1PCTEN;
-	write_sysreg(val, cnthctl_el2);
-}
-
 void __hyp_text __timer_disable_traps(struct kvm_vcpu *vcpu)
 {
 	/*
 	 * We don't need to do this for VHE since the host kernel runs in EL2
 	 * with HCR_EL2.TGE ==1, which makes those bits have no impact.
 	 */
-	if (!has_vhe())
-		enable_el1_phys_timer_access();
+	if (!has_vhe()) {
+		u64 val;
+
+		/* Allow physical timer/counter access for the host */
+		val = read_sysreg(cnthctl_el2);
+		val |= CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN;
+		write_sysreg(val, cnthctl_el2);
+	}
 }
 
 void __hyp_text __timer_enable_traps(struct kvm_vcpu *vcpu)
 {
-	if (!has_vhe())
-		disable_el1_phys_timer_access();
+	if (!has_vhe()) {
+		u64 val;
+
+		/*
+		 * Disallow physical timer access for the guest
+		 * Physical counter access is allowed
+		 */
+		val = read_sysreg(cnthctl_el2);
+		val &= ~CNTHCTL_EL1PCEN;
+		val |= CNTHCTL_EL1PCTEN;
+		write_sysreg(val, cnthctl_el2);
+	}
 }
-- 
cgit v1.2.3


From 668533dc0764b30c9dd2baf3ca800156f688326b Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 29 Nov 2017 10:30:13 -0800
Subject: kallsyms: take advantage of the new '%px' format

The conditional kallsym hex printing used a special fixed-width '%lx'
output (KALLSYM_FMT) in preparation for the hashing of %p, but that
series ended up adding a %px specifier to help with the conversions.

Use it, and avoid the "print pointer as an unsigned long" code.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kallsyms.h | 6 ------
 kernel/kallsyms.c        | 8 ++++----
 kernel/module.c          | 6 +++---
 3 files changed, 7 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h
index 708f337d780b..bd118a6c60cb 100644
--- a/include/linux/kallsyms.h
+++ b/include/linux/kallsyms.h
@@ -14,12 +14,6 @@
 #define KSYM_SYMBOL_LEN (sizeof("%s+%#lx/%#lx [%s]") + (KSYM_NAME_LEN - 1) + \
 			 2*(BITS_PER_LONG*3/10) + (MODULE_NAME_LEN - 1) + 1)
 
-#ifndef CONFIG_64BIT
-# define KALLSYM_FMT "%08lx"
-#else
-# define KALLSYM_FMT "%016lx"
-#endif
-
 struct module;
 
 #ifdef CONFIG_KALLSYMS
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 531ffa984bc2..d5fa4116688a 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -614,14 +614,14 @@ static void s_stop(struct seq_file *m, void *p)
 
 static int s_show(struct seq_file *m, void *p)
 {
-	unsigned long value;
+	void *value;
 	struct kallsym_iter *iter = m->private;
 
 	/* Some debugging symbols have no name.  Ignore them. */
 	if (!iter->name[0])
 		return 0;
 
-	value = iter->show_value ? iter->value : 0;
+	value = iter->show_value ? (void *)iter->value : NULL;
 
 	if (iter->module_name[0]) {
 		char type;
@@ -632,10 +632,10 @@ static int s_show(struct seq_file *m, void *p)
 		 */
 		type = iter->exported ? toupper(iter->type) :
 					tolower(iter->type);
-		seq_printf(m, KALLSYM_FMT " %c %s\t[%s]\n", value,
+		seq_printf(m, "%px %c %s\t[%s]\n", value,
 			   type, iter->name, iter->module_name);
 	} else
-		seq_printf(m, KALLSYM_FMT " %c %s\n", value,
+		seq_printf(m, "%px %c %s\n", value,
 			   iter->type, iter->name);
 	return 0;
 }
diff --git a/kernel/module.c b/kernel/module.c
index f0411a271765..dea01ac9cb74 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -4157,7 +4157,7 @@ static int m_show(struct seq_file *m, void *p)
 {
 	struct module *mod = list_entry(p, struct module, list);
 	char buf[MODULE_FLAGS_BUF_SIZE];
-	unsigned long value;
+	void *value;
 
 	/* We always ignore unformed modules. */
 	if (mod->state == MODULE_STATE_UNFORMED)
@@ -4173,8 +4173,8 @@ static int m_show(struct seq_file *m, void *p)
 		   mod->state == MODULE_STATE_COMING ? "Loading" :
 		   "Live");
 	/* Used by oprofile and other similar tools. */
-	value = m->private ? 0 : (unsigned long)mod->core_layout.base;
-	seq_printf(m, " 0x" KALLSYM_FMT, value);
+	value = m->private ? NULL : mod->core_layout.base;
+	seq_printf(m, " 0x%px", value);
 
 	/* Taints info */
 	if (mod->taints)
-- 
cgit v1.2.3


From 1569d651f152870663fabd8f1c80af353f967ad5 Mon Sep 17 00:00:00 2001
From: Christian König <christian.koenig@amd.com>
Date: Mon, 27 Nov 2017 13:12:35 +0100
Subject: drm/ttm: fix populate_and_map() functions once more
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts "drm/ttm: Fix configuration error around populate_and_map()
functions".

This fix has gone into the wrong direction. Those helpers should be
available even when neither CONFIG_INTEL_IOMMU nor CONFIG_SWIOTLB are
set.

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/ttm/ttm_page_alloc.c |  2 --
 include/drm/ttm/ttm_page_alloc.h     | 32 ++++++++++----------------------
 2 files changed, 10 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc.c b/drivers/gpu/drm/ttm/ttm_page_alloc.c
index 316f831ad5f0..6d48ccca0b38 100644
--- a/drivers/gpu/drm/ttm/ttm_page_alloc.c
+++ b/drivers/gpu/drm/ttm/ttm_page_alloc.c
@@ -1058,7 +1058,6 @@ void ttm_pool_unpopulate(struct ttm_tt *ttm)
 }
 EXPORT_SYMBOL(ttm_pool_unpopulate);
 
-#if defined(CONFIG_SWIOTLB) || defined(CONFIG_INTEL_IOMMU)
 int ttm_populate_and_map_pages(struct device *dev, struct ttm_dma_tt *tt)
 {
 	unsigned i, j;
@@ -1129,7 +1128,6 @@ void ttm_unmap_and_unpopulate_pages(struct device *dev, struct ttm_dma_tt *tt)
 	ttm_pool_unpopulate(&tt->ttm);
 }
 EXPORT_SYMBOL(ttm_unmap_and_unpopulate_pages);
-#endif
 
 int ttm_page_alloc_debugfs(struct seq_file *m, void *data)
 {
diff --git a/include/drm/ttm/ttm_page_alloc.h b/include/drm/ttm/ttm_page_alloc.h
index 38a2b4770c35..593811362a91 100644
--- a/include/drm/ttm/ttm_page_alloc.h
+++ b/include/drm/ttm/ttm_page_alloc.h
@@ -58,12 +58,21 @@ int ttm_pool_populate(struct ttm_tt *ttm);
  */
 void ttm_pool_unpopulate(struct ttm_tt *ttm);
 
+/**
+ * Populates and DMA maps pages to fullfil a ttm_dma_populate() request
+ */
+int ttm_populate_and_map_pages(struct device *dev, struct ttm_dma_tt *tt);
+
+/**
+ * Unpopulates and DMA unmaps pages as part of a
+ * ttm_dma_unpopulate() request */
+void ttm_unmap_and_unpopulate_pages(struct device *dev, struct ttm_dma_tt *tt);
+
 /**
  * Output the state of pools to debugfs file
  */
 int ttm_page_alloc_debugfs(struct seq_file *m, void *data);
 
-
 #if defined(CONFIG_SWIOTLB) || defined(CONFIG_INTEL_IOMMU)
 /**
  * Initialize pool allocator.
@@ -83,17 +92,6 @@ int ttm_dma_page_alloc_debugfs(struct seq_file *m, void *data);
 int ttm_dma_populate(struct ttm_dma_tt *ttm_dma, struct device *dev);
 void ttm_dma_unpopulate(struct ttm_dma_tt *ttm_dma, struct device *dev);
 
-
-/**
- * Populates and DMA maps pages to fullfil a ttm_dma_populate() request
- */
-int ttm_populate_and_map_pages(struct device *dev, struct ttm_dma_tt *tt);
-
-/**
- * Unpopulates and DMA unmaps pages as part of a
- * ttm_dma_unpopulate() request */
-void ttm_unmap_and_unpopulate_pages(struct device *dev, struct ttm_dma_tt *tt);
-
 #else
 static inline int ttm_dma_page_alloc_init(struct ttm_mem_global *glob,
 					  unsigned max_pages)
@@ -116,16 +114,6 @@ static inline void ttm_dma_unpopulate(struct ttm_dma_tt *ttm_dma,
 				      struct device *dev)
 {
 }
-
-static inline int ttm_populate_and_map_pages(struct device *dev, struct ttm_dma_tt *tt)
-{
-	return -ENOMEM;
-}
-
-static inline void ttm_unmap_and_unpopulate_pages(struct device *dev, struct ttm_dma_tt *tt)
-{
-}
-
 #endif
 
 #endif
-- 
cgit v1.2.3


From 23721a755f98ac846897a013c92cccb281c1bcc8 Mon Sep 17 00:00:00 2001
From: Xie XiuQi <xiexiuqi@huawei.com>
Date: Thu, 30 Nov 2017 09:41:29 +0800
Subject: trace/xdp: fix compile warning: 'struct bpf_map' declared inside
 parameter list
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We meet this compile warning, which caused by missing bpf.h in xdp.h.

In file included from ./include/trace/events/xdp.h:10:0,
                 from ./include/linux/bpf_trace.h:6,
                 from drivers/net/ethernet/intel/i40e/i40e_txrx.c:29:
./include/trace/events/xdp.h:93:17: warning: ‘struct bpf_map’ declared inside parameter list will not be visible outside of this definition or declaration
    const struct bpf_map *map, u32 map_index),
                 ^
./include/linux/tracepoint.h:187:34: note: in definition of macro ‘__DECLARE_TRACE’
  static inline void trace_##name(proto)    \
                                  ^~~~~
./include/linux/tracepoint.h:352:24: note: in expansion of macro ‘PARAMS’
  __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args),  \
                        ^~~~~~
./include/linux/tracepoint.h:477:2: note: in expansion of macro ‘DECLARE_TRACE’
  DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
  ^~~~~~~~~~~~~
./include/linux/tracepoint.h:477:22: note: in expansion of macro ‘PARAMS’
  DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
                      ^~~~~~
./include/trace/events/xdp.h:89:1: note: in expansion of macro ‘DEFINE_EVENT’
 DEFINE_EVENT(xdp_redirect_template, xdp_redirect,
 ^~~~~~~~~~~~
./include/trace/events/xdp.h:90:2: note: in expansion of macro ‘TP_PROTO’
  TP_PROTO(const struct net_device *dev,
  ^~~~~~~~
./include/trace/events/xdp.h:93:17: warning: ‘struct bpf_map’ declared inside parameter list will not be visible outside of this definition or declaration
    const struct bpf_map *map, u32 map_index),
                 ^
./include/linux/tracepoint.h:203:38: note: in definition of macro ‘__DECLARE_TRACE’
  register_trace_##name(void (*probe)(data_proto), void *data) \
                                      ^~~~~~~~~~
./include/linux/tracepoint.h:354:4: note: in expansion of macro ‘PARAMS’
    PARAMS(void *__data, proto),   \
    ^~~~~~

Reported-by: Huang Daode <huangdaode@hisilicon.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Fixes: 8d3b778ff544 ("xdp: tracepoint xdp_redirect also need a map argument")
Signed-off-by: Xie XiuQi <xiexiuqi@huawei.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/trace/events/xdp.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 4cd0f05d0113..8989a92c571a 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -8,6 +8,7 @@
 #include <linux/netdevice.h>
 #include <linux/filter.h>
 #include <linux/tracepoint.h>
+#include <linux/bpf.h>
 
 #define __XDP_ACT_MAP(FN)	\
 	FN(ABORTED)		\
-- 
cgit v1.2.3


From 1501899a898dfb5477c55534bdfd734c046da06d Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 29 Nov 2017 16:10:06 -0800
Subject: mm: fix device-dax pud write-faults triggered by get_user_pages()

Currently only get_user_pages_fast() can safely handle the writable gup
case due to its use of pud_access_permitted() to check whether the pud
entry is writable.  In the gup slow path pud_write() is used instead of
pud_access_permitted() and to date it has been unimplemented, just calls
BUG_ON().

    kernel BUG at ./include/linux/hugetlb.h:244!
    [..]
    RIP: 0010:follow_devmap_pud+0x482/0x490
    [..]
    Call Trace:
     follow_page_mask+0x28c/0x6e0
     __get_user_pages+0xe4/0x6c0
     get_user_pages_unlocked+0x130/0x1b0
     get_user_pages_fast+0x89/0xb0
     iov_iter_get_pages_alloc+0x114/0x4a0
     nfs_direct_read_schedule_iovec+0xd2/0x350
     ? nfs_start_io_direct+0x63/0x70
     nfs_file_direct_read+0x1e0/0x250
     nfs_file_read+0x90/0xc0

For now this just implements a simple check for the _PAGE_RW bit similar
to pmd_write.  However, this implies that the gup-slow-path check is
missing the extra checks that the gup-fast-path performs with
pud_access_permitted.  Later patches will align all checks to use the
'access_permitted' helper if the architecture provides it.

Note that the generic 'access_permitted' helper fallback is the simple
_PAGE_RW check on architectures that do not define the
'access_permitted' helper(s).

[dan.j.williams@intel.com: fix powerpc compile error]
  Link: http://lkml.kernel.org/r/151129126165.37405.16031785266675461397.stgit@dwillia2-desk3.amr.corp.intel.com
Link: http://lkml.kernel.org/r/151043109938.2842.14834662818213616199.stgit@dwillia2-desk3.amr.corp.intel.com
Fixes: a00cc7d9dd93 ("mm, x86: add support for PUD-sized transparent hugepages")
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Acked-by: Thomas Gleixner <tglx@linutronix.de>	[x86]
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/pgtable.h | 6 ++++++
 include/asm-generic/pgtable.h  | 8 ++++++++
 include/linux/hugetlb.h        | 8 --------
 3 files changed, 14 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 09f9e1e00e3b..dcce76ee4aa7 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1088,6 +1088,12 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
 	clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
 }
 
+#define pud_write pud_write
+static inline int pud_write(pud_t pud)
+{
+	return pud_flags(pud) & _PAGE_RW;
+}
+
 /*
  * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
  *
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 757dc6ffc7ba..1ac457511f4e 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -814,6 +814,14 @@ static inline int pmd_write(pmd_t pmd)
 #endif /* __HAVE_ARCH_PMD_WRITE */
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
+#ifndef pud_write
+static inline int pud_write(pud_t pud)
+{
+	BUG();
+	return 0;
+}
+#endif /* pud_write */
+
 #if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \
 	(defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
 	 !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD))
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index fbf5b31d47ee..82a25880714a 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -239,14 +239,6 @@ static inline int pgd_write(pgd_t pgd)
 }
 #endif
 
-#ifndef pud_write
-static inline int pud_write(pud_t pud)
-{
-	BUG();
-	return 0;
-}
-#endif
-
 #define HUGETLB_ANON_FILE "anon_hugepage"
 
 enum {
-- 
cgit v1.2.3


From e4e40e0263ea6a3bfefbfd15d1b6ff5c03f2b95e Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 29 Nov 2017 16:10:10 -0800
Subject: mm: switch to 'define pmd_write' instead of __HAVE_ARCH_PMD_WRITE

In response to compile breakage introduced by a series that added the
pud_write helper to x86, Stephen notes:

    did you consider using the other paradigm:

    In arch include files:
    #define pud_write       pud_write
    static inline int pud_write(pud_t pud)
     .....

    Then in include/asm-generic/pgtable.h:

    #ifndef pud_write
    tatic inline int pud_write(pud_t pud)
    {
            ....
    }
    #endif

    If you had, then the powerpc code would have worked ... ;-) and many
    of the other interfaces in include/asm-generic/pgtable.h are
    protected that way ...

Given that some architecture already define pmd_write() as a macro, it's
a net reduction to drop the definition of __HAVE_ARCH_PMD_WRITE.

Link: http://lkml.kernel.org/r/151129126721.37405.13339850900081557813.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Suggested-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/include/asm/pgtable-3level.h        | 1 -
 arch/arm64/include/asm/pgtable.h             | 1 -
 arch/mips/include/asm/pgtable.h              | 2 +-
 arch/powerpc/include/asm/book3s/64/pgtable.h | 1 -
 arch/s390/include/asm/pgtable.h              | 2 +-
 arch/sparc/include/asm/pgtable_64.h          | 2 +-
 arch/tile/include/asm/pgtable.h              | 1 -
 arch/x86/include/asm/pgtable.h               | 2 +-
 include/asm-generic/pgtable.h                | 4 ++--
 9 files changed, 6 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
index 2a029bceaf2f..1a7a17b2a1ba 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -221,7 +221,6 @@ static inline pte_t pte_mkspecial(pte_t pte)
 }
 #define	__HAVE_ARCH_PTE_SPECIAL
 
-#define __HAVE_ARCH_PMD_WRITE
 #define pmd_write(pmd)		(pmd_isclear((pmd), L_PMD_SECT_RDONLY))
 #define pmd_dirty(pmd)		(pmd_isset((pmd), L_PMD_SECT_DIRTY))
 #define pud_page(pud)		pmd_page(__pmd(pud_val(pud)))
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index c9530b5b5ca8..149d05fb9421 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -345,7 +345,6 @@ static inline int pmd_protnone(pmd_t pmd)
 
 #define pmd_thp_or_huge(pmd)	(pmd_huge(pmd) || pmd_trans_huge(pmd))
 
-#define __HAVE_ARCH_PMD_WRITE
 #define pmd_write(pmd)		pte_write(pmd_pte(pmd))
 
 #define pmd_mkhuge(pmd)		(__pmd(pmd_val(pmd) & ~PMD_TABLE_BIT))
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index 9e9e94415d08..1a508a74d48d 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -552,7 +552,7 @@ static inline pmd_t pmd_mkhuge(pmd_t pmd)
 extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,
 		       pmd_t *pmdp, pmd_t pmd);
 
-#define __HAVE_ARCH_PMD_WRITE
+#define pmd_write pmd_write
 static inline int pmd_write(pmd_t pmd)
 {
 	return !!(pmd_val(pmd) & _PAGE_WRITE);
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 9a677cd5997f..44697817ccc6 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1005,7 +1005,6 @@ static inline int pmd_protnone(pmd_t pmd)
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
-#define __HAVE_ARCH_PMD_WRITE
 #define pmd_write(pmd)		pte_write(pmd_pte(pmd))
 #define __pmd_write(pmd)	__pte_write(pmd_pte(pmd))
 #define pmd_savedwrite(pmd)	pte_savedwrite(pmd_pte(pmd))
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index d7fe9838084d..0a6b0286c32e 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -709,7 +709,7 @@ static inline unsigned long pmd_pfn(pmd_t pmd)
 	return (pmd_val(pmd) & origin_mask) >> PAGE_SHIFT;
 }
 
-#define __HAVE_ARCH_PMD_WRITE
+#define pmd_write pmd_write
 static inline int pmd_write(pmd_t pmd)
 {
 	return (pmd_val(pmd) & _SEGMENT_ENTRY_WRITE) != 0;
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 5a9e96be1665..9937c5ff94a9 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -715,7 +715,7 @@ static inline unsigned long pmd_pfn(pmd_t pmd)
 	return pte_pfn(pte);
 }
 
-#define __HAVE_ARCH_PMD_WRITE
+#define pmd_write pmd_write
 static inline unsigned long pmd_write(pmd_t pmd)
 {
 	pte_t pte = __pte(pmd_val(pmd));
diff --git a/arch/tile/include/asm/pgtable.h b/arch/tile/include/asm/pgtable.h
index 2a26cc4fefc2..adfa21b18488 100644
--- a/arch/tile/include/asm/pgtable.h
+++ b/arch/tile/include/asm/pgtable.h
@@ -475,7 +475,6 @@ static inline void pmd_clear(pmd_t *pmdp)
 #define pmd_mkdirty(pmd)	pte_pmd(pte_mkdirty(pmd_pte(pmd)))
 #define pmd_huge_page(pmd)	pte_huge(pmd_pte(pmd))
 #define pmd_mkhuge(pmd)		pte_pmd(pte_mkhuge(pmd_pte(pmd)))
-#define __HAVE_ARCH_PMD_WRITE
 
 #define pfn_pmd(pfn, pgprot)	pte_pmd(pfn_pte((pfn), (pgprot)))
 #define pmd_pfn(pmd)		pte_pfn(pmd_pte(pmd))
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index dcce76ee4aa7..95e2dfd75521 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1061,7 +1061,7 @@ extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
 				  unsigned long address, pmd_t *pmdp);
 
 
-#define __HAVE_ARCH_PMD_WRITE
+#define pmd_write pmd_write
 static inline int pmd_write(pmd_t pmd)
 {
 	return pmd_flags(pmd) & _PAGE_RW;
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 1ac457511f4e..b234d54f2cb6 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -805,13 +805,13 @@ static inline int pmd_trans_huge(pmd_t pmd)
 {
 	return 0;
 }
-#ifndef __HAVE_ARCH_PMD_WRITE
+#ifndef pmd_write
 static inline int pmd_write(pmd_t pmd)
 {
 	BUG();
 	return 0;
 }
-#endif /* __HAVE_ARCH_PMD_WRITE */
+#endif /* pmd_write */
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #ifndef pud_write
-- 
cgit v1.2.3


From 31383c6865a578834dd953d9dbc88e6b19fe3997 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 29 Nov 2017 16:10:28 -0800
Subject: mm, hugetlbfs: introduce ->split() to vm_operations_struct

Patch series "device-dax: fix unaligned munmap handling"

When device-dax is operating in huge-page mode we want it to behave like
hugetlbfs and fail attempts to split vmas into unaligned ranges.  It
would be messy to teach the munmap path about device-dax alignment
constraints in the same (hstate) way that hugetlbfs communicates this
constraint.  Instead, these patches introduce a new ->split() vm
operation.

This patch (of 2):

The device-dax interface has similar constraints as hugetlbfs in that it
requires the munmap path to unmap in huge page aligned units.  Rather
than add more custom vma handling code in __split_vma() introduce a new
vm operation to perform this vma specific check.

Link: http://lkml.kernel.org/r/151130418135.4029.6783191281930729710.stgit@dwillia2-desk3.amr.corp.intel.com
Fixes: dee410792419 ("/dev/dax, core: file operations and dax-mmap")
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 1 +
 mm/hugetlb.c       | 8 ++++++++
 mm/mmap.c          | 8 +++++---
 3 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index ee073146aaa7..b3b6a7e313e9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -377,6 +377,7 @@ enum page_entry_size {
 struct vm_operations_struct {
 	void (*open)(struct vm_area_struct * area);
 	void (*close)(struct vm_area_struct * area);
+	int (*split)(struct vm_area_struct * area, unsigned long addr);
 	int (*mremap)(struct vm_area_struct * area);
 	int (*fault)(struct vm_fault *vmf);
 	int (*huge_fault)(struct vm_fault *vmf, enum page_entry_size pe_size);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 681b300185c0..698e8fb34031 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3125,6 +3125,13 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
 	}
 }
 
+static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
+{
+	if (addr & ~(huge_page_mask(hstate_vma(vma))))
+		return -EINVAL;
+	return 0;
+}
+
 /*
  * We cannot handle pagefaults against hugetlb pages at all.  They cause
  * handle_mm_fault() to try to instantiate regular-sized pages in the
@@ -3141,6 +3148,7 @@ const struct vm_operations_struct hugetlb_vm_ops = {
 	.fault = hugetlb_vm_op_fault,
 	.open = hugetlb_vm_op_open,
 	.close = hugetlb_vm_op_close,
+	.split = hugetlb_vm_op_split,
 };
 
 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
diff --git a/mm/mmap.c b/mm/mmap.c
index 924839fac0e6..a4d546821214 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2555,9 +2555,11 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
 	struct vm_area_struct *new;
 	int err;
 
-	if (is_vm_hugetlb_page(vma) && (addr &
-					~(huge_page_mask(hstate_vma(vma)))))
-		return -EINVAL;
+	if (vma->vm_ops && vma->vm_ops->split) {
+		err = vma->vm_ops->split(vma, addr);
+		if (err)
+			return err;
+	}
 
 	new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 	if (!new)
-- 
cgit v1.2.3


From 2bb6d2837083de722bfdc369cb0d76ce188dd9b4 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 29 Nov 2017 16:10:35 -0800
Subject: mm: introduce get_user_pages_longterm

Patch series "introduce get_user_pages_longterm()", v2.

Here is a new get_user_pages api for cases where a driver intends to
keep an elevated page count indefinitely.  This is distinct from usages
like iov_iter_get_pages where the elevated page counts are transient.
The iov_iter_get_pages cases immediately turn around and submit the
pages to a device driver which will put_page when the i/o operation
completes (under kernel control).

In the longterm case userspace is responsible for dropping the page
reference at some undefined point in the future.  This is untenable for
filesystem-dax case where the filesystem is in control of the lifetime
of the block / page and needs reasonable limits on how long it can wait
for pages in a mapping to become idle.

Fixing filesystems to actually wait for dax pages to be idle before
blocks from a truncate/hole-punch operation are repurposed is saved for
a later patch series.

Also, allowing longterm registration of dax mappings is a future patch
series that introduces a "map with lease" semantic where the kernel can
revoke a lease and force userspace to drop its page references.

I have also tagged these for -stable to purposely break cases that might
assume that longterm memory registrations for filesystem-dax mappings
were supported by the kernel.  The behavior regression this policy
change implies is one of the reasons we maintain the "dax enabled.
Warning: EXPERIMENTAL, use at your own risk" notification when mounting
a filesystem in dax mode.

It is worth noting the device-dax interface does not suffer the same
constraints since it does not support file space management operations
like hole-punch.

This patch (of 4):

Until there is a solution to the dma-to-dax vs truncate problem it is
not safe to allow long standing memory registrations against
filesytem-dax vmas.  Device-dax vmas do not have this problem and are
explicitly allowed.

This is temporary until a "memory registration with layout-lease"
mechanism can be implemented for the affected sub-systems (RDMA and
V4L2).

[akpm@linux-foundation.org: use kcalloc()]
Link: http://lkml.kernel.org/r/151068939435.7446.13560129395419350737.stgit@dwillia2-desk3.amr.corp.intel.com
Fixes: 3565fce3a659 ("mm, x86: get_user_pages() for dax mappings")
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Suggested-by: Christoph Hellwig <hch@lst.de>
Cc: Doug Ledford <dledford@redhat.com>
Cc: Hal Rosenstock <hal.rosenstock@gmail.com>
Cc: Inki Dae <inki.dae@samsung.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Joonyoung Shim <jy0922.shim@samsung.com>
Cc: Kyungmin Park <kyungmin.park@samsung.com>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Sean Hefty <sean.hefty@intel.com>
Cc: Seung-Woo Kim <sw0312.kim@samsung.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h | 14 ++++++++++++
 include/linux/mm.h | 13 +++++++++++
 mm/gup.c           | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 91 insertions(+)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index bbd92da0946e..9dc498d16cc1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3194,6 +3194,20 @@ static inline bool vma_is_dax(struct vm_area_struct *vma)
 	return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host);
 }
 
+static inline bool vma_is_fsdax(struct vm_area_struct *vma)
+{
+	struct inode *inode;
+
+	if (!vma->vm_file)
+		return false;
+	if (!vma_is_dax(vma))
+		return false;
+	inode = file_inode(vma->vm_file);
+	if (inode->i_mode == S_IFCHR)
+		return false; /* device-dax */
+	return true;
+}
+
 static inline int iocb_flags(struct file *file)
 {
 	int res = 0;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b3b6a7e313e9..ea818ff739cd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1380,6 +1380,19 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
 		    unsigned int gup_flags, struct page **pages, int *locked);
 long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
 		    struct page **pages, unsigned int gup_flags);
+#ifdef CONFIG_FS_DAX
+long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
+			    unsigned int gup_flags, struct page **pages,
+			    struct vm_area_struct **vmas);
+#else
+static inline long get_user_pages_longterm(unsigned long start,
+		unsigned long nr_pages, unsigned int gup_flags,
+		struct page **pages, struct vm_area_struct **vmas)
+{
+	return get_user_pages(start, nr_pages, gup_flags, pages, vmas);
+}
+#endif /* CONFIG_FS_DAX */
+
 int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 			struct page **pages);
 
diff --git a/mm/gup.c b/mm/gup.c
index 85cc822fd403..d3fb60e5bfac 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1095,6 +1095,70 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
 }
 EXPORT_SYMBOL(get_user_pages);
 
+#ifdef CONFIG_FS_DAX
+/*
+ * This is the same as get_user_pages() in that it assumes we are
+ * operating on the current task's mm, but it goes further to validate
+ * that the vmas associated with the address range are suitable for
+ * longterm elevated page reference counts. For example, filesystem-dax
+ * mappings are subject to the lifetime enforced by the filesystem and
+ * we need guarantees that longterm users like RDMA and V4L2 only
+ * establish mappings that have a kernel enforced revocation mechanism.
+ *
+ * "longterm" == userspace controlled elevated page count lifetime.
+ * Contrast this to iov_iter_get_pages() usages which are transient.
+ */
+long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
+		unsigned int gup_flags, struct page **pages,
+		struct vm_area_struct **vmas_arg)
+{
+	struct vm_area_struct **vmas = vmas_arg;
+	struct vm_area_struct *vma_prev = NULL;
+	long rc, i;
+
+	if (!pages)
+		return -EINVAL;
+
+	if (!vmas) {
+		vmas = kcalloc(nr_pages, sizeof(struct vm_area_struct *),
+			       GFP_KERNEL);
+		if (!vmas)
+			return -ENOMEM;
+	}
+
+	rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
+
+	for (i = 0; i < rc; i++) {
+		struct vm_area_struct *vma = vmas[i];
+
+		if (vma == vma_prev)
+			continue;
+
+		vma_prev = vma;
+
+		if (vma_is_fsdax(vma))
+			break;
+	}
+
+	/*
+	 * Either get_user_pages() failed, or the vma validation
+	 * succeeded, in either case we don't need to put_page() before
+	 * returning.
+	 */
+	if (i >= rc)
+		goto out;
+
+	for (i = 0; i < rc; i++)
+		put_page(pages[i]);
+	rc = -EOPNOTSUPP;
+out:
+	if (vmas != vmas_arg)
+		kfree(vmas);
+	return rc;
+}
+EXPORT_SYMBOL(get_user_pages_longterm);
+#endif /* CONFIG_FS_DAX */
+
 /**
  * populate_vma_page_range() -  populate a range of pages in the vma.
  * @vma:   target vma
-- 
cgit v1.2.3


From 40a899ed16486455f964e46d1af31fd4fded21c1 Mon Sep 17 00:00:00 2001
From: Zi Yan <zi.yan@cs.rutgers.edu>
Date: Wed, 29 Nov 2017 16:11:12 -0800
Subject: mm: migrate: fix an incorrect call of prep_transhuge_page()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In https://lkml.org/lkml/2017/11/20/411, Andrea reported that during
memory hotplug/hot remove prep_transhuge_page() is called incorrectly on
non-THP pages for migration, when THP is on but THP migration is not
enabled.  This leads to a bad state of target pages for migration.

By inspecting the code, if called on a non-THP, prep_transhuge_page()
will

 1) change the value of the mapping of (page + 2), since it is used for
    THP deferred list;

 2) change the lru value of (page + 1), since it is used for THP's dtor.

Both can lead to data corruption of these two pages.

Andrea said:
 "Pragmatically and from the point of view of the memory_hotplug subsys,
  the effect is a kernel crash when pages are being migrated during a
  memory hot remove offline and migration target pages are found in a
  bad state"

This patch fixes it by only calling prep_transhuge_page() when we are
certain that the target page is THP.

Link: http://lkml.kernel.org/r/20171121021855.50525-1-zi.yan@sent.com
Fixes: 8135d8926c08 ("mm: memory_hotplug: memory hotremove supports thp migration")
Signed-off-by: Zi Yan <zi.yan@cs.rutgers.edu>
Reported-by: Andrea Reale <ar@linux.vnet.ibm.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: "Jérôme Glisse" <jglisse@redhat.com>
Cc: <stable@vger.kernel.org>	[4.14]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/migrate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 895ec0c4942e..a2246cf670ba 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -54,7 +54,7 @@ static inline struct page *new_page_nodemask(struct page *page,
 	new_page = __alloc_pages_nodemask(gfp_mask, order,
 				preferred_nid, nodemask);
 
-	if (new_page && PageTransHuge(page))
+	if (new_page && PageTransHuge(new_page))
 		prep_transhuge_page(new_page);
 
 	return new_page;
-- 
cgit v1.2.3


From 5d38f049cee1e1c4a7ac55aa79d37d01ddcc3860 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Wed, 29 Nov 2017 16:11:26 -0800
Subject: autofs: revert "autofs: fix AT_NO_AUTOMOUNT not being honored"

Commit 42f461482178 ("autofs: fix AT_NO_AUTOMOUNT not being honored")
allowed the fstatat(2) system call to properly honor the AT_NO_AUTOMOUNT
flag but introduced a semantic change.

In order to honor AT_NO_AUTOMOUNT a semantic change was made to the
negative dentry case for stat family system calls in follow_automount().

This changed the unconditional triggering of an automount in this case
to no longer be done and an error returned instead.

This has caused more problems than I expected so reverting the change is
needed.

In a discussion with Neil Brown it was concluded that the automount(8)
daemon can implement this change without kernel modifications.  So that
will be done instead and the autofs module documentation updated with a
description of the problem and what needs to be done by module users for
this specific case.

Link: http://lkml.kernel.org/r/151174730120.6162.3848002191530283984.stgit@pluto.themaw.net
Fixes: 42f4614821 ("autofs: fix AT_NO_AUTOMOUNT not being honored")
Signed-off-by: Ian Kent <raven@themaw.net>
Cc: Neil Brown <neilb@suse.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: David Howells <dhowells@redhat.com>
Cc: Colin Walters <walters@redhat.com>
Cc: Ondrej Holy <oholy@redhat.com>
Cc: <stable@vger.kernel.org>	[4.11+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namei.c         | 15 +++------------
 include/linux/fs.h |  3 ++-
 2 files changed, 5 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/fs/namei.c b/fs/namei.c
index f0c7a7b9b6ca..9cc91fb7f156 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1129,18 +1129,9 @@ static int follow_automount(struct path *path, struct nameidata *nd,
 	 * of the daemon to instantiate them before they can be used.
 	 */
 	if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
-			   LOOKUP_OPEN | LOOKUP_CREATE |
-			   LOOKUP_AUTOMOUNT))) {
-		/* Positive dentry that isn't meant to trigger an
-		 * automount, EISDIR will allow it to be used,
-		 * otherwise there's no mount here "now" so return
-		 * ENOENT.
-		 */
-		if (path->dentry->d_inode)
-			return -EISDIR;
-		else
-			return -ENOENT;
-	}
+			   LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
+	    path->dentry->d_inode)
+		return -EISDIR;
 
 	if (path->dentry->d_sb->s_user_ns != &init_user_ns)
 		return -EACCES;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9dc498d16cc1..511fbaabf624 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3088,7 +3088,8 @@ static inline int vfs_lstat(const char __user *name, struct kstat *stat)
 static inline int vfs_fstatat(int dfd, const char __user *filename,
 			      struct kstat *stat, int flags)
 {
-	return vfs_statx(dfd, filename, flags, stat, STATX_BASIC_STATS);
+	return vfs_statx(dfd, filename, flags | AT_NO_AUTOMOUNT,
+			 stat, STATX_BASIC_STATS);
 }
 static inline int vfs_fstat(int fd, struct kstat *stat)
 {
-- 
cgit v1.2.3


From f8821f96ae97260d68228fe53f81848b2ede44d7 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Thu, 30 Nov 2017 14:33:56 +0100
Subject: skbuff: Grammar s/are can/can/, s/change/changes/

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index bc486ef23f20..a38c80e9f91e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1406,8 +1406,7 @@ static inline struct sk_buff *skb_get(struct sk_buff *skb)
 }
 
 /*
- * If users == 1, we are the only owner and are can avoid redundant
- * atomic change.
+ * If users == 1, we are the only owner and can avoid redundant atomic changes.
  */
 
 /**
-- 
cgit v1.2.3


From 90a6ec85351b31449c2c6b5406b5396ac96f191d Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Wed, 29 Nov 2017 16:07:51 -0800
Subject: act_sample: get rid of tcf_sample_cleanup_rcu()

Similar to commit d7fb60b9cafb ("net_sched: get rid of tcfa_rcu"),
TC actions don't need to respect RCU grace period, because it
is either just detached from tc filter (standalone case) or
it is removed together with tc filter (bound case) in which case
RCU grace period is already respected at filter layer.

Fixes: 5c5670fae430 ("net/sched: Introduce sample tc action")
Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Cc: Yotam Gigi <yotamg@mellanox.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_sample.h |  1 -
 net/sched/act_sample.c         | 14 +++-----------
 2 files changed, 3 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/tc_act/tc_sample.h b/include/net/tc_act/tc_sample.h
index 524cee4f4c81..01dbfea32672 100644
--- a/include/net/tc_act/tc_sample.h
+++ b/include/net/tc_act/tc_sample.h
@@ -14,7 +14,6 @@ struct tcf_sample {
 	struct psample_group __rcu *psample_group;
 	u32 psample_group_num;
 	struct list_head tcfm_list;
-	struct rcu_head rcu;
 };
 #define to_sample(a) ((struct tcf_sample *)a)
 
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 8b5abcd2f32f..9438969290a6 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -96,23 +96,16 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla,
 	return ret;
 }
 
-static void tcf_sample_cleanup_rcu(struct rcu_head *rcu)
+static void tcf_sample_cleanup(struct tc_action *a, int bind)
 {
-	struct tcf_sample *s = container_of(rcu, struct tcf_sample, rcu);
+	struct tcf_sample *s = to_sample(a);
 	struct psample_group *psample_group;
 
-	psample_group = rcu_dereference_protected(s->psample_group, 1);
+	psample_group = rtnl_dereference(s->psample_group);
 	RCU_INIT_POINTER(s->psample_group, NULL);
 	psample_group_put(psample_group);
 }
 
-static void tcf_sample_cleanup(struct tc_action *a, int bind)
-{
-	struct tcf_sample *s = to_sample(a);
-
-	call_rcu(&s->rcu, tcf_sample_cleanup_rcu);
-}
-
 static bool tcf_sample_dev_ok_push(struct net_device *dev)
 {
 	switch (dev->type) {
@@ -264,7 +257,6 @@ static int __init sample_init_module(void)
 
 static void __exit sample_cleanup_module(void)
 {
-	rcu_barrier();
 	tcf_unregister_action(&act_sample_ops, &sample_net_ops);
 }
 
-- 
cgit v1.2.3


From e5f612969c6f965e3bd1158598e0a3b1c4f389b9 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 25 Nov 2017 21:18:35 +0800
Subject: sctp: abandon the whole msg if one part of a fragmented message is
 abandoned

As rfc3758#section-3.1 demands:

   A3) When a TSN is "abandoned", if it is part of a fragmented message,
       all other TSN's within that fragmented message MUST be abandoned
       at the same time.

Besides, if it couldn't handle this, the rest frags would never get
assembled in peer side.

This patch supports it by adding abandoned flag in sctp_datamsg, when
one chunk is being abandoned, set chunk->msg->abandoned as well. Next
time when checking for abandoned, go checking chunk->msg->abandoned
first.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h |  3 ++-
 net/sctp/chunk.c           |  7 +++++++
 net/sctp/outqueue.c        | 12 ++++++++----
 3 files changed, 17 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 16f949eef52f..2f8f93da5dc2 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -503,7 +503,8 @@ struct sctp_datamsg {
 	/* Did the messenge fail to send? */
 	int send_error;
 	u8 send_failed:1,
-	   can_delay;	    /* should this message be Nagle delayed */
+	   can_delay:1,	/* should this message be Nagle delayed */
+	   abandoned:1;	/* should this message be abandoned */
 };
 
 struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *,
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 7b261afc47b9..9213805b558d 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -53,6 +53,7 @@ static void sctp_datamsg_init(struct sctp_datamsg *msg)
 	msg->send_failed = 0;
 	msg->send_error = 0;
 	msg->can_delay = 1;
+	msg->abandoned = 0;
 	msg->expires_at = 0;
 	INIT_LIST_HEAD(&msg->chunks);
 }
@@ -304,6 +305,9 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk)
 	if (!chunk->asoc->peer.prsctp_capable)
 		return 0;
 
+	if (chunk->msg->abandoned)
+		return 1;
+
 	if (SCTP_PR_TTL_ENABLED(chunk->sinfo.sinfo_flags) &&
 	    time_after(jiffies, chunk->msg->expires_at)) {
 		struct sctp_stream_out *streamout =
@@ -316,6 +320,7 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk)
 			chunk->asoc->abandoned_unsent[SCTP_PR_INDEX(TTL)]++;
 			streamout->ext->abandoned_unsent[SCTP_PR_INDEX(TTL)]++;
 		}
+		chunk->msg->abandoned = 1;
 		return 1;
 	} else if (SCTP_PR_RTX_ENABLED(chunk->sinfo.sinfo_flags) &&
 		   chunk->sent_count > chunk->sinfo.sinfo_timetolive) {
@@ -324,10 +329,12 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk)
 
 		chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++;
 		streamout->ext->abandoned_sent[SCTP_PR_INDEX(RTX)]++;
+		chunk->msg->abandoned = 1;
 		return 1;
 	} else if (!SCTP_PR_POLICY(chunk->sinfo.sinfo_flags) &&
 		   chunk->msg->expires_at &&
 		   time_after(jiffies, chunk->msg->expires_at)) {
+		chunk->msg->abandoned = 1;
 		return 1;
 	}
 	/* PRIO policy is processed by sendmsg, not here */
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 7029f8b99063..4ab164b5aad0 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -364,10 +364,12 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc,
 	list_for_each_entry_safe(chk, temp, queue, transmitted_list) {
 		struct sctp_stream_out *streamout;
 
-		if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) ||
-		    chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive)
+		if (!chk->msg->abandoned &&
+		    (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) ||
+		     chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive))
 			continue;
 
+		chk->msg->abandoned = 1;
 		list_del_init(&chk->transmitted_list);
 		sctp_insert_list(&asoc->outqueue.abandoned,
 				 &chk->transmitted_list);
@@ -404,10 +406,12 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc,
 	q->sched->unsched_all(&asoc->stream);
 
 	list_for_each_entry_safe(chk, temp, &q->out_chunk_list, list) {
-		if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) ||
-		    chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive)
+		if (!chk->msg->abandoned &&
+		    (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) ||
+		     chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive))
 			continue;
 
+		chk->msg->abandoned = 1;
 		sctp_sched_dequeue_common(q, chk);
 		asoc->sent_cnt_removable--;
 		asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++;
-- 
cgit v1.2.3


From 4db2b604c05afc3d2678fe01d3136c015df313ec Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 22 Nov 2017 11:47:28 +0100
Subject: move libgcc.h to include/linux

Introducing a new include/lib directory just for this file totally
messes up tab completion for include/linux, which is highly annoying.

Move it to include/linux where we have headers for all kinds of other
lib/ code as well.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Palmer Dabbelt <palmer@sifive.com>
---
 include/lib/libgcc.h   | 43 -------------------------------------------
 include/linux/libgcc.h | 43 +++++++++++++++++++++++++++++++++++++++++++
 lib/ashldi3.c          |  2 +-
 lib/ashrdi3.c          |  2 +-
 lib/cmpdi2.c           |  2 +-
 lib/lshrdi3.c          |  2 +-
 lib/muldi3.c           |  2 +-
 lib/ucmpdi2.c          |  2 +-
 8 files changed, 49 insertions(+), 49 deletions(-)
 delete mode 100644 include/lib/libgcc.h
 create mode 100644 include/linux/libgcc.h

(limited to 'include')

diff --git a/include/lib/libgcc.h b/include/lib/libgcc.h
deleted file mode 100644
index 32e1e0f4b2d0..000000000000
--- a/include/lib/libgcc.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * include/lib/libgcc.h
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see the file COPYING, or write
- * to the Free Software Foundation, Inc.
- */
-
-#ifndef __LIB_LIBGCC_H
-#define __LIB_LIBGCC_H
-
-#include <asm/byteorder.h>
-
-typedef int word_type __attribute__ ((mode (__word__)));
-
-#ifdef __BIG_ENDIAN
-struct DWstruct {
-	int high, low;
-};
-#elif defined(__LITTLE_ENDIAN)
-struct DWstruct {
-	int low, high;
-};
-#else
-#error I feel sick.
-#endif
-
-typedef union {
-	struct DWstruct s;
-	long long ll;
-} DWunion;
-
-#endif /* __ASM_LIBGCC_H */
diff --git a/include/linux/libgcc.h b/include/linux/libgcc.h
new file mode 100644
index 000000000000..32e1e0f4b2d0
--- /dev/null
+++ b/include/linux/libgcc.h
@@ -0,0 +1,43 @@
+/*
+ * include/lib/libgcc.h
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see the file COPYING, or write
+ * to the Free Software Foundation, Inc.
+ */
+
+#ifndef __LIB_LIBGCC_H
+#define __LIB_LIBGCC_H
+
+#include <asm/byteorder.h>
+
+typedef int word_type __attribute__ ((mode (__word__)));
+
+#ifdef __BIG_ENDIAN
+struct DWstruct {
+	int high, low;
+};
+#elif defined(__LITTLE_ENDIAN)
+struct DWstruct {
+	int low, high;
+};
+#else
+#error I feel sick.
+#endif
+
+typedef union {
+	struct DWstruct s;
+	long long ll;
+} DWunion;
+
+#endif /* __ASM_LIBGCC_H */
diff --git a/lib/ashldi3.c b/lib/ashldi3.c
index 1b6087db95a5..3ffc46e3bb6c 100644
--- a/lib/ashldi3.c
+++ b/lib/ashldi3.c
@@ -16,7 +16,7 @@
 
 #include <linux/export.h>
 
-#include <lib/libgcc.h>
+#include <linux/libgcc.h>
 
 long long notrace __ashldi3(long long u, word_type b)
 {
diff --git a/lib/ashrdi3.c b/lib/ashrdi3.c
index 2e67c97ac65a..ea054550f0e8 100644
--- a/lib/ashrdi3.c
+++ b/lib/ashrdi3.c
@@ -16,7 +16,7 @@
 
 #include <linux/export.h>
 
-#include <lib/libgcc.h>
+#include <linux/libgcc.h>
 
 long long notrace __ashrdi3(long long u, word_type b)
 {
diff --git a/lib/cmpdi2.c b/lib/cmpdi2.c
index 6d7ebf6c2b86..2250da7e503e 100644
--- a/lib/cmpdi2.c
+++ b/lib/cmpdi2.c
@@ -16,7 +16,7 @@
 
 #include <linux/export.h>
 
-#include <lib/libgcc.h>
+#include <linux/libgcc.h>
 
 word_type notrace __cmpdi2(long long a, long long b)
 {
diff --git a/lib/lshrdi3.c b/lib/lshrdi3.c
index 8e845f4bb65f..99cfa5721f2d 100644
--- a/lib/lshrdi3.c
+++ b/lib/lshrdi3.c
@@ -17,7 +17,7 @@
  */
 
 #include <linux/module.h>
-#include <lib/libgcc.h>
+#include <linux/libgcc.h>
 
 long long notrace __lshrdi3(long long u, word_type b)
 {
diff --git a/lib/muldi3.c b/lib/muldi3.c
index 88938543e10a..54c8b3123376 100644
--- a/lib/muldi3.c
+++ b/lib/muldi3.c
@@ -15,7 +15,7 @@
  */
 
 #include <linux/export.h>
-#include <lib/libgcc.h>
+#include <linux/libgcc.h>
 
 #define W_TYPE_SIZE 32
 
diff --git a/lib/ucmpdi2.c b/lib/ucmpdi2.c
index 49a53505c8e3..25ca2d4c1e19 100644
--- a/lib/ucmpdi2.c
+++ b/lib/ucmpdi2.c
@@ -15,7 +15,7 @@
  */
 
 #include <linux/module.h>
-#include <lib/libgcc.h>
+#include <linux/libgcc.h>
 
 word_type __ucmpdi2(unsigned long long a, unsigned long long b)
 {
-- 
cgit v1.2.3


From 6d745ee8b5e81f3a33791e3c854fbbfd6f3e585e Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 6 Sep 2017 14:56:50 +0200
Subject: iio: stm32: fix adc/trigger link error

The ADC driver can trigger on either the timer or the lptim
trigger, but it only uses a Kconfig 'select' statement
to ensure that the first of the two is present. When the lptim
trigger is enabled as a loadable module, and the adc driver
is built-in, we now get a link error:

drivers/iio/adc/stm32-adc.o: In function `stm32_adc_get_trig_extsel':
stm32-adc.c:(.text+0x4e0): undefined reference to `is_stm32_lptim_trigger'

We could use a second 'select' statement and always have both
trigger drivers enabled when the adc driver is, but it seems that
the lptimer trigger was intentionally left optional, so it seems
better to keep it that way.

This adds a hack to use 'IS_REACHABLE()' rather than 'IS_ENABLED()',
which avoids the link error, but instead leads to the lptimer trigger
not being used in the broken configuration. I've added a runtime
warning for this case to help users figure out what they did wrong
if this should ever be done by accident.

Fixes: f0b638a7f6db ("iio: adc: stm32: add support for lptimer triggers")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: <Stable@vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/timer/stm32-lptim-trigger.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/iio/timer/stm32-lptim-trigger.h b/include/linux/iio/timer/stm32-lptim-trigger.h
index 34d59bfdce2d..464458d20b16 100644
--- a/include/linux/iio/timer/stm32-lptim-trigger.h
+++ b/include/linux/iio/timer/stm32-lptim-trigger.h
@@ -16,11 +16,14 @@
 #define LPTIM2_OUT	"lptim2_out"
 #define LPTIM3_OUT	"lptim3_out"
 
-#if IS_ENABLED(CONFIG_IIO_STM32_LPTIMER_TRIGGER)
+#if IS_REACHABLE(CONFIG_IIO_STM32_LPTIMER_TRIGGER)
 bool is_stm32_lptim_trigger(struct iio_trigger *trig);
 #else
 static inline bool is_stm32_lptim_trigger(struct iio_trigger *trig)
 {
+#if IS_ENABLED(CONFIG_IIO_STM32_LPTIMER_TRIGGER)
+	pr_warn_once("stm32 lptim_trigger not linked in\n");
+#endif
 	return false;
 }
 #endif
-- 
cgit v1.2.3


From b4d1605a8ea608fd7dc45b926a05d75d340bde4b Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sun, 3 Dec 2017 09:33:00 -0800
Subject: tcp: use IPCB instead of TCP_SKB_CB in inet_exact_dif_match()

After this fix : ("tcp: add tcp_v4_fill_cb()/tcp_v4_restore_cb()"),
socket lookups happen while skb->cb[] has not been mangled yet by TCP.

Fixes: a04a480d4392 ("net: Require exact match for TCP socket lookups if dif is l3mdev")
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 4e09398009c1..6998707e81f3 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -844,12 +844,11 @@ static inline int tcp_v6_sdif(const struct sk_buff *skb)
 }
 #endif
 
-/* TCP_SKB_CB reference means this can not be used from early demux */
 static inline bool inet_exact_dif_match(struct net *net, struct sk_buff *skb)
 {
 #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
 	if (!net->ipv4.sysctl_tcp_l3mdev_accept &&
-	    skb && ipv4_l3mdev_skb(TCP_SKB_CB(skb)->header.h4.flags))
+	    skb && ipv4_l3mdev_skb(IPCB(skb)->flags))
 		return true;
 #endif
 	return false;
-- 
cgit v1.2.3


From 250d0c7754aa37c6443f07f1f5f591e2806295d8 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 19 Oct 2017 10:32:13 +0200
Subject: tracing: always define trace_{irq,preempt}_{enable_disable}

We get a build error in the irqsoff tracer in some configurations:

kernel/trace/trace_irqsoff.c: In function 'trace_preempt_on':
kernel/trace/trace_irqsoff.c:855:2: error: implicit declaration of function 'trace_preempt_enable_rcuidle'; did you mean 'trace_irq_enable_rcuidle'? [-Werror=implicit-function-declaration]
  trace_preempt_enable_rcuidle(a0, a1);

The problem is that trace_preempt_enable_rcuidle() has different
definition based on multiple Kconfig symbols, but not all combinations
have a valid definition.

This changes the conditions so that we always get exactly one
definition of each of the four tracing macros. I have not tried
to verify that these definitions are sensible, but now we
can build all randconfig combinations again.

Link: http://lkml.kernel.org/r/20171019083230.2450779-1-arnd@arndb.de

Fixes: d59158162e03 ("tracing: Add support for preempt and irq enable/disable events")
Acked-by: Joel Fernandes <joelaf@google.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/trace/events/preemptirq.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/preemptirq.h b/include/trace/events/preemptirq.h
index f5024c560d8f..9c4eb33c5a1d 100644
--- a/include/trace/events/preemptirq.h
+++ b/include/trace/events/preemptirq.h
@@ -56,15 +56,18 @@ DEFINE_EVENT(preemptirq_template, preempt_enable,
 
 #include <trace/define_trace.h>
 
-#else /* !CONFIG_PREEMPTIRQ_EVENTS */
+#endif /* !CONFIG_PREEMPTIRQ_EVENTS */
 
+#if !defined(CONFIG_PREEMPTIRQ_EVENTS) || defined(CONFIG_PROVE_LOCKING)
 #define trace_irq_enable(...)
 #define trace_irq_disable(...)
-#define trace_preempt_enable(...)
-#define trace_preempt_disable(...)
 #define trace_irq_enable_rcuidle(...)
 #define trace_irq_disable_rcuidle(...)
+#endif
+
+#if !defined(CONFIG_PREEMPTIRQ_EVENTS) || !defined(CONFIG_DEBUG_PREEMPT)
+#define trace_preempt_enable(...)
+#define trace_preempt_disable(...)
 #define trace_preempt_enable_rcuidle(...)
 #define trace_preempt_disable_rcuidle(...)
-
 #endif
-- 
cgit v1.2.3


From a773d419275bf54854ca6cfda8f2594ed2790faa Mon Sep 17 00:00:00 2001
From: Felipe Balbi <felipe.balbi@linux.intel.com>
Date: Fri, 2 Jun 2017 13:20:25 +0300
Subject: tracing: Pass export pointer as argument to ->write()

By passing an export descriptor to the write function, users don't need to
keep a global static pointer and can rely on container_of() to fetch their
own structure.

Link: http://lkml.kernel.org/r/20170602102025.5140-1-felipe.balbi@linux.intel.com

Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Reviewed-by: Chunyan Zhang <zhang.chunyan@linaro.org>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 drivers/hwtracing/stm/ftrace.c | 6 ++++--
 include/linux/trace.h          | 2 +-
 kernel/trace/trace.c           | 2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/hwtracing/stm/ftrace.c b/drivers/hwtracing/stm/ftrace.c
index bd126a7c6da2..7da75644c750 100644
--- a/drivers/hwtracing/stm/ftrace.c
+++ b/drivers/hwtracing/stm/ftrace.c
@@ -42,9 +42,11 @@ static struct stm_ftrace {
  * @len:	length of the data packet
  */
 static void notrace
-stm_ftrace_write(const void *buf, unsigned int len)
+stm_ftrace_write(struct trace_export *export, const void *buf, unsigned int len)
 {
-	stm_source_write(&stm_ftrace.data, STM_FTRACE_CHAN, buf, len);
+	struct stm_ftrace *stm = container_of(export, struct stm_ftrace, ftrace);
+
+	stm_source_write(&stm->data, STM_FTRACE_CHAN, buf, len);
 }
 
 static int stm_ftrace_link(struct stm_source_data *data)
diff --git a/include/linux/trace.h b/include/linux/trace.h
index d24991c1fef3..b95ffb2188ab 100644
--- a/include/linux/trace.h
+++ b/include/linux/trace.h
@@ -18,7 +18,7 @@
  */
 struct trace_export {
 	struct trace_export __rcu	*next;
-	void (*write)(const void *, unsigned int);
+	void (*write)(struct trace_export *, const void *, unsigned int);
 };
 
 int register_ftrace_export(struct trace_export *export);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9f3f043ba3b7..59518b8126d0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2415,7 +2415,7 @@ trace_process_export(struct trace_export *export,
 
 	entry = ring_buffer_event_data(event);
 	size = ring_buffer_event_length(event);
-	export->write(entry, size);
+	export->write(export, entry, size);
 }
 
 static DEFINE_MUTEX(ftrace_export_lock);
-- 
cgit v1.2.3


From 4ce413d1840b25b101be3c0559161db8891f3360 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Fri, 1 Dec 2017 15:29:39 +0000
Subject: irqdesc: Use bool return type instead of int

The irq_balancing_disabled and irq_is_percpu{,_devid} functions are
clearly intended to return bool like the functions in
kernel/irq/settings.h, but actually return an int containing a masked
value of desc->status_use_accessors. This can lead to subtle breakage
if, for example, the return value is subsequently truncated when
assigned to a narrower type.

As Linus points out:

| In particular, what can (and _has_ happened) is that people end up
| using these functions that return true or false, and they assign the
| result to something like a bitfield (or a char) or whatever.
|
| And the code looks *obviously* correct, when you have things like
|
|      dev->percpu = irq_is_percpu_devid(dev->irq);
|
| and that "percpu" thing is just one status bit among many. It may even
| *work*, because maybe that "percpu" flag ends up not being all that
| important, or it just happens to never be set on the particular
| hardware that people end up testing.
|
| But while it looks obviously correct, and might even work, it's really
| fundamentally broken. Because that "true or false" function didn't
| actually return 0/1, it returned 0 or 0x20000.
|
| And 0x20000 may not fit in a bitmask or a "char" or whatever.

Fix the problem by consistently using bool as the return type for these
functions.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: marc.zyngier@arm.com
Link: https://lkml.kernel.org/r/1512142179-24616-1-git-send-email-will.deacon@arm.com
---
 include/linux/irqdesc.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index dd418955962b..39fb3700f7a9 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -230,7 +230,7 @@ irq_set_chip_handler_name_locked(struct irq_data *data, struct irq_chip *chip,
 	data->chip = chip;
 }
 
-static inline int irq_balancing_disabled(unsigned int irq)
+static inline bool irq_balancing_disabled(unsigned int irq)
 {
 	struct irq_desc *desc;
 
@@ -238,7 +238,7 @@ static inline int irq_balancing_disabled(unsigned int irq)
 	return desc->status_use_accessors & IRQ_NO_BALANCING_MASK;
 }
 
-static inline int irq_is_percpu(unsigned int irq)
+static inline bool irq_is_percpu(unsigned int irq)
 {
 	struct irq_desc *desc;
 
@@ -246,7 +246,7 @@ static inline int irq_is_percpu(unsigned int irq)
 	return desc->status_use_accessors & IRQ_PER_CPU;
 }
 
-static inline int irq_is_percpu_devid(unsigned int irq)
+static inline bool irq_is_percpu_devid(unsigned int irq)
 {
 	struct irq_desc *desc;
 
-- 
cgit v1.2.3


From c895f6f703ad7dd2f99e751d9884b0aa5d0eea25 Mon Sep 17 00:00:00 2001
From: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Date: Mon, 4 Dec 2017 10:56:44 +0100
Subject: bpf: correct broken uapi for BPF_PROG_TYPE_PERF_EVENT program type

Commit 0515e5999a466dfe ("bpf: introduce BPF_PROG_TYPE_PERF_EVENT
program type") introduced the bpf_perf_event_data structure which
exports the pt_regs structure.  This is OK for multiple architectures
but fail for s390 and arm64 which do not export pt_regs.  Programs
using them, for example, the bpf selftest fail to compile on these
architectures.

For s390, exporting the pt_regs is not an option because s390 wants
to allow changes to it.  For arm64, there is a user_pt_regs structure
that covers parts of the pt_regs structure for use by user space.

To solve the broken uapi for s390 and arm64, introduce an abstract
type for pt_regs and add an asm/bpf_perf_event.h file that concretes
the type.  An asm-generic header file covers the architectures that
export pt_regs today.

The arch-specific enablement for s390 and arm64 follows in separate
commits.

Reported-by: Thomas Richter <tmricht@linux.vnet.ibm.com>
Fixes: 0515e5999a466dfe ("bpf: introduce BPF_PROG_TYPE_PERF_EVENT program type")
Signed-off-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Reviewed-and-tested-by: Thomas Richter <tmricht@linux.vnet.ibm.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 arch/alpha/include/uapi/asm/Kbuild        | 2 ++
 arch/arc/include/uapi/asm/Kbuild          | 1 +
 arch/arm/include/uapi/asm/Kbuild          | 1 +
 arch/blackfin/include/uapi/asm/Kbuild     | 1 +
 arch/c6x/include/uapi/asm/Kbuild          | 1 +
 arch/cris/include/uapi/asm/Kbuild         | 1 +
 arch/frv/include/uapi/asm/Kbuild          | 2 ++
 arch/h8300/include/uapi/asm/Kbuild        | 1 +
 arch/hexagon/include/uapi/asm/Kbuild      | 1 +
 arch/ia64/include/uapi/asm/Kbuild         | 1 +
 arch/m32r/include/uapi/asm/Kbuild         | 1 +
 arch/m68k/include/uapi/asm/Kbuild         | 1 +
 arch/metag/include/uapi/asm/Kbuild        | 1 +
 arch/microblaze/include/uapi/asm/Kbuild   | 1 +
 arch/mips/include/uapi/asm/Kbuild         | 1 +
 arch/mn10300/include/uapi/asm/Kbuild      | 1 +
 arch/nios2/include/uapi/asm/Kbuild        | 1 +
 arch/openrisc/include/uapi/asm/Kbuild     | 1 +
 arch/parisc/include/uapi/asm/Kbuild       | 1 +
 arch/powerpc/include/uapi/asm/Kbuild      | 1 +
 arch/riscv/include/uapi/asm/Kbuild        | 1 +
 arch/score/include/uapi/asm/Kbuild        | 1 +
 arch/sh/include/uapi/asm/Kbuild           | 1 +
 arch/sparc/include/uapi/asm/Kbuild        | 1 +
 arch/tile/include/uapi/asm/Kbuild         | 1 +
 arch/unicore32/include/uapi/asm/Kbuild    | 1 +
 arch/x86/include/uapi/asm/Kbuild          | 1 +
 arch/xtensa/include/uapi/asm/Kbuild       | 1 +
 include/linux/perf_event.h                | 6 +++++-
 include/uapi/asm-generic/bpf_perf_event.h | 9 +++++++++
 include/uapi/linux/bpf_perf_event.h       | 5 ++---
 kernel/events/core.c                      | 2 +-
 32 files changed, 47 insertions(+), 5 deletions(-)
 create mode 100644 include/uapi/asm-generic/bpf_perf_event.h

(limited to 'include')

diff --git a/arch/alpha/include/uapi/asm/Kbuild b/arch/alpha/include/uapi/asm/Kbuild
index b15bf6bc0e94..14a2e9af97e9 100644
--- a/arch/alpha/include/uapi/asm/Kbuild
+++ b/arch/alpha/include/uapi/asm/Kbuild
@@ -1,2 +1,4 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
+
+generic-y += bpf_perf_event.h
diff --git a/arch/arc/include/uapi/asm/Kbuild b/arch/arc/include/uapi/asm/Kbuild
index fa6d0ff4ff89..170b5db64afe 100644
--- a/arch/arc/include/uapi/asm/Kbuild
+++ b/arch/arc/include/uapi/asm/Kbuild
@@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/arm/include/uapi/asm/Kbuild b/arch/arm/include/uapi/asm/Kbuild
index 4d53de308ee0..4d1cc1847edf 100644
--- a/arch/arm/include/uapi/asm/Kbuild
+++ b/arch/arm/include/uapi/asm/Kbuild
@@ -7,6 +7,7 @@ generated-y += unistd-oabi.h
 generated-y += unistd-eabi.h
 
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += ioctl.h
 generic-y += ipcbuf.h
diff --git a/arch/blackfin/include/uapi/asm/Kbuild b/arch/blackfin/include/uapi/asm/Kbuild
index aa624b4ab655..2240b38c2915 100644
--- a/arch/blackfin/include/uapi/asm/Kbuild
+++ b/arch/blackfin/include/uapi/asm/Kbuild
@@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += ioctl.h
 generic-y += ipcbuf.h
diff --git a/arch/c6x/include/uapi/asm/Kbuild b/arch/c6x/include/uapi/asm/Kbuild
index 67ee896a76a7..26644e15d854 100644
--- a/arch/c6x/include/uapi/asm/Kbuild
+++ b/arch/c6x/include/uapi/asm/Kbuild
@@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/cris/include/uapi/asm/Kbuild b/arch/cris/include/uapi/asm/Kbuild
index 3687b54bb18e..3470c6e9c7b9 100644
--- a/arch/cris/include/uapi/asm/Kbuild
+++ b/arch/cris/include/uapi/asm/Kbuild
@@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/frv/include/uapi/asm/Kbuild b/arch/frv/include/uapi/asm/Kbuild
index b15bf6bc0e94..14a2e9af97e9 100644
--- a/arch/frv/include/uapi/asm/Kbuild
+++ b/arch/frv/include/uapi/asm/Kbuild
@@ -1,2 +1,4 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
+
+generic-y += bpf_perf_event.h
diff --git a/arch/h8300/include/uapi/asm/Kbuild b/arch/h8300/include/uapi/asm/Kbuild
index 187aed820e71..2f65f78792cb 100644
--- a/arch/h8300/include/uapi/asm/Kbuild
+++ b/arch/h8300/include/uapi/asm/Kbuild
@@ -2,6 +2,7 @@
 include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/hexagon/include/uapi/asm/Kbuild b/arch/hexagon/include/uapi/asm/Kbuild
index cb5df3aad3a8..41a176dbb53e 100644
--- a/arch/hexagon/include/uapi/asm/Kbuild
+++ b/arch/hexagon/include/uapi/asm/Kbuild
@@ -2,6 +2,7 @@
 include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/ia64/include/uapi/asm/Kbuild b/arch/ia64/include/uapi/asm/Kbuild
index 13a97aa2285f..f5c6967a93bb 100644
--- a/arch/ia64/include/uapi/asm/Kbuild
+++ b/arch/ia64/include/uapi/asm/Kbuild
@@ -1,4 +1,5 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
+generic-y += bpf_perf_event.h
 generic-y += kvm_para.h
diff --git a/arch/m32r/include/uapi/asm/Kbuild b/arch/m32r/include/uapi/asm/Kbuild
index 1c44d3b3eba0..451bf6071c6e 100644
--- a/arch/m32r/include/uapi/asm/Kbuild
+++ b/arch/m32r/include/uapi/asm/Kbuild
@@ -1,5 +1,6 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
+generic-y += bpf_perf_event.h
 generic-y += kvm_para.h
 generic-y += siginfo.h
diff --git a/arch/m68k/include/uapi/asm/Kbuild b/arch/m68k/include/uapi/asm/Kbuild
index 3717b64a620d..c2e26a44c482 100644
--- a/arch/m68k/include/uapi/asm/Kbuild
+++ b/arch/m68k/include/uapi/asm/Kbuild
@@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += ioctl.h
 generic-y += ipcbuf.h
diff --git a/arch/metag/include/uapi/asm/Kbuild b/arch/metag/include/uapi/asm/Kbuild
index 6ac763d9a3e3..f9eaf07d29f8 100644
--- a/arch/metag/include/uapi/asm/Kbuild
+++ b/arch/metag/include/uapi/asm/Kbuild
@@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/microblaze/include/uapi/asm/Kbuild b/arch/microblaze/include/uapi/asm/Kbuild
index 06609ca36115..2c6a6bffea32 100644
--- a/arch/microblaze/include/uapi/asm/Kbuild
+++ b/arch/microblaze/include/uapi/asm/Kbuild
@@ -2,6 +2,7 @@
 include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/mips/include/uapi/asm/Kbuild b/arch/mips/include/uapi/asm/Kbuild
index a0266feba9e6..7a4becd8963a 100644
--- a/arch/mips/include/uapi/asm/Kbuild
+++ b/arch/mips/include/uapi/asm/Kbuild
@@ -1,4 +1,5 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
+generic-y += bpf_perf_event.h
 generic-y += ipcbuf.h
diff --git a/arch/mn10300/include/uapi/asm/Kbuild b/arch/mn10300/include/uapi/asm/Kbuild
index c94ee54210bc..81271d3af47c 100644
--- a/arch/mn10300/include/uapi/asm/Kbuild
+++ b/arch/mn10300/include/uapi/asm/Kbuild
@@ -1,4 +1,5 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
+generic-y	+= bpf_perf_event.h
 generic-y	+= siginfo.h
diff --git a/arch/nios2/include/uapi/asm/Kbuild b/arch/nios2/include/uapi/asm/Kbuild
index ffca24da7647..13a3d77b4d7b 100644
--- a/arch/nios2/include/uapi/asm/Kbuild
+++ b/arch/nios2/include/uapi/asm/Kbuild
@@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/openrisc/include/uapi/asm/Kbuild b/arch/openrisc/include/uapi/asm/Kbuild
index 62286dbeb904..130c16ccba0a 100644
--- a/arch/openrisc/include/uapi/asm/Kbuild
+++ b/arch/openrisc/include/uapi/asm/Kbuild
@@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/parisc/include/uapi/asm/Kbuild b/arch/parisc/include/uapi/asm/Kbuild
index 196d2a4efb31..286ef5a5904b 100644
--- a/arch/parisc/include/uapi/asm/Kbuild
+++ b/arch/parisc/include/uapi/asm/Kbuild
@@ -2,6 +2,7 @@
 include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
+generic-y += bpf_perf_event.h
 generic-y += kvm_para.h
 generic-y += param.h
 generic-y += poll.h
diff --git a/arch/powerpc/include/uapi/asm/Kbuild b/arch/powerpc/include/uapi/asm/Kbuild
index 0d960ef78a9a..1a6ed5919ffd 100644
--- a/arch/powerpc/include/uapi/asm/Kbuild
+++ b/arch/powerpc/include/uapi/asm/Kbuild
@@ -1,6 +1,7 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
+generic-y += bpf_perf_event.h
 generic-y += param.h
 generic-y += poll.h
 generic-y += resource.h
diff --git a/arch/riscv/include/uapi/asm/Kbuild b/arch/riscv/include/uapi/asm/Kbuild
index 5ded96b06352..7e91f4850475 100644
--- a/arch/riscv/include/uapi/asm/Kbuild
+++ b/arch/riscv/include/uapi/asm/Kbuild
@@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += setup.h
 generic-y += unistd.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/score/include/uapi/asm/Kbuild b/arch/score/include/uapi/asm/Kbuild
index c94ee54210bc..81271d3af47c 100644
--- a/arch/score/include/uapi/asm/Kbuild
+++ b/arch/score/include/uapi/asm/Kbuild
@@ -1,4 +1,5 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
+generic-y	+= bpf_perf_event.h
 generic-y	+= siginfo.h
diff --git a/arch/sh/include/uapi/asm/Kbuild b/arch/sh/include/uapi/asm/Kbuild
index e28531333efa..ba4d39cb321d 100644
--- a/arch/sh/include/uapi/asm/Kbuild
+++ b/arch/sh/include/uapi/asm/Kbuild
@@ -2,6 +2,7 @@
 include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/sparc/include/uapi/asm/Kbuild b/arch/sparc/include/uapi/asm/Kbuild
index 2178c78c7c1a..4680ba246b55 100644
--- a/arch/sparc/include/uapi/asm/Kbuild
+++ b/arch/sparc/include/uapi/asm/Kbuild
@@ -1,4 +1,5 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
+generic-y += bpf_perf_event.h
 generic-y += types.h
diff --git a/arch/tile/include/uapi/asm/Kbuild b/arch/tile/include/uapi/asm/Kbuild
index 5711de0a1b5e..cc439612bcd5 100644
--- a/arch/tile/include/uapi/asm/Kbuild
+++ b/arch/tile/include/uapi/asm/Kbuild
@@ -1,6 +1,7 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/unicore32/include/uapi/asm/Kbuild b/arch/unicore32/include/uapi/asm/Kbuild
index 759a71411169..8611ef980554 100644
--- a/arch/unicore32/include/uapi/asm/Kbuild
+++ b/arch/unicore32/include/uapi/asm/Kbuild
@@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/x86/include/uapi/asm/Kbuild b/arch/x86/include/uapi/asm/Kbuild
index da1489cb64dc..1e901e421f2d 100644
--- a/arch/x86/include/uapi/asm/Kbuild
+++ b/arch/x86/include/uapi/asm/Kbuild
@@ -1,6 +1,7 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
+generic-y += bpf_perf_event.h
 generated-y += unistd_32.h
 generated-y += unistd_64.h
 generated-y += unistd_x32.h
diff --git a/arch/xtensa/include/uapi/asm/Kbuild b/arch/xtensa/include/uapi/asm/Kbuild
index a5bcdfb890f1..837d4dd76785 100644
--- a/arch/xtensa/include/uapi/asm/Kbuild
+++ b/arch/xtensa/include/uapi/asm/Kbuild
@@ -2,6 +2,7 @@
 include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2c9c87d8a0c1..7546822a1d74 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -15,6 +15,7 @@
 #define _LINUX_PERF_EVENT_H
 
 #include <uapi/linux/perf_event.h>
+#include <uapi/linux/bpf_perf_event.h>
 
 /*
  * Kernel-internal data types and definitions:
@@ -787,7 +788,7 @@ struct perf_output_handle {
 };
 
 struct bpf_perf_event_data_kern {
-	struct pt_regs *regs;
+	bpf_user_pt_regs_t *regs;
 	struct perf_sample_data *data;
 	struct perf_event *event;
 };
@@ -1177,6 +1178,9 @@ extern void perf_bp_event(struct perf_event *event, void *data);
 		(user_mode(regs) ? PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL)
 # define perf_instruction_pointer(regs)	instruction_pointer(regs)
 #endif
+#ifndef perf_arch_bpf_user_pt_regs
+# define perf_arch_bpf_user_pt_regs(regs) regs
+#endif
 
 static inline bool has_branch_stack(struct perf_event *event)
 {
diff --git a/include/uapi/asm-generic/bpf_perf_event.h b/include/uapi/asm-generic/bpf_perf_event.h
new file mode 100644
index 000000000000..53815d2cd047
--- /dev/null
+++ b/include/uapi/asm-generic/bpf_perf_event.h
@@ -0,0 +1,9 @@
+#ifndef _UAPI__ASM_GENERIC_BPF_PERF_EVENT_H__
+#define _UAPI__ASM_GENERIC_BPF_PERF_EVENT_H__
+
+#include <linux/ptrace.h>
+
+/* Export kernel pt_regs structure */
+typedef struct pt_regs bpf_user_pt_regs_t;
+
+#endif /* _UAPI__ASM_GENERIC_BPF_PERF_EVENT_H__ */
diff --git a/include/uapi/linux/bpf_perf_event.h b/include/uapi/linux/bpf_perf_event.h
index af549d4ecf1b..8f95303f9d80 100644
--- a/include/uapi/linux/bpf_perf_event.h
+++ b/include/uapi/linux/bpf_perf_event.h
@@ -8,11 +8,10 @@
 #ifndef _UAPI__LINUX_BPF_PERF_EVENT_H__
 #define _UAPI__LINUX_BPF_PERF_EVENT_H__
 
-#include <linux/types.h>
-#include <linux/ptrace.h>
+#include <asm/bpf_perf_event.h>
 
 struct bpf_perf_event_data {
-	struct pt_regs regs;
+	bpf_user_pt_regs_t regs;
 	__u64 sample_period;
 };
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 16beab4767e1..ba957b9812b3 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7987,11 +7987,11 @@ static void bpf_overflow_handler(struct perf_event *event,
 {
 	struct bpf_perf_event_data_kern ctx = {
 		.data = data,
-		.regs = regs,
 		.event = event,
 	};
 	int ret = 0;
 
+	ctx.regs = perf_arch_bpf_user_pt_regs(regs);
 	preempt_disable();
 	if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
 		goto out;
-- 
cgit v1.2.3


From 5c472203421ab4f928aa1ae9e1dbcfdd80324148 Mon Sep 17 00:00:00 2001
From: Nogah Frankel <nogahf@mellanox.com>
Date: Mon, 4 Dec 2017 13:31:10 +0200
Subject: net_sched: red: Avoid devision by zero

Do not allow delta value to be zero since it is used as a divisor.

Fixes: 8af2a218de38 ("sch_red: Adaptative RED AQM")
Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/red.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/red.h b/include/net/red.h
index 9a9347710701..5918f78d36a0 100644
--- a/include/net/red.h
+++ b/include/net/red.h
@@ -179,7 +179,7 @@ static inline void red_set_parms(struct red_parms *p,
 	p->qth_max	= qth_max << Wlog;
 	p->Wlog		= Wlog;
 	p->Plog		= Plog;
-	if (delta < 0)
+	if (delta <= 0)
 		delta = 1;
 	p->qth_delta	= delta;
 	if (!max_P) {
-- 
cgit v1.2.3


From 8afa10cbe281b10371fee5a87ab266e48d71a7f9 Mon Sep 17 00:00:00 2001
From: Nogah Frankel <nogahf@mellanox.com>
Date: Mon, 4 Dec 2017 13:31:11 +0200
Subject: net_sched: red: Avoid illegal values

Check the qmin & qmax values doesn't overflow for the given Wlog value.
Check that qmin <= qmax.

Fixes: a783474591f2 ("[PKT_SCHED]: Generic RED layer")
Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/red.h     | 11 +++++++++++
 net/sched/sch_choke.c |  3 +++
 net/sched/sch_gred.c  |  3 +++
 net/sched/sch_red.c   |  2 ++
 net/sched/sch_sfq.c   |  3 +++
 5 files changed, 22 insertions(+)

(limited to 'include')

diff --git a/include/net/red.h b/include/net/red.h
index 5918f78d36a0..9665582c4687 100644
--- a/include/net/red.h
+++ b/include/net/red.h
@@ -168,6 +168,17 @@ static inline void red_set_vars(struct red_vars *v)
 	v->qcount	= -1;
 }
 
+static inline bool red_check_params(u32 qth_min, u32 qth_max, u8 Wlog)
+{
+	if (fls(qth_min) + Wlog > 32)
+		return false;
+	if (fls(qth_max) + Wlog > 32)
+		return false;
+	if (qth_max < qth_min)
+		return false;
+	return true;
+}
+
 static inline void red_set_parms(struct red_parms *p,
 				 u32 qth_min, u32 qth_max, u8 Wlog, u8 Plog,
 				 u8 Scell_log, u8 *stab, u32 max_P)
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index b30a2c70bd48..531250fceb9e 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -369,6 +369,9 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt)
 
 	ctl = nla_data(tb[TCA_CHOKE_PARMS]);
 
+	if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog))
+		return -EINVAL;
+
 	if (ctl->limit > CHOKE_MAX_QUEUE)
 		return -EINVAL;
 
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index 17c7130454bd..bc30f9186ac6 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -356,6 +356,9 @@ static inline int gred_change_vq(struct Qdisc *sch, int dp,
 	struct gred_sched *table = qdisc_priv(sch);
 	struct gred_sched_data *q = table->tab[dp];
 
+	if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog))
+		return -EINVAL;
+
 	if (!q) {
 		table->tab[dp] = q = *prealloc;
 		*prealloc = NULL;
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 7f8ea9e297c3..9d874e60e032 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -212,6 +212,8 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt)
 	max_P = tb[TCA_RED_MAX_P] ? nla_get_u32(tb[TCA_RED_MAX_P]) : 0;
 
 	ctl = nla_data(tb[TCA_RED_PARMS]);
+	if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog))
+		return -EINVAL;
 
 	if (ctl->limit > 0) {
 		child = fifo_create_dflt(sch, &bfifo_qdisc_ops, ctl->limit);
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 09c1203c1711..930e5bd26d3d 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -639,6 +639,9 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 	if (ctl->divisor &&
 	    (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
 		return -EINVAL;
+	if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max,
+					ctl_v1->Wlog))
+		return -EINVAL;
 	if (ctl_v1 && ctl_v1->qth_min) {
 		p = kmalloc(sizeof(*p), GFP_KERNEL);
 		if (!p)
-- 
cgit v1.2.3


From f775b13eedee2f7f3c6fdd4e90fb79090ce5d339 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Tue, 14 Nov 2017 16:54:23 -0500
Subject: x86,kvm: move qemu/guest FPU switching out to vcpu_run
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, every time a VCPU is scheduled out, the host kernel will
first save the guest FPU/xstate context, then load the qemu userspace
FPU context, only to then immediately save the qemu userspace FPU
context back to memory. When scheduling in a VCPU, the same extraneous
FPU loads and saves are done.

This could be avoided by moving from a model where the guest FPU is
loaded and stored with preemption disabled, to a model where the
qemu userspace FPU is swapped out for the guest FPU context for
the duration of the KVM_RUN ioctl.

This is done under the VCPU mutex, which is also taken when other
tasks inspect the VCPU FPU context, so the code should already be
safe for this change. That should come as no surprise, given that
s390 already has this optimization.

This can fix a bug where KVM calls get_user_pages while owning the
FPU, and the file system ends up requesting the FPU again:

    [258270.527947]  __warn+0xcb/0xf0
    [258270.527948]  warn_slowpath_null+0x1d/0x20
    [258270.527951]  kernel_fpu_disable+0x3f/0x50
    [258270.527953]  __kernel_fpu_begin+0x49/0x100
    [258270.527955]  kernel_fpu_begin+0xe/0x10
    [258270.527958]  crc32c_pcl_intel_update+0x84/0xb0
    [258270.527961]  crypto_shash_update+0x3f/0x110
    [258270.527968]  crc32c+0x63/0x8a [libcrc32c]
    [258270.527975]  dm_bm_checksum+0x1b/0x20 [dm_persistent_data]
    [258270.527978]  node_prepare_for_write+0x44/0x70 [dm_persistent_data]
    [258270.527985]  dm_block_manager_write_callback+0x41/0x50 [dm_persistent_data]
    [258270.527988]  submit_io+0x170/0x1b0 [dm_bufio]
    [258270.527992]  __write_dirty_buffer+0x89/0x90 [dm_bufio]
    [258270.527994]  __make_buffer_clean+0x4f/0x80 [dm_bufio]
    [258270.527996]  __try_evict_buffer+0x42/0x60 [dm_bufio]
    [258270.527998]  dm_bufio_shrink_scan+0xc0/0x130 [dm_bufio]
    [258270.528002]  shrink_slab.part.40+0x1f5/0x420
    [258270.528004]  shrink_node+0x22c/0x320
    [258270.528006]  do_try_to_free_pages+0xf5/0x330
    [258270.528008]  try_to_free_pages+0xe9/0x190
    [258270.528009]  __alloc_pages_slowpath+0x40f/0xba0
    [258270.528011]  __alloc_pages_nodemask+0x209/0x260
    [258270.528014]  alloc_pages_vma+0x1f1/0x250
    [258270.528017]  do_huge_pmd_anonymous_page+0x123/0x660
    [258270.528021]  handle_mm_fault+0xfd3/0x1330
    [258270.528025]  __get_user_pages+0x113/0x640
    [258270.528027]  get_user_pages+0x4f/0x60
    [258270.528063]  __gfn_to_pfn_memslot+0x120/0x3f0 [kvm]
    [258270.528108]  try_async_pf+0x66/0x230 [kvm]
    [258270.528135]  tdp_page_fault+0x130/0x280 [kvm]
    [258270.528149]  kvm_mmu_page_fault+0x60/0x120 [kvm]
    [258270.528158]  handle_ept_violation+0x91/0x170 [kvm_intel]
    [258270.528162]  vmx_handle_exit+0x1ca/0x1400 [kvm_intel]

No performance changes were detected in quick ping-pong tests on
my 4 socket system, which is expected since an FPU+xstate load is
on the order of 0.1us, while ping-ponging between CPUs is on the
order of 20us, and somewhat noisy.

Cc: stable@vger.kernel.org
Signed-off-by: Rik van Riel <riel@redhat.com>
Suggested-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
[Fixed a bug where reset_vcpu called put_fpu without preceding load_fpu,
 which happened inside from KVM_CREATE_VCPU ioctl. - Radim]
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 13 +++++++++++++
 arch/x86/kvm/x86.c              | 39 +++++++++++++++++----------------------
 include/linux/kvm_host.h        |  2 +-
 3 files changed, 31 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 977de5fb968b..62527e053ee4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -536,7 +536,20 @@ struct kvm_vcpu_arch {
 	struct kvm_mmu_memory_cache mmu_page_cache;
 	struct kvm_mmu_memory_cache mmu_page_header_cache;
 
+	/*
+	 * QEMU userspace and the guest each have their own FPU state.
+	 * In vcpu_run, we switch between the user and guest FPU contexts.
+	 * While running a VCPU, the VCPU thread will have the guest FPU
+	 * context.
+	 *
+	 * Note that while the PKRU state lives inside the fpu registers,
+	 * it is switched out separately at VMENTER and VMEXIT time. The
+	 * "guest_fpu" state here contains the guest FPU context, with the
+	 * host PRKU bits.
+	 */
+	struct fpu user_fpu;
 	struct fpu guest_fpu;
+
 	u64 xcr0;
 	u64 guest_supported_xcr0;
 	u32 guest_xstate_size;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index eee8e7faf1af..c8da1680a7d6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2937,7 +2937,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 	pagefault_enable();
 	kvm_x86_ops->vcpu_put(vcpu);
-	kvm_put_guest_fpu(vcpu);
 	vcpu->arch.last_host_tsc = rdtsc();
 }
 
@@ -5254,13 +5253,10 @@ static void emulator_halt(struct x86_emulate_ctxt *ctxt)
 
 static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt)
 {
-	preempt_disable();
-	kvm_load_guest_fpu(emul_to_vcpu(ctxt));
 }
 
 static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt)
 {
-	preempt_enable();
 }
 
 static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
@@ -6952,7 +6948,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	preempt_disable();
 
 	kvm_x86_ops->prepare_guest_switch(vcpu);
-	kvm_load_guest_fpu(vcpu);
 
 	/*
 	 * Disable IRQs before setting IN_GUEST_MODE.  Posted interrupt
@@ -7297,12 +7292,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		}
 	}
 
+	kvm_load_guest_fpu(vcpu);
+
 	if (unlikely(vcpu->arch.complete_userspace_io)) {
 		int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
 		vcpu->arch.complete_userspace_io = NULL;
 		r = cui(vcpu);
 		if (r <= 0)
-			goto out;
+			goto out_fpu;
 	} else
 		WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
 
@@ -7311,6 +7308,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	else
 		r = vcpu_run(vcpu);
 
+out_fpu:
+	kvm_put_guest_fpu(vcpu);
 out:
 	post_kvm_run_save(vcpu);
 	kvm_sigset_deactivate(vcpu);
@@ -7704,32 +7703,25 @@ static void fx_init(struct kvm_vcpu *vcpu)
 	vcpu->arch.cr0 |= X86_CR0_ET;
 }
 
+/* Swap (qemu) user FPU context for the guest FPU context. */
 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 {
-	if (vcpu->guest_fpu_loaded)
-		return;
-
-	/*
-	 * Restore all possible states in the guest,
-	 * and assume host would use all available bits.
-	 * Guest xcr0 would be loaded later.
-	 */
-	vcpu->guest_fpu_loaded = 1;
-	__kernel_fpu_begin();
+	preempt_disable();
+	copy_fpregs_to_fpstate(&vcpu->arch.user_fpu);
 	/* PKRU is separately restored in kvm_x86_ops->run.  */
 	__copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state,
 				~XFEATURE_MASK_PKRU);
+	preempt_enable();
 	trace_kvm_fpu(1);
 }
 
+/* When vcpu_run ends, restore user space FPU context. */
 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 {
-	if (!vcpu->guest_fpu_loaded)
-		return;
-
-	vcpu->guest_fpu_loaded = 0;
+	preempt_disable();
 	copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu);
-	__kernel_fpu_end();
+	copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state);
+	preempt_enable();
 	++vcpu->stat.fpu_reload;
 	trace_kvm_fpu(0);
 }
@@ -7846,7 +7838,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 		 * To avoid have the INIT path from kvm_apic_has_events() that be
 		 * called with loaded FPU and does not let userspace fix the state.
 		 */
-		kvm_put_guest_fpu(vcpu);
+		if (init_event)
+			kvm_put_guest_fpu(vcpu);
 		mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave,
 					XFEATURE_MASK_BNDREGS);
 		if (mpx_state_buffer)
@@ -7855,6 +7848,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 					XFEATURE_MASK_BNDCSR);
 		if (mpx_state_buffer)
 			memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr));
+		if (init_event)
+			kvm_load_guest_fpu(vcpu);
 	}
 
 	if (!init_event) {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 893d6d606cd0..6bdd4b9f6611 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -232,7 +232,7 @@ struct kvm_vcpu {
 	struct mutex mutex;
 	struct kvm_run *run;
 
-	int guest_fpu_loaded, guest_xcr0_loaded;
+	int guest_xcr0_loaded;
 	struct swait_queue_head wq;
 	struct pid __rcu *pid;
 	int sigset_active;
-- 
cgit v1.2.3


From d7efc6c11b277d9d80b99b1334a78bfe7d7edf10 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 5 Dec 2017 12:45:56 -0800
Subject: net: remove hlist_nulls_add_tail_rcu()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Alexander Potapenko reported use of uninitialized memory [1]

This happens when inserting a request socket into TCP ehash,
in __sk_nulls_add_node_rcu(), since sk_reuseport is not initialized.

Bug was added by commit d894ba18d4e4 ("soreuseport: fix ordering for
mixed v4/v6 sockets")

Note that d296ba60d8e2 ("soreuseport: Resolve merge conflict for v4/v6
ordering fix") missed the opportunity to get rid of
hlist_nulls_add_tail_rcu() :

Both UDP sockets and TCP/DCCP listeners no longer use
__sk_nulls_add_node_rcu() for their hash insertion.

Since all other sockets have unique 4-tuple, the reuseport status
has no special meaning, so we can always use hlist_nulls_add_head_rcu()
for them and save few cycles/instructions.

[1]

==================================================================
BUG: KMSAN: use of uninitialized memory in inet_ehash_insert+0xd40/0x1050
CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.13.0+ #3288
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
Call Trace:
 <IRQ>
 __dump_stack lib/dump_stack.c:16
 dump_stack+0x185/0x1d0 lib/dump_stack.c:52
 kmsan_report+0x13f/0x1c0 mm/kmsan/kmsan.c:1016
 __msan_warning_32+0x69/0xb0 mm/kmsan/kmsan_instr.c:766
 __sk_nulls_add_node_rcu ./include/net/sock.h:684
 inet_ehash_insert+0xd40/0x1050 net/ipv4/inet_hashtables.c:413
 reqsk_queue_hash_req net/ipv4/inet_connection_sock.c:754
 inet_csk_reqsk_queue_hash_add+0x1cc/0x300 net/ipv4/inet_connection_sock.c:765
 tcp_conn_request+0x31e7/0x36f0 net/ipv4/tcp_input.c:6414
 tcp_v4_conn_request+0x16d/0x220 net/ipv4/tcp_ipv4.c:1314
 tcp_rcv_state_process+0x42a/0x7210 net/ipv4/tcp_input.c:5917
 tcp_v4_do_rcv+0xa6a/0xcd0 net/ipv4/tcp_ipv4.c:1483
 tcp_v4_rcv+0x3de0/0x4ab0 net/ipv4/tcp_ipv4.c:1763
 ip_local_deliver_finish+0x6bb/0xcb0 net/ipv4/ip_input.c:216
 NF_HOOK ./include/linux/netfilter.h:248
 ip_local_deliver+0x3fa/0x480 net/ipv4/ip_input.c:257
 dst_input ./include/net/dst.h:477
 ip_rcv_finish+0x6fb/0x1540 net/ipv4/ip_input.c:397
 NF_HOOK ./include/linux/netfilter.h:248
 ip_rcv+0x10f6/0x15c0 net/ipv4/ip_input.c:488
 __netif_receive_skb_core+0x36f6/0x3f60 net/core/dev.c:4298
 __netif_receive_skb net/core/dev.c:4336
 netif_receive_skb_internal+0x63c/0x19c0 net/core/dev.c:4497
 napi_skb_finish net/core/dev.c:4858
 napi_gro_receive+0x629/0xa50 net/core/dev.c:4889
 e1000_receive_skb drivers/net/ethernet/intel/e1000/e1000_main.c:4018
 e1000_clean_rx_irq+0x1492/0x1d30
drivers/net/ethernet/intel/e1000/e1000_main.c:4474
 e1000_clean+0x43aa/0x5970 drivers/net/ethernet/intel/e1000/e1000_main.c:3819
 napi_poll net/core/dev.c:5500
 net_rx_action+0x73c/0x1820 net/core/dev.c:5566
 __do_softirq+0x4b4/0x8dd kernel/softirq.c:284
 invoke_softirq kernel/softirq.c:364
 irq_exit+0x203/0x240 kernel/softirq.c:405
 exiting_irq+0xe/0x10 ./arch/x86/include/asm/apic.h:638
 do_IRQ+0x15e/0x1a0 arch/x86/kernel/irq.c:263
 common_interrupt+0x86/0x86

Fixes: d894ba18d4e4 ("soreuseport: fix ordering for mixed v4/v6 sockets")
Fixes: d296ba60d8e2 ("soreuseport: Resolve merge conflict for v4/v6 ordering fix")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Alexander Potapenko <glider@google.com>
Acked-by: Craig Gallek <kraig@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rculist_nulls.h | 38 --------------------------------------
 include/net/sock.h            |  6 +-----
 2 files changed, 1 insertion(+), 43 deletions(-)

(limited to 'include')

diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
index a328e8181e49..e4b257ff881b 100644
--- a/include/linux/rculist_nulls.h
+++ b/include/linux/rculist_nulls.h
@@ -100,44 +100,6 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,
 		first->pprev = &n->next;
 }
 
-/**
- * hlist_nulls_add_tail_rcu
- * @n: the element to add to the hash list.
- * @h: the list to add to.
- *
- * Description:
- * Adds the specified element to the end of the specified hlist_nulls,
- * while permitting racing traversals.  NOTE: tail insertion requires
- * list traversal.
- *
- * The caller must take whatever precautions are necessary
- * (such as holding appropriate locks) to avoid racing
- * with another list-mutation primitive, such as hlist_nulls_add_head_rcu()
- * or hlist_nulls_del_rcu(), running on this same list.
- * However, it is perfectly legal to run concurrently with
- * the _rcu list-traversal primitives, such as
- * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency
- * problems on Alpha CPUs.  Regardless of the type of CPU, the
- * list-traversal primitive must be guarded by rcu_read_lock().
- */
-static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n,
-					struct hlist_nulls_head *h)
-{
-	struct hlist_nulls_node *i, *last = NULL;
-
-	for (i = hlist_nulls_first_rcu(h); !is_a_nulls(i);
-	     i = hlist_nulls_next_rcu(i))
-		last = i;
-
-	if (last) {
-		n->next = last->next;
-		n->pprev = &last->next;
-		rcu_assign_pointer(hlist_nulls_next_rcu(last), n);
-	} else {
-		hlist_nulls_add_head_rcu(n, h);
-	}
-}
-
 /**
  * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type
  * @tpos:	the type * to use as a loop cursor.
diff --git a/include/net/sock.h b/include/net/sock.h
index 79e1a2c7912c..9155da422692 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -685,11 +685,7 @@ static inline void sk_add_node_rcu(struct sock *sk, struct hlist_head *list)
 
 static inline void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
 {
-	if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
-	    sk->sk_family == AF_INET6)
-		hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list);
-	else
-		hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list);
+	hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list);
 }
 
 static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
-- 
cgit v1.2.3


From bb64da9aba89765fee74b395967b18a7d6c364e9 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Tue, 21 Nov 2017 16:02:52 +0100
Subject: KVM: s390: mark irq_state.flags as non-usable

Old kernels did not check for zero in the irq_state.flags field and old
QEMUs did not zero the flag/reserved fields when calling
KVM_S390_*_IRQ_STATE.  Let's add comments to prevent future uses of
these fields.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 Documentation/virtual/kvm/api.txt | 15 ++++++++++++---
 arch/s390/kvm/kvm-s390.c          |  6 ++++--
 include/uapi/linux/kvm.h          |  4 ++--
 3 files changed, 18 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index f670e4b9e7f3..57d3ee9e4bde 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2901,14 +2901,19 @@ userspace buffer and its length:
 
 struct kvm_s390_irq_state {
 	__u64 buf;
-	__u32 flags;
+	__u32 flags;        /* will stay unused for compatibility reasons */
 	__u32 len;
-	__u32 reserved[4];
+	__u32 reserved[4];  /* will stay unused for compatibility reasons */
 };
 
 Userspace passes in the above struct and for each pending interrupt a
 struct kvm_s390_irq is copied to the provided buffer.
 
+The structure contains a flags and a reserved field for future extensions. As
+the kernel never checked for flags == 0 and QEMU never pre-zeroed flags and
+reserved, these fields can not be used in the future without breaking
+compatibility.
+
 If -ENOBUFS is returned the buffer provided was too small and userspace
 may retry with a bigger buffer.
 
@@ -2932,10 +2937,14 @@ containing a struct kvm_s390_irq_state:
 
 struct kvm_s390_irq_state {
 	__u64 buf;
+	__u32 flags;        /* will stay unused for compatibility reasons */
 	__u32 len;
-	__u32 pad;
+	__u32 reserved[4];  /* will stay unused for compatibility reasons */
 };
 
+The restrictions for flags and reserved apply as well.
+(see KVM_S390_GET_IRQ_STATE)
+
 The userspace memory referenced by buf contains a struct kvm_s390_irq
 for each interrupt to be injected into the guest.
 If one of the interrupts could not be injected for some reason the
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 34375eed93ee..efa439f6ffb3 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1,8 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * hosting zSeries kernel virtual machines
+ * hosting IBM Z kernel virtual machines (s390x)
  *
- * Copyright IBM Corp. 2008, 2009
+ * Copyright IBM Corp. 2008, 2017
  *
  *    Author(s): Carsten Otte <cotte@de.ibm.com>
  *               Christian Borntraeger <borntraeger@de.ibm.com>
@@ -3808,6 +3808,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 			r = -EINVAL;
 			break;
 		}
+		/* do not use irq_state.flags, it will break old QEMUs */
 		r = kvm_s390_set_irq_state(vcpu,
 					   (void __user *) irq_state.buf,
 					   irq_state.len);
@@ -3823,6 +3824,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 			r = -EINVAL;
 			break;
 		}
+		/* do not use irq_state.flags, it will break old QEMUs */
 		r = kvm_s390_get_irq_state(vcpu,
 					   (__u8 __user *)  irq_state.buf,
 					   irq_state.len);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 282d7613fce8..496e59a2738b 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -630,9 +630,9 @@ struct kvm_s390_irq {
 
 struct kvm_s390_irq_state {
 	__u64 buf;
-	__u32 flags;
+	__u32 flags;        /* will stay unused for compatibility reasons */
 	__u32 len;
-	__u32 reserved[4];
+	__u32 reserved[4];  /* will stay unused for compatibility reasons */
 };
 
 /* for KVM_SET_GUEST_DEBUG */
-- 
cgit v1.2.3


From a703c55004e1c5076d57e43771b3e11117796ea0 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Mon, 4 Dec 2017 21:48:18 +0100
Subject: drm: safely free connectors from connector_iter

In

commit 613051dac40da1751ab269572766d3348d45a197
Author: Daniel Vetter <daniel.vetter@ffwll.ch>
Date:   Wed Dec 14 00:08:06 2016 +0100

    drm: locking&new iterators for connector_list

we've went to extreme lengths to make sure connector iterations works
in any context, without introducing any additional locking context.
This worked, except for a small fumble in the implementation:

When we actually race with a concurrent connector unplug event, and
our temporary connector reference turns out to be the final one, then
everything breaks: We call the connector release function from
whatever context we happen to be in, which can be an irq/atomic
context. And connector freeing grabs all kinds of locks and stuff.

Fix this by creating a specially safe put function for connetor_iter,
which (in this rare case) punts the cleanup to a worker.

Reported-by: Ben Widawsky <ben@bwidawsk.net>
Cc: Ben Widawsky <ben@bwidawsk.net>
Fixes: 613051dac40d ("drm: locking&new iterators for connector_list")
Cc: Dave Airlie <airlied@gmail.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Sean Paul <seanpaul@chromium.org>
Cc: <stable@vger.kernel.org> # v4.11+
Reviewed-by: Dave Airlie <airlied@gmail.com>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20171204204818.24745-1-daniel.vetter@ffwll.ch
---
 drivers/gpu/drm/drm_connector.c   | 28 ++++++++++++++++++++++++++--
 drivers/gpu/drm/drm_mode_config.c |  2 ++
 include/drm/drm_connector.h       |  8 ++++++++
 3 files changed, 36 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c
index 25f4b2e9a44f..482014137953 100644
--- a/drivers/gpu/drm/drm_connector.c
+++ b/drivers/gpu/drm/drm_connector.c
@@ -152,6 +152,16 @@ static void drm_connector_free(struct kref *kref)
 	connector->funcs->destroy(connector);
 }
 
+static void drm_connector_free_work_fn(struct work_struct *work)
+{
+	struct drm_connector *connector =
+		container_of(work, struct drm_connector, free_work);
+	struct drm_device *dev = connector->dev;
+
+	drm_mode_object_unregister(dev, &connector->base);
+	connector->funcs->destroy(connector);
+}
+
 /**
  * drm_connector_init - Init a preallocated connector
  * @dev: DRM device
@@ -181,6 +191,8 @@ int drm_connector_init(struct drm_device *dev,
 	if (ret)
 		return ret;
 
+	INIT_WORK(&connector->free_work, drm_connector_free_work_fn);
+
 	connector->base.properties = &connector->properties;
 	connector->dev = dev;
 	connector->funcs = funcs;
@@ -529,6 +541,18 @@ void drm_connector_list_iter_begin(struct drm_device *dev,
 }
 EXPORT_SYMBOL(drm_connector_list_iter_begin);
 
+/*
+ * Extra-safe connector put function that works in any context. Should only be
+ * used from the connector_iter functions, where we never really expect to
+ * actually release the connector when dropping our final reference.
+ */
+static void
+drm_connector_put_safe(struct drm_connector *conn)
+{
+	if (refcount_dec_and_test(&conn->base.refcount.refcount))
+		schedule_work(&conn->free_work);
+}
+
 /**
  * drm_connector_list_iter_next - return next connector
  * @iter: connectr_list iterator
@@ -561,7 +585,7 @@ drm_connector_list_iter_next(struct drm_connector_list_iter *iter)
 	spin_unlock_irqrestore(&config->connector_list_lock, flags);
 
 	if (old_conn)
-		drm_connector_put(old_conn);
+		drm_connector_put_safe(old_conn);
 
 	return iter->conn;
 }
@@ -580,7 +604,7 @@ void drm_connector_list_iter_end(struct drm_connector_list_iter *iter)
 {
 	iter->dev = NULL;
 	if (iter->conn)
-		drm_connector_put(iter->conn);
+		drm_connector_put_safe(iter->conn);
 	lock_release(&connector_list_iter_dep_map, 0, _RET_IP_);
 }
 EXPORT_SYMBOL(drm_connector_list_iter_end);
diff --git a/drivers/gpu/drm/drm_mode_config.c b/drivers/gpu/drm/drm_mode_config.c
index cda8bfab6d3b..cc78b3d9e5e4 100644
--- a/drivers/gpu/drm/drm_mode_config.c
+++ b/drivers/gpu/drm/drm_mode_config.c
@@ -431,6 +431,8 @@ void drm_mode_config_cleanup(struct drm_device *dev)
 		drm_connector_put(connector);
 	}
 	drm_connector_list_iter_end(&conn_iter);
+	/* connector_iter drops references in a work item. */
+	flush_scheduled_work();
 	if (WARN_ON(!list_empty(&dev->mode_config.connector_list))) {
 		drm_connector_list_iter_begin(dev, &conn_iter);
 		drm_for_each_connector_iter(connector, &conn_iter)
diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h
index df9807a3caae..a4649c56ca2f 100644
--- a/include/drm/drm_connector.h
+++ b/include/drm/drm_connector.h
@@ -916,6 +916,14 @@ struct drm_connector {
 	uint8_t num_h_tile, num_v_tile;
 	uint8_t tile_h_loc, tile_v_loc;
 	uint16_t tile_h_size, tile_v_size;
+
+	/**
+	 * @free_work:
+	 *
+	 * Work used only by &drm_connector_iter to be able to clean up a
+	 * connector from any context.
+	 */
+	struct work_struct free_work;
 };
 
 #define obj_to_connector(x) container_of(x, struct drm_connector, base)
-- 
cgit v1.2.3


From af97a77bc01ce49a466f9d4c0125479e2e2230b6 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 6 Dec 2017 09:50:08 +0000
Subject: efi: Move some sysfs files to be read-only by root

Thanks to the scripts/leaking_addresses.pl script, it was found that
some EFI values should not be readable by non-root users.

So make them root-only, and to do that, add a __ATTR_RO_MODE() macro to
make this easier, and use it in other places at the same time.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Tested-by: Dave Young <dyoung@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Cc: stable <stable@vger.kernel.org>
Link: http://lkml.kernel.org/r/20171206095010.24170-2-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/firmware/efi/efi.c         |  3 +--
 drivers/firmware/efi/esrt.c        | 15 ++++++---------
 drivers/firmware/efi/runtime-map.c | 10 +++++-----
 include/linux/sysfs.h              |  6 ++++++
 4 files changed, 18 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index f70febf680c3..c3eefa126e3b 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -143,8 +143,7 @@ static ssize_t systab_show(struct kobject *kobj,
 	return str - buf;
 }
 
-static struct kobj_attribute efi_attr_systab =
-			__ATTR(systab, 0400, systab_show, NULL);
+static struct kobj_attribute efi_attr_systab = __ATTR_RO_MODE(systab, 0400);
 
 #define EFI_FIELD(var) efi.var
 
diff --git a/drivers/firmware/efi/esrt.c b/drivers/firmware/efi/esrt.c
index bd7ed3c1148a..7aae2483fcb9 100644
--- a/drivers/firmware/efi/esrt.c
+++ b/drivers/firmware/efi/esrt.c
@@ -106,7 +106,7 @@ static const struct sysfs_ops esre_attr_ops = {
 };
 
 /* Generic ESRT Entry ("ESRE") support. */
-static ssize_t esre_fw_class_show(struct esre_entry *entry, char *buf)
+static ssize_t fw_class_show(struct esre_entry *entry, char *buf)
 {
 	char *str = buf;
 
@@ -117,18 +117,16 @@ static ssize_t esre_fw_class_show(struct esre_entry *entry, char *buf)
 	return str - buf;
 }
 
-static struct esre_attribute esre_fw_class = __ATTR(fw_class, 0400,
-	esre_fw_class_show, NULL);
+static struct esre_attribute esre_fw_class = __ATTR_RO_MODE(fw_class, 0400);
 
 #define esre_attr_decl(name, size, fmt) \
-static ssize_t esre_##name##_show(struct esre_entry *entry, char *buf) \
+static ssize_t name##_show(struct esre_entry *entry, char *buf) \
 { \
 	return sprintf(buf, fmt "\n", \
 		       le##size##_to_cpu(entry->esre.esre1->name)); \
 } \
 \
-static struct esre_attribute esre_##name = __ATTR(name, 0400, \
-	esre_##name##_show, NULL)
+static struct esre_attribute esre_##name = __ATTR_RO_MODE(name, 0400)
 
 esre_attr_decl(fw_type, 32, "%u");
 esre_attr_decl(fw_version, 32, "%u");
@@ -193,14 +191,13 @@ static int esre_create_sysfs_entry(void *esre, int entry_num)
 
 /* support for displaying ESRT fields at the top level */
 #define esrt_attr_decl(name, size, fmt) \
-static ssize_t esrt_##name##_show(struct kobject *kobj, \
+static ssize_t name##_show(struct kobject *kobj, \
 				  struct kobj_attribute *attr, char *buf)\
 { \
 	return sprintf(buf, fmt "\n", le##size##_to_cpu(esrt->name)); \
 } \
 \
-static struct kobj_attribute esrt_##name = __ATTR(name, 0400, \
-	esrt_##name##_show, NULL)
+static struct kobj_attribute esrt_##name = __ATTR_RO_MODE(name, 0400)
 
 esrt_attr_decl(fw_resource_count, 32, "%u");
 esrt_attr_decl(fw_resource_count_max, 32, "%u");
diff --git a/drivers/firmware/efi/runtime-map.c b/drivers/firmware/efi/runtime-map.c
index 8e64b77aeac9..f377609ff141 100644
--- a/drivers/firmware/efi/runtime-map.c
+++ b/drivers/firmware/efi/runtime-map.c
@@ -63,11 +63,11 @@ static ssize_t map_attr_show(struct kobject *kobj, struct attribute *attr,
 	return map_attr->show(entry, buf);
 }
 
-static struct map_attribute map_type_attr = __ATTR_RO(type);
-static struct map_attribute map_phys_addr_attr   = __ATTR_RO(phys_addr);
-static struct map_attribute map_virt_addr_attr  = __ATTR_RO(virt_addr);
-static struct map_attribute map_num_pages_attr  = __ATTR_RO(num_pages);
-static struct map_attribute map_attribute_attr  = __ATTR_RO(attribute);
+static struct map_attribute map_type_attr = __ATTR_RO_MODE(type, 0400);
+static struct map_attribute map_phys_addr_attr = __ATTR_RO_MODE(phys_addr, 0400);
+static struct map_attribute map_virt_addr_attr = __ATTR_RO_MODE(virt_addr, 0400);
+static struct map_attribute map_num_pages_attr = __ATTR_RO_MODE(num_pages, 0400);
+static struct map_attribute map_attribute_attr = __ATTR_RO_MODE(attribute, 0400);
 
 /*
  * These are default attributes that are added for every memmap entry.
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index e32dfe098e82..40839c02d28c 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -117,6 +117,12 @@ struct attribute_group {
 	.show	= _name##_show,						\
 }
 
+#define __ATTR_RO_MODE(_name, _mode) {					\
+	.attr	= { .name = __stringify(_name),				\
+		    .mode = VERIFY_OCTAL_PERMISSIONS(_mode) },		\
+	.show	= _name##_show,						\
+}
+
 #define __ATTR_WO(_name) {						\
 	.attr	= { .name = __stringify(_name), .mode = S_IWUSR },	\
 	.store	= _name##_store,					\
-- 
cgit v1.2.3


From 7912af5c835bd86f2b0347a480e0f40e2fab30d0 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Wed, 6 Dec 2017 14:55:05 -0600
Subject: PCI: Add pci_get_domain_bus_and_slot() stub

The coretemp driver build fails when CONFIG_PCI is not enabled because it
uses a function that does not have a stub for that config case, so add the
function stub.

  ../drivers/hwmon/coretemp.c: In function 'adjust_tjmax':
  ../drivers/hwmon/coretemp.c:250:9: error: implicit declaration of function 'pci_get_domain_bus_and_slot' [-Werror=implicit-function-declaration]
    struct pci_dev *host_bridge = pci_get_domain_bus_and_slot(0, 0, devfn);
  ../drivers/hwmon/coretemp.c:250:32: warning: initialization makes pointer from integer without a cast [enabled by default]
    struct pci_dev *host_bridge = pci_get_domain_bus_and_slot(0, 0, devfn);

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
[bhelgaas: identical patch also by Arnd Bergmann <arnd@arndb.de>]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Guenter Roeck <linux@roeck-us.net>
---
 include/linux/pci.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 0403894147a3..c170c9250c8b 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1674,6 +1674,9 @@ static inline struct pci_dev *pci_get_slot(struct pci_bus *bus,
 static inline struct pci_dev *pci_get_bus_and_slot(unsigned int bus,
 						unsigned int devfn)
 { return NULL; }
+static inline struct pci_dev *pci_get_domain_bus_and_slot(int domain,
+					unsigned int bus, unsigned int devfn)
+{ return NULL; }
 
 static inline int pci_domain_nr(struct pci_bus *bus) { return 0; }
 static inline struct pci_dev *pci_dev_get(struct pci_dev *dev) { return NULL; }
-- 
cgit v1.2.3


From a4abd7a80addb4a9547f7dfc7812566b60ec505c Mon Sep 17 00:00:00 2001
From: Bjørn Mork <bjorn@mork.no>
Date: Wed, 6 Dec 2017 20:21:24 +0100
Subject: usbnet: fix alignment for frames with no ethernet header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The qmi_wwan minidriver support a 'raw-ip' mode where frames are
received without any ethernet header. This causes alignment issues
because the skbs allocated by usbnet are "IP aligned".

Fix by allowing minidrivers to disable the additional alignment
offset. This is implemented using a per-device flag, since the same
minidriver also supports 'ethernet' mode.

Fixes: 32f7adf633b9 ("net: qmi_wwan: support "raw IP" mode")
Reported-and-tested-by: Jay Foster <jay@systech.com>
Signed-off-by: Bjørn Mork <bjorn@mork.no>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/qmi_wwan.c | 2 ++
 drivers/net/usb/usbnet.c   | 5 ++++-
 include/linux/usb/usbnet.h | 1 +
 3 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index c750cf7c042b..304ec6555cd8 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -261,9 +261,11 @@ static void qmi_wwan_netdev_setup(struct net_device *net)
 		net->hard_header_len = 0;
 		net->addr_len        = 0;
 		net->flags           = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
+		set_bit(EVENT_NO_IP_ALIGN, &dev->flags);
 		netdev_dbg(net, "mode: raw IP\n");
 	} else if (!net->header_ops) { /* don't bother if already set */
 		ether_setup(net);
+		clear_bit(EVENT_NO_IP_ALIGN, &dev->flags);
 		netdev_dbg(net, "mode: Ethernet\n");
 	}
 
diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index 80348b6a8646..d56fe32bf48d 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -484,7 +484,10 @@ static int rx_submit (struct usbnet *dev, struct urb *urb, gfp_t flags)
 		return -ENOLINK;
 	}
 
-	skb = __netdev_alloc_skb_ip_align(dev->net, size, flags);
+	if (test_bit(EVENT_NO_IP_ALIGN, &dev->flags))
+		skb = __netdev_alloc_skb(dev->net, size, flags);
+	else
+		skb = __netdev_alloc_skb_ip_align(dev->net, size, flags);
 	if (!skb) {
 		netif_dbg(dev, rx_err, dev->net, "no rx skb\n");
 		usbnet_defer_kevent (dev, EVENT_RX_MEMORY);
diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h
index a69877734c4e..e2ec3582e549 100644
--- a/include/linux/usb/usbnet.h
+++ b/include/linux/usb/usbnet.h
@@ -82,6 +82,7 @@ struct usbnet {
 #		define EVENT_RX_KILL	10
 #		define EVENT_LINK_CHANGE	11
 #		define EVENT_SET_RX_MODE	12
+#		define EVENT_NO_IP_ALIGN	13
 };
 
 static inline struct usb_driver *driver_of(struct usb_interface *intf)
-- 
cgit v1.2.3


From d4761754b4fb2ef8d9a1e9d121c4bec84e1fe292 Mon Sep 17 00:00:00 2001
From: Yousuk Seung <ysseung@google.com>
Date: Thu, 7 Dec 2017 13:41:34 -0800
Subject: tcp: invalidate rate samples during SACK reneging

Mark tcp_sock during a SACK reneging event and invalidate rate samples
while marked. Such rate samples may overestimate bw by including packets
that were SACKed before reneging.

< ack 6001 win 10000 sack 7001:38001
< ack 7001 win 0 sack 8001:38001 // Reneg detected
> seq 7001:8001 // RTO, SACK cleared.
< ack 38001 win 10000

In above example the rate sample taken after the last ack will count
7001-38001 as delivered while the actual delivery rate likely could
be much lower i.e. 7001-8001.

This patch adds a new field tcp_sock.sack_reneg and marks it when we
declare SACK reneging and entering TCP_CA_Loss, and unmarks it after
the last rate sample was taken before moving back to TCP_CA_Open. This
patch also invalidates rate samples taken while tcp_sock.is_sack_reneg
is set.

Fixes: b9f64820fb22 ("tcp: track data delivery rate for a TCP connection")
Signed-off-by: Yousuk Seung <ysseung@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Priyaranjan Jha <priyarjha@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h  |  3 ++-
 include/net/tcp.h    |  2 +-
 net/ipv4/tcp.c       |  1 +
 net/ipv4/tcp_input.c | 10 ++++++++--
 net/ipv4/tcp_rate.c  | 10 +++++++---
 5 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index df5d97a85e1a..ca4a6361389b 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -224,7 +224,8 @@ struct tcp_sock {
 		rate_app_limited:1,  /* rate_{delivered,interval_us} limited? */
 		fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
 		fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */
-		unused:3;
+		is_sack_reneg:1,    /* in recovery from loss with SACK reneg? */
+		unused:2;
 	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
 		thin_lto    : 1,/* Use linear timeouts for thin streams */
 		unused1	    : 1,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6998707e81f3..6da880d2f022 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1055,7 +1055,7 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
 void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
 			    struct rate_sample *rs);
 void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
-		  struct rate_sample *rs);
+		  bool is_sack_reneg, struct rate_sample *rs);
 void tcp_rate_check_app_limited(struct sock *sk);
 
 /* These functions determine how the current flow behaves in respect of SACK
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index bf97317e6c97..f08eebe60446 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2412,6 +2412,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 	tp->snd_cwnd_cnt = 0;
 	tp->window_clamp = 0;
 	tcp_set_ca_state(sk, TCP_CA_Open);
+	tp->is_sack_reneg = 0;
 	tcp_clear_retrans(tp);
 	inet_csk_delack_init(sk);
 	/* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 514c00732988..075c559570e6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1942,6 +1942,8 @@ void tcp_enter_loss(struct sock *sk)
 	if (is_reneg) {
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
 		tp->sacked_out = 0;
+		/* Mark SACK reneging until we recover from this loss event. */
+		tp->is_sack_reneg = 1;
 	}
 	tcp_clear_all_retrans_hints(tp);
 
@@ -2365,6 +2367,7 @@ static bool tcp_try_undo_recovery(struct sock *sk)
 		return true;
 	}
 	tcp_set_ca_state(sk, TCP_CA_Open);
+	tp->is_sack_reneg = 0;
 	return false;
 }
 
@@ -2398,8 +2401,10 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
 			NET_INC_STATS(sock_net(sk),
 					LINUX_MIB_TCPSPURIOUSRTOS);
 		inet_csk(sk)->icsk_retransmits = 0;
-		if (frto_undo || tcp_is_sack(tp))
+		if (frto_undo || tcp_is_sack(tp)) {
 			tcp_set_ca_state(sk, TCP_CA_Open);
+			tp->is_sack_reneg = 0;
+		}
 		return true;
 	}
 	return false;
@@ -3496,6 +3501,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	struct tcp_sacktag_state sack_state;
 	struct rate_sample rs = { .prior_delivered = 0 };
 	u32 prior_snd_una = tp->snd_una;
+	bool is_sack_reneg = tp->is_sack_reneg;
 	u32 ack_seq = TCP_SKB_CB(skb)->seq;
 	u32 ack = TCP_SKB_CB(skb)->ack_seq;
 	bool is_dupack = false;
@@ -3612,7 +3618,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 
 	delivered = tp->delivered - delivered;	/* freshly ACKed or SACKed */
 	lost = tp->lost - lost;			/* freshly marked lost */
-	tcp_rate_gen(sk, delivered, lost, sack_state.rate);
+	tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
 	tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
 	tcp_xmit_recovery(sk, rexmit);
 	return 1;
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
index 3330a370d306..c61240e43923 100644
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -106,7 +106,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
 
 /* Update the connection delivery information and generate a rate sample. */
 void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
-		  struct rate_sample *rs)
+		  bool is_sack_reneg, struct rate_sample *rs)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 snd_us, ack_us;
@@ -124,8 +124,12 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
 
 	rs->acked_sacked = delivered;	/* freshly ACKed or SACKed */
 	rs->losses = lost;		/* freshly marked lost */
-	/* Return an invalid sample if no timing information is available. */
-	if (!rs->prior_mstamp) {
+	/* Return an invalid sample if no timing information is available or
+	 * in recovery from loss with SACK reneging. Rate samples taken during
+	 * a SACK reneging event may overestimate bw by including packets that
+	 * were SACKed before the reneg.
+	 */
+	if (!rs->prior_mstamp || is_sack_reneg) {
 		rs->delivered = -1;
 		rs->interval_us = -1;
 		return;
-- 
cgit v1.2.3


From f335195adf043168ee69d78ea72ac3e30f0c57ce Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 6 Dec 2017 11:27:57 +0100
Subject: kmemcheck: rip it out for real

Commit 4675ff05de2d ("kmemcheck: rip it out") has removed the code but
for some reason SPDX header stayed in place.  This looks like a rebase
mistake in the mmotm tree or the merge mistake.  Let's drop those
leftovers as well.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/kmemcheck.h | 1 -
 arch/x86/mm/kmemcheck/error.c    | 1 -
 arch/x86/mm/kmemcheck/error.h    | 1 -
 arch/x86/mm/kmemcheck/opcode.c   | 1 -
 arch/x86/mm/kmemcheck/opcode.h   | 1 -
 arch/x86/mm/kmemcheck/pte.c      | 1 -
 arch/x86/mm/kmemcheck/pte.h      | 1 -
 arch/x86/mm/kmemcheck/selftest.c | 1 -
 arch/x86/mm/kmemcheck/selftest.h | 1 -
 arch/x86/mm/kmemcheck/shadow.h   | 1 -
 include/linux/kmemcheck.h        | 1 -
 mm/kmemcheck.c                   | 1 -
 tools/include/linux/kmemcheck.h  | 1 -
 13 files changed, 13 deletions(-)
 delete mode 100644 arch/x86/include/asm/kmemcheck.h
 delete mode 100644 arch/x86/mm/kmemcheck/error.c
 delete mode 100644 arch/x86/mm/kmemcheck/error.h
 delete mode 100644 arch/x86/mm/kmemcheck/opcode.c
 delete mode 100644 arch/x86/mm/kmemcheck/opcode.h
 delete mode 100644 arch/x86/mm/kmemcheck/pte.c
 delete mode 100644 arch/x86/mm/kmemcheck/pte.h
 delete mode 100644 arch/x86/mm/kmemcheck/selftest.c
 delete mode 100644 arch/x86/mm/kmemcheck/selftest.h
 delete mode 100644 arch/x86/mm/kmemcheck/shadow.h
 delete mode 100644 include/linux/kmemcheck.h
 delete mode 100644 mm/kmemcheck.c
 delete mode 100644 tools/include/linux/kmemcheck.h

(limited to 'include')

diff --git a/arch/x86/include/asm/kmemcheck.h b/arch/x86/include/asm/kmemcheck.h
deleted file mode 100644
index ea32a7d3cf1b..000000000000
--- a/arch/x86/include/asm/kmemcheck.h
+++ /dev/null
@@ -1 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
deleted file mode 100644
index cec594032515..000000000000
--- a/arch/x86/mm/kmemcheck/error.c
+++ /dev/null
@@ -1 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h
deleted file mode 100644
index ea32a7d3cf1b..000000000000
--- a/arch/x86/mm/kmemcheck/error.h
+++ /dev/null
@@ -1 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c
deleted file mode 100644
index cec594032515..000000000000
--- a/arch/x86/mm/kmemcheck/opcode.c
+++ /dev/null
@@ -1 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h
deleted file mode 100644
index ea32a7d3cf1b..000000000000
--- a/arch/x86/mm/kmemcheck/opcode.h
+++ /dev/null
@@ -1 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c
deleted file mode 100644
index cec594032515..000000000000
--- a/arch/x86/mm/kmemcheck/pte.c
+++ /dev/null
@@ -1 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h
deleted file mode 100644
index ea32a7d3cf1b..000000000000
--- a/arch/x86/mm/kmemcheck/pte.h
+++ /dev/null
@@ -1 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c
deleted file mode 100644
index cec594032515..000000000000
--- a/arch/x86/mm/kmemcheck/selftest.c
+++ /dev/null
@@ -1 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
diff --git a/arch/x86/mm/kmemcheck/selftest.h b/arch/x86/mm/kmemcheck/selftest.h
deleted file mode 100644
index ea32a7d3cf1b..000000000000
--- a/arch/x86/mm/kmemcheck/selftest.h
+++ /dev/null
@@ -1 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h
deleted file mode 100644
index ea32a7d3cf1b..000000000000
--- a/arch/x86/mm/kmemcheck/shadow.h
+++ /dev/null
@@ -1 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
diff --git a/include/linux/kmemcheck.h b/include/linux/kmemcheck.h
deleted file mode 100644
index ea32a7d3cf1b..000000000000
--- a/include/linux/kmemcheck.h
+++ /dev/null
@@ -1 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
deleted file mode 100644
index cec594032515..000000000000
--- a/mm/kmemcheck.c
+++ /dev/null
@@ -1 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
diff --git a/tools/include/linux/kmemcheck.h b/tools/include/linux/kmemcheck.h
deleted file mode 100644
index ea32a7d3cf1b..000000000000
--- a/tools/include/linux/kmemcheck.h
+++ /dev/null
@@ -1 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-- 
cgit v1.2.3


From 3487972d7fa6c5143951436ada5933dcf0ec659d Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 7 Dec 2017 02:41:18 +0100
Subject: PM / sleep: Avoid excess pm_runtime_enable() calls in device_resume()

Middle-layer code doing suspend-time optimizations for devices with
the DPM_FLAG_SMART_SUSPEND flag set (currently, the PCI bus type and
the ACPI PM domain) needs to make the core skip ->thaw_early and
->thaw callbacks for those devices in some cases and it sets the
power.direct_complete flag for them for this purpose.

However, it turns out that setting power.direct_complete outside of
the PM core is a bad idea as it triggers an excess invocation of
pm_runtime_enable() in device_resume().

For this reason, provide a helper to clear power.is_late_suspended
and power.is_suspended to be invoked by the middle-layer code in
question instead of setting power.direct_complete and make that code
call the new helper.

Fixes: c4b65157aeef (PCI / PM: Take SMART_SUSPEND driver flag into account)
Fixes: 05087360fd7a (ACPI / PM: Take SMART_SUSPEND driver flag into account)
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/acpi/device_pm.c  |  2 +-
 drivers/base/power/main.c | 15 +++++++++++++++
 drivers/pci/pci-driver.c  |  2 +-
 include/linux/pm.h        |  1 +
 4 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c
index e4ffaeec9ec2..a4c8ad98560d 100644
--- a/drivers/acpi/device_pm.c
+++ b/drivers/acpi/device_pm.c
@@ -1138,7 +1138,7 @@ int acpi_subsys_thaw_noirq(struct device *dev)
 	 * skip all of the subsequent "thaw" callbacks for the device.
 	 */
 	if (dev_pm_smart_suspend_and_suspended(dev)) {
-		dev->power.direct_complete = true;
+		dev_pm_skip_next_resume_phases(dev);
 		return 0;
 	}
 
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index db2f04415927..08744b572af6 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -525,6 +525,21 @@ static void dpm_watchdog_clear(struct dpm_watchdog *wd)
 
 /*------------------------- Resume routines -------------------------*/
 
+/**
+ * dev_pm_skip_next_resume_phases - Skip next system resume phases for device.
+ * @dev: Target device.
+ *
+ * Make the core skip the "early resume" and "resume" phases for @dev.
+ *
+ * This function can be called by middle-layer code during the "noirq" phase of
+ * system resume if necessary, but not by device drivers.
+ */
+void dev_pm_skip_next_resume_phases(struct device *dev)
+{
+	dev->power.is_late_suspended = false;
+	dev->power.is_suspended = false;
+}
+
 /**
  * device_resume_noirq - Execute a "noirq resume" callback for given device.
  * @dev: Device to handle.
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 7f47bb72bf30..945099d49f8f 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -999,7 +999,7 @@ static int pci_pm_thaw_noirq(struct device *dev)
 	 * the subsequent "thaw" callbacks for the device.
 	 */
 	if (dev_pm_smart_suspend_and_suspended(dev)) {
-		dev->power.direct_complete = true;
+		dev_pm_skip_next_resume_phases(dev);
 		return 0;
 	}
 
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 65d39115f06d..492ed473ba7e 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -765,6 +765,7 @@ extern int pm_generic_poweroff_late(struct device *dev);
 extern int pm_generic_poweroff(struct device *dev);
 extern void pm_generic_complete(struct device *dev);
 
+extern void dev_pm_skip_next_resume_phases(struct device *dev);
 extern bool dev_pm_smart_suspend_and_suspended(struct device *dev);
 
 #else /* !CONFIG_PM_SLEEP */
-- 
cgit v1.2.3


From a8ceb5dbfde1092b466936bca0ff3be127ecf38e Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 5 Dec 2017 21:29:37 +0200
Subject: ptr_ring: add barriers

Users of ptr_ring expect that it's safe to give the
data structure a pointer and have it be available
to consumers, but that actually requires an smb_wmb
or a stronger barrier.

In absence of such barriers and on architectures that reorder writes,
consumer might read an un=initialized value from an skb pointer stored
in the skb array.  This was observed causing crashes.

To fix, add memory barriers.  The barrier we use is a wmb, the
assumption being that producers do not need to read the value so we do
not need to order these reads.

Reported-by: George Cherian <george.cherian@cavium.com>
Suggested-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ptr_ring.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index 37b4bb2545b3..6866df4f31b5 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -101,12 +101,18 @@ static inline bool ptr_ring_full_bh(struct ptr_ring *r)
 
 /* Note: callers invoking this in a loop must use a compiler barrier,
  * for example cpu_relax(). Callers must hold producer_lock.
+ * Callers are responsible for making sure pointer that is being queued
+ * points to a valid data.
  */
 static inline int __ptr_ring_produce(struct ptr_ring *r, void *ptr)
 {
 	if (unlikely(!r->size) || r->queue[r->producer])
 		return -ENOSPC;
 
+	/* Make sure the pointer we are storing points to a valid data. */
+	/* Pairs with smp_read_barrier_depends in __ptr_ring_consume. */
+	smp_wmb();
+
 	r->queue[r->producer++] = ptr;
 	if (unlikely(r->producer >= r->size))
 		r->producer = 0;
@@ -275,6 +281,9 @@ static inline void *__ptr_ring_consume(struct ptr_ring *r)
 	if (ptr)
 		__ptr_ring_discard_one(r);
 
+	/* Make sure anyone accessing data through the pointer is up to date. */
+	/* Pairs with smp_wmb in __ptr_ring_produce. */
+	smp_read_barrier_depends();
 	return ptr;
 }
 
-- 
cgit v1.2.3


From 200809716aed1cac586fcac4c0551a688439be1f Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 10 Dec 2017 16:56:00 +0800
Subject: fou: fix some member types in guehdr

guehdr struct is used to build or parse gue packets, which
are always in big endian. It's better to define all guehdr
members as __beXX types.

Also, in validate_gue_flags it's not good to use a __be32
variable for both Standard flags(__be16) and Private flags
(__be32), and pass it to other funcions.

This patch could fix a bunch of sparse warnings from fou.

Fixes: 5024c33ac354 ("gue: Add infrastructure for flags and options")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/gue.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/gue.h b/include/net/gue.h
index 2fdb29ca74c2..fdad41469b65 100644
--- a/include/net/gue.h
+++ b/include/net/gue.h
@@ -44,10 +44,10 @@ struct guehdr {
 #else
 #error  "Please fix <asm/byteorder.h>"
 #endif
-			__u8    proto_ctype;
-			__u16   flags;
+			__u8	proto_ctype;
+			__be16	flags;
 		};
-		__u32 word;
+		__be32	word;
 	};
 };
 
@@ -84,11 +84,10 @@ static inline size_t guehdr_priv_flags_len(__be32 flags)
  * if there is an unknown standard or private flags, or the options length for
  * the flags exceeds the options length specific in hlen of the GUE header.
  */
-static inline int validate_gue_flags(struct guehdr *guehdr,
-				     size_t optlen)
+static inline int validate_gue_flags(struct guehdr *guehdr, size_t optlen)
 {
+	__be16 flags = guehdr->flags;
 	size_t len;
-	__be32 flags = guehdr->flags;
 
 	if (flags & ~GUE_FLAGS_ALL)
 		return 1;
@@ -101,12 +100,13 @@ static inline int validate_gue_flags(struct guehdr *guehdr,
 		/* Private flags are last four bytes accounted in
 		 * guehdr_flags_len
 		 */
-		flags = *(__be32 *)((void *)&guehdr[1] + len - GUE_LEN_PRIV);
+		__be32 pflags = *(__be32 *)((void *)&guehdr[1] +
+					    len - GUE_LEN_PRIV);
 
-		if (flags & ~GUE_PFLAGS_ALL)
+		if (pflags & ~GUE_PFLAGS_ALL)
 			return 1;
 
-		len += guehdr_priv_flags_len(flags);
+		len += guehdr_priv_flags_len(pflags);
 		if (len > optlen)
 			return 1;
 	}
-- 
cgit v1.2.3


From d89c70356acf11b7cf47ca5cfcafae5062a85451 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 28 Nov 2017 18:42:19 +0000
Subject: locking/core: Remove break_lock field when CONFIG_GENERIC_LOCKBREAK=y

When CONFIG_GENERIC_LOCKBEAK=y, locking structures grow an extra int ->break_lock
field which is used to implement raw_spin_is_contended() by setting the field
to 1 when waiting on a lock and clearing it to zero when holding a lock.
However, there are a few problems with this approach:

  - There is a write-write race between a CPU successfully taking the lock
    (and subsequently writing break_lock = 0) and a waiter waiting on
    the lock (and subsequently writing break_lock = 1). This could result
    in a contended lock being reported as uncontended and vice-versa.

  - On machines with store buffers, nothing guarantees that the writes
    to break_lock are visible to other CPUs at any particular time.

  - READ_ONCE/WRITE_ONCE are not used, so the field is potentially
    susceptible to harmful compiler optimisations,

Consequently, the usefulness of this field is unclear and we'd be better off
removing it and allowing architectures to implement raw_spin_is_contended() by
providing a definition of arch_spin_is_contended(), as they can when
CONFIG_GENERIC_LOCKBREAK=n.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Sebastian Ott <sebott@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1511894539-7988-3-git-send-email-will.deacon@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/rwlock_types.h   | 3 ---
 include/linux/spinlock.h       | 5 -----
 include/linux/spinlock_types.h | 3 ---
 kernel/locking/spinlock.c      | 9 +--------
 4 files changed, 1 insertion(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h
index cc0072e93e36..857a72ceb794 100644
--- a/include/linux/rwlock_types.h
+++ b/include/linux/rwlock_types.h
@@ -10,9 +10,6 @@
  */
 typedef struct {
 	arch_rwlock_t raw_lock;
-#ifdef CONFIG_GENERIC_LOCKBREAK
-	unsigned int break_lock;
-#endif
 #ifdef CONFIG_DEBUG_SPINLOCK
 	unsigned int magic, owner_cpu;
 	void *owner;
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index a39186194cd6..3bf273538840 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -107,16 +107,11 @@ do {								\
 
 #define raw_spin_is_locked(lock)	arch_spin_is_locked(&(lock)->raw_lock)
 
-#ifdef CONFIG_GENERIC_LOCKBREAK
-#define raw_spin_is_contended(lock) ((lock)->break_lock)
-#else
-
 #ifdef arch_spin_is_contended
 #define raw_spin_is_contended(lock)	arch_spin_is_contended(&(lock)->raw_lock)
 #else
 #define raw_spin_is_contended(lock)	(((void)(lock), 0))
 #endif /*arch_spin_is_contended*/
-#endif
 
 /*
  * This barrier must provide two things:
diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
index 73548eb13a5d..24b4e6f2c1a2 100644
--- a/include/linux/spinlock_types.h
+++ b/include/linux/spinlock_types.h
@@ -19,9 +19,6 @@
 
 typedef struct raw_spinlock {
 	arch_spinlock_t raw_lock;
-#ifdef CONFIG_GENERIC_LOCKBREAK
-	unsigned int break_lock;
-#endif
 #ifdef CONFIG_DEBUG_SPINLOCK
 	unsigned int magic, owner_cpu;
 	void *owner;
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 0ebb253e2199..936f3d14dd6b 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -66,12 +66,8 @@ void __lockfunc __raw_##op##_lock(locktype##_t *lock)			\
 			break;						\
 		preempt_enable();					\
 									\
-		if (!(lock)->break_lock)				\
-			(lock)->break_lock = 1;				\
-									\
 		arch_##op##_relax(&lock->raw_lock);			\
 	}								\
-	(lock)->break_lock = 0;						\
 }									\
 									\
 unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock)	\
@@ -86,12 +82,9 @@ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock)	\
 		local_irq_restore(flags);				\
 		preempt_enable();					\
 									\
-		if (!(lock)->break_lock)				\
-			(lock)->break_lock = 1;				\
-									\
 		arch_##op##_relax(&lock->raw_lock);			\
 	}								\
-	(lock)->break_lock = 0;						\
+									\
 	return flags;							\
 }									\
 									\
-- 
cgit v1.2.3


From e966eaeeb623f09975ef362c2866fae6f86844f9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Tue, 12 Dec 2017 12:31:16 +0100
Subject: locking/lockdep: Remove the cross-release locking checks

This code (CONFIG_LOCKDEP_CROSSRELEASE=y and CONFIG_LOCKDEP_COMPLETIONS=y),
while it found a number of old bugs initially, was also causing too many
false positives that caused people to disable lockdep - which is arguably
a worse overall outcome.

If we disable cross-release by default but keep the code upstream then
in practice the most likely outcome is that we'll allow the situation
to degrade gradually, by allowing entropy to introduce more and more
false positives, until it overwhelms maintenance capacity.

Another bad side effect was that people were trying to work around
the false positives by uglifying/complicating unrelated code. There's
a marked difference between annotating locking operations and
uglifying good code just due to bad lock debugging code ...

This gradual decrease in quality happened to a number of debugging
facilities in the kernel, and lockdep is pretty complex already,
so we cannot risk this outcome.

Either cross-release checking can be done right with no false positives,
or it should not be included in the upstream kernel.

( Note that it might make sense to maintain it out of tree and go through
  the false positives every now and then and see whether new bugs were
  introduced. )

Cc: Byungchul Park <byungchul.park@lge.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/locking/crossrelease.txt | 874 ---------------------------------
 include/linux/completion.h             |  45 --
 include/linux/lockdep.h                | 125 -----
 include/linux/sched.h                  |  11 -
 kernel/locking/lockdep.c               | 652 ++----------------------
 lib/Kconfig.debug                      |  33 --
 6 files changed, 35 insertions(+), 1705 deletions(-)
 delete mode 100644 Documentation/locking/crossrelease.txt

(limited to 'include')

diff --git a/Documentation/locking/crossrelease.txt b/Documentation/locking/crossrelease.txt
deleted file mode 100644
index bdf1423d5f99..000000000000
--- a/Documentation/locking/crossrelease.txt
+++ /dev/null
@@ -1,874 +0,0 @@
-Crossrelease
-============
-
-Started by Byungchul Park <byungchul.park@lge.com>
-
-Contents:
-
- (*) Background
-
-     - What causes deadlock
-     - How lockdep works
-
- (*) Limitation
-
-     - Limit lockdep
-     - Pros from the limitation
-     - Cons from the limitation
-     - Relax the limitation
-
- (*) Crossrelease
-
-     - Introduce crossrelease
-     - Introduce commit
-
- (*) Implementation
-
-     - Data structures
-     - How crossrelease works
-
- (*) Optimizations
-
-     - Avoid duplication
-     - Lockless for hot paths
-
- (*) APPENDIX A: What lockdep does to work aggresively
-
- (*) APPENDIX B: How to avoid adding false dependencies
-
-
-==========
-Background
-==========
-
-What causes deadlock
---------------------
-
-A deadlock occurs when a context is waiting for an event to happen,
-which is impossible because another (or the) context who can trigger the
-event is also waiting for another (or the) event to happen, which is
-also impossible due to the same reason.
-
-For example:
-
-   A context going to trigger event C is waiting for event A to happen.
-   A context going to trigger event A is waiting for event B to happen.
-   A context going to trigger event B is waiting for event C to happen.
-
-A deadlock occurs when these three wait operations run at the same time,
-because event C cannot be triggered if event A does not happen, which in
-turn cannot be triggered if event B does not happen, which in turn
-cannot be triggered if event C does not happen. After all, no event can
-be triggered since any of them never meets its condition to wake up.
-
-A dependency might exist between two waiters and a deadlock might happen
-due to an incorrect releationship between dependencies. Thus, we must
-define what a dependency is first. A dependency exists between them if:
-
-   1. There are two waiters waiting for each event at a given time.
-   2. The only way to wake up each waiter is to trigger its event.
-   3. Whether one can be woken up depends on whether the other can.
-
-Each wait in the example creates its dependency like:
-
-   Event C depends on event A.
-   Event A depends on event B.
-   Event B depends on event C.
-
-   NOTE: Precisely speaking, a dependency is one between whether a
-   waiter for an event can be woken up and whether another waiter for
-   another event can be woken up. However from now on, we will describe
-   a dependency as if it's one between an event and another event for
-   simplicity.
-
-And they form circular dependencies like:
-
-    -> C -> A -> B -
-   /                \
-   \                /
-    ----------------
-
-   where 'A -> B' means that event A depends on event B.
-
-Such circular dependencies lead to a deadlock since no waiter can meet
-its condition to wake up as described.
-
-CONCLUSION
-
-Circular dependencies cause a deadlock.
-
-
-How lockdep works
------------------
-
-Lockdep tries to detect a deadlock by checking dependencies created by
-lock operations, acquire and release. Waiting for a lock corresponds to
-waiting for an event, and releasing a lock corresponds to triggering an
-event in the previous section.
-
-In short, lockdep does:
-
-   1. Detect a new dependency.
-   2. Add the dependency into a global graph.
-   3. Check if that makes dependencies circular.
-   4. Report a deadlock or its possibility if so.
-
-For example, consider a graph built by lockdep that looks like:
-
-   A -> B -
-           \
-            -> E
-           /
-   C -> D -
-
-   where A, B,..., E are different lock classes.
-
-Lockdep will add a dependency into the graph on detection of a new
-dependency. For example, it will add a dependency 'E -> C' when a new
-dependency between lock E and lock C is detected. Then the graph will be:
-
-       A -> B -
-               \
-                -> E -
-               /      \
-    -> C -> D -        \
-   /                   /
-   \                  /
-    ------------------
-
-   where A, B,..., E are different lock classes.
-
-This graph contains a subgraph which demonstrates circular dependencies:
-
-                -> E -
-               /      \
-    -> C -> D -        \
-   /                   /
-   \                  /
-    ------------------
-
-   where C, D and E are different lock classes.
-
-This is the condition under which a deadlock might occur. Lockdep
-reports it on detection after adding a new dependency. This is the way
-how lockdep works.
-
-CONCLUSION
-
-Lockdep detects a deadlock or its possibility by checking if circular
-dependencies were created after adding each new dependency.
-
-
-==========
-Limitation
-==========
-
-Limit lockdep
--------------
-
-Limiting lockdep to work on only typical locks e.g. spin locks and
-mutexes, which are released within the acquire context, the
-implementation becomes simple but its capacity for detection becomes
-limited. Let's check pros and cons in next section.
-
-
-Pros from the limitation
-------------------------
-
-Given the limitation, when acquiring a lock, locks in a held_locks
-cannot be released if the context cannot acquire it so has to wait to
-acquire it, which means all waiters for the locks in the held_locks are
-stuck. It's an exact case to create dependencies between each lock in
-the held_locks and the lock to acquire.
-
-For example:
-
-   CONTEXT X
-   ---------
-   acquire A
-   acquire B /* Add a dependency 'A -> B' */
-   release B
-   release A
-
-   where A and B are different lock classes.
-
-When acquiring lock A, the held_locks of CONTEXT X is empty thus no
-dependency is added. But when acquiring lock B, lockdep detects and adds
-a new dependency 'A -> B' between lock A in the held_locks and lock B.
-They can be simply added whenever acquiring each lock.
-
-And data required by lockdep exists in a local structure, held_locks
-embedded in task_struct. Forcing to access the data within the context,
-lockdep can avoid racy problems without explicit locks while handling
-the local data.
-
-Lastly, lockdep only needs to keep locks currently being held, to build
-a dependency graph. However, relaxing the limitation, it needs to keep
-even locks already released, because a decision whether they created
-dependencies might be long-deferred.
-
-To sum up, we can expect several advantages from the limitation:
-
-   1. Lockdep can easily identify a dependency when acquiring a lock.
-   2. Races are avoidable while accessing local locks in a held_locks.
-   3. Lockdep only needs to keep locks currently being held.
-
-CONCLUSION
-
-Given the limitation, the implementation becomes simple and efficient.
-
-
-Cons from the limitation
-------------------------
-
-Given the limitation, lockdep is applicable only to typical locks. For
-example, page locks for page access or completions for synchronization
-cannot work with lockdep.
-
-Can we detect deadlocks below, under the limitation?
-
-Example 1:
-
-   CONTEXT X	   CONTEXT Y	   CONTEXT Z
-   ---------	   ---------	   ----------
-		   mutex_lock A
-   lock_page B
-		   lock_page B
-				   mutex_lock A /* DEADLOCK */
-				   unlock_page B held by X
-		   unlock_page B
-		   mutex_unlock A
-				   mutex_unlock A
-
-   where A and B are different lock classes.
-
-No, we cannot.
-
-Example 2:
-
-   CONTEXT X		   CONTEXT Y
-   ---------		   ---------
-			   mutex_lock A
-   mutex_lock A
-			   wait_for_complete B /* DEADLOCK */
-   complete B
-			   mutex_unlock A
-   mutex_unlock A
-
-   where A is a lock class and B is a completion variable.
-
-No, we cannot.
-
-CONCLUSION
-
-Given the limitation, lockdep cannot detect a deadlock or its
-possibility caused by page locks or completions.
-
-
-Relax the limitation
---------------------
-
-Under the limitation, things to create dependencies are limited to
-typical locks. However, synchronization primitives like page locks and
-completions, which are allowed to be released in any context, also
-create dependencies and can cause a deadlock. So lockdep should track
-these locks to do a better job. We have to relax the limitation for
-these locks to work with lockdep.
-
-Detecting dependencies is very important for lockdep to work because
-adding a dependency means adding an opportunity to check whether it
-causes a deadlock. The more lockdep adds dependencies, the more it
-thoroughly works. Thus Lockdep has to do its best to detect and add as
-many true dependencies into a graph as possible.
-
-For example, considering only typical locks, lockdep builds a graph like:
-
-   A -> B -
-           \
-            -> E
-           /
-   C -> D -
-
-   where A, B,..., E are different lock classes.
-
-On the other hand, under the relaxation, additional dependencies might
-be created and added. Assuming additional 'FX -> C' and 'E -> GX' are
-added thanks to the relaxation, the graph will be:
-
-         A -> B -
-                 \
-                  -> E -> GX
-                 /
-   FX -> C -> D -
-
-   where A, B,..., E, FX and GX are different lock classes, and a suffix
-   'X' is added on non-typical locks.
-
-The latter graph gives us more chances to check circular dependencies
-than the former. However, it might suffer performance degradation since
-relaxing the limitation, with which design and implementation of lockdep
-can be efficient, might introduce inefficiency inevitably. So lockdep
-should provide two options, strong detection and efficient detection.
-
-Choosing efficient detection:
-
-   Lockdep works with only locks restricted to be released within the
-   acquire context. However, lockdep works efficiently.
-
-Choosing strong detection:
-
-   Lockdep works with all synchronization primitives. However, lockdep
-   suffers performance degradation.
-
-CONCLUSION
-
-Relaxing the limitation, lockdep can add additional dependencies giving
-additional opportunities to check circular dependencies.
-
-
-============
-Crossrelease
-============
-
-Introduce crossrelease
-----------------------
-
-In order to allow lockdep to handle additional dependencies by what
-might be released in any context, namely 'crosslock', we have to be able
-to identify those created by crosslocks. The proposed 'crossrelease'
-feature provoides a way to do that.
-
-Crossrelease feature has to do:
-
-   1. Identify dependencies created by crosslocks.
-   2. Add the dependencies into a dependency graph.
-
-That's all. Once a meaningful dependency is added into graph, then
-lockdep would work with the graph as it did. The most important thing
-crossrelease feature has to do is to correctly identify and add true
-dependencies into the global graph.
-
-A dependency e.g. 'A -> B' can be identified only in the A's release
-context because a decision required to identify the dependency can be
-made only in the release context. That is to decide whether A can be
-released so that a waiter for A can be woken up. It cannot be made in
-other than the A's release context.
-
-It's no matter for typical locks because each acquire context is same as
-its release context, thus lockdep can decide whether a lock can be
-released in the acquire context. However for crosslocks, lockdep cannot
-make the decision in the acquire context but has to wait until the
-release context is identified.
-
-Therefore, deadlocks by crosslocks cannot be detected just when it
-happens, because those cannot be identified until the crosslocks are
-released. However, deadlock possibilities can be detected and it's very
-worth. See 'APPENDIX A' section to check why.
-
-CONCLUSION
-
-Using crossrelease feature, lockdep can work with what might be released
-in any context, namely crosslock.
-
-
-Introduce commit
-----------------
-
-Since crossrelease defers the work adding true dependencies of
-crosslocks until they are actually released, crossrelease has to queue
-all acquisitions which might create dependencies with the crosslocks.
-Then it identifies dependencies using the queued data in batches at a
-proper time. We call it 'commit'.
-
-There are four types of dependencies:
-
-1. TT type: 'typical lock A -> typical lock B'
-
-   Just when acquiring B, lockdep can see it's in the A's release
-   context. So the dependency between A and B can be identified
-   immediately. Commit is unnecessary.
-
-2. TC type: 'typical lock A -> crosslock BX'
-
-   Just when acquiring BX, lockdep can see it's in the A's release
-   context. So the dependency between A and BX can be identified
-   immediately. Commit is unnecessary, too.
-
-3. CT type: 'crosslock AX -> typical lock B'
-
-   When acquiring B, lockdep cannot identify the dependency because
-   there's no way to know if it's in the AX's release context. It has
-   to wait until the decision can be made. Commit is necessary.
-
-4. CC type: 'crosslock AX -> crosslock BX'
-
-   When acquiring BX, lockdep cannot identify the dependency because
-   there's no way to know if it's in the AX's release context. It has
-   to wait until the decision can be made. Commit is necessary.
-   But, handling CC type is not implemented yet. It's a future work.
-
-Lockdep can work without commit for typical locks, but commit step is
-necessary once crosslocks are involved. Introducing commit, lockdep
-performs three steps. What lockdep does in each step is:
-
-1. Acquisition: For typical locks, lockdep does what it originally did
-   and queues the lock so that CT type dependencies can be checked using
-   it at the commit step. For crosslocks, it saves data which will be
-   used at the commit step and increases a reference count for it.
-
-2. Commit: No action is reauired for typical locks. For crosslocks,
-   lockdep adds CT type dependencies using the data saved at the
-   acquisition step.
-
-3. Release: No changes are required for typical locks. When a crosslock
-   is released, it decreases a reference count for it.
-
-CONCLUSION
-
-Crossrelease introduces commit step to handle dependencies of crosslocks
-in batches at a proper time.
-
-
-==============
-Implementation
-==============
-
-Data structures
----------------
-
-Crossrelease introduces two main data structures.
-
-1. hist_lock
-
-   This is an array embedded in task_struct, for keeping lock history so
-   that dependencies can be added using them at the commit step. Since
-   it's local data, it can be accessed locklessly in the owner context.
-   The array is filled at the acquisition step and consumed at the
-   commit step. And it's managed in circular manner.
-
-2. cross_lock
-
-   One per lockdep_map exists. This is for keeping data of crosslocks
-   and used at the commit step.
-
-
-How crossrelease works
-----------------------
-
-It's the key of how crossrelease works, to defer necessary works to an
-appropriate point in time and perform in at once at the commit step.
-Let's take a look with examples step by step, starting from how lockdep
-works without crossrelease for typical locks.
-
-   acquire A /* Push A onto held_locks */
-   acquire B /* Push B onto held_locks and add 'A -> B' */
-   acquire C /* Push C onto held_locks and add 'B -> C' */
-   release C /* Pop C from held_locks */
-   release B /* Pop B from held_locks */
-   release A /* Pop A from held_locks */
-
-   where A, B and C are different lock classes.
-
-   NOTE: This document assumes that readers already understand how
-   lockdep works without crossrelease thus omits details. But there's
-   one thing to note. Lockdep pretends to pop a lock from held_locks
-   when releasing it. But it's subtly different from the original pop
-   operation because lockdep allows other than the top to be poped.
-
-In this case, lockdep adds 'the top of held_locks -> the lock to acquire'
-dependency every time acquiring a lock.
-
-After adding 'A -> B', a dependency graph will be:
-
-   A -> B
-
-   where A and B are different lock classes.
-
-And after adding 'B -> C', the graph will be:
-
-   A -> B -> C
-
-   where A, B and C are different lock classes.
-
-Let's performs commit step even for typical locks to add dependencies.
-Of course, commit step is not necessary for them, however, it would work
-well because this is a more general way.
-
-   acquire A
-   /*
-    * Queue A into hist_locks
-    *
-    * In hist_locks: A
-    * In graph: Empty
-    */
-
-   acquire B
-   /*
-    * Queue B into hist_locks
-    *
-    * In hist_locks: A, B
-    * In graph: Empty
-    */
-
-   acquire C
-   /*
-    * Queue C into hist_locks
-    *
-    * In hist_locks: A, B, C
-    * In graph: Empty
-    */
-
-   commit C
-   /*
-    * Add 'C -> ?'
-    * Answer the following to decide '?'
-    * What has been queued since acquire C: Nothing
-    *
-    * In hist_locks: A, B, C
-    * In graph: Empty
-    */
-
-   release C
-
-   commit B
-   /*
-    * Add 'B -> ?'
-    * Answer the following to decide '?'
-    * What has been queued since acquire B: C
-    *
-    * In hist_locks: A, B, C
-    * In graph: 'B -> C'
-    */
-
-   release B
-
-   commit A
-   /*
-    * Add 'A -> ?'
-    * Answer the following to decide '?'
-    * What has been queued since acquire A: B, C
-    *
-    * In hist_locks: A, B, C
-    * In graph: 'B -> C', 'A -> B', 'A -> C'
-    */
-
-   release A
-
-   where A, B and C are different lock classes.
-
-In this case, dependencies are added at the commit step as described.
-
-After commits for A, B and C, the graph will be:
-
-   A -> B -> C
-
-   where A, B and C are different lock classes.
-
-   NOTE: A dependency 'A -> C' is optimized out.
-
-We can see the former graph built without commit step is same as the
-latter graph built using commit steps. Of course the former way leads to
-earlier finish for building the graph, which means we can detect a
-deadlock or its possibility sooner. So the former way would be prefered
-when possible. But we cannot avoid using the latter way for crosslocks.
-
-Let's look at how commit steps work for crosslocks. In this case, the
-commit step is performed only on crosslock AX as real. And it assumes
-that the AX release context is different from the AX acquire context.
-
-   BX RELEASE CONTEXT		   BX ACQUIRE CONTEXT
-   ------------------		   ------------------
-				   acquire A
-				   /*
-				    * Push A onto held_locks
-				    * Queue A into hist_locks
-				    *
-				    * In held_locks: A
-				    * In hist_locks: A
-				    * In graph: Empty
-				    */
-
-				   acquire BX
-				   /*
-				    * Add 'the top of held_locks -> BX'
-				    *
-				    * In held_locks: A
-				    * In hist_locks: A
-				    * In graph: 'A -> BX'
-				    */
-
-   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-   It must be guaranteed that the following operations are seen after
-   acquiring BX globally. It can be done by things like barrier.
-   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-   acquire C
-   /*
-    * Push C onto held_locks
-    * Queue C into hist_locks
-    *
-    * In held_locks: C
-    * In hist_locks: C
-    * In graph: 'A -> BX'
-    */
-
-   release C
-   /*
-    * Pop C from held_locks
-    *
-    * In held_locks: Empty
-    * In hist_locks: C
-    * In graph: 'A -> BX'
-    */
-				   acquire D
-				   /*
-				    * Push D onto held_locks
-				    * Queue D into hist_locks
-				    * Add 'the top of held_locks -> D'
-				    *
-				    * In held_locks: A, D
-				    * In hist_locks: A, D
-				    * In graph: 'A -> BX', 'A -> D'
-				    */
-   acquire E
-   /*
-    * Push E onto held_locks
-    * Queue E into hist_locks
-    *
-    * In held_locks: E
-    * In hist_locks: C, E
-    * In graph: 'A -> BX', 'A -> D'
-    */
-
-   release E
-   /*
-    * Pop E from held_locks
-    *
-    * In held_locks: Empty
-    * In hist_locks: D, E
-    * In graph: 'A -> BX', 'A -> D'
-    */
-				   release D
-				   /*
-				    * Pop D from held_locks
-				    *
-				    * In held_locks: A
-				    * In hist_locks: A, D
-				    * In graph: 'A -> BX', 'A -> D'
-				    */
-   commit BX
-   /*
-    * Add 'BX -> ?'
-    * What has been queued since acquire BX: C, E
-    *
-    * In held_locks: Empty
-    * In hist_locks: D, E
-    * In graph: 'A -> BX', 'A -> D',
-    *           'BX -> C', 'BX -> E'
-    */
-
-   release BX
-   /*
-    * In held_locks: Empty
-    * In hist_locks: D, E
-    * In graph: 'A -> BX', 'A -> D',
-    *           'BX -> C', 'BX -> E'
-    */
-				   release A
-				   /*
-				    * Pop A from held_locks
-				    *
-				    * In held_locks: Empty
-				    * In hist_locks: A, D
-				    * In graph: 'A -> BX', 'A -> D',
-				    *           'BX -> C', 'BX -> E'
-				    */
-
-   where A, BX, C,..., E are different lock classes, and a suffix 'X' is
-   added on crosslocks.
-
-Crossrelease considers all acquisitions after acqiuring BX are
-candidates which might create dependencies with BX. True dependencies
-will be determined when identifying the release context of BX. Meanwhile,
-all typical locks are queued so that they can be used at the commit step.
-And then two dependencies 'BX -> C' and 'BX -> E' are added at the
-commit step when identifying the release context.
-
-The final graph will be, with crossrelease:
-
-               -> C
-              /
-       -> BX -
-      /       \
-   A -         -> E
-      \
-       -> D
-
-   where A, BX, C,..., E are different lock classes, and a suffix 'X' is
-   added on crosslocks.
-
-However, the final graph will be, without crossrelease:
-
-   A -> D
-
-   where A and D are different lock classes.
-
-The former graph has three more dependencies, 'A -> BX', 'BX -> C' and
-'BX -> E' giving additional opportunities to check if they cause
-deadlocks. This way lockdep can detect a deadlock or its possibility
-caused by crosslocks.
-
-CONCLUSION
-
-We checked how crossrelease works with several examples.
-
-
-=============
-Optimizations
-=============
-
-Avoid duplication
------------------
-
-Crossrelease feature uses a cache like what lockdep already uses for
-dependency chains, but this time it's for caching CT type dependencies.
-Once that dependency is cached, the same will never be added again.
-
-
-Lockless for hot paths
-----------------------
-
-To keep all locks for later use at the commit step, crossrelease adopts
-a local array embedded in task_struct, which makes access to the data
-lockless by forcing it to happen only within the owner context. It's
-like how lockdep handles held_locks. Lockless implmentation is important
-since typical locks are very frequently acquired and released.
-
-
-=================================================
-APPENDIX A: What lockdep does to work aggresively
-=================================================
-
-A deadlock actually occurs when all wait operations creating circular
-dependencies run at the same time. Even though they don't, a potential
-deadlock exists if the problematic dependencies exist. Thus it's
-meaningful to detect not only an actual deadlock but also its potential
-possibility. The latter is rather valuable. When a deadlock occurs
-actually, we can identify what happens in the system by some means or
-other even without lockdep. However, there's no way to detect possiblity
-without lockdep unless the whole code is parsed in head. It's terrible.
-Lockdep does the both, and crossrelease only focuses on the latter.
-
-Whether or not a deadlock actually occurs depends on several factors.
-For example, what order contexts are switched in is a factor. Assuming
-circular dependencies exist, a deadlock would occur when contexts are
-switched so that all wait operations creating the dependencies run
-simultaneously. Thus to detect a deadlock possibility even in the case
-that it has not occured yet, lockdep should consider all possible
-combinations of dependencies, trying to:
-
-1. Use a global dependency graph.
-
-   Lockdep combines all dependencies into one global graph and uses them,
-   regardless of which context generates them or what order contexts are
-   switched in. Aggregated dependencies are only considered so they are
-   prone to be circular if a problem exists.
-
-2. Check dependencies between classes instead of instances.
-
-   What actually causes a deadlock are instances of lock. However,
-   lockdep checks dependencies between classes instead of instances.
-   This way lockdep can detect a deadlock which has not happened but
-   might happen in future by others but the same class.
-
-3. Assume all acquisitions lead to waiting.
-
-   Although locks might be acquired without waiting which is essential
-   to create dependencies, lockdep assumes all acquisitions lead to
-   waiting since it might be true some time or another.
-
-CONCLUSION
-
-Lockdep detects not only an actual deadlock but also its possibility,
-and the latter is more valuable.
-
-
-==================================================
-APPENDIX B: How to avoid adding false dependencies
-==================================================
-
-Remind what a dependency is. A dependency exists if:
-
-   1. There are two waiters waiting for each event at a given time.
-   2. The only way to wake up each waiter is to trigger its event.
-   3. Whether one can be woken up depends on whether the other can.
-
-For example:
-
-   acquire A
-   acquire B /* A dependency 'A -> B' exists */
-   release B
-   release A
-
-   where A and B are different lock classes.
-
-A depedency 'A -> B' exists since:
-
-   1. A waiter for A and a waiter for B might exist when acquiring B.
-   2. Only way to wake up each is to release what it waits for.
-   3. Whether the waiter for A can be woken up depends on whether the
-      other can. IOW, TASK X cannot release A if it fails to acquire B.
-
-For another example:
-
-   TASK X			   TASK Y
-   ------			   ------
-				   acquire AX
-   acquire B /* A dependency 'AX -> B' exists */
-   release B
-   release AX held by Y
-
-   where AX and B are different lock classes, and a suffix 'X' is added
-   on crosslocks.
-
-Even in this case involving crosslocks, the same rule can be applied. A
-depedency 'AX -> B' exists since:
-
-   1. A waiter for AX and a waiter for B might exist when acquiring B.
-   2. Only way to wake up each is to release what it waits for.
-   3. Whether the waiter for AX can be woken up depends on whether the
-      other can. IOW, TASK X cannot release AX if it fails to acquire B.
-
-Let's take a look at more complicated example:
-
-   TASK X			   TASK Y
-   ------			   ------
-   acquire B
-   release B
-   fork Y
-				   acquire AX
-   acquire C /* A dependency 'AX -> C' exists */
-   release C
-   release AX held by Y
-
-   where AX, B and C are different lock classes, and a suffix 'X' is
-   added on crosslocks.
-
-Does a dependency 'AX -> B' exist? Nope.
-
-Two waiters are essential to create a dependency. However, waiters for
-AX and B to create 'AX -> B' cannot exist at the same time in this
-example. Thus the dependency 'AX -> B' cannot be created.
-
-It would be ideal if the full set of true ones can be considered. But
-we can ensure nothing but what actually happened. Relying on what
-actually happens at runtime, we can anyway add only true ones, though
-they might be a subset of true ones. It's similar to how lockdep works
-for typical locks. There might be more true dependencies than what
-lockdep has detected in runtime. Lockdep has no choice but to rely on
-what actually happens. Crossrelease also relies on it.
-
-CONCLUSION
-
-Relying on what actually happens, lockdep can avoid adding false
-dependencies.
diff --git a/include/linux/completion.h b/include/linux/completion.h
index 0662a417febe..94a59ba7d422 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -10,9 +10,6 @@
  */
 
 #include <linux/wait.h>
-#ifdef CONFIG_LOCKDEP_COMPLETIONS
-#include <linux/lockdep.h>
-#endif
 
 /*
  * struct completion - structure used to maintain state for a "completion"
@@ -29,58 +26,16 @@
 struct completion {
 	unsigned int done;
 	wait_queue_head_t wait;
-#ifdef CONFIG_LOCKDEP_COMPLETIONS
-	struct lockdep_map_cross map;
-#endif
 };
 
-#ifdef CONFIG_LOCKDEP_COMPLETIONS
-static inline void complete_acquire(struct completion *x)
-{
-	lock_acquire_exclusive((struct lockdep_map *)&x->map, 0, 0, NULL, _RET_IP_);
-}
-
-static inline void complete_release(struct completion *x)
-{
-	lock_release((struct lockdep_map *)&x->map, 0, _RET_IP_);
-}
-
-static inline void complete_release_commit(struct completion *x)
-{
-	lock_commit_crosslock((struct lockdep_map *)&x->map);
-}
-
-#define init_completion_map(x, m)					\
-do {									\
-	lockdep_init_map_crosslock((struct lockdep_map *)&(x)->map,	\
-			(m)->name, (m)->key, 0);				\
-	__init_completion(x);						\
-} while (0)
-
-#define init_completion(x)						\
-do {									\
-	static struct lock_class_key __key;				\
-	lockdep_init_map_crosslock((struct lockdep_map *)&(x)->map,	\
-			"(completion)" #x,				\
-			&__key, 0);					\
-	__init_completion(x);						\
-} while (0)
-#else
 #define init_completion_map(x, m) __init_completion(x)
 #define init_completion(x) __init_completion(x)
 static inline void complete_acquire(struct completion *x) {}
 static inline void complete_release(struct completion *x) {}
 static inline void complete_release_commit(struct completion *x) {}
-#endif
 
-#ifdef CONFIG_LOCKDEP_COMPLETIONS
-#define COMPLETION_INITIALIZER(work) \
-	{ 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait), \
-	STATIC_CROSS_LOCKDEP_MAP_INIT("(completion)" #work, &(work)) }
-#else
 #define COMPLETION_INITIALIZER(work) \
 	{ 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
-#endif
 
 #define COMPLETION_INITIALIZER_ONSTACK_MAP(work, map) \
 	(*({ init_completion_map(&(work), &(map)); &(work); }))
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index a842551fe044..2e75dc34bff5 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -158,12 +158,6 @@ struct lockdep_map {
 	int				cpu;
 	unsigned long			ip;
 #endif
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-	/*
-	 * Whether it's a crosslock.
-	 */
-	int				cross;
-#endif
 };
 
 static inline void lockdep_copy_map(struct lockdep_map *to,
@@ -267,95 +261,8 @@ struct held_lock {
 	unsigned int hardirqs_off:1;
 	unsigned int references:12;					/* 32 bits */
 	unsigned int pin_count;
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-	/*
-	 * Generation id.
-	 *
-	 * A value of cross_gen_id will be stored when holding this,
-	 * which is globally increased whenever each crosslock is held.
-	 */
-	unsigned int gen_id;
-#endif
-};
-
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-#define MAX_XHLOCK_TRACE_ENTRIES 5
-
-/*
- * This is for keeping locks waiting for commit so that true dependencies
- * can be added at commit step.
- */
-struct hist_lock {
-	/*
-	 * Id for each entry in the ring buffer. This is used to
-	 * decide whether the ring buffer was overwritten or not.
-	 *
-	 * For example,
-	 *
-	 *           |<----------- hist_lock ring buffer size ------->|
-	 *           pppppppppppppppppppppiiiiiiiiiiiiiiiiiiiiiiiiiiiii
-	 * wrapped > iiiiiiiiiiiiiiiiiiiiiiiiiii.......................
-	 *
-	 *           where 'p' represents an acquisition in process
-	 *           context, 'i' represents an acquisition in irq
-	 *           context.
-	 *
-	 * In this example, the ring buffer was overwritten by
-	 * acquisitions in irq context, that should be detected on
-	 * rollback or commit.
-	 */
-	unsigned int hist_id;
-
-	/*
-	 * Seperate stack_trace data. This will be used at commit step.
-	 */
-	struct stack_trace	trace;
-	unsigned long		trace_entries[MAX_XHLOCK_TRACE_ENTRIES];
-
-	/*
-	 * Seperate hlock instance. This will be used at commit step.
-	 *
-	 * TODO: Use a smaller data structure containing only necessary
-	 * data. However, we should make lockdep code able to handle the
-	 * smaller one first.
-	 */
-	struct held_lock	hlock;
 };
 
-/*
- * To initialize a lock as crosslock, lockdep_init_map_crosslock() should
- * be called instead of lockdep_init_map().
- */
-struct cross_lock {
-	/*
-	 * When more than one acquisition of crosslocks are overlapped,
-	 * we have to perform commit for them based on cross_gen_id of
-	 * the first acquisition, which allows us to add more true
-	 * dependencies.
-	 *
-	 * Moreover, when no acquisition of a crosslock is in progress,
-	 * we should not perform commit because the lock might not exist
-	 * any more, which might cause incorrect memory access. So we
-	 * have to track the number of acquisitions of a crosslock.
-	 */
-	int nr_acquire;
-
-	/*
-	 * Seperate hlock instance. This will be used at commit step.
-	 *
-	 * TODO: Use a smaller data structure containing only necessary
-	 * data. However, we should make lockdep code able to handle the
-	 * smaller one first.
-	 */
-	struct held_lock	hlock;
-};
-
-struct lockdep_map_cross {
-	struct lockdep_map map;
-	struct cross_lock xlock;
-};
-#endif
-
 /*
  * Initialization, self-test and debugging-output methods:
  */
@@ -560,37 +467,6 @@ enum xhlock_context_t {
 	XHLOCK_CTX_NR,
 };
 
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-extern void lockdep_init_map_crosslock(struct lockdep_map *lock,
-				       const char *name,
-				       struct lock_class_key *key,
-				       int subclass);
-extern void lock_commit_crosslock(struct lockdep_map *lock);
-
-/*
- * What we essencially have to initialize is 'nr_acquire'. Other members
- * will be initialized in add_xlock().
- */
-#define STATIC_CROSS_LOCK_INIT() \
-	{ .nr_acquire = 0,}
-
-#define STATIC_CROSS_LOCKDEP_MAP_INIT(_name, _key) \
-	{ .map.name = (_name), .map.key = (void *)(_key), \
-	  .map.cross = 1, .xlock = STATIC_CROSS_LOCK_INIT(), }
-
-/*
- * To initialize a lockdep_map statically use this macro.
- * Note that _name must not be NULL.
- */
-#define STATIC_LOCKDEP_MAP_INIT(_name, _key) \
-	{ .name = (_name), .key = (void *)(_key), .cross = 0, }
-
-extern void crossrelease_hist_start(enum xhlock_context_t c);
-extern void crossrelease_hist_end(enum xhlock_context_t c);
-extern void lockdep_invariant_state(bool force);
-extern void lockdep_init_task(struct task_struct *task);
-extern void lockdep_free_task(struct task_struct *task);
-#else /* !CROSSRELEASE */
 #define lockdep_init_map_crosslock(m, n, k, s) do {} while (0)
 /*
  * To initialize a lockdep_map statically use this macro.
@@ -604,7 +480,6 @@ static inline void crossrelease_hist_end(enum xhlock_context_t c) {}
 static inline void lockdep_invariant_state(bool force) {}
 static inline void lockdep_init_task(struct task_struct *task) {}
 static inline void lockdep_free_task(struct task_struct *task) {}
-#endif /* CROSSRELEASE */
 
 #ifdef CONFIG_LOCK_STAT
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 21991d668d35..9ce6c3001e9f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -849,17 +849,6 @@ struct task_struct {
 	struct held_lock		held_locks[MAX_LOCK_DEPTH];
 #endif
 
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-#define MAX_XHLOCKS_NR 64UL
-	struct hist_lock *xhlocks; /* Crossrelease history locks */
-	unsigned int xhlock_idx;
-	/* For restoring at history boundaries */
-	unsigned int xhlock_idx_hist[XHLOCK_CTX_NR];
-	unsigned int hist_id;
-	/* For overwrite check at each context exit */
-	unsigned int hist_id_save[XHLOCK_CTX_NR];
-#endif
-
 #ifdef CONFIG_UBSAN
 	unsigned int			in_ubsan;
 #endif
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 670d8d7d8087..5fa1324a4f29 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -57,10 +57,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/lock.h>
 
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-#include <linux/slab.h>
-#endif
-
 #ifdef CONFIG_PROVE_LOCKING
 int prove_locking = 1;
 module_param(prove_locking, int, 0644);
@@ -75,19 +71,6 @@ module_param(lock_stat, int, 0644);
 #define lock_stat 0
 #endif
 
-#ifdef CONFIG_BOOTPARAM_LOCKDEP_CROSSRELEASE_FULLSTACK
-static int crossrelease_fullstack = 1;
-#else
-static int crossrelease_fullstack;
-#endif
-static int __init allow_crossrelease_fullstack(char *str)
-{
-	crossrelease_fullstack = 1;
-	return 0;
-}
-
-early_param("crossrelease_fullstack", allow_crossrelease_fullstack);
-
 /*
  * lockdep_lock: protects the lockdep graph, the hashes and the
  *               class/list/hash allocators.
@@ -740,18 +723,6 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
 	return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL);
 }
 
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-static void cross_init(struct lockdep_map *lock, int cross);
-static int cross_lock(struct lockdep_map *lock);
-static int lock_acquire_crosslock(struct held_lock *hlock);
-static int lock_release_crosslock(struct lockdep_map *lock);
-#else
-static inline void cross_init(struct lockdep_map *lock, int cross) {}
-static inline int cross_lock(struct lockdep_map *lock) { return 0; }
-static inline int lock_acquire_crosslock(struct held_lock *hlock) { return 2; }
-static inline int lock_release_crosslock(struct lockdep_map *lock) { return 2; }
-#endif
-
 /*
  * Register a lock's class in the hash-table, if the class is not present
  * yet. Otherwise we look it up. We cache the result in the lock object
@@ -1151,41 +1122,22 @@ print_circular_lock_scenario(struct held_lock *src,
 		printk(KERN_CONT "\n\n");
 	}
 
-	if (cross_lock(tgt->instance)) {
-		printk(" Possible unsafe locking scenario by crosslock:\n\n");
-		printk("       CPU0                    CPU1\n");
-		printk("       ----                    ----\n");
-		printk("  lock(");
-		__print_lock_name(parent);
-		printk(KERN_CONT ");\n");
-		printk("  lock(");
-		__print_lock_name(target);
-		printk(KERN_CONT ");\n");
-		printk("                               lock(");
-		__print_lock_name(source);
-		printk(KERN_CONT ");\n");
-		printk("                               unlock(");
-		__print_lock_name(target);
-		printk(KERN_CONT ");\n");
-		printk("\n *** DEADLOCK ***\n\n");
-	} else {
-		printk(" Possible unsafe locking scenario:\n\n");
-		printk("       CPU0                    CPU1\n");
-		printk("       ----                    ----\n");
-		printk("  lock(");
-		__print_lock_name(target);
-		printk(KERN_CONT ");\n");
-		printk("                               lock(");
-		__print_lock_name(parent);
-		printk(KERN_CONT ");\n");
-		printk("                               lock(");
-		__print_lock_name(target);
-		printk(KERN_CONT ");\n");
-		printk("  lock(");
-		__print_lock_name(source);
-		printk(KERN_CONT ");\n");
-		printk("\n *** DEADLOCK ***\n\n");
-	}
+	printk(" Possible unsafe locking scenario:\n\n");
+	printk("       CPU0                    CPU1\n");
+	printk("       ----                    ----\n");
+	printk("  lock(");
+	__print_lock_name(target);
+	printk(KERN_CONT ");\n");
+	printk("                               lock(");
+	__print_lock_name(parent);
+	printk(KERN_CONT ");\n");
+	printk("                               lock(");
+	__print_lock_name(target);
+	printk(KERN_CONT ");\n");
+	printk("  lock(");
+	__print_lock_name(source);
+	printk(KERN_CONT ");\n");
+	printk("\n *** DEADLOCK ***\n\n");
 }
 
 /*
@@ -1211,10 +1163,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
 		curr->comm, task_pid_nr(curr));
 	print_lock(check_src);
 
-	if (cross_lock(check_tgt->instance))
-		pr_warn("\nbut now in release context of a crosslock acquired at the following:\n");
-	else
-		pr_warn("\nbut task is already holding lock:\n");
+	pr_warn("\nbut task is already holding lock:\n");
 
 	print_lock(check_tgt);
 	pr_warn("\nwhich lock already depends on the new lock.\n\n");
@@ -1244,9 +1193,7 @@ static noinline int print_circular_bug(struct lock_list *this,
 	if (!debug_locks_off_graph_unlock() || debug_locks_silent)
 		return 0;
 
-	if (cross_lock(check_tgt->instance))
-		this->trace = *trace;
-	else if (!save_trace(&this->trace))
+	if (!save_trace(&this->trace))
 		return 0;
 
 	depth = get_lock_depth(target);
@@ -1850,9 +1797,6 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
 		if (nest)
 			return 2;
 
-		if (cross_lock(prev->instance))
-			continue;
-
 		return print_deadlock_bug(curr, prev, next);
 	}
 	return 1;
@@ -2018,31 +1962,26 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
 	for (;;) {
 		int distance = curr->lockdep_depth - depth + 1;
 		hlock = curr->held_locks + depth - 1;
+
 		/*
-		 * Only non-crosslock entries get new dependencies added.
-		 * Crosslock entries will be added by commit later:
+		 * Only non-recursive-read entries get new dependencies
+		 * added:
 		 */
-		if (!cross_lock(hlock->instance)) {
+		if (hlock->read != 2 && hlock->check) {
+			int ret = check_prev_add(curr, hlock, next, distance, &trace, save_trace);
+			if (!ret)
+				return 0;
+
 			/*
-			 * Only non-recursive-read entries get new dependencies
-			 * added:
+			 * Stop after the first non-trylock entry,
+			 * as non-trylock entries have added their
+			 * own direct dependencies already, so this
+			 * lock is connected to them indirectly:
 			 */
-			if (hlock->read != 2 && hlock->check) {
-				int ret = check_prev_add(curr, hlock, next,
-							 distance, &trace, save_trace);
-				if (!ret)
-					return 0;
-
-				/*
-				 * Stop after the first non-trylock entry,
-				 * as non-trylock entries have added their
-				 * own direct dependencies already, so this
-				 * lock is connected to them indirectly:
-				 */
-				if (!hlock->trylock)
-					break;
-			}
+			if (!hlock->trylock)
+				break;
 		}
+
 		depth--;
 		/*
 		 * End of lock-stack?
@@ -3292,21 +3231,10 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name,
 void lockdep_init_map(struct lockdep_map *lock, const char *name,
 		      struct lock_class_key *key, int subclass)
 {
-	cross_init(lock, 0);
 	__lockdep_init_map(lock, name, key, subclass);
 }
 EXPORT_SYMBOL_GPL(lockdep_init_map);
 
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-void lockdep_init_map_crosslock(struct lockdep_map *lock, const char *name,
-		      struct lock_class_key *key, int subclass)
-{
-	cross_init(lock, 1);
-	__lockdep_init_map(lock, name, key, subclass);
-}
-EXPORT_SYMBOL_GPL(lockdep_init_map_crosslock);
-#endif
-
 struct lock_class_key __lockdep_no_validate__;
 EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
 
@@ -3362,7 +3290,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	int chain_head = 0;
 	int class_idx;
 	u64 chain_key;
-	int ret;
 
 	if (unlikely(!debug_locks))
 		return 0;
@@ -3411,8 +3338,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 
 	class_idx = class - lock_classes + 1;
 
-	/* TODO: nest_lock is not implemented for crosslock yet. */
-	if (depth && !cross_lock(lock)) {
+	if (depth) {
 		hlock = curr->held_locks + depth - 1;
 		if (hlock->class_idx == class_idx && nest_lock) {
 			if (hlock->references) {
@@ -3500,14 +3426,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
 		return 0;
 
-	ret = lock_acquire_crosslock(hlock);
-	/*
-	 * 2 means normal acquire operations are needed. Otherwise, it's
-	 * ok just to return with '0:fail, 1:success'.
-	 */
-	if (ret != 2)
-		return ret;
-
 	curr->curr_chain_key = chain_key;
 	curr->lockdep_depth++;
 	check_chain_key(curr);
@@ -3745,19 +3663,11 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
 	struct task_struct *curr = current;
 	struct held_lock *hlock;
 	unsigned int depth;
-	int ret, i;
+	int i;
 
 	if (unlikely(!debug_locks))
 		return 0;
 
-	ret = lock_release_crosslock(lock);
-	/*
-	 * 2 means normal release operations are needed. Otherwise, it's
-	 * ok just to return with '0:fail, 1:success'.
-	 */
-	if (ret != 2)
-		return ret;
-
 	depth = curr->lockdep_depth;
 	/*
 	 * So we're all set to release this lock.. wait what lock? We don't
@@ -4675,495 +4585,3 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
 	dump_stack();
 }
 EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
-
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-
-/*
- * Crossrelease works by recording a lock history for each thread and
- * connecting those historic locks that were taken after the
- * wait_for_completion() in the complete() context.
- *
- * Task-A				Task-B
- *
- *					mutex_lock(&A);
- *					mutex_unlock(&A);
- *
- * wait_for_completion(&C);
- *   lock_acquire_crosslock();
- *     atomic_inc_return(&cross_gen_id);
- *                                |
- *				  |	mutex_lock(&B);
- *				  |	mutex_unlock(&B);
- *                                |
- *				  |	complete(&C);
- *				  `--	  lock_commit_crosslock();
- *
- * Which will then add a dependency between B and C.
- */
-
-#define xhlock(i)         (current->xhlocks[(i) % MAX_XHLOCKS_NR])
-
-/*
- * Whenever a crosslock is held, cross_gen_id will be increased.
- */
-static atomic_t cross_gen_id; /* Can be wrapped */
-
-/*
- * Make an entry of the ring buffer invalid.
- */
-static inline void invalidate_xhlock(struct hist_lock *xhlock)
-{
-	/*
-	 * Normally, xhlock->hlock.instance must be !NULL.
-	 */
-	xhlock->hlock.instance = NULL;
-}
-
-/*
- * Lock history stacks; we have 2 nested lock history stacks:
- *
- *   HARD(IRQ)
- *   SOFT(IRQ)
- *
- * The thing is that once we complete a HARD/SOFT IRQ the future task locks
- * should not depend on any of the locks observed while running the IRQ.  So
- * what we do is rewind the history buffer and erase all our knowledge of that
- * temporal event.
- */
-
-void crossrelease_hist_start(enum xhlock_context_t c)
-{
-	struct task_struct *cur = current;
-
-	if (!cur->xhlocks)
-		return;
-
-	cur->xhlock_idx_hist[c] = cur->xhlock_idx;
-	cur->hist_id_save[c]    = cur->hist_id;
-}
-
-void crossrelease_hist_end(enum xhlock_context_t c)
-{
-	struct task_struct *cur = current;
-
-	if (cur->xhlocks) {
-		unsigned int idx = cur->xhlock_idx_hist[c];
-		struct hist_lock *h = &xhlock(idx);
-
-		cur->xhlock_idx = idx;
-
-		/* Check if the ring was overwritten. */
-		if (h->hist_id != cur->hist_id_save[c])
-			invalidate_xhlock(h);
-	}
-}
-
-/*
- * lockdep_invariant_state() is used to annotate independence inside a task, to
- * make one task look like multiple independent 'tasks'.
- *
- * Take for instance workqueues; each work is independent of the last. The
- * completion of a future work does not depend on the completion of a past work
- * (in general). Therefore we must not carry that (lock) dependency across
- * works.
- *
- * This is true for many things; pretty much all kthreads fall into this
- * pattern, where they have an invariant state and future completions do not
- * depend on past completions. Its just that since they all have the 'same'
- * form -- the kthread does the same over and over -- it doesn't typically
- * matter.
- *
- * The same is true for system-calls, once a system call is completed (we've
- * returned to userspace) the next system call does not depend on the lock
- * history of the previous system call.
- *
- * They key property for independence, this invariant state, is that it must be
- * a point where we hold no locks and have no history. Because if we were to
- * hold locks, the restore at _end() would not necessarily recover it's history
- * entry. Similarly, independence per-definition means it does not depend on
- * prior state.
- */
-void lockdep_invariant_state(bool force)
-{
-	/*
-	 * We call this at an invariant point, no current state, no history.
-	 * Verify the former, enforce the latter.
-	 */
-	WARN_ON_ONCE(!force && current->lockdep_depth);
-	if (current->xhlocks)
-		invalidate_xhlock(&xhlock(current->xhlock_idx));
-}
-
-static int cross_lock(struct lockdep_map *lock)
-{
-	return lock ? lock->cross : 0;
-}
-
-/*
- * This is needed to decide the relationship between wrapable variables.
- */
-static inline int before(unsigned int a, unsigned int b)
-{
-	return (int)(a - b) < 0;
-}
-
-static inline struct lock_class *xhlock_class(struct hist_lock *xhlock)
-{
-	return hlock_class(&xhlock->hlock);
-}
-
-static inline struct lock_class *xlock_class(struct cross_lock *xlock)
-{
-	return hlock_class(&xlock->hlock);
-}
-
-/*
- * Should we check a dependency with previous one?
- */
-static inline int depend_before(struct held_lock *hlock)
-{
-	return hlock->read != 2 && hlock->check && !hlock->trylock;
-}
-
-/*
- * Should we check a dependency with next one?
- */
-static inline int depend_after(struct held_lock *hlock)
-{
-	return hlock->read != 2 && hlock->check;
-}
-
-/*
- * Check if the xhlock is valid, which would be false if,
- *
- *    1. Has not used after initializaion yet.
- *    2. Got invalidated.
- *
- * Remind hist_lock is implemented as a ring buffer.
- */
-static inline int xhlock_valid(struct hist_lock *xhlock)
-{
-	/*
-	 * xhlock->hlock.instance must be !NULL.
-	 */
-	return !!xhlock->hlock.instance;
-}
-
-/*
- * Record a hist_lock entry.
- *
- * Irq disable is only required.
- */
-static void add_xhlock(struct held_lock *hlock)
-{
-	unsigned int idx = ++current->xhlock_idx;
-	struct hist_lock *xhlock = &xhlock(idx);
-
-#ifdef CONFIG_DEBUG_LOCKDEP
-	/*
-	 * This can be done locklessly because they are all task-local
-	 * state, we must however ensure IRQs are disabled.
-	 */
-	WARN_ON_ONCE(!irqs_disabled());
-#endif
-
-	/* Initialize hist_lock's members */
-	xhlock->hlock = *hlock;
-	xhlock->hist_id = ++current->hist_id;
-
-	xhlock->trace.nr_entries = 0;
-	xhlock->trace.max_entries = MAX_XHLOCK_TRACE_ENTRIES;
-	xhlock->trace.entries = xhlock->trace_entries;
-
-	if (crossrelease_fullstack) {
-		xhlock->trace.skip = 3;
-		save_stack_trace(&xhlock->trace);
-	} else {
-		xhlock->trace.nr_entries = 1;
-		xhlock->trace.entries[0] = hlock->acquire_ip;
-	}
-}
-
-static inline int same_context_xhlock(struct hist_lock *xhlock)
-{
-	return xhlock->hlock.irq_context == task_irq_context(current);
-}
-
-/*
- * This should be lockless as far as possible because this would be
- * called very frequently.
- */
-static void check_add_xhlock(struct held_lock *hlock)
-{
-	/*
-	 * Record a hist_lock, only in case that acquisitions ahead
-	 * could depend on the held_lock. For example, if the held_lock
-	 * is trylock then acquisitions ahead never depends on that.
-	 * In that case, we don't need to record it. Just return.
-	 */
-	if (!current->xhlocks || !depend_before(hlock))
-		return;
-
-	add_xhlock(hlock);
-}
-
-/*
- * For crosslock.
- */
-static int add_xlock(struct held_lock *hlock)
-{
-	struct cross_lock *xlock;
-	unsigned int gen_id;
-
-	if (!graph_lock())
-		return 0;
-
-	xlock = &((struct lockdep_map_cross *)hlock->instance)->xlock;
-
-	/*
-	 * When acquisitions for a crosslock are overlapped, we use
-	 * nr_acquire to perform commit for them, based on cross_gen_id
-	 * of the first acquisition, which allows to add additional
-	 * dependencies.
-	 *
-	 * Moreover, when no acquisition of a crosslock is in progress,
-	 * we should not perform commit because the lock might not exist
-	 * any more, which might cause incorrect memory access. So we
-	 * have to track the number of acquisitions of a crosslock.
-	 *
-	 * depend_after() is necessary to initialize only the first
-	 * valid xlock so that the xlock can be used on its commit.
-	 */
-	if (xlock->nr_acquire++ && depend_after(&xlock->hlock))
-		goto unlock;
-
-	gen_id = (unsigned int)atomic_inc_return(&cross_gen_id);
-	xlock->hlock = *hlock;
-	xlock->hlock.gen_id = gen_id;
-unlock:
-	graph_unlock();
-	return 1;
-}
-
-/*
- * Called for both normal and crosslock acquires. Normal locks will be
- * pushed on the hist_lock queue. Cross locks will record state and
- * stop regular lock_acquire() to avoid being placed on the held_lock
- * stack.
- *
- * Return: 0 - failure;
- *         1 - crosslock, done;
- *         2 - normal lock, continue to held_lock[] ops.
- */
-static int lock_acquire_crosslock(struct held_lock *hlock)
-{
-	/*
-	 *	CONTEXT 1		CONTEXT 2
-	 *	---------		---------
-	 *	lock A (cross)
-	 *	X = atomic_inc_return(&cross_gen_id)
-	 *	~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-	 *				Y = atomic_read_acquire(&cross_gen_id)
-	 *				lock B
-	 *
-	 * atomic_read_acquire() is for ordering between A and B,
-	 * IOW, A happens before B, when CONTEXT 2 see Y >= X.
-	 *
-	 * Pairs with atomic_inc_return() in add_xlock().
-	 */
-	hlock->gen_id = (unsigned int)atomic_read_acquire(&cross_gen_id);
-
-	if (cross_lock(hlock->instance))
-		return add_xlock(hlock);
-
-	check_add_xhlock(hlock);
-	return 2;
-}
-
-static int copy_trace(struct stack_trace *trace)
-{
-	unsigned long *buf = stack_trace + nr_stack_trace_entries;
-	unsigned int max_nr = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
-	unsigned int nr = min(max_nr, trace->nr_entries);
-
-	trace->nr_entries = nr;
-	memcpy(buf, trace->entries, nr * sizeof(trace->entries[0]));
-	trace->entries = buf;
-	nr_stack_trace_entries += nr;
-
-	if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
-		if (!debug_locks_off_graph_unlock())
-			return 0;
-
-		print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");
-		dump_stack();
-
-		return 0;
-	}
-
-	return 1;
-}
-
-static int commit_xhlock(struct cross_lock *xlock, struct hist_lock *xhlock)
-{
-	unsigned int xid, pid;
-	u64 chain_key;
-
-	xid = xlock_class(xlock) - lock_classes;
-	chain_key = iterate_chain_key((u64)0, xid);
-	pid = xhlock_class(xhlock) - lock_classes;
-	chain_key = iterate_chain_key(chain_key, pid);
-
-	if (lookup_chain_cache(chain_key))
-		return 1;
-
-	if (!add_chain_cache_classes(xid, pid, xhlock->hlock.irq_context,
-				chain_key))
-		return 0;
-
-	if (!check_prev_add(current, &xlock->hlock, &xhlock->hlock, 1,
-			    &xhlock->trace, copy_trace))
-		return 0;
-
-	return 1;
-}
-
-static void commit_xhlocks(struct cross_lock *xlock)
-{
-	unsigned int cur = current->xhlock_idx;
-	unsigned int prev_hist_id = xhlock(cur).hist_id;
-	unsigned int i;
-
-	if (!graph_lock())
-		return;
-
-	if (xlock->nr_acquire) {
-		for (i = 0; i < MAX_XHLOCKS_NR; i++) {
-			struct hist_lock *xhlock = &xhlock(cur - i);
-
-			if (!xhlock_valid(xhlock))
-				break;
-
-			if (before(xhlock->hlock.gen_id, xlock->hlock.gen_id))
-				break;
-
-			if (!same_context_xhlock(xhlock))
-				break;
-
-			/*
-			 * Filter out the cases where the ring buffer was
-			 * overwritten and the current entry has a bigger
-			 * hist_id than the previous one, which is impossible
-			 * otherwise:
-			 */
-			if (unlikely(before(prev_hist_id, xhlock->hist_id)))
-				break;
-
-			prev_hist_id = xhlock->hist_id;
-
-			/*
-			 * commit_xhlock() returns 0 with graph_lock already
-			 * released if fail.
-			 */
-			if (!commit_xhlock(xlock, xhlock))
-				return;
-		}
-	}
-
-	graph_unlock();
-}
-
-void lock_commit_crosslock(struct lockdep_map *lock)
-{
-	struct cross_lock *xlock;
-	unsigned long flags;
-
-	if (unlikely(!debug_locks || current->lockdep_recursion))
-		return;
-
-	if (!current->xhlocks)
-		return;
-
-	/*
-	 * Do commit hist_locks with the cross_lock, only in case that
-	 * the cross_lock could depend on acquisitions after that.
-	 *
-	 * For example, if the cross_lock does not have the 'check' flag
-	 * then we don't need to check dependencies and commit for that.
-	 * Just skip it. In that case, of course, the cross_lock does
-	 * not depend on acquisitions ahead, either.
-	 *
-	 * WARNING: Don't do that in add_xlock() in advance. When an
-	 * acquisition context is different from the commit context,
-	 * invalid(skipped) cross_lock might be accessed.
-	 */
-	if (!depend_after(&((struct lockdep_map_cross *)lock)->xlock.hlock))
-		return;
-
-	raw_local_irq_save(flags);
-	check_flags(flags);
-	current->lockdep_recursion = 1;
-	xlock = &((struct lockdep_map_cross *)lock)->xlock;
-	commit_xhlocks(xlock);
-	current->lockdep_recursion = 0;
-	raw_local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(lock_commit_crosslock);
-
-/*
- * Return: 0 - failure;
- *         1 - crosslock, done;
- *         2 - normal lock, continue to held_lock[] ops.
- */
-static int lock_release_crosslock(struct lockdep_map *lock)
-{
-	if (cross_lock(lock)) {
-		if (!graph_lock())
-			return 0;
-		((struct lockdep_map_cross *)lock)->xlock.nr_acquire--;
-		graph_unlock();
-		return 1;
-	}
-	return 2;
-}
-
-static void cross_init(struct lockdep_map *lock, int cross)
-{
-	if (cross)
-		((struct lockdep_map_cross *)lock)->xlock.nr_acquire = 0;
-
-	lock->cross = cross;
-
-	/*
-	 * Crossrelease assumes that the ring buffer size of xhlocks
-	 * is aligned with power of 2. So force it on build.
-	 */
-	BUILD_BUG_ON(MAX_XHLOCKS_NR & (MAX_XHLOCKS_NR - 1));
-}
-
-void lockdep_init_task(struct task_struct *task)
-{
-	int i;
-
-	task->xhlock_idx = UINT_MAX;
-	task->hist_id = 0;
-
-	for (i = 0; i < XHLOCK_CTX_NR; i++) {
-		task->xhlock_idx_hist[i] = UINT_MAX;
-		task->hist_id_save[i] = 0;
-	}
-
-	task->xhlocks = kzalloc(sizeof(struct hist_lock) * MAX_XHLOCKS_NR,
-				GFP_KERNEL);
-}
-
-void lockdep_free_task(struct task_struct *task)
-{
-	if (task->xhlocks) {
-		void *tmp = task->xhlocks;
-		/* Diable crossrelease for current */
-		task->xhlocks = NULL;
-		kfree(tmp);
-	}
-}
-#endif
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 947d3e2ed5c2..9d5b78aad4c5 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1099,8 +1099,6 @@ config PROVE_LOCKING
 	select DEBUG_MUTEXES
 	select DEBUG_RT_MUTEXES if RT_MUTEXES
 	select DEBUG_LOCK_ALLOC
-	select LOCKDEP_CROSSRELEASE
-	select LOCKDEP_COMPLETIONS
 	select TRACE_IRQFLAGS
 	default n
 	help
@@ -1170,37 +1168,6 @@ config LOCK_STAT
 	 CONFIG_LOCK_STAT defines "contended" and "acquired" lock events.
 	 (CONFIG_LOCKDEP defines "acquire" and "release" events.)
 
-config LOCKDEP_CROSSRELEASE
-	bool
-	help
-	 This makes lockdep work for crosslock which is a lock allowed to
-	 be released in a different context from the acquisition context.
-	 Normally a lock must be released in the context acquiring the lock.
-	 However, relexing this constraint helps synchronization primitives
-	 such as page locks or completions can use the lock correctness
-	 detector, lockdep.
-
-config LOCKDEP_COMPLETIONS
-	bool
-	help
-	 A deadlock caused by wait_for_completion() and complete() can be
-	 detected by lockdep using crossrelease feature.
-
-config BOOTPARAM_LOCKDEP_CROSSRELEASE_FULLSTACK
-	bool "Enable the boot parameter, crossrelease_fullstack"
-	depends on LOCKDEP_CROSSRELEASE
-	default n
-	help
-	 The lockdep "cross-release" feature needs to record stack traces
-	 (of calling functions) for all acquisitions, for eventual later
-	 use during analysis. By default only a single caller is recorded,
-	 because the unwind operation can be very expensive with deeper
-	 stack chains.
-
-	 However a boot parameter, crossrelease_fullstack, was
-	 introduced since sometimes deeper traces are required for full
-	 analysis. This option turns on the boot parameter.
-
 config DEBUG_LOCKDEP
 	bool "Lock dependency engine debugging"
 	depends on DEBUG_KERNEL && LOCKDEP
-- 
cgit v1.2.3


From b899a850431e2dd0943205a63a68573f3e312d0d Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 27 Nov 2017 10:38:23 +0000
Subject: compiler.h: Remove ACCESS_ONCE()

There are no longer any kernelspace uses of ACCESS_ONCE(), so we can
remove the definition from <linux/compiler.h>.

This patch removes the ACCESS_ONCE() definition, and updates comments
which referred to it. At the same time, some inconsistent and redundant
whitespace is removed from comments.

Tested-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Joe Perches <joe@perches.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: apw@canonical.com
Link: http://lkml.kernel.org/r/20171127103824.36526-4-mark.rutland@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/compiler.h | 47 +++++++++++------------------------------------
 1 file changed, 11 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 188ed9f65517..52e611ab9a6c 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -220,21 +220,21 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 /*
  * Prevent the compiler from merging or refetching reads or writes. The
  * compiler is also forbidden from reordering successive instances of
- * READ_ONCE, WRITE_ONCE and ACCESS_ONCE (see below), but only when the
- * compiler is aware of some particular ordering.  One way to make the
- * compiler aware of ordering is to put the two invocations of READ_ONCE,
- * WRITE_ONCE or ACCESS_ONCE() in different C statements.
+ * READ_ONCE and WRITE_ONCE, but only when the compiler is aware of some
+ * particular ordering. One way to make the compiler aware of ordering is to
+ * put the two invocations of READ_ONCE or WRITE_ONCE in different C
+ * statements.
  *
- * In contrast to ACCESS_ONCE these two macros will also work on aggregate
- * data types like structs or unions. If the size of the accessed data
- * type exceeds the word size of the machine (e.g., 32 bits or 64 bits)
- * READ_ONCE() and WRITE_ONCE() will fall back to memcpy(). There's at
- * least two memcpy()s: one for the __builtin_memcpy() and then one for
- * the macro doing the copy of variable - '__u' allocated on the stack.
+ * These two macros will also work on aggregate data types like structs or
+ * unions. If the size of the accessed data type exceeds the word size of
+ * the machine (e.g., 32 bits or 64 bits) READ_ONCE() and WRITE_ONCE() will
+ * fall back to memcpy(). There's at least two memcpy()s: one for the
+ * __builtin_memcpy() and then one for the macro doing the copy of variable
+ * - '__u' allocated on the stack.
  *
  * Their two major use cases are: (1) Mediating communication between
  * process-level code and irq/NMI handlers, all running on the same CPU,
- * and (2) Ensuring that the compiler does not  fold, spindle, or otherwise
+ * and (2) Ensuring that the compiler does not fold, spindle, or otherwise
  * mutilate accesses that either do not require ordering or that interact
  * with an explicit memory barrier or atomic instruction that provides the
  * required ordering.
@@ -327,29 +327,4 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 	compiletime_assert(__native_word(t),				\
 		"Need native word sized stores/loads for atomicity.")
 
-/*
- * Prevent the compiler from merging or refetching accesses.  The compiler
- * is also forbidden from reordering successive instances of ACCESS_ONCE(),
- * but only when the compiler is aware of some particular ordering.  One way
- * to make the compiler aware of ordering is to put the two invocations of
- * ACCESS_ONCE() in different C statements.
- *
- * ACCESS_ONCE will only work on scalar types. For union types, ACCESS_ONCE
- * on a union member will work as long as the size of the member matches the
- * size of the union and the size is smaller than word size.
- *
- * The major use cases of ACCESS_ONCE used to be (1) Mediating communication
- * between process-level code and irq/NMI handlers, all running on the same CPU,
- * and (2) Ensuring that the compiler does not  fold, spindle, or otherwise
- * mutilate accesses that either do not require ordering or that interact
- * with an explicit memory barrier or atomic instruction that provides the
- * required ordering.
- *
- * If possible use READ_ONCE()/WRITE_ONCE() instead.
- */
-#define __ACCESS_ONCE(x) ({ \
-	 __maybe_unused typeof(x) __var = (__force typeof(x)) 0; \
-	(volatile typeof(x) *)&(x); })
-#define ACCESS_ONCE(x) (*__ACCESS_ONCE(x))
-
 #endif /* __LINUX_COMPILER_H */
-- 
cgit v1.2.3


From 4b4df570b41dbb421f52605357d5d56c872df6d9 Mon Sep 17 00:00:00 2001
From: Keith Packard <keithp@keithp.com>
Date: Wed, 13 Dec 2017 00:44:26 -0800
Subject: drm: Update edid-derived drm_display_info fields at edid property set
 [v2]

There are a set of values in the drm_display_info structure for each
connector which hold information derived from EDID. These are computed
in drm_add_display_info. Before this patch, that was only called in
drm_add_edid_modes. This meant that they were only set when EDID was
present and never reset when EDID was not, as happened when the
display was disconnected.

One of these fields, non_desktop, is used from
drm_mode_connector_update_edid_property, the function responsible for
assigning the new edid value to the application-visible property.

Various drivers call these two functions (drm_add_edid_modes and
drm_mode_connector_update_edid_property) in different orders. This
means that even when EDID is present, the drm_display_info fields may
not have been computed at the time that
drm_mode_connector_update_edid_property used the non_desktop value to
set the non_desktop property.

I've added a public function (drm_reset_display_info) that resets the
drm_display_info field values to default values and then made the
drm_add_display_info function public. These two functions are now
called directly from drm_mode_connector_update_edid_property so that
the drm_display_info fields are always computed from the current EDID
information before being used in that function.

This means that the drm_display_info values are often computed twice,
once when the EDID property it set and a second time when EDID is used
to compute modes for the device. The alternative would be to uniformly
ensure that the values were computed once before being used, which
would require that all drivers reliably invoke the two paths in the
same order. The computation is inexpensive enough that it seems more
maintainable in the long term to simply compute them in both paths.

The API to drm_add_display_info has been changed so that it no longer
takes the set of edid-based quirks as a parameter. Rather, it now
computes those quirks itself and returns them for further use by
drm_add_edid_modes.

This patch also includes a number of 'const' additions caused by
drm_mode_connector_update_edid_property taking a 'const struct edid *'
parameter and wanting to pass that along to drm_add_display_info.

v2: after review by Daniel Vetter <daniel.vetter@ffwll.ch>

	Removed EXPORT_SYMBOL_GPL for drm_reset_display_info and
	drm_add_display_info.

	Added FIXME in drm_mode_connector_update_edid_property about
	potentially merging that with drm_add_edid_modes to avoid
	the need for two driver calls.

Signed-off-by: Keith Packard <keithp@keithp.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20171213084427.31199-1-keithp@keithp.com
(danvet: cherry picked from commit 12a889bf4bca ("drm: rework delayed
connector cleanup in connector_iter") from drm-misc-next since
functional conflict with changes in -next and we need to make sure
both have the right version and nothing gets lost.)
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/drm_connector.c | 13 +++++++++++
 drivers/gpu/drm/drm_edid.c      | 52 ++++++++++++++++++++++++++++++-----------
 include/drm/drm_edid.h          |  2 ++
 3 files changed, 53 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c
index 482014137953..c4dfcbc861a1 100644
--- a/drivers/gpu/drm/drm_connector.c
+++ b/drivers/gpu/drm/drm_connector.c
@@ -1231,6 +1231,19 @@ int drm_mode_connector_update_edid_property(struct drm_connector *connector,
 	if (edid)
 		size = EDID_LENGTH * (1 + edid->extensions);
 
+	/* Set the display info, using edid if available, otherwise
+	 * reseting the values to defaults. This duplicates the work
+	 * done in drm_add_edid_modes, but that function is not
+	 * consistently called before this one in all drivers and the
+	 * computation is cheap enough that it seems better to
+	 * duplicate it rather than attempt to ensure some arbitrary
+	 * ordering of calls.
+	 */
+	if (edid)
+		drm_add_display_info(connector, edid);
+	else
+		drm_reset_display_info(connector);
+
 	drm_object_property_set_value(&connector->base,
 				      dev->mode_config.non_desktop_property,
 				      connector->display_info.non_desktop);
diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
index 5dfe14763871..cb487148359a 100644
--- a/drivers/gpu/drm/drm_edid.c
+++ b/drivers/gpu/drm/drm_edid.c
@@ -1731,7 +1731,7 @@ EXPORT_SYMBOL(drm_edid_duplicate);
  *
  * Returns true if @vendor is in @edid, false otherwise
  */
-static bool edid_vendor(struct edid *edid, const char *vendor)
+static bool edid_vendor(const struct edid *edid, const char *vendor)
 {
 	char edid_vendor[3];
 
@@ -1749,7 +1749,7 @@ static bool edid_vendor(struct edid *edid, const char *vendor)
  *
  * This tells subsequent routines what fixes they need to apply.
  */
-static u32 edid_get_quirks(struct edid *edid)
+static u32 edid_get_quirks(const struct edid *edid)
 {
 	const struct edid_quirk *quirk;
 	int i;
@@ -2813,7 +2813,7 @@ add_detailed_modes(struct drm_connector *connector, struct edid *edid,
 /*
  * Search EDID for CEA extension block.
  */
-static u8 *drm_find_edid_extension(struct edid *edid, int ext_id)
+static u8 *drm_find_edid_extension(const struct edid *edid, int ext_id)
 {
 	u8 *edid_ext = NULL;
 	int i;
@@ -2835,12 +2835,12 @@ static u8 *drm_find_edid_extension(struct edid *edid, int ext_id)
 	return edid_ext;
 }
 
-static u8 *drm_find_cea_extension(struct edid *edid)
+static u8 *drm_find_cea_extension(const struct edid *edid)
 {
 	return drm_find_edid_extension(edid, CEA_EXT);
 }
 
-static u8 *drm_find_displayid_extension(struct edid *edid)
+static u8 *drm_find_displayid_extension(const struct edid *edid)
 {
 	return drm_find_edid_extension(edid, DISPLAYID_EXT);
 }
@@ -4363,7 +4363,7 @@ drm_parse_hdmi_vsdb_video(struct drm_connector *connector, const u8 *db)
 }
 
 static void drm_parse_cea_ext(struct drm_connector *connector,
-			      struct edid *edid)
+			      const struct edid *edid)
 {
 	struct drm_display_info *info = &connector->display_info;
 	const u8 *edid_ext;
@@ -4397,11 +4397,33 @@ static void drm_parse_cea_ext(struct drm_connector *connector,
 	}
 }
 
-static void drm_add_display_info(struct drm_connector *connector,
-				 struct edid *edid, u32 quirks)
+/* A connector has no EDID information, so we've got no EDID to compute quirks from. Reset
+ * all of the values which would have been set from EDID
+ */
+void
+drm_reset_display_info(struct drm_connector *connector)
 {
 	struct drm_display_info *info = &connector->display_info;
 
+	info->width_mm = 0;
+	info->height_mm = 0;
+
+	info->bpc = 0;
+	info->color_formats = 0;
+	info->cea_rev = 0;
+	info->max_tmds_clock = 0;
+	info->dvi_dual = false;
+
+	info->non_desktop = 0;
+}
+EXPORT_SYMBOL_GPL(drm_reset_display_info);
+
+u32 drm_add_display_info(struct drm_connector *connector, const struct edid *edid)
+{
+	struct drm_display_info *info = &connector->display_info;
+
+	u32 quirks = edid_get_quirks(edid);
+
 	info->width_mm = edid->width_cm * 10;
 	info->height_mm = edid->height_cm * 10;
 
@@ -4414,11 +4436,13 @@ static void drm_add_display_info(struct drm_connector *connector,
 
 	info->non_desktop = !!(quirks & EDID_QUIRK_NON_DESKTOP);
 
+	DRM_DEBUG_KMS("non_desktop set to %d\n", info->non_desktop);
+
 	if (edid->revision < 3)
-		return;
+		return quirks;
 
 	if (!(edid->input & DRM_EDID_INPUT_DIGITAL))
-		return;
+		return quirks;
 
 	drm_parse_cea_ext(connector, edid);
 
@@ -4438,7 +4462,7 @@ static void drm_add_display_info(struct drm_connector *connector,
 
 	/* Only defined for 1.4 with digital displays */
 	if (edid->revision < 4)
-		return;
+		return quirks;
 
 	switch (edid->input & DRM_EDID_DIGITAL_DEPTH_MASK) {
 	case DRM_EDID_DIGITAL_DEPTH_6:
@@ -4473,7 +4497,9 @@ static void drm_add_display_info(struct drm_connector *connector,
 		info->color_formats |= DRM_COLOR_FORMAT_YCRCB444;
 	if (edid->features & DRM_EDID_FEATURE_RGB_YCRCB422)
 		info->color_formats |= DRM_COLOR_FORMAT_YCRCB422;
+	return quirks;
 }
+EXPORT_SYMBOL_GPL(drm_add_display_info);
 
 static int validate_displayid(u8 *displayid, int length, int idx)
 {
@@ -4627,14 +4653,12 @@ int drm_add_edid_modes(struct drm_connector *connector, struct edid *edid)
 		return 0;
 	}
 
-	quirks = edid_get_quirks(edid);
-
 	/*
 	 * CEA-861-F adds ycbcr capability map block, for HDMI 2.0 sinks.
 	 * To avoid multiple parsing of same block, lets parse that map
 	 * from sink info, before parsing CEA modes.
 	 */
-	drm_add_display_info(connector, edid, quirks);
+	quirks = drm_add_display_info(connector, edid);
 
 	/*
 	 * EDID spec says modes should be preferred in this order:
diff --git a/include/drm/drm_edid.h b/include/drm/drm_edid.h
index 2ec41d032e56..efe6d5a8e834 100644
--- a/include/drm/drm_edid.h
+++ b/include/drm/drm_edid.h
@@ -465,6 +465,8 @@ struct edid *drm_get_edid(struct drm_connector *connector,
 struct edid *drm_get_edid_switcheroo(struct drm_connector *connector,
 				     struct i2c_adapter *adapter);
 struct edid *drm_edid_duplicate(const struct edid *edid);
+void drm_reset_display_info(struct drm_connector *connector);
+u32 drm_add_display_info(struct drm_connector *connector, const struct edid *edid);
 int drm_add_edid_modes(struct drm_connector *connector, struct edid *edid);
 
 u8 drm_match_cea_mode(const struct drm_display_mode *to_match);
-- 
cgit v1.2.3


From b5476022bbada3764609368f03329ca287528dc8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 11 Dec 2017 07:17:39 -0800
Subject: ipv4: igmp: guard against silly MTU values
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

IPv4 stack reacts to changes to small MTU, by disabling itself under
RTNL.

But there is a window where threads not using RTNL can see a wrong
device mtu. This can lead to surprises, in igmp code where it is
assumed the mtu is suitable.

Fix this by reading device mtu once and checking IPv4 minimal MTU.

This patch adds missing IPV4_MIN_MTU define, to not abuse
ETH_MIN_MTU anymore.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h     |  1 +
 net/ipv4/devinet.c   |  2 +-
 net/ipv4/igmp.c      | 24 +++++++++++++++---------
 net/ipv4/ip_tunnel.c |  4 ++--
 4 files changed, 19 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/ip.h b/include/net/ip.h
index 9896f46cbbf1..af8addbaa3c1 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -34,6 +34,7 @@
 #include <net/flow_dissector.h>
 
 #define IPV4_MAX_PMTU		65535U		/* RFC 2675, Section 5.1 */
+#define IPV4_MIN_MTU		68			/* RFC 791 */
 
 struct sock;
 
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index a4573bccd6da..7a93359fbc72 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1428,7 +1428,7 @@ skip:
 
 static bool inetdev_valid_mtu(unsigned int mtu)
 {
-	return mtu >= 68;
+	return mtu >= IPV4_MIN_MTU;
 }
 
 static void inetdev_send_gratuitous_arp(struct net_device *dev,
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index d1f8f302dbf3..50448a220a1f 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -404,16 +404,17 @@ static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel)
 }
 
 static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc,
-	int type, struct igmpv3_grec **ppgr)
+	int type, struct igmpv3_grec **ppgr, unsigned int mtu)
 {
 	struct net_device *dev = pmc->interface->dev;
 	struct igmpv3_report *pih;
 	struct igmpv3_grec *pgr;
 
-	if (!skb)
-		skb = igmpv3_newpack(dev, dev->mtu);
-	if (!skb)
-		return NULL;
+	if (!skb) {
+		skb = igmpv3_newpack(dev, mtu);
+		if (!skb)
+			return NULL;
+	}
 	pgr = skb_put(skb, sizeof(struct igmpv3_grec));
 	pgr->grec_type = type;
 	pgr->grec_auxwords = 0;
@@ -436,12 +437,17 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
 	struct igmpv3_grec *pgr = NULL;
 	struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list;
 	int scount, stotal, first, isquery, truncate;
+	unsigned int mtu;
 
 	if (pmc->multiaddr == IGMP_ALL_HOSTS)
 		return skb;
 	if (ipv4_is_local_multicast(pmc->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports)
 		return skb;
 
+	mtu = READ_ONCE(dev->mtu);
+	if (mtu < IPV4_MIN_MTU)
+		return skb;
+
 	isquery = type == IGMPV3_MODE_IS_INCLUDE ||
 		  type == IGMPV3_MODE_IS_EXCLUDE;
 	truncate = type == IGMPV3_MODE_IS_EXCLUDE ||
@@ -462,7 +468,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
 		    AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) {
 			if (skb)
 				igmpv3_sendpack(skb);
-			skb = igmpv3_newpack(dev, dev->mtu);
+			skb = igmpv3_newpack(dev, mtu);
 		}
 	}
 	first = 1;
@@ -498,12 +504,12 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
 				pgr->grec_nsrcs = htons(scount);
 			if (skb)
 				igmpv3_sendpack(skb);
-			skb = igmpv3_newpack(dev, dev->mtu);
+			skb = igmpv3_newpack(dev, mtu);
 			first = 1;
 			scount = 0;
 		}
 		if (first) {
-			skb = add_grhead(skb, pmc, type, &pgr);
+			skb = add_grhead(skb, pmc, type, &pgr, mtu);
 			first = 0;
 		}
 		if (!skb)
@@ -538,7 +544,7 @@ empty_source:
 				igmpv3_sendpack(skb);
 				skb = NULL; /* add_grhead will get a new one */
 			}
-			skb = add_grhead(skb, pmc, type, &pgr);
+			skb = add_grhead(skb, pmc, type, &pgr, mtu);
 		}
 	}
 	if (pgr)
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index fe6fee728ce4..5ddb1cb52bd4 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -349,8 +349,8 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
 	dev->needed_headroom = t_hlen + hlen;
 	mtu -= (dev->hard_header_len + t_hlen);
 
-	if (mtu < 68)
-		mtu = 68;
+	if (mtu < IPV4_MIN_MTU)
+		mtu = IPV4_MIN_MTU;
 
 	return mtu;
 }
-- 
cgit v1.2.3


From ea497bb92064875497554ee7cdf10df7fb7393fc Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Wed, 13 Dec 2017 13:49:36 +0100
Subject: drm: rework delayed connector cleanup in connector_iter

PROBE_DEFER also uses system_wq to reprobe drivers, which means when
that again fails, and we try to flush the overall system_wq (to get
all the delayed connectore cleanup work_struct completed), we
deadlock.

Fix this by using just a single cleanup work, so that we can only
flush that one and don't block on anything else. That means a free
list plus locking, a standard pattern.

v2:
- Correctly free connectors only on last ref. Oops (Chris).
- use llist_head/node (Chris).

v3
- Add init_llist_head (Chris).

Fixes: a703c55004e1 ("drm: safely free connectors from connector_iter")
Fixes: 613051dac40d ("drm: locking&new iterators for connector_list")
Cc: Ben Widawsky <ben@bwidawsk.net>
Cc: Dave Airlie <airlied@gmail.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Sean Paul <seanpaul@chromium.org>
Cc: <stable@vger.kernel.org> # v4.11+: 613051dac40d ("drm: locking&new iterators for connector_list"
Cc: <stable@vger.kernel.org> # v4.11+
Cc: Daniel Vetter <daniel.vetter@intel.com>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Gustavo Padovan <gustavo@padovan.org>
Cc: David Airlie <airlied@linux.ie>
Cc: Javier Martinez Canillas <javier@dowhile0.org>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Guillaume Tucker <guillaume.tucker@collabora.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Kevin Hilman <khilman@baylibre.com>
Cc: Matt Hart <matthew.hart@linaro.org>
Cc: Thierry Escande <thierry.escande@collabora.co.uk>
Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Cc: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20171213124936.17914-1-daniel.vetter@ffwll.ch
---
 drivers/gpu/drm/drm_connector.c     | 50 ++++++++++++++++++++++++++-----------
 drivers/gpu/drm/drm_crtc_internal.h |  1 +
 drivers/gpu/drm/drm_mode_config.c   |  5 +++-
 include/drm/drm_connector.h         | 10 +++++---
 include/drm/drm_mode_config.h       | 18 ++++++++++++-
 5 files changed, 63 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c
index c4dfcbc861a1..9ae236036e32 100644
--- a/drivers/gpu/drm/drm_connector.c
+++ b/drivers/gpu/drm/drm_connector.c
@@ -152,14 +152,23 @@ static void drm_connector_free(struct kref *kref)
 	connector->funcs->destroy(connector);
 }
 
-static void drm_connector_free_work_fn(struct work_struct *work)
+void drm_connector_free_work_fn(struct work_struct *work)
 {
-	struct drm_connector *connector =
-		container_of(work, struct drm_connector, free_work);
-	struct drm_device *dev = connector->dev;
+	struct drm_connector *connector, *n;
+	struct drm_device *dev =
+		container_of(work, struct drm_device, mode_config.connector_free_work);
+	struct drm_mode_config *config = &dev->mode_config;
+	unsigned long flags;
+	struct llist_node *freed;
 
-	drm_mode_object_unregister(dev, &connector->base);
-	connector->funcs->destroy(connector);
+	spin_lock_irqsave(&config->connector_list_lock, flags);
+	freed = llist_del_all(&config->connector_free_list);
+	spin_unlock_irqrestore(&config->connector_list_lock, flags);
+
+	llist_for_each_entry_safe(connector, n, freed, free_node) {
+		drm_mode_object_unregister(dev, &connector->base);
+		connector->funcs->destroy(connector);
+	}
 }
 
 /**
@@ -191,8 +200,6 @@ int drm_connector_init(struct drm_device *dev,
 	if (ret)
 		return ret;
 
-	INIT_WORK(&connector->free_work, drm_connector_free_work_fn);
-
 	connector->base.properties = &connector->properties;
 	connector->dev = dev;
 	connector->funcs = funcs;
@@ -547,10 +554,17 @@ EXPORT_SYMBOL(drm_connector_list_iter_begin);
  * actually release the connector when dropping our final reference.
  */
 static void
-drm_connector_put_safe(struct drm_connector *conn)
+__drm_connector_put_safe(struct drm_connector *conn)
 {
-	if (refcount_dec_and_test(&conn->base.refcount.refcount))
-		schedule_work(&conn->free_work);
+	struct drm_mode_config *config = &conn->dev->mode_config;
+
+	lockdep_assert_held(&config->connector_list_lock);
+
+	if (!refcount_dec_and_test(&conn->base.refcount.refcount))
+		return;
+
+	llist_add(&conn->free_node, &config->connector_free_list);
+	schedule_work(&config->connector_free_work);
 }
 
 /**
@@ -582,10 +596,10 @@ drm_connector_list_iter_next(struct drm_connector_list_iter *iter)
 
 		/* loop until it's not a zombie connector */
 	} while (!kref_get_unless_zero(&iter->conn->base.refcount));
-	spin_unlock_irqrestore(&config->connector_list_lock, flags);
 
 	if (old_conn)
-		drm_connector_put_safe(old_conn);
+		__drm_connector_put_safe(old_conn);
+	spin_unlock_irqrestore(&config->connector_list_lock, flags);
 
 	return iter->conn;
 }
@@ -602,9 +616,15 @@ EXPORT_SYMBOL(drm_connector_list_iter_next);
  */
 void drm_connector_list_iter_end(struct drm_connector_list_iter *iter)
 {
+	struct drm_mode_config *config = &iter->dev->mode_config;
+	unsigned long flags;
+
 	iter->dev = NULL;
-	if (iter->conn)
-		drm_connector_put_safe(iter->conn);
+	if (iter->conn) {
+		spin_lock_irqsave(&config->connector_list_lock, flags);
+		__drm_connector_put_safe(iter->conn);
+		spin_unlock_irqrestore(&config->connector_list_lock, flags);
+	}
 	lock_release(&connector_list_iter_dep_map, 0, _RET_IP_);
 }
 EXPORT_SYMBOL(drm_connector_list_iter_end);
diff --git a/drivers/gpu/drm/drm_crtc_internal.h b/drivers/gpu/drm/drm_crtc_internal.h
index 9ebb8841778c..af00f42ba269 100644
--- a/drivers/gpu/drm/drm_crtc_internal.h
+++ b/drivers/gpu/drm/drm_crtc_internal.h
@@ -142,6 +142,7 @@ int drm_mode_connector_set_obj_prop(struct drm_mode_object *obj,
 				    uint64_t value);
 int drm_connector_create_standard_properties(struct drm_device *dev);
 const char *drm_get_connector_force_name(enum drm_connector_force force);
+void drm_connector_free_work_fn(struct work_struct *work);
 
 /* IOCTL */
 int drm_mode_connector_property_set_ioctl(struct drm_device *dev,
diff --git a/drivers/gpu/drm/drm_mode_config.c b/drivers/gpu/drm/drm_mode_config.c
index cc78b3d9e5e4..256de7313612 100644
--- a/drivers/gpu/drm/drm_mode_config.c
+++ b/drivers/gpu/drm/drm_mode_config.c
@@ -382,6 +382,9 @@ void drm_mode_config_init(struct drm_device *dev)
 	ida_init(&dev->mode_config.connector_ida);
 	spin_lock_init(&dev->mode_config.connector_list_lock);
 
+	init_llist_head(&dev->mode_config.connector_free_list);
+	INIT_WORK(&dev->mode_config.connector_free_work, drm_connector_free_work_fn);
+
 	drm_mode_create_standard_properties(dev);
 
 	/* Just to be sure */
@@ -432,7 +435,7 @@ void drm_mode_config_cleanup(struct drm_device *dev)
 	}
 	drm_connector_list_iter_end(&conn_iter);
 	/* connector_iter drops references in a work item. */
-	flush_scheduled_work();
+	flush_work(&dev->mode_config.connector_free_work);
 	if (WARN_ON(!list_empty(&dev->mode_config.connector_list))) {
 		drm_connector_list_iter_begin(dev, &conn_iter);
 		drm_for_each_connector_iter(connector, &conn_iter)
diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h
index a4649c56ca2f..5971577016a2 100644
--- a/include/drm/drm_connector.h
+++ b/include/drm/drm_connector.h
@@ -24,6 +24,7 @@
 #define __DRM_CONNECTOR_H__
 
 #include <linux/list.h>
+#include <linux/llist.h>
 #include <linux/ctype.h>
 #include <linux/hdmi.h>
 #include <drm/drm_mode_object.h>
@@ -918,12 +919,13 @@ struct drm_connector {
 	uint16_t tile_h_size, tile_v_size;
 
 	/**
-	 * @free_work:
+	 * @free_node:
 	 *
-	 * Work used only by &drm_connector_iter to be able to clean up a
-	 * connector from any context.
+	 * List used only by &drm_connector_iter to be able to clean up a
+	 * connector from any context, in conjunction with
+	 * &drm_mode_config.connector_free_work.
 	 */
-	struct work_struct free_work;
+	struct llist_node free_node;
 };
 
 #define obj_to_connector(x) container_of(x, struct drm_connector, base)
diff --git a/include/drm/drm_mode_config.h b/include/drm/drm_mode_config.h
index b21e827c5c78..b0ce26d71296 100644
--- a/include/drm/drm_mode_config.h
+++ b/include/drm/drm_mode_config.h
@@ -27,6 +27,7 @@
 #include <linux/types.h>
 #include <linux/idr.h>
 #include <linux/workqueue.h>
+#include <linux/llist.h>
 
 #include <drm/drm_modeset_lock.h>
 
@@ -393,7 +394,7 @@ struct drm_mode_config {
 
 	/**
 	 * @connector_list_lock: Protects @num_connector and
-	 * @connector_list.
+	 * @connector_list and @connector_free_list.
 	 */
 	spinlock_t connector_list_lock;
 	/**
@@ -413,6 +414,21 @@ struct drm_mode_config {
 	 * &struct drm_connector_list_iter to walk this list.
 	 */
 	struct list_head connector_list;
+	/**
+	 * @connector_free_list:
+	 *
+	 * List of connector objects linked with &drm_connector.free_head.
+	 * Protected by @connector_list_lock. Used by
+	 * drm_for_each_connector_iter() and
+	 * &struct drm_connector_list_iter to savely free connectors using
+	 * @connector_free_work.
+	 */
+	struct llist_head connector_free_list;
+	/**
+	 * @connector_free_work: Work to clean up @connector_free_list.
+	 */
+	struct work_struct connector_free_work;
+
 	/**
 	 * @num_encoder:
 	 *
-- 
cgit v1.2.3


From c47d7f56e914900410f65835933f9fc4374d0a2b Mon Sep 17 00:00:00 2001
From: Wei Wang <wei.w.wang@intel.com>
Date: Thu, 14 Dec 2017 15:32:24 -0800
Subject: include/linux/idr.h: add #include <linux/bug.h>

The <linux/bug.h> was removed from radix-tree.h by commit f5bba9d11a25
("include/linux/radix-tree.h: remove unneeded #include <linux/bug.h>").

Since that commit, tools/testing/radix-tree/ couldn't pass compilation
due to tools/testing/radix-tree/idr.c:17: undefined reference to
WARN_ON_ONCE.  This patch adds the bug.h header to idr.h to solve the
issue.

Link: http://lkml.kernel.org/r/1511963726-34070-2-git-send-email-wei.w.wang@intel.com
Fixes: f5bba9d11a2 ("include/linux/radix-tree.h: remove unneeded #include <linux/bug.h>")
Signed-off-by: Wei Wang <wei.w.wang@intel.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/idr.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/idr.h b/include/linux/idr.h
index 7c3a365f7e12..fa14f834e4ed 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -15,6 +15,7 @@
 #include <linux/radix-tree.h>
 #include <linux/gfp.h>
 #include <linux/percpu.h>
+#include <linux/bug.h>
 
 struct idr {
 	struct radix_tree_root	idr_rt;
-- 
cgit v1.2.3


From 338f1d9d1b829fec494d053f62820a2ee625b1ec Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 14 Dec 2017 15:32:28 -0800
Subject: lib/rbtree,drm/mm: add rbtree_replace_node_cached()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a variant of rbtree_replace_node() that maintains the leftmost cache
of struct rbtree_root_cached when replacing nodes within the rbtree.

As drm_mm is the only rb_replace_node() being used on an interval tree,
the mistake looks fairly self-contained.  Furthermore the only user of
drm_mm_replace_node() is its testsuite...

Testcase: igt/drm_mm/replace

Link: http://lkml.kernel.org/r/20171122100729.3742-1-chris@chris-wilson.co.uk
Link: https://patchwork.freedesktop.org/patch/msgid/20171109212435.9265-1-chris@chris-wilson.co.uk
Fixes: f808c13fd373 ("lib/interval_tree: fast overlap detection")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Acked-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/gpu/drm/drm_mm.c |  8 +++++---
 include/linux/rbtree.h   |  2 ++
 lib/rbtree.c             | 10 ++++++++++
 3 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_mm.c b/drivers/gpu/drm/drm_mm.c
index 61a1c8ea74bc..c3c79ee6119e 100644
--- a/drivers/gpu/drm/drm_mm.c
+++ b/drivers/gpu/drm/drm_mm.c
@@ -575,21 +575,23 @@ EXPORT_SYMBOL(drm_mm_remove_node);
  */
 void drm_mm_replace_node(struct drm_mm_node *old, struct drm_mm_node *new)
 {
+	struct drm_mm *mm = old->mm;
+
 	DRM_MM_BUG_ON(!old->allocated);
 
 	*new = *old;
 
 	list_replace(&old->node_list, &new->node_list);
-	rb_replace_node(&old->rb, &new->rb, &old->mm->interval_tree.rb_root);
+	rb_replace_node_cached(&old->rb, &new->rb, &mm->interval_tree);
 
 	if (drm_mm_hole_follows(old)) {
 		list_replace(&old->hole_stack, &new->hole_stack);
 		rb_replace_node(&old->rb_hole_size,
 				&new->rb_hole_size,
-				&old->mm->holes_size);
+				&mm->holes_size);
 		rb_replace_node(&old->rb_hole_addr,
 				&new->rb_hole_addr,
-				&old->mm->holes_addr);
+				&mm->holes_addr);
 	}
 
 	old->allocated = false;
diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index d574361943ea..fcbeed4053ef 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -99,6 +99,8 @@ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
 			    struct rb_root *root);
 extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
 				struct rb_root *root);
+extern void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new,
+				   struct rb_root_cached *root);
 
 static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
 				struct rb_node **rb_link)
diff --git a/lib/rbtree.c b/lib/rbtree.c
index ba4a9d165f1b..d3ff682fd4b8 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -603,6 +603,16 @@ void rb_replace_node(struct rb_node *victim, struct rb_node *new,
 }
 EXPORT_SYMBOL(rb_replace_node);
 
+void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new,
+			    struct rb_root_cached *root)
+{
+	rb_replace_node(victim, new, &root->rb_root);
+
+	if (root->rb_leftmost == victim)
+		root->rb_leftmost = new;
+}
+EXPORT_SYMBOL(rb_replace_node_cached);
+
 void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
 			 struct rb_root *root)
 {
-- 
cgit v1.2.3


From 146734b091430c80d80bb96b1139a96fb4bc830e Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 14 Dec 2017 15:32:34 -0800
Subject: string.h: workaround for increased stack usage

The hardened strlen() function causes rather large stack usage in at
least one file in the kernel, in particular when CONFIG_KASAN is
enabled:

  drivers/media/usb/em28xx/em28xx-dvb.c: In function 'em28xx_dvb_init':
  drivers/media/usb/em28xx/em28xx-dvb.c:2062:1: error: the frame size of 3256 bytes is larger than 204 bytes [-Werror=frame-larger-than=]

Analyzing this problem led to the discovery that gcc fails to merge the
stack slots for the i2c_board_info[] structures after we strlcpy() into
them, due to the 'noreturn' attribute on the source string length check.

I reported this as a gcc bug, but it is unlikely to get fixed for gcc-8,
since it is relatively easy to work around, and it gets triggered
rarely.  An earlier workaround I did added an empty inline assembly
statement before the call to fortify_panic(), which works surprisingly
well, but is really ugly and unintuitive.

This is a new approach to the same problem, this time addressing it by
not calling the 'extern __real_strnlen()' function for string constants
where __builtin_strlen() is a compile-time constant and therefore known
to be safe.

We do this by checking if the last character in the string is a
compile-time constant '\0'.  If it is, we can assume that strlen() of
the string is also constant.

As a side-effect, this should also improve the object code output for
any other call of strlen() on a string constant.

[akpm@linux-foundation.org: add comment]
Link: http://lkml.kernel.org/r/20171205215143.3085755-1-arnd@arndb.de
Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82365
Link: https://patchwork.kernel.org/patch/9980413/
Link: https://patchwork.kernel.org/patch/9974047/
Fixes: 6974f0c4555 ("include/linux/string.h: add the option of fortified string.h functions")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Daniel Micay <danielmicay@gmail.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Martin Wilck <mwilck@suse.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/string.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/string.h b/include/linux/string.h
index 410ecf17de3c..cfd83eb2f926 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -259,7 +259,10 @@ __FORTIFY_INLINE __kernel_size_t strlen(const char *p)
 {
 	__kernel_size_t ret;
 	size_t p_size = __builtin_object_size(p, 0);
-	if (p_size == (size_t)-1)
+
+	/* Work around gcc excess stack consumption issue */
+	if (p_size == (size_t)-1 ||
+	    (__builtin_constant_p(p[p_size - 1]) && p[p_size - 1] == '\0'))
 		return __builtin_strlen(p);
 	ret = strnlen(p, p_size);
 	if (p_size <= ret)
-- 
cgit v1.2.3


From 3756f6401c302617c5e091081ca4d26ab604bec5 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 14 Dec 2017 15:32:41 -0800
Subject: exec: avoid gcc-8 warning for get_task_comm

gcc-8 warns about using strncpy() with the source size as the limit:

  fs/exec.c:1223:32: error: argument to 'sizeof' in 'strncpy' call is the same expression as the source; did you mean to use the size of the destination? [-Werror=sizeof-pointer-memaccess]

This is indeed slightly suspicious, as it protects us from source
arguments without NUL-termination, but does not guarantee that the
destination is terminated.

This keeps the strncpy() to ensure we have properly padded target
buffer, but ensures that we use the correct length, by passing the
actual length of the destination buffer as well as adding a build-time
check to ensure it is exactly TASK_COMM_LEN.

There are only 23 callsites which I all reviewed to ensure this is
currently the case.  We could get away with doing only the check or
passing the right length, but it doesn't hurt to do both.

Link: http://lkml.kernel.org/r/20171205151724.1764896-1-arnd@arndb.de
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Suggested-by: Kees Cook <keescook@chromium.org>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Serge Hallyn <serge@hallyn.com>
Cc: James Morris <james.l.morris@oracle.com>
Cc: Aleksa Sarai <asarai@suse.de>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c             | 7 +++----
 include/linux/sched.h | 6 +++++-
 2 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/fs/exec.c b/fs/exec.c
index 6be2aa0ab26f..156f56acfe8e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1216,15 +1216,14 @@ killed:
 	return -EAGAIN;
 }
 
-char *get_task_comm(char *buf, struct task_struct *tsk)
+char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk)
 {
-	/* buf must be at least sizeof(tsk->comm) in size */
 	task_lock(tsk);
-	strncpy(buf, tsk->comm, sizeof(tsk->comm));
+	strncpy(buf, tsk->comm, buf_size);
 	task_unlock(tsk);
 	return buf;
 }
-EXPORT_SYMBOL_GPL(get_task_comm);
+EXPORT_SYMBOL_GPL(__get_task_comm);
 
 /*
  * These functions flushes out all traces of the currently running executable
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 21991d668d35..5124ba709830 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1503,7 +1503,11 @@ static inline void set_task_comm(struct task_struct *tsk, const char *from)
 	__set_task_comm(tsk, from, false);
 }
 
-extern char *get_task_comm(char *to, struct task_struct *tsk);
+extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk);
+#define get_task_comm(buf, tsk) ({			\
+	BUILD_BUG_ON(sizeof(buf) != TASK_COMM_LEN);	\
+	__get_task_comm(buf, sizeof(buf), tsk);		\
+})
 
 #ifdef CONFIG_SMP
 void scheduler_ipi(void);
-- 
cgit v1.2.3


From bdcf0a423ea1c40bbb40e7ee483b50fc8aa3d758 Mon Sep 17 00:00:00 2001
From: Thiago Rafael Becker <thiago.becker@gmail.com>
Date: Thu, 14 Dec 2017 15:33:12 -0800
Subject: kernel: make groups_sort calling a responsibility group_info
 allocators

In testing, we found that nfsd threads may call set_groups in parallel
for the same entry cached in auth.unix.gid, racing in the call of
groups_sort, corrupting the groups for that entry and leading to
permission denials for the client.

This patch:
 - Make groups_sort globally visible.
 - Move the call to groups_sort to the modifiers of group_info
 - Remove the call to groups_sort from set_groups

Link: http://lkml.kernel.org/r/20171211151420.18655-1-thiago.becker@gmail.com
Signed-off-by: Thiago Rafael Becker <thiago.becker@gmail.com>
Reviewed-by: Matthew Wilcox <mawilcox@microsoft.com>
Reviewed-by: NeilBrown <neilb@suse.com>
Acked-by: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/s390/kernel/compat_linux.c   | 1 +
 fs/nfsd/auth.c                    | 3 +++
 include/linux/cred.h              | 1 +
 kernel/groups.c                   | 5 +++--
 kernel/uid16.c                    | 1 +
 net/sunrpc/auth_gss/gss_rpc_xdr.c | 1 +
 net/sunrpc/auth_gss/svcauth_gss.c | 1 +
 net/sunrpc/svcauth_unix.c         | 2 ++
 8 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
index f04db3779b34..59eea9c65d3e 100644
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -263,6 +263,7 @@ COMPAT_SYSCALL_DEFINE2(s390_setgroups16, int, gidsetsize, u16 __user *, grouplis
 		return retval;
 	}
 
+	groups_sort(group_info);
 	retval = set_current_groups(group_info);
 	put_group_info(group_info);
 
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 697f8ae7792d..f650e475d8f0 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -60,6 +60,9 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 				gi->gid[i] = exp->ex_anon_gid;
 			else
 				gi->gid[i] = rqgi->gid[i];
+
+			/* Each thread allocates its own gi, no race */
+			groups_sort(gi);
 		}
 	} else {
 		gi = get_group_info(rqgi);
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 099058e1178b..631286535d0f 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -83,6 +83,7 @@ extern int set_current_groups(struct group_info *);
 extern void set_groups(struct cred *, struct group_info *);
 extern int groups_search(const struct group_info *, kgid_t);
 extern bool may_setgroups(void);
+extern void groups_sort(struct group_info *);
 
 /*
  * The security context of a task
diff --git a/kernel/groups.c b/kernel/groups.c
index e357bc800111..daae2f2dc6d4 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -86,11 +86,12 @@ static int gid_cmp(const void *_a, const void *_b)
 	return gid_gt(a, b) - gid_lt(a, b);
 }
 
-static void groups_sort(struct group_info *group_info)
+void groups_sort(struct group_info *group_info)
 {
 	sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid),
 	     gid_cmp, NULL);
 }
+EXPORT_SYMBOL(groups_sort);
 
 /* a simple bsearch */
 int groups_search(const struct group_info *group_info, kgid_t grp)
@@ -122,7 +123,6 @@ int groups_search(const struct group_info *group_info, kgid_t grp)
 void set_groups(struct cred *new, struct group_info *group_info)
 {
 	put_group_info(new->group_info);
-	groups_sort(group_info);
 	get_group_info(group_info);
 	new->group_info = group_info;
 }
@@ -206,6 +206,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
 		return retval;
 	}
 
+	groups_sort(group_info);
 	retval = set_current_groups(group_info);
 	put_group_info(group_info);
 
diff --git a/kernel/uid16.c b/kernel/uid16.c
index ce74a4901d2b..ef1da2a5f9bd 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -192,6 +192,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
 		return retval;
 	}
 
+	groups_sort(group_info);
 	retval = set_current_groups(group_info);
 	put_group_info(group_info);
 
diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c
index c4778cae58ef..444380f968f1 100644
--- a/net/sunrpc/auth_gss/gss_rpc_xdr.c
+++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c
@@ -231,6 +231,7 @@ static int gssx_dec_linux_creds(struct xdr_stream *xdr,
 			goto out_free_groups;
 		creds->cr_group_info->gid[i] = kgid;
 	}
+	groups_sort(creds->cr_group_info);
 
 	return 0;
 out_free_groups:
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 5dd4e6c9fef2..26531193fce4 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -481,6 +481,7 @@ static int rsc_parse(struct cache_detail *cd,
 				goto out;
 			rsci.cred.cr_group_info->gid[i] = kgid;
 		}
+		groups_sort(rsci.cred.cr_group_info);
 
 		/* mech name */
 		len = qword_get(&mesg, buf, mlen);
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 740b67d5a733..af7f28fb8102 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -520,6 +520,7 @@ static int unix_gid_parse(struct cache_detail *cd,
 		ug.gi->gid[i] = kgid;
 	}
 
+	groups_sort(ug.gi);
 	ugp = unix_gid_lookup(cd, uid);
 	if (ugp) {
 		struct cache_head *ch;
@@ -819,6 +820,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp)
 		kgid_t kgid = make_kgid(&init_user_ns, svc_getnl(argv));
 		cred->cr_group_info->gid[i] = kgid;
 	}
+	groups_sort(cred->cr_group_info);
 	if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) {
 		*authp = rpc_autherr_badverf;
 		return SVC_DENIED;
-- 
cgit v1.2.3


From 4837fe37adff1d159904f0c013471b1ecbcb455e Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 14 Dec 2017 15:33:15 -0800
Subject: mm, oom_reaper: fix memory corruption

David Rientjes has reported the following memory corruption while the
oom reaper tries to unmap the victims address space

  BUG: Bad page map in process oom_reaper  pte:6353826300000000 pmd:00000000
  addr:00007f50cab1d000 vm_flags:08100073 anon_vma:ffff9eea335603f0 mapping:          (null) index:7f50cab1d
  file:          (null) fault:          (null) mmap:          (null) readpage:          (null)
  CPU: 2 PID: 1001 Comm: oom_reaper
  Call Trace:
     unmap_page_range+0x1068/0x1130
     __oom_reap_task_mm+0xd5/0x16b
     oom_reaper+0xff/0x14c
     kthread+0xc1/0xe0

Tetsuo Handa has noticed that the synchronization inside exit_mmap is
insufficient.  We only synchronize with the oom reaper if
tsk_is_oom_victim which is not true if the final __mmput is called from
a different context than the oom victim exit path.  This can trivially
happen from context of any task which has grabbed mm reference (e.g.  to
read /proc/<pid>/ file which requires mm etc.).

The race would look like this

  oom_reaper		oom_victim		task
						mmget_not_zero
			do_exit
			  mmput
  __oom_reap_task_mm				mmput
  						  __mmput
						    exit_mmap
						      remove_vma
    unmap_page_range

Fix this issue by providing a new mm_is_oom_victim() helper which
operates on the mm struct rather than a task.  Any context which
operates on a remote mm struct should use this helper in place of
tsk_is_oom_victim.  The flag is set in mark_oom_victim and never cleared
so it is stable in the exit_mmap path.

Debugged by Tetsuo Handa.

Link: http://lkml.kernel.org/r/20171210095130.17110-1-mhocko@kernel.org
Fixes: 212925802454 ("mm: oom: let oom_reap_task and exit_mmap run concurrently")
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: David Rientjes <rientjes@google.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Andrea Argangeli <andrea@kernel.org>
Cc: <stable@vger.kernel.org>	[4.14]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/oom.h            |  9 +++++++++
 include/linux/sched/coredump.h |  1 +
 mm/mmap.c                      | 10 +++++-----
 mm/oom_kill.c                  |  4 +++-
 4 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 01c91d874a57..5bad038ac012 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -66,6 +66,15 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk)
 	return tsk->signal->oom_mm;
 }
 
+/*
+ * Use this helper if tsk->mm != mm and the victim mm needs a special
+ * handling. This is guaranteed to stay true after once set.
+ */
+static inline bool mm_is_oom_victim(struct mm_struct *mm)
+{
+	return test_bit(MMF_OOM_VICTIM, &mm->flags);
+}
+
 /*
  * Checks whether a page fault on the given mm is still reliable.
  * This is no longer true if the oom reaper started to reap the
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index 9c8847395b5e..ec912d01126f 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -70,6 +70,7 @@ static inline int get_dumpable(struct mm_struct *mm)
 #define MMF_UNSTABLE		22	/* mm is unstable for copy_from_user */
 #define MMF_HUGE_ZERO_PAGE	23      /* mm has ever used the global huge zero page */
 #define MMF_DISABLE_THP		24	/* disable THP for all VMAs */
+#define MMF_OOM_VICTIM		25	/* mm is the oom victim */
 #define MMF_DISABLE_THP_MASK	(1 << MMF_DISABLE_THP)
 
 #define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
diff --git a/mm/mmap.c b/mm/mmap.c
index a4d546821214..9efdc021ad22 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3019,20 +3019,20 @@ void exit_mmap(struct mm_struct *mm)
 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
 	unmap_vmas(&tlb, vma, 0, -1);
 
-	set_bit(MMF_OOM_SKIP, &mm->flags);
-	if (unlikely(tsk_is_oom_victim(current))) {
+	if (unlikely(mm_is_oom_victim(mm))) {
 		/*
 		 * Wait for oom_reap_task() to stop working on this
 		 * mm. Because MMF_OOM_SKIP is already set before
 		 * calling down_read(), oom_reap_task() will not run
 		 * on this "mm" post up_write().
 		 *
-		 * tsk_is_oom_victim() cannot be set from under us
-		 * either because current->mm is already set to NULL
+		 * mm_is_oom_victim() cannot be set from under us
+		 * either because victim->mm is already set to NULL
 		 * under task_lock before calling mmput and oom_mm is
-		 * set not NULL by the OOM killer only if current->mm
+		 * set not NULL by the OOM killer only if victim->mm
 		 * is found not NULL while holding the task_lock.
 		 */
+		set_bit(MMF_OOM_SKIP, &mm->flags);
 		down_write(&mm->mmap_sem);
 		up_write(&mm->mmap_sem);
 	}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c957be32b27a..29f855551efe 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -683,8 +683,10 @@ static void mark_oom_victim(struct task_struct *tsk)
 		return;
 
 	/* oom_mm is bound to the signal struct life time. */
-	if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm))
+	if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
 		mmgrab(tsk->signal->oom_mm);
+		set_bit(MMF_OOM_VICTIM, &mm->flags);
+	}
 
 	/*
 	 * Make sure that the task is woken up from uninterruptible sleep
-- 
cgit v1.2.3


From 7a4fa29106d9a38ef005f5ab15d493c259f269c0 Mon Sep 17 00:00:00 2001
From: Yuval Mintz <yuvalm@mellanox.com>
Date: Thu, 14 Dec 2017 15:54:29 +0200
Subject: net: sched: Add TCA_HW_OFFLOAD

Qdiscs can be offloaded to HW, but current implementation isn't uniform.
Instead, qdiscs either pass information about offload status via their
TCA_OPTIONS or omit it altogether.

Introduce a new attribute - TCA_HW_OFFLOAD that would form a uniform
uAPI for the offloading status of qdiscs.

Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h      | 1 +
 include/uapi/linux/rtnetlink.h | 1 +
 net/sched/sch_api.c            | 2 ++
 3 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 65d0d25f2648..83a3e47d5845 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -71,6 +71,7 @@ struct Qdisc {
 				      * qdisc_tree_decrease_qlen() should stop.
 				      */
 #define TCQ_F_INVISIBLE		0x80 /* invisible by default in dump */
+#define TCQ_F_OFFLOADED		0x200 /* qdisc is offloaded to HW */
 	u32			limit;
 	const struct Qdisc_ops	*ops;
 	struct qdisc_size_table	__rcu *stab;
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index d8b5f80c2ea6..843e29aa3cac 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -557,6 +557,7 @@ enum {
 	TCA_PAD,
 	TCA_DUMP_INVISIBLE,
 	TCA_CHAIN,
+	TCA_HW_OFFLOAD,
 	__TCA_MAX
 };
 
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index b6c4f536876b..0f1eab99ff4e 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -795,6 +795,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 	tcm->tcm_info = refcount_read(&q->refcnt);
 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
 		goto nla_put_failure;
+	if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
+		goto nla_put_failure;
 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
 		goto nla_put_failure;
 	qlen = q->q.qlen;
-- 
cgit v1.2.3


From 4a98795bc8ea148b1ebbbf001283e06430cffe36 Mon Sep 17 00:00:00 2001
From: Yuval Mintz <yuvalm@mellanox.com>
Date: Thu, 14 Dec 2017 15:54:31 +0200
Subject: pkt_sched: Remove TC_RED_OFFLOADED from uapi

Following the previous patch, RED is now using the new uniform uapi
for indicating it's offloaded. As a result, TC_RED_OFFLOADED is no
longer utilized by kernel and can be removed [as it's still not
part of any stable release].

Fixes: 602f3baf2218 ("net_sch: red: Add offload ability to RED qdisc")
Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index af3cc2f4e1ad..37b5096ae97b 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -256,7 +256,6 @@ struct tc_red_qopt {
 #define TC_RED_ECN		1
 #define TC_RED_HARDDROP		2
 #define TC_RED_ADAPTATIVE	4
-#define TC_RED_OFFLOADED	8
 };
 
 struct tc_red_xstats {
-- 
cgit v1.2.3


From 1784f9144b143a1e8b19fe94083b040aa559182b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Tue, 5 Dec 2017 14:14:47 +0100
Subject: drivers/misc/intel/pti: Rename the header file to free up the
 namespace

We'd like to use the 'PTI' acronym for 'Page Table Isolation' - free up the
namespace by renaming the <linux/pti.h> driver header to <linux/intel-pti.h>.

(Also standardize the header guard name while at it.)

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: J Freyensee <james_p_freyensee@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/misc/pti.c        |  2 +-
 include/linux/intel-pti.h | 43 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/pti.h       | 43 -------------------------------------------
 3 files changed, 44 insertions(+), 44 deletions(-)
 create mode 100644 include/linux/intel-pti.h
 delete mode 100644 include/linux/pti.h

(limited to 'include')

diff --git a/drivers/misc/pti.c b/drivers/misc/pti.c
index eda38cbe8530..41f2a9f6851d 100644
--- a/drivers/misc/pti.c
+++ b/drivers/misc/pti.c
@@ -32,7 +32,7 @@
 #include <linux/pci.h>
 #include <linux/mutex.h>
 #include <linux/miscdevice.h>
-#include <linux/pti.h>
+#include <linux/intel-pti.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 
diff --git a/include/linux/intel-pti.h b/include/linux/intel-pti.h
new file mode 100644
index 000000000000..2710d72de3c9
--- /dev/null
+++ b/include/linux/intel-pti.h
@@ -0,0 +1,43 @@
+/*
+ *  Copyright (C) Intel 2011
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * The PTI (Parallel Trace Interface) driver directs trace data routed from
+ * various parts in the system out through the Intel Penwell PTI port and
+ * out of the mobile device for analysis with a debugging tool
+ * (Lauterbach, Fido). This is part of a solution for the MIPI P1149.7,
+ * compact JTAG, standard.
+ *
+ * This header file will allow other parts of the OS to use the
+ * interface to write out it's contents for debugging a mobile system.
+ */
+
+#ifndef LINUX_INTEL_PTI_H_
+#define LINUX_INTEL_PTI_H_
+
+/* offset for last dword of any PTI message. Part of MIPI P1149.7 */
+#define PTI_LASTDWORD_DTS	0x30
+
+/* basic structure used as a write address to the PTI HW */
+struct pti_masterchannel {
+	u8 master;
+	u8 channel;
+};
+
+/* the following functions are defined in misc/pti.c */
+void pti_writedata(struct pti_masterchannel *mc, u8 *buf, int count);
+struct pti_masterchannel *pti_request_masterchannel(u8 type,
+						    const char *thread_name);
+void pti_release_masterchannel(struct pti_masterchannel *mc);
+
+#endif /* LINUX_INTEL_PTI_H_ */
diff --git a/include/linux/pti.h b/include/linux/pti.h
deleted file mode 100644
index b3ea01a3197e..000000000000
--- a/include/linux/pti.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- *  Copyright (C) Intel 2011
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- * The PTI (Parallel Trace Interface) driver directs trace data routed from
- * various parts in the system out through the Intel Penwell PTI port and
- * out of the mobile device for analysis with a debugging tool
- * (Lauterbach, Fido). This is part of a solution for the MIPI P1149.7,
- * compact JTAG, standard.
- *
- * This header file will allow other parts of the OS to use the
- * interface to write out it's contents for debugging a mobile system.
- */
-
-#ifndef PTI_H_
-#define PTI_H_
-
-/* offset for last dword of any PTI message. Part of MIPI P1149.7 */
-#define PTI_LASTDWORD_DTS	0x30
-
-/* basic structure used as a write address to the PTI HW */
-struct pti_masterchannel {
-	u8 master;
-	u8 channel;
-};
-
-/* the following functions are defined in misc/pti.c */
-void pti_writedata(struct pti_masterchannel *mc, u8 *buf, int count);
-struct pti_masterchannel *pti_request_masterchannel(u8 type,
-						    const char *thread_name);
-void pti_release_masterchannel(struct pti_masterchannel *mc);
-
-#endif /*PTI_H_*/
-- 
cgit v1.2.3