From b1b38278e12b04cf9a227f6af2c24651cf6e8a85 Mon Sep 17 00:00:00 2001
From: David Weinehall <david.weinehall@linux.intel.com>
Date: Wed, 20 May 2015 17:00:13 +0300
Subject: drm/i915: add a context parameter to {en, dis}able zero address
 mapping

Export a new context parameter that can be set/queried through the
context_{get,set}param ioctls.  This parameter is passed as a context
flag and decides whether or not a GPU address mapping is allowed to
be made at address zero.  The default is to allow such mappings.

Signed-off-by: David Weinehall <david.weinehall@intel.com>
Acked-by: "Zou, Nanhai" <nanhai.zou@intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 include/uapi/drm/i915_drm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 6e1a2ed116cb..92d61a7c942a 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1106,6 +1106,7 @@ struct drm_i915_gem_context_param {
 	__u32 size;
 	__u64 param;
 #define I915_CONTEXT_PARAM_BAN_PERIOD 0x1
+#define I915_CONTEXT_PARAM_NO_ZEROMAP 0x2
 	__u64 value;
 };
 
-- 
cgit v1.2.3


From 49e4d842f0d0892c3d26c93a81b9f22c1467030e Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 15 Jun 2015 12:23:48 +0100
Subject: drm/i915: Report to userspace if we have a (presumed) working GPU
 reset

In igt, we want to test handling of GPU hangs, both for recovery
purposes and for reporting. However, we don't want to inject a genuine
GPU hang onto a machine that cannot recover and so be permenantly
wedged. Rather than embed heuristics into igt, have the kernel report
exactly when it expects the GPU reset to work.

This can also be usefully extended in future to indicate different
levels of fine-grained resets.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Tim Gore <tim.gore@intel.com>
Cc: Tomas Elf <tomas.elf@intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_dma.c     |  5 +++++
 drivers/gpu/drm/i915/i915_drv.h     |  1 +
 drivers/gpu/drm/i915/intel_uncore.c | 28 ++++++++++++++++++++++------
 include/uapi/drm/i915_drm.h         |  1 +
 4 files changed, 29 insertions(+), 6 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 34248635c36c..88795d2f1819 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -163,6 +163,11 @@ static int i915_getparam(struct drm_device *dev, void *data,
 		if (!value)
 			return -ENODEV;
 		break;
+	case I915_PARAM_HAS_GPU_RESET:
+		value = i915.enable_hangcheck &&
+			i915.reset &&
+			intel_has_gpu_reset(dev);
+		break;
 	default:
 		DRM_DEBUG("Unknown parameter %d\n", param->param);
 		return -EINVAL;
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 971fc330cc35..44efb4deccb5 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2594,6 +2594,7 @@ extern long i915_compat_ioctl(struct file *filp, unsigned int cmd,
 			      unsigned long arg);
 #endif
 extern int intel_gpu_reset(struct drm_device *dev);
+extern bool intel_has_gpu_reset(struct drm_device *dev);
 extern int i915_reset(struct drm_device *dev);
 extern unsigned long i915_chipset_val(struct drm_i915_private *dev_priv);
 extern unsigned long i915_mch_val(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index a6d8a3ee7750..4a86cf007aa0 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -1455,20 +1455,36 @@ static int gen6_do_reset(struct drm_device *dev)
 	return ret;
 }
 
-int intel_gpu_reset(struct drm_device *dev)
+static int (*intel_get_gpu_reset(struct drm_device *dev))(struct drm_device *)
 {
 	if (INTEL_INFO(dev)->gen >= 6)
-		return gen6_do_reset(dev);
+		return gen6_do_reset;
 	else if (IS_GEN5(dev))
-		return ironlake_do_reset(dev);
+		return ironlake_do_reset;
 	else if (IS_G4X(dev))
-		return g4x_do_reset(dev);
+		return g4x_do_reset;
 	else if (IS_G33(dev))
-		return g33_do_reset(dev);
+		return g33_do_reset;
 	else if (INTEL_INFO(dev)->gen >= 3)
-		return i915_do_reset(dev);
+		return i915_do_reset;
 	else
+		return NULL;
+}
+
+int intel_gpu_reset(struct drm_device *dev)
+{
+	int (*reset)(struct drm_device *);
+
+	reset = intel_get_gpu_reset(dev);
+	if (reset == NULL)
 		return -ENODEV;
+
+	return reset(dev);
+}
+
+bool intel_has_gpu_reset(struct drm_device *dev)
+{
+	return intel_get_gpu_reset(dev) != NULL;
 }
 
 void intel_uncore_check_errors(struct drm_device *dev)
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 92d61a7c942a..f88cc1cac5d9 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -354,6 +354,7 @@ typedef struct drm_i915_irq_wait {
 #define I915_PARAM_REVISION              32
 #define I915_PARAM_SUBSLICE_TOTAL	 33
 #define I915_PARAM_EU_TOTAL		 34
+#define I915_PARAM_HAS_GPU_RESET	 35
 
 typedef struct drm_i915_getparam {
 	int param;
-- 
cgit v1.2.3


From a9ed33ca075f712cc7fd96eb84e3d322012fcaaf Mon Sep 17 00:00:00 2001
From: Abdiel Janulgue <abdiel.janulgue@linux.intel.com>
Date: Wed, 1 Jul 2015 10:12:23 +0300
Subject: drm/i915: Expose I915_EXEC_RESOURCE_STREAMER flag and getparam

Ensures that the batch buffer is executed by the resource streamer.
And will let userspace know whether Resource Streamer is supported in
the kernel.

v2: Don't skip 1<<15 for the exec flags (Jani Nikula)
v3: Use HAS_RESOURCE_STREAMER macro for execbuf validation (Chris Wilson)

(from getparam patch)

v2: Update I915_PARAM_HAS_RESOURCE_STREAMER so it's after
    I915_PARAM_HAS_GPU_RESET.
v3: Only advertise RS support for hardware that supports it.
v4: Add HAS_RESOURCE_STREAMER() macro (Chris)

Testcase: igt/gem_exec_params
Cc: Jani Nikula <jani.nikula@intel.com>
Cc: Kenneth Graunke <kenneth@whitecape.org>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Abdiel Janulgue <abdiel.janulgue@linux.intel.com>
[danvet: squash in getparam patch since it'd break bisect, suggested
by Chris.]
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_dma.c            |  3 +++
 drivers/gpu/drm/i915/i915_drv.h            |  3 +++
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 14 ++++++++++++++
 include/uapi/drm/i915_drm.h                |  8 +++++++-
 4 files changed, 27 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index c5349fa3fcce..a42f16592433 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -167,6 +167,9 @@ static int i915_getparam(struct drm_device *dev, void *data,
 		value = i915.enable_hangcheck &&
 			intel_has_gpu_reset(dev);
 		break;
+	case I915_PARAM_HAS_RESOURCE_STREAMER:
+		value = HAS_RESOURCE_STREAMER(dev);
+		break;
 	default:
 		DRM_DEBUG("Unknown parameter %d\n", param->param);
 		return -EINVAL;
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 950a9811a16f..63daf4cd2477 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2539,6 +2539,9 @@ struct drm_i915_cmd_table {
 
 #define HAS_CSR(dev)	(IS_SKYLAKE(dev))
 
+#define HAS_RESOURCE_STREAMER(dev) (IS_HASWELL(dev) || \
+				    INTEL_INFO(dev)->gen >= 8)
+
 #define INTEL_PCH_DEVICE_ID_MASK		0xff00
 #define INTEL_PCH_IBX_DEVICE_ID_TYPE		0x3b00
 #define INTEL_PCH_CPT_DEVICE_ID_TYPE		0x1c00
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 600db7441847..83577c615962 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1490,6 +1490,20 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 		return -EINVAL;
 	}
 
+	if (args->flags & I915_EXEC_RESOURCE_STREAMER) {
+		if (!HAS_RESOURCE_STREAMER(dev)) {
+			DRM_DEBUG("RS is only allowed for Haswell, Gen8 and above\n");
+			return -EINVAL;
+		}
+		if (ring->id != RCS) {
+			DRM_DEBUG("RS is not available on %s\n",
+				 ring->name);
+			return -EINVAL;
+		}
+
+		dispatch_flags |= I915_DISPATCH_RS;
+	}
+
 	intel_runtime_pm_get(dev_priv);
 
 	ret = i915_mutex_lock_interruptible(dev);
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index f88cc1cac5d9..e7c29f1659ad 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -355,6 +355,7 @@ typedef struct drm_i915_irq_wait {
 #define I915_PARAM_SUBSLICE_TOTAL	 33
 #define I915_PARAM_EU_TOTAL		 34
 #define I915_PARAM_HAS_GPU_RESET	 35
+#define I915_PARAM_HAS_RESOURCE_STREAMER 36
 
 typedef struct drm_i915_getparam {
 	int param;
@@ -765,7 +766,12 @@ struct drm_i915_gem_execbuffer2 {
 #define I915_EXEC_BSD_RING1		(1<<13)
 #define I915_EXEC_BSD_RING2		(2<<13)
 
-#define __I915_EXEC_UNKNOWN_FLAGS -(1<<15)
+/** Tell the kernel that the batchbuffer is processed by
+ *  the resource streamer.
+ */
+#define I915_EXEC_RESOURCE_STREAMER     (1<<15)
+
+#define __I915_EXEC_UNKNOWN_FLAGS -(I915_EXEC_RESOURCE_STREAMER<<1)
 
 #define I915_EXEC_CONTEXT_ID_MASK	(0xffffffff)
 #define i915_execbuffer2_set_context_id(eb2, context) \
-- 
cgit v1.2.3


From 9aee1ae3312daf0de4c9c614680d06d557133317 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sat, 16 May 2015 15:11:40 -0300
Subject: [media] media: uapi: vsp1: Use __u32 instead of u32

Don't use the kernel types in uapi headers.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
---
 include/uapi/linux/vsp1.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/vsp1.h b/include/uapi/linux/vsp1.h
index e18858f6e865..9a823696d816 100644
--- a/include/uapi/linux/vsp1.h
+++ b/include/uapi/linux/vsp1.h
@@ -28,7 +28,7 @@
 	_IOWR('V', BASE_VIDIOC_PRIVATE + 1, struct vsp1_lut_config)
 
 struct vsp1_lut_config {
-	u32 lut[256];
+	__u32 lut[256];
 };
 
 #endif	/* __VSP1_USER_H__ */
-- 
cgit v1.2.3


From 1055e0687f581cf988bb2239a00d1396e18ef114 Mon Sep 17 00:00:00 2001
From: Chad Versace <chad.versace@intel.com>
Date: Thu, 9 Jul 2015 01:38:42 -0700
Subject: drm/fourcc: Add formats R8, RG88, GR88
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Kodi/XBMC developers want to transcode NV12 to RGB with OpenGL
shaders, importing the two source planes through
EGL_EXT_image_dma_buf_import. That requires importing the Y plane as an
R8 EGLImage and the UV plane as either an RG88 or GR88 EGLImage.

CC: Peter Frühberger <peter.fruehberger@gmail.com>
Cc: Rainer Hochecker <rainer.hochecker@onlinehome.de>
Cc: Benjamin Widawsky <benjamin.widawsky@intel.com>
Reviewed-by: Pekka Paalanen <pekka.paalanen@collabora.co.uk>
Signed-off-by: Chad Versace <chad.versace@intel.com>
---
 include/uapi/drm/drm_fourcc.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h
index 2f295cde657e..8c5e8b91a3cb 100644
--- a/include/uapi/drm/drm_fourcc.h
+++ b/include/uapi/drm/drm_fourcc.h
@@ -34,6 +34,13 @@
 /* color index */
 #define DRM_FORMAT_C8		fourcc_code('C', '8', ' ', ' ') /* [7:0] C */
 
+/* 8 bpp Red */
+#define DRM_FORMAT_R8		fourcc_code('R', '8', ' ', ' ') /* [7:0] R */
+
+/* 16 bpp RG */
+#define DRM_FORMAT_RG88		fourcc_code('R', 'G', '8', '8') /* [15:0] R:G 8:8 little endian */
+#define DRM_FORMAT_GR88		fourcc_code('G', 'R', '8', '8') /* [15:0] G:R 8:8 little endian */
+
 /* 8 bpp RGB */
 #define DRM_FORMAT_RGB332	fourcc_code('R', 'G', 'B', '8') /* [7:0] R:G:B 3:3:2 */
 #define DRM_FORMAT_BGR233	fourcc_code('B', 'G', 'R', '8') /* [7:0] B:G:R 2:3:3 */
-- 
cgit v1.2.3


From 74fe61f17e999a458d5f64ca2aa9a0282ca32198 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Fri, 10 Jul 2015 08:02:08 -0700
Subject: bridge: mdb: add vlan support for user entries

Until now all user mdb entries were added in vlan 0, this patch adds
support to allow the user to specify the vlan for the entry.
About the uapi change a hole in struct br_mdb_entry is used so the size
and offsets are kept the same (verified with pahole and tested with older
iproute2).

Example:
$ bridge mdb
dev br0 port eth1 grp 239.0.0.1 permanent vlan 2000
dev br0 port eth1 grp 239.0.0.1 permanent vlan 200
dev br0 port eth1 grp 239.0.0.1 permanent

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h | 1 +
 net/bridge/br_mdb.c            | 6 ++++++
 2 files changed, 7 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index eaaea6208b42..3635b7797508 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -182,6 +182,7 @@ struct br_mdb_entry {
 #define MDB_TEMPORARY 0
 #define MDB_PERMANENT 1
 	__u8 state;
+	__u16 vid;
 	struct {
 		union {
 			__be32	ip4;
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 1fb7d076f15c..a8d0e93d43f2 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -85,6 +85,7 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
 					memset(&e, 0, sizeof(e));
 					e.ifindex = port->dev->ifindex;
 					e.state = p->state;
+					e.vid = p->addr.vid;
 					if (p->addr.proto == htons(ETH_P_IP))
 						e.addr.u.ip4 = p->addr.u.ip4;
 #if IS_ENABLED(CONFIG_IPV6)
@@ -242,6 +243,7 @@ void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
 	entry.addr.u.ip6 = group->u.ip6;
 #endif
 	entry.state = state;
+	entry.vid = group->vid;
 	__br_mdb_notify(dev, &entry, type);
 }
 
@@ -264,6 +266,8 @@ static bool is_valid_mdb_entry(struct br_mdb_entry *entry)
 		return false;
 	if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY)
 		return false;
+	if (entry->vid >= VLAN_VID_MASK)
+		return false;
 
 	return true;
 }
@@ -372,6 +376,7 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br,
 	if (!p || p->br != br || p->state == BR_STATE_DISABLED)
 		return -EINVAL;
 
+	ip.vid = entry->vid;
 	ip.proto = entry->addr.proto;
 	if (ip.proto == htons(ETH_P_IP))
 		ip.u.ip4 = entry->addr.u.ip4;
@@ -418,6 +423,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry)
 	if (!netif_running(br->dev) || br->multicast_disabled)
 		return -EINVAL;
 
+	ip.vid = entry->vid;
 	ip.proto = entry->addr.proto;
 	if (ip.proto == htons(ETH_P_IP)) {
 		if (timer_pending(&br->ip4_other_query.timer))
-- 
cgit v1.2.3


From 346add7834557b5b9628b9bf2387106d42e631d4 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Tue, 14 Jul 2015 18:07:30 +0200
Subject: drm/i915: Use expcitly fixed type in compat32 structs

I was confused shortly whether the compat was needed for the int,
until I noticed the pointer in the original.

Also remove typedef.

v2: Review from Chris.
- Add comments.
- Also change the int param in the original structure.

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_ioc32.c | 13 +++++++++----
 include/uapi/drm/i915_drm.h       |  6 +++++-
 2 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/i915/i915_ioc32.c b/drivers/gpu/drm/i915/i915_ioc32.c
index 6eec2221b44e..a5a2d5b3f44b 100644
--- a/drivers/gpu/drm/i915/i915_ioc32.c
+++ b/drivers/gpu/drm/i915/i915_ioc32.c
@@ -35,15 +35,20 @@
 #include <drm/i915_drm.h>
 #include "i915_drv.h"
 
-typedef struct drm_i915_getparam32 {
-	int param;
+struct drm_i915_getparam32 {
+	s32 param;
+	/*
+	 * We screwed up the generic ioctl struct here and used a variable-sized
+	 * pointer. Use u32 in the compat struct to match the 32bit pointer
+	 * userspace expects.
+	 */
 	u32 value;
-} drm_i915_getparam32_t;
+};
 
 static int compat_i915_getparam(struct file *file, unsigned int cmd,
 				unsigned long arg)
 {
-	drm_i915_getparam32_t req32;
+	struct drm_i915_getparam32 req32;
 	drm_i915_getparam_t __user *request;
 
 	if (copy_from_user(&req32, (void __user *)arg, sizeof(req32)))
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index e7c29f1659ad..192027b4f031 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -358,7 +358,11 @@ typedef struct drm_i915_irq_wait {
 #define I915_PARAM_HAS_RESOURCE_STREAMER 36
 
 typedef struct drm_i915_getparam {
-	int param;
+	s32 param;
+	/*
+	 * WARNING: Using pointers instead of fixed-size u64 means we need to write
+	 * compat32 code. Don't repeat this mistake.
+	 */
 	int __user *value;
 } drm_i915_getparam_t;
 
-- 
cgit v1.2.3


From 13c4a90119d28cfcb6b5bdd820c233b86c2b0237 Mon Sep 17 00:00:00 2001
From: Tycho Andersen <tycho.andersen@canonical.com>
Date: Sat, 13 Jun 2015 09:02:48 -0600
Subject: seccomp: add ptrace options for suspend/resume

This patch is the first step in enabling checkpoint/restore of processes
with seccomp enabled.

One of the things CRIU does while dumping tasks is inject code into them
via ptrace to collect information that is only available to the process
itself. However, if we are in a seccomp mode where these processes are
prohibited from making these syscalls, then what CRIU does kills the task.

This patch adds a new ptrace option, PTRACE_O_SUSPEND_SECCOMP, that enables
a task from the init user namespace which has CAP_SYS_ADMIN and no seccomp
filters to disable (and re-enable) seccomp filters for another task so that
they can be successfully dumped (and restored). We restrict the set of
processes that can disable seccomp through ptrace because although today
ptrace can be used to bypass seccomp, there is some discussion of closing
this loophole in the future and we would like this patch to not depend on
that behavior and be future proofed for when it is removed.

Note that seccomp can be suspended before any filters are actually
installed; this behavior is useful on criu restore, so that we can suspend
seccomp, restore the filters, unmap our restore code from the restored
process' address space, and then resume the task by detaching and have the
filters resumed as well.

v2 changes:

* require that the tracer have no seccomp filters installed
* drop TIF_NOTSC manipulation from the patch
* change from ptrace command to a ptrace option and use this ptrace option
  as the flag to check. This means that as soon as the tracer
  detaches/dies, seccomp is re-enabled and as a corrollary that one can not
  disable seccomp across PTRACE_ATTACHs.

v3 changes:

* get rid of various #ifdefs everywhere
* report more sensible errors when PTRACE_O_SUSPEND_SECCOMP is incorrectly
  used

v4 changes:

* get rid of may_suspend_seccomp() in favor of a capable() check in ptrace
  directly

v5 changes:

* check that seccomp is not enabled (or suspended) on the tracer

Signed-off-by: Tycho Andersen <tycho.andersen@canonical.com>
CC: Will Drewry <wad@chromium.org>
CC: Roland McGrath <roland@hack.frob.com>
CC: Pavel Emelyanov <xemul@parallels.com>
CC: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Andy Lutomirski <luto@amacapital.net>
[kees: access seccomp.mode through seccomp_mode() instead]
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/ptrace.h      |  1 +
 include/uapi/linux/ptrace.h |  6 ++++--
 kernel/ptrace.c             | 13 +++++++++++++
 kernel/seccomp.c            |  8 ++++++++
 4 files changed, 26 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 987a73a40ef8..061265f92876 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -34,6 +34,7 @@
 #define PT_TRACE_SECCOMP	PT_EVENT_FLAG(PTRACE_EVENT_SECCOMP)
 
 #define PT_EXITKILL		(PTRACE_O_EXITKILL << PT_OPT_FLAG_SHIFT)
+#define PT_SUSPEND_SECCOMP	(PTRACE_O_SUSPEND_SECCOMP << PT_OPT_FLAG_SHIFT)
 
 /* single stepping state bits (used on ARM and PA-RISC) */
 #define PT_SINGLESTEP_BIT	31
diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
index cf1019e15f5b..a7a697986614 100644
--- a/include/uapi/linux/ptrace.h
+++ b/include/uapi/linux/ptrace.h
@@ -89,9 +89,11 @@ struct ptrace_peeksiginfo_args {
 #define PTRACE_O_TRACESECCOMP	(1 << PTRACE_EVENT_SECCOMP)
 
 /* eventless options */
-#define PTRACE_O_EXITKILL	(1 << 20)
+#define PTRACE_O_EXITKILL		(1 << 20)
+#define PTRACE_O_SUSPEND_SECCOMP	(1 << 21)
 
-#define PTRACE_O_MASK		(0x000000ff | PTRACE_O_EXITKILL)
+#define PTRACE_O_MASK		(\
+	0x000000ff | PTRACE_O_EXITKILL | PTRACE_O_SUSPEND_SECCOMP)
 
 #include <asm/ptrace.h>
 
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index c8e0e050a36a..787320de68e0 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -556,6 +556,19 @@ static int ptrace_setoptions(struct task_struct *child, unsigned long data)
 	if (data & ~(unsigned long)PTRACE_O_MASK)
 		return -EINVAL;
 
+	if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) {
+		if (!config_enabled(CONFIG_CHECKPOINT_RESTORE) ||
+		    !config_enabled(CONFIG_SECCOMP))
+			return -EINVAL;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (seccomp_mode(&current->seccomp) != SECCOMP_MODE_DISABLED ||
+		    current->ptrace & PT_SUSPEND_SECCOMP)
+			return -EPERM;
+	}
+
 	/* Avoid intermediate state when all opts are cleared */
 	flags = child->ptrace;
 	flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 980fd26da22e..645e42d6fa4d 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -590,6 +590,10 @@ void secure_computing_strict(int this_syscall)
 {
 	int mode = current->seccomp.mode;
 
+	if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&
+	    unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
+		return;
+
 	if (mode == 0)
 		return;
 	else if (mode == SECCOMP_MODE_STRICT)
@@ -691,6 +695,10 @@ u32 seccomp_phase1(struct seccomp_data *sd)
 	int this_syscall = sd ? sd->nr :
 		syscall_get_nr(current, task_pt_regs(current));
 
+	if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&
+	    unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
+		return SECCOMP_PHASE1_OK;
+
 	switch (mode) {
 	case SECCOMP_MODE_STRICT:
 		__secure_computing_strict(this_syscall);  /* may call do_exit */
-- 
cgit v1.2.3


From 88d6378bd6c096cb8440face3ae3f33d55a2e6e4 Mon Sep 17 00:00:00 2001
From: Anuradha Karuppiah <anuradhak@cumulusnetworks.com>
Date: Tue, 14 Jul 2015 13:43:20 -0700
Subject: netlink: changes for setting and clearing protodown via netlink.

Signed-off-by: Anuradha Karuppiah <anuradhak@cumulusnetworks.com>
Signed-off-by: Andy Gospodarek <gospo@cumulusnetworks.com>
Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: Wilson Kok <wkok@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h |  1 +
 net/core/rtnetlink.c         | 16 ++++++++++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 2c7e8e3d3981..24d68b797c59 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -148,6 +148,7 @@ enum {
 	IFLA_PHYS_SWITCH_ID,
 	IFLA_LINK_NETNSID,
 	IFLA_PHYS_PORT_NAME,
+	IFLA_PROTO_DOWN,
 	__IFLA_MAX
 };
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 9e433d58d265..03d61b54aac0 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -896,7 +896,9 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + rtnl_link_get_size(dev) /* IFLA_LINKINFO */
 	       + rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */
 	       + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */
-	       + nla_total_size(MAX_PHYS_ITEM_ID_LEN); /* IFLA_PHYS_SWITCH_ID */
+	       + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */
+	       + nla_total_size(1); /* IFLA_PROTO_DOWN */
+
 }
 
 static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev)
@@ -1082,7 +1084,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 	    (dev->ifalias &&
 	     nla_put_string(skb, IFLA_IFALIAS, dev->ifalias)) ||
 	    nla_put_u32(skb, IFLA_CARRIER_CHANGES,
-			atomic_read(&dev->carrier_changes)))
+			atomic_read(&dev->carrier_changes)) ||
+	    nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down))
 		goto nla_put_failure;
 
 	if (1) {
@@ -1319,6 +1322,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_CARRIER_CHANGES]	= { .type = NLA_U32 },  /* ignored */
 	[IFLA_PHYS_SWITCH_ID]	= { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
 	[IFLA_LINK_NETNSID]	= { .type = NLA_S32 },
+	[IFLA_PROTO_DOWN]	= { .type = NLA_U8 },
 };
 
 static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -1858,6 +1862,14 @@ static int do_setlink(const struct sk_buff *skb,
 	}
 	err = 0;
 
+	if (tb[IFLA_PROTO_DOWN]) {
+		err = dev_change_proto_down(dev,
+					    nla_get_u8(tb[IFLA_PROTO_DOWN]));
+		if (err)
+			goto errout;
+		status |= DO_SETLINK_NOTIFY;
+	}
+
 errout:
 	if (status & DO_SETLINK_MODIFIED) {
 		if (status & DO_SETLINK_NOTIFY)
-- 
cgit v1.2.3


From d32d98642de66048f9534a05f3641558e811bbc9 Mon Sep 17 00:00:00 2001
From: Mats Randgaard <matrandg@cisco.com>
Date: Thu, 9 Jul 2015 05:45:47 -0300
Subject: [media] Driver for Toshiba TC358743 HDMI to CSI-2 bridge

The driver is tested on our hardware and all the implemented features
works as expected.

Missing features:
- CEC support
- HDCP repeater support
- IR support

Signed-off-by: Mats Randgaard <matrandg@cisco.com>
[hans.verkuil@cisco.com: updated copyright year to 2015]
[hans.verkuil@cisco.com: update confusing confctl_mutex comment]
Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>

Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
---
 MAINTAINERS                        |    7 +
 drivers/media/i2c/Kconfig          |    9 +
 drivers/media/i2c/Makefile         |    1 +
 drivers/media/i2c/tc358743.c       | 1778 ++++++++++++++++++++++++++++++++++++
 drivers/media/i2c/tc358743_regs.h  |  681 ++++++++++++++
 include/media/tc358743.h           |  131 +++
 include/uapi/linux/v4l2-controls.h |    4 +
 7 files changed, 2611 insertions(+)
 create mode 100644 drivers/media/i2c/tc358743.c
 create mode 100644 drivers/media/i2c/tc358743_regs.h
 create mode 100644 include/media/tc358743.h

(limited to 'include/uapi')

diff --git a/MAINTAINERS b/MAINTAINERS
index fd6078443083..2bb989be111d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10319,6 +10319,13 @@ F:	drivers/char/toshiba.c
 F:	include/linux/toshiba.h
 F:	include/uapi/linux/toshiba.h
 
+TOSHIBA TC358743 DRIVER
+M:	Mats Randgaard <matrandg@cisco.com>
+L:	linux-media@vger.kernel.org
+S:	Maintained
+F:	drivers/media/i2c/tc358743*
+F:	include/media/tc358743.h
+
 TMIO MMC DRIVER
 M:	Ian Molton <ian@mnementh.co.uk>
 L:	linux-mmc@vger.kernel.org
diff --git a/drivers/media/i2c/Kconfig b/drivers/media/i2c/Kconfig
index 8d1268648fe0..0e0490d60e7e 100644
--- a/drivers/media/i2c/Kconfig
+++ b/drivers/media/i2c/Kconfig
@@ -287,6 +287,15 @@ config VIDEO_SAA711X
 	  To compile this driver as a module, choose M here: the
 	  module will be called saa7115.
 
+config VIDEO_TC358743
+	tristate "Toshiba TC358743 decoder"
+	depends on VIDEO_V4L2 && I2C
+	---help---
+	  Support for the Toshiba TC358743 HDMI to MIPI CSI-2 bridge.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called tc358743.
+
 config VIDEO_TVP514X
 	tristate "Texas Instruments TVP514x video decoder"
 	depends on VIDEO_V4L2 && I2C
diff --git a/drivers/media/i2c/Makefile b/drivers/media/i2c/Makefile
index f165faea5b3f..07db257abfc1 100644
--- a/drivers/media/i2c/Makefile
+++ b/drivers/media/i2c/Makefile
@@ -78,3 +78,4 @@ obj-$(CONFIG_VIDEO_AK881X)		+= ak881x.o
 obj-$(CONFIG_VIDEO_IR_I2C)  += ir-kbd-i2c.o
 obj-$(CONFIG_VIDEO_ML86V7667)	+= ml86v7667.o
 obj-$(CONFIG_VIDEO_OV2659)	+= ov2659.o
+obj-$(CONFIG_VIDEO_TC358743)	+= tc358743.o
diff --git a/drivers/media/i2c/tc358743.c b/drivers/media/i2c/tc358743.c
new file mode 100644
index 000000000000..4e8811c3e771
--- /dev/null
+++ b/drivers/media/i2c/tc358743.c
@@ -0,0 +1,1778 @@
+/*
+ * tc358743 - Toshiba HDMI to CSI-2 bridge
+ *
+ * Copyright 2015 Cisco Systems, Inc. and/or its affiliates. All rights
+ * reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+/*
+ * References (c = chapter, p = page):
+ * REF_01 - Toshiba, TC358743XBG (H2C), Functional Specification, Rev 0.60
+ * REF_02 - Toshiba, TC358743XBG_HDMI-CSI_Tv11p_nm.xls
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/i2c.h>
+#include <linux/delay.h>
+#include <linux/videodev2.h>
+#include <linux/workqueue.h>
+#include <linux/v4l2-dv-timings.h>
+#include <linux/hdmi.h>
+#include <media/v4l2-dv-timings.h>
+#include <media/v4l2-device.h>
+#include <media/v4l2-ctrls.h>
+#include <media/tc358743.h>
+
+#include "tc358743_regs.h"
+
+static int debug;
+module_param(debug, int, 0644);
+MODULE_PARM_DESC(debug, "debug level (0-3)");
+
+MODULE_DESCRIPTION("Toshiba TC358743 HDMI to CSI-2 bridge driver");
+MODULE_AUTHOR("Ramakrishnan Muthukrishnan <ram@rkrishnan.org>");
+MODULE_AUTHOR("Mikhail Khelik <mkhelik@cisco.com>");
+MODULE_AUTHOR("Mats Randgaard <matrandg@cisco.com>");
+MODULE_LICENSE("GPL");
+
+#define EDID_NUM_BLOCKS_MAX 8
+#define EDID_BLOCK_SIZE 128
+
+static const struct v4l2_dv_timings_cap tc358743_timings_cap = {
+	.type = V4L2_DV_BT_656_1120,
+	/* keep this initialization for compatibility with GCC < 4.4.6 */
+	.reserved = { 0 },
+	/* Pixel clock from REF_01 p. 20. Min/max height/width are unknown */
+	V4L2_INIT_BT_TIMINGS(1, 10000, 1, 10000, 0, 165000000,
+			V4L2_DV_BT_STD_CEA861 | V4L2_DV_BT_STD_DMT |
+			V4L2_DV_BT_STD_GTF | V4L2_DV_BT_STD_CVT,
+			V4L2_DV_BT_CAP_PROGRESSIVE |
+			V4L2_DV_BT_CAP_REDUCED_BLANKING |
+			V4L2_DV_BT_CAP_CUSTOM)
+};
+
+struct tc358743_state {
+	struct tc358743_platform_data pdata;
+	struct v4l2_subdev sd;
+	struct media_pad pad;
+	struct v4l2_ctrl_handler hdl;
+	struct i2c_client *i2c_client;
+	/* CONFCTL is modified in ops and tc358743_hdmi_sys_int_handler */
+	struct mutex confctl_mutex;
+
+	/* controls */
+	struct v4l2_ctrl *detect_tx_5v_ctrl;
+	struct v4l2_ctrl *audio_sampling_rate_ctrl;
+	struct v4l2_ctrl *audio_present_ctrl;
+
+	/* work queues */
+	struct workqueue_struct *work_queues;
+	struct delayed_work delayed_work_enable_hotplug;
+
+	/* edid  */
+	u8 edid_blocks_written;
+
+	struct v4l2_dv_timings timings;
+	u32 mbus_fmt_code;
+};
+
+static void tc358743_enable_interrupts(struct v4l2_subdev *sd,
+		bool cable_connected);
+static int tc358743_s_ctrl_detect_tx_5v(struct v4l2_subdev *sd);
+
+static inline struct tc358743_state *to_state(struct v4l2_subdev *sd)
+{
+	return container_of(sd, struct tc358743_state, sd);
+}
+
+/* --------------- I2C --------------- */
+
+static void i2c_rd(struct v4l2_subdev *sd, u16 reg, u8 *values, u32 n)
+{
+	struct tc358743_state *state = to_state(sd);
+	struct i2c_client *client = state->i2c_client;
+	int err;
+	u8 buf[2] = { reg >> 8, reg & 0xff };
+	struct i2c_msg msgs[] = {
+		{
+			.addr = client->addr,
+			.flags = 0,
+			.len = 2,
+			.buf = buf,
+		},
+		{
+			.addr = client->addr,
+			.flags = I2C_M_RD,
+			.len = n,
+			.buf = values,
+		},
+	};
+
+	err = i2c_transfer(client->adapter, msgs, ARRAY_SIZE(msgs));
+	if (err != ARRAY_SIZE(msgs)) {
+		v4l2_err(sd, "%s: reading register 0x%x from 0x%x failed\n",
+				__func__, reg, client->addr);
+	}
+}
+
+static void i2c_wr(struct v4l2_subdev *sd, u16 reg, u8 *values, u32 n)
+{
+	struct tc358743_state *state = to_state(sd);
+	struct i2c_client *client = state->i2c_client;
+	int err, i;
+	struct i2c_msg msg;
+	u8 data[2 + n];
+
+	msg.addr = client->addr;
+	msg.buf = data;
+	msg.len = 2 + n;
+	msg.flags = 0;
+
+	data[0] = reg >> 8;
+	data[1] = reg & 0xff;
+
+	for (i = 0; i < n; i++)
+		data[2 + i] = values[i];
+
+	err = i2c_transfer(client->adapter, &msg, 1);
+	if (err != 1) {
+		v4l2_err(sd, "%s: writing register 0x%x from 0x%x failed\n",
+				__func__, reg, client->addr);
+		return;
+	}
+
+	if (debug < 3)
+		return;
+
+	switch (n) {
+	case 1:
+		v4l2_info(sd, "I2C write 0x%04x = 0x%02x",
+				reg, data[2]);
+		break;
+	case 2:
+		v4l2_info(sd, "I2C write 0x%04x = 0x%02x%02x",
+				reg, data[3], data[2]);
+		break;
+	case 4:
+		v4l2_info(sd, "I2C write 0x%04x = 0x%02x%02x%02x%02x",
+				reg, data[5], data[4], data[3], data[2]);
+		break;
+	default:
+		v4l2_info(sd, "I2C write %d bytes from address 0x%04x\n",
+				n, reg);
+	}
+}
+
+static u8 i2c_rd8(struct v4l2_subdev *sd, u16 reg)
+{
+	u8 val;
+
+	i2c_rd(sd, reg, &val, 1);
+
+	return val;
+}
+
+static void i2c_wr8(struct v4l2_subdev *sd, u16 reg, u8 val)
+{
+	i2c_wr(sd, reg, &val, 1);
+}
+
+static void i2c_wr8_and_or(struct v4l2_subdev *sd, u16 reg,
+		u8 mask, u8 val)
+{
+	i2c_wr8(sd, reg, (i2c_rd8(sd, reg) & mask) | val);
+}
+
+static u16 i2c_rd16(struct v4l2_subdev *sd, u16 reg)
+{
+	u16 val;
+
+	i2c_rd(sd, reg, (u8 *)&val, 2);
+
+	return val;
+}
+
+static void i2c_wr16(struct v4l2_subdev *sd, u16 reg, u16 val)
+{
+	i2c_wr(sd, reg, (u8 *)&val, 2);
+}
+
+static void i2c_wr16_and_or(struct v4l2_subdev *sd, u16 reg, u16 mask, u16 val)
+{
+	i2c_wr16(sd, reg, (i2c_rd16(sd, reg) & mask) | val);
+}
+
+static u32 i2c_rd32(struct v4l2_subdev *sd, u16 reg)
+{
+	u32 val;
+
+	i2c_rd(sd, reg, (u8 *)&val, 4);
+
+	return val;
+}
+
+static void i2c_wr32(struct v4l2_subdev *sd, u16 reg, u32 val)
+{
+	i2c_wr(sd, reg, (u8 *)&val, 4);
+}
+
+/* --------------- STATUS --------------- */
+
+static inline bool is_hdmi(struct v4l2_subdev *sd)
+{
+	return i2c_rd8(sd, SYS_STATUS) & MASK_S_HDMI;
+}
+
+static inline bool tx_5v_power_present(struct v4l2_subdev *sd)
+{
+	return i2c_rd8(sd, SYS_STATUS) & MASK_S_DDC5V;
+}
+
+static inline bool no_signal(struct v4l2_subdev *sd)
+{
+	return !(i2c_rd8(sd, SYS_STATUS) & MASK_S_TMDS);
+}
+
+static inline bool no_sync(struct v4l2_subdev *sd)
+{
+	return !(i2c_rd8(sd, SYS_STATUS) & MASK_S_SYNC);
+}
+
+static inline bool audio_present(struct v4l2_subdev *sd)
+{
+	return i2c_rd8(sd, AU_STATUS0) & MASK_S_A_SAMPLE;
+}
+
+static int get_audio_sampling_rate(struct v4l2_subdev *sd)
+{
+	static const int code_to_rate[] = {
+		44100, 0, 48000, 32000, 22050, 384000, 24000, 352800,
+		88200, 768000, 96000, 705600, 176400, 0, 192000, 0
+	};
+
+	/* Register FS_SET is not cleared when the cable is disconnected */
+	if (no_signal(sd))
+		return 0;
+
+	return code_to_rate[i2c_rd8(sd, FS_SET) & MASK_FS];
+}
+
+static unsigned tc358743_num_csi_lanes_in_use(struct v4l2_subdev *sd)
+{
+	return ((i2c_rd32(sd, CSI_CONTROL) & MASK_NOL) >> 1) + 1;
+}
+
+/* --------------- TIMINGS --------------- */
+
+static inline unsigned fps(const struct v4l2_bt_timings *t)
+{
+	if (!V4L2_DV_BT_FRAME_HEIGHT(t) || !V4L2_DV_BT_FRAME_WIDTH(t))
+		return 0;
+
+	return DIV_ROUND_CLOSEST((unsigned)t->pixelclock,
+			V4L2_DV_BT_FRAME_HEIGHT(t) * V4L2_DV_BT_FRAME_WIDTH(t));
+}
+
+static int tc358743_get_detected_timings(struct v4l2_subdev *sd,
+				     struct v4l2_dv_timings *timings)
+{
+	struct v4l2_bt_timings *bt = &timings->bt;
+	unsigned width, height, frame_width, frame_height, frame_interval, fps;
+
+	memset(timings, 0, sizeof(struct v4l2_dv_timings));
+
+	if (no_signal(sd)) {
+		v4l2_dbg(1, debug, sd, "%s: no valid signal\n", __func__);
+		return -ENOLINK;
+	}
+	if (no_sync(sd)) {
+		v4l2_dbg(1, debug, sd, "%s: no sync on signal\n", __func__);
+		return -ENOLCK;
+	}
+
+	timings->type = V4L2_DV_BT_656_1120;
+	bt->interlaced = i2c_rd8(sd, VI_STATUS1) & MASK_S_V_INTERLACE ?
+		V4L2_DV_INTERLACED : V4L2_DV_PROGRESSIVE;
+
+	width = ((i2c_rd8(sd, DE_WIDTH_H_HI) & 0x1f) << 8) +
+		i2c_rd8(sd, DE_WIDTH_H_LO);
+	height = ((i2c_rd8(sd, DE_WIDTH_V_HI) & 0x1f) << 8) +
+		i2c_rd8(sd, DE_WIDTH_V_LO);
+	frame_width = ((i2c_rd8(sd, H_SIZE_HI) & 0x1f) << 8) +
+		i2c_rd8(sd, H_SIZE_LO);
+	frame_height = (((i2c_rd8(sd, V_SIZE_HI) & 0x3f) << 8) +
+		i2c_rd8(sd, V_SIZE_LO)) / 2;
+	/* frame interval in milliseconds * 10
+	 * Require SYS_FREQ0 and SYS_FREQ1 are precisely set */
+	frame_interval = ((i2c_rd8(sd, FV_CNT_HI) & 0x3) << 8) +
+		i2c_rd8(sd, FV_CNT_LO);
+	fps = (frame_interval > 0) ?
+		DIV_ROUND_CLOSEST(10000, frame_interval) : 0;
+
+	bt->width = width;
+	bt->height = height;
+	bt->vsync = frame_height - height;
+	bt->hsync = frame_width - width;
+	bt->pixelclock = frame_width * frame_height * fps;
+	if (bt->interlaced == V4L2_DV_INTERLACED) {
+		bt->height *= 2;
+		bt->il_vsync = bt->vsync + 1;
+		bt->pixelclock /= 2;
+	}
+
+	return 0;
+}
+
+/* --------------- HOTPLUG / HDCP / EDID --------------- */
+
+static void tc358743_delayed_work_enable_hotplug(struct work_struct *work)
+{
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct tc358743_state *state = container_of(dwork,
+			struct tc358743_state, delayed_work_enable_hotplug);
+	struct v4l2_subdev *sd = &state->sd;
+
+	v4l2_dbg(2, debug, sd, "%s:\n", __func__);
+
+	i2c_wr8_and_or(sd, HPD_CTL, ~MASK_HPD_OUT0, MASK_HPD_OUT0);
+}
+
+static void tc358743_set_hdmi_hdcp(struct v4l2_subdev *sd, bool enable)
+{
+	v4l2_dbg(2, debug, sd, "%s: %s\n", __func__, enable ?
+				"enable" : "disable");
+
+	i2c_wr8_and_or(sd, HDCP_REG1,
+			~(MASK_AUTH_UNAUTH_SEL | MASK_AUTH_UNAUTH),
+			MASK_AUTH_UNAUTH_SEL_16_FRAMES | MASK_AUTH_UNAUTH_AUTO);
+
+	i2c_wr8_and_or(sd, HDCP_REG2, ~MASK_AUTO_P3_RESET,
+			SET_AUTO_P3_RESET_FRAMES(0x0f));
+
+	/* HDCP is disabled by configuring the receiver as HDCP repeater. The
+	 * repeater mode require software support to work, so HDCP
+	 * authentication will fail.
+	 */
+	i2c_wr8_and_or(sd, HDCP_REG3, ~KEY_RD_CMD, enable ? KEY_RD_CMD : 0);
+	i2c_wr8_and_or(sd, HDCP_MODE, ~(MASK_AUTO_CLR | MASK_MODE_RST_TN),
+			enable ?  (MASK_AUTO_CLR | MASK_MODE_RST_TN) : 0);
+
+	/* Apple MacBook Pro gen.8 has a bug that makes it freeze every fifth
+	 * second when HDCP is disabled, but the MAX_EXCED bit is handled
+	 * correctly and HDCP is disabled on the HDMI output.
+	 */
+	i2c_wr8_and_or(sd, BSTATUS1, ~MASK_MAX_EXCED,
+			enable ? 0 : MASK_MAX_EXCED);
+	i2c_wr8_and_or(sd, BCAPS, ~(MASK_REPEATER | MASK_READY),
+			enable ? 0 : MASK_REPEATER | MASK_READY);
+}
+
+static void tc358743_disable_edid(struct v4l2_subdev *sd)
+{
+	struct tc358743_state *state = to_state(sd);
+
+	v4l2_dbg(2, debug, sd, "%s:\n", __func__);
+
+	cancel_delayed_work_sync(&state->delayed_work_enable_hotplug);
+
+	/* DDC access to EDID is also disabled when hotplug is disabled. See
+	 * register DDC_CTL */
+	i2c_wr8_and_or(sd, HPD_CTL, ~MASK_HPD_OUT0, 0x0);
+}
+
+static void tc358743_enable_edid(struct v4l2_subdev *sd)
+{
+	struct tc358743_state *state = to_state(sd);
+
+	if (state->edid_blocks_written == 0) {
+		v4l2_dbg(2, debug, sd, "%s: no EDID -> no hotplug\n", __func__);
+		return;
+	}
+
+	v4l2_dbg(2, debug, sd, "%s:\n", __func__);
+
+	/* Enable hotplug after 100 ms. DDC access to EDID is also enabled when
+	 * hotplug is enabled. See register DDC_CTL */
+	queue_delayed_work(state->work_queues,
+			   &state->delayed_work_enable_hotplug, HZ / 10);
+
+	tc358743_enable_interrupts(sd, true);
+	tc358743_s_ctrl_detect_tx_5v(sd);
+}
+
+static void tc358743_erase_bksv(struct v4l2_subdev *sd)
+{
+	int i;
+
+	for (i = 0; i < 5; i++)
+		i2c_wr8(sd, BKSV + i, 0);
+}
+
+/* --------------- AVI infoframe --------------- */
+
+static void print_avi_infoframe(struct v4l2_subdev *sd)
+{
+	struct i2c_client *client = v4l2_get_subdevdata(sd);
+	struct device *dev = &client->dev;
+	union hdmi_infoframe frame;
+	u8 buffer[HDMI_INFOFRAME_SIZE(AVI)];
+
+	if (!is_hdmi(sd)) {
+		v4l2_info(sd, "DVI-D signal - AVI infoframe not supported\n");
+		return;
+	}
+
+	i2c_rd(sd, PK_AVI_0HEAD, buffer, HDMI_INFOFRAME_SIZE(AVI));
+
+	if (hdmi_infoframe_unpack(&frame, buffer) < 0) {
+		v4l2_err(sd, "%s: unpack of AVI infoframe failed\n", __func__);
+		return;
+	}
+
+	hdmi_infoframe_log(KERN_INFO, dev, &frame);
+}
+
+/* --------------- CTRLS --------------- */
+
+static int tc358743_s_ctrl_detect_tx_5v(struct v4l2_subdev *sd)
+{
+	struct tc358743_state *state = to_state(sd);
+
+	return v4l2_ctrl_s_ctrl(state->detect_tx_5v_ctrl,
+			tx_5v_power_present(sd));
+}
+
+static int tc358743_s_ctrl_audio_sampling_rate(struct v4l2_subdev *sd)
+{
+	struct tc358743_state *state = to_state(sd);
+
+	return v4l2_ctrl_s_ctrl(state->audio_sampling_rate_ctrl,
+			get_audio_sampling_rate(sd));
+}
+
+static int tc358743_s_ctrl_audio_present(struct v4l2_subdev *sd)
+{
+	struct tc358743_state *state = to_state(sd);
+
+	return v4l2_ctrl_s_ctrl(state->audio_present_ctrl,
+			audio_present(sd));
+}
+
+static int tc358743_update_controls(struct v4l2_subdev *sd)
+{
+	int ret = 0;
+
+	ret |= tc358743_s_ctrl_detect_tx_5v(sd);
+	ret |= tc358743_s_ctrl_audio_sampling_rate(sd);
+	ret |= tc358743_s_ctrl_audio_present(sd);
+
+	return ret;
+}
+
+/* --------------- INIT --------------- */
+
+static void tc358743_reset_phy(struct v4l2_subdev *sd)
+{
+	v4l2_dbg(1, debug, sd, "%s:\n", __func__);
+
+	i2c_wr8_and_or(sd, PHY_RST, ~MASK_RESET_CTRL, 0);
+	i2c_wr8_and_or(sd, PHY_RST, ~MASK_RESET_CTRL, MASK_RESET_CTRL);
+}
+
+static void tc358743_reset(struct v4l2_subdev *sd, uint16_t mask)
+{
+	u16 sysctl = i2c_rd16(sd, SYSCTL);
+
+	i2c_wr16(sd, SYSCTL, sysctl | mask);
+	i2c_wr16(sd, SYSCTL, sysctl & ~mask);
+}
+
+static inline void tc358743_sleep_mode(struct v4l2_subdev *sd, bool enable)
+{
+	i2c_wr16_and_or(sd, SYSCTL, ~MASK_SLEEP,
+			enable ? MASK_SLEEP : 0);
+}
+
+static inline void enable_stream(struct v4l2_subdev *sd, bool enable)
+{
+	struct tc358743_state *state = to_state(sd);
+
+	v4l2_dbg(3, debug, sd, "%s: %sable\n",
+			__func__, enable ? "en" : "dis");
+
+	if (enable) {
+		/* It is critical for CSI receiver to see lane transition
+		 * LP11->HS. Set to non-continuous mode to enable clock lane
+		 * LP11 state. */
+		i2c_wr32(sd, TXOPTIONCNTRL, 0);
+		/* Set to continuous mode to trigger LP11->HS transition */
+		i2c_wr32(sd, TXOPTIONCNTRL, MASK_CONTCLKMODE);
+		/* Unmute video */
+		i2c_wr8(sd, VI_MUTE, MASK_AUTO_MUTE);
+	} else {
+		/* Mute video so that all data lanes go to LSP11 state.
+		 * No data is output to CSI Tx block. */
+		i2c_wr8(sd, VI_MUTE, MASK_AUTO_MUTE | MASK_VI_MUTE);
+	}
+
+	mutex_lock(&state->confctl_mutex);
+	i2c_wr16_and_or(sd, CONFCTL, ~(MASK_VBUFEN | MASK_ABUFEN),
+			enable ? (MASK_VBUFEN | MASK_ABUFEN) : 0x0);
+	mutex_unlock(&state->confctl_mutex);
+}
+
+static void tc358743_set_pll(struct v4l2_subdev *sd)
+{
+	struct tc358743_state *state = to_state(sd);
+	struct tc358743_platform_data *pdata = &state->pdata;
+	u16 pllctl0 = i2c_rd16(sd, PLLCTL0);
+	u16 pllctl1 = i2c_rd16(sd, PLLCTL1);
+	u16 pllctl0_new = SET_PLL_PRD(pdata->pll_prd) |
+		SET_PLL_FBD(pdata->pll_fbd);
+	u32 hsck = (pdata->refclk_hz / pdata->pll_prd) * pdata->pll_fbd;
+
+	v4l2_dbg(2, debug, sd, "%s:\n", __func__);
+
+	/* Only rewrite when needed (new value or disabled), since rewriting
+	 * triggers another format change event. */
+	if ((pllctl0 != pllctl0_new) || ((pllctl1 & MASK_PLL_EN) == 0)) {
+		u16 pll_frs;
+
+		if (hsck > 500000000)
+			pll_frs = 0x0;
+		else if (hsck > 250000000)
+			pll_frs = 0x1;
+		else if (hsck > 125000000)
+			pll_frs = 0x2;
+		else
+			pll_frs = 0x3;
+
+		v4l2_dbg(1, debug, sd, "%s: updating PLL clock\n", __func__);
+		tc358743_sleep_mode(sd, true);
+		i2c_wr16(sd, PLLCTL0, pllctl0_new);
+		i2c_wr16_and_or(sd, PLLCTL1,
+				~(MASK_PLL_FRS | MASK_RESETB | MASK_PLL_EN),
+				(SET_PLL_FRS(pll_frs) | MASK_RESETB |
+				 MASK_PLL_EN));
+		udelay(10); /* REF_02, Sheet "Source HDMI" */
+		i2c_wr16_and_or(sd, PLLCTL1, ~MASK_CKEN, MASK_CKEN);
+		tc358743_sleep_mode(sd, false);
+	}
+}
+
+static void tc358743_set_ref_clk(struct v4l2_subdev *sd)
+{
+	struct tc358743_state *state = to_state(sd);
+	struct tc358743_platform_data *pdata = &state->pdata;
+	u32 sys_freq;
+	u32 lockdet_ref;
+	u16 fh_min;
+	u16 fh_max;
+
+	BUG_ON(!(pdata->refclk_hz == 26000000 ||
+		 pdata->refclk_hz == 27000000 ||
+		 pdata->refclk_hz == 42000000));
+
+	sys_freq = pdata->refclk_hz / 10000;
+	i2c_wr8(sd, SYS_FREQ0, sys_freq & 0x00ff);
+	i2c_wr8(sd, SYS_FREQ1, (sys_freq & 0xff00) >> 8);
+
+	i2c_wr8_and_or(sd, PHY_CTL0, ~MASK_PHY_SYSCLK_IND,
+			(pdata->refclk_hz == 42000000) ?
+			MASK_PHY_SYSCLK_IND : 0x0);
+
+	fh_min = pdata->refclk_hz / 100000;
+	i2c_wr8(sd, FH_MIN0, fh_min & 0x00ff);
+	i2c_wr8(sd, FH_MIN1, (fh_min & 0xff00) >> 8);
+
+	fh_max = (fh_min * 66) / 10;
+	i2c_wr8(sd, FH_MAX0, fh_max & 0x00ff);
+	i2c_wr8(sd, FH_MAX1, (fh_max & 0xff00) >> 8);
+
+	lockdet_ref = pdata->refclk_hz / 100;
+	i2c_wr8(sd, LOCKDET_REF0, lockdet_ref & 0x0000ff);
+	i2c_wr8(sd, LOCKDET_REF1, (lockdet_ref & 0x00ff00) >> 8);
+	i2c_wr8(sd, LOCKDET_REF2, (lockdet_ref & 0x0f0000) >> 16);
+
+	i2c_wr8_and_or(sd, NCO_F0_MOD, ~MASK_NCO_F0_MOD,
+			(pdata->refclk_hz == 27000000) ?
+			MASK_NCO_F0_MOD_27MHZ : 0x0);
+}
+
+static void tc358743_set_csi_color_space(struct v4l2_subdev *sd)
+{
+	struct tc358743_state *state = to_state(sd);
+
+	switch (state->mbus_fmt_code) {
+	case MEDIA_BUS_FMT_UYVY8_1X16:
+		v4l2_dbg(2, debug, sd, "%s: YCbCr 422 16-bit\n", __func__);
+		i2c_wr8_and_or(sd, VOUT_SET2,
+				~(MASK_SEL422 | MASK_VOUT_422FIL_100) & 0xff,
+				MASK_SEL422 | MASK_VOUT_422FIL_100);
+		i2c_wr8_and_or(sd, VI_REP, ~MASK_VOUT_COLOR_SEL & 0xff,
+				MASK_VOUT_COLOR_601_YCBCR_LIMITED);
+		mutex_lock(&state->confctl_mutex);
+		i2c_wr16_and_or(sd, CONFCTL, ~MASK_YCBCRFMT,
+				MASK_YCBCRFMT_422_8_BIT);
+		mutex_unlock(&state->confctl_mutex);
+		break;
+	case MEDIA_BUS_FMT_RGB888_1X24:
+		v4l2_dbg(2, debug, sd, "%s: RGB 888 24-bit\n", __func__);
+		i2c_wr8_and_or(sd, VOUT_SET2,
+				~(MASK_SEL422 | MASK_VOUT_422FIL_100) & 0xff,
+				0x00);
+		i2c_wr8_and_or(sd, VI_REP, ~MASK_VOUT_COLOR_SEL & 0xff,
+				MASK_VOUT_COLOR_RGB_FULL);
+		mutex_lock(&state->confctl_mutex);
+		i2c_wr16_and_or(sd, CONFCTL, ~MASK_YCBCRFMT, 0);
+		mutex_unlock(&state->confctl_mutex);
+		break;
+	default:
+		v4l2_dbg(2, debug, sd, "%s: Unsupported format code 0x%x\n",
+				__func__, state->mbus_fmt_code);
+	}
+}
+
+static unsigned tc358743_num_csi_lanes_needed(struct v4l2_subdev *sd)
+{
+	struct tc358743_state *state = to_state(sd);
+	struct v4l2_bt_timings *bt = &state->timings.bt;
+	struct tc358743_platform_data *pdata = &state->pdata;
+	u32 bits_pr_pixel =
+		(state->mbus_fmt_code == MEDIA_BUS_FMT_UYVY8_1X16) ?  16 : 24;
+	u32 bps = bt->width * bt->height * fps(bt) * bits_pr_pixel;
+	u32 bps_pr_lane = (pdata->refclk_hz / pdata->pll_prd) * pdata->pll_fbd;
+
+	return DIV_ROUND_UP(bps, bps_pr_lane);
+}
+
+static void tc358743_set_csi(struct v4l2_subdev *sd)
+{
+	struct tc358743_state *state = to_state(sd);
+	struct tc358743_platform_data *pdata = &state->pdata;
+	unsigned lanes = tc358743_num_csi_lanes_needed(sd);
+
+	v4l2_dbg(3, debug, sd, "%s:\n", __func__);
+
+	tc358743_reset(sd, MASK_CTXRST);
+
+	if (lanes < 1)
+		i2c_wr32(sd, CLW_CNTRL, MASK_CLW_LANEDISABLE);
+	if (lanes < 1)
+		i2c_wr32(sd, D0W_CNTRL, MASK_D0W_LANEDISABLE);
+	if (lanes < 2)
+		i2c_wr32(sd, D1W_CNTRL, MASK_D1W_LANEDISABLE);
+	if (lanes < 3)
+		i2c_wr32(sd, D2W_CNTRL, MASK_D2W_LANEDISABLE);
+	if (lanes < 4)
+		i2c_wr32(sd, D3W_CNTRL, MASK_D3W_LANEDISABLE);
+
+	i2c_wr32(sd, LINEINITCNT, pdata->lineinitcnt);
+	i2c_wr32(sd, LPTXTIMECNT, pdata->lptxtimecnt);
+	i2c_wr32(sd, TCLK_HEADERCNT, pdata->tclk_headercnt);
+	i2c_wr32(sd, TCLK_TRAILCNT, pdata->tclk_trailcnt);
+	i2c_wr32(sd, THS_HEADERCNT, pdata->ths_headercnt);
+	i2c_wr32(sd, TWAKEUP, pdata->twakeup);
+	i2c_wr32(sd, TCLK_POSTCNT, pdata->tclk_postcnt);
+	i2c_wr32(sd, THS_TRAILCNT, pdata->ths_trailcnt);
+	i2c_wr32(sd, HSTXVREGCNT, pdata->hstxvregcnt);
+
+	i2c_wr32(sd, HSTXVREGEN,
+			((lanes > 0) ? MASK_CLM_HSTXVREGEN : 0x0) |
+			((lanes > 0) ? MASK_D0M_HSTXVREGEN : 0x0) |
+			((lanes > 1) ? MASK_D1M_HSTXVREGEN : 0x0) |
+			((lanes > 2) ? MASK_D2M_HSTXVREGEN : 0x0) |
+			((lanes > 3) ? MASK_D3M_HSTXVREGEN : 0x0));
+
+	i2c_wr32(sd, TXOPTIONCNTRL, MASK_CONTCLKMODE);
+	i2c_wr32(sd, STARTCNTRL, MASK_START);
+	i2c_wr32(sd, CSI_START, MASK_STRT);
+
+	i2c_wr32(sd, CSI_CONFW, MASK_MODE_SET |
+			MASK_ADDRESS_CSI_CONTROL |
+			MASK_CSI_MODE |
+			MASK_TXHSMD |
+			((lanes == 4) ? MASK_NOL_4 :
+			 (lanes == 3) ? MASK_NOL_3 :
+			 (lanes == 2) ? MASK_NOL_2 : MASK_NOL_1));
+
+	i2c_wr32(sd, CSI_CONFW, MASK_MODE_SET |
+			MASK_ADDRESS_CSI_ERR_INTENA | MASK_TXBRK | MASK_QUNK |
+			MASK_WCER | MASK_INER);
+
+	i2c_wr32(sd, CSI_CONFW, MASK_MODE_CLEAR |
+			MASK_ADDRESS_CSI_ERR_HALT | MASK_TXBRK | MASK_QUNK);
+
+	i2c_wr32(sd, CSI_CONFW, MASK_MODE_SET |
+			MASK_ADDRESS_CSI_INT_ENA | MASK_INTER);
+}
+
+static void tc358743_set_hdmi_phy(struct v4l2_subdev *sd)
+{
+	struct tc358743_state *state = to_state(sd);
+	struct tc358743_platform_data *pdata = &state->pdata;
+
+	/* Default settings from REF_02, sheet "Source HDMI"
+	 * and custom settings as platform data */
+	i2c_wr8_and_or(sd, PHY_EN, ~MASK_ENABLE_PHY, 0x0);
+	i2c_wr8(sd, PHY_CTL1, SET_PHY_AUTO_RST1_US(1600) |
+			SET_FREQ_RANGE_MODE_CYCLES(1));
+	i2c_wr8_and_or(sd, PHY_CTL2, ~MASK_PHY_AUTO_RSTn,
+			(pdata->hdmi_phy_auto_reset_tmds_detected ?
+			 MASK_PHY_AUTO_RST2 : 0) |
+			(pdata->hdmi_phy_auto_reset_tmds_in_range ?
+			 MASK_PHY_AUTO_RST3 : 0) |
+			(pdata->hdmi_phy_auto_reset_tmds_valid ?
+			 MASK_PHY_AUTO_RST4 : 0));
+	i2c_wr8(sd, PHY_BIAS, 0x40);
+	i2c_wr8(sd, PHY_CSQ, SET_CSQ_CNT_LEVEL(0x0a));
+	i2c_wr8(sd, AVM_CTL, 45);
+	i2c_wr8_and_or(sd, HDMI_DET, ~MASK_HDMI_DET_V,
+			pdata->hdmi_detection_delay << 4);
+	i2c_wr8_and_or(sd, HV_RST, ~(MASK_H_PI_RST | MASK_V_PI_RST),
+			(pdata->hdmi_phy_auto_reset_hsync_out_of_range ?
+			 MASK_H_PI_RST : 0) |
+			(pdata->hdmi_phy_auto_reset_vsync_out_of_range ?
+			 MASK_V_PI_RST : 0));
+	i2c_wr8_and_or(sd, PHY_EN, ~MASK_ENABLE_PHY, MASK_ENABLE_PHY);
+}
+
+static void tc358743_set_hdmi_audio(struct v4l2_subdev *sd)
+{
+	struct tc358743_state *state = to_state(sd);
+
+	/* Default settings from REF_02, sheet "Source HDMI" */
+	i2c_wr8(sd, FORCE_MUTE, 0x00);
+	i2c_wr8(sd, AUTO_CMD0, MASK_AUTO_MUTE7 | MASK_AUTO_MUTE6 |
+			MASK_AUTO_MUTE5 | MASK_AUTO_MUTE4 |
+			MASK_AUTO_MUTE1 | MASK_AUTO_MUTE0);
+	i2c_wr8(sd, AUTO_CMD1, MASK_AUTO_MUTE9);
+	i2c_wr8(sd, AUTO_CMD2, MASK_AUTO_PLAY3 | MASK_AUTO_PLAY2);
+	i2c_wr8(sd, BUFINIT_START, SET_BUFINIT_START_MS(500));
+	i2c_wr8(sd, FS_MUTE, 0x00);
+	i2c_wr8(sd, FS_IMODE, MASK_NLPCM_SMODE | MASK_FS_SMODE);
+	i2c_wr8(sd, ACR_MODE, MASK_CTS_MODE);
+	i2c_wr8(sd, ACR_MDF0, MASK_ACR_L2MDF_1976_PPM | MASK_ACR_L1MDF_976_PPM);
+	i2c_wr8(sd, ACR_MDF1, MASK_ACR_L3MDF_3906_PPM);
+	i2c_wr8(sd, SDO_MODE1, MASK_SDO_FMT_I2S);
+	i2c_wr8(sd, DIV_MODE, SET_DIV_DLY_MS(100));
+
+	mutex_lock(&state->confctl_mutex);
+	i2c_wr16_and_or(sd, CONFCTL, 0xffff, MASK_AUDCHNUM_2 |
+			MASK_AUDOUTSEL_I2S | MASK_AUTOINDEX);
+	mutex_unlock(&state->confctl_mutex);
+}
+
+static void tc358743_set_hdmi_info_frame_mode(struct v4l2_subdev *sd)
+{
+	/* Default settings from REF_02, sheet "Source HDMI" */
+	i2c_wr8(sd, PK_INT_MODE, MASK_ISRC2_INT_MODE | MASK_ISRC_INT_MODE |
+			MASK_ACP_INT_MODE | MASK_VS_INT_MODE |
+			MASK_SPD_INT_MODE | MASK_MS_INT_MODE |
+			MASK_AUD_INT_MODE | MASK_AVI_INT_MODE);
+	i2c_wr8(sd, NO_PKT_LIMIT, 0x2c);
+	i2c_wr8(sd, NO_PKT_CLR, 0x53);
+	i2c_wr8(sd, ERR_PK_LIMIT, 0x01);
+	i2c_wr8(sd, NO_PKT_LIMIT2, 0x30);
+	i2c_wr8(sd, NO_GDB_LIMIT, 0x10);
+}
+
+static void tc358743_initial_setup(struct v4l2_subdev *sd)
+{
+	struct tc358743_state *state = to_state(sd);
+	struct tc358743_platform_data *pdata = &state->pdata;
+
+	/* CEC and IR are not supported by this driver */
+	i2c_wr16_and_or(sd, SYSCTL, ~(MASK_CECRST | MASK_IRRST),
+			(MASK_CECRST | MASK_IRRST));
+
+	tc358743_reset(sd, MASK_CTXRST | MASK_HDMIRST);
+	tc358743_sleep_mode(sd, false);
+
+	i2c_wr16(sd, FIFOCTL, pdata->fifo_level);
+
+	tc358743_set_ref_clk(sd);
+
+	i2c_wr8_and_or(sd, DDC_CTL, ~MASK_DDC5V_MODE,
+			pdata->ddc5v_delay & MASK_DDC5V_MODE);
+	i2c_wr8_and_or(sd, EDID_MODE, ~MASK_EDID_MODE, MASK_EDID_MODE_E_DDC);
+
+	tc358743_set_hdmi_phy(sd);
+	tc358743_set_hdmi_hdcp(sd, pdata->enable_hdcp);
+	tc358743_set_hdmi_audio(sd);
+	tc358743_set_hdmi_info_frame_mode(sd);
+
+	/* All CE and IT formats are detected as RGB full range in DVI mode */
+	i2c_wr8_and_or(sd, VI_MODE, ~MASK_RGB_DVI, 0);
+
+	i2c_wr8_and_or(sd, VOUT_SET2, ~MASK_VOUTCOLORMODE,
+			MASK_VOUTCOLORMODE_AUTO);
+	i2c_wr8(sd, VOUT_SET3, MASK_VOUT_EXTCNT);
+}
+
+/* --------------- IRQ --------------- */
+
+static void tc358743_format_change(struct v4l2_subdev *sd)
+{
+	struct tc358743_state *state = to_state(sd);
+	struct v4l2_dv_timings timings;
+	const struct v4l2_event tc358743_ev_fmt = {
+		.type = V4L2_EVENT_SOURCE_CHANGE,
+		.u.src_change.changes = V4L2_EVENT_SRC_CH_RESOLUTION,
+	};
+
+	if (tc358743_get_detected_timings(sd, &timings)) {
+		enable_stream(sd, false);
+
+		v4l2_dbg(1, debug, sd, "%s: Format changed. No signal\n",
+				__func__);
+	} else {
+		if (!v4l2_match_dv_timings(&state->timings, &timings, 0))
+			enable_stream(sd, false);
+
+		v4l2_print_dv_timings(sd->name,
+				"tc358743_format_change: Format changed. New format: ",
+				&timings, false);
+	}
+
+	v4l2_subdev_notify(sd, V4L2_DEVICE_NOTIFY_EVENT,
+			(void *)&tc358743_ev_fmt);
+}
+
+static void tc358743_init_interrupts(struct v4l2_subdev *sd)
+{
+	u16 i;
+
+	/* clear interrupt status registers */
+	for (i = SYS_INT; i <= KEY_INT; i++)
+		i2c_wr8(sd, i, 0xff);
+
+	i2c_wr16(sd, INTSTATUS, 0xffff);
+}
+
+static void tc358743_enable_interrupts(struct v4l2_subdev *sd,
+		bool cable_connected)
+{
+	v4l2_dbg(2, debug, sd, "%s: cable connected = %d\n", __func__,
+			cable_connected);
+
+	if (cable_connected) {
+		i2c_wr8(sd, SYS_INTM, ~(MASK_M_DDC | MASK_M_DVI_DET |
+					MASK_M_HDMI_DET) & 0xff);
+		i2c_wr8(sd, CLK_INTM, ~MASK_M_IN_DE_CHG);
+		i2c_wr8(sd, CBIT_INTM, ~(MASK_M_CBIT_FS | MASK_M_AF_LOCK |
+					MASK_M_AF_UNLOCK) & 0xff);
+		i2c_wr8(sd, AUDIO_INTM, ~MASK_M_BUFINIT_END);
+		i2c_wr8(sd, MISC_INTM, ~MASK_M_SYNC_CHG);
+	} else {
+		i2c_wr8(sd, SYS_INTM, ~MASK_M_DDC & 0xff);
+		i2c_wr8(sd, CLK_INTM, 0xff);
+		i2c_wr8(sd, CBIT_INTM, 0xff);
+		i2c_wr8(sd, AUDIO_INTM, 0xff);
+		i2c_wr8(sd, MISC_INTM, 0xff);
+	}
+}
+
+static void tc358743_hdmi_audio_int_handler(struct v4l2_subdev *sd,
+		bool *handled)
+{
+	u8 audio_int_mask = i2c_rd8(sd, AUDIO_INTM);
+	u8 audio_int = i2c_rd8(sd, AUDIO_INT) & ~audio_int_mask;
+
+	i2c_wr8(sd, AUDIO_INT, audio_int);
+
+	v4l2_dbg(3, debug, sd, "%s: AUDIO_INT = 0x%02x\n", __func__, audio_int);
+
+	tc358743_s_ctrl_audio_sampling_rate(sd);
+	tc358743_s_ctrl_audio_present(sd);
+}
+
+static void tc358743_csi_err_int_handler(struct v4l2_subdev *sd, bool *handled)
+{
+	v4l2_err(sd, "%s: CSI_ERR = 0x%x\n", __func__, i2c_rd32(sd, CSI_ERR));
+
+	i2c_wr32(sd, CSI_INT_CLR, MASK_ICRER);
+}
+
+static void tc358743_hdmi_misc_int_handler(struct v4l2_subdev *sd,
+		bool *handled)
+{
+	u8 misc_int_mask = i2c_rd8(sd, MISC_INTM);
+	u8 misc_int = i2c_rd8(sd, MISC_INT) & ~misc_int_mask;
+
+	i2c_wr8(sd, MISC_INT, misc_int);
+
+	v4l2_dbg(3, debug, sd, "%s: MISC_INT = 0x%02x\n", __func__, misc_int);
+
+	if (misc_int & MASK_I_SYNC_CHG) {
+		/* Reset the HDMI PHY to try to trigger proper lock on the
+		 * incoming video format. Erase BKSV to prevent that old keys
+		 * are used when a new source is connected. */
+		if (no_sync(sd) || no_signal(sd)) {
+			tc358743_reset_phy(sd);
+			tc358743_erase_bksv(sd);
+		}
+
+		tc358743_format_change(sd);
+
+		misc_int &= ~MASK_I_SYNC_CHG;
+		if (handled)
+			*handled = true;
+	}
+
+	if (misc_int) {
+		v4l2_err(sd, "%s: Unhandled MISC_INT interrupts: 0x%02x\n",
+				__func__, misc_int);
+	}
+}
+
+static void tc358743_hdmi_cbit_int_handler(struct v4l2_subdev *sd,
+		bool *handled)
+{
+	u8 cbit_int_mask = i2c_rd8(sd, CBIT_INTM);
+	u8 cbit_int = i2c_rd8(sd, CBIT_INT) & ~cbit_int_mask;
+
+	i2c_wr8(sd, CBIT_INT, cbit_int);
+
+	v4l2_dbg(3, debug, sd, "%s: CBIT_INT = 0x%02x\n", __func__, cbit_int);
+
+	if (cbit_int & MASK_I_CBIT_FS) {
+
+		v4l2_dbg(1, debug, sd, "%s: Audio sample rate changed\n",
+				__func__);
+		tc358743_s_ctrl_audio_sampling_rate(sd);
+
+		cbit_int &= ~MASK_I_CBIT_FS;
+		if (handled)
+			*handled = true;
+	}
+
+	if (cbit_int & (MASK_I_AF_LOCK | MASK_I_AF_UNLOCK)) {
+
+		v4l2_dbg(1, debug, sd, "%s: Audio present changed\n",
+				__func__);
+		tc358743_s_ctrl_audio_present(sd);
+
+		cbit_int &= ~(MASK_I_AF_LOCK | MASK_I_AF_UNLOCK);
+		if (handled)
+			*handled = true;
+	}
+
+	if (cbit_int) {
+		v4l2_err(sd, "%s: Unhandled CBIT_INT interrupts: 0x%02x\n",
+				__func__, cbit_int);
+	}
+}
+
+static void tc358743_hdmi_clk_int_handler(struct v4l2_subdev *sd, bool *handled)
+{
+	u8 clk_int_mask = i2c_rd8(sd, CLK_INTM);
+	u8 clk_int = i2c_rd8(sd, CLK_INT) & ~clk_int_mask;
+
+	/* Bit 7 and bit 6 are set even when they are masked */
+	i2c_wr8(sd, CLK_INT, clk_int | 0x80 | MASK_I_OUT_H_CHG);
+
+	v4l2_dbg(3, debug, sd, "%s: CLK_INT = 0x%02x\n", __func__, clk_int);
+
+	if (clk_int & (MASK_I_IN_DE_CHG)) {
+
+		v4l2_dbg(1, debug, sd, "%s: DE size or position has changed\n",
+				__func__);
+
+		/* If the source switch to a new resolution with the same pixel
+		 * frequency as the existing (e.g. 1080p25 -> 720p50), the
+		 * I_SYNC_CHG interrupt is not always triggered, while the
+		 * I_IN_DE_CHG interrupt seems to work fine. Format change
+		 * notifications are only sent when the signal is stable to
+		 * reduce the number of notifications. */
+		if (!no_signal(sd) && !no_sync(sd))
+			tc358743_format_change(sd);
+
+		clk_int &= ~(MASK_I_IN_DE_CHG);
+		if (handled)
+			*handled = true;
+	}
+
+	if (clk_int) {
+		v4l2_err(sd, "%s: Unhandled CLK_INT interrupts: 0x%02x\n",
+				__func__, clk_int);
+	}
+}
+
+static void tc358743_hdmi_sys_int_handler(struct v4l2_subdev *sd, bool *handled)
+{
+	struct tc358743_state *state = to_state(sd);
+	u8 sys_int_mask = i2c_rd8(sd, SYS_INTM);
+	u8 sys_int = i2c_rd8(sd, SYS_INT) & ~sys_int_mask;
+
+	i2c_wr8(sd, SYS_INT, sys_int);
+
+	v4l2_dbg(3, debug, sd, "%s: SYS_INT = 0x%02x\n", __func__, sys_int);
+
+	if (sys_int & MASK_I_DDC) {
+		bool tx_5v = tx_5v_power_present(sd);
+
+		v4l2_dbg(1, debug, sd, "%s: Tx 5V power present: %s\n",
+				__func__, tx_5v ?  "yes" : "no");
+
+		if (tx_5v) {
+			tc358743_enable_edid(sd);
+		} else {
+			tc358743_enable_interrupts(sd, false);
+			tc358743_disable_edid(sd);
+			memset(&state->timings, 0, sizeof(state->timings));
+			tc358743_erase_bksv(sd);
+			tc358743_update_controls(sd);
+		}
+
+		sys_int &= ~MASK_I_DDC;
+		if (handled)
+			*handled = true;
+	}
+
+	if (sys_int & MASK_I_DVI) {
+		v4l2_dbg(1, debug, sd, "%s: HDMI->DVI change detected\n",
+				__func__);
+
+		/* Reset the HDMI PHY to try to trigger proper lock on the
+		 * incoming video format. Erase BKSV to prevent that old keys
+		 * are used when a new source is connected. */
+		if (no_sync(sd) || no_signal(sd)) {
+			tc358743_reset_phy(sd);
+			tc358743_erase_bksv(sd);
+		}
+
+		sys_int &= ~MASK_I_DVI;
+		if (handled)
+			*handled = true;
+	}
+
+	if (sys_int & MASK_I_HDMI) {
+		v4l2_dbg(1, debug, sd, "%s: DVI->HDMI change detected\n",
+				__func__);
+
+		/* Register is reset in DVI mode (REF_01, c. 6.6.41) */
+		i2c_wr8(sd, ANA_CTL, MASK_APPL_PCSX_NORMAL | MASK_ANALOG_ON);
+
+		sys_int &= ~MASK_I_HDMI;
+		if (handled)
+			*handled = true;
+	}
+
+	if (sys_int) {
+		v4l2_err(sd, "%s: Unhandled SYS_INT interrupts: 0x%02x\n",
+				__func__, sys_int);
+	}
+}
+
+/* --------------- CORE OPS --------------- */
+
+static int tc358743_log_status(struct v4l2_subdev *sd)
+{
+	struct tc358743_state *state = to_state(sd);
+	struct v4l2_dv_timings timings;
+	uint8_t hdmi_sys_status =  i2c_rd8(sd, SYS_STATUS);
+	uint16_t sysctl = i2c_rd16(sd, SYSCTL);
+	u8 vi_status3 =  i2c_rd8(sd, VI_STATUS3);
+	const int deep_color_mode[4] = { 8, 10, 12, 16 };
+	static const char * const input_color_space[] = {
+		"RGB", "YCbCr 601", "Adobe RGB", "YCbCr 709", "NA (4)",
+		"xvYCC 601", "NA(6)", "xvYCC 709", "NA(8)", "sYCC601",
+		"NA(10)", "NA(11)", "NA(12)", "Adobe YCC 601"};
+
+	v4l2_info(sd, "-----Chip status-----\n");
+	v4l2_info(sd, "Chip ID: 0x%02x\n",
+			(i2c_rd16(sd, CHIPID) & MASK_CHIPID) >> 8);
+	v4l2_info(sd, "Chip revision: 0x%02x\n",
+			i2c_rd16(sd, CHIPID) & MASK_REVID);
+	v4l2_info(sd, "Reset: IR: %d, CEC: %d, CSI TX: %d, HDMI: %d\n",
+			!!(sysctl & MASK_IRRST),
+			!!(sysctl & MASK_CECRST),
+			!!(sysctl & MASK_CTXRST),
+			!!(sysctl & MASK_HDMIRST));
+	v4l2_info(sd, "Sleep mode: %s\n", sysctl & MASK_SLEEP ? "on" : "off");
+	v4l2_info(sd, "Cable detected (+5V power): %s\n",
+			hdmi_sys_status & MASK_S_DDC5V ? "yes" : "no");
+	v4l2_info(sd, "DDC lines enabled: %s\n",
+			(i2c_rd8(sd, EDID_MODE) & MASK_EDID_MODE_E_DDC) ?
+			"yes" : "no");
+	v4l2_info(sd, "Hotplug enabled: %s\n",
+			(i2c_rd8(sd, HPD_CTL) & MASK_HPD_OUT0) ?
+			"yes" : "no");
+	v4l2_info(sd, "CEC enabled: %s\n",
+			(i2c_rd16(sd, CECEN) & MASK_CECEN) ?  "yes" : "no");
+	v4l2_info(sd, "-----Signal status-----\n");
+	v4l2_info(sd, "TMDS signal detected: %s\n",
+			hdmi_sys_status & MASK_S_TMDS ? "yes" : "no");
+	v4l2_info(sd, "Stable sync signal: %s\n",
+			hdmi_sys_status & MASK_S_SYNC ? "yes" : "no");
+	v4l2_info(sd, "PHY PLL locked: %s\n",
+			hdmi_sys_status & MASK_S_PHY_PLL ? "yes" : "no");
+	v4l2_info(sd, "PHY DE detected: %s\n",
+			hdmi_sys_status & MASK_S_PHY_SCDT ? "yes" : "no");
+
+	if (tc358743_get_detected_timings(sd, &timings)) {
+		v4l2_info(sd, "No video detected\n");
+	} else {
+		v4l2_print_dv_timings(sd->name, "Detected format: ", &timings,
+				true);
+	}
+	v4l2_print_dv_timings(sd->name, "Configured format: ", &state->timings,
+			true);
+
+	v4l2_info(sd, "-----CSI-TX status-----\n");
+	v4l2_info(sd, "Lanes needed: %d\n",
+			tc358743_num_csi_lanes_needed(sd));
+	v4l2_info(sd, "Lanes in use: %d\n",
+			tc358743_num_csi_lanes_in_use(sd));
+	v4l2_info(sd, "Waiting for particular sync signal: %s\n",
+			(i2c_rd16(sd, CSI_STATUS) & MASK_S_WSYNC) ?
+			"yes" : "no");
+	v4l2_info(sd, "Transmit mode: %s\n",
+			(i2c_rd16(sd, CSI_STATUS) & MASK_S_TXACT) ?
+			"yes" : "no");
+	v4l2_info(sd, "Receive mode: %s\n",
+			(i2c_rd16(sd, CSI_STATUS) & MASK_S_RXACT) ?
+			"yes" : "no");
+	v4l2_info(sd, "Stopped: %s\n",
+			(i2c_rd16(sd, CSI_STATUS) & MASK_S_HLT) ?
+			"yes" : "no");
+	v4l2_info(sd, "Color space: %s\n",
+			state->mbus_fmt_code == MEDIA_BUS_FMT_UYVY8_1X16 ?
+			"YCbCr 422 16-bit" :
+			state->mbus_fmt_code == MEDIA_BUS_FMT_RGB888_1X24 ?
+			"RGB 888 24-bit" : "Unsupported");
+
+	v4l2_info(sd, "-----%s status-----\n", is_hdmi(sd) ? "HDMI" : "DVI-D");
+	v4l2_info(sd, "HDCP encrypted content: %s\n",
+			hdmi_sys_status & MASK_S_HDCP ? "yes" : "no");
+	v4l2_info(sd, "Input color space: %s %s range\n",
+			input_color_space[(vi_status3 & MASK_S_V_COLOR) >> 1],
+			(vi_status3 & MASK_LIMITED) ? "limited" : "full");
+	if (!is_hdmi(sd))
+		return 0;
+	v4l2_info(sd, "AV Mute: %s\n", hdmi_sys_status & MASK_S_AVMUTE ? "on" :
+			"off");
+	v4l2_info(sd, "Deep color mode: %d-bits per channel\n",
+			deep_color_mode[(i2c_rd8(sd, VI_STATUS1) &
+				MASK_S_DEEPCOLOR) >> 2]);
+	print_avi_infoframe(sd);
+
+	return 0;
+}
+
+#ifdef CONFIG_VIDEO_ADV_DEBUG
+static void tc358743_print_register_map(struct v4l2_subdev *sd)
+{
+	v4l2_info(sd, "0x0000–0x00FF: Global Control Register\n");
+	v4l2_info(sd, "0x0100–0x01FF: CSI2-TX PHY Register\n");
+	v4l2_info(sd, "0x0200–0x03FF: CSI2-TX PPI Register\n");
+	v4l2_info(sd, "0x0400–0x05FF: Reserved\n");
+	v4l2_info(sd, "0x0600–0x06FF: CEC Register\n");
+	v4l2_info(sd, "0x0700–0x84FF: Reserved\n");
+	v4l2_info(sd, "0x8500–0x85FF: HDMIRX System Control Register\n");
+	v4l2_info(sd, "0x8600–0x86FF: HDMIRX Audio Control Register\n");
+	v4l2_info(sd, "0x8700–0x87FF: HDMIRX InfoFrame packet data Register\n");
+	v4l2_info(sd, "0x8800–0x88FF: HDMIRX HDCP Port Register\n");
+	v4l2_info(sd, "0x8900–0x89FF: HDMIRX Video Output Port & 3D Register\n");
+	v4l2_info(sd, "0x8A00–0x8BFF: Reserved\n");
+	v4l2_info(sd, "0x8C00–0x8FFF: HDMIRX EDID-RAM (1024bytes)\n");
+	v4l2_info(sd, "0x9000–0x90FF: HDMIRX GBD Extraction Control\n");
+	v4l2_info(sd, "0x9100–0x92FF: HDMIRX GBD RAM read\n");
+	v4l2_info(sd, "0x9300-      : Reserved\n");
+}
+
+static int tc358743_get_reg_size(u16 address)
+{
+	/* REF_01 p. 66-72 */
+	if (address <= 0x00ff)
+		return 2;
+	else if ((address >= 0x0100) && (address <= 0x06FF))
+		return 4;
+	else if ((address >= 0x0700) && (address <= 0x84ff))
+		return 2;
+	else
+		return 1;
+}
+
+static int tc358743_g_register(struct v4l2_subdev *sd,
+			       struct v4l2_dbg_register *reg)
+{
+	if (reg->reg > 0xffff) {
+		tc358743_print_register_map(sd);
+		return -EINVAL;
+	}
+
+	reg->size = tc358743_get_reg_size(reg->reg);
+
+	i2c_rd(sd, reg->reg, (u8 *)&reg->val, reg->size);
+
+	return 0;
+}
+
+static int tc358743_s_register(struct v4l2_subdev *sd,
+			       const struct v4l2_dbg_register *reg)
+{
+	if (reg->reg > 0xffff) {
+		tc358743_print_register_map(sd);
+		return -EINVAL;
+	}
+
+	/* It should not be possible for the user to enable HDCP with a simple
+	 * v4l2-dbg command.
+	 *
+	 * DO NOT REMOVE THIS unless all other issues with HDCP have been
+	 * resolved.
+	 */
+	if (reg->reg == HDCP_MODE ||
+	    reg->reg == HDCP_REG1 ||
+	    reg->reg == HDCP_REG2 ||
+	    reg->reg == HDCP_REG3 ||
+	    reg->reg == BCAPS)
+		return 0;
+
+	i2c_wr(sd, (u16)reg->reg, (u8 *)&reg->val,
+			tc358743_get_reg_size(reg->reg));
+
+	return 0;
+}
+#endif
+
+static int tc358743_isr(struct v4l2_subdev *sd, u32 status, bool *handled)
+{
+	u16 intstatus = i2c_rd16(sd, INTSTATUS);
+
+	v4l2_dbg(1, debug, sd, "%s: IntStatus = 0x%04x\n", __func__, intstatus);
+
+	if (intstatus & MASK_HDMI_INT) {
+		u8 hdmi_int0 = i2c_rd8(sd, HDMI_INT0);
+		u8 hdmi_int1 = i2c_rd8(sd, HDMI_INT1);
+
+		if (hdmi_int0 & MASK_I_MISC)
+			tc358743_hdmi_misc_int_handler(sd, handled);
+		if (hdmi_int1 & MASK_I_CBIT)
+			tc358743_hdmi_cbit_int_handler(sd, handled);
+		if (hdmi_int1 & MASK_I_CLK)
+			tc358743_hdmi_clk_int_handler(sd, handled);
+		if (hdmi_int1 & MASK_I_SYS)
+			tc358743_hdmi_sys_int_handler(sd, handled);
+		if (hdmi_int1 & MASK_I_AUD)
+			tc358743_hdmi_audio_int_handler(sd, handled);
+
+		i2c_wr16(sd, INTSTATUS, MASK_HDMI_INT);
+		intstatus &= ~MASK_HDMI_INT;
+	}
+
+	if (intstatus & MASK_CSI_INT) {
+		u32 csi_int = i2c_rd32(sd, CSI_INT);
+
+		if (csi_int & MASK_INTER)
+			tc358743_csi_err_int_handler(sd, handled);
+
+		i2c_wr16(sd, INTSTATUS, MASK_CSI_INT);
+		intstatus &= ~MASK_CSI_INT;
+	}
+
+	intstatus = i2c_rd16(sd, INTSTATUS);
+	if (intstatus) {
+		v4l2_dbg(1, debug, sd,
+				"%s: Unhandled IntStatus interrupts: 0x%02x\n",
+				__func__, intstatus);
+	}
+
+	return 0;
+}
+
+/* --------------- VIDEO OPS --------------- */
+
+static int tc358743_g_input_status(struct v4l2_subdev *sd, u32 *status)
+{
+	*status = 0;
+	*status |= no_signal(sd) ? V4L2_IN_ST_NO_SIGNAL : 0;
+	*status |= no_sync(sd) ? V4L2_IN_ST_NO_SYNC : 0;
+
+	v4l2_dbg(1, debug, sd, "%s: status = 0x%x\n", __func__, *status);
+
+	return 0;
+}
+
+static int tc358743_s_dv_timings(struct v4l2_subdev *sd,
+				 struct v4l2_dv_timings *timings)
+{
+	struct tc358743_state *state = to_state(sd);
+	struct v4l2_bt_timings *bt;
+
+	if (!timings)
+		return -EINVAL;
+
+	if (debug)
+		v4l2_print_dv_timings(sd->name, "tc358743_s_dv_timings: ",
+				timings, false);
+
+	if (v4l2_match_dv_timings(&state->timings, timings, 0)) {
+		v4l2_dbg(1, debug, sd, "%s: no change\n", __func__);
+		return 0;
+	}
+
+	bt = &timings->bt;
+
+	if (!v4l2_valid_dv_timings(timings,
+				&tc358743_timings_cap, NULL, NULL)) {
+		v4l2_dbg(1, debug, sd, "%s: timings out of range\n", __func__);
+		return -ERANGE;
+	}
+
+	state->timings = *timings;
+
+	enable_stream(sd, false);
+	tc358743_set_pll(sd);
+	tc358743_set_csi(sd);
+
+	return 0;
+}
+
+static int tc358743_g_dv_timings(struct v4l2_subdev *sd,
+				 struct v4l2_dv_timings *timings)
+{
+	struct tc358743_state *state = to_state(sd);
+
+	*timings = state->timings;
+
+	return 0;
+}
+
+static int tc358743_enum_dv_timings(struct v4l2_subdev *sd,
+				    struct v4l2_enum_dv_timings *timings)
+{
+	if (timings->pad != 0)
+		return -EINVAL;
+
+	return v4l2_enum_dv_timings_cap(timings,
+			&tc358743_timings_cap, NULL, NULL);
+}
+
+static int tc358743_query_dv_timings(struct v4l2_subdev *sd,
+		struct v4l2_dv_timings *timings)
+{
+	int ret;
+
+	ret = tc358743_get_detected_timings(sd, timings);
+	if (ret)
+		return ret;
+
+	if (debug)
+		v4l2_print_dv_timings(sd->name, "tc358743_query_dv_timings: ",
+				timings, false);
+
+	if (!v4l2_valid_dv_timings(timings,
+				&tc358743_timings_cap, NULL, NULL)) {
+		v4l2_dbg(1, debug, sd, "%s: timings out of range\n", __func__);
+		return -ERANGE;
+	}
+
+	return 0;
+}
+
+static int tc358743_dv_timings_cap(struct v4l2_subdev *sd,
+		struct v4l2_dv_timings_cap *cap)
+{
+	if (cap->pad != 0)
+		return -EINVAL;
+
+	*cap = tc358743_timings_cap;
+
+	return 0;
+}
+
+static int tc358743_g_mbus_config(struct v4l2_subdev *sd,
+			     struct v4l2_mbus_config *cfg)
+{
+	cfg->type = V4L2_MBUS_CSI2;
+
+	/* Support for non-continuous CSI-2 clock is missing in the driver */
+	cfg->flags = V4L2_MBUS_CSI2_CONTINUOUS_CLOCK;
+
+	switch (tc358743_num_csi_lanes_in_use(sd)) {
+	case 1:
+		cfg->flags |= V4L2_MBUS_CSI2_1_LANE;
+		break;
+	case 2:
+		cfg->flags |= V4L2_MBUS_CSI2_2_LANE;
+		break;
+	case 3:
+		cfg->flags |= V4L2_MBUS_CSI2_3_LANE;
+		break;
+	case 4:
+		cfg->flags |= V4L2_MBUS_CSI2_4_LANE;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int tc358743_s_stream(struct v4l2_subdev *sd, int enable)
+{
+	enable_stream(sd, enable);
+
+	return 0;
+}
+
+/* --------------- PAD OPS --------------- */
+
+static int tc358743_get_fmt(struct v4l2_subdev *sd,
+		struct v4l2_subdev_pad_config *cfg,
+		struct v4l2_subdev_format *format)
+{
+	struct tc358743_state *state = to_state(sd);
+	u8 vi_rep = i2c_rd8(sd, VI_REP);
+
+	if (format->pad != 0)
+		return -EINVAL;
+
+	format->format.code = state->mbus_fmt_code;
+	format->format.width = state->timings.bt.width;
+	format->format.height = state->timings.bt.height;
+	format->format.field = V4L2_FIELD_NONE;
+
+	switch (vi_rep & MASK_VOUT_COLOR_SEL) {
+	case MASK_VOUT_COLOR_RGB_FULL:
+	case MASK_VOUT_COLOR_RGB_LIMITED:
+		format->format.colorspace = V4L2_COLORSPACE_SRGB;
+		break;
+	case MASK_VOUT_COLOR_601_YCBCR_LIMITED:
+	case MASK_VOUT_COLOR_601_YCBCR_FULL:
+		format->format.colorspace = V4L2_COLORSPACE_SMPTE170M;
+		break;
+	case MASK_VOUT_COLOR_709_YCBCR_FULL:
+	case MASK_VOUT_COLOR_709_YCBCR_LIMITED:
+		format->format.colorspace = V4L2_COLORSPACE_REC709;
+		break;
+	default:
+		format->format.colorspace = 0;
+		break;
+	}
+
+	return 0;
+}
+
+static int tc358743_set_fmt(struct v4l2_subdev *sd,
+		struct v4l2_subdev_pad_config *cfg,
+		struct v4l2_subdev_format *format)
+{
+	struct tc358743_state *state = to_state(sd);
+
+	u32 code = format->format.code; /* is overwritten by get_fmt */
+	int ret = tc358743_get_fmt(sd, cfg, format);
+
+	format->format.code = code;
+
+	if (ret)
+		return ret;
+
+	switch (code) {
+	case MEDIA_BUS_FMT_RGB888_1X24:
+	case MEDIA_BUS_FMT_UYVY8_1X16:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (format->which == V4L2_SUBDEV_FORMAT_TRY)
+		return 0;
+
+	state->mbus_fmt_code = format->format.code;
+
+	enable_stream(sd, false);
+	tc358743_set_pll(sd);
+	tc358743_set_csi(sd);
+	tc358743_set_csi_color_space(sd);
+
+	return 0;
+}
+
+static int tc358743_g_edid(struct v4l2_subdev *sd,
+		struct v4l2_subdev_edid *edid)
+{
+	struct tc358743_state *state = to_state(sd);
+
+	if (edid->pad != 0)
+		return -EINVAL;
+
+	if (edid->start_block == 0 && edid->blocks == 0) {
+		edid->blocks = state->edid_blocks_written;
+		return 0;
+	}
+
+	if (state->edid_blocks_written == 0)
+		return -ENODATA;
+
+	if (edid->start_block >= state->edid_blocks_written ||
+			edid->blocks == 0)
+		return -EINVAL;
+
+	if (edid->start_block + edid->blocks > state->edid_blocks_written)
+		edid->blocks = state->edid_blocks_written - edid->start_block;
+
+	i2c_rd(sd, EDID_RAM + (edid->start_block * EDID_BLOCK_SIZE), edid->edid,
+			edid->blocks * EDID_BLOCK_SIZE);
+
+	return 0;
+}
+
+static int tc358743_s_edid(struct v4l2_subdev *sd,
+				struct v4l2_subdev_edid *edid)
+{
+	struct tc358743_state *state = to_state(sd);
+	u16 edid_len = edid->blocks * EDID_BLOCK_SIZE;
+
+	v4l2_dbg(2, debug, sd, "%s, pad %d, start block %d, blocks %d\n",
+		 __func__, edid->pad, edid->start_block, edid->blocks);
+
+	if (edid->pad != 0)
+		return -EINVAL;
+
+	if (edid->start_block != 0)
+		return -EINVAL;
+
+	if (edid->blocks > EDID_NUM_BLOCKS_MAX) {
+		edid->blocks = EDID_NUM_BLOCKS_MAX;
+		return -E2BIG;
+	}
+
+	tc358743_disable_edid(sd);
+
+	i2c_wr8(sd, EDID_LEN1, edid_len & 0xff);
+	i2c_wr8(sd, EDID_LEN2, edid_len >> 8);
+
+	if (edid->blocks == 0) {
+		state->edid_blocks_written = 0;
+		return 0;
+	}
+
+	i2c_wr(sd, EDID_RAM, edid->edid, edid_len);
+
+	state->edid_blocks_written = edid->blocks;
+
+	if (tx_5v_power_present(sd))
+		tc358743_enable_edid(sd);
+
+	return 0;
+}
+
+/* -------------------------------------------------------------------------- */
+
+static const struct v4l2_subdev_core_ops tc358743_core_ops = {
+	.log_status = tc358743_log_status,
+#ifdef CONFIG_VIDEO_ADV_DEBUG
+	.g_register = tc358743_g_register,
+	.s_register = tc358743_s_register,
+#endif
+	.interrupt_service_routine = tc358743_isr,
+};
+
+static const struct v4l2_subdev_video_ops tc358743_video_ops = {
+	.g_input_status = tc358743_g_input_status,
+	.s_dv_timings = tc358743_s_dv_timings,
+	.g_dv_timings = tc358743_g_dv_timings,
+	.query_dv_timings = tc358743_query_dv_timings,
+	.g_mbus_config = tc358743_g_mbus_config,
+	.s_stream = tc358743_s_stream,
+};
+
+static const struct v4l2_subdev_pad_ops tc358743_pad_ops = {
+	.set_fmt = tc358743_set_fmt,
+	.get_fmt = tc358743_get_fmt,
+	.get_edid = tc358743_g_edid,
+	.set_edid = tc358743_s_edid,
+	.enum_dv_timings = tc358743_enum_dv_timings,
+	.dv_timings_cap = tc358743_dv_timings_cap,
+};
+
+static const struct v4l2_subdev_ops tc358743_ops = {
+	.core = &tc358743_core_ops,
+	.video = &tc358743_video_ops,
+	.pad = &tc358743_pad_ops,
+};
+
+/* --------------- CUSTOM CTRLS --------------- */
+
+static const struct v4l2_ctrl_config tc358743_ctrl_audio_sampling_rate = {
+	.id = TC358743_CID_AUDIO_SAMPLING_RATE,
+	.name = "Audio sampling rate",
+	.type = V4L2_CTRL_TYPE_INTEGER,
+	.min = 0,
+	.max = 768000,
+	.step = 1,
+	.def = 0,
+	.flags = V4L2_CTRL_FLAG_READ_ONLY,
+};
+
+static const struct v4l2_ctrl_config tc358743_ctrl_audio_present = {
+	.id = TC358743_CID_AUDIO_PRESENT,
+	.name = "Audio present",
+	.type = V4L2_CTRL_TYPE_BOOLEAN,
+	.min = 0,
+	.max = 1,
+	.step = 1,
+	.def = 0,
+	.flags = V4L2_CTRL_FLAG_READ_ONLY,
+};
+
+/* --------------- PROBE / REMOVE --------------- */
+
+static int tc358743_probe(struct i2c_client *client,
+			  const struct i2c_device_id *id)
+{
+	static struct v4l2_dv_timings default_timing =
+		V4L2_DV_BT_CEA_640X480P59_94;
+	struct tc358743_state *state;
+	struct tc358743_platform_data *pdata = client->dev.platform_data;
+	struct v4l2_subdev *sd;
+	int err;
+
+	if (!i2c_check_functionality(client->adapter, I2C_FUNC_SMBUS_BYTE_DATA))
+		return -EIO;
+	v4l_dbg(1, debug, client, "chip found @ 0x%x (%s)\n",
+		client->addr << 1, client->adapter->name);
+
+	state = devm_kzalloc(&client->dev, sizeof(struct tc358743_state),
+			GFP_KERNEL);
+	if (!state)
+		return -ENOMEM;
+
+	/* platform data */
+	if (!pdata) {
+		v4l_err(client, "No platform data!\n");
+		return -ENODEV;
+	}
+	state->pdata = *pdata;
+
+	state->i2c_client = client;
+	sd = &state->sd;
+	v4l2_i2c_subdev_init(sd, client, &tc358743_ops);
+	sd->flags |= V4L2_SUBDEV_FL_HAS_EVENTS;
+
+	/* i2c access */
+	if ((i2c_rd16(sd, CHIPID) & MASK_CHIPID) != 0) {
+		v4l2_info(sd, "not a TC358743 on address 0x%x\n",
+			  client->addr << 1);
+		return -ENODEV;
+	}
+
+	/* control handlers */
+	v4l2_ctrl_handler_init(&state->hdl, 3);
+
+	/* private controls */
+	state->detect_tx_5v_ctrl = v4l2_ctrl_new_std(&state->hdl, NULL,
+			V4L2_CID_DV_RX_POWER_PRESENT, 0, 1, 0, 0);
+
+	/* custom controls */
+	state->audio_sampling_rate_ctrl = v4l2_ctrl_new_custom(&state->hdl,
+			&tc358743_ctrl_audio_sampling_rate, NULL);
+
+	state->audio_present_ctrl = v4l2_ctrl_new_custom(&state->hdl,
+			&tc358743_ctrl_audio_present, NULL);
+
+	sd->ctrl_handler = &state->hdl;
+	if (state->hdl.error) {
+		err = state->hdl.error;
+		goto err_hdl;
+	}
+
+	if (tc358743_update_controls(sd)) {
+		err = -ENODEV;
+		goto err_hdl;
+	}
+
+	/* work queues */
+	state->work_queues = create_singlethread_workqueue(client->name);
+	if (!state->work_queues) {
+		v4l2_err(sd, "Could not create work queue\n");
+		err = -ENOMEM;
+		goto err_hdl;
+	}
+
+	mutex_init(&state->confctl_mutex);
+
+	INIT_DELAYED_WORK(&state->delayed_work_enable_hotplug,
+			tc358743_delayed_work_enable_hotplug);
+
+	tc358743_initial_setup(sd);
+
+	tc358743_s_dv_timings(sd, &default_timing);
+
+	state->mbus_fmt_code = MEDIA_BUS_FMT_RGB888_1X24;
+	tc358743_set_csi_color_space(sd);
+
+	tc358743_init_interrupts(sd);
+	tc358743_enable_interrupts(sd, tx_5v_power_present(sd));
+	i2c_wr16(sd, INTMASK, ~(MASK_HDMI_MSK | MASK_CSI_MSK) & 0xffff);
+
+	err = v4l2_ctrl_handler_setup(sd->ctrl_handler);
+	if (err)
+		goto err_work_queues;
+
+	v4l2_info(sd, "%s found @ 0x%x (%s)\n", client->name,
+		  client->addr << 1, client->adapter->name);
+
+	return 0;
+
+err_work_queues:
+	cancel_delayed_work(&state->delayed_work_enable_hotplug);
+	destroy_workqueue(state->work_queues);
+	mutex_destroy(&state->confctl_mutex);
+err_hdl:
+	v4l2_ctrl_handler_free(&state->hdl);
+	return err;
+}
+
+static int tc358743_remove(struct i2c_client *client)
+{
+	struct v4l2_subdev *sd = i2c_get_clientdata(client);
+	struct tc358743_state *state = to_state(sd);
+
+	cancel_delayed_work(&state->delayed_work_enable_hotplug);
+	destroy_workqueue(state->work_queues);
+	v4l2_device_unregister_subdev(sd);
+	mutex_destroy(&state->confctl_mutex);
+	v4l2_ctrl_handler_free(&state->hdl);
+
+	return 0;
+}
+
+static struct i2c_device_id tc358743_id[] = {
+	{"tc358743", 0},
+	{}
+};
+
+MODULE_DEVICE_TABLE(i2c, tc358743_id);
+
+static struct i2c_driver tc358743_driver = {
+	.driver = {
+		.owner = THIS_MODULE,
+		.name = "tc358743",
+	},
+	.probe = tc358743_probe,
+	.remove = tc358743_remove,
+	.id_table = tc358743_id,
+};
+
+module_i2c_driver(tc358743_driver);
diff --git a/drivers/media/i2c/tc358743_regs.h b/drivers/media/i2c/tc358743_regs.h
new file mode 100644
index 000000000000..81f1db558e7c
--- /dev/null
+++ b/drivers/media/i2c/tc358743_regs.h
@@ -0,0 +1,681 @@
+/*
+ * tc358743 - Toshiba HDMI to CSI-2 bridge - register names and bit masks
+ *
+ * Copyright 2015 Cisco Systems, Inc. and/or its affiliates. All rights
+ * reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+/*
+ * References (c = chapter, p = page):
+ * REF_01 - Toshiba, TC358743XBG (H2C), Functional Specification, Rev 0.60
+ */
+
+/* Bit masks has prefix 'MASK_' and options after '_'. */
+
+#ifndef __TC358743_REGS_H
+#define __TC358743_REGS_H
+
+#define CHIPID                                0x0000
+#define MASK_CHIPID                           0xff00
+#define MASK_REVID                            0x00ff
+
+#define SYSCTL                                0x0002
+#define MASK_IRRST                            0x0800
+#define MASK_CECRST                           0x0400
+#define MASK_CTXRST                           0x0200
+#define MASK_HDMIRST                          0x0100
+#define MASK_SLEEP                            0x0001
+
+#define CONFCTL                               0x0004
+#define MASK_PWRISO                           0x8000
+#define MASK_ACLKOPT                          0x1000
+#define MASK_AUDCHNUM                         0x0c00
+#define MASK_AUDCHNUM_8                       0x0000
+#define MASK_AUDCHNUM_6                       0x0400
+#define MASK_AUDCHNUM_4                       0x0800
+#define MASK_AUDCHNUM_2                       0x0c00
+#define MASK_AUDCHSEL                         0x0200
+#define MASK_I2SDLYOPT                        0x0100
+#define MASK_YCBCRFMT                         0x00c0
+#define MASK_YCBCRFMT_444                     0x0000
+#define MASK_YCBCRFMT_422_12_BIT              0x0040
+#define MASK_YCBCRFMT_COLORBAR                0x0080
+#define MASK_YCBCRFMT_422_8_BIT               0x00c0
+#define MASK_INFRMEN                          0x0020
+#define MASK_AUDOUTSEL                        0x0018
+#define MASK_AUDOUTSEL_CSI                    0x0000
+#define MASK_AUDOUTSEL_I2S                    0x0010
+#define MASK_AUDOUTSEL_TDM                    0x0018
+#define MASK_AUTOINDEX                        0x0004
+#define MASK_ABUFEN                           0x0002
+#define MASK_VBUFEN                           0x0001
+
+#define FIFOCTL                               0x0006
+
+#define INTSTATUS                             0x0014
+#define MASK_AMUTE_INT                        0x0400
+#define MASK_HDMI_INT                         0x0200
+#define MASK_CSI_INT                          0x0100
+#define MASK_SYS_INT                          0x0020
+#define MASK_CEC_EINT                         0x0010
+#define MASK_CEC_TINT                         0x0008
+#define MASK_CEC_RINT                         0x0004
+#define MASK_IR_EINT                          0x0002
+#define MASK_IR_DINT                          0x0001
+
+#define INTMASK                               0x0016
+#define MASK_AMUTE_MSK                        0x0400
+#define MASK_HDMI_MSK                         0x0200
+#define MASK_CSI_MSK                          0x0100
+#define MASK_SYS_MSK                          0x0020
+#define MASK_CEC_EMSK                         0x0010
+#define MASK_CEC_TMSK                         0x0008
+#define MASK_CEC_RMSK                         0x0004
+#define MASK_IR_EMSK                          0x0002
+#define MASK_IR_DMSK                          0x0001
+
+#define INTFLAG                               0x0018
+#define INTSYSSTATUS                          0x001A
+
+#define PLLCTL0                               0x0020
+#define MASK_PLL_PRD                          0xf000
+#define SET_PLL_PRD(prd)                      ((((prd) - 1) << 12) &\
+						MASK_PLL_PRD)
+#define MASK_PLL_FBD                          0x01ff
+#define SET_PLL_FBD(fbd)                      (((fbd) - 1) & MASK_PLL_FBD)
+
+#define PLLCTL1                               0x0022
+#define MASK_PLL_FRS                          0x0c00
+#define SET_PLL_FRS(frs)                      (((frs) << 10) & MASK_PLL_FRS)
+#define MASK_PLL_LBWS                         0x0300
+#define MASK_LFBREN                           0x0040
+#define MASK_BYPCKEN                          0x0020
+#define MASK_CKEN                             0x0010
+#define MASK_RESETB                           0x0002
+#define MASK_PLL_EN                           0x0001
+
+#define CLW_CNTRL                             0x0140
+#define MASK_CLW_LANEDISABLE                  0x0001
+
+#define D0W_CNTRL                             0x0144
+#define MASK_D0W_LANEDISABLE                  0x0001
+
+#define D1W_CNTRL                             0x0148
+#define MASK_D1W_LANEDISABLE                  0x0001
+
+#define D2W_CNTRL                             0x014C
+#define MASK_D2W_LANEDISABLE                  0x0001
+
+#define D3W_CNTRL                             0x0150
+#define MASK_D3W_LANEDISABLE                  0x0001
+
+#define STARTCNTRL                            0x0204
+#define MASK_START                            0x00000001
+
+#define LINEINITCNT                           0x0210
+#define LPTXTIMECNT                           0x0214
+#define TCLK_HEADERCNT                        0x0218
+#define TCLK_TRAILCNT                         0x021C
+#define THS_HEADERCNT                         0x0220
+#define TWAKEUP                               0x0224
+#define TCLK_POSTCNT                          0x0228
+#define THS_TRAILCNT                          0x022C
+#define HSTXVREGCNT                           0x0230
+
+#define HSTXVREGEN                            0x0234
+#define MASK_D3M_HSTXVREGEN                   0x0010
+#define MASK_D2M_HSTXVREGEN                   0x0008
+#define MASK_D1M_HSTXVREGEN                   0x0004
+#define MASK_D0M_HSTXVREGEN                   0x0002
+#define MASK_CLM_HSTXVREGEN                   0x0001
+
+
+#define TXOPTIONCNTRL                         0x0238
+#define MASK_CONTCLKMODE                      0x00000001
+
+#define CSI_CONTROL                           0x040C
+#define MASK_CSI_MODE                         0x8000
+#define MASK_HTXTOEN                          0x0400
+#define MASK_TXHSMD                           0x0080
+#define MASK_HSCKMD                           0x0020
+#define MASK_NOL                              0x0006
+#define MASK_NOL_1                            0x0000
+#define MASK_NOL_2                            0x0002
+#define MASK_NOL_3                            0x0004
+#define MASK_NOL_4                            0x0006
+#define MASK_EOTDIS                           0x0001
+
+#define CSI_INT                               0x0414
+#define MASK_INTHLT                           0x00000008
+#define MASK_INTER                            0x00000004
+
+#define CSI_INT_ENA                           0x0418
+#define MASK_IENHLT                           0x00000008
+#define MASK_IENER                            0x00000004
+
+#define CSI_ERR                               0x044C
+#define MASK_INER                             0x00000200
+#define MASK_WCER                             0x00000100
+#define MASK_QUNK                             0x00000010
+#define MASK_TXBRK                            0x00000002
+
+#define CSI_ERR_INTENA                        0x0450
+#define CSI_ERR_HALT                          0x0454
+
+#define CSI_CONFW                             0x0500
+#define MASK_MODE                             0xe0000000
+#define MASK_MODE_SET                         0xa0000000
+#define MASK_MODE_CLEAR                       0xc0000000
+#define MASK_ADDRESS                          0x1f000000
+#define MASK_ADDRESS_CSI_CONTROL              0x03000000
+#define MASK_ADDRESS_CSI_INT_ENA              0x06000000
+#define MASK_ADDRESS_CSI_ERR_INTENA           0x14000000
+#define MASK_ADDRESS_CSI_ERR_HALT             0x15000000
+#define MASK_DATA                             0x0000ffff
+
+#define CSI_INT_CLR                           0x050C
+#define MASK_ICRER                            0x00000004
+
+#define CSI_START                             0x0518
+#define MASK_STRT                             0x00000001
+
+#define CECEN                                 0x0600
+#define MASK_CECEN                            0x0001
+
+#define HDMI_INT0                             0x8500
+#define MASK_I_KEY                            0x80
+#define MASK_I_MISC                           0x02
+#define MASK_I_PHYERR                         0x01
+
+#define HDMI_INT1                             0x8501
+#define MASK_I_GBD                            0x80
+#define MASK_I_HDCP                           0x40
+#define MASK_I_ERR                            0x20
+#define MASK_I_AUD                            0x10
+#define MASK_I_CBIT                           0x08
+#define MASK_I_PACKET                         0x04
+#define MASK_I_CLK                            0x02
+#define MASK_I_SYS                            0x01
+
+#define SYS_INT                               0x8502
+#define MASK_I_ACR_CTS                        0x80
+#define MASK_I_ACRN                           0x40
+#define MASK_I_DVI                            0x20
+#define MASK_I_HDMI                           0x10
+#define MASK_I_NOPMBDET                       0x08
+#define MASK_I_DPMBDET                        0x04
+#define MASK_I_TMDS                           0x02
+#define MASK_I_DDC                            0x01
+
+#define CLK_INT                               0x8503
+#define MASK_I_OUT_H_CHG                      0x40
+#define MASK_I_IN_DE_CHG                      0x20
+#define MASK_I_IN_HV_CHG                      0x10
+#define MASK_I_DC_CHG                         0x08
+#define MASK_I_PXCLK_CHG                      0x04
+#define MASK_I_PHYCLK_CHG                     0x02
+#define MASK_I_TMDSCLK_CHG                    0x01
+
+#define CBIT_INT                              0x8505
+#define MASK_I_AF_LOCK                        0x80
+#define MASK_I_AF_UNLOCK                      0x40
+#define MASK_I_CBIT_FS                        0x02
+
+#define AUDIO_INT                             0x8506
+
+#define ERR_INT                               0x8507
+#define MASK_I_EESS_ERR                       0x80
+
+#define HDCP_INT                              0x8508
+#define MASK_I_AVM_SET                        0x80
+#define MASK_I_AVM_CLR                        0x40
+#define MASK_I_LINKERR                        0x20
+#define MASK_I_SHA_END                        0x10
+#define MASK_I_R0_END                         0x08
+#define MASK_I_KM_END                         0x04
+#define MASK_I_AKSV_END                       0x02
+#define MASK_I_AN_END                         0x01
+
+#define MISC_INT                              0x850B
+#define MASK_I_AS_LAYOUT                      0x10
+#define MASK_I_NO_SPD                         0x08
+#define MASK_I_NO_VS                          0x03
+#define MASK_I_SYNC_CHG                       0x02
+#define MASK_I_AUDIO_MUTE                     0x01
+
+#define KEY_INT                               0x850F
+
+#define SYS_INTM                              0x8512
+#define MASK_M_ACR_CTS                        0x80
+#define MASK_M_ACR_N                          0x40
+#define MASK_M_DVI_DET                        0x20
+#define MASK_M_HDMI_DET                       0x10
+#define MASK_M_NOPMBDET                       0x08
+#define MASK_M_BPMBDET                        0x04
+#define MASK_M_TMDS                           0x02
+#define MASK_M_DDC                            0x01
+
+#define CLK_INTM                              0x8513
+#define MASK_M_OUT_H_CHG                      0x40
+#define MASK_M_IN_DE_CHG                      0x20
+#define MASK_M_IN_HV_CHG                      0x10
+#define MASK_M_DC_CHG                         0x08
+#define MASK_M_PXCLK_CHG                      0x04
+#define MASK_M_PHYCLK_CHG                     0x02
+#define MASK_M_TMDS_CHG                       0x01
+
+#define PACKET_INTM                           0x8514
+
+#define CBIT_INTM                             0x8515
+#define MASK_M_AF_LOCK                        0x80
+#define MASK_M_AF_UNLOCK                      0x40
+#define MASK_M_CBIT_FS                        0x02
+
+#define AUDIO_INTM                            0x8516
+#define MASK_M_BUFINIT_END                    0x01
+
+#define ERR_INTM                              0x8517
+#define MASK_M_EESS_ERR                       0x80
+
+#define HDCP_INTM                             0x8518
+#define MASK_M_AVM_SET                        0x80
+#define MASK_M_AVM_CLR                        0x40
+#define MASK_M_LINKERR                        0x20
+#define MASK_M_SHA_END                        0x10
+#define MASK_M_R0_END                         0x08
+#define MASK_M_KM_END                         0x04
+#define MASK_M_AKSV_END                       0x02
+#define MASK_M_AN_END                         0x01
+
+#define MISC_INTM                             0x851B
+#define MASK_M_AS_LAYOUT                      0x10
+#define MASK_M_NO_SPD                         0x08
+#define MASK_M_NO_VS                          0x03
+#define MASK_M_SYNC_CHG                       0x02
+#define MASK_M_AUDIO_MUTE                     0x01
+
+#define KEY_INTM                              0x851F
+
+#define SYS_STATUS                            0x8520
+#define MASK_S_SYNC                           0x80
+#define MASK_S_AVMUTE                         0x40
+#define MASK_S_HDCP                           0x20
+#define MASK_S_HDMI                           0x10
+#define MASK_S_PHY_SCDT                       0x08
+#define MASK_S_PHY_PLL                        0x04
+#define MASK_S_TMDS                           0x02
+#define MASK_S_DDC5V                          0x01
+
+#define CSI_STATUS                            0x0410
+#define MASK_S_WSYNC                          0x0400
+#define MASK_S_TXACT                          0x0200
+#define MASK_S_RXACT                          0x0100
+#define MASK_S_HLT                            0x0001
+
+#define VI_STATUS1                            0x8522
+#define MASK_S_V_GBD                          0x08
+#define MASK_S_DEEPCOLOR                      0x0c
+#define MASK_S_V_422                          0x02
+#define MASK_S_V_INTERLACE                    0x01
+
+#define AU_STATUS0                            0x8523
+#define MASK_S_A_SAMPLE                       0x01
+
+#define VI_STATUS3                            0x8528
+#define MASK_S_V_COLOR                        0x1e
+#define MASK_LIMITED                          0x01
+
+#define PHY_CTL0                              0x8531
+#define MASK_PHY_SYSCLK_IND                   0x02
+#define MASK_PHY_CTL                          0x01
+
+
+#define PHY_CTL1                              0x8532 /* Not in REF_01 */
+#define MASK_PHY_AUTO_RST1                    0xf0
+#define MASK_PHY_AUTO_RST1_OFF                0x00
+#define SET_PHY_AUTO_RST1_US(us)             ((((us) / 200) << 4) & \
+						MASK_PHY_AUTO_RST1)
+#define MASK_FREQ_RANGE_MODE                  0x0f
+#define SET_FREQ_RANGE_MODE_CYCLES(cycles)   (((cycles) - 1) & \
+						MASK_FREQ_RANGE_MODE)
+
+#define PHY_CTL2                              0x8533 /* Not in REF_01 */
+#define MASK_PHY_AUTO_RST4                    0x04
+#define MASK_PHY_AUTO_RST3                    0x02
+#define MASK_PHY_AUTO_RST2                    0x01
+#define MASK_PHY_AUTO_RSTn                    (MASK_PHY_AUTO_RST4 | \
+						MASK_PHY_AUTO_RST3 | \
+						MASK_PHY_AUTO_RST2)
+
+#define PHY_EN                                0x8534
+#define MASK_ENABLE_PHY                       0x01
+
+#define PHY_RST                               0x8535
+#define MASK_RESET_CTRL                       0x01   /* Reset active low */
+
+#define PHY_BIAS                              0x8536 /* Not in REF_01 */
+
+#define PHY_CSQ                               0x853F /* Not in REF_01 */
+#define MASK_CSQ_CNT                          0x0f
+#define SET_CSQ_CNT_LEVEL(n)                 (n & MASK_CSQ_CNT)
+
+#define SYS_FREQ0                             0x8540
+#define SYS_FREQ1                             0x8541
+
+#define SYS_CLK                               0x8542 /* Not in REF_01 */
+#define MASK_CLK_DIFF                         0x0C
+#define MASK_CLK_DIV                          0x03
+
+#define DDC_CTL                               0x8543
+#define MASK_DDC_ACK_POL                      0x08
+#define MASK_DDC_ACTION                       0x04
+#define MASK_DDC5V_MODE                       0x03
+#define MASK_DDC5V_MODE_0MS                   0x00
+#define MASK_DDC5V_MODE_50MS                  0x01
+#define MASK_DDC5V_MODE_100MS                 0x02
+#define MASK_DDC5V_MODE_200MS                 0x03
+
+#define HPD_CTL                               0x8544
+#define MASK_HPD_CTL0                         0x10
+#define MASK_HPD_OUT0                         0x01
+
+#define ANA_CTL                               0x8545
+#define MASK_APPL_PCSX                        0x30
+#define MASK_APPL_PCSX_HIZ                    0x00
+#define MASK_APPL_PCSX_L_FIX                  0x10
+#define MASK_APPL_PCSX_H_FIX                  0x20
+#define MASK_APPL_PCSX_NORMAL                 0x30
+#define MASK_ANALOG_ON                        0x01
+
+#define AVM_CTL                               0x8546
+
+#define INIT_END                              0x854A
+#define MASK_INIT_END                         0x01
+
+#define HDMI_DET                              0x8552 /* Not in REF_01 */
+#define MASK_HDMI_DET_MOD1                    0x80
+#define MASK_HDMI_DET_MOD0                    0x40
+#define MASK_HDMI_DET_V                       0x30
+#define MASK_HDMI_DET_V_SYNC                  0x00
+#define MASK_HDMI_DET_V_ASYNC_25MS            0x10
+#define MASK_HDMI_DET_V_ASYNC_50MS            0x20
+#define MASK_HDMI_DET_V_ASYNC_100MS           0x30
+#define MASK_HDMI_DET_NUM                     0x0f
+
+#define HDCP_MODE                             0x8560
+#define MASK_MODE_RST_TN                      0x20
+#define MASK_LINE_REKEY                       0x10
+#define MASK_AUTO_CLR                         0x04
+
+#define HDCP_REG1                             0x8563 /* Not in REF_01 */
+#define MASK_AUTH_UNAUTH_SEL                  0x70
+#define MASK_AUTH_UNAUTH_SEL_12_FRAMES        0x70
+#define MASK_AUTH_UNAUTH_SEL_8_FRAMES         0x60
+#define MASK_AUTH_UNAUTH_SEL_4_FRAMES         0x50
+#define MASK_AUTH_UNAUTH_SEL_2_FRAMES         0x40
+#define MASK_AUTH_UNAUTH_SEL_64_FRAMES        0x30
+#define MASK_AUTH_UNAUTH_SEL_32_FRAMES        0x20
+#define MASK_AUTH_UNAUTH_SEL_16_FRAMES        0x10
+#define MASK_AUTH_UNAUTH_SEL_ONCE             0x00
+#define MASK_AUTH_UNAUTH                      0x01
+#define MASK_AUTH_UNAUTH_AUTO                 0x01
+
+#define HDCP_REG2                             0x8564 /* Not in REF_01 */
+#define MASK_AUTO_P3_RESET                    0x0F
+#define SET_AUTO_P3_RESET_FRAMES(n)          (n & MASK_AUTO_P3_RESET)
+#define MASK_AUTO_P3_RESET_OFF                0x00
+
+#define VI_MODE                               0x8570
+#define MASK_RGB_DVI                          0x08 /* Not in REF_01 */
+
+#define VOUT_SET2                             0x8573
+#define MASK_SEL422                           0x80
+#define MASK_VOUT_422FIL_100                  0x40
+#define MASK_VOUTCOLORMODE                    0x03
+#define MASK_VOUTCOLORMODE_THROUGH            0x00
+#define MASK_VOUTCOLORMODE_AUTO               0x01
+#define MASK_VOUTCOLORMODE_MANUAL             0x03
+
+#define VOUT_SET3                             0x8574
+#define MASK_VOUT_EXTCNT                      0x08
+
+#define VI_REP                                0x8576
+#define MASK_VOUT_COLOR_SEL                   0xe0
+#define MASK_VOUT_COLOR_RGB_FULL              0x00
+#define MASK_VOUT_COLOR_RGB_LIMITED           0x20
+#define MASK_VOUT_COLOR_601_YCBCR_FULL        0x40
+#define MASK_VOUT_COLOR_601_YCBCR_LIMITED     0x60
+#define MASK_VOUT_COLOR_709_YCBCR_FULL        0x80
+#define MASK_VOUT_COLOR_709_YCBCR_LIMITED     0xa0
+#define MASK_VOUT_COLOR_FULL_TO_LIMITED       0xc0
+#define MASK_VOUT_COLOR_LIMITED_TO_FULL       0xe0
+#define MASK_IN_REP_HEN                       0x10
+#define MASK_IN_REP                           0x0f
+
+#define VI_MUTE                               0x857F
+#define MASK_AUTO_MUTE                        0xc0
+#define MASK_VI_MUTE                          0x10
+
+#define DE_WIDTH_H_LO                         0x8582 /* Not in REF_01 */
+#define DE_WIDTH_H_HI                         0x8583 /* Not in REF_01 */
+#define DE_WIDTH_V_LO                         0x8588 /* Not in REF_01 */
+#define DE_WIDTH_V_HI                         0x8589 /* Not in REF_01 */
+#define H_SIZE_LO                             0x858A /* Not in REF_01 */
+#define H_SIZE_HI                             0x858B /* Not in REF_01 */
+#define V_SIZE_LO                             0x858C /* Not in REF_01 */
+#define V_SIZE_HI                             0x858D /* Not in REF_01 */
+#define FV_CNT_LO                             0x85A1 /* Not in REF_01 */
+#define FV_CNT_HI                             0x85A2 /* Not in REF_01 */
+
+#define FH_MIN0                               0x85AA /* Not in REF_01 */
+#define FH_MIN1                               0x85AB /* Not in REF_01 */
+#define FH_MAX0                               0x85AC /* Not in REF_01 */
+#define FH_MAX1                               0x85AD /* Not in REF_01 */
+
+#define HV_RST                                0x85AF /* Not in REF_01 */
+#define MASK_H_PI_RST                         0x20
+#define MASK_V_PI_RST                         0x10
+
+#define EDID_MODE                             0x85C7
+#define MASK_EDID_SPEED                       0x40
+#define MASK_EDID_MODE                        0x03
+#define MASK_EDID_MODE_DISABLE                0x00
+#define MASK_EDID_MODE_DDC2B                  0x01
+#define MASK_EDID_MODE_E_DDC                  0x02
+
+#define EDID_LEN1                             0x85CA
+#define EDID_LEN2                             0x85CB
+
+#define HDCP_REG3                             0x85D1 /* Not in REF_01 */
+#define KEY_RD_CMD                            0x01
+
+#define FORCE_MUTE                            0x8600
+#define MASK_FORCE_AMUTE                      0x10
+#define MASK_FORCE_DMUTE                      0x01
+
+#define CMD_AUD                               0x8601
+#define MASK_CMD_BUFINIT                      0x04
+#define MASK_CMD_LOCKDET                      0x02
+#define MASK_CMD_MUTE                         0x01
+
+#define AUTO_CMD0                             0x8602
+#define MASK_AUTO_MUTE7                       0x80
+#define MASK_AUTO_MUTE6                       0x40
+#define MASK_AUTO_MUTE5                       0x20
+#define MASK_AUTO_MUTE4                       0x10
+#define MASK_AUTO_MUTE3                       0x08
+#define MASK_AUTO_MUTE2                       0x04
+#define MASK_AUTO_MUTE1                       0x02
+#define MASK_AUTO_MUTE0                       0x01
+
+#define AUTO_CMD1                             0x8603
+#define MASK_AUTO_MUTE10                      0x04
+#define MASK_AUTO_MUTE9                       0x02
+#define MASK_AUTO_MUTE8                       0x01
+
+#define AUTO_CMD2                             0x8604
+#define MASK_AUTO_PLAY3                       0x08
+#define MASK_AUTO_PLAY2                       0x04
+
+#define BUFINIT_START                         0x8606
+#define SET_BUFINIT_START_MS(milliseconds)   ((milliseconds) / 100)
+
+#define FS_MUTE                               0x8607
+#define MASK_FS_ELSE_MUTE                     0x80
+#define MASK_FS22_MUTE                        0x40
+#define MASK_FS24_MUTE                        0x20
+#define MASK_FS88_MUTE                        0x10
+#define MASK_FS96_MUTE                        0x08
+#define MASK_FS176_MUTE                       0x04
+#define MASK_FS192_MUTE                       0x02
+#define MASK_FS_NO_MUTE                       0x01
+
+#define FS_IMODE                              0x8620
+#define MASK_NLPCM_HMODE                      0x40
+#define MASK_NLPCM_SMODE                      0x20
+#define MASK_NLPCM_IMODE                      0x10
+#define MASK_FS_HMODE                         0x08
+#define MASK_FS_AMODE                         0x04
+#define MASK_FS_SMODE                         0x02
+#define MASK_FS_IMODE                         0x01
+
+#define FS_SET                                0x8621
+#define MASK_FS                               0x0f
+
+#define LOCKDET_REF0                          0x8630
+#define LOCKDET_REF1                          0x8631
+#define LOCKDET_REF2                          0x8632
+
+#define ACR_MODE                              0x8640
+#define MASK_ACR_LOAD                         0x10
+#define MASK_N_MODE                           0x04
+#define MASK_CTS_MODE                         0x01
+
+#define ACR_MDF0                              0x8641
+#define MASK_ACR_L2MDF                        0x70
+#define MASK_ACR_L2MDF_0_PPM                  0x00
+#define MASK_ACR_L2MDF_61_PPM                 0x10
+#define MASK_ACR_L2MDF_122_PPM                0x20
+#define MASK_ACR_L2MDF_244_PPM                0x30
+#define MASK_ACR_L2MDF_488_PPM                0x40
+#define MASK_ACR_L2MDF_976_PPM                0x50
+#define MASK_ACR_L2MDF_1976_PPM               0x60
+#define MASK_ACR_L2MDF_3906_PPM               0x70
+#define MASK_ACR_L1MDF                        0x07
+#define MASK_ACR_L1MDF_0_PPM                  0x00
+#define MASK_ACR_L1MDF_61_PPM                 0x01
+#define MASK_ACR_L1MDF_122_PPM                0x02
+#define MASK_ACR_L1MDF_244_PPM                0x03
+#define MASK_ACR_L1MDF_488_PPM                0x04
+#define MASK_ACR_L1MDF_976_PPM                0x05
+#define MASK_ACR_L1MDF_1976_PPM               0x06
+#define MASK_ACR_L1MDF_3906_PPM               0x07
+
+#define ACR_MDF1                              0x8642
+#define MASK_ACR_L3MDF                        0x07
+#define MASK_ACR_L3MDF_0_PPM                  0x00
+#define MASK_ACR_L3MDF_61_PPM                 0x01
+#define MASK_ACR_L3MDF_122_PPM                0x02
+#define MASK_ACR_L3MDF_244_PPM                0x03
+#define MASK_ACR_L3MDF_488_PPM                0x04
+#define MASK_ACR_L3MDF_976_PPM                0x05
+#define MASK_ACR_L3MDF_1976_PPM               0x06
+#define MASK_ACR_L3MDF_3906_PPM               0x07
+
+#define SDO_MODE1                             0x8652
+#define MASK_SDO_BIT_LENG                     0x70
+#define MASK_SDO_FMT                          0x03
+#define MASK_SDO_FMT_RIGHT                    0x00
+#define MASK_SDO_FMT_LEFT                     0x01
+#define MASK_SDO_FMT_I2S                      0x02
+
+#define DIV_MODE                              0x8665 /* Not in REF_01 */
+#define MASK_DIV_DLY                          0xf0
+#define SET_DIV_DLY_MS(milliseconds)         ((((milliseconds) / 100) << 4) & \
+						MASK_DIV_DLY)
+#define MASK_DIV_MODE                         0x01
+
+#define NCO_F0_MOD                            0x8670
+#define MASK_NCO_F0_MOD                       0x03
+#define MASK_NCO_F0_MOD_42MHZ                 0x00
+#define MASK_NCO_F0_MOD_27MHZ                 0x01
+
+#define PK_INT_MODE                           0x8709
+#define MASK_ISRC2_INT_MODE                   0x80
+#define MASK_ISRC_INT_MODE                    0x40
+#define MASK_ACP_INT_MODE                     0x20
+#define MASK_VS_INT_MODE                      0x10
+#define MASK_SPD_INT_MODE                     0x08
+#define MASK_MS_INT_MODE                      0x04
+#define MASK_AUD_INT_MODE                     0x02
+#define MASK_AVI_INT_MODE                     0x01
+
+#define NO_PKT_LIMIT                          0x870B
+#define MASK_NO_ACP_LIMIT                     0xf0
+#define SET_NO_ACP_LIMIT_MS(milliseconds)    ((((milliseconds) / 80) << 4) & \
+						MASK_NO_ACP_LIMIT)
+#define MASK_NO_AVI_LIMIT                     0x0f
+#define SET_NO_AVI_LIMIT_MS(milliseconds)    (((milliseconds) / 80) & \
+						MASK_NO_AVI_LIMIT)
+
+#define NO_PKT_CLR                            0x870C
+#define MASK_NO_VS_CLR                        0x40
+#define MASK_NO_SPD_CLR                       0x20
+#define MASK_NO_ACP_CLR                       0x10
+#define MASK_NO_AVI_CLR1                      0x02
+#define MASK_NO_AVI_CLR0                      0x01
+
+#define ERR_PK_LIMIT                          0x870D
+#define NO_PKT_LIMIT2                         0x870E
+#define PK_AVI_0HEAD                          0x8710
+#define PK_AVI_1HEAD                          0x8711
+#define PK_AVI_2HEAD                          0x8712
+#define PK_AVI_0BYTE                          0x8713
+#define PK_AVI_1BYTE                          0x8714
+#define PK_AVI_2BYTE                          0x8715
+#define PK_AVI_3BYTE                          0x8716
+#define PK_AVI_4BYTE                          0x8717
+#define PK_AVI_5BYTE                          0x8718
+#define PK_AVI_6BYTE                          0x8719
+#define PK_AVI_7BYTE                          0x871A
+#define PK_AVI_8BYTE                          0x871B
+#define PK_AVI_9BYTE                          0x871C
+#define PK_AVI_10BYTE                         0x871D
+#define PK_AVI_11BYTE                         0x871E
+#define PK_AVI_12BYTE                         0x871F
+#define PK_AVI_13BYTE                         0x8720
+#define PK_AVI_14BYTE                         0x8721
+#define PK_AVI_15BYTE                         0x8722
+#define PK_AVI_16BYTE                         0x8723
+
+#define BKSV                                  0x8800
+
+#define BCAPS                                 0x8840
+#define MASK_HDMI_RSVD                        0x80
+#define MASK_REPEATER                         0x40
+#define MASK_READY                            0x20
+#define MASK_FASTI2C                          0x10
+#define MASK_1_1_FEA                          0x02
+#define MASK_FAST_REAU                        0x01
+
+#define BSTATUS1                              0x8842
+#define MASK_MAX_EXCED                        0x08
+
+#define EDID_RAM                              0x8C00
+#define NO_GDB_LIMIT                          0x9007
+
+#endif
diff --git a/include/media/tc358743.h b/include/media/tc358743.h
new file mode 100644
index 000000000000..4513f2f9cfbc
--- /dev/null
+++ b/include/media/tc358743.h
@@ -0,0 +1,131 @@
+/*
+ * tc358743 - Toshiba HDMI to CSI-2 bridge
+ *
+ * Copyright 2015 Cisco Systems, Inc. and/or its affiliates. All rights
+ * reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+/*
+ * References (c = chapter, p = page):
+ * REF_01 - Toshiba, TC358743XBG (H2C), Functional Specification, Rev 0.60
+ * REF_02 - Toshiba, TC358743XBG_HDMI-CSI_Tv11p_nm.xls
+ */
+
+#ifndef _TC358743_
+#define _TC358743_
+
+enum tc358743_ddc5v_delays {
+	DDC5V_DELAY_0_MS,
+	DDC5V_DELAY_50_MS,
+	DDC5V_DELAY_100_MS,
+	DDC5V_DELAY_200_MS,
+};
+
+enum tc358743_hdmi_detection_delay {
+	HDMI_MODE_DELAY_0_MS,
+	HDMI_MODE_DELAY_25_MS,
+	HDMI_MODE_DELAY_50_MS,
+	HDMI_MODE_DELAY_100_MS,
+};
+
+struct tc358743_platform_data {
+	/* System clock connected to REFCLK (pin H5) */
+	u32 refclk_hz; /* 26 MHz, 27 MHz or 42 MHz */
+
+	/* DDC +5V debounce delay to avoid spurious interrupts when the cable
+	 * is connected.
+	 * Sets DDC5V_MODE in register DDC_CTL.
+	 * Default: DDC5V_DELAY_0_MS
+	 */
+	enum tc358743_ddc5v_delays ddc5v_delay;
+
+	bool enable_hdcp;
+
+	/*
+	 * The FIFO size is 512x32, so Toshiba recommend to set the default FIFO
+	 * level to somewhere in the middle (e.g. 300), so it can cover speed
+	 * mismatches in input and output ports.
+	 */
+	u16 fifo_level;
+
+	/* Bps pr lane is (refclk_hz / pll_prd) * pll_fbd */
+	u16 pll_prd;
+	u16 pll_fbd;
+
+	/* CSI
+	 * Calculate CSI parameters with REF_02 for the highest resolution your
+	 * CSI interface can handle. The driver will adjust the number of CSI
+	 * lanes in use according to the pixel clock.
+	 *
+	 * The values in brackets are calculated with REF_02 when the number of
+	 * bps pr lane is 823.5 MHz, and can serve as a starting point.
+	 */
+	u32 lineinitcnt;	/* (0x00001770) */
+	u32 lptxtimecnt;	/* (0x00000005) */
+	u32 tclk_headercnt;	/* (0x00001d04) */
+	u32 tclk_trailcnt;	/* (0x00000000) */
+	u32 ths_headercnt;	/* (0x00000505) */
+	u32 twakeup;		/* (0x00004650) */
+	u32 tclk_postcnt;	/* (0x00000000) */
+	u32 ths_trailcnt;	/* (0x00000004) */
+	u32 hstxvregcnt;	/* (0x00000005) */
+
+	/* DVI->HDMI detection delay to avoid unnecessary switching between DVI
+	 * and HDMI mode.
+	 * Sets HDMI_DET_V in register HDMI_DET.
+	 * Default: HDMI_MODE_DELAY_0_MS
+	 */
+	enum tc358743_hdmi_detection_delay hdmi_detection_delay;
+
+	/* Reset PHY automatically when TMDS clock goes from DC to AC.
+	 * Sets PHY_AUTO_RST2 in register PHY_CTL2.
+	 * Default: false
+	 */
+	bool hdmi_phy_auto_reset_tmds_detected;
+
+	/* Reset PHY automatically when TMDS clock passes 21 MHz.
+	 * Sets PHY_AUTO_RST3 in register PHY_CTL2.
+	 * Default: false
+	 */
+	bool hdmi_phy_auto_reset_tmds_in_range;
+
+	/* Reset PHY automatically when TMDS clock is detected.
+	 * Sets PHY_AUTO_RST4 in register PHY_CTL2.
+	 * Default: false
+	 */
+	bool hdmi_phy_auto_reset_tmds_valid;
+
+	/* Reset HDMI PHY automatically when hsync period is out of range.
+	 * Sets H_PI_RST in register HV_RST.
+	 * Default: false
+	 */
+	bool hdmi_phy_auto_reset_hsync_out_of_range;
+
+	/* Reset HDMI PHY automatically when vsync period is out of range.
+	 * Sets V_PI_RST in register HV_RST.
+	 * Default: false
+	 */
+	bool hdmi_phy_auto_reset_vsync_out_of_range;
+};
+
+/* custom controls */
+/* Audio sample rate in Hz */
+#define TC358743_CID_AUDIO_SAMPLING_RATE (V4L2_CID_USER_TC358743_BASE + 0)
+/* Audio present status */
+#define TC358743_CID_AUDIO_PRESENT       (V4L2_CID_USER_TC358743_BASE + 1)
+
+#endif
diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index 9f6e108ff4a0..d448c536b49d 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -174,6 +174,10 @@ enum v4l2_colorfx {
  * We reserve 16 controls for this driver. */
 #define V4L2_CID_USER_ADV7180_BASE		(V4L2_CID_USER_BASE + 0x1070)
 
+/* The base for the tc358743 driver controls.
+ * We reserve 16 controls for this driver. */
+#define V4L2_CID_USER_TC358743_BASE		(V4L2_CID_USER_BASE + 0x1080)
+
 /* MPEG-class control IDs */
 /* The MPEG controls are applicable to all codec controls
  * and the 'MPEG' part of the define is historical */
-- 
cgit v1.2.3


From eff3cddc222c88943ff515ae9335687c9e2cbaf6 Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Wed, 22 Apr 2015 14:40:30 -0700
Subject: clarify implementation of ethtool's get_ts_info op

This patch adds some clarification about the intended way to implement
both SIOCSHWTSTAMP and ethtool's get_ts_info. The HWTSTAMP API has
several Rx filters which are very specific, as well as more general
filters. The specific filters really only exist to support some broken
hardware which can't fully implement the generic filters. This patch
adds clarification that it is okay to support the specific filters in
SIOCSHWTSTAMP by upscaling them to the generic filters. In addition,
update the header for ethtool_ts_info to specify that drivers ought to
only report the filters they support without upscaling in this manner.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Reviewed-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 Documentation/networking/timestamping.txt | 7 +++++++
 include/uapi/linux/ethtool.h              | 5 +++++
 2 files changed, 12 insertions(+)

(limited to 'include/uapi')

diff --git a/Documentation/networking/timestamping.txt b/Documentation/networking/timestamping.txt
index 5f0922613f1a..a977339fbe0a 100644
--- a/Documentation/networking/timestamping.txt
+++ b/Documentation/networking/timestamping.txt
@@ -359,6 +359,13 @@ the requested fine-grained filtering for incoming packets is not
 supported, the driver may time stamp more than just the requested types
 of packets.
 
+Drivers are free to use a more permissive configuration than the requested
+configuration. It is expected that drivers should only implement directly the
+most generic mode that can be supported. For example if the hardware can
+support HWTSTAMP_FILTER_V2_EVENT, then it should generally always upscale
+HWTSTAMP_FILTER_V2_L2_SYNC_MESSAGE, and so forth, as HWTSTAMP_FILTER_V2_EVENT
+is more generic (and more useful to applications).
+
 A driver which supports hardware time stamping shall update the struct
 with the actual, possibly more permissive configuration. If the
 requested packets cannot be time stamped, then nothing should be
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index cd67aec187d9..cd1629170103 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1093,6 +1093,11 @@ struct ethtool_sfeatures {
  * the 'hwtstamp_tx_types' and 'hwtstamp_rx_filters' enumeration values,
  * respectively.  For example, if the device supports HWTSTAMP_TX_ON,
  * then (1 << HWTSTAMP_TX_ON) in 'tx_types' will be set.
+ *
+ * Drivers should only report the filters they actually support without
+ * upscaling in the SIOCSHWTSTAMP ioctl. If the SIOCSHWSTAMP request for
+ * HWTSTAMP_FILTER_V1_SYNC is supported by HWTSTAMP_FILTER_V1_EVENT, then the
+ * driver should only report HWTSTAMP_FILTER_V1_EVENT in this op.
  */
 struct ethtool_ts_info {
 	__u32	cmd;
-- 
cgit v1.2.3


From 7b8f4586532f36c5541a15d072576e7e89a5df75 Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Fri, 3 Jul 2015 19:39:02 +0800
Subject: nfsd: Add macro NFS_ACL_MASK for ACL

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs2acl.c           | 10 ++++------
 fs/nfsd/nfs3acl.c           |  4 ++--
 include/uapi/linux/nfsacl.h |  1 +
 3 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'include/uapi')

diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index d54701f6dc78..1580ea6fd64d 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -44,13 +44,13 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,
 
 	inode = d_inode(fh->fh_dentry);
 
-	if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
+	if (argp->mask & ~NFS_ACL_MASK)
 		RETURN_STATUS(nfserr_inval);
 	resp->mask = argp->mask;
 
 	nfserr = fh_getattr(fh, &resp->stat);
 	if (nfserr)
-		goto fail;
+		RETURN_STATUS(nfserr);
 
 	if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {
 		acl = get_acl(inode, ACL_TYPE_ACCESS);
@@ -202,7 +202,7 @@ static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p,
 	if (!p)
 		return 0;
 	argp->mask = ntohl(*p++);
-	if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT) ||
+	if (argp->mask & ~NFS_ACL_MASK ||
 	    !xdr_argsize_check(rqstp, p))
 		return 0;
 
@@ -293,9 +293,7 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
 				  resp->acl_default,
 				  resp->mask & NFS_DFACL,
 				  NFS_ACL_DEFAULT);
-	if (n <= 0)
-		return 0;
-	return 1;
+	return (n > 0);
 }
 
 static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, __be32 *p,
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 882b1a14bc3e..01df4cd7c753 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -41,7 +41,7 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp,
 
 	inode = d_inode(fh->fh_dentry);
 
-	if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
+	if (argp->mask & ~NFS_ACL_MASK)
 		RETURN_STATUS(nfserr_inval);
 	resp->mask = argp->mask;
 
@@ -148,7 +148,7 @@ static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p,
 	if (!p)
 		return 0;
 	args->mask = ntohl(*p++);
-	if (args->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT) ||
+	if (args->mask & ~NFS_ACL_MASK ||
 	    !xdr_argsize_check(rqstp, p))
 		return 0;
 
diff --git a/include/uapi/linux/nfsacl.h b/include/uapi/linux/nfsacl.h
index 9bb9771a107f..552726631162 100644
--- a/include/uapi/linux/nfsacl.h
+++ b/include/uapi/linux/nfsacl.h
@@ -22,6 +22,7 @@
 #define NFS_ACLCNT		0x0002
 #define NFS_DFACL		0x0004
 #define NFS_DFACLCNT		0x0008
+#define NFS_ACL_MASK		0x000f
 
 /* Flag for Default ACL entries */
 #define NFS_ACL_DEFAULT		0x1000
-- 
cgit v1.2.3


From 8d20aabe1c76cccac544d9fcc3ad7823d9e98a2d Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 15 Jul 2015 14:21:42 +0200
Subject: ebpf: add helper to retrieve net_cls's classid cookie

It would be very useful to retrieve the net_cls's classid from an eBPF
program to allow for a more fine-grained classification, it could be
directly used or in conjunction with additional policies. I.e. docker,
but also tooling such as cgexec, can easily run applications via net_cls
cgroups:

  cgcreate -g net_cls:/foo
  echo 42 > foo/net_cls.classid
  cgexec -g net_cls:foo <prog>

Thus, their respecitve classid cookie of foo can then be looked up on
the egress path to apply further policies. The helper is desigend such
that a non-zero value returns the cgroup id.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Thomas Graf <tgraf@suug.ch>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h |  7 +++++++
 net/core/filter.c        | 15 +++++++++++++++
 2 files changed, 22 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 29ef6f99e43d..2de87e58b12b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -249,6 +249,13 @@ enum bpf_func_id {
 	 * Return: 0 on success
 	 */
 	BPF_FUNC_get_current_comm,
+
+	/**
+	 * bpf_get_cgroup_classid(skb) - retrieve a proc's classid
+	 * @skb: pointer to skb
+	 * Return: classid if != 0
+	 */
+	BPF_FUNC_get_cgroup_classid,
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/net/core/filter.c b/net/core/filter.c
index be3098fb65e4..247450a5e387 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -47,6 +47,7 @@
 #include <linux/if_vlan.h>
 #include <linux/bpf.h>
 #include <net/sch_generic.h>
+#include <net/cls_cgroup.h>
 
 /**
  *	sk_filter - run a packet through a socket filter
@@ -1424,6 +1425,18 @@ const struct bpf_func_proto bpf_clone_redirect_proto = {
 	.arg3_type      = ARG_ANYTHING,
 };
 
+static u64 bpf_get_cgroup_classid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	return task_get_classid((struct sk_buff *) (unsigned long) r1);
+}
+
+static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
+	.func           = bpf_get_cgroup_classid,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+};
+
 static const struct bpf_func_proto *
 sk_filter_func_proto(enum bpf_func_id func_id)
 {
@@ -1461,6 +1474,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 		return &bpf_l4_csum_replace_proto;
 	case BPF_FUNC_clone_redirect:
 		return &bpf_clone_redirect_proto;
+	case BPF_FUNC_get_cgroup_classid:
+		return &bpf_get_cgroup_classid_proto;
 	default:
 		return sk_filter_func_proto(func_id);
 	}
-- 
cgit v1.2.3


From 4e10df9a60d96ced321dd2af71da558c6b750078 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Mon, 20 Jul 2015 20:34:18 -0700
Subject: bpf: introduce bpf_skb_vlan_push/pop() helpers

Allow eBPF programs attached to TC qdiscs call skb_vlan_push/pop via
helper functions. These functions may change skb->data/hlen which are
cached by some JITs to improve performance of ld_abs/ld_ind instructions.
Therefore JITs need to recognize bpf_skb_vlan_push/pop() calls,
re-compute header len and re-cache skb->data/hlen back into cpu registers.
Note, skb->data/hlen are not directly accessible from the programs,
so any changes to skb->data done either by these helpers or by other
TC actions are safe.

eBPF JIT supported by three architectures:
- arm64 JIT is using bpf_load_pointer() without caching, so it's ok as-is.
- x64 JIT re-caches skb->data/hlen unconditionally after vlan_push/pop calls
  (experiments showed that conditional re-caching is slower).
- s390 JIT falls back to interpreter for now when bpf_skb_vlan_push() is present
  in the program (re-caching is tbd).

These helpers allow more scalable handling of vlan from the programs.
Instead of creating thousands of vlan netdevs on top of eth0 and attaching
TC+ingress+bpf to all of them, the program can be attached to eth0 directly
and manipulate vlans as necessary.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/s390/net/bpf_jit_comp.c |  4 +++
 arch/x86/net/bpf_jit_comp.c  | 80 +++++++++++++++++++++++---------------------
 include/linux/bpf.h          |  2 ++
 include/linux/filter.h       |  1 +
 include/uapi/linux/bpf.h     |  2 ++
 net/core/filter.c            | 48 ++++++++++++++++++++++++++
 6 files changed, 99 insertions(+), 38 deletions(-)

(limited to 'include/uapi')

diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index fee782acc2ee..79c731e8d178 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -973,6 +973,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i
 		 */
 		const u64 func = (u64)__bpf_call_base + imm;
 
+		if (bpf_helper_changes_skb_data((void *)func))
+			/* TODO reload skb->data, hlen */
+			return -1;
+
 		REG_SET_SEEN(BPF_REG_5);
 		jit->seen |= SEEN_FUNC;
 		/* lg %w1,<d(imm)>(%l) */
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 579a8fd74be0..6c335a8fc086 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -315,6 +315,26 @@ static void emit_bpf_tail_call(u8 **pprog)
 	*pprog = prog;
 }
 
+
+static void emit_load_skb_data_hlen(u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+
+	/* r9d = skb->len - skb->data_len (headlen)
+	 * r10 = skb->data
+	 */
+	/* mov %r9d, off32(%rdi) */
+	EMIT3_off32(0x44, 0x8b, 0x8f, offsetof(struct sk_buff, len));
+
+	/* sub %r9d, off32(%rdi) */
+	EMIT3_off32(0x44, 0x2b, 0x8f, offsetof(struct sk_buff, data_len));
+
+	/* mov %r10, off32(%rdi) */
+	EMIT3_off32(0x4c, 0x8b, 0x97, offsetof(struct sk_buff, data));
+	*pprog = prog;
+}
+
 static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		  int oldproglen, struct jit_context *ctx)
 {
@@ -329,36 +349,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 
 	emit_prologue(&prog);
 
-	if (seen_ld_abs) {
-		/* r9d : skb->len - skb->data_len (headlen)
-		 * r10 : skb->data
-		 */
-		if (is_imm8(offsetof(struct sk_buff, len)))
-			/* mov %r9d, off8(%rdi) */
-			EMIT4(0x44, 0x8b, 0x4f,
-			      offsetof(struct sk_buff, len));
-		else
-			/* mov %r9d, off32(%rdi) */
-			EMIT3_off32(0x44, 0x8b, 0x8f,
-				    offsetof(struct sk_buff, len));
-
-		if (is_imm8(offsetof(struct sk_buff, data_len)))
-			/* sub %r9d, off8(%rdi) */
-			EMIT4(0x44, 0x2b, 0x4f,
-			      offsetof(struct sk_buff, data_len));
-		else
-			EMIT3_off32(0x44, 0x2b, 0x8f,
-				    offsetof(struct sk_buff, data_len));
-
-		if (is_imm8(offsetof(struct sk_buff, data)))
-			/* mov %r10, off8(%rdi) */
-			EMIT4(0x4c, 0x8b, 0x57,
-			      offsetof(struct sk_buff, data));
-		else
-			/* mov %r10, off32(%rdi) */
-			EMIT3_off32(0x4c, 0x8b, 0x97,
-				    offsetof(struct sk_buff, data));
-	}
+	if (seen_ld_abs)
+		emit_load_skb_data_hlen(&prog);
 
 	for (i = 0; i < insn_cnt; i++, insn++) {
 		const s32 imm32 = insn->imm;
@@ -367,6 +359,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		u8 b1 = 0, b2 = 0, b3 = 0;
 		s64 jmp_offset;
 		u8 jmp_cond;
+		bool reload_skb_data;
 		int ilen;
 		u8 *func;
 
@@ -818,12 +811,18 @@ xadd:			if (is_imm8(insn->off))
 			func = (u8 *) __bpf_call_base + imm32;
 			jmp_offset = func - (image + addrs[i]);
 			if (seen_ld_abs) {
-				EMIT2(0x41, 0x52); /* push %r10 */
-				EMIT2(0x41, 0x51); /* push %r9 */
-				/* need to adjust jmp offset, since
-				 * pop %r9, pop %r10 take 4 bytes after call insn
-				 */
-				jmp_offset += 4;
+				reload_skb_data = bpf_helper_changes_skb_data(func);
+				if (reload_skb_data) {
+					EMIT1(0x57); /* push %rdi */
+					jmp_offset += 22; /* pop, mov, sub, mov */
+				} else {
+					EMIT2(0x41, 0x52); /* push %r10 */
+					EMIT2(0x41, 0x51); /* push %r9 */
+					/* need to adjust jmp offset, since
+					 * pop %r9, pop %r10 take 4 bytes after call insn
+					 */
+					jmp_offset += 4;
+				}
 			}
 			if (!imm32 || !is_simm32(jmp_offset)) {
 				pr_err("unsupported bpf func %d addr %p image %p\n",
@@ -832,8 +831,13 @@ xadd:			if (is_imm8(insn->off))
 			}
 			EMIT1_off32(0xE8, jmp_offset);
 			if (seen_ld_abs) {
-				EMIT2(0x41, 0x59); /* pop %r9 */
-				EMIT2(0x41, 0x5A); /* pop %r10 */
+				if (reload_skb_data) {
+					EMIT1(0x5F); /* pop %rdi */
+					emit_load_skb_data_hlen(&prog);
+				} else {
+					EMIT2(0x41, 0x59); /* pop %r9 */
+					EMIT2(0x41, 0x5A); /* pop %r10 */
+				}
 			}
 			break;
 
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4383476a0d48..139d6d2e123f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -192,5 +192,7 @@ extern const struct bpf_func_proto bpf_ktime_get_ns_proto;
 extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto;
 extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
 extern const struct bpf_func_proto bpf_get_current_comm_proto;
+extern const struct bpf_func_proto bpf_skb_vlan_push_proto;
+extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
 
 #endif /* _LINUX_BPF_H */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 17724f6ea983..69d00555ce35 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -411,6 +411,7 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);
 
 u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 void bpf_int_jit_compile(struct bpf_prog *fp);
+bool bpf_helper_changes_skb_data(void *func);
 
 #ifdef CONFIG_BPF_JIT
 typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2de87e58b12b..2f6c83d714e9 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -256,6 +256,8 @@ enum bpf_func_id {
 	 * Return: classid if != 0
 	 */
 	BPF_FUNC_get_cgroup_classid,
+	BPF_FUNC_skb_vlan_push, /* bpf_skb_vlan_push(skb, vlan_proto, vlan_tci) */
+	BPF_FUNC_skb_vlan_pop,  /* bpf_skb_vlan_pop(skb) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 247450a5e387..50338071fac4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1437,6 +1437,50 @@ static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
 	.arg1_type      = ARG_PTR_TO_CTX,
 };
 
+static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5)
+{
+	struct sk_buff *skb = (struct sk_buff *) (long) r1;
+	__be16 vlan_proto = (__force __be16) r2;
+
+	if (unlikely(vlan_proto != htons(ETH_P_8021Q) &&
+		     vlan_proto != htons(ETH_P_8021AD)))
+		vlan_proto = htons(ETH_P_8021Q);
+
+	return skb_vlan_push(skb, vlan_proto, vlan_tci);
+}
+
+const struct bpf_func_proto bpf_skb_vlan_push_proto = {
+	.func           = bpf_skb_vlan_push,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_ANYTHING,
+	.arg3_type      = ARG_ANYTHING,
+};
+
+static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	struct sk_buff *skb = (struct sk_buff *) (long) r1;
+
+	return skb_vlan_pop(skb);
+}
+
+const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
+	.func           = bpf_skb_vlan_pop,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+};
+
+bool bpf_helper_changes_skb_data(void *func)
+{
+	if (func == bpf_skb_vlan_push)
+		return true;
+	if (func == bpf_skb_vlan_pop)
+		return true;
+	return false;
+}
+
 static const struct bpf_func_proto *
 sk_filter_func_proto(enum bpf_func_id func_id)
 {
@@ -1476,6 +1520,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 		return &bpf_clone_redirect_proto;
 	case BPF_FUNC_get_cgroup_classid:
 		return &bpf_get_cgroup_classid_proto;
+	case BPF_FUNC_skb_vlan_push:
+		return &bpf_skb_vlan_push_proto;
+	case BPF_FUNC_skb_vlan_pop:
+		return &bpf_skb_vlan_pop_proto;
 	default:
 		return sk_filter_func_proto(func_id);
 	}
-- 
cgit v1.2.3


From 8ab30c1538b14424015e45063c41d509b24c1dea Mon Sep 17 00:00:00 2001
From: Alex Bennée <alex.bennee@linaro.org>
Date: Tue, 7 Jul 2015 17:29:53 +0100
Subject: KVM: add comments for kvm_debug_exit_arch struct
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bring into line with the comments for the other structures and their
KVM_EXIT_* cases. Also update api.txt to reflect use in kvm_run
documentation.

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Reviewed-by: Andrew Jones <drjones@redhat.com>
Acked-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 Documentation/virtual/kvm/api.txt | 4 +++-
 include/uapi/linux/kvm.h          | 3 +++
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index a7926a90156f..9f746eab333d 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3111,11 +3111,13 @@ data_offset describes where the data is located (KVM_EXIT_IO_OUT) or
 where kvm expects application code to place the data for the next
 KVM_RUN invocation (KVM_EXIT_IO_IN).  Data format is a packed array.
 
+		/* KVM_EXIT_DEBUG */
 		struct {
 			struct kvm_debug_exit_arch arch;
 		} debug;
 
-Unused.
+If the exit_reason is KVM_EXIT_DEBUG, then a vcpu is processing a debug event
+for which architecture specific information is returned.
 
 		/* KVM_EXIT_MMIO */
 		struct {
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 716ad4ae4d4b..4ab3c6a8d563 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -237,6 +237,7 @@ struct kvm_run {
 			__u32 count;
 			__u64 data_offset; /* relative to kvm_run start */
 		} io;
+		/* KVM_EXIT_DEBUG */
 		struct {
 			struct kvm_debug_exit_arch arch;
 		} debug;
@@ -285,6 +286,7 @@ struct kvm_run {
 			__u32 data;
 			__u8  is_write;
 		} dcr;
+		/* KVM_EXIT_INTERNAL_ERROR */
 		struct {
 			__u32 suberror;
 			/* Available with KVM_CAP_INTERNAL_ERROR_DATA: */
@@ -295,6 +297,7 @@ struct kvm_run {
 		struct {
 			__u64 gprs[32];
 		} osi;
+		/* KVM_EXIT_PAPR_HCALL */
 		struct {
 			__u64 nr;
 			__u64 ret;
-- 
cgit v1.2.3


From 5540546bc93b49f98a0466fe3f96615286c76574 Mon Sep 17 00:00:00 2001
From: Alex Bennée <alex.bennee@linaro.org>
Date: Tue, 7 Jul 2015 17:30:01 +0100
Subject: KVM: arm64: guest debug, HW assisted debug support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds support for userspace to control the HW debug registers for
guest debug. In the debug ioctl we copy an IMPDEF registers into a new
register set called host_debug_state.

We use the recently introduced vcpu parameter debug_ptr to select which
register set is copied into the real registers when world switch occurs.

I've made some helper functions from hw_breakpoint.c more widely
available for re-use.

As with single step we need to tweak the guest registers to enable the
exceptions so we need to save and restore those bits.

Two new capabilities have been added to the KVM_EXTENSION ioctl to allow
userspace to query the number of hardware break and watch points
available on the host hardware.

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/uapi/linux/kvm.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 4ab3c6a8d563..a1e08e7bbf20 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -820,6 +820,8 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_DISABLE_QUIRKS 116
 #define KVM_CAP_X86_SMM 117
 #define KVM_CAP_MULTI_ADDRESS_SPACE 118
+#define KVM_CAP_GUEST_DEBUG_HW_BPS 119
+#define KVM_CAP_GUEST_DEBUG_HW_WPS 120
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3


From a0d9a8604f29ee3340126ec3f90c9421f930aa50 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Tue, 21 Jul 2015 10:43:45 +0200
Subject: rtnetlink: introduce new RTA_ENCAP_TYPE and RTA_ENCAP attributes

This patch introduces two new RTA attributes to attach encap
data to fib routes.

Example iproute2 command to attach mpls encap data to ipv4 routes

$ip route add 10.1.1.0/30 encap mpls 200 via inet 10.1.1.1 dev swp1

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Suggested-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/rtnetlink.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index fdd8f07f1d34..0d3d3cc43356 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -308,6 +308,8 @@ enum rtattr_type_t {
 	RTA_VIA,
 	RTA_NEWDST,
 	RTA_PREF,
+	RTA_ENCAP_TYPE,
+	RTA_ENCAP,
 	__RTA_MAX
 };
 
-- 
cgit v1.2.3


From 499a24256862714539e902c0499b67da2bb3ab72 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Tue, 21 Jul 2015 10:43:46 +0200
Subject: lwtunnel: infrastructure for handling light weight tunnels like mpls

Provides infrastructure to parse/dump/store encap information for
light weight tunnels like mpls. Encap information for such tunnels
is associated with fib routes.

This infrastructure is based on previous suggestions from
Eric Biederman to follow the xfrm infrastructure.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/lwtunnel.h      |   6 ++
 include/net/lwtunnel.h        | 132 +++++++++++++++++++++++++++++++
 include/uapi/linux/lwtunnel.h |  15 ++++
 net/Kconfig                   |   7 ++
 net/core/Makefile             |   1 +
 net/core/lwtunnel.c           | 179 ++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 340 insertions(+)
 create mode 100644 include/linux/lwtunnel.h
 create mode 100644 include/net/lwtunnel.h
 create mode 100644 include/uapi/linux/lwtunnel.h
 create mode 100644 net/core/lwtunnel.c

(limited to 'include/uapi')

diff --git a/include/linux/lwtunnel.h b/include/linux/lwtunnel.h
new file mode 100644
index 000000000000..97f32f8b4ae1
--- /dev/null
+++ b/include/linux/lwtunnel.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_LWTUNNEL_H_
+#define _LINUX_LWTUNNEL_H_
+
+#include <uapi/linux/lwtunnel.h>
+
+#endif /* _LINUX_LWTUNNEL_H_ */
diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
new file mode 100644
index 000000000000..df24b3611ff4
--- /dev/null
+++ b/include/net/lwtunnel.h
@@ -0,0 +1,132 @@
+#ifndef __NET_LWTUNNEL_H
+#define __NET_LWTUNNEL_H 1
+
+#include <linux/lwtunnel.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <net/route.h>
+
+#define LWTUNNEL_HASH_BITS   7
+#define LWTUNNEL_HASH_SIZE   (1 << LWTUNNEL_HASH_BITS)
+
+/* lw tunnel state flags */
+#define LWTUNNEL_STATE_OUTPUT_REDIRECT 0x1
+
+struct lwtunnel_state {
+	__u16		type;
+	__u16		flags;
+	atomic_t	refcnt;
+	int             len;
+	__u8            data[0];
+};
+
+struct lwtunnel_encap_ops {
+	int (*build_state)(struct net_device *dev, struct nlattr *encap,
+			   struct lwtunnel_state **ts);
+	int (*output)(struct sock *sk, struct sk_buff *skb);
+	int (*fill_encap)(struct sk_buff *skb,
+			  struct lwtunnel_state *lwtstate);
+	int (*get_encap_size)(struct lwtunnel_state *lwtstate);
+	int (*cmp_encap)(struct lwtunnel_state *a, struct lwtunnel_state *b);
+};
+
+extern const struct lwtunnel_encap_ops __rcu *
+		lwtun_encaps[LWTUNNEL_ENCAP_MAX+1];
+
+#ifdef CONFIG_LWTUNNEL
+static inline void lwtunnel_state_get(struct lwtunnel_state *lws)
+{
+	atomic_inc(&lws->refcnt);
+}
+
+static inline void lwtunnel_state_put(struct lwtunnel_state *lws)
+{
+	if (!lws)
+		return;
+
+	if (atomic_dec_and_test(&lws->refcnt))
+		kfree(lws);
+}
+
+static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate)
+{
+	if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_OUTPUT_REDIRECT))
+		return true;
+
+	return false;
+}
+
+int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op,
+			   unsigned int num);
+int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op,
+			   unsigned int num);
+int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
+			 struct nlattr *encap,
+			 struct lwtunnel_state **lws);
+int lwtunnel_fill_encap(struct sk_buff *skb,
+			struct lwtunnel_state *lwtstate);
+int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate);
+struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len);
+int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b);
+
+#else
+
+static inline void lwtunnel_state_get(struct lwtunnel_state *lws)
+{
+}
+
+static inline void lwtunnel_state_put(struct lwtunnel_state *lws)
+{
+}
+
+static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate)
+{
+	return false;
+}
+
+static inline int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op,
+					 unsigned int num)
+{
+	return -EOPNOTSUPP;
+
+}
+
+static inline int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op,
+					 unsigned int num)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
+				       struct nlattr *encap,
+				       struct lwtunnel_state **lws)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int lwtunnel_fill_encap(struct sk_buff *skb,
+				      struct lwtunnel_state *lwtstate)
+{
+	return 0;
+}
+
+static inline int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate)
+{
+	return 0;
+}
+
+static inline struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len)
+{
+	return NULL;
+}
+
+static inline int lwtunnel_cmp_encap(struct lwtunnel_state *a,
+				     struct lwtunnel_state *b)
+{
+	return 0;
+}
+
+#endif
+
+#endif /* __NET_LWTUNNEL_H */
diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
new file mode 100644
index 000000000000..aa611d931a31
--- /dev/null
+++ b/include/uapi/linux/lwtunnel.h
@@ -0,0 +1,15 @@
+#ifndef _UAPI_LWTUNNEL_H_
+#define _UAPI_LWTUNNEL_H_
+
+#include <linux/types.h>
+
+enum lwtunnel_encap_types {
+	LWTUNNEL_ENCAP_NONE,
+	LWTUNNEL_ENCAP_MPLS,
+	__LWTUNNEL_ENCAP_MAX,
+};
+
+#define LWTUNNEL_ENCAP_MAX (__LWTUNNEL_ENCAP_MAX - 1)
+
+
+#endif /* _UAPI_LWTUNNEL_H_ */
diff --git a/net/Kconfig b/net/Kconfig
index 57a7c5af3175..7021c1bf44d6 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -374,6 +374,13 @@ source "net/caif/Kconfig"
 source "net/ceph/Kconfig"
 source "net/nfc/Kconfig"
 
+config LWTUNNEL
+	bool "Network light weight tunnels"
+	---help---
+	  This feature provides an infrastructure to support light weight
+	  tunnels like mpls. There is no netdevice associated with a light
+	  weight tunnel endpoint. Tunnel encapsulation parameters are stored
+	  with light weight tunnel state associated with fib routes.
 
 endif   # if NET
 
diff --git a/net/core/Makefile b/net/core/Makefile
index fec0856dd6c0..086b01fbe1bd 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -23,3 +23,4 @@ obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
 obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
 obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
 obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
+obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
new file mode 100644
index 000000000000..d7ae3a235b4b
--- /dev/null
+++ b/net/core/lwtunnel.c
@@ -0,0 +1,179 @@
+/*
+ * lwtunnel	Infrastructure for light weight tunnels like mpls
+ *
+ * Authors:	Roopa Prabhu, <roopa@cumulusnetworks.com>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/lwtunnel.h>
+#include <linux/in.h>
+#include <linux/init.h>
+#include <linux/err.h>
+
+#include <net/lwtunnel.h>
+#include <net/rtnetlink.h>
+
+struct lwtunnel_state *lwtunnel_state_alloc(int encap_len)
+{
+	struct lwtunnel_state *lws;
+
+	lws = kzalloc(sizeof(*lws) + encap_len, GFP_ATOMIC);
+
+	return lws;
+}
+EXPORT_SYMBOL(lwtunnel_state_alloc);
+
+const struct lwtunnel_encap_ops __rcu *
+		lwtun_encaps[LWTUNNEL_ENCAP_MAX + 1] __read_mostly;
+
+int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *ops,
+			   unsigned int num)
+{
+	if (num > LWTUNNEL_ENCAP_MAX)
+		return -ERANGE;
+
+	return !cmpxchg((const struct lwtunnel_encap_ops **)
+			&lwtun_encaps[num],
+			NULL, ops) ? 0 : -1;
+}
+EXPORT_SYMBOL(lwtunnel_encap_add_ops);
+
+int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops,
+			   unsigned int encap_type)
+{
+	int ret;
+
+	if (encap_type == LWTUNNEL_ENCAP_NONE ||
+	    encap_type > LWTUNNEL_ENCAP_MAX)
+		return -ERANGE;
+
+	ret = (cmpxchg((const struct lwtunnel_encap_ops **)
+		       &lwtun_encaps[encap_type],
+		       ops, NULL) == ops) ? 0 : -1;
+
+	synchronize_net();
+
+	return ret;
+}
+EXPORT_SYMBOL(lwtunnel_encap_del_ops);
+
+int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
+			 struct nlattr *encap, struct lwtunnel_state **lws)
+{
+	const struct lwtunnel_encap_ops *ops;
+	int ret = -EINVAL;
+
+	if (encap_type == LWTUNNEL_ENCAP_NONE ||
+	    encap_type > LWTUNNEL_ENCAP_MAX)
+		return ret;
+
+	ret = -EOPNOTSUPP;
+	rcu_read_lock();
+	ops = rcu_dereference(lwtun_encaps[encap_type]);
+	if (likely(ops && ops->build_state))
+		ret = ops->build_state(dev, encap, lws);
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(lwtunnel_build_state);
+
+int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate)
+{
+	const struct lwtunnel_encap_ops *ops;
+	struct nlattr *nest;
+	int ret = -EINVAL;
+
+	if (!lwtstate)
+		return 0;
+
+	if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
+	    lwtstate->type > LWTUNNEL_ENCAP_MAX)
+		return 0;
+
+	ret = -EOPNOTSUPP;
+	nest = nla_nest_start(skb, RTA_ENCAP);
+	rcu_read_lock();
+	ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
+	if (likely(ops && ops->fill_encap))
+		ret = ops->fill_encap(skb, lwtstate);
+	rcu_read_unlock();
+
+	if (ret)
+		goto nla_put_failure;
+	nla_nest_end(skb, nest);
+	ret = nla_put_u16(skb, RTA_ENCAP_TYPE, lwtstate->type);
+	if (ret)
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+
+	return (ret == -EOPNOTSUPP ? 0 : ret);
+}
+EXPORT_SYMBOL(lwtunnel_fill_encap);
+
+int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate)
+{
+	const struct lwtunnel_encap_ops *ops;
+	int ret = 0;
+
+	if (!lwtstate)
+		return 0;
+
+	if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
+	    lwtstate->type > LWTUNNEL_ENCAP_MAX)
+		return 0;
+
+	rcu_read_lock();
+	ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
+	if (likely(ops && ops->get_encap_size))
+		ret = nla_total_size(ops->get_encap_size(lwtstate));
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(lwtunnel_get_encap_size);
+
+int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+	const struct lwtunnel_encap_ops *ops;
+	int ret = 0;
+
+	if (!a && !b)
+		return 0;
+
+	if (!a || !b)
+		return 1;
+
+	if (a->type != b->type)
+		return 1;
+
+	if (a->type == LWTUNNEL_ENCAP_NONE ||
+	    a->type > LWTUNNEL_ENCAP_MAX)
+		return 0;
+
+	rcu_read_lock();
+	ops = rcu_dereference(lwtun_encaps[a->type]);
+	if (likely(ops && ops->cmp_encap))
+		ret = ops->cmp_encap(a, b);
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(lwtunnel_cmp_encap);
-- 
cgit v1.2.3


From e3e4712ec0961ed586a8db340bd994c4ad7f5dba Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Tue, 21 Jul 2015 10:43:53 +0200
Subject: mpls: ip tunnel support

This implementation uses lwtunnel infrastructure to register
hooks for mpls tunnel encaps.

It picks cues from iptunnel_encaps infrastructure and previous
mpls iptunnel RFC patches from Eric W. Biederman and Robert Shearman

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mpls_iptunnel.h      |   6 +
 include/net/mpls_iptunnel.h        |  29 +++++
 include/uapi/linux/mpls_iptunnel.h |  28 +++++
 net/mpls/Kconfig                   |   8 +-
 net/mpls/Makefile                  |   1 +
 net/mpls/mpls_iptunnel.c           | 233 +++++++++++++++++++++++++++++++++++++
 6 files changed, 304 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/mpls_iptunnel.h
 create mode 100644 include/net/mpls_iptunnel.h
 create mode 100644 include/uapi/linux/mpls_iptunnel.h
 create mode 100644 net/mpls/mpls_iptunnel.c

(limited to 'include/uapi')

diff --git a/include/linux/mpls_iptunnel.h b/include/linux/mpls_iptunnel.h
new file mode 100644
index 000000000000..ef29eb2d6dfd
--- /dev/null
+++ b/include/linux/mpls_iptunnel.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_MPLS_IPTUNNEL_H
+#define _LINUX_MPLS_IPTUNNEL_H
+
+#include <uapi/linux/mpls_iptunnel.h>
+
+#endif  /* _LINUX_MPLS_IPTUNNEL_H */
diff --git a/include/net/mpls_iptunnel.h b/include/net/mpls_iptunnel.h
new file mode 100644
index 000000000000..4757997f76ed
--- /dev/null
+++ b/include/net/mpls_iptunnel.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2015 Cumulus Networks, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef _NET_MPLS_IPTUNNEL_H
+#define _NET_MPLS_IPTUNNEL_H 1
+
+#define MAX_NEW_LABELS 2
+
+struct mpls_iptunnel_encap {
+	u32	label[MAX_NEW_LABELS];
+	u32	labels;
+};
+
+static inline struct mpls_iptunnel_encap *mpls_lwtunnel_encap(struct lwtunnel_state *lwtstate)
+{
+	return (struct mpls_iptunnel_encap *)lwtstate->data;
+}
+
+#endif
diff --git a/include/uapi/linux/mpls_iptunnel.h b/include/uapi/linux/mpls_iptunnel.h
new file mode 100644
index 000000000000..d80a0498f77e
--- /dev/null
+++ b/include/uapi/linux/mpls_iptunnel.h
@@ -0,0 +1,28 @@
+/*
+ *	mpls tunnel api
+ *
+ *	Authors:
+ *		Roopa Prabhu <roopa@cumulusnetworks.com>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _UAPI_LINUX_MPLS_IPTUNNEL_H
+#define _UAPI_LINUX_MPLS_IPTUNNEL_H
+
+/* MPLS tunnel attributes
+ * [RTA_ENCAP] = {
+ *     [MPLS_IPTUNNEL_DST]
+ * }
+ */
+enum {
+	MPLS_IPTUNNEL_UNSPEC,
+	MPLS_IPTUNNEL_DST,
+	__MPLS_IPTUNNEL_MAX,
+};
+#define MPLS_IPTUNNEL_MAX (__MPLS_IPTUNNEL_MAX - 1)
+
+#endif /* _UAPI_LINUX_MPLS_IPTUNNEL_H */
diff --git a/net/mpls/Kconfig b/net/mpls/Kconfig
index 17bde799c854..5c467ef97311 100644
--- a/net/mpls/Kconfig
+++ b/net/mpls/Kconfig
@@ -24,7 +24,13 @@ config NET_MPLS_GSO
 
 config MPLS_ROUTING
 	tristate "MPLS: routing support"
-	help
+	---help---
 	 Add support for forwarding of mpls packets.
 
+config MPLS_IPTUNNEL
+	tristate "MPLS: IP over MPLS tunnel support"
+	depends on LWTUNNEL && MPLS_ROUTING
+	---help---
+	 mpls ip tunnel support.
+
 endif # MPLS
diff --git a/net/mpls/Makefile b/net/mpls/Makefile
index 65bbe68c72e6..9ca923625016 100644
--- a/net/mpls/Makefile
+++ b/net/mpls/Makefile
@@ -3,5 +3,6 @@
 #
 obj-$(CONFIG_NET_MPLS_GSO) += mpls_gso.o
 obj-$(CONFIG_MPLS_ROUTING) += mpls_router.o
+obj-$(CONFIG_MPLS_IPTUNNEL) += mpls_iptunnel.o
 
 mpls_router-y := af_mpls.o
diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
new file mode 100644
index 000000000000..eea096f21ba5
--- /dev/null
+++ b/net/mpls/mpls_iptunnel.c
@@ -0,0 +1,233 @@
+/*
+ * mpls tunnels	An implementation mpls tunnels using the light weight tunnel
+ *		infrastructure
+ *
+ * Authors:	Roopa Prabhu, <roopa@cumulusnetworks.com>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ */
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/net.h>
+#include <linux/module.h>
+#include <linux/mpls.h>
+#include <linux/vmalloc.h>
+#include <net/ip.h>
+#include <net/dst.h>
+#include <net/lwtunnel.h>
+#include <net/netevent.h>
+#include <net/netns/generic.h>
+#include <net/ip6_fib.h>
+#include <net/route.h>
+#include <net/mpls_iptunnel.h>
+#include <linux/mpls_iptunnel.h>
+#include "internal.h"
+
+static const struct nla_policy mpls_iptunnel_policy[MPLS_IPTUNNEL_MAX + 1] = {
+	[MPLS_IPTUNNEL_DST]	= { .type = NLA_U32 },
+};
+
+static unsigned int mpls_encap_size(struct mpls_iptunnel_encap *en)
+{
+	/* The size of the layer 2.5 labels to be added for this route */
+	return en->labels * sizeof(struct mpls_shim_hdr);
+}
+
+int mpls_output(struct sock *sk, struct sk_buff *skb)
+{
+	struct mpls_iptunnel_encap *tun_encap_info;
+	struct mpls_shim_hdr *hdr;
+	struct net_device *out_dev;
+	unsigned int hh_len;
+	unsigned int new_header_size;
+	unsigned int mtu;
+	struct dst_entry *dst = skb_dst(skb);
+	struct rtable *rt = NULL;
+	struct rt6_info *rt6 = NULL;
+	struct lwtunnel_state *lwtstate = NULL;
+	int err = 0;
+	bool bos;
+	int i;
+	unsigned int ttl;
+
+	/* Obtain the ttl */
+	if (skb->protocol == htons(ETH_P_IP)) {
+		ttl = ip_hdr(skb)->ttl;
+		rt = (struct rtable *)dst;
+		lwtstate = rt->rt_lwtstate;
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		ttl = ipv6_hdr(skb)->hop_limit;
+		rt6 = (struct rt6_info *)dst;
+		lwtstate = rt6->rt6i_lwtstate;
+	} else {
+		goto drop;
+	}
+
+	skb_orphan(skb);
+
+	/* Find the output device */
+	out_dev = rcu_dereference(dst->dev);
+	if (!mpls_output_possible(out_dev) ||
+	    !lwtstate || skb_warn_if_lro(skb))
+		goto drop;
+
+	skb_forward_csum(skb);
+
+	tun_encap_info = mpls_lwtunnel_encap(lwtstate);
+
+	/* Verify the destination can hold the packet */
+	new_header_size = mpls_encap_size(tun_encap_info);
+	mtu = mpls_dev_mtu(out_dev);
+	if (mpls_pkt_too_big(skb, mtu - new_header_size))
+		goto drop;
+
+	hh_len = LL_RESERVED_SPACE(out_dev);
+	if (!out_dev->header_ops)
+		hh_len = 0;
+
+	/* Ensure there is enough space for the headers in the skb */
+	if (skb_cow(skb, hh_len + new_header_size))
+		goto drop;
+
+	skb_push(skb, new_header_size);
+	skb_reset_network_header(skb);
+
+	skb->dev = out_dev;
+	skb->protocol = htons(ETH_P_MPLS_UC);
+
+	/* Push the new labels */
+	hdr = mpls_hdr(skb);
+	bos = true;
+	for (i = tun_encap_info->labels - 1; i >= 0; i--) {
+		hdr[i] = mpls_entry_encode(tun_encap_info->label[i],
+					   ttl, 0, bos);
+		bos = false;
+	}
+
+	if (rt)
+		err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gateway,
+				 skb);
+	else if (rt6)
+		err = neigh_xmit(NEIGH_ND_TABLE, out_dev, &rt6->rt6i_gateway,
+				 skb);
+	if (err)
+		net_dbg_ratelimited("%s: packet transmission failed: %d\n",
+				    __func__, err);
+
+	return 0;
+
+drop:
+	kfree_skb(skb);
+	return -EINVAL;
+}
+
+static int mpls_build_state(struct net_device *dev, struct nlattr *nla,
+			    struct lwtunnel_state **ts)
+{
+	struct mpls_iptunnel_encap *tun_encap_info;
+	struct nlattr *tb[MPLS_IPTUNNEL_MAX + 1];
+	struct lwtunnel_state *newts;
+	int tun_encap_info_len;
+	int ret;
+
+	ret = nla_parse_nested(tb, MPLS_IPTUNNEL_MAX, nla,
+			       mpls_iptunnel_policy);
+	if (ret < 0)
+		return ret;
+
+	if (!tb[MPLS_IPTUNNEL_DST])
+		return -EINVAL;
+
+	tun_encap_info_len = sizeof(*tun_encap_info);
+
+	newts = lwtunnel_state_alloc(tun_encap_info_len);
+	if (!newts)
+		return -ENOMEM;
+
+	newts->len = tun_encap_info_len;
+	tun_encap_info = mpls_lwtunnel_encap(newts);
+	ret = nla_get_labels(tb[MPLS_IPTUNNEL_DST], MAX_NEW_LABELS,
+			     &tun_encap_info->labels, tun_encap_info->label);
+	if (ret)
+		goto errout;
+	newts->type = LWTUNNEL_ENCAP_MPLS;
+	newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
+
+	*ts = newts;
+
+	return 0;
+
+errout:
+	kfree(newts);
+	*ts = NULL;
+
+	return ret;
+}
+
+static int mpls_fill_encap_info(struct sk_buff *skb,
+				struct lwtunnel_state *lwtstate)
+{
+	struct mpls_iptunnel_encap *tun_encap_info;
+	
+	tun_encap_info = mpls_lwtunnel_encap(lwtstate);
+
+	if (nla_put_labels(skb, MPLS_IPTUNNEL_DST, tun_encap_info->labels,
+			   tun_encap_info->label))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static int mpls_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+	struct mpls_iptunnel_encap *tun_encap_info;
+
+	tun_encap_info = mpls_lwtunnel_encap(lwtstate);
+
+	return nla_total_size(tun_encap_info->labels * 4);
+}
+
+static int mpls_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+	struct mpls_iptunnel_encap *a_hdr = mpls_lwtunnel_encap(a);
+	struct mpls_iptunnel_encap *b_hdr = mpls_lwtunnel_encap(b);
+	int l;
+
+	if (a_hdr->labels != b_hdr->labels)
+		return 1;
+
+	for (l = 0; l < MAX_NEW_LABELS; l++)
+		if (a_hdr->label[l] != b_hdr->label[l])
+			return 1;
+	return 0;
+}
+
+static const struct lwtunnel_encap_ops mpls_iptun_ops = {
+	.build_state = mpls_build_state,
+	.output = mpls_output,
+	.fill_encap = mpls_fill_encap_info,
+	.get_encap_size = mpls_encap_nlsize,
+	.cmp_encap = mpls_encap_cmp,
+};
+
+static int __init mpls_iptunnel_init(void)
+{
+	return lwtunnel_encap_add_ops(&mpls_iptun_ops, LWTUNNEL_ENCAP_MPLS);
+}
+module_init(mpls_iptunnel_init);
+
+static void __exit mpls_iptunnel_exit(void)
+{
+	lwtunnel_encap_del_ops(&mpls_iptun_ops, LWTUNNEL_ENCAP_MPLS);
+}
+module_exit(mpls_iptunnel_exit);
+
+MODULE_DESCRIPTION("MultiProtocol Label Switching IP Tunnels");
+MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3


From 1d8fff907342d2339796dbd27ea47d0e76a6a2d0 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 21 Jul 2015 10:43:54 +0200
Subject: ip_tunnel: Make ovs_tunnel_info and ovs_key_ipv4_tunnel generic

Rename the tunnel metadata data structures currently internal to
OVS and make them generic for use by all IP tunnels.

Both structures are kernel internal and will stay that way. Their
members are exposed to user space through individual Netlink
attributes by OVS. It will therefore be possible to extend/modify
these structures without affecting user ABI.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_tunnels.h         | 63 +++++++++++++++++++++++++++++++++
 include/uapi/linux/openvswitch.h |  2 +-
 net/openvswitch/actions.c        |  2 +-
 net/openvswitch/datapath.h       |  5 +--
 net/openvswitch/flow.c           |  4 +--
 net/openvswitch/flow.h           | 76 ++--------------------------------------
 net/openvswitch/flow_netlink.c   | 16 ++++-----
 net/openvswitch/flow_netlink.h   |  2 +-
 net/openvswitch/vport-geneve.c   | 17 +++++----
 net/openvswitch/vport-gre.c      | 16 ++++-----
 net/openvswitch/vport-vxlan.c    | 18 +++++-----
 net/openvswitch/vport.c          | 30 ++++++++--------
 net/openvswitch/vport.h          | 12 +++----
 13 files changed, 128 insertions(+), 135 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index d8214cb88bbc..6b9d559ce5f5 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -22,6 +22,28 @@
 /* Keep error state on tunnel for 30 sec */
 #define IPTUNNEL_ERR_TIMEO	(30*HZ)
 
+/* Used to memset ip_tunnel padding. */
+#define IP_TUNNEL_KEY_SIZE					\
+	(offsetof(struct ip_tunnel_key, tp_dst) +		\
+	 FIELD_SIZEOF(struct ip_tunnel_key, tp_dst))
+
+struct ip_tunnel_key {
+	__be64			tun_id;
+	__be32			ipv4_src;
+	__be32			ipv4_dst;
+	__be16			tun_flags;
+	__u8			ipv4_tos;
+	__u8			ipv4_ttl;
+	__be16			tp_src;
+	__be16			tp_dst;
+} __packed __aligned(4); /* Minimize padding. */
+
+struct ip_tunnel_info {
+	struct ip_tunnel_key	key;
+	const void		*options;
+	u8			options_len;
+};
+
 /* 6rd prefix/relay information */
 #ifdef CONFIG_IPV6_SIT_6RD
 struct ip_tunnel_6rd_parm {
@@ -136,6 +158,47 @@ int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *op,
 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *op,
 			    unsigned int num);
 
+static inline void __ip_tunnel_info_init(struct ip_tunnel_info *tun_info,
+					 __be32 saddr, __be32 daddr,
+					 u8 tos, u8 ttl,
+					 __be16 tp_src, __be16 tp_dst,
+					 __be64 tun_id, __be16 tun_flags,
+					 const void *opts, u8 opts_len)
+{
+	tun_info->key.tun_id = tun_id;
+	tun_info->key.ipv4_src = saddr;
+	tun_info->key.ipv4_dst = daddr;
+	tun_info->key.ipv4_tos = tos;
+	tun_info->key.ipv4_ttl = ttl;
+	tun_info->key.tun_flags = tun_flags;
+
+	/* For the tunnel types on the top of IPsec, the tp_src and tp_dst of
+	 * the upper tunnel are used.
+	 * E.g: GRE over IPSEC, the tp_src and tp_port are zero.
+	 */
+	tun_info->key.tp_src = tp_src;
+	tun_info->key.tp_dst = tp_dst;
+
+	/* Clear struct padding. */
+	if (sizeof(tun_info->key) != IP_TUNNEL_KEY_SIZE)
+		memset((unsigned char *)&tun_info->key + IP_TUNNEL_KEY_SIZE,
+		       0, sizeof(tun_info->key) - IP_TUNNEL_KEY_SIZE);
+
+	tun_info->options = opts;
+	tun_info->options_len = opts_len;
+}
+
+static inline void ip_tunnel_info_init(struct ip_tunnel_info *tun_info,
+				       const struct iphdr *iph,
+				       __be16 tp_src, __be16 tp_dst,
+				       __be64 tun_id, __be16 tun_flags,
+				       const void *opts, u8 opts_len)
+{
+	__ip_tunnel_info_init(tun_info, iph->saddr, iph->daddr,
+			      iph->tos, iph->ttl, tp_src, tp_dst,
+			      tun_id, tun_flags, opts, opts_len);
+}
+
 #ifdef CONFIG_INET
 
 int ip_tunnel_init(struct net_device *dev);
diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 1dab77601c21..d6b885460187 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -321,7 +321,7 @@ enum ovs_key_attr {
 				 * the accepted length of the array. */
 
 #ifdef __KERNEL__
-	OVS_KEY_ATTR_TUNNEL_INFO,  /* struct ovs_tunnel_info */
+	OVS_KEY_ATTR_TUNNEL_INFO,  /* struct ip_tunnel_info */
 #endif
 	__OVS_KEY_ATTR_MAX
 };
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 8a8c0b8b4f63..27c1687cfd92 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -611,7 +611,7 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb,
 			    struct sw_flow_key *key, const struct nlattr *attr,
 			    const struct nlattr *actions, int actions_len)
 {
-	struct ovs_tunnel_info info;
+	struct ip_tunnel_info info;
 	struct dp_upcall_info upcall;
 	const struct nlattr *a;
 	int rem;
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index cd691e935e08..6b28c5cedb23 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -25,6 +25,7 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <linux/u64_stats_sync.h>
+#include <net/ip_tunnels.h>
 
 #include "flow.h"
 #include "flow_table.h"
@@ -98,7 +99,7 @@ struct datapath {
  * when a packet is received by OVS.
  */
 struct ovs_skb_cb {
-	struct ovs_tunnel_info  *egress_tun_info;
+	struct ip_tunnel_info  *egress_tun_info;
 	struct vport		*input_vport;
 };
 #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb)
@@ -114,7 +115,7 @@ struct ovs_skb_cb {
  * @egress_tun_info: If nonnull, becomes %OVS_PACKET_ATTR_EGRESS_TUN_KEY.
  */
 struct dp_upcall_info {
-	const struct ovs_tunnel_info *egress_tun_info;
+	const struct ip_tunnel_info *egress_tun_info;
 	const struct nlattr *userdata;
 	const struct nlattr *actions;
 	int actions_len;
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index bc7b0aba994a..8db22ef73626 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -682,12 +682,12 @@ int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key)
 	return key_extract(skb, key);
 }
 
-int ovs_flow_key_extract(const struct ovs_tunnel_info *tun_info,
+int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
 			 struct sk_buff *skb, struct sw_flow_key *key)
 {
 	/* Extract metadata from packet. */
 	if (tun_info) {
-		memcpy(&key->tun_key, &tun_info->tunnel, sizeof(key->tun_key));
+		memcpy(&key->tun_key, &tun_info->key, sizeof(key->tun_key));
 
 		if (tun_info->options) {
 			BUILD_BUG_ON((1 << (sizeof(tun_info->options_len) *
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index a076e445ccc2..cadc6c5c3545 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -32,31 +32,10 @@
 #include <linux/time.h>
 #include <linux/flex_array.h>
 #include <net/inet_ecn.h>
+#include <net/ip_tunnels.h>
 
 struct sk_buff;
 
-/* Used to memset ovs_key_ipv4_tunnel padding. */
-#define OVS_TUNNEL_KEY_SIZE					\
-	(offsetof(struct ovs_key_ipv4_tunnel, tp_dst) +		\
-	 FIELD_SIZEOF(struct ovs_key_ipv4_tunnel, tp_dst))
-
-struct ovs_key_ipv4_tunnel {
-	__be64 tun_id;
-	__be32 ipv4_src;
-	__be32 ipv4_dst;
-	__be16 tun_flags;
-	u8   ipv4_tos;
-	u8   ipv4_ttl;
-	__be16 tp_src;
-	__be16 tp_dst;
-} __packed __aligned(4); /* Minimize padding. */
-
-struct ovs_tunnel_info {
-	struct ovs_key_ipv4_tunnel tunnel;
-	const void *options;
-	u8 options_len;
-};
-
 /* Store options at the end of the array if they are less than the
  * maximum size. This allows us to get the benefits of variable length
  * matching for small options.
@@ -66,55 +45,6 @@ struct ovs_tunnel_info {
 #define TUN_METADATA_OPTS(flow_key, opt_len) \
 	((void *)((flow_key)->tun_opts + TUN_METADATA_OFFSET(opt_len)))
 
-static inline void __ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info,
-					    __be32 saddr, __be32 daddr,
-					    u8 tos, u8 ttl,
-					    __be16 tp_src,
-					    __be16 tp_dst,
-					    __be64 tun_id,
-					    __be16 tun_flags,
-					    const void *opts,
-					    u8 opts_len)
-{
-	tun_info->tunnel.tun_id = tun_id;
-	tun_info->tunnel.ipv4_src = saddr;
-	tun_info->tunnel.ipv4_dst = daddr;
-	tun_info->tunnel.ipv4_tos = tos;
-	tun_info->tunnel.ipv4_ttl = ttl;
-	tun_info->tunnel.tun_flags = tun_flags;
-
-	/* For the tunnel types on the top of IPsec, the tp_src and tp_dst of
-	 * the upper tunnel are used.
-	 * E.g: GRE over IPSEC, the tp_src and tp_port are zero.
-	 */
-	tun_info->tunnel.tp_src = tp_src;
-	tun_info->tunnel.tp_dst = tp_dst;
-
-	/* Clear struct padding. */
-	if (sizeof(tun_info->tunnel) != OVS_TUNNEL_KEY_SIZE)
-		memset((unsigned char *)&tun_info->tunnel + OVS_TUNNEL_KEY_SIZE,
-		       0, sizeof(tun_info->tunnel) - OVS_TUNNEL_KEY_SIZE);
-
-	tun_info->options = opts;
-	tun_info->options_len = opts_len;
-}
-
-static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info,
-					  const struct iphdr *iph,
-					  __be16 tp_src,
-					  __be16 tp_dst,
-					  __be64 tun_id,
-					  __be16 tun_flags,
-					  const void *opts,
-					  u8 opts_len)
-{
-	__ovs_flow_tun_info_init(tun_info, iph->saddr, iph->daddr,
-				 iph->tos, iph->ttl,
-				 tp_src, tp_dst,
-				 tun_id, tun_flags,
-				 opts, opts_len);
-}
-
 #define OVS_SW_FLOW_KEY_METADATA_SIZE			\
 	(offsetof(struct sw_flow_key, recirc_id) +	\
 	FIELD_SIZEOF(struct sw_flow_key, recirc_id))
@@ -122,7 +52,7 @@ static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info,
 struct sw_flow_key {
 	u8 tun_opts[255];
 	u8 tun_opts_len;
-	struct ovs_key_ipv4_tunnel tun_key;  /* Encapsulating tunnel key. */
+	struct ip_tunnel_key tun_key;	/* Encapsulating tunnel key. */
 	struct {
 		u32	priority;	/* Packet QoS priority. */
 		u32	skb_mark;	/* SKB mark. */
@@ -273,7 +203,7 @@ void ovs_flow_stats_clear(struct sw_flow *);
 u64 ovs_flow_used_time(unsigned long flow_jiffies);
 
 int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key);
-int ovs_flow_key_extract(const struct ovs_tunnel_info *tun_info,
+int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
 			 struct sk_buff *skb,
 			 struct sw_flow_key *key);
 /* Extract key from packet coming from userspace. */
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 624e41c4267f..ecfa530d3461 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -641,7 +641,7 @@ static int vxlan_opt_to_nlattr(struct sk_buff *skb,
 }
 
 static int __ipv4_tun_to_nlattr(struct sk_buff *skb,
-				const struct ovs_key_ipv4_tunnel *output,
+				const struct ip_tunnel_key *output,
 				const void *tun_opts, int swkey_tun_opts_len)
 {
 	if (output->tun_flags & TUNNEL_KEY &&
@@ -689,7 +689,7 @@ static int __ipv4_tun_to_nlattr(struct sk_buff *skb,
 }
 
 static int ipv4_tun_to_nlattr(struct sk_buff *skb,
-			      const struct ovs_key_ipv4_tunnel *output,
+			      const struct ip_tunnel_key *output,
 			      const void *tun_opts, int swkey_tun_opts_len)
 {
 	struct nlattr *nla;
@@ -708,9 +708,9 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb,
 }
 
 int ovs_nla_put_egress_tunnel_key(struct sk_buff *skb,
-				  const struct ovs_tunnel_info *egress_tun_info)
+				  const struct ip_tunnel_info *egress_tun_info)
 {
-	return __ipv4_tun_to_nlattr(skb, &egress_tun_info->tunnel,
+	return __ipv4_tun_to_nlattr(skb, &egress_tun_info->key,
 				    egress_tun_info->options,
 				    egress_tun_info->options_len);
 }
@@ -1746,7 +1746,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
 {
 	struct sw_flow_match match;
 	struct sw_flow_key key;
-	struct ovs_tunnel_info *tun_info;
+	struct ip_tunnel_info *tun_info;
 	struct nlattr *a;
 	int err = 0, start, opts_type;
 
@@ -1777,7 +1777,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
 		return PTR_ERR(a);
 
 	tun_info = nla_data(a);
-	tun_info->tunnel = key.tun_key;
+	tun_info->key = key.tun_key;
 	tun_info->options_len = key.tun_opts_len;
 
 	if (tun_info->options_len) {
@@ -2227,13 +2227,13 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)
 
 	switch (key_type) {
 	case OVS_KEY_ATTR_TUNNEL_INFO: {
-		struct ovs_tunnel_info *tun_info = nla_data(ovs_key);
+		struct ip_tunnel_info *tun_info = nla_data(ovs_key);
 
 		start = nla_nest_start(skb, OVS_ACTION_ATTR_SET);
 		if (!start)
 			return -EMSGSIZE;
 
-		err = ipv4_tun_to_nlattr(skb, &tun_info->tunnel,
+		err = ipv4_tun_to_nlattr(skb, &tun_info->key,
 					 tun_info->options_len ?
 						tun_info->options : NULL,
 					 tun_info->options_len);
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index 5c3d75bff310..ec53eb6e632b 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -55,7 +55,7 @@ int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb);
 int ovs_nla_get_match(struct sw_flow_match *, const struct nlattr *key,
 		      const struct nlattr *mask, bool log);
 int ovs_nla_put_egress_tunnel_key(struct sk_buff *,
-				  const struct ovs_tunnel_info *);
+				  const struct ip_tunnel_info *);
 
 bool ovs_nla_get_ufid(struct sw_flow_id *, const struct nlattr *, bool log);
 int ovs_nla_get_identifier(struct sw_flow_id *sfid, const struct nlattr *ufid,
diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c
index 208c576bd1b6..1da3a14d1010 100644
--- a/net/openvswitch/vport-geneve.c
+++ b/net/openvswitch/vport-geneve.c
@@ -77,7 +77,7 @@ static void geneve_rcv(struct geneve_sock *gs, struct sk_buff *skb)
 	struct vport *vport = gs->rcv_data;
 	struct genevehdr *geneveh = geneve_hdr(skb);
 	int opts_len;
-	struct ovs_tunnel_info tun_info;
+	struct ip_tunnel_info tun_info;
 	__be64 key;
 	__be16 flags;
 
@@ -90,10 +90,9 @@ static void geneve_rcv(struct geneve_sock *gs, struct sk_buff *skb)
 
 	key = vni_to_tunnel_id(geneveh->vni);
 
-	ovs_flow_tun_info_init(&tun_info, ip_hdr(skb),
-			       udp_hdr(skb)->source, udp_hdr(skb)->dest,
-			       key, flags,
-			       geneveh->options, opts_len);
+	ip_tunnel_info_init(&tun_info, ip_hdr(skb),
+			    udp_hdr(skb)->source, udp_hdr(skb)->dest,
+			    key, flags, geneveh->options, opts_len);
 
 	ovs_vport_receive(vport, skb, &tun_info);
 }
@@ -165,8 +164,8 @@ error:
 
 static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb)
 {
-	const struct ovs_key_ipv4_tunnel *tun_key;
-	struct ovs_tunnel_info *tun_info;
+	const struct ip_tunnel_key *tun_key;
+	struct ip_tunnel_info *tun_info;
 	struct net *net = ovs_dp_get_net(vport->dp);
 	struct geneve_port *geneve_port = geneve_vport(vport);
 	__be16 dport = inet_sk(geneve_port->gs->sock->sk)->inet_sport;
@@ -183,7 +182,7 @@ static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb)
 		goto error;
 	}
 
-	tun_key = &tun_info->tunnel;
+	tun_key = &tun_info->key;
 	rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_UDP);
 	if (IS_ERR(rt)) {
 		err = PTR_ERR(rt);
@@ -225,7 +224,7 @@ static const char *geneve_get_name(const struct vport *vport)
 }
 
 static int geneve_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
-				      struct ovs_tunnel_info *egress_tun_info)
+				      struct ip_tunnel_info *egress_tun_info)
 {
 	struct geneve_port *geneve_port = geneve_vport(vport);
 	struct net *net = ovs_dp_get_net(vport->dp);
diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
index f17ac9642f4e..b87656c66aaf 100644
--- a/net/openvswitch/vport-gre.c
+++ b/net/openvswitch/vport-gre.c
@@ -67,9 +67,9 @@ static struct sk_buff *__build_header(struct sk_buff *skb,
 				      int tunnel_hlen)
 {
 	struct tnl_ptk_info tpi;
-	const struct ovs_key_ipv4_tunnel *tun_key;
+	const struct ip_tunnel_key *tun_key;
 
-	tun_key = &OVS_CB(skb)->egress_tun_info->tunnel;
+	tun_key = &OVS_CB(skb)->egress_tun_info->key;
 
 	skb = gre_handle_offloads(skb, !!(tun_key->tun_flags & TUNNEL_CSUM));
 	if (IS_ERR(skb))
@@ -97,7 +97,7 @@ static __be64 key_to_tunnel_id(__be32 key, __be32 seq)
 static int gre_rcv(struct sk_buff *skb,
 		   const struct tnl_ptk_info *tpi)
 {
-	struct ovs_tunnel_info tun_info;
+	struct ip_tunnel_info tun_info;
 	struct ovs_net *ovs_net;
 	struct vport *vport;
 	__be64 key;
@@ -108,8 +108,8 @@ static int gre_rcv(struct sk_buff *skb,
 		return PACKET_REJECT;
 
 	key = key_to_tunnel_id(tpi->key, tpi->seq);
-	ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), 0, 0, key,
-			       filter_tnl_flags(tpi->flags), NULL, 0);
+	ip_tunnel_info_init(&tun_info, ip_hdr(skb), 0, 0, key,
+			    filter_tnl_flags(tpi->flags), NULL, 0);
 
 	ovs_vport_receive(vport, skb, &tun_info);
 	return PACKET_RCVD;
@@ -134,7 +134,7 @@ static int gre_err(struct sk_buff *skb, u32 info,
 static int gre_tnl_send(struct vport *vport, struct sk_buff *skb)
 {
 	struct net *net = ovs_dp_get_net(vport->dp);
-	const struct ovs_key_ipv4_tunnel *tun_key;
+	const struct ip_tunnel_key *tun_key;
 	struct flowi4 fl;
 	struct rtable *rt;
 	int min_headroom;
@@ -147,7 +147,7 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb)
 		goto err_free_skb;
 	}
 
-	tun_key = &OVS_CB(skb)->egress_tun_info->tunnel;
+	tun_key = &OVS_CB(skb)->egress_tun_info->key;
 	rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_GRE);
 	if (IS_ERR(rt)) {
 		err = PTR_ERR(rt);
@@ -277,7 +277,7 @@ static void gre_tnl_destroy(struct vport *vport)
 }
 
 static int gre_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
-				   struct ovs_tunnel_info *egress_tun_info)
+				   struct ip_tunnel_info *egress_tun_info)
 {
 	return ovs_tunnel_get_egress_info(egress_tun_info,
 					  ovs_dp_get_net(vport->dp),
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index 6d39766e7828..6f7986fabb70 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -64,7 +64,7 @@ static inline struct vxlan_port *vxlan_vport(const struct vport *vport)
 static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
 		      struct vxlan_metadata *md)
 {
-	struct ovs_tunnel_info tun_info;
+	struct ip_tunnel_info tun_info;
 	struct vxlan_port *vxlan_port;
 	struct vport *vport = vs->data;
 	struct iphdr *iph;
@@ -82,9 +82,9 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
 	/* Save outer tunnel values */
 	iph = ip_hdr(skb);
 	key = cpu_to_be64(ntohl(md->vni) >> 8);
-	ovs_flow_tun_info_init(&tun_info, iph,
-			       udp_hdr(skb)->source, udp_hdr(skb)->dest,
-			       key, flags, &opts, sizeof(opts));
+	ip_tunnel_info_init(&tun_info, iph,
+			    udp_hdr(skb)->source, udp_hdr(skb)->dest,
+			    key, flags, &opts, sizeof(opts));
 
 	ovs_vport_receive(vport, skb, &tun_info);
 }
@@ -205,13 +205,13 @@ error:
 
 static int vxlan_ext_gbp(struct sk_buff *skb)
 {
-	const struct ovs_tunnel_info *tun_info;
+	const struct ip_tunnel_info *tun_info;
 	const struct ovs_vxlan_opts *opts;
 
 	tun_info = OVS_CB(skb)->egress_tun_info;
 	opts = tun_info->options;
 
-	if (tun_info->tunnel.tun_flags & TUNNEL_VXLAN_OPT &&
+	if (tun_info->key.tun_flags & TUNNEL_VXLAN_OPT &&
 	    tun_info->options_len >= sizeof(*opts))
 		return opts->gbp;
 	else
@@ -224,7 +224,7 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb)
 	struct vxlan_port *vxlan_port = vxlan_vport(vport);
 	struct sock *sk = vxlan_port->vs->sock->sk;
 	__be16 dst_port = inet_sk(sk)->inet_sport;
-	const struct ovs_key_ipv4_tunnel *tun_key;
+	const struct ip_tunnel_key *tun_key;
 	struct vxlan_metadata md = {0};
 	struct rtable *rt;
 	struct flowi4 fl;
@@ -238,7 +238,7 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb)
 		goto error;
 	}
 
-	tun_key = &OVS_CB(skb)->egress_tun_info->tunnel;
+	tun_key = &OVS_CB(skb)->egress_tun_info->key;
 	rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_UDP);
 	if (IS_ERR(rt)) {
 		err = PTR_ERR(rt);
@@ -269,7 +269,7 @@ error:
 }
 
 static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
-				     struct ovs_tunnel_info *egress_tun_info)
+				     struct ip_tunnel_info *egress_tun_info)
 {
 	struct net *net = ovs_dp_get_net(vport->dp);
 	struct vxlan_port *vxlan_port = vxlan_vport(vport);
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 067a3fff1d2c..af23ba077836 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -469,7 +469,7 @@ u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb)
  * skb->data should point to the Ethernet header.
  */
 void ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
-		       const struct ovs_tunnel_info *tun_info)
+		       const struct ip_tunnel_info *tun_info)
 {
 	struct pcpu_sw_netstats *stats;
 	struct sw_flow_key key;
@@ -572,22 +572,22 @@ void ovs_vport_deferred_free(struct vport *vport)
 }
 EXPORT_SYMBOL_GPL(ovs_vport_deferred_free);
 
-int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info,
+int ovs_tunnel_get_egress_info(struct ip_tunnel_info *egress_tun_info,
 			       struct net *net,
-			       const struct ovs_tunnel_info *tun_info,
+			       const struct ip_tunnel_info *tun_info,
 			       u8 ipproto,
 			       u32 skb_mark,
 			       __be16 tp_src,
 			       __be16 tp_dst)
 {
-	const struct ovs_key_ipv4_tunnel *tun_key;
+	const struct ip_tunnel_key *tun_key;
 	struct rtable *rt;
 	struct flowi4 fl;
 
 	if (unlikely(!tun_info))
 		return -EINVAL;
 
-	tun_key = &tun_info->tunnel;
+	tun_key = &tun_info->key;
 
 	/* Route lookup to get srouce IP address.
 	 * The process may need to be changed if the corresponding process
@@ -602,22 +602,22 @@ int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info,
 	/* Generate egress_tun_info based on tun_info,
 	 * saddr, tp_src and tp_dst
 	 */
-	__ovs_flow_tun_info_init(egress_tun_info,
-				 fl.saddr, tun_key->ipv4_dst,
-				 tun_key->ipv4_tos,
-				 tun_key->ipv4_ttl,
-				 tp_src, tp_dst,
-				 tun_key->tun_id,
-				 tun_key->tun_flags,
-				 tun_info->options,
-				 tun_info->options_len);
+	__ip_tunnel_info_init(egress_tun_info,
+			      fl.saddr, tun_key->ipv4_dst,
+			      tun_key->ipv4_tos,
+			      tun_key->ipv4_ttl,
+			      tp_src, tp_dst,
+			      tun_key->tun_id,
+			      tun_key->tun_flags,
+			      tun_info->options,
+			      tun_info->options_len);
 
 	return 0;
 }
 EXPORT_SYMBOL_GPL(ovs_tunnel_get_egress_info);
 
 int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
-				  struct ovs_tunnel_info *info)
+				  struct ip_tunnel_info *info)
 {
 	/* get_egress_tun_info() is only implemented on tunnel ports. */
 	if (unlikely(!vport->ops->get_egress_tun_info))
diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
index bc85331a6c60..4750fb673a9f 100644
--- a/net/openvswitch/vport.h
+++ b/net/openvswitch/vport.h
@@ -58,15 +58,15 @@ u32 ovs_vport_find_upcall_portid(const struct vport *, struct sk_buff *);
 
 int ovs_vport_send(struct vport *, struct sk_buff *);
 
-int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info,
+int ovs_tunnel_get_egress_info(struct ip_tunnel_info *egress_tun_info,
 			       struct net *net,
-			       const struct ovs_tunnel_info *tun_info,
+			       const struct ip_tunnel_info *tun_info,
 			       u8 ipproto,
 			       u32 skb_mark,
 			       __be16 tp_src,
 			       __be16 tp_dst);
 int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
-				  struct ovs_tunnel_info *info);
+				  struct ip_tunnel_info *info);
 
 /* The following definitions are for implementers of vport devices: */
 
@@ -176,7 +176,7 @@ struct vport_ops {
 
 	int (*send)(struct vport *, struct sk_buff *);
 	int (*get_egress_tun_info)(struct vport *, struct sk_buff *,
-				   struct ovs_tunnel_info *);
+				   struct ip_tunnel_info *);
 
 	struct module *owner;
 	struct list_head list;
@@ -226,7 +226,7 @@ static inline struct vport *vport_from_priv(void *priv)
 }
 
 void ovs_vport_receive(struct vport *, struct sk_buff *,
-		       const struct ovs_tunnel_info *);
+		       const struct ip_tunnel_info *);
 
 static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb,
 				      const void *start, unsigned int len)
@@ -239,7 +239,7 @@ int ovs_vport_ops_register(struct vport_ops *ops);
 void ovs_vport_ops_unregister(struct vport_ops *ops);
 
 static inline struct rtable *ovs_tunnel_route_lookup(struct net *net,
-						     const struct ovs_key_ipv4_tunnel *key,
+						     const struct ip_tunnel_key *key,
 						     u32 mark,
 						     struct flowi4 *fl,
 						     u8 protocol)
-- 
cgit v1.2.3


From ee122c79d4227f6ec642157834b6a90fcffa4382 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 21 Jul 2015 10:43:58 +0200
Subject: vxlan: Flow based tunneling

Allows putting a VXLAN device into a new flow-based mode in which
skbs with a ip_tunnel_info dst metadata attached will be encapsulated
according to the instructions stored in there with the VXLAN device
defaults taken into consideration.

Similar on the receive side, if the VXLAN_F_COLLECT_METADATA flag is
set, the packet processing will populate a ip_tunnel_info struct for
each packet received and attach it to the skb using the new metadata
dst.  The metadata structure will contain the outer header and tunnel
header fields which have been stripped off. Layers further up in the
stack such as routing, tc or netfitler can later match on these fields
and perform forwarding. It is the responsibility of upper layers to
ensure that the flag is set if the metadata is needed. The flag limits
the additional cost of metadata collecting based on demand.

This prepares the VXLAN device to be steered by the routing and other
subsystems which allows to support encapsulation for a large number
of tunnel endpoints and tunnel ids through a single net_device which
improves the scalability.

It also allows for OVS to leverage this mode which in turn allows for
the removal of the OVS specific VXLAN code.

Because the skb is currently scrubed in vxlan_rcv(), the attachment of
the new dst metadata is postponed until after scrubing which requires
the temporary addition of a new member to vxlan_metadata. This member
is removed again in a later commit after the indirect VXLAN receive API
has been removed.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c          | 149 ++++++++++++++++++++++++++++++++++++-------
 include/linux/skbuff.h       |   1 +
 include/net/dst_metadata.h   |  13 ++++
 include/net/ip_tunnels.h     |  14 ++++
 include/net/vxlan.h          |  10 ++-
 include/uapi/linux/if_link.h |   1 +
 6 files changed, 165 insertions(+), 23 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index ec86a11743fd..06c092b05a51 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -49,6 +49,7 @@
 #include <net/ip6_tunnel.h>
 #include <net/ip6_checksum.h>
 #endif
+#include <net/dst_metadata.h>
 
 #define VXLAN_VERSION	"0.1"
 
@@ -140,6 +141,11 @@ struct vxlan_dev {
 static u32 vxlan_salt __read_mostly;
 static struct workqueue_struct *vxlan_wq;
 
+static inline bool vxlan_collect_metadata(struct vxlan_sock *vs)
+{
+	return vs->flags & VXLAN_F_COLLECT_METADATA;
+}
+
 #if IS_ENABLED(CONFIG_IPV6)
 static inline
 bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
@@ -1164,10 +1170,13 @@ static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh,
 /* Callback from net/ipv4/udp.c to receive packets */
 static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 {
+	struct metadata_dst *tun_dst = NULL;
+	struct ip_tunnel_info *info;
 	struct vxlan_sock *vs;
 	struct vxlanhdr *vxh;
 	u32 flags, vni;
-	struct vxlan_metadata md = {0};
+	struct vxlan_metadata _md;
+	struct vxlan_metadata *md = &_md;
 
 	/* Need Vxlan and inner Ethernet header to be present */
 	if (!pskb_may_pull(skb, VXLAN_HLEN))
@@ -1202,6 +1211,33 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 		vni &= VXLAN_VNI_MASK;
 	}
 
+	if (vxlan_collect_metadata(vs)) {
+		const struct iphdr *iph = ip_hdr(skb);
+
+		tun_dst = metadata_dst_alloc(sizeof(*md), GFP_ATOMIC);
+		if (!tun_dst)
+			goto drop;
+
+		info = &tun_dst->u.tun_info;
+		info->key.ipv4_src = iph->saddr;
+		info->key.ipv4_dst = iph->daddr;
+		info->key.ipv4_tos = iph->tos;
+		info->key.ipv4_ttl = iph->ttl;
+		info->key.tp_src = udp_hdr(skb)->source;
+		info->key.tp_dst = udp_hdr(skb)->dest;
+
+		info->mode = IP_TUNNEL_INFO_RX;
+		info->key.tun_flags = TUNNEL_KEY;
+		info->key.tun_id = cpu_to_be64(vni >> 8);
+		if (udp_hdr(skb)->check != 0)
+			info->key.tun_flags |= TUNNEL_CSUM;
+
+		md = ip_tunnel_info_opts(info, sizeof(*md));
+		md->tun_dst = tun_dst;
+	} else {
+		memset(md, 0, sizeof(*md));
+	}
+
 	/* For backwards compatibility, only allow reserved fields to be
 	 * used by VXLAN extensions if explicitly requested.
 	 */
@@ -1209,13 +1245,16 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 		struct vxlanhdr_gbp *gbp;
 
 		gbp = (struct vxlanhdr_gbp *)vxh;
-		md.gbp = ntohs(gbp->policy_id);
+		md->gbp = ntohs(gbp->policy_id);
+
+		if (tun_dst)
+			info->key.tun_flags |= TUNNEL_VXLAN_OPT;
 
 		if (gbp->dont_learn)
-			md.gbp |= VXLAN_GBP_DONT_LEARN;
+			md->gbp |= VXLAN_GBP_DONT_LEARN;
 
 		if (gbp->policy_applied)
-			md.gbp |= VXLAN_GBP_POLICY_APPLIED;
+			md->gbp |= VXLAN_GBP_POLICY_APPLIED;
 
 		flags &= ~VXLAN_GBP_USED_BITS;
 	}
@@ -1233,8 +1272,8 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 		goto bad_flags;
 	}
 
-	md.vni = vxh->vx_vni;
-	vs->rcv(vs, skb, &md);
+	md->vni = vxh->vx_vni;
+	vs->rcv(vs, skb, md);
 	return 0;
 
 drop:
@@ -1247,6 +1286,9 @@ bad_flags:
 		   ntohl(vxh->vx_flags), ntohl(vxh->vx_vni));
 
 error:
+	if (tun_dst)
+		dst_release((struct dst_entry *)tun_dst);
+
 	/* Return non vxlan pkt */
 	return 1;
 }
@@ -1263,7 +1305,12 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
 	int err = 0;
 	union vxlan_addr *remote_ip;
 
-	vni = ntohl(md->vni) >> 8;
+	/* For flow based devices, map all packets to VNI 0 */
+	if (vs->flags & VXLAN_F_FLOW_BASED)
+		vni = 0;
+	else
+		vni = ntohl(md->vni) >> 8;
+
 	/* Is this VNI defined? */
 	vxlan = vxlan_vs_find_vni(vs, vni);
 	if (!vxlan)
@@ -1292,12 +1339,19 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
 #endif
 	}
 
+	if (md->tun_dst) {
+		skb_dst_set(skb, (struct dst_entry *)md->tun_dst);
+		md->tun_dst = NULL;
+	}
+
 	if ((vxlan->flags & VXLAN_F_LEARN) &&
 	    vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source))
 		goto drop;
 
 	skb_reset_network_header(skb);
-	skb->mark = md->gbp;
+	/* In flow-based mode, GBP is carried in dst_metadata */
+	if (!(vs->flags & VXLAN_F_FLOW_BASED))
+		skb->mark = md->gbp;
 
 	if (oip6)
 		err = IP6_ECN_decapsulate(oip6, skb);
@@ -1330,6 +1384,9 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
 
 	return;
 drop:
+	if (md->tun_dst)
+		dst_release((struct dst_entry *)md->tun_dst);
+
 	/* Consume bad packet */
 	kfree_skb(skb);
 }
@@ -1878,22 +1935,40 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
 static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 			   struct vxlan_rdst *rdst, bool did_rsc)
 {
+	struct ip_tunnel_info *info = skb_tunnel_info(skb);
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct sock *sk = vxlan->vn_sock->sock->sk;
 	struct rtable *rt = NULL;
 	const struct iphdr *old_iph;
 	struct flowi4 fl4;
 	union vxlan_addr *dst;
-	struct vxlan_metadata md;
+	union vxlan_addr remote_ip;
+	struct vxlan_metadata _md;
+	struct vxlan_metadata *md = &_md;
 	__be16 src_port = 0, dst_port;
 	u32 vni;
 	__be16 df = 0;
 	__u8 tos, ttl;
 	int err;
+	u32 flags = vxlan->flags;
 
-	dst_port = rdst->remote_port ? rdst->remote_port : vxlan->dst_port;
-	vni = rdst->remote_vni;
-	dst = &rdst->remote_ip;
+	if (rdst) {
+		dst_port = rdst->remote_port ? rdst->remote_port : vxlan->dst_port;
+		vni = rdst->remote_vni;
+		dst = &rdst->remote_ip;
+	} else {
+		if (!info) {
+			WARN_ONCE(1, "%s: Missing encapsulation instructions\n",
+				  dev->name);
+			goto drop;
+		}
+
+		dst_port = info->key.tp_dst ? : vxlan->dst_port;
+		vni = be64_to_cpu(info->key.tun_id);
+		remote_ip.sin.sin_family = AF_INET;
+		remote_ip.sin.sin_addr.s_addr = info->key.ipv4_dst;
+		dst = &remote_ip;
+	}
 
 	if (vxlan_addr_any(dst)) {
 		if (did_rsc) {
@@ -1918,8 +1993,25 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 				     vxlan->port_max, true);
 
 	if (dst->sa.sa_family == AF_INET) {
+		if (info) {
+			if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT)
+				df = htons(IP_DF);
+			if (info->key.tun_flags & TUNNEL_CSUM)
+				flags |= VXLAN_F_UDP_CSUM;
+			else
+				flags &= ~VXLAN_F_UDP_CSUM;
+
+			ttl = info->key.ipv4_ttl;
+			tos = info->key.ipv4_tos;
+
+			if (info->options_len)
+				md = ip_tunnel_info_opts(info, sizeof(*md));
+		} else {
+			md->gbp = skb->mark;
+		}
+
 		memset(&fl4, 0, sizeof(fl4));
-		fl4.flowi4_oif = rdst->remote_ifindex;
+		fl4.flowi4_oif = rdst ? rdst->remote_ifindex : 0;
 		fl4.flowi4_tos = RT_TOS(tos);
 		fl4.flowi4_mark = skb->mark;
 		fl4.flowi4_proto = IPPROTO_UDP;
@@ -1958,14 +2050,12 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 
 		tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
 		ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
-		md.vni = htonl(vni << 8);
-		md.gbp = skb->mark;
-
+		md->vni = htonl(vni << 8);
 		err = vxlan_xmit_skb(rt, sk, skb, fl4.saddr,
 				     dst->sin.sin_addr.s_addr, tos, ttl, df,
-				     src_port, dst_port, &md,
+				     src_port, dst_port, md,
 				     !net_eq(vxlan->net, dev_net(vxlan->dev)),
-				     vxlan->flags);
+				     flags);
 		if (err < 0) {
 			/* skb is already freed. */
 			skb = NULL;
@@ -1980,7 +2070,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		u32 flags;
 
 		memset(&fl6, 0, sizeof(fl6));
-		fl6.flowi6_oif = rdst->remote_ifindex;
+		fl6.flowi6_oif = rdst ? rdst->remote_ifindex : 0;
 		fl6.daddr = dst->sin6.sin6_addr;
 		fl6.saddr = vxlan->saddr.sin6.sin6_addr;
 		fl6.flowi6_mark = skb->mark;
@@ -2018,11 +2108,11 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		}
 
 		ttl = ttl ? : ip6_dst_hoplimit(ndst);
-		md.vni = htonl(vni << 8);
-		md.gbp = skb->mark;
+		md->vni = htonl(vni << 8);
+		md->gbp = skb->mark;
 
 		err = vxlan6_xmit_skb(ndst, sk, skb, dev, &fl6.saddr, &fl6.daddr,
-				      0, ttl, src_port, dst_port, &md,
+				      0, ttl, src_port, dst_port, md,
 				      !net_eq(vxlan->net, dev_net(vxlan->dev)),
 				      vxlan->flags);
 #endif
@@ -2051,6 +2141,7 @@ tx_free:
 static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
+	const struct ip_tunnel_info *info = skb_tunnel_info(skb);
 	struct ethhdr *eth;
 	bool did_rsc = false;
 	struct vxlan_rdst *rdst, *fdst = NULL;
@@ -2078,6 +2169,12 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
 #endif
 	}
 
+	if (vxlan->flags & VXLAN_F_FLOW_BASED &&
+	    info && info->mode == IP_TUNNEL_INFO_TX) {
+		vxlan_xmit_one(skb, dev, NULL, false);
+		return NETDEV_TX_OK;
+	}
+
 	f = vxlan_find_mac(vxlan, eth->h_dest);
 	did_rsc = false;
 
@@ -2405,6 +2502,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
 	[IFLA_VXLAN_RSC]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_L2MISS]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_L3MISS]	= { .type = NLA_U8 },
+	[IFLA_VXLAN_FLOWBASED]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_PORT]	= { .type = NLA_U16 },
 	[IFLA_VXLAN_UDP_CSUM]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]	= { .type = NLA_U8 },
@@ -2681,6 +2779,10 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev,
 	if (data[IFLA_VXLAN_LIMIT])
 		vxlan->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]);
 
+	if (data[IFLA_VXLAN_FLOWBASED] &&
+	    nla_get_u8(data[IFLA_VXLAN_FLOWBASED]))
+		vxlan->flags |= VXLAN_F_FLOW_BASED;
+
 	if (data[IFLA_VXLAN_PORT_RANGE]) {
 		const struct ifla_vxlan_port_range *p
 			= nla_data(data[IFLA_VXLAN_PORT_RANGE]);
@@ -2777,6 +2879,7 @@ static size_t vxlan_get_size(const struct net_device *dev)
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_RSC */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_L2MISS */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_L3MISS */
+		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_FLOWBASED */
 		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_AGEING */
 		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_LIMIT */
 		nla_total_size(sizeof(struct ifla_vxlan_port_range)) +
@@ -2843,6 +2946,8 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
 			!!(vxlan->flags & VXLAN_F_L2MISS)) ||
 	    nla_put_u8(skb, IFLA_VXLAN_L3MISS,
 			!!(vxlan->flags & VXLAN_F_L3MISS)) ||
+	    nla_put_u8(skb, IFLA_VXLAN_FLOWBASED,
+		       !!(vxlan->flags & VXLAN_F_FLOW_BASED)) ||
 	    nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->age_interval) ||
 	    nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->addrmax) ||
 	    nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->dst_port) ||
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 6bd96fe9416a..648a2c241993 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3469,5 +3469,6 @@ static inline unsigned int skb_gso_network_seglen(const struct sk_buff *skb)
 			       skb_network_header(skb);
 	return hdr_len + skb_gso_transport_seglen(skb);
 }
+
 #endif	/* __KERNEL__ */
 #endif	/* _LINUX_SKBUFF_H */
diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index 4f7694f3c7d0..e843937fb30a 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -8,6 +8,9 @@
 struct metadata_dst {
 	struct dst_entry		dst;
 	size_t				opts_len;
+	union {
+		struct ip_tunnel_info	tun_info;
+	} u;
 };
 
 static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb)
@@ -20,6 +23,16 @@ static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb)
 	return NULL;
 }
 
+static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb)
+{
+	struct metadata_dst *md_dst = skb_metadata_dst(skb);
+
+	if (md_dst)
+		return &md_dst->u.tun_info;
+
+	return NULL;
+}
+
 static inline bool skb_valid_dst(const struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 6b9d559ce5f5..d11530f1c1e2 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -38,10 +38,19 @@ struct ip_tunnel_key {
 	__be16			tp_dst;
 } __packed __aligned(4); /* Minimize padding. */
 
+/* Indicates whether the tunnel info structure represents receive
+ * or transmit tunnel parameters.
+ */
+enum {
+	IP_TUNNEL_INFO_RX,
+	IP_TUNNEL_INFO_TX,
+};
+
 struct ip_tunnel_info {
 	struct ip_tunnel_key	key;
 	const void		*options;
 	u8			options_len;
+	u8			mode;
 };
 
 /* 6rd prefix/relay information */
@@ -284,6 +293,11 @@ static inline void iptunnel_xmit_stats(int err,
 	}
 }
 
+static inline void *ip_tunnel_info_opts(struct ip_tunnel_info *info, size_t n)
+{
+	return info + 1;
+}
+
 #endif /* CONFIG_INET */
 
 #endif /* __NET_IP_TUNNELS_H */
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 0082b5d33d7d..80a2da29e088 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -7,6 +7,7 @@
 #include <linux/skbuff.h>
 #include <linux/netdevice.h>
 #include <linux/udp.h>
+#include <net/dst_metadata.h>
 
 #define VNI_HASH_BITS	10
 #define VNI_HASH_SIZE	(1<<VNI_HASH_BITS)
@@ -97,6 +98,9 @@ struct vxlanhdr {
 struct vxlan_metadata {
 	__be32		vni;
 	u32		gbp;
+
+	/* Temporary until vxlan_rcv() API is gone */
+	struct metadata_dst *tun_dst;
 };
 
 struct vxlan_sock;
@@ -130,6 +134,8 @@ struct vxlan_sock {
 #define VXLAN_F_REMCSUM_RX		0x400
 #define VXLAN_F_GBP			0x800
 #define VXLAN_F_REMCSUM_NOPARTIAL	0x1000
+#define VXLAN_F_COLLECT_METADATA	0x2000
+#define VXLAN_F_FLOW_BASED		0x4000
 
 /* Flags that are used in the receive path. These flags must match in
  * order for a socket to be shareable
@@ -137,7 +143,9 @@ struct vxlan_sock {
 #define VXLAN_F_RCV_FLAGS		(VXLAN_F_GBP |			\
 					 VXLAN_F_UDP_ZERO_CSUM6_RX |	\
 					 VXLAN_F_REMCSUM_RX |		\
-					 VXLAN_F_REMCSUM_NOPARTIAL)
+					 VXLAN_F_REMCSUM_NOPARTIAL |	\
+					 VXLAN_F_COLLECT_METADATA |	\
+					 VXLAN_F_FLOW_BASED)
 
 struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port,
 				  vxlan_rcv_t *rcv, void *data,
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 24d68b797c59..9eeb5d9cf8f0 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -382,6 +382,7 @@ enum {
 	IFLA_VXLAN_REMCSUM_RX,
 	IFLA_VXLAN_GBP,
 	IFLA_VXLAN_REMCSUM_NOPARTIAL,
+	IFLA_VXLAN_FLOWBASED,
 	__IFLA_VXLAN_MAX
 };
 #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
-- 
cgit v1.2.3


From 3093fbe7ff4bc7d1571fc217dade1cf80330a714 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 21 Jul 2015 10:44:00 +0200
Subject: route: Per route IP tunnel metadata via lightweight tunnel

This introduces a new IP tunnel lightweight tunnel type which allows
to specify IP tunnel instructions per route. Only IPv4 is supported
at this point.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c            |  10 +++-
 include/net/dst_metadata.h     |  12 ++++-
 include/net/ip_tunnels.h       |   7 ++-
 include/uapi/linux/lwtunnel.h  |   1 +
 include/uapi/linux/rtnetlink.h |  15 ++++++
 net/ipv4/ip_tunnel_core.c      | 114 +++++++++++++++++++++++++++++++++++++++++
 net/ipv4/route.c               |   2 +-
 net/openvswitch/vport.h        |   1 +
 8 files changed, 157 insertions(+), 5 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 06c092b05a51..9486d7ec128c 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1935,7 +1935,7 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
 static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 			   struct vxlan_rdst *rdst, bool did_rsc)
 {
-	struct ip_tunnel_info *info = skb_tunnel_info(skb);
+	struct ip_tunnel_info *info;
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct sock *sk = vxlan->vn_sock->sock->sk;
 	struct rtable *rt = NULL;
@@ -1952,6 +1952,9 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 	int err;
 	u32 flags = vxlan->flags;
 
+	/* FIXME: Support IPv6 */
+	info = skb_tunnel_info(skb, AF_INET);
+
 	if (rdst) {
 		dst_port = rdst->remote_port ? rdst->remote_port : vxlan->dst_port;
 		vni = rdst->remote_vni;
@@ -2141,12 +2144,15 @@ tx_free:
 static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
-	const struct ip_tunnel_info *info = skb_tunnel_info(skb);
+	const struct ip_tunnel_info *info;
 	struct ethhdr *eth;
 	bool did_rsc = false;
 	struct vxlan_rdst *rdst, *fdst = NULL;
 	struct vxlan_fdb *f;
 
+	/* FIXME: Support IPv6 */
+	info = skb_tunnel_info(skb, AF_INET);
+
 	skb_reset_mac_header(skb);
 	eth = eth_hdr(skb);
 
diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index e843937fb30a..7b0306894663 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -23,13 +23,23 @@ static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb)
 	return NULL;
 }
 
-static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb)
+static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb,
+						     int family)
 {
 	struct metadata_dst *md_dst = skb_metadata_dst(skb);
+	struct rtable *rt;
 
 	if (md_dst)
 		return &md_dst->u.tun_info;
 
+	switch (family) {
+	case AF_INET:
+		rt = (struct rtable *)skb_dst(skb);
+		if (rt && rt->rt_lwtstate)
+			return lwt_tun_info(rt->rt_lwtstate);
+		break;
+	}
+
 	return NULL;
 }
 
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index d11530f1c1e2..0b7e18cfa0b4 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -9,9 +9,9 @@
 #include <net/dsfield.h>
 #include <net/gro_cells.h>
 #include <net/inet_ecn.h>
-#include <net/ip.h>
 #include <net/netns/generic.h>
 #include <net/rtnetlink.h>
+#include <net/lwtunnel.h>
 
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/ipv6.h>
@@ -298,6 +298,11 @@ static inline void *ip_tunnel_info_opts(struct ip_tunnel_info *info, size_t n)
 	return info + 1;
 }
 
+static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstate)
+{
+	return (struct ip_tunnel_info *)lwtstate->data;
+}
+
 #endif /* CONFIG_INET */
 
 #endif /* __NET_IP_TUNNELS_H */
diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
index aa611d931a31..31377bbea3f8 100644
--- a/include/uapi/linux/lwtunnel.h
+++ b/include/uapi/linux/lwtunnel.h
@@ -6,6 +6,7 @@
 enum lwtunnel_encap_types {
 	LWTUNNEL_ENCAP_NONE,
 	LWTUNNEL_ENCAP_MPLS,
+	LWTUNNEL_ENCAP_IP,
 	__LWTUNNEL_ENCAP_MAX,
 };
 
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 0d3d3cc43356..47d24cb3fbc1 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -286,6 +286,21 @@ enum rt_class_t {
 
 /* Routing message attributes */
 
+enum ip_tunnel_t {
+	IP_TUN_UNSPEC,
+	IP_TUN_ID,
+	IP_TUN_DST,
+	IP_TUN_SRC,
+	IP_TUN_TTL,
+	IP_TUN_TOS,
+	IP_TUN_SPORT,
+	IP_TUN_DPORT,
+	IP_TUN_FLAGS,
+	__IP_TUN_MAX,
+};
+
+#define IP_TUN_MAX (__IP_TUN_MAX - 1)
+
 enum rtattr_type_t {
 	RTA_UNSPEC,
 	RTA_DST,
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 6a51a71a6c67..025b76e803fd 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -190,3 +190,117 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
 	return tot;
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
+
+static const struct nla_policy ip_tun_policy[IP_TUN_MAX + 1] = {
+	[IP_TUN_ID]		= { .type = NLA_U64 },
+	[IP_TUN_DST]		= { .type = NLA_U32 },
+	[IP_TUN_SRC]		= { .type = NLA_U32 },
+	[IP_TUN_TTL]		= { .type = NLA_U8 },
+	[IP_TUN_TOS]		= { .type = NLA_U8 },
+	[IP_TUN_SPORT]		= { .type = NLA_U16 },
+	[IP_TUN_DPORT]		= { .type = NLA_U16 },
+	[IP_TUN_FLAGS]		= { .type = NLA_U16 },
+};
+
+static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr,
+			      struct lwtunnel_state **ts)
+{
+	struct ip_tunnel_info *tun_info;
+	struct lwtunnel_state *new_state;
+	struct nlattr *tb[IP_TUN_MAX + 1];
+	int err;
+
+	err = nla_parse_nested(tb, IP_TUN_MAX, attr, ip_tun_policy);
+	if (err < 0)
+		return err;
+
+	new_state = lwtunnel_state_alloc(sizeof(*tun_info));
+	if (!new_state)
+		return -ENOMEM;
+
+	new_state->type = LWTUNNEL_ENCAP_IP;
+
+	tun_info = lwt_tun_info(new_state);
+
+	if (tb[IP_TUN_ID])
+		tun_info->key.tun_id = nla_get_u64(tb[IP_TUN_ID]);
+
+	if (tb[IP_TUN_DST])
+		tun_info->key.ipv4_dst = nla_get_be32(tb[IP_TUN_DST]);
+
+	if (tb[IP_TUN_SRC])
+		tun_info->key.ipv4_src = nla_get_be32(tb[IP_TUN_SRC]);
+
+	if (tb[IP_TUN_TTL])
+		tun_info->key.ipv4_ttl = nla_get_u8(tb[IP_TUN_TTL]);
+
+	if (tb[IP_TUN_TOS])
+		tun_info->key.ipv4_tos = nla_get_u8(tb[IP_TUN_TOS]);
+
+	if (tb[IP_TUN_SPORT])
+		tun_info->key.tp_src = nla_get_be16(tb[IP_TUN_SPORT]);
+
+	if (tb[IP_TUN_DPORT])
+		tun_info->key.tp_dst = nla_get_be16(tb[IP_TUN_DPORT]);
+
+	if (tb[IP_TUN_FLAGS])
+		tun_info->key.tun_flags = nla_get_u16(tb[IP_TUN_FLAGS]);
+
+	tun_info->mode = IP_TUNNEL_INFO_TX;
+	tun_info->options = NULL;
+	tun_info->options_len = 0;
+
+	*ts = new_state;
+
+	return 0;
+}
+
+static int ip_tun_fill_encap_info(struct sk_buff *skb,
+				  struct lwtunnel_state *lwtstate)
+{
+	struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
+
+	if (nla_put_u64(skb, IP_TUN_ID, tun_info->key.tun_id) ||
+	    nla_put_be32(skb, IP_TUN_DST, tun_info->key.ipv4_dst) ||
+	    nla_put_be32(skb, IP_TUN_SRC, tun_info->key.ipv4_src) ||
+	    nla_put_u8(skb, IP_TUN_TOS, tun_info->key.ipv4_tos) ||
+	    nla_put_u8(skb, IP_TUN_TTL, tun_info->key.ipv4_ttl) ||
+	    nla_put_u16(skb, IP_TUN_SPORT, tun_info->key.tp_src) ||
+	    nla_put_u16(skb, IP_TUN_DPORT, tun_info->key.tp_dst) ||
+	    nla_put_u16(skb, IP_TUN_FLAGS, tun_info->key.tun_flags))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+	return nla_total_size(8)	/* IP_TUN_ID */
+		+ nla_total_size(4)	/* IP_TUN_DST */
+		+ nla_total_size(4)	/* IP_TUN_SRC */
+		+ nla_total_size(1)	/* IP_TUN_TOS */
+		+ nla_total_size(1)	/* IP_TUN_TTL */
+		+ nla_total_size(2)	/* IP_TUN_SPORT */
+		+ nla_total_size(2)	/* IP_TUN_DPORT */
+		+ nla_total_size(2);	/* IP_TUN_FLAGS */
+}
+
+static const struct lwtunnel_encap_ops ip_tun_lwt_ops = {
+	.build_state = ip_tun_build_state,
+	.fill_encap = ip_tun_fill_encap_info,
+	.get_encap_size = ip_tun_encap_nlsize,
+};
+
+static int __init ip_tunnel_core_init(void)
+{
+	lwtunnel_encap_add_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP);
+
+	return 0;
+}
+module_init(ip_tunnel_core_init);
+
+static void __exit ip_tunnel_core_exit(void)
+{
+	lwtunnel_encap_del_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP);
+}
+module_exit(ip_tunnel_core_exit);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 91da18be0a71..519ec232818d 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1693,7 +1693,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	   by fib_lookup.
 	 */
 
-	tun_info = skb_tunnel_info(skb);
+	tun_info = skb_tunnel_info(skb, AF_INET);
 	if (tun_info && tun_info->mode == IP_TUNNEL_INFO_RX)
 		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
 	else
diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
index 4750fb673a9f..75d68248ba69 100644
--- a/net/openvswitch/vport.h
+++ b/net/openvswitch/vport.h
@@ -27,6 +27,7 @@
 #include <linux/skbuff.h>
 #include <linux/spinlock.h>
 #include <linux/u64_stats_sync.h>
+#include <net/route.h>
 
 #include "datapath.h"
 
-- 
cgit v1.2.3


From e7030878fc8448492b6e5cecd574043f63271298 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 21 Jul 2015 10:44:01 +0200
Subject: fib: Add fib rule match on tunnel id

This add the ability to select a routing table based on the tunnel
id which allows to maintain separate routing tables for each virtual
tunnel network.

ip rule add from all tunnel-id 100 lookup 100
ip rule add from all tunnel-id 200 lookup 200

A new static key controls the collection of metadata at tunnel level
upon demand.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c            |  3 ++-
 include/net/fib_rules.h        |  1 +
 include/net/ip_tunnels.h       | 11 +++++++++++
 include/uapi/linux/fib_rules.h |  2 +-
 net/core/fib_rules.c           | 24 ++++++++++++++++++++++--
 net/ipv4/ip_tunnel_core.c      | 16 ++++++++++++++++
 6 files changed, 53 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 9486d7ec128c..2587ac84f71a 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -143,7 +143,8 @@ static struct workqueue_struct *vxlan_wq;
 
 static inline bool vxlan_collect_metadata(struct vxlan_sock *vs)
 {
-	return vs->flags & VXLAN_F_COLLECT_METADATA;
+	return vs->flags & VXLAN_F_COLLECT_METADATA ||
+	       ip_tunnel_collect_metadata();
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index 903a55efbffe..4e8f804f4589 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -19,6 +19,7 @@ struct fib_rule {
 	u8			action;
 	/* 3 bytes hole, try to use */
 	u32			target;
+	__be64			tun_id;
 	struct fib_rule __rcu	*ctarget;
 	struct net		*fr_net;
 
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 0b7e18cfa0b4..0a5a7763eec2 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -303,6 +303,17 @@ static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstat
 	return (struct ip_tunnel_info *)lwtstate->data;
 }
 
+extern struct static_key ip_tunnel_metadata_cnt;
+
+/* Returns > 0 if metadata should be collected */
+static inline int ip_tunnel_collect_metadata(void)
+{
+	return static_key_false(&ip_tunnel_metadata_cnt);
+}
+
+void ip_tunnel_need_metadata(void);
+void ip_tunnel_unneed_metadata(void);
+
 #endif /* CONFIG_INET */
 
 #endif /* __NET_IP_TUNNELS_H */
diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h
index 2b82d7e30974..96161b8202b5 100644
--- a/include/uapi/linux/fib_rules.h
+++ b/include/uapi/linux/fib_rules.h
@@ -43,7 +43,7 @@ enum {
 	FRA_UNUSED5,
 	FRA_FWMARK,	/* mark */
 	FRA_FLOW,	/* flow/class id */
-	FRA_UNUSED6,
+	FRA_TUN_ID,
 	FRA_SUPPRESS_IFGROUP,
 	FRA_SUPPRESS_PREFIXLEN,
 	FRA_TABLE,	/* Extended table id */
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 9a12668f7d62..ae8306e7c56f 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -16,6 +16,7 @@
 #include <net/net_namespace.h>
 #include <net/sock.h>
 #include <net/fib_rules.h>
+#include <net/ip_tunnels.h>
 
 int fib_default_rule_add(struct fib_rules_ops *ops,
 			 u32 pref, u32 table, u32 flags)
@@ -186,6 +187,9 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
 	if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask)
 		goto out;
 
+	if (rule->tun_id && (rule->tun_id != fl->flowi_tun_key.tun_id))
+		goto out;
+
 	ret = ops->match(rule, fl, flags);
 out:
 	return (rule->flags & FIB_RULE_INVERT) ? !ret : ret;
@@ -330,6 +334,9 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
 	if (tb[FRA_FWMASK])
 		rule->mark_mask = nla_get_u32(tb[FRA_FWMASK]);
 
+	if (tb[FRA_TUN_ID])
+		rule->tun_id = nla_get_be64(tb[FRA_TUN_ID]);
+
 	rule->action = frh->action;
 	rule->flags = frh->flags;
 	rule->table = frh_get_table(frh, tb);
@@ -407,6 +414,9 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
 	if (unresolved)
 		ops->unresolved_rules++;
 
+	if (rule->tun_id)
+		ip_tunnel_need_metadata();
+
 	notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid);
 	flush_route_cache(ops);
 	rules_ops_put(ops);
@@ -473,6 +483,10 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh)
 		    (rule->mark_mask != nla_get_u32(tb[FRA_FWMASK])))
 			continue;
 
+		if (tb[FRA_TUN_ID] &&
+		    (rule->tun_id != nla_get_be64(tb[FRA_TUN_ID])))
+			continue;
+
 		if (!ops->compare(rule, frh, tb))
 			continue;
 
@@ -487,6 +501,9 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh)
 				goto errout;
 		}
 
+		if (rule->tun_id)
+			ip_tunnel_unneed_metadata();
+
 		list_del_rcu(&rule->list);
 
 		if (rule->action == FR_ACT_GOTO) {
@@ -535,7 +552,8 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
 			 + nla_total_size(4) /* FRA_SUPPRESS_PREFIXLEN */
 			 + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */
 			 + nla_total_size(4) /* FRA_FWMARK */
-			 + nla_total_size(4); /* FRA_FWMASK */
+			 + nla_total_size(4) /* FRA_FWMASK */
+			 + nla_total_size(8); /* FRA_TUN_ID */
 
 	if (ops->nlmsg_payload)
 		payload += ops->nlmsg_payload(rule);
@@ -591,7 +609,9 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
 	    ((rule->mark_mask || rule->mark) &&
 	     nla_put_u32(skb, FRA_FWMASK, rule->mark_mask)) ||
 	    (rule->target &&
-	     nla_put_u32(skb, FRA_GOTO, rule->target)))
+	     nla_put_u32(skb, FRA_GOTO, rule->target)) ||
+	    (rule->tun_id &&
+	     nla_put_be64(skb, FRA_TUN_ID, rule->tun_id)))
 		goto nla_put_failure;
 
 	if (rule->suppress_ifgroup != -1) {
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 025b76e803fd..630e6d5712e8 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -32,6 +32,7 @@
 #include <linux/etherdevice.h>
 #include <linux/if_ether.h>
 #include <linux/if_vlan.h>
+#include <linux/static_key.h>
 
 #include <net/ip.h>
 #include <net/icmp.h>
@@ -304,3 +305,18 @@ static void __exit ip_tunnel_core_exit(void)
 	lwtunnel_encap_del_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP);
 }
 module_exit(ip_tunnel_core_exit);
+
+struct static_key ip_tunnel_metadata_cnt = STATIC_KEY_INIT_FALSE;
+EXPORT_SYMBOL(ip_tunnel_metadata_cnt);
+
+void ip_tunnel_need_metadata(void)
+{
+	static_key_slow_inc(&ip_tunnel_metadata_cnt);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_need_metadata);
+
+void ip_tunnel_unneed_metadata(void)
+{
+	static_key_slow_dec(&ip_tunnel_metadata_cnt);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_unneed_metadata);
-- 
cgit v1.2.3


From b56ea2985d389a3676638203323ebe22c261b7fe Mon Sep 17 00:00:00 2001
From: Rick Jones <rick.jones2@hp.com>
Date: Tue, 21 Jul 2015 16:14:13 -0700
Subject: net: track success and failure of TCP PMTU probing

Track success and failure of TCP PMTU probing.

Signed-off-by: Rick Jones <rick.jones2@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/snmp.h | 2 ++
 net/ipv4/proc.c           | 2 ++
 net/ipv4/tcp_input.c      | 2 ++
 3 files changed, 6 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index eee8968407f0..25a9ad8bcef1 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -278,6 +278,8 @@ enum
 	LINUX_MIB_TCPACKSKIPPEDCHALLENGE,	/* TCPACKSkippedChallenge */
 	LINUX_MIB_TCPWINPROBE,			/* TCPWinProbe */
 	LINUX_MIB_TCPKEEPALIVE,			/* TCPKeepAlive */
+	LINUX_MIB_TCPMTUPFAIL,			/* TCPMTUPFail */
+	LINUX_MIB_TCPMTUPSUCCESS,		/* TCPMTUPSuccess */
 	__LINUX_MIB_MAX
 };
 
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index da5d483e236a..3abd9d7a3adf 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -300,6 +300,8 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPACKSkippedChallenge", LINUX_MIB_TCPACKSKIPPEDCHALLENGE),
 	SNMP_MIB_ITEM("TCPWinProbe", LINUX_MIB_TCPWINPROBE),
 	SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE),
+	SNMP_MIB_ITEM("TCPMTUPFail", LINUX_MIB_TCPMTUPFAIL),
+	SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1578fc2a6f39..cda3ffedadb6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2593,6 +2593,7 @@ static void tcp_mtup_probe_failed(struct sock *sk)
 
 	icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
 	icsk->icsk_mtup.probe_size = 0;
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMTUPFAIL);
 }
 
 static void tcp_mtup_probe_success(struct sock *sk)
@@ -2612,6 +2613,7 @@ static void tcp_mtup_probe_success(struct sock *sk)
 	icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
 	icsk->icsk_mtup.probe_size = 0;
 	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS);
 }
 
 /* Do a simple retransmit without using the backoff mechanisms in
-- 
cgit v1.2.3


From 3985e8a3611a93bb36789f65db862e5700aab65e Mon Sep 17 00:00:00 2001
From: Erik Kline <ek@google.com>
Date: Wed, 22 Jul 2015 16:38:25 +0900
Subject: ipv6: sysctl to restrict candidate source addresses

Per RFC 6724, section 4, "Candidate Source Addresses":

    It is RECOMMENDED that the candidate source addresses be the set
    of unicast addresses assigned to the interface that will be used
    to send to the destination (the "outgoing" interface).

Add a sysctl to enable this behaviour.

Signed-off-by: Erik Kline <ek@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  7 +++++++
 include/linux/ipv6.h                   |  1 +
 include/uapi/linux/ipv6.h              |  1 +
 net/ipv6/addrconf.c                    | 22 +++++++++++++++++++---
 4 files changed, 28 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index f63aeefd2c24..1a5ab21bcca5 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1460,6 +1460,13 @@ router_solicitations - INTEGER
 	routers are present.
 	Default: 3
 
+use_oif_addrs_only - BOOLEAN
+	When enabled, the candidate source addresses for destinations
+	routed via this interface are restricted to the set of addresses
+	configured on this interface (vis. RFC 6724, section 4).
+
+	Default: false
+
 use_tempaddr - INTEGER
 	Preference for Privacy Extensions (RFC3041).
 	  <= 0 : disable Privacy Extensions
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 1319a6bb6b82..06ed637225b8 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -57,6 +57,7 @@ struct ipv6_devconf {
 		bool initialized;
 		struct in6_addr secret;
 	} stable_secret;
+	__s32		use_oif_addrs_only;
 	void		*sysctl;
 };
 
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 5efa54ae567c..641a146ead7d 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -171,6 +171,7 @@ enum {
 	DEVCONF_USE_OPTIMISTIC,
 	DEVCONF_ACCEPT_RA_MTU,
 	DEVCONF_STABLE_SECRET,
+	DEVCONF_USE_OIF_ADDRS_ONLY,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 32153c248959..eb0c6a3a8a00 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -211,7 +211,8 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	.accept_ra_mtu		= 1,
 	.stable_secret		= {
 		.initialized = false,
-	}
+	},
+	.use_oif_addrs_only	= 0,
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -253,6 +254,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	.stable_secret		= {
 		.initialized = false,
 	},
+	.use_oif_addrs_only	= 0,
 };
 
 /* Check if a valid qdisc is available */
@@ -1472,11 +1474,16 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev,
 	 *    include addresses assigned to interfaces
 	 *    belonging to the same site as the outgoing
 	 *    interface.)
+	 *  - "It is RECOMMENDED that the candidate source addresses
+	 *    be the set of unicast addresses assigned to the
+	 *    interface that will be used to send to the destination
+	 *    (the 'outgoing' interface)." (RFC 6724)
 	 */
 	if (dst_dev) {
+		idev = __in6_dev_get(dst_dev);
 		if ((dst_type & IPV6_ADDR_MULTICAST) ||
-		    dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL) {
-			idev = __in6_dev_get(dst_dev);
+		    dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL ||
+		    (idev && idev->cnf.use_oif_addrs_only)) {
 			use_oif_addr = true;
 		}
 	}
@@ -4607,6 +4614,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_ACCEPT_RA_FROM_LOCAL] = cnf->accept_ra_from_local;
 	array[DEVCONF_ACCEPT_RA_MTU] = cnf->accept_ra_mtu;
 	/* we omit DEVCONF_STABLE_SECRET for now */
+	array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -5605,6 +5613,14 @@ static struct addrconf_sysctl_table
 			.mode		= 0600,
 			.proc_handler	= addrconf_sysctl_stable_secret,
 		},
+		{
+			.procname       = "use_oif_addrs_only",
+			.data           = &ipv6_devconf.use_oif_addrs_only,
+			.maxlen         = sizeof(int),
+			.mode           = 0644,
+			.proc_handler   = proc_dointvec,
+
+		},
 		{
 			/* sentinel */
 		}
-- 
cgit v1.2.3


From 2ce7918990641b07e70e1b25752d666369e2016e Mon Sep 17 00:00:00 2001
From: Andrey Smetanin <asmetanin@virtuozzo.com>
Date: Fri, 3 Jul 2015 15:01:41 +0300
Subject: kvm/x86: add sending hyper-v crash notification to user space

Sending of notification is done by exiting vcpu to user space
if KVM_REQ_HV_CRASH is enabled for vcpu. At exit to user space
the kvm_run structure contains system_event with type
KVM_SYSTEM_EVENT_CRASH to notify about guest crash occurred.

Signed-off-by: Andrey Smetanin <asmetanin@virtuozzo.com>
Signed-off-by: Denis V. Lunev <den@openvz.org>
Reviewed-by: Peter Hornyack <peterhornyack@google.com>
CC: Paolo Bonzini <pbonzini@redhat.com>
CC: Gleb Natapov <gleb@kernel.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/api.txt | 5 +++++
 arch/x86/kvm/x86.c                | 6 ++++++
 include/linux/kvm_host.h          | 1 +
 include/uapi/linux/kvm.h          | 1 +
 4 files changed, 13 insertions(+)

(limited to 'include/uapi')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index a7926a90156f..a4ebcb712375 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3277,6 +3277,7 @@ should put the acknowledged interrupt vector into the 'epr' field.
 		struct {
 #define KVM_SYSTEM_EVENT_SHUTDOWN       1
 #define KVM_SYSTEM_EVENT_RESET          2
+#define KVM_SYSTEM_EVENT_CRASH          3
 			__u32 type;
 			__u64 flags;
 		} system_event;
@@ -3296,6 +3297,10 @@ Valid values for 'type' are:
   KVM_SYSTEM_EVENT_RESET -- the guest has requested a reset of the VM.
    As with SHUTDOWN, userspace can choose to ignore the request, or
    to schedule the reset to occur in the future and may call KVM_RUN again.
+  KVM_SYSTEM_EVENT_CRASH -- the guest crash occurred and the guest
+   has requested a crash condition maintenance. Userspace can choose
+   to ignore the request, or to gather VM memory core dump and/or
+   reset/shutdown of the VM.
 
 		/* Fix the size of the union. */
 		char padding[256];
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cfa3e5a7d6be..28076c266a9a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6263,6 +6263,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			vcpu_scan_ioapic(vcpu);
 		if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
 			kvm_vcpu_reload_apic_access_page(vcpu);
+		if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) {
+			vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+			vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH;
+			r = 0;
+			goto out;
+		}
 	}
 
 	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1d917d9b7f12..51103f0feb7e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -139,6 +139,7 @@ static inline bool is_error_page(struct page *page)
 #define KVM_REQ_DISABLE_IBS       24
 #define KVM_REQ_APIC_PAGE_RELOAD  25
 #define KVM_REQ_SMI               26
+#define KVM_REQ_HV_CRASH          27
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID		0
 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID	1
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 716ad4ae4d4b..9ef19ebd9df4 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -317,6 +317,7 @@ struct kvm_run {
 		struct {
 #define KVM_SYSTEM_EVENT_SHUTDOWN       1
 #define KVM_SYSTEM_EVENT_RESET          2
+#define KVM_SYSTEM_EVENT_CRASH          3
 			__u32 type;
 			__u64 flags;
 		} system_event;
-- 
cgit v1.2.3


From 45c6df4471edf5404463756df1014fe7e36db3d3 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 23 Jun 2015 19:09:59 +0200
Subject: tty: linux/gsmmux.h needs linux/types.h

We use __u8 in linux/gsmmux.h, so include linux/types.h to have that
defined.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/gsmmux.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/gsmmux.h b/include/uapi/linux/gsmmux.h
index c06742d52856..ab055d8cddef 100644
--- a/include/uapi/linux/gsmmux.h
+++ b/include/uapi/linux/gsmmux.h
@@ -3,6 +3,7 @@
 
 #include <linux/if.h>
 #include <linux/ioctl.h>
+#include <linux/types.h>
 
 struct gsm_config
 {
-- 
cgit v1.2.3


From 45ac1403f564f411c6a383a2448688ba8dd705a4 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Tue, 21 Jul 2015 12:44:02 +0300
Subject: perf: Add PERF_RECORD_SWITCH to indicate context switches

There are already two events for context switches, namely the tracepoint
sched:sched_switch and the software event context_switches.
Unfortunately neither are suitable for use by non-privileged users for
the purpose of synchronizing hardware trace data (e.g. Intel PT) to the
context switch.

Tracepoints are no good at all for non-privileged users because they
need either CAP_SYS_ADMIN or /proc/sys/kernel/perf_event_paranoid <= -1.

On the other hand, kernel software events need either CAP_SYS_ADMIN or
/proc/sys/kernel/perf_event_paranoid <= 1.

Now many distributions do default perf_event_paranoid to 1 making
context_switches a contender, except it has another problem (which is
also shared with sched:sched_switch) which is that it happens before
perf schedules events out instead of after perf schedules events in.
Whereas a privileged user can see all the events anyway, a
non-privileged user only sees events for their own processes, in other
words they see when their process was scheduled out not when it was
scheduled in. That presents two problems to use the event:

1. the information comes too late, so tools have to look ahead in the
   event stream to find out what the current state is

2. if they are unlucky tracing might have stopped before the
   context-switches event is recorded.

This new PERF_RECORD_SWITCH event does not have those problems
and it also has a couple of other small advantages.

It is easier to use because it is an auxiliary event (like mmap, comm
and task events) which can be enabled by setting a single bit. It is
smaller than sched:sched_switch and easier to parse.

To make the event useful for privileged users also, if the
context is cpu-wide then the event record will be
PERF_RECORD_SWITCH_CPU_WIDE which is the same as
PERF_RECORD_SWITCH except it also provides the next or
previous pid/tid.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Jiri Olsa <jolsa@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Pawel Moll <pawel.moll@arm.com>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1437471846-26995-2-git-send-email-adrian.hunter@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 include/uapi/linux/perf_event.h |  31 +++++++++++-
 kernel/events/core.c            | 103 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 133 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index d97f84c080da..022d0acf7df0 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -330,7 +330,8 @@ struct perf_event_attr {
 				mmap2          :  1, /* include mmap with inode data     */
 				comm_exec      :  1, /* flag comm events that are due to an exec */
 				use_clockid    :  1, /* use @clockid for time fields */
-				__reserved_1   : 38;
+				context_switch :  1, /* context switch data */
+				__reserved_1   : 37;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
@@ -572,9 +573,11 @@ struct perf_event_mmap_page {
 /*
  * PERF_RECORD_MISC_MMAP_DATA and PERF_RECORD_MISC_COMM_EXEC are used on
  * different events so can reuse the same bit position.
+ * Ditto PERF_RECORD_MISC_SWITCH_OUT.
  */
 #define PERF_RECORD_MISC_MMAP_DATA		(1 << 13)
 #define PERF_RECORD_MISC_COMM_EXEC		(1 << 13)
+#define PERF_RECORD_MISC_SWITCH_OUT		(1 << 13)
 /*
  * Indicates that the content of PERF_SAMPLE_IP points to
  * the actual instruction that triggered the event. See also
@@ -818,6 +821,32 @@ enum perf_event_type {
 	 */
 	PERF_RECORD_LOST_SAMPLES		= 13,
 
+	/*
+	 * Records a context switch in or out (flagged by
+	 * PERF_RECORD_MISC_SWITCH_OUT). See also
+	 * PERF_RECORD_SWITCH_CPU_WIDE.
+	 *
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_SWITCH			= 14,
+
+	/*
+	 * CPU-wide version of PERF_RECORD_SWITCH with next_prev_pid and
+	 * next_prev_tid that are the next (switching out) or previous
+	 * (switching in) pid/tid.
+	 *
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u32				next_prev_pid;
+	 *	u32				next_prev_tid;
+	 *	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_SWITCH_CPU_WIDE		= 15,
+
 	PERF_RECORD_MAX,			/* non-ABI */
 };
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d3dae3419b99..ce21143c0d9e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -163,6 +163,7 @@ static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
 static atomic_t nr_task_events __read_mostly;
 static atomic_t nr_freq_events __read_mostly;
+static atomic_t nr_switch_events __read_mostly;
 
 static LIST_HEAD(pmus);
 static DEFINE_MUTEX(pmus_lock);
@@ -2619,6 +2620,9 @@ static void perf_pmu_sched_task(struct task_struct *prev,
 	local_irq_restore(flags);
 }
 
+static void perf_event_switch(struct task_struct *task,
+			      struct task_struct *next_prev, bool sched_in);
+
 #define for_each_task_context_nr(ctxn)					\
 	for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
 
@@ -2641,6 +2645,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
 	if (__this_cpu_read(perf_sched_cb_usages))
 		perf_pmu_sched_task(task, next, false);
 
+	if (atomic_read(&nr_switch_events))
+		perf_event_switch(task, next, false);
+
 	for_each_task_context_nr(ctxn)
 		perf_event_context_sched_out(task, ctxn, next);
 
@@ -2831,6 +2838,9 @@ void __perf_event_task_sched_in(struct task_struct *prev,
 	if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
 		perf_cgroup_sched_in(prev, task);
 
+	if (atomic_read(&nr_switch_events))
+		perf_event_switch(task, prev, true);
+
 	if (__this_cpu_read(perf_sched_cb_usages))
 		perf_pmu_sched_task(prev, task, true);
 }
@@ -3454,6 +3464,10 @@ static void unaccount_event(struct perf_event *event)
 		atomic_dec(&nr_task_events);
 	if (event->attr.freq)
 		atomic_dec(&nr_freq_events);
+	if (event->attr.context_switch) {
+		static_key_slow_dec_deferred(&perf_sched_events);
+		atomic_dec(&nr_switch_events);
+	}
 	if (is_cgroup_event(event))
 		static_key_slow_dec_deferred(&perf_sched_events);
 	if (has_branch_stack(event))
@@ -5981,6 +5995,91 @@ void perf_log_lost_samples(struct perf_event *event, u64 lost)
 	perf_output_end(&handle);
 }
 
+/*
+ * context_switch tracking
+ */
+
+struct perf_switch_event {
+	struct task_struct	*task;
+	struct task_struct	*next_prev;
+
+	struct {
+		struct perf_event_header	header;
+		u32				next_prev_pid;
+		u32				next_prev_tid;
+	} event_id;
+};
+
+static int perf_event_switch_match(struct perf_event *event)
+{
+	return event->attr.context_switch;
+}
+
+static void perf_event_switch_output(struct perf_event *event, void *data)
+{
+	struct perf_switch_event *se = data;
+	struct perf_output_handle handle;
+	struct perf_sample_data sample;
+	int ret;
+
+	if (!perf_event_switch_match(event))
+		return;
+
+	/* Only CPU-wide events are allowed to see next/prev pid/tid */
+	if (event->ctx->task) {
+		se->event_id.header.type = PERF_RECORD_SWITCH;
+		se->event_id.header.size = sizeof(se->event_id.header);
+	} else {
+		se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
+		se->event_id.header.size = sizeof(se->event_id);
+		se->event_id.next_prev_pid =
+					perf_event_pid(event, se->next_prev);
+		se->event_id.next_prev_tid =
+					perf_event_tid(event, se->next_prev);
+	}
+
+	perf_event_header__init_id(&se->event_id.header, &sample, event);
+
+	ret = perf_output_begin(&handle, event, se->event_id.header.size);
+	if (ret)
+		return;
+
+	if (event->ctx->task)
+		perf_output_put(&handle, se->event_id.header);
+	else
+		perf_output_put(&handle, se->event_id);
+
+	perf_event__output_id_sample(event, &handle, &sample);
+
+	perf_output_end(&handle);
+}
+
+static void perf_event_switch(struct task_struct *task,
+			      struct task_struct *next_prev, bool sched_in)
+{
+	struct perf_switch_event switch_event;
+
+	/* N.B. caller checks nr_switch_events != 0 */
+
+	switch_event = (struct perf_switch_event){
+		.task		= task,
+		.next_prev	= next_prev,
+		.event_id	= {
+			.header = {
+				/* .type */
+				.misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
+				/* .size */
+			},
+			/* .next_prev_pid */
+			/* .next_prev_tid */
+		},
+	};
+
+	perf_event_aux(perf_event_switch_output,
+		       &switch_event,
+		       NULL);
+}
+
 /*
  * IRQ throttle logging
  */
@@ -7479,6 +7578,10 @@ static void account_event(struct perf_event *event)
 		if (atomic_inc_return(&nr_freq_events) == 1)
 			tick_nohz_full_kick_all();
 	}
+	if (event->attr.context_switch) {
+		atomic_inc(&nr_switch_events);
+		static_key_slow_inc(&perf_sched_events.key);
+	}
 	if (has_branch_stack(event))
 		static_key_slow_inc(&perf_sched_events.key);
 	if (is_cgroup_event(event))
-- 
cgit v1.2.3


From fc5462f8525b47fa219452289ecb22c921c16823 Mon Sep 17 00:00:00 2001
From: Azael Avalos <coproscefalo@gmail.com>
Date: Wed, 22 Jul 2015 18:09:11 -0600
Subject: toshiba_acpi: Add /dev/toshiba_acpi device

There were previous attempts to "merge" the toshiba SMM module to the
toshiba_acpi one, they were trying to imitate what the old toshiba
module does, however, some models (TOS1900 devices) come with a
"crippled" implementation and do not provide all the "features" a
"genuine" Toshiba BIOS does.

This patch adds a new device called toshiba_acpi, which aim is to
enable userspace to access the SMM on Toshiba laptops via ACPI calls.

Creating a new convenience _IOWR command to access the SCI functions
by opening/closing the SCI internally to avoid buggy BIOS, while at
the same time providing backwards compatibility.

Older programs (and new) who wish to access the SMM on newer models
can do it by pointing their path to /dev/toshiba_acpi (instead of
/dev/toshiba) as the toshiba.h header was modified to reflect these
changes as well as adds all the toshiba_acpi paths and command,
however, it is strongly recommended to use the new IOCTL for any
SCI command to avoid any buggy BIOS.

Signed-off-by: Azael Avalos <coproscefalo@gmail.com>
Signed-off-by: Darren Hart <dvhart@linux.intel.com>
---
 Documentation/ioctl/ioctl-number.txt |  2 +-
 drivers/platform/x86/toshiba_acpi.c  | 91 ++++++++++++++++++++++++++++++++++++
 include/uapi/linux/toshiba.h         | 32 +++++++++++--
 3 files changed, 121 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 611c52267d24..21d2f27c886b 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -263,7 +263,7 @@ Code  Seq#(hex)	Include File		Comments
 's'	all	linux/cdk.h
 't'	00-7F	linux/ppp-ioctl.h
 't'	80-8F	linux/isdn_ppp.h
-'t'	90	linux/toshiba.h
+'t'	90-91	linux/toshiba.h		toshiba and toshiba_acpi SMM
 'u'	00-1F	linux/smb_fs.h		gone
 'u'	20-3F	linux/uvcvideo.h	USB video class host driver
 'v'	00-1F	linux/ext2_fs.h		conflict!
diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c
index c3a0c4d0c1dc..802577f43a23 100644
--- a/drivers/platform/x86/toshiba_acpi.c
+++ b/drivers/platform/x86/toshiba_acpi.c
@@ -50,6 +50,8 @@
 #include <linux/acpi.h>
 #include <linux/dmi.h>
 #include <linux/uaccess.h>
+#include <linux/miscdevice.h>
+#include <linux/toshiba.h>
 #include <acpi/video.h>
 
 MODULE_AUTHOR("John Belmonte");
@@ -170,6 +172,7 @@ struct toshiba_acpi_dev {
 	struct led_classdev led_dev;
 	struct led_classdev kbd_led;
 	struct led_classdev eco_led;
+	struct miscdevice miscdev;
 
 	int force_fan;
 	int last_key_event;
@@ -2239,6 +2242,81 @@ static struct attribute_group toshiba_attr_group = {
 	.attrs = toshiba_attributes,
 };
 
+/*
+ * Misc device
+ */
+static int toshiba_acpi_smm_bridge(SMMRegisters *regs)
+{
+	u32 in[TCI_WORDS] = { regs->eax, regs->ebx, regs->ecx,
+			      regs->edx, regs->esi, regs->edi };
+	u32 out[TCI_WORDS];
+	acpi_status status;
+
+	status = tci_raw(toshiba_acpi, in, out);
+	if (ACPI_FAILURE(status)) {
+		pr_err("ACPI call to query SMM registers failed\n");
+		return -EIO;
+	}
+
+	/* Fillout the SMM struct with the TCI call results */
+	regs->eax = out[0];
+	regs->ebx = out[1];
+	regs->ecx = out[2];
+	regs->edx = out[3];
+	regs->esi = out[4];
+	regs->edi = out[5];
+
+	return 0;
+}
+
+static long toshiba_acpi_ioctl(struct file *fp, unsigned int cmd,
+			       unsigned long arg)
+{
+	SMMRegisters __user *argp = (SMMRegisters __user *)arg;
+	SMMRegisters regs;
+	int ret;
+
+	if (!argp)
+		return -EINVAL;
+
+	switch (cmd) {
+	case TOSH_SMM:
+		if (copy_from_user(&regs, argp, sizeof(SMMRegisters)))
+			return -EFAULT;
+		ret = toshiba_acpi_smm_bridge(&regs);
+		if (ret)
+			return ret;
+		if (copy_to_user(argp, &regs, sizeof(SMMRegisters)))
+			return -EFAULT;
+		break;
+	case TOSHIBA_ACPI_SCI:
+		if (copy_from_user(&regs, argp, sizeof(SMMRegisters)))
+			return -EFAULT;
+		/* Ensure we are being called with a SCI_{GET, SET} register */
+		if (regs.eax != SCI_GET && regs.eax != SCI_SET)
+			return -EINVAL;
+		if (!sci_open(toshiba_acpi))
+			return -EIO;
+		ret = toshiba_acpi_smm_bridge(&regs);
+		sci_close(toshiba_acpi);
+		if (ret)
+			return ret;
+		if (copy_to_user(argp, &regs, sizeof(SMMRegisters)))
+			return -EFAULT;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static const struct file_operations toshiba_acpi_fops = {
+	.owner		= THIS_MODULE,
+	.unlocked_ioctl = toshiba_acpi_ioctl,
+	.llseek		= noop_llseek,
+};
+
 /*
  * Hotkeys
  */
@@ -2540,6 +2618,8 @@ static int toshiba_acpi_remove(struct acpi_device *acpi_dev)
 {
 	struct toshiba_acpi_dev *dev = acpi_driver_data(acpi_dev);
 
+	misc_deregister(&dev->miscdev);
+
 	remove_toshiba_proc_entries(dev);
 
 	if (dev->sysfs_created)
@@ -2611,6 +2691,17 @@ static int toshiba_acpi_add(struct acpi_device *acpi_dev)
 		return -ENOMEM;
 	dev->acpi_dev = acpi_dev;
 	dev->method_hci = hci_method;
+	dev->miscdev.minor = MISC_DYNAMIC_MINOR;
+	dev->miscdev.name = "toshiba_acpi";
+	dev->miscdev.fops = &toshiba_acpi_fops;
+
+	ret = misc_register(&dev->miscdev);
+	if (ret) {
+		pr_err("Failed to register miscdevice\n");
+		kfree(dev);
+		return ret;
+	}
+
 	acpi_dev->driver_data = dev;
 	dev_set_drvdata(&acpi_dev->dev, dev);
 
diff --git a/include/uapi/linux/toshiba.h b/include/uapi/linux/toshiba.h
index e9bef5b2f91e..c58bf4b5bb26 100644
--- a/include/uapi/linux/toshiba.h
+++ b/include/uapi/linux/toshiba.h
@@ -1,6 +1,7 @@
 /* toshiba.h -- Linux driver for accessing the SMM on Toshiba laptops 
  *
  * Copyright (c) 1996-2000  Jonathan A. Buzzard (jonathan@buzzard.org.uk)
+ * Copyright (c) 2015  Azael Avalos <coproscefalo@gmail.com>
  *
  * Thanks to Juergen Heinzl <juergen@monocerus.demon.co.uk> for the pointers
  * on making sure the structure is aligned and packed.
@@ -20,9 +21,18 @@
 #ifndef _UAPI_LINUX_TOSHIBA_H
 #define _UAPI_LINUX_TOSHIBA_H
 
-#define TOSH_PROC "/proc/toshiba"
-#define TOSH_DEVICE "/dev/toshiba"
-#define TOSH_SMM _IOWR('t', 0x90, int)	/* broken: meant 24 bytes */
+/*
+ * Toshiba modules paths
+ */
+
+#define TOSH_PROC		"/proc/toshiba"
+#define TOSH_DEVICE		"/dev/toshiba"
+#define TOSHIBA_ACPI_PROC	"/proc/acpi/toshiba"
+#define TOSHIBA_ACPI_DEVICE	"/dev/toshiba_acpi"
+
+/*
+ * Toshiba SMM structure
+ */
 
 typedef struct {
 	unsigned int eax;
@@ -33,5 +43,21 @@ typedef struct {
 	unsigned int edi __attribute__ ((packed));
 } SMMRegisters;
 
+/*
+ * IOCTLs (0x90 - 0x91)
+ */
+
+#define TOSH_SMM		_IOWR('t', 0x90, SMMRegisters)
+/*
+ * Convenience toshiba_acpi command.
+ *
+ * The System Configuration Interface (SCI) is opened/closed internally
+ * to avoid userspace of buggy BIOSes.
+ *
+ * The toshiba_acpi module checks whether the eax register is set with
+ * SCI_GET (0xf300) or SCI_SET (0xf400), returning -EINVAL if not.
+ */
+#define TOSHIBA_ACPI_SCI	_IOWR('t', 0x91, SMMRegisters)
+
 
 #endif /* _UAPI_LINUX_TOSHIBA_H */
-- 
cgit v1.2.3


From e0910bace663b78c026b73bbd711a24ccf410531 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Thu, 23 Jul 2015 15:43:56 +0200
Subject: lwtunnel: export linux/lwtunnel.h to userspace

Note also that include/linux/lwtunnel.h is not needed.

CC: Thomas Graf <tgraf@suug.ch>
CC: Roopa Prabhu <roopa@cumulusnetworks.com>
Fixes: 499a24256862 ("lwtunnel: infrastructure for handling light weight tunnels like mpls")
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/lwtunnel.h  | 6 ------
 include/uapi/linux/Kbuild | 1 +
 2 files changed, 1 insertion(+), 6 deletions(-)
 delete mode 100644 include/linux/lwtunnel.h

(limited to 'include/uapi')

diff --git a/include/linux/lwtunnel.h b/include/linux/lwtunnel.h
deleted file mode 100644
index 97f32f8b4ae1..000000000000
--- a/include/linux/lwtunnel.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _LINUX_LWTUNNEL_H_
-#define _LINUX_LWTUNNEL_H_
-
-#include <uapi/linux/lwtunnel.h>
-
-#endif /* _LINUX_LWTUNNEL_H_ */
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 1ff9942718fe..aafb9937b162 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -243,6 +243,7 @@ header-y += limits.h
 header-y += llc.h
 header-y += loop.h
 header-y += lp.h
+header-y += lwtunnel.h
 header-y += magic.h
 header-y += major.h
 header-y += map_to_7segment.h
-- 
cgit v1.2.3


From ec92777f2ba93c00387b8fe53780c25adc57c744 Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Thu, 9 Jul 2015 13:25:35 -0600
Subject: libnvdimm: Update name of the ars_status_record mask field

The spec suggests that this is a simple 'length' field, not a mask.
Update the name accordingly.

Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/uapi/linux/ndctl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
index 2b94ea2287bb..e94bc20016b2 100644
--- a/include/uapi/linux/ndctl.h
+++ b/include/uapi/linux/ndctl.h
@@ -87,7 +87,7 @@ struct nd_cmd_ars_status {
 		__u32 handle;
 		__u32 flags;
 		__u64 err_address;
-		__u64 mask;
+		__u64 length;
 	} __packed records[0];
 } __packed;
 
-- 
cgit v1.2.3


From 39c686b862cdb2049b90e095b6c6c727b2a7ab60 Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Thu, 9 Jul 2015 13:25:36 -0600
Subject: libnvdimm: Add DSM support for Address Range Scrub commands

Add support for the three ARS DSM commands:
- Query ARS Capabilities - Queries the firmware to check if a given
  range supports scrub, and if so, which type (persistent vs. volatile)
- Start ARS - Starts a scrub for a given range/type
- Query ARS Status - Checks status of a previously started scrub, and
  provides the error logs if any.

  The commands are described by the example DSM spec at:
  http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf

Also add these commands to the nfit_test test framework, and return
canned data.

Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit.c              |   1 +
 drivers/acpi/nfit.h              |   1 +
 include/uapi/linux/ndctl.h       |  10 ++
 tools/testing/nvdimm/test/nfit.c | 199 +++++++++++++++++++++++++++------------
 4 files changed, 152 insertions(+), 59 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index 628a42c41ab1..ef8a664db254 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -868,6 +868,7 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc)
 	struct acpi_device *adev;
 	int i;
 
+	nd_desc->dsm_mask = acpi_desc->bus_dsm_force_en;
 	adev = to_acpi_dev(acpi_desc);
 	if (!adev)
 		return;
diff --git a/drivers/acpi/nfit.h b/drivers/acpi/nfit.h
index 79b6d83875c1..f2c2bb751882 100644
--- a/drivers/acpi/nfit.h
+++ b/drivers/acpi/nfit.h
@@ -107,6 +107,7 @@ struct acpi_nfit_desc {
 	struct nvdimm_bus *nvdimm_bus;
 	struct device *dev;
 	unsigned long dimm_dsm_force_en;
+	unsigned long bus_dsm_force_en;
 	int (*blk_do_io)(struct nd_blk_region *ndbr, resource_size_t dpa,
 			void *iobuf, u64 len, int rw);
 };
diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
index e94bc20016b2..5b4a4be06e2b 100644
--- a/include/uapi/linux/ndctl.h
+++ b/include/uapi/linux/ndctl.h
@@ -111,6 +111,11 @@ enum {
 	ND_CMD_VENDOR = 9,
 };
 
+enum {
+	ND_ARS_VOLATILE = 1,
+	ND_ARS_PERSISTENT = 2,
+};
+
 static inline const char *nvdimm_bus_cmd_name(unsigned cmd)
 {
 	static const char * const names[] = {
@@ -194,4 +199,9 @@ enum nd_driver_flags {
 enum {
 	ND_MIN_NAMESPACE_SIZE = 0x00400000,
 };
+
+enum ars_masks {
+	ARS_STATUS_MASK = 0x0000FFFF,
+	ARS_EXT_STATUS_SHIFT = 16,
+};
 #endif /* __NDCTL_H__ */
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index d0bdae40ccc9..28dba918524e 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -147,75 +147,153 @@ static struct nfit_test *to_nfit_test(struct device *dev)
 	return container_of(pdev, struct nfit_test, pdev);
 }
 
+static int nfit_test_cmd_get_config_size(struct nd_cmd_get_config_size *nd_cmd,
+		unsigned int buf_len)
+{
+	if (buf_len < sizeof(*nd_cmd))
+		return -EINVAL;
+
+	nd_cmd->status = 0;
+	nd_cmd->config_size = LABEL_SIZE;
+	nd_cmd->max_xfer = SZ_4K;
+
+	return 0;
+}
+
+static int nfit_test_cmd_get_config_data(struct nd_cmd_get_config_data_hdr
+		*nd_cmd, unsigned int buf_len, void *label)
+{
+	unsigned int len, offset = nd_cmd->in_offset;
+	int rc;
+
+	if (buf_len < sizeof(*nd_cmd))
+		return -EINVAL;
+	if (offset >= LABEL_SIZE)
+		return -EINVAL;
+	if (nd_cmd->in_length + sizeof(*nd_cmd) > buf_len)
+		return -EINVAL;
+
+	nd_cmd->status = 0;
+	len = min(nd_cmd->in_length, LABEL_SIZE - offset);
+	memcpy(nd_cmd->out_buf, label + offset, len);
+	rc = buf_len - sizeof(*nd_cmd) - len;
+
+	return rc;
+}
+
+static int nfit_test_cmd_set_config_data(struct nd_cmd_set_config_hdr *nd_cmd,
+		unsigned int buf_len, void *label)
+{
+	unsigned int len, offset = nd_cmd->in_offset;
+	u32 *status;
+	int rc;
+
+	if (buf_len < sizeof(*nd_cmd))
+		return -EINVAL;
+	if (offset >= LABEL_SIZE)
+		return -EINVAL;
+	if (nd_cmd->in_length + sizeof(*nd_cmd) + 4 > buf_len)
+		return -EINVAL;
+
+	status = (void *)nd_cmd + nd_cmd->in_length + sizeof(*nd_cmd);
+	*status = 0;
+	len = min(nd_cmd->in_length, LABEL_SIZE - offset);
+	memcpy(label + offset, nd_cmd->in_buf, len);
+	rc = buf_len - sizeof(*nd_cmd) - (len + 4);
+
+	return rc;
+}
+
+static int nfit_test_cmd_ars_cap(struct nd_cmd_ars_cap *nd_cmd,
+		unsigned int buf_len)
+{
+	if (buf_len < sizeof(*nd_cmd))
+		return -EINVAL;
+
+	nd_cmd->max_ars_out = 256;
+	nd_cmd->status = (ND_ARS_PERSISTENT | ND_ARS_VOLATILE) << 16;
+
+	return 0;
+}
+
+static int nfit_test_cmd_ars_start(struct nd_cmd_ars_start *nd_cmd,
+		unsigned int buf_len)
+{
+	if (buf_len < sizeof(*nd_cmd))
+		return -EINVAL;
+
+	nd_cmd->status = 0;
+
+	return 0;
+}
+
+static int nfit_test_cmd_ars_status(struct nd_cmd_ars_status *nd_cmd,
+		unsigned int buf_len)
+{
+	if (buf_len < sizeof(*nd_cmd))
+		return -EINVAL;
+
+	nd_cmd->out_length = 256;
+	nd_cmd->num_records = 0;
+	nd_cmd->status = 0;
+
+	return 0;
+}
+
 static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
 		struct nvdimm *nvdimm, unsigned int cmd, void *buf,
 		unsigned int buf_len)
 {
 	struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
 	struct nfit_test *t = container_of(acpi_desc, typeof(*t), acpi_desc);
-	struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
-	int i, rc;
+	int i, rc = 0;
+
+	if (nvdimm) {
+		struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
 
-	if (!nfit_mem || !test_bit(cmd, &nfit_mem->dsm_mask))
-		return -ENOTTY;
+		if (!nfit_mem || !test_bit(cmd, &nfit_mem->dsm_mask))
+			return -ENOTTY;
 
-	/* lookup label space for the given dimm */
-	for (i = 0; i < ARRAY_SIZE(handle); i++)
-		if (__to_nfit_memdev(nfit_mem)->device_handle == handle[i])
+		/* lookup label space for the given dimm */
+		for (i = 0; i < ARRAY_SIZE(handle); i++)
+			if (__to_nfit_memdev(nfit_mem)->device_handle ==
+					handle[i])
+				break;
+		if (i >= ARRAY_SIZE(handle))
+			return -ENXIO;
+
+		switch (cmd) {
+		case ND_CMD_GET_CONFIG_SIZE:
+			rc = nfit_test_cmd_get_config_size(buf, buf_len);
 			break;
-	if (i >= ARRAY_SIZE(handle))
-		return -ENXIO;
+		case ND_CMD_GET_CONFIG_DATA:
+			rc = nfit_test_cmd_get_config_data(buf, buf_len,
+				t->label[i]);
+			break;
+		case ND_CMD_SET_CONFIG_DATA:
+			rc = nfit_test_cmd_set_config_data(buf, buf_len,
+				t->label[i]);
+			break;
+		default:
+			return -ENOTTY;
+		}
+	} else {
+		if (!nd_desc || !test_bit(cmd, &nd_desc->dsm_mask))
+			return -ENOTTY;
 
-	switch (cmd) {
-	case ND_CMD_GET_CONFIG_SIZE: {
-		struct nd_cmd_get_config_size *nd_cmd = buf;
-
-		if (buf_len < sizeof(*nd_cmd))
-			return -EINVAL;
-		nd_cmd->status = 0;
-		nd_cmd->config_size = LABEL_SIZE;
-		nd_cmd->max_xfer = SZ_4K;
-		rc = 0;
-		break;
-	}
-	case ND_CMD_GET_CONFIG_DATA: {
-		struct nd_cmd_get_config_data_hdr *nd_cmd = buf;
-		unsigned int len, offset = nd_cmd->in_offset;
-
-		if (buf_len < sizeof(*nd_cmd))
-			return -EINVAL;
-		if (offset >= LABEL_SIZE)
-			return -EINVAL;
-		if (nd_cmd->in_length + sizeof(*nd_cmd) > buf_len)
-			return -EINVAL;
-
-		nd_cmd->status = 0;
-		len = min(nd_cmd->in_length, LABEL_SIZE - offset);
-		memcpy(nd_cmd->out_buf, t->label[i] + offset, len);
-		rc = buf_len - sizeof(*nd_cmd) - len;
-		break;
-	}
-	case ND_CMD_SET_CONFIG_DATA: {
-		struct nd_cmd_set_config_hdr *nd_cmd = buf;
-		unsigned int len, offset = nd_cmd->in_offset;
-		u32 *status;
-
-		if (buf_len < sizeof(*nd_cmd))
-			return -EINVAL;
-		if (offset >= LABEL_SIZE)
-			return -EINVAL;
-		if (nd_cmd->in_length + sizeof(*nd_cmd) + 4 > buf_len)
-			return -EINVAL;
-
-		status = buf + nd_cmd->in_length + sizeof(*nd_cmd);
-		*status = 0;
-		len = min(nd_cmd->in_length, LABEL_SIZE - offset);
-		memcpy(t->label[i] + offset, nd_cmd->in_buf, len);
-		rc = buf_len - sizeof(*nd_cmd) - (len + 4);
-		break;
-	}
-	default:
-		return -ENOTTY;
+		switch (cmd) {
+		case ND_CMD_ARS_CAP:
+			rc = nfit_test_cmd_ars_cap(buf, buf_len);
+			break;
+		case ND_CMD_ARS_START:
+			rc = nfit_test_cmd_ars_start(buf, buf_len);
+			break;
+		case ND_CMD_ARS_STATUS:
+			rc = nfit_test_cmd_ars_status(buf, buf_len);
+			break;
+		default:
+			return -ENOTTY;
+		}
 	}
 
 	return rc;
@@ -876,6 +954,9 @@ static void nfit_test0_setup(struct nfit_test *t)
 	set_bit(ND_CMD_GET_CONFIG_SIZE, &acpi_desc->dimm_dsm_force_en);
 	set_bit(ND_CMD_GET_CONFIG_DATA, &acpi_desc->dimm_dsm_force_en);
 	set_bit(ND_CMD_SET_CONFIG_DATA, &acpi_desc->dimm_dsm_force_en);
+	set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_dsm_force_en);
+	set_bit(ND_CMD_ARS_START, &acpi_desc->bus_dsm_force_en);
+	set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_dsm_force_en);
 	nd_desc = &acpi_desc->nd_desc;
 	nd_desc->ndctl = nfit_test_ctl;
 }
-- 
cgit v1.2.3


From a37281b63681015b12c3b7322e6bd681c0ea1ef4 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Fri, 21 Nov 2014 13:45:08 +0100
Subject: KVM: s390: more irq names for trace events

This patch adds names for missing irq types to the trace events.
In order to identify adapter irqs, the define is moved from
interrupt.c to the other basic irq defines in uapi/linux/kvm.h.

Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/interrupt.c  |  3 +--
 arch/s390/kvm/trace-s390.h | 22 ++++++++++++++++------
 include/uapi/linux/kvm.h   |  1 +
 3 files changed, 18 insertions(+), 8 deletions(-)

(limited to 'include/uapi')

diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 6c98fb61d154..a5781404b83f 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -30,7 +30,6 @@
 #define IOINT_SCHID_MASK 0x0000ffff
 #define IOINT_SSID_MASK 0x00030000
 #define IOINT_CSSID_MASK 0x03fc0000
-#define IOINT_AI_MASK 0x04000000
 #define PFAULT_INIT 0x0600
 #define PFAULT_DONE 0x0680
 #define VIRTIO_PARAM 0x0d00
@@ -1447,7 +1446,7 @@ int kvm_s390_inject_vm(struct kvm *kvm,
 		inti->mchk.mcic = s390int->parm64;
 		break;
 	case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
-		if (inti->type & IOINT_AI_MASK)
+		if (inti->type & KVM_S390_INT_IO_AI_MASK)
 			VM_EVENT(kvm, 5, "%s", "inject: I/O (AI)");
 		else
 			VM_EVENT(kvm, 5, "inject: I/O css %x ss %x schid %04x",
diff --git a/arch/s390/kvm/trace-s390.h b/arch/s390/kvm/trace-s390.h
index 4f373c50c264..cc1d6c68356f 100644
--- a/arch/s390/kvm/trace-s390.h
+++ b/arch/s390/kvm/trace-s390.h
@@ -105,11 +105,22 @@ TRACE_EVENT(kvm_s390_vcpu_start_stop,
 	{KVM_S390_PROGRAM_INT, "program interrupt"},			\
 	{KVM_S390_SIGP_SET_PREFIX, "sigp set prefix"},			\
 	{KVM_S390_RESTART, "sigp restart"},				\
+	{KVM_S390_INT_PFAULT_INIT, "pfault init"},			\
+	{KVM_S390_INT_PFAULT_DONE, "pfault done"},			\
+	{KVM_S390_MCHK, "machine check"},				\
+	{KVM_S390_INT_CLOCK_COMP, "clock comparator"},			\
+	{KVM_S390_INT_CPU_TIMER, "cpu timer"},				\
 	{KVM_S390_INT_VIRTIO, "virtio interrupt"},			\
 	{KVM_S390_INT_SERVICE, "sclp interrupt"},			\
 	{KVM_S390_INT_EMERGENCY, "sigp emergency"},			\
 	{KVM_S390_INT_EXTERNAL_CALL, "sigp ext call"}
 
+#define get_irq_name(__type) \
+	(__type > KVM_S390_INT_IO_MAX ? \
+	__print_symbolic(__type, kvm_s390_int_type) : \
+		(__type & KVM_S390_INT_IO_AI_MASK ? \
+		 "adapter I/O interrupt" : "subchannel I/O interrupt"))
+
 TRACE_EVENT(kvm_s390_inject_vm,
 	    TP_PROTO(__u64 type, __u32 parm, __u64 parm64, int who),
 	    TP_ARGS(type, parm, parm64, who),
@@ -131,8 +142,7 @@ TRACE_EVENT(kvm_s390_inject_vm,
 	    TP_printk("inject%s: type:%x (%s) parm:%x parm64:%llx",
 		      (__entry->who == 1) ? " (from kernel)" :
 		      (__entry->who == 2) ? " (from user)" : "",
-		      __entry->inttype,
-		      __print_symbolic(__entry->inttype, kvm_s390_int_type),
+		      __entry->inttype, get_irq_name(__entry->inttype),
 		      __entry->parm, __entry->parm64)
 	);
 
@@ -156,8 +166,8 @@ TRACE_EVENT(kvm_s390_inject_vcpu,
 
 	    TP_printk("inject (vcpu %d): type:%x (%s) parm:%x parm64:%llx",
 		      __entry->id, __entry->inttype,
-		      __print_symbolic(__entry->inttype, kvm_s390_int_type),
-		      __entry->parm, __entry->parm64)
+		      get_irq_name(__entry->inttype), __entry->parm,
+		      __entry->parm64)
 	);
 
 /*
@@ -184,8 +194,8 @@ TRACE_EVENT(kvm_s390_deliver_interrupt,
 	    TP_printk("deliver interrupt (vcpu %d): type:%x (%s) "	\
 		      "data:%08llx %016llx",
 		      __entry->id, __entry->inttype,
-		      __print_symbolic(__entry->inttype, kvm_s390_int_type),
-		      __entry->data0, __entry->data1)
+		      get_irq_name(__entry->inttype), __entry->data0,
+		      __entry->data1)
 	);
 
 /*
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 9ef19ebd9df4..0d831f94f8a8 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -482,6 +482,7 @@ struct kvm_s390_psw {
 	 ((ai) << 26))
 #define KVM_S390_INT_IO_MIN		0x00000000u
 #define KVM_S390_INT_IO_MAX		0xfffdffffu
+#define KVM_S390_INT_IO_AI_MASK		0x04000000u
 
 
 struct kvm_s390_interrupt {
-- 
cgit v1.2.3


From 8486a0bba6fc13ae60a8e402cf77c60a7d4e7a04 Mon Sep 17 00:00:00 2001
From: Macpaul Lin <macpaul@gmail.com>
Date: Thu, 9 Jul 2015 15:18:38 +0800
Subject: usb: add usb_otg20_descriptor for OTG 2.0 and above

OTG 2.0 introduces bcdOTG in otg descriptor to identify the OTG and EH
supplement release number with which the OTG device is compliant, this
patch adds structure usb_otg20_descriptor for OTG 2.0 and above.

Signed-off-by: Macpaul Lin <macpaul@gmail.com>
Signed-off-by: Li Jun <jun.li@freescale.com>
Reviewed-by: Roger Quadros <rogerq@ti.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 include/uapi/linux/usb/ch9.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h
index aa33fd1b2d4f..aec689941c18 100644
--- a/include/uapi/linux/usb/ch9.h
+++ b/include/uapi/linux/usb/ch9.h
@@ -674,6 +674,17 @@ struct usb_otg_descriptor {
 	__u8  bmAttributes;	/* support for HNP, SRP, etc */
 } __attribute__ ((packed));
 
+/* USB_DT_OTG (from OTG 2.0 supplement) */
+struct usb_otg20_descriptor {
+	__u8  bLength;
+	__u8  bDescriptorType;
+
+	__u8  bmAttributes;	/* support for HNP, SRP and ADP, etc */
+	__le16 bcdOTG;		/* OTG and EH supplement release number
+				 * in binary-coded decimal(i.e. 2.0 is 0200H)
+				 */
+} __attribute__ ((packed));
+
 /* from usb_otg_descriptor.bmAttributes */
 #define USB_OTG_SRP		(1 << 0)
 #define USB_OTG_HNP		(1 << 1)	/* swap host/device roles */
-- 
cgit v1.2.3


From 5d701cef9b40188c11c39be3a401bc4feeb154b8 Mon Sep 17 00:00:00 2001
From: Macpaul Lin <macpaul@gmail.com>
Date: Thu, 9 Jul 2015 15:18:39 +0800
Subject: usb: add USB_OTG_ADP definition

Add USB_OTG_ADP definition for usb_otg_descriptor.bmAttributes.

Signed-off-by: Macpaul Lin <macpaul@gmail.com>
Signed-off-by: Li Jun <jun.li@freescale.com>
Acked-by: Peter Chen <peter.chen@freescale.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 include/uapi/linux/usb/ch9.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h
index aec689941c18..f7adc6e01f9e 100644
--- a/include/uapi/linux/usb/ch9.h
+++ b/include/uapi/linux/usb/ch9.h
@@ -688,6 +688,7 @@ struct usb_otg20_descriptor {
 /* from usb_otg_descriptor.bmAttributes */
 #define USB_OTG_SRP		(1 << 0)
 #define USB_OTG_HNP		(1 << 1)	/* swap host/device roles */
+#define USB_OTG_ADP		(1 << 2)	/* support ADP */
 
 /*-------------------------------------------------------------------------*/
 
-- 
cgit v1.2.3


From d7ee3519042798be6224e97f259ed47a63da4620 Mon Sep 17 00:00:00 2001
From: Michal Kubeček <mkubecek@suse.cz>
Date: Fri, 17 Jul 2015 16:17:56 +0200
Subject: netfilter: nf_ct_sctp: minimal multihoming support

Currently nf_conntrack_proto_sctp module handles only packets between
primary addresses used to establish the connection. Any packets between
secondary addresses are classified as invalid so that usual firewall
configurations drop them. Allowing HEARTBEAT and HEARTBEAT-ACK chunks to
establish a new conntrack would allow traffic between secondary
addresses to pass through. A more sophisticated solution based on the
addresses advertised in the initial handshake (and possibly also later
dynamic address addition and removal) would be much harder to implement.
Moreover, in general we cannot assume to always see the initial
handshake as it can be routed through a different path.

The patch adds two new conntrack states:

  SCTP_CONNTRACK_HEARTBEAT_SENT  - a HEARTBEAT chunk seen but not acked
  SCTP_CONNTRACK_HEARTBEAT_ACKED - a HEARTBEAT acked by HEARTBEAT-ACK

State transition rules:

- HEARTBEAT_SENT responds to usual chunks the same way as NONE (so that
  the behaviour changes as little as possible)
- HEARTBEAT_ACKED responds to usual chunks the same way as ESTABLISHED
  does, except the resulting state is HEARTBEAT_ACKED rather than
  ESTABLISHED
- previously existing states except NONE are preserved when HEARTBEAT or
  HEARTBEAT-ACK is seen
- NONE (in the initial direction) changes to HEARTBEAT_SENT on HEARTBEAT
  and to CLOSED on HEARTBEAT-ACK
- HEARTBEAT_SENT changes to HEARTBEAT_ACKED on HEARTBEAT-ACK in the
  reply direction
- HEARTBEAT_SENT and HEARTBEAT_ACKED are preserved on HEARTBEAT and
  HEARTBEAT-ACK otherwise

Normally, vtag is set from the INIT chunk for the reply direction and
from the INIT-ACK chunk for the originating direction (i.e. each of
these defines vtag value for the opposite direction). For secondary
conntracks, we can't rely on seeing INIT/INIT-ACK and even if we have
seen them, we would need to connect two different conntracks. Therefore
simplified logic is applied: vtag of first packet in each direction
(HEARTBEAT in the originating and HEARTBEAT-ACK in reply direction) is
saved and all following packets in that direction are compared with this
saved value. While INIT and INIT-ACK define vtag for the opposite
direction, vtags extracted from HEARTBEAT and HEARTBEAT-ACK are always
for their direction.

Default timeout values for new states are

  HEARTBEAT_SENT: 30 seconds (default hb_interval)
  HEARTBEAT_ACKED: 210 seconds (hb_interval * path_max_retry + max_rto)

(We cannot expect to see the shutdown sequence so that, unlike
ESTABLISHED, the HEARTBEAT_ACKED timeout shouldn't be too long.)

Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_conntrack_sctp.h   |   2 +
 include/uapi/linux/netfilter/nfnetlink_cttimeout.h |   2 +
 net/netfilter/nf_conntrack_proto_sctp.c            | 101 ++++++++++++++++-----
 3 files changed, 81 insertions(+), 24 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter/nf_conntrack_sctp.h b/include/uapi/linux/netfilter/nf_conntrack_sctp.h
index ceeefe6681b5..ed4e776e1242 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_sctp.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_sctp.h
@@ -13,6 +13,8 @@ enum sctp_conntrack {
 	SCTP_CONNTRACK_SHUTDOWN_SENT,
 	SCTP_CONNTRACK_SHUTDOWN_RECD,
 	SCTP_CONNTRACK_SHUTDOWN_ACK_SENT,
+	SCTP_CONNTRACK_HEARTBEAT_SENT,
+	SCTP_CONNTRACK_HEARTBEAT_ACKED,
 	SCTP_CONNTRACK_MAX
 };
 
diff --git a/include/uapi/linux/netfilter/nfnetlink_cttimeout.h b/include/uapi/linux/netfilter/nfnetlink_cttimeout.h
index 1ab0b97b3a1e..f2c10dc140d6 100644
--- a/include/uapi/linux/netfilter/nfnetlink_cttimeout.h
+++ b/include/uapi/linux/netfilter/nfnetlink_cttimeout.h
@@ -92,6 +92,8 @@ enum ctattr_timeout_sctp {
 	CTA_TIMEOUT_SCTP_SHUTDOWN_SENT,
 	CTA_TIMEOUT_SCTP_SHUTDOWN_RECD,
 	CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT,
+	CTA_TIMEOUT_SCTP_HEARTBEAT_SENT,
+	CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED,
 	__CTA_TIMEOUT_SCTP_MAX
 };
 #define CTA_TIMEOUT_SCTP_MAX (__CTA_TIMEOUT_SCTP_MAX - 1)
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index b45da90fad32..67197731eb68 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -42,6 +42,8 @@ static const char *const sctp_conntrack_names[] = {
 	"SHUTDOWN_SENT",
 	"SHUTDOWN_RECD",
 	"SHUTDOWN_ACK_SENT",
+	"HEARTBEAT_SENT",
+	"HEARTBEAT_ACKED",
 };
 
 #define SECS  * HZ
@@ -57,6 +59,8 @@ static unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] __read_mostly = {
 	[SCTP_CONNTRACK_SHUTDOWN_SENT]		= 300 SECS / 1000,
 	[SCTP_CONNTRACK_SHUTDOWN_RECD]		= 300 SECS / 1000,
 	[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT]	= 3 SECS,
+	[SCTP_CONNTRACK_HEARTBEAT_SENT]		= 30 SECS,
+	[SCTP_CONNTRACK_HEARTBEAT_ACKED]	= 210 SECS,
 };
 
 #define sNO SCTP_CONNTRACK_NONE
@@ -67,6 +71,8 @@ static unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] __read_mostly = {
 #define	sSS SCTP_CONNTRACK_SHUTDOWN_SENT
 #define	sSR SCTP_CONNTRACK_SHUTDOWN_RECD
 #define	sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT
+#define	sHS SCTP_CONNTRACK_HEARTBEAT_SENT
+#define	sHA SCTP_CONNTRACK_HEARTBEAT_ACKED
 #define	sIV SCTP_CONNTRACK_MAX
 
 /*
@@ -88,6 +94,10 @@ SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite
 		    to that of the SHUTDOWN chunk.
 CLOSED            - We have seen a SHUTDOWN_COMPLETE chunk in the direction of
 		    the SHUTDOWN chunk. Connection is closed.
+HEARTBEAT_SENT    - We have seen a HEARTBEAT in a new flow.
+HEARTBEAT_ACKED   - We have seen a HEARTBEAT-ACK in the direction opposite to
+		    that of the HEARTBEAT chunk. Secondary connection is
+		    established.
 */
 
 /* TODO
@@ -97,36 +107,40 @@ CLOSED            - We have seen a SHUTDOWN_COMPLETE chunk in the direction of
  - Check the error type in the reply dir before transitioning from
 cookie echoed to closed.
  - Sec 5.2.4 of RFC 2960
- - Multi Homing support.
+ - Full Multi Homing support.
 */
 
 /* SCTP conntrack state transitions */
-static const u8 sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = {
+static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = {
 	{
 /*	ORIGINAL	*/
-/*                  sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */
-/* init         */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA},
-/* init_ack     */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},
-/* abort        */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
-/* shutdown     */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA},
-/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA},
-/* error        */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't have Stale cookie*/
-/* cookie_echo  */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA},/* 5.2.4 - Big TODO */
-/* cookie_ack   */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't come in orig dir */
-/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL}
+/*                  sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */
+/* init         */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA},
+/* init_ack     */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},
+/* abort        */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
+/* shutdown     */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL, sSS},
+/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA, sHA},
+/* error        */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* Can't have Stale cookie*/
+/* cookie_echo  */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* 5.2.4 - Big TODO */
+/* cookie_ack   */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* Can't come in orig dir */
+/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL, sHA},
+/* heartbeat    */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA},
+/* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA}
 	},
 	{
 /*	REPLY	*/
-/*                  sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */
-/* init         */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* INIT in sCL Big TODO */
-/* init_ack     */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},
-/* abort        */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
-/* shutdown     */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA},
-/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA},
-/* error        */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA},
-/* cookie_echo  */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't come in reply dir */
-/* cookie_ack   */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA},
-/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL}
+/*                  sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */
+/* init         */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* INIT in sCL Big TODO */
+/* init_ack     */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},
+/* abort        */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV, sCL},
+/* shutdown     */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV, sSR},
+/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV, sHA},
+/* error        */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV, sHA},
+/* cookie_echo  */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* Can't come in reply dir */
+/* cookie_ack   */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV, sHA},
+/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV, sHA},
+/* heartbeat    */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA},
+/* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHA, sHA}
 	}
 };
 
@@ -278,9 +292,16 @@ static int sctp_new_state(enum ip_conntrack_dir dir,
 		pr_debug("SCTP_CID_SHUTDOWN_COMPLETE\n");
 		i = 8;
 		break;
+	case SCTP_CID_HEARTBEAT:
+		pr_debug("SCTP_CID_HEARTBEAT");
+		i = 9;
+		break;
+	case SCTP_CID_HEARTBEAT_ACK:
+		pr_debug("SCTP_CID_HEARTBEAT_ACK");
+		i = 10;
+		break;
 	default:
-		/* Other chunks like DATA, SACK, HEARTBEAT and
-		its ACK do not cause a change in state */
+		/* Other chunks like DATA or SACK do not change the state */
 		pr_debug("Unknown chunk type, Will stay in %s\n",
 			 sctp_conntrack_names[cur_state]);
 		return cur_state;
@@ -329,6 +350,8 @@ static int sctp_packet(struct nf_conn *ct,
 	    !test_bit(SCTP_CID_COOKIE_ECHO, map) &&
 	    !test_bit(SCTP_CID_ABORT, map) &&
 	    !test_bit(SCTP_CID_SHUTDOWN_ACK, map) &&
+	    !test_bit(SCTP_CID_HEARTBEAT, map) &&
+	    !test_bit(SCTP_CID_HEARTBEAT_ACK, map) &&
 	    sh->vtag != ct->proto.sctp.vtag[dir]) {
 		pr_debug("Verification tag check failed\n");
 		goto out;
@@ -357,6 +380,16 @@ static int sctp_packet(struct nf_conn *ct,
 			/* Sec 8.5.1 (D) */
 			if (sh->vtag != ct->proto.sctp.vtag[dir])
 				goto out_unlock;
+		} else if (sch->type == SCTP_CID_HEARTBEAT ||
+			   sch->type == SCTP_CID_HEARTBEAT_ACK) {
+			if (ct->proto.sctp.vtag[dir] == 0) {
+				pr_debug("Setting vtag %x for dir %d\n",
+					 sh->vtag, dir);
+				ct->proto.sctp.vtag[dir] = sh->vtag;
+			} else if (sh->vtag != ct->proto.sctp.vtag[dir]) {
+				pr_debug("Verification tag check failed\n");
+				goto out_unlock;
+			}
 		}
 
 		old_state = ct->proto.sctp.state;
@@ -466,6 +499,10 @@ static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
 				/* Sec 8.5.1 (A) */
 				return false;
 			}
+		} else if (sch->type == SCTP_CID_HEARTBEAT) {
+			pr_debug("Setting vtag %x for secondary conntrack\n",
+				 sh->vtag);
+			ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag;
 		}
 		/* If it is a shutdown ack OOTB packet, we expect a return
 		   shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */
@@ -610,6 +647,8 @@ sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = {
 	[CTA_TIMEOUT_SCTP_SHUTDOWN_SENT]	= { .type = NLA_U32 },
 	[CTA_TIMEOUT_SCTP_SHUTDOWN_RECD]	= { .type = NLA_U32 },
 	[CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT]	= { .type = NLA_U32 },
+	[CTA_TIMEOUT_SCTP_HEARTBEAT_SENT]	= { .type = NLA_U32 },
+	[CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED]	= { .type = NLA_U32 },
 };
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
 
@@ -658,6 +697,18 @@ static struct ctl_table sctp_sysctl_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
 	},
+	{
+		.procname	= "nf_conntrack_sctp_timeout_heartbeat_sent",
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "nf_conntrack_sctp_timeout_heartbeat_acked",
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
 	{ }
 };
 
@@ -730,6 +781,8 @@ static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn,
 	pn->ctl_table[4].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT];
 	pn->ctl_table[5].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD];
 	pn->ctl_table[6].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT];
+	pn->ctl_table[7].data = &sn->timeouts[SCTP_CONNTRACK_HEARTBEAT_SENT];
+	pn->ctl_table[8].data = &sn->timeouts[SCTP_CONNTRACK_HEARTBEAT_ACKED];
 #endif
 	return 0;
 }
-- 
cgit v1.2.3


From a0ddef81f4aeeeec3326f6b6a255d8ea13b41908 Mon Sep 17 00:00:00 2001
From: Chris Metcalf <cmetcalf@ezchip.com>
Date: Wed, 22 Jul 2015 14:30:14 -0400
Subject: tile: enable full SECCOMP support

Signed-off-by: Chris Metcalf <cmetcalf@ezchip.com>
---
 .../seccomp/seccomp-filter/arch-support.txt        |  2 +-
 arch/tile/Kconfig                                  | 17 +++++++++++++
 arch/tile/include/asm/Kbuild                       |  1 +
 arch/tile/include/asm/elf.h                        |  4 +---
 arch/tile/include/asm/syscall.h                    | 28 +++++++++++++++++++++-
 arch/tile/kernel/intvec_32.S                       |  1 +
 arch/tile/kernel/intvec_64.S                       |  1 +
 arch/tile/kernel/ptrace.c                          |  3 +++
 include/uapi/linux/audit.h                         |  3 +++
 include/uapi/linux/elf-em.h                        |  2 ++
 10 files changed, 57 insertions(+), 5 deletions(-)

(limited to 'include/uapi')

diff --git a/Documentation/features/seccomp/seccomp-filter/arch-support.txt b/Documentation/features/seccomp/seccomp-filter/arch-support.txt
index bea800910342..76d39d66a5d7 100644
--- a/Documentation/features/seccomp/seccomp-filter/arch-support.txt
+++ b/Documentation/features/seccomp/seccomp-filter/arch-support.txt
@@ -32,7 +32,7 @@
     |       score: | TODO |
     |          sh: | TODO |
     |       sparc: | TODO |
-    |        tile: | TODO |
+    |        tile: |  ok  |
     |          um: | TODO |
     |   unicore32: | TODO |
     |         x86: |  ok  |
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 9def1f52d03a..2ba12d761723 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -32,6 +32,7 @@ config TILE
 	select EDAC_SUPPORT
 	select GENERIC_STRNCPY_FROM_USER
 	select GENERIC_STRNLEN_USER
+	select HAVE_ARCH_SECCOMP_FILTER
 
 # FIXME: investigate whether we need/want these options.
 #	select HAVE_IOREMAP_PROT
@@ -221,6 +222,22 @@ config COMPAT
 	  If enabled, the kernel will support running TILE-Gx binaries
 	  that were built with the -m32 option.
 
+config SECCOMP
+	bool "Enable seccomp to safely compute untrusted bytecode"
+	depends on PROC_FS
+	help
+	  This kernel feature is useful for number crunching applications
+	  that may need to compute untrusted bytecode during their
+	  execution. By using pipes or other transports made available to
+	  the process as file descriptors supporting the read/write
+	  syscalls, it's possible to isolate those applications in
+	  their own address space using seccomp. Once seccomp is
+	  enabled via prctl, it cannot be disabled and the task is only
+	  allowed to execute a few safe syscalls defined by each seccomp
+	  mode.
+
+	  If unsure, say N.
+
 config SYSVIPC_COMPAT
 	def_bool y
 	depends on COMPAT && SYSVIPC
diff --git a/arch/tile/include/asm/Kbuild b/arch/tile/include/asm/Kbuild
index d8a843163471..ba35c41c71ff 100644
--- a/arch/tile/include/asm/Kbuild
+++ b/arch/tile/include/asm/Kbuild
@@ -28,6 +28,7 @@ generic-y += poll.h
 generic-y += posix_types.h
 generic-y += preempt.h
 generic-y += resource.h
+generic-y += seccomp.h
 generic-y += sembuf.h
 generic-y += serial.h
 generic-y += shmbuf.h
diff --git a/arch/tile/include/asm/elf.h b/arch/tile/include/asm/elf.h
index 41d9878a9686..c505d77e4d06 100644
--- a/arch/tile/include/asm/elf.h
+++ b/arch/tile/include/asm/elf.h
@@ -22,6 +22,7 @@
 #include <arch/chip.h>
 
 #include <linux/ptrace.h>
+#include <linux/elf-em.h>
 #include <asm/byteorder.h>
 #include <asm/page.h>
 
@@ -30,9 +31,6 @@ typedef unsigned long elf_greg_t;
 #define ELF_NGREG (sizeof(struct pt_regs) / sizeof(elf_greg_t))
 typedef elf_greg_t elf_gregset_t[ELF_NGREG];
 
-#define EM_TILEPRO 188
-#define EM_TILEGX  191
-
 /* Provide a nominal data structure. */
 #define ELF_NFPREG	0
 typedef double elf_fpreg_t;
diff --git a/arch/tile/include/asm/syscall.h b/arch/tile/include/asm/syscall.h
index 9644b88f133d..373d73064ea1 100644
--- a/arch/tile/include/asm/syscall.h
+++ b/arch/tile/include/asm/syscall.h
@@ -20,6 +20,8 @@
 
 #include <linux/sched.h>
 #include <linux/err.h>
+#include <linux/audit.h>
+#include <linux/compat.h>
 #include <arch/abi.h>
 
 /* The array of function pointers for syscalls. */
@@ -61,7 +63,15 @@ static inline void syscall_set_return_value(struct task_struct *task,
 					    struct pt_regs *regs,
 					    int error, long val)
 {
-	regs->regs[0] = (long) error ?: val;
+	if (error) {
+		/* R0 is the passed-in negative error, R1 is positive. */
+		regs->regs[0] = error;
+		regs->regs[1] = -error;
+	} else {
+		/* R1 set to zero to indicate no error. */
+		regs->regs[0] = val;
+		regs->regs[1] = 0;
+	}
 }
 
 static inline void syscall_get_arguments(struct task_struct *task,
@@ -82,4 +92,20 @@ static inline void syscall_set_arguments(struct task_struct *task,
 	memcpy(&regs[i], args, n * sizeof(args[0]));
 }
 
+/*
+ * We don't care about endianness (__AUDIT_ARCH_LE bit) here because
+ * tile has the same system calls both on little- and big- endian.
+ */
+static inline int syscall_get_arch(void)
+{
+	if (is_compat_task())
+		return AUDIT_ARCH_TILEGX32;
+
+#ifdef CONFIG_TILEGX
+	return AUDIT_ARCH_TILEGX;
+#else
+	return AUDIT_ARCH_TILEPRO;
+#endif
+}
+
 #endif	/* _ASM_TILE_SYSCALL_H */
diff --git a/arch/tile/kernel/intvec_32.S b/arch/tile/kernel/intvec_32.S
index cdbda45a4e4b..fbbe2ea882ea 100644
--- a/arch/tile/kernel/intvec_32.S
+++ b/arch/tile/kernel/intvec_32.S
@@ -1224,6 +1224,7 @@ handle_syscall:
 	 jal    do_syscall_trace_enter
 	}
 	FEEDBACK_REENTER(handle_syscall)
+	blz     r0, .Lsyscall_sigreturn_skip
 
 	/*
 	 * We always reload our registers from the stack at this
diff --git a/arch/tile/kernel/intvec_64.S b/arch/tile/kernel/intvec_64.S
index 800b91d3f9dc..58964d209d4d 100644
--- a/arch/tile/kernel/intvec_64.S
+++ b/arch/tile/kernel/intvec_64.S
@@ -1247,6 +1247,7 @@ handle_syscall:
 	 jal    do_syscall_trace_enter
 	}
 	FEEDBACK_REENTER(handle_syscall)
+	bltz    r0, .Lsyscall_sigreturn_skip
 
 	/*
 	 * We always reload our registers from the stack at this
diff --git a/arch/tile/kernel/ptrace.c b/arch/tile/kernel/ptrace.c
index f84eed8243da..bdc126faf741 100644
--- a/arch/tile/kernel/ptrace.c
+++ b/arch/tile/kernel/ptrace.c
@@ -262,6 +262,9 @@ int do_syscall_trace_enter(struct pt_regs *regs)
 	if (work & _TIF_NOHZ)
 		user_exit();
 
+	if (secure_computing() == -1)
+		return -1;
+
 	if (work & _TIF_SYSCALL_TRACE) {
 		if (tracehook_report_syscall_entry(regs))
 			regs->regs[TREG_SYSCALL_NR] = -1;
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index d3475e1f15ec..1f977dd4c370 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -382,6 +382,9 @@ enum {
 #define AUDIT_ARCH_SHEL64	(EM_SH|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_SPARC	(EM_SPARC)
 #define AUDIT_ARCH_SPARC64	(EM_SPARCV9|__AUDIT_ARCH_64BIT)
+#define AUDIT_ARCH_TILEGX	(EM_TILEGX|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
+#define AUDIT_ARCH_TILEGX32	(EM_TILEGX|__AUDIT_ARCH_LE)
+#define AUDIT_ARCH_TILEPRO	(EM_TILEPRO|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_X86_64	(EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
 
 #define AUDIT_PERM_EXEC		1
diff --git a/include/uapi/linux/elf-em.h b/include/uapi/linux/elf-em.h
index b08829667ed7..3429a3ba382b 100644
--- a/include/uapi/linux/elf-em.h
+++ b/include/uapi/linux/elf-em.h
@@ -38,6 +38,8 @@
 #define EM_ALTERA_NIOS2	113	/* Altera Nios II soft-core processor */
 #define EM_TI_C6000	140	/* TI C6X DSPs */
 #define EM_AARCH64	183	/* ARM 64 bit */
+#define EM_TILEPRO	188	/* Tilera TILEPro */
+#define EM_TILEGX	191	/* Tilera TILE-Gx */
 #define EM_FRV		0x5441	/* Fujitsu FR-V */
 #define EM_AVR32	0x18ad	/* Atmel AVR32 */
 
-- 
cgit v1.2.3


From 8013d1d7eafb0589ca766db6b74026f76b7f5cb4 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Thu, 30 Jul 2015 14:28:42 +0800
Subject: net/ipv6: add sysctl option accept_ra_min_hop_limit

Commit 6fd99094de2b ("ipv6: Don't reduce hop limit for an interface")
disabled accept hop limit from RA if it is smaller than the current hop
limit for security stuff. But this behavior kind of break the RFC definition.

RFC 4861, 6.3.4.  Processing Received Router Advertisements
   A Router Advertisement field (e.g., Cur Hop Limit, Reachable Time,
   and Retrans Timer) may contain a value denoting that it is
   unspecified.  In such cases, the parameter should be ignored and the
   host should continue using whatever value it is already using.

   If the received Cur Hop Limit value is non-zero, the host SHOULD set
   its CurHopLimit variable to the received value.

So add sysctl option accept_ra_min_hop_limit to let user choose the minimum
hop limit value they can accept from RA. And set default to 1 to meet RFC
standards.

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Acked-by: YOSHIFUJI Hideaki <hideaki.yoshifuji@miraclelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  8 ++++++++
 include/linux/ipv6.h                   |  1 +
 include/uapi/linux/ipv6.h              |  1 +
 net/ipv6/addrconf.c                    | 10 ++++++++++
 net/ipv6/ndisc.c                       | 16 +++++++---------
 5 files changed, 27 insertions(+), 9 deletions(-)

(limited to 'include/uapi')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 1a5ab21bcca5..00d26d919459 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1340,6 +1340,14 @@ accept_ra_from_local - BOOLEAN
 	   disabled if accept_ra_from_local is disabled
                on a specific interface.
 
+accept_ra_min_hop_limit - INTEGER
+	Minimum hop limit Information in Router Advertisement.
+
+	Hop limit Information in Router Advertisement less than this
+	variable shall be ignored.
+
+	Default: 1
+
 accept_ra_pinfo - BOOLEAN
 	Learn Prefix Information in Router Advertisement.
 
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 06ed637225b8..cb9dcad72372 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -29,6 +29,7 @@ struct ipv6_devconf {
 	__s32		max_desync_factor;
 	__s32		max_addresses;
 	__s32		accept_ra_defrtr;
+	__s32		accept_ra_min_hop_limit;
 	__s32		accept_ra_pinfo;
 #ifdef CONFIG_IPV6_ROUTER_PREF
 	__s32		accept_ra_rtr_pref;
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 641a146ead7d..80f3b74446a1 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -172,6 +172,7 @@ enum {
 	DEVCONF_ACCEPT_RA_MTU,
 	DEVCONF_STABLE_SECRET,
 	DEVCONF_USE_OIF_ADDRS_ONLY,
+	DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index eb0c6a3a8a00..53e3a9d756b0 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -195,6 +195,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	.max_addresses		= IPV6_MAX_ADDRESSES,
 	.accept_ra_defrtr	= 1,
 	.accept_ra_from_local	= 0,
+	.accept_ra_min_hop_limit= 1,
 	.accept_ra_pinfo	= 1,
 #ifdef CONFIG_IPV6_ROUTER_PREF
 	.accept_ra_rtr_pref	= 1,
@@ -237,6 +238,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	.max_addresses		= IPV6_MAX_ADDRESSES,
 	.accept_ra_defrtr	= 1,
 	.accept_ra_from_local	= 0,
+	.accept_ra_min_hop_limit= 1,
 	.accept_ra_pinfo	= 1,
 #ifdef CONFIG_IPV6_ROUTER_PREF
 	.accept_ra_rtr_pref	= 1,
@@ -4588,6 +4590,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_MAX_DESYNC_FACTOR] = cnf->max_desync_factor;
 	array[DEVCONF_MAX_ADDRESSES] = cnf->max_addresses;
 	array[DEVCONF_ACCEPT_RA_DEFRTR] = cnf->accept_ra_defrtr;
+	array[DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT] = cnf->accept_ra_min_hop_limit;
 	array[DEVCONF_ACCEPT_RA_PINFO] = cnf->accept_ra_pinfo;
 #ifdef CONFIG_IPV6_ROUTER_PREF
 	array[DEVCONF_ACCEPT_RA_RTR_PREF] = cnf->accept_ra_rtr_pref;
@@ -5484,6 +5487,13 @@ static struct addrconf_sysctl_table
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec,
 		},
+		{
+			.procname	= "accept_ra_min_hop_limit",
+			.data		= &ipv6_devconf.accept_ra_min_hop_limit,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
 		{
 			.procname	= "accept_ra_pinfo",
 			.data		= &ipv6_devconf.accept_ra_pinfo,
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 0a05b35a90fc..6e184e02fd3c 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1225,18 +1225,16 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 
 	if (rt)
 		rt6_set_expires(rt, jiffies + (HZ * lifetime));
-	if (ra_msg->icmph.icmp6_hop_limit) {
-		/* Only set hop_limit on the interface if it is higher than
-		 * the current hop_limit.
-		 */
-		if (in6_dev->cnf.hop_limit < ra_msg->icmph.icmp6_hop_limit) {
+	if (in6_dev->cnf.accept_ra_min_hop_limit < 256 &&
+	    ra_msg->icmph.icmp6_hop_limit) {
+		if (in6_dev->cnf.accept_ra_min_hop_limit <= ra_msg->icmph.icmp6_hop_limit) {
 			in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit;
+			if (rt)
+				dst_metric_set(&rt->dst, RTAX_HOPLIMIT,
+					       ra_msg->icmph.icmp6_hop_limit);
 		} else {
-			ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than current\n");
+			ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than minimum\n");
 		}
-		if (rt)
-			dst_metric_set(&rt->dst, RTAX_HOPLIMIT,
-				       ra_msg->icmph.icmp6_hop_limit);
 	}
 
 skip_defrtr:
-- 
cgit v1.2.3


From d3aa45ce6b94c65b83971257317867db13e5f492 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Thu, 30 Jul 2015 15:36:57 -0700
Subject: bpf: add helpers to access tunnel metadata

Introduce helpers to let eBPF programs attached to TC manipulate tunnel metadata:
bpf_skb_[gs]et_tunnel_key(skb, key, size, flags)
skb: pointer to skb
key: pointer to 'struct bpf_tunnel_key'
size: size of 'struct bpf_tunnel_key'
flags: room for future extensions

First eBPF program that uses these helpers will allocate per_cpu
metadata_dst structures that will be used on TX.
On RX metadata_dst is allocated by tunnel driver.

Typical usage for TX:
struct bpf_tunnel_key tkey;
... populate tkey ...
bpf_skb_set_tunnel_key(skb, &tkey, sizeof(tkey), 0);
bpf_clone_redirect(skb, vxlan_dev_ifindex, 0);

RX:
struct bpf_tunnel_key tkey = {};
bpf_skb_get_tunnel_key(skb, &tkey, sizeof(tkey), 0);
... lookup or redirect based on tkey ...

'struct bpf_tunnel_key' will be extended in the future by adding
elements to the end and the 'size' argument will indicate which fields
are populated, thereby keeping backwards compatibility.
The 'flags' argument may be used as well when the 'size' is not enough or
to indicate completely different layout of bpf_tunnel_key.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst_metadata.h |  1 +
 include/uapi/linux/bpf.h   | 17 ++++++++++
 net/core/dst.c             | 35 +++++++++++++++++----
 net/core/filter.c          | 77 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 124 insertions(+), 6 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index 7b0306894663..075f523ff23f 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -51,5 +51,6 @@ static inline bool skb_valid_dst(const struct sk_buff *skb)
 }
 
 struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags);
+struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags);
 
 #endif /* __NET_DST_METADATA_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2f6c83d714e9..bc0d27d3fbdd 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -258,6 +258,18 @@ enum bpf_func_id {
 	BPF_FUNC_get_cgroup_classid,
 	BPF_FUNC_skb_vlan_push, /* bpf_skb_vlan_push(skb, vlan_proto, vlan_tci) */
 	BPF_FUNC_skb_vlan_pop,  /* bpf_skb_vlan_pop(skb) */
+
+	/**
+	 * bpf_skb_[gs]et_tunnel_key(skb, key, size, flags)
+	 * retrieve or populate tunnel metadata
+	 * @skb: pointer to skb
+	 * @key: pointer to 'struct bpf_tunnel_key'
+	 * @size: size of 'struct bpf_tunnel_key'
+	 * @flags: room for future extensions
+	 * Retrun: 0 on success
+	 */
+	BPF_FUNC_skb_get_tunnel_key,
+	BPF_FUNC_skb_set_tunnel_key,
 	__BPF_FUNC_MAX_ID,
 };
 
@@ -280,4 +292,9 @@ struct __sk_buff {
 	__u32 cb[5];
 };
 
+struct bpf_tunnel_key {
+	__u32 tunnel_id;
+	__u32 remote_ipv4;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/net/core/dst.c b/net/core/dst.c
index 76a617f6d60a..f8694d1b8702 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -362,15 +362,10 @@ static int dst_md_discard(struct sk_buff *skb)
 	return 0;
 }
 
-struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags)
+static void __metadata_dst_init(struct metadata_dst *md_dst, u8 optslen)
 {
-	struct metadata_dst *md_dst;
 	struct dst_entry *dst;
 
-	md_dst = kmalloc(sizeof(*md_dst) + optslen, flags);
-	if (!md_dst)
-		return ERR_PTR(-ENOMEM);
-
 	dst = &md_dst->dst;
 	dst_init(dst, &md_dst_ops, NULL, 1, DST_OBSOLETE_NONE,
 		 DST_METADATA | DST_NOCACHE | DST_NOCOUNT);
@@ -380,11 +375,39 @@ struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags)
 
 	memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst));
 	md_dst->opts_len = optslen;
+}
+
+struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags)
+{
+	struct metadata_dst *md_dst;
+
+	md_dst = kmalloc(sizeof(*md_dst) + optslen, flags);
+	if (!md_dst)
+		return NULL;
+
+	__metadata_dst_init(md_dst, optslen);
 
 	return md_dst;
 }
 EXPORT_SYMBOL_GPL(metadata_dst_alloc);
 
+struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags)
+{
+	int cpu;
+	struct metadata_dst __percpu *md_dst;
+
+	md_dst = __alloc_percpu_gfp(sizeof(struct metadata_dst) + optslen,
+				    __alignof__(struct metadata_dst), flags);
+	if (!md_dst)
+		return NULL;
+
+	for_each_possible_cpu(cpu)
+		__metadata_dst_init(per_cpu_ptr(md_dst, cpu), optslen);
+
+	return md_dst;
+}
+EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu);
+
 /* Dirty hack. We did it in 2.2 (in __dst_free),
  * we have _very_ good reasons not to repeat
  * this mistake in 2.3, but we have no choice
diff --git a/net/core/filter.c b/net/core/filter.c
index 786722a9c6f2..1b72264ff2ee 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -48,6 +48,7 @@
 #include <linux/bpf.h>
 #include <net/sch_generic.h>
 #include <net/cls_cgroup.h>
+#include <net/dst_metadata.h>
 
 /**
  *	sk_filter - run a packet through a socket filter
@@ -1483,6 +1484,78 @@ bool bpf_helper_changes_skb_data(void *func)
 	return false;
 }
 
+static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
+{
+	struct sk_buff *skb = (struct sk_buff *) (long) r1;
+	struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2;
+	struct ip_tunnel_info *info = skb_tunnel_info(skb, AF_INET);
+
+	if (unlikely(size != sizeof(struct bpf_tunnel_key) || flags || !info))
+		return -EINVAL;
+
+	to->tunnel_id = be64_to_cpu(info->key.tun_id);
+	to->remote_ipv4 = be32_to_cpu(info->key.ipv4_src);
+
+	return 0;
+}
+
+const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
+	.func		= bpf_skb_get_tunnel_key,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_STACK,
+	.arg3_type	= ARG_CONST_STACK_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+};
+
+static struct metadata_dst __percpu *md_dst;
+
+static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
+{
+	struct sk_buff *skb = (struct sk_buff *) (long) r1;
+	struct bpf_tunnel_key *from = (struct bpf_tunnel_key *) (long) r2;
+	struct metadata_dst *md = this_cpu_ptr(md_dst);
+	struct ip_tunnel_info *info;
+
+	if (unlikely(size != sizeof(struct bpf_tunnel_key) || flags))
+		return -EINVAL;
+
+	skb_dst_drop(skb);
+	dst_hold((struct dst_entry *) md);
+	skb_dst_set(skb, (struct dst_entry *) md);
+
+	info = &md->u.tun_info;
+	info->mode = IP_TUNNEL_INFO_TX;
+	info->key.tun_id = cpu_to_be64(from->tunnel_id);
+	info->key.ipv4_dst = cpu_to_be32(from->remote_ipv4);
+
+	return 0;
+}
+
+const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
+	.func		= bpf_skb_set_tunnel_key,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_STACK,
+	.arg3_type	= ARG_CONST_STACK_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+};
+
+static const struct bpf_func_proto *bpf_get_skb_set_tunnel_key_proto(void)
+{
+	if (!md_dst) {
+		/* race is not possible, since it's called from
+		 * verifier that is holding verifier mutex
+		 */
+		md_dst = metadata_dst_alloc_percpu(0, GFP_KERNEL);
+		if (!md_dst)
+			return NULL;
+	}
+	return &bpf_skb_set_tunnel_key_proto;
+}
+
 static const struct bpf_func_proto *
 sk_filter_func_proto(enum bpf_func_id func_id)
 {
@@ -1526,6 +1599,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 		return &bpf_skb_vlan_push_proto;
 	case BPF_FUNC_skb_vlan_pop:
 		return &bpf_skb_vlan_pop_proto;
+	case BPF_FUNC_skb_get_tunnel_key:
+		return &bpf_skb_get_tunnel_key_proto;
+	case BPF_FUNC_skb_set_tunnel_key:
+		return bpf_get_skb_set_tunnel_key_proto();
 	default:
 		return sk_filter_func_proto(func_id);
 	}
-- 
cgit v1.2.3


From f8a9b1bc1b238eed9987da747a0e52f5bb009980 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Thu, 30 Jul 2015 20:10:22 -0700
Subject: vxlan: expose COLLECT_METADATA flag to user space

Two vxlan driver flags FLOWBASED and COLLECT_METADATA need to be set to
make use of its new flow mode. The former already exposed. Expose the latter.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c          | 5 +++++
 include/uapi/linux/if_link.h | 1 +
 2 files changed, 6 insertions(+)

(limited to 'include/uapi')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index beed5d4025a3..e90f7a484e1c 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2463,6 +2463,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
 	[IFLA_VXLAN_L2MISS]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_L3MISS]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_FLOWBASED]	= { .type = NLA_U8 },
+	[IFLA_VXLAN_COLLECT_METADATA]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_PORT]	= { .type = NLA_U16 },
 	[IFLA_VXLAN_UDP_CSUM]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]	= { .type = NLA_U8 },
@@ -2817,6 +2818,10 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev,
 	    nla_get_u8(data[IFLA_VXLAN_FLOWBASED]))
 		conf.flags |= VXLAN_F_FLOW_BASED;
 
+	if (data[IFLA_VXLAN_COLLECT_METADATA] &&
+	    nla_get_u8(data[IFLA_VXLAN_COLLECT_METADATA]))
+		conf.flags |= VXLAN_F_COLLECT_METADATA;
+
 	if (data[IFLA_VXLAN_PORT_RANGE]) {
 		const struct ifla_vxlan_port_range *p
 			= nla_data(data[IFLA_VXLAN_PORT_RANGE]);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 9eeb5d9cf8f0..24e22cd4be79 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -383,6 +383,7 @@ enum {
 	IFLA_VXLAN_GBP,
 	IFLA_VXLAN_REMCSUM_NOPARTIAL,
 	IFLA_VXLAN_FLOWBASED,
+	IFLA_VXLAN_COLLECT_METADATA,
 	__IFLA_VXLAN_MAX
 };
 #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
-- 
cgit v1.2.3


From 0f7bffd9e512b77279bbce704fad3cb1d6887958 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Fri, 31 Jul 2015 16:49:43 +0200
Subject: bonding: add tlb_dynamic_lb netlink support

tlb_dynamic_lb could be set only via sysfs, this patch allows it to be
set via netlink.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_netlink.c | 17 +++++++++++++++--
 include/uapi/linux/if_link.h       |  1 +
 2 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c
index 1bda29249d12..db760e84119f 100644
--- a/drivers/net/bonding/bond_netlink.c
+++ b/drivers/net/bonding/bond_netlink.c
@@ -111,6 +111,7 @@ static const struct nla_policy bond_policy[IFLA_BOND_MAX + 1] = {
 	[IFLA_BOND_AD_USER_PORT_KEY]	= { .type = NLA_U16 },
 	[IFLA_BOND_AD_ACTOR_SYSTEM]	= { .type = NLA_BINARY,
 					    .len  = ETH_ALEN },
+	[IFLA_BOND_TLB_DYNAMIC_LB]	= { .type = NLA_U8 },
 };
 
 static const struct nla_policy bond_slave_policy[IFLA_BOND_SLAVE_MAX + 1] = {
@@ -405,7 +406,6 @@ static int bond_changelink(struct net_device *bond_dev,
 		if (err)
 			return err;
 	}
-
 	if (data[IFLA_BOND_AD_USER_PORT_KEY]) {
 		int port_key =
 			nla_get_u16(data[IFLA_BOND_AD_USER_PORT_KEY]);
@@ -415,7 +415,6 @@ static int bond_changelink(struct net_device *bond_dev,
 		if (err)
 			return err;
 	}
-
 	if (data[IFLA_BOND_AD_ACTOR_SYSTEM]) {
 		if (nla_len(data[IFLA_BOND_AD_ACTOR_SYSTEM]) != ETH_ALEN)
 			return -EINVAL;
@@ -426,6 +425,15 @@ static int bond_changelink(struct net_device *bond_dev,
 		if (err)
 			return err;
 	}
+	if (data[IFLA_BOND_TLB_DYNAMIC_LB]) {
+		int dynamic_lb = nla_get_u8(data[IFLA_BOND_TLB_DYNAMIC_LB]);
+
+		bond_opt_initval(&newval, dynamic_lb);
+		err = __bond_opt_set(bond, BOND_OPT_TLB_DYNAMIC_LB, &newval);
+		if (err)
+			return err;
+	}
+
 	return 0;
 }
 
@@ -476,6 +484,7 @@ static size_t bond_get_size(const struct net_device *bond_dev)
 		nla_total_size(sizeof(u16)) + /* IFLA_BOND_AD_ACTOR_SYS_PRIO */
 		nla_total_size(sizeof(u16)) + /* IFLA_BOND_AD_USER_PORT_KEY */
 		nla_total_size(ETH_ALEN) + /* IFLA_BOND_AD_ACTOR_SYSTEM */
+		nla_total_size(sizeof(u8)) + /* IFLA_BOND_TLB_DYNAMIC_LB */
 		0;
 }
 
@@ -598,6 +607,10 @@ static int bond_fill_info(struct sk_buff *skb,
 		       bond->params.ad_select))
 		goto nla_put_failure;
 
+	if (nla_put_u8(skb, IFLA_BOND_TLB_DYNAMIC_LB,
+		       bond->params.tlb_dynamic_lb))
+		goto nla_put_failure;
+
 	if (BOND_MODE(bond) == BOND_MODE_8023AD) {
 		struct ad_info info;
 
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 24e22cd4be79..ea047480a1f0 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -434,6 +434,7 @@ enum {
 	IFLA_BOND_AD_ACTOR_SYS_PRIO,
 	IFLA_BOND_AD_USER_PORT_KEY,
 	IFLA_BOND_AD_ACTOR_SYSTEM,
+	IFLA_BOND_TLB_DYNAMIC_LB,
 	__IFLA_BOND_MAX,
 };
 
-- 
cgit v1.2.3


From ba7591d8b28bd16a2eface5d009ab0b60c7629a4 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 1 Aug 2015 00:46:29 +0200
Subject: ebpf: add skb->hash to offset map for usage in {cls, act}_bpf or
 filters

Add skb->hash to the __sk_buff offset map, so it can be accessed from
an eBPF program. We currently already do this for classic BPF filters,
but not yet on eBPF, it might be useful as a demuxer in combination with
helpers like bpf_clone_redirect(), toy example:

  __section("cls-lb") int ingress_main(struct __sk_buff *skb)
  {
    unsigned int which = 3 + (skb->hash & 7);
    /* bpf_skb_store_bytes(skb, ...); */
    /* bpf_l{3,4}_csum_replace(skb, ...); */
    bpf_clone_redirect(skb, which, 0);
    return -1;
  }

I was thinking whether to add skb_get_hash(), but then concluded the
raw skb->hash seems fine in this case: we can directly access the hash
w/o extra eBPF helper function call, it's filled out by many NICs on
ingress, and in case the entropy level would not be sufficient, people
can still implement their own specific sw fallback hash mix anyway.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 1 +
 net/core/filter.c        | 7 +++++++
 2 files changed, 8 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index bc0d27d3fbdd..2ce13c109b00 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -290,6 +290,7 @@ struct __sk_buff {
 	__u32 ifindex;
 	__u32 tc_index;
 	__u32 cb[5];
+	__u32 hash;
 };
 
 struct bpf_tunnel_key {
diff --git a/net/core/filter.c b/net/core/filter.c
index 1b72264ff2ee..a50dbfa83ad9 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1711,6 +1711,13 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 				      offsetof(struct net_device, ifindex));
 		break;
 
+	case offsetof(struct __sk_buff, hash):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
+
+		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+				      offsetof(struct sk_buff, hash));
+		break;
+
 	case offsetof(struct __sk_buff, mark):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
 
-- 
cgit v1.2.3


From 3c7c8468e5d993dfe377a67e41cbb23cda93af9e Mon Sep 17 00:00:00 2001
From: Tomas Winkler <tomas.winkler@intel.com>
Date: Sun, 26 Jul 2015 09:54:20 +0300
Subject: mei: add async event notification ioctls

Add ioctl IOCTL_MEI_NOTIFY_SET for enabling and disabling
async event notification.
Add ioctl IOCTL_MEI_NOTIFY_GET for receiving and acking
an event notification.

Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Signed-off-by: Alexander Usyskin <alexander.usyskin@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ioctl/ioctl-number.txt   |  2 +
 Documentation/misc-devices/mei/mei.txt | 45 ++++++++++++++++++++++-
 drivers/misc/mei/main.c                | 67 ++++++++++++++++++++++++++++++++++
 include/uapi/linux/mei.h               | 19 ++++++++++
 4 files changed, 132 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 611c52267d24..141f847c7648 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -124,6 +124,8 @@ Code  Seq#(hex)	Include File		Comments
 'H'	00-7F	linux/hiddev.h		conflict!
 'H'	00-0F	linux/hidraw.h		conflict!
 'H'	01	linux/mei.h		conflict!
+'H'	02	linux/mei.h		conflict!
+'H'	03	linux/mei.h		conflict!
 'H'	00-0F	sound/asound.h		conflict!
 'H'	20-40	sound/asound_fm.h	conflict!
 'H'	80-8F	sound/sfnt_info.h	conflict!
diff --git a/Documentation/misc-devices/mei/mei.txt b/Documentation/misc-devices/mei/mei.txt
index 8d47501bba0a..91c1fa34f48b 100644
--- a/Documentation/misc-devices/mei/mei.txt
+++ b/Documentation/misc-devices/mei/mei.txt
@@ -96,7 +96,7 @@ A code snippet for an application communicating with Intel AMTHI client:
 IOCTL
 =====
 
-The Intel MEI Driver supports the following IOCTL command:
+The Intel MEI Driver supports the following IOCTL commands:
 	IOCTL_MEI_CONNECT_CLIENT	Connect to firmware Feature (client).
 
 	usage:
@@ -125,6 +125,49 @@ The Intel MEI Driver supports the following IOCTL command:
         data that can be sent or received. (e.g. if MTU=2K, can send
         requests up to bytes 2k and received responses up to 2k bytes).
 
+	IOCTL_MEI_NOTIFY_SET: enable or disable event notifications
+
+	Usage:
+		uint32_t enable;
+		ioctl(fd, IOCTL_MEI_NOTIFY_SET, &enable);
+
+	Inputs:
+		uint32_t enable = 1;
+		or
+		uint32_t enable[disable] = 0;
+
+	Error returns:
+		EINVAL	Wrong IOCTL Number
+		ENODEV	Device  is not initialized or the client not connected
+		ENOMEM	Unable to allocate memory to client internal data.
+		EFAULT	Fatal Error (e.g. Unable to access user input data)
+		EOPNOTSUPP if the device doesn't support the feature
+
+	Notes:
+	The client must be connected in order to enable notification events
+
+
+	IOCTL_MEI_NOTIFY_GET : retrieve event
+
+	Usage:
+		uint32_t event;
+		ioctl(fd, IOCTL_MEI_NOTIFY_GET, &event);
+
+	Outputs:
+		1 - if an event is pending
+		0 - if there is no even pending
+
+	Error returns:
+		EINVAL	Wrong IOCTL Number
+		ENODEV	Device is not initialized or the client not connected
+		ENOMEM	Unable to allocate memory to client internal data.
+		EFAULT	Fatal Error (e.g. Unable to access user input data)
+		EOPNOTSUPP if the device doesn't support the feature
+
+	Notes:
+	The client must be connected and event notification has to be enabled
+	in order to receive an event
+
 
 Intel ME Applications
 =====================
diff --git a/drivers/misc/mei/main.c b/drivers/misc/mei/main.c
index e9513d651cd3..ffa70035af29 100644
--- a/drivers/misc/mei/main.c
+++ b/drivers/misc/mei/main.c
@@ -445,6 +445,45 @@ end:
 	return rets;
 }
 
+/**
+ * mei_ioctl_client_notify_request -
+ *     propagate event notification request to client
+ *
+ * @file: pointer to file structure
+ * @request: 0 - disable, 1 - enable
+ *
+ * Return: 0 on success , <0 on error
+ */
+static int mei_ioctl_client_notify_request(struct file *file, u32 request)
+{
+	struct mei_cl *cl = file->private_data;
+
+	return mei_cl_notify_request(cl, file, request);
+}
+
+/**
+ * mei_ioctl_client_notify_get -  wait for notification request
+ *
+ * @file: pointer to file structure
+ * @notify_get: 0 - disable, 1 - enable
+ *
+ * Return: 0 on success , <0 on error
+ */
+static int mei_ioctl_client_notify_get(struct file *file, u32 *notify_get)
+{
+	struct mei_cl *cl = file->private_data;
+	bool notify_ev;
+	bool block = (file->f_flags & O_NONBLOCK) == 0;
+	int rets;
+
+	rets = mei_cl_notify_get(cl, block, &notify_ev);
+	if (rets)
+		return rets;
+
+	*notify_get = notify_ev ? 1 : 0;
+	return 0;
+}
+
 /**
  * mei_ioctl - the IOCTL function
  *
@@ -459,6 +498,7 @@ static long mei_ioctl(struct file *file, unsigned int cmd, unsigned long data)
 	struct mei_device *dev;
 	struct mei_cl *cl = file->private_data;
 	struct mei_connect_client_data connect_data;
+	u32 notify_get, notify_req;
 	int rets;
 
 
@@ -499,6 +539,33 @@ static long mei_ioctl(struct file *file, unsigned int cmd, unsigned long data)
 
 		break;
 
+	case IOCTL_MEI_NOTIFY_SET:
+		dev_dbg(dev->dev, ": IOCTL_MEI_NOTIFY_SET.\n");
+		if (copy_from_user(&notify_req,
+				   (char __user *)data, sizeof(notify_req))) {
+			dev_dbg(dev->dev, "failed to copy data from userland\n");
+			rets = -EFAULT;
+			goto out;
+		}
+		rets = mei_ioctl_client_notify_request(file, notify_req);
+		break;
+
+	case IOCTL_MEI_NOTIFY_GET:
+		dev_dbg(dev->dev, ": IOCTL_MEI_NOTIFY_GET.\n");
+		rets = mei_ioctl_client_notify_get(file, &notify_get);
+		if (rets)
+			goto out;
+
+		dev_dbg(dev->dev, "copy connect data to user\n");
+		if (copy_to_user((char __user *)data,
+				&notify_get, sizeof(notify_get))) {
+			dev_dbg(dev->dev, "failed to copy data to userland\n");
+			rets = -EFAULT;
+			goto out;
+
+		}
+		break;
+
 	default:
 		dev_err(dev->dev, ": unsupported ioctl %d.\n", cmd);
 		rets = -ENOIOCTLCMD;
diff --git a/include/uapi/linux/mei.h b/include/uapi/linux/mei.h
index bc0d8b69c49e..7c3b64f6a215 100644
--- a/include/uapi/linux/mei.h
+++ b/include/uapi/linux/mei.h
@@ -107,4 +107,23 @@ struct mei_connect_client_data {
 	};
 };
 
+/**
+ * DOC: set and unset event notification for a connected client
+ *
+ * The IOCTL argument is 1 for enabling event notification and 0 for
+ * disabling the service
+ * Return:  -EOPNOTSUPP if the devices doesn't support the feature
+ */
+#define IOCTL_MEI_NOTIFY_SET _IOW('H', 0x02, __u32)
+
+/**
+ * DOC: retrieve notification
+ *
+ * The IOCTL output argument is 1 if an event was is pending and 0 otherwise
+ * the ioctl has to be called in order to acknowledge pending event
+ *
+ * Return:  -EOPNOTSUPP if the devices doesn't support the feature
+ */
+#define IOCTL_MEI_NOTIFY_GET _IOR('H', 0x03, __u32)
+
 #endif /* _LINUX_MEI_H  */
-- 
cgit v1.2.3


From a6affd24f439feddec04bab4d1e3ad6579868367 Mon Sep 17 00:00:00 2001
From: Robert Shearman <rshearma@brocade.com>
Date: Mon, 3 Aug 2015 17:50:04 +0100
Subject: mpls: Use definition for reserved label checks

In multiple locations there are checks for whether the label in hand
is a reserved label or not using the arbritray value of 16. Factor
this out into a #define for better maintainability and for
documentation.

Signed-off-by: Robert Shearman <rshearma@brocade.com>
Acked-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/mpls.h |  2 ++
 net/mpls/af_mpls.c        | 21 +++++++++++----------
 2 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/mpls.h b/include/uapi/linux/mpls.h
index 139d4dd1cab8..24a6cb1aec86 100644
--- a/include/uapi/linux/mpls.h
+++ b/include/uapi/linux/mpls.h
@@ -41,4 +41,6 @@ struct mpls_label {
 #define MPLS_LABEL_OAMALERT		14 /* RFC3429 */
 #define MPLS_LABEL_EXTENSION		15 /* RFC7274 */
 
+#define MPLS_LABEL_FIRST_UNRESERVED	16 /* RFC3032 */
+
 #endif /* _UAPI_MPLS_H */
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 88cfaa241c07..b6b9a6c4e784 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -293,7 +293,7 @@ static void mpls_notify_route(struct net *net, unsigned index,
 	struct mpls_route *rt = new ? new : old;
 	unsigned nlm_flags = (old && new) ? NLM_F_REPLACE : 0;
 	/* Ignore reserved labels for now */
-	if (rt && (index >= 16))
+	if (rt && (index >= MPLS_LABEL_FIRST_UNRESERVED))
 		rtmsg_lfib(event, index, rt, nlh, net, portid, nlm_flags);
 }
 
@@ -327,7 +327,8 @@ static unsigned find_free_label(struct net *net)
 
 	platform_label = rtnl_dereference(net->mpls.platform_label);
 	platform_labels = net->mpls.platform_labels;
-	for (index = 16; index < platform_labels; index++) {
+	for (index = MPLS_LABEL_FIRST_UNRESERVED; index < platform_labels;
+	     index++) {
 		if (!rtnl_dereference(platform_label[index]))
 			return index;
 	}
@@ -436,8 +437,8 @@ static int mpls_route_add(struct mpls_route_config *cfg)
 		index = find_free_label(net);
 	}
 
-	/* The first 16 labels are reserved, and may not be set */
-	if (index < 16)
+	/* Reserved labels may not be set */
+	if (index < MPLS_LABEL_FIRST_UNRESERVED)
 		goto errout;
 
 	/* The full 20 bit range may not be supported. */
@@ -516,8 +517,8 @@ static int mpls_route_del(struct mpls_route_config *cfg)
 
 	index = cfg->rc_label;
 
-	/* The first 16 labels are reserved, and may not be removed */
-	if (index < 16)
+	/* Reserved labels may not be removed */
+	if (index < MPLS_LABEL_FIRST_UNRESERVED)
 		goto errout;
 
 	/* The full 20 bit range may not be supported */
@@ -835,8 +836,8 @@ static int rtm_to_route_config(struct sk_buff *skb,  struct nlmsghdr *nlh,
 					   &cfg->rc_label))
 				goto errout;
 
-			/* The first 16 labels are reserved, and may not be set */
-			if (cfg->rc_label < 16)
+			/* Reserved labels may not be set */
+			if (cfg->rc_label < MPLS_LABEL_FIRST_UNRESERVED)
 				goto errout;
 
 			break;
@@ -961,8 +962,8 @@ static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb)
 	ASSERT_RTNL();
 
 	index = cb->args[0];
-	if (index < 16)
-		index = 16;
+	if (index < MPLS_LABEL_FIRST_UNRESERVED)
+		index = MPLS_LABEL_FIRST_UNRESERVED;
 
 	platform_label = rtnl_dereference(net->mpls.platform_label);
 	platform_labels = net->mpls.platform_labels;
-- 
cgit v1.2.3


From 71ef3c6b9d4665ee7afbbe4c208a98917dcfc32f Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Sun, 10 May 2015 12:22:42 -0700
Subject: perf: Add cycles to branch_info

Intel Skylake supports reporting the time in cycles a branch in the LBR
took, to give a rough indication of the basic block performance.

Export the cycle information in the branch_info structure.
This can be done by just reusing some currently zero padding.

This is just the generic header change. The architecture
still needs to fill it in.

There's no attempt to convert to real time, as we really
want cycles here.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1431285767-27027-5-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/uapi/linux/perf_event.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 022d0acf7df0..2881145cda86 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -951,6 +951,7 @@ union perf_mem_data_src {
  *
  *     in_tx: running in a hardware transaction
  *     abort: aborting a hardware transaction
+ *    cycles: cycles from last branch (or 0 if not supported)
  */
 struct perf_branch_entry {
 	__u64	from;
@@ -959,7 +960,8 @@ struct perf_branch_entry {
 		predicted:1,/* target predicted */
 		in_tx:1,    /* in transaction */
 		abort:1,    /* transaction abort */
-		reserved:60;
+		cycles:16,  /* cycle count to last branch */
+		reserved:44;
 };
 
 #endif /* _UAPI_LINUX_PERF_EVENT_H */
-- 
cgit v1.2.3


From 35c051258e8fd7cb97222f4aa887bcd404c156d0 Mon Sep 17 00:00:00 2001
From: Sinclair Yeh <syeh@vmware.com>
Date: Fri, 26 Jun 2015 01:42:06 -0700
Subject: drm/vmwgfx: Implement screen targets

Add support for the screen target device interface.
Add a getparam parameter and bump minor to signal availability.

Signed-off-by: Sinclair Yeh <syeh@vmware.com>
Signed-off-by: Thomas Hellstrom <thellstrom@vmware.com>
---
 drivers/gpu/drm/vmwgfx/Makefile         |    2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.c     |   24 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.h     |   15 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_ioctl.c   |    4 +
 drivers/gpu/drm/vmwgfx/vmwgfx_kms.c     |   62 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_kms.h     |   20 +
 drivers/gpu/drm/vmwgfx/vmwgfx_mob.c     |    3 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c    | 1364 +++++++++++++++++++++++++++++++
 drivers/gpu/drm/vmwgfx/vmwgfx_surface.c |    4 +
 include/uapi/drm/vmwgfx_drm.h           |    1 +
 10 files changed, 1475 insertions(+), 24 deletions(-)
 create mode 100644 drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/vmwgfx/Makefile b/drivers/gpu/drm/vmwgfx/Makefile
index 529bc7217c72..484093986d5a 100644
--- a/drivers/gpu/drm/vmwgfx/Makefile
+++ b/drivers/gpu/drm/vmwgfx/Makefile
@@ -7,6 +7,6 @@ vmwgfx-y := vmwgfx_execbuf.o vmwgfx_gmr.o vmwgfx_kms.o vmwgfx_drv.o \
 	    vmwgfx_overlay.o vmwgfx_marker.o vmwgfx_gmrid_manager.o \
 	    vmwgfx_fence.o vmwgfx_dmabuf.o vmwgfx_scrn.o vmwgfx_context.o \
 	    vmwgfx_surface.o vmwgfx_prime.o vmwgfx_mob.o vmwgfx_shader.o \
-	    vmwgfx_cmdbuf_res.o vmwgfx_cmdbuf.o \
+	    vmwgfx_cmdbuf_res.o vmwgfx_cmdbuf.o vmwgfx_stdu.o \
 
 obj-$(CONFIG_DRM_VMWGFX) := vmwgfx.o
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
index 7e2b3c84119b..ab1b70ce19c1 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
@@ -693,22 +693,28 @@ static int vmw_driver_load(struct drm_device *dev, unsigned long chipset)
 				 SVGA_REG_MAX_PRIMARY_BOUNDING_BOX_MEM);
 		dev_priv->max_mob_size =
 			vmw_read(dev_priv, SVGA_REG_MOB_MAX_SIZE);
+		dev_priv->stdu_max_width =
+			vmw_read(dev_priv, SVGA_REG_SCREENTARGET_MAX_WIDTH);
+		dev_priv->stdu_max_height =
+			vmw_read(dev_priv, SVGA_REG_SCREENTARGET_MAX_HEIGHT);
+
+		vmw_write(dev_priv, SVGA_REG_DEV_CAP,
+			  SVGA3D_DEVCAP_MAX_TEXTURE_WIDTH);
+		dev_priv->texture_max_width = vmw_read(dev_priv,
+						       SVGA_REG_DEV_CAP);
+		vmw_write(dev_priv, SVGA_REG_DEV_CAP,
+			  SVGA3D_DEVCAP_MAX_TEXTURE_HEIGHT);
+		dev_priv->texture_max_height = vmw_read(dev_priv,
+							SVGA_REG_DEV_CAP);
 	} else
 		dev_priv->prim_bb_mem = dev_priv->vram_size;
+ 
+	vmw_print_capabilities(dev_priv->capabilities);
 
 	ret = vmw_dma_masks(dev_priv);
 	if (unlikely(ret != 0))
 		goto out_err0;
 
-	/*
-	 * Limit back buffer size to VRAM size.  Remove this once
-	 * screen targets are implemented.
-	 */
-	if (dev_priv->prim_bb_mem > dev_priv->vram_size)
-		dev_priv->prim_bb_mem = dev_priv->vram_size;
-
-	vmw_print_capabilities(dev_priv->capabilities);
-
 	if (dev_priv->capabilities & SVGA_CAP_GMR2) {
 		DRM_INFO("Max GMR ids is %u\n",
 			 (unsigned)dev_priv->max_gmr_ids);
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
index c3f8fc97b336..04f8bf21557f 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
@@ -40,17 +40,17 @@
 #include <drm/ttm/ttm_module.h>
 #include "vmwgfx_fence.h"
 
-#define VMWGFX_DRIVER_DATE "20140704"
+#define VMWGFX_DRIVER_DATE "20150626"
 #define VMWGFX_DRIVER_MAJOR 2
-#define VMWGFX_DRIVER_MINOR 6
-#define VMWGFX_DRIVER_PATCHLEVEL 1
+#define VMWGFX_DRIVER_MINOR 7
+#define VMWGFX_DRIVER_PATCHLEVEL 0
 #define VMWGFX_FILE_PAGE_OFFSET 0x00100000
 #define VMWGFX_FIFO_STATIC_SIZE (1024*1024)
 #define VMWGFX_MAX_RELOCATIONS 2048
 #define VMWGFX_MAX_VALIDATIONS 2048
 #define VMWGFX_MAX_DISPLAYS 16
 #define VMWGFX_CMD_BOUNCE_INIT_SIZE 32768
-#define VMWGFX_ENABLE_SCREEN_TARGET_OTABLE 0
+#define VMWGFX_ENABLE_SCREEN_TARGET_OTABLE 1
 
 /*
  * Perhaps we should have sysfs entries for these.
@@ -337,7 +337,8 @@ struct vmw_ctx_binding_state {
 enum vmw_display_unit_type {
 	vmw_du_invalid = 0,
 	vmw_du_legacy,
-	vmw_du_screen_object
+	vmw_du_screen_object,
+	vmw_du_screen_target
 };
 
 
@@ -402,6 +403,10 @@ struct vmw_private {
 	uint32_t mmio_size;
 	uint32_t fb_max_width;
 	uint32_t fb_max_height;
+	uint32_t texture_max_width;
+	uint32_t texture_max_height;
+	uint32_t stdu_max_width;
+	uint32_t stdu_max_height;
 	uint32_t initial_width;
 	uint32_t initial_height;
 	__le32 __iomem *mmio_virt;
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ioctl.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ioctl.c
index 69c8ce23123c..55940bc0eb07 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_ioctl.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ioctl.c
@@ -105,6 +105,10 @@ int vmw_getparam_ioctl(struct drm_device *dev, void *data,
 	case DRM_VMW_PARAM_MAX_MOB_SIZE:
 		param->value = dev_priv->max_mob_size;
 		break;
+	case DRM_VMW_PARAM_SCREEN_TARGET:
+		param->value =
+			(dev_priv->active_display_unit == vmw_du_screen_target);
+		break;
 	default:
 		DRM_ERROR("Illegal vmwgfx get param request: %d\n",
 			  param->param);
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c
index 7566a5a14004..6680aa67386f 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c
@@ -463,6 +463,11 @@ static int vmw_framebuffer_surface_dirty(struct drm_framebuffer *framebuffer,
 						   flags, color,
 						   clips, num_clips,
 						   inc, NULL);
+	else
+		ret = vmw_kms_stdu_do_surface_dirty(dev_priv, file_priv,
+						    &vfbs->base,
+						    clips, num_clips,
+						    inc);
 
 	vmw_fifo_flush(dev_priv, false);
 	ttm_read_unlock(&dev_priv->reservation_sem);
@@ -636,6 +641,11 @@ static int vmw_framebuffer_dmabuf_dirty(struct drm_framebuffer *framebuffer,
 						  flags, color,
 						  clips, num_clips, increment,
 						  NULL);
+	} else {
+		ret = vmw_kms_stdu_do_surface_dirty(dev_priv, file_priv,
+						    &vfbd->base,
+						    clips, num_clips,
+						    increment);
 	}
 
 	vmw_fifo_flush(dev_priv, false);
@@ -999,8 +1009,6 @@ int vmw_kms_generic_present(struct vmw_private *dev_priv,
 			break;
 	}
 
-	vmw_fifo_flush(dev_priv, false);
-
 	kfree(cmd);
 out_free_tmp:
 	kfree(tmp);
@@ -1017,8 +1025,21 @@ int vmw_kms_present(struct vmw_private *dev_priv,
 		    struct drm_vmw_rect *clips,
 		    uint32_t num_clips)
 {
-	return vmw_kms_generic_present(dev_priv, file_priv, vfb, surface, sid,
-				       destX, destY, clips, num_clips);
+	int ret;
+
+	if (dev_priv->active_display_unit == vmw_du_screen_target)
+		ret = vmw_kms_stdu_present(dev_priv, file_priv, vfb, sid,
+					   destX, destY, clips, num_clips);
+	else
+		ret = vmw_kms_generic_present(dev_priv, file_priv, vfb,
+					      surface, sid, destX, destY,
+					      clips, num_clips);
+	if (ret)
+		return ret;
+
+	vmw_fifo_flush(dev_priv, false);
+
+	return 0;
 }
 
 int vmw_kms_readback(struct vmw_private *dev_priv,
@@ -1141,9 +1162,12 @@ int vmw_kms_init(struct vmw_private *dev_priv)
 	dev->mode_config.max_width = 8192;
 	dev->mode_config.max_height = 8192;
 
-	ret = vmw_kms_sou_init_display(dev_priv);
-	if (ret) /* Fallback */
-		ret = vmw_kms_ldu_init_display(dev_priv);
+	ret = vmw_kms_stdu_init_display(dev_priv);
+	if (ret) {
+		ret = vmw_kms_sou_init_display(dev_priv);
+		if (ret) /* Fallback */
+			ret = vmw_kms_ldu_init_display(dev_priv);
+	}
 
 	return ret;
 }
@@ -1160,6 +1184,8 @@ int vmw_kms_close(struct vmw_private *dev_priv)
 	drm_mode_config_cleanup(dev_priv->dev);
 	if (dev_priv->active_display_unit == vmw_du_screen_object)
 		ret = vmw_kms_sou_close_display(dev_priv);
+	else if (dev_priv->active_display_unit == vmw_du_screen_target)
+		ret = vmw_kms_stdu_close_display(dev_priv);
 	else
 		ret = vmw_kms_ldu_close_display(dev_priv);
 
@@ -1311,7 +1337,9 @@ bool vmw_kms_validate_mode_vram(struct vmw_private *dev_priv,
 				uint32_t pitch,
 				uint32_t height)
 {
-	return ((u64) pitch * (u64) height) < (u64) dev_priv->prim_bb_mem;
+	return ((u64) pitch * (u64) height) < (u64)
+		((dev_priv->active_display_unit == vmw_du_screen_target) ?
+		 dev_priv->prim_bb_mem : dev_priv->vram_size);
 }
 
 
@@ -1558,6 +1586,11 @@ int vmw_du_connector_fill_modes(struct drm_connector *connector,
 	if (dev_priv->active_display_unit == vmw_du_screen_object)
 		assumed_bpp = 4;
 
+	if (dev_priv->active_display_unit == vmw_du_screen_target) {
+		max_width  = min(max_width,  dev_priv->stdu_max_width);
+		max_height = min(max_height, dev_priv->stdu_max_height);
+	}
+
 	/* Add preferred mode */
 	mode = drm_mode_duplicate(dev, &prefmode);
 	if (!mode)
@@ -1674,6 +1707,19 @@ int vmw_kms_update_layout_ioctl(struct drm_device *dev, void *data,
 			bounding_box.h = rects[i].y + rects[i].h;
 	}
 
+	/*
+	 * For Screen Target Display Unit, all the displays must fit
+	 * inside of maximum texture size.
+	 */
+	if (dev_priv->active_display_unit == vmw_du_screen_target)
+		if (bounding_box.w > dev_priv->texture_max_width ||
+		    bounding_box.h > dev_priv->texture_max_height) {
+			DRM_ERROR("Layout exceeds maximum texture size\n");
+			ret = -EINVAL;
+			goto out_free;
+		}
+
+
 	vmw_du_update_layout(dev_priv, arg->num_outputs, rects);
 
 out_free:
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.h b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.h
index 0f2c29166f7c..548fa872b39c 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.h
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.h
@@ -204,4 +204,24 @@ int vmw_kms_sou_do_dmabuf_dirty(struct drm_file *file_priv,
 				struct drm_clip_rect *clips,
 				unsigned num_clips, int increment,
 				struct vmw_fence_obj **out_fence);
+
+
+/*
+ * Screen Target Display Unit functions - vmwgfx_stdu.c
+ */
+int vmw_kms_stdu_init_display(struct vmw_private *dev_priv);
+int vmw_kms_stdu_close_display(struct vmw_private *dev_priv);
+int vmw_kms_stdu_do_surface_dirty(struct vmw_private *dev_priv,
+				  struct drm_file *file_priv,
+				  struct vmw_framebuffer *framebuffer,
+				  struct drm_clip_rect *clips,
+				  unsigned num_clips, int increment);
+int vmw_kms_stdu_present(struct vmw_private *dev_priv,
+			 struct drm_file *file_priv,
+			 struct vmw_framebuffer *vfb,
+			 uint32_t user_handle,
+			 int32_t dest_x, int32_t dest_y,
+			 struct drm_vmw_rect *clips,
+			 uint32_t num_clips);
+
 #endif
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_mob.c b/drivers/gpu/drm/vmwgfx/vmwgfx_mob.c
index 46f975e57d06..0feac5675c51 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_mob.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_mob.c
@@ -31,7 +31,8 @@
  * If we set up the screen target otable, screen objects stop working.
  */
 
-#define VMW_OTABLE_SETUP_SUB ((VMWGFX_ENABLE_SCREEN_TARGET_OTABLE) ? 0 : 1)
+#define VMW_OTABLE_SETUP_SUB ((VMWGFX_ENABLE_SCREEN_TARGET_OTABLE &&	\
+			       (dev_priv->capabilities & SVGA_CAP_3D)) ? 0 : 1)
 
 #ifdef CONFIG_64BIT
 #define VMW_PPN_SIZE 8
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c b/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c
new file mode 100644
index 000000000000..3b8235c7ee42
--- /dev/null
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c
@@ -0,0 +1,1364 @@
+/******************************************************************************
+ *
+ * Copyright © 2014 VMware, Inc., Palo Alto, CA., USA
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ ******************************************************************************/
+
+#include "vmwgfx_kms.h"
+#include "svga3d_surfacedefs.h"
+#include <drm/drm_plane_helper.h>
+
+#define vmw_crtc_to_stdu(x) \
+	container_of(x, struct vmw_screen_target_display_unit, base.crtc)
+#define vmw_encoder_to_stdu(x) \
+	container_of(x, struct vmw_screen_target_display_unit, base.encoder)
+#define vmw_connector_to_stdu(x) \
+	container_of(x, struct vmw_screen_target_display_unit, base.connector)
+
+
+
+enum stdu_content_type {
+	SAME_AS_DISPLAY = 0,
+	SEPARATE_SURFACE,
+	SEPARATE_DMA
+};
+
+
+
+/**
+ * struct vmw_screen_target_display_unit
+ *
+ * @base: VMW specific DU structure
+ * @display_srf: surface to be displayed.  The dimension of this will always
+ *               match the display mode.  If the display mode matches
+ *               content_vfbs dimensions, then this is a pointer into the
+ *               corresponding field in content_vfbs.  If not, then this
+ *               is a separate buffer to which content_vfbs will blit to.
+ * @content_fb: holds the rendered content, can be a surface or DMA buffer
+ * @content_type:  content_fb type
+ * @defined:  true if the current display unit has been initialized
+ */
+struct vmw_screen_target_display_unit {
+	struct vmw_display_unit base;
+
+	struct vmw_surface     *display_srf;
+	struct drm_framebuffer *content_fb;
+
+	enum stdu_content_type content_fb_type;
+
+	bool defined;
+};
+
+
+
+static void vmw_stdu_destroy(struct vmw_screen_target_display_unit *stdu);
+
+
+
+/******************************************************************************
+ * Screen Target Display Unit helper Functions
+ *****************************************************************************/
+
+/**
+ * vmw_stdu_pin_display - pins the resource associated with the display surface
+ *
+ * @stdu: contains the display surface
+ *
+ * Since the display surface can either be a private surface allocated by us,
+ * or it can point to the content surface, we use this function to not pin the
+ * same resource twice.
+ */
+static int vmw_stdu_pin_display(struct vmw_screen_target_display_unit *stdu)
+{
+	return vmw_resource_pin(&stdu->display_srf->res);
+}
+
+
+
+/**
+ * vmw_stdu_unpin_display - unpins the resource associated with display surface
+ *
+ * @stdu: contains the display surface
+ *
+ * If the display surface was privatedly allocated by
+ * vmw_surface_gb_priv_define() and not registered as a framebuffer, then it
+ * won't be automatically cleaned up when all the framebuffers are freed.  As
+ * such, we have to explicitly call vmw_resource_unreference() to get it freed.
+ */
+static void vmw_stdu_unpin_display(struct vmw_screen_target_display_unit *stdu)
+{
+	if (stdu->display_srf) {
+		struct vmw_resource *res = &stdu->display_srf->res;
+
+		vmw_resource_unpin(res);
+
+		if (stdu->content_fb_type != SAME_AS_DISPLAY) {
+			vmw_resource_unreference(&res);
+			stdu->content_fb_type = SAME_AS_DISPLAY;
+		}
+
+		stdu->display_srf = NULL;
+	}
+}
+
+
+
+/******************************************************************************
+ * Screen Target Display Unit CRTC Functions
+ *****************************************************************************/
+
+
+/**
+ * vmw_stdu_crtc_destroy - cleans up the STDU
+ *
+ * @crtc: used to get a reference to the containing STDU
+ */
+static void vmw_stdu_crtc_destroy(struct drm_crtc *crtc)
+{
+	vmw_stdu_destroy(vmw_crtc_to_stdu(crtc));
+}
+
+
+
+/**
+ * vmw_stdu_content_copy - copies an area from the content to display surface
+ *
+ * @dev_priv:  VMW DRM device
+ * @file_priv: Pointer to a drm file private structure
+ * @stdu: STDU whose display surface will be blitted to
+ * @content_x: top/left corner of the content area to blit from
+ * @content_y: top/left corner of the content area to blit from
+ * @width: width of the blit area
+ * @height: height of the blit area
+ * @display_x: top/left corner of the display area to blit to
+ * @display_y: top/left corner of the display area to blit to
+ *
+ * Copies an area from the content surface to the display surface.
+ *
+ * RETURNs:
+ * 0 on success, error code on failure
+ */
+static int vmw_stdu_content_copy(struct vmw_private *dev_priv,
+				 struct drm_file *file_priv,
+				 struct vmw_screen_target_display_unit *stdu,
+				 uint32_t content_x, uint32_t content_y,
+				 uint32_t width, uint32_t height,
+				 uint32_t display_x, uint32_t display_y)
+{
+	size_t fifo_size;
+	int ret;
+	void *cmd;
+
+	struct vmw_surface_dma {
+		SVGA3dCmdHeader     header;
+		SVGA3dCmdSurfaceDMA body;
+		SVGA3dCopyBox       area;
+		SVGA3dCmdSurfaceDMASuffix suffix;
+	} surface_dma_cmd;
+
+	struct {
+		SVGA3dCmdHeader      header;
+		SVGA3dCmdSurfaceCopy body;
+		SVGA3dCopyBox        area;
+	} surface_cpy_cmd;
+
+
+	/*
+	 * Can only copy if content and display surfaces exist and are not
+	 * the same surface
+	 */
+	if (stdu->display_srf == NULL || stdu->content_fb == NULL ||
+	    stdu->content_fb_type == SAME_AS_DISPLAY) {
+		return -EINVAL;
+	}
+
+	if (stdu->content_fb_type == SEPARATE_DMA) {
+		struct vmw_framebuffer *content_vfb;
+		struct vmw_framebuffer_dmabuf *content_vfbd;
+		struct vmw_framebuffer_surface *content_vfbs;
+		struct drm_vmw_size cur_size = {0};
+		const struct svga3d_surface_desc *desc;
+		SVGA3dCmdSurfaceDMASuffix *suffix;
+		SVGAGuestPtr ptr;
+
+		content_vfb  = vmw_framebuffer_to_vfb(stdu->content_fb);
+		content_vfbd = vmw_framebuffer_to_vfbd(stdu->content_fb);
+		content_vfbs = vmw_framebuffer_to_vfbs(stdu->content_fb);
+
+		cur_size.width  = width;
+		cur_size.height = height;
+		cur_size.depth  = 1;
+
+		desc = svga3dsurface_get_desc(content_vfbs->surface->format);
+
+
+		fifo_size = sizeof(surface_dma_cmd);
+
+		memset(&surface_dma_cmd, 0, fifo_size);
+
+		ptr.gmrId  = content_vfb->user_handle;
+		ptr.offset = 0;
+
+		surface_dma_cmd.header.id   = SVGA_3D_CMD_SURFACE_DMA;
+		surface_dma_cmd.header.size = sizeof(surface_dma_cmd.body) +
+					      sizeof(surface_dma_cmd.area) +
+					      sizeof(surface_dma_cmd.suffix);
+
+		surface_dma_cmd.body.guest.ptr   = ptr;
+		surface_dma_cmd.body.guest.pitch = stdu->content_fb->pitches[0];
+		surface_dma_cmd.body.host.sid    = stdu->display_srf->res.id;
+		surface_dma_cmd.body.host.face   = 0;
+		surface_dma_cmd.body.host.mipmap = 0;
+		surface_dma_cmd.body.transfer    = SVGA3D_WRITE_HOST_VRAM;
+
+		surface_dma_cmd.area.srcx = content_x;
+		surface_dma_cmd.area.srcy = content_y;
+		surface_dma_cmd.area.x    = display_x;
+		surface_dma_cmd.area.y    = display_y;
+		surface_dma_cmd.area.d    = 1;
+		surface_dma_cmd.area.w    = width;
+		surface_dma_cmd.area.h    = height;
+
+		suffix = &surface_dma_cmd.suffix;
+
+		suffix->suffixSize    = sizeof(*suffix);
+		suffix->maximumOffset = svga3dsurface_get_image_buffer_size(
+						desc,
+						&cur_size,
+						stdu->content_fb->pitches[0]);
+
+		cmd = (void *) &surface_dma_cmd;
+	} else {
+		struct vmw_framebuffer *content_vfb;
+
+		content_vfb = vmw_framebuffer_to_vfb(stdu->content_fb);
+
+		fifo_size = sizeof(surface_cpy_cmd);
+
+		memset(&surface_cpy_cmd, 0, sizeof(surface_cpy_cmd));
+
+		surface_cpy_cmd.header.id   = SVGA_3D_CMD_SURFACE_COPY;
+		surface_cpy_cmd.header.size = sizeof(surface_cpy_cmd.body) +
+					      sizeof(surface_cpy_cmd.area);
+
+		surface_cpy_cmd.body.src.sid  = content_vfb->user_handle;
+		surface_cpy_cmd.body.dest.sid = stdu->display_srf->res.id;
+
+		surface_cpy_cmd.area.srcx = content_x;
+		surface_cpy_cmd.area.srcy = content_y;
+		surface_cpy_cmd.area.x    = display_x;
+		surface_cpy_cmd.area.y    = display_y;
+		surface_cpy_cmd.area.d    = 1;
+		surface_cpy_cmd.area.w    = width;
+		surface_cpy_cmd.area.h    = height;
+
+		cmd = (void *) &surface_cpy_cmd;
+	}
+
+	ret = vmw_execbuf_process(file_priv, dev_priv, NULL, cmd,
+				  fifo_size, 0, VMW_QUIRK_SCREENTARGET,
+				  NULL, NULL);
+
+	return ret;
+}
+
+
+
+/**
+ * vmw_stdu_define_st - Defines a Screen Target
+ *
+ * @dev_priv:  VMW DRM device
+ * @stdu: display unit to create a Screen Target for
+ *
+ * Creates a STDU that we can used later.  This function is called whenever the
+ * framebuffer size changes.
+ *
+ * RETURNs:
+ * 0 on success, error code on failure
+ */
+static int vmw_stdu_define_st(struct vmw_private *dev_priv,
+			      struct vmw_screen_target_display_unit *stdu)
+{
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdDefineGBScreenTarget body;
+	} *cmd;
+
+	cmd = vmw_fifo_reserve(dev_priv, sizeof(*cmd));
+
+	if (unlikely(cmd == NULL)) {
+		DRM_ERROR("Out of FIFO space defining Screen Target\n");
+		return -ENOMEM;
+	}
+
+	cmd->header.id   = SVGA_3D_CMD_DEFINE_GB_SCREENTARGET;
+	cmd->header.size = sizeof(cmd->body);
+
+	cmd->body.stid   = stdu->base.unit;
+	cmd->body.width  = stdu->display_srf->base_size.width;
+	cmd->body.height = stdu->display_srf->base_size.height;
+	cmd->body.flags  = (0 == cmd->body.stid) ? SVGA_STFLAG_PRIMARY : 0;
+	cmd->body.dpi    = 0;
+	cmd->body.xRoot  = stdu->base.crtc.x;
+	cmd->body.yRoot  = stdu->base.crtc.y;
+
+	if (!stdu->base.is_implicit) {
+		cmd->body.xRoot  = stdu->base.gui_x;
+		cmd->body.yRoot  = stdu->base.gui_y;
+	}
+
+	vmw_fifo_commit(dev_priv, sizeof(*cmd));
+
+	stdu->defined = true;
+
+	return 0;
+}
+
+
+
+/**
+ * vmw_stdu_bind_st - Binds a surface to a Screen Target
+ *
+ * @dev_priv: VMW DRM device
+ * @stdu: display unit affected
+ * @res: Buffer to bind to the screen target.  Set to NULL to blank screen.
+ *
+ * Binding a surface to a Screen Target the same as flipping
+ */
+static int vmw_stdu_bind_st(struct vmw_private *dev_priv,
+			    struct vmw_screen_target_display_unit *stdu,
+			    struct vmw_resource *res)
+{
+	SVGA3dSurfaceImageId image;
+
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdBindGBScreenTarget body;
+	} *cmd;
+
+
+	if (!stdu->defined) {
+		DRM_ERROR("No screen target defined\n");
+		return -EINVAL;
+	}
+
+	/* Set up image using information in vfb */
+	memset(&image, 0, sizeof(image));
+	image.sid = res ? res->id : SVGA3D_INVALID_ID;
+
+	cmd = vmw_fifo_reserve(dev_priv, sizeof(*cmd));
+
+	if (unlikely(cmd == NULL)) {
+		DRM_ERROR("Out of FIFO space binding a screen target\n");
+		return -ENOMEM;
+	}
+
+	cmd->header.id   = SVGA_3D_CMD_BIND_GB_SCREENTARGET;
+	cmd->header.size = sizeof(cmd->body);
+
+	cmd->body.stid   = stdu->base.unit;
+	cmd->body.image  = image;
+
+	vmw_fifo_commit(dev_priv, sizeof(*cmd));
+
+	return 0;
+}
+
+
+
+/**
+ * vmw_stdu_update_st - Updates a Screen Target
+ *
+ * @dev_priv: VMW DRM device
+ * @file_priv: Pointer to a drm file private structure
+ * @stdu: display unit affected
+ * @update_area: area that needs to be updated
+ *
+ * This function needs to be called whenever the content of a screen
+ * target changes.
+ * If the display and content buffers are different, then this function does
+ * a blit first from the content buffer to the display buffer before issuing
+ * the Screen Target update command.
+ *
+ * RETURNS:
+ * 0 on success, error code on failure
+ */
+static int vmw_stdu_update_st(struct vmw_private *dev_priv,
+			      struct drm_file *file_priv,
+			      struct vmw_screen_target_display_unit *stdu,
+			      struct drm_clip_rect *update_area)
+{
+	u32 width, height;
+	u32 display_update_x, display_update_y;
+	unsigned short display_x1, display_y1, display_x2, display_y2;
+
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdUpdateGBScreenTarget body;
+	} *cmd;
+
+
+	if (!stdu->defined) {
+		DRM_ERROR("No screen target defined");
+		return -EINVAL;
+	}
+
+	/* Display coordinates relative to its position in content surface */
+	display_x1 = stdu->base.crtc.x;
+	display_y1 = stdu->base.crtc.y;
+	display_x2 = display_x1 + stdu->display_srf->base_size.width;
+	display_y2 = display_y1 + stdu->display_srf->base_size.height;
+
+	/* Do nothing if the update area is outside of the display surface */
+	if (update_area->x2 <= display_x1 || update_area->x1 >= display_x2 ||
+	    update_area->y2 <= display_y1 || update_area->y1 >= display_y2)
+		return 0;
+
+	/* The top-left hand corner of the update area in display surface */
+	display_update_x = max(update_area->x1 - display_x1, 0);
+	display_update_y = max(update_area->y1 - display_y1, 0);
+
+	width  = min(update_area->x2, display_x2) -
+		 max(update_area->x1, display_x1);
+	height = min(update_area->y2, display_y2) -
+		 max(update_area->y1, display_y1);
+
+	if (file_priv && stdu->content_fb_type != SAME_AS_DISPLAY) {
+		int ret;
+
+		ret = vmw_stdu_content_copy(dev_priv, file_priv,
+					    stdu,
+					    max(update_area->x1, display_x1),
+					    max(update_area->y1, display_y1),
+					    width, height,
+					    display_update_x, display_update_y);
+		if (unlikely(ret != 0)) {
+			DRM_ERROR("Failed to blit content\n");
+			return ret;
+		}
+	}
+
+	cmd = vmw_fifo_reserve(dev_priv, sizeof(*cmd));
+
+	if (unlikely(cmd == NULL)) {
+		DRM_ERROR("Out of FIFO space updating a Screen Target\n");
+		return -ENOMEM;
+	}
+
+	cmd->header.id   = SVGA_3D_CMD_UPDATE_GB_SCREENTARGET;
+	cmd->header.size = sizeof(cmd->body);
+
+	cmd->body.stid   = stdu->base.unit;
+	cmd->body.rect.x = display_update_x;
+	cmd->body.rect.y = display_update_y;
+	cmd->body.rect.w = width;
+	cmd->body.rect.h = height;
+
+	vmw_fifo_commit(dev_priv, sizeof(*cmd));
+
+	return 0;
+}
+
+
+
+/**
+ * vmw_stdu_destroy_st - Destroy a Screen Target
+ *
+ * @dev_priv:  VMW DRM device
+ * @stdu: display unit to destroy
+ */
+static int vmw_stdu_destroy_st(struct vmw_private *dev_priv,
+			       struct vmw_screen_target_display_unit *stdu)
+{
+	int    ret;
+
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdDestroyGBScreenTarget body;
+	} *cmd;
+
+
+	/* Nothing to do if not successfully defined */
+	if (unlikely(!stdu->defined))
+		return 0;
+
+	cmd = vmw_fifo_reserve(dev_priv, sizeof(*cmd));
+
+	if (unlikely(cmd == NULL)) {
+		DRM_ERROR("Out of FIFO space, screen target not destroyed\n");
+		return -ENOMEM;
+	}
+
+	cmd->header.id   = SVGA_3D_CMD_DESTROY_GB_SCREENTARGET;
+	cmd->header.size = sizeof(cmd->body);
+
+	cmd->body.stid   = stdu->base.unit;
+
+	vmw_fifo_commit(dev_priv, sizeof(*cmd));
+
+	/* Force sync */
+	ret = vmw_fallback_wait(dev_priv, false, true, 0, false, 3*HZ);
+	if (unlikely(ret != 0))
+		DRM_ERROR("Failed to sync with HW");
+
+	stdu->defined = false;
+
+	return ret;
+}
+
+
+
+/**
+ * vmw_stdu_crtc_set_config - Sets a mode
+ *
+ * @set:  mode parameters
+ *
+ * This function is the device-specific portion of the DRM CRTC mode set.
+ * For the SVGA device, we do this by defining a Screen Target, binding a
+ * GB Surface to that target, and finally update the screen target.
+ *
+ * RETURNS:
+ * 0 on success, error code otherwise
+ */
+static int vmw_stdu_crtc_set_config(struct drm_mode_set *set)
+{
+	struct vmw_private *dev_priv;
+	struct vmw_screen_target_display_unit *stdu;
+	struct vmw_framebuffer *vfb;
+	struct vmw_framebuffer_surface *new_vfbs;
+	struct drm_display_mode *mode;
+	struct drm_framebuffer  *new_fb;
+	struct drm_crtc      *crtc;
+	struct drm_encoder   *encoder;
+	struct drm_connector *connector;
+	struct drm_clip_rect update_area = {0};
+	int    ret;
+
+
+	if (!set || !set->crtc)
+		return -EINVAL;
+
+	crtc     = set->crtc;
+	crtc->x  = set->x;
+	crtc->y  = set->y;
+	stdu     = vmw_crtc_to_stdu(crtc);
+	mode     = set->mode;
+	new_fb   = set->fb;
+	dev_priv = vmw_priv(crtc->dev);
+
+
+	if (set->num_connectors > 1) {
+		DRM_ERROR("Too many connectors\n");
+		return -EINVAL;
+	}
+
+	if (set->num_connectors == 1 &&
+	    set->connectors[0] != &stdu->base.connector) {
+		DRM_ERROR("Connectors don't match %p %p\n",
+			set->connectors[0], &stdu->base.connector);
+		return -EINVAL;
+	}
+
+
+	/* Since they always map one to one these are safe */
+	connector = &stdu->base.connector;
+	encoder   = &stdu->base.encoder;
+
+
+	/*
+	 * After this point the CRTC will be considered off unless a new fb
+	 * is bound
+	 */
+	if (stdu->defined) {
+		/* Unbind current surface by binding an invalid one */
+		ret = vmw_stdu_bind_st(dev_priv, stdu, NULL);
+		if (unlikely(ret != 0))
+			return ret;
+
+		/* Update Screen Target, display will now be blank */
+		if (crtc->primary->fb) {
+			update_area.x2 = crtc->primary->fb->width;
+			update_area.y2 = crtc->primary->fb->height;
+
+			ret = vmw_stdu_update_st(dev_priv, NULL,
+						 stdu,
+						 &update_area);
+			if (unlikely(ret != 0))
+				return ret;
+		}
+
+		crtc->primary->fb  = NULL;
+		crtc->enabled      = false;
+		encoder->crtc      = NULL;
+		connector->encoder = NULL;
+
+		vmw_stdu_unpin_display(stdu);
+		stdu->content_fb      = NULL;
+		stdu->content_fb_type = SAME_AS_DISPLAY;
+
+		ret = vmw_stdu_destroy_st(dev_priv, stdu);
+		/* The hardware is hung, give up */
+		if (unlikely(ret != 0))
+			return ret;
+	}
+
+
+	/* Any of these conditions means the caller wants CRTC off */
+	if (set->num_connectors == 0 || !mode || !new_fb)
+		return 0;
+
+
+	if (set->x + mode->hdisplay > new_fb->width ||
+	    set->y + mode->vdisplay > new_fb->height) {
+		DRM_ERROR("Set outside of framebuffer\n");
+		return -EINVAL;
+	}
+
+	stdu->content_fb = new_fb;
+	vfb = vmw_framebuffer_to_vfb(stdu->content_fb);
+
+	if (vfb->dmabuf)
+		stdu->content_fb_type = SEPARATE_DMA;
+
+	/*
+	 * If the requested mode is different than the width and height
+	 * of the FB or if the content buffer is a DMA buf, then allocate
+	 * a display FB that matches the dimension of the mode
+	 */
+	if (mode->hdisplay != new_fb->width  ||
+	    mode->vdisplay != new_fb->height ||
+	    stdu->content_fb_type != SAME_AS_DISPLAY) {
+		struct vmw_surface content_srf;
+		struct drm_vmw_size display_base_size = {0};
+		struct vmw_surface *display_srf;
+
+
+		display_base_size.width  = mode->hdisplay;
+		display_base_size.height = mode->vdisplay;
+		display_base_size.depth  = 1;
+
+		/*
+		 * If content buffer is a DMA buf, then we have to construct
+		 * surface info
+		 */
+		if (stdu->content_fb_type == SEPARATE_DMA) {
+
+			switch (new_fb->bits_per_pixel) {
+			case 32:
+				content_srf.format = SVGA3D_X8R8G8B8;
+				break;
+
+			case 16:
+				content_srf.format = SVGA3D_R5G6B5;
+				break;
+
+			case 8:
+				content_srf.format = SVGA3D_P8;
+				break;
+
+			default:
+				DRM_ERROR("Invalid format\n");
+				ret = -EINVAL;
+				goto err_unref_content;
+			}
+
+			content_srf.flags             = 0;
+			content_srf.mip_levels[0]     = 1;
+			content_srf.multisample_count = 0;
+		} else {
+
+			stdu->content_fb_type = SEPARATE_SURFACE;
+
+			new_vfbs = vmw_framebuffer_to_vfbs(new_fb);
+			content_srf = *new_vfbs->surface;
+		}
+
+
+		ret = vmw_surface_gb_priv_define(crtc->dev,
+				0, /* because kernel visible only */
+				content_srf.flags,
+				content_srf.format,
+				true, /* a scanout buffer */
+				content_srf.mip_levels[0],
+				content_srf.multisample_count,
+				display_base_size,
+				&display_srf);
+		if (unlikely(ret != 0)) {
+			DRM_ERROR("Cannot allocate a display FB.\n");
+			goto err_unref_content;
+		}
+
+		stdu->display_srf = display_srf;
+	} else {
+		new_vfbs = vmw_framebuffer_to_vfbs(new_fb);
+		stdu->display_srf = new_vfbs->surface;
+	}
+
+
+	ret = vmw_stdu_pin_display(stdu);
+	if (unlikely(ret != 0)) {
+		stdu->display_srf = NULL;
+		goto err_unref_content;
+	}
+
+	vmw_fb_off(dev_priv);
+	vmw_svga_enable(dev_priv);
+
+	/*
+	 * Steps to displaying a surface, assume surface is already
+	 * bound:
+	 *   1.  define a screen target
+	 *   2.  bind a fb to the screen target
+	 *   3.  update that screen target (this is done later by
+	 *       vmw_kms_stdu_do_surface_dirty_or_present)
+	 */
+	ret = vmw_stdu_define_st(dev_priv, stdu);
+	if (unlikely(ret != 0))
+		goto err_unpin_display_and_content;
+
+	ret = vmw_stdu_bind_st(dev_priv, stdu, &stdu->display_srf->res);
+	if (unlikely(ret != 0))
+		goto err_unpin_destroy_st;
+
+
+	connector->encoder = encoder;
+	encoder->crtc      = crtc;
+
+	crtc->mode    = *mode;
+	crtc->primary->fb = new_fb;
+	crtc->enabled = true;
+
+	return ret;
+
+err_unpin_destroy_st:
+	vmw_stdu_destroy_st(dev_priv, stdu);
+err_unpin_display_and_content:
+	vmw_stdu_unpin_display(stdu);
+err_unref_content:
+	stdu->content_fb = NULL;
+	return ret;
+}
+
+
+
+/**
+ * vmw_stdu_crtc_page_flip - Binds a buffer to a screen target
+ *
+ * @crtc: CRTC to attach FB to
+ * @fb: FB to attach
+ * @event: Event to be posted. This event should've been alloced
+ *         using k[mz]alloc, and should've been completely initialized.
+ * @page_flip_flags: Input flags.
+ *
+ * If the STDU uses the same display and content buffers, i.e. a true flip,
+ * this function will replace the existing display buffer with the new content
+ * buffer.
+ *
+ * If the STDU uses different display and content buffers, i.e. a blit, then
+ * only the content buffer will be updated.
+ *
+ * RETURNS:
+ * 0 on success, error code on failure
+ */
+static int vmw_stdu_crtc_page_flip(struct drm_crtc *crtc,
+				   struct drm_framebuffer *new_fb,
+				   struct drm_pending_vblank_event *event,
+				   uint32_t flags)
+
+{
+	struct vmw_private *dev_priv = vmw_priv(crtc->dev);
+	struct vmw_screen_target_display_unit *stdu;
+	struct drm_file *file_priv;
+	struct drm_clip_rect update_area = {0};
+	int ret;
+
+	/*
+	 * Temporarily don't support event == NULL. We need the
+	 * @file_priv pointer!
+	 */
+	if (event == NULL)
+		return -EINVAL;
+
+	if (crtc == NULL)
+		return -EINVAL;
+
+	dev_priv          = vmw_priv(crtc->dev);
+	stdu              = vmw_crtc_to_stdu(crtc);
+	crtc->primary->fb = new_fb;
+	stdu->content_fb  = new_fb;
+
+	if (stdu->display_srf) {
+		update_area.x2 = stdu->display_srf->base_size.width;
+		update_area.y2 = stdu->display_srf->base_size.height;
+
+		/*
+		 * If the display surface is the same as the content surface
+		 * then remove the reference
+		 */
+		if (stdu->content_fb_type == SAME_AS_DISPLAY) {
+			if (stdu->defined) {
+				/* Unbind the current surface */
+				ret = vmw_stdu_bind_st(dev_priv, stdu, NULL);
+				if (unlikely(ret != 0))
+					goto err_out;
+			}
+			vmw_stdu_unpin_display(stdu);
+			stdu->display_srf = NULL;
+		}
+	}
+
+
+	if (!new_fb) {
+		/* Blanks the display */
+		(void) vmw_stdu_update_st(dev_priv, NULL, stdu, &update_area);
+
+		return 0;
+	}
+
+
+	if (stdu->content_fb_type == SAME_AS_DISPLAY) {
+		stdu->display_srf = vmw_framebuffer_to_vfbs(new_fb)->surface;
+		ret = vmw_stdu_pin_display(stdu);
+		if (ret) {
+			stdu->display_srf = NULL;
+			goto err_out;
+		}
+
+		/* Bind display surface */
+		ret = vmw_stdu_bind_st(dev_priv, stdu, &stdu->display_srf->res);
+		if (unlikely(ret != 0))
+			goto err_unpin_display_and_content;
+	}
+
+	/* Update display surface: after this point everything is bound */
+	update_area.x2 = stdu->display_srf->base_size.width;
+	update_area.y2 = stdu->display_srf->base_size.height;
+
+	file_priv = event->base.file_priv;
+	ret = vmw_stdu_update_st(dev_priv, file_priv, stdu, &update_area);
+	if (unlikely(ret != 0))
+		return ret;
+
+	if (event) {
+		struct vmw_fence_obj *fence = NULL;
+
+		vmw_execbuf_fence_commands(NULL, dev_priv, &fence, NULL);
+		if (!fence)
+			return -ENOMEM;
+
+		ret = vmw_event_fence_action_queue(file_priv, fence,
+						   &event->base,
+						   &event->event.tv_sec,
+						   &event->event.tv_usec,
+						   true);
+		vmw_fence_obj_unreference(&fence);
+	}
+
+	return ret;
+
+err_unpin_display_and_content:
+	vmw_stdu_unpin_display(stdu);
+err_out:
+	crtc->primary->fb = NULL;
+	stdu->content_fb = NULL;
+	return ret;
+}
+
+
+
+/*
+ *  Screen Target CRTC dispatch table
+ */
+static struct drm_crtc_funcs vmw_stdu_crtc_funcs = {
+	.save = vmw_du_crtc_save,
+	.restore = vmw_du_crtc_restore,
+	.cursor_set = vmw_du_crtc_cursor_set,
+	.cursor_move = vmw_du_crtc_cursor_move,
+	.gamma_set = vmw_du_crtc_gamma_set,
+	.destroy = vmw_stdu_crtc_destroy,
+	.set_config = vmw_stdu_crtc_set_config,
+	.page_flip = vmw_stdu_crtc_page_flip,
+};
+
+
+
+/******************************************************************************
+ * Screen Target Display Unit Encoder Functions
+ *****************************************************************************/
+
+/**
+ * vmw_stdu_encoder_destroy - cleans up the STDU
+ *
+ * @encoder: used the get the containing STDU
+ *
+ * vmwgfx cleans up crtc/encoder/connector all at the same time so technically
+ * this can be a no-op.  Nevertheless, it doesn't hurt of have this in case
+ * the common KMS code changes and somehow vmw_stdu_crtc_destroy() doesn't
+ * get called.
+ */
+static void vmw_stdu_encoder_destroy(struct drm_encoder *encoder)
+{
+	vmw_stdu_destroy(vmw_encoder_to_stdu(encoder));
+}
+
+static struct drm_encoder_funcs vmw_stdu_encoder_funcs = {
+	.destroy = vmw_stdu_encoder_destroy,
+};
+
+
+
+/******************************************************************************
+ * Screen Target Display Unit Connector Functions
+ *****************************************************************************/
+
+/**
+ * vmw_stdu_connector_destroy - cleans up the STDU
+ *
+ * @connector: used to get the containing STDU
+ *
+ * vmwgfx cleans up crtc/encoder/connector all at the same time so technically
+ * this can be a no-op.  Nevertheless, it doesn't hurt of have this in case
+ * the common KMS code changes and somehow vmw_stdu_crtc_destroy() doesn't
+ * get called.
+ */
+static void vmw_stdu_connector_destroy(struct drm_connector *connector)
+{
+	vmw_stdu_destroy(vmw_connector_to_stdu(connector));
+}
+
+
+
+static struct drm_connector_funcs vmw_stdu_connector_funcs = {
+	.dpms = vmw_du_connector_dpms,
+	.save = vmw_du_connector_save,
+	.restore = vmw_du_connector_restore,
+	.detect = vmw_du_connector_detect,
+	.fill_modes = vmw_du_connector_fill_modes,
+	.set_property = vmw_du_connector_set_property,
+	.destroy = vmw_stdu_connector_destroy,
+};
+
+
+
+/**
+ * vmw_stdu_init - Sets up a Screen Target Display Unit
+ *
+ * @dev_priv: VMW DRM device
+ * @unit: unit number range from 0 to VMWGFX_NUM_DISPLAY_UNITS
+ *
+ * This function is called once per CRTC, and allocates one Screen Target
+ * display unit to represent that CRTC.  Since the SVGA device does not separate
+ * out encoder and connector, they are represented as part of the STDU as well.
+ */
+static int vmw_stdu_init(struct vmw_private *dev_priv, unsigned unit)
+{
+	struct vmw_screen_target_display_unit *stdu;
+	struct drm_device *dev = dev_priv->dev;
+	struct drm_connector *connector;
+	struct drm_encoder *encoder;
+	struct drm_crtc *crtc;
+
+
+	stdu = kzalloc(sizeof(*stdu), GFP_KERNEL);
+	if (!stdu)
+		return -ENOMEM;
+
+	stdu->base.unit = unit;
+	crtc = &stdu->base.crtc;
+	encoder = &stdu->base.encoder;
+	connector = &stdu->base.connector;
+
+	stdu->base.pref_active = (unit == 0);
+	stdu->base.pref_width  = dev_priv->initial_width;
+	stdu->base.pref_height = dev_priv->initial_height;
+	stdu->base.pref_mode   = NULL;
+	stdu->base.is_implicit = true;
+
+	drm_connector_init(dev, connector, &vmw_stdu_connector_funcs,
+			   DRM_MODE_CONNECTOR_VIRTUAL);
+	connector->status = vmw_du_connector_detect(connector, false);
+
+	drm_encoder_init(dev, encoder, &vmw_stdu_encoder_funcs,
+			 DRM_MODE_ENCODER_VIRTUAL);
+	drm_mode_connector_attach_encoder(connector, encoder);
+	encoder->possible_crtcs = (1 << unit);
+	encoder->possible_clones = 0;
+
+	(void) drm_connector_register(connector);
+
+	drm_crtc_init(dev, crtc, &vmw_stdu_crtc_funcs);
+
+	drm_mode_crtc_set_gamma_size(crtc, 256);
+
+	drm_object_attach_property(&connector->base,
+				   dev->mode_config.dirty_info_property,
+				   1);
+
+	return 0;
+}
+
+
+
+/**
+ *  vmw_stdu_destroy - Cleans up a vmw_screen_target_display_unit
+ *
+ *  @stdu:  Screen Target Display Unit to be destroyed
+ *
+ *  Clean up after vmw_stdu_init
+ */
+static void vmw_stdu_destroy(struct vmw_screen_target_display_unit *stdu)
+{
+	vmw_stdu_unpin_display(stdu);
+
+	vmw_du_cleanup(&stdu->base);
+	kfree(stdu);
+}
+
+
+
+/******************************************************************************
+ * Screen Target Display KMS Functions
+ *
+ * These functions are called by the common KMS code in vmwgfx_kms.c
+ *****************************************************************************/
+
+/**
+ * vmw_kms_stdu_init_display - Initializes a Screen Target based display
+ *
+ * @dev_priv: VMW DRM device
+ *
+ * This function initialize a Screen Target based display device.  It checks
+ * the capability bits to make sure the underlying hardware can support
+ * screen targets, and then creates the maximum number of CRTCs, a.k.a Display
+ * Units, as supported by the display hardware.
+ *
+ * RETURNS:
+ * 0 on success, error code otherwise
+ */
+int vmw_kms_stdu_init_display(struct vmw_private *dev_priv)
+{
+	struct drm_device *dev = dev_priv->dev;
+	int i, ret;
+
+
+	/* Do nothing if Screen Target support is turned off */
+	if (!VMWGFX_ENABLE_SCREEN_TARGET_OTABLE)
+		return -ENOSYS;
+
+	if (!(dev_priv->capabilities & SVGA_CAP_GBOBJECTS) ||
+	    !(dev_priv->capabilities & SVGA_CAP_3D))
+		return -ENOSYS;
+
+	ret = drm_vblank_init(dev, VMWGFX_NUM_DISPLAY_UNITS);
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = drm_mode_create_dirty_info_property(dev);
+	if (unlikely(ret != 0))
+		goto err_vblank_cleanup;
+
+	for (i = 0; i < VMWGFX_NUM_DISPLAY_UNITS; ++i) {
+		ret = vmw_stdu_init(dev_priv, i);
+
+		if (unlikely(ret != 0)) {
+			DRM_ERROR("Failed to initialize STDU %d", i);
+			goto err_vblank_cleanup;
+		}
+	}
+
+	dev_priv->active_display_unit = vmw_du_screen_target;
+
+	DRM_INFO("Screen Target Display device initialized\n");
+
+	return 0;
+
+err_vblank_cleanup:
+	drm_vblank_cleanup(dev);
+	return ret;
+}
+
+
+
+/**
+ * vmw_kms_stdu_close_display - Cleans up after vmw_kms_stdu_init_display
+ *
+ * @dev_priv: VMW DRM device
+ *
+ * Frees up any resources allocated by vmw_kms_stdu_init_display
+ *
+ * RETURNS:
+ * 0 on success
+ */
+int vmw_kms_stdu_close_display(struct vmw_private *dev_priv)
+{
+	struct drm_device *dev = dev_priv->dev;
+
+	drm_vblank_cleanup(dev);
+
+	return 0;
+}
+
+
+
+/**
+ * vmw_kms_stdu_do_surface_dirty - updates a dirty rectange to SVGA device
+ *
+ * @dev_priv: VMW DRM device
+ * @file_priv: Pointer to a drm file private structure
+ * @framebuffer: FB with the new content to be copied to SVGA device
+ * @clip_rects: array of dirty rectanges
+ * @num_of_clip_rects: number of rectanges in @clips
+ * @increment: increment to the next dirty rect in @clips
+ *
+ * This function sends an Update command to the SVGA device.  This will notify
+ * the device that a region needs to be copied to the screen.  At this time
+ * we are not coalescing clip rects into one large clip rect because the SVGA
+ * device will do it for us.
+ *
+ * RETURNS:
+ * 0 on success, error code otherwise
+ */
+int vmw_kms_stdu_do_surface_dirty(struct vmw_private *dev_priv,
+				  struct drm_file *file_priv,
+				  struct vmw_framebuffer *framebuffer,
+				  struct drm_clip_rect *clip_rects,
+				  unsigned num_of_clip_rects, int increment)
+{
+	struct vmw_screen_target_display_unit *stdu[VMWGFX_NUM_DISPLAY_UNITS];
+	struct drm_clip_rect *cur_rect;
+	struct drm_crtc *crtc;
+
+	unsigned num_of_du = 0, cur_du, count = 0;
+	int      ret = 0;
+
+
+	BUG_ON(!clip_rects || !num_of_clip_rects);
+
+	/* Figure out all the DU affected by this surface */
+	list_for_each_entry(crtc, &dev_priv->dev->mode_config.crtc_list,
+			    head) {
+		if (crtc->primary->fb != &framebuffer->base)
+			continue;
+
+		stdu[num_of_du++] = vmw_crtc_to_stdu(crtc);
+	}
+
+	for (cur_du = 0; cur_du < num_of_du; cur_du++)
+		for (cur_rect = clip_rects, count = 0;
+		     count < num_of_clip_rects && ret == 0;
+		     cur_rect += increment, count++) {
+			ret = vmw_stdu_update_st(dev_priv, file_priv,
+						 stdu[cur_du],
+						 cur_rect);
+		}
+
+	return ret;
+}
+
+
+
+/**
+ * vmw_kms_stdu_present - present a surface to the display surface
+ *
+ * @dev_priv: VMW DRM device
+ * @file_priv: Pointer to a drm file private structure
+ * @vfb: Used to pick which STDU(s) is affected
+ * @user_handle: user handle for the source surface
+ * @dest_x: top/left corner of the display area to blit to
+ * @dest_y: top/left corner of the display area to blit to
+ * @clip_rects: array of dirty rectanges
+ * @num_of_clip_rects: number of rectanges in @clips
+ *
+ * This function copies a surface onto the display surface, and
+ * updates the screen target.  Strech blit is currently not
+ * supported.
+ *
+ * RETURNS:
+ * 0 on success, error code otherwise
+ */
+int vmw_kms_stdu_present(struct vmw_private *dev_priv,
+			 struct drm_file *file_priv,
+			 struct vmw_framebuffer *vfb,
+			 uint32_t user_handle,
+			 int32_t dest_x, int32_t dest_y,
+			 struct drm_vmw_rect *clip_rects,
+			 uint32_t num_of_clip_rects)
+{
+	struct vmw_screen_target_display_unit *stdu[VMWGFX_NUM_DISPLAY_UNITS];
+	struct drm_clip_rect *update_area;
+	struct drm_crtc *crtc;
+	size_t fifo_size;
+	int num_of_du = 0, cur_du, i;
+	int ret = 0;
+	struct vmw_clip_rect src_bb;
+
+	struct {
+		SVGA3dCmdHeader      header;
+		SVGA3dCmdSurfaceCopy body;
+	} *cmd;
+	SVGA3dCopyBox *blits;
+
+
+	BUG_ON(!clip_rects || !num_of_clip_rects);
+
+	list_for_each_entry(crtc, &dev_priv->dev->mode_config.crtc_list, head) {
+		if (crtc->primary->fb != &vfb->base)
+			continue;
+
+		stdu[num_of_du++] = vmw_crtc_to_stdu(crtc);
+	}
+
+
+	update_area = kcalloc(num_of_clip_rects, sizeof(*update_area),
+			      GFP_KERNEL);
+	if (unlikely(update_area == NULL)) {
+		DRM_ERROR("Temporary clip rect memory alloc failed.\n");
+		return -ENOMEM;
+	}
+
+
+	fifo_size = sizeof(*cmd) + sizeof(SVGA3dCopyBox) * num_of_clip_rects;
+
+	cmd = kmalloc(fifo_size, GFP_KERNEL);
+	if (unlikely(cmd == NULL)) {
+		DRM_ERROR("Failed to allocate memory for surface copy.\n");
+		ret = -ENOMEM;
+		goto out_free_update_area;
+	}
+
+	memset(cmd, 0, fifo_size);
+	cmd->header.id = SVGA_3D_CMD_SURFACE_COPY;
+
+	blits = (SVGA3dCopyBox *)&cmd[1];
+
+
+	/* Figure out the source bounding box */
+	src_bb.x1 = clip_rects->x;
+	src_bb.y1 = clip_rects->y;
+	src_bb.x2 = clip_rects->x + clip_rects->w;
+	src_bb.y2 = clip_rects->y + clip_rects->h;
+
+	for (i = 1; i < num_of_clip_rects; i++) {
+		src_bb.x1 = min_t(int, src_bb.x1, clip_rects[i].x);
+		src_bb.x2 = max_t(int, src_bb.x2,
+				  clip_rects[i].x + (int) clip_rects[i].w);
+		src_bb.y1 = min_t(int, src_bb.y1, clip_rects[i].y);
+		src_bb.y2 = max_t(int, src_bb.y2,
+				  clip_rects[i].y + (int) clip_rects[i].h);
+	}
+
+	for (i = 0; i < num_of_clip_rects; i++) {
+		update_area[i].x1 = clip_rects[i].x - src_bb.x1;
+		update_area[i].x2 = update_area[i].x1 + clip_rects[i].w;
+		update_area[i].y1 = clip_rects[i].y - src_bb.y1;
+		update_area[i].y2 = update_area[i].y1 + clip_rects[i].h;
+	}
+
+
+	for (cur_du = 0; cur_du < num_of_du; cur_du++) {
+		struct vmw_clip_rect dest_bb;
+		int num_of_blits;
+
+		crtc = &stdu[cur_du]->base.crtc;
+
+		dest_bb.x1 = src_bb.x1 + dest_x - crtc->x;
+		dest_bb.y1 = src_bb.y1 + dest_y - crtc->y;
+		dest_bb.x2 = src_bb.x2 + dest_x - crtc->x;
+		dest_bb.y2 = src_bb.y2 + dest_y - crtc->y;
+
+		/* Skip any STDU outside of the destination bounding box */
+		if (dest_bb.x1 >= crtc->mode.hdisplay ||
+		    dest_bb.y1 >= crtc->mode.vdisplay ||
+		    dest_bb.x2 <= 0 || dest_bb.y2 <= 0)
+			continue;
+
+		/* Normalize to top-left of src bounding box in dest coord */
+		dest_bb.x2 = crtc->mode.hdisplay - dest_bb.x1;
+		dest_bb.y2 = crtc->mode.vdisplay - dest_bb.y1;
+		dest_bb.x1 = 0 - dest_bb.x1;
+		dest_bb.y1 = 0 - dest_bb.y1;
+
+		for (i = 0, num_of_blits = 0; i < num_of_clip_rects; i++) {
+			int x1 = max_t(int, dest_bb.x1, (int)update_area[i].x1);
+			int y1 = max_t(int, dest_bb.y1, (int)update_area[i].y1);
+			int x2 = min_t(int, dest_bb.x2, (int)update_area[i].x2);
+			int y2 = min_t(int, dest_bb.y2, (int)update_area[i].y2);
+
+			if (x1 >= x2)
+				continue;
+
+			if (y1 >= y2)
+				continue;
+
+			blits[num_of_blits].srcx =  src_bb.x1  + x1;
+			blits[num_of_blits].srcy =  src_bb.y1  + y1;
+			blits[num_of_blits].x    = -dest_bb.x1 + x1;
+			blits[num_of_blits].y    = -dest_bb.y1 + y1;
+			blits[num_of_blits].d    = 1;
+			blits[num_of_blits].w    = x2 - x1;
+			blits[num_of_blits].h    = y2 - y1;
+			num_of_blits++;
+		}
+
+		if (num_of_blits == 0)
+			continue;
+
+		/* Calculate new command size */
+		fifo_size = sizeof(*cmd) + sizeof(SVGA3dCopyBox) * num_of_blits;
+
+		cmd->header.size = cpu_to_le32(fifo_size - sizeof(cmd->header));
+
+		cmd->body.src.sid  = user_handle;
+		cmd->body.dest.sid = stdu[cur_du]->display_srf->res.id;
+
+		ret = vmw_execbuf_process(file_priv, dev_priv, NULL, cmd,
+					  fifo_size, 0, VMW_QUIRK_SCREENTARGET,
+					  NULL, NULL);
+
+		if (unlikely(ret != 0))
+			break;
+
+		for (i = 0; i < num_of_blits; i++) {
+			struct drm_clip_rect blit_area;
+
+			/*
+			 * Add crtc offset because vmw_stdu_update_st expects
+			 * desktop coordinates
+			 */
+			blit_area.x1 = blits[i].x + crtc->x;
+			blit_area.x2 = blit_area.x1 + blits[i].w;
+			blit_area.y1 = blits[i].y + crtc->y;
+			blit_area.y2 = blit_area.y1 + blits[i].h;
+			(void) vmw_stdu_update_st(dev_priv, NULL, stdu[cur_du],
+						  &blit_area);
+		}
+	}
+
+	kfree(cmd);
+
+out_free_update_area:
+	kfree(update_area);
+
+	return ret;
+}
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
index fb54ccd4e87d..835f3431574f 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
@@ -1486,6 +1486,10 @@ int vmw_surface_gb_priv_define(struct drm_device *dev,
 					srf->mip_levels[0],
 					srf->flags & SVGA3D_SURFACE_CUBEMAP);
 
+	if (dev_priv->active_display_unit == vmw_du_screen_target &&
+	    for_scanout)
+		srf->flags |= SVGA3D_SURFACE_SCREENTARGET;
+
 	/*
 	 * From this point, the generic resource management functions
 	 * destroy the object on failure.
diff --git a/include/uapi/drm/vmwgfx_drm.h b/include/uapi/drm/vmwgfx_drm.h
index c472bedbe38e..c8a863180174 100644
--- a/include/uapi/drm/vmwgfx_drm.h
+++ b/include/uapi/drm/vmwgfx_drm.h
@@ -88,6 +88,7 @@
 #define DRM_VMW_PARAM_3D_CAPS_SIZE     8
 #define DRM_VMW_PARAM_MAX_MOB_MEMORY   9
 #define DRM_VMW_PARAM_MAX_MOB_SIZE     10
+#define DRM_VMW_PARAM_SCREEN_TARGET    11
 
 /**
  * enum drm_vmw_handle_type - handle type for ref ioctls
-- 
cgit v1.2.3


From 34d99af52ad40bd498ba66970579a5bc1fb1a3bc Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Wed, 5 Aug 2015 16:29:37 -0400
Subject: audit: implement audit by executable

This adds the ability audit the actions of a not-yet-running process.

This patch implements the ability to filter on the executable path.  Instead of
just hard coding the ino and dev of the executable we care about at the moment
the rule is inserted into the kernel, use the new audit_fsnotify
infrastructure to manage this dynamically.  This means that if the filename
does not yet exist but the containing directory does, or if the inode in
question is unlinked and creat'd (aka updated) the rule will just continue to
work.  If the containing directory is moved or deleted or the filesystem is
unmounted, the rule is deleted automatically.  A future enhancement would be to
have the rule survive across directory disruptions.

This is a heavily modified version of a patch originally submitted by Eric
Paris with some ideas from Peter Moody.

Cc: Peter Moody <peter@hda3.com>
Cc: Eric Paris <eparis@redhat.com>
Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
[PM: minor whitespace clean to satisfy ./scripts/checkpatch]
Signed-off-by: Paul Moore <pmoore@redhat.com>
---
 include/linux/audit.h      |  1 +
 include/uapi/linux/audit.h |  5 ++++-
 kernel/audit.h             |  4 ++++
 kernel/audit_tree.c        |  2 ++
 kernel/audit_watch.c       | 31 +++++++++++++++++++++++++++
 kernel/auditfilter.c       | 53 +++++++++++++++++++++++++++++++++++++++++++++-
 kernel/auditsc.c           |  3 +++
 7 files changed, 97 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 759feb0e9d13..b2abc996c25d 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -62,6 +62,7 @@ struct audit_krule {
 	struct audit_field	*inode_f; /* quick access to an inode field */
 	struct audit_watch	*watch;	/* associated watch */
 	struct audit_tree	*tree;	/* associated watched tree */
+	struct audit_fsnotify_mark	*exe;
 	struct list_head	rlist;	/* entry in audit_{watch,tree}.rules list */
 	struct list_head	list;	/* for AUDIT_LIST* purposes only */
 	u64			prio;
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index d3475e1f15ec..f6ff62c24aba 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -266,6 +266,7 @@
 #define AUDIT_OBJ_UID	109
 #define AUDIT_OBJ_GID	110
 #define AUDIT_FIELD_COMPARE	111
+#define AUDIT_EXE	112
 
 #define AUDIT_ARG0      200
 #define AUDIT_ARG1      (AUDIT_ARG0+1)
@@ -324,8 +325,10 @@ enum {
 
 #define AUDIT_FEATURE_BITMAP_BACKLOG_LIMIT	0x00000001
 #define AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME	0x00000002
+#define AUDIT_FEATURE_BITMAP_EXECUTABLE_PATH	0x00000004
 #define AUDIT_FEATURE_BITMAP_ALL (AUDIT_FEATURE_BITMAP_BACKLOG_LIMIT | \
-				  AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME)
+				  AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME | \
+				  AUDIT_FEATURE_BITMAP_EXECUTABLE_PATH)
 
 /* deprecated: AUDIT_VERSION_* */
 #define AUDIT_VERSION_LATEST 		AUDIT_FEATURE_BITMAP_ALL
diff --git a/kernel/audit.h b/kernel/audit.h
index 7102d538737b..24ec86145667 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -274,6 +274,8 @@ extern char *audit_mark_path(struct audit_fsnotify_mark *mark);
 extern void audit_remove_mark(struct audit_fsnotify_mark *audit_mark);
 extern void audit_remove_mark_rule(struct audit_krule *krule);
 extern int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev);
+extern int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old);
+extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark);
 
 #else
 #define audit_put_watch(w) {}
@@ -289,6 +291,8 @@ extern int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long in
 #define audit_remove_mark(m)
 #define audit_remove_mark_rule(k)
 #define audit_mark_compare(m, i, d) 0
+#define audit_exe_compare(t, m) (-EINVAL)
+#define audit_dupe_exe(n, o) (-EINVAL)
 #endif /* CONFIG_AUDIT_WATCH */
 
 #ifdef CONFIG_AUDIT_TREE
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 2e0c97427b33..f41722506808 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -478,6 +478,8 @@ static void kill_rules(struct audit_tree *tree)
 		if (rule->tree) {
 			/* not a half-baked one */
 			audit_tree_log_remove_rule(rule);
+			if (entry->rule.exe)
+				audit_remove_mark(entry->rule.exe);
 			rule->tree = NULL;
 			list_del_rcu(&entry->list);
 			list_del(&entry->rule.list);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 645c6884cee5..27ef8dcf7cd8 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -312,6 +312,8 @@ static void audit_update_watch(struct audit_parent *parent,
 				list_replace(&oentry->rule.list,
 					     &nentry->rule.list);
 			}
+			if (oentry->rule.exe)
+				audit_remove_mark(oentry->rule.exe);
 
 			audit_watch_log_rule_change(r, owatch, "updated_rules");
 
@@ -342,6 +344,8 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
 		list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
 			e = container_of(r, struct audit_entry, rule);
 			audit_watch_log_rule_change(r, w, "remove_rule");
+			if (e->rule.exe)
+				audit_remove_mark(e->rule.exe);
 			list_del(&r->rlist);
 			list_del(&r->list);
 			list_del_rcu(&e->list);
@@ -514,3 +518,30 @@ static int __init audit_watch_init(void)
 	return 0;
 }
 device_initcall(audit_watch_init);
+
+int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old)
+{
+	struct audit_fsnotify_mark *audit_mark;
+	char *pathname;
+
+	pathname = kstrdup(audit_mark_path(old->exe), GFP_KERNEL);
+	if (!pathname)
+		return -ENOMEM;
+
+	audit_mark = audit_alloc_mark(new, pathname, strlen(pathname));
+	if (IS_ERR(audit_mark)) {
+		kfree(pathname);
+		return PTR_ERR(audit_mark);
+	}
+	new->exe = audit_mark;
+
+	return 0;
+}
+
+int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark)
+{
+	unsigned long ino = tsk->mm->exe_file->f_inode->i_ino;
+	dev_t dev = tsk->mm->exe_file->f_inode->i_sb->s_dev;
+
+	return audit_mark_compare(mark, ino, dev);
+}
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index b4d8c366ec30..7714d93edb85 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -405,6 +405,12 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
 		if (f->val > AUDIT_MAX_FIELD_COMPARE)
 			return -EINVAL;
 		break;
+	case AUDIT_EXE:
+		if (f->op != Audit_equal)
+			return -EINVAL;
+		if (entry->rule.listnr != AUDIT_FILTER_EXIT)
+			return -EINVAL;
+		break;
 	};
 	return 0;
 }
@@ -419,6 +425,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 	size_t remain = datasz - sizeof(struct audit_rule_data);
 	int i;
 	char *str;
+	struct audit_fsnotify_mark *audit_mark;
 
 	entry = audit_to_entry_common(data);
 	if (IS_ERR(entry))
@@ -539,6 +546,24 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 			entry->rule.buflen += f->val;
 			entry->rule.filterkey = str;
 			break;
+		case AUDIT_EXE:
+			if (entry->rule.exe || f->val > PATH_MAX)
+				goto exit_free;
+			str = audit_unpack_string(&bufp, &remain, f->val);
+			if (IS_ERR(str)) {
+				err = PTR_ERR(str);
+				goto exit_free;
+			}
+			entry->rule.buflen += f->val;
+
+			audit_mark = audit_alloc_mark(&entry->rule, str, f->val);
+			if (IS_ERR(audit_mark)) {
+				kfree(str);
+				err = PTR_ERR(audit_mark);
+				goto exit_free;
+			}
+			entry->rule.exe = audit_mark;
+			break;
 		}
 	}
 
@@ -551,6 +576,8 @@ exit_nofree:
 exit_free:
 	if (entry->rule.tree)
 		audit_put_tree(entry->rule.tree); /* that's the temporary one */
+	if (entry->rule.exe)
+		audit_remove_mark(entry->rule.exe); /* that's the template one */
 	audit_free_rule(entry);
 	return ERR_PTR(err);
 }
@@ -615,6 +642,10 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
 			data->buflen += data->values[i] =
 				audit_pack_string(&bufp, krule->filterkey);
 			break;
+		case AUDIT_EXE:
+			data->buflen += data->values[i] =
+				audit_pack_string(&bufp, audit_mark_path(krule->exe));
+			break;
 		case AUDIT_LOGINUID_SET:
 			if (krule->pflags & AUDIT_LOGINUID_LEGACY && !f->val) {
 				data->fields[i] = AUDIT_LOGINUID;
@@ -678,6 +709,12 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
 			if (strcmp(a->filterkey, b->filterkey))
 				return 1;
 			break;
+		case AUDIT_EXE:
+			/* both paths exist based on above type compare */
+			if (strcmp(audit_mark_path(a->exe),
+				   audit_mark_path(b->exe)))
+				return 1;
+			break;
 		case AUDIT_UID:
 		case AUDIT_EUID:
 		case AUDIT_SUID:
@@ -799,8 +836,14 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old)
 				err = -ENOMEM;
 			else
 				new->filterkey = fk;
+			break;
+		case AUDIT_EXE:
+			err = audit_dupe_exe(new, old);
+			break;
 		}
 		if (err) {
+			if (new->exe)
+				audit_remove_mark(new->exe);
 			audit_free_rule(entry);
 			return ERR_PTR(err);
 		}
@@ -963,6 +1006,9 @@ int audit_del_rule(struct audit_entry *entry)
 	if (e->rule.tree)
 		audit_remove_tree_rule(&e->rule);
 
+	if (e->rule.exe)
+		audit_remove_mark_rule(&e->rule);
+
 #ifdef CONFIG_AUDITSYSCALL
 	if (!dont_count)
 		audit_n_rules--;
@@ -1067,8 +1113,11 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data,
 		WARN_ON(1);
 	}
 
-	if (err || type == AUDIT_DEL_RULE)
+	if (err || type == AUDIT_DEL_RULE) {
+		if (entry->rule.exe)
+			audit_remove_mark(entry->rule.exe);
 		audit_free_rule(entry);
+	}
 
 	return err;
 }
@@ -1360,6 +1409,8 @@ static int update_lsm_rule(struct audit_krule *r)
 		return 0;
 
 	nentry = audit_dupe_rule(r);
+	if (entry->rule.exe)
+		audit_remove_mark(entry->rule.exe);
 	if (IS_ERR(nentry)) {
 		/* save the first error encountered for the
 		 * return value */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index ea3fe2b748a8..9b56b7ae053f 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -466,6 +466,9 @@ static int audit_filter_rules(struct task_struct *tsk,
 				result = audit_comparator(ctx->ppid, f->op, f->val);
 			}
 			break;
+		case AUDIT_EXE:
+			result = audit_exe_compare(tsk, rule->exe);
+			break;
 		case AUDIT_UID:
 			result = audit_uid_comparator(cred->uid, f->op, f->uid);
 			break;
-- 
cgit v1.2.3


From d877f07112f1e5a247c6b585c971a93895c9f738 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 31 May 2015 18:04:11 +0200
Subject: netfilter: nf_tables: add nft_dup expression

This new expression uses the nf_dup engine to clone packets to a given gateway.
Unlike xt_TEE, we use an index to indicate output interface which should be
fine at this stage.

Moreover, change to the preemtion-safe this_cpu_read(nf_skb_duplicated) from
nf_dup_ipv{4,6} to silence a lockdep splat.

Based on the original tee expression from Arturo Borrero Gonzalez, although
this patch has diverted quite a bit from this initial effort due to the
change to support maps.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nft_dup.h          |   9 +++
 include/uapi/linux/netfilter/nf_tables.h |  14 ++++
 net/ipv4/netfilter/Kconfig               |   6 ++
 net/ipv4/netfilter/Makefile              |   1 +
 net/ipv4/netfilter/nf_dup_ipv4.c         |   2 +-
 net/ipv4/netfilter/nft_dup_ipv4.c        | 110 +++++++++++++++++++++++++++++++
 net/ipv6/netfilter/Kconfig               |   6 ++
 net/ipv6/netfilter/Makefile              |   1 +
 net/ipv6/netfilter/nf_dup_ipv6.c         |   2 +-
 net/ipv6/netfilter/nft_dup_ipv6.c        | 108 ++++++++++++++++++++++++++++++
 10 files changed, 257 insertions(+), 2 deletions(-)
 create mode 100644 include/net/netfilter/nft_dup.h
 create mode 100644 net/ipv4/netfilter/nft_dup_ipv4.c
 create mode 100644 net/ipv6/netfilter/nft_dup_ipv6.c

(limited to 'include/uapi')

diff --git a/include/net/netfilter/nft_dup.h b/include/net/netfilter/nft_dup.h
new file mode 100644
index 000000000000..6b84cf6491a2
--- /dev/null
+++ b/include/net/netfilter/nft_dup.h
@@ -0,0 +1,9 @@
+#ifndef _NFT_DUP_H_
+#define _NFT_DUP_H_
+
+struct nft_dup_inet {
+	enum nft_registers	sreg_addr:8;
+	enum nft_registers	sreg_dev:8;
+};
+
+#endif /* _NFT_DUP_H_ */
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index a99e6a997140..2ef35f2c9bda 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -935,6 +935,20 @@ enum nft_redir_attributes {
 };
 #define NFTA_REDIR_MAX		(__NFTA_REDIR_MAX - 1)
 
+/**
+ * enum nft_dup_attributes - nf_tables dup expression netlink attributes
+ *
+ * @NFTA_DUP_SREG_ADDR: source register of address (NLA_U32: nft_registers)
+ * @NFTA_DUP_SREG_DEV: source register of output interface (NLA_U32: nft_register)
+ */
+enum nft_dup_attributes {
+	NFTA_DUP_UNSPEC,
+	NFTA_DUP_SREG_ADDR,
+	NFTA_DUP_SREG_DEV,
+	__NFTA_DUP_MAX
+};
+#define NFTA_DUP_MAX		(__NFTA_DUP_MAX - 1)
+
 /**
  * enum nft_gen_attributes - nf_tables ruleset generation attributes
  *
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 0142ea259d7d..690d27d3f2f9 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -58,6 +58,12 @@ config NFT_REJECT_IPV4
 	default NFT_REJECT
 	tristate
 
+config NFT_DUP_IPV4
+	tristate "IPv4 nf_tables packet duplication support"
+	select NF_DUP_IPV4
+	help
+	  This module enables IPv4 packet duplication support for nf_tables.
+
 endif # NF_TABLES_IPV4
 
 config NF_TABLES_ARP
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 9136ffc2d474..87b073da14c9 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -41,6 +41,7 @@ obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o
 obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o
 obj-$(CONFIG_NFT_MASQ_IPV4) += nft_masq_ipv4.o
 obj-$(CONFIG_NFT_REDIR_IPV4) += nft_redir_ipv4.o
+obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o
 obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o
 
 # generic IP tables 
diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c
index eff85ab3f47d..b5bb37564b0e 100644
--- a/net/ipv4/netfilter/nf_dup_ipv4.c
+++ b/net/ipv4/netfilter/nf_dup_ipv4.c
@@ -69,7 +69,7 @@ void nf_dup_ipv4(struct sk_buff *skb, unsigned int hooknum,
 {
 	struct iphdr *iph;
 
-	if (__this_cpu_read(nf_skb_duplicated))
+	if (this_cpu_read(nf_skb_duplicated))
 		return;
 	/*
 	 * Copy the skb, and route the copy. Will later return %XT_CONTINUE for
diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c
new file mode 100644
index 000000000000..25419fbddcb6
--- /dev/null
+++ b/net/ipv4/netfilter/nft_dup_ipv4.c
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2015 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/ipv4/nf_dup_ipv4.h>
+
+struct nft_dup_ipv4 {
+	enum nft_registers	sreg_addr:8;
+	enum nft_registers	sreg_dev:8;
+};
+
+static void nft_dup_ipv4_eval(const struct nft_expr *expr,
+			      struct nft_regs *regs,
+			      const struct nft_pktinfo *pkt)
+{
+	struct nft_dup_ipv4 *priv = nft_expr_priv(expr);
+	struct in_addr gw = {
+		.s_addr = regs->data[priv->sreg_addr],
+	};
+	int oif = regs->data[priv->sreg_dev];
+
+	nf_dup_ipv4(pkt->skb, pkt->ops->hooknum, &gw, oif);
+}
+
+static int nft_dup_ipv4_init(const struct nft_ctx *ctx,
+			     const struct nft_expr *expr,
+			     const struct nlattr * const tb[])
+{
+	struct nft_dup_ipv4 *priv = nft_expr_priv(expr);
+	int err;
+
+	if (tb[NFTA_DUP_SREG_ADDR] == NULL)
+		return -EINVAL;
+
+	priv->sreg_addr = nft_parse_register(tb[NFTA_DUP_SREG_ADDR]);
+	err = nft_validate_register_load(priv->sreg_addr, sizeof(struct in_addr));
+	if (err < 0)
+		return err;
+
+	if (tb[NFTA_DUP_SREG_DEV] != NULL) {
+		priv->sreg_dev = nft_parse_register(tb[NFTA_DUP_SREG_DEV]);
+		return nft_validate_register_load(priv->sreg_dev, sizeof(int));
+	}
+	return 0;
+}
+
+static int nft_dup_ipv4_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	struct nft_dup_ipv4 *priv = nft_expr_priv(expr);
+
+	if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr) ||
+	    nft_dump_register(skb, NFTA_DUP_SREG_DEV, priv->sreg_dev))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_type nft_dup_ipv4_type;
+static const struct nft_expr_ops nft_dup_ipv4_ops = {
+	.type		= &nft_dup_ipv4_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_dup_ipv4)),
+	.eval		= nft_dup_ipv4_eval,
+	.init		= nft_dup_ipv4_init,
+	.dump		= nft_dup_ipv4_dump,
+};
+
+static const struct nla_policy nft_dup_ipv4_policy[NFTA_DUP_MAX + 1] = {
+	[NFTA_DUP_SREG_ADDR]	= { .type = NLA_U32 },
+	[NFTA_DUP_SREG_DEV]	= { .type = NLA_U32 },
+};
+
+static struct nft_expr_type nft_dup_ipv4_type __read_mostly = {
+	.family		= NFPROTO_IPV4,
+	.name		= "dup",
+	.ops		= &nft_dup_ipv4_ops,
+	.policy		= nft_dup_ipv4_policy,
+	.maxattr	= NFTA_DUP_MAX,
+	.owner		= THIS_MODULE,
+};
+
+static int __init nft_dup_ipv4_module_init(void)
+{
+	return nft_register_expr(&nft_dup_ipv4_type);
+}
+
+static void __exit nft_dup_ipv4_module_exit(void)
+{
+	nft_unregister_expr(&nft_dup_ipv4_type);
+}
+
+module_init(nft_dup_ipv4_module_init);
+module_exit(nft_dup_ipv4_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "dup");
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 298daf30d857..96833e4b3193 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -47,6 +47,12 @@ config NFT_REJECT_IPV6
 	default NFT_REJECT
 	tristate
 
+config NFT_DUP_IPV6
+	tristate "IPv6 nf_tables packet duplication support"
+	select NF_DUP_IPV6
+	help
+	  This module enables IPv6 packet duplication support for nf_tables.
+
 endif # NF_TABLES_IPV6
 endif # NF_TABLES
 
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index dc6c732f98ca..b4f7d0b4e2af 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -39,6 +39,7 @@ obj-$(CONFIG_NFT_CHAIN_NAT_IPV6) += nft_chain_nat_ipv6.o
 obj-$(CONFIG_NFT_REJECT_IPV6) += nft_reject_ipv6.o
 obj-$(CONFIG_NFT_MASQ_IPV6) += nft_masq_ipv6.o
 obj-$(CONFIG_NFT_REDIR_IPV6) += nft_redir_ipv6.o
+obj-$(CONFIG_NFT_DUP_IPV6) += nft_dup_ipv6.o
 
 # matches
 obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o
diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c
index 399fdda18447..d8ab654080b4 100644
--- a/net/ipv6/netfilter/nf_dup_ipv6.c
+++ b/net/ipv6/netfilter/nf_dup_ipv6.c
@@ -63,7 +63,7 @@ static bool nf_dup_ipv6_route(struct sk_buff *skb, const struct in6_addr *gw,
 void nf_dup_ipv6(struct sk_buff *skb, unsigned int hooknum,
 		 const struct in6_addr *gw, int oif)
 {
-	if (__this_cpu_read(nf_skb_duplicated))
+	if (this_cpu_read(nf_skb_duplicated))
 		return;
 	skb = pskb_copy(skb, GFP_ATOMIC);
 	if (skb == NULL)
diff --git a/net/ipv6/netfilter/nft_dup_ipv6.c b/net/ipv6/netfilter/nft_dup_ipv6.c
new file mode 100644
index 000000000000..0eaa4f65fdea
--- /dev/null
+++ b/net/ipv6/netfilter/nft_dup_ipv6.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2015 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/ipv6/nf_dup_ipv6.h>
+
+struct nft_dup_ipv6 {
+	enum nft_registers	sreg_addr:8;
+	enum nft_registers	sreg_dev:8;
+};
+
+static void nft_dup_ipv6_eval(const struct nft_expr *expr,
+			      struct nft_regs *regs,
+			      const struct nft_pktinfo *pkt)
+{
+	struct nft_dup_ipv6 *priv = nft_expr_priv(expr);
+	struct in6_addr *gw = (struct in6_addr *)&regs->data[priv->sreg_addr];
+	int oif = regs->data[priv->sreg_dev];
+
+	nf_dup_ipv6(pkt->skb, pkt->ops->hooknum, gw, oif);
+}
+
+static int nft_dup_ipv6_init(const struct nft_ctx *ctx,
+			     const struct nft_expr *expr,
+			     const struct nlattr * const tb[])
+{
+	struct nft_dup_ipv6 *priv = nft_expr_priv(expr);
+	int err;
+
+	if (tb[NFTA_DUP_SREG_ADDR] == NULL)
+		return -EINVAL;
+
+	priv->sreg_addr = nft_parse_register(tb[NFTA_DUP_SREG_ADDR]);
+	err = nft_validate_register_load(priv->sreg_addr, sizeof(struct in6_addr));
+	if (err < 0)
+		return err;
+
+	if (tb[NFTA_DUP_SREG_DEV] != NULL) {
+		priv->sreg_dev = nft_parse_register(tb[NFTA_DUP_SREG_DEV]);
+		return nft_validate_register_load(priv->sreg_dev, sizeof(int));
+	}
+	return 0;
+}
+
+static int nft_dup_ipv6_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	struct nft_dup_ipv6 *priv = nft_expr_priv(expr);
+
+	if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr) ||
+	    nft_dump_register(skb, NFTA_DUP_SREG_DEV, priv->sreg_dev))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_type nft_dup_ipv6_type;
+static const struct nft_expr_ops nft_dup_ipv6_ops = {
+	.type		= &nft_dup_ipv6_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_dup_ipv6)),
+	.eval		= nft_dup_ipv6_eval,
+	.init		= nft_dup_ipv6_init,
+	.dump		= nft_dup_ipv6_dump,
+};
+
+static const struct nla_policy nft_dup_ipv6_policy[NFTA_DUP_MAX + 1] = {
+	[NFTA_DUP_SREG_ADDR]	= { .type = NLA_U32 },
+	[NFTA_DUP_SREG_DEV]	= { .type = NLA_U32 },
+};
+
+static struct nft_expr_type nft_dup_ipv6_type __read_mostly = {
+	.family		= NFPROTO_IPV6,
+	.name		= "dup",
+	.ops		= &nft_dup_ipv6_ops,
+	.policy		= nft_dup_ipv6_policy,
+	.maxattr	= NFTA_DUP_MAX,
+	.owner		= THIS_MODULE,
+};
+
+static int __init nft_dup_ipv6_module_init(void)
+{
+	return nft_register_expr(&nft_dup_ipv6_type);
+}
+
+static void __exit nft_dup_ipv6_module_exit(void)
+{
+	nft_unregister_expr(&nft_dup_ipv6_type);
+}
+
+module_init(nft_dup_ipv6_module_init);
+module_exit(nft_dup_ipv6_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "dup");
-- 
cgit v1.2.3


From 3e87baafa4f476017b2453e52a1ca7308b4e6ad5 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 2 Aug 2015 18:02:14 +0200
Subject: netfilter: nft_limit: add burst parameter

This patch adds the burst parameter. This burst indicates the number of packets
that can exceed the limit.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  2 ++
 net/netfilter/nft_limit.c                | 20 ++++++++++++++++++--
 2 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 2ef35f2c9bda..cafd78943c7d 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -761,11 +761,13 @@ enum nft_ct_attributes {
  *
  * @NFTA_LIMIT_RATE: refill rate (NLA_U64)
  * @NFTA_LIMIT_UNIT: refill unit (NLA_U64)
+ * @NFTA_LIMIT_BURST: burst (NLA_U32)
  */
 enum nft_limit_attributes {
 	NFTA_LIMIT_UNSPEC,
 	NFTA_LIMIT_RATE,
 	NFTA_LIMIT_UNIT,
+	NFTA_LIMIT_BURST,
 	__NFTA_LIMIT_MAX
 };
 #define NFTA_LIMIT_MAX		(__NFTA_LIMIT_MAX - 1)
diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c
index c4d1b1b75b8f..d8c5ff1bf7dd 100644
--- a/net/netfilter/nft_limit.c
+++ b/net/netfilter/nft_limit.c
@@ -25,6 +25,7 @@ struct nft_limit {
 	u64		tokens_max;
 	u64		rate;
 	u64		nsecs;
+	u32		burst;
 };
 
 static inline bool nft_limit_eval(struct nft_limit *limit, u64 cost)
@@ -65,6 +66,18 @@ static int nft_limit_init(struct nft_limit *limit,
 	if (limit->rate == 0 || limit->nsecs < unit)
 		return -EOVERFLOW;
 	limit->tokens = limit->tokens_max = limit->nsecs;
+
+	if (tb[NFTA_LIMIT_BURST]) {
+		u64 rate;
+
+		limit->burst = ntohl(nla_get_be32(tb[NFTA_LIMIT_BURST]));
+
+		rate = limit->rate + limit->burst;
+		if (rate < limit->rate)
+			return -EOVERFLOW;
+
+		limit->rate = rate;
+	}
 	limit->last = ktime_get_ns();
 
 	return 0;
@@ -73,9 +86,11 @@ static int nft_limit_init(struct nft_limit *limit,
 static int nft_limit_dump(struct sk_buff *skb, const struct nft_limit *limit)
 {
 	u64 secs = div_u64(limit->nsecs, NSEC_PER_SEC);
+	u64 rate = limit->rate - limit->burst;
 
-	if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(limit->rate)) ||
-	    nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(secs)))
+	if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(rate)) ||
+	    nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(secs)) ||
+	    nla_put_be32(skb, NFTA_LIMIT_BURST, htonl(limit->burst)))
 		goto nla_put_failure;
 	return 0;
 
@@ -96,6 +111,7 @@ static void nft_limit_pkts_eval(const struct nft_expr *expr,
 static const struct nla_policy nft_limit_policy[NFTA_LIMIT_MAX + 1] = {
 	[NFTA_LIMIT_RATE]	= { .type = NLA_U64 },
 	[NFTA_LIMIT_UNIT]	= { .type = NLA_U64 },
+	[NFTA_LIMIT_BURST]	= { .type = NLA_U32 },
 };
 
 static int nft_limit_pkts_init(const struct nft_ctx *ctx,
-- 
cgit v1.2.3


From d2168e849ebf617b2b7feae44c0c0baf739cb610 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 5 Aug 2015 12:38:44 +0200
Subject: netfilter: nft_limit: add per-byte limiting

This patch adds a new NFTA_LIMIT_TYPE netlink attribute to indicate the type of
limiting.

Contrary to per-packet limiting, the cost is calculated from the packet path
since this depends on the packet length.

The burst attribute indicates the number of bytes in which the rate can be
exceeded.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  7 ++++
 net/netfilter/nft_limit.c                | 63 ++++++++++++++++++++++++++++++--
 2 files changed, 66 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index cafd78943c7d..d8c8a7c9d88a 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -756,18 +756,25 @@ enum nft_ct_attributes {
 };
 #define NFTA_CT_MAX		(__NFTA_CT_MAX - 1)
 
+enum nft_limit_type {
+	NFT_LIMIT_PKTS,
+	NFT_LIMIT_PKT_BYTES
+};
+
 /**
  * enum nft_limit_attributes - nf_tables limit expression netlink attributes
  *
  * @NFTA_LIMIT_RATE: refill rate (NLA_U64)
  * @NFTA_LIMIT_UNIT: refill unit (NLA_U64)
  * @NFTA_LIMIT_BURST: burst (NLA_U32)
+ * @NFTA_LIMIT_TYPE: type of limit (NLA_U32: enum nft_limit_type)
  */
 enum nft_limit_attributes {
 	NFTA_LIMIT_UNSPEC,
 	NFTA_LIMIT_RATE,
 	NFTA_LIMIT_UNIT,
 	NFTA_LIMIT_BURST,
+	NFTA_LIMIT_TYPE,
 	__NFTA_LIMIT_MAX
 };
 #define NFTA_LIMIT_MAX		(__NFTA_LIMIT_MAX - 1)
diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c
index b418698e26b0..5d67938f8b2f 100644
--- a/net/netfilter/nft_limit.c
+++ b/net/netfilter/nft_limit.c
@@ -83,14 +83,16 @@ static int nft_limit_init(struct nft_limit *limit,
 	return 0;
 }
 
-static int nft_limit_dump(struct sk_buff *skb, const struct nft_limit *limit)
+static int nft_limit_dump(struct sk_buff *skb, const struct nft_limit *limit,
+			  enum nft_limit_type type)
 {
 	u64 secs = div_u64(limit->nsecs, NSEC_PER_SEC);
 	u64 rate = limit->rate - limit->burst;
 
 	if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(rate)) ||
 	    nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(secs)) ||
-	    nla_put_be32(skb, NFTA_LIMIT_BURST, htonl(limit->burst)))
+	    nla_put_be32(skb, NFTA_LIMIT_BURST, htonl(limit->burst)) ||
+	    nla_put_be32(skb, NFTA_LIMIT_TYPE, htonl(type)))
 		goto nla_put_failure;
 	return 0;
 
@@ -117,6 +119,7 @@ static const struct nla_policy nft_limit_policy[NFTA_LIMIT_MAX + 1] = {
 	[NFTA_LIMIT_RATE]	= { .type = NLA_U64 },
 	[NFTA_LIMIT_UNIT]	= { .type = NLA_U64 },
 	[NFTA_LIMIT_BURST]	= { .type = NLA_U32 },
+	[NFTA_LIMIT_TYPE]	= { .type = NLA_U32 },
 };
 
 static int nft_limit_pkts_init(const struct nft_ctx *ctx,
@@ -138,7 +141,7 @@ static int nft_limit_pkts_dump(struct sk_buff *skb, const struct nft_expr *expr)
 {
 	const struct nft_limit_pkts *priv = nft_expr_priv(expr);
 
-	return nft_limit_dump(skb, &priv->limit);
+	return nft_limit_dump(skb, &priv->limit, NFT_LIMIT_PKTS);
 }
 
 static struct nft_expr_type nft_limit_type;
@@ -150,9 +153,61 @@ static const struct nft_expr_ops nft_limit_pkts_ops = {
 	.dump		= nft_limit_pkts_dump,
 };
 
+static void nft_limit_pkt_bytes_eval(const struct nft_expr *expr,
+				     struct nft_regs *regs,
+				     const struct nft_pktinfo *pkt)
+{
+	struct nft_limit *priv = nft_expr_priv(expr);
+	u64 cost = div_u64(priv->nsecs * pkt->skb->len, priv->rate);
+
+	if (nft_limit_eval(priv, cost))
+		regs->verdict.code = NFT_BREAK;
+}
+
+static int nft_limit_pkt_bytes_init(const struct nft_ctx *ctx,
+				    const struct nft_expr *expr,
+				    const struct nlattr * const tb[])
+{
+	struct nft_limit *priv = nft_expr_priv(expr);
+
+	return nft_limit_init(priv, tb);
+}
+
+static int nft_limit_pkt_bytes_dump(struct sk_buff *skb,
+				    const struct nft_expr *expr)
+{
+	const struct nft_limit *priv = nft_expr_priv(expr);
+
+	return nft_limit_dump(skb, priv, NFT_LIMIT_PKT_BYTES);
+}
+
+static const struct nft_expr_ops nft_limit_pkt_bytes_ops = {
+	.type		= &nft_limit_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_limit)),
+	.eval		= nft_limit_pkt_bytes_eval,
+	.init		= nft_limit_pkt_bytes_init,
+	.dump		= nft_limit_pkt_bytes_dump,
+};
+
+static const struct nft_expr_ops *
+nft_limit_select_ops(const struct nft_ctx *ctx,
+		     const struct nlattr * const tb[])
+{
+	if (tb[NFTA_LIMIT_TYPE] == NULL)
+		return &nft_limit_pkts_ops;
+
+	switch (ntohl(nla_get_be32(tb[NFTA_LIMIT_TYPE]))) {
+	case NFT_LIMIT_PKTS:
+		return &nft_limit_pkts_ops;
+	case NFT_LIMIT_PKT_BYTES:
+		return &nft_limit_pkt_bytes_ops;
+	}
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
 static struct nft_expr_type nft_limit_type __read_mostly = {
 	.name		= "limit",
-	.ops		= &nft_limit_pkts_ops,
+	.select_ops	= nft_limit_select_ops,
 	.policy		= nft_limit_policy,
 	.maxattr	= NFTA_LIMIT_MAX,
 	.flags		= NFT_EXPR_STATEFUL,
-- 
cgit v1.2.3


From da8b43c0e1dcea3bcac5f37ea59934ddaa137aed Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Tue, 4 Aug 2015 22:51:07 -0700
Subject: vxlan: combine VXLAN_FLOWBASED into VXLAN_COLLECT_METADATA

IFLA_VXLAN_FLOWBASED is useless without IFLA_VXLAN_COLLECT_METADATA,
so combine them into single IFLA_VXLAN_COLLECT_METADATA flag.
'flowbased' doesn't convey real meaning of the vxlan tunnel mode.
This mode can be used by routing, tc+bpf and ovs.
Only ovs is strictly flow based, so 'collect metadata' is a better
name for this tunnel mode.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c           | 17 ++++++-----------
 include/net/vxlan.h           |  4 +---
 include/uapi/linux/if_link.h  |  1 -
 net/openvswitch/vport-vxlan.c |  2 +-
 4 files changed, 8 insertions(+), 16 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index e90f7a484e1c..b6731fad19ba 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1141,7 +1141,7 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
 	union vxlan_addr *remote_ip;
 
 	/* For flow based devices, map all packets to VNI 0 */
-	if (vs->flags & VXLAN_F_FLOW_BASED)
+	if (vs->flags & VXLAN_F_COLLECT_METADATA)
 		vni = 0;
 
 	/* Is this VNI defined? */
@@ -1183,7 +1183,7 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
 
 	skb_reset_network_header(skb);
 	/* In flow-based mode, GBP is carried in dst_metadata */
-	if (!(vs->flags & VXLAN_F_FLOW_BASED))
+	if (!(vs->flags & VXLAN_F_COLLECT_METADATA))
 		skb->mark = md->gbp;
 
 	if (oip6)
@@ -2129,7 +2129,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
 #endif
 	}
 
-	if (vxlan->flags & VXLAN_F_FLOW_BASED &&
+	if (vxlan->flags & VXLAN_F_COLLECT_METADATA &&
 	    info && info->mode == IP_TUNNEL_INFO_TX) {
 		vxlan_xmit_one(skb, dev, NULL, false);
 		return NETDEV_TX_OK;
@@ -2462,7 +2462,6 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
 	[IFLA_VXLAN_RSC]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_L2MISS]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_L3MISS]	= { .type = NLA_U8 },
-	[IFLA_VXLAN_FLOWBASED]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_COLLECT_METADATA]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_PORT]	= { .type = NLA_U16 },
 	[IFLA_VXLAN_UDP_CSUM]	= { .type = NLA_U8 },
@@ -2814,10 +2813,6 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev,
 	if (data[IFLA_VXLAN_LIMIT])
 		conf.addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]);
 
-	if (data[IFLA_VXLAN_FLOWBASED] &&
-	    nla_get_u8(data[IFLA_VXLAN_FLOWBASED]))
-		conf.flags |= VXLAN_F_FLOW_BASED;
-
 	if (data[IFLA_VXLAN_COLLECT_METADATA] &&
 	    nla_get_u8(data[IFLA_VXLAN_COLLECT_METADATA]))
 		conf.flags |= VXLAN_F_COLLECT_METADATA;
@@ -2903,7 +2898,7 @@ static size_t vxlan_get_size(const struct net_device *dev)
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_RSC */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_L2MISS */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_L3MISS */
-		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_FLOWBASED */
+		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_COLLECT_METADATA */
 		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_AGEING */
 		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_LIMIT */
 		nla_total_size(sizeof(struct ifla_vxlan_port_range)) +
@@ -2970,8 +2965,8 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
 			!!(vxlan->flags & VXLAN_F_L2MISS)) ||
 	    nla_put_u8(skb, IFLA_VXLAN_L3MISS,
 			!!(vxlan->flags & VXLAN_F_L3MISS)) ||
-	    nla_put_u8(skb, IFLA_VXLAN_FLOWBASED,
-		       !!(vxlan->flags & VXLAN_F_FLOW_BASED)) ||
+	    nla_put_u8(skb, IFLA_VXLAN_COLLECT_METADATA,
+		       !!(vxlan->flags & VXLAN_F_COLLECT_METADATA)) ||
 	    nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->cfg.age_interval) ||
 	    nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->cfg.addrmax) ||
 	    nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->cfg.dst_port) ||
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index eb8d721cdb67..e4534f1b2d8c 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -181,7 +181,6 @@ struct vxlan_dev {
 #define VXLAN_F_GBP			0x800
 #define VXLAN_F_REMCSUM_NOPARTIAL	0x1000
 #define VXLAN_F_COLLECT_METADATA	0x2000
-#define VXLAN_F_FLOW_BASED		0x4000
 
 /* Flags that are used in the receive path. These flags must match in
  * order for a socket to be shareable
@@ -190,8 +189,7 @@ struct vxlan_dev {
 					 VXLAN_F_UDP_ZERO_CSUM6_RX |	\
 					 VXLAN_F_REMCSUM_RX |		\
 					 VXLAN_F_REMCSUM_NOPARTIAL |	\
-					 VXLAN_F_COLLECT_METADATA |	\
-					 VXLAN_F_FLOW_BASED)
+					 VXLAN_F_COLLECT_METADATA)
 
 struct net_device *vxlan_dev_create(struct net *net, const char *name,
 				    u8 name_assign_type, struct vxlan_config *conf);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index ea047480a1f0..f24ec99a2262 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -382,7 +382,6 @@ enum {
 	IFLA_VXLAN_REMCSUM_RX,
 	IFLA_VXLAN_GBP,
 	IFLA_VXLAN_REMCSUM_NOPARTIAL,
-	IFLA_VXLAN_FLOWBASED,
 	IFLA_VXLAN_COLLECT_METADATA,
 	__IFLA_VXLAN_MAX
 };
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index 547173336cd3..c6e937e36f8b 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -90,7 +90,7 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms)
 	int err;
 	struct vxlan_config conf = {
 		.no_share = true,
-		.flags = VXLAN_F_FLOW_BASED | VXLAN_F_COLLECT_METADATA,
+		.flags = VXLAN_F_COLLECT_METADATA,
 	};
 
 	if (!options) {
-- 
cgit v1.2.3


From ea317b267e9d03a8241893aa176fba7661d07579 Mon Sep 17 00:00:00 2001
From: Kaixu Xia <xiakaixu@huawei.com>
Date: Thu, 6 Aug 2015 07:02:34 +0000
Subject: bpf: Add new bpf map type to store the pointer to struct perf_event

Introduce a new bpf map type 'BPF_MAP_TYPE_PERF_EVENT_ARRAY'.
This map only stores the pointer to struct perf_event. The
user space event FDs from perf_event_open() syscall are converted
to the pointer to struct perf_event and stored in map.

Signed-off-by: Kaixu Xia <xiakaixu@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  1 +
 include/uapi/linux/bpf.h |  1 +
 kernel/bpf/arraymap.c    | 57 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 59 insertions(+)

(limited to 'include/uapi')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d495211d63d1..4fc1f4070789 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -10,6 +10,7 @@
 #include <uapi/linux/bpf.h>
 #include <linux/workqueue.h>
 #include <linux/file.h>
+#include <linux/perf_event.h>
 
 struct bpf_map;
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2ce13c109b00..a1814e8e53a7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -114,6 +114,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_HASH,
 	BPF_MAP_TYPE_ARRAY,
 	BPF_MAP_TYPE_PROG_ARRAY,
+	BPF_MAP_TYPE_PERF_EVENT_ARRAY,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 45df6572ecfd..29ace107f236 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -273,3 +273,60 @@ static int __init register_prog_array_map(void)
 	return 0;
 }
 late_initcall(register_prog_array_map);
+
+static void perf_event_array_map_free(struct bpf_map *map)
+{
+	bpf_fd_array_map_clear(map);
+	fd_array_map_free(map);
+}
+
+static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
+{
+	struct perf_event *event;
+	const struct perf_event_attr *attr;
+
+	event = perf_event_get(fd);
+	if (IS_ERR(event))
+		return event;
+
+	attr = perf_event_attrs(event);
+	if (IS_ERR(attr))
+		return (void *)attr;
+
+	if (attr->type != PERF_TYPE_RAW &&
+	    attr->type != PERF_TYPE_HARDWARE) {
+		perf_event_release_kernel(event);
+		return ERR_PTR(-EINVAL);
+	}
+	return event;
+}
+
+static void perf_event_fd_array_put_ptr(void *ptr)
+{
+	struct perf_event *event = ptr;
+
+	perf_event_release_kernel(event);
+}
+
+static const struct bpf_map_ops perf_event_array_ops = {
+	.map_alloc = fd_array_map_alloc,
+	.map_free = perf_event_array_map_free,
+	.map_get_next_key = array_map_get_next_key,
+	.map_lookup_elem = fd_array_map_lookup_elem,
+	.map_update_elem = fd_array_map_update_elem,
+	.map_delete_elem = fd_array_map_delete_elem,
+	.map_fd_get_ptr = perf_event_fd_array_get_ptr,
+	.map_fd_put_ptr = perf_event_fd_array_put_ptr,
+};
+
+static struct bpf_map_type_list perf_event_array_type __read_mostly = {
+	.ops = &perf_event_array_ops,
+	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+};
+
+static int __init register_perf_event_array_map(void)
+{
+	bpf_register_map_type(&perf_event_array_type);
+	return 0;
+}
+late_initcall(register_perf_event_array_map);
-- 
cgit v1.2.3


From 35578d7984003097af2b1e34502bc943d40c1804 Mon Sep 17 00:00:00 2001
From: Kaixu Xia <xiakaixu@huawei.com>
Date: Thu, 6 Aug 2015 07:02:35 +0000
Subject: bpf: Implement function bpf_perf_event_read() that get the selected
 hardware PMU conuter

According to the perf_event_map_fd and index, the function
bpf_perf_event_read() can convert the corresponding map
value to the pointer to struct perf_event and return the
Hardware PMU counter value.

Signed-off-by: Kaixu Xia <xiakaixu@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  1 +
 include/uapi/linux/bpf.h |  1 +
 kernel/bpf/verifier.c    | 48 +++++++++++++++++++++++++++++++++---------------
 kernel/trace/bpf_trace.c | 31 +++++++++++++++++++++++++++++++
 4 files changed, 66 insertions(+), 15 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4fc1f4070789..f57d7fed9ec3 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -190,6 +190,7 @@ extern const struct bpf_func_proto bpf_map_lookup_elem_proto;
 extern const struct bpf_func_proto bpf_map_update_elem_proto;
 extern const struct bpf_func_proto bpf_map_delete_elem_proto;
 
+extern const struct bpf_func_proto bpf_perf_event_read_proto;
 extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
 extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
 extern const struct bpf_func_proto bpf_tail_call_proto;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a1814e8e53a7..92a48e2d5461 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -271,6 +271,7 @@ enum bpf_func_id {
 	 */
 	BPF_FUNC_skb_get_tunnel_key,
 	BPF_FUNC_skb_set_tunnel_key,
+	BPF_FUNC_perf_event_read,	/* u64 bpf_perf_event_read(&map, index) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index cd307df98cb3..48e1c7192560 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -238,6 +238,14 @@ static const char * const reg_type_str[] = {
 	[CONST_IMM]		= "imm",
 };
 
+static const struct {
+	int map_type;
+	int func_id;
+} func_limit[] = {
+	{BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
+	{BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
+};
+
 static void print_verifier_state(struct verifier_env *env)
 {
 	enum bpf_reg_type t;
@@ -837,6 +845,28 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 	return err;
 }
 
+static int check_map_func_compatibility(struct bpf_map *map, int func_id)
+{
+	bool bool_map, bool_func;
+	int i;
+
+	if (!map)
+		return 0;
+
+	for (i = 0; i <= ARRAY_SIZE(func_limit); i++) {
+		bool_map = (map->map_type == func_limit[i].map_type);
+		bool_func = (func_id == func_limit[i].func_id);
+		/* only when map & func pair match it can continue.
+		 * don't allow any other map type to be passed into
+		 * the special func;
+		 */
+		if (bool_map != bool_func)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int check_call(struct verifier_env *env, int func_id)
 {
 	struct verifier_state *state = &env->cur_state;
@@ -912,21 +942,9 @@ static int check_call(struct verifier_env *env, int func_id)
 		return -EINVAL;
 	}
 
-	if (map && map->map_type == BPF_MAP_TYPE_PROG_ARRAY &&
-	    func_id != BPF_FUNC_tail_call)
-		/* prog_array map type needs extra care:
-		 * only allow to pass it into bpf_tail_call() for now.
-		 * bpf_map_delete_elem() can be allowed in the future,
-		 * while bpf_map_update_elem() must only be done via syscall
-		 */
-		return -EINVAL;
-
-	if (func_id == BPF_FUNC_tail_call &&
-	    map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
-		/* don't allow any other map type to be passed into
-		 * bpf_tail_call()
-		 */
-		return -EINVAL;
+	err = check_map_func_compatibility(map, func_id);
+	if (err)
+		return err;
 
 	return 0;
 }
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 88a041adee90..ef9936df1b04 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -158,6 +158,35 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
 	return &bpf_trace_printk_proto;
 }
 
+static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
+{
+	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	struct perf_event *event;
+
+	if (unlikely(index >= array->map.max_entries))
+		return -E2BIG;
+
+	event = (struct perf_event *)array->ptrs[index];
+	if (!event)
+		return -ENOENT;
+
+	/*
+	 * we don't know if the function is run successfully by the
+	 * return value. It can be judged in other places, such as
+	 * eBPF programs.
+	 */
+	return perf_event_read_local(event);
+}
+
+const struct bpf_func_proto bpf_perf_event_read_proto = {
+	.func		= bpf_perf_event_read,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
@@ -183,6 +212,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return bpf_get_trace_printk_proto();
 	case BPF_FUNC_get_smp_processor_id:
 		return &bpf_get_smp_processor_id_proto;
+	case BPF_FUNC_perf_event_read:
+		return &bpf_perf_event_read_proto;
 	default:
 		return NULL;
 	}
-- 
cgit v1.2.3


From a7854037da006a7472c48773e3190db55217ec9b Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Fri, 7 Aug 2015 19:40:45 +0300
Subject: bridge: netlink: add support for vlan_filtering attribute

This patch adds the ability to toggle the vlan filtering support via
netlink. Since we're already running with rtnl in .changelink() we don't
need to take any additional locks.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h |  1 +
 net/bridge/br_netlink.c      | 14 +++++++++++++-
 net/bridge/br_private.h      |  7 +++++++
 net/bridge/br_vlan.c         | 18 ++++++++++++------
 4 files changed, 33 insertions(+), 7 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index f24ec99a2262..d450be36add2 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -230,6 +230,7 @@ enum {
 	IFLA_BR_AGEING_TIME,
 	IFLA_BR_STP_STATE,
 	IFLA_BR_PRIORITY,
+	IFLA_BR_VLAN_FILTERING,
 	__IFLA_BR_MAX,
 };
 
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 91a2e08c2bb8..6eb683d8e0c5 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -724,6 +724,7 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = {
 	[IFLA_BR_AGEING_TIME] = { .type = NLA_U32 },
 	[IFLA_BR_STP_STATE] = { .type = NLA_U32 },
 	[IFLA_BR_PRIORITY] = { .type = NLA_U16 },
+	[IFLA_BR_VLAN_FILTERING] = { .type = NLA_U8 },
 };
 
 static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
@@ -771,6 +772,14 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
 		br_stp_set_bridge_priority(br, priority);
 	}
 
+	if (data[IFLA_BR_VLAN_FILTERING]) {
+		u8 vlan_filter = nla_get_u8(data[IFLA_BR_VLAN_FILTERING]);
+
+		err = __br_vlan_filter_toggle(br, vlan_filter);
+		if (err)
+			return err;
+	}
+
 	return 0;
 }
 
@@ -782,6 +791,7 @@ static size_t br_get_size(const struct net_device *brdev)
 	       nla_total_size(sizeof(u32)) +    /* IFLA_BR_AGEING_TIME */
 	       nla_total_size(sizeof(u32)) +    /* IFLA_BR_STP_STATE */
 	       nla_total_size(sizeof(u16)) +    /* IFLA_BR_PRIORITY */
+	       nla_total_size(sizeof(u8)) +     /* IFLA_BR_VLAN_FILTERING */
 	       0;
 }
 
@@ -794,13 +804,15 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
 	u32 ageing_time = jiffies_to_clock_t(br->ageing_time);
 	u32 stp_enabled = br->stp_enabled;
 	u16 priority = (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1];
+	u8 vlan_enabled = br_vlan_enabled(br);
 
 	if (nla_put_u32(skb, IFLA_BR_FORWARD_DELAY, forward_delay) ||
 	    nla_put_u32(skb, IFLA_BR_HELLO_TIME, hello_time) ||
 	    nla_put_u32(skb, IFLA_BR_MAX_AGE, age_time) ||
 	    nla_put_u32(skb, IFLA_BR_AGEING_TIME, ageing_time) ||
 	    nla_put_u32(skb, IFLA_BR_STP_STATE, stp_enabled) ||
-	    nla_put_u16(skb, IFLA_BR_PRIORITY, priority))
+	    nla_put_u16(skb, IFLA_BR_PRIORITY, priority) ||
+	    nla_put_u8(skb, IFLA_BR_VLAN_FILTERING, vlan_enabled))
 		return -EMSGSIZE;
 
 	return 0;
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index e2cb359f9dd3..3d95647039d0 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -614,6 +614,7 @@ int br_vlan_delete(struct net_bridge *br, u16 vid);
 void br_vlan_flush(struct net_bridge *br);
 bool br_vlan_find(struct net_bridge *br, u16 vid);
 void br_recalculate_fwd_mask(struct net_bridge *br);
+int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val);
 int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val);
 int br_vlan_set_proto(struct net_bridge *br, unsigned long val);
 int br_vlan_init(struct net_bridge *br);
@@ -771,6 +772,12 @@ static inline int br_vlan_enabled(struct net_bridge *br)
 {
 	return 0;
 }
+
+static inline int __br_vlan_filter_toggle(struct net_bridge *br,
+					  unsigned long val)
+{
+	return -EOPNOTSUPP;
+}
 #endif
 
 struct nf_br_ops {
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 0d41f81838ff..3cef6892c0bb 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -468,21 +468,27 @@ void br_recalculate_fwd_mask(struct net_bridge *br)
 					      ~(1u << br->group_addr[5]);
 }
 
-int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val)
+int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val)
 {
-	if (!rtnl_trylock())
-		return restart_syscall();
-
 	if (br->vlan_enabled == val)
-		goto unlock;
+		return 0;
 
 	br->vlan_enabled = val;
 	br_manage_promisc(br);
 	recalculate_group_addr(br);
 	br_recalculate_fwd_mask(br);
 
-unlock:
+	return 0;
+}
+
+int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val)
+{
+	if (!rtnl_trylock())
+		return restart_syscall();
+
+	__br_vlan_filter_toggle(br, val);
 	rtnl_unlock();
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From fb811395cd5a71b9e94a068f524a6f4a21b67bdb Mon Sep 17 00:00:00 2001
From: Rick Jones <rick.jones2@hp.com>
Date: Fri, 7 Aug 2015 11:10:37 -0700
Subject: net: add explicit logging and stat for neighbour table overflow

Add an explicit neighbour table overflow message (ratelimited) and
statistic to make diagnosing neighbour table overflows tractable in
the wild.

Diagnosing a neighbour table overflow can be quite difficult in the wild
because there is no explicit dmesg logged.  Callers to neighbour code
seem to use net_dbg_ratelimit when the neighbour call fails which means
the "base message" is not emitted and the callback suppressed messages
from the ratelimiting can end-up juxtaposed with unrelated messages.
Further, a forced garbage collection will increment a stat on each call
whether it was successful in freeing-up a table entry or not, so that
statistic is only a hint.  So, add a net_info_ratelimited message and
explicit statistic to the neighbour code.

Signed-off-by: Rick Jones <rick.jones2@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/neighbour.h        |  1 +
 include/uapi/linux/neighbour.h |  1 +
 net/core/neighbour.c           | 14 ++++++++++----
 3 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index bd33e66f49aa..8b683841e574 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -125,6 +125,7 @@ struct neigh_statistics {
 	unsigned long forced_gc_runs;	/* number of forced GC runs */
 
 	unsigned long unres_discards;	/* number of unresolved drops */
+	unsigned long table_fulls;      /* times even gc couldn't help */
 };
 
 #define NEIGH_CACHE_STAT_INC(tbl, field) this_cpu_inc((tbl)->stats->field)
diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h
index 2e35c61bbdd1..788655bfa0f3 100644
--- a/include/uapi/linux/neighbour.h
+++ b/include/uapi/linux/neighbour.h
@@ -106,6 +106,7 @@ struct ndt_stats {
 	__u64		ndts_rcv_probes_ucast;
 	__u64		ndts_periodic_gc_runs;
 	__u64		ndts_forced_gc_runs;
+	__u64		ndts_table_fulls;
 };
 
 enum {
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 84195dacb8b6..2b515ba7e94f 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -274,8 +274,12 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device
 	    (entries >= tbl->gc_thresh2 &&
 	     time_after(now, tbl->last_flush + 5 * HZ))) {
 		if (!neigh_forced_gc(tbl) &&
-		    entries >= tbl->gc_thresh3)
+		    entries >= tbl->gc_thresh3) {
+			net_info_ratelimited("%s: neighbor table overflow!\n",
+					     tbl->id);
+			NEIGH_CACHE_STAT_INC(tbl, table_fulls);
 			goto out_entries;
+		}
 	}
 
 	n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC);
@@ -1849,6 +1853,7 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
 			ndst.ndts_rcv_probes_ucast	+= st->rcv_probes_ucast;
 			ndst.ndts_periodic_gc_runs	+= st->periodic_gc_runs;
 			ndst.ndts_forced_gc_runs	+= st->forced_gc_runs;
+			ndst.ndts_table_fulls		+= st->table_fulls;
 		}
 
 		if (nla_put(skb, NDTA_STATS, sizeof(ndst), &ndst))
@@ -2717,12 +2722,12 @@ static int neigh_stat_seq_show(struct seq_file *seq, void *v)
 	struct neigh_statistics *st = v;
 
 	if (v == SEQ_START_TOKEN) {
-		seq_printf(seq, "entries  allocs destroys hash_grows  lookups hits  res_failed  rcv_probes_mcast rcv_probes_ucast  periodic_gc_runs forced_gc_runs unresolved_discards\n");
+		seq_printf(seq, "entries  allocs destroys hash_grows  lookups hits  res_failed  rcv_probes_mcast rcv_probes_ucast  periodic_gc_runs forced_gc_runs unresolved_discards table_fulls\n");
 		return 0;
 	}
 
 	seq_printf(seq, "%08x  %08lx %08lx %08lx  %08lx %08lx  %08lx  "
-			"%08lx %08lx  %08lx %08lx %08lx\n",
+			"%08lx %08lx  %08lx %08lx %08lx %08lx\n",
 		   atomic_read(&tbl->entries),
 
 		   st->allocs,
@@ -2739,7 +2744,8 @@ static int neigh_stat_seq_show(struct seq_file *seq, void *v)
 
 		   st->periodic_gc_runs,
 		   st->forced_gc_runs,
-		   st->unres_discards
+		   st->unres_discards,
+		   st->table_fulls
 		   );
 
 	return 0;
-- 
cgit v1.2.3


From 2e15ea390e6f4466655066d97e22ec66870a042c Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Fri, 7 Aug 2015 23:51:42 -0700
Subject: ip_gre: Add support to collect tunnel metadata.

Following patch create new tunnel flag which enable
tunnel metadata collection on given device.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_tunnels.h       |   7 +-
 include/uapi/linux/if_tunnel.h |   1 +
 net/ipv4/ip_gre.c              | 195 +++++++++++++++++++++++++++++++++++++----
 net/ipv4/ip_tunnel.c           |  37 ++++++--
 net/ipv4/ipip.c                |   2 +-
 net/ipv6/sit.c                 |   2 +-
 6 files changed, 216 insertions(+), 28 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 47984415f5d1..984dbfa15e13 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -82,6 +82,8 @@ struct ip_tunnel_dst {
 	__be32				 saddr;
 };
 
+struct metadata_dst;
+
 struct ip_tunnel {
 	struct ip_tunnel __rcu	*next;
 	struct hlist_node hash_node;
@@ -115,6 +117,7 @@ struct ip_tunnel {
 	unsigned int		prl_count;	/* # of entries in PRL */
 	int			ip_tnl_net_id;
 	struct gro_cells	gro_cells;
+	bool			collect_md;
 };
 
 #define TUNNEL_CSUM		__cpu_to_be16(0x01)
@@ -149,6 +152,7 @@ struct tnl_ptk_info {
 struct ip_tunnel_net {
 	struct net_device *fb_tunnel_dev;
 	struct hlist_head tunnels[IP_TNL_HASH_SIZE];
+	struct ip_tunnel __rcu *collect_md_tun;
 };
 
 struct ip_tunnel_encap_ops {
@@ -235,7 +239,8 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
 				   __be32 key);
 
 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
-		  const struct tnl_ptk_info *tpi, bool log_ecn_error);
+		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
+		  bool log_ecn_error);
 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
 			 struct ip_tunnel_parm *p);
 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h
index bd3cc11a431f..af4de90ba27d 100644
--- a/include/uapi/linux/if_tunnel.h
+++ b/include/uapi/linux/if_tunnel.h
@@ -112,6 +112,7 @@ enum {
 	IFLA_GRE_ENCAP_FLAGS,
 	IFLA_GRE_ENCAP_SPORT,
 	IFLA_GRE_ENCAP_DPORT,
+	IFLA_GRE_COLLECT_METADATA,
 	__IFLA_GRE_MAX,
 };
 
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 5fd706473c73..554a760c2cd0 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -25,6 +25,7 @@
 #include <linux/udp.h>
 #include <linux/if_arp.h>
 #include <linux/mroute.h>
+#include <linux/if_vlan.h>
 #include <linux/init.h>
 #include <linux/in6.h>
 #include <linux/inetdevice.h>
@@ -47,6 +48,7 @@
 #include <net/netns/generic.h>
 #include <net/rtnetlink.h>
 #include <net/gre.h>
+#include <net/dst_metadata.h>
 
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/ipv6.h>
@@ -200,9 +202,29 @@ static int ipgre_err(struct sk_buff *skb, u32 info,
 	return PACKET_RCVD;
 }
 
+static __be64 key_to_tunnel_id(__be32 key)
+{
+#ifdef __BIG_ENDIAN
+	return (__force __be64)((__force u32)key);
+#else
+	return (__force __be64)((__force u64)key << 32);
+#endif
+}
+
+/* Returns the least-significant 32 bits of a __be64. */
+static __be32 tunnel_id_to_key(__be64 x)
+{
+#ifdef __BIG_ENDIAN
+	return (__force __be32)x;
+#else
+	return (__force __be32)((__force u64)x >> 32);
+#endif
+}
+
 static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
 {
 	struct net *net = dev_net(skb->dev);
+	struct metadata_dst *tun_dst = NULL;
 	struct ip_tunnel_net *itn;
 	const struct iphdr *iph;
 	struct ip_tunnel *tunnel;
@@ -218,40 +240,162 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
 
 	if (tunnel) {
 		skb_pop_mac_header(skb);
-		ip_tunnel_rcv(tunnel, skb, tpi, log_ecn_error);
+		if (tunnel->collect_md) {
+			struct ip_tunnel_info *info;
+
+			tun_dst = metadata_dst_alloc(0, GFP_ATOMIC);
+			if (!tun_dst)
+				return PACKET_REJECT;
+
+			info = &tun_dst->u.tun_info;
+			info->key.ipv4_src = iph->saddr;
+			info->key.ipv4_dst = iph->daddr;
+			info->key.ipv4_tos = iph->tos;
+			info->key.ipv4_ttl = iph->ttl;
+
+			info->mode = IP_TUNNEL_INFO_RX;
+			info->key.tun_flags = tpi->flags &
+					      (TUNNEL_CSUM | TUNNEL_KEY);
+			info->key.tun_id = key_to_tunnel_id(tpi->key);
+
+			info->key.tp_src = 0;
+			info->key.tp_dst = 0;
+		}
+
+		ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
 		return PACKET_RCVD;
 	}
 	return PACKET_REJECT;
 }
 
+static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags,
+			 __be16 proto, __be32 key, __be32 seq)
+{
+	struct gre_base_hdr *greh;
+
+	skb_push(skb, hdr_len);
+
+	skb_reset_transport_header(skb);
+	greh = (struct gre_base_hdr *)skb->data;
+	greh->flags = tnl_flags_to_gre_flags(flags);
+	greh->protocol = proto;
+
+	if (flags & (TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_SEQ)) {
+		__be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4);
+
+		if (flags & TUNNEL_SEQ) {
+			*ptr = seq;
+			ptr--;
+		}
+		if (flags & TUNNEL_KEY) {
+			*ptr = key;
+			ptr--;
+		}
+		if (flags & TUNNEL_CSUM &&
+		    !(skb_shinfo(skb)->gso_type &
+		      (SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) {
+			*ptr = 0;
+			*(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
+								 skb->len, 0));
+		}
+	}
+}
+
 static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
 		       const struct iphdr *tnl_params,
 		       __be16 proto)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
-	struct tnl_ptk_info tpi;
 
-	tpi.flags = tunnel->parms.o_flags;
-	tpi.proto = proto;
-	tpi.key = tunnel->parms.o_key;
 	if (tunnel->parms.o_flags & TUNNEL_SEQ)
 		tunnel->o_seqno++;
-	tpi.seq = htonl(tunnel->o_seqno);
 
 	/* Push GRE header. */
-	gre_build_header(skb, &tpi, tunnel->tun_hlen);
-
-	skb_set_inner_protocol(skb, tpi.proto);
+	build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags,
+		     proto, tunnel->parms.o_key, htonl(tunnel->o_seqno));
 
+	skb_set_inner_protocol(skb, proto);
 	ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
 }
 
+static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct ip_tunnel_info *tun_info;
+	struct net *net = dev_net(dev);
+	const struct ip_tunnel_key *key;
+	struct flowi4 fl;
+	struct rtable *rt;
+	int min_headroom;
+	int tunnel_hlen;
+	__be16 df, flags;
+	int err;
+
+	tun_info = skb_tunnel_info(skb, AF_INET);
+	if (unlikely(!tun_info || tun_info->mode != IP_TUNNEL_INFO_TX))
+		goto err_free_skb;
+
+	key = &tun_info->key;
+	memset(&fl, 0, sizeof(fl));
+	fl.daddr = key->ipv4_dst;
+	fl.saddr = key->ipv4_src;
+	fl.flowi4_tos = RT_TOS(key->ipv4_tos);
+	fl.flowi4_mark = skb->mark;
+	fl.flowi4_proto = IPPROTO_GRE;
+
+	rt = ip_route_output_key(net, &fl);
+	if (IS_ERR(rt))
+		goto err_free_skb;
+
+	tunnel_hlen = ip_gre_calc_hlen(key->tun_flags);
+
+	min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
+			+ tunnel_hlen + sizeof(struct iphdr);
+	if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
+		int head_delta = SKB_DATA_ALIGN(min_headroom -
+						skb_headroom(skb) +
+						16);
+		err = pskb_expand_head(skb, max_t(int, head_delta, 0),
+				       0, GFP_ATOMIC);
+		if (unlikely(err))
+			goto err_free_rt;
+	}
+
+	/* Push Tunnel header. */
+	skb = gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM));
+	if (IS_ERR(skb)) {
+		skb = NULL;
+		goto err_free_rt;
+	}
+
+	flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY);
+	build_header(skb, tunnel_hlen, flags, htons(ETH_P_TEB),
+		     tunnel_id_to_key(tun_info->key.tun_id), 0);
+
+	df = key->tun_flags & TUNNEL_DONT_FRAGMENT ?  htons(IP_DF) : 0;
+	err = iptunnel_xmit(skb->sk, rt, skb, fl.saddr,
+			    key->ipv4_dst, IPPROTO_GRE,
+			    key->ipv4_tos, key->ipv4_ttl, df, false);
+	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
+	return;
+
+err_free_rt:
+	ip_rt_put(rt);
+err_free_skb:
+	kfree_skb(skb);
+	dev->stats.tx_dropped++;
+}
+
 static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
 			      struct net_device *dev)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 	const struct iphdr *tnl_params;
 
+	if (tunnel->collect_md) {
+		gre_fb_xmit(skb, dev);
+		return NETDEV_TX_OK;
+	}
+
 	if (dev->header_ops) {
 		/* Need space for new headers */
 		if (skb_cow_head(skb, dev->needed_headroom -
@@ -277,7 +421,6 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
 		goto out;
 
 	__gre_xmit(skb, dev, tnl_params, skb->protocol);
-
 	return NETDEV_TX_OK;
 
 free_skb:
@@ -292,6 +435,11 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 
+	if (tunnel->collect_md) {
+		gre_fb_xmit(skb, dev);
+		return NETDEV_TX_OK;
+	}
+
 	skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM));
 	if (IS_ERR(skb))
 		goto out;
@@ -300,7 +448,6 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
 		goto free_skb;
 
 	__gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB));
-
 	return NETDEV_TX_OK;
 
 free_skb:
@@ -596,8 +743,10 @@ out:
 	return ipgre_tunnel_validate(tb, data);
 }
 
-static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[],
-			       struct ip_tunnel_parm *parms)
+static void ipgre_netlink_parms(struct net_device *dev,
+				struct nlattr *data[],
+				struct nlattr *tb[],
+				struct ip_tunnel_parm *parms)
 {
 	memset(parms, 0, sizeof(*parms));
 
@@ -635,6 +784,12 @@ static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[],
 
 	if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
 		parms->iph.frag_off = htons(IP_DF);
+
+	if (data[IFLA_GRE_COLLECT_METADATA]) {
+		struct ip_tunnel *t = netdev_priv(dev);
+
+		t->collect_md = true;
+	}
 }
 
 /* This function returns true when ENCAP attributes are present in the nl msg */
@@ -712,7 +867,7 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev,
 			return err;
 	}
 
-	ipgre_netlink_parms(data, tb, &p);
+	ipgre_netlink_parms(dev, data, tb, &p);
 	return ip_tunnel_newlink(dev, tb, &p);
 }
 
@@ -730,7 +885,7 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
 			return err;
 	}
 
-	ipgre_netlink_parms(data, tb, &p);
+	ipgre_netlink_parms(dev, data, tb, &p);
 	return ip_tunnel_changelink(dev, tb, &p);
 }
 
@@ -765,6 +920,8 @@ static size_t ipgre_get_size(const struct net_device *dev)
 		nla_total_size(2) +
 		/* IFLA_GRE_ENCAP_DPORT */
 		nla_total_size(2) +
+		/* IFLA_GRE_COLLECT_METADATA */
+		nla_total_size(0) +
 		0;
 }
 
@@ -796,6 +953,11 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
 			t->encap.flags))
 		goto nla_put_failure;
 
+	if (t->collect_md) {
+		if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA))
+			goto nla_put_failure;
+	}
+
 	return 0;
 
 nla_put_failure:
@@ -817,6 +979,7 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
 	[IFLA_GRE_ENCAP_FLAGS]	= { .type = NLA_U16 },
 	[IFLA_GRE_ENCAP_SPORT]	= { .type = NLA_U16 },
 	[IFLA_GRE_ENCAP_DPORT]	= { .type = NLA_U16 },
+	[IFLA_GRE_COLLECT_METADATA]	= { .type = NLA_FLAG },
 };
 
 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
@@ -851,7 +1014,7 @@ static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
 
 static int __net_init ipgre_tap_init_net(struct net *net)
 {
-	return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, NULL);
+	return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
 }
 
 static void __net_exit ipgre_tap_exit_net(struct net *net)
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 626d9e56a6bd..cbb51f3fac06 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -230,10 +230,13 @@ skip_key_lookup:
 	if (cand)
 		return cand;
 
+	t = rcu_dereference(itn->collect_md_tun);
+	if (t)
+		return t;
+
 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
 		return netdev_priv(itn->fb_tunnel_dev);
 
-
 	return NULL;
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
@@ -261,11 +264,15 @@ static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
 {
 	struct hlist_head *head = ip_bucket(itn, &t->parms);
 
+	if (t->collect_md)
+		rcu_assign_pointer(itn->collect_md_tun, t);
 	hlist_add_head_rcu(&t->hash_node, head);
 }
 
-static void ip_tunnel_del(struct ip_tunnel *t)
+static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
 {
+	if (t->collect_md)
+		rcu_assign_pointer(itn->collect_md_tun, NULL);
 	hlist_del_init_rcu(&t->hash_node);
 }
 
@@ -419,7 +426,8 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,
 }
 
 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
-		  const struct tnl_ptk_info *tpi, bool log_ecn_error)
+		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
+		  bool log_ecn_error)
 {
 	struct pcpu_sw_netstats *tstats;
 	const struct iphdr *iph = ip_hdr(skb);
@@ -478,6 +486,9 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
 		skb->dev = tunnel->dev;
 	}
 
+	if (tun_dst)
+		skb_dst_set(skb, (struct dst_entry *)tun_dst);
+
 	gro_cells_receive(&tunnel->gro_cells, skb);
 	return 0;
 
@@ -806,7 +817,7 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn,
 			     struct ip_tunnel_parm *p,
 			     bool set_mtu)
 {
-	ip_tunnel_del(t);
+	ip_tunnel_del(itn, t);
 	t->parms.iph.saddr = p->iph.saddr;
 	t->parms.iph.daddr = p->iph.daddr;
 	t->parms.i_key = p->i_key;
@@ -967,7 +978,7 @@ void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
 
 	if (itn->fb_tunnel_dev != dev) {
-		ip_tunnel_del(netdev_priv(dev));
+		ip_tunnel_del(itn, netdev_priv(dev));
 		unregister_netdevice_queue(dev, head);
 	}
 }
@@ -1072,8 +1083,13 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
 	nt = netdev_priv(dev);
 	itn = net_generic(net, nt->ip_tnl_net_id);
 
-	if (ip_tunnel_find(itn, p, dev->type))
-		return -EEXIST;
+	if (nt->collect_md) {
+		if (rtnl_dereference(itn->collect_md_tun))
+			return -EEXIST;
+	} else {
+		if (ip_tunnel_find(itn, p, dev->type))
+			return -EEXIST;
+	}
 
 	nt->net = net;
 	nt->parms = *p;
@@ -1089,7 +1105,6 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
 		dev->mtu = mtu;
 
 	ip_tunnel_add(itn, nt);
-
 out:
 	return err;
 }
@@ -1163,6 +1178,10 @@ int ip_tunnel_init(struct net_device *dev)
 	iph->version		= 4;
 	iph->ihl		= 5;
 
+	if (tunnel->collect_md) {
+		dev->features |= NETIF_F_NETNS_LOCAL;
+		netif_keep_dst(dev);
+	}
 	return 0;
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_init);
@@ -1176,7 +1195,7 @@ void ip_tunnel_uninit(struct net_device *dev)
 	itn = net_generic(net, tunnel->ip_tnl_net_id);
 	/* fb_tunnel_dev will be unregisted in net-exit call. */
 	if (itn->fb_tunnel_dev != dev)
-		ip_tunnel_del(netdev_priv(dev));
+		ip_tunnel_del(itn, netdev_priv(dev));
 
 	ip_tunnel_dst_reset_all(tunnel);
 }
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 254238daf58b..f34c31defafe 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -198,7 +198,7 @@ static int ipip_rcv(struct sk_buff *skb)
 			goto drop;
 		if (iptunnel_pull_header(skb, 0, tpi.proto))
 			goto drop;
-		return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error);
+		return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error);
 	}
 
 	return -1;
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index ac35a28599be..94428fd85b2f 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -742,7 +742,7 @@ static int ipip_rcv(struct sk_buff *skb)
 			goto drop;
 		if (iptunnel_pull_header(skb, 0, tpi.proto))
 			goto drop;
-		return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error);
+		return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error);
 	}
 
 	return 1;
-- 
cgit v1.2.3


From d80efd5cb3dec16a8d1aea9b8a4a7921972dba65 Mon Sep 17 00:00:00 2001
From: Thomas Hellstrom <thellstrom@vmware.com>
Date: Mon, 10 Aug 2015 10:39:35 -0700
Subject: drm/vmwgfx: Initial DX support

Initial DX support.
Co-authored with Sinclair Yeh, Charmaine Lee and Jakob Bornecrantz.

Signed-off-by: Thomas Hellstrom <thellstrom@vmware.com>
Signed-off-by: Sinclair Yeh <syeh@vmware.com>
Signed-off-by: Charmaine Lee <charmainel@vmware.com>
---
 drivers/gpu/drm/vmwgfx/Makefile               |    1 +
 drivers/gpu/drm/vmwgfx/vmwgfx_binding.c       | 1294 ++++++++++++++++++++++
 drivers/gpu/drm/vmwgfx/vmwgfx_binding.h       |  209 ++++
 drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf.c        |    5 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf_res.c    |   24 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_context.c       |  735 ++++++-------
 drivers/gpu/drm/vmwgfx/vmwgfx_cotable.c       |  662 +++++++++++
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.c           |   54 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.h           |  175 ++-
 drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c       | 1468 ++++++++++++++++++++++---
 drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c          |   28 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_ioctl.c         |    7 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_kms.c           |   15 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_mob.c           |  169 +--
 drivers/gpu/drm/vmwgfx/vmwgfx_resource.c      |   48 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h |   12 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_shader.c        |  484 +++++++-
 drivers/gpu/drm/vmwgfx/vmwgfx_so.c            |  555 ++++++++++
 drivers/gpu/drm/vmwgfx/vmwgfx_so.h            |  160 +++
 drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c          |    1 +
 drivers/gpu/drm/vmwgfx/vmwgfx_surface.c       |  109 +-
 include/uapi/drm/vmwgfx_drm.h                 |   35 +-
 22 files changed, 5411 insertions(+), 839 deletions(-)
 create mode 100644 drivers/gpu/drm/vmwgfx/vmwgfx_binding.c
 create mode 100644 drivers/gpu/drm/vmwgfx/vmwgfx_binding.h
 create mode 100644 drivers/gpu/drm/vmwgfx/vmwgfx_cotable.c
 create mode 100644 drivers/gpu/drm/vmwgfx/vmwgfx_so.c
 create mode 100644 drivers/gpu/drm/vmwgfx/vmwgfx_so.h

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/vmwgfx/Makefile b/drivers/gpu/drm/vmwgfx/Makefile
index 484093986d5a..d281575bbe11 100644
--- a/drivers/gpu/drm/vmwgfx/Makefile
+++ b/drivers/gpu/drm/vmwgfx/Makefile
@@ -8,5 +8,6 @@ vmwgfx-y := vmwgfx_execbuf.o vmwgfx_gmr.o vmwgfx_kms.o vmwgfx_drv.o \
 	    vmwgfx_fence.o vmwgfx_dmabuf.o vmwgfx_scrn.o vmwgfx_context.o \
 	    vmwgfx_surface.o vmwgfx_prime.o vmwgfx_mob.o vmwgfx_shader.o \
 	    vmwgfx_cmdbuf_res.o vmwgfx_cmdbuf.o vmwgfx_stdu.o \
+	    vmwgfx_cotable.o vmwgfx_so.o vmwgfx_binding.o
 
 obj-$(CONFIG_DRM_VMWGFX) := vmwgfx.o
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_binding.c b/drivers/gpu/drm/vmwgfx/vmwgfx_binding.c
new file mode 100644
index 000000000000..9c42e96da510
--- /dev/null
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_binding.c
@@ -0,0 +1,1294 @@
+/**************************************************************************
+ *
+ * Copyright © 2015 VMware, Inc., Palo Alto, CA., USA
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+/*
+ * This file implements the vmwgfx context binding manager,
+ * The sole reason for having to use this code is that vmware guest
+ * backed contexts can be swapped out to their backing mobs by the device
+ * at any time, also swapped in at any time. At swapin time, the device
+ * validates the context bindings to make sure they point to valid resources.
+ * It's this outside-of-drawcall validation (that can happen at any time),
+ * that makes this code necessary.
+ *
+ * We therefore need to kill any context bindings pointing to a resource
+ * when the resource is swapped out. Furthermore, if the vmwgfx driver has
+ * swapped out the context we can't swap it in again to kill bindings because
+ * of backing mob reservation lockdep violations, so as part of
+ * context swapout, also kill all bindings of a context, so that they are
+ * already killed if a resource to which a binding points
+ * needs to be swapped out.
+ *
+ * Note that a resource can be pointed to by bindings from multiple contexts,
+ * Therefore we can't easily protect this data by a per context mutex
+ * (unless we use deadlock-safe WW mutexes). So we use a global binding_mutex
+ * to protect all binding manager data.
+ *
+ * Finally, any association between a context and a global resource
+ * (surface, shader or even DX query) is conceptually a context binding that
+ * needs to be tracked by this code.
+ */
+
+#include "vmwgfx_drv.h"
+#include "vmwgfx_binding.h"
+#include "device_include/svga3d_reg.h"
+
+#define VMW_BINDING_RT_BIT     0
+#define VMW_BINDING_PS_BIT     1
+#define VMW_BINDING_SO_BIT     2
+#define VMW_BINDING_VB_BIT     3
+#define VMW_BINDING_NUM_BITS   4
+
+#define VMW_BINDING_PS_SR_BIT  0
+
+/**
+ * struct vmw_ctx_binding_state - per context binding state
+ *
+ * @dev_priv: Pointer to device private structure.
+ * @list: linked list of individual active bindings.
+ * @render_targets: Render target bindings.
+ * @texture_units: Texture units bindings.
+ * @ds_view: Depth-stencil view binding.
+ * @so_targets: StreamOutput target bindings.
+ * @vertex_buffers: Vertex buffer bindings.
+ * @index_buffer: Index buffer binding.
+ * @per_shader: Per shader-type bindings.
+ * @dirty: Bitmap tracking per binding-type changes that have not yet
+ * been emitted to the device.
+ * @dirty_vb: Bitmap tracking individual vertex buffer binding changes that
+ * have not yet been emitted to the device.
+ * @bind_cmd_buffer: Scratch space used to construct binding commands.
+ * @bind_cmd_count: Number of binding command data entries in @bind_cmd_buffer
+ * @bind_first_slot: Used together with @bind_cmd_buffer to indicate the
+ * device binding slot of the first command data entry in @bind_cmd_buffer.
+ *
+ * Note that this structure also provides storage space for the individual
+ * struct vmw_ctx_binding objects, so that no dynamic allocation is needed
+ * for individual bindings.
+ *
+ */
+struct vmw_ctx_binding_state {
+	struct vmw_private *dev_priv;
+	struct list_head list;
+	struct vmw_ctx_bindinfo_view render_targets[SVGA3D_RT_MAX];
+	struct vmw_ctx_bindinfo_tex texture_units[SVGA3D_NUM_TEXTURE_UNITS];
+	struct vmw_ctx_bindinfo_view ds_view;
+	struct vmw_ctx_bindinfo_so so_targets[SVGA3D_DX_MAX_SOTARGETS];
+	struct vmw_ctx_bindinfo_vb vertex_buffers[SVGA3D_DX_MAX_VERTEXBUFFERS];
+	struct vmw_ctx_bindinfo_ib index_buffer;
+	struct vmw_dx_shader_bindings per_shader[SVGA3D_NUM_SHADERTYPE_DX10];
+
+	unsigned long dirty;
+	DECLARE_BITMAP(dirty_vb, SVGA3D_DX_MAX_VERTEXBUFFERS);
+
+	u32 bind_cmd_buffer[VMW_MAX_VIEW_BINDINGS];
+	u32 bind_cmd_count;
+	u32 bind_first_slot;
+};
+
+static int vmw_binding_scrub_shader(struct vmw_ctx_bindinfo *bi, bool rebind);
+static int vmw_binding_scrub_render_target(struct vmw_ctx_bindinfo *bi,
+					   bool rebind);
+static int vmw_binding_scrub_texture(struct vmw_ctx_bindinfo *bi, bool rebind);
+static int vmw_binding_scrub_cb(struct vmw_ctx_bindinfo *bi, bool rebind);
+static int vmw_binding_scrub_dx_rt(struct vmw_ctx_bindinfo *bi, bool rebind);
+static int vmw_binding_scrub_sr(struct vmw_ctx_bindinfo *bi, bool rebind);
+static int vmw_binding_scrub_so(struct vmw_ctx_bindinfo *bi, bool rebind);
+static int vmw_binding_emit_dirty(struct vmw_ctx_binding_state *cbs);
+static int vmw_binding_scrub_dx_shader(struct vmw_ctx_bindinfo *bi,
+				       bool rebind);
+static int vmw_binding_scrub_ib(struct vmw_ctx_bindinfo *bi, bool rebind);
+static int vmw_binding_scrub_vb(struct vmw_ctx_bindinfo *bi, bool rebind);
+static void vmw_binding_build_asserts(void) __attribute__ ((unused));
+
+typedef int (*vmw_scrub_func)(struct vmw_ctx_bindinfo *, bool);
+
+/**
+ * struct vmw_binding_info - Per binding type information for the binding
+ * manager
+ *
+ * @size: The size of the struct binding derived from a struct vmw_ctx_bindinfo.
+ * @offsets: array[shader_slot] of offsets to the array[slot]
+ * of struct bindings for the binding type.
+ * @scrub_func: Pointer to the scrub function for this binding type.
+ *
+ * Holds static information to help optimize the binding manager and avoid
+ * an excessive amount of switch statements.
+ */
+struct vmw_binding_info {
+	size_t size;
+	const size_t *offsets;
+	vmw_scrub_func scrub_func;
+};
+
+/*
+ * A number of static variables that help determine the scrub func and the
+ * location of the struct vmw_ctx_bindinfo slots for each binding type.
+ */
+static const size_t vmw_binding_shader_offsets[] = {
+	offsetof(struct vmw_ctx_binding_state, per_shader[0].shader),
+	offsetof(struct vmw_ctx_binding_state, per_shader[1].shader),
+	offsetof(struct vmw_ctx_binding_state, per_shader[2].shader),
+};
+static const size_t vmw_binding_rt_offsets[] = {
+	offsetof(struct vmw_ctx_binding_state, render_targets),
+};
+static const size_t vmw_binding_tex_offsets[] = {
+	offsetof(struct vmw_ctx_binding_state, texture_units),
+};
+static const size_t vmw_binding_cb_offsets[] = {
+	offsetof(struct vmw_ctx_binding_state, per_shader[0].const_buffers),
+	offsetof(struct vmw_ctx_binding_state, per_shader[1].const_buffers),
+	offsetof(struct vmw_ctx_binding_state, per_shader[2].const_buffers),
+};
+static const size_t vmw_binding_dx_ds_offsets[] = {
+	offsetof(struct vmw_ctx_binding_state, ds_view),
+};
+static const size_t vmw_binding_sr_offsets[] = {
+	offsetof(struct vmw_ctx_binding_state, per_shader[0].shader_res),
+	offsetof(struct vmw_ctx_binding_state, per_shader[1].shader_res),
+	offsetof(struct vmw_ctx_binding_state, per_shader[2].shader_res),
+};
+static const size_t vmw_binding_so_offsets[] = {
+	offsetof(struct vmw_ctx_binding_state, so_targets),
+};
+static const size_t vmw_binding_vb_offsets[] = {
+	offsetof(struct vmw_ctx_binding_state, vertex_buffers),
+};
+static const size_t vmw_binding_ib_offsets[] = {
+	offsetof(struct vmw_ctx_binding_state, index_buffer),
+};
+
+static const struct vmw_binding_info vmw_binding_infos[] = {
+	[vmw_ctx_binding_shader] = {
+		.size = sizeof(struct vmw_ctx_bindinfo_shader),
+		.offsets = vmw_binding_shader_offsets,
+		.scrub_func = vmw_binding_scrub_shader},
+	[vmw_ctx_binding_rt] = {
+		.size = sizeof(struct vmw_ctx_bindinfo_view),
+		.offsets = vmw_binding_rt_offsets,
+		.scrub_func = vmw_binding_scrub_render_target},
+	[vmw_ctx_binding_tex] = {
+		.size = sizeof(struct vmw_ctx_bindinfo_tex),
+		.offsets = vmw_binding_tex_offsets,
+		.scrub_func = vmw_binding_scrub_texture},
+	[vmw_ctx_binding_cb] = {
+		.size = sizeof(struct vmw_ctx_bindinfo_cb),
+		.offsets = vmw_binding_cb_offsets,
+		.scrub_func = vmw_binding_scrub_cb},
+	[vmw_ctx_binding_dx_shader] = {
+		.size = sizeof(struct vmw_ctx_bindinfo_shader),
+		.offsets = vmw_binding_shader_offsets,
+		.scrub_func = vmw_binding_scrub_dx_shader},
+	[vmw_ctx_binding_dx_rt] = {
+		.size = sizeof(struct vmw_ctx_bindinfo_view),
+		.offsets = vmw_binding_rt_offsets,
+		.scrub_func = vmw_binding_scrub_dx_rt},
+	[vmw_ctx_binding_sr] = {
+		.size = sizeof(struct vmw_ctx_bindinfo_view),
+		.offsets = vmw_binding_sr_offsets,
+		.scrub_func = vmw_binding_scrub_sr},
+	[vmw_ctx_binding_ds] = {
+		.size = sizeof(struct vmw_ctx_bindinfo_view),
+		.offsets = vmw_binding_dx_ds_offsets,
+		.scrub_func = vmw_binding_scrub_dx_rt},
+	[vmw_ctx_binding_so] = {
+		.size = sizeof(struct vmw_ctx_bindinfo_so),
+		.offsets = vmw_binding_so_offsets,
+		.scrub_func = vmw_binding_scrub_so},
+	[vmw_ctx_binding_vb] = {
+		.size = sizeof(struct vmw_ctx_bindinfo_vb),
+		.offsets = vmw_binding_vb_offsets,
+		.scrub_func = vmw_binding_scrub_vb},
+	[vmw_ctx_binding_ib] = {
+		.size = sizeof(struct vmw_ctx_bindinfo_ib),
+		.offsets = vmw_binding_ib_offsets,
+		.scrub_func = vmw_binding_scrub_ib},
+};
+
+/**
+ * vmw_cbs_context - Return a pointer to the context resource of a
+ * context binding state tracker.
+ *
+ * @cbs: The context binding state tracker.
+ *
+ * Provided there are any active bindings, this function will return an
+ * unreferenced pointer to the context resource that owns the context
+ * binding state tracker. If there are no active bindings, this function
+ * will return NULL. Note that the caller must somehow ensure that a reference
+ * is held on the context resource prior to calling this function.
+ */
+static const struct vmw_resource *
+vmw_cbs_context(const struct vmw_ctx_binding_state *cbs)
+{
+	if (list_empty(&cbs->list))
+		return NULL;
+
+	return list_first_entry(&cbs->list, struct vmw_ctx_bindinfo,
+				ctx_list)->ctx;
+}
+
+/**
+ * vmw_binding_loc - determine the struct vmw_ctx_bindinfo slot location.
+ *
+ * @cbs: Pointer to a struct vmw_ctx_binding state which holds the slot.
+ * @bt: The binding type.
+ * @shader_slot: The shader slot of the binding. If none, then set to 0.
+ * @slot: The slot of the binding.
+ */
+static struct vmw_ctx_bindinfo *
+vmw_binding_loc(struct vmw_ctx_binding_state *cbs,
+		enum vmw_ctx_binding_type bt, u32 shader_slot, u32 slot)
+{
+	const struct vmw_binding_info *b = &vmw_binding_infos[bt];
+	size_t offset = b->offsets[shader_slot] + b->size*slot;
+
+	return (struct vmw_ctx_bindinfo *)((u8 *) cbs + offset);
+}
+
+/**
+ * vmw_binding_drop: Stop tracking a context binding
+ *
+ * @bi: Pointer to binding tracker storage.
+ *
+ * Stops tracking a context binding, and re-initializes its storage.
+ * Typically used when the context binding is replaced with a binding to
+ * another (or the same, for that matter) resource.
+ */
+static void vmw_binding_drop(struct vmw_ctx_bindinfo *bi)
+{
+	list_del(&bi->ctx_list);
+	if (!list_empty(&bi->res_list))
+		list_del(&bi->res_list);
+	bi->ctx = NULL;
+}
+
+/**
+ * vmw_binding_add: Start tracking a context binding
+ *
+ * @cbs: Pointer to the context binding state tracker.
+ * @bi: Information about the binding to track.
+ *
+ * Starts tracking the binding in the context binding
+ * state structure @cbs.
+ */
+void vmw_binding_add(struct vmw_ctx_binding_state *cbs,
+		    const struct vmw_ctx_bindinfo *bi,
+		    u32 shader_slot, u32 slot)
+{
+	struct vmw_ctx_bindinfo *loc =
+		vmw_binding_loc(cbs, bi->bt, shader_slot, slot);
+	const struct vmw_binding_info *b = &vmw_binding_infos[bi->bt];
+
+	if (loc->ctx != NULL)
+		vmw_binding_drop(loc);
+
+	memcpy(loc, bi, b->size);
+	loc->scrubbed = false;
+	list_add(&loc->ctx_list, &cbs->list);
+	INIT_LIST_HEAD(&loc->res_list);
+}
+
+/**
+ * vmw_binding_transfer: Transfer a context binding tracking entry.
+ *
+ * @cbs: Pointer to the persistent context binding state tracker.
+ * @bi: Information about the binding to track.
+ *
+ */
+static void vmw_binding_transfer(struct vmw_ctx_binding_state *cbs,
+				 const struct vmw_ctx_binding_state *from,
+				 const struct vmw_ctx_bindinfo *bi)
+{
+	size_t offset = (unsigned long)bi - (unsigned long)from;
+	struct vmw_ctx_bindinfo *loc = (struct vmw_ctx_bindinfo *)
+		((unsigned long) cbs + offset);
+
+	if (loc->ctx != NULL) {
+		WARN_ON(bi->scrubbed);
+
+		vmw_binding_drop(loc);
+	}
+
+	if (bi->res != NULL) {
+		memcpy(loc, bi, vmw_binding_infos[bi->bt].size);
+		list_add_tail(&loc->ctx_list, &cbs->list);
+		list_add_tail(&loc->res_list, &loc->res->binding_head);
+	}
+}
+
+/**
+ * vmw_binding_state_kill - Kill all bindings associated with a
+ * struct vmw_ctx_binding state structure, and re-initialize the structure.
+ *
+ * @cbs: Pointer to the context binding state tracker.
+ *
+ * Emits commands to scrub all bindings associated with the
+ * context binding state tracker. Then re-initializes the whole structure.
+ */
+void vmw_binding_state_kill(struct vmw_ctx_binding_state *cbs)
+{
+	struct vmw_ctx_bindinfo *entry, *next;
+
+	vmw_binding_state_scrub(cbs);
+	list_for_each_entry_safe(entry, next, &cbs->list, ctx_list)
+		vmw_binding_drop(entry);
+}
+
+/**
+ * vmw_binding_state_scrub - Scrub all bindings associated with a
+ * struct vmw_ctx_binding state structure.
+ *
+ * @cbs: Pointer to the context binding state tracker.
+ *
+ * Emits commands to scrub all bindings associated with the
+ * context binding state tracker.
+ */
+void vmw_binding_state_scrub(struct vmw_ctx_binding_state *cbs)
+{
+	struct vmw_ctx_bindinfo *entry;
+
+	list_for_each_entry(entry, &cbs->list, ctx_list) {
+		if (!entry->scrubbed) {
+			(void) vmw_binding_infos[entry->bt].scrub_func
+				(entry, false);
+			entry->scrubbed = true;
+		}
+	}
+
+	(void) vmw_binding_emit_dirty(cbs);
+}
+
+/**
+ * vmw_binding_res_list_kill - Kill all bindings on a
+ * resource binding list
+ *
+ * @head: list head of resource binding list
+ *
+ * Kills all bindings associated with a specific resource. Typically
+ * called before the resource is destroyed.
+ */
+void vmw_binding_res_list_kill(struct list_head *head)
+{
+	struct vmw_ctx_bindinfo *entry, *next;
+
+	vmw_binding_res_list_scrub(head);
+	list_for_each_entry_safe(entry, next, head, res_list)
+		vmw_binding_drop(entry);
+}
+
+/**
+ * vmw_binding_res_list_scrub - Scrub all bindings on a
+ * resource binding list
+ *
+ * @head: list head of resource binding list
+ *
+ * Scrub all bindings associated with a specific resource. Typically
+ * called before the resource is evicted.
+ */
+void vmw_binding_res_list_scrub(struct list_head *head)
+{
+	struct vmw_ctx_bindinfo *entry;
+
+	list_for_each_entry(entry, head, res_list) {
+		if (!entry->scrubbed) {
+			(void) vmw_binding_infos[entry->bt].scrub_func
+				(entry, false);
+			entry->scrubbed = true;
+		}
+	}
+
+	list_for_each_entry(entry, head, res_list) {
+		struct vmw_ctx_binding_state *cbs =
+			vmw_context_binding_state(entry->ctx);
+
+		(void) vmw_binding_emit_dirty(cbs);
+	}
+}
+
+
+/**
+ * vmw_binding_state_commit - Commit staged binding info
+ *
+ * @ctx: Pointer to context to commit the staged binding info to.
+ * @from: Staged binding info built during execbuf.
+ * @scrubbed: Transfer only scrubbed bindings.
+ *
+ * Transfers binding info from a temporary structure
+ * (typically used by execbuf) to the persistent
+ * structure in the context. This can be done once commands have been
+ * submitted to hardware
+ */
+void vmw_binding_state_commit(struct vmw_ctx_binding_state *to,
+			      struct vmw_ctx_binding_state *from)
+{
+	struct vmw_ctx_bindinfo *entry, *next;
+
+	list_for_each_entry_safe(entry, next, &from->list, ctx_list) {
+		vmw_binding_transfer(to, from, entry);
+		vmw_binding_drop(entry);
+	}
+}
+
+/**
+ * vmw_binding_rebind_all - Rebind all scrubbed bindings of a context
+ *
+ * @ctx: The context resource
+ *
+ * Walks through the context binding list and rebinds all scrubbed
+ * resources.
+ */
+int vmw_binding_rebind_all(struct vmw_ctx_binding_state *cbs)
+{
+	struct vmw_ctx_bindinfo *entry;
+	int ret;
+
+	list_for_each_entry(entry, &cbs->list, ctx_list) {
+		if (likely(!entry->scrubbed))
+			continue;
+
+		if ((entry->res == NULL || entry->res->id ==
+			    SVGA3D_INVALID_ID))
+			continue;
+
+		ret = vmw_binding_infos[entry->bt].scrub_func(entry, true);
+		if (unlikely(ret != 0))
+			return ret;
+
+		entry->scrubbed = false;
+	}
+
+	return vmw_binding_emit_dirty(cbs);
+}
+
+/**
+ * vmw_binding_scrub_shader - scrub a shader binding from a context.
+ *
+ * @bi: single binding information.
+ * @rebind: Whether to issue a bind instead of scrub command.
+ */
+static int vmw_binding_scrub_shader(struct vmw_ctx_bindinfo *bi, bool rebind)
+{
+	struct vmw_ctx_bindinfo_shader *binding =
+		container_of(bi, typeof(*binding), bi);
+	struct vmw_private *dev_priv = bi->ctx->dev_priv;
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdSetShader body;
+	} *cmd;
+
+	cmd = vmw_fifo_reserve(dev_priv, sizeof(*cmd));
+	if (unlikely(cmd == NULL)) {
+		DRM_ERROR("Failed reserving FIFO space for shader "
+			  "unbinding.\n");
+		return -ENOMEM;
+	}
+
+	cmd->header.id = SVGA_3D_CMD_SET_SHADER;
+	cmd->header.size = sizeof(cmd->body);
+	cmd->body.cid = bi->ctx->id;
+	cmd->body.type = binding->shader_slot + SVGA3D_SHADERTYPE_MIN;
+	cmd->body.shid = ((rebind) ? bi->res->id : SVGA3D_INVALID_ID);
+	vmw_fifo_commit(dev_priv, sizeof(*cmd));
+
+	return 0;
+}
+
+/**
+ * vmw_binding_scrub_render_target - scrub a render target binding
+ * from a context.
+ *
+ * @bi: single binding information.
+ * @rebind: Whether to issue a bind instead of scrub command.
+ */
+static int vmw_binding_scrub_render_target(struct vmw_ctx_bindinfo *bi,
+					   bool rebind)
+{
+	struct vmw_ctx_bindinfo_view *binding =
+		container_of(bi, typeof(*binding), bi);
+	struct vmw_private *dev_priv = bi->ctx->dev_priv;
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdSetRenderTarget body;
+	} *cmd;
+
+	cmd = vmw_fifo_reserve(dev_priv, sizeof(*cmd));
+	if (unlikely(cmd == NULL)) {
+		DRM_ERROR("Failed reserving FIFO space for render target "
+			  "unbinding.\n");
+		return -ENOMEM;
+	}
+
+	cmd->header.id = SVGA_3D_CMD_SETRENDERTARGET;
+	cmd->header.size = sizeof(cmd->body);
+	cmd->body.cid = bi->ctx->id;
+	cmd->body.type = binding->slot;
+	cmd->body.target.sid = ((rebind) ? bi->res->id : SVGA3D_INVALID_ID);
+	cmd->body.target.face = 0;
+	cmd->body.target.mipmap = 0;
+	vmw_fifo_commit(dev_priv, sizeof(*cmd));
+
+	return 0;
+}
+
+/**
+ * vmw_binding_scrub_texture - scrub a texture binding from a context.
+ *
+ * @bi: single binding information.
+ * @rebind: Whether to issue a bind instead of scrub command.
+ *
+ * TODO: Possibly complement this function with a function that takes
+ * a list of texture bindings and combines them to a single command.
+ */
+static int vmw_binding_scrub_texture(struct vmw_ctx_bindinfo *bi,
+				     bool rebind)
+{
+	struct vmw_ctx_bindinfo_tex *binding =
+		container_of(bi, typeof(*binding), bi);
+	struct vmw_private *dev_priv = bi->ctx->dev_priv;
+	struct {
+		SVGA3dCmdHeader header;
+		struct {
+			SVGA3dCmdSetTextureState c;
+			SVGA3dTextureState s1;
+		} body;
+	} *cmd;
+
+	cmd = vmw_fifo_reserve(dev_priv, sizeof(*cmd));
+	if (unlikely(cmd == NULL)) {
+		DRM_ERROR("Failed reserving FIFO space for texture "
+			  "unbinding.\n");
+		return -ENOMEM;
+	}
+
+	cmd->header.id = SVGA_3D_CMD_SETTEXTURESTATE;
+	cmd->header.size = sizeof(cmd->body);
+	cmd->body.c.cid = bi->ctx->id;
+	cmd->body.s1.stage = binding->texture_stage;
+	cmd->body.s1.name = SVGA3D_TS_BIND_TEXTURE;
+	cmd->body.s1.value = ((rebind) ? bi->res->id : SVGA3D_INVALID_ID);
+	vmw_fifo_commit(dev_priv, sizeof(*cmd));
+
+	return 0;
+}
+
+/**
+ * vmw_binding_scrub_dx_shader - scrub a dx shader binding from a context.
+ *
+ * @bi: single binding information.
+ * @rebind: Whether to issue a bind instead of scrub command.
+ */
+static int vmw_binding_scrub_dx_shader(struct vmw_ctx_bindinfo *bi, bool rebind)
+{
+	struct vmw_ctx_bindinfo_shader *binding =
+		container_of(bi, typeof(*binding), bi);
+	struct vmw_private *dev_priv = bi->ctx->dev_priv;
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdDXSetShader body;
+	} *cmd;
+
+	cmd = vmw_fifo_reserve_dx(dev_priv, sizeof(*cmd), bi->ctx->id);
+	if (unlikely(cmd == NULL)) {
+		DRM_ERROR("Failed reserving FIFO space for DX shader "
+			  "unbinding.\n");
+		return -ENOMEM;
+	}
+	cmd->header.id = SVGA_3D_CMD_DX_SET_SHADER;
+	cmd->header.size = sizeof(cmd->body);
+	cmd->body.type = binding->shader_slot + SVGA3D_SHADERTYPE_MIN;
+	cmd->body.shaderId = ((rebind) ? bi->res->id : SVGA3D_INVALID_ID);
+	vmw_fifo_commit(dev_priv, sizeof(*cmd));
+
+	return 0;
+}
+
+/**
+ * vmw_binding_scrub_cb - scrub a constant buffer binding from a context.
+ *
+ * @bi: single binding information.
+ * @rebind: Whether to issue a bind instead of scrub command.
+ */
+static int vmw_binding_scrub_cb(struct vmw_ctx_bindinfo *bi, bool rebind)
+{
+	struct vmw_ctx_bindinfo_cb *binding =
+		container_of(bi, typeof(*binding), bi);
+	struct vmw_private *dev_priv = bi->ctx->dev_priv;
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdDXSetSingleConstantBuffer body;
+	} *cmd;
+
+	cmd = vmw_fifo_reserve_dx(dev_priv, sizeof(*cmd), bi->ctx->id);
+	if (unlikely(cmd == NULL)) {
+		DRM_ERROR("Failed reserving FIFO space for DX shader "
+			  "unbinding.\n");
+		return -ENOMEM;
+	}
+
+	cmd->header.id = SVGA_3D_CMD_DX_SET_SINGLE_CONSTANT_BUFFER;
+	cmd->header.size = sizeof(cmd->body);
+	cmd->body.slot = binding->slot;
+	cmd->body.type = binding->shader_slot + SVGA3D_SHADERTYPE_MIN;
+	if (rebind) {
+		cmd->body.offsetInBytes = binding->offset;
+		cmd->body.sizeInBytes = binding->size;
+		cmd->body.sid = bi->res->id;
+	} else {
+		cmd->body.offsetInBytes = 0;
+		cmd->body.sizeInBytes = 0;
+		cmd->body.sid = SVGA3D_INVALID_ID;
+	}
+	vmw_fifo_commit(dev_priv, sizeof(*cmd));
+
+	return 0;
+}
+
+/**
+ * vmw_collect_view_ids - Build view id data for a view binding command
+ * without checking which bindings actually need to be emitted
+ *
+ * @cbs: Pointer to the context's struct vmw_ctx_binding_state
+ * @bi: Pointer to where the binding info array is stored in @cbs
+ * @max_num: Maximum number of entries in the @bi array.
+ *
+ * Scans the @bi array for bindings and builds a buffer of view id data.
+ * Stops at the first non-existing binding in the @bi array.
+ * On output, @cbs->bind_cmd_count contains the number of bindings to be
+ * emitted, @cbs->bind_first_slot is set to zero, and @cbs->bind_cmd_buffer
+ * contains the command data.
+ */
+static void vmw_collect_view_ids(struct vmw_ctx_binding_state *cbs,
+				 const struct vmw_ctx_bindinfo *bi,
+				 u32 max_num)
+{
+	const struct vmw_ctx_bindinfo_view *biv =
+		container_of(bi, struct vmw_ctx_bindinfo_view, bi);
+	unsigned long i;
+
+	cbs->bind_cmd_count = 0;
+	cbs->bind_first_slot = 0;
+
+	for (i = 0; i < max_num; ++i, ++biv) {
+		if (!biv->bi.ctx)
+			break;
+
+		cbs->bind_cmd_buffer[cbs->bind_cmd_count++] =
+			((biv->bi.scrubbed) ?
+			 SVGA3D_INVALID_ID : biv->bi.res->id);
+	}
+}
+
+/**
+ * vmw_collect_dirty_view_ids - Build view id data for a view binding command
+ *
+ * @cbs: Pointer to the context's struct vmw_ctx_binding_state
+ * @bi: Pointer to where the binding info array is stored in @cbs
+ * @dirty: Bitmap indicating which bindings need to be emitted.
+ * @max_num: Maximum number of entries in the @bi array.
+ *
+ * Scans the @bi array for bindings that need to be emitted and
+ * builds a buffer of view id data.
+ * On output, @cbs->bind_cmd_count contains the number of bindings to be
+ * emitted, @cbs->bind_first_slot indicates the index of the first emitted
+ * binding, and @cbs->bind_cmd_buffer contains the command data.
+ */
+static void vmw_collect_dirty_view_ids(struct vmw_ctx_binding_state *cbs,
+				       const struct vmw_ctx_bindinfo *bi,
+				       unsigned long *dirty,
+				       u32 max_num)
+{
+	const struct vmw_ctx_bindinfo_view *biv =
+		container_of(bi, struct vmw_ctx_bindinfo_view, bi);
+	unsigned long i, next_bit;
+
+	cbs->bind_cmd_count = 0;
+	i = find_first_bit(dirty, max_num);
+	next_bit = i;
+	cbs->bind_first_slot = i;
+
+	biv += i;
+	for (; i < max_num; ++i, ++biv) {
+		cbs->bind_cmd_buffer[cbs->bind_cmd_count++] =
+			((!biv->bi.ctx || biv->bi.scrubbed) ?
+			 SVGA3D_INVALID_ID : biv->bi.res->id);
+
+		if (next_bit == i) {
+			next_bit = find_next_bit(dirty, max_num, i + 1);
+			if (next_bit >= max_num)
+				break;
+		}
+	}
+}
+
+/**
+ * vmw_binding_emit_set_sr - Issue delayed DX shader resource binding commands
+ *
+ * @cbs: Pointer to the context's struct vmw_ctx_binding_state
+ */
+static int vmw_emit_set_sr(struct vmw_ctx_binding_state *cbs,
+			   int shader_slot)
+{
+	const struct vmw_ctx_bindinfo *loc =
+		&cbs->per_shader[shader_slot].shader_res[0].bi;
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdDXSetShaderResources body;
+	} *cmd;
+	size_t cmd_size, view_id_size;
+	const struct vmw_resource *ctx = vmw_cbs_context(cbs);
+
+	vmw_collect_dirty_view_ids(cbs, loc,
+				   cbs->per_shader[shader_slot].dirty_sr,
+				   SVGA3D_DX_MAX_SRVIEWS);
+	if (cbs->bind_cmd_count == 0)
+		return 0;
+
+	view_id_size = cbs->bind_cmd_count*sizeof(uint32);
+	cmd_size = sizeof(*cmd) + view_id_size;
+	cmd = vmw_fifo_reserve_dx(ctx->dev_priv, cmd_size, ctx->id);
+	if (unlikely(cmd == NULL)) {
+		DRM_ERROR("Failed reserving FIFO space for DX shader"
+			  " resource binding.\n");
+		return -ENOMEM;
+	}
+
+	cmd->header.id = SVGA_3D_CMD_DX_SET_SHADER_RESOURCES;
+	cmd->header.size = sizeof(cmd->body) + view_id_size;
+	cmd->body.type = shader_slot + SVGA3D_SHADERTYPE_MIN;
+	cmd->body.startView = cbs->bind_first_slot;
+
+	memcpy(&cmd[1], cbs->bind_cmd_buffer, view_id_size);
+
+	vmw_fifo_commit(ctx->dev_priv, cmd_size);
+	bitmap_clear(cbs->per_shader[shader_slot].dirty_sr,
+		     cbs->bind_first_slot, cbs->bind_cmd_count);
+
+	return 0;
+}
+
+/**
+ * vmw_binding_emit_set_rt - Issue delayed DX rendertarget binding commands
+ *
+ * @cbs: Pointer to the context's struct vmw_ctx_binding_state
+ */
+static int vmw_emit_set_rt(struct vmw_ctx_binding_state *cbs)
+{
+	const struct vmw_ctx_bindinfo *loc = &cbs->render_targets[0].bi;
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdDXSetRenderTargets body;
+	} *cmd;
+	size_t cmd_size, view_id_size;
+	const struct vmw_resource *ctx = vmw_cbs_context(cbs);
+
+	vmw_collect_view_ids(cbs, loc, SVGA3D_MAX_SIMULTANEOUS_RENDER_TARGETS);
+	view_id_size = cbs->bind_cmd_count*sizeof(uint32);
+	cmd_size = sizeof(*cmd) + view_id_size;
+	cmd = vmw_fifo_reserve_dx(ctx->dev_priv, cmd_size, ctx->id);
+	if (unlikely(cmd == NULL)) {
+		DRM_ERROR("Failed reserving FIFO space for DX render-target"
+			  " binding.\n");
+		return -ENOMEM;
+	}
+
+	cmd->header.id = SVGA_3D_CMD_DX_SET_RENDERTARGETS;
+	cmd->header.size = sizeof(cmd->body) + view_id_size;
+
+	if (cbs->ds_view.bi.ctx && !cbs->ds_view.bi.scrubbed)
+		cmd->body.depthStencilViewId = cbs->ds_view.bi.res->id;
+	else
+		cmd->body.depthStencilViewId = SVGA3D_INVALID_ID;
+
+	memcpy(&cmd[1], cbs->bind_cmd_buffer, view_id_size);
+
+	vmw_fifo_commit(ctx->dev_priv, cmd_size);
+
+	return 0;
+
+}
+
+/**
+ * vmw_collect_so_targets - Build SVGA3dSoTarget data for a binding command
+ * without checking which bindings actually need to be emitted
+ *
+ * @cbs: Pointer to the context's struct vmw_ctx_binding_state
+ * @bi: Pointer to where the binding info array is stored in @cbs
+ * @max_num: Maximum number of entries in the @bi array.
+ *
+ * Scans the @bi array for bindings and builds a buffer of SVGA3dSoTarget data.
+ * Stops at the first non-existing binding in the @bi array.
+ * On output, @cbs->bind_cmd_count contains the number of bindings to be
+ * emitted, @cbs->bind_first_slot is set to zero, and @cbs->bind_cmd_buffer
+ * contains the command data.
+ */
+static void vmw_collect_so_targets(struct vmw_ctx_binding_state *cbs,
+				   const struct vmw_ctx_bindinfo *bi,
+				   u32 max_num)
+{
+	const struct vmw_ctx_bindinfo_so *biso =
+		container_of(bi, struct vmw_ctx_bindinfo_so, bi);
+	unsigned long i;
+	SVGA3dSoTarget *so_buffer = (SVGA3dSoTarget *) cbs->bind_cmd_buffer;
+
+	cbs->bind_cmd_count = 0;
+	cbs->bind_first_slot = 0;
+
+	for (i = 0; i < max_num; ++i, ++biso, ++so_buffer,
+		    ++cbs->bind_cmd_count) {
+		if (!biso->bi.ctx)
+			break;
+
+		if (!biso->bi.scrubbed) {
+			so_buffer->sid = biso->bi.res->id;
+			so_buffer->offset = biso->offset;
+			so_buffer->sizeInBytes = biso->size;
+		} else {
+			so_buffer->sid = SVGA3D_INVALID_ID;
+			so_buffer->offset = 0;
+			so_buffer->sizeInBytes = 0;
+		}
+	}
+}
+
+/**
+ * vmw_binding_emit_set_so - Issue delayed streamout binding commands
+ *
+ * @cbs: Pointer to the context's struct vmw_ctx_binding_state
+ */
+static int vmw_emit_set_so(struct vmw_ctx_binding_state *cbs)
+{
+	const struct vmw_ctx_bindinfo *loc = &cbs->so_targets[0].bi;
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdDXSetSOTargets body;
+	} *cmd;
+	size_t cmd_size, so_target_size;
+	const struct vmw_resource *ctx = vmw_cbs_context(cbs);
+
+	vmw_collect_so_targets(cbs, loc, SVGA3D_DX_MAX_SOTARGETS);
+	if (cbs->bind_cmd_count == 0)
+		return 0;
+
+	so_target_size = cbs->bind_cmd_count*sizeof(SVGA3dSoTarget);
+	cmd_size = sizeof(*cmd) + so_target_size;
+	cmd = vmw_fifo_reserve_dx(ctx->dev_priv, cmd_size, ctx->id);
+	if (unlikely(cmd == NULL)) {
+		DRM_ERROR("Failed reserving FIFO space for DX SO target"
+			  " binding.\n");
+		return -ENOMEM;
+	}
+
+	cmd->header.id = SVGA_3D_CMD_DX_SET_SOTARGETS;
+	cmd->header.size = sizeof(cmd->body) + so_target_size;
+	memcpy(&cmd[1], cbs->bind_cmd_buffer, so_target_size);
+
+	vmw_fifo_commit(ctx->dev_priv, cmd_size);
+
+	return 0;
+
+}
+
+/**
+ * vmw_binding_emit_dirty_ps - Issue delayed per shader binding commands
+ *
+ * @cbs: Pointer to the context's struct vmw_ctx_binding_state
+ *
+ */
+static int vmw_binding_emit_dirty_ps(struct vmw_ctx_binding_state *cbs)
+{
+	struct vmw_dx_shader_bindings *sb = &cbs->per_shader[0];
+	u32 i;
+	int ret;
+
+	for (i = 0; i < SVGA3D_NUM_SHADERTYPE_DX10; ++i, ++sb) {
+		if (!test_bit(VMW_BINDING_PS_SR_BIT, &sb->dirty))
+			continue;
+
+		ret = vmw_emit_set_sr(cbs, i);
+		if (ret)
+			break;
+
+		__clear_bit(VMW_BINDING_PS_SR_BIT, &sb->dirty);
+	}
+
+	return 0;
+}
+
+/**
+ * vmw_collect_dirty_vbs - Build SVGA3dVertexBuffer data for a
+ * SVGA3dCmdDXSetVertexBuffers command
+ *
+ * @cbs: Pointer to the context's struct vmw_ctx_binding_state
+ * @bi: Pointer to where the binding info array is stored in @cbs
+ * @dirty: Bitmap indicating which bindings need to be emitted.
+ * @max_num: Maximum number of entries in the @bi array.
+ *
+ * Scans the @bi array for bindings that need to be emitted and
+ * builds a buffer of SVGA3dVertexBuffer data.
+ * On output, @cbs->bind_cmd_count contains the number of bindings to be
+ * emitted, @cbs->bind_first_slot indicates the index of the first emitted
+ * binding, and @cbs->bind_cmd_buffer contains the command data.
+ */
+static void vmw_collect_dirty_vbs(struct vmw_ctx_binding_state *cbs,
+				  const struct vmw_ctx_bindinfo *bi,
+				  unsigned long *dirty,
+				  u32 max_num)
+{
+	const struct vmw_ctx_bindinfo_vb *biv =
+		container_of(bi, struct vmw_ctx_bindinfo_vb, bi);
+	unsigned long i, next_bit;
+	SVGA3dVertexBuffer *vbs = (SVGA3dVertexBuffer *) &cbs->bind_cmd_buffer;
+
+	cbs->bind_cmd_count = 0;
+	i = find_first_bit(dirty, max_num);
+	next_bit = i;
+	cbs->bind_first_slot = i;
+
+	biv += i;
+	for (; i < max_num; ++i, ++biv, ++vbs) {
+		if (!biv->bi.ctx || biv->bi.scrubbed) {
+			vbs->sid = SVGA3D_INVALID_ID;
+			vbs->stride = 0;
+			vbs->offset = 0;
+		} else {
+			vbs->sid = biv->bi.res->id;
+			vbs->stride = biv->stride;
+			vbs->offset = biv->offset;
+		}
+		cbs->bind_cmd_count++;
+		if (next_bit == i) {
+			next_bit = find_next_bit(dirty, max_num, i + 1);
+			if (next_bit >= max_num)
+				break;
+		}
+	}
+}
+
+/**
+ * vmw_binding_emit_set_vb - Issue delayed vertex buffer binding commands
+ *
+ * @cbs: Pointer to the context's struct vmw_ctx_binding_state
+ *
+ */
+static int vmw_emit_set_vb(struct vmw_ctx_binding_state *cbs)
+{
+	const struct vmw_ctx_bindinfo *loc =
+		&cbs->vertex_buffers[0].bi;
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdDXSetVertexBuffers body;
+	} *cmd;
+	size_t cmd_size, set_vb_size;
+	const struct vmw_resource *ctx = vmw_cbs_context(cbs);
+
+	vmw_collect_dirty_vbs(cbs, loc, cbs->dirty_vb,
+			     SVGA3D_DX_MAX_VERTEXBUFFERS);
+	if (cbs->bind_cmd_count == 0)
+		return 0;
+
+	set_vb_size = cbs->bind_cmd_count*sizeof(SVGA3dVertexBuffer);
+	cmd_size = sizeof(*cmd) + set_vb_size;
+	cmd = vmw_fifo_reserve_dx(ctx->dev_priv, cmd_size, ctx->id);
+	if (unlikely(cmd == NULL)) {
+		DRM_ERROR("Failed reserving FIFO space for DX vertex buffer"
+			  " binding.\n");
+		return -ENOMEM;
+	}
+
+	cmd->header.id = SVGA_3D_CMD_DX_SET_VERTEX_BUFFERS;
+	cmd->header.size = sizeof(cmd->body) + set_vb_size;
+	cmd->body.startBuffer = cbs->bind_first_slot;
+
+	memcpy(&cmd[1], cbs->bind_cmd_buffer, set_vb_size);
+
+	vmw_fifo_commit(ctx->dev_priv, cmd_size);
+	bitmap_clear(cbs->dirty_vb,
+		     cbs->bind_first_slot, cbs->bind_cmd_count);
+
+	return 0;
+}
+
+/**
+ * vmw_binding_emit_dirty - Issue delayed binding commands
+ *
+ * @cbs: Pointer to the context's struct vmw_ctx_binding_state
+ *
+ * This function issues the delayed binding commands that arise from
+ * previous scrub / unscrub calls. These binding commands are typically
+ * commands that batch a number of bindings and therefore it makes sense
+ * to delay them.
+ */
+static int vmw_binding_emit_dirty(struct vmw_ctx_binding_state *cbs)
+{
+	int ret = 0;
+	unsigned long hit = 0;
+
+	while ((hit = find_next_bit(&cbs->dirty, VMW_BINDING_NUM_BITS, hit))
+	      < VMW_BINDING_NUM_BITS) {
+
+		switch (hit) {
+		case VMW_BINDING_RT_BIT:
+			ret = vmw_emit_set_rt(cbs);
+			break;
+		case VMW_BINDING_PS_BIT:
+			ret = vmw_binding_emit_dirty_ps(cbs);
+			break;
+		case VMW_BINDING_SO_BIT:
+			ret = vmw_emit_set_so(cbs);
+			break;
+		case VMW_BINDING_VB_BIT:
+			ret = vmw_emit_set_vb(cbs);
+			break;
+		default:
+			BUG();
+		}
+		if (ret)
+			return ret;
+
+		__clear_bit(hit, &cbs->dirty);
+		hit++;
+	}
+
+	return 0;
+}
+
+/**
+ * vmw_binding_scrub_sr - Schedule a dx shaderresource binding
+ * scrub from a context
+ *
+ * @bi: single binding information.
+ * @rebind: Whether to issue a bind instead of scrub command.
+ */
+static int vmw_binding_scrub_sr(struct vmw_ctx_bindinfo *bi, bool rebind)
+{
+	struct vmw_ctx_bindinfo_view *biv =
+		container_of(bi, struct vmw_ctx_bindinfo_view, bi);
+	struct vmw_ctx_binding_state *cbs =
+		vmw_context_binding_state(bi->ctx);
+
+	__set_bit(biv->slot, cbs->per_shader[biv->shader_slot].dirty_sr);
+	__set_bit(VMW_BINDING_PS_SR_BIT,
+		  &cbs->per_shader[biv->shader_slot].dirty);
+	__set_bit(VMW_BINDING_PS_BIT, &cbs->dirty);
+
+	return 0;
+}
+
+/**
+ * vmw_binding_scrub_dx_rt - Schedule a dx rendertarget binding
+ * scrub from a context
+ *
+ * @bi: single binding information.
+ * @rebind: Whether to issue a bind instead of scrub command.
+ */
+static int vmw_binding_scrub_dx_rt(struct vmw_ctx_bindinfo *bi, bool rebind)
+{
+	struct vmw_ctx_binding_state *cbs =
+		vmw_context_binding_state(bi->ctx);
+
+	__set_bit(VMW_BINDING_RT_BIT, &cbs->dirty);
+
+	return 0;
+}
+
+/**
+ * vmw_binding_scrub_so - Schedule a dx streamoutput buffer binding
+ * scrub from a context
+ *
+ * @bi: single binding information.
+ * @rebind: Whether to issue a bind instead of scrub command.
+ */
+static int vmw_binding_scrub_so(struct vmw_ctx_bindinfo *bi, bool rebind)
+{
+	struct vmw_ctx_binding_state *cbs =
+		vmw_context_binding_state(bi->ctx);
+
+	__set_bit(VMW_BINDING_SO_BIT, &cbs->dirty);
+
+	return 0;
+}
+
+/**
+ * vmw_binding_scrub_vb - Schedule a dx vertex buffer binding
+ * scrub from a context
+ *
+ * @bi: single binding information.
+ * @rebind: Whether to issue a bind instead of scrub command.
+ */
+static int vmw_binding_scrub_vb(struct vmw_ctx_bindinfo *bi, bool rebind)
+{
+	struct vmw_ctx_bindinfo_vb *bivb =
+		container_of(bi, struct vmw_ctx_bindinfo_vb, bi);
+	struct vmw_ctx_binding_state *cbs =
+		vmw_context_binding_state(bi->ctx);
+
+	__set_bit(bivb->slot, cbs->dirty_vb);
+	__set_bit(VMW_BINDING_VB_BIT, &cbs->dirty);
+
+	return 0;
+}
+
+/**
+ * vmw_binding_scrub_ib - scrub a dx index buffer binding from a context
+ *
+ * @bi: single binding information.
+ * @rebind: Whether to issue a bind instead of scrub command.
+ */
+static int vmw_binding_scrub_ib(struct vmw_ctx_bindinfo *bi, bool rebind)
+{
+	struct vmw_ctx_bindinfo_ib *binding =
+		container_of(bi, typeof(*binding), bi);
+	struct vmw_private *dev_priv = bi->ctx->dev_priv;
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdDXSetIndexBuffer body;
+	} *cmd;
+
+	cmd = vmw_fifo_reserve_dx(dev_priv, sizeof(*cmd), bi->ctx->id);
+	if (unlikely(cmd == NULL)) {
+		DRM_ERROR("Failed reserving FIFO space for DX index buffer "
+			  "binding.\n");
+		return -ENOMEM;
+	}
+	cmd->header.id = SVGA_3D_CMD_DX_SET_INDEX_BUFFER;
+	cmd->header.size = sizeof(cmd->body);
+	if (rebind) {
+		cmd->body.sid = bi->res->id;
+		cmd->body.format = binding->format;
+		cmd->body.offset = binding->offset;
+	} else {
+		cmd->body.sid = SVGA3D_INVALID_ID;
+		cmd->body.format = 0;
+		cmd->body.offset = 0;
+	}
+
+	vmw_fifo_commit(dev_priv, sizeof(*cmd));
+
+	return 0;
+}
+
+/**
+ * vmw_binding_state_alloc - Allocate a struct vmw_ctx_binding_state with
+ * memory accounting.
+ *
+ * @dev_priv: Pointer to a device private structure.
+ *
+ * Returns a pointer to a newly allocated struct or an error pointer on error.
+ */
+struct vmw_ctx_binding_state *
+vmw_binding_state_alloc(struct vmw_private *dev_priv)
+{
+	struct vmw_ctx_binding_state *cbs;
+	int ret;
+
+	ret = ttm_mem_global_alloc(vmw_mem_glob(dev_priv), sizeof(*cbs),
+				   false, false);
+	if (ret)
+		return ERR_PTR(ret);
+
+	cbs = vzalloc(sizeof(*cbs));
+	if (!cbs) {
+		ttm_mem_global_free(vmw_mem_glob(dev_priv), sizeof(*cbs));
+		return ERR_PTR(-ENOMEM);
+	}
+
+	cbs->dev_priv = dev_priv;
+	INIT_LIST_HEAD(&cbs->list);
+
+	return cbs;
+}
+
+/**
+ * vmw_binding_state_free - Free a struct vmw_ctx_binding_state and its
+ * memory accounting info.
+ *
+ * @cbs: Pointer to the struct vmw_ctx_binding_state to be freed.
+ */
+void vmw_binding_state_free(struct vmw_ctx_binding_state *cbs)
+{
+	struct vmw_private *dev_priv = cbs->dev_priv;
+
+	vfree(cbs);
+	ttm_mem_global_free(vmw_mem_glob(dev_priv), sizeof(*cbs));
+}
+
+/**
+ * vmw_binding_state_list - Get the binding list of a
+ * struct vmw_ctx_binding_state
+ *
+ * @cbs: Pointer to the struct vmw_ctx_binding_state
+ *
+ * Returns the binding list which can be used to traverse through the bindings
+ * and access the resource information of all bindings.
+ */
+struct list_head *vmw_binding_state_list(struct vmw_ctx_binding_state *cbs)
+{
+	return &cbs->list;
+}
+
+/**
+ * vmwgfx_binding_state_reset - clear a struct vmw_ctx_binding_state
+ *
+ * @cbs: Pointer to the struct vmw_ctx_binding_state to be cleared
+ *
+ * Drops all bindings registered in @cbs. No device binding actions are
+ * performed.
+ */
+void vmw_binding_state_reset(struct vmw_ctx_binding_state *cbs)
+{
+	struct vmw_ctx_bindinfo *entry, *next;
+
+	list_for_each_entry_safe(entry, next, &cbs->list, ctx_list)
+		vmw_binding_drop(entry);
+}
+
+/*
+ * This function is unused at run-time, and only used to hold various build
+ * asserts important for code optimization assumptions.
+ */
+static void vmw_binding_build_asserts(void)
+{
+	BUILD_BUG_ON(SVGA3D_NUM_SHADERTYPE_DX10 != 3);
+	BUILD_BUG_ON(SVGA3D_MAX_SIMULTANEOUS_RENDER_TARGETS > SVGA3D_RT_MAX);
+	BUILD_BUG_ON(sizeof(uint32) != sizeof(u32));
+
+	/*
+	 * struct vmw_ctx_binding_state::bind_cmd_buffer is used for various
+	 * view id arrays.
+	 */
+	BUILD_BUG_ON(VMW_MAX_VIEW_BINDINGS < SVGA3D_RT_MAX);
+	BUILD_BUG_ON(VMW_MAX_VIEW_BINDINGS < SVGA3D_DX_MAX_SRVIEWS);
+	BUILD_BUG_ON(VMW_MAX_VIEW_BINDINGS < SVGA3D_DX_MAX_CONSTBUFFERS);
+
+	/*
+	 * struct vmw_ctx_binding_state::bind_cmd_buffer is used for
+	 * u32 view ids, SVGA3dSoTargets and SVGA3dVertexBuffers
+	 */
+	BUILD_BUG_ON(SVGA3D_DX_MAX_SOTARGETS*sizeof(SVGA3dSoTarget) >
+		     VMW_MAX_VIEW_BINDINGS*sizeof(u32));
+	BUILD_BUG_ON(SVGA3D_DX_MAX_VERTEXBUFFERS*sizeof(SVGA3dVertexBuffer) >
+		     VMW_MAX_VIEW_BINDINGS*sizeof(u32));
+}
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_binding.h b/drivers/gpu/drm/vmwgfx/vmwgfx_binding.h
new file mode 100644
index 000000000000..bf2e77ad5a20
--- /dev/null
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_binding.h
@@ -0,0 +1,209 @@
+/**************************************************************************
+ *
+ * Copyright © 2015 VMware, Inc., Palo Alto, CA., USA
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+#ifndef _VMWGFX_BINDING_H_
+#define _VMWGFX_BINDING_H_
+
+#include "device_include/svga3d_reg.h"
+#include <linux/list.h>
+
+#define VMW_MAX_VIEW_BINDINGS 128
+
+struct vmw_private;
+struct vmw_ctx_binding_state;
+
+/*
+ * enum vmw_ctx_binding_type - abstract resource to context binding types
+ */
+enum vmw_ctx_binding_type {
+	vmw_ctx_binding_shader,
+	vmw_ctx_binding_rt,
+	vmw_ctx_binding_tex,
+	vmw_ctx_binding_cb,
+	vmw_ctx_binding_dx_shader,
+	vmw_ctx_binding_dx_rt,
+	vmw_ctx_binding_sr,
+	vmw_ctx_binding_ds,
+	vmw_ctx_binding_so,
+	vmw_ctx_binding_vb,
+	vmw_ctx_binding_ib,
+	vmw_ctx_binding_max
+};
+
+/**
+ * struct vmw_ctx_bindinfo - single binding metadata
+ *
+ * @ctx_list: List head for the context's list of bindings.
+ * @res_list: List head for a resource's list of bindings.
+ * @ctx: Non-refcounted pointer to the context that owns the binding. NULL
+ * indicates no binding present.
+ * @res: Non-refcounted pointer to the resource the binding points to. This
+ * is typically a surface or a view.
+ * @bt: Binding type.
+ * @scrubbed: Whether the binding has been scrubbed from the context.
+ */
+struct vmw_ctx_bindinfo {
+	struct list_head ctx_list;
+	struct list_head res_list;
+	struct vmw_resource *ctx;
+	struct vmw_resource *res;
+	enum vmw_ctx_binding_type bt;
+	bool scrubbed;
+};
+
+/**
+ * struct vmw_ctx_bindinfo_tex - texture stage binding metadata
+ *
+ * @bi: struct vmw_ctx_bindinfo we derive from.
+ * @texture_stage: Device data used to reconstruct binding command.
+ */
+struct vmw_ctx_bindinfo_tex {
+	struct vmw_ctx_bindinfo bi;
+	uint32 texture_stage;
+};
+
+/**
+ * struct vmw_ctx_bindinfo_shader - Shader binding metadata
+ *
+ * @bi: struct vmw_ctx_bindinfo we derive from.
+ * @shader_slot: Device data used to reconstruct binding command.
+ */
+struct vmw_ctx_bindinfo_shader {
+	struct vmw_ctx_bindinfo bi;
+	SVGA3dShaderType shader_slot;
+};
+
+/**
+ * struct vmw_ctx_bindinfo_cb - Constant buffer binding metadata
+ *
+ * @bi: struct vmw_ctx_bindinfo we derive from.
+ * @shader_slot: Device data used to reconstruct binding command.
+ * @offset: Device data used to reconstruct binding command.
+ * @size: Device data used to reconstruct binding command.
+ * @slot: Device data used to reconstruct binding command.
+ */
+struct vmw_ctx_bindinfo_cb {
+	struct vmw_ctx_bindinfo bi;
+	SVGA3dShaderType shader_slot;
+	uint32 offset;
+	uint32 size;
+	uint32 slot;
+};
+
+/**
+ * struct vmw_ctx_bindinfo_view - View binding metadata
+ *
+ * @bi: struct vmw_ctx_bindinfo we derive from.
+ * @shader_slot: Device data used to reconstruct binding command.
+ * @slot: Device data used to reconstruct binding command.
+ */
+struct vmw_ctx_bindinfo_view {
+	struct vmw_ctx_bindinfo bi;
+	SVGA3dShaderType shader_slot;
+	uint32 slot;
+};
+
+/**
+ * struct vmw_ctx_bindinfo_so - StreamOutput binding metadata
+ *
+ * @bi: struct vmw_ctx_bindinfo we derive from.
+ * @offset: Device data used to reconstruct binding command.
+ * @size: Device data used to reconstruct binding command.
+ * @slot: Device data used to reconstruct binding command.
+ */
+struct vmw_ctx_bindinfo_so {
+	struct vmw_ctx_bindinfo bi;
+	uint32 offset;
+	uint32 size;
+	uint32 slot;
+};
+
+/**
+ * struct vmw_ctx_bindinfo_vb - Vertex buffer binding metadata
+ *
+ * @bi: struct vmw_ctx_bindinfo we derive from.
+ * @offset: Device data used to reconstruct binding command.
+ * @stride: Device data used to reconstruct binding command.
+ * @slot: Device data used to reconstruct binding command.
+ */
+struct vmw_ctx_bindinfo_vb {
+	struct vmw_ctx_bindinfo bi;
+	uint32 offset;
+	uint32 stride;
+	uint32 slot;
+};
+
+/**
+ * struct vmw_ctx_bindinfo_ib - StreamOutput binding metadata
+ *
+ * @bi: struct vmw_ctx_bindinfo we derive from.
+ * @offset: Device data used to reconstruct binding command.
+ * @format: Device data used to reconstruct binding command.
+ */
+struct vmw_ctx_bindinfo_ib {
+	struct vmw_ctx_bindinfo bi;
+	uint32 offset;
+	uint32 format;
+};
+
+/**
+ * struct vmw_dx_shader_bindings - per shader type context binding state
+ *
+ * @shader: The shader binding for this shader type
+ * @const_buffer: Const buffer bindings for this shader type.
+ * @shader_res: Shader resource view bindings for this shader type.
+ * @dirty_sr: Bitmap tracking individual shader resource bindings changes
+ * that have not yet been emitted to the device.
+ * @dirty: Bitmap tracking per-binding type binding changes that have not
+ * yet been emitted to the device.
+ */
+struct vmw_dx_shader_bindings {
+	struct vmw_ctx_bindinfo_shader shader;
+	struct vmw_ctx_bindinfo_cb const_buffers[SVGA3D_DX_MAX_CONSTBUFFERS];
+	struct vmw_ctx_bindinfo_view shader_res[SVGA3D_DX_MAX_SRVIEWS];
+	DECLARE_BITMAP(dirty_sr, SVGA3D_DX_MAX_SRVIEWS);
+	unsigned long dirty;
+};
+
+extern void vmw_binding_add(struct vmw_ctx_binding_state *cbs,
+			    const struct vmw_ctx_bindinfo *ci,
+			    u32 shader_slot, u32 slot);
+extern void
+vmw_binding_state_commit(struct vmw_ctx_binding_state *to,
+			 struct vmw_ctx_binding_state *from);
+extern void vmw_binding_res_list_kill(struct list_head *head);
+extern void vmw_binding_res_list_scrub(struct list_head *head);
+extern int vmw_binding_rebind_all(struct vmw_ctx_binding_state *cbs);
+extern void vmw_binding_state_kill(struct vmw_ctx_binding_state *cbs);
+extern void vmw_binding_state_scrub(struct vmw_ctx_binding_state *cbs);
+extern struct vmw_ctx_binding_state *
+vmw_binding_state_alloc(struct vmw_private *dev_priv);
+extern void vmw_binding_state_free(struct vmw_ctx_binding_state *cbs);
+extern struct list_head *
+vmw_binding_state_list(struct vmw_ctx_binding_state *cbs);
+extern void vmw_binding_state_reset(struct vmw_ctx_binding_state *cbs);
+
+#endif
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf.c b/drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf.c
index 04fa8526b55e..5ae8f921da2a 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf.c
@@ -916,9 +916,8 @@ static void *vmw_cmdbuf_reserve_cur(struct vmw_cmdbuf_man *man,
 
 	cur = man->cur;
 	if (cur && (size + man->cur_pos > cur->size ||
-	    (ctx_id != SVGA3D_INVALID_ID &&
-	     (cur->cb_header->flags & SVGA_CB_FLAG_DX_CONTEXT) &&
-	     ctx_id != cur->cb_header->dxContext)))
+		    ((cur->cb_header->flags & SVGA_CB_FLAG_DX_CONTEXT) &&
+		     ctx_id != cur->cb_header->dxContext)))
 		__vmw_cmdbuf_cur_flush(man);
 
 	if (!man->cur) {
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf_res.c b/drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf_res.c
index 21e9b7f8dad0..59d965f8b530 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf_res.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf_res.c
@@ -26,15 +26,10 @@
  **************************************************************************/
 
 #include "vmwgfx_drv.h"
+#include "vmwgfx_resource_priv.h"
 
 #define VMW_CMDBUF_RES_MAN_HT_ORDER 12
 
-enum vmw_cmdbuf_res_state {
-	VMW_CMDBUF_RES_COMMITED,
-	VMW_CMDBUF_RES_ADD,
-	VMW_CMDBUF_RES_DEL
-};
-
 /**
  * struct vmw_cmdbuf_res - Command buffer managed resource entry.
  *
@@ -132,9 +127,12 @@ void vmw_cmdbuf_res_commit(struct list_head *list)
 
 	list_for_each_entry_safe(entry, next, list, head) {
 		list_del(&entry->head);
+		if (entry->res->func->commit_notify)
+			entry->res->func->commit_notify(entry->res,
+							entry->state);
 		switch (entry->state) {
 		case VMW_CMDBUF_RES_ADD:
-			entry->state = VMW_CMDBUF_RES_COMMITED;
+			entry->state = VMW_CMDBUF_RES_COMMITTED;
 			list_add_tail(&entry->head, &entry->man->list);
 			break;
 		case VMW_CMDBUF_RES_DEL:
@@ -175,7 +173,7 @@ void vmw_cmdbuf_res_revert(struct list_head *list)
 						 &entry->hash);
 			list_del(&entry->head);
 			list_add_tail(&entry->head, &entry->man->list);
-			entry->state = VMW_CMDBUF_RES_COMMITED;
+			entry->state = VMW_CMDBUF_RES_COMMITTED;
 			break;
 		default:
 			BUG();
@@ -231,6 +229,9 @@ out_invalid_key:
  * @res_type: The resource type.
  * @user_key: The user-space id of the resource.
  * @list: The staging list.
+ * @res_p: If the resource is in an already committed state, points to the
+ * struct vmw_resource on successful return. The pointer will be
+ * non ref-counted.
  *
  * This function looks up the struct vmw_cmdbuf_res entry from the manager
  * hash table and, if it exists, removes it. Depending on its current staging
@@ -240,7 +241,8 @@ out_invalid_key:
 int vmw_cmdbuf_res_remove(struct vmw_cmdbuf_res_manager *man,
 			  enum vmw_cmdbuf_res_type res_type,
 			  u32 user_key,
-			  struct list_head *list)
+			  struct list_head *list,
+			  struct vmw_resource **res_p)
 {
 	struct vmw_cmdbuf_res *entry;
 	struct drm_hash_item *hash;
@@ -256,12 +258,14 @@ int vmw_cmdbuf_res_remove(struct vmw_cmdbuf_res_manager *man,
 	switch (entry->state) {
 	case VMW_CMDBUF_RES_ADD:
 		vmw_cmdbuf_res_free(man, entry);
+		*res_p = NULL;
 		break;
-	case VMW_CMDBUF_RES_COMMITED:
+	case VMW_CMDBUF_RES_COMMITTED:
 		(void) drm_ht_remove_item(&man->resources, &entry->hash);
 		list_del(&entry->head);
 		entry->state = VMW_CMDBUF_RES_DEL;
 		list_add_tail(&entry->head, list);
+		*res_p = entry->res;
 		break;
 	default:
 		BUG();
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_context.c b/drivers/gpu/drm/vmwgfx/vmwgfx_context.c
index 15f954423e7c..abfe67c893c7 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_context.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_context.c
@@ -27,19 +27,18 @@
 
 #include "vmwgfx_drv.h"
 #include "vmwgfx_resource_priv.h"
+#include "vmwgfx_binding.h"
 #include "ttm/ttm_placement.h"
 
 struct vmw_user_context {
 	struct ttm_base_object base;
 	struct vmw_resource res;
-	struct vmw_ctx_binding_state cbs;
+	struct vmw_ctx_binding_state *cbs;
 	struct vmw_cmdbuf_res_manager *man;
+	struct vmw_resource *cotables[SVGA_COTABLE_DX10_MAX];
+	spinlock_t cotable_lock;
 };
 
-
-
-typedef int (*vmw_scrub_func)(struct vmw_ctx_bindinfo *, bool);
-
 static void vmw_user_context_free(struct vmw_resource *res);
 static struct vmw_resource *
 vmw_user_context_base_to_res(struct ttm_base_object *base);
@@ -51,12 +50,14 @@ static int vmw_gb_context_unbind(struct vmw_resource *res,
 				 bool readback,
 				 struct ttm_validate_buffer *val_buf);
 static int vmw_gb_context_destroy(struct vmw_resource *res);
-static int vmw_context_scrub_shader(struct vmw_ctx_bindinfo *bi, bool rebind);
-static int vmw_context_scrub_render_target(struct vmw_ctx_bindinfo *bi,
-					   bool rebind);
-static int vmw_context_scrub_texture(struct vmw_ctx_bindinfo *bi, bool rebind);
-static void vmw_context_binding_state_scrub(struct vmw_ctx_binding_state *cbs);
-static void vmw_context_binding_state_kill(struct vmw_ctx_binding_state *cbs);
+static int vmw_dx_context_create(struct vmw_resource *res);
+static int vmw_dx_context_bind(struct vmw_resource *res,
+			       struct ttm_validate_buffer *val_buf);
+static int vmw_dx_context_unbind(struct vmw_resource *res,
+				 bool readback,
+				 struct ttm_validate_buffer *val_buf);
+static int vmw_dx_context_destroy(struct vmw_resource *res);
+
 static uint64_t vmw_user_context_size;
 
 static const struct vmw_user_resource_conv user_context_conv = {
@@ -93,15 +94,36 @@ static const struct vmw_res_func vmw_gb_context_func = {
 	.unbind = vmw_gb_context_unbind
 };
 
-static const vmw_scrub_func vmw_scrub_funcs[vmw_ctx_binding_max] = {
-	[vmw_ctx_binding_shader] = vmw_context_scrub_shader,
-	[vmw_ctx_binding_rt] = vmw_context_scrub_render_target,
-	[vmw_ctx_binding_tex] = vmw_context_scrub_texture };
+static const struct vmw_res_func vmw_dx_context_func = {
+	.res_type = vmw_res_dx_context,
+	.needs_backup = true,
+	.may_evict = true,
+	.type_name = "dx contexts",
+	.backup_placement = &vmw_mob_placement,
+	.create = vmw_dx_context_create,
+	.destroy = vmw_dx_context_destroy,
+	.bind = vmw_dx_context_bind,
+	.unbind = vmw_dx_context_unbind
+};
 
 /**
  * Context management:
  */
 
+static void vmw_context_cotables_unref(struct vmw_user_context *uctx)
+{
+	struct vmw_resource *res;
+	int i;
+
+	for (i = 0; i < SVGA_COTABLE_DX10_MAX; ++i) {
+		spin_lock(&uctx->cotable_lock);
+		res = uctx->cotables[i];
+		uctx->cotables[i] = NULL;
+		spin_unlock(&uctx->cotable_lock);
+		vmw_resource_unreference(&res);
+	}
+}
+
 static void vmw_hw_context_destroy(struct vmw_resource *res)
 {
 	struct vmw_user_context *uctx =
@@ -113,17 +135,19 @@ static void vmw_hw_context_destroy(struct vmw_resource *res)
 	} *cmd;
 
 
-	if (res->func->destroy == vmw_gb_context_destroy) {
+	if (res->func->destroy == vmw_gb_context_destroy ||
+	    res->func->destroy == vmw_dx_context_destroy) {
 		mutex_lock(&dev_priv->cmdbuf_mutex);
 		vmw_cmdbuf_res_man_destroy(uctx->man);
 		mutex_lock(&dev_priv->binding_mutex);
-		(void) vmw_context_binding_state_kill(&uctx->cbs);
-		(void) vmw_gb_context_destroy(res);
+		vmw_binding_state_kill(uctx->cbs);
+		(void) res->func->destroy(res);
 		mutex_unlock(&dev_priv->binding_mutex);
 		if (dev_priv->pinned_bo != NULL &&
 		    !dev_priv->query_cid_valid)
 			__vmw_execbuf_release_pinned_bo(dev_priv, NULL);
 		mutex_unlock(&dev_priv->cmdbuf_mutex);
+		vmw_context_cotables_unref(uctx);
 		return;
 	}
 
@@ -144,16 +168,20 @@ static void vmw_hw_context_destroy(struct vmw_resource *res)
 }
 
 static int vmw_gb_context_init(struct vmw_private *dev_priv,
+			       bool dx,
 			       struct vmw_resource *res,
-			       void (*res_free) (struct vmw_resource *res))
+			       void (*res_free)(struct vmw_resource *res))
 {
-	int ret;
+	int ret, i;
 	struct vmw_user_context *uctx =
 		container_of(res, struct vmw_user_context, res);
 
+	res->backup_size = (dx ? sizeof(SVGADXContextMobFormat) :
+			    SVGA3D_CONTEXT_DATA_SIZE);
 	ret = vmw_resource_init(dev_priv, res, true,
-				res_free, &vmw_gb_context_func);
-	res->backup_size = SVGA3D_CONTEXT_DATA_SIZE;
+				res_free,
+				dx ? &vmw_dx_context_func :
+				&vmw_gb_context_func);
 	if (unlikely(ret != 0))
 		goto out_err;
 
@@ -166,12 +194,32 @@ static int vmw_gb_context_init(struct vmw_private *dev_priv,
 		}
 	}
 
-	memset(&uctx->cbs, 0, sizeof(uctx->cbs));
-	INIT_LIST_HEAD(&uctx->cbs.list);
+	uctx->cbs = vmw_binding_state_alloc(dev_priv);
+	if (IS_ERR(uctx->cbs)) {
+		ret = PTR_ERR(uctx->cbs);
+		goto out_err;
+	}
+
+	spin_lock_init(&uctx->cotable_lock);
+
+	if (dx) {
+		for (i = 0; i < SVGA_COTABLE_DX10_MAX; ++i) {
+			uctx->cotables[i] = vmw_cotable_alloc(dev_priv,
+							      &uctx->res, i);
+			if (unlikely(uctx->cotables[i] == NULL)) {
+				ret = -ENOMEM;
+				goto out_cotables;
+			}
+		}
+	}
+
+
 
 	vmw_resource_activate(res, vmw_hw_context_destroy);
 	return 0;
 
+out_cotables:
+	vmw_context_cotables_unref(uctx);
 out_err:
 	if (res_free)
 		res_free(res);
@@ -182,7 +230,8 @@ out_err:
 
 static int vmw_context_init(struct vmw_private *dev_priv,
 			    struct vmw_resource *res,
-			    void (*res_free) (struct vmw_resource *res))
+			    void (*res_free)(struct vmw_resource *res),
+			    bool dx)
 {
 	int ret;
 
@@ -192,7 +241,7 @@ static int vmw_context_init(struct vmw_private *dev_priv,
 	} *cmd;
 
 	if (dev_priv->has_mob)
-		return vmw_gb_context_init(dev_priv, res, res_free);
+		return vmw_gb_context_init(dev_priv, dx, res, res_free);
 
 	ret = vmw_resource_init(dev_priv, res, false,
 				res_free, &vmw_legacy_context_func);
@@ -232,19 +281,10 @@ out_early:
 	return ret;
 }
 
-struct vmw_resource *vmw_context_alloc(struct vmw_private *dev_priv)
-{
-	struct vmw_resource *res = kmalloc(sizeof(*res), GFP_KERNEL);
-	int ret;
-
-	if (unlikely(res == NULL))
-		return NULL;
-
-	ret = vmw_context_init(dev_priv, res, NULL);
-
-	return (ret == 0) ? res : NULL;
-}
 
+/*
+ * GB context.
+ */
 
 static int vmw_gb_context_create(struct vmw_resource *res)
 {
@@ -309,7 +349,6 @@ static int vmw_gb_context_bind(struct vmw_resource *res,
 			  "binding.\n");
 		return -ENOMEM;
 	}
-
 	cmd->header.id = SVGA_3D_CMD_BIND_GB_CONTEXT;
 	cmd->header.size = sizeof(cmd->body);
 	cmd->body.cid = res->id;
@@ -346,7 +385,7 @@ static int vmw_gb_context_unbind(struct vmw_resource *res,
 	BUG_ON(bo->mem.mem_type != VMW_PL_MOB);
 
 	mutex_lock(&dev_priv->binding_mutex);
-	vmw_context_binding_state_scrub(&uctx->cbs);
+	vmw_binding_state_scrub(uctx->cbs);
 
 	submit_size = sizeof(*cmd2) + (readback ? sizeof(*cmd1) : 0);
 
@@ -419,6 +458,221 @@ static int vmw_gb_context_destroy(struct vmw_resource *res)
 	return 0;
 }
 
+/*
+ * DX context.
+ */
+
+static int vmw_dx_context_create(struct vmw_resource *res)
+{
+	struct vmw_private *dev_priv = res->dev_priv;
+	int ret;
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdDXDefineContext body;
+	} *cmd;
+
+	if (likely(res->id != -1))
+		return 0;
+
+	ret = vmw_resource_alloc_id(res);
+	if (unlikely(ret != 0)) {
+		DRM_ERROR("Failed to allocate a context id.\n");
+		goto out_no_id;
+	}
+
+	if (unlikely(res->id >= VMWGFX_NUM_DXCONTEXT)) {
+		ret = -EBUSY;
+		goto out_no_fifo;
+	}
+
+	cmd = vmw_fifo_reserve(dev_priv, sizeof(*cmd));
+	if (unlikely(cmd == NULL)) {
+		DRM_ERROR("Failed reserving FIFO space for context "
+			  "creation.\n");
+		ret = -ENOMEM;
+		goto out_no_fifo;
+	}
+
+	cmd->header.id = SVGA_3D_CMD_DX_DEFINE_CONTEXT;
+	cmd->header.size = sizeof(cmd->body);
+	cmd->body.cid = res->id;
+	vmw_fifo_commit(dev_priv, sizeof(*cmd));
+	vmw_fifo_resource_inc(dev_priv);
+
+	return 0;
+
+out_no_fifo:
+	vmw_resource_release_id(res);
+out_no_id:
+	return ret;
+}
+
+static int vmw_dx_context_bind(struct vmw_resource *res,
+			       struct ttm_validate_buffer *val_buf)
+{
+	struct vmw_private *dev_priv = res->dev_priv;
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdDXBindContext body;
+	} *cmd;
+	struct ttm_buffer_object *bo = val_buf->bo;
+
+	BUG_ON(bo->mem.mem_type != VMW_PL_MOB);
+
+	cmd = vmw_fifo_reserve(dev_priv, sizeof(*cmd));
+	if (unlikely(cmd == NULL)) {
+		DRM_ERROR("Failed reserving FIFO space for context "
+			  "binding.\n");
+		return -ENOMEM;
+	}
+
+	cmd->header.id = SVGA_3D_CMD_DX_BIND_CONTEXT;
+	cmd->header.size = sizeof(cmd->body);
+	cmd->body.cid = res->id;
+	cmd->body.mobid = bo->mem.start;
+	cmd->body.validContents = res->backup_dirty;
+	res->backup_dirty = false;
+	vmw_fifo_commit(dev_priv, sizeof(*cmd));
+
+
+	return 0;
+}
+
+/**
+ * vmw_dx_context_scrub_cotables - Scrub all bindings and
+ * cotables from a context
+ *
+ * @ctx: Pointer to the context resource
+ * @readback: Whether to save the otable contents on scrubbing.
+ *
+ * COtables must be unbound before their context, but unbinding requires
+ * the backup buffer being reserved, whereas scrubbing does not.
+ * This function scrubs all cotables of a context, potentially reading back
+ * the contents into their backup buffers. However, scrubbing cotables
+ * also makes the device context invalid, so scrub all bindings first so
+ * that doesn't have to be done later with an invalid context.
+ */
+void vmw_dx_context_scrub_cotables(struct vmw_resource *ctx,
+				   bool readback)
+{
+	struct vmw_user_context *uctx =
+		container_of(ctx, struct vmw_user_context, res);
+	int i;
+
+	vmw_binding_state_scrub(uctx->cbs);
+	for (i = 0; i < SVGA_COTABLE_DX10_MAX; ++i) {
+		struct vmw_resource *res;
+
+		/* Avoid racing with ongoing cotable destruction. */
+		spin_lock(&uctx->cotable_lock);
+		res = uctx->cotables[vmw_cotable_scrub_order[i]];
+		if (res)
+			res = vmw_resource_reference_unless_doomed(res);
+		spin_unlock(&uctx->cotable_lock);
+		if (!res)
+			continue;
+
+		WARN_ON(vmw_cotable_scrub(res, readback));
+		vmw_resource_unreference(&res);
+	}
+}
+
+static int vmw_dx_context_unbind(struct vmw_resource *res,
+				 bool readback,
+				 struct ttm_validate_buffer *val_buf)
+{
+	struct vmw_private *dev_priv = res->dev_priv;
+	struct ttm_buffer_object *bo = val_buf->bo;
+	struct vmw_fence_obj *fence;
+
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdDXReadbackContext body;
+	} *cmd1;
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdDXBindContext body;
+	} *cmd2;
+	uint32_t submit_size;
+	uint8_t *cmd;
+
+
+	BUG_ON(bo->mem.mem_type != VMW_PL_MOB);
+
+	mutex_lock(&dev_priv->binding_mutex);
+	vmw_dx_context_scrub_cotables(res, readback);
+
+	submit_size = sizeof(*cmd2) + (readback ? sizeof(*cmd1) : 0);
+
+	cmd = vmw_fifo_reserve(dev_priv, submit_size);
+	if (unlikely(cmd == NULL)) {
+		DRM_ERROR("Failed reserving FIFO space for context "
+			  "unbinding.\n");
+		mutex_unlock(&dev_priv->binding_mutex);
+		return -ENOMEM;
+	}
+
+	cmd2 = (void *) cmd;
+	if (readback) {
+		cmd1 = (void *) cmd;
+		cmd1->header.id = SVGA_3D_CMD_DX_READBACK_CONTEXT;
+		cmd1->header.size = sizeof(cmd1->body);
+		cmd1->body.cid = res->id;
+		cmd2 = (void *) (&cmd1[1]);
+	}
+	cmd2->header.id = SVGA_3D_CMD_DX_BIND_CONTEXT;
+	cmd2->header.size = sizeof(cmd2->body);
+	cmd2->body.cid = res->id;
+	cmd2->body.mobid = SVGA3D_INVALID_ID;
+
+	vmw_fifo_commit(dev_priv, submit_size);
+	mutex_unlock(&dev_priv->binding_mutex);
+
+	/*
+	 * Create a fence object and fence the backup buffer.
+	 */
+
+	(void) vmw_execbuf_fence_commands(NULL, dev_priv,
+					  &fence, NULL);
+
+	vmw_fence_single_bo(bo, fence);
+
+	if (likely(fence != NULL))
+		vmw_fence_obj_unreference(&fence);
+
+	return 0;
+}
+
+static int vmw_dx_context_destroy(struct vmw_resource *res)
+{
+	struct vmw_private *dev_priv = res->dev_priv;
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdDXDestroyContext body;
+	} *cmd;
+
+	if (likely(res->id == -1))
+		return 0;
+
+	cmd = vmw_fifo_reserve(dev_priv, sizeof(*cmd));
+	if (unlikely(cmd == NULL)) {
+		DRM_ERROR("Failed reserving FIFO space for context "
+			  "destruction.\n");
+		return -ENOMEM;
+	}
+
+	cmd->header.id = SVGA_3D_CMD_DX_DESTROY_CONTEXT;
+	cmd->header.size = sizeof(cmd->body);
+	cmd->body.cid = res->id;
+	vmw_fifo_commit(dev_priv, sizeof(*cmd));
+	if (dev_priv->query_cid == res->id)
+		dev_priv->query_cid_valid = false;
+	vmw_resource_release_id(res);
+	vmw_fifo_resource_dec(dev_priv);
+
+	return 0;
+}
+
 /**
  * User-space context management:
  */
@@ -435,6 +689,8 @@ static void vmw_user_context_free(struct vmw_resource *res)
 	    container_of(res, struct vmw_user_context, res);
 	struct vmw_private *dev_priv = res->dev_priv;
 
+	if (ctx->cbs)
+		vmw_binding_state_free(ctx->cbs);
 	ttm_base_object_kfree(ctx, base);
 	ttm_mem_global_free(vmw_mem_glob(dev_priv),
 			    vmw_user_context_size);
@@ -465,8 +721,8 @@ int vmw_context_destroy_ioctl(struct drm_device *dev, void *data,
 	return ttm_ref_object_base_unref(tfile, arg->cid, TTM_REF_USAGE);
 }
 
-int vmw_context_define_ioctl(struct drm_device *dev, void *data,
-			     struct drm_file *file_priv)
+static int vmw_context_define(struct drm_device *dev, void *data,
+			      struct drm_file *file_priv, bool dx)
 {
 	struct vmw_private *dev_priv = vmw_priv(dev);
 	struct vmw_user_context *ctx;
@@ -476,6 +732,10 @@ int vmw_context_define_ioctl(struct drm_device *dev, void *data,
 	struct ttm_object_file *tfile = vmw_fpriv(file_priv)->tfile;
 	int ret;
 
+	if (!dev_priv->has_dx && dx) {
+		DRM_ERROR("DX contexts not supported by device.\n");
+		return -EINVAL;
+	}
 
 	/*
 	 * Approximate idr memory usage with 128 bytes. It will be limited
@@ -516,7 +776,7 @@ int vmw_context_define_ioctl(struct drm_device *dev, void *data,
 	 * From here on, the destructor takes over resource freeing.
 	 */
 
-	ret = vmw_context_init(dev_priv, res, vmw_user_context_free);
+	ret = vmw_context_init(dev_priv, res, vmw_user_context_free, dx);
 	if (unlikely(ret != 0))
 		goto out_unlock;
 
@@ -535,387 +795,74 @@ out_err:
 out_unlock:
 	ttm_read_unlock(&dev_priv->reservation_sem);
 	return ret;
-
-}
-
-/**
- * vmw_context_scrub_shader - scrub a shader binding from a context.
- *
- * @bi: single binding information.
- * @rebind: Whether to issue a bind instead of scrub command.
- */
-static int vmw_context_scrub_shader(struct vmw_ctx_bindinfo *bi, bool rebind)
-{
-	struct vmw_private *dev_priv = bi->ctx->dev_priv;
-	struct {
-		SVGA3dCmdHeader header;
-		SVGA3dCmdSetShader body;
-	} *cmd;
-
-	cmd = vmw_fifo_reserve(dev_priv, sizeof(*cmd));
-	if (unlikely(cmd == NULL)) {
-		DRM_ERROR("Failed reserving FIFO space for shader "
-			  "unbinding.\n");
-		return -ENOMEM;
-	}
-
-	cmd->header.id = SVGA_3D_CMD_SET_SHADER;
-	cmd->header.size = sizeof(cmd->body);
-	cmd->body.cid = bi->ctx->id;
-	cmd->body.type = bi->i1.shader_type;
-	cmd->body.shid = ((rebind) ? bi->res->id : SVGA3D_INVALID_ID);
-	vmw_fifo_commit(dev_priv, sizeof(*cmd));
-
-	return 0;
-}
-
-/**
- * vmw_context_scrub_render_target - scrub a render target binding
- * from a context.
- *
- * @bi: single binding information.
- * @rebind: Whether to issue a bind instead of scrub command.
- */
-static int vmw_context_scrub_render_target(struct vmw_ctx_bindinfo *bi,
-					   bool rebind)
-{
-	struct vmw_private *dev_priv = bi->ctx->dev_priv;
-	struct {
-		SVGA3dCmdHeader header;
-		SVGA3dCmdSetRenderTarget body;
-	} *cmd;
-
-	cmd = vmw_fifo_reserve(dev_priv, sizeof(*cmd));
-	if (unlikely(cmd == NULL)) {
-		DRM_ERROR("Failed reserving FIFO space for render target "
-			  "unbinding.\n");
-		return -ENOMEM;
-	}
-
-	cmd->header.id = SVGA_3D_CMD_SETRENDERTARGET;
-	cmd->header.size = sizeof(cmd->body);
-	cmd->body.cid = bi->ctx->id;
-	cmd->body.type = bi->i1.rt_type;
-	cmd->body.target.sid = ((rebind) ? bi->res->id : SVGA3D_INVALID_ID);
-	cmd->body.target.face = 0;
-	cmd->body.target.mipmap = 0;
-	vmw_fifo_commit(dev_priv, sizeof(*cmd));
-
-	return 0;
-}
-
-/**
- * vmw_context_scrub_texture - scrub a texture binding from a context.
- *
- * @bi: single binding information.
- * @rebind: Whether to issue a bind instead of scrub command.
- *
- * TODO: Possibly complement this function with a function that takes
- * a list of texture bindings and combines them to a single command.
- */
-static int vmw_context_scrub_texture(struct vmw_ctx_bindinfo *bi,
-				     bool rebind)
-{
-	struct vmw_private *dev_priv = bi->ctx->dev_priv;
-	struct {
-		SVGA3dCmdHeader header;
-		struct {
-			SVGA3dCmdSetTextureState c;
-			SVGA3dTextureState s1;
-		} body;
-	} *cmd;
-
-	cmd = vmw_fifo_reserve(dev_priv, sizeof(*cmd));
-	if (unlikely(cmd == NULL)) {
-		DRM_ERROR("Failed reserving FIFO space for texture "
-			  "unbinding.\n");
-		return -ENOMEM;
-	}
-
-
-	cmd->header.id = SVGA_3D_CMD_SETTEXTURESTATE;
-	cmd->header.size = sizeof(cmd->body);
-	cmd->body.c.cid = bi->ctx->id;
-	cmd->body.s1.stage = bi->i1.texture_stage;
-	cmd->body.s1.name = SVGA3D_TS_BIND_TEXTURE;
-	cmd->body.s1.value = ((rebind) ? bi->res->id : SVGA3D_INVALID_ID);
-	vmw_fifo_commit(dev_priv, sizeof(*cmd));
-
-	return 0;
 }
 
-/**
- * vmw_context_binding_drop: Stop tracking a context binding
- *
- * @cb: Pointer to binding tracker storage.
- *
- * Stops tracking a context binding, and re-initializes its storage.
- * Typically used when the context binding is replaced with a binding to
- * another (or the same, for that matter) resource.
- */
-static void vmw_context_binding_drop(struct vmw_ctx_binding *cb)
+int vmw_context_define_ioctl(struct drm_device *dev, void *data,
+			     struct drm_file *file_priv)
 {
-	list_del(&cb->ctx_list);
-	if (!list_empty(&cb->res_list))
-		list_del(&cb->res_list);
-	cb->bi.ctx = NULL;
+	return vmw_context_define(dev, data, file_priv, false);
 }
 
-/**
- * vmw_context_binding_add: Start tracking a context binding
- *
- * @cbs: Pointer to the context binding state tracker.
- * @bi: Information about the binding to track.
- *
- * Performs basic checks on the binding to make sure arguments are within
- * bounds and then starts tracking the binding in the context binding
- * state structure @cbs.
- */
-int vmw_context_binding_add(struct vmw_ctx_binding_state *cbs,
-			    const struct vmw_ctx_bindinfo *bi)
+int vmw_extended_context_define_ioctl(struct drm_device *dev, void *data,
+				      struct drm_file *file_priv)
 {
-	struct vmw_ctx_binding *loc;
-
-	switch (bi->bt) {
-	case vmw_ctx_binding_rt:
-		if (unlikely((unsigned)bi->i1.rt_type >= SVGA3D_RT_MAX)) {
-			DRM_ERROR("Illegal render target type %u.\n",
-				  (unsigned) bi->i1.rt_type);
-			return -EINVAL;
-		}
-		loc = &cbs->render_targets[bi->i1.rt_type];
-		break;
-	case vmw_ctx_binding_tex:
-		if (unlikely((unsigned)bi->i1.texture_stage >=
-			     SVGA3D_NUM_TEXTURE_UNITS)) {
-			DRM_ERROR("Illegal texture/sampler unit %u.\n",
-				  (unsigned) bi->i1.texture_stage);
-			return -EINVAL;
-		}
-		loc = &cbs->texture_units[bi->i1.texture_stage];
-		break;
-	case vmw_ctx_binding_shader:
-		if (unlikely((unsigned)bi->i1.shader_type >=
-			     SVGA3D_SHADERTYPE_PREDX_MAX)) {
-			DRM_ERROR("Illegal shader type %u.\n",
-				  (unsigned) bi->i1.shader_type);
-			return -EINVAL;
-		}
-		loc = &cbs->shaders[bi->i1.shader_type];
-		break;
+	union drm_vmw_extended_context_arg *arg = (typeof(arg)) data;
+	struct drm_vmw_context_arg *rep = &arg->rep;
+
+	switch (arg->req) {
+	case drm_vmw_context_legacy:
+		return vmw_context_define(dev, rep, file_priv, false);
+	case drm_vmw_context_dx:
+		return vmw_context_define(dev, rep, file_priv, true);
 	default:
-		BUG();
-	}
-
-	if (loc->bi.ctx != NULL)
-		vmw_context_binding_drop(loc);
-
-	loc->bi = *bi;
-	loc->bi.scrubbed = false;
-	list_add_tail(&loc->ctx_list, &cbs->list);
-	INIT_LIST_HEAD(&loc->res_list);
-
-	return 0;
-}
-
-/**
- * vmw_context_binding_transfer: Transfer a context binding tracking entry.
- *
- * @cbs: Pointer to the persistent context binding state tracker.
- * @bi: Information about the binding to track.
- *
- */
-static void vmw_context_binding_transfer(struct vmw_ctx_binding_state *cbs,
-					 const struct vmw_ctx_bindinfo *bi)
-{
-	struct vmw_ctx_binding *loc;
-
-	switch (bi->bt) {
-	case vmw_ctx_binding_rt:
-		loc = &cbs->render_targets[bi->i1.rt_type];
-		break;
-	case vmw_ctx_binding_tex:
-		loc = &cbs->texture_units[bi->i1.texture_stage];
-		break;
-	case vmw_ctx_binding_shader:
-		loc = &cbs->shaders[bi->i1.shader_type];
 		break;
-	default:
-		BUG();
-	}
-
-	if (loc->bi.ctx != NULL)
-		vmw_context_binding_drop(loc);
-
-	if (bi->res != NULL) {
-		loc->bi = *bi;
-		list_add_tail(&loc->ctx_list, &cbs->list);
-		list_add_tail(&loc->res_list, &bi->res->binding_head);
-	}
-}
-
-/**
- * vmw_context_binding_kill - Kill a binding on the device
- * and stop tracking it.
- *
- * @cb: Pointer to binding tracker storage.
- *
- * Emits FIFO commands to scrub a binding represented by @cb.
- * Then stops tracking the binding and re-initializes its storage.
- */
-static void vmw_context_binding_kill(struct vmw_ctx_binding *cb)
-{
-	if (!cb->bi.scrubbed) {
-		(void) vmw_scrub_funcs[cb->bi.bt](&cb->bi, false);
-		cb->bi.scrubbed = true;
-	}
-	vmw_context_binding_drop(cb);
-}
-
-/**
- * vmw_context_binding_state_kill - Kill all bindings associated with a
- * struct vmw_ctx_binding state structure, and re-initialize the structure.
- *
- * @cbs: Pointer to the context binding state tracker.
- *
- * Emits commands to scrub all bindings associated with the
- * context binding state tracker. Then re-initializes the whole structure.
- */
-static void vmw_context_binding_state_kill(struct vmw_ctx_binding_state *cbs)
-{
-	struct vmw_ctx_binding *entry, *next;
-
-	list_for_each_entry_safe(entry, next, &cbs->list, ctx_list)
-		vmw_context_binding_kill(entry);
-}
-
-/**
- * vmw_context_binding_state_scrub - Scrub all bindings associated with a
- * struct vmw_ctx_binding state structure.
- *
- * @cbs: Pointer to the context binding state tracker.
- *
- * Emits commands to scrub all bindings associated with the
- * context binding state tracker.
- */
-static void vmw_context_binding_state_scrub(struct vmw_ctx_binding_state *cbs)
-{
-	struct vmw_ctx_binding *entry;
-
-	list_for_each_entry(entry, &cbs->list, ctx_list) {
-		if (!entry->bi.scrubbed) {
-			(void) vmw_scrub_funcs[entry->bi.bt](&entry->bi, false);
-			entry->bi.scrubbed = true;
-		}
-	}
-}
-
-/**
- * vmw_context_binding_res_list_kill - Kill all bindings on a
- * resource binding list
- *
- * @head: list head of resource binding list
- *
- * Kills all bindings associated with a specific resource. Typically
- * called before the resource is destroyed.
- */
-void vmw_context_binding_res_list_kill(struct list_head *head)
-{
-	struct vmw_ctx_binding *entry, *next;
-
-	list_for_each_entry_safe(entry, next, head, res_list)
-		vmw_context_binding_kill(entry);
-}
-
-/**
- * vmw_context_binding_res_list_scrub - Scrub all bindings on a
- * resource binding list
- *
- * @head: list head of resource binding list
- *
- * Scrub all bindings associated with a specific resource. Typically
- * called before the resource is evicted.
- */
-void vmw_context_binding_res_list_scrub(struct list_head *head)
-{
-	struct vmw_ctx_binding *entry;
-
-	list_for_each_entry(entry, head, res_list) {
-		if (!entry->bi.scrubbed) {
-			(void) vmw_scrub_funcs[entry->bi.bt](&entry->bi, false);
-			entry->bi.scrubbed = true;
-		}
 	}
+	return -EINVAL;
 }
 
 /**
- * vmw_context_binding_state_transfer - Commit staged binding info
+ * vmw_context_binding_list - Return a list of context bindings
  *
- * @ctx: Pointer to context to commit the staged binding info to.
- * @from: Staged binding info built during execbuf.
+ * @ctx: The context resource
  *
- * Transfers binding info from a temporary structure to the persistent
- * structure in the context. This can be done once commands
+ * Returns the current list of bindings of the given context. Note that
+ * this list becomes stale as soon as the dev_priv::binding_mutex is unlocked.
  */
-void vmw_context_binding_state_transfer(struct vmw_resource *ctx,
-					struct vmw_ctx_binding_state *from)
+struct list_head *vmw_context_binding_list(struct vmw_resource *ctx)
 {
 	struct vmw_user_context *uctx =
 		container_of(ctx, struct vmw_user_context, res);
-	struct vmw_ctx_binding *entry, *next;
 
-	list_for_each_entry_safe(entry, next, &from->list, ctx_list)
-		vmw_context_binding_transfer(&uctx->cbs, &entry->bi);
+	return vmw_binding_state_list(uctx->cbs);
 }
 
-/**
- * vmw_context_rebind_all - Rebind all scrubbed bindings of a context
- *
- * @ctx: The context resource
- *
- * Walks through the context binding list and rebinds all scrubbed
- * resources.
- */
-int vmw_context_rebind_all(struct vmw_resource *ctx)
+struct vmw_cmdbuf_res_manager *vmw_context_res_man(struct vmw_resource *ctx)
 {
-	struct vmw_ctx_binding *entry;
-	struct vmw_user_context *uctx =
-		container_of(ctx, struct vmw_user_context, res);
-	struct vmw_ctx_binding_state *cbs = &uctx->cbs;
-	int ret;
-
-	list_for_each_entry(entry, &cbs->list, ctx_list) {
-		if (likely(!entry->bi.scrubbed))
-			continue;
-
-		if (WARN_ON(entry->bi.res == NULL || entry->bi.res->id ==
-			    SVGA3D_INVALID_ID))
-			continue;
-
-		ret = vmw_scrub_funcs[entry->bi.bt](&entry->bi, true);
-		if (unlikely(ret != 0))
-			return ret;
+	return container_of(ctx, struct vmw_user_context, res)->man;
+}
 
-		entry->bi.scrubbed = false;
-	}
+struct vmw_resource *vmw_context_cotable(struct vmw_resource *ctx,
+					 SVGACOTableType cotable_type)
+{
+	if (cotable_type >= SVGA_COTABLE_DX10_MAX)
+		return ERR_PTR(-EINVAL);
 
-	return 0;
+	return vmw_resource_reference
+		(container_of(ctx, struct vmw_user_context, res)->
+		 cotables[cotable_type]);
 }
 
 /**
- * vmw_context_binding_list - Return a list of context bindings
+ * vmw_context_binding_state -
+ * Return a pointer to a context binding state structure
  *
  * @ctx: The context resource
  *
- * Returns the current list of bindings of the given context. Note that
- * this list becomes stale as soon as the dev_priv::binding_mutex is unlocked.
+ * Returns the current state of bindings of the given context. Note that
+ * this state becomes stale as soon as the dev_priv::binding_mutex is unlocked.
  */
-struct list_head *vmw_context_binding_list(struct vmw_resource *ctx)
-{
-	return &(container_of(ctx, struct vmw_user_context, res)->cbs.list);
-}
-
-struct vmw_cmdbuf_res_manager *vmw_context_res_man(struct vmw_resource *ctx)
+struct vmw_ctx_binding_state *
+vmw_context_binding_state(struct vmw_resource *ctx)
 {
-	return container_of(ctx, struct vmw_user_context, res)->man;
+	return container_of(ctx, struct vmw_user_context, res)->cbs;
 }
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_cotable.c b/drivers/gpu/drm/vmwgfx/vmwgfx_cotable.c
new file mode 100644
index 000000000000..22bb04ffec78
--- /dev/null
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_cotable.c
@@ -0,0 +1,662 @@
+/**************************************************************************
+ *
+ * Copyright © 2014 VMware, Inc., Palo Alto, CA., USA
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+/*
+ * Treat context OTables as resources to make use of the resource
+ * backing MOB eviction mechanism, that is used to read back the COTable
+ * whenever the backing MOB is evicted.
+ */
+
+#include "vmwgfx_drv.h"
+#include "vmwgfx_resource_priv.h"
+#include <ttm/ttm_placement.h>
+#include "vmwgfx_so.h"
+
+/**
+ * struct vmw_cotable - Context Object Table resource
+ *
+ * @res: struct vmw_resource we are deriving from.
+ * @ctx: non-refcounted pointer to the owning context.
+ * @size_read_back: Size of data read back during eviction.
+ * @seen_entries: Seen entries in command stream for this cotable.
+ * @type: The cotable type.
+ * @scrubbed: Whether the cotable has been scrubbed.
+ * @resource_list: List of resources in the cotable.
+ */
+struct vmw_cotable {
+	struct vmw_resource res;
+	struct vmw_resource *ctx;
+	size_t size_read_back;
+	int seen_entries;
+	u32 type;
+	bool scrubbed;
+	struct list_head resource_list;
+};
+
+/**
+ * struct vmw_cotable_info - Static info about cotable types
+ *
+ * @min_initial_entries: Min number of initial intries at cotable allocation
+ * for this cotable type.
+ * @size: Size of each entry.
+ */
+struct vmw_cotable_info {
+	u32 min_initial_entries;
+	u32 size;
+	void (*unbind_func)(struct vmw_private *, struct list_head *,
+			    bool);
+};
+
+static const struct vmw_cotable_info co_info[] = {
+	{1, sizeof(SVGACOTableDXRTViewEntry), &vmw_view_cotable_list_destroy},
+	{1, sizeof(SVGACOTableDXDSViewEntry), &vmw_view_cotable_list_destroy},
+	{1, sizeof(SVGACOTableDXSRViewEntry), &vmw_view_cotable_list_destroy},
+	{1, sizeof(SVGACOTableDXElementLayoutEntry), NULL},
+	{1, sizeof(SVGACOTableDXBlendStateEntry), NULL},
+	{1, sizeof(SVGACOTableDXDepthStencilEntry), NULL},
+	{1, sizeof(SVGACOTableDXRasterizerStateEntry), NULL},
+	{1, sizeof(SVGACOTableDXSamplerEntry), NULL},
+	{1, sizeof(SVGACOTableDXStreamOutputEntry), NULL},
+	{1, sizeof(SVGACOTableDXQueryEntry), NULL},
+	{1, sizeof(SVGACOTableDXShaderEntry), &vmw_dx_shader_cotable_list_scrub}
+};
+
+/*
+ * Cotables with bindings that we remove must be scrubbed first,
+ * otherwise, the device will swap in an invalid context when we remove
+ * bindings before scrubbing a cotable...
+ */
+const SVGACOTableType vmw_cotable_scrub_order[] = {
+	SVGA_COTABLE_RTVIEW,
+	SVGA_COTABLE_DSVIEW,
+	SVGA_COTABLE_SRVIEW,
+	SVGA_COTABLE_DXSHADER,
+	SVGA_COTABLE_ELEMENTLAYOUT,
+	SVGA_COTABLE_BLENDSTATE,
+	SVGA_COTABLE_DEPTHSTENCIL,
+	SVGA_COTABLE_RASTERIZERSTATE,
+	SVGA_COTABLE_SAMPLER,
+	SVGA_COTABLE_STREAMOUTPUT,
+	SVGA_COTABLE_DXQUERY,
+};
+
+static int vmw_cotable_bind(struct vmw_resource *res,
+			    struct ttm_validate_buffer *val_buf);
+static int vmw_cotable_unbind(struct vmw_resource *res,
+			      bool readback,
+			      struct ttm_validate_buffer *val_buf);
+static int vmw_cotable_create(struct vmw_resource *res);
+static int vmw_cotable_destroy(struct vmw_resource *res);
+
+static const struct vmw_res_func vmw_cotable_func = {
+	.res_type = vmw_res_cotable,
+	.needs_backup = true,
+	.may_evict = true,
+	.type_name = "context guest backed object tables",
+	.backup_placement = &vmw_mob_placement,
+	.create = vmw_cotable_create,
+	.destroy = vmw_cotable_destroy,
+	.bind = vmw_cotable_bind,
+	.unbind = vmw_cotable_unbind,
+};
+
+/**
+ * vmw_cotable - Convert a struct vmw_resource pointer to a struct
+ * vmw_cotable pointer
+ *
+ * @res: Pointer to the resource.
+ */
+static struct vmw_cotable *vmw_cotable(struct vmw_resource *res)
+{
+	return container_of(res, struct vmw_cotable, res);
+}
+
+/**
+ * vmw_cotable_destroy - Cotable resource destroy callback
+ *
+ * @res: Pointer to the cotable resource.
+ *
+ * There is no device cotable destroy command, so this function only
+ * makes sure that the resource id is set to invalid.
+ */
+static int vmw_cotable_destroy(struct vmw_resource *res)
+{
+	res->id = -1;
+	return 0;
+}
+
+/**
+ * vmw_cotable_unscrub - Undo a cotable unscrub operation
+ *
+ * @res: Pointer to the cotable resource
+ *
+ * This function issues commands to (re)bind the cotable to
+ * its backing mob, which needs to be validated and reserved at this point.
+ * This is identical to bind() except the function interface looks different.
+ */
+static int vmw_cotable_unscrub(struct vmw_resource *res)
+{
+	struct vmw_cotable *vcotbl = vmw_cotable(res);
+	struct vmw_private *dev_priv = res->dev_priv;
+	struct ttm_buffer_object *bo = &res->backup->base;
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdDXSetCOTable body;
+	} *cmd;
+
+	WARN_ON_ONCE(bo->mem.mem_type != VMW_PL_MOB);
+	lockdep_assert_held(&bo->resv->lock.base);
+
+	cmd = vmw_fifo_reserve_dx(dev_priv, sizeof(*cmd), SVGA3D_INVALID_ID);
+	if (!cmd) {
+		DRM_ERROR("Failed reserving FIFO space for cotable "
+			  "binding.\n");
+		return -ENOMEM;
+	}
+
+	WARN_ON(vcotbl->ctx->id == SVGA3D_INVALID_ID);
+	WARN_ON(bo->mem.mem_type != VMW_PL_MOB);
+	cmd->header.id = SVGA_3D_CMD_DX_SET_COTABLE;
+	cmd->header.size = sizeof(cmd->body);
+	cmd->body.cid = vcotbl->ctx->id;
+	cmd->body.type = vcotbl->type;
+	cmd->body.mobid = bo->mem.start;
+	cmd->body.validSizeInBytes = vcotbl->size_read_back;
+
+	vmw_fifo_commit_flush(dev_priv, sizeof(*cmd));
+	vcotbl->scrubbed = false;
+
+	return 0;
+}
+
+/**
+ * vmw_cotable_bind - Undo a cotable unscrub operation
+ *
+ * @res: Pointer to the cotable resource
+ * @val_buf: Pointer to a struct ttm_validate_buffer prepared by the caller
+ * for convenience / fencing.
+ *
+ * This function issues commands to (re)bind the cotable to
+ * its backing mob, which needs to be validated and reserved at this point.
+ */
+static int vmw_cotable_bind(struct vmw_resource *res,
+			    struct ttm_validate_buffer *val_buf)
+{
+	/*
+	 * The create() callback may have changed @res->backup without
+	 * the caller noticing, and with val_buf->bo still pointing to
+	 * the old backup buffer. Although hackish, and not used currently,
+	 * take the opportunity to correct the value here so that it's not
+	 * misused in the future.
+	 */
+	val_buf->bo = &res->backup->base;
+
+	return vmw_cotable_unscrub(res);
+}
+
+/**
+ * vmw_cotable_scrub - Scrub the cotable from the device.
+ *
+ * @res: Pointer to the cotable resource.
+ * @readback: Whether initiate a readback of the cotable data to the backup
+ * buffer.
+ *
+ * In some situations (context swapouts) it might be desirable to make the
+ * device forget about the cotable without performing a full unbind. A full
+ * unbind requires reserved backup buffers and it might not be possible to
+ * reserve them due to locking order violation issues. The vmw_cotable_scrub
+ * function implements a partial unbind() without that requirement but with the
+ * following restrictions.
+ * 1) Before the cotable is again used by the GPU, vmw_cotable_unscrub() must
+ *    be called.
+ * 2) Before the cotable backing buffer is used by the CPU, or during the
+ *    resource destruction, vmw_cotable_unbind() must be called.
+ */
+int vmw_cotable_scrub(struct vmw_resource *res, bool readback)
+{
+	struct vmw_cotable *vcotbl = vmw_cotable(res);
+	struct vmw_private *dev_priv = res->dev_priv;
+	size_t submit_size;
+
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdDXReadbackCOTable body;
+	} *cmd0;
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdDXSetCOTable body;
+	} *cmd1;
+
+	if (vcotbl->scrubbed)
+		return 0;
+
+	if (co_info[vcotbl->type].unbind_func)
+		co_info[vcotbl->type].unbind_func(dev_priv,
+						  &vcotbl->resource_list,
+						  readback);
+	submit_size = sizeof(*cmd1);
+	if (readback)
+		submit_size += sizeof(*cmd0);
+
+	cmd1 = vmw_fifo_reserve_dx(dev_priv, submit_size, SVGA3D_INVALID_ID);
+	if (!cmd1) {
+		DRM_ERROR("Failed reserving FIFO space for cotable "
+			  "unbinding.\n");
+		return -ENOMEM;
+	}
+
+	vcotbl->size_read_back = 0;
+	if (readback) {
+		cmd0 = (void *) cmd1;
+		cmd0->header.id = SVGA_3D_CMD_DX_READBACK_COTABLE;
+		cmd0->header.size = sizeof(cmd0->body);
+		cmd0->body.cid = vcotbl->ctx->id;
+		cmd0->body.type = vcotbl->type;
+		cmd1 = (void *) &cmd0[1];
+		vcotbl->size_read_back = res->backup_size;
+	}
+	cmd1->header.id = SVGA_3D_CMD_DX_SET_COTABLE;
+	cmd1->header.size = sizeof(cmd1->body);
+	cmd1->body.cid = vcotbl->ctx->id;
+	cmd1->body.type = vcotbl->type;
+	cmd1->body.mobid = SVGA3D_INVALID_ID;
+	cmd1->body.validSizeInBytes = 0;
+	vmw_fifo_commit_flush(dev_priv, submit_size);
+	vcotbl->scrubbed = true;
+
+	/* Trigger a create() on next validate. */
+	res->id = -1;
+
+	return 0;
+}
+
+/**
+ * vmw_cotable_unbind - Cotable resource unbind callback
+ *
+ * @res: Pointer to the cotable resource.
+ * @readback: Whether to read back cotable data to the backup buffer.
+ * val_buf: Pointer to a struct ttm_validate_buffer prepared by the caller
+ * for convenience / fencing.
+ *
+ * Unbinds the cotable from the device and fences the backup buffer.
+ */
+static int vmw_cotable_unbind(struct vmw_resource *res,
+			      bool readback,
+			      struct ttm_validate_buffer *val_buf)
+{
+	struct vmw_cotable *vcotbl = vmw_cotable(res);
+	struct vmw_private *dev_priv = res->dev_priv;
+	struct ttm_buffer_object *bo = val_buf->bo;
+	struct vmw_fence_obj *fence;
+	int ret;
+
+	if (list_empty(&res->mob_head))
+		return 0;
+
+	WARN_ON_ONCE(bo->mem.mem_type != VMW_PL_MOB);
+	lockdep_assert_held(&bo->resv->lock.base);
+
+	mutex_lock(&dev_priv->binding_mutex);
+	if (!vcotbl->scrubbed)
+		vmw_dx_context_scrub_cotables(vcotbl->ctx, readback);
+	mutex_unlock(&dev_priv->binding_mutex);
+	(void) vmw_execbuf_fence_commands(NULL, dev_priv, &fence, NULL);
+	vmw_fence_single_bo(bo, fence);
+	if (likely(fence != NULL))
+		vmw_fence_obj_unreference(&fence);
+
+	return ret;
+}
+
+/**
+ * vmw_cotable_readback - Read back a cotable without unbinding.
+ *
+ * @res: The cotable resource.
+ *
+ * Reads back a cotable to its backing mob without scrubbing the MOB from
+ * the cotable. The MOB is fenced for subsequent CPU access.
+ */
+static int vmw_cotable_readback(struct vmw_resource *res)
+{
+	struct vmw_cotable *vcotbl = vmw_cotable(res);
+	struct vmw_private *dev_priv = res->dev_priv;
+
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdDXReadbackCOTable body;
+	} *cmd;
+	struct vmw_fence_obj *fence;
+
+	if (!vcotbl->scrubbed) {
+		cmd = vmw_fifo_reserve_dx(dev_priv, sizeof(*cmd),
+					  SVGA3D_INVALID_ID);
+		if (!cmd) {
+			DRM_ERROR("Failed reserving FIFO space for cotable "
+				  "readback.\n");
+			return -ENOMEM;
+		}
+		cmd->header.id = SVGA_3D_CMD_DX_READBACK_COTABLE;
+		cmd->header.size = sizeof(cmd->body);
+		cmd->body.cid = vcotbl->ctx->id;
+		cmd->body.type = vcotbl->type;
+		vcotbl->size_read_back = res->backup_size;
+		vmw_fifo_commit(dev_priv, sizeof(*cmd));
+	}
+
+	(void) vmw_execbuf_fence_commands(NULL, dev_priv, &fence, NULL);
+	vmw_fence_single_bo(&res->backup->base, fence);
+	vmw_fence_obj_unreference(&fence);
+
+	return 0;
+}
+
+/**
+ * vmw_cotable_resize - Resize a cotable.
+ *
+ * @res: The cotable resource.
+ * @new_size: The new size.
+ *
+ * Resizes a cotable and binds the new backup buffer.
+ * On failure the cotable is left intact.
+ * Important! This function may not fail once the MOB switch has been
+ * committed to hardware. That would put the device context in an
+ * invalid state which we can't currently recover from.
+ */
+static int vmw_cotable_resize(struct vmw_resource *res, size_t new_size)
+{
+	struct vmw_private *dev_priv = res->dev_priv;
+	struct vmw_cotable *vcotbl = vmw_cotable(res);
+	struct vmw_dma_buffer *buf, *old_buf = res->backup;
+	struct ttm_buffer_object *bo, *old_bo = &res->backup->base;
+	size_t old_size = res->backup_size;
+	size_t old_size_read_back = vcotbl->size_read_back;
+	size_t cur_size_read_back;
+	struct ttm_bo_kmap_obj old_map, new_map;
+	int ret;
+	size_t i;
+
+	ret = vmw_cotable_readback(res);
+	if (ret)
+		return ret;
+
+	cur_size_read_back = vcotbl->size_read_back;
+	vcotbl->size_read_back = old_size_read_back;
+
+	/*
+	 * While device is processing, Allocate and reserve a buffer object
+	 * for the new COTable. Initially pin the buffer object to make sure
+	 * we can use tryreserve without failure.
+	 */
+	buf = kzalloc(sizeof(*buf), GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	ret = vmw_dmabuf_init(dev_priv, buf, new_size, &vmw_mob_ne_placement,
+			      true, vmw_dmabuf_bo_free);
+	if (ret) {
+		DRM_ERROR("Failed initializing new cotable MOB.\n");
+		return ret;
+	}
+
+	bo = &buf->base;
+	WARN_ON_ONCE(ttm_bo_reserve(bo, false, true, false, NULL));
+
+	ret = ttm_bo_wait(old_bo, false, false, false);
+	if (unlikely(ret != 0)) {
+		DRM_ERROR("Failed waiting for cotable unbind.\n");
+		goto out_wait;
+	}
+
+	/*
+	 * Do a page by page copy of COTables. This eliminates slow vmap()s.
+	 * This should really be a TTM utility.
+	 */
+	for (i = 0; i < old_bo->num_pages; ++i) {
+		bool dummy;
+
+		ret = ttm_bo_kmap(old_bo, i, 1, &old_map);
+		if (unlikely(ret != 0)) {
+			DRM_ERROR("Failed mapping old COTable on resize.\n");
+			goto out_wait;
+		}
+		ret = ttm_bo_kmap(bo, i, 1, &new_map);
+		if (unlikely(ret != 0)) {
+			DRM_ERROR("Failed mapping new COTable on resize.\n");
+			goto out_map_new;
+		}
+		memcpy(ttm_kmap_obj_virtual(&new_map, &dummy),
+		       ttm_kmap_obj_virtual(&old_map, &dummy),
+		       PAGE_SIZE);
+		ttm_bo_kunmap(&new_map);
+		ttm_bo_kunmap(&old_map);
+	}
+
+	/* Unpin new buffer, and switch backup buffers. */
+	ret = ttm_bo_validate(bo, &vmw_mob_placement, false, false);
+	if (unlikely(ret != 0)) {
+		DRM_ERROR("Failed validating new COTable backup buffer.\n");
+		goto out_wait;
+	}
+
+	res->backup = buf;
+	res->backup_size = new_size;
+	vcotbl->size_read_back = cur_size_read_back;
+
+	/*
+	 * Now tell the device to switch. If this fails, then we need to
+	 * revert the full resize.
+	 */
+	ret = vmw_cotable_unscrub(res);
+	if (ret) {
+		DRM_ERROR("Failed switching COTable backup buffer.\n");
+		res->backup = old_buf;
+		res->backup_size = old_size;
+		vcotbl->size_read_back = old_size_read_back;
+		goto out_wait;
+	}
+
+	/* Let go of the old mob. */
+	list_del(&res->mob_head);
+	list_add_tail(&res->mob_head, &buf->res_list);
+	vmw_dmabuf_unreference(&old_buf);
+	res->id = vcotbl->type;
+
+	return 0;
+
+out_map_new:
+	ttm_bo_kunmap(&old_map);
+out_wait:
+	ttm_bo_unreserve(bo);
+	vmw_dmabuf_unreference(&buf);
+
+	return ret;
+}
+
+/**
+ * vmw_cotable_create - Cotable resource create callback
+ *
+ * @res: Pointer to a cotable resource.
+ *
+ * There is no separate create command for cotables, so this callback, which
+ * is called before bind() in the validation sequence is instead used for two
+ * things.
+ * 1) Unscrub the cotable if it is scrubbed and still attached to a backup
+ *    buffer, that is, if @res->mob_head is non-empty.
+ * 2) Resize the cotable if needed.
+ */
+static int vmw_cotable_create(struct vmw_resource *res)
+{
+	struct vmw_cotable *vcotbl = vmw_cotable(res);
+	size_t new_size = res->backup_size;
+	size_t needed_size;
+	int ret;
+
+	/* Check whether we need to resize the cotable */
+	needed_size = (vcotbl->seen_entries + 1) * co_info[vcotbl->type].size;
+	while (needed_size > new_size)
+		new_size *= 2;
+
+	if (likely(new_size <= res->backup_size)) {
+		if (vcotbl->scrubbed && !list_empty(&res->mob_head)) {
+			ret = vmw_cotable_unscrub(res);
+			if (ret)
+				return ret;
+		}
+		res->id = vcotbl->type;
+		return 0;
+	}
+
+	return vmw_cotable_resize(res, new_size);
+}
+
+/**
+ * vmw_hw_cotable_destroy - Cotable hw_destroy callback
+ *
+ * @res: Pointer to a cotable resource.
+ *
+ * The final (part of resource destruction) destroy callback.
+ */
+static void vmw_hw_cotable_destroy(struct vmw_resource *res)
+{
+	(void) vmw_cotable_destroy(res);
+}
+
+static size_t cotable_acc_size;
+
+/**
+ * vmw_cotable_free - Cotable resource destructor
+ *
+ * @res: Pointer to a cotable resource.
+ */
+static void vmw_cotable_free(struct vmw_resource *res)
+{
+	struct vmw_private *dev_priv = res->dev_priv;
+
+	kfree(res);
+	ttm_mem_global_free(vmw_mem_glob(dev_priv), cotable_acc_size);
+}
+
+/**
+ * vmw_cotable_alloc - Create a cotable resource
+ *
+ * @dev_priv: Pointer to a device private struct.
+ * @ctx: Pointer to the context resource.
+ * The cotable resource will not add a refcount.
+ * @type: The cotable type.
+ */
+struct vmw_resource *vmw_cotable_alloc(struct vmw_private *dev_priv,
+				       struct vmw_resource *ctx,
+				       u32 type)
+{
+	struct vmw_cotable *vcotbl;
+	int ret;
+	u32 num_entries;
+
+	if (unlikely(cotable_acc_size == 0))
+		cotable_acc_size = ttm_round_pot(sizeof(struct vmw_cotable));
+
+	ret = ttm_mem_global_alloc(vmw_mem_glob(dev_priv),
+				   cotable_acc_size, false, true);
+	if (unlikely(ret))
+		return ERR_PTR(ret);
+
+	vcotbl = kzalloc(sizeof(*vcotbl), GFP_KERNEL);
+	if (unlikely(vcotbl == NULL)) {
+		ret = -ENOMEM;
+		goto out_no_alloc;
+	}
+
+	ret = vmw_resource_init(dev_priv, &vcotbl->res, true,
+				vmw_cotable_free, &vmw_cotable_func);
+	if (unlikely(ret != 0))
+		goto out_no_init;
+
+	INIT_LIST_HEAD(&vcotbl->resource_list);
+	vcotbl->res.id = type;
+	vcotbl->res.backup_size = PAGE_SIZE;
+	num_entries = PAGE_SIZE / co_info[type].size;
+	if (num_entries < co_info[type].min_initial_entries) {
+		vcotbl->res.backup_size = co_info[type].min_initial_entries *
+			co_info[type].size;
+		vcotbl->res.backup_size =
+			(vcotbl->res.backup_size + PAGE_SIZE - 1) & PAGE_MASK;
+	}
+
+	vcotbl->scrubbed = true;
+	vcotbl->seen_entries = -1;
+	vcotbl->type = type;
+	vcotbl->ctx = ctx;
+
+	vmw_resource_activate(&vcotbl->res, vmw_hw_cotable_destroy);
+
+	return &vcotbl->res;
+
+out_no_init:
+	kfree(vcotbl);
+out_no_alloc:
+	ttm_mem_global_free(vmw_mem_glob(dev_priv), cotable_acc_size);
+	return ERR_PTR(ret);
+}
+
+/**
+ * vmw_cotable_notify - Notify the cotable about an item creation
+ *
+ * @res: Pointer to a cotable resource.
+ * @id: Item id.
+ */
+int vmw_cotable_notify(struct vmw_resource *res, int id)
+{
+	struct vmw_cotable *vcotbl = vmw_cotable(res);
+
+	if (id < 0 || id >= SVGA_COTABLE_MAX_IDS) {
+		DRM_ERROR("Illegal COTable id. Type is %u. Id is %d\n",
+			  (unsigned) vcotbl->type, id);
+		return -EINVAL;
+	}
+
+	if (vcotbl->seen_entries < id) {
+		/* Trigger a call to create() on next validate */
+		res->id = -1;
+		vcotbl->seen_entries = id;
+	}
+
+	return 0;
+}
+
+/**
+ * vmw_cotable_add_view - add a view to the cotable's list of active views.
+ *
+ * @res: pointer struct vmw_resource representing the cotable.
+ * @head: pointer to the struct list_head member of the resource, dedicated
+ * to the cotable active resource list.
+ */
+void vmw_cotable_add_resource(struct vmw_resource *res, struct list_head *head)
+{
+	struct vmw_cotable *vcotbl =
+		container_of(res, struct vmw_cotable, res);
+
+	list_add_tail(head, &vcotbl->resource_list);
+}
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
index b83adea43f3a..fd0cb8c67d05 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
@@ -28,6 +28,7 @@
 
 #include <drm/drmP.h>
 #include "vmwgfx_drv.h"
+#include "vmwgfx_binding.h"
 #include <drm/ttm/ttm_placement.h>
 #include <drm/ttm/ttm_bo_driver.h>
 #include <drm/ttm/ttm_object.h>
@@ -127,6 +128,9 @@
 #define DRM_IOCTL_VMW_SYNCCPU					\
 	DRM_IOW(DRM_COMMAND_BASE + DRM_VMW_SYNCCPU,		\
 		 struct drm_vmw_synccpu_arg)
+#define DRM_IOCTL_VMW_CREATE_EXTENDED_CONTEXT			\
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VMW_CREATE_EXTENDED_CONTEXT,	\
+		struct drm_vmw_context_arg)
 
 /**
  * The core DRM version of this macro doesn't account for
@@ -168,8 +172,8 @@ static const struct drm_ioctl_desc vmw_ioctls[] = {
 		      DRM_UNLOCKED | DRM_RENDER_ALLOW),
 	VMW_IOCTL_DEF(VMW_REF_SURFACE, vmw_surface_reference_ioctl,
 		      DRM_AUTH | DRM_UNLOCKED | DRM_RENDER_ALLOW),
-	VMW_IOCTL_DEF(VMW_EXECBUF, vmw_execbuf_ioctl,
-		      DRM_AUTH | DRM_UNLOCKED | DRM_RENDER_ALLOW),
+	VMW_IOCTL_DEF(VMW_EXECBUF, NULL, DRM_AUTH | DRM_UNLOCKED |
+		      DRM_RENDER_ALLOW),
 	VMW_IOCTL_DEF(VMW_FENCE_WAIT, vmw_fence_obj_wait_ioctl,
 		      DRM_UNLOCKED | DRM_RENDER_ALLOW),
 	VMW_IOCTL_DEF(VMW_FENCE_SIGNALED,
@@ -206,6 +210,9 @@ static const struct drm_ioctl_desc vmw_ioctls[] = {
 	VMW_IOCTL_DEF(VMW_SYNCCPU,
 		      vmw_user_dmabuf_synccpu_ioctl,
 		      DRM_UNLOCKED | DRM_RENDER_ALLOW),
+	VMW_IOCTL_DEF(VMW_CREATE_EXTENDED_CONTEXT,
+		      vmw_extended_context_define_ioctl,
+		      DRM_AUTH | DRM_UNLOCKED | DRM_RENDER_ALLOW),
 };
 
 static struct pci_device_id vmw_pci_id_list[] = {
@@ -390,8 +397,10 @@ static int vmw_request_device(struct vmw_private *dev_priv)
 	}
 	vmw_fence_fifo_up(dev_priv->fman);
 	dev_priv->cman = vmw_cmdbuf_man_create(dev_priv);
-	if (IS_ERR(dev_priv->cman))
+	if (IS_ERR(dev_priv->cman)) {
 		dev_priv->cman = NULL;
+		dev_priv->has_dx = false;
+	}
 
 	ret = vmw_request_device_late(dev_priv);
 	if (ret)
@@ -848,6 +857,14 @@ static int vmw_driver_load(struct drm_device *dev, unsigned long chipset)
 		}
 	}
 
+	if (dev_priv->has_mob) {
+		spin_lock(&dev_priv->cap_lock);
+		vmw_write(dev_priv, SVGA_REG_DEV_CAP, SVGA3D_DEVCAP_DX);
+		dev_priv->has_dx = !!vmw_read(dev_priv, SVGA_REG_DEV_CAP);
+		spin_unlock(&dev_priv->cap_lock);
+	}
+
+
 	ret = vmw_kms_init(dev_priv);
 	if (unlikely(ret != 0))
 		goto out_no_kms;
@@ -857,6 +874,8 @@ static int vmw_driver_load(struct drm_device *dev, unsigned long chipset)
 	if (ret)
 		goto out_no_fifo;
 
+	DRM_INFO("DX: %s\n", dev_priv->has_dx ? "yes." : "no.");
+
 	if (dev_priv->enable_fb) {
 		vmw_fifo_resource_inc(dev_priv);
 		vmw_svga_enable(dev_priv);
@@ -900,6 +919,8 @@ out_err0:
 	for (i = vmw_res_context; i < vmw_res_max; ++i)
 		idr_destroy(&dev_priv->res_idr[i]);
 
+	if (dev_priv->ctx.staged_bindings)
+		vmw_binding_state_free(dev_priv->ctx.staged_bindings);
 	kfree(dev_priv);
 	return ret;
 }
@@ -945,6 +966,8 @@ static int vmw_driver_unload(struct drm_device *dev)
 	iounmap(dev_priv->mmio_virt);
 	arch_phys_wc_del(dev_priv->mmio_mtrr);
 	(void)ttm_bo_device_release(&dev_priv->bdev);
+	if (dev_priv->ctx.staged_bindings)
+		vmw_binding_state_free(dev_priv->ctx.staged_bindings);
 	vmw_ttm_global_release(dev_priv);
 
 	for (i = vmw_res_context; i < vmw_res_max; ++i)
@@ -1082,11 +1105,21 @@ static long vmw_generic_ioctl(struct file *filp, unsigned int cmd,
 		const struct drm_ioctl_desc *ioctl =
 			&vmw_ioctls[nr - DRM_COMMAND_BASE];
 
-		if (unlikely(ioctl->cmd != cmd)) {
-			DRM_ERROR("Invalid command format, ioctl %d\n",
-				  nr - DRM_COMMAND_BASE);
-			return -EINVAL;
+		if (nr == DRM_COMMAND_BASE + DRM_VMW_EXECBUF) {
+			ret = (long) drm_ioctl_permit(ioctl->flags, file_priv);
+			if (unlikely(ret != 0))
+				return ret;
+
+			if (unlikely((cmd & (IOC_IN | IOC_OUT)) != IOC_IN))
+				goto out_io_encoding;
+
+			return (long) vmw_execbuf_ioctl(dev, arg, file_priv,
+							_IOC_SIZE(cmd));
 		}
+
+		if (unlikely(ioctl->cmd != cmd))
+			goto out_io_encoding;
+
 		flags = ioctl->flags;
 	} else if (!drm_ioctl_flags(nr, &flags))
 		return -EINVAL;
@@ -1106,6 +1139,12 @@ static long vmw_generic_ioctl(struct file *filp, unsigned int cmd,
 		ttm_read_unlock(&vmaster->lock);
 
 	return ret;
+
+out_io_encoding:
+	DRM_ERROR("Invalid command format, ioctl %d\n",
+		  nr - DRM_COMMAND_BASE);
+
+	return -EINVAL;
 }
 
 static long vmw_unlocked_ioctl(struct file *filp, unsigned int cmd,
@@ -1156,7 +1195,6 @@ static void vmw_master_destroy(struct drm_device *dev,
 	kfree(vmaster);
 }
 
-
 static int vmw_master_set(struct drm_device *dev,
 			  struct drm_file *file_priv,
 			  bool from_open)
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
index f513e444125d..b88ea50b7d95 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
@@ -59,6 +59,8 @@
 #define VMWGFX_NUM_GB_SHADER 20000
 #define VMWGFX_NUM_GB_SURFACE 32768
 #define VMWGFX_NUM_GB_SCREEN_TARGET VMWGFX_MAX_DISPLAYS
+#define VMWGFX_NUM_DXCONTEXT 256
+#define VMWGFX_NUM_DXQUERY 512
 #define VMWGFX_NUM_MOB (VMWGFX_NUM_GB_CONTEXT +\
 			VMWGFX_NUM_GB_SHADER +\
 			VMWGFX_NUM_GB_SURFACE +\
@@ -132,6 +134,9 @@ enum vmw_res_type {
 	vmw_res_surface,
 	vmw_res_stream,
 	vmw_res_shader,
+	vmw_res_dx_context,
+	vmw_res_cotable,
+	vmw_res_view,
 	vmw_res_max
 };
 
@@ -139,7 +144,8 @@ enum vmw_res_type {
  * Resources that are managed using command streams.
  */
 enum vmw_cmdbuf_res_type {
-	vmw_cmdbuf_res_compat_shader
+	vmw_cmdbuf_res_shader,
+	vmw_cmdbuf_res_view
 };
 
 struct vmw_cmdbuf_res_manager;
@@ -162,11 +168,13 @@ struct vmw_surface {
 	struct drm_vmw_size *sizes;
 	uint32_t num_sizes;
 	bool scanout;
+	uint32_t array_size;
 	/* TODO so far just a extra pointer */
 	struct vmw_cursor_snooper snooper;
 	struct vmw_surface_offset *offsets;
 	SVGA3dTextureFilter autogen_filter;
 	uint32_t multisample_count;
+	struct list_head view_list;
 };
 
 struct vmw_marker_queue {
@@ -186,6 +194,7 @@ struct vmw_fifo_state {
 	struct mutex fifo_mutex;
 	struct rw_semaphore rwsem;
 	struct vmw_marker_queue marker_queue;
+	bool dx;
 };
 
 struct vmw_relocation {
@@ -265,73 +274,6 @@ struct vmw_piter {
 	struct page *(*page)(struct vmw_piter *);
 };
 
-/*
- * enum vmw_ctx_binding_type - abstract resource to context binding types
- */
-enum vmw_ctx_binding_type {
-	vmw_ctx_binding_shader,
-	vmw_ctx_binding_rt,
-	vmw_ctx_binding_tex,
-	vmw_ctx_binding_max
-};
-
-/**
- * struct vmw_ctx_bindinfo - structure representing a single context binding
- *
- * @ctx: Pointer to the context structure. NULL means the binding is not
- * active.
- * @res: Non ref-counted pointer to the bound resource.
- * @bt: The binding type.
- * @i1: Union of information needed to unbind.
- */
-struct vmw_ctx_bindinfo {
-	struct vmw_resource *ctx;
-	struct vmw_resource *res;
-	enum vmw_ctx_binding_type bt;
-	bool scrubbed;
-	union {
-		SVGA3dShaderType shader_type;
-		SVGA3dRenderTargetType rt_type;
-		uint32 texture_stage;
-	} i1;
-};
-
-/**
- * struct vmw_ctx_binding - structure representing a single context binding
- *                        - suitable for tracking in a context
- *
- * @ctx_list: List head for context.
- * @res_list: List head for bound resource.
- * @bi: Binding info
- */
-struct vmw_ctx_binding {
-	struct list_head ctx_list;
-	struct list_head res_list;
-	struct vmw_ctx_bindinfo bi;
-};
-
-
-/**
- * struct vmw_ctx_binding_state - context binding state
- *
- * @list: linked list of individual bindings.
- * @render_targets: Render target bindings.
- * @texture_units: Texture units/samplers bindings.
- * @shaders: Shader bindings.
- *
- * Note that this structure also provides storage space for the individual
- * struct vmw_ctx_binding objects, so that no dynamic allocation is needed
- * for individual bindings.
- *
- */
-struct vmw_ctx_binding_state {
-	struct list_head list;
-	struct vmw_ctx_binding render_targets[SVGA3D_RT_MAX];
-	struct vmw_ctx_binding texture_units[SVGA3D_NUM_TEXTURE_UNITS];
-	struct vmw_ctx_binding shaders[SVGA3D_SHADERTYPE_PREDX_MAX];
-};
-
-
 /*
  * enum vmw_display_unit_type - Describes the display unit
  */
@@ -356,6 +298,7 @@ struct vmw_sw_context{
 	uint32_t *cmd_bounce;
 	uint32_t cmd_bounce_size;
 	struct list_head resource_list;
+	struct list_head ctx_resource_list; /* For contexts and cotables */
 	struct vmw_dma_buffer *cur_query_bo;
 	struct list_head res_relocations;
 	uint32_t *buf_start;
@@ -363,8 +306,13 @@ struct vmw_sw_context{
 	struct vmw_resource *last_query_ctx;
 	bool needs_post_query_barrier;
 	struct vmw_resource *error_resource;
-	struct vmw_ctx_binding_state staged_bindings;
+	struct vmw_ctx_binding_state *staged_bindings;
+	bool staged_bindings_inuse;
 	struct list_head staged_cmd_res;
+	struct vmw_resource_val_node *dx_ctx_node;
+	struct vmw_dma_buffer *dx_query_mob;
+	struct vmw_resource *dx_query_ctx;
+	struct vmw_cmdbuf_res_manager *man;
 };
 
 struct vmw_legacy_display;
@@ -382,6 +330,26 @@ struct vmw_vga_topology_state {
 	uint32_t pos_y;
 };
 
+
+/*
+ * struct vmw_otable - Guest Memory OBject table metadata
+ *
+ * @size:           Size of the table (page-aligned).
+ * @page_table:     Pointer to a struct vmw_mob holding the page table.
+ */
+struct vmw_otable {
+	unsigned long size;
+	struct vmw_mob *page_table;
+	bool enabled;
+};
+
+struct vmw_otable_batch {
+	unsigned num_otables;
+	struct vmw_otable *otables;
+	struct vmw_resource *context;
+	struct ttm_buffer_object *otable_bo;
+};
+
 struct vmw_private {
 	struct ttm_bo_device bdev;
 	struct ttm_bo_global_ref bo_global_ref;
@@ -417,6 +385,7 @@ struct vmw_private {
 	bool has_mob;
 	spinlock_t hw_lock;
 	spinlock_t cap_lock;
+	bool has_dx;
 
 	/*
 	 * VGA registers.
@@ -552,8 +521,7 @@ struct vmw_private {
 	/*
 	 * Guest Backed stuff
 	 */
-	struct ttm_buffer_object *otable_bo;
-	struct vmw_otable *otables;
+	struct vmw_otable_batch otable_batch;
 
 	struct vmw_cmdbuf_man *cman;
 };
@@ -685,6 +653,7 @@ extern int vmw_user_stream_lookup(struct vmw_private *dev_priv,
 				  uint32_t *inout_id,
 				  struct vmw_resource **out);
 extern void vmw_resource_unreserve(struct vmw_resource *res,
+				   bool switch_backup,
 				   struct vmw_dma_buffer *new_backup,
 				   unsigned long new_backup_offset);
 extern void vmw_resource_move_notify(struct ttm_buffer_object *bo,
@@ -742,7 +711,10 @@ extern int vmw_fifo_init(struct vmw_private *dev_priv,
 extern void vmw_fifo_release(struct vmw_private *dev_priv,
 			     struct vmw_fifo_state *fifo);
 extern void *vmw_fifo_reserve(struct vmw_private *dev_priv, uint32_t bytes);
+extern void *
+vmw_fifo_reserve_dx(struct vmw_private *dev_priv, uint32_t bytes, int ctx_id);
 extern void vmw_fifo_commit(struct vmw_private *dev_priv, uint32_t bytes);
+extern void vmw_fifo_commit_flush(struct vmw_private *dev_priv, uint32_t bytes);
 extern int vmw_fifo_send_fence(struct vmw_private *dev_priv,
 			       uint32_t *seqno);
 extern void vmw_fifo_ping_host_locked(struct vmw_private *, uint32_t reason);
@@ -828,14 +800,15 @@ static inline struct page *vmw_piter_page(struct vmw_piter *viter)
  * Command submission - vmwgfx_execbuf.c
  */
 
-extern int vmw_execbuf_ioctl(struct drm_device *dev, void *data,
-			     struct drm_file *file_priv);
+extern int vmw_execbuf_ioctl(struct drm_device *dev, unsigned long data,
+			     struct drm_file *file_priv, size_t size);
 extern int vmw_execbuf_process(struct drm_file *file_priv,
 			       struct vmw_private *dev_priv,
 			       void __user *user_commands,
 			       void *kernel_commands,
 			       uint32_t command_size,
 			       uint64_t throttle_us,
+			       uint32_t dx_context_handle,
 			       struct drm_vmw_fence_rep __user
 			       *user_fence_rep,
 			       struct vmw_fence_obj **out_fence);
@@ -960,6 +933,7 @@ int vmw_dumb_destroy(struct drm_file *file_priv,
 		     uint32_t handle);
 extern int vmw_resource_pin(struct vmw_resource *res, bool interruptible);
 extern void vmw_resource_unpin(struct vmw_resource *res);
+extern enum vmw_res_type vmw_res_type(const struct vmw_resource *res);
 
 /**
  * Overlay control - vmwgfx_overlay.c
@@ -1016,27 +990,28 @@ extern void vmw_otables_takedown(struct vmw_private *dev_priv);
 
 extern const struct vmw_user_resource_conv *user_context_converter;
 
-extern struct vmw_resource *vmw_context_alloc(struct vmw_private *dev_priv);
-
 extern int vmw_context_check(struct vmw_private *dev_priv,
 			     struct ttm_object_file *tfile,
 			     int id,
 			     struct vmw_resource **p_res);
 extern int vmw_context_define_ioctl(struct drm_device *dev, void *data,
 				    struct drm_file *file_priv);
+extern int vmw_extended_context_define_ioctl(struct drm_device *dev, void *data,
+					     struct drm_file *file_priv);
 extern int vmw_context_destroy_ioctl(struct drm_device *dev, void *data,
 				     struct drm_file *file_priv);
-extern int vmw_context_binding_add(struct vmw_ctx_binding_state *cbs,
-				   const struct vmw_ctx_bindinfo *ci);
-extern void
-vmw_context_binding_state_transfer(struct vmw_resource *res,
-				   struct vmw_ctx_binding_state *cbs);
-extern void vmw_context_binding_res_list_kill(struct list_head *head);
-extern void vmw_context_binding_res_list_scrub(struct list_head *head);
-extern int vmw_context_rebind_all(struct vmw_resource *ctx);
 extern struct list_head *vmw_context_binding_list(struct vmw_resource *ctx);
 extern struct vmw_cmdbuf_res_manager *
 vmw_context_res_man(struct vmw_resource *ctx);
+extern struct vmw_resource *vmw_context_cotable(struct vmw_resource *ctx,
+						SVGACOTableType cotable_type);
+extern struct list_head *vmw_context_binding_list(struct vmw_resource *ctx);
+struct vmw_ctx_binding_state;
+extern struct vmw_ctx_binding_state *
+vmw_context_binding_state(struct vmw_resource *ctx);
+extern void vmw_dx_context_scrub_cotables(struct vmw_resource *ctx,
+					  bool readback);
+
 /*
  * Surface management - vmwgfx_surface.c
  */
@@ -1066,6 +1041,7 @@ int vmw_surface_gb_priv_define(struct drm_device *dev,
 			       bool for_scanout,
 			       uint32_t num_mip_levels,
 			       uint32_t multisample_count,
+			       uint32_t array_size,
 			       struct drm_vmw_size size,
 			       struct vmw_surface **srf_out);
 
@@ -1085,12 +1061,21 @@ extern int vmw_compat_shader_add(struct vmw_private *dev_priv,
 				 SVGA3dShaderType shader_type,
 				 size_t size,
 				 struct list_head *list);
-extern int vmw_compat_shader_remove(struct vmw_cmdbuf_res_manager *man,
-				    u32 user_key, SVGA3dShaderType shader_type,
-				    struct list_head *list);
+extern int vmw_shader_remove(struct vmw_cmdbuf_res_manager *man,
+			     u32 user_key, SVGA3dShaderType shader_type,
+			     struct list_head *list);
+extern int vmw_dx_shader_add(struct vmw_cmdbuf_res_manager *man,
+			     struct vmw_resource *ctx,
+			     u32 user_key,
+			     SVGA3dShaderType shader_type,
+			     struct list_head *list);
+extern void vmw_dx_shader_cotable_list_scrub(struct vmw_private *dev_priv,
+					     struct list_head *list,
+					     bool readback);
+
 extern struct vmw_resource *
-vmw_compat_shader_lookup(struct vmw_cmdbuf_res_manager *man,
-			 u32 user_key, SVGA3dShaderType shader_type);
+vmw_shader_lookup(struct vmw_cmdbuf_res_manager *man,
+		  u32 user_key, SVGA3dShaderType shader_type);
 
 /*
  * Command buffer managed resources - vmwgfx_cmdbuf_res.c
@@ -1114,8 +1099,20 @@ extern int vmw_cmdbuf_res_add(struct vmw_cmdbuf_res_manager *man,
 extern int vmw_cmdbuf_res_remove(struct vmw_cmdbuf_res_manager *man,
 				 enum vmw_cmdbuf_res_type res_type,
 				 u32 user_key,
-				 struct list_head *list);
+				 struct list_head *list,
+				 struct vmw_resource **res);
 
+/*
+ * COTable management - vmwgfx_cotable.c
+ */
+extern const SVGACOTableType vmw_cotable_scrub_order[];
+extern struct vmw_resource *vmw_cotable_alloc(struct vmw_private *dev_priv,
+					      struct vmw_resource *ctx,
+					      u32 type);
+extern int vmw_cotable_notify(struct vmw_resource *res, int id);
+extern int vmw_cotable_scrub(struct vmw_resource *res, bool readback);
+extern void vmw_cotable_add_resource(struct vmw_resource *ctx,
+				     struct list_head *head);
 
 /*
  * Command buffer managerment vmwgfx_cmdbuf.c
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
index 847264f8a33a..401305bbb810 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
@@ -29,6 +29,8 @@
 #include "vmwgfx_reg.h"
 #include <drm/ttm/ttm_bo_api.h>
 #include <drm/ttm/ttm_placement.h>
+#include "vmwgfx_so.h"
+#include "vmwgfx_binding.h"
 
 #define VMW_RES_HT_ORDER 12
 
@@ -59,8 +61,11 @@ struct vmw_resource_relocation {
  * @new_backup_offset: New backup buffer offset if @new_backup is non-NUll.
  * @first_usage: Set to true the first time the resource is referenced in
  * the command stream.
- * @no_buffer_needed: Resources do not need to allocate buffer backup on
- * reservation. The command stream will provide one.
+ * @switching_backup: The command stream provides a new backup buffer for a
+ * resource.
+ * @no_buffer_needed: This means @switching_backup is true on first buffer
+ * reference. So resource reservation does not need to allocate a backup
+ * buffer for the resource.
  */
 struct vmw_resource_val_node {
 	struct list_head head;
@@ -69,8 +74,9 @@ struct vmw_resource_val_node {
 	struct vmw_dma_buffer *new_backup;
 	struct vmw_ctx_binding_state *staged_bindings;
 	unsigned long new_backup_offset;
-	bool first_usage;
-	bool no_buffer_needed;
+	u32 first_usage : 1;
+	u32 switching_backup : 1;
+	u32 no_buffer_needed : 1;
 };
 
 /**
@@ -92,6 +98,10 @@ struct vmw_cmd_entry {
 	[(_cmd) - SVGA_3D_CMD_BASE] = {(_func), (_user_allow),\
 				       (_gb_disable), (_gb_enable)}
 
+static int vmw_resource_context_res_add(struct vmw_private *dev_priv,
+					struct vmw_sw_context *sw_context,
+					struct vmw_resource *ctx);
+
 /**
  * vmw_resource_unreserve - unreserve resources previously reserved for
  * command submission.
@@ -99,15 +109,16 @@ struct vmw_cmd_entry {
  * @list_head: list of resources to unreserve.
  * @backoff: Whether command submission failed.
  */
-static void vmw_resource_list_unreserve(struct list_head *list,
+static void vmw_resource_list_unreserve(struct vmw_sw_context *sw_context,
+					struct list_head *list,
 					bool backoff)
 {
 	struct vmw_resource_val_node *val;
 
 	list_for_each_entry(val, list, head) {
 		struct vmw_resource *res = val->res;
-		struct vmw_dma_buffer *new_backup =
-			backoff ? NULL : val->new_backup;
+		bool switch_backup =
+			(backoff) ? false : val->switching_backup;
 
 		/*
 		 * Transfer staged context bindings to the
@@ -115,18 +126,71 @@ static void vmw_resource_list_unreserve(struct list_head *list,
 		 */
 		if (unlikely(val->staged_bindings)) {
 			if (!backoff) {
-				vmw_context_binding_state_transfer
-					(val->res, val->staged_bindings);
+				vmw_binding_state_commit
+					(vmw_context_binding_state(val->res),
+					 val->staged_bindings);
 			}
-			kfree(val->staged_bindings);
+
+			if (val->staged_bindings != sw_context->staged_bindings)
+				vmw_binding_state_free(val->staged_bindings);
+			else
+				sw_context->staged_bindings_inuse = false;
 			val->staged_bindings = NULL;
 		}
-		vmw_resource_unreserve(res, new_backup,
-			val->new_backup_offset);
+		vmw_resource_unreserve(res, switch_backup, val->new_backup,
+				       val->new_backup_offset);
 		vmw_dmabuf_unreference(&val->new_backup);
 	}
 }
 
+/**
+ * vmw_cmd_ctx_first_setup - Perform the setup needed when a context is
+ * added to the validate list.
+ *
+ * @dev_priv: Pointer to the device private:
+ * @sw_context: The validation context:
+ * @node: The validation node holding this context.
+ */
+static int vmw_cmd_ctx_first_setup(struct vmw_private *dev_priv,
+				   struct vmw_sw_context *sw_context,
+				   struct vmw_resource_val_node *node)
+{
+	int ret;
+
+	ret = vmw_resource_context_res_add(dev_priv, sw_context, node->res);
+	if (unlikely(ret != 0))
+		goto out_err;
+
+	if (!sw_context->staged_bindings) {
+		sw_context->staged_bindings =
+			vmw_binding_state_alloc(dev_priv);
+		if (IS_ERR(sw_context->staged_bindings)) {
+			DRM_ERROR("Failed to allocate context binding "
+				  "information.\n");
+			ret = PTR_ERR(sw_context->staged_bindings);
+			sw_context->staged_bindings = NULL;
+			goto out_err;
+		}
+	}
+
+	if (sw_context->staged_bindings_inuse) {
+		node->staged_bindings = vmw_binding_state_alloc(dev_priv);
+		if (IS_ERR(node->staged_bindings)) {
+			DRM_ERROR("Failed to allocate context binding "
+				  "information.\n");
+			ret = PTR_ERR(node->staged_bindings);
+			node->staged_bindings = NULL;
+			goto out_err;
+		}
+	} else {
+		node->staged_bindings = sw_context->staged_bindings;
+		sw_context->staged_bindings_inuse = true;
+	}
+
+	return 0;
+out_err:
+	return ret;
+}
 
 /**
  * vmw_resource_val_add - Add a resource to the software context's
@@ -141,6 +205,7 @@ static int vmw_resource_val_add(struct vmw_sw_context *sw_context,
 				struct vmw_resource *res,
 				struct vmw_resource_val_node **p_node)
 {
+	struct vmw_private *dev_priv = res->dev_priv;
 	struct vmw_resource_val_node *node;
 	struct drm_hash_item *hash;
 	int ret;
@@ -169,14 +234,90 @@ static int vmw_resource_val_add(struct vmw_sw_context *sw_context,
 		kfree(node);
 		return ret;
 	}
-	list_add_tail(&node->head, &sw_context->resource_list);
 	node->res = vmw_resource_reference(res);
 	node->first_usage = true;
-
 	if (unlikely(p_node != NULL))
 		*p_node = node;
 
-	return 0;
+	if (!dev_priv->has_mob) {
+		list_add_tail(&node->head, &sw_context->resource_list);
+		return 0;
+	}
+
+	switch (vmw_res_type(res)) {
+	case vmw_res_context:
+	case vmw_res_dx_context:
+		list_add(&node->head, &sw_context->ctx_resource_list);
+		ret = vmw_cmd_ctx_first_setup(dev_priv, sw_context, node);
+		break;
+	case vmw_res_cotable:
+		list_add_tail(&node->head, &sw_context->ctx_resource_list);
+		break;
+	default:
+		list_add_tail(&node->head, &sw_context->resource_list);
+		break;
+	}
+
+	return ret;
+}
+
+/**
+ * vmw_view_res_val_add - Add a view and the surface it's pointing to
+ * to the validation list
+ *
+ * @sw_context: The software context holding the validation list.
+ * @view: Pointer to the view resource.
+ *
+ * Returns 0 if success, negative error code otherwise.
+ */
+static int vmw_view_res_val_add(struct vmw_sw_context *sw_context,
+				struct vmw_resource *view)
+{
+	int ret;
+
+	/*
+	 * First add the resource the view is pointing to, otherwise
+	 * it may be swapped out when the view is validated.
+	 */
+	ret = vmw_resource_val_add(sw_context, vmw_view_srf(view), NULL);
+	if (ret)
+		return ret;
+
+	return vmw_resource_val_add(sw_context, view, NULL);
+}
+
+/**
+ * vmw_view_id_val_add - Look up a view and add it and the surface it's
+ * pointing to to the validation list.
+ *
+ * @sw_context: The software context holding the validation list.
+ * @view_type: The view type to look up.
+ * @id: view id of the view.
+ *
+ * The view is represented by a view id and the DX context it's created on,
+ * or scheduled for creation on. If there is no DX context set, the function
+ * will return -EINVAL. Otherwise returns 0 on success and -EINVAL on failure.
+ */
+static int vmw_view_id_val_add(struct vmw_sw_context *sw_context,
+			       enum vmw_view_type view_type, u32 id)
+{
+	struct vmw_resource_val_node *ctx_node = sw_context->dx_ctx_node;
+	struct vmw_resource *view;
+	int ret;
+
+	if (!ctx_node) {
+		DRM_ERROR("DX Context not set.\n");
+		return -EINVAL;
+	}
+
+	view = vmw_view_lookup(sw_context->man, view_type, id);
+	if (IS_ERR(view))
+		return PTR_ERR(view);
+
+	ret = vmw_view_res_val_add(sw_context, view);
+	vmw_resource_unreference(&view);
+
+	return ret;
 }
 
 /**
@@ -195,19 +336,41 @@ static int vmw_resource_context_res_add(struct vmw_private *dev_priv,
 					struct vmw_resource *ctx)
 {
 	struct list_head *binding_list;
-	struct vmw_ctx_binding *entry;
+	struct vmw_ctx_bindinfo *entry;
 	int ret = 0;
 	struct vmw_resource *res;
+	u32 i;
+
+	/* Add all cotables to the validation list. */
+	if (dev_priv->has_dx && vmw_res_type(ctx) == vmw_res_dx_context) {
+		for (i = 0; i < SVGA_COTABLE_DX10_MAX; ++i) {
+			res = vmw_context_cotable(ctx, i);
+			if (IS_ERR(res))
+				continue;
+
+			ret = vmw_resource_val_add(sw_context, res, NULL);
+			vmw_resource_unreference(&res);
+			if (unlikely(ret != 0))
+				return ret;
+		}
+	}
+
 
+	/* Add all resources bound to the context to the validation list */
 	mutex_lock(&dev_priv->binding_mutex);
 	binding_list = vmw_context_binding_list(ctx);
 
 	list_for_each_entry(entry, binding_list, ctx_list) {
-		res = vmw_resource_reference_unless_doomed(entry->bi.res);
+		/* entry->res is not refcounted */
+		res = vmw_resource_reference_unless_doomed(entry->res);
 		if (unlikely(res == NULL))
 			continue;
 
-		ret = vmw_resource_val_add(sw_context, entry->bi.res, NULL);
+		if (vmw_res_type(entry->res) == vmw_res_view)
+			ret = vmw_view_res_val_add(sw_context, entry->res);
+		else
+			ret = vmw_resource_val_add(sw_context, entry->res,
+						   NULL);
 		vmw_resource_unreference(&res);
 		if (unlikely(ret != 0))
 			break;
@@ -409,6 +572,7 @@ static int vmw_resources_validate(struct vmw_sw_context *sw_context)
 
 	list_for_each_entry(val, &sw_context->resource_list, head) {
 		struct vmw_resource *res = val->res;
+		struct vmw_dma_buffer *backup = res->backup;
 
 		ret = vmw_resource_validate(res);
 		if (unlikely(ret != 0)) {
@@ -416,18 +580,29 @@ static int vmw_resources_validate(struct vmw_sw_context *sw_context)
 				DRM_ERROR("Failed to validate resource.\n");
 			return ret;
 		}
+
+		/* Check if the resource switched backup buffer */
+		if (backup && res->backup && (backup != res->backup)) {
+			struct vmw_dma_buffer *vbo = res->backup;
+
+			ret = vmw_bo_to_validate_list
+				(sw_context, vbo,
+				 vmw_resource_needs_backup(res), NULL);
+			if (ret) {
+				ttm_bo_unreserve(&vbo->base);
+				return ret;
+			}
+		}
 	}
 	return 0;
 }
 
-
 /**
  * vmw_cmd_res_reloc_add - Add a resource to a software context's
  * relocation- and validation lists.
  *
  * @dev_priv: Pointer to a struct vmw_private identifying the device.
  * @sw_context: Pointer to the software context.
- * @res_type: Resource type.
  * @id_loc: Pointer to where the id that needs translation is located.
  * @res: Valid pointer to a struct vmw_resource.
  * @p_val: If non null, a pointer to the struct vmw_resource_validate_node
@@ -435,7 +610,6 @@ static int vmw_resources_validate(struct vmw_sw_context *sw_context)
  */
 static int vmw_cmd_res_reloc_add(struct vmw_private *dev_priv,
 				 struct vmw_sw_context *sw_context,
-				 enum vmw_res_type res_type,
 				 uint32_t *id_loc,
 				 struct vmw_resource *res,
 				 struct vmw_resource_val_node **p_val)
@@ -454,29 +628,6 @@ static int vmw_cmd_res_reloc_add(struct vmw_private *dev_priv,
 	if (unlikely(ret != 0))
 		return ret;
 
-	if (res_type == vmw_res_context && dev_priv->has_mob &&
-	    node->first_usage) {
-
-		/*
-		 * Put contexts first on the list to be able to exit
-		 * list traversal for contexts early.
-		 */
-		list_del(&node->head);
-		list_add(&node->head, &sw_context->resource_list);
-
-		ret = vmw_resource_context_res_add(dev_priv, sw_context, res);
-		if (unlikely(ret != 0))
-			return ret;
-		node->staged_bindings =
-			kzalloc(sizeof(*node->staged_bindings), GFP_KERNEL);
-		if (node->staged_bindings == NULL) {
-			DRM_ERROR("Failed to allocate context binding "
-				  "information.\n");
-			return -ENOMEM;
-		}
-		INIT_LIST_HEAD(&node->staged_bindings->list);
-	}
-
 	if (p_val)
 		*p_val = node;
 
@@ -554,7 +705,7 @@ vmw_cmd_res_check(struct vmw_private *dev_priv,
 	rcache->res = res;
 	rcache->handle = *id_loc;
 
-	ret = vmw_cmd_res_reloc_add(dev_priv, sw_context, res_type, id_loc,
+	ret = vmw_cmd_res_reloc_add(dev_priv, sw_context, id_loc,
 				    res, &node);
 	if (unlikely(ret != 0))
 		goto out_no_reloc;
@@ -589,7 +740,8 @@ static int vmw_rebind_contexts(struct vmw_sw_context *sw_context)
 		if (unlikely(!val->staged_bindings))
 			break;
 
-		ret = vmw_context_rebind_all(val->res);
+		ret = vmw_binding_rebind_all
+			(vmw_context_binding_state(val->res));
 		if (unlikely(ret != 0)) {
 			if (ret != -ERESTARTSYS)
 				DRM_ERROR("Failed to rebind context.\n");
@@ -600,6 +752,69 @@ static int vmw_rebind_contexts(struct vmw_sw_context *sw_context)
 	return 0;
 }
 
+/**
+ * vmw_view_bindings_add - Add an array of view bindings to a context
+ * binding state tracker.
+ *
+ * @sw_context: The execbuf state used for this command.
+ * @view_type: View type for the bindings.
+ * @binding_type: Binding type for the bindings.
+ * @shader_slot: The shader slot to user for the bindings.
+ * @view_ids: Array of view ids to be bound.
+ * @num_views: Number of view ids in @view_ids.
+ * @first_slot: The binding slot to be used for the first view id in @view_ids.
+ */
+static int vmw_view_bindings_add(struct vmw_sw_context *sw_context,
+				 enum vmw_view_type view_type,
+				 enum vmw_ctx_binding_type binding_type,
+				 uint32 shader_slot,
+				 uint32 view_ids[], u32 num_views,
+				 u32 first_slot)
+{
+	struct vmw_resource_val_node *ctx_node = sw_context->dx_ctx_node;
+	struct vmw_cmdbuf_res_manager *man;
+	u32 i;
+	int ret;
+
+	if (!ctx_node) {
+		DRM_ERROR("DX Context not set.\n");
+		return -EINVAL;
+	}
+
+	man = sw_context->man;
+	for (i = 0; i < num_views; ++i) {
+		struct vmw_ctx_bindinfo_view binding;
+		struct vmw_resource *view = NULL;
+
+		if (view_ids[i] != SVGA3D_INVALID_ID) {
+			view = vmw_view_lookup(man, view_type, view_ids[i]);
+			if (IS_ERR(view)) {
+				DRM_ERROR("View not found.\n");
+				return PTR_ERR(view);
+			}
+
+			ret = vmw_view_res_val_add(sw_context, view);
+			if (ret) {
+				DRM_ERROR("Could not add view to "
+					  "validation list.\n");
+				vmw_resource_unreference(&view);
+				return ret;
+			}
+		}
+		binding.bi.ctx = ctx_node->res;
+		binding.bi.res = view;
+		binding.bi.bt = binding_type;
+		binding.shader_slot = shader_slot;
+		binding.slot = first_slot + i;
+		vmw_binding_add(ctx_node->staged_bindings, &binding.bi,
+				shader_slot, binding.slot);
+		if (view)
+			vmw_resource_unreference(&view);
+	}
+
+	return 0;
+}
+
 /**
  * vmw_cmd_cid_check - Check a command header for valid context information.
  *
@@ -638,6 +853,12 @@ static int vmw_cmd_set_render_target_check(struct vmw_private *dev_priv,
 
 	cmd = container_of(header, struct vmw_sid_cmd, header);
 
+	if (cmd->body.type >= SVGA3D_RT_MAX) {
+		DRM_ERROR("Illegal render target type %u.\n",
+			  (unsigned) cmd->body.type);
+		return -EINVAL;
+	}
+
 	ret = vmw_cmd_res_check(dev_priv, sw_context, vmw_res_context,
 				user_context_converter, &cmd->body.cid,
 				&ctx_node);
@@ -651,13 +872,14 @@ static int vmw_cmd_set_render_target_check(struct vmw_private *dev_priv,
 		return ret;
 
 	if (dev_priv->has_mob) {
-		struct vmw_ctx_bindinfo bi;
+		struct vmw_ctx_bindinfo_view binding;
 
-		bi.ctx = ctx_node->res;
-		bi.res = res_node ? res_node->res : NULL;
-		bi.bt = vmw_ctx_binding_rt;
-		bi.i1.rt_type = cmd->body.type;
-		return vmw_context_binding_add(ctx_node->staged_bindings, &bi);
+		binding.bi.ctx = ctx_node->res;
+		binding.bi.res = res_node ? res_node->res : NULL;
+		binding.bi.bt = vmw_ctx_binding_rt;
+		binding.slot = cmd->body.type;
+		vmw_binding_add(ctx_node->staged_bindings,
+				&binding.bi, 0, binding.slot);
 	}
 
 	return 0;
@@ -1364,6 +1586,12 @@ static int vmw_cmd_tex_state(struct vmw_private *dev_priv,
 		if (likely(cur_state->name != SVGA3D_TS_BIND_TEXTURE))
 			continue;
 
+		if (cur_state->stage >= SVGA3D_NUM_TEXTURE_UNITS) {
+			DRM_ERROR("Illegal texture/sampler unit %u.\n",
+				  (unsigned) cur_state->stage);
+			return -EINVAL;
+		}
+
 		ret = vmw_cmd_res_check(dev_priv, sw_context, vmw_res_surface,
 					user_surface_converter,
 					&cur_state->value, &res_node);
@@ -1371,14 +1599,14 @@ static int vmw_cmd_tex_state(struct vmw_private *dev_priv,
 			return ret;
 
 		if (dev_priv->has_mob) {
-			struct vmw_ctx_bindinfo bi;
-
-			bi.ctx = ctx_node->res;
-			bi.res = res_node ? res_node->res : NULL;
-			bi.bt = vmw_ctx_binding_tex;
-			bi.i1.texture_stage = cur_state->stage;
-			vmw_context_binding_add(ctx_node->staged_bindings,
-						&bi);
+			struct vmw_ctx_bindinfo_tex binding;
+
+			binding.bi.ctx = ctx_node->res;
+			binding.bi.res = res_node ? res_node->res : NULL;
+			binding.bi.bt = vmw_ctx_binding_tex;
+			binding.texture_stage = cur_state->stage;
+			vmw_binding_add(ctx_node->staged_bindings, &binding.bi,
+					0, binding.texture_stage);
 		}
 	}
 
@@ -1408,6 +1636,47 @@ static int vmw_cmd_check_define_gmrfb(struct vmw_private *dev_priv,
 	return ret;
 }
 
+
+/**
+ * vmw_cmd_res_switch_backup - Utility function to handle backup buffer
+ * switching
+ *
+ * @dev_priv: Pointer to a device private struct.
+ * @sw_context: The software context being used for this batch.
+ * @val_node: The validation node representing the resource.
+ * @buf_id: Pointer to the user-space backup buffer handle in the command
+ * stream.
+ * @backup_offset: Offset of backup into MOB.
+ *
+ * This function prepares for registering a switch of backup buffers
+ * in the resource metadata just prior to unreserving. It's basically a wrapper
+ * around vmw_cmd_res_switch_backup with a different interface.
+ */
+static int vmw_cmd_res_switch_backup(struct vmw_private *dev_priv,
+				     struct vmw_sw_context *sw_context,
+				     struct vmw_resource_val_node *val_node,
+				     uint32_t *buf_id,
+				     unsigned long backup_offset)
+{
+	struct vmw_dma_buffer *dma_buf;
+	int ret;
+
+	ret = vmw_translate_mob_ptr(dev_priv, sw_context, buf_id, &dma_buf);
+	if (ret)
+		return ret;
+
+	val_node->switching_backup = true;
+	if (val_node->first_usage)
+		val_node->no_buffer_needed = true;
+
+	vmw_dmabuf_unreference(&val_node->new_backup);
+	val_node->new_backup = dma_buf;
+	val_node->new_backup_offset = backup_offset;
+
+	return 0;
+}
+
+
 /**
  * vmw_cmd_switch_backup - Utility function to handle backup buffer switching
  *
@@ -1421,7 +1690,8 @@ static int vmw_cmd_check_define_gmrfb(struct vmw_private *dev_priv,
  * @backup_offset: Offset of backup into MOB.
  *
  * This function prepares for registering a switch of backup buffers
- * in the resource metadata just prior to unreserving.
+ * in the resource metadata just prior to unreserving. It's basically a wrapper
+ * around vmw_cmd_res_switch_backup with a different interface.
  */
 static int vmw_cmd_switch_backup(struct vmw_private *dev_priv,
 				 struct vmw_sw_context *sw_context,
@@ -1432,27 +1702,16 @@ static int vmw_cmd_switch_backup(struct vmw_private *dev_priv,
 				 uint32_t *buf_id,
 				 unsigned long backup_offset)
 {
-	int ret;
-	struct vmw_dma_buffer *dma_buf;
 	struct vmw_resource_val_node *val_node;
+	int ret;
 
 	ret = vmw_cmd_res_check(dev_priv, sw_context, res_type,
 				converter, res_id, &val_node);
-	if (unlikely(ret != 0))
-		return ret;
-
-	ret = vmw_translate_mob_ptr(dev_priv, sw_context, buf_id, &dma_buf);
-	if (unlikely(ret != 0))
+	if (ret)
 		return ret;
 
-	if (val_node->first_usage)
-		val_node->no_buffer_needed = true;
-
-	vmw_dmabuf_unreference(&val_node->new_backup);
-	val_node->new_backup = dma_buf;
-	val_node->new_backup_offset = backup_offset;
-
-	return 0;
+	return vmw_cmd_res_switch_backup(dev_priv, sw_context, val_node,
+					 buf_id, backup_offset);
 }
 
 /**
@@ -1704,10 +1963,10 @@ static int vmw_cmd_shader_destroy(struct vmw_private *dev_priv,
 	if (unlikely(!dev_priv->has_mob))
 		return 0;
 
-	ret = vmw_compat_shader_remove(vmw_context_res_man(val->res),
-				       cmd->body.shid,
-				       cmd->body.type,
-				       &sw_context->staged_cmd_res);
+	ret = vmw_shader_remove(vmw_context_res_man(val->res),
+				cmd->body.shid,
+				cmd->body.type,
+				&sw_context->staged_cmd_res);
 	if (unlikely(ret != 0))
 		return ret;
 
@@ -1735,13 +1994,19 @@ static int vmw_cmd_set_shader(struct vmw_private *dev_priv,
 		SVGA3dCmdSetShader body;
 	} *cmd;
 	struct vmw_resource_val_node *ctx_node, *res_node = NULL;
-	struct vmw_ctx_bindinfo bi;
+	struct vmw_ctx_bindinfo_shader binding;
 	struct vmw_resource *res = NULL;
 	int ret;
 
 	cmd = container_of(header, struct vmw_set_shader_cmd,
 			   header);
 
+	if (cmd->body.type >= SVGA3D_SHADERTYPE_PREDX_MAX) {
+		DRM_ERROR("Illegal shader type %u.\n",
+			  (unsigned) cmd->body.type);
+		return -EINVAL;
+	}
+
 	ret = vmw_cmd_res_check(dev_priv, sw_context, vmw_res_context,
 				user_context_converter, &cmd->body.cid,
 				&ctx_node);
@@ -1752,14 +2017,12 @@ static int vmw_cmd_set_shader(struct vmw_private *dev_priv,
 		return 0;
 
 	if (cmd->body.shid != SVGA3D_INVALID_ID) {
-		res = vmw_compat_shader_lookup
-			(vmw_context_res_man(ctx_node->res),
-			 cmd->body.shid,
-			 cmd->body.type);
+		res = vmw_shader_lookup(vmw_context_res_man(ctx_node->res),
+					cmd->body.shid,
+					cmd->body.type);
 
 		if (!IS_ERR(res)) {
 			ret = vmw_cmd_res_reloc_add(dev_priv, sw_context,
-						    vmw_res_shader,
 						    &cmd->body.shid, res,
 						    &res_node);
 			vmw_resource_unreference(&res);
@@ -1777,11 +2040,13 @@ static int vmw_cmd_set_shader(struct vmw_private *dev_priv,
 			return ret;
 	}
 
-	bi.ctx = ctx_node->res;
-	bi.res = res_node ? res_node->res : NULL;
-	bi.bt = vmw_ctx_binding_shader;
-	bi.i1.shader_type = cmd->body.type;
-	return vmw_context_binding_add(ctx_node->staged_bindings, &bi);
+	binding.bi.ctx = ctx_node->res;
+	binding.bi.res = res_node ? res_node->res : NULL;
+	binding.bi.bt = vmw_ctx_binding_shader;
+	binding.shader_slot = cmd->body.type - SVGA3D_SHADERTYPE_MIN;
+	vmw_binding_add(ctx_node->staged_bindings, &binding.bi,
+			binding.shader_slot, 0);
+	return 0;
 }
 
 /**
@@ -1843,78 +2108,705 @@ static int vmw_cmd_bind_gb_shader(struct vmw_private *dev_priv,
 				     cmd->body.offsetInBytes);
 }
 
-static int vmw_cmd_check_not_3d(struct vmw_private *dev_priv,
-				struct vmw_sw_context *sw_context,
-				void *buf, uint32_t *size)
+/**
+ * vmw_cmd_dx_set_single_constant_buffer - Validate an
+ * SVGA_3D_CMD_DX_SET_SINGLE_CONSTANT_BUFFER command.
+ *
+ * @dev_priv: Pointer to a device private struct.
+ * @sw_context: The software context being used for this batch.
+ * @header: Pointer to the command header in the command stream.
+ */
+static int
+vmw_cmd_dx_set_single_constant_buffer(struct vmw_private *dev_priv,
+				      struct vmw_sw_context *sw_context,
+				      SVGA3dCmdHeader *header)
 {
-	uint32_t size_remaining = *size;
-	uint32_t cmd_id;
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdDXSetSingleConstantBuffer body;
+	} *cmd;
+	struct vmw_resource_val_node *res_node = NULL;
+	struct vmw_resource_val_node *ctx_node = sw_context->dx_ctx_node;
+	struct vmw_ctx_bindinfo_cb binding;
+	int ret;
 
-	cmd_id = ((uint32_t *)buf)[0];
-	switch (cmd_id) {
-	case SVGA_CMD_UPDATE:
-		*size = sizeof(uint32_t) + sizeof(SVGAFifoCmdUpdate);
-		break;
-	case SVGA_CMD_DEFINE_GMRFB:
-		*size = sizeof(uint32_t) + sizeof(SVGAFifoCmdDefineGMRFB);
-		break;
-	case SVGA_CMD_BLIT_GMRFB_TO_SCREEN:
-		*size = sizeof(uint32_t) + sizeof(SVGAFifoCmdBlitGMRFBToScreen);
-		break;
-	case SVGA_CMD_BLIT_SCREEN_TO_GMRFB:
-		*size = sizeof(uint32_t) + sizeof(SVGAFifoCmdBlitGMRFBToScreen);
-		break;
-	default:
-		DRM_ERROR("Unsupported SVGA command: %u.\n", cmd_id);
+	if (unlikely(ctx_node == NULL)) {
+		DRM_ERROR("DX Context not set.\n");
 		return -EINVAL;
 	}
 
-	if (*size > size_remaining) {
-		DRM_ERROR("Invalid SVGA command (size mismatch):"
-			  " %u.\n", cmd_id);
-		return -EINVAL;
-	}
+	cmd = container_of(header, typeof(*cmd), header);
+	ret = vmw_cmd_res_check(dev_priv, sw_context, vmw_res_surface,
+				user_surface_converter,
+				&cmd->body.sid, &res_node);
+	if (unlikely(ret != 0))
+		return ret;
 
-	if (unlikely(!sw_context->kernel)) {
-		DRM_ERROR("Kernel only SVGA command: %u.\n", cmd_id);
-		return -EPERM;
+	binding.bi.ctx = ctx_node->res;
+	binding.bi.res = res_node ? res_node->res : NULL;
+	binding.bi.bt = vmw_ctx_binding_cb;
+	binding.shader_slot = cmd->body.type - SVGA3D_SHADERTYPE_MIN;
+	binding.offset = cmd->body.offsetInBytes;
+	binding.size = cmd->body.sizeInBytes;
+	binding.slot = cmd->body.slot;
+
+	if (binding.shader_slot >= SVGA3D_NUM_SHADERTYPE_DX10 ||
+	    binding.slot >= SVGA3D_DX_MAX_CONSTBUFFERS) {
+		DRM_ERROR("Illegal const buffer shader %u slot %u.\n",
+			  (unsigned) cmd->body.type,
+			  (unsigned) binding.slot);
+		return -EINVAL;
 	}
 
-	if (cmd_id == SVGA_CMD_DEFINE_GMRFB)
-		return vmw_cmd_check_define_gmrfb(dev_priv, sw_context, buf);
+	vmw_binding_add(ctx_node->staged_bindings, &binding.bi,
+			binding.shader_slot, binding.slot);
 
 	return 0;
 }
 
-static const struct vmw_cmd_entry vmw_cmd_entries[SVGA_3D_CMD_MAX] = {
-	VMW_CMD_DEF(SVGA_3D_CMD_SURFACE_DEFINE, &vmw_cmd_invalid,
-		    false, false, false),
-	VMW_CMD_DEF(SVGA_3D_CMD_SURFACE_DESTROY, &vmw_cmd_invalid,
-		    false, false, false),
-	VMW_CMD_DEF(SVGA_3D_CMD_SURFACE_COPY, &vmw_cmd_surface_copy_check,
-		    true, false, false),
-	VMW_CMD_DEF(SVGA_3D_CMD_SURFACE_STRETCHBLT, &vmw_cmd_stretch_blt_check,
-		    true, false, false),
-	VMW_CMD_DEF(SVGA_3D_CMD_SURFACE_DMA, &vmw_cmd_dma,
-		    true, false, false),
-	VMW_CMD_DEF(SVGA_3D_CMD_CONTEXT_DEFINE, &vmw_cmd_invalid,
-		    false, false, false),
-	VMW_CMD_DEF(SVGA_3D_CMD_CONTEXT_DESTROY, &vmw_cmd_invalid,
-		    false, false, false),
-	VMW_CMD_DEF(SVGA_3D_CMD_SETTRANSFORM, &vmw_cmd_cid_check,
-		    true, false, false),
-	VMW_CMD_DEF(SVGA_3D_CMD_SETZRANGE, &vmw_cmd_cid_check,
-		    true, false, false),
-	VMW_CMD_DEF(SVGA_3D_CMD_SETRENDERSTATE, &vmw_cmd_cid_check,
-		    true, false, false),
-	VMW_CMD_DEF(SVGA_3D_CMD_SETRENDERTARGET,
-		    &vmw_cmd_set_render_target_check, true, false, false),
-	VMW_CMD_DEF(SVGA_3D_CMD_SETTEXTURESTATE, &vmw_cmd_tex_state,
-		    true, false, false),
-	VMW_CMD_DEF(SVGA_3D_CMD_SETMATERIAL, &vmw_cmd_cid_check,
-		    true, false, false),
-	VMW_CMD_DEF(SVGA_3D_CMD_SETLIGHTDATA, &vmw_cmd_cid_check,
-		    true, false, false),
+/**
+ * vmw_cmd_dx_set_shader_res - Validate an
+ * SVGA_3D_CMD_DX_SET_SHADER_RESOURCES command
+ *
+ * @dev_priv: Pointer to a device private struct.
+ * @sw_context: The software context being used for this batch.
+ * @header: Pointer to the command header in the command stream.
+ */
+static int vmw_cmd_dx_set_shader_res(struct vmw_private *dev_priv,
+				     struct vmw_sw_context *sw_context,
+				     SVGA3dCmdHeader *header)
+{
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdDXSetShaderResources body;
+	} *cmd = container_of(header, typeof(*cmd), header);
+	u32 num_sr_view = (cmd->header.size - sizeof(cmd->body)) /
+		sizeof(SVGA3dShaderResourceViewId);
+
+	if ((u64) cmd->body.startView + (u64) num_sr_view >
+	    (u64) SVGA3D_DX_MAX_SRVIEWS ||
+	    cmd->body.type >= SVGA3D_SHADERTYPE_DX10_MAX) {
+		DRM_ERROR("Invalid shader binding.\n");
+		return -EINVAL;
+	}
+
+	return vmw_view_bindings_add(sw_context, vmw_view_sr,
+				     vmw_ctx_binding_sr,
+				     cmd->body.type - SVGA3D_SHADERTYPE_MIN,
+				     (void *) &cmd[1], num_sr_view,
+				     cmd->body.startView);
+}
+
+/**
+ * vmw_cmd_dx_set_shader - Validate an SVGA_3D_CMD_DX_SET_SHADER
+ * command
+ *
+ * @dev_priv: Pointer to a device private struct.
+ * @sw_context: The software context being used for this batch.
+ * @header: Pointer to the command header in the command stream.
+ */
+static int vmw_cmd_dx_set_shader(struct vmw_private *dev_priv,
+				 struct vmw_sw_context *sw_context,
+				 SVGA3dCmdHeader *header)
+{
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdDXSetShader body;
+	} *cmd;
+	struct vmw_resource *res = NULL;
+	struct vmw_resource_val_node *ctx_node = sw_context->dx_ctx_node;
+	struct vmw_ctx_bindinfo_shader binding;
+	int ret = 0;
+
+	if (unlikely(ctx_node == NULL)) {
+		DRM_ERROR("DX Context not set.\n");
+		return -EINVAL;
+	}
+
+	cmd = container_of(header, typeof(*cmd), header);
+
+	if (cmd->body.type >= SVGA3D_SHADERTYPE_DX10_MAX) {
+		DRM_ERROR("Illegal shader type %u.\n",
+			  (unsigned) cmd->body.type);
+		return -EINVAL;
+	}
+
+	if (cmd->body.shaderId != SVGA3D_INVALID_ID) {
+		res = vmw_shader_lookup(sw_context->man, cmd->body.shaderId, 0);
+		if (IS_ERR(res)) {
+			DRM_ERROR("Could not find shader for binding.\n");
+			return PTR_ERR(res);
+		}
+
+		ret = vmw_resource_val_add(sw_context, res, NULL);
+		if (ret)
+			goto out_unref;
+	}
+
+	binding.bi.ctx = ctx_node->res;
+	binding.bi.res = res;
+	binding.bi.bt = vmw_ctx_binding_dx_shader;
+	binding.shader_slot = cmd->body.type - SVGA3D_SHADERTYPE_MIN;
+
+	vmw_binding_add(ctx_node->staged_bindings, &binding.bi,
+			binding.shader_slot, 0);
+out_unref:
+	if (res)
+		vmw_resource_unreference(&res);
+
+	return ret;
+}
+
+/**
+ * vmw_cmd_dx_set_vertex_buffers - Validates an
+ * SVGA_3D_CMD_DX_SET_VERTEX_BUFFERS command
+ *
+ * @dev_priv: Pointer to a device private struct.
+ * @sw_context: The software context being used for this batch.
+ * @header: Pointer to the command header in the command stream.
+ */
+static int vmw_cmd_dx_set_vertex_buffers(struct vmw_private *dev_priv,
+					 struct vmw_sw_context *sw_context,
+					 SVGA3dCmdHeader *header)
+{
+	struct vmw_resource_val_node *ctx_node = sw_context->dx_ctx_node;
+	struct vmw_ctx_bindinfo_vb binding;
+	struct vmw_resource_val_node *res_node;
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdDXSetVertexBuffers body;
+		SVGA3dVertexBuffer buf[];
+	} *cmd;
+	int i, ret, num;
+
+	if (unlikely(ctx_node == NULL)) {
+		DRM_ERROR("DX Context not set.\n");
+		return -EINVAL;
+	}
+
+	cmd = container_of(header, typeof(*cmd), header);
+	num = (cmd->header.size - sizeof(cmd->body)) /
+		sizeof(SVGA3dVertexBuffer);
+	if ((u64)num + (u64)cmd->body.startBuffer >
+	    (u64)SVGA3D_DX_MAX_VERTEXBUFFERS) {
+		DRM_ERROR("Invalid number of vertex buffers.\n");
+		return -EINVAL;
+	}
+
+	for (i = 0; i < num; i++) {
+		ret = vmw_cmd_res_check(dev_priv, sw_context, vmw_res_surface,
+					user_surface_converter,
+					&cmd->buf[i].sid, &res_node);
+		if (unlikely(ret != 0))
+			return ret;
+
+		binding.bi.ctx = ctx_node->res;
+		binding.bi.bt = vmw_ctx_binding_vb;
+		binding.bi.res = ((res_node) ? res_node->res : NULL);
+		binding.offset = cmd->buf[i].offset;
+		binding.stride = cmd->buf[i].stride;
+		binding.slot = i + cmd->body.startBuffer;
+
+		vmw_binding_add(ctx_node->staged_bindings, &binding.bi,
+				0, binding.slot);
+	}
+
+	return 0;
+}
+
+/**
+ * vmw_cmd_dx_ia_set_vertex_buffers - Validate an
+ * SVGA_3D_CMD_DX_IA_SET_VERTEX_BUFFERS command.
+ *
+ * @dev_priv: Pointer to a device private struct.
+ * @sw_context: The software context being used for this batch.
+ * @header: Pointer to the command header in the command stream.
+ */
+static int vmw_cmd_dx_set_index_buffer(struct vmw_private *dev_priv,
+				       struct vmw_sw_context *sw_context,
+				       SVGA3dCmdHeader *header)
+{
+	struct vmw_resource_val_node *ctx_node = sw_context->dx_ctx_node;
+	struct vmw_ctx_bindinfo_ib binding;
+	struct vmw_resource_val_node *res_node;
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdDXSetIndexBuffer body;
+	} *cmd;
+	int ret;
+
+	if (unlikely(ctx_node == NULL)) {
+		DRM_ERROR("DX Context not set.\n");
+		return -EINVAL;
+	}
+
+	cmd = container_of(header, typeof(*cmd), header);
+	ret = vmw_cmd_res_check(dev_priv, sw_context, vmw_res_surface,
+				user_surface_converter,
+				&cmd->body.sid, &res_node);
+	if (unlikely(ret != 0))
+		return ret;
+
+	binding.bi.ctx = ctx_node->res;
+	binding.bi.res = ((res_node) ? res_node->res : NULL);
+	binding.bi.bt = vmw_ctx_binding_ib;
+	binding.offset = cmd->body.offset;
+	binding.format = cmd->body.format;
+
+	vmw_binding_add(ctx_node->staged_bindings, &binding.bi, 0, 0);
+
+	return 0;
+}
+
+/**
+ * vmw_cmd_dx_set_rendertarget - Validate an
+ * SVGA_3D_CMD_DX_SET_RENDERTARGETS command
+ *
+ * @dev_priv: Pointer to a device private struct.
+ * @sw_context: The software context being used for this batch.
+ * @header: Pointer to the command header in the command stream.
+ */
+static int vmw_cmd_dx_set_rendertargets(struct vmw_private *dev_priv,
+					struct vmw_sw_context *sw_context,
+					SVGA3dCmdHeader *header)
+{
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdDXSetRenderTargets body;
+	} *cmd = container_of(header, typeof(*cmd), header);
+	int ret;
+	u32 num_rt_view = (cmd->header.size - sizeof(cmd->body)) /
+		sizeof(SVGA3dRenderTargetViewId);
+
+	if (num_rt_view > SVGA3D_MAX_SIMULTANEOUS_RENDER_TARGETS) {
+		DRM_ERROR("Invalid DX Rendertarget binding.\n");
+		return -EINVAL;
+	}
+
+	ret = vmw_view_bindings_add(sw_context, vmw_view_ds,
+				    vmw_ctx_binding_ds, 0,
+				    &cmd->body.depthStencilViewId, 1, 0);
+	if (ret)
+		return ret;
+
+	return vmw_view_bindings_add(sw_context, vmw_view_rt,
+				     vmw_ctx_binding_dx_rt, 0,
+				     (void *)&cmd[1], num_rt_view, 0);
+}
+
+/**
+ * vmw_cmd_dx_clear_rendertarget_view - Validate an
+ * SVGA_3D_CMD_DX_CLEAR_RENDERTARGET_VIEW command
+ *
+ * @dev_priv: Pointer to a device private struct.
+ * @sw_context: The software context being used for this batch.
+ * @header: Pointer to the command header in the command stream.
+ */
+static int vmw_cmd_dx_clear_rendertarget_view(struct vmw_private *dev_priv,
+					      struct vmw_sw_context *sw_context,
+					      SVGA3dCmdHeader *header)
+{
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdDXClearRenderTargetView body;
+	} *cmd = container_of(header, typeof(*cmd), header);
+
+	return vmw_view_id_val_add(sw_context, vmw_view_rt,
+				   cmd->body.renderTargetViewId);
+}
+
+/**
+ * vmw_cmd_dx_clear_rendertarget_view - Validate an
+ * SVGA_3D_CMD_DX_CLEAR_DEPTHSTENCIL_VIEW command
+ *
+ * @dev_priv: Pointer to a device private struct.
+ * @sw_context: The software context being used for this batch.
+ * @header: Pointer to the command header in the command stream.
+ */
+static int vmw_cmd_dx_clear_depthstencil_view(struct vmw_private *dev_priv,
+					      struct vmw_sw_context *sw_context,
+					      SVGA3dCmdHeader *header)
+{
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdDXClearDepthStencilView body;
+	} *cmd = container_of(header, typeof(*cmd), header);
+
+	return vmw_view_id_val_add(sw_context, vmw_view_ds,
+				   cmd->body.depthStencilViewId);
+}
+
+static int vmw_cmd_dx_view_define(struct vmw_private *dev_priv,
+				  struct vmw_sw_context *sw_context,
+				  SVGA3dCmdHeader *header)
+{
+	struct vmw_resource_val_node *ctx_node = sw_context->dx_ctx_node;
+	struct vmw_resource_val_node *srf_node;
+	struct vmw_resource *res;
+	enum vmw_view_type view_type;
+	int ret;
+	/*
+	 * This is based on the fact that all affected define commands have
+	 * the same initial command body layout.
+	 */
+	struct {
+		SVGA3dCmdHeader header;
+		uint32 defined_id;
+		uint32 sid;
+	} *cmd;
+
+	if (unlikely(ctx_node == NULL)) {
+		DRM_ERROR("DX Context not set.\n");
+		return -EINVAL;
+	}
+
+	view_type = vmw_view_cmd_to_type(header->id);
+	cmd = container_of(header, typeof(*cmd), header);
+	ret = vmw_cmd_res_check(dev_priv, sw_context, vmw_res_surface,
+				user_surface_converter,
+				&cmd->sid, &srf_node);
+	if (unlikely(ret != 0))
+		return ret;
+
+	res = vmw_context_cotable(ctx_node->res, vmw_view_cotables[view_type]);
+	ret = vmw_cotable_notify(res, cmd->defined_id);
+	vmw_resource_unreference(&res);
+	if (unlikely(ret != 0))
+		return ret;
+
+	return vmw_view_add(sw_context->man,
+			    ctx_node->res,
+			    srf_node->res,
+			    view_type,
+			    cmd->defined_id,
+			    header,
+			    header->size + sizeof(*header),
+			    &sw_context->staged_cmd_res);
+}
+
+static int vmw_cmd_dx_so_define(struct vmw_private *dev_priv,
+				struct vmw_sw_context *sw_context,
+				SVGA3dCmdHeader *header)
+{
+	struct vmw_resource_val_node *ctx_node = sw_context->dx_ctx_node;
+	struct vmw_resource *res;
+	/*
+	 * This is based on the fact that all affected define commands have
+	 * the same initial command body layout.
+	 */
+	struct {
+		SVGA3dCmdHeader header;
+		uint32 defined_id;
+	} *cmd;
+	enum vmw_so_type so_type;
+	int ret;
+
+	if (unlikely(ctx_node == NULL)) {
+		DRM_ERROR("DX Context not set.\n");
+		return -EINVAL;
+	}
+
+	so_type = vmw_so_cmd_to_type(header->id);
+	res = vmw_context_cotable(ctx_node->res, vmw_so_cotables[so_type]);
+	cmd = container_of(header, typeof(*cmd), header);
+	ret = vmw_cotable_notify(res, cmd->defined_id);
+	vmw_resource_unreference(&res);
+
+	return ret;
+}
+
+/**
+ * vmw_cmd_dx_check_subresource - Validate an
+ * SVGA_3D_CMD_DX_[X]_SUBRESOURCE command
+ *
+ * @dev_priv: Pointer to a device private struct.
+ * @sw_context: The software context being used for this batch.
+ * @header: Pointer to the command header in the command stream.
+ */
+static int vmw_cmd_dx_check_subresource(struct vmw_private *dev_priv,
+					struct vmw_sw_context *sw_context,
+					SVGA3dCmdHeader *header)
+{
+	struct {
+		SVGA3dCmdHeader header;
+		union {
+			SVGA3dCmdDXReadbackSubResource r_body;
+			SVGA3dCmdDXInvalidateSubResource i_body;
+			SVGA3dCmdDXUpdateSubResource u_body;
+			SVGA3dSurfaceId sid;
+		};
+	} *cmd;
+
+	BUILD_BUG_ON(offsetof(typeof(*cmd), r_body.sid) !=
+		     offsetof(typeof(*cmd), sid));
+	BUILD_BUG_ON(offsetof(typeof(*cmd), i_body.sid) !=
+		     offsetof(typeof(*cmd), sid));
+	BUILD_BUG_ON(offsetof(typeof(*cmd), u_body.sid) !=
+		     offsetof(typeof(*cmd), sid));
+
+	cmd = container_of(header, typeof(*cmd), header);
+
+	return vmw_cmd_res_check(dev_priv, sw_context, vmw_res_surface,
+				 user_surface_converter,
+				 &cmd->sid, NULL);
+}
+
+static int vmw_cmd_dx_cid_check(struct vmw_private *dev_priv,
+				struct vmw_sw_context *sw_context,
+				SVGA3dCmdHeader *header)
+{
+	struct vmw_resource_val_node *ctx_node = sw_context->dx_ctx_node;
+
+	if (unlikely(ctx_node == NULL)) {
+		DRM_ERROR("DX Context not set.\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * vmw_cmd_dx_view_remove - validate a view remove command and
+ * schedule the view resource for removal.
+ *
+ * @dev_priv: Pointer to a device private struct.
+ * @sw_context: The software context being used for this batch.
+ * @header: Pointer to the command header in the command stream.
+ *
+ * Check that the view exists, and if it was not created using this
+ * command batch, make sure it's validated (present in the device) so that
+ * the remove command will not confuse the device.
+ */
+static int vmw_cmd_dx_view_remove(struct vmw_private *dev_priv,
+				  struct vmw_sw_context *sw_context,
+				  SVGA3dCmdHeader *header)
+{
+	struct vmw_resource_val_node *ctx_node = sw_context->dx_ctx_node;
+	struct {
+		SVGA3dCmdHeader header;
+		union vmw_view_destroy body;
+	} *cmd = container_of(header, typeof(*cmd), header);
+	enum vmw_view_type view_type = vmw_view_cmd_to_type(header->id);
+	struct vmw_resource *view;
+	int ret;
+
+	if (!ctx_node) {
+		DRM_ERROR("DX Context not set.\n");
+		return -EINVAL;
+	}
+
+	ret = vmw_view_remove(sw_context->man,
+			      cmd->body.view_id, view_type,
+			      &sw_context->staged_cmd_res,
+			      &view);
+	if (ret || !view)
+		return ret;
+
+	/*
+	 * Add view to the validate list iff it was not created using this
+	 * command batch.
+	 */
+	return vmw_view_res_val_add(sw_context, view);
+}
+
+/**
+ * vmw_cmd_dx_define_shader - Validate an SVGA_3D_CMD_DX_DEFINE_SHADER
+ * command
+ *
+ * @dev_priv: Pointer to a device private struct.
+ * @sw_context: The software context being used for this batch.
+ * @header: Pointer to the command header in the command stream.
+ */
+static int vmw_cmd_dx_define_shader(struct vmw_private *dev_priv,
+				    struct vmw_sw_context *sw_context,
+				    SVGA3dCmdHeader *header)
+{
+	struct vmw_resource_val_node *ctx_node = sw_context->dx_ctx_node;
+	struct vmw_resource *res;
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdDXDefineShader body;
+	} *cmd = container_of(header, typeof(*cmd), header);
+	int ret;
+
+	if (!ctx_node) {
+		DRM_ERROR("DX Context not set.\n");
+		return -EINVAL;
+	}
+
+	res = vmw_context_cotable(ctx_node->res, SVGA_COTABLE_DXSHADER);
+	ret = vmw_cotable_notify(res, cmd->body.shaderId);
+	vmw_resource_unreference(&res);
+	if (ret)
+		return ret;
+
+	return vmw_dx_shader_add(sw_context->man, ctx_node->res,
+				 cmd->body.shaderId, cmd->body.type,
+				 &sw_context->staged_cmd_res);
+}
+
+/**
+ * vmw_cmd_dx_destroy_shader - Validate an SVGA_3D_CMD_DX_DESTROY_SHADER
+ * command
+ *
+ * @dev_priv: Pointer to a device private struct.
+ * @sw_context: The software context being used for this batch.
+ * @header: Pointer to the command header in the command stream.
+ */
+static int vmw_cmd_dx_destroy_shader(struct vmw_private *dev_priv,
+				     struct vmw_sw_context *sw_context,
+				     SVGA3dCmdHeader *header)
+{
+	struct vmw_resource_val_node *ctx_node = sw_context->dx_ctx_node;
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdDXDestroyShader body;
+	} *cmd = container_of(header, typeof(*cmd), header);
+	int ret;
+
+	if (!ctx_node) {
+		DRM_ERROR("DX Context not set.\n");
+		return -EINVAL;
+	}
+
+	ret = vmw_shader_remove(sw_context->man, cmd->body.shaderId, 0,
+				&sw_context->staged_cmd_res);
+	if (ret)
+		DRM_ERROR("Could not find shader to remove.\n");
+
+	return ret;
+}
+
+/**
+ * vmw_cmd_dx_bind_shader - Validate an SVGA_3D_CMD_DX_BIND_SHADER
+ * command
+ *
+ * @dev_priv: Pointer to a device private struct.
+ * @sw_context: The software context being used for this batch.
+ * @header: Pointer to the command header in the command stream.
+ */
+static int vmw_cmd_dx_bind_shader(struct vmw_private *dev_priv,
+				  struct vmw_sw_context *sw_context,
+				  SVGA3dCmdHeader *header)
+{
+	struct vmw_resource_val_node *ctx_node;
+	struct vmw_resource_val_node *res_node;
+	struct vmw_resource *res;
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdDXBindShader body;
+	} *cmd = container_of(header, typeof(*cmd), header);
+	int ret;
+
+	if (cmd->body.cid != SVGA3D_INVALID_ID) {
+		ret = vmw_cmd_res_check(dev_priv, sw_context, vmw_res_context,
+					user_context_converter,
+					&cmd->body.cid, &ctx_node);
+		if (ret)
+			return ret;
+	} else {
+		ctx_node = sw_context->dx_ctx_node;
+		if (!ctx_node) {
+			DRM_ERROR("DX Context not set.\n");
+			return -EINVAL;
+		}
+	}
+
+	res = vmw_shader_lookup(vmw_context_res_man(ctx_node->res),
+				cmd->body.shid, 0);
+	if (IS_ERR(res)) {
+		DRM_ERROR("Could not find shader to bind.\n");
+		return PTR_ERR(res);
+	}
+
+	ret = vmw_resource_val_add(sw_context, res, &res_node);
+	if (ret) {
+		DRM_ERROR("Error creating resource validation node.\n");
+		goto out_unref;
+	}
+
+
+	ret = vmw_cmd_res_switch_backup(dev_priv, sw_context, res_node,
+					&cmd->body.mobid,
+					cmd->body.offsetInBytes);
+out_unref:
+	vmw_resource_unreference(&res);
+
+	return ret;
+}
+
+static int vmw_cmd_check_not_3d(struct vmw_private *dev_priv,
+				struct vmw_sw_context *sw_context,
+				void *buf, uint32_t *size)
+{
+	uint32_t size_remaining = *size;
+	uint32_t cmd_id;
+
+	cmd_id = ((uint32_t *)buf)[0];
+	switch (cmd_id) {
+	case SVGA_CMD_UPDATE:
+		*size = sizeof(uint32_t) + sizeof(SVGAFifoCmdUpdate);
+		break;
+	case SVGA_CMD_DEFINE_GMRFB:
+		*size = sizeof(uint32_t) + sizeof(SVGAFifoCmdDefineGMRFB);
+		break;
+	case SVGA_CMD_BLIT_GMRFB_TO_SCREEN:
+		*size = sizeof(uint32_t) + sizeof(SVGAFifoCmdBlitGMRFBToScreen);
+		break;
+	case SVGA_CMD_BLIT_SCREEN_TO_GMRFB:
+		*size = sizeof(uint32_t) + sizeof(SVGAFifoCmdBlitGMRFBToScreen);
+		break;
+	default:
+		DRM_ERROR("Unsupported SVGA command: %u.\n", cmd_id);
+		return -EINVAL;
+	}
+
+	if (*size > size_remaining) {
+		DRM_ERROR("Invalid SVGA command (size mismatch):"
+			  " %u.\n", cmd_id);
+		return -EINVAL;
+	}
+
+	if (unlikely(!sw_context->kernel)) {
+		DRM_ERROR("Kernel only SVGA command: %u.\n", cmd_id);
+		return -EPERM;
+	}
+
+	if (cmd_id == SVGA_CMD_DEFINE_GMRFB)
+		return vmw_cmd_check_define_gmrfb(dev_priv, sw_context, buf);
+
+	return 0;
+}
+
+static const struct vmw_cmd_entry vmw_cmd_entries[SVGA_3D_CMD_MAX] = {
+	VMW_CMD_DEF(SVGA_3D_CMD_SURFACE_DEFINE, &vmw_cmd_invalid,
+		    false, false, false),
+	VMW_CMD_DEF(SVGA_3D_CMD_SURFACE_DESTROY, &vmw_cmd_invalid,
+		    false, false, false),
+	VMW_CMD_DEF(SVGA_3D_CMD_SURFACE_COPY, &vmw_cmd_surface_copy_check,
+		    true, false, false),
+	VMW_CMD_DEF(SVGA_3D_CMD_SURFACE_STRETCHBLT, &vmw_cmd_stretch_blt_check,
+		    true, false, false),
+	VMW_CMD_DEF(SVGA_3D_CMD_SURFACE_DMA, &vmw_cmd_dma,
+		    true, false, false),
+	VMW_CMD_DEF(SVGA_3D_CMD_CONTEXT_DEFINE, &vmw_cmd_invalid,
+		    false, false, false),
+	VMW_CMD_DEF(SVGA_3D_CMD_CONTEXT_DESTROY, &vmw_cmd_invalid,
+		    false, false, false),
+	VMW_CMD_DEF(SVGA_3D_CMD_SETTRANSFORM, &vmw_cmd_cid_check,
+		    true, false, false),
+	VMW_CMD_DEF(SVGA_3D_CMD_SETZRANGE, &vmw_cmd_cid_check,
+		    true, false, false),
+	VMW_CMD_DEF(SVGA_3D_CMD_SETRENDERSTATE, &vmw_cmd_cid_check,
+		    true, false, false),
+	VMW_CMD_DEF(SVGA_3D_CMD_SETRENDERTARGET,
+		    &vmw_cmd_set_render_target_check, true, false, false),
+	VMW_CMD_DEF(SVGA_3D_CMD_SETTEXTURESTATE, &vmw_cmd_tex_state,
+		    true, false, false),
+	VMW_CMD_DEF(SVGA_3D_CMD_SETMATERIAL, &vmw_cmd_cid_check,
+		    true, false, false),
+	VMW_CMD_DEF(SVGA_3D_CMD_SETLIGHTDATA, &vmw_cmd_cid_check,
+		    true, false, false),
 	VMW_CMD_DEF(SVGA_3D_CMD_SETLIGHTENABLED, &vmw_cmd_cid_check,
 		    true, false, false),
 	VMW_CMD_DEF(SVGA_3D_CMD_SETVIEWPORT, &vmw_cmd_cid_check,
@@ -2050,7 +2942,136 @@ static const struct vmw_cmd_entry vmw_cmd_entries[SVGA_3D_CMD_MAX] = {
 	VMW_CMD_DEF(SVGA_3D_CMD_INVALIDATE_GB_IMAGE_PARTIAL, &vmw_cmd_invalid,
 		    false, false, true),
 	VMW_CMD_DEF(SVGA_3D_CMD_SET_GB_SHADERCONSTS_INLINE, &vmw_cmd_cid_check,
-		    true, false, true)
+		    true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_GB_SCREEN_DMA, &vmw_cmd_invalid,
+		    false, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_BIND_GB_SURFACE_WITH_PITCH, &vmw_cmd_invalid,
+		    false, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_GB_MOB_FENCE, &vmw_cmd_invalid,
+		    false, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DEFINE_GB_SURFACE_V2, &vmw_cmd_invalid,
+		    false, false, true),
+
+	/*
+	 * DX commands
+	 */
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_DEFINE_CONTEXT, &vmw_cmd_invalid,
+		    false, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_DESTROY_CONTEXT, &vmw_cmd_invalid,
+		    false, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_BIND_CONTEXT, &vmw_cmd_invalid,
+		    false, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_READBACK_CONTEXT, &vmw_cmd_invalid,
+		    false, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_INVALIDATE_CONTEXT, &vmw_cmd_invalid,
+		    false, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_SET_SINGLE_CONSTANT_BUFFER,
+		    &vmw_cmd_dx_set_single_constant_buffer, true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_SET_SHADER_RESOURCES,
+		    &vmw_cmd_dx_set_shader_res, true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_SET_SHADER, &vmw_cmd_dx_set_shader,
+		    true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_DRAW_INSTANCED, &vmw_cmd_invalid,
+		    true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_DRAW_INDEXED_INSTANCED, &vmw_cmd_invalid,
+		    true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_DRAW_AUTO, &vmw_cmd_invalid,
+		    true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_SET_VERTEX_BUFFERS,
+		    &vmw_cmd_dx_set_vertex_buffers, true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_SET_INDEX_BUFFER,
+		    &vmw_cmd_dx_set_index_buffer, true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_SET_RENDERTARGETS,
+		    &vmw_cmd_dx_set_rendertargets, true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_SET_BLEND_STATE, &vmw_cmd_dx_cid_check,
+		    true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_SET_RASTERIZER_STATE, &vmw_cmd_dx_cid_check,
+		    true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_SET_DEPTHSTENCIL_STATE,
+		    &vmw_cmd_dx_cid_check,
+		    true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_DEFINE_QUERY, &vmw_cmd_invalid,
+		    true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_DESTROY_QUERY, &vmw_cmd_invalid,
+		    true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_BIND_QUERY, &vmw_cmd_invalid,
+		    true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_BEGIN_QUERY, &vmw_cmd_invalid,
+		    true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_END_QUERY, &vmw_cmd_invalid,
+		    true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_READBACK_QUERY, &vmw_cmd_invalid,
+		    true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_SET_PREDICATION, &vmw_cmd_invalid,
+		    true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_SET_VIEWPORTS, &vmw_cmd_dx_cid_check,
+		    true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_SET_SCISSORRECTS, &vmw_cmd_dx_cid_check,
+		    true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_CLEAR_RENDERTARGET_VIEW,
+		    &vmw_cmd_dx_clear_rendertarget_view, true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_CLEAR_DEPTHSTENCIL_VIEW,
+		    &vmw_cmd_dx_clear_depthstencil_view, true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_PRED_COPY_REGION, &vmw_cmd_invalid,
+		    true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_PRED_COPY, &vmw_cmd_invalid,
+		    true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_GENMIPS, &vmw_cmd_invalid,
+		    true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_UPDATE_SUBRESOURCE,
+		    &vmw_cmd_dx_check_subresource, true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_READBACK_SUBRESOURCE,
+		    &vmw_cmd_dx_check_subresource, true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_INVALIDATE_SUBRESOURCE,
+		    &vmw_cmd_dx_check_subresource, true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_DEFINE_SHADERRESOURCE_VIEW,
+		    &vmw_cmd_dx_view_define, true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_DESTROY_SHADERRESOURCE_VIEW,
+		    &vmw_cmd_dx_view_remove, true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_DEFINE_RENDERTARGET_VIEW,
+		    &vmw_cmd_dx_view_define, true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_DESTROY_RENDERTARGET_VIEW,
+		    &vmw_cmd_dx_view_remove, true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_DEFINE_DEPTHSTENCIL_VIEW,
+		    &vmw_cmd_dx_view_define, true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_DESTROY_DEPTHSTENCIL_VIEW,
+		    &vmw_cmd_dx_view_remove, true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_DEFINE_ELEMENTLAYOUT,
+		    &vmw_cmd_dx_so_define, true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_DESTROY_ELEMENTLAYOUT,
+		    &vmw_cmd_dx_cid_check, true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_DEFINE_BLEND_STATE,
+		    &vmw_cmd_dx_so_define, true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_DESTROY_BLEND_STATE,
+		    &vmw_cmd_dx_cid_check, true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_DEFINE_DEPTHSTENCIL_STATE,
+		    &vmw_cmd_dx_so_define, true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_DESTROY_DEPTHSTENCIL_STATE,
+		    &vmw_cmd_dx_cid_check, true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_DEFINE_RASTERIZER_STATE,
+		    &vmw_cmd_dx_so_define, true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_DESTROY_RASTERIZER_STATE,
+		    &vmw_cmd_dx_cid_check, true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_DEFINE_SAMPLER_STATE,
+		    &vmw_cmd_dx_so_define, true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_DESTROY_SAMPLER_STATE,
+		    &vmw_cmd_dx_cid_check, true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_DEFINE_SHADER,
+		    &vmw_cmd_dx_define_shader, true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_DESTROY_SHADER,
+		    &vmw_cmd_dx_destroy_shader, true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_BIND_SHADER,
+		    &vmw_cmd_dx_bind_shader, true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_DEFINE_STREAMOUTPUT,
+		    &vmw_cmd_dx_so_define, true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_DESTROY_STREAMOUTPUT,
+		    &vmw_cmd_dx_cid_check, true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_SET_STREAMOUTPUT, &vmw_cmd_invalid,
+		    true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_SET_INPUT_LAYOUT,
+		    &vmw_cmd_dx_cid_check, true, false, true),
+	VMW_CMD_DEF(SVGA_3D_CMD_DX_SET_TOPOLOGY,
+		    &vmw_cmd_dx_cid_check, true, false, true),
 };
 
 static int vmw_cmd_check(struct vmw_private *dev_priv,
@@ -2183,7 +3204,8 @@ static void vmw_apply_relocations(struct vmw_sw_context *sw_context)
  *
  * @list: The resource list.
  */
-static void vmw_resource_list_unreference(struct list_head *list)
+static void vmw_resource_list_unreference(struct vmw_sw_context *sw_context,
+					  struct list_head *list)
 {
 	struct vmw_resource_val_node *val, *val_next;
 
@@ -2194,8 +3216,15 @@ static void vmw_resource_list_unreference(struct list_head *list)
 	list_for_each_entry_safe(val, val_next, list, head) {
 		list_del_init(&val->head);
 		vmw_resource_unreference(&val->res);
-		if (unlikely(val->staged_bindings))
-			kfree(val->staged_bindings);
+
+		if (val->staged_bindings) {
+			if (val->staged_bindings != sw_context->staged_bindings)
+				vmw_binding_state_free(val->staged_bindings);
+			else
+				sw_context->staged_bindings_inuse = false;
+			val->staged_bindings = NULL;
+		}
+
 		kfree(val);
 	}
 }
@@ -2431,8 +3460,13 @@ static int vmw_execbuf_submit_fifo(struct vmw_private *dev_priv,
 				   u32 command_size,
 				   struct vmw_sw_context *sw_context)
 {
-	void *cmd = vmw_fifo_reserve(dev_priv, command_size);
+	void *cmd;
 
+	if (sw_context->dx_ctx_node)
+		cmd = vmw_fifo_reserve_dx(dev_priv, command_size,
+					  sw_context->dx_ctx_node->res->id);
+	else
+		cmd = vmw_fifo_reserve(dev_priv, command_size);
 	if (!cmd) {
 		DRM_ERROR("Failed reserving fifo space for commands.\n");
 		return -ENOMEM;
@@ -2464,8 +3498,10 @@ static int vmw_execbuf_submit_cmdbuf(struct vmw_private *dev_priv,
 				     u32 command_size,
 				     struct vmw_sw_context *sw_context)
 {
+	u32 id = ((sw_context->dx_ctx_node) ? sw_context->dx_ctx_node->res->id :
+		  SVGA3D_INVALID_ID);
 	void *cmd = vmw_cmdbuf_reserve(dev_priv->cman, command_size,
-				       SVGA3D_INVALID_ID, false, header);
+				       id, false, header);
 
 	vmw_apply_relocations(sw_context);
 	vmw_resource_relocations_apply(cmd, &sw_context->res_relocations);
@@ -2535,12 +3571,44 @@ static void *vmw_execbuf_cmdbuf(struct vmw_private *dev_priv,
 	return kernel_commands;
 }
 
+static int vmw_execbuf_tie_context(struct vmw_private *dev_priv,
+				   struct vmw_sw_context *sw_context,
+				   uint32_t handle)
+{
+	struct vmw_resource_val_node *ctx_node;
+	struct vmw_resource *res;
+	int ret;
+
+	if (handle == SVGA3D_INVALID_ID)
+		return 0;
+
+	ret = vmw_user_resource_lookup_handle(dev_priv, sw_context->fp->tfile,
+					      handle, user_context_converter,
+					      &res);
+	if (unlikely(ret != 0)) {
+		DRM_ERROR("Could not find or user DX context 0x%08x.\n",
+			  (unsigned) handle);
+		return ret;
+	}
+
+	ret = vmw_resource_val_add(sw_context, res, &ctx_node);
+	if (unlikely(ret != 0))
+		goto out_err;
+
+	sw_context->dx_ctx_node = ctx_node;
+	sw_context->man = vmw_context_res_man(res);
+out_err:
+	vmw_resource_unreference(&res);
+	return ret;
+}
+
 int vmw_execbuf_process(struct drm_file *file_priv,
 			struct vmw_private *dev_priv,
 			void __user *user_commands,
 			void *kernel_commands,
 			uint32_t command_size,
 			uint64_t throttle_us,
+			uint32_t dx_context_handle,
 			struct drm_vmw_fence_rep __user *user_fence_rep,
 			struct vmw_fence_obj **out_fence)
 {
@@ -2596,12 +3664,17 @@ int vmw_execbuf_process(struct drm_file *file_priv,
 	sw_context->cur_reloc = 0;
 	sw_context->cur_val_buf = 0;
 	INIT_LIST_HEAD(&sw_context->resource_list);
+	INIT_LIST_HEAD(&sw_context->ctx_resource_list);
 	sw_context->cur_query_bo = dev_priv->pinned_bo;
 	sw_context->last_query_ctx = NULL;
 	sw_context->needs_post_query_barrier = false;
+	sw_context->dx_ctx_node = NULL;
 	memset(sw_context->res_cache, 0, sizeof(sw_context->res_cache));
 	INIT_LIST_HEAD(&sw_context->validate_nodes);
 	INIT_LIST_HEAD(&sw_context->res_relocations);
+	if (sw_context->staged_bindings)
+		vmw_binding_state_reset(sw_context->staged_bindings);
+
 	if (!sw_context->res_ht_initialized) {
 		ret = drm_ht_create(&sw_context->res_ht, VMW_RES_HT_ORDER);
 		if (unlikely(ret != 0))
@@ -2610,11 +3683,20 @@ int vmw_execbuf_process(struct drm_file *file_priv,
 	}
 	INIT_LIST_HEAD(&sw_context->staged_cmd_res);
 	INIT_LIST_HEAD(&resource_list);
+	ret = vmw_execbuf_tie_context(dev_priv, sw_context, dx_context_handle);
+	if (unlikely(ret != 0)) {
+		list_splice_init(&sw_context->ctx_resource_list,
+				 &sw_context->resource_list);
+		goto out_err_nores;
+	}
+
 	ret = vmw_cmd_check_all(dev_priv, sw_context, kernel_commands,
 				command_size);
 	if (unlikely(ret != 0))
 		goto out_err_nores;
 
+	list_splice_init(&sw_context->ctx_resource_list,
+			 &sw_context->resource_list);
 	ret = vmw_resources_reserve(sw_context);
 	if (unlikely(ret != 0))
 		goto out_err_nores;
@@ -2622,7 +3704,7 @@ int vmw_execbuf_process(struct drm_file *file_priv,
 	ret = ttm_eu_reserve_buffers(&ticket, &sw_context->validate_nodes,
 				     true, NULL);
 	if (unlikely(ret != 0))
-		goto out_err;
+		goto out_err_nores;
 
 	ret = vmw_validate_buffers(dev_priv, sw_context);
 	if (unlikely(ret != 0))
@@ -2652,8 +3734,9 @@ int vmw_execbuf_process(struct drm_file *file_priv,
 						sw_context);
 		header = NULL;
 	}
+	mutex_unlock(&dev_priv->binding_mutex);
 	if (ret)
-		goto out_unlock_binding;
+		goto out_err;
 
 	vmw_query_bo_switch_commit(dev_priv, sw_context);
 	ret = vmw_execbuf_fence_commands(file_priv, dev_priv,
@@ -2668,8 +3751,8 @@ int vmw_execbuf_process(struct drm_file *file_priv,
 	if (ret != 0)
 		DRM_ERROR("Fence submission error. Syncing.\n");
 
-	vmw_resource_list_unreserve(&sw_context->resource_list, false);
-	mutex_unlock(&dev_priv->binding_mutex);
+	vmw_resource_list_unreserve(sw_context, &sw_context->resource_list,
+				    false);
 
 	ttm_eu_fence_buffer_objects(&ticket, &sw_context->validate_nodes,
 				    (void *) fence);
@@ -2698,7 +3781,7 @@ int vmw_execbuf_process(struct drm_file *file_priv,
 	 * Unreference resources outside of the cmdbuf_mutex to
 	 * avoid deadlocks in resource destruction paths.
 	 */
-	vmw_resource_list_unreference(&resource_list);
+	vmw_resource_list_unreference(sw_context, &resource_list);
 
 	return 0;
 
@@ -2707,7 +3790,8 @@ out_unlock_binding:
 out_err:
 	ttm_eu_backoff_reservation(&ticket, &sw_context->validate_nodes);
 out_err_nores:
-	vmw_resource_list_unreserve(&sw_context->resource_list, true);
+	vmw_resource_list_unreserve(sw_context, &sw_context->resource_list,
+				    true);
 	vmw_resource_relocations_free(&sw_context->res_relocations);
 	vmw_free_relocations(sw_context);
 	vmw_clear_validations(sw_context);
@@ -2725,7 +3809,7 @@ out_unlock:
 	 * Unreference resources outside of the cmdbuf_mutex to
 	 * avoid deadlocks in resource destruction paths.
 	 */
-	vmw_resource_list_unreference(&resource_list);
+	vmw_resource_list_unreference(sw_context, &resource_list);
 	if (unlikely(error_resource != NULL))
 		vmw_resource_unreference(&error_resource);
 out_free_header:
@@ -2877,36 +3961,68 @@ void vmw_execbuf_release_pinned_bo(struct vmw_private *dev_priv)
 	mutex_unlock(&dev_priv->cmdbuf_mutex);
 }
 
-
-int vmw_execbuf_ioctl(struct drm_device *dev, void *data,
-		      struct drm_file *file_priv)
+int vmw_execbuf_ioctl(struct drm_device *dev, unsigned long data,
+		      struct drm_file *file_priv, size_t size)
 {
 	struct vmw_private *dev_priv = vmw_priv(dev);
-	struct drm_vmw_execbuf_arg *arg = (struct drm_vmw_execbuf_arg *)data;
+	struct drm_vmw_execbuf_arg arg;
 	int ret;
+	static const size_t copy_offset[] = {
+		offsetof(struct drm_vmw_execbuf_arg, context_handle),
+		sizeof(struct drm_vmw_execbuf_arg)};
+
+	if (unlikely(size < copy_offset[0])) {
+		DRM_ERROR("Invalid command size, ioctl %d\n",
+			  DRM_VMW_EXECBUF);
+		return -EINVAL;
+	}
+
+	if (copy_from_user(&arg, (void __user *) data, copy_offset[0]) != 0)
+		return -EFAULT;
 
 	/*
-	 * This will allow us to extend the ioctl argument while
+	 * Extend the ioctl argument while
 	 * maintaining backwards compatibility:
 	 * We take different code paths depending on the value of
-	 * arg->version.
+	 * arg.version.
 	 */
 
-	if (unlikely(arg->version != DRM_VMW_EXECBUF_VERSION)) {
+	if (unlikely(arg.version > DRM_VMW_EXECBUF_VERSION ||
+		     arg.version == 0)) {
 		DRM_ERROR("Incorrect execbuf version.\n");
-		DRM_ERROR("You're running outdated experimental "
-			  "vmwgfx user-space drivers.");
 		return -EINVAL;
 	}
 
+	if (arg.version > 1 &&
+	    copy_from_user(&arg.context_handle,
+			   (void __user *) (data + copy_offset[0]),
+			   copy_offset[arg.version - 1] -
+			   copy_offset[0]) != 0)
+		return -EFAULT;
+
+	switch (arg.version) {
+	case 1:
+		arg.context_handle = (uint32_t) -1;
+		break;
+	case 2:
+		if (arg.pad64 != 0) {
+			DRM_ERROR("Unused IOCTL data not set to zero.\n");
+			return -EINVAL;
+		}
+		break;
+	default:
+		break;
+	}
+
 	ret = ttm_read_lock(&dev_priv->reservation_sem, true);
 	if (unlikely(ret != 0))
 		return ret;
 
 	ret = vmw_execbuf_process(file_priv, dev_priv,
-				  (void __user *)(unsigned long)arg->commands,
-				  NULL, arg->command_size, arg->throttle_us,
-				  (void __user *)(unsigned long)arg->fence_rep,
+				  (void __user *)(unsigned long)arg.commands,
+				  NULL, arg.command_size, arg.throttle_us,
+				  arg.context_handle,
+				  (void __user *)(unsigned long)arg.fence_rep,
 				  NULL);
 	ttm_read_unlock(&dev_priv->reservation_sem);
 	if (unlikely(ret != 0))
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c
index cb24936a18c1..3c876d4826c0 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c
@@ -29,6 +29,11 @@
 #include <drm/drmP.h>
 #include <drm/ttm/ttm_placement.h>
 
+struct vmw_temp_set_context {
+	SVGA3dCmdHeader header;
+	SVGA3dCmdDXTempSetContext body;
+};
+
 bool vmw_fifo_have_3d(struct vmw_private *dev_priv)
 {
 	u32 __iomem *fifo_mem = dev_priv->mmio_virt;
@@ -99,6 +104,7 @@ int vmw_fifo_init(struct vmw_private *dev_priv, struct vmw_fifo_state *fifo)
 	uint32_t max;
 	uint32_t min;
 
+	fifo->dx = false;
 	fifo->static_buffer_size = VMWGFX_FIFO_STATIC_SIZE;
 	fifo->static_buffer = vmalloc(fifo->static_buffer_size);
 	if (unlikely(fifo->static_buffer == NULL))
@@ -396,15 +402,20 @@ out_err:
 	return NULL;
 }
 
-void *vmw_fifo_reserve(struct vmw_private *dev_priv, uint32_t bytes)
+void *vmw_fifo_reserve_dx(struct vmw_private *dev_priv, uint32_t bytes,
+			  int ctx_id)
 {
 	void *ret;
 
 	if (dev_priv->cman)
 		ret = vmw_cmdbuf_reserve(dev_priv->cman, bytes,
-					 SVGA3D_INVALID_ID, false, NULL);
-	else
+					 ctx_id, false, NULL);
+	else if (ctx_id == SVGA3D_INVALID_ID)
 		ret = vmw_local_fifo_reserve(dev_priv, bytes);
+	else {
+		WARN_ON("Command buffer has not been allocated.\n");
+		ret = NULL;
+	}
 	if (IS_ERR_OR_NULL(ret)) {
 		DRM_ERROR("Fifo reserve failure of %u bytes.\n",
 			  (unsigned) bytes);
@@ -466,6 +477,10 @@ static void vmw_local_fifo_commit(struct vmw_private *dev_priv, uint32_t bytes)
 	uint32_t min = ioread32(fifo_mem + SVGA_FIFO_MIN);
 	bool reserveable = fifo_state->capabilities & SVGA_FIFO_CAP_RESERVE;
 
+	if (fifo_state->dx)
+		bytes += sizeof(struct vmw_temp_set_context);
+
+	fifo_state->dx = false;
 	BUG_ON((bytes & 3) != 0);
 	BUG_ON(bytes > fifo_state->reserved_size);
 
@@ -518,7 +533,7 @@ void vmw_fifo_commit(struct vmw_private *dev_priv, uint32_t bytes)
  * @dev_priv: Pointer to device private structure.
  * @bytes: Number of bytes to commit.
  */
-static void vmw_fifo_commit_flush(struct vmw_private *dev_priv, uint32_t bytes)
+void vmw_fifo_commit_flush(struct vmw_private *dev_priv, uint32_t bytes)
 {
 	if (dev_priv->cman)
 		vmw_cmdbuf_commit(dev_priv->cman, bytes, NULL, true);
@@ -706,3 +721,8 @@ int vmw_fifo_emit_dummy_query(struct vmw_private *dev_priv,
 
 	return vmw_fifo_emit_dummy_legacy_query(dev_priv, cid);
 }
+
+void *vmw_fifo_reserve(struct vmw_private *dev_priv, uint32_t bytes)
+{
+	return vmw_fifo_reserve_dx(dev_priv, bytes, SVGA3D_INVALID_ID);
+}
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ioctl.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ioctl.c
index 91efe9cdd822..dca7f7f41aab 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_ioctl.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ioctl.c
@@ -110,6 +110,9 @@ int vmw_getparam_ioctl(struct drm_device *dev, void *data,
 		param->value =
 			(dev_priv->active_display_unit == vmw_du_screen_target);
 		break;
+	case DRM_VMW_PARAM_DX:
+		param->value = dev_priv->has_dx;
+		break;
 	default:
 		DRM_ERROR("Illegal vmwgfx get param request: %d\n",
 			  param->param);
@@ -193,8 +196,8 @@ int vmw_get_cap_3d_ioctl(struct drm_device *dev, void *data,
 		uint32_t *bounce32 = (uint32_t *) bounce;
 
 		num = size / sizeof(uint32_t);
-		if (num > SVGA3D_DEVCAP_MAX)
-			num = SVGA3D_DEVCAP_MAX;
+		if (num > SVGA3D_DEVCAP_DX)
+			num = SVGA3D_DEVCAP_DX;
 
 		spin_lock(&dev_priv->cap_lock);
 		for (i = 0; i < num; ++i) {
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c
index 34d04bf17dfa..f961bb98cdaa 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c
@@ -528,7 +528,11 @@ static int vmw_kms_new_framebuffer_surface(struct vmw_private *dev_priv,
 		return -EINVAL;
 	}
 
-	if (unlikely(format != surface->format)) {
+	/*
+	 * For DX, surface format validation is done when surface->scanout
+	 * is set.
+	 */
+	if (!dev_priv->has_dx && format != surface->format) {
 		DRM_ERROR("Invalid surface format for requested mode.\n");
 		return -EINVAL;
 	}
@@ -754,6 +758,7 @@ static int vmw_create_dmabuf_proxy(struct drm_device *dev,
 			true, /* can be a scanout buffer */
 			1, /* num of mip levels */
 			0,
+			0,
 			content_base_size,
 			srf_out);
 	if (ret) {
@@ -769,7 +774,7 @@ static int vmw_create_dmabuf_proxy(struct drm_device *dev,
 	vmw_dmabuf_unreference(&res->backup);
 	res->backup = vmw_dmabuf_reference(dmabuf_mob);
 	res->backup_offset = 0;
-	vmw_resource_unreserve(res, NULL, 0);
+	vmw_resource_unreserve(res, false, NULL, 0);
 	mutex_unlock(&res->dev_priv->cmdbuf_mutex);
 
 	return 0;
@@ -1869,7 +1874,7 @@ void vmw_kms_helper_buffer_finish(struct vmw_private *dev_priv,
 void vmw_kms_helper_resource_revert(struct vmw_resource *res)
 {
 	vmw_kms_helper_buffer_revert(res->backup);
-	vmw_resource_unreserve(res, NULL, 0);
+	vmw_resource_unreserve(res, false, NULL, 0);
 	mutex_unlock(&res->dev_priv->cmdbuf_mutex);
 }
 
@@ -1916,7 +1921,7 @@ int vmw_kms_helper_resource_prepare(struct vmw_resource *res,
 out_revert:
 	vmw_kms_helper_buffer_revert(res->backup);
 out_unreserve:
-	vmw_resource_unreserve(res, NULL, 0);
+	vmw_resource_unreserve(res, false, NULL, 0);
 out_unlock:
 	mutex_unlock(&res->dev_priv->cmdbuf_mutex);
 	return ret;
@@ -1937,7 +1942,7 @@ void vmw_kms_helper_resource_finish(struct vmw_resource *res,
 		vmw_kms_helper_buffer_finish(res->dev_priv, NULL, res->backup,
 					     out_fence, NULL);
 
-	vmw_resource_unreserve(res, NULL, 0);
+	vmw_resource_unreserve(res, false, NULL, 0);
 	mutex_unlock(&res->dev_priv->cmdbuf_mutex);
 }
 
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_mob.c b/drivers/gpu/drm/vmwgfx/vmwgfx_mob.c
index 5b0287eba30d..a8203a9e1050 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_mob.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_mob.c
@@ -67,9 +67,23 @@ struct vmw_mob {
  * @size:           Size of the table (page-aligned).
  * @page_table:     Pointer to a struct vmw_mob holding the page table.
  */
-struct vmw_otable {
-	unsigned long size;
-	struct vmw_mob *page_table;
+static const struct vmw_otable pre_dx_tables[] = {
+	{VMWGFX_NUM_MOB * SVGA3D_OTABLE_MOB_ENTRY_SIZE, NULL, true},
+	{VMWGFX_NUM_GB_SURFACE * SVGA3D_OTABLE_SURFACE_ENTRY_SIZE, NULL, true},
+	{VMWGFX_NUM_GB_CONTEXT * SVGA3D_OTABLE_CONTEXT_ENTRY_SIZE, NULL, true},
+	{VMWGFX_NUM_GB_SHADER * SVGA3D_OTABLE_SHADER_ENTRY_SIZE, NULL, true},
+	{VMWGFX_NUM_GB_SCREEN_TARGET * SVGA3D_OTABLE_SCREEN_TARGET_ENTRY_SIZE,
+	 NULL, VMWGFX_ENABLE_SCREEN_TARGET_OTABLE}
+};
+
+static const struct vmw_otable dx_tables[] = {
+	{VMWGFX_NUM_MOB * SVGA3D_OTABLE_MOB_ENTRY_SIZE, NULL, true},
+	{VMWGFX_NUM_GB_SURFACE * SVGA3D_OTABLE_SURFACE_ENTRY_SIZE, NULL, true},
+	{VMWGFX_NUM_GB_CONTEXT * SVGA3D_OTABLE_CONTEXT_ENTRY_SIZE, NULL, true},
+	{VMWGFX_NUM_GB_SHADER * SVGA3D_OTABLE_SHADER_ENTRY_SIZE, NULL, true},
+	{VMWGFX_NUM_GB_SCREEN_TARGET * SVGA3D_OTABLE_SCREEN_TARGET_ENTRY_SIZE,
+	 NULL, VMWGFX_ENABLE_SCREEN_TARGET_OTABLE},
+	{VMWGFX_NUM_DXCONTEXT * sizeof(SVGAOTableDXContextEntry), NULL, true},
 };
 
 static int vmw_mob_pt_populate(struct vmw_private *dev_priv,
@@ -92,6 +106,7 @@ static void vmw_mob_pt_setup(struct vmw_mob *mob,
  */
 static int vmw_setup_otable_base(struct vmw_private *dev_priv,
 				 SVGAOTableType type,
+				 struct ttm_buffer_object *otable_bo,
 				 unsigned long offset,
 				 struct vmw_otable *otable)
 {
@@ -106,7 +121,7 @@ static int vmw_setup_otable_base(struct vmw_private *dev_priv,
 
 	BUG_ON(otable->page_table != NULL);
 
-	vsgt = vmw_bo_sg_table(dev_priv->otable_bo);
+	vsgt = vmw_bo_sg_table(otable_bo);
 	vmw_piter_start(&iter, vsgt, offset >> PAGE_SHIFT);
 	WARN_ON(!vmw_piter_next(&iter));
 
@@ -193,7 +208,7 @@ static void vmw_takedown_otable_base(struct vmw_private *dev_priv,
 			  "takedown.\n");
 		return;
 	}
- 
+
 	memset(cmd, 0, sizeof(*cmd));
 	cmd->header.id = SVGA_3D_CMD_SET_OTABLE_BASE;
 	cmd->header.size = sizeof(cmd->body);
@@ -218,47 +233,21 @@ static void vmw_takedown_otable_base(struct vmw_private *dev_priv,
 	otable->page_table = NULL;
 }
 
-/*
- * vmw_otables_setup - Set up guest backed memory object tables
- *
- * @dev_priv:       Pointer to a device private structure
- *
- * Takes care of the device guest backed surface
- * initialization, by setting up the guest backed memory object tables.
- * Returns 0 on success and various error codes on failure. A succesful return
- * means the object tables can be taken down using the vmw_otables_takedown
- * function.
- */
-int vmw_otables_setup(struct vmw_private *dev_priv)
+
+static int vmw_otable_batch_setup(struct vmw_private *dev_priv,
+				  struct vmw_otable_batch *batch)
 {
 	unsigned long offset;
 	unsigned long bo_size;
-	struct vmw_otable *otables;
+	struct vmw_otable *otables = batch->otables;
 	SVGAOTableType i;
 	int ret;
 
-	otables = kzalloc(SVGA_OTABLE_DX9_MAX * sizeof(*otables),
-			  GFP_KERNEL);
-	if (unlikely(otables == NULL)) {
-		DRM_ERROR("Failed to allocate space for otable "
-			  "metadata.\n");
-		return -ENOMEM;
-	}
-
-	otables[SVGA_OTABLE_MOB].size =
-		VMWGFX_NUM_MOB * SVGA3D_OTABLE_MOB_ENTRY_SIZE;
-	otables[SVGA_OTABLE_SURFACE].size =
-		VMWGFX_NUM_GB_SURFACE * SVGA3D_OTABLE_SURFACE_ENTRY_SIZE;
-	otables[SVGA_OTABLE_CONTEXT].size =
-		VMWGFX_NUM_GB_CONTEXT * SVGA3D_OTABLE_CONTEXT_ENTRY_SIZE;
-	otables[SVGA_OTABLE_SHADER].size =
-		VMWGFX_NUM_GB_SHADER * SVGA3D_OTABLE_SHADER_ENTRY_SIZE;
-	otables[SVGA_OTABLE_SCREENTARGET].size =
-		VMWGFX_NUM_GB_SCREEN_TARGET *
-		SVGA3D_OTABLE_SCREEN_TARGET_ENTRY_SIZE;
-
 	bo_size = 0;
-	for (i = 0; i < SVGA_OTABLE_DX9_MAX; ++i) {
+	for (i = 0; i < batch->num_otables; ++i) {
+		if (!otables[i].enabled)
+			continue;
+
 		otables[i].size =
 			(otables[i].size + PAGE_SIZE - 1) & PAGE_MASK;
 		bo_size += otables[i].size;
@@ -268,63 +257,105 @@ int vmw_otables_setup(struct vmw_private *dev_priv)
 			    ttm_bo_type_device,
 			    &vmw_sys_ne_placement,
 			    0, false, NULL,
-			    &dev_priv->otable_bo);
+			    &batch->otable_bo);
 
 	if (unlikely(ret != 0))
 		goto out_no_bo;
 
-	ret = ttm_bo_reserve(dev_priv->otable_bo, false, true, false, NULL);
+	ret = ttm_bo_reserve(batch->otable_bo, false, true, false, NULL);
 	BUG_ON(ret != 0);
-	ret = vmw_bo_driver.ttm_tt_populate(dev_priv->otable_bo->ttm);
+	ret = vmw_bo_driver.ttm_tt_populate(batch->otable_bo->ttm);
 	if (unlikely(ret != 0))
 		goto out_unreserve;
-	ret = vmw_bo_map_dma(dev_priv->otable_bo);
+	ret = vmw_bo_map_dma(batch->otable_bo);
 	if (unlikely(ret != 0))
 		goto out_unreserve;
 
-	ttm_bo_unreserve(dev_priv->otable_bo);
+	ttm_bo_unreserve(batch->otable_bo);
 
 	offset = 0;
-	for (i = 0; i < SVGA_OTABLE_DX9_MAX - VMW_OTABLE_SETUP_SUB; ++i) {
-		ret = vmw_setup_otable_base(dev_priv, i, offset,
+	for (i = 0; i < batch->num_otables; ++i) {
+		if (!batch->otables[i].enabled)
+			continue;
+
+		ret = vmw_setup_otable_base(dev_priv, i, batch->otable_bo,
+					    offset,
 					    &otables[i]);
 		if (unlikely(ret != 0))
 			goto out_no_setup;
 		offset += otables[i].size;
 	}
 
-	dev_priv->otables = otables;
 	return 0;
 
 out_unreserve:
-	ttm_bo_unreserve(dev_priv->otable_bo);
+	ttm_bo_unreserve(batch->otable_bo);
 out_no_setup:
-	for (i = 0; i < SVGA_OTABLE_DX9_MAX - VMW_OTABLE_SETUP_SUB; ++i)
-		vmw_takedown_otable_base(dev_priv, i, &otables[i]);
+	for (i = 0; i < batch->num_otables; ++i) {
+		if (batch->otables[i].enabled)
+			vmw_takedown_otable_base(dev_priv, i,
+						 &batch->otables[i]);
+	}
 
-	ttm_bo_unref(&dev_priv->otable_bo);
+	ttm_bo_unref(&batch->otable_bo);
 out_no_bo:
-	kfree(otables);
 	return ret;
 }
 
-
 /*
- * vmw_otables_takedown - Take down guest backed memory object tables
+ * vmw_otables_setup - Set up guest backed memory object tables
  *
  * @dev_priv:       Pointer to a device private structure
  *
- * Take down the Guest Memory Object tables.
+ * Takes care of the device guest backed surface
+ * initialization, by setting up the guest backed memory object tables.
+ * Returns 0 on success and various error codes on failure. A successful return
+ * means the object tables can be taken down using the vmw_otables_takedown
+ * function.
  */
-void vmw_otables_takedown(struct vmw_private *dev_priv)
+int vmw_otables_setup(struct vmw_private *dev_priv)
+{
+	struct vmw_otable **otables = &dev_priv->otable_batch.otables;
+	int ret;
+
+	if (dev_priv->has_dx) {
+		*otables = kmalloc(sizeof(dx_tables), GFP_KERNEL);
+		if (*otables == NULL)
+			return -ENOMEM;
+
+		memcpy(*otables, dx_tables, sizeof(dx_tables));
+		dev_priv->otable_batch.num_otables = ARRAY_SIZE(dx_tables);
+	} else {
+		*otables = kmalloc(sizeof(pre_dx_tables), GFP_KERNEL);
+		if (*otables == NULL)
+			return -ENOMEM;
+
+		memcpy(*otables, pre_dx_tables, sizeof(pre_dx_tables));
+		dev_priv->otable_batch.num_otables = ARRAY_SIZE(pre_dx_tables);
+	}
+
+	ret = vmw_otable_batch_setup(dev_priv, &dev_priv->otable_batch);
+	if (unlikely(ret != 0))
+		goto out_setup;
+
+	return 0;
+
+out_setup:
+	kfree(*otables);
+	return ret;
+}
+
+static void vmw_otable_batch_takedown(struct vmw_private *dev_priv,
+			       struct vmw_otable_batch *batch)
 {
 	SVGAOTableType i;
-	struct ttm_buffer_object *bo = dev_priv->otable_bo;
+	struct ttm_buffer_object *bo = batch->otable_bo;
 	int ret;
 
-	for (i = 0; i < SVGA_OTABLE_DX9_MAX - VMW_OTABLE_SETUP_SUB; ++i)
-		vmw_takedown_otable_base(dev_priv, i,
-					 &dev_priv->otables[i]);
+	for (i = 0; i < batch->num_otables; ++i)
+		if (batch->otables[i].enabled)
+			vmw_takedown_otable_base(dev_priv, i,
+						 &batch->otables[i]);
 
 	ret = ttm_bo_reserve(bo, false, true, false, NULL);
 	BUG_ON(ret != 0);
@@ -332,11 +363,21 @@ void vmw_otables_takedown(struct vmw_private *dev_priv)
 	vmw_fence_single_bo(bo, NULL);
 	ttm_bo_unreserve(bo);
 
-	ttm_bo_unref(&dev_priv->otable_bo);
-	kfree(dev_priv->otables);
-	dev_priv->otables = NULL;
+	ttm_bo_unref(&batch->otable_bo);
 }
 
+/*
+ * vmw_otables_takedown - Take down guest backed memory object tables
+ *
+ * @dev_priv:       Pointer to a device private structure
+ *
+ * Take down the Guest Memory Object tables.
+ */
+void vmw_otables_takedown(struct vmw_private *dev_priv)
+{
+	vmw_otable_batch_takedown(dev_priv, &dev_priv->otable_batch);
+	kfree(dev_priv->otable_batch.otables);
+}
 
 /*
  * vmw_mob_calculate_pt_pages - Calculate the number of page table pages
@@ -410,7 +451,7 @@ static int vmw_mob_pt_populate(struct vmw_private *dev_priv,
 		goto out_unreserve;
 
 	ttm_bo_unreserve(mob->pt_bo);
-	
+
 	return 0;
 
 out_unreserve:
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
index be2809aaa7cb..6186e859dab0 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
@@ -31,6 +31,7 @@
 #include <drm/ttm/ttm_placement.h>
 #include <drm/drmP.h>
 #include "vmwgfx_resource_priv.h"
+#include "vmwgfx_binding.h"
 
 #define VMW_RES_EVICT_ERR_COUNT 10
 
@@ -144,10 +145,10 @@ static void vmw_resource_release(struct kref *kref)
 	}
 
 	if (likely(res->hw_destroy != NULL)) {
-		res->hw_destroy(res);
 		mutex_lock(&dev_priv->binding_mutex);
-		vmw_context_binding_res_list_kill(&res->binding_head);
+		vmw_binding_res_list_kill(&res->binding_head);
 		mutex_unlock(&dev_priv->binding_mutex);
+		res->hw_destroy(res);
 	}
 
 	id = res->id;
@@ -1149,14 +1150,16 @@ out_bind_failed:
  * command submission.
  *
  * @res:               Pointer to the struct vmw_resource to unreserve.
+ * @switch_backup:     Backup buffer has been switched.
  * @new_backup:        Pointer to new backup buffer if command submission
- *                     switched.
- * @new_backup_offset: New backup offset if @new_backup is !NULL.
+ *                     switched. May be NULL.
+ * @new_backup_offset: New backup offset if @switch_backup is true.
  *
  * Currently unreserving a resource means putting it back on the device's
  * resource lru list, so that it can be evicted if necessary.
  */
 void vmw_resource_unreserve(struct vmw_resource *res,
+			    bool switch_backup,
 			    struct vmw_dma_buffer *new_backup,
 			    unsigned long new_backup_offset)
 {
@@ -1165,19 +1168,22 @@ void vmw_resource_unreserve(struct vmw_resource *res,
 	if (!list_empty(&res->lru_head))
 		return;
 
-	if (new_backup && new_backup != res->backup) {
-
+	if (switch_backup && new_backup != res->backup) {
 		if (res->backup) {
 			lockdep_assert_held(&res->backup->base.resv->lock.base);
 			list_del_init(&res->mob_head);
 			vmw_dmabuf_unreference(&res->backup);
 		}
 
-		res->backup = vmw_dmabuf_reference(new_backup);
-		lockdep_assert_held(&new_backup->base.resv->lock.base);
-		list_add_tail(&res->mob_head, &new_backup->res_list);
+		if (new_backup) {
+			res->backup = vmw_dmabuf_reference(new_backup);
+			lockdep_assert_held(&new_backup->base.resv->lock.base);
+			list_add_tail(&res->mob_head, &new_backup->res_list);
+		} else {
+			res->backup = NULL;
+		}
 	}
-	if (new_backup)
+	if (switch_backup)
 		res->backup_offset = new_backup_offset;
 
 	if (!res->func->may_evict || res->id == -1 || res->pin_count)
@@ -1269,8 +1275,12 @@ int vmw_resource_reserve(struct vmw_resource *res, bool interruptible,
 	if (res->func->needs_backup && res->backup == NULL &&
 	    !no_backup) {
 		ret = vmw_resource_buf_alloc(res, interruptible);
-		if (unlikely(ret != 0))
+		if (unlikely(ret != 0)) {
+			DRM_ERROR("Failed to allocate a backup buffer "
+				  "of size %lu. bytes\n",
+				  (unsigned long) res->backup_size);
 			return ret;
+		}
 	}
 
 	return 0;
@@ -1354,7 +1364,7 @@ int vmw_resource_validate(struct vmw_resource *res)
 	struct ttm_validate_buffer val_buf;
 	unsigned err_count = 0;
 
-	if (likely(!res->func->may_evict))
+	if (!res->func->create)
 		return 0;
 
 	val_buf.bo = NULL;
@@ -1624,7 +1634,7 @@ int vmw_resource_pin(struct vmw_resource *res, bool interruptible)
 	res->pin_count++;
 
 out_no_validate:
-	vmw_resource_unreserve(res, NULL, 0UL);
+	vmw_resource_unreserve(res, false, NULL, 0UL);
 out_no_reserve:
 	mutex_unlock(&dev_priv->cmdbuf_mutex);
 	ttm_write_unlock(&dev_priv->reservation_sem);
@@ -1660,8 +1670,18 @@ void vmw_resource_unpin(struct vmw_resource *res)
 		ttm_bo_unreserve(&vbo->base);
 	}
 
-	vmw_resource_unreserve(res, NULL, 0UL);
+	vmw_resource_unreserve(res, false, NULL, 0UL);
 
 	mutex_unlock(&dev_priv->cmdbuf_mutex);
 	ttm_read_unlock(&dev_priv->reservation_sem);
 }
+
+/**
+ * vmw_res_type - Return the resource type
+ *
+ * @res: Pointer to the resource
+ */
+enum vmw_res_type vmw_res_type(const struct vmw_resource *res)
+{
+	return res->func->res_type;
+}
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h b/drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h
index f3adeed2854c..743e2adafed2 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h
@@ -30,6 +30,12 @@
 
 #include "vmwgfx_drv.h"
 
+enum vmw_cmdbuf_res_state {
+	VMW_CMDBUF_RES_COMMITTED,
+	VMW_CMDBUF_RES_ADD,
+	VMW_CMDBUF_RES_DEL
+};
+
 /**
  * struct vmw_user_resource_conv - Identify a derived user-exported resource
  * type and provide a function to convert its ttm_base_object pointer to
@@ -55,8 +61,10 @@ struct vmw_user_resource_conv {
  * @bind:              Bind a hardware resource to persistent buffer storage.
  * @unbind:            Unbind a hardware resource from persistent
  *                     buffer storage.
+ * @commit_notify:     If the resource is a command buffer managed resource,
+ *                     callback to notify that a define or remove command
+ *                     has been committed to the device.
  */
-
 struct vmw_res_func {
 	enum vmw_res_type res_type;
 	bool needs_backup;
@@ -71,6 +79,8 @@ struct vmw_res_func {
 	int (*unbind) (struct vmw_resource *res,
 		       bool readback,
 		       struct ttm_validate_buffer *val_buf);
+	void (*commit_notify)(struct vmw_resource *res,
+			      enum vmw_cmdbuf_res_state state);
 };
 
 int vmw_resource_alloc_id(struct vmw_resource *res);
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_shader.c b/drivers/gpu/drm/vmwgfx/vmwgfx_shader.c
index 11bc60c2771a..61403ebe3a1e 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_shader.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_shader.c
@@ -27,12 +27,15 @@
 
 #include "vmwgfx_drv.h"
 #include "vmwgfx_resource_priv.h"
+#include "vmwgfx_binding.h"
 #include "ttm/ttm_placement.h"
 
 struct vmw_shader {
 	struct vmw_resource res;
 	SVGA3dShaderType type;
 	uint32_t size;
+	uint8_t num_input_sig;
+	uint8_t num_output_sig;
 };
 
 struct vmw_user_shader {
@@ -40,8 +43,18 @@ struct vmw_user_shader {
 	struct vmw_shader shader;
 };
 
+struct vmw_dx_shader {
+	struct vmw_resource res;
+	struct vmw_resource *ctx;
+	struct vmw_resource *cotable;
+	u32 id;
+	bool committed;
+	struct list_head cotable_head;
+};
+
 static uint64_t vmw_user_shader_size;
 static uint64_t vmw_shader_size;
+static size_t vmw_shader_dx_size;
 
 static void vmw_user_shader_free(struct vmw_resource *res);
 static struct vmw_resource *
@@ -55,6 +68,18 @@ static int vmw_gb_shader_unbind(struct vmw_resource *res,
 				 struct ttm_validate_buffer *val_buf);
 static int vmw_gb_shader_destroy(struct vmw_resource *res);
 
+static int vmw_dx_shader_create(struct vmw_resource *res);
+static int vmw_dx_shader_bind(struct vmw_resource *res,
+			       struct ttm_validate_buffer *val_buf);
+static int vmw_dx_shader_unbind(struct vmw_resource *res,
+				 bool readback,
+				 struct ttm_validate_buffer *val_buf);
+static void vmw_dx_shader_commit_notify(struct vmw_resource *res,
+					enum vmw_cmdbuf_res_state state);
+static bool vmw_shader_id_ok(u32 user_key, SVGA3dShaderType shader_type);
+static u32 vmw_shader_key(u32 user_key, SVGA3dShaderType shader_type);
+static uint64_t vmw_user_shader_size;
+
 static const struct vmw_user_resource_conv user_shader_conv = {
 	.object_type = VMW_RES_SHADER,
 	.base_obj_to_res = vmw_user_shader_base_to_res,
@@ -77,6 +102,24 @@ static const struct vmw_res_func vmw_gb_shader_func = {
 	.unbind = vmw_gb_shader_unbind
 };
 
+static const struct vmw_res_func vmw_dx_shader_func = {
+	.res_type = vmw_res_shader,
+	.needs_backup = true,
+	.may_evict = false,
+	.type_name = "dx shaders",
+	.backup_placement = &vmw_mob_placement,
+	.create = vmw_dx_shader_create,
+	/*
+	 * The destroy callback is only called with a committed resource on
+	 * context destroy, in which case we destroy the cotable anyway,
+	 * so there's no need to destroy DX shaders separately.
+	 */
+	.destroy = NULL,
+	.bind = vmw_dx_shader_bind,
+	.unbind = vmw_dx_shader_unbind,
+	.commit_notify = vmw_dx_shader_commit_notify,
+};
+
 /**
  * Shader management:
  */
@@ -87,25 +130,42 @@ vmw_res_to_shader(struct vmw_resource *res)
 	return container_of(res, struct vmw_shader, res);
 }
 
+/**
+ * vmw_res_to_dx_shader - typecast a struct vmw_resource to a
+ * struct vmw_dx_shader
+ *
+ * @res: Pointer to the struct vmw_resource.
+ */
+static inline struct vmw_dx_shader *
+vmw_res_to_dx_shader(struct vmw_resource *res)
+{
+	return container_of(res, struct vmw_dx_shader, res);
+}
+
 static void vmw_hw_shader_destroy(struct vmw_resource *res)
 {
-	(void) vmw_gb_shader_destroy(res);
+	if (likely(res->func->destroy))
+		(void) res->func->destroy(res);
+	else
+		res->id = -1;
 }
 
+
 static int vmw_gb_shader_init(struct vmw_private *dev_priv,
 			      struct vmw_resource *res,
 			      uint32_t size,
 			      uint64_t offset,
 			      SVGA3dShaderType type,
+			      uint8_t num_input_sig,
+			      uint8_t num_output_sig,
 			      struct vmw_dma_buffer *byte_code,
 			      void (*res_free) (struct vmw_resource *res))
 {
 	struct vmw_shader *shader = vmw_res_to_shader(res);
 	int ret;
 
-	ret = vmw_resource_init(dev_priv, res, true,
-				res_free, &vmw_gb_shader_func);
-
+	ret = vmw_resource_init(dev_priv, res, true, res_free,
+				&vmw_gb_shader_func);
 
 	if (unlikely(ret != 0)) {
 		if (res_free)
@@ -122,11 +182,17 @@ static int vmw_gb_shader_init(struct vmw_private *dev_priv,
 	}
 	shader->size = size;
 	shader->type = type;
+	shader->num_input_sig = num_input_sig;
+	shader->num_output_sig = num_output_sig;
 
 	vmw_resource_activate(res, vmw_hw_shader_destroy);
 	return 0;
 }
 
+/*
+ * GB shader code:
+ */
+
 static int vmw_gb_shader_create(struct vmw_resource *res)
 {
 	struct vmw_private *dev_priv = res->dev_priv;
@@ -259,7 +325,7 @@ static int vmw_gb_shader_destroy(struct vmw_resource *res)
 		return 0;
 
 	mutex_lock(&dev_priv->binding_mutex);
-	vmw_context_binding_res_list_scrub(&res->binding_head);
+	vmw_binding_res_list_scrub(&res->binding_head);
 
 	cmd = vmw_fifo_reserve(dev_priv, sizeof(*cmd));
 	if (unlikely(cmd == NULL)) {
@@ -280,6 +346,321 @@ static int vmw_gb_shader_destroy(struct vmw_resource *res)
 	return 0;
 }
 
+/*
+ * DX shader code:
+ */
+
+/**
+ * vmw_dx_shader_commit_notify - Notify that a shader operation has been
+ * committed to hardware from a user-supplied command stream.
+ *
+ * @res: Pointer to the shader resource.
+ * @state: Indicating whether a creation or removal has been committed.
+ *
+ */
+static void vmw_dx_shader_commit_notify(struct vmw_resource *res,
+					enum vmw_cmdbuf_res_state state)
+{
+	struct vmw_dx_shader *shader = vmw_res_to_dx_shader(res);
+	struct vmw_private *dev_priv = res->dev_priv;
+
+	if (state == VMW_CMDBUF_RES_ADD) {
+		mutex_lock(&dev_priv->binding_mutex);
+		vmw_cotable_add_resource(shader->cotable,
+					 &shader->cotable_head);
+		shader->committed = true;
+		res->id = shader->id;
+		mutex_unlock(&dev_priv->binding_mutex);
+	} else {
+		mutex_lock(&dev_priv->binding_mutex);
+		list_del_init(&shader->cotable_head);
+		shader->committed = false;
+		res->id = -1;
+		mutex_unlock(&dev_priv->binding_mutex);
+	}
+}
+
+/**
+ * vmw_dx_shader_unscrub - Have the device reattach a MOB to a DX shader.
+ *
+ * @res: The shader resource
+ *
+ * This function reverts a scrub operation.
+ */
+static int vmw_dx_shader_unscrub(struct vmw_resource *res)
+{
+	struct vmw_dx_shader *shader = vmw_res_to_dx_shader(res);
+	struct vmw_private *dev_priv = res->dev_priv;
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdDXBindShader body;
+	} *cmd;
+
+	if (!list_empty(&shader->cotable_head) || !shader->committed)
+		return 0;
+
+	cmd = vmw_fifo_reserve_dx(dev_priv, sizeof(*cmd),
+				  shader->ctx->id);
+	if (unlikely(cmd == NULL)) {
+		DRM_ERROR("Failed reserving FIFO space for shader "
+			  "scrubbing.\n");
+		return -ENOMEM;
+	}
+
+	cmd->header.id = SVGA_3D_CMD_DX_BIND_SHADER;
+	cmd->header.size = sizeof(cmd->body);
+	cmd->body.cid = shader->ctx->id;
+	cmd->body.shid = shader->id;
+	cmd->body.mobid = res->backup->base.mem.start;
+	cmd->body.offsetInBytes = res->backup_offset;
+	vmw_fifo_commit(dev_priv, sizeof(*cmd));
+
+	vmw_cotable_add_resource(shader->cotable, &shader->cotable_head);
+
+	return 0;
+}
+
+/**
+ * vmw_dx_shader_create - The DX shader create callback
+ *
+ * @res: The DX shader resource
+ *
+ * The create callback is called as part of resource validation and
+ * makes sure that we unscrub the shader if it's previously been scrubbed.
+ */
+static int vmw_dx_shader_create(struct vmw_resource *res)
+{
+	struct vmw_private *dev_priv = res->dev_priv;
+	struct vmw_dx_shader *shader = vmw_res_to_dx_shader(res);
+	int ret = 0;
+
+	WARN_ON_ONCE(!shader->committed);
+
+	if (!list_empty(&res->mob_head)) {
+		mutex_lock(&dev_priv->binding_mutex);
+		ret = vmw_dx_shader_unscrub(res);
+		mutex_unlock(&dev_priv->binding_mutex);
+	}
+
+	res->id = shader->id;
+	return ret;
+}
+
+/**
+ * vmw_dx_shader_bind - The DX shader bind callback
+ *
+ * @res: The DX shader resource
+ * @val_buf: Pointer to the validate buffer.
+ *
+ */
+static int vmw_dx_shader_bind(struct vmw_resource *res,
+			      struct ttm_validate_buffer *val_buf)
+{
+	struct vmw_private *dev_priv = res->dev_priv;
+	struct ttm_buffer_object *bo = val_buf->bo;
+
+	BUG_ON(bo->mem.mem_type != VMW_PL_MOB);
+	mutex_lock(&dev_priv->binding_mutex);
+	vmw_dx_shader_unscrub(res);
+	mutex_unlock(&dev_priv->binding_mutex);
+
+	return 0;
+}
+
+/**
+ * vmw_dx_shader_scrub - Have the device unbind a MOB from a DX shader.
+ *
+ * @res: The shader resource
+ *
+ * This function unbinds a MOB from the DX shader without requiring the
+ * MOB dma_buffer to be reserved. The driver still considers the MOB bound.
+ * However, once the driver eventually decides to unbind the MOB, it doesn't
+ * need to access the context.
+ */
+static int vmw_dx_shader_scrub(struct vmw_resource *res)
+{
+	struct vmw_dx_shader *shader = vmw_res_to_dx_shader(res);
+	struct vmw_private *dev_priv = res->dev_priv;
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdDXBindShader body;
+	} *cmd;
+
+	if (list_empty(&shader->cotable_head))
+		return 0;
+
+	WARN_ON_ONCE(!shader->committed);
+	cmd = vmw_fifo_reserve(dev_priv, sizeof(*cmd));
+	if (unlikely(cmd == NULL)) {
+		DRM_ERROR("Failed reserving FIFO space for shader "
+			  "scrubbing.\n");
+		return -ENOMEM;
+	}
+
+	cmd->header.id = SVGA_3D_CMD_DX_BIND_SHADER;
+	cmd->header.size = sizeof(cmd->body);
+	cmd->body.cid = shader->ctx->id;
+	cmd->body.shid = res->id;
+	cmd->body.mobid = SVGA3D_INVALID_ID;
+	cmd->body.offsetInBytes = 0;
+	vmw_fifo_commit(dev_priv, sizeof(*cmd));
+	res->id = -1;
+	list_del_init(&shader->cotable_head);
+
+	return 0;
+}
+
+/**
+ * vmw_dx_shader_unbind - The dx shader unbind callback.
+ *
+ * @res: The shader resource
+ * @readback: Whether this is a readback unbind. Currently unused.
+ * @val_buf: MOB buffer information.
+ */
+static int vmw_dx_shader_unbind(struct vmw_resource *res,
+				bool readback,
+				struct ttm_validate_buffer *val_buf)
+{
+	struct vmw_private *dev_priv = res->dev_priv;
+	struct vmw_fence_obj *fence;
+	int ret;
+
+	BUG_ON(res->backup->base.mem.mem_type != VMW_PL_MOB);
+
+	mutex_lock(&dev_priv->binding_mutex);
+	ret = vmw_dx_shader_scrub(res);
+	mutex_unlock(&dev_priv->binding_mutex);
+
+	if (ret)
+		return ret;
+
+	(void) vmw_execbuf_fence_commands(NULL, dev_priv,
+					  &fence, NULL);
+	vmw_fence_single_bo(val_buf->bo, fence);
+
+	if (likely(fence != NULL))
+		vmw_fence_obj_unreference(&fence);
+
+	return 0;
+}
+
+/**
+ * vmw_dx_shader_cotable_list_scrub - The cotable unbind_func callback for
+ * DX shaders.
+ *
+ * @dev_priv: Pointer to device private structure.
+ * @list: The list of cotable resources.
+ * @readback: Whether the call was part of a readback unbind.
+ *
+ * Scrubs all shader MOBs so that any subsequent shader unbind or shader
+ * destroy operation won't need to swap in the context.
+ */
+void vmw_dx_shader_cotable_list_scrub(struct vmw_private *dev_priv,
+				      struct list_head *list,
+				      bool readback)
+{
+	struct vmw_dx_shader *entry, *next;
+
+	WARN_ON_ONCE(!mutex_is_locked(&dev_priv->binding_mutex));
+
+	list_for_each_entry_safe(entry, next, list, cotable_head) {
+		WARN_ON(vmw_dx_shader_scrub(&entry->res));
+		if (!readback)
+			entry->committed = false;
+	}
+}
+
+/**
+ * vmw_dx_shader_res_free - The DX shader free callback
+ *
+ * @res: The shader resource
+ *
+ * Frees the DX shader resource and updates memory accounting.
+ */
+static void vmw_dx_shader_res_free(struct vmw_resource *res)
+{
+	struct vmw_private *dev_priv = res->dev_priv;
+	struct vmw_dx_shader *shader = vmw_res_to_dx_shader(res);
+
+	vmw_resource_unreference(&shader->cotable);
+	kfree(shader);
+	ttm_mem_global_free(vmw_mem_glob(dev_priv), vmw_shader_dx_size);
+}
+
+/**
+ * vmw_dx_shader_add - Add a shader resource as a command buffer managed
+ * resource.
+ *
+ * @man: The command buffer resource manager.
+ * @ctx: Pointer to the context resource.
+ * @user_key: The id used for this shader.
+ * @shader_type: The shader type.
+ * @list: The list of staged command buffer managed resources.
+ */
+int vmw_dx_shader_add(struct vmw_cmdbuf_res_manager *man,
+		      struct vmw_resource *ctx,
+		      u32 user_key,
+		      SVGA3dShaderType shader_type,
+		      struct list_head *list)
+{
+	struct vmw_dx_shader *shader;
+	struct vmw_resource *res;
+	struct vmw_private *dev_priv = ctx->dev_priv;
+	int ret;
+
+	if (!vmw_shader_dx_size)
+		vmw_shader_dx_size = ttm_round_pot(sizeof(*shader));
+
+	if (!vmw_shader_id_ok(user_key, shader_type))
+		return -EINVAL;
+
+	ret = ttm_mem_global_alloc(vmw_mem_glob(dev_priv), vmw_shader_dx_size,
+				   false, true);
+	if (ret) {
+		if (ret != -ERESTARTSYS)
+			DRM_ERROR("Out of graphics memory for shader "
+				  "creation.\n");
+		return ret;
+	}
+
+	shader = kmalloc(sizeof(*shader), GFP_KERNEL);
+	if (!shader) {
+		ttm_mem_global_free(vmw_mem_glob(dev_priv), vmw_shader_dx_size);
+		return -ENOMEM;
+	}
+
+	res = &shader->res;
+	shader->ctx = ctx;
+	shader->cotable = vmw_context_cotable(ctx, SVGA_COTABLE_DXSHADER);
+	shader->id = user_key;
+	shader->committed = false;
+	INIT_LIST_HEAD(&shader->cotable_head);
+	ret = vmw_resource_init(dev_priv, res, true,
+				vmw_dx_shader_res_free, &vmw_dx_shader_func);
+	if (ret)
+		goto out_resource_init;
+
+	/*
+	 * The user_key name-space is not per shader type for DX shaders,
+	 * so when hashing, use a single zero shader type.
+	 */
+	ret = vmw_cmdbuf_res_add(man, vmw_cmdbuf_res_shader,
+				 vmw_shader_key(user_key, 0),
+				 res, list);
+	if (ret)
+		goto out_resource_init;
+
+	res->id = shader->id;
+	vmw_resource_activate(res, vmw_hw_shader_destroy);
+
+out_resource_init:
+	vmw_resource_unreference(&res);
+
+	return ret;
+}
+
+
+
 /**
  * User-space shader management:
  */
@@ -341,6 +722,8 @@ static int vmw_user_shader_alloc(struct vmw_private *dev_priv,
 				 size_t shader_size,
 				 size_t offset,
 				 SVGA3dShaderType shader_type,
+				 uint8_t num_input_sig,
+				 uint8_t num_output_sig,
 				 struct ttm_object_file *tfile,
 				 u32 *handle)
 {
@@ -383,7 +766,8 @@ static int vmw_user_shader_alloc(struct vmw_private *dev_priv,
 	 */
 
 	ret = vmw_gb_shader_init(dev_priv, res, shader_size,
-				 offset, shader_type, buffer,
+				 offset, shader_type, num_input_sig,
+				 num_output_sig, buffer,
 				 vmw_user_shader_free);
 	if (unlikely(ret != 0))
 		goto out;
@@ -449,7 +833,7 @@ static struct vmw_resource *vmw_shader_alloc(struct vmw_private *dev_priv,
 	 * From here on, the destructor takes over resource freeing.
 	 */
 	ret = vmw_gb_shader_init(dev_priv, res, shader_size,
-				 offset, shader_type, buffer,
+				 offset, shader_type, 0, 0, buffer,
 				 vmw_shader_free);
 
 out_err:
@@ -457,19 +841,20 @@ out_err:
 }
 
 
-int vmw_shader_define_ioctl(struct drm_device *dev, void *data,
-			     struct drm_file *file_priv)
+static int vmw_shader_define(struct drm_device *dev, struct drm_file *file_priv,
+			     enum drm_vmw_shader_type shader_type_drm,
+			     u32 buffer_handle, size_t size, size_t offset,
+			     uint8_t num_input_sig, uint8_t num_output_sig,
+			     uint32_t *shader_handle)
 {
 	struct vmw_private *dev_priv = vmw_priv(dev);
-	struct drm_vmw_shader_create_arg *arg =
-		(struct drm_vmw_shader_create_arg *)data;
 	struct ttm_object_file *tfile = vmw_fpriv(file_priv)->tfile;
 	struct vmw_dma_buffer *buffer = NULL;
 	SVGA3dShaderType shader_type;
 	int ret;
 
-	if (arg->buffer_handle != SVGA3D_INVALID_ID) {
-		ret = vmw_user_dmabuf_lookup(tfile, arg->buffer_handle,
+	if (buffer_handle != SVGA3D_INVALID_ID) {
+		ret = vmw_user_dmabuf_lookup(tfile, buffer_handle,
 					     &buffer);
 		if (unlikely(ret != 0)) {
 			DRM_ERROR("Could not find buffer for shader "
@@ -478,23 +863,20 @@ int vmw_shader_define_ioctl(struct drm_device *dev, void *data,
 		}
 
 		if ((u64)buffer->base.num_pages * PAGE_SIZE <
-		    (u64)arg->size + (u64)arg->offset) {
+		    (u64)size + (u64)offset) {
 			DRM_ERROR("Illegal buffer- or shader size.\n");
 			ret = -EINVAL;
 			goto out_bad_arg;
 		}
 	}
 
-	switch (arg->shader_type) {
+	switch (shader_type_drm) {
 	case drm_vmw_shader_type_vs:
 		shader_type = SVGA3D_SHADERTYPE_VS;
 		break;
 	case drm_vmw_shader_type_ps:
 		shader_type = SVGA3D_SHADERTYPE_PS;
 		break;
-	case drm_vmw_shader_type_gs:
-		shader_type = SVGA3D_SHADERTYPE_GS;
-		break;
 	default:
 		DRM_ERROR("Illegal shader type.\n");
 		ret = -EINVAL;
@@ -505,8 +887,9 @@ int vmw_shader_define_ioctl(struct drm_device *dev, void *data,
 	if (unlikely(ret != 0))
 		goto out_bad_arg;
 
-	ret = vmw_user_shader_alloc(dev_priv, buffer, arg->size, arg->offset,
-				    shader_type, tfile, &arg->shader_handle);
+	ret = vmw_user_shader_alloc(dev_priv, buffer, size, offset,
+				    shader_type, num_input_sig,
+				    num_output_sig, tfile, shader_handle);
 
 	ttm_read_unlock(&dev_priv->reservation_sem);
 out_bad_arg:
@@ -515,7 +898,7 @@ out_bad_arg:
 }
 
 /**
- * vmw_compat_shader_id_ok - Check whether a compat shader user key and
+ * vmw_shader_id_ok - Check whether a compat shader user key and
  * shader type are within valid bounds.
  *
  * @user_key: User space id of the shader.
@@ -523,13 +906,13 @@ out_bad_arg:
  *
  * Returns true if valid false if not.
  */
-static bool vmw_compat_shader_id_ok(u32 user_key, SVGA3dShaderType shader_type)
+static bool vmw_shader_id_ok(u32 user_key, SVGA3dShaderType shader_type)
 {
 	return user_key <= ((1 << 20) - 1) && (unsigned) shader_type < 16;
 }
 
 /**
- * vmw_compat_shader_key - Compute a hash key suitable for a compat shader.
+ * vmw_shader_key - Compute a hash key suitable for a compat shader.
  *
  * @user_key: User space id of the shader.
  * @shader_type: Shader type.
@@ -537,13 +920,13 @@ static bool vmw_compat_shader_id_ok(u32 user_key, SVGA3dShaderType shader_type)
  * Returns a hash key suitable for a command buffer managed resource
  * manager hash table.
  */
-static u32 vmw_compat_shader_key(u32 user_key, SVGA3dShaderType shader_type)
+static u32 vmw_shader_key(u32 user_key, SVGA3dShaderType shader_type)
 {
 	return user_key | (shader_type << 20);
 }
 
 /**
- * vmw_compat_shader_remove - Stage a compat shader for removal.
+ * vmw_shader_remove - Stage a compat shader for removal.
  *
  * @man: Pointer to the compat shader manager identifying the shader namespace.
  * @user_key: The key that is used to identify the shader. The key is
@@ -551,17 +934,18 @@ static u32 vmw_compat_shader_key(u32 user_key, SVGA3dShaderType shader_type)
  * @shader_type: Shader type.
  * @list: Caller's list of staged command buffer resource actions.
  */
-int vmw_compat_shader_remove(struct vmw_cmdbuf_res_manager *man,
-			     u32 user_key, SVGA3dShaderType shader_type,
-			     struct list_head *list)
+int vmw_shader_remove(struct vmw_cmdbuf_res_manager *man,
+		      u32 user_key, SVGA3dShaderType shader_type,
+		      struct list_head *list)
 {
-	if (!vmw_compat_shader_id_ok(user_key, shader_type))
+	struct vmw_resource *dummy;
+
+	if (!vmw_shader_id_ok(user_key, shader_type))
 		return -EINVAL;
 
-	return vmw_cmdbuf_res_remove(man, vmw_cmdbuf_res_compat_shader,
-				     vmw_compat_shader_key(user_key,
-							   shader_type),
-				     list);
+	return vmw_cmdbuf_res_remove(man, vmw_cmdbuf_res_shader,
+				     vmw_shader_key(user_key, shader_type),
+				     list, &dummy);
 }
 
 /**
@@ -591,7 +975,7 @@ int vmw_compat_shader_add(struct vmw_private *dev_priv,
 	int ret;
 	struct vmw_resource *res;
 
-	if (!vmw_compat_shader_id_ok(user_key, shader_type))
+	if (!vmw_shader_id_ok(user_key, shader_type))
 		return -EINVAL;
 
 	/* Allocate and pin a DMA buffer */
@@ -628,8 +1012,8 @@ int vmw_compat_shader_add(struct vmw_private *dev_priv,
 	if (unlikely(ret != 0))
 		goto no_reserve;
 
-	ret = vmw_cmdbuf_res_add(man, vmw_cmdbuf_res_compat_shader,
-				 vmw_compat_shader_key(user_key, shader_type),
+	ret = vmw_cmdbuf_res_add(man, vmw_cmdbuf_res_shader,
+				 vmw_shader_key(user_key, shader_type),
 				 res, list);
 	vmw_resource_unreference(&res);
 no_reserve:
@@ -639,7 +1023,7 @@ out:
 }
 
 /**
- * vmw_compat_shader_lookup - Look up a compat shader
+ * vmw_shader_lookup - Look up a compat shader
  *
  * @man: Pointer to the command buffer managed resource manager identifying
  * the shader namespace.
@@ -650,14 +1034,26 @@ out:
  * found. An error pointer otherwise.
  */
 struct vmw_resource *
-vmw_compat_shader_lookup(struct vmw_cmdbuf_res_manager *man,
-			 u32 user_key,
-			 SVGA3dShaderType shader_type)
+vmw_shader_lookup(struct vmw_cmdbuf_res_manager *man,
+		  u32 user_key,
+		  SVGA3dShaderType shader_type)
 {
-	if (!vmw_compat_shader_id_ok(user_key, shader_type))
+	if (!vmw_shader_id_ok(user_key, shader_type))
 		return ERR_PTR(-EINVAL);
 
-	return vmw_cmdbuf_res_lookup(man, vmw_cmdbuf_res_compat_shader,
-				     vmw_compat_shader_key(user_key,
-							   shader_type));
+	return vmw_cmdbuf_res_lookup(man, vmw_cmdbuf_res_shader,
+				     vmw_shader_key(user_key, shader_type));
+}
+
+int vmw_shader_define_ioctl(struct drm_device *dev, void *data,
+			     struct drm_file *file_priv)
+{
+	struct drm_vmw_shader_create_arg *arg =
+		(struct drm_vmw_shader_create_arg *)data;
+
+	return vmw_shader_define(dev, file_priv, arg->shader_type,
+				 arg->buffer_handle,
+				 arg->size, arg->offset,
+				 0, 0,
+				 &arg->shader_handle);
 }
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_so.c b/drivers/gpu/drm/vmwgfx/vmwgfx_so.c
new file mode 100644
index 000000000000..4dfdc95b2cfe
--- /dev/null
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_so.c
@@ -0,0 +1,555 @@
+/**************************************************************************
+ * Copyright © 2014 VMware, Inc., Palo Alto, CA., USA
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "vmwgfx_drv.h"
+#include "vmwgfx_resource_priv.h"
+#include "vmwgfx_so.h"
+#include "vmwgfx_binding.h"
+
+/*
+ * The currently only reason we need to keep track of views is that if we
+ * destroy a hardware surface, all views pointing to it must also be destroyed,
+ * otherwise the device will error.
+ * So in particuar if a surface is evicted, we must destroy all views pointing
+ * to it, and all context bindings of that view. Similarly we must restore
+ * the view bindings, views and surfaces pointed to by the views when a
+ * context is referenced in the command stream.
+ */
+
+/**
+ * struct vmw_view - view metadata
+ *
+ * @res: The struct vmw_resource we derive from
+ * @ctx: Non-refcounted pointer to the context this view belongs to.
+ * @srf: Refcounted pointer to the surface pointed to by this view.
+ * @cotable: Refcounted pointer to the cotable holding this view.
+ * @srf_head: List head for the surface-to-view list.
+ * @cotable_head: List head for the cotable-to_view list.
+ * @view_type: View type.
+ * @view_id: User-space per context view id. Currently used also as per
+ * context device view id.
+ * @cmd_size: Size of the SVGA3D define view command that we've copied from the
+ * command stream.
+ * @committed: Whether the view is actually created or pending creation at the
+ * device level.
+ * @cmd: The SVGA3D define view command copied from the command stream.
+ */
+struct vmw_view {
+	struct rcu_head rcu;
+	struct vmw_resource res;
+	struct vmw_resource *ctx;      /* Immutable */
+	struct vmw_resource *srf;      /* Immutable */
+	struct vmw_resource *cotable;  /* Immutable */
+	struct list_head srf_head;     /* Protected by binding_mutex */
+	struct list_head cotable_head; /* Protected by binding_mutex */
+	unsigned view_type;            /* Immutable */
+	unsigned view_id;              /* Immutable */
+	u32 cmd_size;                  /* Immutable */
+	bool committed;                /* Protected by binding_mutex */
+	u32 cmd[1];                    /* Immutable */
+};
+
+static int vmw_view_create(struct vmw_resource *res);
+static int vmw_view_destroy(struct vmw_resource *res);
+static void vmw_hw_view_destroy(struct vmw_resource *res);
+static void vmw_view_commit_notify(struct vmw_resource *res,
+				   enum vmw_cmdbuf_res_state state);
+
+static const struct vmw_res_func vmw_view_func = {
+	.res_type = vmw_res_view,
+	.needs_backup = false,
+	.may_evict = false,
+	.type_name = "DX view",
+	.backup_placement = NULL,
+	.create = vmw_view_create,
+	.commit_notify = vmw_view_commit_notify,
+};
+
+/**
+ * struct vmw_view - view define command body stub
+ *
+ * @view_id: The device id of the view being defined
+ * @sid: The surface id of the view being defined
+ *
+ * This generic struct is used by the code to change @view_id and @sid of a
+ * saved view define command.
+ */
+struct vmw_view_define {
+	uint32 view_id;
+	uint32 sid;
+};
+
+/**
+ * vmw_view - Convert a struct vmw_resource to a struct vmw_view
+ *
+ * @res: Pointer to the resource to convert.
+ *
+ * Returns a pointer to a struct vmw_view.
+ */
+static struct vmw_view *vmw_view(struct vmw_resource *res)
+{
+	return container_of(res, struct vmw_view, res);
+}
+
+/**
+ * vmw_view_commit_notify - Notify that a view operation has been committed to
+ * hardware from a user-supplied command stream.
+ *
+ * @res: Pointer to the view resource.
+ * @state: Indicating whether a creation or removal has been committed.
+ *
+ */
+static void vmw_view_commit_notify(struct vmw_resource *res,
+				   enum vmw_cmdbuf_res_state state)
+{
+	struct vmw_view *view = vmw_view(res);
+	struct vmw_private *dev_priv = res->dev_priv;
+
+	mutex_lock(&dev_priv->binding_mutex);
+	if (state == VMW_CMDBUF_RES_ADD) {
+		struct vmw_surface *srf = vmw_res_to_srf(view->srf);
+
+		list_add_tail(&view->srf_head, &srf->view_list);
+		vmw_cotable_add_resource(view->cotable, &view->cotable_head);
+		view->committed = true;
+		res->id = view->view_id;
+
+	} else {
+		list_del_init(&view->cotable_head);
+		list_del_init(&view->srf_head);
+		view->committed = false;
+		res->id = -1;
+	}
+	mutex_unlock(&dev_priv->binding_mutex);
+}
+
+/**
+ * vmw_view_create - Create a hardware view.
+ *
+ * @res: Pointer to the view resource.
+ *
+ * Create a hardware view. Typically used if that view has previously been
+ * destroyed by an eviction operation.
+ */
+static int vmw_view_create(struct vmw_resource *res)
+{
+	struct vmw_view *view = vmw_view(res);
+	struct vmw_surface *srf = vmw_res_to_srf(view->srf);
+	struct vmw_private *dev_priv = res->dev_priv;
+	struct {
+		SVGA3dCmdHeader header;
+		struct vmw_view_define body;
+	} *cmd;
+
+	mutex_lock(&dev_priv->binding_mutex);
+	if (!view->committed) {
+		mutex_unlock(&dev_priv->binding_mutex);
+		return 0;
+	}
+
+	cmd = vmw_fifo_reserve_dx(res->dev_priv, view->cmd_size,
+				  view->ctx->id);
+	if (!cmd) {
+		DRM_ERROR("Failed reserving FIFO space for view creation.\n");
+		mutex_unlock(&dev_priv->binding_mutex);
+		return -ENOMEM;
+	}
+	memcpy(cmd, &view->cmd, view->cmd_size);
+	WARN_ON(cmd->body.view_id != view->view_id);
+	/* Sid may have changed due to surface eviction. */
+	WARN_ON(view->srf->id == SVGA3D_INVALID_ID);
+	cmd->body.sid = view->srf->id;
+	vmw_fifo_commit(res->dev_priv, view->cmd_size);
+	res->id = view->view_id;
+	list_add_tail(&view->srf_head, &srf->view_list);
+	vmw_cotable_add_resource(view->cotable, &view->cotable_head);
+	mutex_unlock(&dev_priv->binding_mutex);
+
+	return 0;
+}
+
+/**
+ * vmw_view_destroy - Destroy a hardware view.
+ *
+ * @res: Pointer to the view resource.
+ *
+ * Destroy a hardware view. Typically used on unexpected termination of the
+ * owning process or if the surface the view is pointing to is destroyed.
+ */
+static int vmw_view_destroy(struct vmw_resource *res)
+{
+	struct vmw_private *dev_priv = res->dev_priv;
+	struct vmw_view *view = vmw_view(res);
+	struct {
+		SVGA3dCmdHeader header;
+		union vmw_view_destroy body;
+	} *cmd;
+
+	WARN_ON_ONCE(!mutex_is_locked(&dev_priv->binding_mutex));
+	vmw_binding_res_list_scrub(&res->binding_head);
+
+	if (!view->committed || res->id == -1)
+		return 0;
+
+	cmd = vmw_fifo_reserve_dx(dev_priv, sizeof(*cmd), view->ctx->id);
+	if (!cmd) {
+		DRM_ERROR("Failed reserving FIFO space for view "
+			  "destruction.\n");
+		return -ENOMEM;
+	}
+
+	cmd->header.id = vmw_view_destroy_cmds[view->view_type];
+	cmd->header.size = sizeof(cmd->body);
+	cmd->body.view_id = view->view_id;
+	vmw_fifo_commit(dev_priv, sizeof(*cmd));
+	res->id = -1;
+	list_del_init(&view->cotable_head);
+	list_del_init(&view->srf_head);
+
+	return 0;
+}
+
+/**
+ * vmw_hw_view_destroy - Destroy a hardware view as part of resource cleanup.
+ *
+ * @res: Pointer to the view resource.
+ *
+ * Destroy a hardware view if it's still present.
+ */
+static void vmw_hw_view_destroy(struct vmw_resource *res)
+{
+	struct vmw_private *dev_priv = res->dev_priv;
+
+	mutex_lock(&dev_priv->binding_mutex);
+	WARN_ON(vmw_view_destroy(res));
+	res->id = -1;
+	mutex_unlock(&dev_priv->binding_mutex);
+}
+
+/**
+ * vmw_view_key - Compute a view key suitable for the cmdbuf resource manager
+ *
+ * @user_key: The user-space id used for the view.
+ * @view_type: The view type.
+ *
+ * Destroy a hardware view if it's still present.
+ */
+static u32 vmw_view_key(u32 user_key, enum vmw_view_type view_type)
+{
+	return user_key | (view_type << 20);
+}
+
+/**
+ * vmw_view_id_ok - Basic view id and type range checks.
+ *
+ * @user_key: The user-space id used for the view.
+ * @view_type: The view type.
+ *
+ * Checks that the view id and type (typically provided by user-space) is
+ * valid.
+ */
+static bool vmw_view_id_ok(u32 user_key, enum vmw_view_type view_type)
+{
+	return (user_key < SVGA_COTABLE_MAX_IDS &&
+		view_type < vmw_view_max);
+}
+
+/**
+ * vmw_view_res_free - resource res_free callback for view resources
+ *
+ * @res: Pointer to a struct vmw_resource
+ *
+ * Frees memory and memory accounting held by a struct vmw_view.
+ */
+static void vmw_view_res_free(struct vmw_resource *res)
+{
+	struct vmw_view *view = vmw_view(res);
+	size_t size = offsetof(struct vmw_view, cmd) + view->cmd_size;
+	struct vmw_private *dev_priv = res->dev_priv;
+
+	vmw_resource_unreference(&view->cotable);
+	vmw_resource_unreference(&view->srf);
+	kfree_rcu(view, rcu);
+	ttm_mem_global_free(vmw_mem_glob(dev_priv), size);
+}
+
+/**
+ * vmw_view_add - Create a view resource and stage it for addition
+ * as a command buffer managed resource.
+ *
+ * @man: Pointer to the compat shader manager identifying the shader namespace.
+ * @ctx: Pointer to a struct vmw_resource identifying the active context.
+ * @srf: Pointer to a struct vmw_resource identifying the surface the view
+ * points to.
+ * @view_type: The view type deduced from the view create command.
+ * @user_key: The key that is used to identify the shader. The key is
+ * unique to the view type and to the context.
+ * @cmd: Pointer to the view create command in the command stream.
+ * @cmd_size: Size of the view create command in the command stream.
+ * @list: Caller's list of staged command buffer resource actions.
+ */
+int vmw_view_add(struct vmw_cmdbuf_res_manager *man,
+		 struct vmw_resource *ctx,
+		 struct vmw_resource *srf,
+		 enum vmw_view_type view_type,
+		 u32 user_key,
+		 const void *cmd,
+		 size_t cmd_size,
+		 struct list_head *list)
+{
+	static const size_t vmw_view_define_sizes[] = {
+		[vmw_view_sr] = sizeof(SVGA3dCmdDXDefineShaderResourceView),
+		[vmw_view_rt] = sizeof(SVGA3dCmdDXDefineRenderTargetView),
+		[vmw_view_ds] = sizeof(SVGA3dCmdDXDefineDepthStencilView)
+	};
+
+	struct vmw_private *dev_priv = ctx->dev_priv;
+	struct vmw_resource *res;
+	struct vmw_view *view;
+	size_t size;
+	int ret;
+
+	if (cmd_size != vmw_view_define_sizes[view_type] +
+	    sizeof(SVGA3dCmdHeader)) {
+		DRM_ERROR("Illegal view create command size.\n");
+		return -EINVAL;
+	}
+
+	if (!vmw_view_id_ok(user_key, view_type)) {
+		DRM_ERROR("Illegal view add view id.\n");
+		return -EINVAL;
+	}
+
+	size = offsetof(struct vmw_view, cmd) + cmd_size;
+
+	ret = ttm_mem_global_alloc(vmw_mem_glob(dev_priv), size, false, true);
+	if (ret) {
+		if (ret != -ERESTARTSYS)
+			DRM_ERROR("Out of graphics memory for view"
+				  " creation.\n");
+		return ret;
+	}
+
+	view = kmalloc(size, GFP_KERNEL);
+	if (!view) {
+		ttm_mem_global_free(vmw_mem_glob(dev_priv), size);
+		return -ENOMEM;
+	}
+
+	res = &view->res;
+	view->ctx = ctx;
+	view->srf = vmw_resource_reference(srf);
+	view->cotable = vmw_context_cotable(ctx, vmw_view_cotables[view_type]);
+	view->view_type = view_type;
+	view->view_id = user_key;
+	view->cmd_size = cmd_size;
+	view->committed = false;
+	INIT_LIST_HEAD(&view->srf_head);
+	INIT_LIST_HEAD(&view->cotable_head);
+	memcpy(&view->cmd, cmd, cmd_size);
+	ret = vmw_resource_init(dev_priv, res, true,
+				vmw_view_res_free, &vmw_view_func);
+	if (ret)
+		goto out_resource_init;
+
+	ret = vmw_cmdbuf_res_add(man, vmw_cmdbuf_res_view,
+				 vmw_view_key(user_key, view_type),
+				 res, list);
+	if (ret)
+		goto out_resource_init;
+
+	res->id = view->view_id;
+	vmw_resource_activate(res, vmw_hw_view_destroy);
+
+out_resource_init:
+	vmw_resource_unreference(&res);
+
+	return ret;
+}
+
+/**
+ * vmw_view_remove - Stage a view for removal.
+ *
+ * @man: Pointer to the view manager identifying the shader namespace.
+ * @user_key: The key that is used to identify the view. The key is
+ * unique to the view type.
+ * @view_type: View type
+ * @list: Caller's list of staged command buffer resource actions.
+ * @res_p: If the resource is in an already committed state, points to the
+ * struct vmw_resource on successful return. The pointer will be
+ * non ref-counted.
+ */
+int vmw_view_remove(struct vmw_cmdbuf_res_manager *man,
+		    u32 user_key, enum vmw_view_type view_type,
+		    struct list_head *list,
+		    struct vmw_resource **res_p)
+{
+	if (!vmw_view_id_ok(user_key, view_type)) {
+		DRM_ERROR("Illegal view remove view id.\n");
+		return -EINVAL;
+	}
+
+	return vmw_cmdbuf_res_remove(man, vmw_cmdbuf_res_view,
+				     vmw_view_key(user_key, view_type),
+				     list, res_p);
+}
+
+/**
+ * vmw_view_cotable_list_destroy - Evict all views belonging to a cotable.
+ *
+ * @dev_priv: Pointer to a device private struct.
+ * @list: List of views belonging to a cotable.
+ * @readback: Unused. Needed for function interface only.
+ *
+ * This function evicts all views belonging to a cotable.
+ * It must be called with the binding_mutex held, and the caller must hold
+ * a reference to the view resource. This is typically called before the
+ * cotable is paged out.
+ */
+void vmw_view_cotable_list_destroy(struct vmw_private *dev_priv,
+				   struct list_head *list,
+				   bool readback)
+{
+	struct vmw_view *entry, *next;
+
+	WARN_ON_ONCE(!mutex_is_locked(&dev_priv->binding_mutex));
+
+	list_for_each_entry_safe(entry, next, list, cotable_head)
+		WARN_ON(vmw_view_destroy(&entry->res));
+}
+
+/**
+ * vmw_view_surface_list_destroy - Evict all views pointing to a surface
+ *
+ * @dev_priv: Pointer to a device private struct.
+ * @list: List of views pointing to a surface.
+ *
+ * This function evicts all views pointing to a surface. This is typically
+ * called before the surface is evicted.
+ */
+void vmw_view_surface_list_destroy(struct vmw_private *dev_priv,
+				   struct list_head *list)
+{
+	struct vmw_view *entry, *next;
+
+	WARN_ON_ONCE(!mutex_is_locked(&dev_priv->binding_mutex));
+
+	list_for_each_entry_safe(entry, next, list, srf_head)
+		WARN_ON(vmw_view_destroy(&entry->res));
+}
+
+/**
+ * vmw_view_srf - Return a non-refcounted pointer to the surface a view is
+ * pointing to.
+ *
+ * @res: pointer to a view resource.
+ *
+ * Note that the view itself is holding a reference, so as long
+ * the view resource is alive, the surface resource will be.
+ */
+struct vmw_resource *vmw_view_srf(struct vmw_resource *res)
+{
+	return vmw_view(res)->srf;
+}
+
+/**
+ * vmw_view_lookup - Look up a view.
+ *
+ * @man: The context's cmdbuf ref manager.
+ * @view_type: The view type.
+ * @user_key: The view user id.
+ *
+ * returns a refcounted pointer to a view or an error pointer if not found.
+ */
+struct vmw_resource *vmw_view_lookup(struct vmw_cmdbuf_res_manager *man,
+				     enum vmw_view_type view_type,
+				     u32 user_key)
+{
+	return vmw_cmdbuf_res_lookup(man, vmw_cmdbuf_res_view,
+				     vmw_view_key(user_key, view_type));
+}
+
+const u32 vmw_view_destroy_cmds[] = {
+	[vmw_view_sr] = SVGA_3D_CMD_DX_DESTROY_SHADERRESOURCE_VIEW,
+	[vmw_view_rt] = SVGA_3D_CMD_DX_DESTROY_RENDERTARGET_VIEW,
+	[vmw_view_ds] = SVGA_3D_CMD_DX_DESTROY_DEPTHSTENCIL_VIEW,
+};
+
+const SVGACOTableType vmw_view_cotables[] = {
+	[vmw_view_sr] = SVGA_COTABLE_SRVIEW,
+	[vmw_view_rt] = SVGA_COTABLE_RTVIEW,
+	[vmw_view_ds] = SVGA_COTABLE_DSVIEW,
+};
+
+const SVGACOTableType vmw_so_cotables[] = {
+	[vmw_so_el] = SVGA_COTABLE_ELEMENTLAYOUT,
+	[vmw_so_bs] = SVGA_COTABLE_BLENDSTATE,
+	[vmw_so_ds] = SVGA_COTABLE_DEPTHSTENCIL,
+	[vmw_so_rs] = SVGA_COTABLE_RASTERIZERSTATE,
+	[vmw_so_ss] = SVGA_COTABLE_SAMPLER,
+	[vmw_so_so] = SVGA_COTABLE_STREAMOUTPUT
+};
+
+
+/* To remove unused function warning */
+static void vmw_so_build_asserts(void) __attribute__((used));
+
+
+/*
+ * This function is unused at run-time, and only used to dump various build
+ * asserts important for code optimization assumptions.
+ */
+static void vmw_so_build_asserts(void)
+{
+	/* Assert that our vmw_view_cmd_to_type() function is correct. */
+	BUILD_BUG_ON(SVGA_3D_CMD_DX_DESTROY_SHADERRESOURCE_VIEW !=
+		     SVGA_3D_CMD_DX_DEFINE_SHADERRESOURCE_VIEW + 1);
+	BUILD_BUG_ON(SVGA_3D_CMD_DX_DEFINE_RENDERTARGET_VIEW !=
+		     SVGA_3D_CMD_DX_DEFINE_SHADERRESOURCE_VIEW + 2);
+	BUILD_BUG_ON(SVGA_3D_CMD_DX_DESTROY_RENDERTARGET_VIEW !=
+		     SVGA_3D_CMD_DX_DEFINE_SHADERRESOURCE_VIEW + 3);
+	BUILD_BUG_ON(SVGA_3D_CMD_DX_DEFINE_DEPTHSTENCIL_VIEW !=
+		     SVGA_3D_CMD_DX_DEFINE_SHADERRESOURCE_VIEW + 4);
+	BUILD_BUG_ON(SVGA_3D_CMD_DX_DESTROY_DEPTHSTENCIL_VIEW !=
+		     SVGA_3D_CMD_DX_DEFINE_SHADERRESOURCE_VIEW + 5);
+
+	/* Assert that our "one body fits all" assumption is valid */
+	BUILD_BUG_ON(sizeof(union vmw_view_destroy) != sizeof(u32));
+
+	/* Assert that the view key space can hold all view ids. */
+	BUILD_BUG_ON(SVGA_COTABLE_MAX_IDS >= ((1 << 20) - 1));
+
+	/*
+	 * Assert that the offset of sid in all view define commands
+	 * is what we assume it to be.
+	 */
+	BUILD_BUG_ON(offsetof(struct vmw_view_define, sid) !=
+		     offsetof(SVGA3dCmdDXDefineShaderResourceView, sid));
+	BUILD_BUG_ON(offsetof(struct vmw_view_define, sid) !=
+		     offsetof(SVGA3dCmdDXDefineRenderTargetView, sid));
+	BUILD_BUG_ON(offsetof(struct vmw_view_define, sid) !=
+		     offsetof(SVGA3dCmdDXDefineDepthStencilView, sid));
+}
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_so.h b/drivers/gpu/drm/vmwgfx/vmwgfx_so.h
new file mode 100644
index 000000000000..5ef867a9e0d5
--- /dev/null
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_so.h
@@ -0,0 +1,160 @@
+/**************************************************************************
+ * Copyright © 2014 VMware, Inc., Palo Alto, CA., USA
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+#ifndef VMW_SO_H
+#define VMW_SO_H
+
+enum vmw_view_type {
+	vmw_view_sr,
+	vmw_view_rt,
+	vmw_view_ds,
+	vmw_view_max,
+};
+
+enum vmw_so_type {
+	vmw_so_el,
+	vmw_so_bs,
+	vmw_so_ds,
+	vmw_so_rs,
+	vmw_so_ss,
+	vmw_so_so,
+	vmw_so_max,
+};
+
+/**
+ * union vmw_view_destroy - view destruction command body
+ *
+ * @rtv: RenderTarget view destruction command body
+ * @srv: ShaderResource view destruction command body
+ * @dsv: DepthStencil view destruction command body
+ * @view_id: A single u32 view id.
+ *
+ * The assumption here is that all union members are really represented by a
+ * single u32 in the command stream. If that's not the case,
+ * the size of this union will not equal the size of an u32, and the
+ * assumption is invalid, and we detect that at compile time in the
+ * vmw_so_build_asserts() function.
+ */
+union vmw_view_destroy {
+	struct SVGA3dCmdDXDestroyRenderTargetView rtv;
+	struct SVGA3dCmdDXDestroyShaderResourceView srv;
+	struct SVGA3dCmdDXDestroyDepthStencilView dsv;
+	u32 view_id;
+};
+
+/* Map enum vmw_view_type to view destroy command ids*/
+extern const u32 vmw_view_destroy_cmds[];
+
+/* Map enum vmw_view_type to SVGACOTableType */
+extern const SVGACOTableType vmw_view_cotables[];
+
+/* Map enum vmw_so_type to SVGACOTableType */
+extern const SVGACOTableType vmw_so_cotables[];
+
+/*
+ * vmw_view_cmd_to_type - Return the view type for a create or destroy command
+ *
+ * @id: The SVGA3D command id.
+ *
+ * For a given view create or destroy command id, return the corresponding
+ * enum vmw_view_type. If the command is unknown, return vmw_view_max.
+ * The validity of the simplified calculation is verified in the
+ * vmw_so_build_asserts() function.
+ */
+static inline enum vmw_view_type vmw_view_cmd_to_type(u32 id)
+{
+	u32 tmp = (id - SVGA_3D_CMD_DX_DEFINE_SHADERRESOURCE_VIEW) / 2;
+
+	if (tmp > (u32)vmw_view_max)
+		return vmw_view_max;
+
+	return (enum vmw_view_type) tmp;
+}
+
+/*
+ * vmw_so_cmd_to_type - Return the state object type for a
+ * create or destroy command
+ *
+ * @id: The SVGA3D command id.
+ *
+ * For a given state object create or destroy command id,
+ * return the corresponding enum vmw_so_type. If the command is uknown,
+ * return vmw_so_max. We should perhaps optimize this function using
+ * a similar strategy as vmw_view_cmd_to_type().
+ */
+static inline enum vmw_so_type vmw_so_cmd_to_type(u32 id)
+{
+	switch (id) {
+	case SVGA_3D_CMD_DX_DEFINE_ELEMENTLAYOUT:
+	case SVGA_3D_CMD_DX_DESTROY_ELEMENTLAYOUT:
+		return vmw_so_el;
+	case SVGA_3D_CMD_DX_DEFINE_BLEND_STATE:
+	case SVGA_3D_CMD_DX_DESTROY_BLEND_STATE:
+		return vmw_so_bs;
+	case SVGA_3D_CMD_DX_DEFINE_DEPTHSTENCIL_STATE:
+	case SVGA_3D_CMD_DX_DESTROY_DEPTHSTENCIL_STATE:
+		return vmw_so_ds;
+	case SVGA_3D_CMD_DX_DEFINE_RASTERIZER_STATE:
+	case SVGA_3D_CMD_DX_DESTROY_RASTERIZER_STATE:
+		return vmw_so_rs;
+	case SVGA_3D_CMD_DX_DEFINE_SAMPLER_STATE:
+	case SVGA_3D_CMD_DX_DESTROY_SAMPLER_STATE:
+		return vmw_so_ss;
+	case SVGA_3D_CMD_DX_DEFINE_STREAMOUTPUT:
+	case SVGA_3D_CMD_DX_DESTROY_STREAMOUTPUT:
+		return vmw_so_so;
+	default:
+		break;
+	}
+	return vmw_so_max;
+}
+
+/*
+ * View management - vmwgfx_so.c
+ */
+extern int vmw_view_add(struct vmw_cmdbuf_res_manager *man,
+			struct vmw_resource *ctx,
+			struct vmw_resource *srf,
+			enum vmw_view_type view_type,
+			u32 user_key,
+			const void *cmd,
+			size_t cmd_size,
+			struct list_head *list);
+
+extern int vmw_view_remove(struct vmw_cmdbuf_res_manager *man,
+			   u32 user_key, enum vmw_view_type view_type,
+			   struct list_head *list,
+			   struct vmw_resource **res_p);
+
+extern void vmw_view_surface_list_destroy(struct vmw_private *dev_priv,
+					  struct list_head *view_list);
+extern void vmw_view_cotable_list_destroy(struct vmw_private *dev_priv,
+					  struct list_head *list,
+					  bool readback);
+extern struct vmw_resource *vmw_view_srf(struct vmw_resource *res);
+extern struct vmw_resource *vmw_view_lookup(struct vmw_cmdbuf_res_manager *man,
+					    enum vmw_view_type view_type,
+					    u32 user_key);
+#endif
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c b/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c
index d4a453703eed..ae6773e171b0 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c
@@ -561,6 +561,7 @@ static int vmw_stdu_crtc_set_config(struct drm_mode_set *set)
 				true, /* a scanout buffer */
 				content_srf.mip_levels[0],
 				content_srf.multisample_count,
+				0,
 				display_base_size,
 				&display_srf);
 		if (unlikely(ret != 0)) {
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
index eea1790eed6a..12ade0cf98d0 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
@@ -27,9 +27,12 @@
 
 #include "vmwgfx_drv.h"
 #include "vmwgfx_resource_priv.h"
+#include "vmwgfx_so.h"
+#include "vmwgfx_binding.h"
 #include <ttm/ttm_placement.h>
 #include "device_include/svga3d_surfacedefs.h"
 
+
 /**
  * struct vmw_user_surface - User-space visible surface resource
  *
@@ -593,6 +596,7 @@ static int vmw_surface_init(struct vmw_private *dev_priv,
 	 * surface validate.
 	 */
 
+	INIT_LIST_HEAD(&srf->view_list);
 	vmw_resource_activate(res, vmw_hw_surface_destroy);
 	return ret;
 }
@@ -723,6 +727,7 @@ int vmw_surface_define_ioctl(struct drm_device *dev, void *data,
 	desc = svga3dsurface_get_desc(req->format);
 	if (unlikely(desc->block_desc == SVGA3DBLOCKDESC_NONE)) {
 		DRM_ERROR("Invalid surface format for surface creation.\n");
+		DRM_ERROR("Format requested is: %d\n", req->format);
 		return -EINVAL;
 	}
 
@@ -1018,12 +1023,16 @@ static int vmw_gb_surface_create(struct vmw_resource *res)
 {
 	struct vmw_private *dev_priv = res->dev_priv;
 	struct vmw_surface *srf = vmw_res_to_srf(res);
-	uint32_t cmd_len, submit_len;
+	uint32_t cmd_len, cmd_id, submit_len;
 	int ret;
 	struct {
 		SVGA3dCmdHeader header;
 		SVGA3dCmdDefineGBSurface body;
 	} *cmd;
+	struct {
+		SVGA3dCmdHeader header;
+		SVGA3dCmdDefineGBSurface_v2 body;
+	} *cmd2;
 
 	if (likely(res->id != -1))
 		return 0;
@@ -1040,9 +1049,19 @@ static int vmw_gb_surface_create(struct vmw_resource *res)
 		goto out_no_fifo;
 	}
 
-	cmd_len = sizeof(cmd->body);
-	submit_len = sizeof(*cmd);
+	if (srf->array_size > 0) {
+		/* has_dx checked on creation time. */
+		cmd_id = SVGA_3D_CMD_DEFINE_GB_SURFACE_V2;
+		cmd_len = sizeof(cmd2->body);
+		submit_len = sizeof(*cmd2);
+	} else {
+		cmd_id = SVGA_3D_CMD_DEFINE_GB_SURFACE;
+		cmd_len = sizeof(cmd->body);
+		submit_len = sizeof(*cmd);
+	}
+
 	cmd = vmw_fifo_reserve(dev_priv, submit_len);
+	cmd2 = (typeof(cmd2))cmd;
 	if (unlikely(cmd == NULL)) {
 		DRM_ERROR("Failed reserving FIFO space for surface "
 			  "creation.\n");
@@ -1050,17 +1069,33 @@ static int vmw_gb_surface_create(struct vmw_resource *res)
 		goto out_no_fifo;
 	}
 
-	cmd->header.id = SVGA_3D_CMD_DEFINE_GB_SURFACE;
-	cmd->header.size = cmd_len;
-	cmd->body.sid = srf->res.id;
-	cmd->body.surfaceFlags = srf->flags;
-	cmd->body.format = srf->format;
-	cmd->body.numMipLevels = srf->mip_levels[0];
-	cmd->body.multisampleCount = srf->multisample_count;
-	cmd->body.autogenFilter = srf->autogen_filter;
-	cmd->body.size.width = srf->base_size.width;
-	cmd->body.size.height = srf->base_size.height;
-	cmd->body.size.depth = srf->base_size.depth;
+	if (srf->array_size > 0) {
+		cmd2->header.id = cmd_id;
+		cmd2->header.size = cmd_len;
+		cmd2->body.sid = srf->res.id;
+		cmd2->body.surfaceFlags = srf->flags;
+		cmd2->body.format = cpu_to_le32(srf->format);
+		cmd2->body.numMipLevels = srf->mip_levels[0];
+		cmd2->body.multisampleCount = srf->multisample_count;
+		cmd2->body.autogenFilter = srf->autogen_filter;
+		cmd2->body.size.width = srf->base_size.width;
+		cmd2->body.size.height = srf->base_size.height;
+		cmd2->body.size.depth = srf->base_size.depth;
+		cmd2->body.arraySize = srf->array_size;
+	} else {
+		cmd->header.id = cmd_id;
+		cmd->header.size = cmd_len;
+		cmd->body.sid = srf->res.id;
+		cmd->body.surfaceFlags = srf->flags;
+		cmd->body.format = cpu_to_le32(srf->format);
+		cmd->body.numMipLevels = srf->mip_levels[0];
+		cmd->body.multisampleCount = srf->multisample_count;
+		cmd->body.autogenFilter = srf->autogen_filter;
+		cmd->body.size.width = srf->base_size.width;
+		cmd->body.size.height = srf->base_size.height;
+		cmd->body.size.depth = srf->base_size.depth;
+	}
+
 	vmw_fifo_commit(dev_priv, submit_len);
 
 	return 0;
@@ -1188,6 +1223,7 @@ static int vmw_gb_surface_unbind(struct vmw_resource *res,
 static int vmw_gb_surface_destroy(struct vmw_resource *res)
 {
 	struct vmw_private *dev_priv = res->dev_priv;
+	struct vmw_surface *srf = vmw_res_to_srf(res);
 	struct {
 		SVGA3dCmdHeader header;
 		SVGA3dCmdDestroyGBSurface body;
@@ -1197,7 +1233,8 @@ static int vmw_gb_surface_destroy(struct vmw_resource *res)
 		return 0;
 
 	mutex_lock(&dev_priv->binding_mutex);
-	vmw_context_binding_res_list_scrub(&res->binding_head);
+	vmw_view_surface_list_destroy(dev_priv, &srf->view_list);
+	vmw_binding_res_list_scrub(&res->binding_head);
 
 	cmd = vmw_fifo_reserve(dev_priv, sizeof(*cmd));
 	if (unlikely(cmd == NULL)) {
@@ -1259,6 +1296,7 @@ int vmw_gb_surface_define_ioctl(struct drm_device *dev, void *data,
 			req->drm_surface_flags & drm_vmw_surface_flag_scanout,
 			req->mip_levels,
 			req->multisample_count,
+			req->array_size,
 			req->base_size,
 			&srf);
 	if (unlikely(ret != 0))
@@ -1275,10 +1313,17 @@ int vmw_gb_surface_define_ioctl(struct drm_device *dev, void *data,
 	res = &user_srf->srf.res;
 
 
-	if (req->buffer_handle != SVGA3D_INVALID_ID)
+	if (req->buffer_handle != SVGA3D_INVALID_ID) {
 		ret = vmw_user_dmabuf_lookup(tfile, req->buffer_handle,
 					     &res->backup);
-	else if (req->drm_surface_flags & drm_vmw_surface_flag_create_buffer)
+		if (ret == 0 && res->backup->base.num_pages * PAGE_SIZE <
+		    res->backup_size) {
+			DRM_ERROR("Surface backup buffer is too small.\n");
+			vmw_dmabuf_unreference(&res->backup);
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+	} else if (req->drm_surface_flags & drm_vmw_surface_flag_create_buffer)
 		ret = vmw_user_dmabuf_alloc(dev_priv, tfile,
 					    res->backup_size,
 					    req->drm_surface_flags &
@@ -1378,6 +1423,7 @@ int vmw_gb_surface_reference_ioctl(struct drm_device *dev, void *data,
 	rep->creq.drm_surface_flags = 0;
 	rep->creq.multisample_count = srf->multisample_count;
 	rep->creq.autogen_filter = srf->autogen_filter;
+	rep->creq.array_size = srf->array_size;
 	rep->creq.buffer_handle = backup_handle;
 	rep->creq.base_size = srf->base_size;
 	rep->crep.handle = user_srf->prime.base.hash.key;
@@ -1404,6 +1450,7 @@ out_bad_resource:
  * @for_scanout: true if inteded to be used for scanout buffer
  * @num_mip_levels:  number of MIP levels
  * @multisample_count:
+ * @array_size: Surface array size.
  * @size: width, heigh, depth of the surface requested
  * @user_srf_out: allocated user_srf.  Set to NULL on failure.
  *
@@ -1419,6 +1466,7 @@ int vmw_surface_gb_priv_define(struct drm_device *dev,
 			       bool for_scanout,
 			       uint32_t num_mip_levels,
 			       uint32_t multisample_count,
+			       uint32_t array_size,
 			       struct drm_vmw_size size,
 			       struct vmw_surface **srf_out)
 {
@@ -1426,7 +1474,7 @@ int vmw_surface_gb_priv_define(struct drm_device *dev,
 	struct vmw_user_surface *user_srf;
 	struct vmw_surface *srf;
 	int ret;
-
+	u32 num_layers;
 
 	*srf_out = NULL;
 
@@ -1445,6 +1493,12 @@ int vmw_surface_gb_priv_define(struct drm_device *dev,
 		}
 	}
 
+	/* array_size must be null for non-GL3 host. */
+	if (array_size > 0 && !dev_priv->has_dx) {
+		DRM_ERROR("Tried to create DX surface on non-DX host.\n");
+		return -EINVAL;
+	}
+
 	ret = ttm_read_lock(&dev_priv->reservation_sem, true);
 	if (unlikely(ret != 0))
 		return ret;
@@ -1481,10 +1535,21 @@ int vmw_surface_gb_priv_define(struct drm_device *dev,
 	srf->autogen_filter    = SVGA3D_TEX_FILTER_NONE;
 	srf->multisample_count = multisample_count;
 
-	srf->res.backup_size   = svga3dsurface_get_serialized_size(srf->format,
-					srf->base_size,
-					srf->mip_levels[0],
-					srf->flags & SVGA3D_SURFACE_CUBEMAP);
+	if (array_size)
+		num_layers = array_size;
+	else if (svga3d_flags & SVGA3D_SURFACE_CUBEMAP)
+		num_layers = SVGA3D_MAX_SURFACE_FACES;
+	else
+		num_layers = 1;
+
+	srf->res.backup_size   =
+		svga3dsurface_get_serialized_size(srf->format,
+						  srf->base_size,
+						  srf->mip_levels[0],
+						  num_layers);
+
+	if (srf->flags & SVGA3D_SURFACE_BIND_STREAM_OUTPUT)
+		srf->res.backup_size += sizeof(SVGA3dDXSOState);
 
 	if (dev_priv->active_display_unit == vmw_du_screen_target &&
 	    for_scanout)
diff --git a/include/uapi/drm/vmwgfx_drm.h b/include/uapi/drm/vmwgfx_drm.h
index c8a863180174..c5bcddd9f58c 100644
--- a/include/uapi/drm/vmwgfx_drm.h
+++ b/include/uapi/drm/vmwgfx_drm.h
@@ -64,6 +64,7 @@
 #define DRM_VMW_GB_SURFACE_CREATE    23
 #define DRM_VMW_GB_SURFACE_REF       24
 #define DRM_VMW_SYNCCPU              25
+#define DRM_VMW_CREATE_EXTENDED_CONTEXT 26
 
 /*************************************************************************/
 /**
@@ -89,6 +90,7 @@
 #define DRM_VMW_PARAM_MAX_MOB_MEMORY   9
 #define DRM_VMW_PARAM_MAX_MOB_SIZE     10
 #define DRM_VMW_PARAM_SCREEN_TARGET    11
+#define DRM_VMW_PARAM_DX               12
 
 /**
  * enum drm_vmw_handle_type - handle type for ref ioctls
@@ -297,7 +299,7 @@ union drm_vmw_surface_reference_arg {
  * Argument to the DRM_VMW_EXECBUF Ioctl.
  */
 
-#define DRM_VMW_EXECBUF_VERSION 1
+#define DRM_VMW_EXECBUF_VERSION 2
 
 struct drm_vmw_execbuf_arg {
 	uint64_t commands;
@@ -306,6 +308,8 @@ struct drm_vmw_execbuf_arg {
 	uint64_t fence_rep;
 	uint32_t version;
 	uint32_t flags;
+	uint32_t context_handle;
+	uint32_t pad64;
 };
 
 /**
@@ -826,7 +830,6 @@ struct drm_vmw_update_layout_arg {
 enum drm_vmw_shader_type {
 	drm_vmw_shader_type_vs = 0,
 	drm_vmw_shader_type_ps,
-	drm_vmw_shader_type_gs
 };
 
 
@@ -908,6 +911,8 @@ enum drm_vmw_surface_flags {
  * @buffer_handle     Buffer handle of backup buffer. SVGA3D_INVALID_ID
  *                    if none.
  * @base_size         Size of the base mip level for all faces.
+ * @array_size        Must be zero for non-DX hardware, and if non-zero
+ *                    svga3d_flags must have proper bind flags setup.
  *
  * Input argument to the  DRM_VMW_GB_SURFACE_CREATE Ioctl.
  * Part of output argument for the DRM_VMW_GB_SURFACE_REF Ioctl.
@@ -920,7 +925,7 @@ struct drm_vmw_gb_surface_create_req {
 	uint32_t multisample_count;
 	uint32_t autogen_filter;
 	uint32_t buffer_handle;
-	uint32_t pad64;
+	uint32_t array_size;
 	struct drm_vmw_size base_size;
 };
 
@@ -1060,4 +1065,28 @@ struct drm_vmw_synccpu_arg {
 	uint32_t pad64;
 };
 
+/*************************************************************************/
+/**
+ * DRM_VMW_CREATE_EXTENDED_CONTEXT - Create a host context.
+ *
+ * Allocates a device unique context id, and queues a create context command
+ * for the host. Does not wait for host completion.
+ */
+enum drm_vmw_extended_context {
+	drm_vmw_context_legacy,
+	drm_vmw_context_dx
+};
+
+/**
+ * union drm_vmw_extended_context_arg
+ *
+ * @req: Context type.
+ * @rep: Context identifier.
+ *
+ * Argument to the DRM_VMW_CREATE_EXTENDED_CONTEXT Ioctl.
+ */
+union drm_vmw_extended_context_arg {
+	enum drm_vmw_extended_context req;
+	struct drm_vmw_context_arg rep;
+};
 #endif
-- 
cgit v1.2.3


From 54fbde8a94a8a78547597215c9e4be590d075ee0 Mon Sep 17 00:00:00 2001
From: Sinclair Yeh <syeh@vmware.com>
Date: Wed, 29 Jul 2015 12:38:02 -0700
Subject: drm/vmwgfx: Fix copyright headers

Updating and fixing copyright headers.
Bump version minor to signal vgpu10 support.

Signed-off-by: Sinclair Yeh <syeh@vmware.com>
Signed-off-by: Thomas Hellstrom <thellstrom@vmware.com>
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 drivers/gpu/drm/vmwgfx/vmwgfx_buffer.c        | 2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf_res.c    | 2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_context.c       | 2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_cotable.c       | 2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_dmabuf.c        | 2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.c           | 2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.h           | 6 +++---
 drivers/gpu/drm/vmwgfx/vmwgfx_fb.c            | 2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_fence.c         | 2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_fence.h         | 2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c          | 2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_gmr.c           | 2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_ioctl.c         | 2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_irq.c           | 2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_kms.c           | 2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_kms.h           | 2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_ldu.c           | 2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_mob.c           | 2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_overlay.c       | 2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_reg.h           | 2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_resource.c      | 2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h | 2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c          | 2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_shader.c        | 2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_so.c            | 2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_so.h            | 2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c          | 2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_surface.c       | 2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c      | 2 +-
 include/uapi/drm/vmwgfx_drm.h                 | 2 +-
 30 files changed, 32 insertions(+), 32 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_buffer.c b/drivers/gpu/drm/vmwgfx/vmwgfx_buffer.c
index 469a7042037d..3329f623c8bf 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_buffer.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_buffer.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright © 2009 VMware, Inc., Palo Alto, CA., USA
+ * Copyright © 2009-2015 VMware, Inc., Palo Alto, CA., USA
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf_res.c b/drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf_res.c
index 59d965f8b530..13db8a2851ed 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf_res.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf_res.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright © 2014 VMware, Inc., Palo Alto, CA., USA
+ * Copyright © 2014-2015 VMware, Inc., Palo Alto, CA., USA
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_context.c b/drivers/gpu/drm/vmwgfx/vmwgfx_context.c
index 7b3356fed205..7ef77640028d 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_context.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_context.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright © 2009-2012 VMware, Inc., Palo Alto, CA., USA
+ * Copyright © 2009-2015 VMware, Inc., Palo Alto, CA., USA
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_cotable.c b/drivers/gpu/drm/vmwgfx/vmwgfx_cotable.c
index 22bb04ffec78..ce659a125f2b 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_cotable.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_cotable.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright © 2014 VMware, Inc., Palo Alto, CA., USA
+ * Copyright © 2014-2015 VMware, Inc., Palo Alto, CA., USA
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_dmabuf.c b/drivers/gpu/drm/vmwgfx/vmwgfx_dmabuf.c
index 9b4f0939d7bd..299925a1f6c6 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_dmabuf.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_dmabuf.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright © 2011 VMware, Inc., Palo Alto, CA., USA
+ * Copyright © 2011-2015 VMware, Inc., Palo Alto, CA., USA
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
index fd0cb8c67d05..5f849435ca4c 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright © 2009 VMware, Inc., Palo Alto, CA., USA
+ * Copyright © 2009-2015 VMware, Inc., Palo Alto, CA., USA
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
index 0e18dfb28ad5..8f40692cf48a 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright © 2009-2014 VMware, Inc., Palo Alto, CA., USA
+ * Copyright © 2009-2015 VMware, Inc., Palo Alto, CA., USA
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -40,9 +40,9 @@
 #include <drm/ttm/ttm_module.h>
 #include "vmwgfx_fence.h"
 
-#define VMWGFX_DRIVER_DATE "20150626"
+#define VMWGFX_DRIVER_DATE "20150810"
 #define VMWGFX_DRIVER_MAJOR 2
-#define VMWGFX_DRIVER_MINOR 7
+#define VMWGFX_DRIVER_MINOR 9
 #define VMWGFX_DRIVER_PATCHLEVEL 0
 #define VMWGFX_FILE_PAGE_OFFSET 0x00100000
 #define VMWGFX_FIFO_STATIC_SIZE (1024*1024)
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fb.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fb.c
index 9856803e7aba..042c5b4c706c 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_fb.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fb.c
@@ -1,7 +1,7 @@
 /**************************************************************************
  *
  * Copyright © 2007 David Airlie
- * Copyright © 2009 VMware, Inc., Palo Alto, CA., USA
+ * Copyright © 2009-2015 VMware, Inc., Palo Alto, CA., USA
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c
index 75d6222b510a..567ddede51d1 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright © 2011 VMware, Inc., Palo Alto, CA., USA
+ * Copyright © 2011-2014 VMware, Inc., Palo Alto, CA., USA
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fence.h b/drivers/gpu/drm/vmwgfx/vmwgfx_fence.h
index 26a4add39208..8be6c29f5eb5 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_fence.h
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fence.h
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright © 2011 VMware, Inc., Palo Alto, CA., USA
+ * Copyright © 2011-2012 VMware, Inc., Palo Alto, CA., USA
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c
index 3c876d4826c0..80c40c31d4f8 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright © 2009 VMware, Inc., Palo Alto, CA., USA
+ * Copyright © 2009-2015 VMware, Inc., Palo Alto, CA., USA
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_gmr.c b/drivers/gpu/drm/vmwgfx/vmwgfx_gmr.c
index 61d8d803199f..66ffa1d4759c 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_gmr.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_gmr.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright © 2009-2011 VMware, Inc., Palo Alto, CA., USA
+ * Copyright © 2009-2015 VMware, Inc., Palo Alto, CA., USA
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ioctl.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ioctl.c
index 893359c8d522..0a970afed93b 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_ioctl.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ioctl.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright © 2009 VMware, Inc., Palo Alto, CA., USA
+ * Copyright © 2009-2015 VMware, Inc., Palo Alto, CA., USA
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c
index 2c2bac4a0fd6..9498a5e33c12 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright © 2009 VMware, Inc., Palo Alto, CA., USA
+ * Copyright © 2009-2015 VMware, Inc., Palo Alto, CA., USA
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c
index f961bb98cdaa..5d72298918d9 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright © 2009-2014 VMware, Inc., Palo Alto, CA., USA
+ * Copyright © 2009-2015 VMware, Inc., Palo Alto, CA., USA
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.h b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.h
index eb6c8536866f..876de908fce3 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.h
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.h
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright © 2009-2014 VMware, Inc., Palo Alto, CA., USA
+ * Copyright © 2009-2015 VMware, Inc., Palo Alto, CA., USA
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ldu.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ldu.c
index 55038457a096..bb63e4d795fa 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_ldu.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ldu.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright © 2009 VMware, Inc., Palo Alto, CA., USA
+ * Copyright © 2009-2015 VMware, Inc., Palo Alto, CA., USA
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_mob.c b/drivers/gpu/drm/vmwgfx/vmwgfx_mob.c
index a8203a9e1050..23db16008e39 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_mob.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_mob.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright © 2012 VMware, Inc., Palo Alto, CA., USA
+ * Copyright © 2012-2015 VMware, Inc., Palo Alto, CA., USA
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_overlay.c b/drivers/gpu/drm/vmwgfx/vmwgfx_overlay.c
index 755e94132a3b..76069f093ccf 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_overlay.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_overlay.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright © 2009 VMware, Inc., Palo Alto, CA., USA
+ * Copyright © 2009-2014 VMware, Inc., Palo Alto, CA., USA
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_reg.h b/drivers/gpu/drm/vmwgfx/vmwgfx_reg.h
index 529295da1fe9..dce798053a96 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_reg.h
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_reg.h
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright © 2009 VMware, Inc., Palo Alto, CA., USA
+ * Copyright © 2009-2014 VMware, Inc., Palo Alto, CA., USA
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
index bcd342dd8b96..c1912f852b42 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright © 2009 VMware, Inc., Palo Alto, CA., USA
+ * Copyright © 2009-2015 VMware, Inc., Palo Alto, CA., USA
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h b/drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h
index 743e2adafed2..5994ef6265e0 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright © 2012 VMware, Inc., Palo Alto, CA., USA
+ * Copyright © 2012-2014 VMware, Inc., Palo Alto, CA., USA
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c b/drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c
index 2af3fa1b1904..b96d1ab610c5 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright © 2011-2014 VMware, Inc., Palo Alto, CA., USA
+ * Copyright © 2011-2015 VMware, Inc., Palo Alto, CA., USA
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_shader.c b/drivers/gpu/drm/vmwgfx/vmwgfx_shader.c
index 61403ebe3a1e..bba1ee395478 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_shader.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_shader.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright © 2009-2012 VMware, Inc., Palo Alto, CA., USA
+ * Copyright © 2009-2015 VMware, Inc., Palo Alto, CA., USA
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_so.c b/drivers/gpu/drm/vmwgfx/vmwgfx_so.c
index 4dfdc95b2cfe..5a73eebd0f35 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_so.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_so.c
@@ -1,5 +1,5 @@
 /**************************************************************************
- * Copyright © 2014 VMware, Inc., Palo Alto, CA., USA
+ * Copyright © 2014-2015 VMware, Inc., Palo Alto, CA., USA
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_so.h b/drivers/gpu/drm/vmwgfx/vmwgfx_so.h
index 5ef867a9e0d5..268738387b5e 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_so.h
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_so.h
@@ -1,5 +1,5 @@
 /**************************************************************************
- * Copyright © 2014 VMware, Inc., Palo Alto, CA., USA
+ * Copyright © 2014-2015 VMware, Inc., Palo Alto, CA., USA
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c b/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c
index ae6773e171b0..c22e2df1b336 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c
@@ -1,6 +1,6 @@
 /******************************************************************************
  *
- * COPYRIGHT © 2014 VMware, Inc., Palo Alto, CA., USA
+ * COPYRIGHT © 2014-2015 VMware, Inc., Palo Alto, CA., USA
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
index ca496a6eb59f..5b8595b78429 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright © 2009-2014 VMware, Inc., Palo Alto, CA., USA
+ * Copyright © 2009-2015 VMware, Inc., Palo Alto, CA., USA
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c
index 98d6bfb3a997..e771091d2cd3 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright © 2009 VMware, Inc., Palo Alto, CA., USA
+ * Copyright © 2009-2011 VMware, Inc., Palo Alto, CA., USA
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
diff --git a/include/uapi/drm/vmwgfx_drm.h b/include/uapi/drm/vmwgfx_drm.h
index c5bcddd9f58c..05b204954d16 100644
--- a/include/uapi/drm/vmwgfx_drm.h
+++ b/include/uapi/drm/vmwgfx_drm.h
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright © 2009 VMware, Inc., Palo Alto, CA., USA
+ * Copyright © 2009-2015 VMware, Inc., Palo Alto, CA., USA
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
-- 
cgit v1.2.3


From 35103d11173b8fea874183f8aa508ae71234d299 Mon Sep 17 00:00:00 2001
From: Andy Gospodarek <gospo@cumulusnetworks.com>
Date: Thu, 13 Aug 2015 10:39:01 -0400
Subject: net: ipv6 sysctl option to ignore routes when nexthop link is down

Like the ipv4 patch with a similar title, this adds a sysctl to allow
the user to change routing behavior based on whether or not the
interface associated with the nexthop was an up or down link.  The
default setting preserves the current behavior, but anyone that enables
it will notice that nexthops on down interfaces will no longer be
selected:

net.ipv6.conf.all.ignore_routes_with_linkdown = 0
net.ipv6.conf.default.ignore_routes_with_linkdown = 0
net.ipv6.conf.lo.ignore_routes_with_linkdown = 0
...

When the above sysctls are set, not only will link status be reported to
userspace, but an indication that a nexthop is dead and will not be used
is also reported.

1000::/8 via 7000::2 dev p7p1  metric 1024 dead linkdown  pref medium
1000::/8 via 8000::2 dev p8p1  metric 1024  pref medium
7000::/8 dev p7p1  proto kernel  metric 256 dead linkdown  pref medium
8000::/8 dev p8p1  proto kernel  metric 256  pref medium
9000::/8 via 8000::2 dev p8p1  metric 2048  pref medium
9000::/8 via 7000::2 dev p7p1  metric 1024 dead linkdown  pref medium
fe80::/64 dev p7p1  proto kernel  metric 256 dead linkdown  pref medium
fe80::/64 dev p8p1  proto kernel  metric 256  pref medium

This also adds devconf support and notification when sysctl values
change.

v2: drop use of rt6i_nhflags since it is not needed right now

Signed-off-by: Andy Gospodarek <gospo@cumulusnetworks.com>
Signed-off-by: Dinesh Dutt <ddutt@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h      |   1 +
 include/uapi/linux/ipv6.h |   1 +
 net/ipv6/addrconf.c       | 105 +++++++++++++++++++++++++++++++++++++++++++++-
 net/ipv6/route.c          |  11 ++++-
 4 files changed, 116 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index cb9dcad72372..f1f32af6d9b9 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -31,6 +31,7 @@ struct ipv6_devconf {
 	__s32		accept_ra_defrtr;
 	__s32		accept_ra_min_hop_limit;
 	__s32		accept_ra_pinfo;
+	__s32		ignore_routes_with_linkdown;
 #ifdef CONFIG_IPV6_ROUTER_PREF
 	__s32		accept_ra_rtr_pref;
 	__s32		rtr_probe_interval;
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 80f3b74446a1..38b4fef20219 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -173,6 +173,7 @@ enum {
 	DEVCONF_STABLE_SECRET,
 	DEVCONF_USE_OIF_ADDRS_ONLY,
 	DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT,
+	DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 53e3a9d756b0..5dfbac72f1ab 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -214,6 +214,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 		.initialized = false,
 	},
 	.use_oif_addrs_only	= 0,
+	.ignore_routes_with_linkdown = 0,
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -257,6 +258,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 		.initialized = false,
 	},
 	.use_oif_addrs_only	= 0,
+	.ignore_routes_with_linkdown = 0,
 };
 
 /* Check if a valid qdisc is available */
@@ -472,6 +474,9 @@ static int inet6_netconf_msgsize_devconf(int type)
 	if (type == -1 || type == NETCONFA_PROXY_NEIGH)
 		size += nla_total_size(4);
 
+	if (type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
+		size += nla_total_size(4);
+
 	return size;
 }
 
@@ -508,6 +513,11 @@ static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
 	    nla_put_s32(skb, NETCONFA_PROXY_NEIGH, devconf->proxy_ndp) < 0)
 		goto nla_put_failure;
 
+	if ((type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
+	    nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+			devconf->ignore_routes_with_linkdown) < 0)
+		goto nla_put_failure;
+
 	nlmsg_end(skb, nlh);
 	return 0;
 
@@ -544,6 +554,7 @@ static const struct nla_policy devconf_ipv6_policy[NETCONFA_MAX+1] = {
 	[NETCONFA_IFINDEX]	= { .len = sizeof(int) },
 	[NETCONFA_FORWARDING]	= { .len = sizeof(int) },
 	[NETCONFA_PROXY_NEIGH]	= { .len = sizeof(int) },
+	[NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN]	= { .len = sizeof(int) },
 };
 
 static int inet6_netconf_get_devconf(struct sk_buff *in_skb,
@@ -766,6 +777,63 @@ static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf)
 		rt6_purge_dflt_routers(net);
 	return 1;
 }
+
+static void addrconf_linkdown_change(struct net *net, __s32 newf)
+{
+	struct net_device *dev;
+	struct inet6_dev *idev;
+
+	for_each_netdev(net, dev) {
+		idev = __in6_dev_get(dev);
+		if (idev) {
+			int changed = (!idev->cnf.ignore_routes_with_linkdown) ^ (!newf);
+
+			idev->cnf.ignore_routes_with_linkdown = newf;
+			if (changed)
+				inet6_netconf_notify_devconf(dev_net(dev),
+							     NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+							     dev->ifindex,
+							     &idev->cnf);
+		}
+	}
+}
+
+static int addrconf_fixup_linkdown(struct ctl_table *table, int *p, int newf)
+{
+	struct net *net;
+	int old;
+
+	if (!rtnl_trylock())
+		return restart_syscall();
+
+	net = (struct net *)table->extra2;
+	old = *p;
+	*p = newf;
+
+	if (p == &net->ipv6.devconf_dflt->ignore_routes_with_linkdown) {
+		if ((!newf) ^ (!old))
+			inet6_netconf_notify_devconf(net,
+						     NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+						     NETCONFA_IFINDEX_DEFAULT,
+						     net->ipv6.devconf_dflt);
+		rtnl_unlock();
+		return 0;
+	}
+
+	if (p == &net->ipv6.devconf_all->ignore_routes_with_linkdown) {
+		net->ipv6.devconf_dflt->ignore_routes_with_linkdown = newf;
+		addrconf_linkdown_change(net, newf);
+		if ((!newf) ^ (!old))
+			inet6_netconf_notify_devconf(net,
+						     NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+						     NETCONFA_IFINDEX_ALL,
+						     net->ipv6.devconf_all);
+	}
+	rtnl_unlock();
+
+	return 1;
+}
+
 #endif
 
 /* Nobody refers to this ifaddr, destroy it */
@@ -4616,6 +4684,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_SUPPRESS_FRAG_NDISC] = cnf->suppress_frag_ndisc;
 	array[DEVCONF_ACCEPT_RA_FROM_LOCAL] = cnf->accept_ra_from_local;
 	array[DEVCONF_ACCEPT_RA_MTU] = cnf->accept_ra_mtu;
+	array[DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN] = cnf->ignore_routes_with_linkdown;
 	/* we omit DEVCONF_STABLE_SECRET for now */
 	array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only;
 }
@@ -5338,6 +5407,34 @@ out:
 	return err;
 }
 
+static
+int addrconf_sysctl_ignore_routes_with_linkdown(struct ctl_table *ctl,
+						int write,
+						void __user *buffer,
+						size_t *lenp,
+						loff_t *ppos)
+{
+	int *valp = ctl->data;
+	int val = *valp;
+	loff_t pos = *ppos;
+	struct ctl_table lctl;
+	int ret;
+
+	/* ctl->data points to idev->cnf.ignore_routes_when_linkdown
+	 * we should not modify it until we get the rtnl lock.
+	 */
+	lctl = *ctl;
+	lctl.data = &val;
+
+	ret = proc_dointvec(&lctl, write, buffer, lenp, ppos);
+
+	if (write)
+		ret = addrconf_fixup_linkdown(ctl, valp, val);
+	if (ret)
+		*ppos = pos;
+	return ret;
+}
+
 static struct addrconf_sysctl_table
 {
 	struct ctl_table_header *sysctl_header;
@@ -5629,7 +5726,13 @@ static struct addrconf_sysctl_table
 			.maxlen         = sizeof(int),
 			.mode           = 0644,
 			.proc_handler   = proc_dointvec,
-
+		},
+		{
+			.procname	= "ignore_routes_with_linkdown",
+			.data		= &ipv6_devconf.ignore_routes_with_linkdown,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= addrconf_sysctl_ignore_routes_with_linkdown,
 		},
 		{
 			/* sentinel */
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 370f72785385..1c0217e61357 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -665,6 +665,12 @@ static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
 {
 	int m;
 	bool match_do_rr = false;
+	struct inet6_dev *idev = rt->rt6i_idev;
+	struct net_device *dev = rt->dst.dev;
+
+	if (dev && !netif_carrier_ok(dev) &&
+	    idev->cnf.ignore_routes_with_linkdown)
+		goto out;
 
 	if (rt6_check_expired(rt))
 		goto out;
@@ -2887,8 +2893,11 @@ static int rt6_fill_node(struct net *net,
 	else
 		rtm->rtm_type = RTN_UNICAST;
 	rtm->rtm_flags = 0;
-	if (!netif_carrier_ok(rt->dst.dev))
+	if (!netif_carrier_ok(rt->dst.dev)) {
 		rtm->rtm_flags |= RTNH_F_LINKDOWN;
+		if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
+			rtm->rtm_flags |= RTNH_F_DEAD;
+	}
 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
 	rtm->rtm_protocol = rt->rt6i_protocol;
 	if (rt->rt6i_flags & RTF_DYNAMIC)
-- 
cgit v1.2.3


From 4e3c89920cd3a6cfce22c6f537690747c26128dd Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Thu, 13 Aug 2015 14:59:00 -0600
Subject: net: Introduce VRF related flags and helpers

Add a VRF_MASTER flag for interfaces and helper functions for determining
if a device is a VRF_MASTER.

Add link attribute for passing VRF_TABLE id.

Add vrf_ptr to netdevice.

Add various macros for determining if a device is a VRF device, the index
of the master VRF device and table associated with VRF device.

Signed-off-by: Shrijeet Mukherjee <shm@cumulusnetworks.com>
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h    |  20 +++++++
 include/net/vrf.h            | 139 +++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/if_link.h |   9 +++
 3 files changed, 168 insertions(+)
 create mode 100644 include/net/vrf.h

(limited to 'include/uapi')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 607b5f41f46f..f7a6ef2fae3a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1289,6 +1289,7 @@ enum netdev_priv_flags {
 	IFF_XMIT_DST_RELEASE_PERM	= 1<<22,
 	IFF_IPVLAN_MASTER		= 1<<23,
 	IFF_IPVLAN_SLAVE		= 1<<24,
+	IFF_VRF_MASTER			= 1<<25,
 };
 
 #define IFF_802_1Q_VLAN			IFF_802_1Q_VLAN
@@ -1316,6 +1317,7 @@ enum netdev_priv_flags {
 #define IFF_XMIT_DST_RELEASE_PERM	IFF_XMIT_DST_RELEASE_PERM
 #define IFF_IPVLAN_MASTER		IFF_IPVLAN_MASTER
 #define IFF_IPVLAN_SLAVE		IFF_IPVLAN_SLAVE
+#define IFF_VRF_MASTER			IFF_VRF_MASTER
 
 /**
  *	struct net_device - The DEVICE structure.
@@ -1432,6 +1434,7 @@ enum netdev_priv_flags {
  *	@dn_ptr:	DECnet specific data
  *	@ip6_ptr:	IPv6 specific data
  *	@ax25_ptr:	AX.25 specific data
+ *	@vrf_ptr:	VRF specific data
  *	@ieee80211_ptr:	IEEE 802.11 specific data, assign before registering
  *
  *	@last_rx:	Time of last Rx
@@ -1650,6 +1653,7 @@ struct net_device {
 	struct dn_dev __rcu     *dn_ptr;
 	struct inet6_dev __rcu	*ip6_ptr;
 	void			*ax25_ptr;
+	struct net_vrf_dev __rcu *vrf_ptr;
 	struct wireless_dev	*ieee80211_ptr;
 	struct wpan_dev		*ieee802154_ptr;
 #if IS_ENABLED(CONFIG_MPLS_ROUTING)
@@ -3808,6 +3812,22 @@ static inline bool netif_supports_nofcs(struct net_device *dev)
 	return dev->priv_flags & IFF_SUPP_NOFCS;
 }
 
+static inline bool netif_is_vrf(const struct net_device *dev)
+{
+	return dev->priv_flags & IFF_VRF_MASTER;
+}
+
+static inline bool netif_index_is_vrf(struct net *net, int ifindex)
+{
+	struct net_device *dev = dev_get_by_index_rcu(net, ifindex);
+	bool rc = false;
+
+	if (dev)
+		rc = netif_is_vrf(dev);
+
+	return rc;
+}
+
 /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */
 static inline void netif_keep_dst(struct net_device *dev)
 {
diff --git a/include/net/vrf.h b/include/net/vrf.h
new file mode 100644
index 000000000000..0484d29d4589
--- /dev/null
+++ b/include/net/vrf.h
@@ -0,0 +1,139 @@
+/*
+ * include/net/net_vrf.h - adds vrf dev structure definitions
+ * Copyright (c) 2015 Cumulus Networks
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __LINUX_NET_VRF_H
+#define __LINUX_NET_VRF_H
+
+struct net_vrf_dev {
+	struct rcu_head		rcu;
+	int                     ifindex; /* ifindex of master dev */
+	u32                     tb_id;   /* table id for VRF */
+};
+
+struct slave {
+	struct list_head	list;
+	struct net_device	*dev;
+};
+
+struct slave_queue {
+	struct list_head	all_slaves;
+	int			num_slaves;
+};
+
+struct net_vrf {
+	struct slave_queue	queue;
+	struct rtable           *rth;
+	u32			tb_id;
+};
+
+
+#if IS_ENABLED(CONFIG_NET_VRF)
+/* called with rcu_read_lock() */
+static inline int vrf_master_ifindex_rcu(const struct net_device *dev)
+{
+	struct net_vrf_dev *vrf_ptr;
+	int ifindex = 0;
+
+	if (!dev)
+		return 0;
+
+	if (netif_is_vrf(dev))
+		ifindex = dev->ifindex;
+	else {
+		vrf_ptr = rcu_dereference(dev->vrf_ptr);
+		if (vrf_ptr)
+			ifindex = vrf_ptr->ifindex;
+	}
+
+	return ifindex;
+}
+
+/* called with rcu_read_lock */
+static inline int vrf_dev_table_rcu(const struct net_device *dev)
+{
+	int tb_id = 0;
+
+	if (dev) {
+		struct net_vrf_dev *vrf_ptr;
+
+		vrf_ptr = rcu_dereference(dev->vrf_ptr);
+		if (vrf_ptr)
+			tb_id = vrf_ptr->tb_id;
+	}
+	return tb_id;
+}
+
+static inline int vrf_dev_table(const struct net_device *dev)
+{
+	int tb_id;
+
+	rcu_read_lock();
+	tb_id = vrf_dev_table_rcu(dev);
+	rcu_read_unlock();
+
+	return tb_id;
+}
+
+/* called with rtnl */
+static inline int vrf_dev_table_rtnl(const struct net_device *dev)
+{
+	int tb_id = 0;
+
+	if (dev) {
+		struct net_vrf_dev *vrf_ptr;
+
+		vrf_ptr = rtnl_dereference(dev->vrf_ptr);
+		if (vrf_ptr)
+			tb_id = vrf_ptr->tb_id;
+	}
+	return tb_id;
+}
+
+/* caller has already checked netif_is_vrf(dev) */
+static inline struct rtable *vrf_dev_get_rth(const struct net_device *dev)
+{
+	struct rtable *rth = ERR_PTR(-ENETUNREACH);
+	struct net_vrf *vrf = netdev_priv(dev);
+
+	if (vrf) {
+		rth = vrf->rth;
+		atomic_inc(&rth->dst.__refcnt);
+	}
+	return rth;
+}
+
+#else
+static inline int vrf_master_ifindex_rcu(const struct net_device *dev)
+{
+	return 0;
+}
+
+static inline int vrf_dev_table_rcu(const struct net_device *dev)
+{
+	return 0;
+}
+
+static inline int vrf_dev_table(const struct net_device *dev)
+{
+	return 0;
+}
+
+static inline int vrf_dev_table_rtnl(const struct net_device *dev)
+{
+	return 0;
+}
+
+static inline struct rtable *vrf_dev_get_rth(const struct net_device *dev)
+{
+	return ERR_PTR(-ENETUNREACH);
+}
+#endif
+
+#endif /* __LINUX_NET_VRF_H */
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index d450be36add2..313c305fd1ad 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -341,6 +341,15 @@ enum macvlan_macaddr_mode {
 
 #define MACVLAN_FLAG_NOPROMISC	1
 
+/* VRF section */
+enum {
+	IFLA_VRF_UNSPEC,
+	IFLA_VRF_TABLE,
+	__IFLA_VRF_MAX
+};
+
+#define IFLA_VRF_MAX (__IFLA_VRF_MAX - 1)
+
 /* IPVLAN section */
 enum {
 	IFLA_IPVLAN_UNSPEC,
-- 
cgit v1.2.3


From a1c234f95cae2d293047bb6c36e7a4840dbac815 Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Fri, 14 Aug 2015 16:40:40 +0200
Subject: lwtunnel: rename ip lwtunnel attributes

We already have IFLA_IPTUN_ netlink attributes. The IP_TUN_ attributes look
very similar, yet they serve very different purpose. This is confusing for
anyone trying to implement a user space tool supporting lwt.

As the IP_TUN_ attributes are used only for the lightweight tunnels, prefix
them with LWTUNNEL_IP_ instead to make their purpose clear. Also, it's more
logical to have them in lwtunnel.h together with the encap enum.

Fixes: 3093fbe7ff4b ("route: Per route IP tunnel metadata via lightweight tunnel")
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/lwtunnel.h  | 14 +++++++
 include/uapi/linux/rtnetlink.h | 15 --------
 net/ipv4/ip_tunnel_core.c      | 86 +++++++++++++++++++++---------------------
 3 files changed, 57 insertions(+), 58 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
index 31377bbea3f8..3bf223bc2367 100644
--- a/include/uapi/linux/lwtunnel.h
+++ b/include/uapi/linux/lwtunnel.h
@@ -12,5 +12,19 @@ enum lwtunnel_encap_types {
 
 #define LWTUNNEL_ENCAP_MAX (__LWTUNNEL_ENCAP_MAX - 1)
 
+enum lwtunnel_ip_t {
+	LWTUNNEL_IP_UNSPEC,
+	LWTUNNEL_IP_ID,
+	LWTUNNEL_IP_DST,
+	LWTUNNEL_IP_SRC,
+	LWTUNNEL_IP_TTL,
+	LWTUNNEL_IP_TOS,
+	LWTUNNEL_IP_SPORT,
+	LWTUNNEL_IP_DPORT,
+	LWTUNNEL_IP_FLAGS,
+	__LWTUNNEL_IP_MAX,
+};
+
+#define LWTUNNEL_IP_MAX (__LWTUNNEL_IP_MAX - 1)
 
 #endif /* _UAPI_LWTUNNEL_H_ */
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 47d24cb3fbc1..0d3d3cc43356 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -286,21 +286,6 @@ enum rt_class_t {
 
 /* Routing message attributes */
 
-enum ip_tunnel_t {
-	IP_TUN_UNSPEC,
-	IP_TUN_ID,
-	IP_TUN_DST,
-	IP_TUN_SRC,
-	IP_TUN_TTL,
-	IP_TUN_TOS,
-	IP_TUN_SPORT,
-	IP_TUN_DPORT,
-	IP_TUN_FLAGS,
-	__IP_TUN_MAX,
-};
-
-#define IP_TUN_MAX (__IP_TUN_MAX - 1)
-
 enum rtattr_type_t {
 	RTA_UNSPEC,
 	RTA_DST,
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 5512f4e4ec1b..fd6319681c50 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -192,15 +192,15 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
 
-static const struct nla_policy ip_tun_policy[IP_TUN_MAX + 1] = {
-	[IP_TUN_ID]		= { .type = NLA_U64 },
-	[IP_TUN_DST]		= { .type = NLA_U32 },
-	[IP_TUN_SRC]		= { .type = NLA_U32 },
-	[IP_TUN_TTL]		= { .type = NLA_U8 },
-	[IP_TUN_TOS]		= { .type = NLA_U8 },
-	[IP_TUN_SPORT]		= { .type = NLA_U16 },
-	[IP_TUN_DPORT]		= { .type = NLA_U16 },
-	[IP_TUN_FLAGS]		= { .type = NLA_U16 },
+static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = {
+	[LWTUNNEL_IP_ID]	= { .type = NLA_U64 },
+	[LWTUNNEL_IP_DST]	= { .type = NLA_U32 },
+	[LWTUNNEL_IP_SRC]	= { .type = NLA_U32 },
+	[LWTUNNEL_IP_TTL]	= { .type = NLA_U8 },
+	[LWTUNNEL_IP_TOS]	= { .type = NLA_U8 },
+	[LWTUNNEL_IP_SPORT]	= { .type = NLA_U16 },
+	[LWTUNNEL_IP_DPORT]	= { .type = NLA_U16 },
+	[LWTUNNEL_IP_FLAGS]	= { .type = NLA_U16 },
 };
 
 static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr,
@@ -208,10 +208,10 @@ static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr,
 {
 	struct ip_tunnel_info *tun_info;
 	struct lwtunnel_state *new_state;
-	struct nlattr *tb[IP_TUN_MAX + 1];
+	struct nlattr *tb[LWTUNNEL_IP_MAX + 1];
 	int err;
 
-	err = nla_parse_nested(tb, IP_TUN_MAX, attr, ip_tun_policy);
+	err = nla_parse_nested(tb, LWTUNNEL_IP_MAX, attr, ip_tun_policy);
 	if (err < 0)
 		return err;
 
@@ -223,29 +223,29 @@ static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr,
 
 	tun_info = lwt_tun_info(new_state);
 
-	if (tb[IP_TUN_ID])
-		tun_info->key.tun_id = nla_get_u64(tb[IP_TUN_ID]);
+	if (tb[LWTUNNEL_IP_ID])
+		tun_info->key.tun_id = nla_get_u64(tb[LWTUNNEL_IP_ID]);
 
-	if (tb[IP_TUN_DST])
-		tun_info->key.ipv4_dst = nla_get_be32(tb[IP_TUN_DST]);
+	if (tb[LWTUNNEL_IP_DST])
+		tun_info->key.ipv4_dst = nla_get_be32(tb[LWTUNNEL_IP_DST]);
 
-	if (tb[IP_TUN_SRC])
-		tun_info->key.ipv4_src = nla_get_be32(tb[IP_TUN_SRC]);
+	if (tb[LWTUNNEL_IP_SRC])
+		tun_info->key.ipv4_src = nla_get_be32(tb[LWTUNNEL_IP_SRC]);
 
-	if (tb[IP_TUN_TTL])
-		tun_info->key.ipv4_ttl = nla_get_u8(tb[IP_TUN_TTL]);
+	if (tb[LWTUNNEL_IP_TTL])
+		tun_info->key.ipv4_ttl = nla_get_u8(tb[LWTUNNEL_IP_TTL]);
 
-	if (tb[IP_TUN_TOS])
-		tun_info->key.ipv4_tos = nla_get_u8(tb[IP_TUN_TOS]);
+	if (tb[LWTUNNEL_IP_TOS])
+		tun_info->key.ipv4_tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]);
 
-	if (tb[IP_TUN_SPORT])
-		tun_info->key.tp_src = nla_get_be16(tb[IP_TUN_SPORT]);
+	if (tb[LWTUNNEL_IP_SPORT])
+		tun_info->key.tp_src = nla_get_be16(tb[LWTUNNEL_IP_SPORT]);
 
-	if (tb[IP_TUN_DPORT])
-		tun_info->key.tp_dst = nla_get_be16(tb[IP_TUN_DPORT]);
+	if (tb[LWTUNNEL_IP_DPORT])
+		tun_info->key.tp_dst = nla_get_be16(tb[LWTUNNEL_IP_DPORT]);
 
-	if (tb[IP_TUN_FLAGS])
-		tun_info->key.tun_flags = nla_get_u16(tb[IP_TUN_FLAGS]);
+	if (tb[LWTUNNEL_IP_FLAGS])
+		tun_info->key.tun_flags = nla_get_u16(tb[LWTUNNEL_IP_FLAGS]);
 
 	tun_info->mode = IP_TUNNEL_INFO_TX;
 	tun_info->options = NULL;
@@ -261,14 +261,14 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb,
 {
 	struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
 
-	if (nla_put_u64(skb, IP_TUN_ID, tun_info->key.tun_id) ||
-	    nla_put_be32(skb, IP_TUN_DST, tun_info->key.ipv4_dst) ||
-	    nla_put_be32(skb, IP_TUN_SRC, tun_info->key.ipv4_src) ||
-	    nla_put_u8(skb, IP_TUN_TOS, tun_info->key.ipv4_tos) ||
-	    nla_put_u8(skb, IP_TUN_TTL, tun_info->key.ipv4_ttl) ||
-	    nla_put_u16(skb, IP_TUN_SPORT, tun_info->key.tp_src) ||
-	    nla_put_u16(skb, IP_TUN_DPORT, tun_info->key.tp_dst) ||
-	    nla_put_u16(skb, IP_TUN_FLAGS, tun_info->key.tun_flags))
+	if (nla_put_u64(skb, LWTUNNEL_IP_ID, tun_info->key.tun_id) ||
+	    nla_put_be32(skb, LWTUNNEL_IP_DST, tun_info->key.ipv4_dst) ||
+	    nla_put_be32(skb, LWTUNNEL_IP_SRC, tun_info->key.ipv4_src) ||
+	    nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.ipv4_tos) ||
+	    nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ipv4_ttl) ||
+	    nla_put_u16(skb, LWTUNNEL_IP_SPORT, tun_info->key.tp_src) ||
+	    nla_put_u16(skb, LWTUNNEL_IP_DPORT, tun_info->key.tp_dst) ||
+	    nla_put_u16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags))
 		return -ENOMEM;
 
 	return 0;
@@ -276,14 +276,14 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb,
 
 static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
 {
-	return nla_total_size(8)	/* IP_TUN_ID */
-		+ nla_total_size(4)	/* IP_TUN_DST */
-		+ nla_total_size(4)	/* IP_TUN_SRC */
-		+ nla_total_size(1)	/* IP_TUN_TOS */
-		+ nla_total_size(1)	/* IP_TUN_TTL */
-		+ nla_total_size(2)	/* IP_TUN_SPORT */
-		+ nla_total_size(2)	/* IP_TUN_DPORT */
-		+ nla_total_size(2);	/* IP_TUN_FLAGS */
+	return nla_total_size(8)	/* LWTUNNEL_IP_ID */
+		+ nla_total_size(4)	/* LWTUNNEL_IP_DST */
+		+ nla_total_size(4)	/* LWTUNNEL_IP_SRC */
+		+ nla_total_size(1)	/* LWTUNNEL_IP_TOS */
+		+ nla_total_size(1)	/* LWTUNNEL_IP_TTL */
+		+ nla_total_size(2)	/* LWTUNNEL_IP_SPORT */
+		+ nla_total_size(2)	/* LWTUNNEL_IP_DPORT */
+		+ nla_total_size(2);	/* LWTUNNEL_IP_FLAGS */
 }
 
 static const struct lwtunnel_encap_ops ip_tun_lwt_ops = {
-- 
cgit v1.2.3


From 47dceb8ecdc1c3ad1818dfea3d659a05b74c3fc2 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Fri, 14 Aug 2015 22:31:34 -0400
Subject: packet: add classic BPF fanout mode

Add fanout mode PACKET_FANOUT_CBPF that accepts a classic BPF program
to select a socket.

This avoids having to keep adding special case fanout modes. One
example use case is application layer load balancing. The QUIC
protocol, for instance, encodes a connection ID in UDP payload.

Also add socket option SOL_PACKET/PACKET_FANOUT_DATA that updates data
associated with the socket group. Fanout mode PACKET_FANOUT_CBPF is the
only user so far.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_packet.h |  2 +
 net/packet/af_packet.c         | 99 +++++++++++++++++++++++++++++++++++++++++-
 net/packet/internal.h          |  5 ++-
 3 files changed, 104 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h
index d3d715f8c88f..a4bb16fa822e 100644
--- a/include/uapi/linux/if_packet.h
+++ b/include/uapi/linux/if_packet.h
@@ -55,6 +55,7 @@ struct sockaddr_ll {
 #define PACKET_TX_HAS_OFF		19
 #define PACKET_QDISC_BYPASS		20
 #define PACKET_ROLLOVER_STATS		21
+#define PACKET_FANOUT_DATA		22
 
 #define PACKET_FANOUT_HASH		0
 #define PACKET_FANOUT_LB		1
@@ -62,6 +63,7 @@ struct sockaddr_ll {
 #define PACKET_FANOUT_ROLLOVER		3
 #define PACKET_FANOUT_RND		4
 #define PACKET_FANOUT_QM		5
+#define PACKET_FANOUT_CBPF		6
 #define PACKET_FANOUT_FLAG_ROLLOVER	0x1000
 #define PACKET_FANOUT_FLAG_DEFRAG	0x8000
 
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index b5afe538bb88..8869d07013e6 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -92,6 +92,7 @@
 #ifdef CONFIG_INET
 #include <net/inet_common.h>
 #endif
+#include <linux/bpf.h>
 
 #include "internal.h"
 
@@ -1410,6 +1411,22 @@ static unsigned int fanout_demux_qm(struct packet_fanout *f,
 	return skb_get_queue_mapping(skb) % num;
 }
 
+static unsigned int fanout_demux_bpf(struct packet_fanout *f,
+				     struct sk_buff *skb,
+				     unsigned int num)
+{
+	struct bpf_prog *prog;
+	unsigned int ret = 0;
+
+	rcu_read_lock();
+	prog = rcu_dereference(f->bpf_prog);
+	if (prog)
+		ret = BPF_PROG_RUN(prog, skb) % num;
+	rcu_read_unlock();
+
+	return ret;
+}
+
 static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
 {
 	return f->flags & (flag >> 8);
@@ -1454,6 +1471,9 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
 	case PACKET_FANOUT_ROLLOVER:
 		idx = fanout_demux_rollover(f, skb, 0, false, num);
 		break;
+	case PACKET_FANOUT_CBPF:
+		idx = fanout_demux_bpf(f, skb, num);
+		break;
 	}
 
 	if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
@@ -1502,6 +1522,74 @@ static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
 	return false;
 }
 
+static void fanout_init_data(struct packet_fanout *f)
+{
+	switch (f->type) {
+	case PACKET_FANOUT_LB:
+		atomic_set(&f->rr_cur, 0);
+		break;
+	case PACKET_FANOUT_CBPF:
+		RCU_INIT_POINTER(f->bpf_prog, NULL);
+		break;
+	}
+}
+
+static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
+{
+	struct bpf_prog *old;
+
+	spin_lock(&f->lock);
+	old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
+	rcu_assign_pointer(f->bpf_prog, new);
+	spin_unlock(&f->lock);
+
+	if (old) {
+		synchronize_net();
+		bpf_prog_destroy(old);
+	}
+}
+
+static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
+				unsigned int len)
+{
+	struct bpf_prog *new;
+	struct sock_fprog fprog;
+	int ret;
+
+	if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
+		return -EPERM;
+	if (len != sizeof(fprog))
+		return -EINVAL;
+	if (copy_from_user(&fprog, data, len))
+		return -EFAULT;
+
+	ret = bpf_prog_create_from_user(&new, &fprog, NULL);
+	if (ret)
+		return ret;
+
+	__fanout_set_data_bpf(po->fanout, new);
+	return 0;
+}
+
+static int fanout_set_data(struct packet_sock *po, char __user *data,
+			   unsigned int len)
+{
+	switch (po->fanout->type) {
+	case PACKET_FANOUT_CBPF:
+		return fanout_set_data_cbpf(po, data, len);
+	default:
+		return -EINVAL;
+	};
+}
+
+static void fanout_release_data(struct packet_fanout *f)
+{
+	switch (f->type) {
+	case PACKET_FANOUT_CBPF:
+		__fanout_set_data_bpf(f, NULL);
+	};
+}
+
 static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
 {
 	struct packet_sock *po = pkt_sk(sk);
@@ -1519,6 +1607,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
 	case PACKET_FANOUT_CPU:
 	case PACKET_FANOUT_RND:
 	case PACKET_FANOUT_QM:
+	case PACKET_FANOUT_CBPF:
 		break;
 	default:
 		return -EINVAL;
@@ -1561,10 +1650,10 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
 		match->id = id;
 		match->type = type;
 		match->flags = flags;
-		atomic_set(&match->rr_cur, 0);
 		INIT_LIST_HEAD(&match->list);
 		spin_lock_init(&match->lock);
 		atomic_set(&match->sk_ref, 0);
+		fanout_init_data(match);
 		match->prot_hook.type = po->prot_hook.type;
 		match->prot_hook.dev = po->prot_hook.dev;
 		match->prot_hook.func = packet_rcv_fanout;
@@ -1610,6 +1699,7 @@ static void fanout_release(struct sock *sk)
 	if (atomic_dec_and_test(&f->sk_ref)) {
 		list_del(&f->list);
 		dev_remove_pack(&f->prot_hook);
+		fanout_release_data(f);
 		kfree(f);
 	}
 	mutex_unlock(&fanout_mutex);
@@ -3529,6 +3619,13 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
 
 		return fanout_add(sk, val & 0xffff, val >> 16);
 	}
+	case PACKET_FANOUT_DATA:
+	{
+		if (!po->fanout)
+			return -EINVAL;
+
+		return fanout_set_data(po, optval, optlen);
+	}
 	case PACKET_TX_HAS_OFF:
 	{
 		unsigned int val;
diff --git a/net/packet/internal.h b/net/packet/internal.h
index e20b3e8829b8..9ee46314b7d7 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -79,7 +79,10 @@ struct packet_fanout {
 	u16			id;
 	u8			type;
 	u8			flags;
-	atomic_t		rr_cur;
+	union {
+		atomic_t		rr_cur;
+		struct bpf_prog __rcu	*bpf_prog;
+	};
 	struct list_head	list;
 	struct sock		*arr[PACKET_FANOUT_MAX];
 	spinlock_t		lock;
-- 
cgit v1.2.3


From f2e520956a1ab636698f8160194c9b8ac0989aab Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Fri, 14 Aug 2015 22:31:35 -0400
Subject: packet: add extended BPF fanout mode

Add fanout mode PACKET_FANOUT_EBPF that accepts an en extended BPF
program to select a socket.

Update the internal eBPF program by passing to socket option
SOL_PACKET/PACKET_FANOUT_DATA a file descriptor returned by bpf().

Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_packet.h |  1 +
 net/packet/af_packet.c         | 31 +++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h
index a4bb16fa822e..9e7edfd8141e 100644
--- a/include/uapi/linux/if_packet.h
+++ b/include/uapi/linux/if_packet.h
@@ -64,6 +64,7 @@ struct sockaddr_ll {
 #define PACKET_FANOUT_RND		4
 #define PACKET_FANOUT_QM		5
 #define PACKET_FANOUT_CBPF		6
+#define PACKET_FANOUT_EBPF		7
 #define PACKET_FANOUT_FLAG_ROLLOVER	0x1000
 #define PACKET_FANOUT_FLAG_DEFRAG	0x8000
 
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 8869d07013e6..7b8e39a22387 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1472,6 +1472,7 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
 		idx = fanout_demux_rollover(f, skb, 0, false, num);
 		break;
 	case PACKET_FANOUT_CBPF:
+	case PACKET_FANOUT_EBPF:
 		idx = fanout_demux_bpf(f, skb, num);
 		break;
 	}
@@ -1529,6 +1530,7 @@ static void fanout_init_data(struct packet_fanout *f)
 		atomic_set(&f->rr_cur, 0);
 		break;
 	case PACKET_FANOUT_CBPF:
+	case PACKET_FANOUT_EBPF:
 		RCU_INIT_POINTER(f->bpf_prog, NULL);
 		break;
 	}
@@ -1571,12 +1573,39 @@ static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
 	return 0;
 }
 
+static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
+				unsigned int len)
+{
+	struct bpf_prog *new;
+	u32 fd;
+
+	if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
+		return -EPERM;
+	if (len != sizeof(fd))
+		return -EINVAL;
+	if (copy_from_user(&fd, data, len))
+		return -EFAULT;
+
+	new = bpf_prog_get(fd);
+	if (IS_ERR(new))
+		return PTR_ERR(new);
+	if (new->type != BPF_PROG_TYPE_SOCKET_FILTER) {
+		bpf_prog_put(new);
+		return -EINVAL;
+	}
+
+	__fanout_set_data_bpf(po->fanout, new);
+	return 0;
+}
+
 static int fanout_set_data(struct packet_sock *po, char __user *data,
 			   unsigned int len)
 {
 	switch (po->fanout->type) {
 	case PACKET_FANOUT_CBPF:
 		return fanout_set_data_cbpf(po, data, len);
+	case PACKET_FANOUT_EBPF:
+		return fanout_set_data_ebpf(po, data, len);
 	default:
 		return -EINVAL;
 	};
@@ -1586,6 +1615,7 @@ static void fanout_release_data(struct packet_fanout *f)
 {
 	switch (f->type) {
 	case PACKET_FANOUT_CBPF:
+	case PACKET_FANOUT_EBPF:
 		__fanout_set_data_bpf(f, NULL);
 	};
 }
@@ -1608,6 +1638,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
 	case PACKET_FANOUT_RND:
 	case PACKET_FANOUT_QM:
 	case PACKET_FANOUT_CBPF:
+	case PACKET_FANOUT_EBPF:
 		break;
 	default:
 		return -EINVAL;
-- 
cgit v1.2.3


From deedb59039f111c41aa5a54ee384c8e7c08bc78a Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 14 Aug 2015 16:03:39 +0200
Subject: netfilter: nf_conntrack: add direction support for zones

This work adds a direction parameter to netfilter zones, so identity
separation can be performed only in original/reply or both directions
(default). This basically opens up the possibility of doing NAT with
conflicting IP address/port tuples from multiple, isolated tenants
on a host (e.g. from a netns) without requiring each tenant to NAT
twice resp. to use its own dedicated IP address to SNAT to, meaning
overlapping tuples can be made unique with the zone identifier in
original direction, where the NAT engine will then allocate a unique
tuple in the commonly shared default zone for the reply direction.
In some restricted, local DNAT cases, also port redirection could be
used for making the reply traffic unique w/o requiring SNAT.

The consensus we've reached and discussed at NFWS and since the initial
implementation [1] was to directly integrate the direction meta data
into the existing zones infrastructure, as opposed to the ct->mark
approach we proposed initially.

As we pass the nf_conntrack_zone object directly around, we don't have
to touch all call-sites, but only those, that contain equality checks
of zones. Thus, based on the current direction (original or reply),
we either return the actual id, or the default NF_CT_DEFAULT_ZONE_ID.
CT expectations are direction-agnostic entities when expectations are
being compared among themselves, so we can only use the identifier
in this case.

Note that zone identifiers can not be included into the hash mix
anymore as they don't contain a "stable" value that would be equal
for both directions at all times, f.e. if only zone->id would
unconditionally be xor'ed into the table slot hash, then replies won't
find the corresponding conntracking entry anymore.

If no particular direction is specified when configuring zones, the
behaviour is exactly as we expect currently (both directions).

Support has been added for the CT netlink interface as well as the
x_tables raw CT target, which both already offer existing interfaces
to user space for the configuration of zones.

Below a minimal, simplified collision example (script in [2]) with
netperf sessions:

  +--- tenant-1 ---+   mark := 1
  |    netperf     |--+
  +----------------+  |                CT zone := mark [ORIGINAL]
   [ip,sport] := X   +--------------+  +--- gateway ---+
                     | mark routing |--|     SNAT      |-- ... +
                     +--------------+  +---------------+       |
  +--- tenant-2 ---+  |                                     ~~~|~~~
  |    netperf     |--+                +-----------+           |
  +----------------+   mark := 2       | netserver |------ ... +
   [ip,sport] := X                     +-----------+
                                        [ip,port] := Y
On the gateway netns, example:

  iptables -t raw -A PREROUTING -j CT --zone mark --zone-dir ORIGINAL
  iptables -t nat -A POSTROUTING -o <dev> -j SNAT --to-source <ip> --random-fully

  iptables -t mangle -A PREROUTING -m conntrack --ctdir ORIGINAL -j CONNMARK --save-mark
  iptables -t mangle -A POSTROUTING -m conntrack --ctdir REPLY -j CONNMARK --restore-mark

conntrack dump from gateway netns:

  netperf -H 10.1.1.2 -t TCP_STREAM -l60 -p12865,5555 from each tenant netns

  tcp 6 431995 ESTABLISHED src=40.1.1.1 dst=10.1.1.2 sport=5555 dport=12865 zone-orig=1
                           src=10.1.1.2 dst=10.1.1.1 sport=12865 dport=1024
               [ASSURED] mark=1 secctx=system_u:object_r:unlabeled_t:s0 use=1

  tcp 6 431994 ESTABLISHED src=40.1.1.1 dst=10.1.1.2 sport=5555 dport=12865 zone-orig=2
                           src=10.1.1.2 dst=10.1.1.1 sport=12865 dport=5555
               [ASSURED] mark=2 secctx=system_u:object_r:unlabeled_t:s0 use=1

  tcp 6 299 ESTABLISHED src=40.1.1.1 dst=10.1.1.2 sport=39438 dport=33768 zone-orig=1
                        src=10.1.1.2 dst=10.1.1.1 sport=33768 dport=39438
               [ASSURED] mark=1 secctx=system_u:object_r:unlabeled_t:s0 use=1

  tcp 6 300 ESTABLISHED src=40.1.1.1 dst=10.1.1.2 sport=32889 dport=40206 zone-orig=2
                        src=10.1.1.2 dst=10.1.1.1 sport=40206 dport=32889
               [ASSURED] mark=2 secctx=system_u:object_r:unlabeled_t:s0 use=2

Taking this further, test script in [2] creates 200 tenants and runs
original-tuple colliding netperf sessions each. A conntrack -L dump in
the gateway netns also confirms 200 overlapping entries, all in ESTABLISHED
state as expected.

I also did run various other tests with some permutations of the script,
to mention some: SNAT in random/random-fully/persistent mode, no zones (no
overlaps), static zones (original, reply, both directions), etc.

  [1] http://thread.gmane.org/gmane.comp.security.firewalls.netfilter.devel/57412/
  [2] https://paste.fedoraproject.org/242835/65657871/

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_zones.h         |  31 +++-
 include/uapi/linux/netfilter/nfnetlink_conntrack.h |   1 +
 include/uapi/linux/netfilter/xt_CT.h               |   6 +-
 net/ipv4/netfilter/nf_defrag_ipv4.c                |   8 +-
 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c          |   8 +-
 net/netfilter/nf_conntrack_core.c                  |  53 +++---
 net/netfilter/nf_conntrack_expect.c                |   8 +-
 net/netfilter/nf_conntrack_netlink.c               | 177 +++++++++++++++------
 net/netfilter/nf_conntrack_standalone.c            |  30 +++-
 net/netfilter/nf_nat_core.c                        |  13 +-
 net/netfilter/xt_CT.c                              |  17 +-
 net/sched/act_connmark.c                           |   1 +
 12 files changed, 259 insertions(+), 94 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/netfilter/nf_conntrack_zones.h b/include/net/netfilter/nf_conntrack_zones.h
index 0788bb0f267d..3942ddf0d4ff 100644
--- a/include/net/netfilter/nf_conntrack_zones.h
+++ b/include/net/netfilter/nf_conntrack_zones.h
@@ -1,10 +1,18 @@
 #ifndef _NF_CONNTRACK_ZONES_H
 #define _NF_CONNTRACK_ZONES_H
 
+#include <linux/netfilter/nf_conntrack_tuple_common.h>
+
 #define NF_CT_DEFAULT_ZONE_ID	0
 
+#define NF_CT_ZONE_DIR_ORIG	(1 << IP_CT_DIR_ORIGINAL)
+#define NF_CT_ZONE_DIR_REPL	(1 << IP_CT_DIR_REPLY)
+
+#define NF_CT_DEFAULT_ZONE_DIR	(NF_CT_ZONE_DIR_ORIG | NF_CT_ZONE_DIR_REPL)
+
 struct nf_conntrack_zone {
 	u16	id;
+	u16	dir;
 };
 
 extern const struct nf_conntrack_zone nf_ct_zone_dflt;
@@ -29,8 +37,29 @@ nf_ct_zone_tmpl(const struct nf_conn *tmpl)
 	return tmpl ? nf_ct_zone(tmpl) : &nf_ct_zone_dflt;
 }
 
+static inline bool nf_ct_zone_matches_dir(const struct nf_conntrack_zone *zone,
+					  enum ip_conntrack_dir dir)
+{
+	return zone->dir & (1 << dir);
+}
+
+static inline u16 nf_ct_zone_id(const struct nf_conntrack_zone *zone,
+				enum ip_conntrack_dir dir)
+{
+	return nf_ct_zone_matches_dir(zone, dir) ?
+	       zone->id : NF_CT_DEFAULT_ZONE_ID;
+}
+
 static inline bool nf_ct_zone_equal(const struct nf_conn *a,
-				    const struct nf_conntrack_zone *b)
+				    const struct nf_conntrack_zone *b,
+				    enum ip_conntrack_dir dir)
+{
+	return nf_ct_zone_id(nf_ct_zone(a), dir) ==
+	       nf_ct_zone_id(b, dir);
+}
+
+static inline bool nf_ct_zone_equal_any(const struct nf_conn *a,
+					const struct nf_conntrack_zone *b)
 {
 	return nf_ct_zone(a)->id == b->id;
 }
diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
index acad6c52a652..c1a4e1441a25 100644
--- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
@@ -61,6 +61,7 @@ enum ctattr_tuple {
 	CTA_TUPLE_UNSPEC,
 	CTA_TUPLE_IP,
 	CTA_TUPLE_PROTO,
+	CTA_TUPLE_ZONE,
 	__CTA_TUPLE_MAX
 };
 #define CTA_TUPLE_MAX (__CTA_TUPLE_MAX - 1)
diff --git a/include/uapi/linux/netfilter/xt_CT.h b/include/uapi/linux/netfilter/xt_CT.h
index 5a688c1ca4d7..452005ff0e9e 100644
--- a/include/uapi/linux/netfilter/xt_CT.h
+++ b/include/uapi/linux/netfilter/xt_CT.h
@@ -6,7 +6,11 @@
 enum {
 	XT_CT_NOTRACK		= 1 << 0,
 	XT_CT_NOTRACK_ALIAS	= 1 << 1,
-	XT_CT_MASK		= XT_CT_NOTRACK | XT_CT_NOTRACK_ALIAS,
+	XT_CT_ZONE_DIR_ORIG	= 1 << 2,
+	XT_CT_ZONE_DIR_REPL	= 1 << 3,
+
+	XT_CT_MASK		= XT_CT_NOTRACK | XT_CT_NOTRACK_ALIAS |
+				  XT_CT_ZONE_DIR_ORIG | XT_CT_ZONE_DIR_REPL,
 };
 
 struct xt_ct_target_info {
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index 20fe8e67c09b..9306ec4fab41 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -45,8 +45,12 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
 {
 	u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
-	if (skb->nfct)
-		zone_id = nf_ct_zone((struct nf_conn *)skb->nfct)->id;
+	if (skb->nfct) {
+		enum ip_conntrack_info ctinfo;
+		const struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+
+		zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo));
+	}
 #endif
 	if (nf_bridge_in_prerouting(skb))
 		return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id;
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
index 9d3de9b74856..6d9c0b3d5b8c 100644
--- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -35,8 +35,12 @@ static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum,
 {
 	u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
-	if (skb->nfct)
-		zone_id = nf_ct_zone((struct nf_conn *)skb->nfct)->id;
+	if (skb->nfct) {
+		enum ip_conntrack_info ctinfo;
+		const struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+
+		zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo));
+	}
 #endif
 	if (nf_bridge_in_prerouting(skb))
 		return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 0bb26e84f849..acc06222ce6a 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -126,8 +126,7 @@ EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
 unsigned int nf_conntrack_hash_rnd __read_mostly;
 EXPORT_SYMBOL_GPL(nf_conntrack_hash_rnd);
 
-static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
-			      const struct nf_conntrack_zone *zone)
+static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple)
 {
 	unsigned int n;
 
@@ -136,7 +135,7 @@ static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
 	 * three bytes manually.
 	 */
 	n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
-	return jhash2((u32 *)tuple, n, zone->id ^ nf_conntrack_hash_rnd ^
+	return jhash2((u32 *)tuple, n, nf_conntrack_hash_rnd ^
 		      (((__force __u16)tuple->dst.u.all << 16) |
 		      tuple->dst.protonum));
 }
@@ -152,17 +151,15 @@ static u32 hash_bucket(u32 hash, const struct net *net)
 }
 
 static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
-				  const struct nf_conntrack_zone *zone,
 				  unsigned int size)
 {
-	return __hash_bucket(hash_conntrack_raw(tuple, zone), size);
+	return __hash_bucket(hash_conntrack_raw(tuple), size);
 }
 
 static inline u_int32_t hash_conntrack(const struct net *net,
-				       const struct nf_conntrack_zone *zone,
 				       const struct nf_conntrack_tuple *tuple)
 {
-	return __hash_conntrack(tuple, zone, net->ct.htable_size);
+	return __hash_conntrack(tuple, net->ct.htable_size);
 }
 
 bool
@@ -312,6 +309,7 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
 		if (!nf_ct_zone)
 			goto out_free;
 		nf_ct_zone->id = zone->id;
+		nf_ct_zone->dir = zone->dir;
 	}
 #endif
 	atomic_set(&tmpl->ct_general.use, 0);
@@ -376,20 +374,18 @@ destroy_conntrack(struct nf_conntrack *nfct)
 
 static void nf_ct_delete_from_lists(struct nf_conn *ct)
 {
-	const struct nf_conntrack_zone *zone;
 	struct net *net = nf_ct_net(ct);
 	unsigned int hash, reply_hash;
 	unsigned int sequence;
 
-	zone = nf_ct_zone(ct);
 	nf_ct_helper_destroy(ct);
 
 	local_bh_disable();
 	do {
 		sequence = read_seqcount_begin(&net->ct.generation);
-		hash = hash_conntrack(net, zone,
+		hash = hash_conntrack(net,
 				      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
-		reply_hash = hash_conntrack(net, zone,
+		reply_hash = hash_conntrack(net,
 					   &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 	} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
 
@@ -446,7 +442,7 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
 	 * so we need to check that the conntrack is confirmed
 	 */
 	return nf_ct_tuple_equal(tuple, &h->tuple) &&
-	       nf_ct_zone_equal(ct, zone) &&
+	       nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) &&
 	       nf_ct_is_confirmed(ct);
 }
 
@@ -523,7 +519,7 @@ nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
 		      const struct nf_conntrack_tuple *tuple)
 {
 	return __nf_conntrack_find_get(net, zone, tuple,
-				       hash_conntrack_raw(tuple, zone));
+				       hash_conntrack_raw(tuple));
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
 
@@ -554,9 +550,9 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
 	local_bh_disable();
 	do {
 		sequence = read_seqcount_begin(&net->ct.generation);
-		hash = hash_conntrack(net, zone,
+		hash = hash_conntrack(net,
 				      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
-		reply_hash = hash_conntrack(net, zone,
+		reply_hash = hash_conntrack(net,
 					   &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 	} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
 
@@ -564,12 +560,14 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
 	hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
 		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
 				      &h->tuple) &&
-		    nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone))
+		    nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone,
+				     NF_CT_DIRECTION(h)))
 			goto out;
 	hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)
 		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
 				      &h->tuple) &&
-		    nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone))
+		    nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone,
+				     NF_CT_DIRECTION(h)))
 			goto out;
 
 	add_timer(&ct->timeout);
@@ -623,7 +621,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
 		/* reuse the hash saved before */
 		hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
 		hash = hash_bucket(hash, net);
-		reply_hash = hash_conntrack(net, zone,
+		reply_hash = hash_conntrack(net,
 					   &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 
 	} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
@@ -655,12 +653,14 @@ __nf_conntrack_confirm(struct sk_buff *skb)
 	hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
 		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
 				      &h->tuple) &&
-		    nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone))
+		    nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone,
+				     NF_CT_DIRECTION(h)))
 			goto out;
 	hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)
 		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
 				      &h->tuple) &&
-		    nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone))
+		    nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone,
+				     NF_CT_DIRECTION(h)))
 			goto out;
 
 	/* Timer relative to confirmation time, not original
@@ -720,7 +720,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
 	unsigned int hash;
 
 	zone = nf_ct_zone(ignored_conntrack);
-	hash = hash_conntrack(net, zone, tuple);
+	hash = hash_conntrack(net, tuple);
 
 	/* Disable BHs the entire time since we need to disable them at
 	 * least once for the stats anyway.
@@ -730,7 +730,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
 		ct = nf_ct_tuplehash_to_ctrack(h);
 		if (ct != ignored_conntrack &&
 		    nf_ct_tuple_equal(tuple, &h->tuple) &&
-		    nf_ct_zone_equal(ct, zone)) {
+		    nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h))) {
 			NF_CT_STAT_INC(net, found);
 			rcu_read_unlock_bh();
 			return 1;
@@ -830,7 +830,7 @@ __nf_conntrack_alloc(struct net *net,
 	if (unlikely(!nf_conntrack_hash_rnd)) {
 		init_nf_conntrack_hash_rnd();
 		/* recompute the hash as nf_conntrack_hash_rnd is initialized */
-		hash = hash_conntrack_raw(orig, zone);
+		hash = hash_conntrack_raw(orig);
 	}
 
 	/* We don't want any race condition at early drop stage */
@@ -875,6 +875,7 @@ __nf_conntrack_alloc(struct net *net,
 		if (!nf_ct_zone)
 			goto out_free;
 		nf_ct_zone->id = zone->id;
+		nf_ct_zone->dir = zone->dir;
 	}
 #endif
 	/* Because we use RCU lookups, we set ct_general.use to zero before
@@ -1053,7 +1054,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
 
 	/* look for tuple match */
 	zone = nf_ct_zone_tmpl(tmpl);
-	hash = hash_conntrack_raw(&tuple, zone);
+	hash = hash_conntrack_raw(&tuple);
 	h = __nf_conntrack_find_get(net, zone, &tuple, hash);
 	if (!h) {
 		h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
@@ -1306,6 +1307,7 @@ EXPORT_SYMBOL_GPL(__nf_ct_kill_acct);
 /* Built-in default zone used e.g. by modules. */
 const struct nf_conntrack_zone nf_ct_zone_dflt = {
 	.id	= NF_CT_DEFAULT_ZONE_ID,
+	.dir	= NF_CT_DEFAULT_ZONE_DIR,
 };
 EXPORT_SYMBOL_GPL(nf_ct_zone_dflt);
 
@@ -1617,8 +1619,7 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
 					struct nf_conntrack_tuple_hash, hnnode);
 			ct = nf_ct_tuplehash_to_ctrack(h);
 			hlist_nulls_del_rcu(&h->hnnode);
-			bucket = __hash_conntrack(&h->tuple, nf_ct_zone(ct),
-						  hashsize);
+			bucket = __hash_conntrack(&h->tuple, hashsize);
 			hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
 		}
 	}
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 980db854c3c8..acf5c7b3f378 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -101,7 +101,7 @@ __nf_ct_expect_find(struct net *net,
 	h = nf_ct_expect_dst_hash(tuple);
 	hlist_for_each_entry_rcu(i, &net->ct.expect_hash[h], hnode) {
 		if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
-		    nf_ct_zone_equal(i->master, zone))
+		    nf_ct_zone_equal_any(i->master, zone))
 			return i;
 	}
 	return NULL;
@@ -143,7 +143,7 @@ nf_ct_find_expectation(struct net *net,
 	hlist_for_each_entry(i, &net->ct.expect_hash[h], hnode) {
 		if (!(i->flags & NF_CT_EXPECT_INACTIVE) &&
 		    nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
-		    nf_ct_zone_equal(i->master, zone)) {
+		    nf_ct_zone_equal_any(i->master, zone)) {
 			exp = i;
 			break;
 		}
@@ -223,7 +223,7 @@ static inline int expect_clash(const struct nf_conntrack_expect *a,
 	}
 
 	return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask) &&
-	       nf_ct_zone_equal(a->master, nf_ct_zone(b->master));
+	       nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master));
 }
 
 static inline int expect_matches(const struct nf_conntrack_expect *a,
@@ -232,7 +232,7 @@ static inline int expect_matches(const struct nf_conntrack_expect *a,
 	return a->master == b->master && a->class == b->class &&
 	       nf_ct_tuple_equal(&a->tuple, &b->tuple) &&
 	       nf_ct_tuple_mask_equal(&a->mask, &b->mask) &&
-	       nf_ct_zone_equal(a->master, nf_ct_zone(b->master));
+	       nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master));
 }
 
 /* Generally a bad idea to call this: could have matched already. */
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 95f7f01e253d..4eaf925bead4 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -127,6 +127,20 @@ ctnetlink_dump_tuples(struct sk_buff *skb,
 	return ret;
 }
 
+static inline int
+ctnetlink_dump_zone_id(struct sk_buff *skb, int attrtype,
+		       const struct nf_conntrack_zone *zone, int dir)
+{
+	if (zone->id == NF_CT_DEFAULT_ZONE_ID || zone->dir != dir)
+		return 0;
+	if (nla_put_be16(skb, attrtype, htons(zone->id)))
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
 static inline int
 ctnetlink_dump_status(struct sk_buff *skb, const struct nf_conn *ct)
 {
@@ -474,11 +488,16 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
 	nfmsg->version      = NFNETLINK_V0;
 	nfmsg->res_id	    = 0;
 
+	zone = nf_ct_zone(ct);
+
 	nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED);
 	if (!nest_parms)
 		goto nla_put_failure;
 	if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
 		goto nla_put_failure;
+	if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone,
+				   NF_CT_ZONE_DIR_ORIG) < 0)
+		goto nla_put_failure;
 	nla_nest_end(skb, nest_parms);
 
 	nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED);
@@ -486,11 +505,13 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
 		goto nla_put_failure;
 	if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0)
 		goto nla_put_failure;
+	if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone,
+				   NF_CT_ZONE_DIR_REPL) < 0)
+		goto nla_put_failure;
 	nla_nest_end(skb, nest_parms);
 
-	zone = nf_ct_zone(ct);
-	if (zone->id != NF_CT_DEFAULT_ZONE_ID &&
-	    nla_put_be16(skb, CTA_ZONE, htons(zone->id)))
+	if (ctnetlink_dump_zone_id(skb, CTA_ZONE, zone,
+				   NF_CT_DEFAULT_ZONE_DIR) < 0)
 		goto nla_put_failure;
 
 	if (ctnetlink_dump_status(skb, ct) < 0 ||
@@ -600,7 +621,7 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct)
 	       + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */
 #endif
 #ifdef CONFIG_NF_CONNTRACK_ZONES
-	       + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE */
+	       + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE|CTA_TUPLE_ZONE */
 #endif
 	       + ctnetlink_proto_size(ct)
 	       + ctnetlink_label_size(ct)
@@ -658,11 +679,16 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
 	nfmsg->res_id	= 0;
 
 	rcu_read_lock();
+	zone = nf_ct_zone(ct);
+
 	nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED);
 	if (!nest_parms)
 		goto nla_put_failure;
 	if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
 		goto nla_put_failure;
+	if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone,
+				   NF_CT_ZONE_DIR_ORIG) < 0)
+		goto nla_put_failure;
 	nla_nest_end(skb, nest_parms);
 
 	nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED);
@@ -670,11 +696,13 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
 		goto nla_put_failure;
 	if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0)
 		goto nla_put_failure;
+	if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone,
+				   NF_CT_ZONE_DIR_REPL) < 0)
+		goto nla_put_failure;
 	nla_nest_end(skb, nest_parms);
 
-	zone = nf_ct_zone(ct);
-	if (zone->id != NF_CT_DEFAULT_ZONE_ID &&
-	    nla_put_be16(skb, CTA_ZONE, htons(zone->id)))
+	if (ctnetlink_dump_zone_id(skb, CTA_ZONE, zone,
+				   NF_CT_DEFAULT_ZONE_DIR) < 0)
 		goto nla_put_failure;
 
 	if (ctnetlink_dump_id(skb, ct) < 0)
@@ -924,15 +952,55 @@ ctnetlink_parse_tuple_proto(struct nlattr *attr,
 	return ret;
 }
 
+static int
+ctnetlink_parse_zone(const struct nlattr *attr,
+		     struct nf_conntrack_zone *zone)
+{
+	zone->id  = NF_CT_DEFAULT_ZONE_ID;
+	zone->dir = NF_CT_DEFAULT_ZONE_DIR;
+
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+	if (attr)
+		zone->id = ntohs(nla_get_be16(attr));
+#else
+	if (attr)
+		return -EOPNOTSUPP;
+#endif
+	return 0;
+}
+
+static int
+ctnetlink_parse_tuple_zone(struct nlattr *attr, enum ctattr_type type,
+			   struct nf_conntrack_zone *zone)
+{
+	int ret;
+
+	if (zone->id != NF_CT_DEFAULT_ZONE_ID)
+		return -EINVAL;
+
+	ret = ctnetlink_parse_zone(attr, zone);
+	if (ret < 0)
+		return ret;
+
+	if (type == CTA_TUPLE_REPLY)
+		zone->dir = NF_CT_ZONE_DIR_REPL;
+	else
+		zone->dir = NF_CT_ZONE_DIR_ORIG;
+
+	return 0;
+}
+
 static const struct nla_policy tuple_nla_policy[CTA_TUPLE_MAX+1] = {
 	[CTA_TUPLE_IP]		= { .type = NLA_NESTED },
 	[CTA_TUPLE_PROTO]	= { .type = NLA_NESTED },
+	[CTA_TUPLE_ZONE]	= { .type = NLA_U16 },
 };
 
 static int
 ctnetlink_parse_tuple(const struct nlattr * const cda[],
 		      struct nf_conntrack_tuple *tuple,
-		      enum ctattr_type type, u_int8_t l3num)
+		      enum ctattr_type type, u_int8_t l3num,
+		      struct nf_conntrack_zone *zone)
 {
 	struct nlattr *tb[CTA_TUPLE_MAX+1];
 	int err;
@@ -959,6 +1027,16 @@ ctnetlink_parse_tuple(const struct nlattr * const cda[],
 	if (err < 0)
 		return err;
 
+	if (tb[CTA_TUPLE_ZONE]) {
+		if (!zone)
+			return -EINVAL;
+
+		err = ctnetlink_parse_tuple_zone(tb[CTA_TUPLE_ZONE],
+						 type, zone);
+		if (err < 0)
+			return err;
+	}
+
 	/* orig and expect tuples get DIR_ORIGINAL */
 	if (type == CTA_TUPLE_REPLY)
 		tuple->dst.dir = IP_CT_DIR_REPLY;
@@ -968,22 +1046,6 @@ ctnetlink_parse_tuple(const struct nlattr * const cda[],
 	return 0;
 }
 
-static int
-ctnetlink_parse_zone(const struct nlattr *attr,
-		     struct nf_conntrack_zone *zone)
-{
-	zone->id = NF_CT_DEFAULT_ZONE_ID;
-
-#ifdef CONFIG_NF_CONNTRACK_ZONES
-	if (attr)
-		zone->id = ntohs(nla_get_be16(attr));
-#else
-	if (attr)
-		return -EOPNOTSUPP;
-#endif
-	return 0;
-}
-
 static const struct nla_policy help_nla_policy[CTA_HELP_MAX+1] = {
 	[CTA_HELP_NAME]		= { .type = NLA_NUL_STRING,
 				    .len = NF_CT_HELPER_NAME_LEN - 1 },
@@ -1071,9 +1133,11 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
 		return err;
 
 	if (cda[CTA_TUPLE_ORIG])
-		err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3);
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG,
+					    u3, &zone);
 	else if (cda[CTA_TUPLE_REPLY])
-		err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3);
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY,
+					    u3, &zone);
 	else {
 		return ctnetlink_flush_conntrack(net, cda,
 						 NETLINK_CB(skb).portid,
@@ -1143,9 +1207,11 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
 		return err;
 
 	if (cda[CTA_TUPLE_ORIG])
-		err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3);
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG,
+					    u3, &zone);
 	else if (cda[CTA_TUPLE_REPLY])
-		err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3);
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY,
+					    u3, &zone);
 	else
 		return -EINVAL;
 
@@ -1767,7 +1833,8 @@ ctnetlink_create_conntrack(struct net *net,
 		struct nf_conntrack_tuple_hash *master_h;
 		struct nf_conn *master_ct;
 
-		err = ctnetlink_parse_tuple(cda, &master, CTA_TUPLE_MASTER, u3);
+		err = ctnetlink_parse_tuple(cda, &master, CTA_TUPLE_MASTER,
+					    u3, NULL);
 		if (err < 0)
 			goto err2;
 
@@ -1818,13 +1885,15 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
 		return err;
 
 	if (cda[CTA_TUPLE_ORIG]) {
-		err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG, u3);
+		err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG,
+					    u3, &zone);
 		if (err < 0)
 			return err;
 	}
 
 	if (cda[CTA_TUPLE_REPLY]) {
-		err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY, u3);
+		err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY,
+					    u3, &zone);
 		if (err < 0)
 			return err;
 	}
@@ -2088,7 +2157,7 @@ ctnetlink_nfqueue_build_size(const struct nf_conn *ct)
 	       + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */
 #endif
 #ifdef CONFIG_NF_CONNTRACK_ZONES
-	       + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE */
+	       + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE|CTA_TUPLE_ZONE */
 #endif
 	       + ctnetlink_proto_size(ct)
 	       ;
@@ -2101,11 +2170,16 @@ ctnetlink_nfqueue_build(struct sk_buff *skb, struct nf_conn *ct)
 	struct nlattr *nest_parms;
 
 	rcu_read_lock();
+	zone = nf_ct_zone(ct);
+
 	nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED);
 	if (!nest_parms)
 		goto nla_put_failure;
 	if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
 		goto nla_put_failure;
+	if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone,
+				   NF_CT_ZONE_DIR_ORIG) < 0)
+		goto nla_put_failure;
 	nla_nest_end(skb, nest_parms);
 
 	nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED);
@@ -2113,11 +2187,13 @@ ctnetlink_nfqueue_build(struct sk_buff *skb, struct nf_conn *ct)
 		goto nla_put_failure;
 	if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0)
 		goto nla_put_failure;
+	if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone,
+				   NF_CT_ZONE_DIR_REPL) < 0)
+		goto nla_put_failure;
 	nla_nest_end(skb, nest_parms);
 
-	zone = nf_ct_zone(ct);
-	if (zone->id != NF_CT_DEFAULT_ZONE_ID &&
-	    nla_put_be16(skb, CTA_ZONE, htons(zone->id)))
+	if (ctnetlink_dump_zone_id(skb, CTA_ZONE, zone,
+				   NF_CT_DEFAULT_ZONE_DIR) < 0)
 		goto nla_put_failure;
 
 	if (ctnetlink_dump_id(skb, ct) < 0)
@@ -2225,12 +2301,12 @@ static int ctnetlink_nfqueue_exp_parse(const struct nlattr * const *cda,
 	int err;
 
 	err = ctnetlink_parse_tuple(cda, tuple, CTA_EXPECT_TUPLE,
-				    nf_ct_l3num(ct));
+				    nf_ct_l3num(ct), NULL);
 	if (err < 0)
 		return err;
 
 	return ctnetlink_parse_tuple(cda, mask, CTA_EXPECT_MASK,
-				     nf_ct_l3num(ct));
+				     nf_ct_l3num(ct), NULL);
 }
 
 static int
@@ -2625,7 +2701,8 @@ static int ctnetlink_dump_exp_ct(struct sock *ctnl, struct sk_buff *skb,
 		.done = ctnetlink_exp_done,
 	};
 
-	err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3);
+	err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER,
+				    u3, NULL);
 	if (err < 0)
 		return err;
 
@@ -2677,9 +2754,11 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
 		return err;
 
 	if (cda[CTA_EXPECT_TUPLE])
-		err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE,
+					    u3, NULL);
 	else if (cda[CTA_EXPECT_MASTER])
-		err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3);
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER,
+					    u3, NULL);
 	else
 		return -EINVAL;
 
@@ -2747,7 +2826,8 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
 		if (err < 0)
 			return err;
 
-		err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE,
+					    u3, NULL);
 		if (err < 0)
 			return err;
 
@@ -2854,7 +2934,8 @@ ctnetlink_parse_expect_nat(const struct nlattr *attr,
 		return -EINVAL;
 
 	err = ctnetlink_parse_tuple((const struct nlattr * const *)tb,
-					&nat_tuple, CTA_EXPECT_NAT_TUPLE, u3);
+				    &nat_tuple, CTA_EXPECT_NAT_TUPLE,
+				    u3, NULL);
 	if (err < 0)
 		return err;
 
@@ -2955,13 +3036,16 @@ ctnetlink_create_expect(struct net *net,
 	int err;
 
 	/* caller guarantees that those three CTA_EXPECT_* exist */
-	err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
+	err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE,
+				    u3, NULL);
 	if (err < 0)
 		return err;
-	err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK, u3);
+	err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK,
+				    u3, NULL);
 	if (err < 0)
 		return err;
-	err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER, u3);
+	err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER,
+				    u3, NULL);
 	if (err < 0)
 		return err;
 
@@ -3029,7 +3113,8 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,
 	if (err < 0)
 		return err;
 
-	err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
+	err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE,
+				    u3, NULL);
 	if (err < 0)
 		return err;
 
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 28c8b2b982ec..1fb3cacc04e1 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -141,12 +141,30 @@ static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
 #endif
 
 #ifdef CONFIG_NF_CONNTRACK_ZONES
-static void ct_show_zone(struct seq_file *s, const struct nf_conn *ct)
+static void ct_show_zone(struct seq_file *s, const struct nf_conn *ct,
+			 int dir)
 {
-	seq_printf(s, "zone=%u ", nf_ct_zone(ct)->id);
+	const struct nf_conntrack_zone *zone = nf_ct_zone(ct);
+
+	if (zone->dir != dir)
+		return;
+	switch (zone->dir) {
+	case NF_CT_DEFAULT_ZONE_DIR:
+		seq_printf(s, "zone=%u ", zone->id);
+		break;
+	case NF_CT_ZONE_DIR_ORIG:
+		seq_printf(s, "zone-orig=%u ", zone->id);
+		break;
+	case NF_CT_ZONE_DIR_REPL:
+		seq_printf(s, "zone-reply=%u ", zone->id);
+		break;
+	default:
+		break;
+	}
 }
 #else
-static inline void ct_show_zone(struct seq_file *s, const struct nf_conn *ct)
+static inline void ct_show_zone(struct seq_file *s, const struct nf_conn *ct,
+				int dir)
 {
 }
 #endif
@@ -213,6 +231,8 @@ static int ct_seq_show(struct seq_file *s, void *v)
 	print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
 		    l3proto, l4proto);
 
+	ct_show_zone(s, ct, NF_CT_ZONE_DIR_ORIG);
+
 	if (seq_has_overflowed(s))
 		goto release;
 
@@ -225,6 +245,8 @@ static int ct_seq_show(struct seq_file *s, void *v)
 	print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
 		    l3proto, l4proto);
 
+	ct_show_zone(s, ct, NF_CT_ZONE_DIR_REPL);
+
 	if (seq_print_acct(s, ct, IP_CT_DIR_REPLY))
 		goto release;
 
@@ -239,7 +261,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
 #endif
 
 	ct_show_secctx(s, ct);
-	ct_show_zone(s, ct);
+	ct_show_zone(s, ct, NF_CT_DEFAULT_ZONE_DIR);
 	ct_show_delta_time(s, ct);
 
 	seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use));
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 65ebaf9fc4f9..5113dfd39df9 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -118,15 +118,13 @@ EXPORT_SYMBOL(nf_xfrm_me_harder);
 
 /* We keep an extra hash for each conntrack, for fast searching. */
 static inline unsigned int
-hash_by_src(const struct net *net,
-	    const struct nf_conntrack_zone *zone,
-	    const struct nf_conntrack_tuple *tuple)
+hash_by_src(const struct net *net, const struct nf_conntrack_tuple *tuple)
 {
 	unsigned int hash;
 
 	/* Original src, to ensure we map it consistently if poss. */
 	hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32),
-		      tuple->dst.protonum ^ zone->id ^ nf_conntrack_hash_rnd);
+		      tuple->dst.protonum ^ nf_conntrack_hash_rnd);
 
 	return reciprocal_scale(hash, net->ct.nat_htable_size);
 }
@@ -194,13 +192,14 @@ find_appropriate_src(struct net *net,
 		     struct nf_conntrack_tuple *result,
 		     const struct nf_nat_range *range)
 {
-	unsigned int h = hash_by_src(net, zone, tuple);
+	unsigned int h = hash_by_src(net, tuple);
 	const struct nf_conn_nat *nat;
 	const struct nf_conn *ct;
 
 	hlist_for_each_entry_rcu(nat, &net->ct.nat_bysource[h], bysource) {
 		ct = nat->ct;
-		if (same_src(ct, tuple) && nf_ct_zone_equal(ct, zone)) {
+		if (same_src(ct, tuple) &&
+		    nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) {
 			/* Copy source part from reply tuple. */
 			nf_ct_invert_tuplepr(result,
 				       &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
@@ -425,7 +424,7 @@ nf_nat_setup_info(struct nf_conn *ct,
 	if (maniptype == NF_NAT_MANIP_SRC) {
 		unsigned int srchash;
 
-		srchash = hash_by_src(net, nf_ct_zone(ct),
+		srchash = hash_by_src(net,
 				      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
 		spin_lock_bh(&nf_nat_lock);
 		/* nf_conntrack_alter_reply might re-allocate extension aera */
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 29e2856063ff..536cb67928ad 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -181,6 +181,19 @@ out:
 #endif
 }
 
+static u16 xt_ct_flags_to_dir(const struct xt_ct_target_info_v1 *info)
+{
+	switch (info->flags & (XT_CT_ZONE_DIR_ORIG |
+			       XT_CT_ZONE_DIR_REPL)) {
+	case XT_CT_ZONE_DIR_ORIG:
+		return NF_CT_ZONE_DIR_ORIG;
+	case XT_CT_ZONE_DIR_REPL:
+		return NF_CT_ZONE_DIR_REPL;
+	default:
+		return NF_CT_DEFAULT_ZONE_DIR;
+	}
+}
+
 static int xt_ct_tg_check(const struct xt_tgchk_param *par,
 			  struct xt_ct_target_info_v1 *info)
 {
@@ -194,7 +207,8 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,
 	}
 
 #ifndef CONFIG_NF_CONNTRACK_ZONES
-	if (info->zone)
+	if (info->zone || info->flags & (XT_CT_ZONE_DIR_ORIG |
+					 XT_CT_ZONE_DIR_REPL))
 		goto err1;
 #endif
 
@@ -204,6 +218,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,
 
 	memset(&zone, 0, sizeof(zone));
 	zone.id = info->zone;
+	zone.dir = xt_ct_flags_to_dir(info);
 
 	ct = nf_ct_tmpl_alloc(par->net, &zone, GFP_KERNEL);
 	ret = PTR_ERR(ct);
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index e67a1bdd0929..5019a47b9270 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -72,6 +72,7 @@ static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a,
 		goto out;
 
 	zone.id = ca->zone;
+	zone.dir = NF_CT_DEFAULT_ZONE_DIR;
 
 	thash = nf_conntrack_find_get(dev_net(skb->dev), &zone, &tuple);
 	if (!thash)
-- 
cgit v1.2.3


From 5e8018fc61423e677398d4ad4d72df70b9788e77 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 14 Aug 2015 16:03:40 +0200
Subject: netfilter: nf_conntrack: add efficient mark to zone mapping

This work adds the possibility of deriving the zone id from the skb->mark
field in a scalable manner. This allows for having only a single template
serving hundreds/thousands of different zones, for example, instead of the
need to have one match for each zone as an extra CT jump target.

Note that we'd need to have this information attached to the template as at
the time when we're trying to lookup a possible ct object, we already need
to know zone information for a possible match when going into
__nf_conntrack_find_get(). This work provides a minimal implementation for
a possible mapping.

In order to not add/expose an extra ct->status bit, the zone structure has
been extended to carry a flag for deriving the mark.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_zones.h     | 45 +++++++++++++++++++++--
 include/uapi/linux/netfilter/xt_CT.h           |  4 ++-
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c   |  3 +-
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c |  4 ++-
 net/netfilter/nf_conntrack_core.c              | 50 +++++++++-----------------
 net/netfilter/nf_conntrack_netlink.c           |  5 ++-
 net/netfilter/xt_CT.c                          |  5 ++-
 7 files changed, 72 insertions(+), 44 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/netfilter/nf_conntrack_zones.h b/include/net/netfilter/nf_conntrack_zones.h
index 3942ddf0d4ff..5316c7b3a374 100644
--- a/include/net/netfilter/nf_conntrack_zones.h
+++ b/include/net/netfilter/nf_conntrack_zones.h
@@ -10,9 +10,12 @@
 
 #define NF_CT_DEFAULT_ZONE_DIR	(NF_CT_ZONE_DIR_ORIG | NF_CT_ZONE_DIR_REPL)
 
+#define NF_CT_FLAG_MARK		1
+
 struct nf_conntrack_zone {
 	u16	id;
-	u16	dir;
+	u8	flags;
+	u8	dir;
 };
 
 extern const struct nf_conntrack_zone nf_ct_zone_dflt;
@@ -32,9 +35,45 @@ nf_ct_zone(const struct nf_conn *ct)
 }
 
 static inline const struct nf_conntrack_zone *
-nf_ct_zone_tmpl(const struct nf_conn *tmpl)
+nf_ct_zone_init(struct nf_conntrack_zone *zone, u16 id, u8 dir, u8 flags)
+{
+	zone->id = id;
+	zone->flags = flags;
+	zone->dir = dir;
+
+	return zone;
+}
+
+static inline const struct nf_conntrack_zone *
+nf_ct_zone_tmpl(const struct nf_conn *tmpl, const struct sk_buff *skb,
+		struct nf_conntrack_zone *tmp)
+{
+	const struct nf_conntrack_zone *zone;
+
+	if (!tmpl)
+		return &nf_ct_zone_dflt;
+
+	zone = nf_ct_zone(tmpl);
+	if (zone->flags & NF_CT_FLAG_MARK)
+		zone = nf_ct_zone_init(tmp, skb->mark, zone->dir, 0);
+
+	return zone;
+}
+
+static inline int nf_ct_zone_add(struct nf_conn *ct, gfp_t flags,
+				 const struct nf_conntrack_zone *info)
 {
-	return tmpl ? nf_ct_zone(tmpl) : &nf_ct_zone_dflt;
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+	struct nf_conntrack_zone *nf_ct_zone;
+
+	nf_ct_zone = nf_ct_ext_add(ct, NF_CT_EXT_ZONE, flags);
+	if (!nf_ct_zone)
+		return -ENOMEM;
+
+	nf_ct_zone_init(nf_ct_zone, info->id, info->dir,
+			info->flags);
+#endif
+	return 0;
 }
 
 static inline bool nf_ct_zone_matches_dir(const struct nf_conntrack_zone *zone,
diff --git a/include/uapi/linux/netfilter/xt_CT.h b/include/uapi/linux/netfilter/xt_CT.h
index 452005ff0e9e..9e520418b858 100644
--- a/include/uapi/linux/netfilter/xt_CT.h
+++ b/include/uapi/linux/netfilter/xt_CT.h
@@ -8,9 +8,11 @@ enum {
 	XT_CT_NOTRACK_ALIAS	= 1 << 1,
 	XT_CT_ZONE_DIR_ORIG	= 1 << 2,
 	XT_CT_ZONE_DIR_REPL	= 1 << 3,
+	XT_CT_ZONE_MARK		= 1 << 4,
 
 	XT_CT_MASK		= XT_CT_NOTRACK | XT_CT_NOTRACK_ALIAS |
-				  XT_CT_ZONE_DIR_ORIG | XT_CT_ZONE_DIR_REPL,
+				  XT_CT_ZONE_DIR_ORIG | XT_CT_ZONE_DIR_REPL |
+				  XT_CT_ZONE_MARK,
 };
 
 struct xt_ct_target_info {
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 8a2f41c2fe6f..cdde3ec496e9 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -135,9 +135,10 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
 	const struct nf_conntrack_l4proto *innerproto;
 	const struct nf_conntrack_tuple_hash *h;
 	const struct nf_conntrack_zone *zone;
+	struct nf_conntrack_zone tmp;
 
 	NF_CT_ASSERT(skb->nfct == NULL);
-	zone = nf_ct_zone_tmpl(tmpl);
+	zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
 
 	/* Are they talking about one of our connections? */
 	if (!nf_ct_get_tuplepr(skb,
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 202914151360..0e6fae103d33 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -150,6 +150,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
 	struct nf_conntrack_tuple intuple, origtuple;
 	const struct nf_conntrack_tuple_hash *h;
 	const struct nf_conntrack_l4proto *inproto;
+	struct nf_conntrack_zone tmp;
 
 	NF_CT_ASSERT(skb->nfct == NULL);
 
@@ -176,7 +177,8 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
 
 	*ctinfo = IP_CT_RELATED;
 
-	h = nf_conntrack_find_get(net, nf_ct_zone_tmpl(tmpl), &intuple);
+	h = nf_conntrack_find_get(net, nf_ct_zone_tmpl(tmpl, skb, &tmp),
+				  &intuple);
 	if (!h) {
 		pr_debug("icmpv6_error: no match\n");
 		return -NF_ACCEPT;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index acc06222ce6a..48521d62c672 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -301,25 +301,15 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
 	tmpl->status = IPS_TEMPLATE;
 	write_pnet(&tmpl->ct_net, net);
 
-#ifdef CONFIG_NF_CONNTRACK_ZONES
-	if (zone) {
-		struct nf_conntrack_zone *nf_ct_zone;
-
-		nf_ct_zone = nf_ct_ext_add(tmpl, NF_CT_EXT_ZONE, GFP_ATOMIC);
-		if (!nf_ct_zone)
-			goto out_free;
-		nf_ct_zone->id = zone->id;
-		nf_ct_zone->dir = zone->dir;
-	}
-#endif
+	if (nf_ct_zone_add(tmpl, flags, zone) < 0)
+		goto out_free;
+
 	atomic_set(&tmpl->ct_general.use, 0);
 
 	return tmpl;
-#ifdef CONFIG_NF_CONNTRACK_ZONES
 out_free:
 	kfree(tmpl);
 	return NULL;
-#endif
 }
 EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc);
 
@@ -850,10 +840,9 @@ __nf_conntrack_alloc(struct net *net,
 	 * SLAB_DESTROY_BY_RCU.
 	 */
 	ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp);
-	if (ct == NULL) {
-		atomic_dec(&net->ct.count);
-		return ERR_PTR(-ENOMEM);
-	}
+	if (ct == NULL)
+		goto out;
+
 	spin_lock_init(&ct->lock);
 	ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
 	ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
@@ -867,29 +856,20 @@ __nf_conntrack_alloc(struct net *net,
 	memset(&ct->__nfct_init_offset[0], 0,
 	       offsetof(struct nf_conn, proto) -
 	       offsetof(struct nf_conn, __nfct_init_offset[0]));
-#ifdef CONFIG_NF_CONNTRACK_ZONES
-	if (zone) {
-		struct nf_conntrack_zone *nf_ct_zone;
-
-		nf_ct_zone = nf_ct_ext_add(ct, NF_CT_EXT_ZONE, GFP_ATOMIC);
-		if (!nf_ct_zone)
-			goto out_free;
-		nf_ct_zone->id = zone->id;
-		nf_ct_zone->dir = zone->dir;
-	}
-#endif
+
+	if (zone && nf_ct_zone_add(ct, GFP_ATOMIC, zone) < 0)
+		goto out_free;
+
 	/* Because we use RCU lookups, we set ct_general.use to zero before
 	 * this is inserted in any list.
 	 */
 	atomic_set(&ct->ct_general.use, 0);
 	return ct;
-
-#ifdef CONFIG_NF_CONNTRACK_ZONES
 out_free:
-	atomic_dec(&net->ct.count);
 	kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
+out:
+	atomic_dec(&net->ct.count);
 	return ERR_PTR(-ENOMEM);
-#endif
 }
 
 struct nf_conn *nf_conntrack_alloc(struct net *net,
@@ -937,6 +917,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
 	struct nf_conntrack_expect *exp = NULL;
 	const struct nf_conntrack_zone *zone;
 	struct nf_conn_timeout *timeout_ext;
+	struct nf_conntrack_zone tmp;
 	unsigned int *timeouts;
 
 	if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) {
@@ -944,7 +925,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
 		return NULL;
 	}
 
-	zone = nf_ct_zone_tmpl(tmpl);
+	zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
 	ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
 				  hash);
 	if (IS_ERR(ct))
@@ -1042,6 +1023,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
 	const struct nf_conntrack_zone *zone;
 	struct nf_conntrack_tuple tuple;
 	struct nf_conntrack_tuple_hash *h;
+	struct nf_conntrack_zone tmp;
 	struct nf_conn *ct;
 	u32 hash;
 
@@ -1053,7 +1035,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
 	}
 
 	/* look for tuple match */
-	zone = nf_ct_zone_tmpl(tmpl);
+	zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
 	hash = hash_conntrack_raw(&tuple);
 	h = __nf_conntrack_find_get(net, zone, &tuple, hash);
 	if (!h) {
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 4eaf925bead4..94a66541e0b7 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -956,9 +956,8 @@ static int
 ctnetlink_parse_zone(const struct nlattr *attr,
 		     struct nf_conntrack_zone *zone)
 {
-	zone->id  = NF_CT_DEFAULT_ZONE_ID;
-	zone->dir = NF_CT_DEFAULT_ZONE_DIR;
-
+	nf_ct_zone_init(zone, NF_CT_DEFAULT_ZONE_ID,
+			NF_CT_DEFAULT_ZONE_DIR, 0);
 #ifdef CONFIG_NF_CONNTRACK_ZONES
 	if (attr)
 		zone->id = ntohs(nla_get_be16(attr));
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 536cb67928ad..346509825a80 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -208,7 +208,8 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,
 
 #ifndef CONFIG_NF_CONNTRACK_ZONES
 	if (info->zone || info->flags & (XT_CT_ZONE_DIR_ORIG |
-					 XT_CT_ZONE_DIR_REPL))
+					 XT_CT_ZONE_DIR_REPL |
+					 XT_CT_ZONE_MARK))
 		goto err1;
 #endif
 
@@ -219,6 +220,8 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,
 	memset(&zone, 0, sizeof(zone));
 	zone.id = info->zone;
 	zone.dir = xt_ct_flags_to_dir(info);
+	if (info->flags & XT_CT_ZONE_MARK)
+		zone.flags |= NF_CT_FLAG_MARK;
 
 	ct = nf_ct_tmpl_alloc(par->net, &zone, GFP_KERNEL);
 	ret = PTR_ERR(ct);
-- 
cgit v1.2.3


From 65d7ab8de582bc668e3dabb6ff48f750098a6e78 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Mon, 17 Aug 2015 13:42:27 -0700
Subject: net: Identifier Locator Addressing module

Adding new module name ila. This implements ILA translation. Light
weight tunnel redirection is used to perform the translation in
the data path. This is configured by the "ip -6 route" command
using the "encap ila <locator>" option, where <locator> is the
value to set in destination locator of the packet. e.g.

ip -6 route add 3333:0:0:1:5555:0:1:0/128 \
      encap ila 2001:0:0:1 via 2401:db00:20:911a:face:0:25:0

Sets a route where 3333:0:0:1 will be overwritten by
2001:0:0:1 on output.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ila.h      |  15 +++
 include/uapi/linux/lwtunnel.h |   1 +
 net/ipv6/Kconfig              |  19 ++++
 net/ipv6/Makefile             |   1 +
 net/ipv6/ila.c                | 216 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 252 insertions(+)
 create mode 100644 include/uapi/linux/ila.h
 create mode 100644 net/ipv6/ila.c

(limited to 'include/uapi')

diff --git a/include/uapi/linux/ila.h b/include/uapi/linux/ila.h
new file mode 100644
index 000000000000..7ed9e670814e
--- /dev/null
+++ b/include/uapi/linux/ila.h
@@ -0,0 +1,15 @@
+/* ila.h - ILA Interface */
+
+#ifndef _UAPI_LINUX_ILA_H
+#define _UAPI_LINUX_ILA_H
+
+enum {
+	ILA_ATTR_UNSPEC,
+	ILA_ATTR_LOCATOR,			/* u64 */
+
+	__ILA_ATTR_MAX,
+};
+
+#define ILA_ATTR_MAX		(__ILA_ATTR_MAX - 1)
+
+#endif /* _UAPI_LINUX_ILA_H */
diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
index 3bf223bc2367..aa84ca396bcb 100644
--- a/include/uapi/linux/lwtunnel.h
+++ b/include/uapi/linux/lwtunnel.h
@@ -7,6 +7,7 @@ enum lwtunnel_encap_types {
 	LWTUNNEL_ENCAP_NONE,
 	LWTUNNEL_ENCAP_MPLS,
 	LWTUNNEL_ENCAP_IP,
+	LWTUNNEL_ENCAP_ILA,
 	__LWTUNNEL_ENCAP_MAX,
 };
 
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 643f61339e7b..983bb999738c 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -92,6 +92,25 @@ config IPV6_MIP6
 
 	  If unsure, say N.
 
+config IPV6_ILA
+	tristate "IPv6: Identifier Locator Addressing (ILA)"
+	select LWTUNNEL
+	---help---
+	  Support for IPv6 Identifier Locator Addressing (ILA).
+
+	  ILA is a mechanism to do network virtualization without
+	  encapsulation. The basic concept of ILA is that we split an
+	  IPv6 address into a 64 bit locator and 64 bit identifier. The
+	  identifier is the identity of an entity in communication
+	  ("who") and the locator expresses the location of the
+	  entity ("where").
+
+	  ILA can be configured using the "encap ila" option with
+	  "ip -6 route" command. ILA is described in
+	  https://tools.ietf.org/html/draft-herbert-nvo3-ila-00.
+
+	  If unsure, say N.
+
 config INET6_XFRM_TUNNEL
 	tristate
 	select INET6_TUNNEL
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 0f3f1999719a..2c900c7b7eb1 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_INET6_XFRM_MODE_TUNNEL) += xfrm6_mode_tunnel.o
 obj-$(CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION) += xfrm6_mode_ro.o
 obj-$(CONFIG_INET6_XFRM_MODE_BEET) += xfrm6_mode_beet.o
 obj-$(CONFIG_IPV6_MIP6) += mip6.o
+obj-$(CONFIG_IPV6_ILA) += ila.o
 obj-$(CONFIG_NETFILTER)	+= netfilter/
 
 obj-$(CONFIG_IPV6_VTI) += ip6_vti.o
diff --git a/net/ipv6/ila.c b/net/ipv6/ila.c
new file mode 100644
index 000000000000..2540ab4b76d1
--- /dev/null
+++ b/net/ipv6/ila.c
@@ -0,0 +1,216 @@
+#include <linux/errno.h>
+#include <linux/ip.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/types.h>
+#include <net/checksum.h>
+#include <net/ip.h>
+#include <net/ip6_fib.h>
+#include <net/lwtunnel.h>
+#include <net/protocol.h>
+#include <uapi/linux/ila.h>
+
+struct ila_params {
+	__be64 locator;
+};
+
+static inline struct ila_params *ila_params_lwtunnel(
+	struct lwtunnel_state *lwstate)
+{
+	return (struct ila_params *)lwstate->data;
+}
+
+static inline __wsum compute_csum_diff8(const __be32 *from, const __be32 *to)
+{
+	__be32 diff[] = {
+		~from[0], ~from[1], to[0], to[1],
+	};
+
+	return csum_partial(diff, sizeof(diff), 0);
+}
+
+static inline __wsum get_csum_diff(struct ipv6hdr *ip6h, struct ila_params *p)
+{
+		return compute_csum_diff8((__be32 *)&ip6h->daddr,
+					  (__be32 *)&p->locator);
+}
+
+static void update_ipv6_locator(struct sk_buff *skb, struct ila_params *p)
+{
+	__wsum diff;
+	struct ipv6hdr *ip6h = ipv6_hdr(skb);
+	size_t nhoff = sizeof(struct ipv6hdr);
+
+	/* First update checksum */
+	switch (ip6h->nexthdr) {
+	case NEXTHDR_TCP:
+		if (likely(pskb_may_pull(skb, nhoff + sizeof(struct tcphdr)))) {
+			struct tcphdr *th = (struct tcphdr *)
+					(skb_network_header(skb) + nhoff);
+
+			diff = get_csum_diff(ip6h, p);
+			inet_proto_csum_replace_by_diff(&th->check, skb,
+							diff, true);
+		}
+		break;
+	case NEXTHDR_UDP:
+		if (likely(pskb_may_pull(skb, nhoff + sizeof(struct udphdr)))) {
+			struct udphdr *uh = (struct udphdr *)
+					(skb_network_header(skb) + nhoff);
+
+			if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+				diff = get_csum_diff(ip6h, p);
+				inet_proto_csum_replace_by_diff(&uh->check, skb,
+								diff, true);
+				if (!uh->check)
+					uh->check = CSUM_MANGLED_0;
+			}
+		}
+		break;
+	case NEXTHDR_ICMP:
+		if (likely(pskb_may_pull(skb,
+					 nhoff + sizeof(struct icmp6hdr)))) {
+			struct icmp6hdr *ih = (struct icmp6hdr *)
+					(skb_network_header(skb) + nhoff);
+
+			diff = get_csum_diff(ip6h, p);
+			inet_proto_csum_replace_by_diff(&ih->icmp6_cksum, skb,
+							diff, true);
+		}
+		break;
+	}
+
+	/* Now change destination address */
+	*(__be64 *)&ip6h->daddr = p->locator;
+}
+
+static int ila_output(struct sock *sk, struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct rt6_info *rt6 = NULL;
+
+	if (skb->protocol != htons(ETH_P_IPV6))
+		goto drop;
+
+	rt6 = (struct rt6_info *)dst;
+
+	update_ipv6_locator(skb, ila_params_lwtunnel(rt6->rt6i_lwtstate));
+
+	return rt6->rt6i_lwtstate->orig_output(sk, skb);
+
+drop:
+	kfree_skb(skb);
+	return -EINVAL;
+}
+
+static int ila_input(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct rt6_info *rt6 = NULL;
+
+	if (skb->protocol != htons(ETH_P_IPV6))
+		goto drop;
+
+	rt6 = (struct rt6_info *)dst;
+
+	update_ipv6_locator(skb, ila_params_lwtunnel(rt6->rt6i_lwtstate));
+
+	return rt6->rt6i_lwtstate->orig_input(skb);
+
+drop:
+	kfree_skb(skb);
+	return -EINVAL;
+}
+
+static struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
+	[ILA_ATTR_LOCATOR] = { .type = NLA_U64, },
+};
+
+static int ila_build_state(struct net_device *dev, struct nlattr *nla,
+			   struct lwtunnel_state **ts)
+{
+	struct ila_params *p;
+	struct nlattr *tb[ILA_ATTR_MAX + 1];
+	size_t encap_len = sizeof(*p);
+	struct lwtunnel_state *newts;
+	int ret;
+
+	ret = nla_parse_nested(tb, ILA_ATTR_MAX, nla,
+			       ila_nl_policy);
+	if (ret < 0)
+		return ret;
+
+	if (!tb[ILA_ATTR_LOCATOR])
+		return -EINVAL;
+
+	newts = lwtunnel_state_alloc(encap_len);
+	if (!newts)
+		return -ENOMEM;
+
+	newts->len = encap_len;
+	p = ila_params_lwtunnel(newts);
+
+	p->locator = (__force __be64)nla_get_u64(tb[ILA_ATTR_LOCATOR]);
+
+	newts->type = LWTUNNEL_ENCAP_ILA;
+	newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT |
+			LWTUNNEL_STATE_INPUT_REDIRECT;
+
+	*ts = newts;
+
+	return 0;
+}
+
+static int ila_fill_encap_info(struct sk_buff *skb,
+			       struct lwtunnel_state *lwtstate)
+{
+	struct ila_params *p = ila_params_lwtunnel(lwtstate);
+
+	if (nla_put_u64(skb, ILA_ATTR_LOCATOR, (__force u64)p->locator))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static int ila_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+	/* No encapsulation overhead */
+	return 0;
+}
+
+static int ila_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+	struct ila_params *a_p = ila_params_lwtunnel(a);
+	struct ila_params *b_p = ila_params_lwtunnel(b);
+
+	return (a_p->locator != b_p->locator);
+}
+
+static const struct lwtunnel_encap_ops ila_encap_ops = {
+	.build_state = ila_build_state,
+	.output = ila_output,
+	.input = ila_input,
+	.fill_encap = ila_fill_encap_info,
+	.get_encap_size = ila_encap_nlsize,
+	.cmp_encap = ila_encap_cmp,
+};
+
+static int __init ila_init(void)
+{
+	return lwtunnel_encap_add_ops(&ila_encap_ops, LWTUNNEL_ENCAP_ILA);
+}
+
+static void __exit ila_fini(void)
+{
+	lwtunnel_encap_del_ops(&ila_encap_ops, LWTUNNEL_ENCAP_ILA);
+}
+
+module_init(ila_init);
+module_exit(ila_fini);
+MODULE_AUTHOR("Tom Herbert <tom@herbertland.com>");
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From d9232a3da8683cd9c9854a858bcca968fe5f3bca Mon Sep 17 00:00:00 2001
From: Ian Munsie <imunsie@au1.ibm.com>
Date: Thu, 23 Jul 2015 16:43:56 +1000
Subject: cxl: Add alternate MMIO error handling

userspace programs using cxl currently have to use two strategies for
dealing with MMIO errors simultaneously. They have to check every read
for a return of all Fs in case the adapter has gone away and the kernel
has not yet noticed, and they have to deal with SIGBUS in case the
kernel has already noticed, invalidated the mapping and marked the
context as failed.

In order to simplify things, this patch adds an alternative approach
where the kernel will return a page filled with Fs instead of delivering
a SIGBUS. This allows userspace to only need to deal with one of these
two error paths, and is intended for use in libraries that use cxl
transparently and may not be able to safely install a signal handler.

This approach will only work if certain constraints are met. Namely, if
the application is both reading and writing to an address in the problem
state area it cannot assume that a non-FF read is OK, as it may just be
reading out a value it has previously written. Further - since only one
page is used per context a write to a given offset would be visible when
reading the same offset from a different page in the mapping (this only
applies within a single context, not between contexts).

An application could deal with this by e.g. making sure it also reads
from a read-only offset after any reads to a read/write offset.

Due to these constraints, this functionality must be explicitly
requested by userspace when starting the context by passing in the
CXL_START_WORK_ERR_FF flag.

Signed-off-by: Ian Munsie <imunsie@au1.ibm.com>
Acked-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 drivers/misc/cxl/context.c | 14 ++++++++++++++
 drivers/misc/cxl/cxl.h     |  4 +++-
 drivers/misc/cxl/file.c    |  4 +++-
 include/uapi/misc/cxl.h    |  4 +++-
 4 files changed, 23 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c
index 615842115848..941fda04aa9a 100644
--- a/drivers/misc/cxl/context.c
+++ b/drivers/misc/cxl/context.c
@@ -126,6 +126,18 @@ static int cxl_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (ctx->status != STARTED) {
 		mutex_unlock(&ctx->status_mutex);
 		pr_devel("%s: Context not started, failing problem state access\n", __func__);
+		if (ctx->mmio_err_ff) {
+			if (!ctx->ff_page) {
+				ctx->ff_page = alloc_page(GFP_USER);
+				if (!ctx->ff_page)
+					return VM_FAULT_OOM;
+				memset(page_address(ctx->ff_page), 0xff, PAGE_SIZE);
+			}
+			get_page(ctx->ff_page);
+			vmf->page = ctx->ff_page;
+			vma->vm_page_prot = pgprot_cached(vma->vm_page_prot);
+			return 0;
+		}
 		return VM_FAULT_SIGBUS;
 	}
 
@@ -257,6 +269,8 @@ static void reclaim_ctx(struct rcu_head *rcu)
 	struct cxl_context *ctx = container_of(rcu, struct cxl_context, rcu);
 
 	free_page((u64)ctx->sstp);
+	if (ctx->ff_page)
+		__free_page(ctx->ff_page);
 	ctx->sstp = NULL;
 
 	kfree(ctx);
diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index 6f5386653dae..e7af256f60c5 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -34,7 +34,7 @@ extern uint cxl_verbose;
  * Bump version each time a user API change is made, whether it is
  * backwards compatible ot not.
  */
-#define CXL_API_VERSION 1
+#define CXL_API_VERSION 2
 #define CXL_API_VERSION_COMPATIBLE 1
 
 /*
@@ -418,6 +418,8 @@ struct cxl_context {
 	/* Used to unmap any mmaps when force detaching */
 	struct address_space *mapping;
 	struct mutex mapping_lock;
+	struct page *ff_page;
+	bool mmio_err_ff;
 
 	spinlock_t sste_lock; /* Protects segment table entries */
 	struct cxl_sste *sstp;
diff --git a/drivers/misc/cxl/file.c b/drivers/misc/cxl/file.c
index 57bdb473749f..a30bf285b5bd 100644
--- a/drivers/misc/cxl/file.c
+++ b/drivers/misc/cxl/file.c
@@ -184,6 +184,8 @@ static long afu_ioctl_start_work(struct cxl_context *ctx,
 	if (work.flags & CXL_START_WORK_AMR)
 		amr = work.amr & mfspr(SPRN_UAMOR);
 
+	ctx->mmio_err_ff = !!(work.flags & CXL_START_WORK_ERR_FF);
+
 	/*
 	 * We grab the PID here and not in the file open to allow for the case
 	 * where a process (master, some daemon, etc) has opened the chardev on
@@ -538,7 +540,7 @@ int __init cxl_file_init(void)
 	 * If these change we really need to update API.  Either change some
 	 * flags or update API version number CXL_API_VERSION.
 	 */
-	BUILD_BUG_ON(CXL_API_VERSION != 1);
+	BUILD_BUG_ON(CXL_API_VERSION != 2);
 	BUILD_BUG_ON(sizeof(struct cxl_ioctl_start_work) != 64);
 	BUILD_BUG_ON(sizeof(struct cxl_event_header) != 8);
 	BUILD_BUG_ON(sizeof(struct cxl_event_afu_interrupt) != 8);
diff --git a/include/uapi/misc/cxl.h b/include/uapi/misc/cxl.h
index 99a8ca15fe64..1e889aa8a36e 100644
--- a/include/uapi/misc/cxl.h
+++ b/include/uapi/misc/cxl.h
@@ -29,8 +29,10 @@ struct cxl_ioctl_start_work {
 
 #define CXL_START_WORK_AMR		0x0000000000000001ULL
 #define CXL_START_WORK_NUM_IRQS		0x0000000000000002ULL
+#define CXL_START_WORK_ERR_FF		0x0000000000000004ULL
 #define CXL_START_WORK_ALL		(CXL_START_WORK_AMR |\
-					 CXL_START_WORK_NUM_IRQS)
+					 CXL_START_WORK_NUM_IRQS |\
+					 CXL_START_WORK_ERR_FF)
 
 
 /* Possible modes that an afu can be in */
-- 
cgit v1.2.3


From 81f03fedcce7ee7e83c37237ecaa2f68aad236fd Mon Sep 17 00:00:00 2001
From: Jon Derrick <jonathan.derrick@intel.com>
Date: Mon, 10 Aug 2015 15:20:41 -0600
Subject: NVMe: Add nvme subsystem reset IOCTL

Controllers can perform optional subsystem resets as introduced in NVMe
1.1. This patch adds an IOCTL to trigger the subsystem reset by writing
"NVMe" to the NSSR register.

Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
Acked-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/nvme-core.c | 11 +++++++++++
 include/linux/nvme.h      |  2 +-
 include/uapi/linux/nvme.h |  1 +
 3 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index e318a992e2e6..3a35c5807bb7 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -1901,6 +1901,15 @@ static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns,
 	return status;
 }
 
+static int nvme_subsys_reset(struct nvme_dev *dev)
+{
+	if (!dev->subsystem)
+		return -ENOTTY;
+
+	writel(0x4E564D65, &dev->bar->nssr); /* "NVMe" */
+	return 0;
+}
+
 static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
 							unsigned long arg)
 {
@@ -2932,6 +2941,8 @@ static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 	case NVME_IOCTL_RESET:
 		dev_warn(dev->dev, "resetting controller\n");
 		return nvme_reset(dev);
+	case NVME_IOCTL_SUBSYS_RESET:
+		return nvme_subsys_reset(dev);
 	default:
 		return -ENOTTY;
 	}
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index d6b5600cfa47..b5812c395351 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -28,7 +28,7 @@ struct nvme_bar {
 	__u32			cc;	/* Controller Configuration */
 	__u32			rsvd1;	/* Reserved */
 	__u32			csts;	/* Controller Status */
-	__u32			rsvd2;	/* Reserved */
+	__u32			nssr;	/* Subsystem Reset */
 	__u32			aqa;	/* Admin Queue Attributes */
 	__u64			asq;	/* Admin SQ Base Address */
 	__u64			acq;	/* Admin CQ Base Address */
diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h
index 732b32e92b02..8864194a4151 100644
--- a/include/uapi/linux/nvme.h
+++ b/include/uapi/linux/nvme.h
@@ -584,5 +584,6 @@ struct nvme_passthru_cmd {
 #define NVME_IOCTL_SUBMIT_IO	_IOW('N', 0x42, struct nvme_user_io)
 #define NVME_IOCTL_IO_CMD	_IOWR('N', 0x43, struct nvme_passthru_cmd)
 #define NVME_IOCTL_RESET	_IO('N', 0x44)
+#define NVME_IOCTL_SUBSYS_RESET	_IO('N', 0x45)
 
 #endif /* _UAPI_LINUX_NVME_H */
-- 
cgit v1.2.3


From bd49784fd1e8f42c7600fbfa206361324857f373 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 18 Aug 2015 16:26:16 -0400
Subject: dm stats: report precise_timestamps and histogram in @stats_list
 output

If the user selected the precise_timestamps or histogram options, report
it in the @stats_list message output.

If the user didn't select these options, no extra tokens are reported,
thus it is backward compatible with old software that doesn't know about
precise timestamps and histogram.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org # 4.2
---
 Documentation/device-mapper/statistics.txt |  4 ++++
 drivers/md/dm-stats.c                      | 14 +++++++++++++-
 include/uapi/linux/dm-ioctl.h              |  4 ++--
 3 files changed, 19 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/Documentation/device-mapper/statistics.txt b/Documentation/device-mapper/statistics.txt
index 4919b2dfd1b3..6f5ef944ca4c 100644
--- a/Documentation/device-mapper/statistics.txt
+++ b/Documentation/device-mapper/statistics.txt
@@ -121,6 +121,10 @@ Messages
 
 	Output format:
 	  <region_id>: <start_sector>+<length> <step> <program_id> <aux_data>
+	        precise_timestamps histogram:n1,n2,n3,...
+
+	The strings "precise_timestamps" and "histogram" are printed only
+	if they were specified when creating the region.
 
     @stats_print <region_id> [<starting_line> <number_of_lines>]
 
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index 8a8b48fa901a..8289804ccd99 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -457,12 +457,24 @@ static int dm_stats_list(struct dm_stats *stats, const char *program,
 	list_for_each_entry(s, &stats->list, list_entry) {
 		if (!program || !strcmp(program, s->program_id)) {
 			len = s->end - s->start;
-			DMEMIT("%d: %llu+%llu %llu %s %s\n", s->id,
+			DMEMIT("%d: %llu+%llu %llu %s %s", s->id,
 				(unsigned long long)s->start,
 				(unsigned long long)len,
 				(unsigned long long)s->step,
 				s->program_id,
 				s->aux_data);
+			if (s->stat_flags & STAT_PRECISE_TIMESTAMPS)
+				DMEMIT(" precise_timestamps");
+			if (s->n_histogram_entries) {
+				unsigned i;
+				DMEMIT(" histogram:");
+				for (i = 0; i < s->n_histogram_entries; i++) {
+					if (i)
+						DMEMIT(",");
+					DMEMIT("%llu", s->histogram_boundaries[i]);
+				}
+			}
+			DMEMIT("\n");
 		}
 	}
 	mutex_unlock(&stats->mutex);
diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h
index 061aca3a962d..d34611e35a30 100644
--- a/include/uapi/linux/dm-ioctl.h
+++ b/include/uapi/linux/dm-ioctl.h
@@ -267,9 +267,9 @@ enum {
 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	32
+#define DM_VERSION_MINOR	33
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2015-6-26)"
+#define DM_VERSION_EXTRA	"-ioctl (2015-8-18)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
-- 
cgit v1.2.3


From 32a2b002ce615eadd3bfaddabde290f70a1dd17b Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Thu, 20 Aug 2015 13:56:32 +0200
Subject: ipv6: route: per route IP tunnel metadata via lightweight tunnel

Allow specification of per route IP tunnel instructions also for IPv6.
This complements commit 3093fbe7ff4b ("route: Per route IP tunnel metadata
via lightweight tunnel").

Signed-off-by: Jiri Benc <jbenc@redhat.com>
CC: YOSHIFUJI Hideaki <hideaki.yoshifuji@miraclelinux.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/lwtunnel.h |  16 +++++++
 net/ipv4/ip_tunnel_core.c     | 102 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 118 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
index aa84ca396bcb..34141a5dfe74 100644
--- a/include/uapi/linux/lwtunnel.h
+++ b/include/uapi/linux/lwtunnel.h
@@ -8,6 +8,7 @@ enum lwtunnel_encap_types {
 	LWTUNNEL_ENCAP_MPLS,
 	LWTUNNEL_ENCAP_IP,
 	LWTUNNEL_ENCAP_ILA,
+	LWTUNNEL_ENCAP_IP6,
 	__LWTUNNEL_ENCAP_MAX,
 };
 
@@ -28,4 +29,19 @@ enum lwtunnel_ip_t {
 
 #define LWTUNNEL_IP_MAX (__LWTUNNEL_IP_MAX - 1)
 
+enum lwtunnel_ip6_t {
+	LWTUNNEL_IP6_UNSPEC,
+	LWTUNNEL_IP6_ID,
+	LWTUNNEL_IP6_DST,
+	LWTUNNEL_IP6_SRC,
+	LWTUNNEL_IP6_HOPLIMIT,
+	LWTUNNEL_IP6_TC,
+	LWTUNNEL_IP6_SPORT,
+	LWTUNNEL_IP6_DPORT,
+	LWTUNNEL_IP6_FLAGS,
+	__LWTUNNEL_IP6_MAX,
+};
+
+#define LWTUNNEL_IP6_MAX (__LWTUNNEL_IP6_MAX - 1)
+
 #endif /* _UAPI_LWTUNNEL_H_ */
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index f0514e39e57c..289b6c26ce37 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -299,9 +299,111 @@ static const struct lwtunnel_encap_ops ip_tun_lwt_ops = {
 	.cmp_encap = ip_tun_cmp_encap,
 };
 
+static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = {
+	[LWTUNNEL_IP6_ID]		= { .type = NLA_U64 },
+	[LWTUNNEL_IP6_DST]		= { .len = sizeof(struct in6_addr) },
+	[LWTUNNEL_IP6_SRC]		= { .len = sizeof(struct in6_addr) },
+	[LWTUNNEL_IP6_HOPLIMIT]		= { .type = NLA_U8 },
+	[LWTUNNEL_IP6_TC]		= { .type = NLA_U8 },
+	[LWTUNNEL_IP6_SPORT]		= { .type = NLA_U16 },
+	[LWTUNNEL_IP6_DPORT]		= { .type = NLA_U16 },
+	[LWTUNNEL_IP6_FLAGS]		= { .type = NLA_U16 },
+};
+
+static int ip6_tun_build_state(struct net_device *dev, struct nlattr *attr,
+			       struct lwtunnel_state **ts)
+{
+	struct ip_tunnel_info *tun_info;
+	struct lwtunnel_state *new_state;
+	struct nlattr *tb[LWTUNNEL_IP6_MAX + 1];
+	int err;
+
+	err = nla_parse_nested(tb, LWTUNNEL_IP6_MAX, attr, ip6_tun_policy);
+	if (err < 0)
+		return err;
+
+	new_state = lwtunnel_state_alloc(sizeof(*tun_info));
+	if (!new_state)
+		return -ENOMEM;
+
+	new_state->type = LWTUNNEL_ENCAP_IP6;
+
+	tun_info = lwt_tun_info(new_state);
+
+	if (tb[LWTUNNEL_IP6_ID])
+		tun_info->key.tun_id = nla_get_u64(tb[LWTUNNEL_IP6_ID]);
+
+	if (tb[LWTUNNEL_IP6_DST])
+		tun_info->key.u.ipv6.dst = nla_get_in6_addr(tb[LWTUNNEL_IP6_DST]);
+
+	if (tb[LWTUNNEL_IP6_SRC])
+		tun_info->key.u.ipv6.src = nla_get_in6_addr(tb[LWTUNNEL_IP6_SRC]);
+
+	if (tb[LWTUNNEL_IP6_HOPLIMIT])
+		tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP6_HOPLIMIT]);
+
+	if (tb[LWTUNNEL_IP6_TC])
+		tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]);
+
+	if (tb[LWTUNNEL_IP6_SPORT])
+		tun_info->key.tp_src = nla_get_be16(tb[LWTUNNEL_IP6_SPORT]);
+
+	if (tb[LWTUNNEL_IP6_DPORT])
+		tun_info->key.tp_dst = nla_get_be16(tb[LWTUNNEL_IP6_DPORT]);
+
+	if (tb[LWTUNNEL_IP6_FLAGS])
+		tun_info->key.tun_flags = nla_get_u16(tb[LWTUNNEL_IP6_FLAGS]);
+
+	tun_info->mode = IP_TUNNEL_INFO_TX;
+	tun_info->options = NULL;
+	tun_info->options_len = 0;
+
+	*ts = new_state;
+
+	return 0;
+}
+
+static int ip6_tun_fill_encap_info(struct sk_buff *skb,
+				   struct lwtunnel_state *lwtstate)
+{
+	struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
+
+	if (nla_put_u64(skb, LWTUNNEL_IP6_ID, tun_info->key.tun_id) ||
+	    nla_put_in6_addr(skb, LWTUNNEL_IP6_DST, &tun_info->key.u.ipv6.dst) ||
+	    nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) ||
+	    nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.tos) ||
+	    nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.ttl) ||
+	    nla_put_u16(skb, LWTUNNEL_IP6_SPORT, tun_info->key.tp_src) ||
+	    nla_put_u16(skb, LWTUNNEL_IP6_DPORT, tun_info->key.tp_dst) ||
+	    nla_put_u16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int ip6_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+	return nla_total_size(8)	/* LWTUNNEL_IP6_ID */
+		+ nla_total_size(16)	/* LWTUNNEL_IP6_DST */
+		+ nla_total_size(16)	/* LWTUNNEL_IP6_SRC */
+		+ nla_total_size(1)	/* LWTUNNEL_IP6_HOPLIMIT */
+		+ nla_total_size(1)	/* LWTUNNEL_IP6_TC */
+		+ nla_total_size(2)	/* LWTUNNEL_IP6_SPORT */
+		+ nla_total_size(2)	/* LWTUNNEL_IP6_DPORT */
+		+ nla_total_size(2);	/* LWTUNNEL_IP6_FLAGS */
+}
+
+static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = {
+	.build_state = ip6_tun_build_state,
+	.fill_encap = ip6_tun_fill_encap_info,
+	.get_encap_size = ip6_tun_encap_nlsize,
+	.cmp_encap = ip_tun_cmp_encap,
+};
+
 void __init ip_tunnel_core_init(void)
 {
 	lwtunnel_encap_add_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP);
+	lwtunnel_encap_add_ops(&ip6_tun_lwt_ops, LWTUNNEL_ENCAP_IP6);
 }
 
 struct static_key ip_tunnel_metadata_cnt = STATIC_KEY_INIT_FALSE;
-- 
cgit v1.2.3


From e4ff67513096e6e196ca58043fce04d0f87babbe Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Sun, 26 Jul 2015 15:03:27 +0300
Subject: ipvs: add sync_maxlen parameter for the sync daemon

Allow setups with large MTU to send large sync packets by
adding sync_maxlen parameter. The default value is now based
on MTU but no more than 1500 for compatibility reasons.

To avoid problems if MTU changes allow fragmentation by
sending packets with DF=0. Problem reported by Dan Carpenter.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/net/ip_vs.h             |  19 +++---
 include/uapi/linux/ip_vs.h      |   1 +
 net/netfilter/ipvs/ip_vs_ctl.c  |  53 ++++++++++------
 net/netfilter/ipvs/ip_vs_sync.c | 137 ++++++++++++++++++----------------------
 4 files changed, 108 insertions(+), 102 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 4e3731ee4eac..2fdc13caf712 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -846,6 +846,13 @@ struct ipvs_master_sync_state {
 /* How much time to keep dests in trash */
 #define IP_VS_DEST_TRASH_PERIOD		(120 * HZ)
 
+struct ipvs_sync_daemon_cfg {
+	int			syncid;
+	u16			sync_maxlen;
+	/* multicast interface name */
+	char			mcast_ifn[IP_VS_IFNAME_MAXLEN];
+};
+
 /* IPVS in network namespace */
 struct netns_ipvs {
 	int			gen;		/* Generation */
@@ -961,15 +968,10 @@ struct netns_ipvs {
 	spinlock_t		sync_buff_lock;
 	struct task_struct	**backup_threads;
 	int			threads_mask;
-	int			send_mesg_maxlen;
-	int			recv_mesg_maxlen;
 	volatile int		sync_state;
-	volatile int		master_syncid;
-	volatile int		backup_syncid;
 	struct mutex		sync_mutex;
-	/* multicast interface name */
-	char			master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
-	char			backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
+	struct ipvs_sync_daemon_cfg	mcfg;	/* Master Configuration */
+	struct ipvs_sync_daemon_cfg	bcfg;	/* Backup Configuration */
 	/* net name space ptr */
 	struct net		*net;            /* Needed by timer routines */
 	/* Number of heterogeneous destinations, needed becaus heterogeneous
@@ -1408,7 +1410,8 @@ static inline void ip_vs_dest_put_and_free(struct ip_vs_dest *dest)
 /* IPVS sync daemon data and function prototypes
  * (from ip_vs_sync.c)
  */
-int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid);
+int start_sync_thread(struct net *net, struct ipvs_sync_daemon_cfg *cfg,
+		      int state);
 int stop_sync_thread(struct net *net, int state);
 void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts);
 
diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h
index 3199243f2028..68377d8c8870 100644
--- a/include/uapi/linux/ip_vs.h
+++ b/include/uapi/linux/ip_vs.h
@@ -406,6 +406,7 @@ enum {
 	IPVS_DAEMON_ATTR_STATE,		/* sync daemon state (master/backup) */
 	IPVS_DAEMON_ATTR_MCAST_IFN,	/* multicast interface name */
 	IPVS_DAEMON_ATTR_SYNC_ID,	/* SyncID we belong to */
+	IPVS_DAEMON_ATTR_SYNC_MAXLEN,	/* UDP Payload Size */
 	__IPVS_DAEMON_ATTR_MAX,
 };
 
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index af0b69e411b7..96f7bbfd5e1d 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2336,10 +2336,15 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 		struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
 
 		if (cmd == IP_VS_SO_SET_STARTDAEMON) {
+			struct ipvs_sync_daemon_cfg cfg;
+
+			memset(&cfg, 0, sizeof(cfg));
+			strlcpy(cfg.mcast_ifn, dm->mcast_ifn,
+				sizeof(cfg.mcast_ifn));
+			cfg.syncid = dm->syncid;
 			rtnl_lock();
 			mutex_lock(&ipvs->sync_mutex);
-			ret = start_sync_thread(net, dm->state, dm->mcast_ifn,
-						dm->syncid);
+			ret = start_sync_thread(net, &cfg, dm->state);
 			mutex_unlock(&ipvs->sync_mutex);
 			rtnl_unlock();
 		} else {
@@ -2650,15 +2655,15 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 		mutex_lock(&ipvs->sync_mutex);
 		if (ipvs->sync_state & IP_VS_STATE_MASTER) {
 			d[0].state = IP_VS_STATE_MASTER;
-			strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn,
+			strlcpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn,
 				sizeof(d[0].mcast_ifn));
-			d[0].syncid = ipvs->master_syncid;
+			d[0].syncid = ipvs->mcfg.syncid;
 		}
 		if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
 			d[1].state = IP_VS_STATE_BACKUP;
-			strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn,
+			strlcpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn,
 				sizeof(d[1].mcast_ifn));
-			d[1].syncid = ipvs->backup_syncid;
+			d[1].syncid = ipvs->bcfg.syncid;
 		}
 		if (copy_to_user(user, &d, sizeof(d)) != 0)
 			ret = -EFAULT;
@@ -2813,6 +2818,7 @@ static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
 	[IPVS_DAEMON_ATTR_MCAST_IFN]	= { .type = NLA_NUL_STRING,
 					    .len = IP_VS_IFNAME_MAXLEN },
 	[IPVS_DAEMON_ATTR_SYNC_ID]	= { .type = NLA_U32 },
+	[IPVS_DAEMON_ATTR_SYNC_MAXLEN]	= { .type = NLA_U16 },
 };
 
 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
@@ -3271,7 +3277,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
 }
 
 static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state,
-				  const char *mcast_ifn, __u32 syncid)
+				  struct ipvs_sync_daemon_cfg *c)
 {
 	struct nlattr *nl_daemon;
 
@@ -3280,8 +3286,9 @@ static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state,
 		return -EMSGSIZE;
 
 	if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) ||
-	    nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn) ||
-	    nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid))
+	    nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, c->mcast_ifn) ||
+	    nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, c->syncid) ||
+	    nla_put_u16(skb, IPVS_DAEMON_ATTR_SYNC_MAXLEN, c->sync_maxlen))
 		goto nla_put_failure;
 	nla_nest_end(skb, nl_daemon);
 
@@ -3293,7 +3300,7 @@ nla_put_failure:
 }
 
 static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state,
-				  const char *mcast_ifn, __u32 syncid,
+				  struct ipvs_sync_daemon_cfg *c,
 				  struct netlink_callback *cb)
 {
 	void *hdr;
@@ -3303,7 +3310,7 @@ static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state,
 	if (!hdr)
 		return -EMSGSIZE;
 
-	if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid))
+	if (ip_vs_genl_fill_daemon(skb, state, c))
 		goto nla_put_failure;
 
 	genlmsg_end(skb, hdr);
@@ -3323,8 +3330,7 @@ static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
 	mutex_lock(&ipvs->sync_mutex);
 	if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
 		if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
-					   ipvs->master_mcast_ifn,
-					   ipvs->master_syncid, cb) < 0)
+					   &ipvs->mcfg, cb) < 0)
 			goto nla_put_failure;
 
 		cb->args[0] = 1;
@@ -3332,8 +3338,7 @@ static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
 
 	if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
 		if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
-					   ipvs->backup_mcast_ifn,
-					   ipvs->backup_syncid, cb) < 0)
+					   &ipvs->bcfg, cb) < 0)
 			goto nla_put_failure;
 
 		cb->args[1] = 1;
@@ -3348,25 +3353,33 @@ nla_put_failure:
 static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs)
 {
 	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct ipvs_sync_daemon_cfg c;
+	struct nlattr *a;
 	int ret;
 
+	memset(&c, 0, sizeof(c));
 	if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
 	      attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
 	      attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
 		return -EINVAL;
+	strlcpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
+		sizeof(c.mcast_ifn));
+	c.syncid = nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]);
+
+	a = attrs[IPVS_DAEMON_ATTR_SYNC_MAXLEN];
+	if (a)
+		c.sync_maxlen = nla_get_u16(a);
 
 	/* The synchronization protocol is incompatible with mixed family
 	 * services
 	 */
-	if (net_ipvs(net)->mixed_address_family_dests > 0)
+	if (ipvs->mixed_address_family_dests > 0)
 		return -EINVAL;
 
 	rtnl_lock();
 	mutex_lock(&ipvs->sync_mutex);
-	ret = start_sync_thread(net,
-				nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
-				nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
-				nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
+	ret = start_sync_thread(net, &c,
+				nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
 	mutex_unlock(&ipvs->sync_mutex);
 	rtnl_unlock();
 	return ret;
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 6bc6dca9bca8..e68a43421479 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -320,26 +320,28 @@ sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms)
  * Create a new sync buffer for Version 1 proto.
  */
 static inline struct ip_vs_sync_buff *
-ip_vs_sync_buff_create(struct netns_ipvs *ipvs)
+ip_vs_sync_buff_create(struct netns_ipvs *ipvs, unsigned int len)
 {
 	struct ip_vs_sync_buff *sb;
 
 	if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
 		return NULL;
 
-	sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC);
+	len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg),
+		    ipvs->mcfg.sync_maxlen);
+	sb->mesg = kmalloc(len, GFP_ATOMIC);
 	if (!sb->mesg) {
 		kfree(sb);
 		return NULL;
 	}
 	sb->mesg->reserved = 0;  /* old nr_conns i.e. must be zero now */
 	sb->mesg->version = SYNC_PROTO_VER;
-	sb->mesg->syncid = ipvs->master_syncid;
+	sb->mesg->syncid = ipvs->mcfg.syncid;
 	sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg));
 	sb->mesg->nr_conns = 0;
 	sb->mesg->spare = 0;
 	sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg);
-	sb->end = (unsigned char *)sb->mesg + ipvs->send_mesg_maxlen;
+	sb->end = (unsigned char *)sb->mesg + len;
 
 	sb->firstuse = jiffies;
 	return sb;
@@ -402,7 +404,7 @@ select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp)
  * Create a new sync buffer for Version 0 proto.
  */
 static inline struct ip_vs_sync_buff *
-ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs)
+ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs, unsigned int len)
 {
 	struct ip_vs_sync_buff *sb;
 	struct ip_vs_sync_mesg_v0 *mesg;
@@ -410,17 +412,19 @@ ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs)
 	if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
 		return NULL;
 
-	sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC);
+	len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg_v0),
+		    ipvs->mcfg.sync_maxlen);
+	sb->mesg = kmalloc(len, GFP_ATOMIC);
 	if (!sb->mesg) {
 		kfree(sb);
 		return NULL;
 	}
 	mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg;
 	mesg->nr_conns = 0;
-	mesg->syncid = ipvs->master_syncid;
+	mesg->syncid = ipvs->mcfg.syncid;
 	mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0));
 	sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0);
-	sb->end = (unsigned char *)mesg + ipvs->send_mesg_maxlen;
+	sb->end = (unsigned char *)mesg + len;
 	sb->firstuse = jiffies;
 	return sb;
 }
@@ -533,7 +537,7 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
 	struct ip_vs_sync_buff *buff;
 	struct ipvs_master_sync_state *ms;
 	int id;
-	int len;
+	unsigned int len;
 
 	if (unlikely(cp->af != AF_INET))
 		return;
@@ -553,17 +557,19 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
 	id = select_master_thread_id(ipvs, cp);
 	ms = &ipvs->ms[id];
 	buff = ms->sync_buff;
+	len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
+		SIMPLE_CONN_SIZE;
 	if (buff) {
 		m = (struct ip_vs_sync_mesg_v0 *) buff->mesg;
 		/* Send buffer if it is for v1 */
-		if (!m->nr_conns) {
+		if (buff->head + len > buff->end || !m->nr_conns) {
 			sb_queue_tail(ipvs, ms);
 			ms->sync_buff = NULL;
 			buff = NULL;
 		}
 	}
 	if (!buff) {
-		buff = ip_vs_sync_buff_create_v0(ipvs);
+		buff = ip_vs_sync_buff_create_v0(ipvs, len);
 		if (!buff) {
 			spin_unlock_bh(&ipvs->sync_buff_lock);
 			pr_err("ip_vs_sync_buff_create failed.\n");
@@ -572,8 +578,6 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
 		ms->sync_buff = buff;
 	}
 
-	len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
-		SIMPLE_CONN_SIZE;
 	m = (struct ip_vs_sync_mesg_v0 *) buff->mesg;
 	s = (struct ip_vs_sync_conn_v0 *) buff->head;
 
@@ -597,12 +601,6 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
 	m->nr_conns++;
 	m->size = htons(ntohs(m->size) + len);
 	buff->head += len;
-
-	/* check if there is a space for next one */
-	if (buff->head + FULL_CONN_SIZE > buff->end) {
-		sb_queue_tail(ipvs, ms);
-		ms->sync_buff = NULL;
-	}
 	spin_unlock_bh(&ipvs->sync_buff_lock);
 
 	/* synchronize its controller if it has */
@@ -694,7 +692,7 @@ sloop:
 	}
 
 	if (!buff) {
-		buff = ip_vs_sync_buff_create(ipvs);
+		buff = ip_vs_sync_buff_create(ipvs, len);
 		if (!buff) {
 			spin_unlock_bh(&ipvs->sync_buff_lock);
 			pr_err("ip_vs_sync_buff_create failed.\n");
@@ -1219,7 +1217,7 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer,
 		return;
 	}
 	/* SyncID sanity check */
-	if (ipvs->backup_syncid != 0 && m2->syncid != ipvs->backup_syncid) {
+	if (ipvs->bcfg.syncid != 0 && m2->syncid != ipvs->bcfg.syncid) {
 		IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid);
 		return;
 	}
@@ -1319,6 +1317,17 @@ static void set_mcast_ttl(struct sock *sk, u_char ttl)
 	release_sock(sk);
 }
 
+/* Control fragmentation of messages */
+static void set_mcast_pmtudisc(struct sock *sk, int val)
+{
+	struct inet_sock *inet = inet_sk(sk);
+
+	/* setsockopt(sock, SOL_IP, IP_MTU_DISCOVER, &val, sizeof(val)); */
+	lock_sock(sk);
+	inet->pmtudisc = val;
+	release_sock(sk);
+}
+
 /*
  *      Specifiy default interface for outgoing multicasts
  */
@@ -1344,43 +1353,6 @@ static int set_mcast_if(struct sock *sk, char *ifname)
 }
 
 
-/*
- *	Set the maximum length of sync message according to the
- *	specified interface's MTU.
- */
-static int set_sync_mesg_maxlen(struct net *net, int sync_state)
-{
-	struct netns_ipvs *ipvs = net_ipvs(net);
-	struct net_device *dev;
-	int num;
-
-	if (sync_state == IP_VS_STATE_MASTER) {
-		dev = __dev_get_by_name(net, ipvs->master_mcast_ifn);
-		if (!dev)
-			return -ENODEV;
-
-		num = (dev->mtu - sizeof(struct iphdr) -
-		       sizeof(struct udphdr) -
-		       SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
-		ipvs->send_mesg_maxlen = SYNC_MESG_HEADER_LEN +
-			SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF);
-		IP_VS_DBG(7, "setting the maximum length of sync sending "
-			  "message %d.\n", ipvs->send_mesg_maxlen);
-	} else if (sync_state == IP_VS_STATE_BACKUP) {
-		dev = __dev_get_by_name(net, ipvs->backup_mcast_ifn);
-		if (!dev)
-			return -ENODEV;
-
-		ipvs->recv_mesg_maxlen = dev->mtu -
-			sizeof(struct iphdr) - sizeof(struct udphdr);
-		IP_VS_DBG(7, "setting the maximum length of sync receiving "
-			  "message %d.\n", ipvs->recv_mesg_maxlen);
-	}
-
-	return 0;
-}
-
-
 /*
  *      Join a multicast group.
  *      the group is specified by a class D multicast address 224.0.0.0/8
@@ -1461,7 +1433,7 @@ static struct socket *make_send_sock(struct net *net, int id)
 		pr_err("Error during creation of socket; terminating\n");
 		return ERR_PTR(result);
 	}
-	result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn);
+	result = set_mcast_if(sock->sk, ipvs->mcfg.mcast_ifn);
 	if (result < 0) {
 		pr_err("Error setting outbound mcast interface\n");
 		goto error;
@@ -1469,11 +1441,13 @@ static struct socket *make_send_sock(struct net *net, int id)
 
 	set_mcast_loop(sock->sk, 0);
 	set_mcast_ttl(sock->sk, 1);
+	/* Allow fragmentation if MTU changes */
+	set_mcast_pmtudisc(sock->sk, IP_PMTUDISC_DONT);
 	result = sysctl_sync_sock_size(ipvs);
 	if (result > 0)
 		set_sock_size(sock->sk, 1, result);
 
-	result = bind_mcastif_addr(sock, ipvs->master_mcast_ifn);
+	result = bind_mcastif_addr(sock, ipvs->mcfg.mcast_ifn);
 	if (result < 0) {
 		pr_err("Error binding address of the mcast interface\n");
 		goto error;
@@ -1531,7 +1505,7 @@ static struct socket *make_receive_sock(struct net *net, int id)
 	/* join the multicast group */
 	result = join_mcast_group(sock->sk,
 			(struct in_addr *) &mcast_addr.sin_addr,
-			ipvs->backup_mcast_ifn);
+			ipvs->bcfg.mcast_ifn);
 	if (result < 0) {
 		pr_err("Error joining to the multicast group\n");
 		goto error;
@@ -1639,7 +1613,7 @@ static int sync_thread_master(void *data)
 
 	pr_info("sync thread started: state = MASTER, mcast_ifn = %s, "
 		"syncid = %d, id = %d\n",
-		ipvs->master_mcast_ifn, ipvs->master_syncid, tinfo->id);
+		ipvs->mcfg.mcast_ifn, ipvs->mcfg.syncid, tinfo->id);
 
 	for (;;) {
 		sb = next_sync_buff(ipvs, ms);
@@ -1693,7 +1667,7 @@ static int sync_thread_backup(void *data)
 
 	pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, "
 		"syncid = %d, id = %d\n",
-		ipvs->backup_mcast_ifn, ipvs->backup_syncid, tinfo->id);
+		ipvs->bcfg.mcast_ifn, ipvs->bcfg.syncid, tinfo->id);
 
 	while (!kthread_should_stop()) {
 		wait_event_interruptible(*sk_sleep(tinfo->sock->sk),
@@ -1703,7 +1677,7 @@ static int sync_thread_backup(void *data)
 		/* do we have data now? */
 		while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {
 			len = ip_vs_receive(tinfo->sock, tinfo->buf,
-					ipvs->recv_mesg_maxlen);
+					ipvs->bcfg.sync_maxlen);
 			if (len <= 0) {
 				if (len != -EAGAIN)
 					pr_err("receiving message error\n");
@@ -1723,16 +1697,19 @@ static int sync_thread_backup(void *data)
 }
 
 
-int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
+int start_sync_thread(struct net *net, struct ipvs_sync_daemon_cfg *c,
+		      int state)
 {
 	struct ip_vs_sync_thread_data *tinfo;
 	struct task_struct **array = NULL, *task;
 	struct socket *sock;
 	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct net_device *dev;
 	char *name;
 	int (*threadfn)(void *data);
-	int id, count;
+	int id, count, hlen;
 	int result = -ENOMEM;
+	u16 mtu, min_mtu;
 
 	IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
 	IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
@@ -1744,22 +1721,35 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
 	} else
 		count = ipvs->threads_mask + 1;
 
+	dev = __dev_get_by_name(net, c->mcast_ifn);
+	if (!dev) {
+		pr_err("Unknown mcast interface: %s\n", c->mcast_ifn);
+		return -ENODEV;
+	}
+	hlen = sizeof(struct iphdr) + sizeof(struct udphdr);
+	mtu = (state == IP_VS_STATE_BACKUP) ?
+		  clamp(dev->mtu, 1500U, 65535U) : 1500U;
+	min_mtu = (state == IP_VS_STATE_BACKUP) ? 1024 : 1;
+
+	if (c->sync_maxlen)
+		c->sync_maxlen = clamp_t(unsigned int,
+					 c->sync_maxlen, min_mtu,
+					 65535 - hlen);
+	else
+		c->sync_maxlen = mtu - hlen;
+
 	if (state == IP_VS_STATE_MASTER) {
 		if (ipvs->ms)
 			return -EEXIST;
 
-		strlcpy(ipvs->master_mcast_ifn, mcast_ifn,
-			sizeof(ipvs->master_mcast_ifn));
-		ipvs->master_syncid = syncid;
+		ipvs->mcfg = *c;
 		name = "ipvs-m:%d:%d";
 		threadfn = sync_thread_master;
 	} else if (state == IP_VS_STATE_BACKUP) {
 		if (ipvs->backup_threads)
 			return -EEXIST;
 
-		strlcpy(ipvs->backup_mcast_ifn, mcast_ifn,
-			sizeof(ipvs->backup_mcast_ifn));
-		ipvs->backup_syncid = syncid;
+		ipvs->bcfg = *c;
 		name = "ipvs-b:%d:%d";
 		threadfn = sync_thread_backup;
 	} else {
@@ -1787,7 +1777,6 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
 		if (!array)
 			goto out;
 	}
-	set_sync_mesg_maxlen(net, state);
 
 	tinfo = NULL;
 	for (id = 0; id < count; id++) {
@@ -1805,7 +1794,7 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
 		tinfo->net = net;
 		tinfo->sock = sock;
 		if (state == IP_VS_STATE_BACKUP) {
-			tinfo->buf = kmalloc(ipvs->recv_mesg_maxlen,
+			tinfo->buf = kmalloc(ipvs->bcfg.sync_maxlen,
 					     GFP_KERNEL);
 			if (!tinfo->buf)
 				goto outtinfo;
-- 
cgit v1.2.3


From d33288172e72c4729e8b9f2243fb40601afabc8f Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Sun, 26 Jul 2015 15:03:28 +0300
Subject: ipvs: add more mcast parameters for the sync daemon

- mcast_group: configure the multicast address, now IPv6
is supported too

- mcast_port: configure the multicast port

- mcast_ttl: configure the multicast TTL/HOP_LIMIT

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/net/ip_vs.h             |   4 ++
 include/uapi/linux/ip_vs.h      |   4 ++
 net/netfilter/ipvs/ip_vs_ctl.c  |  50 ++++++++++++++-
 net/netfilter/ipvs/ip_vs_sync.c | 138 +++++++++++++++++++++++++++++++++-------
 4 files changed, 172 insertions(+), 24 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 2fdc13caf712..9b9ca87a4210 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -847,8 +847,12 @@ struct ipvs_master_sync_state {
 #define IP_VS_DEST_TRASH_PERIOD		(120 * HZ)
 
 struct ipvs_sync_daemon_cfg {
+	union nf_inet_addr	mcast_group;
 	int			syncid;
 	u16			sync_maxlen;
+	u16			mcast_port;
+	u8			mcast_af;
+	u8			mcast_ttl;
 	/* multicast interface name */
 	char			mcast_ifn[IP_VS_IFNAME_MAXLEN];
 };
diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h
index 68377d8c8870..391395c06c7e 100644
--- a/include/uapi/linux/ip_vs.h
+++ b/include/uapi/linux/ip_vs.h
@@ -407,6 +407,10 @@ enum {
 	IPVS_DAEMON_ATTR_MCAST_IFN,	/* multicast interface name */
 	IPVS_DAEMON_ATTR_SYNC_ID,	/* SyncID we belong to */
 	IPVS_DAEMON_ATTR_SYNC_MAXLEN,	/* UDP Payload Size */
+	IPVS_DAEMON_ATTR_MCAST_GROUP,	/* IPv4 Multicast Address */
+	IPVS_DAEMON_ATTR_MCAST_GROUP6,	/* IPv6 Multicast Address */
+	IPVS_DAEMON_ATTR_MCAST_PORT,	/* Multicast Port (base) */
+	IPVS_DAEMON_ATTR_MCAST_TTL,	/* Multicast TTL */
 	__IPVS_DAEMON_ATTR_MAX,
 };
 
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 96f7bbfd5e1d..1a23e91d50d8 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2819,6 +2819,10 @@ static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
 					    .len = IP_VS_IFNAME_MAXLEN },
 	[IPVS_DAEMON_ATTR_SYNC_ID]	= { .type = NLA_U32 },
 	[IPVS_DAEMON_ATTR_SYNC_MAXLEN]	= { .type = NLA_U16 },
+	[IPVS_DAEMON_ATTR_MCAST_GROUP]	= { .type = NLA_U32 },
+	[IPVS_DAEMON_ATTR_MCAST_GROUP6]	= { .len = sizeof(struct in6_addr) },
+	[IPVS_DAEMON_ATTR_MCAST_PORT]	= { .type = NLA_U16 },
+	[IPVS_DAEMON_ATTR_MCAST_TTL]	= { .type = NLA_U8 },
 };
 
 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
@@ -3288,8 +3292,21 @@ static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state,
 	if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) ||
 	    nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, c->mcast_ifn) ||
 	    nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, c->syncid) ||
-	    nla_put_u16(skb, IPVS_DAEMON_ATTR_SYNC_MAXLEN, c->sync_maxlen))
+	    nla_put_u16(skb, IPVS_DAEMON_ATTR_SYNC_MAXLEN, c->sync_maxlen) ||
+	    nla_put_u16(skb, IPVS_DAEMON_ATTR_MCAST_PORT, c->mcast_port) ||
+	    nla_put_u8(skb, IPVS_DAEMON_ATTR_MCAST_TTL, c->mcast_ttl))
 		goto nla_put_failure;
+#ifdef CONFIG_IP_VS_IPV6
+	if (c->mcast_af == AF_INET6) {
+		if (nla_put_in6_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP6,
+				     &c->mcast_group.in6))
+			goto nla_put_failure;
+	} else
+#endif
+		if (c->mcast_af == AF_INET &&
+		    nla_put_in_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP,
+				    c->mcast_group.ip))
+			goto nla_put_failure;
 	nla_nest_end(skb, nl_daemon);
 
 	return 0;
@@ -3370,6 +3387,37 @@ static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs)
 	if (a)
 		c.sync_maxlen = nla_get_u16(a);
 
+	a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP];
+	if (a) {
+		c.mcast_af = AF_INET;
+		c.mcast_group.ip = nla_get_in_addr(a);
+		if (!ipv4_is_multicast(c.mcast_group.ip))
+			return -EINVAL;
+	} else {
+		a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP6];
+		if (a) {
+#ifdef CONFIG_IP_VS_IPV6
+			int addr_type;
+
+			c.mcast_af = AF_INET6;
+			c.mcast_group.in6 = nla_get_in6_addr(a);
+			addr_type = ipv6_addr_type(&c.mcast_group.in6);
+			if (!(addr_type & IPV6_ADDR_MULTICAST))
+				return -EINVAL;
+#else
+			return -EAFNOSUPPORT;
+#endif
+		}
+	}
+
+	a = attrs[IPVS_DAEMON_ATTR_MCAST_PORT];
+	if (a)
+		c.mcast_port = nla_get_u16(a);
+
+	a = attrs[IPVS_DAEMON_ATTR_MCAST_TTL];
+	if (a)
+		c.mcast_ttl = nla_get_u8(a);
+
 	/* The synchronization protocol is incompatible with mixed family
 	 * services
 	 */
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index e68a43421479..43f140950075 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -262,6 +262,11 @@ struct ip_vs_sync_mesg {
 	/* ip_vs_sync_conn entries start here */
 };
 
+union ipvs_sockaddr {
+	struct sockaddr_in	in;
+	struct sockaddr_in6	in6;
+};
+
 struct ip_vs_sync_buff {
 	struct list_head        list;
 	unsigned long           firstuse;
@@ -1301,6 +1306,14 @@ static void set_mcast_loop(struct sock *sk, u_char loop)
 	/* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
 	lock_sock(sk);
 	inet->mc_loop = loop ? 1 : 0;
+#ifdef CONFIG_IP_VS_IPV6
+	if (sk->sk_family == AF_INET6) {
+		struct ipv6_pinfo *np = inet6_sk(sk);
+
+		/* IPV6_MULTICAST_LOOP */
+		np->mc_loop = loop ? 1 : 0;
+	}
+#endif
 	release_sock(sk);
 }
 
@@ -1314,6 +1327,14 @@ static void set_mcast_ttl(struct sock *sk, u_char ttl)
 	/* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
 	lock_sock(sk);
 	inet->mc_ttl = ttl;
+#ifdef CONFIG_IP_VS_IPV6
+	if (sk->sk_family == AF_INET6) {
+		struct ipv6_pinfo *np = inet6_sk(sk);
+
+		/* IPV6_MULTICAST_HOPS */
+		np->mcast_hops = ttl;
+	}
+#endif
 	release_sock(sk);
 }
 
@@ -1325,6 +1346,14 @@ static void set_mcast_pmtudisc(struct sock *sk, int val)
 	/* setsockopt(sock, SOL_IP, IP_MTU_DISCOVER, &val, sizeof(val)); */
 	lock_sock(sk);
 	inet->pmtudisc = val;
+#ifdef CONFIG_IP_VS_IPV6
+	if (sk->sk_family == AF_INET6) {
+		struct ipv6_pinfo *np = inet6_sk(sk);
+
+		/* IPV6_MTU_DISCOVER */
+		np->pmtudisc = val;
+	}
+#endif
 	release_sock(sk);
 }
 
@@ -1347,6 +1376,14 @@ static int set_mcast_if(struct sock *sk, char *ifname)
 	lock_sock(sk);
 	inet->mc_index = dev->ifindex;
 	/*  inet->mc_addr  = 0; */
+#ifdef CONFIG_IP_VS_IPV6
+	if (sk->sk_family == AF_INET6) {
+		struct ipv6_pinfo *np = inet6_sk(sk);
+
+		/* IPV6_MULTICAST_IF */
+		np->mcast_oif = dev->ifindex;
+	}
+#endif
 	release_sock(sk);
 
 	return 0;
@@ -1384,6 +1421,27 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
 	return ret;
 }
 
+#ifdef CONFIG_IP_VS_IPV6
+static int join_mcast_group6(struct sock *sk, struct in6_addr *addr,
+			     char *ifname)
+{
+	struct net *net = sock_net(sk);
+	struct net_device *dev;
+	int ret;
+
+	dev = __dev_get_by_name(net, ifname);
+	if (!dev)
+		return -ENODEV;
+	if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
+		return -EINVAL;
+
+	lock_sock(sk);
+	ret = ipv6_sock_mc_join(sk, dev->ifindex, addr);
+	release_sock(sk);
+
+	return ret;
+}
+#endif
 
 static int bind_mcastif_addr(struct socket *sock, char *ifname)
 {
@@ -1412,6 +1470,26 @@ static int bind_mcastif_addr(struct socket *sock, char *ifname)
 	return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
 }
 
+static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen,
+			       struct ipvs_sync_daemon_cfg *c, int id)
+{
+	if (AF_INET6 == c->mcast_af) {
+		sa->in6 = (struct sockaddr_in6) {
+			.sin6_family = AF_INET6,
+			.sin6_port = htons(c->mcast_port + id),
+		};
+		sa->in6.sin6_addr = c->mcast_group.in6;
+		*salen = sizeof(sa->in6);
+	} else {
+		sa->in = (struct sockaddr_in) {
+			.sin_family = AF_INET,
+			.sin_port = htons(c->mcast_port + id),
+		};
+		sa->in.sin_addr = c->mcast_group.in;
+		*salen = sizeof(sa->in);
+	}
+}
+
 /*
  *      Set up sending multicast socket over UDP
  */
@@ -1419,16 +1497,13 @@ static struct socket *make_send_sock(struct net *net, int id)
 {
 	struct netns_ipvs *ipvs = net_ipvs(net);
 	/* multicast addr */
-	struct sockaddr_in mcast_addr = {
-		.sin_family		= AF_INET,
-		.sin_port		= cpu_to_be16(IP_VS_SYNC_PORT + id),
-		.sin_addr.s_addr	= cpu_to_be32(IP_VS_SYNC_GROUP),
-	};
+	union ipvs_sockaddr mcast_addr;
 	struct socket *sock;
-	int result;
+	int result, salen;
 
 	/* First create a socket */
-	result = sock_create_kern(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+	result = sock_create_kern(net, ipvs->mcfg.mcast_af, SOCK_DGRAM,
+				  IPPROTO_UDP, &sock);
 	if (result < 0) {
 		pr_err("Error during creation of socket; terminating\n");
 		return ERR_PTR(result);
@@ -1440,21 +1515,25 @@ static struct socket *make_send_sock(struct net *net, int id)
 	}
 
 	set_mcast_loop(sock->sk, 0);
-	set_mcast_ttl(sock->sk, 1);
+	set_mcast_ttl(sock->sk, ipvs->mcfg.mcast_ttl);
 	/* Allow fragmentation if MTU changes */
 	set_mcast_pmtudisc(sock->sk, IP_PMTUDISC_DONT);
 	result = sysctl_sync_sock_size(ipvs);
 	if (result > 0)
 		set_sock_size(sock->sk, 1, result);
 
-	result = bind_mcastif_addr(sock, ipvs->mcfg.mcast_ifn);
+	if (AF_INET == ipvs->mcfg.mcast_af)
+		result = bind_mcastif_addr(sock, ipvs->mcfg.mcast_ifn);
+	else
+		result = 0;
 	if (result < 0) {
 		pr_err("Error binding address of the mcast interface\n");
 		goto error;
 	}
 
+	get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id);
 	result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr,
-			sizeof(struct sockaddr), 0);
+				    salen, 0);
 	if (result < 0) {
 		pr_err("Error connecting to the multicast addr\n");
 		goto error;
@@ -1475,16 +1554,13 @@ static struct socket *make_receive_sock(struct net *net, int id)
 {
 	struct netns_ipvs *ipvs = net_ipvs(net);
 	/* multicast addr */
-	struct sockaddr_in mcast_addr = {
-		.sin_family		= AF_INET,
-		.sin_port		= cpu_to_be16(IP_VS_SYNC_PORT + id),
-		.sin_addr.s_addr	= cpu_to_be32(IP_VS_SYNC_GROUP),
-	};
+	union ipvs_sockaddr mcast_addr;
 	struct socket *sock;
-	int result;
+	int result, salen;
 
 	/* First create a socket */
-	result = sock_create_kern(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+	result = sock_create_kern(net, ipvs->bcfg.mcast_af, SOCK_DGRAM,
+				  IPPROTO_UDP, &sock);
 	if (result < 0) {
 		pr_err("Error during creation of socket; terminating\n");
 		return ERR_PTR(result);
@@ -1495,17 +1571,22 @@ static struct socket *make_receive_sock(struct net *net, int id)
 	if (result > 0)
 		set_sock_size(sock->sk, 0, result);
 
-	result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr,
-			sizeof(struct sockaddr));
+	get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id);
+	result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen);
 	if (result < 0) {
 		pr_err("Error binding to the multicast addr\n");
 		goto error;
 	}
 
 	/* join the multicast group */
-	result = join_mcast_group(sock->sk,
-			(struct in_addr *) &mcast_addr.sin_addr,
-			ipvs->bcfg.mcast_ifn);
+#ifdef CONFIG_IP_VS_IPV6
+	if (ipvs->bcfg.mcast_af == AF_INET6)
+		result = join_mcast_group6(sock->sk, &mcast_addr.in6.sin6_addr,
+					   ipvs->bcfg.mcast_ifn);
+	else
+#endif
+		result = join_mcast_group(sock->sk, &mcast_addr.in.sin_addr,
+					  ipvs->bcfg.mcast_ifn);
 	if (result < 0) {
 		pr_err("Error joining to the multicast group\n");
 		goto error;
@@ -1721,12 +1802,23 @@ int start_sync_thread(struct net *net, struct ipvs_sync_daemon_cfg *c,
 	} else
 		count = ipvs->threads_mask + 1;
 
+	if (c->mcast_af == AF_UNSPEC) {
+		c->mcast_af = AF_INET;
+		c->mcast_group.ip = cpu_to_be32(IP_VS_SYNC_GROUP);
+	}
+	if (!c->mcast_port)
+		c->mcast_port = IP_VS_SYNC_PORT;
+	if (!c->mcast_ttl)
+		c->mcast_ttl = 1;
+
 	dev = __dev_get_by_name(net, c->mcast_ifn);
 	if (!dev) {
 		pr_err("Unknown mcast interface: %s\n", c->mcast_ifn);
 		return -ENODEV;
 	}
-	hlen = sizeof(struct iphdr) + sizeof(struct udphdr);
+	hlen = (AF_INET6 == c->mcast_af) ?
+	       sizeof(struct ipv6hdr) + sizeof(struct udphdr) :
+	       sizeof(struct iphdr) + sizeof(struct udphdr);
 	mtu = (state == IP_VS_STATE_BACKUP) ?
 		  clamp(dev->mtu, 1500U, 65535U) : 1500U;
 	min_mtu = (state == IP_VS_STATE_BACKUP) ? 1024 : 1;
-- 
cgit v1.2.3


From b96f465035f9fae83c1d8de3e80eecfe6877608c Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Tue, 25 Aug 2015 12:51:44 -0500
Subject: dlm: fix lvb copy for user locks

For a userland lock request, the previous and current
lock modes are used to decide when the lvb should be
copied back to the user.  The wrong previous value was
used, so that it always matched the current value.
This caused the lvb to be copied back to the user in
the wrong cases.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/user.c                   | 7 ++++---
 include/uapi/linux/dlm_device.h | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index fb85f32e9eca..35960502ec8d 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -785,6 +785,7 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
 	DECLARE_WAITQUEUE(wait, current);
 	struct dlm_callback cb;
 	int rv, resid, copy_lvb = 0;
+	int old_mode, new_mode;
 
 	if (count == sizeof(struct dlm_device_version)) {
 		rv = copy_version_to_user(buf, count);
@@ -841,6 +842,9 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
 
 	lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_cb_list);
 
+	/* rem_lkb_callback sets a new lkb_last_cast */
+	old_mode = lkb->lkb_last_cast.mode;
+
 	rv = dlm_rem_lkb_callback(lkb->lkb_resource->res_ls, lkb, &cb, &resid);
 	if (rv < 0) {
 		/* this shouldn't happen; lkb should have been removed from
@@ -864,9 +868,6 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
 	}
 
 	if (cb.flags & DLM_CB_CAST) {
-		int old_mode, new_mode;
-
-		old_mode = lkb->lkb_last_cast.mode;
 		new_mode = cb.mode;
 
 		if (!cb.sb_status && lkb->lkb_lksb->sb_lvbptr &&
diff --git a/include/uapi/linux/dlm_device.h b/include/uapi/linux/dlm_device.h
index 3060783c4191..df56c8ff0769 100644
--- a/include/uapi/linux/dlm_device.h
+++ b/include/uapi/linux/dlm_device.h
@@ -26,7 +26,7 @@
 /* Version of the device interface */
 #define DLM_DEVICE_VERSION_MAJOR 6
 #define DLM_DEVICE_VERSION_MINOR 0
-#define DLM_DEVICE_VERSION_PATCH 1
+#define DLM_DEVICE_VERSION_PATCH 2
 
 /* struct passed to the lock write */
 struct dlm_lock_params {
-- 
cgit v1.2.3


From 1afe839e6b31a85fc53adbf8757d6373908d414d Mon Sep 17 00:00:00 2001
From: Andreas Herz <andi@geekosphere.org>
Date: Fri, 21 Aug 2015 11:31:32 +0200
Subject: netfilter: ip6t_REJECT: added missing icmpv6 codes

RFC 4443 added two new codes values for ICMPv6 type 1:

 5 - Source address failed ingress/egress policy
 6 - Reject route to destination

And RFC 7084 states in L-14 that IPv6 Router MUST send ICMPv6 Destination
Unreachable with code 5 for packets forwarded to it that use an address
from a prefix that has been invalidated.

Codes 5 and 6 are more informative subsets of code 1.

Signed-off-by: Andreas Herz <andi@geekosphere.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter_ipv6/ip6t_REJECT.h | 4 +++-
 net/ipv6/netfilter/ip6t_REJECT.c                | 6 ++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter_ipv6/ip6t_REJECT.h b/include/uapi/linux/netfilter_ipv6/ip6t_REJECT.h
index 205ed62e4605..cd2e940c8bf5 100644
--- a/include/uapi/linux/netfilter_ipv6/ip6t_REJECT.h
+++ b/include/uapi/linux/netfilter_ipv6/ip6t_REJECT.h
@@ -10,7 +10,9 @@ enum ip6t_reject_with {
 	IP6T_ICMP6_ADDR_UNREACH,
 	IP6T_ICMP6_PORT_UNREACH,
 	IP6T_ICMP6_ECHOREPLY,
-	IP6T_TCP_RESET
+	IP6T_TCP_RESET,
+	IP6T_ICMP6_POLICY_FAIL,
+	IP6T_ICMP6_REJECT_ROUTE
 };
 
 struct ip6t_reject_info {
diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c
index 567367a75172..0ed841a3fa33 100644
--- a/net/ipv6/netfilter/ip6t_REJECT.c
+++ b/net/ipv6/netfilter/ip6t_REJECT.c
@@ -63,6 +63,12 @@ reject_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 	case IP6T_TCP_RESET:
 		nf_send_reset6(net, skb, par->hooknum);
 		break;
+	case IP6T_ICMP6_POLICY_FAIL:
+		nf_send_unreach6(net, skb, ICMPV6_POLICY_FAIL, par->hooknum);
+		break;
+	case IP6T_ICMP6_REJECT_ROUTE:
+		nf_send_unreach6(net, skb, ICMPV6_REJECT_ROUTE, par->hooknum);
+		break;
 	}
 
 	return NF_DROP;
-- 
cgit v1.2.3


From 65be2c79acc3aa0f9c0e8d4871f5a451d854465a Mon Sep 17 00:00:00 2001
From: "Matthew R. Ochs" <mrochs@linux.vnet.ibm.com>
Date: Thu, 13 Aug 2015 21:47:43 -0500
Subject: cxlflash: Superpipe support

Add superpipe supporting infrastructure to device driver for the IBM CXL
Flash adapter. This patch allows userspace applications to take advantage
of the accelerated I/O features that this adapter provides and bypass the
traditional filesystem stack.

Signed-off-by: Matthew R. Ochs <mrochs@linux.vnet.ibm.com>
Signed-off-by: Manoj N. Kumar <manoj@linux.vnet.ibm.com>
Reviewed-by: Michael Neuling <mikey@neuling.org>
Reviewed-by: Wen Xiong <wenxiong@linux.vnet.ibm.com>
Reviewed-by: Brian King <brking@linux.vnet.ibm.com>
Signed-off-by: James Bottomley <JBottomley@Odin.com>
---
 Documentation/ioctl/ioctl-number.txt |    1 +
 Documentation/powerpc/cxlflash.txt   |  257 +++++
 drivers/scsi/cxlflash/Makefile       |    2 +-
 drivers/scsi/cxlflash/common.h       |   19 +
 drivers/scsi/cxlflash/lunmgt.c       |  263 +++++
 drivers/scsi/cxlflash/main.c         |   38 +-
 drivers/scsi/cxlflash/sislite.h      |    5 +-
 drivers/scsi/cxlflash/superpipe.c    | 2014 ++++++++++++++++++++++++++++++++++
 drivers/scsi/cxlflash/superpipe.h    |  132 +++
 include/uapi/scsi/Kbuild             |    1 +
 include/uapi/scsi/cxlflash_ioctl.h   |  140 +++
 11 files changed, 2868 insertions(+), 4 deletions(-)
 create mode 100644 Documentation/powerpc/cxlflash.txt
 create mode 100644 drivers/scsi/cxlflash/lunmgt.c
 create mode 100644 drivers/scsi/cxlflash/superpipe.c
 create mode 100644 drivers/scsi/cxlflash/superpipe.h
 create mode 100644 include/uapi/scsi/cxlflash_ioctl.h

(limited to 'include/uapi')

diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 611c52267d24..9bd118d26a8a 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -314,6 +314,7 @@ Code  Seq#(hex)	Include File		Comments
 0xB3	00	linux/mmc/ioctl.h
 0xC0	00-0F	linux/usb/iowarrior.h
 0xCA	00-0F	uapi/misc/cxl.h
+0xCA	80-8F	uapi/scsi/cxlflash_ioctl.h
 0xCB	00-1F	CBM serial IEC bus	in development:
 					<mailto:michael.klein@puffin.lb.shuttle.de>
 0xCD	01	linux/reiserfs_fs.h
diff --git a/Documentation/powerpc/cxlflash.txt b/Documentation/powerpc/cxlflash.txt
new file mode 100644
index 000000000000..f943967f90ce
--- /dev/null
+++ b/Documentation/powerpc/cxlflash.txt
@@ -0,0 +1,257 @@
+Introduction
+============
+
+    The IBM Power architecture provides support for CAPI (Coherent
+    Accelerator Power Interface), which is available to certain PCIe slots
+    on Power 8 systems. CAPI can be thought of as a special tunneling
+    protocol through PCIe that allow PCIe adapters to look like special
+    purpose co-processors which can read or write an application's
+    memory and generate page faults. As a result, the host interface to
+    an adapter running in CAPI mode does not require the data buffers to
+    be mapped to the device's memory (IOMMU bypass) nor does it require
+    memory to be pinned.
+
+    On Linux, Coherent Accelerator (CXL) kernel services present CAPI
+    devices as a PCI device by implementing a virtual PCI host bridge.
+    This abstraction simplifies the infrastructure and programming
+    model, allowing for drivers to look similar to other native PCI
+    device drivers.
+
+    CXL provides a mechanism by which user space applications can
+    directly talk to a device (network or storage) bypassing the typical
+    kernel/device driver stack. The CXL Flash Adapter Driver enables a
+    user space application direct access to Flash storage.
+
+    The CXL Flash Adapter Driver is a kernel module that sits in the
+    SCSI stack as a low level device driver (below the SCSI disk and
+    protocol drivers) for the IBM CXL Flash Adapter. This driver is
+    responsible for the initialization of the adapter, setting up the
+    special path for user space access, and performing error recovery. It
+    communicates directly the Flash Accelerator Functional Unit (AFU)
+    as described in Documentation/powerpc/cxl.txt.
+
+    The cxlflash driver supports two, mutually exclusive, modes of
+    operation at the device (LUN) level:
+
+        - Any flash device (LUN) can be configured to be accessed as a
+          regular disk device (i.e.: /dev/sdc). This is the default mode.
+
+        - Any flash device (LUN) can be configured to be accessed from
+          user space with a special block library. This mode further
+          specifies the means of accessing the device and provides for
+          either raw access to the entire LUN (referred to as direct
+          or physical LUN access) or access to a kernel/AFU-mediated
+          partition of the LUN (referred to as virtual LUN access). The
+          segmentation of a disk device into virtual LUNs is assisted
+          by special translation services provided by the Flash AFU.
+
+Overview
+========
+
+    The Coherent Accelerator Interface Architecture (CAIA) introduces a
+    concept of a master context. A master typically has special privileges
+    granted to it by the kernel or hypervisor allowing it to perform AFU
+    wide management and control. The master may or may not be involved
+    directly in each user I/O, but at the minimum is involved in the
+    initial setup before the user application is allowed to send requests
+    directly to the AFU.
+
+    The CXL Flash Adapter Driver establishes a master context with the
+    AFU. It uses memory mapped I/O (MMIO) for this control and setup. The
+    Adapter Problem Space Memory Map looks like this:
+
+                     +-------------------------------+
+                     |    512 * 64 KB User MMIO      |
+                     |        (per context)          |
+                     |       User Accessible         |
+                     +-------------------------------+
+                     |    512 * 128 B per context    |
+                     |    Provisioning and Control   |
+                     |   Trusted Process accessible  |
+                     +-------------------------------+
+                     |         64 KB Global          |
+                     |   Trusted Process accessible  |
+                     +-------------------------------+
+
+    This driver configures itself into the SCSI software stack as an
+    adapter driver. The driver is the only entity that is considered a
+    Trusted Process to program the Provisioning and Control and Global
+    areas in the MMIO Space shown above.  The master context driver
+    discovers all LUNs attached to the CXL Flash adapter and instantiates
+    scsi block devices (/dev/sdb, /dev/sdc etc.) for each unique LUN
+    seen from each path.
+
+    Once these scsi block devices are instantiated, an application
+    written to a specification provided by the block library may get
+    access to the Flash from user space (without requiring a system call).
+
+    This master context driver also provides a series of ioctls for this
+    block library to enable this user space access.  The driver supports
+    two modes for accessing the block device.
+
+    The first mode is called a virtual mode. In this mode a single scsi
+    block device (/dev/sdb) may be carved up into any number of distinct
+    virtual LUNs. The virtual LUNs may be resized as long as the sum of
+    the sizes of all the virtual LUNs, along with the meta-data associated
+    with it does not exceed the physical capacity.
+
+    The second mode is called the physical mode. In this mode a single
+    block device (/dev/sdb) may be opened directly by the block library
+    and the entire space for the LUN is available to the application.
+
+    Only the physical mode provides persistence of the data.  i.e. The
+    data written to the block device will survive application exit and
+    restart and also reboot. The virtual LUNs do not persist (i.e. do
+    not survive after the application terminates or the system reboots).
+
+
+Block library API
+=================
+
+    Applications intending to get access to the CXL Flash from user
+    space should use the block library, as it abstracts the details of
+    interfacing directly with the cxlflash driver that are necessary for
+    performing administrative actions (i.e.: setup, tear down, resize).
+    The block library can be thought of as a 'user' of services,
+    implemented as IOCTLs, that are provided by the cxlflash driver
+    specifically for devices (LUNs) operating in user space access
+    mode. While it is not a requirement that applications understand
+    the interface between the block library and the cxlflash driver,
+    a high-level overview of each supported service (IOCTL) is provided
+    below.
+
+    The block library can be found on GitHub:
+    http://www.github.com/mikehollinger/ibmcapikv
+
+
+CXL Flash Driver IOCTLs
+=======================
+
+    Users, such as the block library, that wish to interface with a flash
+    device (LUN) via user space access need to use the services provided
+    by the cxlflash driver. As these services are implemented as ioctls,
+    a file descriptor handle must first be obtained in order to establish
+    the communication channel between a user and the kernel.  This file
+    descriptor is obtained by opening the device special file associated
+    with the scsi disk device (/dev/sdb) that was created during LUN
+    discovery. As per the location of the cxlflash driver within the
+    SCSI protocol stack, this open is actually not seen by the cxlflash
+    driver. Upon successful open, the user receives a file descriptor
+    (herein referred to as fd1) that should be used for issuing the
+    subsequent ioctls listed below.
+
+    The structure definitions for these IOCTLs are available in:
+    uapi/scsi/cxlflash_ioctl.h
+
+DK_CXLFLASH_ATTACH
+------------------
+
+    This ioctl obtains, initializes, and starts a context using the CXL
+    kernel services. These services specify a context id (u16) by which
+    to uniquely identify the context and its allocated resources. The
+    services additionally provide a second file descriptor (herein
+    referred to as fd2) that is used by the block library to initiate
+    memory mapped I/O (via mmap()) to the CXL flash device and poll for
+    completion events. This file descriptor is intentionally installed by
+    this driver and not the CXL kernel services to allow for intermediary
+    notification and access in the event of a non-user-initiated close(),
+    such as a killed process. This design point is described in further
+    detail in the description for the DK_CXLFLASH_DETACH ioctl.
+
+    There are a few important aspects regarding the "tokens" (context id
+    and fd2) that are provided back to the user:
+
+        - These tokens are only valid for the process under which they
+          were created. The child of a forked process cannot continue
+          to use the context id or file descriptor created by its parent.
+
+        - These tokens are only valid for the lifetime of the context and
+          the process under which they were created. Once either is
+          destroyed, the tokens are to be considered stale and subsequent
+          usage will result in errors.
+
+        - When a context is no longer needed, the user shall detach from
+          the context via the DK_CXLFLASH_DETACH ioctl.
+
+        - A close on fd2 will invalidate the tokens. This operation is not
+          required by the user.
+
+DK_CXLFLASH_USER_DIRECT
+-----------------------
+    This ioctl is responsible for transitioning the LUN to direct
+    (physical) mode access and configuring the AFU for direct access from
+    user space on a per-context basis. Additionally, the block size and
+    last logical block address (LBA) are returned to the user.
+
+    As mentioned previously, when operating in user space access mode,
+    LUNs may be accessed in whole or in part. Only one mode is allowed
+    at a time and if one mode is active (outstanding references exist),
+    requests to use the LUN in a different mode are denied.
+
+    The AFU is configured for direct access from user space by adding an
+    entry to the AFU's resource handle table. The index of the entry is
+    treated as a resource handle that is returned to the user. The user
+    is then able to use the handle to reference the LUN during I/O.
+
+DK_CXLFLASH_RELEASE
+-------------------
+    This ioctl is responsible for releasing a previously obtained
+    reference to either a physical or virtual LUN. This can be
+    thought of as the inverse of the DK_CXLFLASH_USER_DIRECT or
+    DK_CXLFLASH_USER_VIRTUAL ioctls. Upon success, the resource handle
+    is no longer valid and the entry in the resource handle table is
+    made available to be used again.
+
+    As part of the release process for virtual LUNs, the virtual LUN
+    is first resized to 0 to clear out and free the translation tables
+    associated with the virtual LUN reference.
+
+DK_CXLFLASH_DETACH
+------------------
+    This ioctl is responsible for unregistering a context with the
+    cxlflash driver and release outstanding resources that were
+    not explicitly released via the DK_CXLFLASH_RELEASE ioctl. Upon
+    success, all "tokens" which had been provided to the user from the
+    DK_CXLFLASH_ATTACH onward are no longer valid.
+
+DK_CXLFLASH_VERIFY
+------------------
+    This ioctl is used to detect various changes such as the capacity of
+    the disk changing, the number of LUNs visible changing, etc. In cases
+    where the changes affect the application (such as a LUN resize), the
+    cxlflash driver will report the changed state to the application.
+
+    The user calls in when they want to validate that a LUN hasn't been
+    changed in response to a check condition. As the user is operating out
+    of band from the kernel, they will see these types of events without
+    the kernel's knowledge. When encountered, the user's architected
+    behavior is to call in to this ioctl, indicating what they want to
+    verify and passing along any appropriate information. For now, only
+    verifying a LUN change (ie: size different) with sense data is
+    supported.
+
+DK_CXLFLASH_RECOVER_AFU
+-----------------------
+    This ioctl is used to drive recovery (if such an action is warranted)
+    of a specified user context. Any state associated with the user context
+    is re-established upon successful recovery.
+
+    User contexts are put into an error condition when the device needs to
+    be reset or is terminating. Users are notified of this error condition
+    by seeing all 0xF's on an MMIO read. Upon encountering this, the
+    architected behavior for a user is to call into this ioctl to recover
+    their context. A user may also call into this ioctl at any time to
+    check if the device is operating normally. If a failure is returned
+    from this ioctl, the user is expected to gracefully clean up their
+    context via release/detach ioctls. Until they do, the context they
+    hold is not relinquished. The user may also optionally exit the process
+    at which time the context/resources they held will be freed as part of
+    the release fop.
+
+DK_CXLFLASH_MANAGE_LUN
+----------------------
+    This ioctl is used to switch a LUN from a mode where it is available
+    for file-system access (legacy), to a mode where it is set aside for
+    exclusive user space access (superpipe). In case a LUN is visible
+    across multiple ports and adapters, this ioctl is used to uniquely
+    identify each LUN by its World Wide Node Name (WWNN).
diff --git a/drivers/scsi/cxlflash/Makefile b/drivers/scsi/cxlflash/Makefile
index dc95e203e3af..c14d24c720d6 100644
--- a/drivers/scsi/cxlflash/Makefile
+++ b/drivers/scsi/cxlflash/Makefile
@@ -1,2 +1,2 @@
 obj-$(CONFIG_CXLFLASH) += cxlflash.o
-cxlflash-y += main.o
+cxlflash-y += main.o superpipe.o lunmgt.o
diff --git a/drivers/scsi/cxlflash/common.h b/drivers/scsi/cxlflash/common.h
index ffdbc572d180..d3e54e61c7a5 100644
--- a/drivers/scsi/cxlflash/common.h
+++ b/drivers/scsi/cxlflash/common.h
@@ -107,6 +107,17 @@ struct cxlflash_cfg {
 	struct pci_pool *cxlflash_cmd_pool;
 	struct pci_dev *parent_dev;
 
+	atomic_t recovery_threads;
+	struct mutex ctx_recovery_mutex;
+	struct mutex ctx_tbl_list_mutex;
+	struct ctx_info *ctx_tbl[MAX_CONTEXT];
+	struct list_head ctx_err_recovery; /* contexts w/ recovery pending */
+	struct file_operations cxl_fops;
+
+	atomic_t num_user_contexts;
+
+	struct list_head lluns; /* list of llun_info structs */
+
 	wait_queue_head_t tmf_waitq;
 	bool tmf_active;
 	wait_queue_head_t limbo_waitq;
@@ -182,4 +193,12 @@ int cxlflash_afu_reset(struct cxlflash_cfg *);
 struct afu_cmd *cxlflash_cmd_checkout(struct afu *);
 void cxlflash_cmd_checkin(struct afu_cmd *);
 int cxlflash_afu_sync(struct afu *, ctx_hndl_t, res_hndl_t, u8);
+void cxlflash_list_init(void);
+void cxlflash_term_global_luns(void);
+void cxlflash_free_errpage(void);
+int cxlflash_ioctl(struct scsi_device *, int, void __user *);
+void cxlflash_stop_term_user_contexts(struct cxlflash_cfg *);
+int cxlflash_mark_contexts_error(struct cxlflash_cfg *);
+void cxlflash_term_local_luns(struct cxlflash_cfg *);
+
 #endif /* ifndef _CXLFLASH_COMMON_H */
diff --git a/drivers/scsi/cxlflash/lunmgt.c b/drivers/scsi/cxlflash/lunmgt.c
new file mode 100644
index 000000000000..66d5bef11ee6
--- /dev/null
+++ b/drivers/scsi/cxlflash/lunmgt.c
@@ -0,0 +1,263 @@
+/*
+ * CXL Flash Device Driver
+ *
+ * Written by: Manoj N. Kumar <manoj@linux.vnet.ibm.com>, IBM Corporation
+ *             Matthew R. Ochs <mrochs@linux.vnet.ibm.com>, IBM Corporation
+ *
+ * Copyright (C) 2015 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <misc/cxl.h>
+#include <asm/unaligned.h>
+
+#include <scsi/scsi_host.h>
+#include <uapi/scsi/cxlflash_ioctl.h>
+
+#include "sislite.h"
+#include "common.h"
+#include "superpipe.h"
+
+/**
+ * create_local() - allocate and initialize a local LUN information structure
+ * @sdev:	SCSI device associated with LUN.
+ * @wwid:	World Wide Node Name for LUN.
+ *
+ * Return: Allocated local llun_info structure on success, NULL on failure
+ */
+static struct llun_info *create_local(struct scsi_device *sdev, u8 *wwid)
+{
+	struct llun_info *lli = NULL;
+
+	lli = kzalloc(sizeof(*lli), GFP_KERNEL);
+	if (unlikely(!lli)) {
+		pr_err("%s: could not allocate lli\n", __func__);
+		goto out;
+	}
+
+	lli->sdev = sdev;
+	lli->newly_created = true;
+	lli->host_no = sdev->host->host_no;
+
+	memcpy(lli->wwid, wwid, DK_CXLFLASH_MANAGE_LUN_WWID_LEN);
+out:
+	return lli;
+}
+
+/**
+ * create_global() - allocate and initialize a global LUN information structure
+ * @sdev:	SCSI device associated with LUN.
+ * @wwid:	World Wide Node Name for LUN.
+ *
+ * Return: Allocated global glun_info structure on success, NULL on failure
+ */
+static struct glun_info *create_global(struct scsi_device *sdev, u8 *wwid)
+{
+	struct glun_info *gli = NULL;
+
+	gli = kzalloc(sizeof(*gli), GFP_KERNEL);
+	if (unlikely(!gli)) {
+		pr_err("%s: could not allocate gli\n", __func__);
+		goto out;
+	}
+
+	mutex_init(&gli->mutex);
+	memcpy(gli->wwid, wwid, DK_CXLFLASH_MANAGE_LUN_WWID_LEN);
+out:
+	return gli;
+}
+
+/**
+ * refresh_local() - find and update local LUN information structure by WWID
+ * @cfg:	Internal structure associated with the host.
+ * @wwid:	WWID associated with LUN.
+ *
+ * When the LUN is found, mark it by updating it's newly_created field.
+ *
+ * Return: Found local lun_info structure on success, NULL on failure
+ * If a LUN with the WWID is found in the list, refresh it's state.
+ */
+static struct llun_info *refresh_local(struct cxlflash_cfg *cfg, u8 *wwid)
+{
+	struct llun_info *lli, *temp;
+
+	list_for_each_entry_safe(lli, temp, &cfg->lluns, list)
+		if (!memcmp(lli->wwid, wwid, DK_CXLFLASH_MANAGE_LUN_WWID_LEN)) {
+			lli->newly_created = false;
+			return lli;
+		}
+
+	return NULL;
+}
+
+/**
+ * lookup_global() - find a global LUN information structure by WWID
+ * @wwid:	WWID associated with LUN.
+ *
+ * Return: Found global lun_info structure on success, NULL on failure
+ */
+static struct glun_info *lookup_global(u8 *wwid)
+{
+	struct glun_info *gli, *temp;
+
+	list_for_each_entry_safe(gli, temp, &global.gluns, list)
+		if (!memcmp(gli->wwid, wwid, DK_CXLFLASH_MANAGE_LUN_WWID_LEN))
+			return gli;
+
+	return NULL;
+}
+
+/**
+ * find_and_create_lun() - find or create a local LUN information structure
+ * @sdev:	SCSI device associated with LUN.
+ * @wwid:	WWID associated with LUN.
+ *
+ * The LUN is kept both in a local list (per adapter) and in a global list
+ * (across all adapters). Certain attributes of the LUN are local to the
+ * adapter (such as index, port selection mask etc.).
+ * The block allocation map is shared across all adapters (i.e. associated
+ * wih the global list). Since different attributes are associated with
+ * the per adapter and global entries, allocate two separate structures for each
+ * LUN (one local, one global).
+ *
+ * Keep a pointer back from the local to the global entry.
+ *
+ * Return: Found/Allocated local lun_info structure on success, NULL on failure
+ */
+static struct llun_info *find_and_create_lun(struct scsi_device *sdev, u8 *wwid)
+{
+	struct llun_info *lli = NULL;
+	struct glun_info *gli = NULL;
+	struct Scsi_Host *shost = sdev->host;
+	struct cxlflash_cfg *cfg = shost_priv(shost);
+
+	mutex_lock(&global.mutex);
+	if (unlikely(!wwid))
+		goto out;
+
+	lli = refresh_local(cfg, wwid);
+	if (lli)
+		goto out;
+
+	lli = create_local(sdev, wwid);
+	if (unlikely(!lli))
+		goto out;
+
+	gli = lookup_global(wwid);
+	if (gli) {
+		lli->parent = gli;
+		list_add(&lli->list, &cfg->lluns);
+		goto out;
+	}
+
+	gli = create_global(sdev, wwid);
+	if (unlikely(!gli)) {
+		kfree(lli);
+		lli = NULL;
+		goto out;
+	}
+
+	lli->parent = gli;
+	list_add(&lli->list, &cfg->lluns);
+
+	list_add(&gli->list, &global.gluns);
+
+out:
+	mutex_unlock(&global.mutex);
+	pr_debug("%s: returning %p\n", __func__, lli);
+	return lli;
+}
+
+/**
+ * cxlflash_term_local_luns() - Delete all entries from local LUN list, free.
+ * @cfg:	Internal structure associated with the host.
+ */
+void cxlflash_term_local_luns(struct cxlflash_cfg *cfg)
+{
+	struct llun_info *lli, *temp;
+
+	mutex_lock(&global.mutex);
+	list_for_each_entry_safe(lli, temp, &cfg->lluns, list) {
+		list_del(&lli->list);
+		kfree(lli);
+	}
+	mutex_unlock(&global.mutex);
+}
+
+/**
+ * cxlflash_list_init() - initializes the global LUN list
+ */
+void cxlflash_list_init(void)
+{
+	INIT_LIST_HEAD(&global.gluns);
+	mutex_init(&global.mutex);
+	global.err_page = NULL;
+}
+
+/**
+ * cxlflash_term_global_luns() - frees resources associated with global LUN list
+ */
+void cxlflash_term_global_luns(void)
+{
+	struct glun_info *gli, *temp;
+
+	mutex_lock(&global.mutex);
+	list_for_each_entry_safe(gli, temp, &global.gluns, list) {
+		list_del(&gli->list);
+		kfree(gli);
+	}
+	mutex_unlock(&global.mutex);
+}
+
+/**
+ * cxlflash_manage_lun() - handles LUN management activities
+ * @sdev:	SCSI device associated with LUN.
+ * @manage:	Manage ioctl data structure.
+ *
+ * This routine is used to notify the driver about a LUN's WWID and associate
+ * SCSI devices (sdev) with a global LUN instance. Additionally it serves to
+ * change a LUN's operating mode: legacy or superpipe.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int cxlflash_manage_lun(struct scsi_device *sdev,
+			struct dk_cxlflash_manage_lun *manage)
+{
+	int rc = 0;
+	struct llun_info *lli = NULL;
+	u64 flags = manage->hdr.flags;
+	u32 chan = sdev->channel;
+
+	lli = find_and_create_lun(sdev, manage->wwid);
+	pr_debug("%s: ENTER: WWID = %016llX%016llX, flags = %016llX li = %p\n",
+		 __func__, get_unaligned_le64(&manage->wwid[0]),
+		 get_unaligned_le64(&manage->wwid[8]),
+		 manage->hdr.flags, lli);
+	if (unlikely(!lli)) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	if (flags & DK_CXLFLASH_MANAGE_LUN_ENABLE_SUPERPIPE) {
+		if (lli->newly_created)
+			lli->port_sel = CHAN2PORT(chan);
+		else
+			lli->port_sel = BOTH_PORTS;
+		/* Store off lun in unpacked, AFU-friendly format */
+		lli->lun_id[chan] = lun_to_lunid(sdev->lun);
+		sdev->hostdata = lli;
+	} else if (flags & DK_CXLFLASH_MANAGE_LUN_DISABLE_SUPERPIPE) {
+		if (lli->parent->mode != MODE_NONE)
+			rc = -EBUSY;
+		else
+			sdev->hostdata = NULL;
+	}
+
+out:
+	pr_debug("%s: returning rc=%d\n", __func__, rc);
+	return rc;
+}
diff --git a/drivers/scsi/cxlflash/main.c b/drivers/scsi/cxlflash/main.c
index 3ae8dca236ef..02d464f41b7f 100644
--- a/drivers/scsi/cxlflash/main.c
+++ b/drivers/scsi/cxlflash/main.c
@@ -23,6 +23,7 @@
 
 #include <scsi/scsi_cmnd.h>
 #include <scsi/scsi_host.h>
+#include <uapi/scsi/cxlflash_ioctl.h>
 
 #include "main.h"
 #include "sislite.h"
@@ -519,7 +520,7 @@ static int cxlflash_eh_host_reset_handler(struct scsi_cmnd *scp)
 	case STATE_NORMAL:
 		cfg->state = STATE_LIMBO;
 		scsi_block_requests(cfg->host);
-
+		cxlflash_mark_contexts_error(cfg);
 		rcr = cxlflash_afu_reset(cfg);
 		if (rcr) {
 			rc = FAILED;
@@ -662,6 +663,21 @@ static ssize_t cxlflash_store_lun_mode(struct device *dev,
 	return count;
 }
 
+/**
+ * cxlflash_show_ioctl_version() - presents the current ioctl version of the host
+ * @dev:	Generic device associated with the host.
+ * @attr:	Device attribute representing the ioctl version.
+ * @buf:	Buffer of length PAGE_SIZE to report back the ioctl version.
+ *
+ * Return: The size of the ASCII string returned in @buf.
+ */
+static ssize_t cxlflash_show_ioctl_version(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
+{
+	return scnprintf(buf, PAGE_SIZE, "%u\n", DK_CXLFLASH_VERSION_0);
+}
+
 /**
  * cxlflash_show_dev_mode() - presents the current mode of the device
  * @dev:	Generic device associated with the device.
@@ -700,11 +716,13 @@ static DEVICE_ATTR(port0, S_IRUGO, cxlflash_show_port_status, NULL);
 static DEVICE_ATTR(port1, S_IRUGO, cxlflash_show_port_status, NULL);
 static DEVICE_ATTR(lun_mode, S_IRUGO | S_IWUSR, cxlflash_show_lun_mode,
 		   cxlflash_store_lun_mode);
+static DEVICE_ATTR(ioctl_version, S_IRUGO, cxlflash_show_ioctl_version, NULL);
 
 static struct device_attribute *cxlflash_host_attrs[] = {
 	&dev_attr_port0,
 	&dev_attr_port1,
 	&dev_attr_lun_mode,
+	&dev_attr_ioctl_version,
 	NULL
 };
 
@@ -725,6 +743,7 @@ static struct scsi_host_template driver_template = {
 	.module = THIS_MODULE,
 	.name = CXLFLASH_ADAPTER_NAME,
 	.info = cxlflash_driver_info,
+	.ioctl = cxlflash_ioctl,
 	.proc_name = CXLFLASH_NAME,
 	.queuecommand = cxlflash_queuecommand,
 	.eh_device_reset_handler = cxlflash_eh_device_reset_handler,
@@ -872,9 +891,11 @@ static void cxlflash_remove(struct pci_dev *pdev)
 	spin_unlock_irqrestore(&cfg->tmf_waitq.lock, lock_flags);
 
 	cfg->state = STATE_FAILTERM;
+	cxlflash_stop_term_user_contexts(cfg);
 
 	switch (cfg->init_state) {
 	case INIT_STATE_SCSI:
+		cxlflash_term_local_luns(cfg);
 		scsi_remove_host(cfg->host);
 		scsi_host_put(cfg->host);
 		/* Fall through */
@@ -2274,6 +2295,10 @@ static int cxlflash_probe(struct pci_dev *pdev,
 	INIT_WORK(&cfg->work_q, cxlflash_worker_thread);
 	cfg->lr_state = LINK_RESET_INVALID;
 	cfg->lr_port = -1;
+	mutex_init(&cfg->ctx_tbl_list_mutex);
+	mutex_init(&cfg->ctx_recovery_mutex);
+	INIT_LIST_HEAD(&cfg->ctx_err_recovery);
+	INIT_LIST_HEAD(&cfg->lluns);
 
 	pci_set_drvdata(pdev, cfg);
 
@@ -2335,6 +2360,7 @@ out_remove:
 static pci_ers_result_t cxlflash_pci_error_detected(struct pci_dev *pdev,
 						    pci_channel_state_t state)
 {
+	int rc = 0;
 	struct cxlflash_cfg *cfg = pci_get_drvdata(pdev);
 	struct device *dev = &cfg->dev->dev;
 
@@ -2346,7 +2372,10 @@ static pci_ers_result_t cxlflash_pci_error_detected(struct pci_dev *pdev,
 
 		/* Turn off legacy I/O */
 		scsi_block_requests(cfg->host);
-
+		rc = cxlflash_mark_contexts_error(cfg);
+		if (unlikely(rc))
+			dev_err(dev, "%s: Failed to mark user contexts!(%d)\n",
+				__func__, rc);
 		term_mc(cfg, UNDO_START);
 		stop_afu(cfg);
 
@@ -2431,6 +2460,8 @@ static int __init init_cxlflash(void)
 	pr_info("%s: IBM Power CXL Flash Adapter: %s\n",
 		__func__, CXLFLASH_DRIVER_DATE);
 
+	cxlflash_list_init();
+
 	return pci_register_driver(&cxlflash_driver);
 }
 
@@ -2439,6 +2470,9 @@ static int __init init_cxlflash(void)
  */
 static void __exit exit_cxlflash(void)
 {
+	cxlflash_term_global_luns();
+	cxlflash_free_errpage();
+
 	pci_unregister_driver(&cxlflash_driver);
 }
 
diff --git a/drivers/scsi/cxlflash/sislite.h b/drivers/scsi/cxlflash/sislite.h
index bf5d39978630..66b889151a4c 100644
--- a/drivers/scsi/cxlflash/sislite.h
+++ b/drivers/scsi/cxlflash/sislite.h
@@ -409,7 +409,10 @@ struct sisl_lxt_entry {
 
 };
 
-/* Per the SISlite spec, RHT entries are to be 16-byte aligned */
+/*
+ * RHT - Resource Handle Table
+ * Per the SISlite spec, RHT entries are to be 16-byte aligned
+ */
 struct sisl_rht_entry {
 	struct sisl_lxt_entry *lxt_start;
 	u32 lxt_cnt;
diff --git a/drivers/scsi/cxlflash/superpipe.c b/drivers/scsi/cxlflash/superpipe.c
new file mode 100644
index 000000000000..3c8bce8bbb0b
--- /dev/null
+++ b/drivers/scsi/cxlflash/superpipe.c
@@ -0,0 +1,2014 @@
+/*
+ * CXL Flash Device Driver
+ *
+ * Written by: Manoj N. Kumar <manoj@linux.vnet.ibm.com>, IBM Corporation
+ *             Matthew R. Ochs <mrochs@linux.vnet.ibm.com>, IBM Corporation
+ *
+ * Copyright (C) 2015 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/delay.h>
+#include <linux/file.h>
+#include <linux/syscalls.h>
+#include <misc/cxl.h>
+#include <asm/unaligned.h>
+
+#include <scsi/scsi.h>
+#include <scsi/scsi_host.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_eh.h>
+#include <uapi/scsi/cxlflash_ioctl.h>
+
+#include "sislite.h"
+#include "common.h"
+#include "superpipe.h"
+
+struct cxlflash_global global;
+
+/**
+ * marshal_det_to_rele() - translate detach to release structure
+ * @detach:	Destination structure for the translate/copy.
+ * @rele:	Source structure from which to translate/copy.
+ */
+static void marshal_det_to_rele(struct dk_cxlflash_detach *detach,
+				struct dk_cxlflash_release *release)
+{
+	release->hdr = detach->hdr;
+	release->context_id = detach->context_id;
+}
+
+/**
+ * cxlflash_free_errpage() - frees resources associated with global error page
+ */
+void cxlflash_free_errpage(void)
+{
+
+	mutex_lock(&global.mutex);
+	if (global.err_page) {
+		__free_page(global.err_page);
+		global.err_page = NULL;
+	}
+	mutex_unlock(&global.mutex);
+}
+
+/**
+ * cxlflash_stop_term_user_contexts() - stops/terminates known user contexts
+ * @cfg:	Internal structure associated with the host.
+ *
+ * When the host needs to go down, all users must be quiesced and their
+ * memory freed. This is accomplished by putting the contexts in error
+ * state which will notify the user and let them 'drive' the tear-down.
+ * Meanwhile, this routine camps until all user contexts have been removed.
+ */
+void cxlflash_stop_term_user_contexts(struct cxlflash_cfg *cfg)
+{
+	struct device *dev = &cfg->dev->dev;
+	int i, found;
+
+	cxlflash_mark_contexts_error(cfg);
+
+	while (true) {
+		found = false;
+
+		for (i = 0; i < MAX_CONTEXT; i++)
+			if (cfg->ctx_tbl[i]) {
+				found = true;
+				break;
+			}
+
+		if (!found && list_empty(&cfg->ctx_err_recovery))
+			return;
+
+		dev_dbg(dev, "%s: Wait for user contexts to quiesce...\n",
+			__func__);
+		wake_up_all(&cfg->limbo_waitq);
+		ssleep(1);
+	}
+}
+
+/**
+ * find_error_context() - locates a context by cookie on the error recovery list
+ * @cfg:	Internal structure associated with the host.
+ * @rctxid:	Desired context by id.
+ * @file:	Desired context by file.
+ *
+ * Return: Found context on success, NULL on failure
+ */
+static struct ctx_info *find_error_context(struct cxlflash_cfg *cfg, u64 rctxid,
+					   struct file *file)
+{
+	struct ctx_info *ctxi;
+
+	list_for_each_entry(ctxi, &cfg->ctx_err_recovery, list)
+		if ((ctxi->ctxid == rctxid) || (ctxi->file == file))
+			return ctxi;
+
+	return NULL;
+}
+
+/**
+ * get_context() - obtains a validated and locked context reference
+ * @cfg:	Internal structure associated with the host.
+ * @rctxid:	Desired context (raw, un-decoded format).
+ * @arg:	LUN information or file associated with request.
+ * @ctx_ctrl:	Control information to 'steer' desired lookup.
+ *
+ * NOTE: despite the name pid, in linux, current->pid actually refers
+ * to the lightweight process id (tid) and can change if the process is
+ * multi threaded. The tgid remains constant for the process and only changes
+ * when the process of fork. For all intents and purposes, think of tgid
+ * as a pid in the traditional sense.
+ *
+ * Return: Validated context on success, NULL on failure
+ */
+struct ctx_info *get_context(struct cxlflash_cfg *cfg, u64 rctxid,
+			     void *arg, enum ctx_ctrl ctx_ctrl)
+{
+	struct device *dev = &cfg->dev->dev;
+	struct ctx_info *ctxi = NULL;
+	struct lun_access *lun_access = NULL;
+	struct file *file = NULL;
+	struct llun_info *lli = arg;
+	u64 ctxid = DECODE_CTXID(rctxid);
+	int rc;
+	pid_t pid = current->tgid, ctxpid = 0;
+
+	if (ctx_ctrl & CTX_CTRL_FILE) {
+		lli = NULL;
+		file = (struct file *)arg;
+	}
+
+	if (ctx_ctrl & CTX_CTRL_CLONE)
+		pid = current->parent->tgid;
+
+	if (likely(ctxid < MAX_CONTEXT)) {
+		while (true) {
+			rc = mutex_lock_interruptible(&cfg->ctx_tbl_list_mutex);
+			if (rc)
+				goto out;
+
+			ctxi = cfg->ctx_tbl[ctxid];
+			if (ctxi)
+				if ((file && (ctxi->file != file)) ||
+				    (!file && (ctxi->ctxid != rctxid)))
+					ctxi = NULL;
+
+			if ((ctx_ctrl & CTX_CTRL_ERR) ||
+			    (!ctxi && (ctx_ctrl & CTX_CTRL_ERR_FALLBACK)))
+				ctxi = find_error_context(cfg, rctxid, file);
+			if (!ctxi) {
+				mutex_unlock(&cfg->ctx_tbl_list_mutex);
+				goto out;
+			}
+
+			/*
+			 * Need to acquire ownership of the context while still
+			 * under the table/list lock to serialize with a remove
+			 * thread. Use the 'try' to avoid stalling the
+			 * table/list lock for a single context.
+			 *
+			 * Note that the lock order is:
+			 *
+			 *	cfg->ctx_tbl_list_mutex -> ctxi->mutex
+			 *
+			 * Therefore release ctx_tbl_list_mutex before retrying.
+			 */
+			rc = mutex_trylock(&ctxi->mutex);
+			mutex_unlock(&cfg->ctx_tbl_list_mutex);
+			if (rc)
+				break; /* got the context's lock! */
+		}
+
+		if (ctxi->unavail)
+			goto denied;
+
+		ctxpid = ctxi->pid;
+		if (likely(!(ctx_ctrl & CTX_CTRL_NOPID)))
+			if (pid != ctxpid)
+				goto denied;
+
+		if (lli) {
+			list_for_each_entry(lun_access, &ctxi->luns, list)
+				if (lun_access->lli == lli)
+					goto out;
+			goto denied;
+		}
+	}
+
+out:
+	dev_dbg(dev, "%s: rctxid=%016llX ctxinfo=%p ctxpid=%u pid=%u "
+		"ctx_ctrl=%u\n", __func__, rctxid, ctxi, ctxpid, pid,
+		ctx_ctrl);
+
+	return ctxi;
+
+denied:
+	mutex_unlock(&ctxi->mutex);
+	ctxi = NULL;
+	goto out;
+}
+
+/**
+ * put_context() - release a context that was retrieved from get_context()
+ * @ctxi:	Context to release.
+ *
+ * For now, releasing the context equates to unlocking it's mutex.
+ */
+void put_context(struct ctx_info *ctxi)
+{
+	mutex_unlock(&ctxi->mutex);
+}
+
+/**
+ * afu_attach() - attach a context to the AFU
+ * @cfg:	Internal structure associated with the host.
+ * @ctxi:	Context to attach.
+ *
+ * Upon setting the context capabilities, they must be confirmed with
+ * a read back operation as the context might have been closed since
+ * the mailbox was unlocked. When this occurs, registration is failed.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+static int afu_attach(struct cxlflash_cfg *cfg, struct ctx_info *ctxi)
+{
+	struct device *dev = &cfg->dev->dev;
+	struct afu *afu = cfg->afu;
+	struct sisl_ctrl_map *ctrl_map = ctxi->ctrl_map;
+	int rc = 0;
+	u64 val;
+
+	/* Unlock cap and restrict user to read/write cmds in translated mode */
+	readq_be(&ctrl_map->mbox_r);
+	val = (SISL_CTX_CAP_READ_CMD | SISL_CTX_CAP_WRITE_CMD);
+	writeq_be(val, &ctrl_map->ctx_cap);
+	val = readq_be(&ctrl_map->ctx_cap);
+	if (val != (SISL_CTX_CAP_READ_CMD | SISL_CTX_CAP_WRITE_CMD)) {
+		dev_err(dev, "%s: ctx may be closed val=%016llX\n",
+			__func__, val);
+		rc = -EAGAIN;
+		goto out;
+	}
+
+	/* Set up MMIO registers pointing to the RHT */
+	writeq_be((u64)ctxi->rht_start, &ctrl_map->rht_start);
+	val = SISL_RHT_CNT_ID((u64)MAX_RHT_PER_CONTEXT, (u64)(afu->ctx_hndl));
+	writeq_be(val, &ctrl_map->rht_cnt_id);
+out:
+	dev_dbg(dev, "%s: returning rc=%d\n", __func__, rc);
+	return rc;
+}
+
+/**
+ * read_cap16() - issues a SCSI READ_CAP16 command
+ * @sdev:	SCSI device associated with LUN.
+ * @lli:	LUN destined for capacity request.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+static int read_cap16(struct scsi_device *sdev, struct llun_info *lli)
+{
+	struct cxlflash_cfg *cfg = (struct cxlflash_cfg *)sdev->host->hostdata;
+	struct device *dev = &cfg->dev->dev;
+	struct glun_info *gli = lli->parent;
+	u8 *cmd_buf = NULL;
+	u8 *scsi_cmd = NULL;
+	u8 *sense_buf = NULL;
+	int rc = 0;
+	int result = 0;
+	int retry_cnt = 0;
+	u32 tout = (MC_DISCOVERY_TIMEOUT * HZ);
+
+retry:
+	cmd_buf = kzalloc(CMD_BUFSIZE, GFP_KERNEL);
+	scsi_cmd = kzalloc(MAX_COMMAND_SIZE, GFP_KERNEL);
+	sense_buf = kzalloc(SCSI_SENSE_BUFFERSIZE, GFP_KERNEL);
+	if (unlikely(!cmd_buf || !scsi_cmd || !sense_buf)) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	scsi_cmd[0] = SERVICE_ACTION_IN_16;	/* read cap(16) */
+	scsi_cmd[1] = SAI_READ_CAPACITY_16;	/* service action */
+	put_unaligned_be32(CMD_BUFSIZE, &scsi_cmd[10]);
+
+	dev_dbg(dev, "%s: %ssending cmd(0x%x)\n", __func__,
+		retry_cnt ? "re" : "", scsi_cmd[0]);
+
+	result = scsi_execute(sdev, scsi_cmd, DMA_FROM_DEVICE, cmd_buf,
+			      CMD_BUFSIZE, sense_buf, tout, 5, 0, NULL);
+
+	if (driver_byte(result) == DRIVER_SENSE) {
+		result &= ~(0xFF<<24); /* DRIVER_SENSE is not an error */
+		if (result & SAM_STAT_CHECK_CONDITION) {
+			struct scsi_sense_hdr sshdr;
+
+			scsi_normalize_sense(sense_buf, SCSI_SENSE_BUFFERSIZE,
+					    &sshdr);
+			switch (sshdr.sense_key) {
+			case NO_SENSE:
+			case RECOVERED_ERROR:
+				/* fall through */
+			case NOT_READY:
+				result &= ~SAM_STAT_CHECK_CONDITION;
+				break;
+			case UNIT_ATTENTION:
+				switch (sshdr.asc) {
+				case 0x29: /* Power on Reset or Device Reset */
+					/* fall through */
+				case 0x2A: /* Device capacity changed */
+				case 0x3F: /* Report LUNs changed */
+					/* Retry the command once more */
+					if (retry_cnt++ < 1) {
+						kfree(cmd_buf);
+						kfree(scsi_cmd);
+						kfree(sense_buf);
+						goto retry;
+					}
+				}
+				break;
+			default:
+				break;
+			}
+		}
+	}
+
+	if (result) {
+		dev_err(dev, "%s: command failed, result=0x%x\n",
+			__func__, result);
+		rc = -EIO;
+		goto out;
+	}
+
+	/*
+	 * Read cap was successful, grab values from the buffer;
+	 * note that we don't need to worry about unaligned access
+	 * as the buffer is allocated on an aligned boundary.
+	 */
+	mutex_lock(&gli->mutex);
+	gli->max_lba = be64_to_cpu(*((u64 *)&cmd_buf[0]));
+	gli->blk_len = be32_to_cpu(*((u32 *)&cmd_buf[8]));
+	mutex_unlock(&gli->mutex);
+
+out:
+	kfree(cmd_buf);
+	kfree(scsi_cmd);
+	kfree(sense_buf);
+
+	dev_dbg(dev, "%s: maxlba=%lld blklen=%d rc=%d\n",
+		__func__, gli->max_lba, gli->blk_len, rc);
+	return rc;
+}
+
+/**
+ * get_rhte() - obtains validated resource handle table entry reference
+ * @ctxi:	Context owning the resource handle.
+ * @rhndl:	Resource handle associated with entry.
+ * @lli:	LUN associated with request.
+ *
+ * Return: Validated RHTE on success, NULL on failure
+ */
+struct sisl_rht_entry *get_rhte(struct ctx_info *ctxi, res_hndl_t rhndl,
+				struct llun_info *lli)
+{
+	struct sisl_rht_entry *rhte = NULL;
+
+	if (unlikely(!ctxi->rht_start)) {
+		pr_debug("%s: Context does not have allocated RHT!\n",
+			 __func__);
+		goto out;
+	}
+
+	if (unlikely(rhndl >= MAX_RHT_PER_CONTEXT)) {
+		pr_debug("%s: Bad resource handle! (%d)\n", __func__, rhndl);
+		goto out;
+	}
+
+	if (unlikely(ctxi->rht_lun[rhndl] != lli)) {
+		pr_debug("%s: Bad resource handle LUN! (%d)\n",
+			 __func__, rhndl);
+		goto out;
+	}
+
+	rhte = &ctxi->rht_start[rhndl];
+	if (unlikely(rhte->nmask == 0)) {
+		pr_debug("%s: Unopened resource handle! (%d)\n",
+			 __func__, rhndl);
+		rhte = NULL;
+		goto out;
+	}
+
+out:
+	return rhte;
+}
+
+/**
+ * rhte_checkout() - obtains free/empty resource handle table entry
+ * @ctxi:	Context owning the resource handle.
+ * @lli:	LUN associated with request.
+ *
+ * Return: Free RHTE on success, NULL on failure
+ */
+struct sisl_rht_entry *rhte_checkout(struct ctx_info *ctxi,
+				     struct llun_info *lli)
+{
+	struct sisl_rht_entry *rhte = NULL;
+	int i;
+
+	/* Find a free RHT entry */
+	for (i = 0; i < MAX_RHT_PER_CONTEXT; i++)
+		if (ctxi->rht_start[i].nmask == 0) {
+			rhte = &ctxi->rht_start[i];
+			ctxi->rht_out++;
+			break;
+		}
+
+	if (likely(rhte))
+		ctxi->rht_lun[i] = lli;
+
+	pr_debug("%s: returning rhte=%p (%d)\n", __func__, rhte, i);
+	return rhte;
+}
+
+/**
+ * rhte_checkin() - releases a resource handle table entry
+ * @ctxi:	Context owning the resource handle.
+ * @rhte:	RHTE to release.
+ */
+void rhte_checkin(struct ctx_info *ctxi,
+		  struct sisl_rht_entry *rhte)
+{
+	u32 rsrc_handle = rhte - ctxi->rht_start;
+
+	rhte->nmask = 0;
+	rhte->fp = 0;
+	ctxi->rht_out--;
+	ctxi->rht_lun[rsrc_handle] = NULL;
+}
+
+/**
+ * rhte_format1() - populates a RHTE for format 1
+ * @rhte:	RHTE to populate.
+ * @lun_id:	LUN ID of LUN associated with RHTE.
+ * @perm:	Desired permissions for RHTE.
+ * @port_sel:	Port selection mask
+ */
+static void rht_format1(struct sisl_rht_entry *rhte, u64 lun_id, u32 perm,
+			u32 port_sel)
+{
+	/*
+	 * Populate the Format 1 RHT entry for direct access (physical
+	 * LUN) using the synchronization sequence defined in the
+	 * SISLite specification.
+	 */
+	struct sisl_rht_entry_f1 dummy = { 0 };
+	struct sisl_rht_entry_f1 *rhte_f1 = (struct sisl_rht_entry_f1 *)rhte;
+
+	memset(rhte_f1, 0, sizeof(*rhte_f1));
+	rhte_f1->fp = SISL_RHT_FP(1U, 0);
+	dma_wmb(); /* Make setting of format bit visible */
+
+	rhte_f1->lun_id = lun_id;
+	dma_wmb(); /* Make setting of LUN id visible */
+
+	/*
+	 * Use a dummy RHT Format 1 entry to build the second dword
+	 * of the entry that must be populated in a single write when
+	 * enabled (valid bit set to TRUE).
+	 */
+	dummy.valid = 0x80;
+	dummy.fp = SISL_RHT_FP(1U, perm);
+	dummy.port_sel = port_sel;
+	rhte_f1->dw = dummy.dw;
+
+	dma_wmb(); /* Make remaining RHT entry fields visible */
+}
+
+/**
+ * cxlflash_lun_attach() - attaches a user to a LUN and manages the LUN's mode
+ * @gli:	LUN to attach.
+ * @mode:	Desired mode of the LUN.
+ * @locked:	Mutex status on current thread.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int cxlflash_lun_attach(struct glun_info *gli, enum lun_mode mode, bool locked)
+{
+	int rc = 0;
+
+	if (!locked)
+		mutex_lock(&gli->mutex);
+
+	if (gli->mode == MODE_NONE)
+		gli->mode = mode;
+	else if (gli->mode != mode) {
+		pr_debug("%s: LUN operating in mode %d, requested mode %d\n",
+			 __func__, gli->mode, mode);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	gli->users++;
+	WARN_ON(gli->users <= 0);
+out:
+	pr_debug("%s: Returning rc=%d gli->mode=%u gli->users=%u\n",
+		 __func__, rc, gli->mode, gli->users);
+	if (!locked)
+		mutex_unlock(&gli->mutex);
+	return rc;
+}
+
+/**
+ * cxlflash_lun_detach() - detaches a user from a LUN and resets the LUN's mode
+ * @gli:	LUN to detach.
+ */
+void cxlflash_lun_detach(struct glun_info *gli)
+{
+	mutex_lock(&gli->mutex);
+	WARN_ON(gli->mode == MODE_NONE);
+	if (--gli->users == 0)
+		gli->mode = MODE_NONE;
+	pr_debug("%s: gli->users=%u\n", __func__, gli->users);
+	WARN_ON(gli->users < 0);
+	mutex_unlock(&gli->mutex);
+}
+
+/**
+ * _cxlflash_disk_release() - releases the specified resource entry
+ * @sdev:	SCSI device associated with LUN.
+ * @ctxi:	Context owning resources.
+ * @release:	Release ioctl data structure.
+ *
+ * Note that the AFU sync should _not_ be performed when the context is sitting
+ * on the error recovery list. A context on the error recovery list is not known
+ * to the AFU due to reset. When the context is recovered, it will be reattached
+ * and made known again to the AFU.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int _cxlflash_disk_release(struct scsi_device *sdev,
+			   struct ctx_info *ctxi,
+			   struct dk_cxlflash_release *release)
+{
+	struct cxlflash_cfg *cfg = (struct cxlflash_cfg *)sdev->host->hostdata;
+	struct device *dev = &cfg->dev->dev;
+	struct llun_info *lli = sdev->hostdata;
+	struct glun_info *gli = lli->parent;
+	struct afu *afu = cfg->afu;
+	bool put_ctx = false;
+
+	res_hndl_t rhndl = release->rsrc_handle;
+
+	int rc = 0;
+	u64 ctxid = DECODE_CTXID(release->context_id),
+	    rctxid = release->context_id;
+
+	struct sisl_rht_entry *rhte;
+	struct sisl_rht_entry_f1 *rhte_f1;
+
+	dev_dbg(dev, "%s: ctxid=%llu rhndl=0x%llx gli->mode=%u gli->users=%u\n",
+		__func__, ctxid, release->rsrc_handle, gli->mode, gli->users);
+
+	if (!ctxi) {
+		ctxi = get_context(cfg, rctxid, lli, CTX_CTRL_ERR_FALLBACK);
+		if (unlikely(!ctxi)) {
+			dev_dbg(dev, "%s: Bad context! (%llu)\n",
+				__func__, ctxid);
+			rc = -EINVAL;
+			goto out;
+		}
+
+		put_ctx = true;
+	}
+
+	rhte = get_rhte(ctxi, rhndl, lli);
+	if (unlikely(!rhte)) {
+		dev_dbg(dev, "%s: Bad resource handle! (%d)\n",
+			__func__, rhndl);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	switch (gli->mode) {
+	case MODE_PHYSICAL:
+		/*
+		 * Clear the Format 1 RHT entry for direct access
+		 * (physical LUN) using the synchronization sequence
+		 * defined in the SISLite specification.
+		 */
+		rhte_f1 = (struct sisl_rht_entry_f1 *)rhte;
+
+		rhte_f1->valid = 0;
+		dma_wmb(); /* Make revocation of RHT entry visible */
+
+		rhte_f1->lun_id = 0;
+		dma_wmb(); /* Make clearing of LUN id visible */
+
+		rhte_f1->dw = 0;
+		dma_wmb(); /* Make RHT entry bottom-half clearing visible */
+
+		if (!ctxi->err_recovery_active)
+			cxlflash_afu_sync(afu, ctxid, rhndl, AFU_HW_SYNC);
+		break;
+	default:
+		WARN(1, "Unsupported LUN mode!");
+		goto out;
+	}
+
+	rhte_checkin(ctxi, rhte);
+	cxlflash_lun_detach(gli);
+
+out:
+	if (put_ctx)
+		put_context(ctxi);
+	dev_dbg(dev, "%s: returning rc=%d\n", __func__, rc);
+	return rc;
+}
+
+int cxlflash_disk_release(struct scsi_device *sdev,
+			  struct dk_cxlflash_release *release)
+{
+	return _cxlflash_disk_release(sdev, NULL, release);
+}
+
+/**
+ * destroy_context() - releases a context
+ * @cfg:	Internal structure associated with the host.
+ * @ctxi:	Context to release.
+ *
+ * Note that the rht_lun member of the context was cut from a single
+ * allocation when the context was created and therefore does not need
+ * to be explicitly freed. Also note that we conditionally check for the
+ * existence of the context control map before clearing the RHT registers
+ * and context capabilities because it is possible to destroy a context
+ * while the context is in the error state (previous mapping was removed
+ * [so we don't have to worry about clearing] and context is waiting for
+ * a new mapping).
+ */
+static void destroy_context(struct cxlflash_cfg *cfg,
+			    struct ctx_info *ctxi)
+{
+	struct afu *afu = cfg->afu;
+
+	WARN_ON(!list_empty(&ctxi->luns));
+
+	/* Clear RHT registers and drop all capabilities for this context */
+	if (afu->afu_map && ctxi->ctrl_map) {
+		writeq_be(0, &ctxi->ctrl_map->rht_start);
+		writeq_be(0, &ctxi->ctrl_map->rht_cnt_id);
+		writeq_be(0, &ctxi->ctrl_map->ctx_cap);
+	}
+
+	/* Free memory associated with context */
+	free_page((ulong)ctxi->rht_start);
+	kfree(ctxi->rht_lun);
+	kfree(ctxi);
+	atomic_dec_if_positive(&cfg->num_user_contexts);
+}
+
+/**
+ * create_context() - allocates and initializes a context
+ * @cfg:	Internal structure associated with the host.
+ * @ctx:	Previously obtained CXL context reference.
+ * @ctxid:	Previously obtained process element associated with CXL context.
+ * @adap_fd:	Previously obtained adapter fd associated with CXL context.
+ * @file:	Previously obtained file associated with CXL context.
+ * @perms:	User-specified permissions.
+ *
+ * The context's mutex is locked when an allocated context is returned.
+ *
+ * Return: Allocated context on success, NULL on failure
+ */
+static struct ctx_info *create_context(struct cxlflash_cfg *cfg,
+				       struct cxl_context *ctx, int ctxid,
+				       int adap_fd, struct file *file,
+				       u32 perms)
+{
+	struct device *dev = &cfg->dev->dev;
+	struct afu *afu = cfg->afu;
+	struct ctx_info *ctxi = NULL;
+	struct llun_info **lli = NULL;
+	struct sisl_rht_entry *rhte;
+
+	ctxi = kzalloc(sizeof(*ctxi), GFP_KERNEL);
+	lli = kzalloc((MAX_RHT_PER_CONTEXT * sizeof(*lli)), GFP_KERNEL);
+	if (unlikely(!ctxi || !lli)) {
+		dev_err(dev, "%s: Unable to allocate context!\n", __func__);
+		goto err;
+	}
+
+	rhte = (struct sisl_rht_entry *)get_zeroed_page(GFP_KERNEL);
+	if (unlikely(!rhte)) {
+		dev_err(dev, "%s: Unable to allocate RHT!\n", __func__);
+		goto err;
+	}
+
+	ctxi->rht_lun = lli;
+	ctxi->rht_start = rhte;
+	ctxi->rht_perms = perms;
+
+	ctxi->ctrl_map = &afu->afu_map->ctrls[ctxid].ctrl;
+	ctxi->ctxid = ENCODE_CTXID(ctxi, ctxid);
+	ctxi->lfd = adap_fd;
+	ctxi->pid = current->tgid; /* tgid = pid */
+	ctxi->ctx = ctx;
+	ctxi->file = file;
+	mutex_init(&ctxi->mutex);
+	INIT_LIST_HEAD(&ctxi->luns);
+	INIT_LIST_HEAD(&ctxi->list); /* initialize for list_empty() */
+
+	atomic_inc(&cfg->num_user_contexts);
+	mutex_lock(&ctxi->mutex);
+out:
+	return ctxi;
+
+err:
+	kfree(lli);
+	kfree(ctxi);
+	ctxi = NULL;
+	goto out;
+}
+
+/**
+ * _cxlflash_disk_detach() - detaches a LUN from a context
+ * @sdev:	SCSI device associated with LUN.
+ * @ctxi:	Context owning resources.
+ * @detach:	Detach ioctl data structure.
+ *
+ * As part of the detach, all per-context resources associated with the LUN
+ * are cleaned up. When detaching the last LUN for a context, the context
+ * itself is cleaned up and released.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+static int _cxlflash_disk_detach(struct scsi_device *sdev,
+				 struct ctx_info *ctxi,
+				 struct dk_cxlflash_detach *detach)
+{
+	struct cxlflash_cfg *cfg = (struct cxlflash_cfg *)sdev->host->hostdata;
+	struct device *dev = &cfg->dev->dev;
+	struct llun_info *lli = sdev->hostdata;
+	struct lun_access *lun_access, *t;
+	struct dk_cxlflash_release rel;
+	bool put_ctx = false;
+
+	int i;
+	int rc = 0;
+	int lfd;
+	u64 ctxid = DECODE_CTXID(detach->context_id),
+	    rctxid = detach->context_id;
+
+	dev_dbg(dev, "%s: ctxid=%llu\n", __func__, ctxid);
+
+	if (!ctxi) {
+		ctxi = get_context(cfg, rctxid, lli, CTX_CTRL_ERR_FALLBACK);
+		if (unlikely(!ctxi)) {
+			dev_dbg(dev, "%s: Bad context! (%llu)\n",
+				__func__, ctxid);
+			rc = -EINVAL;
+			goto out;
+		}
+
+		put_ctx = true;
+	}
+
+	/* Cleanup outstanding resources tied to this LUN */
+	if (ctxi->rht_out) {
+		marshal_det_to_rele(detach, &rel);
+		for (i = 0; i < MAX_RHT_PER_CONTEXT; i++) {
+			if (ctxi->rht_lun[i] == lli) {
+				rel.rsrc_handle = i;
+				_cxlflash_disk_release(sdev, ctxi, &rel);
+			}
+
+			/* No need to loop further if we're done */
+			if (ctxi->rht_out == 0)
+				break;
+		}
+	}
+
+	/* Take our LUN out of context, free the node */
+	list_for_each_entry_safe(lun_access, t, &ctxi->luns, list)
+		if (lun_access->lli == lli) {
+			list_del(&lun_access->list);
+			kfree(lun_access);
+			lun_access = NULL;
+			break;
+		}
+
+	/* Tear down context following last LUN cleanup */
+	if (list_empty(&ctxi->luns)) {
+		ctxi->unavail = true;
+		mutex_unlock(&ctxi->mutex);
+		mutex_lock(&cfg->ctx_tbl_list_mutex);
+		mutex_lock(&ctxi->mutex);
+
+		/* Might not have been in error list so conditionally remove */
+		if (!list_empty(&ctxi->list))
+			list_del(&ctxi->list);
+		cfg->ctx_tbl[ctxid] = NULL;
+		mutex_unlock(&cfg->ctx_tbl_list_mutex);
+		mutex_unlock(&ctxi->mutex);
+
+		lfd = ctxi->lfd;
+		destroy_context(cfg, ctxi);
+		ctxi = NULL;
+		put_ctx = false;
+
+		/*
+		 * As a last step, clean up external resources when not
+		 * already on an external cleanup thread, i.e.: close(adap_fd).
+		 *
+		 * NOTE: this will free up the context from the CXL services,
+		 * allowing it to dole out the same context_id on a future
+		 * (or even currently in-flight) disk_attach operation.
+		 */
+		if (lfd != -1)
+			sys_close(lfd);
+	}
+
+out:
+	if (put_ctx)
+		put_context(ctxi);
+	dev_dbg(dev, "%s: returning rc=%d\n", __func__, rc);
+	return rc;
+}
+
+static int cxlflash_disk_detach(struct scsi_device *sdev,
+				struct dk_cxlflash_detach *detach)
+{
+	return _cxlflash_disk_detach(sdev, NULL, detach);
+}
+
+/**
+ * cxlflash_cxl_release() - release handler for adapter file descriptor
+ * @inode:	File-system inode associated with fd.
+ * @file:	File installed with adapter file descriptor.
+ *
+ * This routine is the release handler for the fops registered with
+ * the CXL services on an initial attach for a context. It is called
+ * when a close is performed on the adapter file descriptor returned
+ * to the user. Programmatically, the user is not required to perform
+ * the close, as it is handled internally via the detach ioctl when
+ * a context is being removed. Note that nothing prevents the user
+ * from performing a close, but the user should be aware that doing
+ * so is considered catastrophic and subsequent usage of the superpipe
+ * API with previously saved off tokens will fail.
+ *
+ * When initiated from an external close (either by the user or via
+ * a process tear down), the routine derives the context reference
+ * and calls detach for each LUN associated with the context. The
+ * final detach operation will cause the context itself to be freed.
+ * Note that the saved off lfd is reset prior to calling detach to
+ * signify that the final detach should not perform a close.
+ *
+ * When initiated from a detach operation as part of the tear down
+ * of a context, the context is first completely freed and then the
+ * close is performed. This routine will fail to derive the context
+ * reference (due to the context having already been freed) and then
+ * call into the CXL release entry point.
+ *
+ * Thus, with exception to when the CXL process element (context id)
+ * lookup fails (a case that should theoretically never occur), every
+ * call into this routine results in a complete freeing of a context.
+ *
+ * As part of the detach, all per-context resources associated with the LUN
+ * are cleaned up. When detaching the last LUN for a context, the context
+ * itself is cleaned up and released.
+ *
+ * Return: 0 on success
+ */
+static int cxlflash_cxl_release(struct inode *inode, struct file *file)
+{
+	struct cxl_context *ctx = cxl_fops_get_context(file);
+	struct cxlflash_cfg *cfg = container_of(file->f_op, struct cxlflash_cfg,
+						cxl_fops);
+	struct device *dev = &cfg->dev->dev;
+	struct ctx_info *ctxi = NULL;
+	struct dk_cxlflash_detach detach = { { 0 }, 0 };
+	struct lun_access *lun_access, *t;
+	enum ctx_ctrl ctrl = CTX_CTRL_ERR_FALLBACK | CTX_CTRL_FILE;
+	int ctxid;
+
+	ctxid = cxl_process_element(ctx);
+	if (unlikely(ctxid < 0)) {
+		dev_err(dev, "%s: Context %p was closed! (%d)\n",
+			__func__, ctx, ctxid);
+		goto out;
+	}
+
+	ctxi = get_context(cfg, ctxid, file, ctrl);
+	if (unlikely(!ctxi)) {
+		ctxi = get_context(cfg, ctxid, file, ctrl | CTX_CTRL_CLONE);
+		if (!ctxi) {
+			dev_dbg(dev, "%s: Context %d already free!\n",
+				__func__, ctxid);
+			goto out_release;
+		}
+
+		dev_dbg(dev, "%s: Another process owns context %d!\n",
+			__func__, ctxid);
+		put_context(ctxi);
+		goto out;
+	}
+
+	dev_dbg(dev, "%s: close(%d) for context %d\n",
+		__func__, ctxi->lfd, ctxid);
+
+	/* Reset the file descriptor to indicate we're on a close() thread */
+	ctxi->lfd = -1;
+	detach.context_id = ctxi->ctxid;
+	list_for_each_entry_safe(lun_access, t, &ctxi->luns, list)
+		_cxlflash_disk_detach(lun_access->sdev, ctxi, &detach);
+out_release:
+	cxl_fd_release(inode, file);
+out:
+	dev_dbg(dev, "%s: returning\n", __func__);
+	return 0;
+}
+
+/**
+ * unmap_context() - clears a previously established mapping
+ * @ctxi:	Context owning the mapping.
+ *
+ * This routine is used to switch between the error notification page
+ * (dummy page of all 1's) and the real mapping (established by the CXL
+ * fault handler).
+ */
+static void unmap_context(struct ctx_info *ctxi)
+{
+	unmap_mapping_range(ctxi->file->f_mapping, 0, 0, 1);
+}
+
+/**
+ * get_err_page() - obtains and allocates the error notification page
+ *
+ * Return: error notification page on success, NULL on failure
+ */
+static struct page *get_err_page(void)
+{
+	struct page *err_page = global.err_page;
+
+	if (unlikely(!err_page)) {
+		err_page = alloc_page(GFP_KERNEL);
+		if (unlikely(!err_page)) {
+			pr_err("%s: Unable to allocate err_page!\n", __func__);
+			goto out;
+		}
+
+		memset(page_address(err_page), -1, PAGE_SIZE);
+
+		/* Serialize update w/ other threads to avoid a leak */
+		mutex_lock(&global.mutex);
+		if (likely(!global.err_page))
+			global.err_page = err_page;
+		else {
+			__free_page(err_page);
+			err_page = global.err_page;
+		}
+		mutex_unlock(&global.mutex);
+	}
+
+out:
+	pr_debug("%s: returning err_page=%p\n", __func__, err_page);
+	return err_page;
+}
+
+/**
+ * cxlflash_mmap_fault() - mmap fault handler for adapter file descriptor
+ * @vma:	VM area associated with mapping.
+ * @vmf:	VM fault associated with current fault.
+ *
+ * To support error notification via MMIO, faults are 'caught' by this routine
+ * that was inserted before passing back the adapter file descriptor on attach.
+ * When a fault occurs, this routine evaluates if error recovery is active and
+ * if so, installs the error page to 'notify' the user about the error state.
+ * During normal operation, the fault is simply handled by the original fault
+ * handler that was installed by CXL services as part of initializing the
+ * adapter file descriptor. The VMA's page protection bits are toggled to
+ * indicate cached/not-cached depending on the memory backing the fault.
+ *
+ * Return: 0 on success, VM_FAULT_SIGBUS on failure
+ */
+static int cxlflash_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct file *file = vma->vm_file;
+	struct cxl_context *ctx = cxl_fops_get_context(file);
+	struct cxlflash_cfg *cfg = container_of(file->f_op, struct cxlflash_cfg,
+						cxl_fops);
+	struct device *dev = &cfg->dev->dev;
+	struct ctx_info *ctxi = NULL;
+	struct page *err_page = NULL;
+	enum ctx_ctrl ctrl = CTX_CTRL_ERR_FALLBACK | CTX_CTRL_FILE;
+	int rc = 0;
+	int ctxid;
+
+	ctxid = cxl_process_element(ctx);
+	if (unlikely(ctxid < 0)) {
+		dev_err(dev, "%s: Context %p was closed! (%d)\n",
+			__func__, ctx, ctxid);
+		goto err;
+	}
+
+	ctxi = get_context(cfg, ctxid, file, ctrl);
+	if (unlikely(!ctxi)) {
+		dev_dbg(dev, "%s: Bad context! (%d)\n", __func__, ctxid);
+		goto err;
+	}
+
+	dev_dbg(dev, "%s: fault(%d) for context %d\n",
+		__func__, ctxi->lfd, ctxid);
+
+	if (likely(!ctxi->err_recovery_active)) {
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+		rc = ctxi->cxl_mmap_vmops->fault(vma, vmf);
+	} else {
+		dev_dbg(dev, "%s: err recovery active, use err_page!\n",
+			__func__);
+
+		err_page = get_err_page();
+		if (unlikely(!err_page)) {
+			dev_err(dev, "%s: Could not obtain error page!\n",
+				__func__);
+			rc = VM_FAULT_RETRY;
+			goto out;
+		}
+
+		get_page(err_page);
+		vmf->page = err_page;
+		vma->vm_page_prot = pgprot_cached(vma->vm_page_prot);
+	}
+
+out:
+	if (likely(ctxi))
+		put_context(ctxi);
+	dev_dbg(dev, "%s: returning rc=%d\n", __func__, rc);
+	return rc;
+
+err:
+	rc = VM_FAULT_SIGBUS;
+	goto out;
+}
+
+/*
+ * Local MMAP vmops to 'catch' faults
+ */
+static const struct vm_operations_struct cxlflash_mmap_vmops = {
+	.fault = cxlflash_mmap_fault,
+};
+
+/**
+ * cxlflash_cxl_mmap() - mmap handler for adapter file descriptor
+ * @file:	File installed with adapter file descriptor.
+ * @vma:	VM area associated with mapping.
+ *
+ * Installs local mmap vmops to 'catch' faults for error notification support.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+static int cxlflash_cxl_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct cxl_context *ctx = cxl_fops_get_context(file);
+	struct cxlflash_cfg *cfg = container_of(file->f_op, struct cxlflash_cfg,
+						cxl_fops);
+	struct device *dev = &cfg->dev->dev;
+	struct ctx_info *ctxi = NULL;
+	enum ctx_ctrl ctrl = CTX_CTRL_ERR_FALLBACK | CTX_CTRL_FILE;
+	int ctxid;
+	int rc = 0;
+
+	ctxid = cxl_process_element(ctx);
+	if (unlikely(ctxid < 0)) {
+		dev_err(dev, "%s: Context %p was closed! (%d)\n",
+			__func__, ctx, ctxid);
+		rc = -EIO;
+		goto out;
+	}
+
+	ctxi = get_context(cfg, ctxid, file, ctrl);
+	if (unlikely(!ctxi)) {
+		dev_dbg(dev, "%s: Bad context! (%d)\n", __func__, ctxid);
+		rc = -EIO;
+		goto out;
+	}
+
+	dev_dbg(dev, "%s: mmap(%d) for context %d\n",
+		__func__, ctxi->lfd, ctxid);
+
+	rc = cxl_fd_mmap(file, vma);
+	if (likely(!rc)) {
+		/* Insert ourself in the mmap fault handler path */
+		ctxi->cxl_mmap_vmops = vma->vm_ops;
+		vma->vm_ops = &cxlflash_mmap_vmops;
+	}
+
+out:
+	if (likely(ctxi))
+		put_context(ctxi);
+	return rc;
+}
+
+/*
+ * Local fops for adapter file descriptor
+ */
+static const struct file_operations cxlflash_cxl_fops = {
+	.owner = THIS_MODULE,
+	.mmap = cxlflash_cxl_mmap,
+	.release = cxlflash_cxl_release,
+};
+
+/**
+ * cxlflash_mark_contexts_error() - move contexts to error state and list
+ * @cfg:	Internal structure associated with the host.
+ *
+ * A context is only moved over to the error list when there are no outstanding
+ * references to it. This ensures that a running operation has completed.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int cxlflash_mark_contexts_error(struct cxlflash_cfg *cfg)
+{
+	int i, rc = 0;
+	struct ctx_info *ctxi = NULL;
+
+	mutex_lock(&cfg->ctx_tbl_list_mutex);
+
+	for (i = 0; i < MAX_CONTEXT; i++) {
+		ctxi = cfg->ctx_tbl[i];
+		if (ctxi) {
+			mutex_lock(&ctxi->mutex);
+			cfg->ctx_tbl[i] = NULL;
+			list_add(&ctxi->list, &cfg->ctx_err_recovery);
+			ctxi->err_recovery_active = true;
+			ctxi->ctrl_map = NULL;
+			unmap_context(ctxi);
+			mutex_unlock(&ctxi->mutex);
+		}
+	}
+
+	mutex_unlock(&cfg->ctx_tbl_list_mutex);
+	return rc;
+}
+
+/*
+ * Dummy NULL fops
+ */
+static const struct file_operations null_fops = {
+	.owner = THIS_MODULE,
+};
+
+/**
+ * cxlflash_disk_attach() - attach a LUN to a context
+ * @sdev:	SCSI device associated with LUN.
+ * @attach:	Attach ioctl data structure.
+ *
+ * Creates a context and attaches LUN to it. A LUN can only be attached
+ * one time to a context (subsequent attaches for the same context/LUN pair
+ * are not supported). Additional LUNs can be attached to a context by
+ * specifying the 'reuse' flag defined in the cxlflash_ioctl.h header.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+static int cxlflash_disk_attach(struct scsi_device *sdev,
+				struct dk_cxlflash_attach *attach)
+{
+	struct cxlflash_cfg *cfg = (struct cxlflash_cfg *)sdev->host->hostdata;
+	struct device *dev = &cfg->dev->dev;
+	struct afu *afu = cfg->afu;
+	struct llun_info *lli = sdev->hostdata;
+	struct glun_info *gli = lli->parent;
+	struct cxl_ioctl_start_work *work;
+	struct ctx_info *ctxi = NULL;
+	struct lun_access *lun_access = NULL;
+	int rc = 0;
+	u32 perms;
+	int ctxid = -1;
+	u64 rctxid = 0UL;
+	struct file *file;
+
+	struct cxl_context *ctx;
+
+	int fd = -1;
+
+	/* On first attach set fileops */
+	if (atomic_read(&cfg->num_user_contexts) == 0)
+		cfg->cxl_fops = cxlflash_cxl_fops;
+
+	if (attach->num_interrupts > 4) {
+		dev_dbg(dev, "%s: Cannot support this many interrupts %llu\n",
+			__func__, attach->num_interrupts);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	if (gli->max_lba == 0) {
+		dev_dbg(dev, "%s: No capacity info for this LUN (%016llX)\n",
+			__func__, lli->lun_id[sdev->channel]);
+		rc = read_cap16(sdev, lli);
+		if (rc) {
+			dev_err(dev, "%s: Invalid device! (%d)\n",
+				__func__, rc);
+			rc = -ENODEV;
+			goto out;
+		}
+		dev_dbg(dev, "%s: LBA = %016llX\n", __func__, gli->max_lba);
+		dev_dbg(dev, "%s: BLK_LEN = %08X\n", __func__, gli->blk_len);
+	}
+
+	if (attach->hdr.flags & DK_CXLFLASH_ATTACH_REUSE_CONTEXT) {
+		rctxid = attach->context_id;
+		ctxi = get_context(cfg, rctxid, NULL, 0);
+		if (!ctxi) {
+			dev_dbg(dev, "%s: Bad context! (%016llX)\n",
+				__func__, rctxid);
+			rc = -EINVAL;
+			goto out;
+		}
+
+		list_for_each_entry(lun_access, &ctxi->luns, list)
+			if (lun_access->lli == lli) {
+				dev_dbg(dev, "%s: Already attached!\n",
+					__func__);
+				rc = -EINVAL;
+				goto out;
+			}
+	}
+
+	lun_access = kzalloc(sizeof(*lun_access), GFP_KERNEL);
+	if (unlikely(!lun_access)) {
+		dev_err(dev, "%s: Unable to allocate lun_access!\n", __func__);
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	lun_access->lli = lli;
+	lun_access->sdev = sdev;
+
+	/* Non-NULL context indicates reuse */
+	if (ctxi) {
+		dev_dbg(dev, "%s: Reusing context for LUN! (%016llX)\n",
+			__func__, rctxid);
+		list_add(&lun_access->list, &ctxi->luns);
+		fd = ctxi->lfd;
+		goto out_attach;
+	}
+
+	ctx = cxl_dev_context_init(cfg->dev);
+	if (unlikely(IS_ERR_OR_NULL(ctx))) {
+		dev_err(dev, "%s: Could not initialize context %p\n",
+			__func__, ctx);
+		rc = -ENODEV;
+		goto err0;
+	}
+
+	ctxid = cxl_process_element(ctx);
+	if (unlikely((ctxid > MAX_CONTEXT) || (ctxid < 0))) {
+		dev_err(dev, "%s: ctxid (%d) invalid!\n", __func__, ctxid);
+		rc = -EPERM;
+		goto err1;
+	}
+
+	file = cxl_get_fd(ctx, &cfg->cxl_fops, &fd);
+	if (unlikely(fd < 0)) {
+		rc = -ENODEV;
+		dev_err(dev, "%s: Could not get file descriptor\n", __func__);
+		goto err1;
+	}
+
+	/* Translate read/write O_* flags from fcntl.h to AFU permission bits */
+	perms = SISL_RHT_PERM(attach->hdr.flags + 1);
+
+	ctxi = create_context(cfg, ctx, ctxid, fd, file, perms);
+	if (unlikely(!ctxi)) {
+		dev_err(dev, "%s: Failed to create context! (%d)\n",
+			__func__, ctxid);
+		goto err2;
+	}
+
+	work = &ctxi->work;
+	work->num_interrupts = attach->num_interrupts;
+	work->flags = CXL_START_WORK_NUM_IRQS;
+
+	rc = cxl_start_work(ctx, work);
+	if (unlikely(rc)) {
+		dev_dbg(dev, "%s: Could not start context rc=%d\n",
+			__func__, rc);
+		goto err3;
+	}
+
+	rc = afu_attach(cfg, ctxi);
+	if (unlikely(rc)) {
+		dev_err(dev, "%s: Could not attach AFU rc %d\n", __func__, rc);
+		goto err4;
+	}
+
+	/*
+	 * No error paths after this point. Once the fd is installed it's
+	 * visible to user space and can't be undone safely on this thread.
+	 * There is no need to worry about a deadlock here because no one
+	 * knows about us yet; we can be the only one holding our mutex.
+	 */
+	list_add(&lun_access->list, &ctxi->luns);
+	mutex_unlock(&ctxi->mutex);
+	mutex_lock(&cfg->ctx_tbl_list_mutex);
+	mutex_lock(&ctxi->mutex);
+	cfg->ctx_tbl[ctxid] = ctxi;
+	mutex_unlock(&cfg->ctx_tbl_list_mutex);
+	fd_install(fd, file);
+
+out_attach:
+	attach->hdr.return_flags = 0;
+	attach->context_id = ctxi->ctxid;
+	attach->block_size = gli->blk_len;
+	attach->mmio_size = sizeof(afu->afu_map->hosts[0].harea);
+	attach->last_lba = gli->max_lba;
+	attach->max_xfer = (sdev->host->max_sectors * 512) / gli->blk_len;
+
+out:
+	attach->adap_fd = fd;
+
+	if (ctxi)
+		put_context(ctxi);
+
+	dev_dbg(dev, "%s: returning ctxid=%d fd=%d bs=%lld rc=%d llba=%lld\n",
+		__func__, ctxid, fd, attach->block_size, rc, attach->last_lba);
+	return rc;
+
+err4:
+	cxl_stop_context(ctx);
+err3:
+	put_context(ctxi);
+	destroy_context(cfg, ctxi);
+	ctxi = NULL;
+err2:
+	/*
+	 * Here, we're overriding the fops with a dummy all-NULL fops because
+	 * fput() calls the release fop, which will cause us to mistakenly
+	 * call into the CXL code. Rather than try to add yet more complexity
+	 * to that routine (cxlflash_cxl_release) we should try to fix the
+	 * issue here.
+	 */
+	file->f_op = &null_fops;
+	fput(file);
+	put_unused_fd(fd);
+	fd = -1;
+err1:
+	cxl_release_context(ctx);
+err0:
+	kfree(lun_access);
+	goto out;
+}
+
+/**
+ * recover_context() - recovers a context in error
+ * @cfg:	Internal structure associated with the host.
+ * @ctxi:	Context to release.
+ *
+ * Restablishes the state for a context-in-error.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+static int recover_context(struct cxlflash_cfg *cfg, struct ctx_info *ctxi)
+{
+	struct device *dev = &cfg->dev->dev;
+	int rc = 0;
+	int old_fd, fd = -1;
+	int ctxid = -1;
+	struct file *file;
+	struct cxl_context *ctx;
+	struct afu *afu = cfg->afu;
+
+	ctx = cxl_dev_context_init(cfg->dev);
+	if (unlikely(IS_ERR_OR_NULL(ctx))) {
+		dev_err(dev, "%s: Could not initialize context %p\n",
+			__func__, ctx);
+		rc = -ENODEV;
+		goto out;
+	}
+
+	ctxid = cxl_process_element(ctx);
+	if (unlikely((ctxid > MAX_CONTEXT) || (ctxid < 0))) {
+		dev_err(dev, "%s: ctxid (%d) invalid!\n", __func__, ctxid);
+		rc = -EPERM;
+		goto err1;
+	}
+
+	file = cxl_get_fd(ctx, &cfg->cxl_fops, &fd);
+	if (unlikely(fd < 0)) {
+		rc = -ENODEV;
+		dev_err(dev, "%s: Could not get file descriptor\n", __func__);
+		goto err1;
+	}
+
+	rc = cxl_start_work(ctx, &ctxi->work);
+	if (unlikely(rc)) {
+		dev_dbg(dev, "%s: Could not start context rc=%d\n",
+			__func__, rc);
+		goto err2;
+	}
+
+	/* Update with new MMIO area based on updated context id */
+	ctxi->ctrl_map = &afu->afu_map->ctrls[ctxid].ctrl;
+
+	rc = afu_attach(cfg, ctxi);
+	if (rc) {
+		dev_err(dev, "%s: Could not attach AFU rc %d\n", __func__, rc);
+		goto err3;
+	}
+
+	/*
+	 * No error paths after this point. Once the fd is installed it's
+	 * visible to user space and can't be undone safely on this thread.
+	 */
+	old_fd = ctxi->lfd;
+	ctxi->ctxid = ENCODE_CTXID(ctxi, ctxid);
+	ctxi->lfd = fd;
+	ctxi->ctx = ctx;
+	ctxi->file = file;
+
+	/*
+	 * Put context back in table (note the reinit of the context list);
+	 * we must first drop the context's mutex and then acquire it in
+	 * order with the table/list mutex to avoid a deadlock - safe to do
+	 * here because no one can find us at this moment in time.
+	 */
+	mutex_unlock(&ctxi->mutex);
+	mutex_lock(&cfg->ctx_tbl_list_mutex);
+	mutex_lock(&ctxi->mutex);
+	list_del_init(&ctxi->list);
+	cfg->ctx_tbl[ctxid] = ctxi;
+	mutex_unlock(&cfg->ctx_tbl_list_mutex);
+	fd_install(fd, file);
+
+	/* Release the original adapter fd and associated CXL resources */
+	sys_close(old_fd);
+out:
+	dev_dbg(dev, "%s: returning ctxid=%d fd=%d rc=%d\n",
+		__func__, ctxid, fd, rc);
+	return rc;
+
+err3:
+	cxl_stop_context(ctx);
+err2:
+	fput(file);
+	put_unused_fd(fd);
+err1:
+	cxl_release_context(ctx);
+	goto out;
+}
+
+/**
+ * check_state() - checks and responds to the current adapter state
+ * @cfg:	Internal structure associated with the host.
+ *
+ * This routine can block and should only be used on process context.
+ * Note that when waking up from waiting in limbo, the state is unknown
+ * and must be checked again before proceeding.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+static int check_state(struct cxlflash_cfg *cfg)
+{
+	struct device *dev = &cfg->dev->dev;
+	int rc = 0;
+
+retry:
+	switch (cfg->state) {
+	case STATE_LIMBO:
+		dev_dbg(dev, "%s: Limbo, going to wait...\n", __func__);
+		rc = wait_event_interruptible(cfg->limbo_waitq,
+					      cfg->state != STATE_LIMBO);
+		if (unlikely(rc))
+			break;
+		goto retry;
+	case STATE_FAILTERM:
+		dev_dbg(dev, "%s: Failed/Terminating!\n", __func__);
+		rc = -ENODEV;
+		break;
+	default:
+		break;
+	}
+
+	return rc;
+}
+
+/**
+ * cxlflash_afu_recover() - initiates AFU recovery
+ * @sdev:	SCSI device associated with LUN.
+ * @recover:	Recover ioctl data structure.
+ *
+ * Only a single recovery is allowed at a time to avoid exhausting CXL
+ * resources (leading to recovery failure) in the event that we're up
+ * against the maximum number of contexts limit. For similar reasons,
+ * a context recovery is retried if there are multiple recoveries taking
+ * place at the same time and the failure was due to CXL services being
+ * unable to keep up.
+ *
+ * Because a user can detect an error condition before the kernel, it is
+ * quite possible for this routine to act as the kernel's EEH detection
+ * source (MMIO read of mbox_r). Because of this, there is a window of
+ * time where an EEH might have been detected but not yet 'serviced'
+ * (callback invoked, causing the device to enter limbo state). To avoid
+ * looping in this routine during that window, a 1 second sleep is in place
+ * between the time the MMIO failure is detected and the time a wait on the
+ * limbo wait queue is attempted via check_state().
+ *
+ * Return: 0 on success, -errno on failure
+ */
+static int cxlflash_afu_recover(struct scsi_device *sdev,
+				struct dk_cxlflash_recover_afu *recover)
+{
+	struct cxlflash_cfg *cfg = (struct cxlflash_cfg *)sdev->host->hostdata;
+	struct device *dev = &cfg->dev->dev;
+	struct llun_info *lli = sdev->hostdata;
+	struct afu *afu = cfg->afu;
+	struct ctx_info *ctxi = NULL;
+	struct mutex *mutex = &cfg->ctx_recovery_mutex;
+	u64 ctxid = DECODE_CTXID(recover->context_id),
+	    rctxid = recover->context_id;
+	long reg;
+	int lretry = 20; /* up to 2 seconds */
+	int rc = 0;
+
+	atomic_inc(&cfg->recovery_threads);
+	rc = mutex_lock_interruptible(mutex);
+	if (rc)
+		goto out;
+
+	dev_dbg(dev, "%s: reason 0x%016llX rctxid=%016llX\n",
+		__func__, recover->reason, rctxid);
+
+retry:
+	/* Ensure that this process is attached to the context */
+	ctxi = get_context(cfg, rctxid, lli, CTX_CTRL_ERR_FALLBACK);
+	if (unlikely(!ctxi)) {
+		dev_dbg(dev, "%s: Bad context! (%llu)\n", __func__, ctxid);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	if (ctxi->err_recovery_active) {
+retry_recover:
+		rc = recover_context(cfg, ctxi);
+		if (unlikely(rc)) {
+			dev_err(dev, "%s: Recovery failed for context %llu (rc=%d)\n",
+				__func__, ctxid, rc);
+			if ((rc == -ENODEV) &&
+			    ((atomic_read(&cfg->recovery_threads) > 1) ||
+			     (lretry--))) {
+				dev_dbg(dev, "%s: Going to try again!\n",
+					__func__);
+				mutex_unlock(mutex);
+				msleep(100);
+				rc = mutex_lock_interruptible(mutex);
+				if (rc)
+					goto out;
+				goto retry_recover;
+			}
+
+			goto out;
+		}
+
+		ctxi->err_recovery_active = false;
+		recover->context_id = ctxi->ctxid;
+		recover->adap_fd = ctxi->lfd;
+		recover->mmio_size = sizeof(afu->afu_map->hosts[0].harea);
+		recover->hdr.return_flags |=
+			DK_CXLFLASH_RECOVER_AFU_CONTEXT_RESET;
+		goto out;
+	}
+
+	/* Test if in error state */
+	reg = readq_be(&afu->ctrl_map->mbox_r);
+	if (reg == -1) {
+		dev_dbg(dev, "%s: MMIO read fail! Wait for recovery...\n",
+			__func__);
+		mutex_unlock(&ctxi->mutex);
+		ctxi = NULL;
+		ssleep(1);
+		rc = check_state(cfg);
+		if (unlikely(rc))
+			goto out;
+		goto retry;
+	}
+
+	dev_dbg(dev, "%s: MMIO working, no recovery required!\n", __func__);
+out:
+	if (likely(ctxi))
+		put_context(ctxi);
+	mutex_unlock(mutex);
+	atomic_dec_if_positive(&cfg->recovery_threads);
+	return rc;
+}
+
+/**
+ * process_sense() - evaluates and processes sense data
+ * @sdev:	SCSI device associated with LUN.
+ * @verify:	Verify ioctl data structure.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+static int process_sense(struct scsi_device *sdev,
+			 struct dk_cxlflash_verify *verify)
+{
+	struct cxlflash_cfg *cfg = (struct cxlflash_cfg *)sdev->host->hostdata;
+	struct device *dev = &cfg->dev->dev;
+	struct llun_info *lli = sdev->hostdata;
+	struct glun_info *gli = lli->parent;
+	u64 prev_lba = gli->max_lba;
+	struct scsi_sense_hdr sshdr = { 0 };
+	int rc = 0;
+
+	rc = scsi_normalize_sense((const u8 *)&verify->sense_data,
+				  DK_CXLFLASH_VERIFY_SENSE_LEN, &sshdr);
+	if (!rc) {
+		dev_err(dev, "%s: Failed to normalize sense data!\n", __func__);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	switch (sshdr.sense_key) {
+	case NO_SENSE:
+	case RECOVERED_ERROR:
+		/* fall through */
+	case NOT_READY:
+		break;
+	case UNIT_ATTENTION:
+		switch (sshdr.asc) {
+		case 0x29: /* Power on Reset or Device Reset */
+			/* fall through */
+		case 0x2A: /* Device settings/capacity changed */
+			rc = read_cap16(sdev, lli);
+			if (rc) {
+				rc = -ENODEV;
+				break;
+			}
+			if (prev_lba != gli->max_lba)
+				dev_dbg(dev, "%s: Capacity changed old=%lld "
+					"new=%lld\n", __func__, prev_lba,
+					gli->max_lba);
+			break;
+		case 0x3F: /* Report LUNs changed, Rescan. */
+			scsi_scan_host(cfg->host);
+			break;
+		default:
+			rc = -EIO;
+			break;
+		}
+		break;
+	default:
+		rc = -EIO;
+		break;
+	}
+out:
+	dev_dbg(dev, "%s: sense_key %x asc %x ascq %x rc %d\n", __func__,
+		sshdr.sense_key, sshdr.asc, sshdr.ascq, rc);
+	return rc;
+}
+
+/**
+ * cxlflash_disk_verify() - verifies a LUN is the same and handle size changes
+ * @sdev:	SCSI device associated with LUN.
+ * @verify:	Verify ioctl data structure.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+static int cxlflash_disk_verify(struct scsi_device *sdev,
+				struct dk_cxlflash_verify *verify)
+{
+	int rc = 0;
+	struct ctx_info *ctxi = NULL;
+	struct cxlflash_cfg *cfg = (struct cxlflash_cfg *)sdev->host->hostdata;
+	struct device *dev = &cfg->dev->dev;
+	struct llun_info *lli = sdev->hostdata;
+	struct glun_info *gli = lli->parent;
+	struct sisl_rht_entry *rhte = NULL;
+	res_hndl_t rhndl = verify->rsrc_handle;
+	u64 ctxid = DECODE_CTXID(verify->context_id),
+	    rctxid = verify->context_id;
+	u64 last_lba = 0;
+
+	dev_dbg(dev, "%s: ctxid=%llu rhndl=%016llX, hint=%016llX, "
+		"flags=%016llX\n", __func__, ctxid, verify->rsrc_handle,
+		verify->hint, verify->hdr.flags);
+
+	ctxi = get_context(cfg, rctxid, lli, 0);
+	if (unlikely(!ctxi)) {
+		dev_dbg(dev, "%s: Bad context! (%llu)\n", __func__, ctxid);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	rhte = get_rhte(ctxi, rhndl, lli);
+	if (unlikely(!rhte)) {
+		dev_dbg(dev, "%s: Bad resource handle! (%d)\n",
+			__func__, rhndl);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * Look at the hint/sense to see if it requires us to redrive
+	 * inquiry (i.e. the Unit attention is due to the WWN changing).
+	 */
+	if (verify->hint & DK_CXLFLASH_VERIFY_HINT_SENSE) {
+		rc = process_sense(sdev, verify);
+		if (unlikely(rc)) {
+			dev_err(dev, "%s: Failed to validate sense data (%d)\n",
+				__func__, rc);
+			goto out;
+		}
+	}
+
+	switch (gli->mode) {
+	case MODE_PHYSICAL:
+		last_lba = gli->max_lba;
+		break;
+	default:
+		WARN(1, "Unsupported LUN mode!");
+	}
+
+	verify->last_lba = last_lba;
+
+out:
+	if (likely(ctxi))
+		put_context(ctxi);
+	dev_dbg(dev, "%s: returning rc=%d llba=%llX\n",
+		__func__, rc, verify->last_lba);
+	return rc;
+}
+
+/**
+ * decode_ioctl() - translates an encoded ioctl to an easily identifiable string
+ * @cmd:	The ioctl command to decode.
+ *
+ * Return: A string identifying the decoded ioctl.
+ */
+static char *decode_ioctl(int cmd)
+{
+	switch (cmd) {
+	case DK_CXLFLASH_ATTACH:
+		return __stringify_1(DK_CXLFLASH_ATTACH);
+	case DK_CXLFLASH_USER_DIRECT:
+		return __stringify_1(DK_CXLFLASH_USER_DIRECT);
+	case DK_CXLFLASH_RELEASE:
+		return __stringify_1(DK_CXLFLASH_RELEASE);
+	case DK_CXLFLASH_DETACH:
+		return __stringify_1(DK_CXLFLASH_DETACH);
+	case DK_CXLFLASH_VERIFY:
+		return __stringify_1(DK_CXLFLASH_VERIFY);
+	case DK_CXLFLASH_RECOVER_AFU:
+		return __stringify_1(DK_CXLFLASH_RECOVER_AFU);
+	case DK_CXLFLASH_MANAGE_LUN:
+		return __stringify_1(DK_CXLFLASH_MANAGE_LUN);
+	}
+
+	return "UNKNOWN";
+}
+
+/**
+ * cxlflash_disk_direct_open() - opens a direct (physical) disk
+ * @sdev:	SCSI device associated with LUN.
+ * @arg:	UDirect ioctl data structure.
+ *
+ * On successful return, the user is informed of the resource handle
+ * to be used to identify the direct lun and the size (in blocks) of
+ * the direct lun in last LBA format.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+static int cxlflash_disk_direct_open(struct scsi_device *sdev, void *arg)
+{
+	struct cxlflash_cfg *cfg = (struct cxlflash_cfg *)sdev->host->hostdata;
+	struct device *dev = &cfg->dev->dev;
+	struct afu *afu = cfg->afu;
+	struct llun_info *lli = sdev->hostdata;
+	struct glun_info *gli = lli->parent;
+
+	struct dk_cxlflash_udirect *pphys = (struct dk_cxlflash_udirect *)arg;
+
+	u64 ctxid = DECODE_CTXID(pphys->context_id),
+	    rctxid = pphys->context_id;
+	u64 lun_size = 0;
+	u64 last_lba = 0;
+	u64 rsrc_handle = -1;
+	u32 port = CHAN2PORT(sdev->channel);
+
+	int rc = 0;
+
+	struct ctx_info *ctxi = NULL;
+	struct sisl_rht_entry *rhte = NULL;
+
+	pr_debug("%s: ctxid=%llu ls=0x%llx\n", __func__, ctxid, lun_size);
+
+	rc = cxlflash_lun_attach(gli, MODE_PHYSICAL, false);
+	if (unlikely(rc)) {
+		dev_dbg(dev, "%s: Failed to attach to LUN! (PHYSICAL)\n",
+			__func__);
+		goto out;
+	}
+
+	ctxi = get_context(cfg, rctxid, lli, 0);
+	if (unlikely(!ctxi)) {
+		dev_dbg(dev, "%s: Bad context! (%llu)\n", __func__, ctxid);
+		rc = -EINVAL;
+		goto err1;
+	}
+
+	rhte = rhte_checkout(ctxi, lli);
+	if (unlikely(!rhte)) {
+		dev_dbg(dev, "%s: too many opens for this context\n", __func__);
+		rc = -EMFILE;	/* too many opens  */
+		goto err1;
+	}
+
+	rsrc_handle = (rhte - ctxi->rht_start);
+
+	rht_format1(rhte, lli->lun_id[sdev->channel], ctxi->rht_perms, port);
+	cxlflash_afu_sync(afu, ctxid, rsrc_handle, AFU_LW_SYNC);
+
+	last_lba = gli->max_lba;
+	pphys->hdr.return_flags = 0;
+	pphys->last_lba = last_lba;
+	pphys->rsrc_handle = rsrc_handle;
+
+out:
+	if (likely(ctxi))
+		put_context(ctxi);
+	dev_dbg(dev, "%s: returning handle 0x%llx rc=%d llba %lld\n",
+		__func__, rsrc_handle, rc, last_lba);
+	return rc;
+
+err1:
+	cxlflash_lun_detach(gli);
+	goto out;
+}
+
+/**
+ * ioctl_common() - common IOCTL handler for driver
+ * @sdev:	SCSI device associated with LUN.
+ * @cmd:	IOCTL command.
+ *
+ * Handles common fencing operations that are valid for multiple ioctls. Always
+ * allow through ioctls that are cleanup oriented in nature, even when operating
+ * in a failed/terminating state.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+static int ioctl_common(struct scsi_device *sdev, int cmd)
+{
+	struct cxlflash_cfg *cfg = (struct cxlflash_cfg *)sdev->host->hostdata;
+	struct device *dev = &cfg->dev->dev;
+	struct llun_info *lli = sdev->hostdata;
+	int rc = 0;
+
+	if (unlikely(!lli)) {
+		dev_dbg(dev, "%s: Unknown LUN\n", __func__);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	rc = check_state(cfg);
+	if (unlikely(rc) && (cfg->state == STATE_FAILTERM)) {
+		switch (cmd) {
+		case DK_CXLFLASH_RELEASE:
+		case DK_CXLFLASH_DETACH:
+			dev_dbg(dev, "%s: Command override! (%d)\n",
+				__func__, rc);
+			rc = 0;
+			break;
+		}
+	}
+out:
+	return rc;
+}
+
+/**
+ * cxlflash_ioctl() - IOCTL handler for driver
+ * @sdev:	SCSI device associated with LUN.
+ * @cmd:	IOCTL command.
+ * @arg:	Userspace ioctl data structure.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int cxlflash_ioctl(struct scsi_device *sdev, int cmd, void __user *arg)
+{
+	typedef int (*sioctl) (struct scsi_device *, void *);
+
+	struct cxlflash_cfg *cfg = (struct cxlflash_cfg *)sdev->host->hostdata;
+	struct device *dev = &cfg->dev->dev;
+	struct afu *afu = cfg->afu;
+	struct dk_cxlflash_hdr *hdr;
+	char buf[sizeof(union cxlflash_ioctls)];
+	size_t size = 0;
+	bool known_ioctl = false;
+	int idx;
+	int rc = 0;
+	struct Scsi_Host *shost = sdev->host;
+	sioctl do_ioctl = NULL;
+
+	static const struct {
+		size_t size;
+		sioctl ioctl;
+	} ioctl_tbl[] = {	/* NOTE: order matters here */
+	{sizeof(struct dk_cxlflash_attach), (sioctl)cxlflash_disk_attach},
+	{sizeof(struct dk_cxlflash_udirect), cxlflash_disk_direct_open},
+	{sizeof(struct dk_cxlflash_release), (sioctl)cxlflash_disk_release},
+	{sizeof(struct dk_cxlflash_detach), (sioctl)cxlflash_disk_detach},
+	{sizeof(struct dk_cxlflash_verify), (sioctl)cxlflash_disk_verify},
+	{sizeof(struct dk_cxlflash_recover_afu), (sioctl)cxlflash_afu_recover},
+	{sizeof(struct dk_cxlflash_manage_lun), (sioctl)cxlflash_manage_lun},
+	};
+
+	/* Restrict command set to physical support only for internal LUN */
+	if (afu->internal_lun)
+		switch (cmd) {
+		case DK_CXLFLASH_RELEASE:
+			dev_dbg(dev, "%s: %s not supported for lun_mode=%d\n",
+				__func__, decode_ioctl(cmd), afu->internal_lun);
+			rc = -EINVAL;
+			goto cxlflash_ioctl_exit;
+		}
+
+	switch (cmd) {
+	case DK_CXLFLASH_ATTACH:
+	case DK_CXLFLASH_USER_DIRECT:
+	case DK_CXLFLASH_RELEASE:
+	case DK_CXLFLASH_DETACH:
+	case DK_CXLFLASH_VERIFY:
+	case DK_CXLFLASH_RECOVER_AFU:
+		dev_dbg(dev, "%s: %s (%08X) on dev(%d/%d/%d/%llu)\n",
+			__func__, decode_ioctl(cmd), cmd, shost->host_no,
+			sdev->channel, sdev->id, sdev->lun);
+		rc = ioctl_common(sdev, cmd);
+		if (unlikely(rc))
+			goto cxlflash_ioctl_exit;
+
+		/* fall through */
+
+	case DK_CXLFLASH_MANAGE_LUN:
+		known_ioctl = true;
+		idx = _IOC_NR(cmd) - _IOC_NR(DK_CXLFLASH_ATTACH);
+		size = ioctl_tbl[idx].size;
+		do_ioctl = ioctl_tbl[idx].ioctl;
+
+		if (likely(do_ioctl))
+			break;
+
+		/* fall through */
+	default:
+		rc = -EINVAL;
+		goto cxlflash_ioctl_exit;
+	}
+
+	if (unlikely(copy_from_user(&buf, arg, size))) {
+		dev_err(dev, "%s: copy_from_user() fail! "
+			"size=%lu cmd=%d (%s) arg=%p\n",
+			__func__, size, cmd, decode_ioctl(cmd), arg);
+		rc = -EFAULT;
+		goto cxlflash_ioctl_exit;
+	}
+
+	hdr = (struct dk_cxlflash_hdr *)&buf;
+	if (hdr->version != DK_CXLFLASH_VERSION_0) {
+		dev_dbg(dev, "%s: Version %u not supported for %s\n",
+			__func__, hdr->version, decode_ioctl(cmd));
+		rc = -EINVAL;
+		goto cxlflash_ioctl_exit;
+	}
+
+	if (hdr->rsvd[0] || hdr->rsvd[1] || hdr->rsvd[2] || hdr->return_flags) {
+		dev_dbg(dev, "%s: Reserved/rflags populated!\n", __func__);
+		rc = -EINVAL;
+		goto cxlflash_ioctl_exit;
+	}
+
+	rc = do_ioctl(sdev, (void *)&buf);
+	if (likely(!rc))
+		if (unlikely(copy_to_user(arg, &buf, size))) {
+			dev_err(dev, "%s: copy_to_user() fail! "
+				"size=%lu cmd=%d (%s) arg=%p\n",
+				__func__, size, cmd, decode_ioctl(cmd), arg);
+			rc = -EFAULT;
+		}
+
+	/* fall through to exit */
+
+cxlflash_ioctl_exit:
+	if (unlikely(rc && known_ioctl))
+		dev_err(dev, "%s: ioctl %s (%08X) on dev(%d/%d/%d/%llu) "
+			"returned rc %d\n", __func__,
+			decode_ioctl(cmd), cmd, shost->host_no,
+			sdev->channel, sdev->id, sdev->lun, rc);
+	else
+		dev_dbg(dev, "%s: ioctl %s (%08X) on dev(%d/%d/%d/%llu) "
+			"returned rc %d\n", __func__, decode_ioctl(cmd),
+			cmd, shost->host_no, sdev->channel, sdev->id,
+			sdev->lun, rc);
+	return rc;
+}
diff --git a/drivers/scsi/cxlflash/superpipe.h b/drivers/scsi/cxlflash/superpipe.h
new file mode 100644
index 000000000000..ae39b9627118
--- /dev/null
+++ b/drivers/scsi/cxlflash/superpipe.h
@@ -0,0 +1,132 @@
+/*
+ * CXL Flash Device Driver
+ *
+ * Written by: Manoj N. Kumar <manoj@linux.vnet.ibm.com>, IBM Corporation
+ *             Matthew R. Ochs <mrochs@linux.vnet.ibm.com>, IBM Corporation
+ *
+ * Copyright (C) 2015 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _CXLFLASH_SUPERPIPE_H
+#define _CXLFLASH_SUPERPIPE_H
+
+extern struct cxlflash_global global;
+
+/*
+ * Terminology: use afu (and not adapter) to refer to the HW.
+ * Adapter is the entire slot and includes PSL out of which
+ * only the AFU is visible to user space.
+ */
+
+/* Chunk size parms: note sislite minimum chunk size is
+   0x10000 LBAs corresponding to a NMASK or 16.
+*/
+#define MC_CHUNK_SIZE     (1 << MC_RHT_NMASK)	/* in LBAs */
+
+#define MC_DISCOVERY_TIMEOUT 5  /* 5 secs */
+
+#define CHAN2PORT(_x)	((_x) + 1)
+
+enum lun_mode {
+	MODE_NONE = 0,
+	MODE_PHYSICAL
+};
+
+/* Global (entire driver, spans adapters) lun_info structure */
+struct glun_info {
+	u64 max_lba;		/* from read cap(16) */
+	u32 blk_len;		/* from read cap(16) */
+	enum lun_mode mode;	/* NONE, PHYSICAL */
+	int users;		/* Number of users w/ references to LUN */
+
+	u8 wwid[16];
+
+	struct mutex mutex;
+
+	struct list_head list;
+};
+
+/* Local (per-adapter) lun_info structure */
+struct llun_info {
+	u64 lun_id[CXLFLASH_NUM_FC_PORTS]; /* from REPORT_LUNS */
+	u32 lun_index;		/* Index in the LUN table */
+	u32 host_no;		/* host_no from Scsi_host */
+	u32 port_sel;		/* What port to use for this LUN */
+	bool newly_created;	/* Whether the LUN was just discovered */
+
+	u8 wwid[16];		/* Keep a duplicate copy here? */
+
+	struct glun_info *parent; /* Pointer to entry in global LUN structure */
+	struct scsi_device *sdev;
+	struct list_head list;
+};
+
+struct lun_access {
+	struct llun_info *lli;
+	struct scsi_device *sdev;
+	struct list_head list;
+};
+
+enum ctx_ctrl {
+	CTX_CTRL_CLONE		= (1 << 1),
+	CTX_CTRL_ERR		= (1 << 2),
+	CTX_CTRL_ERR_FALLBACK	= (1 << 3),
+	CTX_CTRL_NOPID		= (1 << 4),
+	CTX_CTRL_FILE		= (1 << 5)
+};
+
+#define ENCODE_CTXID(_ctx, _id)	(((((u64)_ctx) & 0xFFFFFFFF0) << 28) | _id)
+#define DECODE_CTXID(_val)	(_val & 0xFFFFFFFF)
+
+struct ctx_info {
+	struct sisl_ctrl_map *ctrl_map; /* initialized at startup */
+	struct sisl_rht_entry *rht_start; /* 1 page (req'd for alignment),
+					     alloc/free on attach/detach */
+	u32 rht_out;		/* Number of checked out RHT entries */
+	u32 rht_perms;		/* User-defined permissions for RHT entries */
+	struct llun_info **rht_lun;       /* Mapping of RHT entries to LUNs */
+
+	struct cxl_ioctl_start_work work;
+	u64 ctxid;
+	int lfd;
+	pid_t pid;
+	bool unavail;
+	bool err_recovery_active;
+	struct mutex mutex; /* Context protection */
+	struct cxl_context *ctx;
+	struct list_head luns;	/* LUNs attached to this context */
+	const struct vm_operations_struct *cxl_mmap_vmops;
+	struct file *file;
+	struct list_head list; /* Link contexts in error recovery */
+};
+
+struct cxlflash_global {
+	struct mutex mutex;
+	struct list_head gluns;/* list of glun_info structs */
+	struct page *err_page; /* One page of all 0xF for error notification */
+};
+
+int cxlflash_disk_release(struct scsi_device *, struct dk_cxlflash_release *);
+int _cxlflash_disk_release(struct scsi_device *, struct ctx_info *,
+			   struct dk_cxlflash_release *);
+
+int cxlflash_lun_attach(struct glun_info *, enum lun_mode, bool);
+void cxlflash_lun_detach(struct glun_info *);
+
+struct ctx_info *get_context(struct cxlflash_cfg *, u64, void *, enum ctx_ctrl);
+void put_context(struct ctx_info *);
+
+struct sisl_rht_entry *get_rhte(struct ctx_info *, res_hndl_t,
+				struct llun_info *);
+
+struct sisl_rht_entry *rhte_checkout(struct ctx_info *, struct llun_info *);
+void rhte_checkin(struct ctx_info *, struct sisl_rht_entry *);
+
+int cxlflash_manage_lun(struct scsi_device *, struct dk_cxlflash_manage_lun *);
+
+#endif /* ifndef _CXLFLASH_SUPERPIPE_H */
diff --git a/include/uapi/scsi/Kbuild b/include/uapi/scsi/Kbuild
index 75746d52f208..d791e0ad509d 100644
--- a/include/uapi/scsi/Kbuild
+++ b/include/uapi/scsi/Kbuild
@@ -3,3 +3,4 @@ header-y += fc/
 header-y += scsi_bsg_fc.h
 header-y += scsi_netlink.h
 header-y += scsi_netlink_fc.h
+header-y += cxlflash_ioctl.h
diff --git a/include/uapi/scsi/cxlflash_ioctl.h b/include/uapi/scsi/cxlflash_ioctl.h
new file mode 100644
index 000000000000..570773406531
--- /dev/null
+++ b/include/uapi/scsi/cxlflash_ioctl.h
@@ -0,0 +1,140 @@
+/*
+ * CXL Flash Device Driver
+ *
+ * Written by: Manoj N. Kumar <manoj@linux.vnet.ibm.com>, IBM Corporation
+ *             Matthew R. Ochs <mrochs@linux.vnet.ibm.com>, IBM Corporation
+ *
+ * Copyright (C) 2015 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _CXLFLASH_IOCTL_H
+#define _CXLFLASH_IOCTL_H
+
+#include <linux/types.h>
+
+/*
+ * Structure and flag definitions CXL Flash superpipe ioctls
+ */
+
+#define DK_CXLFLASH_VERSION_0	0
+
+struct dk_cxlflash_hdr {
+	__u16 version;			/* Version data */
+	__u16 rsvd[3];			/* Reserved for future use */
+	__u64 flags;			/* Input flags */
+	__u64 return_flags;		/* Returned flags */
+};
+
+/*
+ * Notes:
+ * -----
+ * The 'context_id' field of all ioctl structures contains the context
+ * identifier for a context in the lower 32-bits (upper 32-bits are not
+ * to be used when identifying a context to the AFU). That said, the value
+ * in its entirety (all 64-bits) is to be treated as an opaque cookie and
+ * should be presented as such when issuing ioctls.
+ *
+ * For DK_CXLFLASH_ATTACH ioctl, user specifies read/write access
+ * permissions via the O_RDONLY, O_WRONLY, and O_RDWR flags defined in
+ * the fcntl.h header file.
+ */
+#define DK_CXLFLASH_ATTACH_REUSE_CONTEXT	0x8000000000000000ULL
+
+struct dk_cxlflash_attach {
+	struct dk_cxlflash_hdr hdr;	/* Common fields */
+	__u64 num_interrupts;		/* Requested number of interrupts */
+	__u64 context_id;		/* Returned context */
+	__u64 mmio_size;		/* Returned size of MMIO area */
+	__u64 block_size;		/* Returned block size, in bytes */
+	__u64 adap_fd;			/* Returned adapter file descriptor */
+	__u64 last_lba;			/* Returned last LBA on the device */
+	__u64 max_xfer;			/* Returned max transfer size, blocks */
+	__u64 reserved[8];		/* Reserved for future use */
+};
+
+struct dk_cxlflash_detach {
+	struct dk_cxlflash_hdr hdr;	/* Common fields */
+	__u64 context_id;		/* Context to detach */
+	__u64 reserved[8];		/* Reserved for future use */
+};
+
+struct dk_cxlflash_udirect {
+	struct dk_cxlflash_hdr hdr;	/* Common fields */
+	__u64 context_id;		/* Context to own physical resources */
+	__u64 rsrc_handle;		/* Returned resource handle */
+	__u64 last_lba;			/* Returned last LBA on the device */
+	__u64 reserved[8];		/* Reserved for future use */
+};
+
+struct dk_cxlflash_release {
+	struct dk_cxlflash_hdr hdr;	/* Common fields */
+	__u64 context_id;		/* Context owning resources */
+	__u64 rsrc_handle;		/* Resource handle to release */
+	__u64 reserved[8];		/* Reserved for future use */
+};
+
+#define DK_CXLFLASH_VERIFY_SENSE_LEN	18
+#define DK_CXLFLASH_VERIFY_HINT_SENSE	0x8000000000000000ULL
+
+struct dk_cxlflash_verify {
+	struct dk_cxlflash_hdr hdr;	/* Common fields */
+	__u64 context_id;		/* Context owning resources to verify */
+	__u64 rsrc_handle;		/* Resource handle of LUN */
+	__u64 hint;			/* Reasons for verify */
+	__u64 last_lba;			/* Returned last LBA of device */
+	__u8 sense_data[DK_CXLFLASH_VERIFY_SENSE_LEN]; /* SCSI sense data */
+	__u8 pad[6];			/* Pad to next 8-byte boundary */
+	__u64 reserved[8];		/* Reserved for future use */
+};
+
+#define DK_CXLFLASH_RECOVER_AFU_CONTEXT_RESET	0x8000000000000000ULL
+
+struct dk_cxlflash_recover_afu {
+	struct dk_cxlflash_hdr hdr;	/* Common fields */
+	__u64 reason;			/* Reason for recovery request */
+	__u64 context_id;		/* Context to recover / updated ID */
+	__u64 mmio_size;		/* Returned size of MMIO area */
+	__u64 adap_fd;			/* Returned adapter file descriptor */
+	__u64 reserved[8];		/* Reserved for future use */
+};
+
+#define DK_CXLFLASH_MANAGE_LUN_WWID_LEN			16
+#define DK_CXLFLASH_MANAGE_LUN_ENABLE_SUPERPIPE		0x8000000000000000ULL
+#define DK_CXLFLASH_MANAGE_LUN_DISABLE_SUPERPIPE	0x4000000000000000ULL
+#define DK_CXLFLASH_MANAGE_LUN_ALL_PORTS_ACCESSIBLE	0x2000000000000000ULL
+
+struct dk_cxlflash_manage_lun {
+	struct dk_cxlflash_hdr hdr;			/* Common fields */
+	__u8 wwid[DK_CXLFLASH_MANAGE_LUN_WWID_LEN];	/* Page83 WWID, NAA-6 */
+	__u64 reserved[8];				/* Rsvd, future use */
+};
+
+union cxlflash_ioctls {
+	struct dk_cxlflash_attach attach;
+	struct dk_cxlflash_detach detach;
+	struct dk_cxlflash_udirect udirect;
+	struct dk_cxlflash_release release;
+	struct dk_cxlflash_verify verify;
+	struct dk_cxlflash_recover_afu recover_afu;
+	struct dk_cxlflash_manage_lun manage_lun;
+};
+
+#define MAX_CXLFLASH_IOCTL_SZ	(sizeof(union cxlflash_ioctls))
+
+#define CXL_MAGIC 0xCA
+#define CXL_IOWR(_n, _s)	_IOWR(CXL_MAGIC, _n, struct _s)
+
+#define DK_CXLFLASH_ATTACH		CXL_IOWR(0x80, dk_cxlflash_attach)
+#define DK_CXLFLASH_USER_DIRECT		CXL_IOWR(0x81, dk_cxlflash_udirect)
+#define DK_CXLFLASH_RELEASE		CXL_IOWR(0x82, dk_cxlflash_release)
+#define DK_CXLFLASH_DETACH		CXL_IOWR(0x83, dk_cxlflash_detach)
+#define DK_CXLFLASH_VERIFY		CXL_IOWR(0x84, dk_cxlflash_verify)
+#define DK_CXLFLASH_RECOVER_AFU		CXL_IOWR(0x85, dk_cxlflash_recover_afu)
+#define DK_CXLFLASH_MANAGE_LUN		CXL_IOWR(0x86, dk_cxlflash_manage_lun)
+
+#endif /* ifndef _CXLFLASH_IOCTL_H */
-- 
cgit v1.2.3


From 2cb79266d6b229dbebd31fe114af1bdab25c8076 Mon Sep 17 00:00:00 2001
From: "Matthew R. Ochs" <mrochs@linux.vnet.ibm.com>
Date: Thu, 13 Aug 2015 21:47:53 -0500
Subject: cxlflash: Virtual LUN support

Add support for physical LUN segmentation (virtual LUNs) to device
driver supporting the IBM CXL Flash adapter. This patch allows user
space applications to virtually segment a physical LUN into N virtual
LUNs, taking advantage of the translation features provided by this
adapter.

Signed-off-by: Matthew R. Ochs <mrochs@linux.vnet.ibm.com>
Signed-off-by: Manoj N. Kumar <manoj@linux.vnet.ibm.com>
Reviewed-by: Michael Neuling <mikey@neuling.org>
Reviewed-by: Wen Xiong <wenxiong@linux.vnet.ibm.com>
Signed-off-by: James Bottomley <JBottomley@Odin.com>
---
 Documentation/powerpc/cxlflash.txt |   63 +-
 drivers/scsi/cxlflash/Makefile     |    2 +-
 drivers/scsi/cxlflash/common.h     |    4 +
 drivers/scsi/cxlflash/lunmgt.c     |    3 +
 drivers/scsi/cxlflash/main.c       |   13 +
 drivers/scsi/cxlflash/sislite.h    |   20 +-
 drivers/scsi/cxlflash/superpipe.c  |   82 ++-
 drivers/scsi/cxlflash/superpipe.h  |   17 +-
 drivers/scsi/cxlflash/vlun.c       | 1243 ++++++++++++++++++++++++++++++++++++
 drivers/scsi/cxlflash/vlun.h       |   86 +++
 include/uapi/scsi/cxlflash_ioctl.h |   34 +
 11 files changed, 1550 insertions(+), 17 deletions(-)
 create mode 100644 drivers/scsi/cxlflash/vlun.c
 create mode 100644 drivers/scsi/cxlflash/vlun.h

(limited to 'include/uapi')

diff --git a/Documentation/powerpc/cxlflash.txt b/Documentation/powerpc/cxlflash.txt
index f943967f90ce..4202d1bc583c 100644
--- a/Documentation/powerpc/cxlflash.txt
+++ b/Documentation/powerpc/cxlflash.txt
@@ -163,7 +163,8 @@ DK_CXLFLASH_ATTACH
 
         - These tokens are only valid for the process under which they
           were created. The child of a forked process cannot continue
-          to use the context id or file descriptor created by its parent.
+          to use the context id or file descriptor created by its parent
+          (see DK_CXLFLASH_VLUN_CLONE for further details).
 
         - These tokens are only valid for the lifetime of the context and
           the process under which they were created. Once either is
@@ -193,6 +194,45 @@ DK_CXLFLASH_USER_DIRECT
     treated as a resource handle that is returned to the user. The user
     is then able to use the handle to reference the LUN during I/O.
 
+DK_CXLFLASH_USER_VIRTUAL
+------------------------
+    This ioctl is responsible for transitioning the LUN to virtual mode
+    of access and configuring the AFU for virtual access from user space
+    on a per-context basis. Additionally, the block size and last logical
+    block address (LBA) are returned to the user.
+
+    As mentioned previously, when operating in user space access mode,
+    LUNs may be accessed in whole or in part. Only one mode is allowed
+    at a time and if one mode is active (outstanding references exist),
+    requests to use the LUN in a different mode are denied.
+
+    The AFU is configured for virtual access from user space by adding
+    an entry to the AFU's resource handle table. The index of the entry
+    is treated as a resource handle that is returned to the user. The
+    user is then able to use the handle to reference the LUN during I/O.
+
+    By default, the virtual LUN is created with a size of 0. The user
+    would need to use the DK_CXLFLASH_VLUN_RESIZE ioctl to adjust the grow
+    the virtual LUN to a desired size. To avoid having to perform this
+    resize for the initial creation of the virtual LUN, the user has the
+    option of specifying a size as part of the DK_CXLFLASH_USER_VIRTUAL
+    ioctl, such that when success is returned to the user, the
+    resource handle that is provided is already referencing provisioned
+    storage. This is reflected by the last LBA being a non-zero value.
+
+DK_CXLFLASH_VLUN_RESIZE
+-----------------------
+    This ioctl is responsible for resizing a previously created virtual
+    LUN and will fail if invoked upon a LUN that is not in virtual
+    mode. Upon success, an updated last LBA is returned to the user
+    indicating the new size of the virtual LUN associated with the
+    resource handle.
+
+    The partitioning of virtual LUNs is jointly mediated by the cxlflash
+    driver and the AFU. An allocation table is kept for each LUN that is
+    operating in the virtual mode and used to program a LUN translation
+    table that the AFU references when provided with a resource handle.
+
 DK_CXLFLASH_RELEASE
 -------------------
     This ioctl is responsible for releasing a previously obtained
@@ -214,6 +254,27 @@ DK_CXLFLASH_DETACH
     success, all "tokens" which had been provided to the user from the
     DK_CXLFLASH_ATTACH onward are no longer valid.
 
+DK_CXLFLASH_VLUN_CLONE
+----------------------
+    This ioctl is responsible for cloning a previously created
+    context to a more recently created context. It exists solely to
+    support maintaining user space access to storage after a process
+    forks. Upon success, the child process (which invoked the ioctl)
+    will have access to the same LUNs via the same resource handle(s)
+    and fd2 as the parent, but under a different context.
+
+    Context sharing across processes is not supported with CXL and
+    therefore each fork must be met with establishing a new context
+    for the child process. This ioctl simplifies the state management
+    and playback required by a user in such a scenario. When a process
+    forks, child process can clone the parents context by first creating
+    a context (via DK_CXLFLASH_ATTACH) and then using this ioctl to
+    perform the clone from the parent to the child.
+
+    The clone itself is fairly simple. The resource handle and lun
+    translation tables are copied from the parent context to the child's
+    and then synced with the AFU.
+
 DK_CXLFLASH_VERIFY
 ------------------
     This ioctl is used to detect various changes such as the capacity of
diff --git a/drivers/scsi/cxlflash/Makefile b/drivers/scsi/cxlflash/Makefile
index c14d24c720d6..9e39866d473b 100644
--- a/drivers/scsi/cxlflash/Makefile
+++ b/drivers/scsi/cxlflash/Makefile
@@ -1,2 +1,2 @@
 obj-$(CONFIG_CXLFLASH) += cxlflash.o
-cxlflash-y += main.o superpipe.o lunmgt.o
+cxlflash-y += main.o superpipe.o lunmgt.o vlun.o
diff --git a/drivers/scsi/cxlflash/common.h b/drivers/scsi/cxlflash/common.h
index d3e54e61c7a5..1c56037146e1 100644
--- a/drivers/scsi/cxlflash/common.h
+++ b/drivers/scsi/cxlflash/common.h
@@ -116,6 +116,9 @@ struct cxlflash_cfg {
 
 	atomic_t num_user_contexts;
 
+	/* Parameters that are LUN table related */
+	int last_lun_index[CXLFLASH_NUM_FC_PORTS];
+	int promote_lun_index;
 	struct list_head lluns; /* list of llun_info structs */
 
 	wait_queue_head_t tmf_waitq;
@@ -200,5 +203,6 @@ int cxlflash_ioctl(struct scsi_device *, int, void __user *);
 void cxlflash_stop_term_user_contexts(struct cxlflash_cfg *);
 int cxlflash_mark_contexts_error(struct cxlflash_cfg *);
 void cxlflash_term_local_luns(struct cxlflash_cfg *);
+void cxlflash_restore_luntable(struct cxlflash_cfg *);
 
 #endif /* ifndef _CXLFLASH_COMMON_H */
diff --git a/drivers/scsi/cxlflash/lunmgt.c b/drivers/scsi/cxlflash/lunmgt.c
index 66d5bef11ee6..d98ad0ff64c1 100644
--- a/drivers/scsi/cxlflash/lunmgt.c
+++ b/drivers/scsi/cxlflash/lunmgt.c
@@ -20,6 +20,7 @@
 
 #include "sislite.h"
 #include "common.h"
+#include "vlun.h"
 #include "superpipe.h"
 
 /**
@@ -42,6 +43,7 @@ static struct llun_info *create_local(struct scsi_device *sdev, u8 *wwid)
 	lli->sdev = sdev;
 	lli->newly_created = true;
 	lli->host_no = sdev->host->host_no;
+	lli->in_table = false;
 
 	memcpy(lli->wwid, wwid, DK_CXLFLASH_MANAGE_LUN_WWID_LEN);
 out:
@@ -208,6 +210,7 @@ void cxlflash_term_global_luns(void)
 	mutex_lock(&global.mutex);
 	list_for_each_entry_safe(gli, temp, &global.gluns, list) {
 		list_del(&gli->list);
+		cxlflash_ba_terminate(&gli->blka.ba_lun);
 		kfree(gli);
 	}
 	mutex_unlock(&global.mutex);
diff --git a/drivers/scsi/cxlflash/main.c b/drivers/scsi/cxlflash/main.c
index 02d464f41b7f..458ed838f83a 100644
--- a/drivers/scsi/cxlflash/main.c
+++ b/drivers/scsi/cxlflash/main.c
@@ -1989,6 +1989,8 @@ static int init_afu(struct cxlflash_cfg *cfg)
 	afu_err_intr_init(cfg->afu);
 	atomic64_set(&afu->room, readq_be(&afu->host_map->cmd_room));
 
+	/* Restore the LUN mappings */
+	cxlflash_restore_luntable(cfg);
 err1:
 	pr_debug("%s: returning rc=%d\n", __func__, rc);
 	return rc;
@@ -2286,6 +2288,17 @@ static int cxlflash_probe(struct pci_dev *pdev,
 
 	cfg->init_state = INIT_STATE_NONE;
 	cfg->dev = pdev;
+
+	/*
+	 * The promoted LUNs move to the top of the LUN table. The rest stay
+	 * on the bottom half. The bottom half grows from the end
+	 * (index = 255), whereas the top half grows from the beginning
+	 * (index = 0).
+	 */
+	cfg->promote_lun_index  = 0;
+	cfg->last_lun_index[0] = CXLFLASH_NUM_VLUNS/2 - 1;
+	cfg->last_lun_index[1] = CXLFLASH_NUM_VLUNS/2 - 1;
+
 	cfg->dev_id = (struct pci_device_id *)dev_id;
 	cfg->mcctx = NULL;
 
diff --git a/drivers/scsi/cxlflash/sislite.h b/drivers/scsi/cxlflash/sislite.h
index 66b889151a4c..63bf394fe78c 100644
--- a/drivers/scsi/cxlflash/sislite.h
+++ b/drivers/scsi/cxlflash/sislite.h
@@ -397,16 +397,17 @@ struct cxlflash_afu_map {
 	};
 };
 
-/* LBA translation control blocks */
-
+/*
+ * LXT - LBA Translation Table
+ * LXT control blocks
+ */
 struct sisl_lxt_entry {
 	u64 rlba_base;	/* bits 0:47 is base
-				 * b48:55 is lun index
-				 * b58:59 is write & read perms
-				 * (if no perm, afu_rc=0x15)
-				 * b60:63 is port_sel mask
-				 */
-
+			 * b48:55 is lun index
+			 * b58:59 is write & read perms
+			 * (if no perm, afu_rc=0x15)
+			 * b60:63 is port_sel mask
+			 */
 };
 
 /*
@@ -465,4 +466,7 @@ struct sisl_rht_entry_f1 {
 #define TMF_LUN_RESET  0x1U
 #define TMF_CLEAR_ACA  0x2U
 
+
+#define SISLITE_MAX_WS_BLOCKS 512
+
 #endif /* _SISLITE_H */
diff --git a/drivers/scsi/cxlflash/superpipe.c b/drivers/scsi/cxlflash/superpipe.c
index 3c8bce8bbb0b..f1b62cea75b1 100644
--- a/drivers/scsi/cxlflash/superpipe.c
+++ b/drivers/scsi/cxlflash/superpipe.c
@@ -26,10 +26,24 @@
 
 #include "sislite.h"
 #include "common.h"
+#include "vlun.h"
 #include "superpipe.h"
 
 struct cxlflash_global global;
 
+/**
+ * marshal_rele_to_resize() - translate release to resize structure
+ * @rele:	Source structure from which to translate/copy.
+ * @resize:	Destination structure for the translate/copy.
+ */
+static void marshal_rele_to_resize(struct dk_cxlflash_release *release,
+				   struct dk_cxlflash_resize *resize)
+{
+	resize->hdr = release->hdr;
+	resize->context_id = release->context_id;
+	resize->rsrc_handle = release->rsrc_handle;
+}
+
 /**
  * marshal_det_to_rele() - translate detach to release structure
  * @detach:	Destination structure for the translate/copy.
@@ -449,6 +463,7 @@ void rhte_checkin(struct ctx_info *ctxi,
 	rhte->fp = 0;
 	ctxi->rht_out--;
 	ctxi->rht_lun[rsrc_handle] = NULL;
+	ctxi->rht_needs_ws[rsrc_handle] = false;
 }
 
 /**
@@ -526,13 +541,21 @@ out:
 /**
  * cxlflash_lun_detach() - detaches a user from a LUN and resets the LUN's mode
  * @gli:	LUN to detach.
+ *
+ * When resetting the mode, terminate block allocation resources as they
+ * are no longer required (service is safe to call even when block allocation
+ * resources were not present - such as when transitioning from physical mode).
+ * These resources will be reallocated when needed (subsequent transition to
+ * virtual mode).
  */
 void cxlflash_lun_detach(struct glun_info *gli)
 {
 	mutex_lock(&gli->mutex);
 	WARN_ON(gli->mode == MODE_NONE);
-	if (--gli->users == 0)
+	if (--gli->users == 0) {
 		gli->mode = MODE_NONE;
+		cxlflash_ba_terminate(&gli->blka.ba_lun);
+	}
 	pr_debug("%s: gli->users=%u\n", __func__, gli->users);
 	WARN_ON(gli->users < 0);
 	mutex_unlock(&gli->mutex);
@@ -544,10 +567,12 @@ void cxlflash_lun_detach(struct glun_info *gli)
  * @ctxi:	Context owning resources.
  * @release:	Release ioctl data structure.
  *
- * Note that the AFU sync should _not_ be performed when the context is sitting
- * on the error recovery list. A context on the error recovery list is not known
- * to the AFU due to reset. When the context is recovered, it will be reattached
- * and made known again to the AFU.
+ * For LUNs in virtual mode, the virtual LUN associated with the specified
+ * resource handle is resized to 0 prior to releasing the RHTE. Note that the
+ * AFU sync should _not_ be performed when the context is sitting on the error
+ * recovery list. A context on the error recovery list is not known to the AFU
+ * due to reset. When the context is recovered, it will be reattached and made
+ * known again to the AFU.
  *
  * Return: 0 on success, -errno on failure
  */
@@ -562,6 +587,7 @@ int _cxlflash_disk_release(struct scsi_device *sdev,
 	struct afu *afu = cfg->afu;
 	bool put_ctx = false;
 
+	struct dk_cxlflash_resize size;
 	res_hndl_t rhndl = release->rsrc_handle;
 
 	int rc = 0;
@@ -594,7 +620,24 @@ int _cxlflash_disk_release(struct scsi_device *sdev,
 		goto out;
 	}
 
+	/*
+	 * Resize to 0 for virtual LUNS by setting the size
+	 * to 0. This will clear LXT_START and LXT_CNT fields
+	 * in the RHT entry and properly sync with the AFU.
+	 *
+	 * Afterwards we clear the remaining fields.
+	 */
 	switch (gli->mode) {
+	case MODE_VIRTUAL:
+		marshal_rele_to_resize(release, &size);
+		size.req_size = 0;
+		rc = _cxlflash_vlun_resize(sdev, ctxi, &size);
+		if (rc) {
+			dev_dbg(dev, "%s: resize failed rc %d\n", __func__, rc);
+			goto out;
+		}
+
+		break;
 	case MODE_PHYSICAL:
 		/*
 		 * Clear the Format 1 RHT entry for direct access
@@ -666,6 +709,7 @@ static void destroy_context(struct cxlflash_cfg *cfg,
 
 	/* Free memory associated with context */
 	free_page((ulong)ctxi->rht_start);
+	kfree(ctxi->rht_needs_ws);
 	kfree(ctxi->rht_lun);
 	kfree(ctxi);
 	atomic_dec_if_positive(&cfg->num_user_contexts);
@@ -693,11 +737,13 @@ static struct ctx_info *create_context(struct cxlflash_cfg *cfg,
 	struct afu *afu = cfg->afu;
 	struct ctx_info *ctxi = NULL;
 	struct llun_info **lli = NULL;
+	bool *ws = NULL;
 	struct sisl_rht_entry *rhte;
 
 	ctxi = kzalloc(sizeof(*ctxi), GFP_KERNEL);
 	lli = kzalloc((MAX_RHT_PER_CONTEXT * sizeof(*lli)), GFP_KERNEL);
-	if (unlikely(!ctxi || !lli)) {
+	ws = kzalloc((MAX_RHT_PER_CONTEXT * sizeof(*ws)), GFP_KERNEL);
+	if (unlikely(!ctxi || !lli || !ws)) {
 		dev_err(dev, "%s: Unable to allocate context!\n", __func__);
 		goto err;
 	}
@@ -709,6 +755,7 @@ static struct ctx_info *create_context(struct cxlflash_cfg *cfg,
 	}
 
 	ctxi->rht_lun = lli;
+	ctxi->rht_needs_ws = ws;
 	ctxi->rht_start = rhte;
 	ctxi->rht_perms = perms;
 
@@ -728,6 +775,7 @@ out:
 	return ctxi;
 
 err:
+	kfree(ws);
 	kfree(lli);
 	kfree(ctxi);
 	ctxi = NULL;
@@ -1729,6 +1777,12 @@ static int cxlflash_disk_verify(struct scsi_device *sdev,
 	case MODE_PHYSICAL:
 		last_lba = gli->max_lba;
 		break;
+	case MODE_VIRTUAL:
+		/* Cast lxt_cnt to u64 for multiply to be treated as 64bit op */
+		last_lba = ((u64)rhte->lxt_cnt * MC_CHUNK_SIZE * gli->blk_len);
+		last_lba /= CXLFLASH_BLOCK_SIZE;
+		last_lba--;
+		break;
 	default:
 		WARN(1, "Unsupported LUN mode!");
 	}
@@ -1756,12 +1810,18 @@ static char *decode_ioctl(int cmd)
 		return __stringify_1(DK_CXLFLASH_ATTACH);
 	case DK_CXLFLASH_USER_DIRECT:
 		return __stringify_1(DK_CXLFLASH_USER_DIRECT);
+	case DK_CXLFLASH_USER_VIRTUAL:
+		return __stringify_1(DK_CXLFLASH_USER_VIRTUAL);
+	case DK_CXLFLASH_VLUN_RESIZE:
+		return __stringify_1(DK_CXLFLASH_VLUN_RESIZE);
 	case DK_CXLFLASH_RELEASE:
 		return __stringify_1(DK_CXLFLASH_RELEASE);
 	case DK_CXLFLASH_DETACH:
 		return __stringify_1(DK_CXLFLASH_DETACH);
 	case DK_CXLFLASH_VERIFY:
 		return __stringify_1(DK_CXLFLASH_VERIFY);
+	case DK_CXLFLASH_VLUN_CLONE:
+		return __stringify_1(DK_CXLFLASH_VLUN_CLONE);
 	case DK_CXLFLASH_RECOVER_AFU:
 		return __stringify_1(DK_CXLFLASH_RECOVER_AFU);
 	case DK_CXLFLASH_MANAGE_LUN:
@@ -1876,6 +1936,7 @@ static int ioctl_common(struct scsi_device *sdev, int cmd)
 	rc = check_state(cfg);
 	if (unlikely(rc) && (cfg->state == STATE_FAILTERM)) {
 		switch (cmd) {
+		case DK_CXLFLASH_VLUN_RESIZE:
 		case DK_CXLFLASH_RELEASE:
 		case DK_CXLFLASH_DETACH:
 			dev_dbg(dev, "%s: Command override! (%d)\n",
@@ -1923,12 +1984,18 @@ int cxlflash_ioctl(struct scsi_device *sdev, int cmd, void __user *arg)
 	{sizeof(struct dk_cxlflash_verify), (sioctl)cxlflash_disk_verify},
 	{sizeof(struct dk_cxlflash_recover_afu), (sioctl)cxlflash_afu_recover},
 	{sizeof(struct dk_cxlflash_manage_lun), (sioctl)cxlflash_manage_lun},
+	{sizeof(struct dk_cxlflash_uvirtual), cxlflash_disk_virtual_open},
+	{sizeof(struct dk_cxlflash_resize), (sioctl)cxlflash_vlun_resize},
+	{sizeof(struct dk_cxlflash_clone), (sioctl)cxlflash_disk_clone},
 	};
 
 	/* Restrict command set to physical support only for internal LUN */
 	if (afu->internal_lun)
 		switch (cmd) {
 		case DK_CXLFLASH_RELEASE:
+		case DK_CXLFLASH_USER_VIRTUAL:
+		case DK_CXLFLASH_VLUN_RESIZE:
+		case DK_CXLFLASH_VLUN_CLONE:
 			dev_dbg(dev, "%s: %s not supported for lun_mode=%d\n",
 				__func__, decode_ioctl(cmd), afu->internal_lun);
 			rc = -EINVAL;
@@ -1942,6 +2009,9 @@ int cxlflash_ioctl(struct scsi_device *sdev, int cmd, void __user *arg)
 	case DK_CXLFLASH_DETACH:
 	case DK_CXLFLASH_VERIFY:
 	case DK_CXLFLASH_RECOVER_AFU:
+	case DK_CXLFLASH_USER_VIRTUAL:
+	case DK_CXLFLASH_VLUN_RESIZE:
+	case DK_CXLFLASH_VLUN_CLONE:
 		dev_dbg(dev, "%s: %s (%08X) on dev(%d/%d/%d/%llu)\n",
 			__func__, decode_ioctl(cmd), cmd, shost->host_no,
 			sdev->channel, sdev->id, sdev->lun);
diff --git a/drivers/scsi/cxlflash/superpipe.h b/drivers/scsi/cxlflash/superpipe.h
index ae39b9627118..d7dc88bc64a4 100644
--- a/drivers/scsi/cxlflash/superpipe.h
+++ b/drivers/scsi/cxlflash/superpipe.h
@@ -31,9 +31,11 @@ extern struct cxlflash_global global;
 #define MC_DISCOVERY_TIMEOUT 5  /* 5 secs */
 
 #define CHAN2PORT(_x)	((_x) + 1)
+#define PORT2CHAN(_x)	((_x) - 1)
 
 enum lun_mode {
 	MODE_NONE = 0,
+	MODE_VIRTUAL,
 	MODE_PHYSICAL
 };
 
@@ -41,13 +43,14 @@ enum lun_mode {
 struct glun_info {
 	u64 max_lba;		/* from read cap(16) */
 	u32 blk_len;		/* from read cap(16) */
-	enum lun_mode mode;	/* NONE, PHYSICAL */
+	enum lun_mode mode;	/* NONE, VIRTUAL, PHYSICAL */
 	int users;		/* Number of users w/ references to LUN */
 
 	u8 wwid[16];
 
 	struct mutex mutex;
 
+	struct blka blka;
 	struct list_head list;
 };
 
@@ -58,6 +61,7 @@ struct llun_info {
 	u32 host_no;		/* host_no from Scsi_host */
 	u32 port_sel;		/* What port to use for this LUN */
 	bool newly_created;	/* Whether the LUN was just discovered */
+	bool in_table;		/* Whether a LUN table entry was created */
 
 	u8 wwid[16];		/* Keep a duplicate copy here? */
 
@@ -90,6 +94,7 @@ struct ctx_info {
 	u32 rht_out;		/* Number of checked out RHT entries */
 	u32 rht_perms;		/* User-defined permissions for RHT entries */
 	struct llun_info **rht_lun;       /* Mapping of RHT entries to LUNs */
+	bool *rht_needs_ws;	/* User-desired write-same function per RHTE */
 
 	struct cxl_ioctl_start_work work;
 	u64 ctxid;
@@ -111,10 +116,18 @@ struct cxlflash_global {
 	struct page *err_page; /* One page of all 0xF for error notification */
 };
 
+int cxlflash_vlun_resize(struct scsi_device *, struct dk_cxlflash_resize *);
+int _cxlflash_vlun_resize(struct scsi_device *, struct ctx_info *,
+			  struct dk_cxlflash_resize *);
+
 int cxlflash_disk_release(struct scsi_device *, struct dk_cxlflash_release *);
 int _cxlflash_disk_release(struct scsi_device *, struct ctx_info *,
 			   struct dk_cxlflash_release *);
 
+int cxlflash_disk_clone(struct scsi_device *, struct dk_cxlflash_clone *);
+
+int cxlflash_disk_virtual_open(struct scsi_device *, void *);
+
 int cxlflash_lun_attach(struct glun_info *, enum lun_mode, bool);
 void cxlflash_lun_detach(struct glun_info *);
 
@@ -127,6 +140,8 @@ struct sisl_rht_entry *get_rhte(struct ctx_info *, res_hndl_t,
 struct sisl_rht_entry *rhte_checkout(struct ctx_info *, struct llun_info *);
 void rhte_checkin(struct ctx_info *, struct sisl_rht_entry *);
 
+void cxlflash_ba_terminate(struct ba_lun *);
+
 int cxlflash_manage_lun(struct scsi_device *, struct dk_cxlflash_manage_lun *);
 
 #endif /* ifndef _CXLFLASH_SUPERPIPE_H */
diff --git a/drivers/scsi/cxlflash/vlun.c b/drivers/scsi/cxlflash/vlun.c
new file mode 100644
index 000000000000..6155cb1d4ed3
--- /dev/null
+++ b/drivers/scsi/cxlflash/vlun.c
@@ -0,0 +1,1243 @@
+/*
+ * CXL Flash Device Driver
+ *
+ * Written by: Manoj N. Kumar <manoj@linux.vnet.ibm.com>, IBM Corporation
+ *             Matthew R. Ochs <mrochs@linux.vnet.ibm.com>, IBM Corporation
+ *
+ * Copyright (C) 2015 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/syscalls.h>
+#include <misc/cxl.h>
+#include <asm/unaligned.h>
+#include <asm/bitsperlong.h>
+
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_host.h>
+#include <uapi/scsi/cxlflash_ioctl.h>
+
+#include "sislite.h"
+#include "common.h"
+#include "vlun.h"
+#include "superpipe.h"
+
+/**
+ * marshal_virt_to_resize() - translate uvirtual to resize structure
+ * @virt:	Source structure from which to translate/copy.
+ * @resize:	Destination structure for the translate/copy.
+ */
+static void marshal_virt_to_resize(struct dk_cxlflash_uvirtual *virt,
+				   struct dk_cxlflash_resize *resize)
+{
+	resize->hdr = virt->hdr;
+	resize->context_id = virt->context_id;
+	resize->rsrc_handle = virt->rsrc_handle;
+	resize->req_size = virt->lun_size;
+	resize->last_lba = virt->last_lba;
+}
+
+/**
+ * marshal_clone_to_rele() - translate clone to release structure
+ * @clone:	Source structure from which to translate/copy.
+ * @rele:	Destination structure for the translate/copy.
+ */
+static void marshal_clone_to_rele(struct dk_cxlflash_clone *clone,
+				  struct dk_cxlflash_release *release)
+{
+	release->hdr = clone->hdr;
+	release->context_id = clone->context_id_dst;
+}
+
+/**
+ * ba_init() - initializes a block allocator
+ * @ba_lun:	Block allocator to initialize.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+static int ba_init(struct ba_lun *ba_lun)
+{
+	struct ba_lun_info *bali = NULL;
+	int lun_size_au = 0, i = 0;
+	int last_word_underflow = 0;
+	u64 *lam;
+
+	pr_debug("%s: Initializing LUN: lun_id = %llX, "
+		 "ba_lun->lsize = %lX, ba_lun->au_size = %lX\n",
+		__func__, ba_lun->lun_id, ba_lun->lsize, ba_lun->au_size);
+
+	/* Calculate bit map size */
+	lun_size_au = ba_lun->lsize / ba_lun->au_size;
+	if (lun_size_au == 0) {
+		pr_debug("%s: Requested LUN size of 0!\n", __func__);
+		return -EINVAL;
+	}
+
+	/* Allocate lun information container */
+	bali = kzalloc(sizeof(struct ba_lun_info), GFP_KERNEL);
+	if (unlikely(!bali)) {
+		pr_err("%s: Failed to allocate lun_info for lun_id %llX\n",
+		       __func__, ba_lun->lun_id);
+		return -ENOMEM;
+	}
+
+	bali->total_aus = lun_size_au;
+	bali->lun_bmap_size = lun_size_au / BITS_PER_LONG;
+
+	if (lun_size_au % BITS_PER_LONG)
+		bali->lun_bmap_size++;
+
+	/* Allocate bitmap space */
+	bali->lun_alloc_map = kzalloc((bali->lun_bmap_size * sizeof(u64)),
+				      GFP_KERNEL);
+	if (unlikely(!bali->lun_alloc_map)) {
+		pr_err("%s: Failed to allocate lun allocation map: "
+		       "lun_id = %llX\n", __func__, ba_lun->lun_id);
+		kfree(bali);
+		return -ENOMEM;
+	}
+
+	/* Initialize the bit map size and set all bits to '1' */
+	bali->free_aun_cnt = lun_size_au;
+
+	for (i = 0; i < bali->lun_bmap_size; i++)
+		bali->lun_alloc_map[i] = 0xFFFFFFFFFFFFFFFFULL;
+
+	/* If the last word not fully utilized, mark extra bits as allocated */
+	last_word_underflow = (bali->lun_bmap_size * BITS_PER_LONG);
+	last_word_underflow -= bali->free_aun_cnt;
+	if (last_word_underflow > 0) {
+		lam = &bali->lun_alloc_map[bali->lun_bmap_size - 1];
+		for (i = (HIBIT - last_word_underflow + 1);
+		     i < BITS_PER_LONG;
+		     i++)
+			clear_bit(i, (ulong *)lam);
+	}
+
+	/* Initialize high elevator index, low/curr already at 0 from kzalloc */
+	bali->free_high_idx = bali->lun_bmap_size;
+
+	/* Allocate clone map */
+	bali->aun_clone_map = kzalloc((bali->total_aus * sizeof(u8)),
+				      GFP_KERNEL);
+	if (unlikely(!bali->aun_clone_map)) {
+		pr_err("%s: Failed to allocate clone map: lun_id = %llX\n",
+		       __func__, ba_lun->lun_id);
+		kfree(bali->lun_alloc_map);
+		kfree(bali);
+		return -ENOMEM;
+	}
+
+	/* Pass the allocated lun info as a handle to the user */
+	ba_lun->ba_lun_handle = bali;
+
+	pr_debug("%s: Successfully initialized the LUN: "
+		 "lun_id = %llX, bitmap size = %X, free_aun_cnt = %llX\n",
+		__func__, ba_lun->lun_id, bali->lun_bmap_size,
+		bali->free_aun_cnt);
+	return 0;
+}
+
+/**
+ * find_free_range() - locates a free bit within the block allocator
+ * @low:	First word in block allocator to start search.
+ * @high:	Last word in block allocator to search.
+ * @bali:	LUN information structure owning the block allocator to search.
+ * @bit_word:	Passes back the word in the block allocator owning the free bit.
+ *
+ * Return: The bit position within the passed back word, -1 on failure
+ */
+static int find_free_range(u32 low,
+			   u32 high,
+			   struct ba_lun_info *bali, int *bit_word)
+{
+	int i;
+	u64 bit_pos = -1;
+	ulong *lam, num_bits;
+
+	for (i = low; i < high; i++)
+		if (bali->lun_alloc_map[i] != 0) {
+			lam = (ulong *)&bali->lun_alloc_map[i];
+			num_bits = (sizeof(*lam) * BITS_PER_BYTE);
+			bit_pos = find_first_bit(lam, num_bits);
+
+			pr_devel("%s: Found free bit %llX in lun "
+				 "map entry %llX at bitmap index = %X\n",
+				 __func__, bit_pos, bali->lun_alloc_map[i],
+				 i);
+
+			*bit_word = i;
+			bali->free_aun_cnt--;
+			clear_bit(bit_pos, lam);
+			break;
+		}
+
+	return bit_pos;
+}
+
+/**
+ * ba_alloc() - allocates a block from the block allocator
+ * @ba_lun:	Block allocator from which to allocate a block.
+ *
+ * Return: The allocated block, -1 on failure
+ */
+static u64 ba_alloc(struct ba_lun *ba_lun)
+{
+	u64 bit_pos = -1;
+	int bit_word = 0;
+	struct ba_lun_info *bali = NULL;
+
+	bali = ba_lun->ba_lun_handle;
+
+	pr_debug("%s: Received block allocation request: "
+		 "lun_id = %llX, free_aun_cnt = %llX\n",
+		 __func__, ba_lun->lun_id, bali->free_aun_cnt);
+
+	if (bali->free_aun_cnt == 0) {
+		pr_debug("%s: No space left on LUN: lun_id = %llX\n",
+			 __func__, ba_lun->lun_id);
+		return -1ULL;
+	}
+
+	/* Search to find a free entry, curr->high then low->curr */
+	bit_pos = find_free_range(bali->free_curr_idx,
+				  bali->free_high_idx, bali, &bit_word);
+	if (bit_pos == -1) {
+		bit_pos = find_free_range(bali->free_low_idx,
+					  bali->free_curr_idx,
+					  bali, &bit_word);
+		if (bit_pos == -1) {
+			pr_debug("%s: Could not find an allocation unit on LUN:"
+				 " lun_id = %llX\n", __func__, ba_lun->lun_id);
+			return -1ULL;
+		}
+	}
+
+	/* Update the free_curr_idx */
+	if (bit_pos == HIBIT)
+		bali->free_curr_idx = bit_word + 1;
+	else
+		bali->free_curr_idx = bit_word;
+
+	pr_debug("%s: Allocating AU number %llX, on lun_id %llX, "
+		 "free_aun_cnt = %llX\n", __func__,
+		 ((bit_word * BITS_PER_LONG) + bit_pos), ba_lun->lun_id,
+		 bali->free_aun_cnt);
+
+	return (u64) ((bit_word * BITS_PER_LONG) + bit_pos);
+}
+
+/**
+ * validate_alloc() - validates the specified block has been allocated
+ * @ba_lun_info:	LUN info owning the block allocator.
+ * @aun:		Block to validate.
+ *
+ * Return: 0 on success, -1 on failure
+ */
+static int validate_alloc(struct ba_lun_info *bali, u64 aun)
+{
+	int idx = 0, bit_pos = 0;
+
+	idx = aun / BITS_PER_LONG;
+	bit_pos = aun % BITS_PER_LONG;
+
+	if (test_bit(bit_pos, (ulong *)&bali->lun_alloc_map[idx]))
+		return -1;
+
+	return 0;
+}
+
+/**
+ * ba_free() - frees a block from the block allocator
+ * @ba_lun:	Block allocator from which to allocate a block.
+ * @to_free:	Block to free.
+ *
+ * Return: 0 on success, -1 on failure
+ */
+static int ba_free(struct ba_lun *ba_lun, u64 to_free)
+{
+	int idx = 0, bit_pos = 0;
+	struct ba_lun_info *bali = NULL;
+
+	bali = ba_lun->ba_lun_handle;
+
+	if (validate_alloc(bali, to_free)) {
+		pr_debug("%s: The AUN %llX is not allocated on lun_id %llX\n",
+			 __func__, to_free, ba_lun->lun_id);
+		return -1;
+	}
+
+	pr_debug("%s: Received a request to free AU %llX on lun_id %llX, "
+		 "free_aun_cnt = %llX\n", __func__, to_free, ba_lun->lun_id,
+		 bali->free_aun_cnt);
+
+	if (bali->aun_clone_map[to_free] > 0) {
+		pr_debug("%s: AUN %llX on lun_id %llX has been cloned. Clone "
+			 "count = %X\n", __func__, to_free, ba_lun->lun_id,
+			 bali->aun_clone_map[to_free]);
+		bali->aun_clone_map[to_free]--;
+		return 0;
+	}
+
+	idx = to_free / BITS_PER_LONG;
+	bit_pos = to_free % BITS_PER_LONG;
+
+	set_bit(bit_pos, (ulong *)&bali->lun_alloc_map[idx]);
+	bali->free_aun_cnt++;
+
+	if (idx < bali->free_low_idx)
+		bali->free_low_idx = idx;
+	else if (idx > bali->free_high_idx)
+		bali->free_high_idx = idx;
+
+	pr_debug("%s: Successfully freed AU at bit_pos %X, bit map index %X on "
+		 "lun_id %llX, free_aun_cnt = %llX\n", __func__, bit_pos, idx,
+		 ba_lun->lun_id, bali->free_aun_cnt);
+
+	return 0;
+}
+
+/**
+ * ba_clone() - Clone a chunk of the block allocation table
+ * @ba_lun:	Block allocator from which to allocate a block.
+ * @to_free:	Block to free.
+ *
+ * Return: 0 on success, -1 on failure
+ */
+static int ba_clone(struct ba_lun *ba_lun, u64 to_clone)
+{
+	struct ba_lun_info *bali = ba_lun->ba_lun_handle;
+
+	if (validate_alloc(bali, to_clone)) {
+		pr_debug("%s: AUN %llX is not allocated on lun_id %llX\n",
+			 __func__, to_clone, ba_lun->lun_id);
+		return -1;
+	}
+
+	pr_debug("%s: Received a request to clone AUN %llX on lun_id %llX\n",
+		 __func__, to_clone, ba_lun->lun_id);
+
+	if (bali->aun_clone_map[to_clone] == MAX_AUN_CLONE_CNT) {
+		pr_debug("%s: AUN %llX on lun_id %llX hit max clones already\n",
+			 __func__, to_clone, ba_lun->lun_id);
+		return -1;
+	}
+
+	bali->aun_clone_map[to_clone]++;
+
+	return 0;
+}
+
+/**
+ * ba_space() - returns the amount of free space left in the block allocator
+ * @ba_lun:	Block allocator.
+ *
+ * Return: Amount of free space in block allocator
+ */
+static u64 ba_space(struct ba_lun *ba_lun)
+{
+	struct ba_lun_info *bali = ba_lun->ba_lun_handle;
+
+	return bali->free_aun_cnt;
+}
+
+/**
+ * cxlflash_ba_terminate() - frees resources associated with the block allocator
+ * @ba_lun:	Block allocator.
+ *
+ * Safe to call in a partially allocated state.
+ */
+void cxlflash_ba_terminate(struct ba_lun *ba_lun)
+{
+	struct ba_lun_info *bali = ba_lun->ba_lun_handle;
+
+	if (bali) {
+		kfree(bali->aun_clone_map);
+		kfree(bali->lun_alloc_map);
+		kfree(bali);
+		ba_lun->ba_lun_handle = NULL;
+	}
+}
+
+/**
+ * init_vlun() - initializes a LUN for virtual use
+ * @lun_info:	LUN information structure that owns the block allocator.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+static int init_vlun(struct llun_info *lli)
+{
+	int rc = 0;
+	struct glun_info *gli = lli->parent;
+	struct blka *blka = &gli->blka;
+
+	memset(blka, 0, sizeof(*blka));
+	mutex_init(&blka->mutex);
+
+	/* LUN IDs are unique per port, save the index instead */
+	blka->ba_lun.lun_id = lli->lun_index;
+	blka->ba_lun.lsize = gli->max_lba + 1;
+	blka->ba_lun.lba_size = gli->blk_len;
+
+	blka->ba_lun.au_size = MC_CHUNK_SIZE;
+	blka->nchunk = blka->ba_lun.lsize / MC_CHUNK_SIZE;
+
+	rc = ba_init(&blka->ba_lun);
+	if (unlikely(rc))
+		pr_debug("%s: cannot init block_alloc, rc=%d\n", __func__, rc);
+
+	pr_debug("%s: returning rc=%d lli=%p\n", __func__, rc, lli);
+	return rc;
+}
+
+/**
+ * write_same16() - sends a SCSI WRITE_SAME16 (0) command to specified LUN
+ * @sdev:	SCSI device associated with LUN.
+ * @lba:	Logical block address to start write same.
+ * @nblks:	Number of logical blocks to write same.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+static int write_same16(struct scsi_device *sdev,
+			u64 lba,
+			u32 nblks)
+{
+	u8 *cmd_buf = NULL;
+	u8 *scsi_cmd = NULL;
+	u8 *sense_buf = NULL;
+	int rc = 0;
+	int result = 0;
+	int ws_limit = SISLITE_MAX_WS_BLOCKS;
+	u64 offset = lba;
+	int left = nblks;
+	u32 tout = sdev->request_queue->rq_timeout;
+	struct cxlflash_cfg *cfg = (struct cxlflash_cfg *)sdev->host->hostdata;
+	struct device *dev = &cfg->dev->dev;
+
+	cmd_buf = kzalloc(CMD_BUFSIZE, GFP_KERNEL);
+	scsi_cmd = kzalloc(MAX_COMMAND_SIZE, GFP_KERNEL);
+	sense_buf = kzalloc(SCSI_SENSE_BUFFERSIZE, GFP_KERNEL);
+	if (unlikely(!cmd_buf || !scsi_cmd || !sense_buf)) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	while (left > 0) {
+
+		scsi_cmd[0] = WRITE_SAME_16;
+		put_unaligned_be64(offset, &scsi_cmd[2]);
+		put_unaligned_be32(ws_limit < left ? ws_limit : left,
+				   &scsi_cmd[10]);
+
+		result = scsi_execute(sdev, scsi_cmd, DMA_TO_DEVICE, cmd_buf,
+				      CMD_BUFSIZE, sense_buf, tout, 5, 0, NULL);
+		if (result) {
+			dev_err_ratelimited(dev, "%s: command failed for "
+					    "offset %lld result=0x%x\n",
+					    __func__, offset, result);
+			rc = -EIO;
+			goto out;
+		}
+		left -= ws_limit;
+		offset += ws_limit;
+	}
+
+out:
+	kfree(cmd_buf);
+	kfree(scsi_cmd);
+	kfree(sense_buf);
+	pr_debug("%s: returning rc=%d\n", __func__, rc);
+	return rc;
+}
+
+/**
+ * grow_lxt() - expands the translation table associated with the specified RHTE
+ * @afu:	AFU associated with the host.
+ * @sdev:	SCSI device associated with LUN.
+ * @ctxid:	Context ID of context owning the RHTE.
+ * @rhndl:	Resource handle associated with the RHTE.
+ * @rhte:	Resource handle entry (RHTE).
+ * @new_size:	Number of translation entries associated with RHTE.
+ *
+ * By design, this routine employs a 'best attempt' allocation and will
+ * truncate the requested size down if there is not sufficient space in
+ * the block allocator to satisfy the request but there does exist some
+ * amount of space. The user is made aware of this by returning the size
+ * allocated.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+static int grow_lxt(struct afu *afu,
+		    struct scsi_device *sdev,
+		    ctx_hndl_t ctxid,
+		    res_hndl_t rhndl,
+		    struct sisl_rht_entry *rhte,
+		    u64 *new_size)
+{
+	struct sisl_lxt_entry *lxt = NULL, *lxt_old = NULL;
+	struct llun_info *lli = sdev->hostdata;
+	struct glun_info *gli = lli->parent;
+	struct blka *blka = &gli->blka;
+	u32 av_size;
+	u32 ngrps, ngrps_old;
+	u64 aun;		/* chunk# allocated by block allocator */
+	u64 delta = *new_size - rhte->lxt_cnt;
+	u64 my_new_size;
+	int i, rc = 0;
+
+	/*
+	 * Check what is available in the block allocator before re-allocating
+	 * LXT array. This is done up front under the mutex which must not be
+	 * released until after allocation is complete.
+	 */
+	mutex_lock(&blka->mutex);
+	av_size = ba_space(&blka->ba_lun);
+	if (unlikely(av_size <= 0)) {
+		pr_debug("%s: ba_space error: av_size %d\n", __func__, av_size);
+		mutex_unlock(&blka->mutex);
+		rc = -ENOSPC;
+		goto out;
+	}
+
+	if (av_size < delta)
+		delta = av_size;
+
+	lxt_old = rhte->lxt_start;
+	ngrps_old = LXT_NUM_GROUPS(rhte->lxt_cnt);
+	ngrps = LXT_NUM_GROUPS(rhte->lxt_cnt + delta);
+
+	if (ngrps != ngrps_old) {
+		/* reallocate to fit new size */
+		lxt = kzalloc((sizeof(*lxt) * LXT_GROUP_SIZE * ngrps),
+			      GFP_KERNEL);
+		if (unlikely(!lxt)) {
+			mutex_unlock(&blka->mutex);
+			rc = -ENOMEM;
+			goto out;
+		}
+
+		/* copy over all old entries */
+		memcpy(lxt, lxt_old, (sizeof(*lxt) * rhte->lxt_cnt));
+	} else
+		lxt = lxt_old;
+
+	/* nothing can fail from now on */
+	my_new_size = rhte->lxt_cnt + delta;
+
+	/* add new entries to the end */
+	for (i = rhte->lxt_cnt; i < my_new_size; i++) {
+		/*
+		 * Due to the earlier check of available space, ba_alloc
+		 * cannot fail here. If it did due to internal error,
+		 * leave a rlba_base of -1u which will likely be a
+		 * invalid LUN (too large).
+		 */
+		aun = ba_alloc(&blka->ba_lun);
+		if ((aun == -1ULL) || (aun >= blka->nchunk))
+			pr_debug("%s: ba_alloc error: allocated chunk# %llX, "
+				 "max %llX\n", __func__, aun, blka->nchunk - 1);
+
+		/* select both ports, use r/w perms from RHT */
+		lxt[i].rlba_base = ((aun << MC_CHUNK_SHIFT) |
+				    (lli->lun_index << LXT_LUNIDX_SHIFT) |
+				    (RHT_PERM_RW << LXT_PERM_SHIFT |
+				     lli->port_sel));
+	}
+
+	mutex_unlock(&blka->mutex);
+
+	/*
+	 * The following sequence is prescribed in the SISlite spec
+	 * for syncing up with the AFU when adding LXT entries.
+	 */
+	dma_wmb(); /* Make LXT updates are visible */
+
+	rhte->lxt_start = lxt;
+	dma_wmb(); /* Make RHT entry's LXT table update visible */
+
+	rhte->lxt_cnt = my_new_size;
+	dma_wmb(); /* Make RHT entry's LXT table size update visible */
+
+	cxlflash_afu_sync(afu, ctxid, rhndl, AFU_LW_SYNC);
+
+	/* free old lxt if reallocated */
+	if (lxt != lxt_old)
+		kfree(lxt_old);
+	*new_size = my_new_size;
+out:
+	pr_debug("%s: returning rc=%d\n", __func__, rc);
+	return rc;
+}
+
+/**
+ * shrink_lxt() - reduces translation table associated with the specified RHTE
+ * @afu:	AFU associated with the host.
+ * @sdev:	SCSI device associated with LUN.
+ * @rhndl:	Resource handle associated with the RHTE.
+ * @rhte:	Resource handle entry (RHTE).
+ * @ctxi:	Context owning resources.
+ * @new_size:	Number of translation entries associated with RHTE.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+static int shrink_lxt(struct afu *afu,
+		      struct scsi_device *sdev,
+		      res_hndl_t rhndl,
+		      struct sisl_rht_entry *rhte,
+		      struct ctx_info *ctxi,
+		      u64 *new_size)
+{
+	struct sisl_lxt_entry *lxt, *lxt_old;
+	struct llun_info *lli = sdev->hostdata;
+	struct glun_info *gli = lli->parent;
+	struct blka *blka = &gli->blka;
+	ctx_hndl_t ctxid = DECODE_CTXID(ctxi->ctxid);
+	bool needs_ws = ctxi->rht_needs_ws[rhndl];
+	bool needs_sync = !ctxi->err_recovery_active;
+	u32 ngrps, ngrps_old;
+	u64 aun;		/* chunk# allocated by block allocator */
+	u64 delta = rhte->lxt_cnt - *new_size;
+	u64 my_new_size;
+	int i, rc = 0;
+
+	lxt_old = rhte->lxt_start;
+	ngrps_old = LXT_NUM_GROUPS(rhte->lxt_cnt);
+	ngrps = LXT_NUM_GROUPS(rhte->lxt_cnt - delta);
+
+	if (ngrps != ngrps_old) {
+		/* Reallocate to fit new size unless new size is 0 */
+		if (ngrps) {
+			lxt = kzalloc((sizeof(*lxt) * LXT_GROUP_SIZE * ngrps),
+				      GFP_KERNEL);
+			if (unlikely(!lxt)) {
+				rc = -ENOMEM;
+				goto out;
+			}
+
+			/* Copy over old entries that will remain */
+			memcpy(lxt, lxt_old,
+			       (sizeof(*lxt) * (rhte->lxt_cnt - delta)));
+		} else
+			lxt = NULL;
+	} else
+		lxt = lxt_old;
+
+	/* Nothing can fail from now on */
+	my_new_size = rhte->lxt_cnt - delta;
+
+	/*
+	 * The following sequence is prescribed in the SISlite spec
+	 * for syncing up with the AFU when removing LXT entries.
+	 */
+	rhte->lxt_cnt = my_new_size;
+	dma_wmb(); /* Make RHT entry's LXT table size update visible */
+
+	rhte->lxt_start = lxt;
+	dma_wmb(); /* Make RHT entry's LXT table update visible */
+
+	if (needs_sync)
+		cxlflash_afu_sync(afu, ctxid, rhndl, AFU_HW_SYNC);
+
+	if (needs_ws) {
+		/*
+		 * Mark the context as unavailable, so that we can release
+		 * the mutex safely.
+		 */
+		ctxi->unavail = true;
+		mutex_unlock(&ctxi->mutex);
+	}
+
+	/* Free LBAs allocated to freed chunks */
+	mutex_lock(&blka->mutex);
+	for (i = delta - 1; i >= 0; i--) {
+		/* Mask the higher 48 bits before shifting, even though
+		 * it is a noop
+		 */
+		aun = (lxt_old[my_new_size + i].rlba_base & SISL_ASTATUS_MASK);
+		aun = (aun >> MC_CHUNK_SHIFT);
+		if (needs_ws)
+			write_same16(sdev, aun, MC_CHUNK_SIZE);
+		ba_free(&blka->ba_lun, aun);
+	}
+	mutex_unlock(&blka->mutex);
+
+	if (needs_ws) {
+		/* Make the context visible again */
+		mutex_lock(&ctxi->mutex);
+		ctxi->unavail = false;
+	}
+
+	/* Free old lxt if reallocated */
+	if (lxt != lxt_old)
+		kfree(lxt_old);
+	*new_size = my_new_size;
+out:
+	pr_debug("%s: returning rc=%d\n", __func__, rc);
+	return rc;
+}
+
+/**
+ * _cxlflash_vlun_resize() - changes the size of a virtual lun
+ * @sdev:	SCSI device associated with LUN owning virtual LUN.
+ * @ctxi:	Context owning resources.
+ * @resize:	Resize ioctl data structure.
+ *
+ * On successful return, the user is informed of the new size (in blocks)
+ * of the virtual lun in last LBA format. When the size of the virtual
+ * lun is zero, the last LBA is reflected as -1. See comment in the
+ * prologue for _cxlflash_disk_release() regarding AFU syncs and contexts
+ * on the error recovery list.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int _cxlflash_vlun_resize(struct scsi_device *sdev,
+			  struct ctx_info *ctxi,
+			  struct dk_cxlflash_resize *resize)
+{
+	struct cxlflash_cfg *cfg = (struct cxlflash_cfg *)sdev->host->hostdata;
+	struct llun_info *lli = sdev->hostdata;
+	struct glun_info *gli = lli->parent;
+	struct afu *afu = cfg->afu;
+	bool put_ctx = false;
+
+	res_hndl_t rhndl = resize->rsrc_handle;
+	u64 new_size;
+	u64 nsectors;
+	u64 ctxid = DECODE_CTXID(resize->context_id),
+	    rctxid = resize->context_id;
+
+	struct sisl_rht_entry *rhte;
+
+	int rc = 0;
+
+	/*
+	 * The requested size (req_size) is always assumed to be in 4k blocks,
+	 * so we have to convert it here from 4k to chunk size.
+	 */
+	nsectors = (resize->req_size * CXLFLASH_BLOCK_SIZE) / gli->blk_len;
+	new_size = DIV_ROUND_UP(nsectors, MC_CHUNK_SIZE);
+
+	pr_debug("%s: ctxid=%llu rhndl=0x%llx, req_size=0x%llx,"
+		 "new_size=%llx\n", __func__, ctxid, resize->rsrc_handle,
+		 resize->req_size, new_size);
+
+	if (unlikely(gli->mode != MODE_VIRTUAL)) {
+		pr_debug("%s: LUN mode does not support resize! (%d)\n",
+			 __func__, gli->mode);
+		rc = -EINVAL;
+		goto out;
+
+	}
+
+	if (!ctxi) {
+		ctxi = get_context(cfg, rctxid, lli, CTX_CTRL_ERR_FALLBACK);
+		if (unlikely(!ctxi)) {
+			pr_debug("%s: Bad context! (%llu)\n", __func__, ctxid);
+			rc = -EINVAL;
+			goto out;
+		}
+
+		put_ctx = true;
+	}
+
+	rhte = get_rhte(ctxi, rhndl, lli);
+	if (unlikely(!rhte)) {
+		pr_debug("%s: Bad resource handle! (%u)\n", __func__, rhndl);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	if (new_size > rhte->lxt_cnt)
+		rc = grow_lxt(afu, sdev, ctxid, rhndl, rhte, &new_size);
+	else if (new_size < rhte->lxt_cnt)
+		rc = shrink_lxt(afu, sdev, rhndl, rhte, ctxi, &new_size);
+
+	resize->hdr.return_flags = 0;
+	resize->last_lba = (new_size * MC_CHUNK_SIZE * gli->blk_len);
+	resize->last_lba /= CXLFLASH_BLOCK_SIZE;
+	resize->last_lba--;
+
+out:
+	if (put_ctx)
+		put_context(ctxi);
+	pr_debug("%s: resized to %lld returning rc=%d\n",
+		 __func__, resize->last_lba, rc);
+	return rc;
+}
+
+int cxlflash_vlun_resize(struct scsi_device *sdev,
+			 struct dk_cxlflash_resize *resize)
+{
+	return _cxlflash_vlun_resize(sdev, NULL, resize);
+}
+
+/**
+ * cxlflash_restore_luntable() - Restore LUN table to prior state
+ * @cfg:	Internal structure associated with the host.
+ */
+void cxlflash_restore_luntable(struct cxlflash_cfg *cfg)
+{
+	struct llun_info *lli, *temp;
+	u32 chan;
+	u32 lind;
+	struct afu *afu = cfg->afu;
+	struct sisl_global_map *agm = &afu->afu_map->global;
+
+	mutex_lock(&global.mutex);
+
+	list_for_each_entry_safe(lli, temp, &cfg->lluns, list) {
+		if (!lli->in_table)
+			continue;
+
+		lind = lli->lun_index;
+
+		if (lli->port_sel == BOTH_PORTS) {
+			writeq_be(lli->lun_id[0], &agm->fc_port[0][lind]);
+			writeq_be(lli->lun_id[1], &agm->fc_port[1][lind]);
+			pr_debug("%s: Virtual LUN on slot %d  id0=%llx, "
+				 "id1=%llx\n", __func__, lind,
+				 lli->lun_id[0], lli->lun_id[1]);
+		} else {
+			chan = PORT2CHAN(lli->port_sel);
+			writeq_be(lli->lun_id[chan], &agm->fc_port[chan][lind]);
+			pr_debug("%s: Virtual LUN on slot %d chan=%d, "
+				 "id=%llx\n", __func__, lind, chan,
+				 lli->lun_id[chan]);
+		}
+	}
+
+	mutex_unlock(&global.mutex);
+}
+
+/**
+ * init_luntable() - write an entry in the LUN table
+ * @cfg:	Internal structure associated with the host.
+ * @lli:	Per adapter LUN information structure.
+ *
+ * On successful return, a LUN table entry is created.
+ * At the top for LUNs visible on both ports.
+ * At the bottom for LUNs visible only on one port.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+static int init_luntable(struct cxlflash_cfg *cfg, struct llun_info *lli)
+{
+	u32 chan;
+	u32 lind;
+	int rc = 0;
+	struct afu *afu = cfg->afu;
+	struct sisl_global_map *agm = &afu->afu_map->global;
+
+	mutex_lock(&global.mutex);
+
+	if (lli->in_table)
+		goto out;
+
+	if (lli->port_sel == BOTH_PORTS) {
+		/*
+		 * If this LUN is visible from both ports, we will put
+		 * it in the top half of the LUN table.
+		 */
+		if ((cfg->promote_lun_index == cfg->last_lun_index[0]) ||
+		    (cfg->promote_lun_index == cfg->last_lun_index[1])) {
+			rc = -ENOSPC;
+			goto out;
+		}
+
+		lind = lli->lun_index = cfg->promote_lun_index;
+		writeq_be(lli->lun_id[0], &agm->fc_port[0][lind]);
+		writeq_be(lli->lun_id[1], &agm->fc_port[1][lind]);
+		cfg->promote_lun_index++;
+		pr_debug("%s: Virtual LUN on slot %d  id0=%llx, id1=%llx\n",
+			 __func__, lind, lli->lun_id[0], lli->lun_id[1]);
+	} else {
+		/*
+		 * If this LUN is visible only from one port, we will put
+		 * it in the bottom half of the LUN table.
+		 */
+		chan = PORT2CHAN(lli->port_sel);
+		if (cfg->promote_lun_index == cfg->last_lun_index[chan]) {
+			rc = -ENOSPC;
+			goto out;
+		}
+
+		lind = lli->lun_index = cfg->last_lun_index[chan];
+		writeq_be(lli->lun_id[chan], &agm->fc_port[chan][lind]);
+		cfg->last_lun_index[chan]--;
+		pr_debug("%s: Virtual LUN on slot %d  chan=%d, id=%llx\n",
+			 __func__, lind, chan, lli->lun_id[chan]);
+	}
+
+	lli->in_table = true;
+out:
+	mutex_unlock(&global.mutex);
+	pr_debug("%s: returning rc=%d\n", __func__, rc);
+	return rc;
+}
+
+/**
+ * cxlflash_disk_virtual_open() - open a virtual disk of specified size
+ * @sdev:	SCSI device associated with LUN owning virtual LUN.
+ * @arg:	UVirtual ioctl data structure.
+ *
+ * On successful return, the user is informed of the resource handle
+ * to be used to identify the virtual lun and the size (in blocks) of
+ * the virtual lun in last LBA format. When the size of the virtual lun
+ * is zero, the last LBA is reflected as -1.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int cxlflash_disk_virtual_open(struct scsi_device *sdev, void *arg)
+{
+	struct cxlflash_cfg *cfg = (struct cxlflash_cfg *)sdev->host->hostdata;
+	struct device *dev = &cfg->dev->dev;
+	struct llun_info *lli = sdev->hostdata;
+	struct glun_info *gli = lli->parent;
+
+	struct dk_cxlflash_uvirtual *virt = (struct dk_cxlflash_uvirtual *)arg;
+	struct dk_cxlflash_resize resize;
+
+	u64 ctxid = DECODE_CTXID(virt->context_id),
+	    rctxid = virt->context_id;
+	u64 lun_size = virt->lun_size;
+	u64 last_lba = 0;
+	u64 rsrc_handle = -1;
+
+	int rc = 0;
+
+	struct ctx_info *ctxi = NULL;
+	struct sisl_rht_entry *rhte = NULL;
+
+	pr_debug("%s: ctxid=%llu ls=0x%llx\n", __func__, ctxid, lun_size);
+
+	mutex_lock(&gli->mutex);
+	if (gli->mode == MODE_NONE) {
+		/* Setup the LUN table and block allocator on first call */
+		rc = init_luntable(cfg, lli);
+		if (rc) {
+			dev_err(dev, "%s: call to init_luntable failed "
+				"rc=%d!\n", __func__, rc);
+			goto err0;
+		}
+
+		rc = init_vlun(lli);
+		if (rc) {
+			dev_err(dev, "%s: call to init_vlun failed rc=%d!\n",
+				__func__, rc);
+			rc = -ENOMEM;
+			goto err0;
+		}
+	}
+
+	rc = cxlflash_lun_attach(gli, MODE_VIRTUAL, true);
+	if (unlikely(rc)) {
+		dev_err(dev, "%s: Failed to attach to LUN! (VIRTUAL)\n",
+			__func__);
+		goto err0;
+	}
+	mutex_unlock(&gli->mutex);
+
+	ctxi = get_context(cfg, rctxid, lli, 0);
+	if (unlikely(!ctxi)) {
+		dev_err(dev, "%s: Bad context! (%llu)\n", __func__, ctxid);
+		rc = -EINVAL;
+		goto err1;
+	}
+
+	rhte = rhte_checkout(ctxi, lli);
+	if (unlikely(!rhte)) {
+		dev_err(dev, "%s: too many opens for this context\n", __func__);
+		rc = -EMFILE;	/* too many opens  */
+		goto err1;
+	}
+
+	rsrc_handle = (rhte - ctxi->rht_start);
+
+	/* Populate RHT format 0 */
+	rhte->nmask = MC_RHT_NMASK;
+	rhte->fp = SISL_RHT_FP(0U, ctxi->rht_perms);
+
+	/* Resize even if requested size is 0 */
+	marshal_virt_to_resize(virt, &resize);
+	resize.rsrc_handle = rsrc_handle;
+	rc = _cxlflash_vlun_resize(sdev, ctxi, &resize);
+	if (rc) {
+		dev_err(dev, "%s: resize failed rc %d\n", __func__, rc);
+		goto err2;
+	}
+	last_lba = resize.last_lba;
+
+	if (virt->hdr.flags & DK_CXLFLASH_UVIRTUAL_NEED_WRITE_SAME)
+		ctxi->rht_needs_ws[rsrc_handle] = true;
+
+	virt->hdr.return_flags = 0;
+	virt->last_lba = last_lba;
+	virt->rsrc_handle = rsrc_handle;
+
+out:
+	if (likely(ctxi))
+		put_context(ctxi);
+	pr_debug("%s: returning handle 0x%llx rc=%d llba %lld\n",
+		 __func__, rsrc_handle, rc, last_lba);
+	return rc;
+
+err2:
+	rhte_checkin(ctxi, rhte);
+err1:
+	cxlflash_lun_detach(gli);
+	goto out;
+err0:
+	/* Special common cleanup prior to successful LUN attach */
+	cxlflash_ba_terminate(&gli->blka.ba_lun);
+	mutex_unlock(&gli->mutex);
+	goto out;
+}
+
+/**
+ * clone_lxt() - copies translation tables from source to destination RHTE
+ * @afu:	AFU associated with the host.
+ * @blka:	Block allocator associated with LUN.
+ * @ctxid:	Context ID of context owning the RHTE.
+ * @rhndl:	Resource handle associated with the RHTE.
+ * @rhte:	Destination resource handle entry (RHTE).
+ * @rhte_src:	Source resource handle entry (RHTE).
+ *
+ * Return: 0 on success, -errno on failure
+ */
+static int clone_lxt(struct afu *afu,
+		     struct blka *blka,
+		     ctx_hndl_t ctxid,
+		     res_hndl_t rhndl,
+		     struct sisl_rht_entry *rhte,
+		     struct sisl_rht_entry *rhte_src)
+{
+	struct sisl_lxt_entry *lxt;
+	u32 ngrps;
+	u64 aun;		/* chunk# allocated by block allocator */
+	int i, j;
+
+	ngrps = LXT_NUM_GROUPS(rhte_src->lxt_cnt);
+
+	if (ngrps) {
+		/* allocate new LXTs for clone */
+		lxt = kzalloc((sizeof(*lxt) * LXT_GROUP_SIZE * ngrps),
+				GFP_KERNEL);
+		if (unlikely(!lxt))
+			return -ENOMEM;
+
+		/* copy over */
+		memcpy(lxt, rhte_src->lxt_start,
+		       (sizeof(*lxt) * rhte_src->lxt_cnt));
+
+		/* clone the LBAs in block allocator via ref_cnt */
+		mutex_lock(&blka->mutex);
+		for (i = 0; i < rhte_src->lxt_cnt; i++) {
+			aun = (lxt[i].rlba_base >> MC_CHUNK_SHIFT);
+			if (ba_clone(&blka->ba_lun, aun) == -1ULL) {
+				/* free the clones already made */
+				for (j = 0; j < i; j++) {
+					aun = (lxt[j].rlba_base >>
+					       MC_CHUNK_SHIFT);
+					ba_free(&blka->ba_lun, aun);
+				}
+
+				mutex_unlock(&blka->mutex);
+				kfree(lxt);
+				return -EIO;
+			}
+		}
+		mutex_unlock(&blka->mutex);
+	} else {
+		lxt = NULL;
+	}
+
+	/*
+	 * The following sequence is prescribed in the SISlite spec
+	 * for syncing up with the AFU when adding LXT entries.
+	 */
+	dma_wmb(); /* Make LXT updates are visible */
+
+	rhte->lxt_start = lxt;
+	dma_wmb(); /* Make RHT entry's LXT table update visible */
+
+	rhte->lxt_cnt = rhte_src->lxt_cnt;
+	dma_wmb(); /* Make RHT entry's LXT table size update visible */
+
+	cxlflash_afu_sync(afu, ctxid, rhndl, AFU_LW_SYNC);
+
+	pr_debug("%s: returning\n", __func__);
+	return 0;
+}
+
+/**
+ * cxlflash_disk_clone() - clone a context by making snapshot of another
+ * @sdev:	SCSI device associated with LUN owning virtual LUN.
+ * @clone:	Clone ioctl data structure.
+ *
+ * This routine effectively performs cxlflash_disk_open operation for each
+ * in-use virtual resource in the source context. Note that the destination
+ * context must be in pristine state and cannot have any resource handles
+ * open at the time of the clone.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int cxlflash_disk_clone(struct scsi_device *sdev,
+			struct dk_cxlflash_clone *clone)
+{
+	struct cxlflash_cfg *cfg = (struct cxlflash_cfg *)sdev->host->hostdata;
+	struct llun_info *lli = sdev->hostdata;
+	struct glun_info *gli = lli->parent;
+	struct blka *blka = &gli->blka;
+	struct afu *afu = cfg->afu;
+	struct dk_cxlflash_release release = { { 0 }, 0 };
+
+	struct ctx_info *ctxi_src = NULL,
+			*ctxi_dst = NULL;
+	struct lun_access *lun_access_src, *lun_access_dst;
+	u32 perms;
+	u64 ctxid_src = DECODE_CTXID(clone->context_id_src),
+	    ctxid_dst = DECODE_CTXID(clone->context_id_dst),
+	    rctxid_src = clone->context_id_src,
+	    rctxid_dst = clone->context_id_dst;
+	int adap_fd_src = clone->adap_fd_src;
+	int i, j;
+	int rc = 0;
+	bool found;
+	LIST_HEAD(sidecar);
+
+	pr_debug("%s: ctxid_src=%llu ctxid_dst=%llu adap_fd_src=%d\n",
+		 __func__, ctxid_src, ctxid_dst, adap_fd_src);
+
+	/* Do not clone yourself */
+	if (unlikely(rctxid_src == rctxid_dst)) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	if (unlikely(gli->mode != MODE_VIRTUAL)) {
+		rc = -EINVAL;
+		pr_debug("%s: Clone not supported on physical LUNs! (%d)\n",
+			 __func__, gli->mode);
+		goto out;
+	}
+
+	ctxi_src = get_context(cfg, rctxid_src, lli, CTX_CTRL_CLONE);
+	ctxi_dst = get_context(cfg, rctxid_dst, lli, 0);
+	if (unlikely(!ctxi_src || !ctxi_dst)) {
+		pr_debug("%s: Bad context! (%llu,%llu)\n", __func__,
+			 ctxid_src, ctxid_dst);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	if (unlikely(adap_fd_src != ctxi_src->lfd)) {
+		pr_debug("%s: Invalid source adapter fd! (%d)\n",
+			 __func__, adap_fd_src);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	/* Verify there is no open resource handle in the destination context */
+	for (i = 0; i < MAX_RHT_PER_CONTEXT; i++)
+		if (ctxi_dst->rht_start[i].nmask != 0) {
+			rc = -EINVAL;
+			goto out;
+		}
+
+	/* Clone LUN access list */
+	list_for_each_entry(lun_access_src, &ctxi_src->luns, list) {
+		found = false;
+		list_for_each_entry(lun_access_dst, &ctxi_dst->luns, list)
+			if (lun_access_dst->sdev == lun_access_src->sdev) {
+				found = true;
+				break;
+			}
+
+		if (!found) {
+			lun_access_dst = kzalloc(sizeof(*lun_access_dst),
+						 GFP_KERNEL);
+			if (unlikely(!lun_access_dst)) {
+				pr_err("%s: Unable to allocate lun_access!\n",
+				       __func__);
+				rc = -ENOMEM;
+				goto out;
+			}
+
+			*lun_access_dst = *lun_access_src;
+			list_add(&lun_access_dst->list, &sidecar);
+		}
+	}
+
+	if (unlikely(!ctxi_src->rht_out)) {
+		pr_debug("%s: Nothing to clone!\n", __func__);
+		goto out_success;
+	}
+
+	/* User specified permission on attach */
+	perms = ctxi_dst->rht_perms;
+
+	/*
+	 * Copy over checked-out RHT (and their associated LXT) entries by
+	 * hand, stopping after we've copied all outstanding entries and
+	 * cleaning up if the clone fails.
+	 *
+	 * Note: This loop is equivalent to performing cxlflash_disk_open and
+	 * cxlflash_vlun_resize. As such, LUN accounting needs to be taken into
+	 * account by attaching after each successful RHT entry clone. In the
+	 * event that a clone failure is experienced, the LUN detach is handled
+	 * via the cleanup performed by _cxlflash_disk_release.
+	 */
+	for (i = 0; i < MAX_RHT_PER_CONTEXT; i++) {
+		if (ctxi_src->rht_out == ctxi_dst->rht_out)
+			break;
+		if (ctxi_src->rht_start[i].nmask == 0)
+			continue;
+
+		/* Consume a destination RHT entry */
+		ctxi_dst->rht_out++;
+		ctxi_dst->rht_start[i].nmask = ctxi_src->rht_start[i].nmask;
+		ctxi_dst->rht_start[i].fp =
+		    SISL_RHT_FP_CLONE(ctxi_src->rht_start[i].fp, perms);
+		ctxi_dst->rht_lun[i] = ctxi_src->rht_lun[i];
+
+		rc = clone_lxt(afu, blka, ctxid_dst, i,
+			       &ctxi_dst->rht_start[i],
+			       &ctxi_src->rht_start[i]);
+		if (rc) {
+			marshal_clone_to_rele(clone, &release);
+			for (j = 0; j < i; j++) {
+				release.rsrc_handle = j;
+				_cxlflash_disk_release(sdev, ctxi_dst,
+						       &release);
+			}
+
+			/* Put back the one we failed on */
+			rhte_checkin(ctxi_dst, &ctxi_dst->rht_start[i]);
+			goto err;
+		}
+
+		cxlflash_lun_attach(gli, gli->mode, false);
+	}
+
+out_success:
+	list_splice(&sidecar, &ctxi_dst->luns);
+	sys_close(adap_fd_src);
+
+	/* fall through */
+out:
+	if (ctxi_src)
+		put_context(ctxi_src);
+	if (ctxi_dst)
+		put_context(ctxi_dst);
+	pr_debug("%s: returning rc=%d\n", __func__, rc);
+	return rc;
+
+err:
+	list_for_each_entry_safe(lun_access_src, lun_access_dst, &sidecar, list)
+		kfree(lun_access_src);
+	goto out;
+}
diff --git a/drivers/scsi/cxlflash/vlun.h b/drivers/scsi/cxlflash/vlun.h
new file mode 100644
index 000000000000..8b29a74946e4
--- /dev/null
+++ b/drivers/scsi/cxlflash/vlun.h
@@ -0,0 +1,86 @@
+/*
+ * CXL Flash Device Driver
+ *
+ * Written by: Manoj N. Kumar <manoj@linux.vnet.ibm.com>, IBM Corporation
+ *             Matthew R. Ochs <mrochs@linux.vnet.ibm.com>, IBM Corporation
+ *
+ * Copyright (C) 2015 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _CXLFLASH_VLUN_H
+#define _CXLFLASH_VLUN_H
+
+/* RHT - Resource Handle Table */
+#define MC_RHT_NMASK      16	/* in bits */
+#define MC_CHUNK_SHIFT    MC_RHT_NMASK	/* shift to go from LBA to chunk# */
+
+#define HIBIT             (BITS_PER_LONG - 1)
+
+#define MAX_AUN_CLONE_CNT 0xFF
+
+/*
+ * LXT - LBA Translation Table
+ *
+ * +-------+-------+-------+-------+-------+-------+-------+---+---+
+ * | RLBA_BASE                                     |LUN_IDX| P |SEL|
+ * +-------+-------+-------+-------+-------+-------+-------+---+---+
+ *
+ * The LXT Entry contains the physical LBA where the chunk starts (RLBA_BASE).
+ * AFU ORes the low order bits from the virtual LBA (offset into the chunk)
+ * with RLBA_BASE. The result is the physical LBA to be sent to storage.
+ * The LXT Entry also contains an index to a LUN TBL and a bitmask of which
+ * outgoing (FC) * ports can be selected. The port select bit-mask is ANDed
+ * with a global port select bit-mask maintained by the driver.
+ * In addition, it has permission bits that are ANDed with the
+ * RHT permissions to arrive at the final permissions for the chunk.
+ *
+ * LXT tables are allocated dynamically in groups. This is done to avoid
+ * a malloc/free overhead each time the LXT has to grow or shrink.
+ *
+ * Based on the current lxt_cnt (used), it is always possible to know
+ * how many are allocated (used+free). The number of allocated entries is
+ * not stored anywhere.
+ *
+ * The LXT table is re-allocated whenever it needs to cross into another group.
+*/
+#define LXT_GROUP_SIZE          8
+#define LXT_NUM_GROUPS(lxt_cnt) (((lxt_cnt) + 7)/8)	/* alloc'ed groups */
+#define LXT_LUNIDX_SHIFT  8	/* LXT entry, shift for LUN index */
+#define LXT_PERM_SHIFT    4	/* LXT entry, shift for permission bits */
+
+struct ba_lun_info {
+	u64 *lun_alloc_map;
+	u32 lun_bmap_size;
+	u32 total_aus;
+	u64 free_aun_cnt;
+
+	/* indices to be used for elevator lookup of free map */
+	u32 free_low_idx;
+	u32 free_curr_idx;
+	u32 free_high_idx;
+
+	u8 *aun_clone_map;
+};
+
+struct ba_lun {
+	u64 lun_id;
+	u64 wwpn;
+	size_t lsize;		/* LUN size in number of LBAs             */
+	size_t lba_size;	/* LBA size in number of bytes            */
+	size_t au_size;		/* Allocation Unit size in number of LBAs */
+	struct ba_lun_info *ba_lun_handle;
+};
+
+/* Block Allocator */
+struct blka {
+	struct ba_lun ba_lun;
+	u64 nchunk;		/* number of chunks */
+	struct mutex mutex;
+};
+
+#endif /* ifndef _CXLFLASH_SUPERPIPE_H */
diff --git a/include/uapi/scsi/cxlflash_ioctl.h b/include/uapi/scsi/cxlflash_ioctl.h
index 570773406531..831351b2e660 100644
--- a/include/uapi/scsi/cxlflash_ioctl.h
+++ b/include/uapi/scsi/cxlflash_ioctl.h
@@ -71,6 +71,17 @@ struct dk_cxlflash_udirect {
 	__u64 reserved[8];		/* Reserved for future use */
 };
 
+#define DK_CXLFLASH_UVIRTUAL_NEED_WRITE_SAME	0x8000000000000000ULL
+
+struct dk_cxlflash_uvirtual {
+	struct dk_cxlflash_hdr hdr;	/* Common fields */
+	__u64 context_id;		/* Context to own virtual resources */
+	__u64 lun_size;			/* Requested size, in 4K blocks */
+	__u64 rsrc_handle;		/* Returned resource handle */
+	__u64 last_lba;			/* Returned last LBA of LUN */
+	__u64 reserved[8];		/* Reserved for future use */
+};
+
 struct dk_cxlflash_release {
 	struct dk_cxlflash_hdr hdr;	/* Common fields */
 	__u64 context_id;		/* Context owning resources */
@@ -78,6 +89,23 @@ struct dk_cxlflash_release {
 	__u64 reserved[8];		/* Reserved for future use */
 };
 
+struct dk_cxlflash_resize {
+	struct dk_cxlflash_hdr hdr;	/* Common fields */
+	__u64 context_id;		/* Context owning resources */
+	__u64 rsrc_handle;		/* Resource handle of LUN to resize */
+	__u64 req_size;			/* New requested size, in 4K blocks */
+	__u64 last_lba;			/* Returned last LBA of LUN */
+	__u64 reserved[8];		/* Reserved for future use */
+};
+
+struct dk_cxlflash_clone {
+	struct dk_cxlflash_hdr hdr;	/* Common fields */
+	__u64 context_id_src;		/* Context to clone from */
+	__u64 context_id_dst;		/* Context to clone to */
+	__u64 adap_fd_src;		/* Source context adapter fd */
+	__u64 reserved[8];		/* Reserved for future use */
+};
+
 #define DK_CXLFLASH_VERIFY_SENSE_LEN	18
 #define DK_CXLFLASH_VERIFY_HINT_SENSE	0x8000000000000000ULL
 
@@ -118,7 +146,10 @@ union cxlflash_ioctls {
 	struct dk_cxlflash_attach attach;
 	struct dk_cxlflash_detach detach;
 	struct dk_cxlflash_udirect udirect;
+	struct dk_cxlflash_uvirtual uvirtual;
 	struct dk_cxlflash_release release;
+	struct dk_cxlflash_resize resize;
+	struct dk_cxlflash_clone clone;
 	struct dk_cxlflash_verify verify;
 	struct dk_cxlflash_recover_afu recover_afu;
 	struct dk_cxlflash_manage_lun manage_lun;
@@ -136,5 +167,8 @@ union cxlflash_ioctls {
 #define DK_CXLFLASH_VERIFY		CXL_IOWR(0x84, dk_cxlflash_verify)
 #define DK_CXLFLASH_RECOVER_AFU		CXL_IOWR(0x85, dk_cxlflash_recover_afu)
 #define DK_CXLFLASH_MANAGE_LUN		CXL_IOWR(0x86, dk_cxlflash_manage_lun)
+#define DK_CXLFLASH_USER_VIRTUAL	CXL_IOWR(0x87, dk_cxlflash_uvirtual)
+#define DK_CXLFLASH_VLUN_RESIZE		CXL_IOWR(0x88, dk_cxlflash_resize)
+#define DK_CXLFLASH_VLUN_CLONE		CXL_IOWR(0x89, dk_cxlflash_clone)
 
 #endif /* ifndef _CXLFLASH_IOCTL_H */
-- 
cgit v1.2.3


From 7f8a436eaa2c3ddd8e1ff2fbca267e6275085536 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joestringer@nicira.com>
Date: Wed, 26 Aug 2015 11:31:48 -0700
Subject: openvswitch: Add conntrack action

Expose the kernel connection tracker via OVS. Userspace components can
make use of the CT action to populate the connection state (ct_state)
field for a flow. This state can be subsequently matched.

Exposed connection states are OVS_CS_F_*:
- NEW (0x01) - Beginning of a new connection.
- ESTABLISHED (0x02) - Part of an existing connection.
- RELATED (0x04) - Related to an established connection.
- INVALID (0x20) - Could not track the connection for this packet.
- REPLY_DIR (0x40) - This packet is in the reply direction for the flow.
- TRACKED (0x80) - This packet has been sent through conntrack.

When the CT action is executed by itself, it will send the packet
through the connection tracker and populate the ct_state field with one
or more of the connection state flags above. The CT action will always
set the TRACKED bit.

When the COMMIT flag is passed to the conntrack action, this specifies
that information about the connection should be stored. This allows
subsequent packets for the same (or related) connections to be
correlated with this connection. Sending subsequent packets for the
connection through conntrack allows the connection tracker to consider
the packets as ESTABLISHED, RELATED, and/or REPLY_DIR.

The CT action may optionally take a zone to track the flow within. This
allows connections with the same 5-tuple to be kept logically separate
from connections in other zones. If the zone is specified, then the
"ct_zone" match field will be subsequently populated with the zone id.

IP fragments are handled by transparently assembling them as part of the
CT action. The maximum received unit (MRU) size is tracked so that
refragmentation can occur during output.

IP frag handling contributed by Andy Zhou.

Based on original design by Justin Pettit.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Signed-off-by: Justin Pettit <jpettit@nicira.com>
Signed-off-by: Andy Zhou <azhou@nicira.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h |  40 ++++
 net/openvswitch/Kconfig          |  11 +
 net/openvswitch/Makefile         |   2 +
 net/openvswitch/actions.c        | 175 ++++++++++++++-
 net/openvswitch/conntrack.c      | 454 +++++++++++++++++++++++++++++++++++++++
 net/openvswitch/conntrack.h      |  78 +++++++
 net/openvswitch/datapath.c       |  66 ++++--
 net/openvswitch/datapath.h       |   6 +
 net/openvswitch/flow.c           |   2 +
 net/openvswitch/flow.h           |   6 +
 net/openvswitch/flow_netlink.c   |  69 ++++--
 net/openvswitch/flow_netlink.h   |   4 +-
 net/openvswitch/vport.c          |   1 +
 13 files changed, 877 insertions(+), 37 deletions(-)
 create mode 100644 net/openvswitch/conntrack.c
 create mode 100644 net/openvswitch/conntrack.h

(limited to 'include/uapi')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index d6b885460187..55f599792673 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -164,6 +164,9 @@ enum ovs_packet_cmd {
  * %OVS_USERSPACE_ATTR_EGRESS_TUN_PORT attribute, which is sent only if the
  * output port is actually a tunnel port. Contains the output tunnel key
  * extracted from the packet as nested %OVS_TUNNEL_KEY_ATTR_* attributes.
+ * @OVS_PACKET_ATTR_MRU: Present for an %OVS_PACKET_CMD_ACTION and
+ * %OVS_PACKET_ATTR_USERSPACE action specify the Maximum received fragment
+ * size.
  *
  * These attributes follow the &struct ovs_header within the Generic Netlink
  * payload for %OVS_PACKET_* commands.
@@ -180,6 +183,7 @@ enum ovs_packet_attr {
 	OVS_PACKET_ATTR_UNUSED2,
 	OVS_PACKET_ATTR_PROBE,      /* Packet operation is a feature probe,
 				       error logging should be suppressed. */
+	OVS_PACKET_ATTR_MRU,	    /* Maximum received IP fragment size. */
 	__OVS_PACKET_ATTR_MAX
 };
 
@@ -319,6 +323,8 @@ enum ovs_key_attr {
 	OVS_KEY_ATTR_MPLS,      /* array of struct ovs_key_mpls.
 				 * The implementation may restrict
 				 * the accepted length of the array. */
+	OVS_KEY_ATTR_CT_STATE,	/* u8 bitmask of OVS_CS_F_* */
+	OVS_KEY_ATTR_CT_ZONE,	/* u16 connection tracking zone. */
 
 #ifdef __KERNEL__
 	OVS_KEY_ATTR_TUNNEL_INFO,  /* struct ip_tunnel_info */
@@ -431,6 +437,15 @@ struct ovs_key_nd {
 	__u8	nd_tll[ETH_ALEN];
 };
 
+/* OVS_KEY_ATTR_CT_STATE flags */
+#define OVS_CS_F_NEW               0x01 /* Beginning of a new connection. */
+#define OVS_CS_F_ESTABLISHED       0x02 /* Part of an existing connection. */
+#define OVS_CS_F_RELATED           0x04 /* Related to an established
+					 * connection. */
+#define OVS_CS_F_INVALID           0x20 /* Could not track connection. */
+#define OVS_CS_F_REPLY_DIR         0x40 /* Flow is in the reply direction. */
+#define OVS_CS_F_TRACKED           0x80 /* Conntrack has occurred. */
+
 /**
  * enum ovs_flow_attr - attributes for %OVS_FLOW_* commands.
  * @OVS_FLOW_ATTR_KEY: Nested %OVS_KEY_ATTR_* attributes specifying the flow
@@ -594,6 +609,28 @@ struct ovs_action_hash {
 	uint32_t  hash_basis;
 };
 
+/**
+ * enum ovs_ct_attr - Attributes for %OVS_ACTION_ATTR_CT action.
+ * @OVS_CT_ATTR_FLAGS: u32 connection tracking flags.
+ * @OVS_CT_ATTR_ZONE: u16 connection tracking zone.
+ */
+enum ovs_ct_attr {
+	OVS_CT_ATTR_UNSPEC,
+	OVS_CT_ATTR_FLAGS,      /* u8 bitmask of OVS_CT_F_*. */
+	OVS_CT_ATTR_ZONE,       /* u16 zone id. */
+	__OVS_CT_ATTR_MAX
+};
+
+#define OVS_CT_ATTR_MAX (__OVS_CT_ATTR_MAX - 1)
+
+/*
+ * OVS_CT_ATTR_FLAGS flags - bitmask of %OVS_CT_F_*
+ * @OVS_CT_F_COMMIT: Commits the flow to the conntrack table. This allows
+ * future packets for the same connection to be identified as 'established'
+ * or 'related'.
+ */
+#define OVS_CT_F_COMMIT		0x01
+
 /**
  * enum ovs_action_attr - Action types.
  *
@@ -623,6 +660,8 @@ struct ovs_action_hash {
  * indicate the new packet contents. This could potentially still be
  * %ETH_P_MPLS if the resulting MPLS label stack is not empty.  If there
  * is no MPLS label stack, as determined by ethertype, no action is taken.
+ * @OVS_ACTION_ATTR_CT: Track the connection. Populate the conntrack-related
+ * entries in the flow key.
  *
  * Only a single header can be set with a single %OVS_ACTION_ATTR_SET.  Not all
  * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
@@ -648,6 +687,7 @@ enum ovs_action_attr {
 				       * data immediately followed by a mask.
 				       * The data must be zero for the unmasked
 				       * bits. */
+	OVS_ACTION_ATTR_CT,           /* One nested OVS_CT_ATTR_* . */
 
 	__OVS_ACTION_ATTR_MAX,	      /* Nothing past this will be accepted
 				       * from userspace. */
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index 422dc0567de9..98f343d0d6dd 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -31,6 +31,17 @@ config OPENVSWITCH
 
 	  If unsure, say N.
 
+config OPENVSWITCH_CONNTRACK
+	bool "Open vSwitch conntrack action support"
+	depends on OPENVSWITCH
+	depends on NF_CONNTRACK
+	default OPENVSWITCH
+	---help---
+	  If you say Y here, then Open vSwitch module will be able to pass
+	  packets through conntrack.
+
+	  Say N to exclude this support and reduce the binary size.
+
 config OPENVSWITCH_GRE
 	tristate "Open vSwitch GRE tunneling support"
 	depends on OPENVSWITCH
diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile
index 6e1701de04d8..5b5913b06f54 100644
--- a/net/openvswitch/Makefile
+++ b/net/openvswitch/Makefile
@@ -15,6 +15,8 @@ openvswitch-y := \
 	vport-internal_dev.o \
 	vport-netdev.o
 
+openvswitch-$(CONFIG_OPENVSWITCH_CONNTRACK) += conntrack.o
+
 obj-$(CONFIG_OPENVSWITCH_VXLAN)+= vport-vxlan.o
 obj-$(CONFIG_OPENVSWITCH_GENEVE)+= vport-geneve.o
 obj-$(CONFIG_OPENVSWITCH_GRE)	+= vport-gre.o
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 520438b77dc8..72ca2c491b0a 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -22,6 +22,7 @@
 #include <linux/in.h>
 #include <linux/ip.h>
 #include <linux/openvswitch.h>
+#include <linux/netfilter_ipv6.h>
 #include <linux/sctp.h>
 #include <linux/tcp.h>
 #include <linux/udp.h>
@@ -29,6 +30,7 @@
 #include <linux/if_arp.h>
 #include <linux/if_vlan.h>
 
+#include <net/dst.h>
 #include <net/ip.h>
 #include <net/ipv6.h>
 #include <net/checksum.h>
@@ -38,6 +40,7 @@
 
 #include "datapath.h"
 #include "flow.h"
+#include "conntrack.h"
 #include "vport.h"
 
 static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
@@ -52,6 +55,20 @@ struct deferred_action {
 	struct sw_flow_key pkt_key;
 };
 
+#define MAX_L2_LEN	(VLAN_ETH_HLEN + 3 * MPLS_HLEN)
+struct ovs_frag_data {
+	unsigned long dst;
+	struct vport *vport;
+	struct ovs_skb_cb cb;
+	__be16 inner_protocol;
+	__u16 vlan_tci;
+	__be16 vlan_proto;
+	unsigned int l2_len;
+	u8 l2_data[MAX_L2_LEN];
+};
+
+static DEFINE_PER_CPU(struct ovs_frag_data, ovs_frag_data_storage);
+
 #define DEFERRED_ACTION_FIFO_SIZE 10
 struct action_fifo {
 	int head;
@@ -602,14 +619,145 @@ static int set_sctp(struct sk_buff *skb, struct sw_flow_key *flow_key,
 	return 0;
 }
 
-static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port)
+static int ovs_vport_output(struct sock *sock, struct sk_buff *skb)
+{
+	struct ovs_frag_data *data = this_cpu_ptr(&ovs_frag_data_storage);
+	struct vport *vport = data->vport;
+
+	if (skb_cow_head(skb, data->l2_len) < 0) {
+		kfree_skb(skb);
+		return -ENOMEM;
+	}
+
+	__skb_dst_copy(skb, data->dst);
+	*OVS_CB(skb) = data->cb;
+	skb->inner_protocol = data->inner_protocol;
+	skb->vlan_tci = data->vlan_tci;
+	skb->vlan_proto = data->vlan_proto;
+
+	/* Reconstruct the MAC header.  */
+	skb_push(skb, data->l2_len);
+	memcpy(skb->data, &data->l2_data, data->l2_len);
+	ovs_skb_postpush_rcsum(skb, skb->data, data->l2_len);
+	skb_reset_mac_header(skb);
+
+	ovs_vport_send(vport, skb);
+	return 0;
+}
+
+static unsigned int
+ovs_dst_get_mtu(const struct dst_entry *dst)
+{
+	return dst->dev->mtu;
+}
+
+static struct dst_ops ovs_dst_ops = {
+	.family = AF_UNSPEC,
+	.mtu = ovs_dst_get_mtu,
+};
+
+/* prepare_frag() is called once per (larger-than-MTU) frame; its inverse is
+ * ovs_vport_output(), which is called once per fragmented packet.
+ */
+static void prepare_frag(struct vport *vport, struct sk_buff *skb)
+{
+	unsigned int hlen = skb_network_offset(skb);
+	struct ovs_frag_data *data;
+
+	data = this_cpu_ptr(&ovs_frag_data_storage);
+	data->dst = skb->_skb_refdst;
+	data->vport = vport;
+	data->cb = *OVS_CB(skb);
+	data->inner_protocol = skb->inner_protocol;
+	data->vlan_tci = skb->vlan_tci;
+	data->vlan_proto = skb->vlan_proto;
+	data->l2_len = hlen;
+	memcpy(&data->l2_data, skb->data, hlen);
+
+	memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+	skb_pull(skb, hlen);
+}
+
+static void ovs_fragment(struct vport *vport, struct sk_buff *skb, u16 mru,
+			 __be16 ethertype)
+{
+	if (skb_network_offset(skb) > MAX_L2_LEN) {
+		OVS_NLERR(1, "L2 header too long to fragment");
+		return;
+	}
+
+	if (ethertype == htons(ETH_P_IP)) {
+		struct dst_entry ovs_dst;
+		unsigned long orig_dst;
+
+		prepare_frag(vport, skb);
+		dst_init(&ovs_dst, &ovs_dst_ops, NULL, 1,
+			 DST_OBSOLETE_NONE, DST_NOCOUNT);
+		ovs_dst.dev = vport->dev;
+
+		orig_dst = skb->_skb_refdst;
+		skb_dst_set_noref(skb, &ovs_dst);
+		IPCB(skb)->frag_max_size = mru;
+
+		ip_do_fragment(skb->sk, skb, ovs_vport_output);
+		refdst_drop(orig_dst);
+	} else if (ethertype == htons(ETH_P_IPV6)) {
+		const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops();
+		unsigned long orig_dst;
+		struct rt6_info ovs_rt;
+
+		if (!v6ops) {
+			kfree_skb(skb);
+			return;
+		}
+
+		prepare_frag(vport, skb);
+		memset(&ovs_rt, 0, sizeof(ovs_rt));
+		dst_init(&ovs_rt.dst, &ovs_dst_ops, NULL, 1,
+			 DST_OBSOLETE_NONE, DST_NOCOUNT);
+		ovs_rt.dst.dev = vport->dev;
+
+		orig_dst = skb->_skb_refdst;
+		skb_dst_set_noref(skb, &ovs_rt.dst);
+		IP6CB(skb)->frag_max_size = mru;
+
+		v6ops->fragment(skb->sk, skb, ovs_vport_output);
+		refdst_drop(orig_dst);
+	} else {
+		WARN_ONCE(1, "Failed fragment ->%s: eth=%04x, MRU=%d, MTU=%d.",
+			  ovs_vport_name(vport), ntohs(ethertype), mru,
+			  vport->dev->mtu);
+		kfree_skb(skb);
+	}
+}
+
+static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port,
+		      struct sw_flow_key *key)
 {
 	struct vport *vport = ovs_vport_rcu(dp, out_port);
 
-	if (likely(vport))
-		ovs_vport_send(vport, skb);
-	else
+	if (likely(vport)) {
+		u16 mru = OVS_CB(skb)->mru;
+
+		if (likely(!mru || (skb->len <= mru + ETH_HLEN))) {
+			ovs_vport_send(vport, skb);
+		} else if (mru <= vport->dev->mtu) {
+			__be16 ethertype = key->eth.type;
+
+			if (!is_flow_key_valid(key)) {
+				if (eth_p_mpls(skb->protocol))
+					ethertype = skb->inner_protocol;
+				else
+					ethertype = vlan_get_protocol(skb);
+			}
+
+			ovs_fragment(vport, skb, mru, ethertype);
+		} else {
+			kfree_skb(skb);
+		}
+	} else {
 		kfree_skb(skb);
+	}
 }
 
 static int output_userspace(struct datapath *dp, struct sk_buff *skb,
@@ -623,6 +771,7 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb,
 
 	memset(&upcall, 0, sizeof(upcall));
 	upcall.cmd = OVS_PACKET_CMD_ACTION;
+	upcall.mru = OVS_CB(skb)->mru;
 
 	for (a = nla_data(attr), rem = nla_len(attr); rem > 0;
 		 a = nla_next(a, &rem)) {
@@ -816,6 +965,11 @@ static int execute_masked_set_action(struct sk_buff *skb,
 		err = set_mpls(skb, flow_key, nla_data(a), get_mask(a,
 								    __be32 *));
 		break;
+
+	case OVS_KEY_ATTR_CT_STATE:
+	case OVS_KEY_ATTR_CT_ZONE:
+		err = -EINVAL;
+		break;
 	}
 
 	return err;
@@ -885,7 +1039,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 			struct sk_buff *out_skb = skb_clone(skb, GFP_ATOMIC);
 
 			if (out_skb)
-				do_output(dp, out_skb, prev_port);
+				do_output(dp, out_skb, prev_port, key);
 
 			prev_port = -1;
 		}
@@ -942,6 +1096,15 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 		case OVS_ACTION_ATTR_SAMPLE:
 			err = sample(dp, skb, key, a, attr, len);
 			break;
+
+		case OVS_ACTION_ATTR_CT:
+			err = ovs_ct_execute(ovs_dp_get_net(dp), skb, key,
+					     nla_data(a));
+
+			/* Hide stolen IP fragments from user space. */
+			if (err == -EINPROGRESS)
+				return 0;
+			break;
 		}
 
 		if (unlikely(err)) {
@@ -951,7 +1114,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 	}
 
 	if (prev_port != -1)
-		do_output(dp, skb, prev_port);
+		do_output(dp, skb, prev_port, key);
 	else
 		consume_skb(skb);
 
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
new file mode 100644
index 000000000000..1189fd50f1cf
--- /dev/null
+++ b/net/openvswitch/conntrack.c
@@ -0,0 +1,454 @@
+/*
+ * Copyright (c) 2015 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/openvswitch.h>
+#include <net/ip.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+
+#include "datapath.h"
+#include "conntrack.h"
+#include "flow.h"
+#include "flow_netlink.h"
+
+struct ovs_ct_len_tbl {
+	size_t maxlen;
+	size_t minlen;
+};
+
+/* Conntrack action context for execution. */
+struct ovs_conntrack_info {
+	struct nf_conntrack_zone zone;
+	struct nf_conn *ct;
+	u32 flags;
+	u16 family;
+};
+
+static u16 key_to_nfproto(const struct sw_flow_key *key)
+{
+	switch (ntohs(key->eth.type)) {
+	case ETH_P_IP:
+		return NFPROTO_IPV4;
+	case ETH_P_IPV6:
+		return NFPROTO_IPV6;
+	default:
+		return NFPROTO_UNSPEC;
+	}
+}
+
+/* Map SKB connection state into the values used by flow definition. */
+static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo)
+{
+	u8 ct_state = OVS_CS_F_TRACKED;
+
+	switch (ctinfo) {
+	case IP_CT_ESTABLISHED_REPLY:
+	case IP_CT_RELATED_REPLY:
+	case IP_CT_NEW_REPLY:
+		ct_state |= OVS_CS_F_REPLY_DIR;
+		break;
+	default:
+		break;
+	}
+
+	switch (ctinfo) {
+	case IP_CT_ESTABLISHED:
+	case IP_CT_ESTABLISHED_REPLY:
+		ct_state |= OVS_CS_F_ESTABLISHED;
+		break;
+	case IP_CT_RELATED:
+	case IP_CT_RELATED_REPLY:
+		ct_state |= OVS_CS_F_RELATED;
+		break;
+	case IP_CT_NEW:
+	case IP_CT_NEW_REPLY:
+		ct_state |= OVS_CS_F_NEW;
+		break;
+	default:
+		break;
+	}
+
+	return ct_state;
+}
+
+static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state,
+				const struct nf_conntrack_zone *zone)
+{
+	key->ct.state = state;
+	key->ct.zone = zone->id;
+}
+
+/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has
+ * previously sent the packet to conntrack via the ct action.
+ */
+static void ovs_ct_update_key(const struct sk_buff *skb,
+			      struct sw_flow_key *key, bool post_ct)
+{
+	const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct;
+	u8 state = 0;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (ct) {
+		state = ovs_ct_get_state(ctinfo);
+		if (ct->master)
+			state |= OVS_CS_F_RELATED;
+		zone = nf_ct_zone(ct);
+	} else if (post_ct) {
+		state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID;
+	}
+	__ovs_ct_update_key(key, state, zone);
+}
+
+void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key)
+{
+	ovs_ct_update_key(skb, key, false);
+}
+
+int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb)
+{
+	if (nla_put_u8(skb, OVS_KEY_ATTR_CT_STATE, key->ct.state))
+		return -EMSGSIZE;
+
+	if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
+	    nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, key->ct.zone))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int handle_fragments(struct net *net, struct sw_flow_key *key,
+			    u16 zone, struct sk_buff *skb)
+{
+	struct ovs_skb_cb ovs_cb = *OVS_CB(skb);
+
+	if (key->eth.type == htons(ETH_P_IP)) {
+		enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone;
+		int err;
+
+		memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+		err = ip_defrag(skb, user);
+		if (err)
+			return err;
+
+		ovs_cb.mru = IPCB(skb)->frag_max_size;
+	} else if (key->eth.type == htons(ETH_P_IPV6)) {
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+		enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
+		struct sk_buff *reasm;
+
+		memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
+		reasm = nf_ct_frag6_gather(skb, user);
+		if (!reasm)
+			return -EINPROGRESS;
+
+		if (skb == reasm)
+			return -EINVAL;
+
+		key->ip.proto = ipv6_hdr(reasm)->nexthdr;
+		skb_morph(skb, reasm);
+		consume_skb(reasm);
+		ovs_cb.mru = IP6CB(skb)->frag_max_size;
+#else
+		return -EPFNOSUPPORT;
+#endif
+	} else {
+		return -EPFNOSUPPORT;
+	}
+
+	key->ip.frag = OVS_FRAG_TYPE_NONE;
+	skb_clear_hash(skb);
+	skb->ignore_df = 1;
+	*OVS_CB(skb) = ovs_cb;
+
+	return 0;
+}
+
+static struct nf_conntrack_expect *
+ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone,
+		   u16 proto, const struct sk_buff *skb)
+{
+	struct nf_conntrack_tuple tuple;
+
+	if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, &tuple))
+		return NULL;
+	return __nf_ct_expect_find(net, zone, &tuple);
+}
+
+/* Determine whether skb->nfct is equal to the result of conntrack lookup. */
+static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb,
+			    const struct ovs_conntrack_info *info)
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (!ct)
+		return false;
+	if (!net_eq(net, read_pnet(&ct->ct_net)))
+		return false;
+	if (!nf_ct_zone_equal_any(info->ct, nf_ct_zone(ct)))
+		return false;
+
+	return true;
+}
+
+static int __ovs_ct_lookup(struct net *net, const struct sw_flow_key *key,
+			   const struct ovs_conntrack_info *info,
+			   struct sk_buff *skb)
+{
+	/* If we are recirculating packets to match on conntrack fields and
+	 * committing with a separate conntrack action,  then we don't need to
+	 * actually run the packet through conntrack twice unless it's for a
+	 * different zone.
+	 */
+	if (!skb_nfct_cached(net, skb, info)) {
+		struct nf_conn *tmpl = info->ct;
+
+		/* Associate skb with specified zone. */
+		if (tmpl) {
+			if (skb->nfct)
+				nf_conntrack_put(skb->nfct);
+			nf_conntrack_get(&tmpl->ct_general);
+			skb->nfct = &tmpl->ct_general;
+			skb->nfctinfo = IP_CT_NEW;
+		}
+
+		if (nf_conntrack_in(net, info->family, NF_INET_PRE_ROUTING,
+				    skb) != NF_ACCEPT)
+			return -ENOENT;
+	}
+
+	return 0;
+}
+
+/* Lookup connection and read fields into key. */
+static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
+			 const struct ovs_conntrack_info *info,
+			 struct sk_buff *skb)
+{
+	struct nf_conntrack_expect *exp;
+
+	exp = ovs_ct_expect_find(net, &info->zone, info->family, skb);
+	if (exp) {
+		u8 state;
+
+		state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED;
+		__ovs_ct_update_key(key, state, &info->zone);
+	} else {
+		int err;
+
+		err = __ovs_ct_lookup(net, key, info, skb);
+		if (err)
+			return err;
+
+		ovs_ct_update_key(skb, key, true);
+	}
+
+	return 0;
+}
+
+/* Lookup connection and confirm if unconfirmed. */
+static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
+			 const struct ovs_conntrack_info *info,
+			 struct sk_buff *skb)
+{
+	u8 state;
+	int err;
+
+	state = key->ct.state;
+	if (key->ct.zone == info->zone.id &&
+	    ((state & OVS_CS_F_TRACKED) && !(state & OVS_CS_F_NEW))) {
+		/* Previous lookup has shown that this connection is already
+		 * tracked and committed. Skip committing.
+		 */
+		return 0;
+	}
+
+	err = __ovs_ct_lookup(net, key, info, skb);
+	if (err)
+		return err;
+	if (nf_conntrack_confirm(skb) != NF_ACCEPT)
+		return -EINVAL;
+
+	ovs_ct_update_key(skb, key, true);
+
+	return 0;
+}
+
+int ovs_ct_execute(struct net *net, struct sk_buff *skb,
+		   struct sw_flow_key *key,
+		   const struct ovs_conntrack_info *info)
+{
+	int nh_ofs;
+	int err;
+
+	/* The conntrack module expects to be working at L3. */
+	nh_ofs = skb_network_offset(skb);
+	skb_pull(skb, nh_ofs);
+
+	if (key->ip.frag != OVS_FRAG_TYPE_NONE) {
+		err = handle_fragments(net, key, info->zone.id, skb);
+		if (err)
+			return err;
+	}
+
+	if (info->flags & OVS_CT_F_COMMIT)
+		err = ovs_ct_commit(net, key, info, skb);
+	else
+		err = ovs_ct_lookup(net, key, info, skb);
+
+	skb_push(skb, nh_ofs);
+	return err;
+}
+
+static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
+	[OVS_CT_ATTR_FLAGS]	= { .minlen = sizeof(u32),
+				    .maxlen = sizeof(u32) },
+	[OVS_CT_ATTR_ZONE]	= { .minlen = sizeof(u16),
+				    .maxlen = sizeof(u16) },
+};
+
+static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
+		    bool log)
+{
+	struct nlattr *a;
+	int rem;
+
+	nla_for_each_nested(a, attr, rem) {
+		int type = nla_type(a);
+		int maxlen = ovs_ct_attr_lens[type].maxlen;
+		int minlen = ovs_ct_attr_lens[type].minlen;
+
+		if (type > OVS_CT_ATTR_MAX) {
+			OVS_NLERR(log,
+				  "Unknown conntrack attr (type=%d, max=%d)",
+				  type, OVS_CT_ATTR_MAX);
+			return -EINVAL;
+		}
+		if (nla_len(a) < minlen || nla_len(a) > maxlen) {
+			OVS_NLERR(log,
+				  "Conntrack attr type has unexpected length (type=%d, length=%d, expected=%d)",
+				  type, nla_len(a), maxlen);
+			return -EINVAL;
+		}
+
+		switch (type) {
+		case OVS_CT_ATTR_FLAGS:
+			info->flags = nla_get_u32(a);
+			break;
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+		case OVS_CT_ATTR_ZONE:
+			info->zone.id = nla_get_u16(a);
+			break;
+#endif
+		default:
+			OVS_NLERR(log, "Unknown conntrack attr (%d)",
+				  type);
+			return -EINVAL;
+		}
+	}
+
+	if (rem > 0) {
+		OVS_NLERR(log, "Conntrack attr has %d unknown bytes", rem);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+bool ovs_ct_verify(enum ovs_key_attr attr)
+{
+	if (attr == OVS_KEY_ATTR_CT_STATE)
+		return true;
+	if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
+	    attr == OVS_KEY_ATTR_CT_ZONE)
+		return true;
+
+	return false;
+}
+
+int ovs_ct_copy_action(struct net *net, const struct nlattr *attr,
+		       const struct sw_flow_key *key,
+		       struct sw_flow_actions **sfa,  bool log)
+{
+	struct ovs_conntrack_info ct_info;
+	u16 family;
+	int err;
+
+	family = key_to_nfproto(key);
+	if (family == NFPROTO_UNSPEC) {
+		OVS_NLERR(log, "ct family unspecified");
+		return -EINVAL;
+	}
+
+	memset(&ct_info, 0, sizeof(ct_info));
+	ct_info.family = family;
+
+	nf_ct_zone_init(&ct_info.zone, NF_CT_DEFAULT_ZONE_ID,
+			NF_CT_DEFAULT_ZONE_DIR, 0);
+
+	err = parse_ct(attr, &ct_info, log);
+	if (err)
+		return err;
+
+	/* Set up template for tracking connections in specific zones. */
+	ct_info.ct = nf_ct_tmpl_alloc(net, &ct_info.zone, GFP_KERNEL);
+	if (!ct_info.ct) {
+		OVS_NLERR(log, "Failed to allocate conntrack template");
+		return -ENOMEM;
+	}
+
+	err = ovs_nla_add_action(sfa, OVS_ACTION_ATTR_CT, &ct_info,
+				 sizeof(ct_info), log);
+	if (err)
+		goto err_free_ct;
+
+	__set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status);
+	nf_conntrack_get(&ct_info.ct->ct_general);
+	return 0;
+err_free_ct:
+	nf_conntrack_free(ct_info.ct);
+	return err;
+}
+
+int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
+			  struct sk_buff *skb)
+{
+	struct nlattr *start;
+
+	start = nla_nest_start(skb, OVS_ACTION_ATTR_CT);
+	if (!start)
+		return -EMSGSIZE;
+
+	if (nla_put_u32(skb, OVS_CT_ATTR_FLAGS, ct_info->flags))
+		return -EMSGSIZE;
+	if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
+	    nla_put_u16(skb, OVS_CT_ATTR_ZONE, ct_info->zone.id))
+		return -EMSGSIZE;
+
+	nla_nest_end(skb, start);
+
+	return 0;
+}
+
+void ovs_ct_free_action(const struct nlattr *a)
+{
+	struct ovs_conntrack_info *ct_info = nla_data(a);
+
+	if (ct_info->ct)
+		nf_ct_put(ct_info->ct);
+}
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h
new file mode 100644
index 000000000000..e812ee64a718
--- /dev/null
+++ b/net/openvswitch/conntrack.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2015 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OVS_CONNTRACK_H
+#define OVS_CONNTRACK_H 1
+
+#include "flow.h"
+
+struct ovs_conntrack_info;
+enum ovs_key_attr;
+
+#if defined(CONFIG_OPENVSWITCH_CONNTRACK)
+bool ovs_ct_verify(enum ovs_key_attr attr);
+int ovs_ct_copy_action(struct net *, const struct nlattr *,
+		       const struct sw_flow_key *, struct sw_flow_actions **,
+		       bool log);
+int ovs_ct_action_to_attr(const struct ovs_conntrack_info *, struct sk_buff *);
+
+int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *,
+		   const struct ovs_conntrack_info *);
+
+void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key);
+int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb);
+void ovs_ct_free_action(const struct nlattr *a);
+#else
+#include <linux/errno.h>
+
+static inline bool ovs_ct_verify(int attr)
+{
+	return false;
+}
+
+static inline int ovs_ct_copy_action(struct net *net, const struct nlattr *nla,
+				     const struct sw_flow_key *key,
+				     struct sw_flow_actions **acts, bool log)
+{
+	return -ENOTSUPP;
+}
+
+static inline int ovs_ct_action_to_attr(const struct ovs_conntrack_info *info,
+					struct sk_buff *skb)
+{
+	return -ENOTSUPP;
+}
+
+static inline int ovs_ct_execute(struct net *net, struct sk_buff *skb,
+				 struct sw_flow_key *key,
+				 const struct ovs_conntrack_info *info)
+{
+	return -ENOTSUPP;
+}
+
+static inline void ovs_ct_fill_key(const struct sk_buff *skb,
+				   struct sw_flow_key *key)
+{
+	key->ct.state = 0;
+	key->ct.zone = 0;
+}
+
+static inline int ovs_ct_put_key(const struct sw_flow_key *key,
+				 struct sk_buff *skb)
+{
+	return 0;
+}
+
+static inline void ovs_ct_free_action(const struct nlattr *a) { }
+#endif
+#endif /* ovs_conntrack.h */
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index d5b547375887..72e63726efa0 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -275,6 +275,7 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
 		memset(&upcall, 0, sizeof(upcall));
 		upcall.cmd = OVS_PACKET_CMD_MISS;
 		upcall.portid = ovs_vport_find_upcall_portid(p, skb);
+		upcall.mru = OVS_CB(skb)->mru;
 		error = ovs_dp_upcall(dp, skb, key, &upcall);
 		if (unlikely(error))
 			kfree_skb(skb);
@@ -400,9 +401,23 @@ static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info,
 	if (upcall_info->actions_len)
 		size += nla_total_size(upcall_info->actions_len);
 
+	/* OVS_PACKET_ATTR_MRU */
+	if (upcall_info->mru)
+		size += nla_total_size(sizeof(upcall_info->mru));
+
 	return size;
 }
 
+static void pad_packet(struct datapath *dp, struct sk_buff *skb)
+{
+	if (!(dp->user_features & OVS_DP_F_UNALIGNED)) {
+		size_t plen = NLA_ALIGN(skb->len) - skb->len;
+
+		if (plen > 0)
+			memset(skb_put(skb, plen), 0, plen);
+	}
+}
+
 static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
 				  const struct sw_flow_key *key,
 				  const struct dp_upcall_info *upcall_info)
@@ -492,6 +507,16 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
 			nla_nest_cancel(user_skb, nla);
 	}
 
+	/* Add OVS_PACKET_ATTR_MRU */
+	if (upcall_info->mru) {
+		if (nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU,
+				upcall_info->mru)) {
+			err = -ENOBUFS;
+			goto out;
+		}
+		pad_packet(dp, user_skb);
+	}
+
 	/* Only reserve room for attribute header, packet data is added
 	 * in skb_zerocopy() */
 	if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) {
@@ -505,12 +530,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
 		goto out;
 
 	/* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */
-	if (!(dp->user_features & OVS_DP_F_UNALIGNED)) {
-		size_t plen = NLA_ALIGN(user_skb->len) - user_skb->len;
-
-		if (plen > 0)
-			memset(skb_put(user_skb, plen), 0, plen);
-	}
+	pad_packet(dp, user_skb);
 
 	((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len;
 
@@ -527,6 +547,7 @@ out:
 static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 {
 	struct ovs_header *ovs_header = info->userhdr;
+	struct net *net = sock_net(skb->sk);
 	struct nlattr **a = info->attrs;
 	struct sw_flow_actions *acts;
 	struct sk_buff *packet;
@@ -535,6 +556,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 	struct datapath *dp;
 	struct ethhdr *eth;
 	struct vport *input_vport;
+	u16 mru = 0;
 	int len;
 	int err;
 	bool log = !a[OVS_PACKET_ATTR_PROBE];
@@ -564,6 +586,13 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 	else
 		packet->protocol = htons(ETH_P_802_2);
 
+	/* Set packet's mru */
+	if (a[OVS_PACKET_ATTR_MRU]) {
+		mru = nla_get_u16(a[OVS_PACKET_ATTR_MRU]);
+		packet->ignore_df = 1;
+	}
+	OVS_CB(packet)->mru = mru;
+
 	/* Build an sw_flow for sending this packet. */
 	flow = ovs_flow_alloc();
 	err = PTR_ERR(flow);
@@ -575,7 +604,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 	if (err)
 		goto err_flow_free;
 
-	err = ovs_nla_copy_actions(a[OVS_PACKET_ATTR_ACTIONS],
+	err = ovs_nla_copy_actions(net, a[OVS_PACKET_ATTR_ACTIONS],
 				   &flow->key, &acts, log);
 	if (err)
 		goto err_flow_free;
@@ -586,7 +615,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 	packet->mark = flow->key.phy.skb_mark;
 
 	rcu_read_lock();
-	dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex);
+	dp = get_dp_rcu(net, ovs_header->dp_ifindex);
 	err = -ENODEV;
 	if (!dp)
 		goto err_unlock;
@@ -598,6 +627,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 	if (!input_vport)
 		goto err_unlock;
 
+	packet->dev = input_vport->dev;
 	OVS_CB(packet)->input_vport = input_vport;
 	sf_acts = rcu_dereference(flow->sf_acts);
 
@@ -624,6 +654,7 @@ static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = {
 	[OVS_PACKET_ATTR_KEY] = { .type = NLA_NESTED },
 	[OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED },
 	[OVS_PACKET_ATTR_PROBE] = { .type = NLA_FLAG },
+	[OVS_PACKET_ATTR_MRU] = { .type = NLA_U16 },
 };
 
 static const struct genl_ops dp_packet_genl_ops[] = {
@@ -880,6 +911,7 @@ static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow,
 
 static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
 {
+	struct net *net = sock_net(skb->sk);
 	struct nlattr **a = info->attrs;
 	struct ovs_header *ovs_header = info->userhdr;
 	struct sw_flow *flow = NULL, *new_flow;
@@ -929,8 +961,8 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
 		goto err_kfree_flow;
 
 	/* Validate actions. */
-	error = ovs_nla_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key,
-				     &acts, log);
+	error = ovs_nla_copy_actions(net, a[OVS_FLOW_ATTR_ACTIONS],
+				     &new_flow->key, &acts, log);
 	if (error) {
 		OVS_NLERR(log, "Flow actions may not be safe on all matching packets.");
 		goto err_kfree_flow;
@@ -944,7 +976,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	ovs_lock();
-	dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
+	dp = get_dp(net, ovs_header->dp_ifindex);
 	if (unlikely(!dp)) {
 		error = -ENODEV;
 		goto err_unlock_ovs;
@@ -1038,7 +1070,8 @@ error:
 }
 
 /* Factor out action copy to avoid "Wframe-larger-than=1024" warning. */
-static struct sw_flow_actions *get_flow_actions(const struct nlattr *a,
+static struct sw_flow_actions *get_flow_actions(struct net *net,
+						const struct nlattr *a,
 						const struct sw_flow_key *key,
 						const struct sw_flow_mask *mask,
 						bool log)
@@ -1048,7 +1081,7 @@ static struct sw_flow_actions *get_flow_actions(const struct nlattr *a,
 	int error;
 
 	ovs_flow_mask_key(&masked_key, key, mask);
-	error = ovs_nla_copy_actions(a, &masked_key, &acts, log);
+	error = ovs_nla_copy_actions(net, a, &masked_key, &acts, log);
 	if (error) {
 		OVS_NLERR(log,
 			  "Actions may not be safe on all matching packets");
@@ -1060,6 +1093,7 @@ static struct sw_flow_actions *get_flow_actions(const struct nlattr *a,
 
 static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
 {
+	struct net *net = sock_net(skb->sk);
 	struct nlattr **a = info->attrs;
 	struct ovs_header *ovs_header = info->userhdr;
 	struct sw_flow_key key;
@@ -1091,8 +1125,8 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
 
 	/* Validate actions. */
 	if (a[OVS_FLOW_ATTR_ACTIONS]) {
-		acts = get_flow_actions(a[OVS_FLOW_ATTR_ACTIONS], &key, &mask,
-					log);
+		acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], &key,
+					&mask, log);
 		if (IS_ERR(acts)) {
 			error = PTR_ERR(acts);
 			goto error;
@@ -1108,7 +1142,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	ovs_lock();
-	dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
+	dp = get_dp(net, ovs_header->dp_ifindex);
 	if (unlikely(!dp)) {
 		error = -ENODEV;
 		goto err_unlock_ovs;
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 487a85f7d967..d24ba98024be 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -27,6 +27,7 @@
 #include <linux/u64_stats_sync.h>
 #include <net/ip_tunnels.h>
 
+#include "conntrack.h"
 #include "flow.h"
 #include "flow_table.h"
 #include "vport.h"
@@ -97,10 +98,13 @@ struct datapath {
  * NULL if the packet is not being tunneled.
  * @input_vport: The original vport packet came in on. This value is cached
  * when a packet is received by OVS.
+ * @mru: The maximum received fragement size; 0 if the packet is not
+ * fragmented.
  */
 struct ovs_skb_cb {
 	struct ip_tunnel_info  *egress_tun_info;
 	struct vport		*input_vport;
+	u16			mru;
 };
 #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb)
 
@@ -113,6 +117,7 @@ struct ovs_skb_cb {
  * then no packet is sent and the packet is accounted in the datapath's @n_lost
  * counter.
  * @egress_tun_info: If nonnull, becomes %OVS_PACKET_ATTR_EGRESS_TUN_KEY.
+ * @mru: If not zero, Maximum received IP fragment size.
  */
 struct dp_upcall_info {
 	const struct ip_tunnel_info *egress_tun_info;
@@ -121,6 +126,7 @@ struct dp_upcall_info {
 	int actions_len;
 	u32 portid;
 	u8 cmd;
+	u16 mru;
 };
 
 /**
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 8db22ef73626..376ca8738fd4 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -49,6 +49,7 @@
 #include "datapath.h"
 #include "flow.h"
 #include "flow_netlink.h"
+#include "conntrack.h"
 
 u64 ovs_flow_used_time(unsigned long flow_jiffies)
 {
@@ -707,6 +708,7 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
 	key->phy.priority = skb->priority;
 	key->phy.in_port = OVS_CB(skb)->input_vport->port_no;
 	key->phy.skb_mark = skb->mark;
+	ovs_ct_fill_key(skb, key);
 	key->ovs_flow_hash = 0;
 	key->recirc_id = 0;
 
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 082a87bac819..312c7d755b9b 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -111,6 +111,12 @@ struct sw_flow_key {
 			} nd;
 		} ipv6;
 	};
+	struct {
+		/* Connection tracking fields. */
+		u16 zone;
+		u8 state;
+	} ct;
+
 } __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */
 
 struct sw_flow_key_range {
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index c182b28c0884..4e795b289eb7 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -281,7 +281,7 @@ size_t ovs_key_attr_size(void)
 	/* Whenever adding new OVS_KEY_ FIELDS, we should consider
 	 * updating this function.
 	 */
-	BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 22);
+	BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 24);
 
 	return    nla_total_size(4)   /* OVS_KEY_ATTR_PRIORITY */
 		+ nla_total_size(0)   /* OVS_KEY_ATTR_TUNNEL */
@@ -290,6 +290,8 @@ size_t ovs_key_attr_size(void)
 		+ nla_total_size(4)   /* OVS_KEY_ATTR_SKB_MARK */
 		+ nla_total_size(4)   /* OVS_KEY_ATTR_DP_HASH */
 		+ nla_total_size(4)   /* OVS_KEY_ATTR_RECIRC_ID */
+		+ nla_total_size(1)   /* OVS_KEY_ATTR_CT_STATE */
+		+ nla_total_size(2)   /* OVS_KEY_ATTR_CT_ZONE */
 		+ nla_total_size(12)  /* OVS_KEY_ATTR_ETHERNET */
 		+ nla_total_size(2)   /* OVS_KEY_ATTR_ETHERTYPE */
 		+ nla_total_size(4)   /* OVS_KEY_ATTR_VLAN */
@@ -339,6 +341,8 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
 	[OVS_KEY_ATTR_TUNNEL]	 = { .len = OVS_ATTR_NESTED,
 				     .next = ovs_tunnel_key_lens, },
 	[OVS_KEY_ATTR_MPLS]	 = { .len = sizeof(struct ovs_key_mpls) },
+	[OVS_KEY_ATTR_CT_STATE]	 = { .len = sizeof(u8) },
+	[OVS_KEY_ATTR_CT_ZONE]	 = { .len = sizeof(u16) },
 };
 
 static bool is_all_zero(const u8 *fp, size_t size)
@@ -768,6 +772,21 @@ static int metadata_from_nlattrs(struct sw_flow_match *match,  u64 *attrs,
 			return -EINVAL;
 		*attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL);
 	}
+
+	if (*attrs & (1 << OVS_KEY_ATTR_CT_STATE) &&
+	    ovs_ct_verify(OVS_KEY_ATTR_CT_STATE)) {
+		u8 ct_state = nla_get_u8(a[OVS_KEY_ATTR_CT_STATE]);
+
+		SW_FLOW_KEY_PUT(match, ct.state, ct_state, is_mask);
+		*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_STATE);
+	}
+	if (*attrs & (1 << OVS_KEY_ATTR_CT_ZONE) &&
+	    ovs_ct_verify(OVS_KEY_ATTR_CT_ZONE)) {
+		u16 ct_zone = nla_get_u16(a[OVS_KEY_ATTR_CT_ZONE]);
+
+		SW_FLOW_KEY_PUT(match, ct.zone, ct_zone, is_mask);
+		*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ZONE);
+	}
 	return 0;
 }
 
@@ -1266,6 +1285,7 @@ int ovs_nla_get_flow_metadata(const struct nlattr *attr,
 	memset(&match, 0, sizeof(match));
 	match.key = key;
 
+	memset(&key->ct, 0, sizeof(key->ct));
 	key->phy.in_port = DP_MAX_PORTS;
 
 	return metadata_from_nlattrs(&match, &attrs, a, false, log);
@@ -1314,6 +1334,9 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
 	if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark))
 		goto nla_put_failure;
 
+	if (ovs_ct_put_key(output, skb))
+		goto nla_put_failure;
+
 	nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key));
 	if (!nla)
 		goto nla_put_failure;
@@ -1574,6 +1597,9 @@ void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts)
 		case OVS_ACTION_ATTR_SET:
 			ovs_nla_free_set_action(a);
 			break;
+		case OVS_ACTION_ATTR_CT:
+			ovs_ct_free_action(a);
+			break;
 		}
 	}
 
@@ -1647,8 +1673,8 @@ static struct nlattr *__add_action(struct sw_flow_actions **sfa,
 	return a;
 }
 
-static int add_action(struct sw_flow_actions **sfa, int attrtype,
-		      void *data, int len, bool log)
+int ovs_nla_add_action(struct sw_flow_actions **sfa, int attrtype, void *data,
+		       int len, bool log)
 {
 	struct nlattr *a;
 
@@ -1663,7 +1689,7 @@ static inline int add_nested_action_start(struct sw_flow_actions **sfa,
 	int used = (*sfa)->actions_len;
 	int err;
 
-	err = add_action(sfa, attrtype, NULL, 0, log);
+	err = ovs_nla_add_action(sfa, attrtype, NULL, 0, log);
 	if (err)
 		return err;
 
@@ -1679,12 +1705,12 @@ static inline void add_nested_action_end(struct sw_flow_actions *sfa,
 	a->nla_len = sfa->actions_len - st_offset;
 }
 
-static int __ovs_nla_copy_actions(const struct nlattr *attr,
+static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
 				  const struct sw_flow_key *key,
 				  int depth, struct sw_flow_actions **sfa,
 				  __be16 eth_type, __be16 vlan_tci, bool log);
 
-static int validate_and_copy_sample(const struct nlattr *attr,
+static int validate_and_copy_sample(struct net *net, const struct nlattr *attr,
 				    const struct sw_flow_key *key, int depth,
 				    struct sw_flow_actions **sfa,
 				    __be16 eth_type, __be16 vlan_tci, bool log)
@@ -1716,15 +1742,15 @@ static int validate_and_copy_sample(const struct nlattr *attr,
 	start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SAMPLE, log);
 	if (start < 0)
 		return start;
-	err = add_action(sfa, OVS_SAMPLE_ATTR_PROBABILITY,
-			 nla_data(probability), sizeof(u32), log);
+	err = ovs_nla_add_action(sfa, OVS_SAMPLE_ATTR_PROBABILITY,
+				 nla_data(probability), sizeof(u32), log);
 	if (err)
 		return err;
 	st_acts = add_nested_action_start(sfa, OVS_SAMPLE_ATTR_ACTIONS, log);
 	if (st_acts < 0)
 		return st_acts;
 
-	err = __ovs_nla_copy_actions(actions, key, depth + 1, sfa,
+	err = __ovs_nla_copy_actions(net, actions, key, depth + 1, sfa,
 				     eth_type, vlan_tci, log);
 	if (err)
 		return err;
@@ -2058,7 +2084,7 @@ static int copy_action(const struct nlattr *from,
 	return 0;
 }
 
-static int __ovs_nla_copy_actions(const struct nlattr *attr,
+static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
 				  const struct sw_flow_key *key,
 				  int depth, struct sw_flow_actions **sfa,
 				  __be16 eth_type, __be16 vlan_tci, bool log)
@@ -2082,7 +2108,8 @@ static int __ovs_nla_copy_actions(const struct nlattr *attr,
 			[OVS_ACTION_ATTR_SET] = (u32)-1,
 			[OVS_ACTION_ATTR_SET_MASKED] = (u32)-1,
 			[OVS_ACTION_ATTR_SAMPLE] = (u32)-1,
-			[OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash)
+			[OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash),
+			[OVS_ACTION_ATTR_CT] = (u32)-1,
 		};
 		const struct ovs_action_push_vlan *vlan;
 		int type = nla_type(a);
@@ -2189,13 +2216,20 @@ static int __ovs_nla_copy_actions(const struct nlattr *attr,
 			break;
 
 		case OVS_ACTION_ATTR_SAMPLE:
-			err = validate_and_copy_sample(a, key, depth, sfa,
+			err = validate_and_copy_sample(net, a, key, depth, sfa,
 						       eth_type, vlan_tci, log);
 			if (err)
 				return err;
 			skip_copy = true;
 			break;
 
+		case OVS_ACTION_ATTR_CT:
+			err = ovs_ct_copy_action(net, a, key, sfa, log);
+			if (err)
+				return err;
+			skip_copy = true;
+			break;
+
 		default:
 			OVS_NLERR(log, "Unknown Action type %d", type);
 			return -EINVAL;
@@ -2214,7 +2248,7 @@ static int __ovs_nla_copy_actions(const struct nlattr *attr,
 }
 
 /* 'key' must be the masked key. */
-int ovs_nla_copy_actions(const struct nlattr *attr,
+int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
 			 const struct sw_flow_key *key,
 			 struct sw_flow_actions **sfa, bool log)
 {
@@ -2225,7 +2259,7 @@ int ovs_nla_copy_actions(const struct nlattr *attr,
 		return PTR_ERR(*sfa);
 
 	(*sfa)->orig_len = nla_len(attr);
-	err = __ovs_nla_copy_actions(attr, key, 0, sfa, key->eth.type,
+	err = __ovs_nla_copy_actions(net, attr, key, 0, sfa, key->eth.type,
 				     key->eth.tci, log);
 	if (err)
 		ovs_nla_free_flow_actions(*sfa);
@@ -2350,6 +2384,13 @@ int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb)
 			if (err)
 				return err;
 			break;
+
+		case OVS_ACTION_ATTR_CT:
+			err = ovs_ct_action_to_attr(nla_data(a), skb);
+			if (err)
+				return err;
+			break;
+
 		default:
 			if (nla_put(skb, type, nla_len(a), nla_data(a)))
 				return -EMSGSIZE;
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index acd074408f0a..c0b484b237c9 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -62,9 +62,11 @@ int ovs_nla_get_identifier(struct sw_flow_id *sfid, const struct nlattr *ufid,
 			   const struct sw_flow_key *key, bool log);
 u32 ovs_nla_get_ufid_flags(const struct nlattr *attr);
 
-int ovs_nla_copy_actions(const struct nlattr *attr,
+int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
 			 const struct sw_flow_key *key,
 			 struct sw_flow_actions **sfa, bool log);
+int ovs_nla_add_action(struct sw_flow_actions **sfa, int attrtype,
+		       void *data, int len, bool log);
 int ovs_nla_put_actions(const struct nlattr *attr,
 			int len, struct sk_buff *skb);
 
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index d73e5a16e7ca..e2dc9dac59e6 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -484,6 +484,7 @@ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
 
 	OVS_CB(skb)->input_vport = vport;
 	OVS_CB(skb)->egress_tun_info = NULL;
+	OVS_CB(skb)->mru = 0;
 	/* Extract flow from 'skb' into 'key'. */
 	error = ovs_flow_key_extract(tun_info, skb, &key);
 	if (unlikely(error)) {
-- 
cgit v1.2.3


From 182e3042e15de759e81618d11fe4f62f5259d982 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joestringer@nicira.com>
Date: Wed, 26 Aug 2015 11:31:49 -0700
Subject: openvswitch: Allow matching on conntrack mark

Allow matching and setting the ct_mark field. As with ct_state and
ct_zone, these fields are populated when the CT action is executed. To
write to this field, a value and mask can be specified as a nested
attribute under the CT action. This data is stored with the conntrack
entry, and is executed after the lookup occurs for the CT action. The
conntrack entry itself must be committed using the COMMIT flag in the CT
action flags for this change to persist.

Signed-off-by: Justin Pettit <jpettit@nicira.com>
Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h |  5 +++
 net/openvswitch/actions.c        |  1 +
 net/openvswitch/conntrack.c      | 67 ++++++++++++++++++++++++++++++++++++++--
 net/openvswitch/conntrack.h      |  1 +
 net/openvswitch/flow.h           |  1 +
 net/openvswitch/flow_netlink.c   | 12 ++++++-
 6 files changed, 83 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 55f599792673..7a185b554343 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -325,6 +325,7 @@ enum ovs_key_attr {
 				 * the accepted length of the array. */
 	OVS_KEY_ATTR_CT_STATE,	/* u8 bitmask of OVS_CS_F_* */
 	OVS_KEY_ATTR_CT_ZONE,	/* u16 connection tracking zone. */
+	OVS_KEY_ATTR_CT_MARK,	/* u32 connection tracking mark */
 
 #ifdef __KERNEL__
 	OVS_KEY_ATTR_TUNNEL_INFO,  /* struct ip_tunnel_info */
@@ -613,11 +614,15 @@ struct ovs_action_hash {
  * enum ovs_ct_attr - Attributes for %OVS_ACTION_ATTR_CT action.
  * @OVS_CT_ATTR_FLAGS: u32 connection tracking flags.
  * @OVS_CT_ATTR_ZONE: u16 connection tracking zone.
+ * @OVS_CT_ATTR_MARK: u32 value followed by u32 mask. For each bit set in the
+ * mask, the corresponding bit in the value is copied to the connection
+ * tracking mark field in the connection.
  */
 enum ovs_ct_attr {
 	OVS_CT_ATTR_UNSPEC,
 	OVS_CT_ATTR_FLAGS,      /* u8 bitmask of OVS_CT_F_*. */
 	OVS_CT_ATTR_ZONE,       /* u16 zone id. */
+	OVS_CT_ATTR_MARK,       /* mark to associate with this connection. */
 	__OVS_CT_ATTR_MAX
 };
 
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 72ca2c491b0a..9741d2c703df 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -968,6 +968,7 @@ static int execute_masked_set_action(struct sk_buff *skb,
 
 	case OVS_KEY_ATTR_CT_STATE:
 	case OVS_KEY_ATTR_CT_ZONE:
+	case OVS_KEY_ATTR_CT_MARK:
 		err = -EINVAL;
 		break;
 	}
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 1189fd50f1cf..e53dafccf21f 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -28,12 +28,19 @@ struct ovs_ct_len_tbl {
 	size_t minlen;
 };
 
+/* Metadata mark for masked write to conntrack mark */
+struct md_mark {
+	u32 value;
+	u32 mask;
+};
+
 /* Conntrack action context for execution. */
 struct ovs_conntrack_info {
 	struct nf_conntrack_zone zone;
 	struct nf_conn *ct;
 	u32 flags;
 	u16 family;
+	struct md_mark mark;
 };
 
 static u16 key_to_nfproto(const struct sw_flow_key *key)
@@ -84,10 +91,12 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo)
 }
 
 static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state,
-				const struct nf_conntrack_zone *zone)
+				const struct nf_conntrack_zone *zone,
+				const struct nf_conn *ct)
 {
 	key->ct.state = state;
 	key->ct.zone = zone->id;
+	key->ct.mark = ct ? ct->mark : 0;
 }
 
 /* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has
@@ -110,7 +119,7 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
 	} else if (post_ct) {
 		state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID;
 	}
-	__ovs_ct_update_key(key, state, zone);
+	__ovs_ct_update_key(key, state, zone, ct);
 }
 
 void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key)
@@ -127,6 +136,35 @@ int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb)
 	    nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, key->ct.zone))
 		return -EMSGSIZE;
 
+	if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) &&
+	    nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, key->ct.mark))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key,
+			   u32 ct_mark, u32 mask)
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct;
+	u32 new_mark;
+
+	if (!IS_ENABLED(CONFIG_NF_CONNTRACK_MARK))
+		return -ENOTSUPP;
+
+	/* The connection could be invalid, in which case set_mark is no-op. */
+	ct = nf_ct_get(skb, &ctinfo);
+	if (!ct)
+		return 0;
+
+	new_mark = ct_mark | (ct->mark & ~(mask));
+	if (ct->mark != new_mark) {
+		ct->mark = new_mark;
+		nf_conntrack_event_cache(IPCT_MARK, ct);
+		key->ct.mark = new_mark;
+	}
+
 	return 0;
 }
 
@@ -247,7 +285,7 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
 		u8 state;
 
 		state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED;
-		__ovs_ct_update_key(key, state, &info->zone);
+		__ovs_ct_update_key(key, state, &info->zone, exp->master);
 	} else {
 		int err;
 
@@ -310,7 +348,13 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
 		err = ovs_ct_commit(net, key, info, skb);
 	else
 		err = ovs_ct_lookup(net, key, info, skb);
+	if (err)
+		goto err;
 
+	if (info->mark.mask)
+		err = ovs_ct_set_mark(skb, key, info->mark.value,
+				      info->mark.mask);
+err:
 	skb_push(skb, nh_ofs);
 	return err;
 }
@@ -320,6 +364,8 @@ static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
 				    .maxlen = sizeof(u32) },
 	[OVS_CT_ATTR_ZONE]	= { .minlen = sizeof(u16),
 				    .maxlen = sizeof(u16) },
+	[OVS_CT_ATTR_MARK]	= { .minlen = sizeof(struct md_mark),
+				    .maxlen = sizeof(struct md_mark) },
 };
 
 static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
@@ -354,6 +400,14 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
 		case OVS_CT_ATTR_ZONE:
 			info->zone.id = nla_get_u16(a);
 			break;
+#endif
+#ifdef CONFIG_NF_CONNTRACK_MARK
+		case OVS_CT_ATTR_MARK: {
+			struct md_mark *mark = nla_data(a);
+
+			info->mark = *mark;
+			break;
+		}
 #endif
 		default:
 			OVS_NLERR(log, "Unknown conntrack attr (%d)",
@@ -377,6 +431,9 @@ bool ovs_ct_verify(enum ovs_key_attr attr)
 	if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
 	    attr == OVS_KEY_ATTR_CT_ZONE)
 		return true;
+	if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) &&
+	    attr == OVS_KEY_ATTR_CT_MARK)
+		return true;
 
 	return false;
 }
@@ -439,6 +496,10 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
 	if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
 	    nla_put_u16(skb, OVS_CT_ATTR_ZONE, ct_info->zone.id))
 		return -EMSGSIZE;
+	if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) &&
+	    nla_put(skb, OVS_CT_ATTR_MARK, sizeof(ct_info->mark),
+		    &ct_info->mark))
+		return -EMSGSIZE;
 
 	nla_nest_end(skb, start);
 
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h
index e812ee64a718..87b289c58978 100644
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -65,6 +65,7 @@ static inline void ovs_ct_fill_key(const struct sk_buff *skb,
 {
 	key->ct.state = 0;
 	key->ct.zone = 0;
+	key->ct.mark = 0;
 }
 
 static inline int ovs_ct_put_key(const struct sw_flow_key *key,
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 312c7d755b9b..e05e69711ce1 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -114,6 +114,7 @@ struct sw_flow_key {
 	struct {
 		/* Connection tracking fields. */
 		u16 zone;
+		u32 mark;
 		u8 state;
 	} ct;
 
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 4e795b289eb7..b17a4ec348f9 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -281,7 +281,7 @@ size_t ovs_key_attr_size(void)
 	/* Whenever adding new OVS_KEY_ FIELDS, we should consider
 	 * updating this function.
 	 */
-	BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 24);
+	BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 25);
 
 	return    nla_total_size(4)   /* OVS_KEY_ATTR_PRIORITY */
 		+ nla_total_size(0)   /* OVS_KEY_ATTR_TUNNEL */
@@ -292,6 +292,7 @@ size_t ovs_key_attr_size(void)
 		+ nla_total_size(4)   /* OVS_KEY_ATTR_RECIRC_ID */
 		+ nla_total_size(1)   /* OVS_KEY_ATTR_CT_STATE */
 		+ nla_total_size(2)   /* OVS_KEY_ATTR_CT_ZONE */
+		+ nla_total_size(4)   /* OVS_KEY_ATTR_CT_MARK */
 		+ nla_total_size(12)  /* OVS_KEY_ATTR_ETHERNET */
 		+ nla_total_size(2)   /* OVS_KEY_ATTR_ETHERTYPE */
 		+ nla_total_size(4)   /* OVS_KEY_ATTR_VLAN */
@@ -343,6 +344,7 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
 	[OVS_KEY_ATTR_MPLS]	 = { .len = sizeof(struct ovs_key_mpls) },
 	[OVS_KEY_ATTR_CT_STATE]	 = { .len = sizeof(u8) },
 	[OVS_KEY_ATTR_CT_ZONE]	 = { .len = sizeof(u16) },
+	[OVS_KEY_ATTR_CT_MARK]	 = { .len = sizeof(u32) },
 };
 
 static bool is_all_zero(const u8 *fp, size_t size)
@@ -787,6 +789,13 @@ static int metadata_from_nlattrs(struct sw_flow_match *match,  u64 *attrs,
 		SW_FLOW_KEY_PUT(match, ct.zone, ct_zone, is_mask);
 		*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ZONE);
 	}
+	if (*attrs & (1 << OVS_KEY_ATTR_CT_MARK) &&
+	    ovs_ct_verify(OVS_KEY_ATTR_CT_MARK)) {
+		u32 mark = nla_get_u32(a[OVS_KEY_ATTR_CT_MARK]);
+
+		SW_FLOW_KEY_PUT(match, ct.mark, mark, is_mask);
+		*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_MARK);
+	}
 	return 0;
 }
 
@@ -1919,6 +1928,7 @@ static int validate_set(const struct nlattr *a,
 
 	case OVS_KEY_ATTR_PRIORITY:
 	case OVS_KEY_ATTR_SKB_MARK:
+	case OVS_KEY_ATTR_CT_MARK:
 	case OVS_KEY_ATTR_ETHERNET:
 		break;
 
-- 
cgit v1.2.3


From c2ac667358708d7cce64c78f58af6adf4c1e848b Mon Sep 17 00:00:00 2001
From: Joe Stringer <joestringer@nicira.com>
Date: Wed, 26 Aug 2015 11:31:52 -0700
Subject: openvswitch: Allow matching on conntrack label

Allow matching and setting the ct_label field. As with ct_mark, this is
populated by executing the CT action. The label field may be modified by
specifying a label and mask nested under the CT action. It is stored as
metadata attached to the connection. Label modification occurs after
lookup, and will only persist when the conntrack entry is committed by
providing the COMMIT flag to the CT action. Labels are currently fixed
to 128 bits in size.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h |  10 +++
 net/openvswitch/actions.c        |   1 +
 net/openvswitch/conntrack.c      | 128 ++++++++++++++++++++++++++++++++++++++-
 net/openvswitch/conntrack.h      |  11 +++-
 net/openvswitch/datapath.c       |  18 +++---
 net/openvswitch/datapath.h       |   3 +
 net/openvswitch/flow.c           |   4 +-
 net/openvswitch/flow.h           |   3 +-
 net/openvswitch/flow_netlink.c   |  46 +++++++++-----
 net/openvswitch/flow_netlink.h   |   9 +--
 10 files changed, 199 insertions(+), 34 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 7a185b554343..9d52058a9330 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -326,6 +326,7 @@ enum ovs_key_attr {
 	OVS_KEY_ATTR_CT_STATE,	/* u8 bitmask of OVS_CS_F_* */
 	OVS_KEY_ATTR_CT_ZONE,	/* u16 connection tracking zone. */
 	OVS_KEY_ATTR_CT_MARK,	/* u32 connection tracking mark */
+	OVS_KEY_ATTR_CT_LABEL,	/* 16-octet connection tracking label */
 
 #ifdef __KERNEL__
 	OVS_KEY_ATTR_TUNNEL_INFO,  /* struct ip_tunnel_info */
@@ -438,6 +439,11 @@ struct ovs_key_nd {
 	__u8	nd_tll[ETH_ALEN];
 };
 
+#define OVS_CT_LABEL_LEN	16
+struct ovs_key_ct_label {
+	__u8	ct_label[OVS_CT_LABEL_LEN];
+};
+
 /* OVS_KEY_ATTR_CT_STATE flags */
 #define OVS_CS_F_NEW               0x01 /* Beginning of a new connection. */
 #define OVS_CS_F_ESTABLISHED       0x02 /* Part of an existing connection. */
@@ -617,12 +623,16 @@ struct ovs_action_hash {
  * @OVS_CT_ATTR_MARK: u32 value followed by u32 mask. For each bit set in the
  * mask, the corresponding bit in the value is copied to the connection
  * tracking mark field in the connection.
+ * @OVS_CT_ATTR_LABEL: %OVS_CT_LABEL_LEN value followed by %OVS_CT_LABEL_LEN
+ * mask. For each bit set in the mask, the corresponding bit in the value is
+ * copied to the connection tracking label field in the connection.
  */
 enum ovs_ct_attr {
 	OVS_CT_ATTR_UNSPEC,
 	OVS_CT_ATTR_FLAGS,      /* u8 bitmask of OVS_CT_F_*. */
 	OVS_CT_ATTR_ZONE,       /* u16 zone id. */
 	OVS_CT_ATTR_MARK,       /* mark to associate with this connection. */
+	OVS_CT_ATTR_LABEL,      /* label to associate with this connection. */
 	__OVS_CT_ATTR_MAX
 };
 
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 9741d2c703df..736a113a75c3 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -969,6 +969,7 @@ static int execute_masked_set_action(struct sk_buff *skb,
 	case OVS_KEY_ATTR_CT_STATE:
 	case OVS_KEY_ATTR_CT_ZONE:
 	case OVS_KEY_ATTR_CT_MARK:
+	case OVS_KEY_ATTR_CT_LABEL:
 		err = -EINVAL;
 		break;
 	}
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index e53dafccf21f..a0417fb33d1d 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -15,6 +15,7 @@
 #include <linux/openvswitch.h>
 #include <net/ip.h>
 #include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_labels.h>
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
 
@@ -34,6 +35,12 @@ struct md_mark {
 	u32 mask;
 };
 
+/* Metadata label for masked write to conntrack label. */
+struct md_label {
+	struct ovs_key_ct_label value;
+	struct ovs_key_ct_label mask;
+};
+
 /* Conntrack action context for execution. */
 struct ovs_conntrack_info {
 	struct nf_conntrack_zone zone;
@@ -41,6 +48,7 @@ struct ovs_conntrack_info {
 	u32 flags;
 	u16 family;
 	struct md_mark mark;
+	struct md_label label;
 };
 
 static u16 key_to_nfproto(const struct sw_flow_key *key)
@@ -90,6 +98,24 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo)
 	return ct_state;
 }
 
+static void ovs_ct_get_label(const struct nf_conn *ct,
+			     struct ovs_key_ct_label *label)
+{
+	struct nf_conn_labels *cl = ct ? nf_ct_labels_find(ct) : NULL;
+
+	if (cl) {
+		size_t len = cl->words * sizeof(long);
+
+		if (len > OVS_CT_LABEL_LEN)
+			len = OVS_CT_LABEL_LEN;
+		else if (len < OVS_CT_LABEL_LEN)
+			memset(label, 0, OVS_CT_LABEL_LEN);
+		memcpy(label, cl->bits, len);
+	} else {
+		memset(label, 0, OVS_CT_LABEL_LEN);
+	}
+}
+
 static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state,
 				const struct nf_conntrack_zone *zone,
 				const struct nf_conn *ct)
@@ -97,6 +123,7 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state,
 	key->ct.state = state;
 	key->ct.zone = zone->id;
 	key->ct.mark = ct ? ct->mark : 0;
+	ovs_ct_get_label(ct, &key->ct.label);
 }
 
 /* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has
@@ -140,6 +167,11 @@ int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb)
 	    nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, key->ct.mark))
 		return -EMSGSIZE;
 
+	if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABEL) &&
+	    nla_put(skb, OVS_KEY_ATTR_CT_LABEL, sizeof(key->ct.label),
+		    &key->ct.label))
+		return -EMSGSIZE;
+
 	return 0;
 }
 
@@ -168,6 +200,40 @@ static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key,
 	return 0;
 }
 
+static int ovs_ct_set_label(struct sk_buff *skb, struct sw_flow_key *key,
+			    const struct ovs_key_ct_label *label,
+			    const struct ovs_key_ct_label *mask)
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn_labels *cl;
+	struct nf_conn *ct;
+	int err;
+
+	if (!IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS))
+		return -ENOTSUPP;
+
+	/* The connection could be invalid, in which case set_label is no-op.*/
+	ct = nf_ct_get(skb, &ctinfo);
+	if (!ct)
+		return 0;
+
+	cl = nf_ct_labels_find(ct);
+	if (!cl) {
+		nf_ct_labels_ext_add(ct);
+		cl = nf_ct_labels_find(ct);
+	}
+	if (!cl || cl->words * sizeof(long) < OVS_CT_LABEL_LEN)
+		return -ENOSPC;
+
+	err = nf_connlabels_replace(ct, (u32 *)label, (u32 *)mask,
+				    OVS_CT_LABEL_LEN / sizeof(u32));
+	if (err)
+		return err;
+
+	ovs_ct_get_label(ct, &key->ct.label);
+	return 0;
+}
+
 static int handle_fragments(struct net *net, struct sw_flow_key *key,
 			    u16 zone, struct sk_buff *skb)
 {
@@ -327,6 +393,17 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
 	return 0;
 }
 
+static bool label_nonzero(const struct ovs_key_ct_label *label)
+{
+	size_t i;
+
+	for (i = 0; i < sizeof(*label); i++)
+		if (label->ct_label[i])
+			return true;
+
+	return false;
+}
+
 int ovs_ct_execute(struct net *net, struct sk_buff *skb,
 		   struct sw_flow_key *key,
 		   const struct ovs_conntrack_info *info)
@@ -351,9 +428,15 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
 	if (err)
 		goto err;
 
-	if (info->mark.mask)
+	if (info->mark.mask) {
 		err = ovs_ct_set_mark(skb, key, info->mark.value,
 				      info->mark.mask);
+		if (err)
+			goto err;
+	}
+	if (label_nonzero(&info->label.mask))
+		err = ovs_ct_set_label(skb, key, &info->label.value,
+				       &info->label.mask);
 err:
 	skb_push(skb, nh_ofs);
 	return err;
@@ -366,6 +449,8 @@ static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
 				    .maxlen = sizeof(u16) },
 	[OVS_CT_ATTR_MARK]	= { .minlen = sizeof(struct md_mark),
 				    .maxlen = sizeof(struct md_mark) },
+	[OVS_CT_ATTR_LABEL]	= { .minlen = sizeof(struct md_label),
+				    .maxlen = sizeof(struct md_label) },
 };
 
 static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
@@ -408,6 +493,14 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
 			info->mark = *mark;
 			break;
 		}
+#endif
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+		case OVS_CT_ATTR_LABEL: {
+			struct md_label *label = nla_data(a);
+
+			info->label = *label;
+			break;
+		}
 #endif
 		default:
 			OVS_NLERR(log, "Unknown conntrack attr (%d)",
@@ -424,7 +517,7 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
 	return 0;
 }
 
-bool ovs_ct_verify(enum ovs_key_attr attr)
+bool ovs_ct_verify(struct net *net, enum ovs_key_attr attr)
 {
 	if (attr == OVS_KEY_ATTR_CT_STATE)
 		return true;
@@ -434,6 +527,12 @@ bool ovs_ct_verify(enum ovs_key_attr attr)
 	if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) &&
 	    attr == OVS_KEY_ATTR_CT_MARK)
 		return true;
+	if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) &&
+	    attr == OVS_KEY_ATTR_CT_LABEL) {
+		struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+
+		return ovs_net->xt_label;
+	}
 
 	return false;
 }
@@ -500,6 +599,10 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
 	    nla_put(skb, OVS_CT_ATTR_MARK, sizeof(ct_info->mark),
 		    &ct_info->mark))
 		return -EMSGSIZE;
+	if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) &&
+	    nla_put(skb, OVS_CT_ATTR_LABEL, sizeof(ct_info->label),
+		    &ct_info->label))
+		return -EMSGSIZE;
 
 	nla_nest_end(skb, start);
 
@@ -513,3 +616,24 @@ void ovs_ct_free_action(const struct nlattr *a)
 	if (ct_info->ct)
 		nf_ct_put(ct_info->ct);
 }
+
+void ovs_ct_init(struct net *net)
+{
+	unsigned int n_bits = sizeof(struct ovs_key_ct_label) * BITS_PER_BYTE;
+	struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+
+	if (nf_connlabels_get(net, n_bits)) {
+		ovs_net->xt_label = false;
+		OVS_NLERR(true, "Failed to set connlabel length");
+	} else {
+		ovs_net->xt_label = true;
+	}
+}
+
+void ovs_ct_exit(struct net *net)
+{
+	struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+
+	if (ovs_net->xt_label)
+		nf_connlabels_put(net);
+}
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h
index 87b289c58978..3cb30667a7dc 100644
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -20,7 +20,9 @@ struct ovs_conntrack_info;
 enum ovs_key_attr;
 
 #if defined(CONFIG_OPENVSWITCH_CONNTRACK)
-bool ovs_ct_verify(enum ovs_key_attr attr);
+void ovs_ct_init(struct net *);
+void ovs_ct_exit(struct net *);
+bool ovs_ct_verify(struct net *, enum ovs_key_attr attr);
 int ovs_ct_copy_action(struct net *, const struct nlattr *,
 		       const struct sw_flow_key *, struct sw_flow_actions **,
 		       bool log);
@@ -35,7 +37,11 @@ void ovs_ct_free_action(const struct nlattr *a);
 #else
 #include <linux/errno.h>
 
-static inline bool ovs_ct_verify(int attr)
+static inline void ovs_ct_init(struct net *net) { }
+
+static inline void ovs_ct_exit(struct net *net) { }
+
+static inline bool ovs_ct_verify(struct net *net, int attr)
 {
 	return false;
 }
@@ -66,6 +72,7 @@ static inline void ovs_ct_fill_key(const struct sk_buff *skb,
 	key->ct.state = 0;
 	key->ct.zone = 0;
 	key->ct.mark = 0;
+	memset(&key->ct.label, 0, sizeof(key->ct.label));
 }
 
 static inline int ovs_ct_put_key(const struct sw_flow_key *key,
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 72e63726efa0..ec0f8d9cee73 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -599,8 +599,8 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 	if (IS_ERR(flow))
 		goto err_kfree_skb;
 
-	err = ovs_flow_key_extract_userspace(a[OVS_PACKET_ATTR_KEY], packet,
-					     &flow->key, log);
+	err = ovs_flow_key_extract_userspace(net, a[OVS_PACKET_ATTR_KEY],
+					     packet, &flow->key, log);
 	if (err)
 		goto err_flow_free;
 
@@ -947,7 +947,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
 
 	/* Extract key. */
 	ovs_match_init(&match, &key, &mask);
-	error = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY],
+	error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
 				  a[OVS_FLOW_ATTR_MASK], log);
 	if (error)
 		goto err_kfree_flow;
@@ -1118,7 +1118,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
 
 	ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log);
 	ovs_match_init(&match, &key, &mask);
-	error = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY],
+	error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
 				  a[OVS_FLOW_ATTR_MASK], log);
 	if (error)
 		goto error;
@@ -1208,6 +1208,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
 {
 	struct nlattr **a = info->attrs;
 	struct ovs_header *ovs_header = info->userhdr;
+	struct net *net = sock_net(skb->sk);
 	struct sw_flow_key key;
 	struct sk_buff *reply;
 	struct sw_flow *flow;
@@ -1222,7 +1223,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
 	ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log);
 	if (a[OVS_FLOW_ATTR_KEY]) {
 		ovs_match_init(&match, &key, NULL);
-		err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL,
+		err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], NULL,
 					log);
 	} else if (!ufid_present) {
 		OVS_NLERR(log,
@@ -1266,6 +1267,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
 {
 	struct nlattr **a = info->attrs;
 	struct ovs_header *ovs_header = info->userhdr;
+	struct net *net = sock_net(skb->sk);
 	struct sw_flow_key key;
 	struct sk_buff *reply;
 	struct sw_flow *flow = NULL;
@@ -1280,8 +1282,8 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
 	ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log);
 	if (a[OVS_FLOW_ATTR_KEY]) {
 		ovs_match_init(&match, &key, NULL);
-		err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL,
-					log);
+		err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
+					NULL, log);
 		if (unlikely(err))
 			return err;
 	}
@@ -2237,6 +2239,7 @@ static int __net_init ovs_init_net(struct net *net)
 
 	INIT_LIST_HEAD(&ovs_net->dps);
 	INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq);
+	ovs_ct_init(net);
 	return 0;
 }
 
@@ -2271,6 +2274,7 @@ static void __net_exit ovs_exit_net(struct net *dnet)
 	struct net *net;
 	LIST_HEAD(head);
 
+	ovs_ct_exit(dnet);
 	ovs_lock();
 	list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node)
 		__dp_destroy(dp);
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index d24ba98024be..4e785ab88973 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -138,6 +138,9 @@ struct ovs_net {
 	struct list_head dps;
 	struct work_struct dp_notify_work;
 	struct vport_net vport_net;
+
+	/* Module reference for configuring conntrack. */
+	bool xt_label;
 };
 
 extern int ovs_net_id;
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 376ca8738fd4..5a3195e538ce 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -715,7 +715,7 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
 	return key_extract(skb, key);
 }
 
-int ovs_flow_key_extract_userspace(const struct nlattr *attr,
+int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr,
 				   struct sk_buff *skb,
 				   struct sw_flow_key *key, bool log)
 {
@@ -724,7 +724,7 @@ int ovs_flow_key_extract_userspace(const struct nlattr *attr,
 	memset(key, 0, OVS_SW_FLOW_KEY_METADATA_SIZE);
 
 	/* Extract metadata from netlink attributes. */
-	err = ovs_nla_get_flow_metadata(attr, key, log);
+	err = ovs_nla_get_flow_metadata(net, attr, key, log);
 	if (err)
 		return err;
 
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index e05e69711ce1..fe527d2dd4b7 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -116,6 +116,7 @@ struct sw_flow_key {
 		u16 zone;
 		u32 mark;
 		u8 state;
+		struct ovs_key_ct_label label;
 	} ct;
 
 } __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */
@@ -220,7 +221,7 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
 			 struct sk_buff *skb,
 			 struct sw_flow_key *key);
 /* Extract key from packet coming from userspace. */
-int ovs_flow_key_extract_userspace(const struct nlattr *attr,
+int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr,
 				   struct sk_buff *skb,
 				   struct sw_flow_key *key, bool log);
 
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index b17a4ec348f9..e22c5bfe8575 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -281,7 +281,7 @@ size_t ovs_key_attr_size(void)
 	/* Whenever adding new OVS_KEY_ FIELDS, we should consider
 	 * updating this function.
 	 */
-	BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 25);
+	BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 26);
 
 	return    nla_total_size(4)   /* OVS_KEY_ATTR_PRIORITY */
 		+ nla_total_size(0)   /* OVS_KEY_ATTR_TUNNEL */
@@ -293,6 +293,7 @@ size_t ovs_key_attr_size(void)
 		+ nla_total_size(1)   /* OVS_KEY_ATTR_CT_STATE */
 		+ nla_total_size(2)   /* OVS_KEY_ATTR_CT_ZONE */
 		+ nla_total_size(4)   /* OVS_KEY_ATTR_CT_MARK */
+		+ nla_total_size(16)  /* OVS_KEY_ATTR_CT_LABEL */
 		+ nla_total_size(12)  /* OVS_KEY_ATTR_ETHERNET */
 		+ nla_total_size(2)   /* OVS_KEY_ATTR_ETHERTYPE */
 		+ nla_total_size(4)   /* OVS_KEY_ATTR_VLAN */
@@ -345,6 +346,7 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
 	[OVS_KEY_ATTR_CT_STATE]	 = { .len = sizeof(u8) },
 	[OVS_KEY_ATTR_CT_ZONE]	 = { .len = sizeof(u16) },
 	[OVS_KEY_ATTR_CT_MARK]	 = { .len = sizeof(u32) },
+	[OVS_KEY_ATTR_CT_LABEL]	 = { .len = sizeof(struct ovs_key_ct_label) },
 };
 
 static bool is_all_zero(const u8 *fp, size_t size)
@@ -721,9 +723,9 @@ int ovs_nla_put_egress_tunnel_key(struct sk_buff *skb,
 				    egress_tun_info->options_len);
 }
 
-static int metadata_from_nlattrs(struct sw_flow_match *match,  u64 *attrs,
-				 const struct nlattr **a, bool is_mask,
-				 bool log)
+static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,
+				 u64 *attrs, const struct nlattr **a,
+				 bool is_mask, bool log)
 {
 	if (*attrs & (1 << OVS_KEY_ATTR_DP_HASH)) {
 		u32 hash_val = nla_get_u32(a[OVS_KEY_ATTR_DP_HASH]);
@@ -776,36 +778,45 @@ static int metadata_from_nlattrs(struct sw_flow_match *match,  u64 *attrs,
 	}
 
 	if (*attrs & (1 << OVS_KEY_ATTR_CT_STATE) &&
-	    ovs_ct_verify(OVS_KEY_ATTR_CT_STATE)) {
+	    ovs_ct_verify(net, OVS_KEY_ATTR_CT_STATE)) {
 		u8 ct_state = nla_get_u8(a[OVS_KEY_ATTR_CT_STATE]);
 
 		SW_FLOW_KEY_PUT(match, ct.state, ct_state, is_mask);
 		*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_STATE);
 	}
 	if (*attrs & (1 << OVS_KEY_ATTR_CT_ZONE) &&
-	    ovs_ct_verify(OVS_KEY_ATTR_CT_ZONE)) {
+	    ovs_ct_verify(net, OVS_KEY_ATTR_CT_ZONE)) {
 		u16 ct_zone = nla_get_u16(a[OVS_KEY_ATTR_CT_ZONE]);
 
 		SW_FLOW_KEY_PUT(match, ct.zone, ct_zone, is_mask);
 		*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ZONE);
 	}
 	if (*attrs & (1 << OVS_KEY_ATTR_CT_MARK) &&
-	    ovs_ct_verify(OVS_KEY_ATTR_CT_MARK)) {
+	    ovs_ct_verify(net, OVS_KEY_ATTR_CT_MARK)) {
 		u32 mark = nla_get_u32(a[OVS_KEY_ATTR_CT_MARK]);
 
 		SW_FLOW_KEY_PUT(match, ct.mark, mark, is_mask);
 		*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_MARK);
 	}
+	if (*attrs & (1 << OVS_KEY_ATTR_CT_LABEL) &&
+	    ovs_ct_verify(net, OVS_KEY_ATTR_CT_LABEL)) {
+		const struct ovs_key_ct_label *cl;
+
+		cl = nla_data(a[OVS_KEY_ATTR_CT_LABEL]);
+		SW_FLOW_KEY_MEMCPY(match, ct.label, cl->ct_label,
+				   sizeof(*cl), is_mask);
+		*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_LABEL);
+	}
 	return 0;
 }
 
-static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs,
-				const struct nlattr **a, bool is_mask,
-				bool log)
+static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match,
+				u64 attrs, const struct nlattr **a,
+				bool is_mask, bool log)
 {
 	int err;
 
-	err = metadata_from_nlattrs(match, &attrs, a, is_mask, log);
+	err = metadata_from_nlattrs(net, match, &attrs, a, is_mask, log);
 	if (err)
 		return err;
 
@@ -1057,6 +1068,7 @@ static void mask_set_nlattr(struct nlattr *attr, u8 val)
  * mask. In case the 'mask' is NULL, the flow is treated as exact match
  * flow. Otherwise, it is treated as a wildcarded flow, except the mask
  * does not include any don't care bit.
+ * @net: Used to determine per-namespace field support.
  * @match: receives the extracted flow match information.
  * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
  * sequence. The fields should of the packet that triggered the creation
@@ -1067,7 +1079,7 @@ static void mask_set_nlattr(struct nlattr *attr, u8 val)
  * probing for feature compatibility this should be passed in as false to
  * suppress unnecessary error logging.
  */
-int ovs_nla_get_match(struct sw_flow_match *match,
+int ovs_nla_get_match(struct net *net, struct sw_flow_match *match,
 		      const struct nlattr *nla_key,
 		      const struct nlattr *nla_mask,
 		      bool log)
@@ -1117,7 +1129,7 @@ int ovs_nla_get_match(struct sw_flow_match *match,
 		}
 	}
 
-	err = ovs_key_from_nlattrs(match, key_attrs, a, false, log);
+	err = ovs_key_from_nlattrs(net, match, key_attrs, a, false, log);
 	if (err)
 		return err;
 
@@ -1197,7 +1209,8 @@ int ovs_nla_get_match(struct sw_flow_match *match,
 			}
 		}
 
-		err = ovs_key_from_nlattrs(match, mask_attrs, a, true, log);
+		err = ovs_key_from_nlattrs(net, match, mask_attrs, a, true,
+					   log);
 		if (err)
 			goto free_newmask;
 	}
@@ -1278,7 +1291,7 @@ u32 ovs_nla_get_ufid_flags(const struct nlattr *attr)
  * extracted from the packet itself.
  */
 
-int ovs_nla_get_flow_metadata(const struct nlattr *attr,
+int ovs_nla_get_flow_metadata(struct net *net, const struct nlattr *attr,
 			      struct sw_flow_key *key,
 			      bool log)
 {
@@ -1297,7 +1310,7 @@ int ovs_nla_get_flow_metadata(const struct nlattr *attr,
 	memset(&key->ct, 0, sizeof(key->ct));
 	key->phy.in_port = DP_MAX_PORTS;
 
-	return metadata_from_nlattrs(&match, &attrs, a, false, log);
+	return metadata_from_nlattrs(net, &match, &attrs, a, false, log);
 }
 
 static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
@@ -1929,6 +1942,7 @@ static int validate_set(const struct nlattr *a,
 	case OVS_KEY_ATTR_PRIORITY:
 	case OVS_KEY_ATTR_SKB_MARK:
 	case OVS_KEY_ATTR_CT_MARK:
+	case OVS_KEY_ATTR_CT_LABEL:
 	case OVS_KEY_ATTR_ETHERNET:
 		break;
 
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index c0b484b237c9..07878e22e783 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -45,15 +45,16 @@ void ovs_match_init(struct sw_flow_match *match,
 
 int ovs_nla_put_key(const struct sw_flow_key *, const struct sw_flow_key *,
 		    int attr, bool is_mask, struct sk_buff *);
-int ovs_nla_get_flow_metadata(const struct nlattr *, struct sw_flow_key *,
-			      bool log);
+int ovs_nla_get_flow_metadata(struct net *, const struct nlattr *,
+			      struct sw_flow_key *, bool log);
 
 int ovs_nla_put_identifier(const struct sw_flow *flow, struct sk_buff *skb);
 int ovs_nla_put_masked_key(const struct sw_flow *flow, struct sk_buff *skb);
 int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb);
 
-int ovs_nla_get_match(struct sw_flow_match *, const struct nlattr *key,
-		      const struct nlattr *mask, bool log);
+int ovs_nla_get_match(struct net *, struct sw_flow_match *,
+		      const struct nlattr *key, const struct nlattr *mask,
+		      bool log);
 int ovs_nla_put_egress_tunnel_key(struct sk_buff *,
 				  const struct ip_tunnel_info *);
 
-- 
cgit v1.2.3


From cae3a2627520c3795b54533c5328b77af3405dbe Mon Sep 17 00:00:00 2001
From: Joe Stringer <joestringer@nicira.com>
Date: Wed, 26 Aug 2015 11:31:53 -0700
Subject: openvswitch: Allow attaching helpers to ct action

Add support for using conntrack helpers to assist protocol detection.
The new OVS_CT_ATTR_HELPER attribute of the CT action specifies a helper
to be used for this connection. If no helper is specified, then helpers
will be automatically applied as per the sysctl configuration of
net.netfilter.nf_conntrack_helper.

The helper may be specified as part of the conntrack action, eg:
ct(helper=ftp). Initial packets for related connections should be
committed to allow later packets for the flow to be considered
established.

Example ovs-ofctl flows allowing FTP connections from ports 1->2:
in_port=1,tcp,action=ct(helper=ftp,commit),2
in_port=2,tcp,ct_state=-trk,action=ct(recirc)
in_port=2,tcp,ct_state=+trk-new+est,action=1
in_port=2,tcp,ct_state=+trk+rel,action=1

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h |   3 ++
 net/openvswitch/conntrack.c      | 109 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 110 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 9d52058a9330..32e07d8cbaf4 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -626,6 +626,7 @@ struct ovs_action_hash {
  * @OVS_CT_ATTR_LABEL: %OVS_CT_LABEL_LEN value followed by %OVS_CT_LABEL_LEN
  * mask. For each bit set in the mask, the corresponding bit in the value is
  * copied to the connection tracking label field in the connection.
+ * @OVS_CT_ATTR_HELPER: variable length string defining conntrack ALG.
  */
 enum ovs_ct_attr {
 	OVS_CT_ATTR_UNSPEC,
@@ -633,6 +634,8 @@ enum ovs_ct_attr {
 	OVS_CT_ATTR_ZONE,       /* u16 zone id. */
 	OVS_CT_ATTR_MARK,       /* mark to associate with this connection. */
 	OVS_CT_ATTR_LABEL,      /* label to associate with this connection. */
+	OVS_CT_ATTR_HELPER,     /* netlink helper to assist detection of
+				   related connections. */
 	__OVS_CT_ATTR_MAX
 };
 
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index a0417fb33d1d..890d3eedb447 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -15,6 +15,7 @@
 #include <linux/openvswitch.h>
 #include <net/ip.h>
 #include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_labels.h>
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
@@ -43,6 +44,7 @@ struct md_label {
 
 /* Conntrack action context for execution. */
 struct ovs_conntrack_info {
+	struct nf_conntrack_helper *helper;
 	struct nf_conntrack_zone zone;
 	struct nf_conn *ct;
 	u32 flags;
@@ -234,6 +236,51 @@ static int ovs_ct_set_label(struct sk_buff *skb, struct sw_flow_key *key,
 	return 0;
 }
 
+/* 'skb' should already be pulled to nh_ofs. */
+static int ovs_ct_helper(struct sk_buff *skb, u16 proto)
+{
+	const struct nf_conntrack_helper *helper;
+	const struct nf_conn_help *help;
+	enum ip_conntrack_info ctinfo;
+	unsigned int protoff;
+	struct nf_conn *ct;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (!ct || ctinfo == IP_CT_RELATED_REPLY)
+		return NF_ACCEPT;
+
+	help = nfct_help(ct);
+	if (!help)
+		return NF_ACCEPT;
+
+	helper = rcu_dereference(help->helper);
+	if (!helper)
+		return NF_ACCEPT;
+
+	switch (proto) {
+	case NFPROTO_IPV4:
+		protoff = ip_hdrlen(skb);
+		break;
+	case NFPROTO_IPV6: {
+		u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+		__be16 frag_off;
+
+		protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr),
+					   &nexthdr, &frag_off);
+		if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
+			pr_debug("proto header not found\n");
+			return NF_ACCEPT;
+		}
+		break;
+	}
+	default:
+		WARN_ONCE(1, "helper invoked on non-IP family!");
+		return NF_DROP;
+	}
+
+	return helper->help(skb, protoff, ct, ctinfo);
+}
+
 static int handle_fragments(struct net *net, struct sw_flow_key *key,
 			    u16 zone, struct sk_buff *skb)
 {
@@ -306,6 +353,13 @@ static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb,
 		return false;
 	if (!nf_ct_zone_equal_any(info->ct, nf_ct_zone(ct)))
 		return false;
+	if (info->helper) {
+		struct nf_conn_help *help;
+
+		help = nf_ct_ext_find(ct, NF_CT_EXT_HELPER);
+		if (help && rcu_access_pointer(help->helper) != info->helper)
+			return false;
+	}
 
 	return true;
 }
@@ -334,6 +388,11 @@ static int __ovs_ct_lookup(struct net *net, const struct sw_flow_key *key,
 		if (nf_conntrack_in(net, info->family, NF_INET_PRE_ROUTING,
 				    skb) != NF_ACCEPT)
 			return -ENOENT;
+
+		if (ovs_ct_helper(skb, info->family) != NF_ACCEPT) {
+			WARN_ONCE(1, "helper rejected packet");
+			return -EINVAL;
+		}
 	}
 
 	return 0;
@@ -442,6 +501,30 @@ err:
 	return err;
 }
 
+static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
+			     const struct sw_flow_key *key, bool log)
+{
+	struct nf_conntrack_helper *helper;
+	struct nf_conn_help *help;
+
+	helper = nf_conntrack_helper_try_module_get(name, info->family,
+						    key->ip.proto);
+	if (!helper) {
+		OVS_NLERR(log, "Unknown helper \"%s\"", name);
+		return -EINVAL;
+	}
+
+	help = nf_ct_helper_ext_add(info->ct, helper, GFP_KERNEL);
+	if (!help) {
+		module_put(helper->me);
+		return -ENOMEM;
+	}
+
+	rcu_assign_pointer(help->helper, helper);
+	info->helper = helper;
+	return 0;
+}
+
 static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
 	[OVS_CT_ATTR_FLAGS]	= { .minlen = sizeof(u32),
 				    .maxlen = sizeof(u32) },
@@ -451,10 +534,12 @@ static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
 				    .maxlen = sizeof(struct md_mark) },
 	[OVS_CT_ATTR_LABEL]	= { .minlen = sizeof(struct md_label),
 				    .maxlen = sizeof(struct md_label) },
+	[OVS_CT_ATTR_HELPER]	= { .minlen = 1,
+				    .maxlen = NF_CT_HELPER_NAME_LEN }
 };
 
 static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
-		    bool log)
+		    const char **helper, bool log)
 {
 	struct nlattr *a;
 	int rem;
@@ -502,6 +587,13 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
 			break;
 		}
 #endif
+		case OVS_CT_ATTR_HELPER:
+			*helper = nla_data(a);
+			if (!memchr(*helper, '\0', nla_len(a))) {
+				OVS_NLERR(log, "Invalid conntrack helper");
+				return -EINVAL;
+			}
+			break;
 		default:
 			OVS_NLERR(log, "Unknown conntrack attr (%d)",
 				  type);
@@ -542,6 +634,7 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr,
 		       struct sw_flow_actions **sfa,  bool log)
 {
 	struct ovs_conntrack_info ct_info;
+	const char *helper = NULL;
 	u16 family;
 	int err;
 
@@ -557,7 +650,7 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr,
 	nf_ct_zone_init(&ct_info.zone, NF_CT_DEFAULT_ZONE_ID,
 			NF_CT_DEFAULT_ZONE_DIR, 0);
 
-	err = parse_ct(attr, &ct_info, log);
+	err = parse_ct(attr, &ct_info, &helper, log);
 	if (err)
 		return err;
 
@@ -567,6 +660,11 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr,
 		OVS_NLERR(log, "Failed to allocate conntrack template");
 		return -ENOMEM;
 	}
+	if (helper) {
+		err = ovs_ct_add_helper(&ct_info, helper, key, log);
+		if (err)
+			goto err_free_ct;
+	}
 
 	err = ovs_nla_add_action(sfa, OVS_ACTION_ATTR_CT, &ct_info,
 				 sizeof(ct_info), log);
@@ -603,6 +701,11 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
 	    nla_put(skb, OVS_CT_ATTR_LABEL, sizeof(ct_info->label),
 		    &ct_info->label))
 		return -EMSGSIZE;
+	if (ct_info->helper) {
+		if (nla_put_string(skb, OVS_CT_ATTR_HELPER,
+				   ct_info->helper->name))
+			return -EMSGSIZE;
+	}
 
 	nla_nest_end(skb, start);
 
@@ -613,6 +716,8 @@ void ovs_ct_free_action(const struct nlattr *a)
 {
 	struct ovs_conntrack_info *ct_info = nla_data(a);
 
+	if (ct_info->helper)
+		module_put(ct_info->helper->me);
 	if (ct_info->ct)
 		nf_ct_put(ct_info->ct);
 }
-- 
cgit v1.2.3


From d2d427b3927bd7a0348fc7f323d0e291f79a2779 Mon Sep 17 00:00:00 2001
From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Date: Thu, 27 Aug 2015 15:32:26 +0900
Subject: bridge: Add netlink support for vlan_protocol attribute

This enables bridge vlan_protocol to be configured through netlink.

When CONFIG_BRIDGE_VLAN_FILTERING is disabled, kernel behaves the
same way as this feature is not implemented.

Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h |  1 +
 net/bridge/br_netlink.c      | 34 ++++++++++++++++++++++++++++++++++
 net/bridge/br_private.h      |  1 +
 net/bridge/br_vlan.c         | 35 +++++++++++++++++++++--------------
 4 files changed, 57 insertions(+), 14 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 313c305fd1ad..2d13dd44ecaa 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -231,6 +231,7 @@ enum {
 	IFLA_BR_STP_STATE,
 	IFLA_BR_PRIORITY,
 	IFLA_BR_VLAN_FILTERING,
+	IFLA_BR_VLAN_PROTOCOL,
 	__IFLA_BR_MAX,
 };
 
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index dbcb1949ea58..af5e187553fd 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -673,6 +673,21 @@ static int br_validate(struct nlattr *tb[], struct nlattr *data[])
 			return -EADDRNOTAVAIL;
 	}
 
+	if (!data)
+		return 0;
+
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+	if (data[IFLA_BR_VLAN_PROTOCOL]) {
+		switch (nla_get_be16(data[IFLA_BR_VLAN_PROTOCOL])) {
+		case htons(ETH_P_8021Q):
+		case htons(ETH_P_8021AD):
+			break;
+		default:
+			return -EPROTONOSUPPORT;
+		}
+	}
+#endif
+
 	return 0;
 }
 
@@ -729,6 +744,7 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = {
 	[IFLA_BR_STP_STATE] = { .type = NLA_U32 },
 	[IFLA_BR_PRIORITY] = { .type = NLA_U16 },
 	[IFLA_BR_VLAN_FILTERING] = { .type = NLA_U8 },
+	[IFLA_BR_VLAN_PROTOCOL] = { .type = NLA_U16 },
 };
 
 static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
@@ -784,6 +800,16 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
 			return err;
 	}
 
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+	if (data[IFLA_BR_VLAN_PROTOCOL]) {
+		__be16 vlan_proto = nla_get_be16(data[IFLA_BR_VLAN_PROTOCOL]);
+
+		err = __br_vlan_set_proto(br, vlan_proto);
+		if (err)
+			return err;
+	}
+#endif
+
 	return 0;
 }
 
@@ -796,6 +822,9 @@ static size_t br_get_size(const struct net_device *brdev)
 	       nla_total_size(sizeof(u32)) +    /* IFLA_BR_STP_STATE */
 	       nla_total_size(sizeof(u16)) +    /* IFLA_BR_PRIORITY */
 	       nla_total_size(sizeof(u8)) +     /* IFLA_BR_VLAN_FILTERING */
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+	       nla_total_size(sizeof(__be16)) +	/* IFLA_BR_VLAN_PROTOCOL */
+#endif
 	       0;
 }
 
@@ -819,6 +848,11 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
 	    nla_put_u8(skb, IFLA_BR_VLAN_FILTERING, vlan_enabled))
 		return -EMSGSIZE;
 
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+	if (nla_put_be16(skb, IFLA_BR_VLAN_PROTOCOL, br->vlan_proto))
+		return -EMSGSIZE;
+#endif
+
 	return 0;
 }
 
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 3d95647039d0..19e8f79b6b99 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -616,6 +616,7 @@ bool br_vlan_find(struct net_bridge *br, u16 vid);
 void br_recalculate_fwd_mask(struct net_bridge *br);
 int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val);
 int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val);
+int __br_vlan_set_proto(struct net_bridge *br, __be16 proto);
 int br_vlan_set_proto(struct net_bridge *br, unsigned long val);
 int br_vlan_init(struct net_bridge *br);
 int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val);
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 3cef6892c0bb..3cd8cc9e804b 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -492,23 +492,16 @@ int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val)
 	return 0;
 }
 
-int br_vlan_set_proto(struct net_bridge *br, unsigned long val)
+int __br_vlan_set_proto(struct net_bridge *br, __be16 proto)
 {
 	int err = 0;
 	struct net_bridge_port *p;
 	struct net_port_vlans *pv;
-	__be16 proto, oldproto;
+	__be16 oldproto;
 	u16 vid, errvid;
 
-	if (val != ETH_P_8021Q && val != ETH_P_8021AD)
-		return -EPROTONOSUPPORT;
-
-	if (!rtnl_trylock())
-		return restart_syscall();
-
-	proto = htons(val);
 	if (br->vlan_proto == proto)
-		goto unlock;
+		return 0;
 
 	/* Add VLANs for the new proto to the device filter. */
 	list_for_each_entry(p, &br->port_list, list) {
@@ -539,9 +532,7 @@ int br_vlan_set_proto(struct net_bridge *br, unsigned long val)
 			vlan_vid_del(p->dev, oldproto, vid);
 	}
 
-unlock:
-	rtnl_unlock();
-	return err;
+	return 0;
 
 err_filt:
 	errvid = vid;
@@ -557,7 +548,23 @@ err_filt:
 			vlan_vid_del(p->dev, proto, vid);
 	}
 
-	goto unlock;
+	return err;
+}
+
+int br_vlan_set_proto(struct net_bridge *br, unsigned long val)
+{
+	int err;
+
+	if (val != ETH_P_8021Q && val != ETH_P_8021AD)
+		return -EPROTONOSUPPORT;
+
+	if (!rtnl_trylock())
+		return restart_syscall();
+
+	err = __br_vlan_set_proto(br, htons(val));
+	rtnl_unlock();
+
+	return err;
 }
 
 static bool vlan_default_pvid(struct net_port_vlans *pv, u16 vid)
-- 
cgit v1.2.3


From cd7918b35f0ee0106bbe2ce4a14b5a8c9763deb8 Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Wed, 26 Aug 2015 23:46:51 -0700
Subject: geneve: Make dst-port configurable.

Add netlink interface to configure Geneve UDP port number.
So that user can configure it for a Gevene device.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Reviewed-by: Jesse Gross <jesse@nicira.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Acked-by: John W. Linville <linville@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/geneve.c         | 25 +++++++++++++++++++++----
 include/uapi/linux/if_link.h |  1 +
 2 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 3c5b2b100943..0a6d9741d956 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -49,6 +49,7 @@ struct geneve_dev {
 	u8                 tos;		/* TOS override */
 	struct sockaddr_in remote;	/* IPv4 address for link partner */
 	struct list_head   next;	/* geneve's per namespace list */
+	__be16		   dst_port;
 };
 
 static int geneve_net_id;
@@ -64,6 +65,7 @@ static inline __u32 geneve_net_vni_hash(u8 vni[3])
 /* geneve receive/decap routine */
 static void geneve_rx(struct geneve_sock *gs, struct sk_buff *skb)
 {
+	struct inet_sock *sk = inet_sk(gs->sock->sk);
 	struct genevehdr *gnvh = geneve_hdr(skb);
 	struct geneve_dev *dummy, *geneve = NULL;
 	struct geneve_net *gn;
@@ -82,7 +84,8 @@ static void geneve_rx(struct geneve_sock *gs, struct sk_buff *skb)
 	vni_list_head = &gn->vni_list[hash];
 	hlist_for_each_entry_rcu(dummy, vni_list_head, hlist) {
 		if (!memcmp(gnvh->vni, dummy->vni, sizeof(dummy->vni)) &&
-		    iph->saddr == dummy->remote.sin_addr.s_addr) {
+		    iph->saddr == dummy->remote.sin_addr.s_addr &&
+		    sk->inet_sport == dummy->dst_port) {
 			geneve = dummy;
 			break;
 		}
@@ -157,7 +160,7 @@ static int geneve_open(struct net_device *dev)
 	struct geneve_net *gn = net_generic(geneve->net, geneve_net_id);
 	struct geneve_sock *gs;
 
-	gs = geneve_sock_add(net, htons(GENEVE_UDP_PORT), geneve_rx, gn,
+	gs = geneve_sock_add(net, geneve->dst_port, geneve_rx, gn,
 	                     false, false);
 	if (IS_ERR(gs))
 		return PTR_ERR(gs);
@@ -228,7 +231,7 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev)
 	/* no need to handle local destination and encap bypass...yet... */
 
 	err = geneve_xmit_skb(gs, rt, skb, fl4.saddr, fl4.daddr,
-	                      tos, ttl, 0, sport, htons(GENEVE_UDP_PORT), 0,
+			      tos, ttl, 0, sport, geneve->dst_port, 0,
 	                      geneve->vni, 0, NULL, false,
 	                      !net_eq(geneve->net, dev_net(geneve->dev)));
 	if (err < 0)
@@ -308,6 +311,7 @@ static const struct nla_policy geneve_policy[IFLA_GENEVE_MAX + 1] = {
 	[IFLA_GENEVE_REMOTE]		= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
 	[IFLA_GENEVE_TTL]		= { .type = NLA_U8 },
 	[IFLA_GENEVE_TOS]		= { .type = NLA_U8 },
+	[IFLA_GENEVE_PORT]		= { .type = NLA_U16 },
 };
 
 static int geneve_validate(struct nlattr *tb[], struct nlattr *data[])
@@ -341,6 +345,7 @@ static int geneve_newlink(struct net *net, struct net_device *dev,
 	struct hlist_head *vni_list_head;
 	struct sockaddr_in remote;	/* IPv4 address for link partner */
 	__u32 vni, hash;
+	__be16 dst_port;
 	int err;
 
 	if (!data[IFLA_GENEVE_ID] || !data[IFLA_GENEVE_REMOTE])
@@ -359,13 +364,20 @@ static int geneve_newlink(struct net *net, struct net_device *dev,
 	if (IN_MULTICAST(ntohl(geneve->remote.sin_addr.s_addr)))
 		return -EINVAL;
 
+	if (data[IFLA_GENEVE_PORT])
+		dst_port = htons(nla_get_u16(data[IFLA_GENEVE_PORT]));
+	else
+		dst_port = htons(GENEVE_UDP_PORT);
+
 	remote = geneve->remote;
 	hash = geneve_net_vni_hash(geneve->vni);
 	vni_list_head = &gn->vni_list[hash];
 	hlist_for_each_entry_rcu(dummy, vni_list_head, hlist) {
 		if (!memcmp(geneve->vni, dummy->vni, sizeof(dummy->vni)) &&
-		    !memcmp(&remote, &dummy->remote, sizeof(dummy->remote)))
+		    !memcmp(&remote, &dummy->remote, sizeof(dummy->remote)) &&
+		    dst_port == dummy->dst_port) {
 			return -EBUSY;
+		}
 	}
 
 	err = register_netdevice(dev);
@@ -378,6 +390,7 @@ static int geneve_newlink(struct net *net, struct net_device *dev,
 	if (data[IFLA_GENEVE_TOS])
 		geneve->tos = nla_get_u8(data[IFLA_GENEVE_TOS]);
 
+	geneve->dst_port = dst_port;
 	list_add(&geneve->next, &gn->geneve_list);
 
 	hlist_add_head_rcu(&geneve->hlist, &gn->vni_list[hash]);
@@ -402,6 +415,7 @@ static size_t geneve_get_size(const struct net_device *dev)
 		nla_total_size(sizeof(struct in_addr)) + /* IFLA_GENEVE_REMOTE */
 		nla_total_size(sizeof(__u8)) +  /* IFLA_GENEVE_TTL */
 		nla_total_size(sizeof(__u8)) +  /* IFLA_GENEVE_TOS */
+		nla_total_size(sizeof(__u16)) +  /* IFLA_GENEVE_PORT */
 		0;
 }
 
@@ -422,6 +436,9 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	    nla_put_u8(skb, IFLA_GENEVE_TOS, geneve->tos))
 		goto nla_put_failure;
 
+	if (nla_put_u16(skb, IFLA_GENEVE_PORT, ntohs(geneve->dst_port)))
+		goto nla_put_failure;
+
 	return 0;
 
 nla_put_failure:
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 2d13dd44ecaa..9d73c31896d0 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -410,6 +410,7 @@ enum {
 	IFLA_GENEVE_REMOTE,
 	IFLA_GENEVE_TTL,
 	IFLA_GENEVE_TOS,
+	IFLA_GENEVE_PORT,	/* destination port */
 	__IFLA_GENEVE_MAX
 };
 #define IFLA_GENEVE_MAX	(__IFLA_GENEVE_MAX - 1)
-- 
cgit v1.2.3


From e305ac6cf5a1e1386aedce7ef9cb773635d5845c Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Wed, 26 Aug 2015 23:46:52 -0700
Subject: geneve: Add support to collect tunnel metadata.

Following patch create new tunnel flag which enable
tunnel metadata collection on given device. These devices
can be used by tunnel metadata based routing or by OVS.
Geneve Consolidation patch get rid of collect_md_tun to
simplify tunnel lookup further.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Reviewed-by: Jesse Gross <jesse@nicira.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/geneve.c         | 356 ++++++++++++++++++++++++++++++++-----------
 include/net/geneve.h         |   3 +
 include/uapi/linux/if_link.h |   1 +
 3 files changed, 275 insertions(+), 85 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 0a6d9741d956..d05150cc25d4 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -15,6 +15,7 @@
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/hash.h>
+#include <net/dst_metadata.h>
 #include <net/rtnetlink.h>
 #include <net/geneve.h>
 
@@ -36,6 +37,7 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
 struct geneve_net {
 	struct list_head  geneve_list;
 	struct hlist_head vni_list[VNI_HASH_SIZE];
+	struct geneve_dev __rcu *collect_md_tun;
 };
 
 /* Pseudo network device */
@@ -50,6 +52,7 @@ struct geneve_dev {
 	struct sockaddr_in remote;	/* IPv4 address for link partner */
 	struct list_head   next;	/* geneve's per namespace list */
 	__be16		   dst_port;
+	bool		   collect_md;
 };
 
 static int geneve_net_id;
@@ -62,48 +65,95 @@ static inline __u32 geneve_net_vni_hash(u8 vni[3])
 	return hash_32(vnid, VNI_HASH_BITS);
 }
 
-/* geneve receive/decap routine */
-static void geneve_rx(struct geneve_sock *gs, struct sk_buff *skb)
+static __be64 vni_to_tunnel_id(const __u8 *vni)
+{
+#ifdef __BIG_ENDIAN
+	return (vni[0] << 16) | (vni[1] << 8) | vni[2];
+#else
+	return (__force __be64)(((__force u64)vni[0] << 40) |
+				((__force u64)vni[1] << 48) |
+				((__force u64)vni[2] << 56));
+#endif
+}
+
+static struct geneve_dev *geneve_lookup(struct geneve_net *gn,
+					struct geneve_sock *gs,
+					struct iphdr *iph,
+					struct genevehdr *gnvh)
 {
 	struct inet_sock *sk = inet_sk(gs->sock->sk);
-	struct genevehdr *gnvh = geneve_hdr(skb);
-	struct geneve_dev *dummy, *geneve = NULL;
-	struct geneve_net *gn;
-	struct iphdr *iph = NULL;
-	struct pcpu_sw_netstats *stats;
 	struct hlist_head *vni_list_head;
-	int err = 0;
+	struct geneve_dev *geneve;
 	__u32 hash;
 
-	iph = ip_hdr(skb); /* Still outer IP header... */
-
-	gn = gs->rcv_data;
+	geneve = rcu_dereference(gn->collect_md_tun);
+	if (geneve)
+		return geneve;
 
 	/* Find the device for this VNI */
 	hash = geneve_net_vni_hash(gnvh->vni);
 	vni_list_head = &gn->vni_list[hash];
-	hlist_for_each_entry_rcu(dummy, vni_list_head, hlist) {
-		if (!memcmp(gnvh->vni, dummy->vni, sizeof(dummy->vni)) &&
-		    iph->saddr == dummy->remote.sin_addr.s_addr &&
-		    sk->inet_sport == dummy->dst_port) {
-			geneve = dummy;
-			break;
+	hlist_for_each_entry_rcu(geneve, vni_list_head, hlist) {
+		if (!memcmp(gnvh->vni, geneve->vni, sizeof(geneve->vni)) &&
+		    iph->saddr == geneve->remote.sin_addr.s_addr &&
+		    sk->inet_sport == geneve->dst_port) {
+			return geneve;
 		}
 	}
+	return NULL;
+}
+
+/* geneve receive/decap routine */
+static void geneve_rx(struct geneve_sock *gs, struct sk_buff *skb)
+{
+	struct genevehdr *gnvh = geneve_hdr(skb);
+	struct metadata_dst *tun_dst = NULL;
+	struct geneve_dev *geneve = NULL;
+	struct pcpu_sw_netstats *stats;
+	struct geneve_net *gn;
+	struct iphdr *iph;
+	int err;
+
+	iph = ip_hdr(skb); /* Still outer IP header... */
+	gn = gs->rcv_data;
+	geneve = geneve_lookup(gn, gs, iph, gnvh);
 	if (!geneve)
 		goto drop;
 
-	/* Drop packets w/ critical options,
-	 * since we don't support any...
-	 */
-	if (gnvh->critical)
-		goto drop;
+	if (ip_tunnel_collect_metadata() || geneve->collect_md) {
+		__be16 flags;
+		void *opts;
+
+		flags = TUNNEL_KEY | TUNNEL_GENEVE_OPT |
+			(gnvh->oam ? TUNNEL_OAM : 0) |
+			(gnvh->critical ? TUNNEL_CRIT_OPT : 0);
+
+		tun_dst = udp_tun_rx_dst(skb, AF_INET, flags,
+					 vni_to_tunnel_id(gnvh->vni),
+					 gnvh->opt_len * 4);
+		if (!tun_dst)
+			goto drop;
+
+		/* Update tunnel dst according to Geneve options. */
+		opts = ip_tunnel_info_opts(&tun_dst->u.tun_info,
+					   gnvh->opt_len * 4);
+		memcpy(opts, gnvh->options, gnvh->opt_len * 4);
+	} else {
+		/* Drop packets w/ critical options,
+		 * since we don't support any...
+		 */
+		if (gnvh->critical)
+			goto drop;
+	}
 
 	skb_reset_mac_header(skb);
 	skb_scrub_packet(skb, !net_eq(geneve->net, dev_net(geneve->dev)));
 	skb->protocol = eth_type_trans(skb, geneve->dev);
 	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 
+	if (tun_dst)
+		skb_dst_set(skb, &tun_dst->dst);
+
 	/* Ignore packet loops (and multicast echo) */
 	if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr))
 		goto drop;
@@ -131,7 +181,6 @@ static void geneve_rx(struct geneve_sock *gs, struct sk_buff *skb)
 	u64_stats_update_end(&stats->syncp);
 
 	netif_rx(skb);
-
 	return;
 drop:
 	/* Consume bad packet */
@@ -144,7 +193,6 @@ static int geneve_init(struct net_device *dev)
 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
 	if (!dev->tstats)
 		return -ENOMEM;
-
 	return 0;
 }
 
@@ -180,69 +228,137 @@ static int geneve_stop(struct net_device *dev)
 	return 0;
 }
 
+static struct rtable *geneve_get_rt(struct sk_buff *skb,
+				    struct net_device *dev,
+				    struct flowi4 *fl4,
+				    struct ip_tunnel_info *info)
+{
+	struct geneve_dev *geneve = netdev_priv(dev);
+	struct rtable *rt = NULL;
+	__u8 tos;
+
+	memset(fl4, 0, sizeof(*fl4));
+	fl4->flowi4_mark = skb->mark;
+	fl4->flowi4_proto = IPPROTO_UDP;
+
+	if (info) {
+		fl4->daddr = info->key.u.ipv4.dst;
+		fl4->saddr = info->key.u.ipv4.src;
+		fl4->flowi4_tos = RT_TOS(info->key.tos);
+	} else {
+		tos = geneve->tos;
+		if (tos == 1) {
+			const struct iphdr *iip = ip_hdr(skb);
+
+			tos = ip_tunnel_get_dsfield(iip, skb);
+		}
+
+		fl4->flowi4_tos = RT_TOS(tos);
+		fl4->daddr = geneve->remote.sin_addr.s_addr;
+	}
+
+	rt = ip_route_output_key(geneve->net, fl4);
+	if (IS_ERR(rt)) {
+		netdev_dbg(dev, "no route to %pI4\n", &fl4->daddr);
+		dev->stats.tx_carrier_errors++;
+		return rt;
+	}
+	if (rt->dst.dev == dev) { /* is this necessary? */
+		netdev_dbg(dev, "circular route to %pI4\n", &fl4->daddr);
+		dev->stats.collisions++;
+		ip_rt_put(rt);
+		return ERR_PTR(-EINVAL);
+	}
+
+	return rt;
+}
+
+/* Convert 64 bit tunnel ID to 24 bit VNI. */
+static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni)
+{
+#ifdef __BIG_ENDIAN
+	vni[0] = (__force __u8)(tun_id >> 16);
+	vni[1] = (__force __u8)(tun_id >> 8);
+	vni[2] = (__force __u8)tun_id;
+#else
+	vni[0] = (__force __u8)((__force u64)tun_id >> 40);
+	vni[1] = (__force __u8)((__force u64)tun_id >> 48);
+	vni[2] = (__force __u8)((__force u64)tun_id >> 56);
+#endif
+}
+
 static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct geneve_dev *geneve = netdev_priv(dev);
 	struct geneve_sock *gs = geneve->sock;
+	struct ip_tunnel_info *info = NULL;
 	struct rtable *rt = NULL;
 	const struct iphdr *iip; /* interior IP header */
 	struct flowi4 fl4;
-	int err;
-	__be16 sport;
 	__u8 tos, ttl;
+	__be16 sport;
+	bool xnet;
+	int err;
 
-	iip = ip_hdr(skb);
-
-	skb_reset_mac_header(skb);
-
-	/* TODO: port min/max limits should be configurable */
-	sport = udp_flow_src_port(dev_net(dev), skb, 0, 0, true);
-
-	tos = geneve->tos;
-	if (tos == 1)
-		tos = ip_tunnel_get_dsfield(iip, skb);
+	sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
 
-	memset(&fl4, 0, sizeof(fl4));
-	fl4.flowi4_tos = RT_TOS(tos);
-	fl4.daddr = geneve->remote.sin_addr.s_addr;
-	fl4.flowi4_mark = skb->mark;
-	fl4.flowi4_proto = IPPROTO_UDP;
+	if (geneve->collect_md) {
+		info = skb_tunnel_info(skb);
+		if (unlikely(info && info->mode != IP_TUNNEL_INFO_TX)) {
+			netdev_dbg(dev, "no tunnel metadata\n");
+			goto tx_error;
+		}
+	}
 
-	rt = ip_route_output_key(geneve->net, &fl4);
+	rt = geneve_get_rt(skb, dev, &fl4, info);
 	if (IS_ERR(rt)) {
 		netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
 		dev->stats.tx_carrier_errors++;
 		goto tx_error;
 	}
-	if (rt->dst.dev == dev) { /* is this necessary? */
-		netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
-		dev->stats.collisions++;
-		goto rt_tx_error;
+	skb_reset_mac_header(skb);
+	xnet = !net_eq(geneve->net, dev_net(geneve->dev));
+
+	if (info) {
+		const struct ip_tunnel_key *key = &info->key;
+		bool udp_csum;
+		u8 *opts = NULL;
+		u8 vni[3];
+		__be16 df;
+
+		tunnel_id_to_vni(key->tun_id, vni);
+		df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
+		udp_csum = !!(key->tun_flags & TUNNEL_CSUM);
+
+		if (key->tun_flags & TUNNEL_GENEVE_OPT)
+			opts = ip_tunnel_info_opts(info, info->options_len);
+
+		err = geneve_xmit_skb(gs, rt, skb, fl4.saddr, fl4.daddr,
+				      key->tos, key->ttl, df,
+				      sport, geneve->dst_port,
+				      key->tun_flags, vni,
+				      info->options_len, opts, udp_csum, xnet);
+	} else {
+		iip = ip_hdr(skb);
+		tos = ip_tunnel_ecn_encap(fl4.flowi4_tos, iip, skb);
+
+		ttl = geneve->ttl;
+		if (!ttl && IN_MULTICAST(ntohl(fl4.daddr)))
+			ttl = 1;
+
+		ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
+
+		/* no need to handle local destination and encap bypass...yet... */
+		err = geneve_xmit_skb(gs, rt, skb, fl4.saddr, fl4.daddr, tos,
+				      ttl, 0, sport, geneve->dst_port, 0,
+				      geneve->vni, 0, NULL, false, xnet);
 	}
-
-	tos = ip_tunnel_ecn_encap(tos, iip, skb);
-
-	ttl = geneve->ttl;
-	if (!ttl && IN_MULTICAST(ntohl(fl4.daddr)))
-		ttl = 1;
-
-	ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
-
-	/* no need to handle local destination and encap bypass...yet... */
-
-	err = geneve_xmit_skb(gs, rt, skb, fl4.saddr, fl4.daddr,
-			      tos, ttl, 0, sport, geneve->dst_port, 0,
-	                      geneve->vni, 0, NULL, false,
-	                      !net_eq(geneve->net, dev_net(geneve->dev)));
 	if (err < 0)
 		ip_rt_put(rt);
 
 	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
-
 	return NETDEV_TX_OK;
 
-rt_tx_error:
-	ip_rt_put(rt);
 tx_error:
 	dev->stats.tx_errors++;
 	dev_kfree_skb(skb);
@@ -312,6 +428,7 @@ static const struct nla_policy geneve_policy[IFLA_GENEVE_MAX + 1] = {
 	[IFLA_GENEVE_TTL]		= { .type = NLA_U8 },
 	[IFLA_GENEVE_TOS]		= { .type = NLA_U8 },
 	[IFLA_GENEVE_PORT]		= { .type = NLA_U16 },
+	[IFLA_GENEVE_COLLECT_METADATA]	= { .type = NLA_FLAG },
 };
 
 static int geneve_validate(struct nlattr *tb[], struct nlattr *data[])
@@ -337,71 +454,112 @@ static int geneve_validate(struct nlattr *tb[], struct nlattr *data[])
 	return 0;
 }
 
-static int geneve_newlink(struct net *net, struct net_device *dev,
-			 struct nlattr *tb[], struct nlattr *data[])
+static int geneve_configure(struct net *net, struct net_device *dev,
+			    __be32 rem_addr, __u32 vni, __u8 ttl, __u8 tos,
+			    __u16 dst_port, bool metadata)
 {
 	struct geneve_net *gn = net_generic(net, geneve_net_id);
 	struct geneve_dev *dummy, *geneve = netdev_priv(dev);
 	struct hlist_head *vni_list_head;
 	struct sockaddr_in remote;	/* IPv4 address for link partner */
-	__u32 vni, hash;
-	__be16 dst_port;
+	__u32 hash;
 	int err;
 
-	if (!data[IFLA_GENEVE_ID] || !data[IFLA_GENEVE_REMOTE])
-		return -EINVAL;
+	if (metadata) {
+		if (rtnl_dereference(gn->collect_md_tun))
+			return -EEXIST;
+		if (!list_empty(&gn->geneve_list))
+			return -EPERM;
+	} else {
+		if (rtnl_dereference(gn->collect_md_tun))
+			return -EPERM;
+	}
 
 	geneve->net = net;
 	geneve->dev = dev;
 
-	vni = nla_get_u32(data[IFLA_GENEVE_ID]);
 	geneve->vni[0] = (vni & 0x00ff0000) >> 16;
 	geneve->vni[1] = (vni & 0x0000ff00) >> 8;
 	geneve->vni[2] =  vni & 0x000000ff;
 
-	geneve->remote.sin_addr.s_addr =
-		nla_get_in_addr(data[IFLA_GENEVE_REMOTE]);
+	geneve->remote.sin_addr.s_addr = rem_addr;
 	if (IN_MULTICAST(ntohl(geneve->remote.sin_addr.s_addr)))
 		return -EINVAL;
 
-	if (data[IFLA_GENEVE_PORT])
-		dst_port = htons(nla_get_u16(data[IFLA_GENEVE_PORT]));
-	else
-		dst_port = htons(GENEVE_UDP_PORT);
-
 	remote = geneve->remote;
+	if (metadata) {
+		if (rem_addr || vni || tos || ttl)
+			return -EINVAL;
+	}
+
 	hash = geneve_net_vni_hash(geneve->vni);
 	vni_list_head = &gn->vni_list[hash];
 	hlist_for_each_entry_rcu(dummy, vni_list_head, hlist) {
 		if (!memcmp(geneve->vni, dummy->vni, sizeof(dummy->vni)) &&
 		    !memcmp(&remote, &dummy->remote, sizeof(dummy->remote)) &&
-		    dst_port == dummy->dst_port) {
+		    htons(dst_port) == dummy->dst_port) {
 			return -EBUSY;
 		}
 	}
 
+	geneve->ttl = ttl;
+	geneve->tos = tos;
+	geneve->dst_port = htons(dst_port);
+	geneve->collect_md = metadata;
+
 	err = register_netdevice(dev);
 	if (err)
 		return err;
 
+	list_add(&geneve->next, &gn->geneve_list);
+	hlist_add_head_rcu(&geneve->hlist, &gn->vni_list[hash]);
+
+	if (geneve->collect_md)
+		rcu_assign_pointer(gn->collect_md_tun, geneve);
+	return 0;
+}
+
+static int geneve_newlink(struct net *net, struct net_device *dev,
+			  struct nlattr *tb[], struct nlattr *data[])
+{
+	__u16 dst_port = GENEVE_UDP_PORT;
+	__u8 ttl = 0, tos = 0;
+	bool metadata = false;
+	__be32 rem_addr;
+	__u32 vni;
+
+	if (!data[IFLA_GENEVE_ID] || !data[IFLA_GENEVE_REMOTE])
+		return -EINVAL;
+
+	vni = nla_get_u32(data[IFLA_GENEVE_ID]);
+	rem_addr = nla_get_in_addr(data[IFLA_GENEVE_REMOTE]);
+
 	if (data[IFLA_GENEVE_TTL])
-		geneve->ttl = nla_get_u8(data[IFLA_GENEVE_TTL]);
+		ttl = nla_get_u8(data[IFLA_GENEVE_TTL]);
 
 	if (data[IFLA_GENEVE_TOS])
-		geneve->tos = nla_get_u8(data[IFLA_GENEVE_TOS]);
+		tos = nla_get_u8(data[IFLA_GENEVE_TOS]);
 
-	geneve->dst_port = dst_port;
-	list_add(&geneve->next, &gn->geneve_list);
+	if (data[IFLA_GENEVE_PORT])
+		dst_port = nla_get_u16(data[IFLA_GENEVE_PORT]);
 
-	hlist_add_head_rcu(&geneve->hlist, &gn->vni_list[hash]);
+	if (data[IFLA_GENEVE_COLLECT_METADATA])
+		metadata = true;
 
-	return 0;
+	return geneve_configure(net, dev, rem_addr, vni,
+				ttl, tos, dst_port, metadata);
 }
 
 static void geneve_dellink(struct net_device *dev, struct list_head *head)
 {
 	struct geneve_dev *geneve = netdev_priv(dev);
 
+	if (geneve->collect_md) {
+		struct geneve_net *gn = net_generic(geneve->net, geneve_net_id);
+
+		rcu_assign_pointer(gn->collect_md_tun, NULL);
+	}
+
 	if (!hlist_unhashed(&geneve->hlist))
 		hlist_del_rcu(&geneve->hlist);
 
@@ -416,6 +574,7 @@ static size_t geneve_get_size(const struct net_device *dev)
 		nla_total_size(sizeof(__u8)) +  /* IFLA_GENEVE_TTL */
 		nla_total_size(sizeof(__u8)) +  /* IFLA_GENEVE_TOS */
 		nla_total_size(sizeof(__u16)) +  /* IFLA_GENEVE_PORT */
+		nla_total_size(0) +	 /* IFLA_GENEVE_COLLECT_METADATA */
 		0;
 }
 
@@ -439,6 +598,11 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	if (nla_put_u16(skb, IFLA_GENEVE_PORT, ntohs(geneve->dst_port)))
 		goto nla_put_failure;
 
+	if (geneve->collect_md) {
+		if (nla_put_flag(skb, IFLA_GENEVE_COLLECT_METADATA))
+			goto nla_put_failure;
+	}
+
 	return 0;
 
 nla_put_failure:
@@ -458,6 +622,28 @@ static struct rtnl_link_ops geneve_link_ops __read_mostly = {
 	.fill_info	= geneve_fill_info,
 };
 
+struct net_device *geneve_dev_create_fb(struct net *net, const char *name,
+					u8 name_assign_type, u16 dst_port)
+{
+	struct nlattr *tb[IFLA_MAX + 1];
+	struct net_device *dev;
+	int err;
+
+	memset(tb, 0, sizeof(tb));
+	dev = rtnl_create_link(net, name, name_assign_type,
+			       &geneve_link_ops, tb);
+	if (IS_ERR(dev))
+		return dev;
+
+	err = geneve_configure(net, dev, 0, 0, 0, 0, dst_port, true);
+	if (err) {
+		free_netdev(dev);
+		return ERR_PTR(err);
+	}
+	return dev;
+}
+EXPORT_SYMBOL_GPL(geneve_dev_create_fb);
+
 static __net_init int geneve_init_net(struct net *net)
 {
 	struct geneve_net *gn = net_generic(net, geneve_net_id);
diff --git a/include/net/geneve.h b/include/net/geneve.h
index 2a0543a1899d..4245e1d23b9b 100644
--- a/include/net/geneve.h
+++ b/include/net/geneve.h
@@ -96,6 +96,9 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
 		    __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port,
 		    __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt,
 		    bool csum, bool xnet);
+
+struct net_device *geneve_dev_create_fb(struct net *net, const char *name,
+					u8 name_assign_type, u16 dst_port);
 #endif /*ifdef CONFIG_INET */
 
 #endif /*ifdef__NET_GENEVE_H */
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 9d73c31896d0..3a5f263cfc2f 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -411,6 +411,7 @@ enum {
 	IFLA_GENEVE_TTL,
 	IFLA_GENEVE_TOS,
 	IFLA_GENEVE_PORT,	/* destination port */
+	IFLA_GENEVE_COLLECT_METADATA,
 	__IFLA_GENEVE_MAX
 };
 #define IFLA_GENEVE_MAX	(__IFLA_GENEVE_MAX - 1)
-- 
cgit v1.2.3


From 5153aacfb8e2744af68e7b84ccd3f02aeefe4f48 Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Wed, 26 Aug 2015 21:12:15 +0800
Subject: NFS: Update NFS4_BITMAP_SIZE

v4.1/v4.2 have define attributes at word2, nfs client also support
security label now.

v3, same as v2.

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/uapi/linux/nfs4.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/nfs4.h b/include/uapi/linux/nfs4.h
index 2119c7c274d7..2b871e0858d9 100644
--- a/include/uapi/linux/nfs4.h
+++ b/include/uapi/linux/nfs4.h
@@ -15,7 +15,7 @@
 
 #include <linux/types.h>
 
-#define NFS4_BITMAP_SIZE	2
+#define NFS4_BITMAP_SIZE	3
 #define NFS4_VERIFIER_SIZE	8
 #define NFS4_STATEID_SEQID_SIZE 4
 #define NFS4_STATEID_OTHER_SIZE 12
-- 
cgit v1.2.3


From 0a6a3a23ea6efde079a5b77688541a98bf202721 Mon Sep 17 00:00:00 2001
From: Christophe Ricard <christophe.ricard@gmail.com>
Date: Fri, 28 Aug 2015 07:07:48 +0200
Subject: netlink: add NETLINK_CAP_ACK socket option

Since commit c05cdb1b864f ("netlink: allow large data transfers from
user-space"), the kernel may fail to allocate the necessary room for the
acknowledgment message back to userspace. This patch introduces a new
socket option that trims off the payload of the original netlink message.

The netlink message header is still included, so the user can guess from
the sequence number what is the message that has triggered the
acknowledgment.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Christophe Ricard <christophe-h.ricard@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/netlink.h |  1 +
 net/netlink/af_netlink.c     | 27 ++++++++++++++++++++++++---
 2 files changed, 25 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
index cf6a65cccbdf..6f3fe16cd22a 100644
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -110,6 +110,7 @@ struct nlmsgerr {
 #define NETLINK_TX_RING			7
 #define NETLINK_LISTEN_ALL_NSID		8
 #define NETLINK_LIST_MEMBERSHIPS	9
+#define NETLINK_CAP_ACK			10
 
 struct nl_pktinfo {
 	__u32	group;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index a774985489e2..3eea0b2a3239 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -84,6 +84,7 @@ struct listeners {
 #define NETLINK_F_BROADCAST_SEND_ERROR	0x4
 #define NETLINK_F_RECV_NO_ENOBUFS	0x8
 #define NETLINK_F_LISTEN_ALL_NSID	0x10
+#define NETLINK_F_CAP_ACK		0x20
 
 static inline int netlink_is_kernel(struct sock *sk)
 {
@@ -2258,6 +2259,13 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
 			nlk->flags &= ~NETLINK_F_LISTEN_ALL_NSID;
 		err = 0;
 		break;
+	case NETLINK_CAP_ACK:
+		if (val)
+			nlk->flags |= NETLINK_F_CAP_ACK;
+		else
+			nlk->flags &= ~NETLINK_F_CAP_ACK;
+		err = 0;
+		break;
 	default:
 		err = -ENOPROTOOPT;
 	}
@@ -2332,6 +2340,16 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname,
 		netlink_table_ungrab();
 		break;
 	}
+	case NETLINK_CAP_ACK:
+		if (len < sizeof(int))
+			return -EINVAL;
+		len = sizeof(int);
+		val = nlk->flags & NETLINK_F_CAP_ACK ? 1 : 0;
+		if (put_user(len, optlen) ||
+		    put_user(val, optval))
+			return -EFAULT;
+		err = 0;
+		break;
 	default:
 		err = -ENOPROTOOPT;
 	}
@@ -2873,9 +2891,12 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err)
 	struct nlmsghdr *rep;
 	struct nlmsgerr *errmsg;
 	size_t payload = sizeof(*errmsg);
+	struct netlink_sock *nlk = nlk_sk(NETLINK_CB(in_skb).sk);
 
-	/* error messages get the original request appened */
-	if (err)
+	/* Error messages get the original request appened, unless the user
+	 * requests to cap the error message.
+	 */
+	if (!(nlk->flags & NETLINK_F_CAP_ACK) && err)
 		payload += nlmsg_len(nlh);
 
 	skb = netlink_alloc_skb(in_skb->sk, nlmsg_total_size(payload),
@@ -2898,7 +2919,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err)
 			  NLMSG_ERROR, payload, 0);
 	errmsg = nlmsg_data(rep);
 	errmsg->error = err;
-	memcpy(&errmsg->msg, nlh, err ? nlh->nlmsg_len : sizeof(*nlh));
+	memcpy(&errmsg->msg, nlh, payload > sizeof(*errmsg) ? nlh->nlmsg_len : sizeof(*nlh));
 	netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid, MSG_DONTWAIT);
 }
 EXPORT_SYMBOL(netlink_ack);
-- 
cgit v1.2.3


From b8d3e4163a3562d7cba486687904383e78e7dd6a Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 31 Aug 2015 15:58:46 +0200
Subject: fib, fib6: reject invalid feature bits

Feature bits that are invalid should not be accepted by the kernel,
only the lower 4 bits may be configured, but not the remaining ones.
Even from these 4, 2 of them are unused.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/rtnetlink.h | 11 +++++++----
 net/ipv4/fib_semantics.c       |  2 ++
 net/ipv6/route.c               |  2 ++
 3 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 0d3d3cc43356..702024769c74 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -418,10 +418,13 @@ enum {
 
 #define RTAX_MAX (__RTAX_MAX - 1)
 
-#define RTAX_FEATURE_ECN	0x00000001
-#define RTAX_FEATURE_SACK	0x00000002
-#define RTAX_FEATURE_TIMESTAMP	0x00000004
-#define RTAX_FEATURE_ALLFRAG	0x00000008
+#define RTAX_FEATURE_ECN	(1 << 0)
+#define RTAX_FEATURE_SACK	(1 << 1)
+#define RTAX_FEATURE_TIMESTAMP	(1 << 2)
+#define RTAX_FEATURE_ALLFRAG	(1 << 3)
+
+#define RTAX_FEATURE_MASK	(RTAX_FEATURE_ECN | RTAX_FEATURE_SACK | \
+				 RTAX_FEATURE_TIMESTAMP | RTAX_FEATURE_ALLFRAG)
 
 struct rta_session {
 	__u8	proto;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 88afbae893f0..115a08e70d43 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -908,6 +908,8 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
 			val = 65535 - 40;
 		if (type == RTAX_MTU && val > 65535 - 15)
 			val = 65535 - 15;
+		if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
+			return -EINVAL;
 		fi->fib_metrics[type - 1] = val;
 	}
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 0261b721b34b..8771530df45e 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1728,6 +1728,8 @@ static int ip6_convert_metrics(struct mx6_config *mxc,
 		} else {
 			val = nla_get_u32(nla);
 		}
+		if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
+			goto err;
 
 		mp[type - 1] = val;
 		__set_bit(type - 1, mxc->mx_valid);
-- 
cgit v1.2.3


From 16f7249ddf831f5ec0e1358222ce5db300446b84 Mon Sep 17 00:00:00 2001
From: Artem Savkov <asavkov@redhat.com>
Date: Wed, 2 Sep 2015 13:41:18 +0200
Subject: uapi/drm/i915_drm.h: fix userspace compilation.

commit 346add7834557b5b9628b9bf2387106d42e631d4
Author: Daniel Vetter <daniel.vetter@ffwll.ch>
Date:   Tue Jul 14 18:07:30 2015 +0200

    drm/i915: Use expcitly fixed type in compat32 structs

changed the type of param field in drm_i915_getparam from int to
s32. This header is exported to userspace and needs to use userspace
type __s32 instead.

This fixes userspace compilation errors like the following:
include/drm/i915_drm.h:361:2: error: unknown type name 's32'
  s32 param;

Signed-off-by: Artem Savkov <asavkov@redhat.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 include/uapi/drm/i915_drm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index dbd16a2d37db..fd5aa47bd689 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -358,7 +358,7 @@ typedef struct drm_i915_irq_wait {
 #define I915_PARAM_HAS_RESOURCE_STREAMER 36
 
 typedef struct drm_i915_getparam {
-	s32 param;
+	__s32 param;
 	/*
 	 * WARNING: Using pointers instead of fixed-size u64 means we need to write
 	 * compat32 code. Don't repeat this mistake.
-- 
cgit v1.2.3


From 58319057b7847667f0c9585b9de0e8932b0fdb08 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 4 Sep 2015 15:42:45 -0700
Subject: capabilities: ambient capabilities

Credit where credit is due: this idea comes from Christoph Lameter with
a lot of valuable input from Serge Hallyn.  This patch is heavily based
on Christoph's patch.

===== The status quo =====

On Linux, there are a number of capabilities defined by the kernel.  To
perform various privileged tasks, processes can wield capabilities that
they hold.

Each task has four capability masks: effective (pE), permitted (pP),
inheritable (pI), and a bounding set (X).  When the kernel checks for a
capability, it checks pE.  The other capability masks serve to modify
what capabilities can be in pE.

Any task can remove capabilities from pE, pP, or pI at any time.  If a
task has a capability in pP, it can add that capability to pE and/or pI.
If a task has CAP_SETPCAP, then it can add any capability to pI, and it
can remove capabilities from X.

Tasks are not the only things that can have capabilities; files can also
have capabilities.  A file can have no capabilty information at all [1].
If a file has capability information, then it has a permitted mask (fP)
and an inheritable mask (fI) as well as a single effective bit (fE) [2].
File capabilities modify the capabilities of tasks that execve(2) them.

A task that successfully calls execve has its capabilities modified for
the file ultimately being excecuted (i.e.  the binary itself if that
binary is ELF or for the interpreter if the binary is a script.) [3] In
the capability evolution rules, for each mask Z, pZ represents the old
value and pZ' represents the new value.  The rules are:

  pP' = (X & fP) | (pI & fI)
  pI' = pI
  pE' = (fE ? pP' : 0)
  X is unchanged

For setuid binaries, fP, fI, and fE are modified by a moderately
complicated set of rules that emulate POSIX behavior.  Similarly, if
euid == 0 or ruid == 0, then fP, fI, and fE are modified differently
(primary, fP and fI usually end up being the full set).  For nonroot
users executing binaries with neither setuid nor file caps, fI and fP
are empty and fE is false.

As an extra complication, if you execute a process as nonroot and fE is
set, then the "secure exec" rules are in effect: AT_SECURE gets set,
LD_PRELOAD doesn't work, etc.

This is rather messy.  We've learned that making any changes is
dangerous, though: if a new kernel version allows an unprivileged
program to change its security state in a way that persists cross
execution of a setuid program or a program with file caps, this
persistent state is surprisingly likely to allow setuid or file-capped
programs to be exploited for privilege escalation.

===== The problem =====

Capability inheritance is basically useless.

If you aren't root and you execute an ordinary binary, fI is zero, so
your capabilities have no effect whatsoever on pP'.  This means that you
can't usefully execute a helper process or a shell command with elevated
capabilities if you aren't root.

On current kernels, you can sort of work around this by setting fI to
the full set for most or all non-setuid executable files.  This causes
pP' = pI for nonroot, and inheritance works.  No one does this because
it's a PITA and it isn't even supported on most filesystems.

If you try this, you'll discover that every nonroot program ends up with
secure exec rules, breaking many things.

This is a problem that has bitten many people who have tried to use
capabilities for anything useful.

===== The proposed change =====

This patch adds a fifth capability mask called the ambient mask (pA).
pA does what most people expect pI to do.

pA obeys the invariant that no bit can ever be set in pA if it is not
set in both pP and pI.  Dropping a bit from pP or pI drops that bit from
pA.  This ensures that existing programs that try to drop capabilities
still do so, with a complication.  Because capability inheritance is so
broken, setting KEEPCAPS, using setresuid to switch to nonroot uids, and
then calling execve effectively drops capabilities.  Therefore,
setresuid from root to nonroot conditionally clears pA unless
SECBIT_NO_SETUID_FIXUP is set.  Processes that don't like this can
re-add bits to pA afterwards.

The capability evolution rules are changed:

  pA' = (file caps or setuid or setgid ? 0 : pA)
  pP' = (X & fP) | (pI & fI) | pA'
  pI' = pI
  pE' = (fE ? pP' : pA')
  X is unchanged

If you are nonroot but you have a capability, you can add it to pA.  If
you do so, your children get that capability in pA, pP, and pE.  For
example, you can set pA = CAP_NET_BIND_SERVICE, and your children can
automatically bind low-numbered ports.  Hallelujah!

Unprivileged users can create user namespaces, map themselves to a
nonzero uid, and create both privileged (relative to their namespace)
and unprivileged process trees.  This is currently more or less
impossible.  Hallelujah!

You cannot use pA to try to subvert a setuid, setgid, or file-capped
program: if you execute any such program, pA gets cleared and the
resulting evolution rules are unchanged by this patch.

Users with nonzero pA are unlikely to unintentionally leak that
capability.  If they run programs that try to drop privileges, dropping
privileges will still work.

It's worth noting that the degree of paranoia in this patch could
possibly be reduced without causing serious problems.  Specifically, if
we allowed pA to persist across executing non-pA-aware setuid binaries
and across setresuid, then, naively, the only capabilities that could
leak as a result would be the capabilities in pA, and any attacker
*already* has those capabilities.  This would make me nervous, though --
setuid binaries that tried to privilege-separate might fail to do so,
and putting CAP_DAC_READ_SEARCH or CAP_DAC_OVERRIDE into pA could have
unexpected side effects.  (Whether these unexpected side effects would
be exploitable is an open question.) I've therefore taken the more
paranoid route.  We can revisit this later.

An alternative would be to require PR_SET_NO_NEW_PRIVS before setting
ambient capabilities.  I think that this would be annoying and would
make granting otherwise unprivileged users minor ambient capabilities
(CAP_NET_BIND_SERVICE or CAP_NET_RAW for example) much less useful than
it is with this patch.

===== Footnotes =====

[1] Files that are missing the "security.capability" xattr or that have
unrecognized values for that xattr end up with has_cap set to false.
The code that does that appears to be complicated for no good reason.

[2] The libcap capability mask parsers and formatters are dangerously
misleading and the documentation is flat-out wrong.  fE is *not* a mask;
it's a single bit.  This has probably confused every single person who
has tried to use file capabilities.

[3] Linux very confusingly processes both the script and the interpreter
if applicable, for reasons that elude me.  The results from thinking
about a script's file capabilities and/or setuid bits are mostly
discarded.

Preliminary userspace code is here, but it needs updating:
https://git.kernel.org/cgit/linux/kernel/git/luto/util-linux-playground.git/commit/?h=cap_ambient&id=7f5afbd175d2

Here is a test program that can be used to verify the functionality
(from Christoph):

/*
 * Test program for the ambient capabilities. This program spawns a shell
 * that allows running processes with a defined set of capabilities.
 *
 * (C) 2015 Christoph Lameter <cl@linux.com>
 * Released under: GPL v3 or later.
 *
 *
 * Compile using:
 *
 *	gcc -o ambient_test ambient_test.o -lcap-ng
 *
 * This program must have the following capabilities to run properly:
 * Permissions for CAP_NET_RAW, CAP_NET_ADMIN, CAP_SYS_NICE
 *
 * A command to equip the binary with the right caps is:
 *
 *	setcap cap_net_raw,cap_net_admin,cap_sys_nice+p ambient_test
 *
 *
 * To get a shell with additional caps that can be inherited by other processes:
 *
 *	./ambient_test /bin/bash
 *
 *
 * Verifying that it works:
 *
 * From the bash spawed by ambient_test run
 *
 *	cat /proc/$$/status
 *
 * and have a look at the capabilities.
 */

#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
#include <cap-ng.h>
#include <sys/prctl.h>
#include <linux/capability.h>

/*
 * Definitions from the kernel header files. These are going to be removed
 * when the /usr/include files have these defined.
 */
#define PR_CAP_AMBIENT 47
#define PR_CAP_AMBIENT_IS_SET 1
#define PR_CAP_AMBIENT_RAISE 2
#define PR_CAP_AMBIENT_LOWER 3
#define PR_CAP_AMBIENT_CLEAR_ALL 4

static void set_ambient_cap(int cap)
{
	int rc;

	capng_get_caps_process();
	rc = capng_update(CAPNG_ADD, CAPNG_INHERITABLE, cap);
	if (rc) {
		printf("Cannot add inheritable cap\n");
		exit(2);
	}
	capng_apply(CAPNG_SELECT_CAPS);

	/* Note the two 0s at the end. Kernel checks for these */
	if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, cap, 0, 0)) {
		perror("Cannot set cap");
		exit(1);
	}
}

int main(int argc, char **argv)
{
	int rc;

	set_ambient_cap(CAP_NET_RAW);
	set_ambient_cap(CAP_NET_ADMIN);
	set_ambient_cap(CAP_SYS_NICE);

	printf("Ambient_test forking shell\n");
	if (execv(argv[1], argv + 1))
		perror("Cannot exec");

	return 0;
}

Signed-off-by: Christoph Lameter <cl@linux.com> # Original author
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Aaron Jones <aaronmdjones@gmail.com>
Cc: Ted Ts'o <tytso@mit.edu>
Cc: Andrew G. Morgan <morgan@kernel.org>
Cc: Mimi Zohar <zohar@linux.vnet.ibm.com>
Cc: Austin S Hemmelgarn <ahferroin7@gmail.com>
Cc: Markku Savela <msa@moth.iki.fi>
Cc: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: James Morris <james.l.morris@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/array.c              |   5 ++-
 include/linux/cred.h         |   8 ++++
 include/uapi/linux/prctl.h   |   7 +++
 kernel/user_namespace.c      |   1 +
 security/commoncap.c         | 102 ++++++++++++++++++++++++++++++++++++++-----
 security/keys/process_keys.c |   1 +
 6 files changed, 113 insertions(+), 11 deletions(-)

(limited to 'include/uapi')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index ce065cf3104f..f60f0121e331 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -308,7 +308,8 @@ static void render_cap_t(struct seq_file *m, const char *header,
 static inline void task_cap(struct seq_file *m, struct task_struct *p)
 {
 	const struct cred *cred;
-	kernel_cap_t cap_inheritable, cap_permitted, cap_effective, cap_bset;
+	kernel_cap_t cap_inheritable, cap_permitted, cap_effective,
+			cap_bset, cap_ambient;
 
 	rcu_read_lock();
 	cred = __task_cred(p);
@@ -316,12 +317,14 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p)
 	cap_permitted	= cred->cap_permitted;
 	cap_effective	= cred->cap_effective;
 	cap_bset	= cred->cap_bset;
+	cap_ambient	= cred->cap_ambient;
 	rcu_read_unlock();
 
 	render_cap_t(m, "CapInh:\t", &cap_inheritable);
 	render_cap_t(m, "CapPrm:\t", &cap_permitted);
 	render_cap_t(m, "CapEff:\t", &cap_effective);
 	render_cap_t(m, "CapBnd:\t", &cap_bset);
+	render_cap_t(m, "CapAmb:\t", &cap_ambient);
 }
 
 static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 8b6c083e68a7..8d70e1361ecd 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -137,6 +137,7 @@ struct cred {
 	kernel_cap_t	cap_permitted;	/* caps we're permitted */
 	kernel_cap_t	cap_effective;	/* caps we can actually use */
 	kernel_cap_t	cap_bset;	/* capability bounding set */
+	kernel_cap_t	cap_ambient;	/* Ambient capability set */
 #ifdef CONFIG_KEYS
 	unsigned char	jit_keyring;	/* default keyring to attach requested
 					 * keys to */
@@ -212,6 +213,13 @@ static inline void validate_process_creds(void)
 }
 #endif
 
+static inline bool cap_ambient_invariant_ok(const struct cred *cred)
+{
+	return cap_issubset(cred->cap_ambient,
+			    cap_intersect(cred->cap_permitted,
+					  cred->cap_inheritable));
+}
+
 /**
  * get_new_cred - Get a reference on a new set of credentials
  * @cred: The new credentials to reference
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 31891d9535e2..a8d0759a9e40 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -190,4 +190,11 @@ struct prctl_mm_map {
 # define PR_FP_MODE_FR		(1 << 0)	/* 64b FP registers */
 # define PR_FP_MODE_FRE		(1 << 1)	/* 32b compatibility */
 
+/* Control the ambient capability set */
+#define PR_CAP_AMBIENT			47
+# define PR_CAP_AMBIENT_IS_SET		1
+# define PR_CAP_AMBIENT_RAISE		2
+# define PR_CAP_AMBIENT_LOWER		3
+# define PR_CAP_AMBIENT_CLEAR_ALL	4
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index f65a0a06a8c0..88fefa68c516 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -39,6 +39,7 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
 	cred->cap_inheritable = CAP_EMPTY_SET;
 	cred->cap_permitted = CAP_FULL_SET;
 	cred->cap_effective = CAP_FULL_SET;
+	cred->cap_ambient = CAP_EMPTY_SET;
 	cred->cap_bset = CAP_FULL_SET;
 #ifdef CONFIG_KEYS
 	key_put(cred->request_key_auth);
diff --git a/security/commoncap.c b/security/commoncap.c
index d103f5a4043d..1f74dde1063e 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -267,6 +267,16 @@ int cap_capset(struct cred *new,
 	new->cap_effective   = *effective;
 	new->cap_inheritable = *inheritable;
 	new->cap_permitted   = *permitted;
+
+	/*
+	 * Mask off ambient bits that are no longer both permitted and
+	 * inheritable.
+	 */
+	new->cap_ambient = cap_intersect(new->cap_ambient,
+					 cap_intersect(*permitted,
+						       *inheritable));
+	if (WARN_ON(!cap_ambient_invariant_ok(new)))
+		return -EINVAL;
 	return 0;
 }
 
@@ -347,6 +357,7 @@ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
 
 		/*
 		 * pP' = (X & fP) | (pI & fI)
+		 * The addition of pA' is handled later.
 		 */
 		new->cap_permitted.cap[i] =
 			(new->cap_bset.cap[i] & permitted) |
@@ -474,10 +485,13 @@ int cap_bprm_set_creds(struct linux_binprm *bprm)
 {
 	const struct cred *old = current_cred();
 	struct cred *new = bprm->cred;
-	bool effective, has_cap = false;
+	bool effective, has_cap = false, is_setid;
 	int ret;
 	kuid_t root_uid;
 
+	if (WARN_ON(!cap_ambient_invariant_ok(old)))
+		return -EPERM;
+
 	effective = false;
 	ret = get_file_caps(bprm, &effective, &has_cap);
 	if (ret < 0)
@@ -522,8 +536,9 @@ skip:
 	 *
 	 * In addition, if NO_NEW_PRIVS, then ensure we get no new privs.
 	 */
-	if ((!uid_eq(new->euid, old->uid) ||
-	     !gid_eq(new->egid, old->gid) ||
+	is_setid = !uid_eq(new->euid, old->uid) || !gid_eq(new->egid, old->gid);
+
+	if ((is_setid ||
 	     !cap_issubset(new->cap_permitted, old->cap_permitted)) &&
 	    bprm->unsafe & ~LSM_UNSAFE_PTRACE_CAP) {
 		/* downgrade; they get no more than they had, and maybe less */
@@ -539,10 +554,28 @@ skip:
 	new->suid = new->fsuid = new->euid;
 	new->sgid = new->fsgid = new->egid;
 
+	/* File caps or setid cancels ambient. */
+	if (has_cap || is_setid)
+		cap_clear(new->cap_ambient);
+
+	/*
+	 * Now that we've computed pA', update pP' to give:
+	 *   pP' = (X & fP) | (pI & fI) | pA'
+	 */
+	new->cap_permitted = cap_combine(new->cap_permitted, new->cap_ambient);
+
+	/*
+	 * Set pE' = (fE ? pP' : pA').  Because pA' is zero if fE is set,
+	 * this is the same as pE' = (fE ? pP' : 0) | pA'.
+	 */
 	if (effective)
 		new->cap_effective = new->cap_permitted;
 	else
-		cap_clear(new->cap_effective);
+		new->cap_effective = new->cap_ambient;
+
+	if (WARN_ON(!cap_ambient_invariant_ok(new)))
+		return -EPERM;
+
 	bprm->cap_effective = effective;
 
 	/*
@@ -557,7 +590,7 @@ skip:
 	 * Number 1 above might fail if you don't have a full bset, but I think
 	 * that is interesting information to audit.
 	 */
-	if (!cap_isclear(new->cap_effective)) {
+	if (!cap_issubset(new->cap_effective, new->cap_ambient)) {
 		if (!cap_issubset(CAP_FULL_SET, new->cap_effective) ||
 		    !uid_eq(new->euid, root_uid) || !uid_eq(new->uid, root_uid) ||
 		    issecure(SECURE_NOROOT)) {
@@ -568,6 +601,10 @@ skip:
 	}
 
 	new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
+
+	if (WARN_ON(!cap_ambient_invariant_ok(new)))
+		return -EPERM;
+
 	return 0;
 }
 
@@ -589,7 +626,7 @@ int cap_bprm_secureexec(struct linux_binprm *bprm)
 	if (!uid_eq(cred->uid, root_uid)) {
 		if (bprm->cap_effective)
 			return 1;
-		if (!cap_isclear(cred->cap_permitted))
+		if (!cap_issubset(cred->cap_permitted, cred->cap_ambient))
 			return 1;
 	}
 
@@ -691,10 +728,18 @@ static inline void cap_emulate_setxuid(struct cred *new, const struct cred *old)
 	     uid_eq(old->suid, root_uid)) &&
 	    (!uid_eq(new->uid, root_uid) &&
 	     !uid_eq(new->euid, root_uid) &&
-	     !uid_eq(new->suid, root_uid)) &&
-	    !issecure(SECURE_KEEP_CAPS)) {
-		cap_clear(new->cap_permitted);
-		cap_clear(new->cap_effective);
+	     !uid_eq(new->suid, root_uid))) {
+		if (!issecure(SECURE_KEEP_CAPS)) {
+			cap_clear(new->cap_permitted);
+			cap_clear(new->cap_effective);
+		}
+
+		/*
+		 * Pre-ambient programs expect setresuid to nonroot followed
+		 * by exec to drop capabilities.  We should make sure that
+		 * this remains the case.
+		 */
+		cap_clear(new->cap_ambient);
 	}
 	if (uid_eq(old->euid, root_uid) && !uid_eq(new->euid, root_uid))
 		cap_clear(new->cap_effective);
@@ -924,6 +969,43 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 			new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
 		return commit_creds(new);
 
+	case PR_CAP_AMBIENT:
+		if (arg2 == PR_CAP_AMBIENT_CLEAR_ALL) {
+			if (arg3 | arg4 | arg5)
+				return -EINVAL;
+
+			new = prepare_creds();
+			if (!new)
+				return -ENOMEM;
+			cap_clear(new->cap_ambient);
+			return commit_creds(new);
+		}
+
+		if (((!cap_valid(arg3)) | arg4 | arg5))
+			return -EINVAL;
+
+		if (arg2 == PR_CAP_AMBIENT_IS_SET) {
+			return !!cap_raised(current_cred()->cap_ambient, arg3);
+		} else if (arg2 != PR_CAP_AMBIENT_RAISE &&
+			   arg2 != PR_CAP_AMBIENT_LOWER) {
+			return -EINVAL;
+		} else {
+			if (arg2 == PR_CAP_AMBIENT_RAISE &&
+			    (!cap_raised(current_cred()->cap_permitted, arg3) ||
+			     !cap_raised(current_cred()->cap_inheritable,
+					 arg3)))
+				return -EPERM;
+
+			new = prepare_creds();
+			if (!new)
+				return -ENOMEM;
+			if (arg2 == PR_CAP_AMBIENT_RAISE)
+				cap_raise(new->cap_ambient, arg3);
+			else
+				cap_lower(new->cap_ambient, arg3);
+			return commit_creds(new);
+		}
+
 	default:
 		/* No functionality available - continue with default */
 		return -ENOSYS;
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index bd536cb221e2..43b4cddbf2b3 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -848,6 +848,7 @@ void key_change_session_keyring(struct callback_head *twork)
 	new->cap_inheritable	= old->cap_inheritable;
 	new->cap_permitted	= old->cap_permitted;
 	new->cap_effective	= old->cap_effective;
+	new->cap_ambient	= old->cap_ambient;
 	new->cap_bset		= old->cap_bset;
 
 	new->jit_keyring	= old->jit_keyring;
-- 
cgit v1.2.3


From 746bf6d64275be0c65b0631d8a72b16f1454cfa1 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 4 Sep 2015 15:42:51 -0700
Subject: capabilities: add a securebit to disable PR_CAP_AMBIENT_RAISE

Per Andrew Morgan's request, add a securebit to allow admins to disable
PR_CAP_AMBIENT_RAISE.  This securebit will prevent processes from adding
capabilities to their ambient set.

For simplicity, this disables PR_CAP_AMBIENT_RAISE entirely rather than
just disabling setting previously cleared bits.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Andrew G. Morgan <morgan@kernel.org>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Serge Hallyn <serge.hallyn@canonical.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Aaron Jones <aaronmdjones@gmail.com>
Cc: Ted Ts'o <tytso@mit.edu>
Cc: Andrew G. Morgan <morgan@kernel.org>
Cc: Mimi Zohar <zohar@linux.vnet.ibm.com>
Cc: Austin S Hemmelgarn <ahferroin7@gmail.com>
Cc: Markku Savela <msa@moth.iki.fi>
Cc: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: James Morris <james.l.morris@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/securebits.h | 11 ++++++++++-
 security/commoncap.c            |  3 ++-
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/securebits.h b/include/uapi/linux/securebits.h
index 985aac9e6bf8..35ac35cef217 100644
--- a/include/uapi/linux/securebits.h
+++ b/include/uapi/linux/securebits.h
@@ -43,9 +43,18 @@
 #define SECBIT_KEEP_CAPS	(issecure_mask(SECURE_KEEP_CAPS))
 #define SECBIT_KEEP_CAPS_LOCKED (issecure_mask(SECURE_KEEP_CAPS_LOCKED))
 
+/* When set, a process cannot add new capabilities to its ambient set. */
+#define SECURE_NO_CAP_AMBIENT_RAISE		6
+#define SECURE_NO_CAP_AMBIENT_RAISE_LOCKED	7  /* make bit-6 immutable */
+
+#define SECBIT_NO_CAP_AMBIENT_RAISE (issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE))
+#define SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED \
+			(issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE_LOCKED))
+
 #define SECURE_ALL_BITS		(issecure_mask(SECURE_NOROOT) | \
 				 issecure_mask(SECURE_NO_SETUID_FIXUP) | \
-				 issecure_mask(SECURE_KEEP_CAPS))
+				 issecure_mask(SECURE_KEEP_CAPS) | \
+				 issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE))
 #define SECURE_ALL_LOCKS	(SECURE_ALL_BITS << 1)
 
 #endif /* _UAPI_LINUX_SECUREBITS_H */
diff --git a/security/commoncap.c b/security/commoncap.c
index 1f74dde1063e..1832cf701c3d 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -993,7 +993,8 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 			if (arg2 == PR_CAP_AMBIENT_RAISE &&
 			    (!cap_raised(current_cred()->cap_permitted, arg3) ||
 			     !cap_raised(current_cred()->cap_inheritable,
-					 arg3)))
+					 arg3) ||
+			     issecure(SECURE_NO_CAP_AMBIENT_RAISE)))
 				return -EPERM;
 
 			new = prepare_creds();
-- 
cgit v1.2.3


From 1038628d80e96e3a086189172d9be8eb85ecfabf Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:46:04 -0700
Subject: userfaultfd: uAPI

Defines the uAPI of the userfaultfd, notably the ioctl numbers and protocol.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/ioctl/ioctl-number.txt |  1 +
 include/uapi/linux/Kbuild            |  1 +
 include/uapi/linux/userfaultfd.h     | 83 ++++++++++++++++++++++++++++++++++++
 3 files changed, 85 insertions(+)
 create mode 100644 include/uapi/linux/userfaultfd.h

(limited to 'include/uapi')

diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 64df08db4657..39ac6546d4a4 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -303,6 +303,7 @@ Code  Seq#(hex)	Include File		Comments
 0xA3	80-8F	Port ACL		in development:
 					<mailto:tlewis@mindspring.com>
 0xA3	90-9F	linux/dtlk.h
+0xAA	00-3F	linux/uapi/linux/userfaultfd.h
 0xAB	00-1F	linux/nbd.h
 0xAC	00-1F	linux/raw.h
 0xAD	00	Netfilter device	in development:
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index aafb9937b162..70ff1d9abf0d 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -456,3 +456,4 @@ header-y += xfrm.h
 header-y += xilinx-v4l2-controls.h
 header-y += zorro.h
 header-y += zorro_ids.h
+header-y += userfaultfd.h
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
new file mode 100644
index 000000000000..09c2e2a8c9d6
--- /dev/null
+++ b/include/uapi/linux/userfaultfd.h
@@ -0,0 +1,83 @@
+/*
+ *  include/linux/userfaultfd.h
+ *
+ *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
+ *  Copyright (C) 2015  Red Hat, Inc.
+ *
+ */
+
+#ifndef _LINUX_USERFAULTFD_H
+#define _LINUX_USERFAULTFD_H
+
+#include <linux/types.h>
+
+#define UFFD_API ((__u64)0xAA)
+/* FIXME: add "|UFFD_BIT_WP" to UFFD_API_BITS after implementing it */
+#define UFFD_API_BITS (UFFD_BIT_WRITE)
+#define UFFD_API_IOCTLS				\
+	((__u64)1 << _UFFDIO_REGISTER |		\
+	 (__u64)1 << _UFFDIO_UNREGISTER |	\
+	 (__u64)1 << _UFFDIO_API)
+#define UFFD_API_RANGE_IOCTLS			\
+	((__u64)1 << _UFFDIO_WAKE)
+
+/*
+ * Valid ioctl command number range with this API is from 0x00 to
+ * 0x3F.  UFFDIO_API is the fixed number, everything else can be
+ * changed by implementing a different UFFD_API. If sticking to the
+ * same UFFD_API more ioctl can be added and userland will be aware of
+ * which ioctl the running kernel implements through the ioctl command
+ * bitmask written by the UFFDIO_API.
+ */
+#define _UFFDIO_REGISTER		(0x00)
+#define _UFFDIO_UNREGISTER		(0x01)
+#define _UFFDIO_WAKE			(0x02)
+#define _UFFDIO_API			(0x3F)
+
+/* userfaultfd ioctl ids */
+#define UFFDIO 0xAA
+#define UFFDIO_API		_IOWR(UFFDIO, _UFFDIO_API,	\
+				      struct uffdio_api)
+#define UFFDIO_REGISTER		_IOWR(UFFDIO, _UFFDIO_REGISTER, \
+				      struct uffdio_register)
+#define UFFDIO_UNREGISTER	_IOR(UFFDIO, _UFFDIO_UNREGISTER,	\
+				     struct uffdio_range)
+#define UFFDIO_WAKE		_IOR(UFFDIO, _UFFDIO_WAKE,	\
+				     struct uffdio_range)
+
+/*
+ * Valid bits below PAGE_SHIFT in the userfault address read through
+ * the read() syscall.
+ */
+#define UFFD_BIT_WRITE	(1<<0)	/* this was a write fault, MISSING or WP */
+#define UFFD_BIT_WP	(1<<1)	/* handle_userfault() reason VM_UFFD_WP */
+#define UFFD_BITS	2	/* two above bits used for UFFD_BIT_* mask */
+
+struct uffdio_api {
+	/* userland asks for an API number */
+	__u64 api;
+
+	/* kernel answers below with the available features for the API */
+	__u64 bits;
+	__u64 ioctls;
+};
+
+struct uffdio_range {
+	__u64 start;
+	__u64 len;
+};
+
+struct uffdio_register {
+	struct uffdio_range range;
+#define UFFDIO_REGISTER_MODE_MISSING	((__u64)1<<0)
+#define UFFDIO_REGISTER_MODE_WP		((__u64)1<<1)
+	__u64 mode;
+
+	/*
+	 * kernel answers which ioctl commands are available for the
+	 * range, keep at the end as the last 8 bytes aren't read.
+	 */
+	__u64 ioctls;
+};
+
+#endif /* _LINUX_USERFAULTFD_H */
-- 
cgit v1.2.3


From 3f602d2724b1f7d2d27ddcd7963a040a5890fd16 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Fri, 4 Sep 2015 15:46:34 -0700
Subject: userfaultfd: Rename uffd_api.bits into .features

This is (seems to be) the minimal thing that is required to unblock
standard uffd usage from the non-cooperative one.  Now more bits can be
added to the features field indicating e.g.  UFFD_FEATURE_FORK and others
needed for the latter use-case.

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c                 |  4 ++--
 include/uapi/linux/userfaultfd.h | 12 +++++++++---
 2 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'include/uapi')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 9bc256d1a143..0756d97b0666 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -884,7 +884,7 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
 		goto out;
 	}
 	/* careful not to leak info, we only read the first 8 bytes */
-	uffdio_api.bits = UFFD_API_BITS;
+	uffdio_api.features = UFFD_API_FEATURES;
 	uffdio_api.ioctls = UFFD_API_IOCTLS;
 	ret = -EFAULT;
 	if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
@@ -941,7 +941,7 @@ static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
 	 *	protocols: aa:... bb:...
 	 */
 	seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
-		   pending, total, UFFD_API, UFFD_API_BITS,
+		   pending, total, UFFD_API, UFFD_API_FEATURES,
 		   UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
 }
 #endif
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 09c2e2a8c9d6..330206016249 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -12,8 +12,8 @@
 #include <linux/types.h>
 
 #define UFFD_API ((__u64)0xAA)
-/* FIXME: add "|UFFD_BIT_WP" to UFFD_API_BITS after implementing it */
-#define UFFD_API_BITS (UFFD_BIT_WRITE)
+/* FIXME: add "|UFFD_FEATURE_WP" to UFFD_API_FEATURES after implementing it */
+#define UFFD_API_FEATURES (UFFD_FEATURE_WRITE_BIT)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -53,12 +53,18 @@
 #define UFFD_BIT_WP	(1<<1)	/* handle_userfault() reason VM_UFFD_WP */
 #define UFFD_BITS	2	/* two above bits used for UFFD_BIT_* mask */
 
+/*
+ * Features reported in uffdio_api.features field
+ */
+#define UFFD_FEATURE_WRITE_BIT	(1<<0) /* Corresponds to UFFD_BIT_WRITE */
+#define UFFD_FEATURE_WP_BIT	(1<<1) /* Corresponds to UFFD_BIT_WP */
+
 struct uffdio_api {
 	/* userland asks for an API number */
 	__u64 api;
 
 	/* kernel answers below with the available features for the API */
-	__u64 bits;
+	__u64 features;
 	__u64 ioctls;
 };
 
-- 
cgit v1.2.3


From a9b85f9415fd9e529d03299e5335433f614ec1fb Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:46:37 -0700
Subject: userfaultfd: change the read API to return a uffd_msg

I had requests to return the full address (not the page aligned one) to
userland.

It's not entirely clear how the page offset could be relevant because
userfaults aren't like SIGBUS that can sigjump to a different place and it
actually skip resolving the fault depending on a page offset.  There's
currently no real way to skip the fault especially because after a
UFFDIO_COPY|ZEROPAGE, the fault is optimized to be retried within the
kernel without having to return to userland first (not even self modifying
code replacing the .text that touched the faulting address would prevent
the fault to be repeated).  Userland cannot skip repeating the fault even
more so if the fault was triggered by a KVM secondary page fault or any
get_user_pages or any copy-user inside some syscall which will return to
kernel code.  The second time FAULT_FLAG_RETRY_NOWAIT won't be set leading
to a SIGBUS being raised because the userfault can't wait if it cannot
release the mmap_map first (and FAULT_FLAG_RETRY_NOWAIT is required for
that).

Still returning userland a proper structure during the read() on the uffd,
can allow to use the current UFFD_API for the future non-cooperative
extensions too and it looks cleaner as well.  Once we get additional
fields there's no point to return the fault address page aligned anymore
to reuse the bits below PAGE_SHIFT.

The only downside is that the read() syscall will read 32bytes instead of
8bytes but that's not going to be measurable overhead.

The total number of new events that can be extended or of new future bits
for already shipped events, is limited to 64 by the features field of the
uffdio_api structure.  If more will be needed a bump of UFFD_API will be
required.

[akpm@linux-foundation.org: use __packed]
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/vm/userfaultfd.txt | 12 +++---
 fs/userfaultfd.c                 | 79 +++++++++++++++++++++++-----------------
 include/uapi/linux/userfaultfd.h | 70 +++++++++++++++++++++++++++--------
 3 files changed, 108 insertions(+), 53 deletions(-)

(limited to 'include/uapi')

diff --git a/Documentation/vm/userfaultfd.txt b/Documentation/vm/userfaultfd.txt
index 90912925425e..70a3c94d1941 100644
--- a/Documentation/vm/userfaultfd.txt
+++ b/Documentation/vm/userfaultfd.txt
@@ -46,11 +46,13 @@ is a corner case that would currently return -EBUSY).
 When first opened the userfaultfd must be enabled invoking the
 UFFDIO_API ioctl specifying a uffdio_api.api value set to UFFD_API (or
 a later API version) which will specify the read/POLLIN protocol
-userland intends to speak on the UFFD. The UFFDIO_API ioctl if
-successful (i.e. if the requested uffdio_api.api is spoken also by the
-running kernel), will return into uffdio_api.features and
-uffdio_api.ioctls two 64bit bitmasks of respectively the activated
-feature of the read(2) protocol and the generic ioctl available.
+userland intends to speak on the UFFD and the uffdio_api.features
+userland requires. The UFFDIO_API ioctl if successful (i.e. if the
+requested uffdio_api.api is spoken also by the running kernel and the
+requested features are going to be enabled) will return into
+uffdio_api.features and uffdio_api.ioctls two 64bit bitmasks of
+respectively all the available features of the read(2) protocol and
+the generic ioctl available.
 
 Once the userfaultfd has been enabled the UFFDIO_REGISTER ioctl should
 be invoked (if present in the returned uffdio_api.ioctls bitmask) to
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 0756d97b0666..1f2ddaaf3c03 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -50,7 +50,7 @@ struct userfaultfd_ctx {
 };
 
 struct userfaultfd_wait_queue {
-	unsigned long address;
+	struct uffd_msg msg;
 	wait_queue_t wq;
 	bool pending;
 	struct userfaultfd_ctx *ctx;
@@ -77,7 +77,8 @@ static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode,
 	/* len == 0 means wake all */
 	start = range->start;
 	len = range->len;
-	if (len && (start > uwq->address || start + len <= uwq->address))
+	if (len && (start > uwq->msg.arg.pagefault.address ||
+		    start + len <= uwq->msg.arg.pagefault.address))
 		goto out;
 	ret = wake_up_state(wq->private, mode);
 	if (ret)
@@ -135,28 +136,43 @@ static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
 	}
 }
 
-static inline unsigned long userfault_address(unsigned long address,
-					      unsigned int flags,
-					      unsigned long reason)
+static inline void msg_init(struct uffd_msg *msg)
 {
-	BUILD_BUG_ON(PAGE_SHIFT < UFFD_BITS);
-	address &= PAGE_MASK;
+	BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
+	/*
+	 * Must use memset to zero out the paddings or kernel data is
+	 * leaked to userland.
+	 */
+	memset(msg, 0, sizeof(struct uffd_msg));
+}
+
+static inline struct uffd_msg userfault_msg(unsigned long address,
+					    unsigned int flags,
+					    unsigned long reason)
+{
+	struct uffd_msg msg;
+	msg_init(&msg);
+	msg.event = UFFD_EVENT_PAGEFAULT;
+	msg.arg.pagefault.address = address;
 	if (flags & FAULT_FLAG_WRITE)
 		/*
-		 * Encode "write" fault information in the LSB of the
-		 * address read by userland, without depending on
-		 * FAULT_FLAG_WRITE kernel internal value.
+		 * If UFFD_FEATURE_PAGEFAULT_FLAG_WRITE was set in the
+		 * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE
+		 * was not set in a UFFD_EVENT_PAGEFAULT, it means it
+		 * was a read fault, otherwise if set it means it's
+		 * a write fault.
 		 */
-		address |= UFFD_BIT_WRITE;
+		msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
 	if (reason & VM_UFFD_WP)
 		/*
-		 * Encode "reason" fault information as bit number 1
-		 * in the address read by userland. If bit number 1 is
-		 * clear it means the reason is a VM_FAULT_MISSING
-		 * fault.
+		 * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
+		 * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WP was
+		 * not set in a UFFD_EVENT_PAGEFAULT, it means it was
+		 * a missing fault, otherwise if set it means it's a
+		 * write protect fault.
 		 */
-		address |= UFFD_BIT_WP;
-	return address;
+		msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
+	return msg;
 }
 
 /*
@@ -242,7 +258,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
 
 	init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
 	uwq.wq.private = current;
-	uwq.address = userfault_address(address, flags, reason);
+	uwq.msg = userfault_msg(address, flags, reason);
 	uwq.pending = true;
 	uwq.ctx = ctx;
 
@@ -398,7 +414,7 @@ static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
 }
 
 static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
-				    __u64 *addr)
+				    struct uffd_msg *msg)
 {
 	ssize_t ret;
 	DECLARE_WAITQUEUE(wait, current);
@@ -416,8 +432,8 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 			 * disappear from under us.
 			 */
 			uwq->pending = false;
-			/* careful to always initialize addr if ret == 0 */
-			*addr = uwq->address;
+			/* careful to always initialize msg if ret == 0 */
+			*msg = uwq->msg;
 			spin_unlock(&ctx->fault_wqh.lock);
 			ret = 0;
 			break;
@@ -447,8 +463,7 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf,
 {
 	struct userfaultfd_ctx *ctx = file->private_data;
 	ssize_t _ret, ret = 0;
-	/* careful to always initialize addr if ret == 0 */
-	__u64 uninitialized_var(addr);
+	struct uffd_msg msg;
 	int no_wait = file->f_flags & O_NONBLOCK;
 
 	if (ctx->state == UFFD_STATE_WAIT_API)
@@ -456,16 +471,16 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf,
 	BUG_ON(ctx->state != UFFD_STATE_RUNNING);
 
 	for (;;) {
-		if (count < sizeof(addr))
+		if (count < sizeof(msg))
 			return ret ? ret : -EINVAL;
-		_ret = userfaultfd_ctx_read(ctx, no_wait, &addr);
+		_ret = userfaultfd_ctx_read(ctx, no_wait, &msg);
 		if (_ret < 0)
 			return ret ? ret : _ret;
-		if (put_user(addr, (__u64 __user *) buf))
+		if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg)))
 			return ret ? ret : -EFAULT;
-		ret += sizeof(addr);
-		buf += sizeof(addr);
-		count -= sizeof(addr);
+		ret += sizeof(msg);
+		buf += sizeof(msg);
+		count -= sizeof(msg);
 		/*
 		 * Allow to read more than one fault at time but only
 		 * block if waiting for the very first one.
@@ -873,17 +888,15 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
 	if (ctx->state != UFFD_STATE_WAIT_API)
 		goto out;
 	ret = -EFAULT;
-	if (copy_from_user(&uffdio_api, buf, sizeof(__u64)))
+	if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
 		goto out;
-	if (uffdio_api.api != UFFD_API) {
-		/* careful not to leak info, we only read the first 8 bytes */
+	if (uffdio_api.api != UFFD_API || uffdio_api.features) {
 		memset(&uffdio_api, 0, sizeof(uffdio_api));
 		if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
 			goto out;
 		ret = -EINVAL;
 		goto out;
 	}
-	/* careful not to leak info, we only read the first 8 bytes */
 	uffdio_api.features = UFFD_API_FEATURES;
 	uffdio_api.ioctls = UFFD_API_IOCTLS;
 	ret = -EFAULT;
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 330206016249..a5f8825381ef 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -11,9 +11,15 @@
 
 #include <linux/types.h>
 
+#include <linux/compiler.h>
+
 #define UFFD_API ((__u64)0xAA)
-/* FIXME: add "|UFFD_FEATURE_WP" to UFFD_API_FEATURES after implementing it */
-#define UFFD_API_FEATURES (UFFD_FEATURE_WRITE_BIT)
+/*
+ * After implementing the respective features it will become:
+ * #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \
+ *			      UFFD_FEATURE_EVENT_FORK)
+ */
+#define UFFD_API_FEATURES (0)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -45,26 +51,60 @@
 #define UFFDIO_WAKE		_IOR(UFFDIO, _UFFDIO_WAKE,	\
 				     struct uffdio_range)
 
-/*
- * Valid bits below PAGE_SHIFT in the userfault address read through
- * the read() syscall.
- */
-#define UFFD_BIT_WRITE	(1<<0)	/* this was a write fault, MISSING or WP */
-#define UFFD_BIT_WP	(1<<1)	/* handle_userfault() reason VM_UFFD_WP */
-#define UFFD_BITS	2	/* two above bits used for UFFD_BIT_* mask */
+/* read() structure */
+struct uffd_msg {
+	__u8	event;
+
+	__u8	reserved1;
+	__u16	reserved2;
+	__u32	reserved3;
+
+	union {
+		struct {
+			__u64	flags;
+			__u64	address;
+		} pagefault;
+
+		struct {
+			/* unused reserved fields */
+			__u64	reserved1;
+			__u64	reserved2;
+			__u64	reserved3;
+		} reserved;
+	} arg;
+} __packed;
 
 /*
- * Features reported in uffdio_api.features field
+ * Start at 0x12 and not at 0 to be more strict against bugs.
  */
-#define UFFD_FEATURE_WRITE_BIT	(1<<0) /* Corresponds to UFFD_BIT_WRITE */
-#define UFFD_FEATURE_WP_BIT	(1<<1) /* Corresponds to UFFD_BIT_WP */
+#define UFFD_EVENT_PAGEFAULT	0x12
+#if 0 /* not available yet */
+#define UFFD_EVENT_FORK		0x13
+#endif
+
+/* flags for UFFD_EVENT_PAGEFAULT */
+#define UFFD_PAGEFAULT_FLAG_WRITE	(1<<0)	/* If this was a write fault */
+#define UFFD_PAGEFAULT_FLAG_WP		(1<<1)	/* If reason is VM_UFFD_WP */
 
 struct uffdio_api {
-	/* userland asks for an API number */
+	/* userland asks for an API number and the features to enable */
 	__u64 api;
-
-	/* kernel answers below with the available features for the API */
+	/*
+	 * Kernel answers below with the all available features for
+	 * the API, this notifies userland of which events and/or
+	 * which flags for each event are enabled in the current
+	 * kernel.
+	 *
+	 * Note: UFFD_EVENT_PAGEFAULT and UFFD_PAGEFAULT_FLAG_WRITE
+	 * are to be considered implicitly always enabled in all kernels as
+	 * long as the uffdio_api.api requested matches UFFD_API.
+	 */
+#if 0 /* not available yet */
+#define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
+#define UFFD_FEATURE_EVENT_FORK			(1<<1)
+#endif
 	__u64 features;
+
 	__u64 ioctls;
 };
 
-- 
cgit v1.2.3


From 1f1c6f075904c241f9e44eb37efa8777141fc938 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:47:01 -0700
Subject: userfaultfd: UFFDIO_COPY|UFFDIO_ZEROPAGE uAPI

This implements the uABI of UFFDIO_COPY and UFFDIO_ZEROPAGE.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/userfaultfd.h | 42 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index a5f8825381ef..df0e09bb7dd5 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -25,7 +25,9 @@
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
 	 (__u64)1 << _UFFDIO_API)
 #define UFFD_API_RANGE_IOCTLS			\
-	((__u64)1 << _UFFDIO_WAKE)
+	((__u64)1 << _UFFDIO_WAKE |		\
+	 (__u64)1 << _UFFDIO_COPY |		\
+	 (__u64)1 << _UFFDIO_ZEROPAGE)
 
 /*
  * Valid ioctl command number range with this API is from 0x00 to
@@ -38,6 +40,8 @@
 #define _UFFDIO_REGISTER		(0x00)
 #define _UFFDIO_UNREGISTER		(0x01)
 #define _UFFDIO_WAKE			(0x02)
+#define _UFFDIO_COPY			(0x03)
+#define _UFFDIO_ZEROPAGE		(0x04)
 #define _UFFDIO_API			(0x3F)
 
 /* userfaultfd ioctl ids */
@@ -50,6 +54,10 @@
 				     struct uffdio_range)
 #define UFFDIO_WAKE		_IOR(UFFDIO, _UFFDIO_WAKE,	\
 				     struct uffdio_range)
+#define UFFDIO_COPY		_IOWR(UFFDIO, _UFFDIO_COPY,	\
+				      struct uffdio_copy)
+#define UFFDIO_ZEROPAGE		_IOWR(UFFDIO, _UFFDIO_ZEROPAGE,	\
+				      struct uffdio_zeropage)
 
 /* read() structure */
 struct uffd_msg {
@@ -126,4 +134,36 @@ struct uffdio_register {
 	__u64 ioctls;
 };
 
+struct uffdio_copy {
+	__u64 dst;
+	__u64 src;
+	__u64 len;
+	/*
+	 * There will be a wrprotection flag later that allows to map
+	 * pages wrprotected on the fly. And such a flag will be
+	 * available if the wrprotection ioctl are implemented for the
+	 * range according to the uffdio_register.ioctls.
+	 */
+#define UFFDIO_COPY_MODE_DONTWAKE		((__u64)1<<0)
+	__u64 mode;
+
+	/*
+	 * "copy" is written by the ioctl and must be at the end: the
+	 * copy_from_user will not read the last 8 bytes.
+	 */
+	__s64 copy;
+};
+
+struct uffdio_zeropage {
+	struct uffdio_range range;
+#define UFFDIO_ZEROPAGE_MODE_DONTWAKE		((__u64)1<<0)
+	__u64 mode;
+
+	/*
+	 * "zeropage" is written by the ioctl and must be at the end:
+	 * the copy_from_user will not read the last 8 bytes.
+	 */
+	__s64 zeropage;
+};
+
 #endif /* _LINUX_USERFAULTFD_H */
-- 
cgit v1.2.3


From a13d7201d7deedcbb6ac6efa94a1a7d34d3d79ec Mon Sep 17 00:00:00 2001
From: Julien Grall <julien.grall@citrix.com>
Date: Fri, 7 Aug 2015 17:34:41 +0100
Subject: xen/privcmd: Further s/MFN/GFN/ clean-up

The privcmd code is mixing the usage of GFN and MFN within the same
functions which make the code difficult to understand when you only work
with auto-translated guests.

The privcmd driver is only dealing with GFN so replace all the mention
of MFN into GFN.

The ioctl structure used to map foreign change has been left unchanged
given that the userspace is using it. Nonetheless, add a comment to
explain the expected value within the "mfn" field.

Signed-off-by: Julien Grall <julien.grall@citrix.com>
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/arm/xen/enlighten.c   | 18 +++++++++---------
 arch/x86/xen/mmu.c         | 32 ++++++++++++++++----------------
 drivers/xen/privcmd.c      | 44 ++++++++++++++++++++++----------------------
 drivers/xen/xlate_mmu.c    | 18 +++++++++---------
 include/uapi/xen/privcmd.h |  4 ++++
 include/xen/xen-ops.h      | 10 +++++-----
 6 files changed, 65 insertions(+), 61 deletions(-)

(limited to 'include/uapi')

diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c
index c50c8d33f874..eeeab074e154 100644
--- a/arch/arm/xen/enlighten.c
+++ b/arch/arm/xen/enlighten.c
@@ -49,35 +49,35 @@ static __read_mostly unsigned int xen_events_irq;
 
 static __initdata struct device_node *xen_node;
 
-int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
+int xen_remap_domain_gfn_array(struct vm_area_struct *vma,
 			       unsigned long addr,
-			       xen_pfn_t *mfn, int nr,
+			       xen_pfn_t *gfn, int nr,
 			       int *err_ptr, pgprot_t prot,
 			       unsigned domid,
 			       struct page **pages)
 {
-	return xen_xlate_remap_gfn_array(vma, addr, mfn, nr, err_ptr,
+	return xen_xlate_remap_gfn_array(vma, addr, gfn, nr, err_ptr,
 					 prot, domid, pages);
 }
-EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_array);
+EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_array);
 
 /* Not used by XENFEAT_auto_translated guests. */
-int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
+int xen_remap_domain_gfn_range(struct vm_area_struct *vma,
                               unsigned long addr,
-                              xen_pfn_t mfn, int nr,
+                              xen_pfn_t gfn, int nr,
                               pgprot_t prot, unsigned domid,
                               struct page **pages)
 {
 	return -ENOSYS;
 }
-EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
+EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_range);
 
-int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
+int xen_unmap_domain_gfn_range(struct vm_area_struct *vma,
 			       int nr, struct page **pages)
 {
 	return xen_xlate_unmap_gfn_range(vma, nr, pages);
 }
-EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range);
+EXPORT_SYMBOL_GPL(xen_unmap_domain_gfn_range);
 
 static void xen_percpu_init(void)
 {
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 2c50b445884e..9c479fe40459 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -2812,9 +2812,9 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
 	return 0;
 }
 
-static int do_remap_mfn(struct vm_area_struct *vma,
+static int do_remap_gfn(struct vm_area_struct *vma,
 			unsigned long addr,
-			xen_pfn_t *mfn, int nr,
+			xen_pfn_t *gfn, int nr,
 			int *err_ptr, pgprot_t prot,
 			unsigned domid,
 			struct page **pages)
@@ -2830,14 +2830,14 @@ static int do_remap_mfn(struct vm_area_struct *vma,
 	if (xen_feature(XENFEAT_auto_translated_physmap)) {
 #ifdef CONFIG_XEN_PVH
 		/* We need to update the local page tables and the xen HAP */
-		return xen_xlate_remap_gfn_array(vma, addr, mfn, nr, err_ptr,
+		return xen_xlate_remap_gfn_array(vma, addr, gfn, nr, err_ptr,
 						 prot, domid, pages);
 #else
 		return -EINVAL;
 #endif
         }
 
-	rmd.mfn = mfn;
+	rmd.mfn = gfn;
 	rmd.prot = prot;
 	/* We use the err_ptr to indicate if there we are doing a contigious
 	 * mapping or a discontigious mapping. */
@@ -2865,8 +2865,8 @@ static int do_remap_mfn(struct vm_area_struct *vma,
 						    batch_left, &done, domid);
 
 			/*
-			 * @err_ptr may be the same buffer as @mfn, so
-			 * only clear it after each chunk of @mfn is
+			 * @err_ptr may be the same buffer as @gfn, so
+			 * only clear it after each chunk of @gfn is
 			 * used.
 			 */
 			if (err_ptr) {
@@ -2896,19 +2896,19 @@ out:
 	return err < 0 ? err : mapped;
 }
 
-int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
+int xen_remap_domain_gfn_range(struct vm_area_struct *vma,
 			       unsigned long addr,
-			       xen_pfn_t mfn, int nr,
+			       xen_pfn_t gfn, int nr,
 			       pgprot_t prot, unsigned domid,
 			       struct page **pages)
 {
-	return do_remap_mfn(vma, addr, &mfn, nr, NULL, prot, domid, pages);
+	return do_remap_gfn(vma, addr, &gfn, nr, NULL, prot, domid, pages);
 }
-EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
+EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_range);
 
-int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
+int xen_remap_domain_gfn_array(struct vm_area_struct *vma,
 			       unsigned long addr,
-			       xen_pfn_t *mfn, int nr,
+			       xen_pfn_t *gfn, int nr,
 			       int *err_ptr, pgprot_t prot,
 			       unsigned domid, struct page **pages)
 {
@@ -2917,13 +2917,13 @@ int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
 	 * cause of "wrong memory was mapped in".
 	 */
 	BUG_ON(err_ptr == NULL);
-	return do_remap_mfn(vma, addr, mfn, nr, err_ptr, prot, domid, pages);
+	return do_remap_gfn(vma, addr, gfn, nr, err_ptr, prot, domid, pages);
 }
-EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_array);
+EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_array);
 
 
 /* Returns: 0 success */
-int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
+int xen_unmap_domain_gfn_range(struct vm_area_struct *vma,
 			       int numpgs, struct page **pages)
 {
 	if (!pages || !xen_feature(XENFEAT_auto_translated_physmap))
@@ -2935,4 +2935,4 @@ int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
 	return -EINVAL;
 #endif
 }
-EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range);
+EXPORT_SYMBOL_GPL(xen_unmap_domain_gfn_range);
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index 5a296161d843..c6deb87c5c69 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -193,16 +193,16 @@ static int traverse_pages_block(unsigned nelem, size_t size,
 	return ret;
 }
 
-struct mmap_mfn_state {
+struct mmap_gfn_state {
 	unsigned long va;
 	struct vm_area_struct *vma;
 	domid_t domain;
 };
 
-static int mmap_mfn_range(void *data, void *state)
+static int mmap_gfn_range(void *data, void *state)
 {
 	struct privcmd_mmap_entry *msg = data;
-	struct mmap_mfn_state *st = state;
+	struct mmap_gfn_state *st = state;
 	struct vm_area_struct *vma = st->vma;
 	int rc;
 
@@ -216,7 +216,7 @@ static int mmap_mfn_range(void *data, void *state)
 	    ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end))
 		return -EINVAL;
 
-	rc = xen_remap_domain_mfn_range(vma,
+	rc = xen_remap_domain_gfn_range(vma,
 					msg->va & PAGE_MASK,
 					msg->mfn, msg->npages,
 					vma->vm_page_prot,
@@ -236,7 +236,7 @@ static long privcmd_ioctl_mmap(void __user *udata)
 	struct vm_area_struct *vma;
 	int rc;
 	LIST_HEAD(pagelist);
-	struct mmap_mfn_state state;
+	struct mmap_gfn_state state;
 
 	/* We only support privcmd_ioctl_mmap_batch for auto translated. */
 	if (xen_feature(XENFEAT_auto_translated_physmap))
@@ -273,7 +273,7 @@ static long privcmd_ioctl_mmap(void __user *udata)
 
 	rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry),
 			    &pagelist,
-			    mmap_mfn_range, &state);
+			    mmap_gfn_range, &state);
 
 
 out_up:
@@ -299,18 +299,18 @@ struct mmap_batch_state {
 	int global_error;
 	int version;
 
-	/* User-space mfn array to store errors in the second pass for V1. */
-	xen_pfn_t __user *user_mfn;
+	/* User-space gfn array to store errors in the second pass for V1. */
+	xen_pfn_t __user *user_gfn;
 	/* User-space int array to store errors in the second pass for V2. */
 	int __user *user_err;
 };
 
-/* auto translated dom0 note: if domU being created is PV, then mfn is
- * mfn(addr on bus). If it's auto xlated, then mfn is pfn (input to HAP).
+/* auto translated dom0 note: if domU being created is PV, then gfn is
+ * mfn(addr on bus). If it's auto xlated, then gfn is pfn (input to HAP).
  */
 static int mmap_batch_fn(void *data, int nr, void *state)
 {
-	xen_pfn_t *mfnp = data;
+	xen_pfn_t *gfnp = data;
 	struct mmap_batch_state *st = state;
 	struct vm_area_struct *vma = st->vma;
 	struct page **pages = vma->vm_private_data;
@@ -321,8 +321,8 @@ static int mmap_batch_fn(void *data, int nr, void *state)
 		cur_pages = &pages[st->index];
 
 	BUG_ON(nr < 0);
-	ret = xen_remap_domain_mfn_array(st->vma, st->va & PAGE_MASK, mfnp, nr,
-					 (int *)mfnp, st->vma->vm_page_prot,
+	ret = xen_remap_domain_gfn_array(st->vma, st->va & PAGE_MASK, gfnp, nr,
+					 (int *)gfnp, st->vma->vm_page_prot,
 					 st->domain, cur_pages);
 
 	/* Adjust the global_error? */
@@ -347,22 +347,22 @@ static int mmap_return_error(int err, struct mmap_batch_state *st)
 
 	if (st->version == 1) {
 		if (err) {
-			xen_pfn_t mfn;
+			xen_pfn_t gfn;
 
-			ret = get_user(mfn, st->user_mfn);
+			ret = get_user(gfn, st->user_gfn);
 			if (ret < 0)
 				return ret;
 			/*
 			 * V1 encodes the error codes in the 32bit top
-			 * nibble of the mfn (with its known
+			 * nibble of the gfn (with its known
 			 * limitations vis-a-vis 64 bit callers).
 			 */
-			mfn |= (err == -ENOENT) ?
+			gfn |= (err == -ENOENT) ?
 				PRIVCMD_MMAPBATCH_PAGED_ERROR :
 				PRIVCMD_MMAPBATCH_MFN_ERROR;
-			return __put_user(mfn, st->user_mfn++);
+			return __put_user(gfn, st->user_gfn++);
 		} else
-			st->user_mfn++;
+			st->user_gfn++;
 	} else { /* st->version == 2 */
 		if (err)
 			return __put_user(err, st->user_err++);
@@ -388,7 +388,7 @@ static int mmap_return_errors(void *data, int nr, void *state)
 	return 0;
 }
 
-/* Allocate pfns that are then mapped with gmfns from foreign domid. Update
+/* Allocate pfns that are then mapped with gfns from foreign domid. Update
  * the vma with the page info to use later.
  * Returns: 0 if success, otherwise -errno
  */
@@ -526,7 +526,7 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version)
 
 	if (state.global_error) {
 		/* Write back errors in second pass. */
-		state.user_mfn = (xen_pfn_t *)m.arr;
+		state.user_gfn = (xen_pfn_t *)m.arr;
 		state.user_err = m.err;
 		ret = traverse_pages_block(m.num, sizeof(xen_pfn_t),
 					   &pagelist, mmap_return_errors, &state);
@@ -587,7 +587,7 @@ static void privcmd_close(struct vm_area_struct *vma)
 	if (!xen_feature(XENFEAT_auto_translated_physmap) || !numpgs || !pages)
 		return;
 
-	rc = xen_unmap_domain_mfn_range(vma, numpgs, pages);
+	rc = xen_unmap_domain_gfn_range(vma, numpgs, pages);
 	if (rc == 0)
 		free_xenballooned_pages(numpgs, pages);
 	else
diff --git a/drivers/xen/xlate_mmu.c b/drivers/xen/xlate_mmu.c
index 58a5389aec89..cff23872c5a9 100644
--- a/drivers/xen/xlate_mmu.c
+++ b/drivers/xen/xlate_mmu.c
@@ -38,8 +38,8 @@
 #include <xen/interface/xen.h>
 #include <xen/interface/memory.h>
 
-/* map fgmfn of domid to lpfn in the current domain */
-static int map_foreign_page(unsigned long lpfn, unsigned long fgmfn,
+/* map fgfn of domid to lpfn in the current domain */
+static int map_foreign_page(unsigned long lpfn, unsigned long fgfn,
 			    unsigned int domid)
 {
 	int rc;
@@ -49,7 +49,7 @@ static int map_foreign_page(unsigned long lpfn, unsigned long fgmfn,
 		.size = 1,
 		.space = XENMAPSPACE_gmfn_foreign,
 	};
-	xen_ulong_t idx = fgmfn;
+	xen_ulong_t idx = fgfn;
 	xen_pfn_t gpfn = lpfn;
 	int err = 0;
 
@@ -62,13 +62,13 @@ static int map_foreign_page(unsigned long lpfn, unsigned long fgmfn,
 }
 
 struct remap_data {
-	xen_pfn_t *fgmfn; /* foreign domain's gmfn */
+	xen_pfn_t *fgfn; /* foreign domain's gfn */
 	pgprot_t prot;
 	domid_t  domid;
 	struct vm_area_struct *vma;
 	int index;
 	struct page **pages;
-	struct xen_remap_mfn_info *info;
+	struct xen_remap_gfn_info *info;
 	int *err_ptr;
 	int mapped;
 };
@@ -82,20 +82,20 @@ static int remap_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr,
 	pte_t pte = pte_mkspecial(pfn_pte(pfn, info->prot));
 	int rc;
 
-	rc = map_foreign_page(pfn, *info->fgmfn, info->domid);
+	rc = map_foreign_page(pfn, *info->fgfn, info->domid);
 	*info->err_ptr++ = rc;
 	if (!rc) {
 		set_pte_at(info->vma->vm_mm, addr, ptep, pte);
 		info->mapped++;
 	}
-	info->fgmfn++;
+	info->fgfn++;
 
 	return 0;
 }
 
 int xen_xlate_remap_gfn_array(struct vm_area_struct *vma,
 			      unsigned long addr,
-			      xen_pfn_t *mfn, int nr,
+			      xen_pfn_t *gfn, int nr,
 			      int *err_ptr, pgprot_t prot,
 			      unsigned domid,
 			      struct page **pages)
@@ -108,7 +108,7 @@ int xen_xlate_remap_gfn_array(struct vm_area_struct *vma,
 	   x86 PVOPS */
 	BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
 
-	data.fgmfn = mfn;
+	data.fgfn = gfn;
 	data.prot  = prot;
 	data.domid = domid;
 	data.vma   = vma;
diff --git a/include/uapi/xen/privcmd.h b/include/uapi/xen/privcmd.h
index a85316811d79..7ddeeda93809 100644
--- a/include/uapi/xen/privcmd.h
+++ b/include/uapi/xen/privcmd.h
@@ -44,6 +44,10 @@ struct privcmd_hypercall {
 
 struct privcmd_mmap_entry {
 	__u64 va;
+	/*
+	 * This should be a GFN. It's not possible to change the name because
+	 * it's exposed to the user-space.
+	 */
 	__u64 mfn;
 	__u64 npages;
 };
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index 0ce4f32017ea..e4e214a5abd5 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -30,7 +30,7 @@ void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order);
 struct vm_area_struct;
 
 /*
- * xen_remap_domain_mfn_array() - map an array of foreign frames
+ * xen_remap_domain_gfn_array() - map an array of foreign frames
  * @vma:     VMA to map the pages into
  * @addr:    Address at which to map the pages
  * @gfn:     Array of GFNs to map
@@ -46,14 +46,14 @@ struct vm_area_struct;
  * Returns the number of successfully mapped frames, or a -ve error
  * code.
  */
-int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
+int xen_remap_domain_gfn_array(struct vm_area_struct *vma,
 			       unsigned long addr,
 			       xen_pfn_t *gfn, int nr,
 			       int *err_ptr, pgprot_t prot,
 			       unsigned domid,
 			       struct page **pages);
 
-/* xen_remap_domain_mfn_range() - map a range of foreign frames
+/* xen_remap_domain_gfn_range() - map a range of foreign frames
  * @vma:     VMA to map the pages into
  * @addr:    Address at which to map the pages
  * @gfn:     First GFN to map.
@@ -65,12 +65,12 @@ int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
  * Returns the number of successfully mapped frames, or a -ve error
  * code.
  */
-int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
+int xen_remap_domain_gfn_range(struct vm_area_struct *vma,
 			       unsigned long addr,
 			       xen_pfn_t gfn, int nr,
 			       pgprot_t prot, unsigned domid,
 			       struct page **pages);
-int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
+int xen_unmap_domain_gfn_range(struct vm_area_struct *vma,
 			       int numpgs, struct page **pages);
 int xen_xlate_remap_gfn_array(struct vm_area_struct *vma,
 			      unsigned long addr,
-- 
cgit v1.2.3


From b14132797d8041a42e03f4ffa1e722da1425adfb Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Tue, 18 Aug 2015 03:28:01 -0400
Subject: elf-em.h: move EM_MICROBLAZE to the common header

The linux/audit.h header uses EM_MICROBLAZE in order to define
AUDIT_ARCH_MICROBLAZE, but it's only available in the microblaze
asm headers.  Move it to the common elf-em.h header so that the
define can be used on non-microblaze systems.  Otherwise we get
build errors that EM_MICROBLAZE isn't defined when we try to use
the AUDIT_ARCH_MICROBLAZE symbol.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Michal Simek <michal.simek@xilinx.com>
---
 arch/microblaze/include/uapi/asm/elf.h | 3 ++-
 include/uapi/linux/elf-em.h            | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/arch/microblaze/include/uapi/asm/elf.h b/arch/microblaze/include/uapi/asm/elf.h
index be1731d5e2fa..e9bcdb6e0086 100644
--- a/arch/microblaze/include/uapi/asm/elf.h
+++ b/arch/microblaze/include/uapi/asm/elf.h
@@ -11,12 +11,13 @@
 #ifndef _UAPI_ASM_MICROBLAZE_ELF_H
 #define _UAPI_ASM_MICROBLAZE_ELF_H
 
+#include <linux/elf-em.h>
+
 /*
  * Note there is no "official" ELF designation for Microblaze.
  * I've snaffled the value from the microblaze binutils source code
  * /binutils/microblaze/include/elf/microblaze.h
  */
-#define EM_MICROBLAZE		189
 #define EM_MICROBLAZE_OLD	0xbaab
 #define ELF_ARCH		EM_MICROBLAZE
 
diff --git a/include/uapi/linux/elf-em.h b/include/uapi/linux/elf-em.h
index 3429a3ba382b..b56dfcfe922a 100644
--- a/include/uapi/linux/elf-em.h
+++ b/include/uapi/linux/elf-em.h
@@ -39,6 +39,7 @@
 #define EM_TI_C6000	140	/* TI C6X DSPs */
 #define EM_AARCH64	183	/* ARM 64 bit */
 #define EM_TILEPRO	188	/* Tilera TILEPro */
+#define EM_MICROBLAZE	189	/* Xilinx MicroBlaze */
 #define EM_TILEGX	191	/* Tilera TILE-Gx */
 #define EM_FRV		0x5441	/* Fujitsu FR-V */
 #define EM_AVR32	0x18ad	/* Atmel AVR32 */
-- 
cgit v1.2.3


From 1ab1e895492d8084dfc1c854efacde219e56b8c1 Mon Sep 17 00:00:00 2001
From: Henrik Austad <henrik@austad.us>
Date: Wed, 9 Sep 2015 12:25:17 +0200
Subject: ether: add IEEE 1722 ethertype - TSN

IEEE 1722 describes AVB (later renamed to TSN - Time Sensitive
Networking), a protocol, encapsualtion and synchronization to utilize
standard networks for audio/video (and later other time-sensitive)
streams.

This standard uses ethertype 0x22F0.

http://standards.ieee.org/develop/regauth/ethertype/eth.txt

This is a respin of a previous patch ("ether: add AVB frame type
ETH_P_AVB")

CC: "David S. Miller" <davem@davemloft.net>
CC: netdev@vger.kernel.org
CC: linux-api@vger.kernel.org
CC: linux-kernel@vger.kernel.org
Signed-off-by: Henrik Austad <henrik@austad.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_ether.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index aa63ed023c2b..ea9221b0331a 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -42,6 +42,7 @@
 #define ETH_P_LOOP	0x0060		/* Ethernet Loopback packet	*/
 #define ETH_P_PUP	0x0200		/* Xerox PUP packet		*/
 #define ETH_P_PUPAT	0x0201		/* Xerox PUP Addr Trans packet	*/
+#define ETH_P_TSN	0x22F0		/* TSN (IEEE 1722) packet	*/
 #define ETH_P_IP	0x0800		/* Internet Protocol packet	*/
 #define ETH_P_X25	0x0805		/* CCITT X.25			*/
 #define ETH_P_ARP	0x0806		/* Address Resolution packet	*/
-- 
cgit v1.2.3


From f074a8f49eb87cde95ac9d040ad5e7ea4f029738 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Wed, 9 Sep 2015 15:35:48 -0700
Subject: proc: export idle flag via kpageflags

As noted by Minchan, a benefit of reading idle flag from /proc/kpageflags
is that one can easily filter dirty and/or unevictable pages while
estimating the size of unused memory.

Note that idle flag read from /proc/kpageflags may be stale in case the
page was accessed via a PTE, because it would be too costly to iterate
over all page mappings on each /proc/kpageflags read to provide an
up-to-date value.  To make sure the flag is up-to-date one has to read
/sys/kernel/mm/page_idle/bitmap first.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Reviewed-by: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Greg Thelen <gthelen@google.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/vm/pagemap.txt           | 7 +++++++
 fs/proc/page.c                         | 3 +++
 include/uapi/linux/kernel-page-flags.h | 1 +
 3 files changed, 11 insertions(+)

(limited to 'include/uapi')

diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt
index ce294b0aace4..0e1e55588b59 100644
--- a/Documentation/vm/pagemap.txt
+++ b/Documentation/vm/pagemap.txt
@@ -70,6 +70,7 @@ There are four components to pagemap:
     22. THP
     23. BALLOON
     24. ZERO_PAGE
+    25. IDLE
 
  * /proc/kpagecgroup.  This file contains a 64-bit inode number of the
    memory cgroup each page is charged to, indexed by PFN. Only available when
@@ -120,6 +121,12 @@ Short descriptions to the page flags:
 24. ZERO_PAGE
     zero page for pfn_zero or huge_zero page
 
+25. IDLE
+    page has not been accessed since it was marked idle (see
+    Documentation/vm/idle_page_tracking.txt). Note that this flag may be
+    stale in case the page was accessed via a PTE. To make sure the flag
+    is up-to-date one has to read /sys/kernel/mm/page_idle/bitmap first.
+
     [IO related page flags]
  1. ERROR     IO error occurred
  3. UPTODATE  page has up-to-date data
diff --git a/fs/proc/page.c b/fs/proc/page.c
index c2d29edcaa6b..0b8286450a93 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -150,6 +150,9 @@ u64 stable_page_flags(struct page *page)
 	if (PageBalloon(page))
 		u |= 1 << KPF_BALLOON;
 
+	if (page_is_idle(page))
+		u |= 1 << KPF_IDLE;
+
 	u |= kpf_copy_bit(k, KPF_LOCKED,	PG_locked);
 
 	u |= kpf_copy_bit(k, KPF_SLAB,		PG_slab);
diff --git a/include/uapi/linux/kernel-page-flags.h b/include/uapi/linux/kernel-page-flags.h
index a6c4962e5d46..5da5f8751ce7 100644
--- a/include/uapi/linux/kernel-page-flags.h
+++ b/include/uapi/linux/kernel-page-flags.h
@@ -33,6 +33,7 @@
 #define KPF_THP			22
 #define KPF_BALLOON		23
 #define KPF_ZERO_PAGE		24
+#define KPF_IDLE		25
 
 
 #endif /* _UAPILINUX_KERNEL_PAGE_FLAGS_H */
-- 
cgit v1.2.3


From ac64a2ce509104a746321a4f9646b6750cf281eb Mon Sep 17 00:00:00 2001
From: David Disseldorp <ddiss@suse.de>
Date: Fri, 4 Sep 2015 01:39:56 +0200
Subject: target: use stringify.h instead of own definition

Signed-off-by: David Disseldorp <ddiss@suse.de>
Acked-by: Andy Grover <agrover@redhat.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/target_core_user.c     | 3 ++-
 include/uapi/linux/target_core_user.h | 4 ----
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index d0bb652b65b5..937cebf76633 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -25,6 +25,7 @@
 #include <linux/parser.h>
 #include <linux/vmalloc.h>
 #include <linux/uio_driver.h>
+#include <linux/stringify.h>
 #include <net/genetlink.h>
 #include <scsi/scsi_common.h>
 #include <scsi/scsi_proto.h>
@@ -898,7 +899,7 @@ static int tcmu_configure_device(struct se_device *dev)
 	WARN_ON(!PAGE_ALIGNED(udev->data_off));
 	WARN_ON(udev->data_size % PAGE_SIZE);
 
-	info->version = xstr(TCMU_MAILBOX_VERSION);
+	info->version = __stringify(TCMU_MAILBOX_VERSION);
 
 	info->mem[0].name = "tcm-user command & data buffer";
 	info->mem[0].addr = (phys_addr_t) udev->mb_addr;
diff --git a/include/uapi/linux/target_core_user.h b/include/uapi/linux/target_core_user.h
index b67f99d3c520..95c6521d8a95 100644
--- a/include/uapi/linux/target_core_user.h
+++ b/include/uapi/linux/target_core_user.h
@@ -42,10 +42,6 @@
 #define TCMU_MAILBOX_VERSION 2
 #define ALIGN_SIZE 64 /* Should be enough for most CPUs */
 
-/* See https://gcc.gnu.org/onlinedocs/cpp/Stringification.html */
-#define xstr(s) str(s)
-#define str(s) #s
-
 struct tcmu_mailbox {
 	__u16 version;
 	__u16 flags;
-- 
cgit v1.2.3


From 5b25b13ab08f616efd566347d809b4ece54570d1 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 11 Sep 2015 13:07:39 -0700
Subject: sys_membarrier(): system-wide memory barrier (generic, x86)

Here is an implementation of a new system call, sys_membarrier(), which
executes a memory barrier on all threads running on the system.  It is
implemented by calling synchronize_sched().  It can be used to
distribute the cost of user-space memory barriers asymmetrically by
transforming pairs of memory barriers into pairs consisting of
sys_membarrier() and a compiler barrier.  For synchronization primitives
that distinguish between read-side and write-side (e.g.  userspace RCU
[1], rwlocks), the read-side can be accelerated significantly by moving
the bulk of the memory barrier overhead to the write-side.

The existing applications of which I am aware that would be improved by
this system call are as follows:

* Through Userspace RCU library (http://urcu.so)
  - DNS server (Knot DNS) https://www.knot-dns.cz/
  - Network sniffer (http://netsniff-ng.org/)
  - Distributed object storage (https://sheepdog.github.io/sheepdog/)
  - User-space tracing (http://lttng.org)
  - Network storage system (https://www.gluster.org/)
  - Virtual routers (https://events.linuxfoundation.org/sites/events/files/slides/DPDK_RCU_0MQ.pdf)
  - Financial software (https://lkml.org/lkml/2015/3/23/189)

Those projects use RCU in userspace to increase read-side speed and
scalability compared to locking.  Especially in the case of RCU used by
libraries, sys_membarrier can speed up the read-side by moving the bulk of
the memory barrier cost to synchronize_rcu().

* Direct users of sys_membarrier
  - core dotnet garbage collector (https://github.com/dotnet/coreclr/issues/198)

Microsoft core dotnet GC developers are planning to use the mprotect()
side-effect of issuing memory barriers through IPIs as a way to implement
Windows FlushProcessWriteBuffers() on Linux.  They are referring to
sys_membarrier in their github thread, specifically stating that
sys_membarrier() is what they are looking for.

To explain the benefit of this scheme, let's introduce two example threads:

Thread A (non-frequent, e.g. executing liburcu synchronize_rcu())
Thread B (frequent, e.g. executing liburcu
rcu_read_lock()/rcu_read_unlock())

In a scheme where all smp_mb() in thread A are ordering memory accesses
with respect to smp_mb() present in Thread B, we can change each
smp_mb() within Thread A into calls to sys_membarrier() and each
smp_mb() within Thread B into compiler barriers "barrier()".

Before the change, we had, for each smp_mb() pairs:

Thread A                    Thread B
previous mem accesses       previous mem accesses
smp_mb()                    smp_mb()
following mem accesses      following mem accesses

After the change, these pairs become:

Thread A                    Thread B
prev mem accesses           prev mem accesses
sys_membarrier()            barrier()
follow mem accesses         follow mem accesses

As we can see, there are two possible scenarios: either Thread B memory
accesses do not happen concurrently with Thread A accesses (1), or they
do (2).

1) Non-concurrent Thread A vs Thread B accesses:

Thread A                    Thread B
prev mem accesses
sys_membarrier()
follow mem accesses
                            prev mem accesses
                            barrier()
                            follow mem accesses

In this case, thread B accesses will be weakly ordered. This is OK,
because at that point, thread A is not particularly interested in
ordering them with respect to its own accesses.

2) Concurrent Thread A vs Thread B accesses

Thread A                    Thread B
prev mem accesses           prev mem accesses
sys_membarrier()            barrier()
follow mem accesses         follow mem accesses

In this case, thread B accesses, which are ensured to be in program
order thanks to the compiler barrier, will be "upgraded" to full
smp_mb() by synchronize_sched().

* Benchmarks

On Intel Xeon E5405 (8 cores)
(one thread is calling sys_membarrier, the other 7 threads are busy
looping)

1000 non-expedited sys_membarrier calls in 33s =3D 33 milliseconds/call.

* User-space user of this system call: Userspace RCU library

Both the signal-based and the sys_membarrier userspace RCU schemes
permit us to remove the memory barrier from the userspace RCU
rcu_read_lock() and rcu_read_unlock() primitives, thus significantly
accelerating them. These memory barriers are replaced by compiler
barriers on the read-side, and all matching memory barriers on the
write-side are turned into an invocation of a memory barrier on all
active threads in the process. By letting the kernel perform this
synchronization rather than dumbly sending a signal to every process
threads (as we currently do), we diminish the number of unnecessary wake
ups and only issue the memory barriers on active threads. Non-running
threads do not need to execute such barrier anyway, because these are
implied by the scheduler context switches.

Results in liburcu:

Operations in 10s, 6 readers, 2 writers:

memory barriers in reader:    1701557485 reads, 2202847 writes
signal-based scheme:          9830061167 reads,    6700 writes
sys_membarrier:               9952759104 reads,     425 writes
sys_membarrier (dyn. check):  7970328887 reads,     425 writes

The dynamic sys_membarrier availability check adds some overhead to
the read-side compared to the signal-based scheme, but besides that,
sys_membarrier slightly outperforms the signal-based scheme. However,
this non-expedited sys_membarrier implementation has a much slower grace
period than signal and memory barrier schemes.

Besides diminishing the number of wake-ups, one major advantage of the
membarrier system call over the signal-based scheme is that it does not
need to reserve a signal. This plays much more nicely with libraries,
and with processes injected into for tracing purposes, for which we
cannot expect that signals will be unused by the application.

An expedited version of this system call can be added later on to speed
up the grace period. Its implementation will likely depend on reading
the cpu_curr()->mm without holding each CPU's rq lock.

This patch adds the system call to x86 and to asm-generic.

[1] http://urcu.so

membarrier(2) man page:

MEMBARRIER(2)              Linux Programmer's Manual             MEMBARRIER(2)

NAME
       membarrier - issue memory barriers on a set of threads

SYNOPSIS
       #include <linux/membarrier.h>

       int membarrier(int cmd, int flags);

DESCRIPTION
       The cmd argument is one of the following:

       MEMBARRIER_CMD_QUERY
              Query  the  set  of  supported commands. It returns a bitmask of
              supported commands.

       MEMBARRIER_CMD_SHARED
              Execute a memory barrier on all threads running on  the  system.
              Upon  return from system call, the caller thread is ensured that
              all running threads have passed through a state where all memory
              accesses  to  user-space  addresses  match program order between
              entry to and return from the system  call  (non-running  threads
              are de facto in such a state). This covers threads from all pro=E2=80=90
              cesses running on the system.  This command returns 0.

       The flags argument needs to be 0. For future extensions.

       All memory accesses performed  in  program  order  from  each  targeted
       thread is guaranteed to be ordered with respect to sys_membarrier(). If
       we use the semantic "barrier()" to represent a compiler barrier forcing
       memory  accesses  to  be performed in program order across the barrier,
       and smp_mb() to represent explicit memory barriers forcing full  memory
       ordering  across  the barrier, we have the following ordering table for
       each pair of barrier(), sys_membarrier() and smp_mb():

       The pair ordering is detailed as (O: ordered, X: not ordered):

                              barrier()   smp_mb() sys_membarrier()
              barrier()          X           X            O
              smp_mb()           X           O            O
              sys_membarrier()   O           O            O

RETURN VALUE
       On success, these system calls return zero.  On error, -1 is  returned,
       and errno is set appropriately. For a given command, with flags
       argument set to 0, this system call is guaranteed to always return the
       same value until reboot.

ERRORS
       ENOSYS System call is not implemented.

       EINVAL Invalid arguments.

Linux                             2015-04-15                     MEMBARRIER(2)

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Nicholas Miell <nmiell@comcast.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Alan Cox <gnomes@lxorguk.ukuu.org.uk>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Pranith Kumar <bobby.prani@gmail.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS                            |  8 +++++
 arch/x86/entry/syscalls/syscall_32.tbl |  1 +
 arch/x86/entry/syscalls/syscall_64.tbl |  1 +
 include/linux/syscalls.h               |  2 ++
 include/uapi/asm-generic/unistd.h      |  4 ++-
 include/uapi/linux/Kbuild              |  1 +
 include/uapi/linux/membarrier.h        | 53 +++++++++++++++++++++++++++
 init/Kconfig                           | 12 +++++++
 kernel/Makefile                        |  1 +
 kernel/membarrier.c                    | 66 ++++++++++++++++++++++++++++++++++
 kernel/sys_ni.c                        |  3 ++
 11 files changed, 151 insertions(+), 1 deletion(-)
 create mode 100644 include/uapi/linux/membarrier.h
 create mode 100644 kernel/membarrier.c

(limited to 'include/uapi')

diff --git a/MAINTAINERS b/MAINTAINERS
index 310da4295c70..e77bc84dc580 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6789,6 +6789,14 @@ W:	http://www.mellanox.com
 Q:	http://patchwork.ozlabs.org/project/netdev/list/
 F:	drivers/net/ethernet/mellanox/mlxsw/
 
+MEMBARRIER SUPPORT
+M:	Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+M:	"Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
+L:	linux-kernel@vger.kernel.org
+S:	Supported
+F:	kernel/membarrier.c
+F:	include/uapi/linux/membarrier.h
+
 MEMORY MANAGEMENT
 L:	linux-mm@kvack.org
 W:	http://www.linux-mm.org
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 477bfa6db370..7663c455b9f6 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -381,3 +381,4 @@
 372	i386	recvmsg			sys_recvmsg			compat_sys_recvmsg
 373	i386	shutdown		sys_shutdown
 374	i386	userfaultfd		sys_userfaultfd
+375	i386	membarrier		sys_membarrier
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 81c490634db9..278842fdf1f6 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -330,6 +330,7 @@
 321	common	bpf			sys_bpf
 322	64	execveat		stub_execveat
 323	common	userfaultfd		sys_userfaultfd
+324	common	membarrier		sys_membarrier
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 08001317aee7..a460e2ef2843 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -885,4 +885,6 @@ asmlinkage long sys_execveat(int dfd, const char __user *filename,
 			const char __user *const __user *argv,
 			const char __user *const __user *envp, int flags);
 
+asmlinkage long sys_membarrier(int cmd, int flags);
+
 #endif
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index e016bd9b1a04..8da542a2874d 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -709,9 +709,11 @@ __SYSCALL(__NR_memfd_create, sys_memfd_create)
 __SYSCALL(__NR_bpf, sys_bpf)
 #define __NR_execveat 281
 __SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat)
+#define __NR_membarrier 282
+__SYSCALL(__NR_membarrier, sys_membarrier)
 
 #undef __NR_syscalls
-#define __NR_syscalls 282
+#define __NR_syscalls 283
 
 /*
  * All syscalls below here should go away really,
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 70ff1d9abf0d..f7b2db44eb4b 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -252,6 +252,7 @@ header-y += mdio.h
 header-y += media.h
 header-y += media-bus-format.h
 header-y += mei.h
+header-y += membarrier.h
 header-y += memfd.h
 header-y += mempolicy.h
 header-y += meye.h
diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h
new file mode 100644
index 000000000000..e0b108bd2624
--- /dev/null
+++ b/include/uapi/linux/membarrier.h
@@ -0,0 +1,53 @@
+#ifndef _UAPI_LINUX_MEMBARRIER_H
+#define _UAPI_LINUX_MEMBARRIER_H
+
+/*
+ * linux/membarrier.h
+ *
+ * membarrier system call API
+ *
+ * Copyright (c) 2010, 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/**
+ * enum membarrier_cmd - membarrier system call command
+ * @MEMBARRIER_CMD_QUERY:   Query the set of supported commands. It returns
+ *                          a bitmask of valid commands.
+ * @MEMBARRIER_CMD_SHARED:  Execute a memory barrier on all running threads.
+ *                          Upon return from system call, the caller thread
+ *                          is ensured that all running threads have passed
+ *                          through a state where all memory accesses to
+ *                          user-space addresses match program order between
+ *                          entry to and return from the system call
+ *                          (non-running threads are de facto in such a
+ *                          state). This covers threads from all processes
+ *                          running on the system. This command returns 0.
+ *
+ * Command to be passed to the membarrier system call. The commands need to
+ * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
+ * the value 0.
+ */
+enum membarrier_cmd {
+	MEMBARRIER_CMD_QUERY = 0,
+	MEMBARRIER_CMD_SHARED = (1 << 0),
+};
+
+#endif /* _UAPI_LINUX_MEMBARRIER_H */
diff --git a/init/Kconfig b/init/Kconfig
index 02da9f1fd9df..c24b6f767bf0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1602,6 +1602,18 @@ config PCI_QUIRKS
 	  bugs/quirks. Disable this only if your target machine is
 	  unaffected by PCI quirks.
 
+config MEMBARRIER
+	bool "Enable membarrier() system call" if EXPERT
+	default y
+	help
+	  Enable the membarrier() system call that allows issuing memory
+	  barriers across all running threads, which can be used to distribute
+	  the cost of user-space memory barriers asymmetrically by transforming
+	  pairs of memory barriers into pairs consisting of membarrier() and a
+	  compiler barrier.
+
+	  If unsure, say Y.
+
 config EMBEDDED
 	bool "Embedded system"
 	option allnoconfig_y
diff --git a/kernel/Makefile b/kernel/Makefile
index d4988410b410..53abf008ecb3 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -100,6 +100,7 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_JUMP_LABEL) += jump_label.o
 obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
 obj-$(CONFIG_TORTURE_TEST) += torture.o
+obj-$(CONFIG_MEMBARRIER) += membarrier.o
 
 obj-$(CONFIG_HAS_IOMEM) += memremap.o
 
diff --git a/kernel/membarrier.c b/kernel/membarrier.c
new file mode 100644
index 000000000000..536c727a56e9
--- /dev/null
+++ b/kernel/membarrier.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright (C) 2010, 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ *
+ * membarrier system call
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/syscalls.h>
+#include <linux/membarrier.h>
+
+/*
+ * Bitmask made from a "or" of all commands within enum membarrier_cmd,
+ * except MEMBARRIER_CMD_QUERY.
+ */
+#define MEMBARRIER_CMD_BITMASK	(MEMBARRIER_CMD_SHARED)
+
+/**
+ * sys_membarrier - issue memory barriers on a set of threads
+ * @cmd:   Takes command values defined in enum membarrier_cmd.
+ * @flags: Currently needs to be 0. For future extensions.
+ *
+ * If this system call is not implemented, -ENOSYS is returned. If the
+ * command specified does not exist, or if the command argument is invalid,
+ * this system call returns -EINVAL. For a given command, with flags argument
+ * set to 0, this system call is guaranteed to always return the same value
+ * until reboot.
+ *
+ * All memory accesses performed in program order from each targeted thread
+ * is guaranteed to be ordered with respect to sys_membarrier(). If we use
+ * the semantic "barrier()" to represent a compiler barrier forcing memory
+ * accesses to be performed in program order across the barrier, and
+ * smp_mb() to represent explicit memory barriers forcing full memory
+ * ordering across the barrier, we have the following ordering table for
+ * each pair of barrier(), sys_membarrier() and smp_mb():
+ *
+ * The pair ordering is detailed as (O: ordered, X: not ordered):
+ *
+ *                        barrier()   smp_mb() sys_membarrier()
+ *        barrier()          X           X            O
+ *        smp_mb()           X           O            O
+ *        sys_membarrier()   O           O            O
+ */
+SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
+{
+	if (unlikely(flags))
+		return -EINVAL;
+	switch (cmd) {
+	case MEMBARRIER_CMD_QUERY:
+		return MEMBARRIER_CMD_BITMASK;
+	case MEMBARRIER_CMD_SHARED:
+		if (num_online_cpus() > 1)
+			synchronize_sched();
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 03c3875d9958..a02decf15583 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -245,3 +245,6 @@ cond_syscall(sys_bpf);
 
 /* execveat */
 cond_syscall(sys_execveat);
+
+/* membarrier */
+cond_syscall(sys_membarrier);
-- 
cgit v1.2.3