From d15156138dad40205c9fdd9abe85c9e1479ae272 Mon Sep 17 00:00:00 2001
From: Douglas Gilbert <dgilbert@interlog.com>
Date: Tue, 1 Jul 2014 10:48:05 -0600
Subject: block SG_IO: add SG_FLAG_Q_AT_HEAD flag

After the SG_IO ioctl was copied into the block layer and
later into the bsg driver, subtle differences emerged.

One difference is the way injected commands are queued through
the block layer (i.e. this is not SCSI device queueing nor SATA
NCQ). Summarizing:
  - SG_IO on block layer device: blk_exec*(at_head=false)
  - sg device SG_IO: at_head=true
  - bsg device SG_IO: at_head=true

Some time ago Boaz Harrosh introduced a sg v4 flag called
BSG_FLAG_Q_AT_TAIL to override the bsg driver default. A
recent patch titled: "sg: add SG_FLAG_Q_AT_TAIL flag"
allowed the sg driver default to be overridden. This patch
allows a SG_IO ioctl sent to a block layer device to have
its default overridden.

ChangeLog:
    - introduce SG_FLAG_Q_AT_HEAD flag in sg.h to cause
      commands that are injected via a block layer
      device SG_IO ioctl to set at_head=true
    - make comments clearer about queueing in sg.h since the
      header is used both by the sg device and block layer
      device implementations of the SG_IO ioctl.
    - introduce BSG_FLAG_Q_AT_HEAD in bsg.h for compatibility
      (it does nothing) and update comments.

Signed-off-by: Douglas Gilbert <dgilbert@interlog.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mike Christie <michaelc@cs.wisc.edu>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/uapi/linux/bsg.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bsg.h b/include/uapi/linux/bsg.h
index 7a12e1c0f371..02986cf8b6f1 100644
--- a/include/uapi/linux/bsg.h
+++ b/include/uapi/linux/bsg.h
@@ -10,12 +10,13 @@
 #define BSG_SUB_PROTOCOL_SCSI_TRANSPORT	2
 
 /*
- * For flags member below
- * sg.h sg_io_hdr also has bits defined for it's flags member. However
- * none of these bits are implemented/used by bsg. The bits below are
- * allocated to not conflict with sg.h ones anyway.
+ * For flag constants below:
+ * sg.h sg_io_hdr also has bits defined for it's flags member. These
+ * two flag values (0x10 and 0x20) have the same meaning in sg.h . For
+ * bsg the BSG_FLAG_Q_AT_HEAD flag is ignored since it is the deafult.
  */
-#define BSG_FLAG_Q_AT_TAIL 0x10 /* default, == 0 at this bit, is Q_AT_HEAD */
+#define BSG_FLAG_Q_AT_TAIL 0x10 /* default is Q_AT_HEAD */
+#define BSG_FLAG_Q_AT_HEAD 0x20
 
 struct sg_io_v4 {
 	__s32 guard;		/* [i] 'Q' to differentiate from v3 */
-- 
cgit v1.2.3


From cb553215d5d277d4838d7d6b7722e964bcf5ca1f Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Thu, 26 Jun 2014 17:41:47 +0800
Subject: include/uapi/linux/virtio_blk.h: introduce feature of VIRTIO_BLK_F_MQ

Current virtio-blk spec only supports one virtual queue for transfering
data between VM and host, and inside VM all kinds of operations on
the virtual queue needs to hold one lock, so cause below problems:

	- bad scalability
	- bad throughput

This patch requests to introduce feature of VIRTIO_BLK_F_MQ
so that more than one virtual queues can be used to virtio-blk
device, then above problems can be solved or eased.

Signed-off-by: Ming Lei <ming.lei@canonical.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/uapi/linux/virtio_blk.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h
index 6d8e61c48563..9ad67b267584 100644
--- a/include/uapi/linux/virtio_blk.h
+++ b/include/uapi/linux/virtio_blk.h
@@ -40,6 +40,7 @@
 #define VIRTIO_BLK_F_WCE	9	/* Writeback mode enabled after reset */
 #define VIRTIO_BLK_F_TOPOLOGY	10	/* Topology information is available */
 #define VIRTIO_BLK_F_CONFIG_WCE	11	/* Writeback mode available in config */
+#define VIRTIO_BLK_F_MQ		12	/* support more than one vq */
 
 #ifndef __KERNEL__
 /* Old (deprecated) name for VIRTIO_BLK_F_WCE. */
@@ -77,6 +78,10 @@ struct virtio_blk_config {
 
 	/* writeback mode (if VIRTIO_BLK_F_CONFIG_WCE) */
 	__u8 wce;
+	__u8 unused;
+
+	/* number of vqs, only available when VIRTIO_BLK_F_MQ is set */
+	__u16 num_queues;
 } __attribute__((packed));
 
 /*
-- 
cgit v1.2.3


From b4e05923f9c5bb65ac82988d7b53cfd7425e6f36 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Sun, 20 Jul 2014 13:35:44 -0700
Subject: Input: add support for Wacom protocol 4 serial tablets

Recent version of xf86-input-wacom no longer support directly accessing
serial tablets. Instead xf86-input-wacom now expects all wacom tablets to
be driven by the kernel and to show up as evdev devices.

This has caused old serial Wacom tablets to stop working for people who still
have such tablets. Julian Squires has written a serio input driver to fix this:
https://github.com/tokenrove/wacom-serial-iv

This is a cleaned up version of this driver with improved Graphire support
(I own an old Graphire myself).

Signed-off-by: Julian Squires <julian@cipht.net>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 MAINTAINERS                          |   7 +
 drivers/input/tablet/Kconfig         |  10 +
 drivers/input/tablet/Makefile        |   1 +
 drivers/input/tablet/wacom_serial4.c | 616 +++++++++++++++++++++++++++++++++++
 include/uapi/linux/serio.h           |   1 +
 5 files changed, 635 insertions(+)
 create mode 100644 drivers/input/tablet/wacom_serial4.c

(limited to 'include/uapi/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 51ebb779c5f3..c68bd8b6f8e9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9708,6 +9708,13 @@ M:	Pierre Ossman <pierre@ossman.eu>
 S:	Maintained
 F:	drivers/mmc/host/wbsd.*
 
+WACOM PROTOCOL 4 SERIAL TABLETS
+M:	Julian Squires <julian@cipht.net>
+M:	Hans de Goede <hdegoede@redhat.com>
+L:	linux-input@vger.kernel.org
+S:	Maintained
+F:	drivers/input/tablet/wacom_serial4.c
+
 WATCHDOG DEVICE DRIVERS
 M:	Wim Van Sebroeck <wim@iguana.be>
 L:	linux-watchdog@vger.kernel.org
diff --git a/drivers/input/tablet/Kconfig b/drivers/input/tablet/Kconfig
index bed7cbf84cfd..a5121b09c63f 100644
--- a/drivers/input/tablet/Kconfig
+++ b/drivers/input/tablet/Kconfig
@@ -89,4 +89,14 @@ config TABLET_USB_WACOM
 	  To compile this driver as a module, choose M here: the
 	  module will be called wacom.
 
+config TABLET_SERIAL_WACOM4
+	tristate "Wacom protocol 4 serial tablet support"
+	select SERIO
+	help
+	  Say Y here if you want to use Wacom protocol 4 serial tablets.
+	  E.g. serial versions of the Cintiq, Graphire or Penpartner.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called wacom_serial4.
+
 endif
diff --git a/drivers/input/tablet/Makefile b/drivers/input/tablet/Makefile
index 3f6c25220638..4d9339fb3b63 100644
--- a/drivers/input/tablet/Makefile
+++ b/drivers/input/tablet/Makefile
@@ -11,3 +11,4 @@ obj-$(CONFIG_TABLET_USB_GTCO)	+= gtco.o
 obj-$(CONFIG_TABLET_USB_HANWANG) += hanwang.o
 obj-$(CONFIG_TABLET_USB_KBTAB)	+= kbtab.o
 obj-$(CONFIG_TABLET_USB_WACOM)	+= wacom.o
+obj-$(CONFIG_TABLET_SERIAL_WACOM4) += wacom_serial4.o
diff --git a/drivers/input/tablet/wacom_serial4.c b/drivers/input/tablet/wacom_serial4.c
new file mode 100644
index 000000000000..d3d251e27307
--- /dev/null
+++ b/drivers/input/tablet/wacom_serial4.c
@@ -0,0 +1,616 @@
+/*
+ * Wacom protocol 4 serial tablet driver
+ *
+ * Copyright 2014      Hans de Goede <hdegoede@redhat.com>
+ * Copyright 2011-2012 Julian Squires <julian@cipht.net>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version of 2 of the License, or (at your
+ * option) any later version. See the file COPYING in the main directory of
+ * this archive for more details.
+ *
+ * Many thanks to Bill Seremetis, without whom PenPartner support
+ * would not have been possible. Thanks to Patrick Mahoney.
+ *
+ * This driver was developed with reference to much code written by others,
+ * particularly:
+ *  - elo, gunze drivers by Vojtech Pavlik <vojtech@ucw.cz>;
+ *  - wacom_w8001 driver by Jaya Kumar <jayakumar.lkml@gmail.com>;
+ *  - the USB wacom input driver, credited to many people
+ *    (see drivers/input/tablet/wacom.h);
+ *  - new and old versions of linuxwacom / xf86-input-wacom credited to
+ *    Frederic Lepied, France. <Lepied@XFree86.org> and
+ *    Ping Cheng, Wacom. <pingc@wacom.com>;
+ *  - and xf86wacom.c (a presumably ancient version of the linuxwacom code),
+ *    by Frederic Lepied and Raph Levien <raph@gtk.org>.
+ *
+ * To do:
+ *  - support pad buttons; (requires access to a model with pad buttons)
+ *  - support (protocol 4-style) tilt (requires access to a > 1.4 rom model)
+ */
+
+/*
+ * Wacom serial protocol 4 documentation taken from linuxwacom-0.9.9 code,
+ * protocol 4 uses 7 or 9 byte of data in the following format:
+ *
+ *	Byte 1
+ *	bit 7  Sync bit always 1
+ *	bit 6  Pointing device detected
+ *	bit 5  Cursor = 0 / Stylus = 1
+ *	bit 4  Reserved
+ *	bit 3  1 if a button on the pointing device has been pressed
+ *	bit 2  P0 (optional)
+ *	bit 1  X15
+ *	bit 0  X14
+ *
+ *	Byte 2
+ *	bit 7  Always 0
+ *	bits 6-0 = X13 - X7
+ *
+ *	Byte 3
+ *	bit 7  Always 0
+ *	bits 6-0 = X6 - X0
+ *
+ *	Byte 4
+ *	bit 7  Always 0
+ *	bit 6  B3
+ *	bit 5  B2
+ *	bit 4  B1
+ *	bit 3  B0
+ *	bit 2  P1 (optional)
+ *	bit 1  Y15
+ *	bit 0  Y14
+ *
+ *	Byte 5
+ *	bit 7  Always 0
+ *	bits 6-0 = Y13 - Y7
+ *
+ *	Byte 6
+ *	bit 7  Always 0
+ *	bits 6-0 = Y6 - Y0
+ *
+ *	Byte 7
+ *	bit 7 Always 0
+ *	bit 6  Sign of pressure data; or wheel-rel for cursor tool
+ *	bit 5  P7; or REL1 for cursor tool
+ *	bit 4  P6; or REL0 for cursor tool
+ *	bit 3  P5
+ *	bit 2  P4
+ *	bit 1  P3
+ *	bit 0  P2
+ *
+ *	byte 8 and 9 are optional and present only
+ *	in tilt mode.
+ *
+ *	Byte 8
+ *	bit 7 Always 0
+ *	bit 6 Sign of tilt X
+ *	bit 5  Xt6
+ *	bit 4  Xt5
+ *	bit 3  Xt4
+ *	bit 2  Xt3
+ *	bit 1  Xt2
+ *	bit 0  Xt1
+ *
+ *	Byte 9
+ *	bit 7 Always 0
+ *	bit 6 Sign of tilt Y
+ *	bit 5  Yt6
+ *	bit 4  Yt5
+ *	bit 3  Yt4
+ *	bit 2  Yt3
+ *	bit 1  Yt2
+ *	bit 0  Yt1
+ */
+
+#include <linux/completion.h>
+#include <linux/init.h>
+#include <linux/input.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/serio.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include "wacom_wac.h"
+
+MODULE_AUTHOR("Julian Squires <julian@cipht.net>, Hans de Goede <hdegoede@redhat.com>");
+MODULE_DESCRIPTION("Wacom protocol 4 serial tablet driver");
+MODULE_LICENSE("GPL");
+
+#define REQUEST_MODEL_AND_ROM_VERSION	"~#"
+#define REQUEST_MAX_COORDINATES		"~C\r"
+#define REQUEST_CONFIGURATION_STRING	"~R\r"
+#define REQUEST_RESET_TO_PROTOCOL_IV	"\r#"
+/*
+ * Note: sending "\r$\r" causes at least the Digitizer II to send
+ * packets in ASCII instead of binary.  "\r#" seems to undo that.
+ */
+
+#define COMMAND_START_SENDING_PACKETS		"ST\r"
+#define COMMAND_STOP_SENDING_PACKETS		"SP\r"
+#define COMMAND_MULTI_MODE_INPUT		"MU1\r"
+#define COMMAND_ORIGIN_IN_UPPER_LEFT		"OC1\r"
+#define COMMAND_ENABLE_ALL_MACRO_BUTTONS	"~M0\r"
+#define COMMAND_DISABLE_GROUP_1_MACRO_BUTTONS	"~M1\r"
+#define COMMAND_TRANSMIT_AT_MAX_RATE		"IT0\r"
+#define COMMAND_DISABLE_INCREMENTAL_MODE	"IN0\r"
+#define COMMAND_ENABLE_CONTINUOUS_MODE		"SR\r"
+#define COMMAND_ENABLE_PRESSURE_MODE		"PH1\r"
+#define COMMAND_Z_FILTER			"ZF1\r"
+
+/* Note that this is a protocol 4 packet without tilt information. */
+#define PACKET_LENGTH		7
+#define DATA_SIZE		32
+
+/* flags */
+#define F_COVERS_SCREEN		0x01
+#define F_HAS_STYLUS2		0x02
+#define F_HAS_SCROLLWHEEL	0x04
+
+enum { STYLUS = 1, ERASER, CURSOR };
+
+static const struct {
+	int device_id;
+	int input_id;
+} tools[] = {
+	{ 0, 0 },
+	{ STYLUS_DEVICE_ID, BTN_TOOL_PEN },
+	{ ERASER_DEVICE_ID, BTN_TOOL_RUBBER },
+	{ CURSOR_DEVICE_ID, BTN_TOOL_MOUSE },
+};
+
+struct wacom {
+	struct input_dev *dev;
+	struct completion cmd_done;
+	int result;
+	u8 expect;
+	u8 eraser_mask;
+	unsigned int extra_z_bits;
+	unsigned int flags;
+	unsigned int res_x, res_y;
+	unsigned int max_x, max_y;
+	unsigned int tool;
+	unsigned int idx;
+	u8 data[DATA_SIZE];
+	char phys[32];
+};
+
+enum {
+	MODEL_CINTIQ		= 0x504C, /* PL */
+	MODEL_CINTIQ2		= 0x4454, /* DT */
+	MODEL_DIGITIZER_II	= 0x5544, /* UD */
+	MODEL_GRAPHIRE		= 0x4554, /* ET */
+	MODEL_PENPARTNER	= 0x4354, /* CT */
+};
+
+static void wacom_handle_model_response(struct wacom *wacom)
+{
+	int major_v, minor_v, r = 0;
+	char *p;
+
+	p = strrchr(wacom->data, 'V');
+	if (p)
+		r = sscanf(p + 1, "%u.%u", &major_v, &minor_v);
+	if (r != 2)
+		major_v = minor_v = 0;
+
+	switch (wacom->data[2] << 8 | wacom->data[3]) {
+	case MODEL_CINTIQ:	/* UNTESTED */
+	case MODEL_CINTIQ2:
+		if ((wacom->data[2] << 8 | wacom->data[3]) == MODEL_CINTIQ) {
+			wacom->dev->name = "Wacom Cintiq";
+			wacom->dev->id.version = MODEL_CINTIQ;
+		} else {
+			wacom->dev->name = "Wacom Cintiq II";
+			wacom->dev->id.version = MODEL_CINTIQ2;
+		}
+		wacom->res_x = 508;
+		wacom->res_y = 508;
+
+		switch (wacom->data[5] << 8 | wacom->data[6]) {
+		case 0x3731: /* PL-710 */
+			wacom->res_x = 2540;
+			wacom->res_y = 2540;
+			/* fall through */
+		case 0x3535: /* PL-550 */
+		case 0x3830: /* PL-800 */
+			wacom->extra_z_bits = 2;
+		}
+
+		wacom->flags = F_COVERS_SCREEN;
+		break;
+
+	case MODEL_PENPARTNER:
+		wacom->dev->name = "Wacom Penpartner";
+		wacom->dev->id.version = MODEL_PENPARTNER;
+		wacom->res_x = 1000;
+		wacom->res_y = 1000;
+		break;
+
+	case MODEL_GRAPHIRE:
+		wacom->dev->name = "Wacom Graphire";
+		wacom->dev->id.version = MODEL_GRAPHIRE;
+		wacom->res_x = 1016;
+		wacom->res_y = 1016;
+		wacom->max_x = 5103;
+		wacom->max_y = 3711;
+		wacom->extra_z_bits = 2;
+		wacom->eraser_mask = 0x08;
+		wacom->flags = F_HAS_STYLUS2 | F_HAS_SCROLLWHEEL;
+		break;
+
+	case MODEL_DIGITIZER_II:
+		wacom->dev->name = "Wacom Digitizer II";
+		wacom->dev->id.version = MODEL_DIGITIZER_II;
+		if (major_v == 1 && minor_v <= 2)
+			wacom->extra_z_bits = 0; /* UNTESTED */
+		break;
+
+	default:
+		dev_err(&wacom->dev->dev, "Unsupported Wacom model %s\n",
+			wacom->data);
+		wacom->result = -ENODEV;
+		return;
+	}
+
+	dev_info(&wacom->dev->dev, "%s tablet, version %u.%u\n",
+		 wacom->dev->name, major_v, minor_v);
+}
+
+static void wacom_handle_configuration_response(struct wacom *wacom)
+{
+	int r, skip;
+
+	dev_dbg(&wacom->dev->dev, "Configuration string: %s\n", wacom->data);
+	r = sscanf(wacom->data, "~R%x,%u,%u,%u,%u", &skip, &skip, &skip,
+		   &wacom->res_x, &wacom->res_y);
+	if (r != 5)
+		dev_warn(&wacom->dev->dev, "could not get resolution\n");
+}
+
+static void wacom_handle_coordinates_response(struct wacom *wacom)
+{
+	int r;
+
+	dev_dbg(&wacom->dev->dev, "Coordinates string: %s\n", wacom->data);
+	r = sscanf(wacom->data, "~C%u,%u", &wacom->max_x, &wacom->max_y);
+	if (r != 2)
+		dev_warn(&wacom->dev->dev, "could not get max coordinates\n");
+}
+
+static void wacom_handle_response(struct wacom *wacom)
+{
+	if (wacom->data[0] != '~' || wacom->data[1] != wacom->expect) {
+		dev_err(&wacom->dev->dev,
+			"Wacom got an unexpected response: %s\n", wacom->data);
+		wacom->result = -EIO;
+	} else {
+		wacom->result = 0;
+
+		switch (wacom->data[1]) {
+		case '#':
+			wacom_handle_model_response(wacom);
+			break;
+		case 'R':
+			wacom_handle_configuration_response(wacom);
+			break;
+		case 'C':
+			wacom_handle_coordinates_response(wacom);
+			break;
+		}
+	}
+
+	complete(&wacom->cmd_done);
+}
+
+static void wacom_handle_packet(struct wacom *wacom)
+{
+	u8 in_proximity_p, stylus_p, button;
+	unsigned int tool;
+	int x, y, z;
+
+	in_proximity_p = wacom->data[0] & 0x40;
+	stylus_p = wacom->data[0] & 0x20;
+	button = (wacom->data[3] & 0x78) >> 3;
+	x = (wacom->data[0] & 3) << 14 | wacom->data[1]<<7 | wacom->data[2];
+	y = (wacom->data[3] & 3) << 14 | wacom->data[4]<<7 | wacom->data[5];
+
+	if (in_proximity_p && stylus_p) {
+		z = wacom->data[6] & 0x7f;
+		if (wacom->extra_z_bits >= 1)
+			z = z << 1 | (wacom->data[3] & 0x4) >> 2;
+		if (wacom->extra_z_bits > 1)
+			z = z << 1 | (wacom->data[0] & 0x4) >> 2;
+		z = z ^ (0x40 << wacom->extra_z_bits);
+	} else {
+		z = -1;
+	}
+
+	if (stylus_p)
+		tool = (button & wacom->eraser_mask) ? ERASER : STYLUS;
+	else
+		tool = CURSOR;
+
+	if (tool != wacom->tool && wacom->tool != 0) {
+		input_report_key(wacom->dev, tools[wacom->tool].input_id, 0);
+		input_sync(wacom->dev);
+	}
+	wacom->tool = tool;
+
+	input_report_key(wacom->dev, tools[tool].input_id, in_proximity_p);
+	input_report_abs(wacom->dev, ABS_MISC,
+			 in_proximity_p ? tools[tool].device_id : 0);
+	input_report_abs(wacom->dev, ABS_X, x);
+	input_report_abs(wacom->dev, ABS_Y, y);
+	input_report_abs(wacom->dev, ABS_PRESSURE, z);
+	if (stylus_p) {
+		input_report_key(wacom->dev, BTN_TOUCH, button & 1);
+		input_report_key(wacom->dev, BTN_STYLUS, button & 2);
+		input_report_key(wacom->dev, BTN_STYLUS2, button & 4);
+	} else {
+		input_report_key(wacom->dev, BTN_LEFT, button & 1);
+		input_report_key(wacom->dev, BTN_RIGHT, button & 2);
+		input_report_key(wacom->dev, BTN_MIDDLE, button & 4);
+		/* handle relative wheel for non-stylus device */
+		z = (wacom->data[6] & 0x30) >> 4;
+		if (wacom->data[6] & 0x40)
+			z = -z;
+		input_report_rel(wacom->dev, REL_WHEEL, z);
+	}
+	input_sync(wacom->dev);
+}
+
+static void wacom_clear_data_buf(struct wacom *wacom)
+{
+	memset(wacom->data, 0, DATA_SIZE);
+	wacom->idx = 0;
+}
+
+static irqreturn_t wacom_interrupt(struct serio *serio, unsigned char data,
+				   unsigned int flags)
+{
+	struct wacom *wacom = serio_get_drvdata(serio);
+
+	if (data & 0x80)
+		wacom->idx = 0;
+
+	/*
+	 * We're either expecting a carriage return-terminated ASCII
+	 * response string, or a seven-byte packet with the MSB set on
+	 * the first byte.
+	 *
+	 * Note however that some tablets (the PenPartner, for
+	 * example) don't send a carriage return at the end of a
+	 * command.  We handle these by waiting for timeout.
+	 */
+	if (data == '\r' && !(wacom->data[0] & 0x80)) {
+		wacom_handle_response(wacom);
+		wacom_clear_data_buf(wacom);
+		return IRQ_HANDLED;
+	}
+
+	/* Leave place for 0 termination */
+	if (wacom->idx > (DATA_SIZE - 2)) {
+		dev_dbg(&wacom->dev->dev,
+			"throwing away %d bytes of garbage\n", wacom->idx);
+		wacom_clear_data_buf(wacom);
+	}
+	wacom->data[wacom->idx++] = data;
+
+	if (wacom->idx == PACKET_LENGTH && (wacom->data[0] & 0x80)) {
+		wacom_handle_packet(wacom);
+		wacom_clear_data_buf(wacom);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static void wacom_disconnect(struct serio *serio)
+{
+	struct wacom *wacom = serio_get_drvdata(serio);
+
+	serio_close(serio);
+	serio_set_drvdata(serio, NULL);
+	input_unregister_device(wacom->dev);
+	kfree(wacom);
+}
+
+static int wacom_send(struct serio *serio, const u8 *command)
+{
+	int err = 0;
+
+	for (; !err && *command; command++)
+		err = serio_write(serio, *command);
+
+	return err;
+}
+
+static int wacom_send_setup_string(struct wacom *wacom, struct serio *serio)
+{
+	const u8 *cmd;
+
+	switch (wacom->dev->id.version) {
+	case MODEL_CINTIQ:	/* UNTESTED */
+		cmd = COMMAND_ORIGIN_IN_UPPER_LEFT
+			COMMAND_TRANSMIT_AT_MAX_RATE
+			COMMAND_ENABLE_CONTINUOUS_MODE
+			COMMAND_START_SENDING_PACKETS;
+		break;
+
+	case MODEL_PENPARTNER:
+		cmd = COMMAND_ENABLE_PRESSURE_MODE
+			COMMAND_START_SENDING_PACKETS;
+		break;
+
+	default:
+		cmd = COMMAND_MULTI_MODE_INPUT
+			COMMAND_ORIGIN_IN_UPPER_LEFT
+			COMMAND_ENABLE_ALL_MACRO_BUTTONS
+			COMMAND_DISABLE_GROUP_1_MACRO_BUTTONS
+			COMMAND_TRANSMIT_AT_MAX_RATE
+			COMMAND_DISABLE_INCREMENTAL_MODE
+			COMMAND_ENABLE_CONTINUOUS_MODE
+			COMMAND_Z_FILTER
+			COMMAND_START_SENDING_PACKETS;
+		break;
+	}
+
+	return wacom_send(serio, cmd);
+}
+
+static int wacom_send_and_wait(struct wacom *wacom, struct serio *serio,
+			       const u8 *cmd, const char *desc)
+{
+	int err;
+	unsigned long u;
+
+	wacom->expect = cmd[1];
+	init_completion(&wacom->cmd_done);
+
+	err = wacom_send(serio, cmd);
+	if (err)
+		return err;
+
+	u = wait_for_completion_timeout(&wacom->cmd_done, HZ);
+	if (u == 0) {
+		/* Timeout, process what we've received. */
+		wacom_handle_response(wacom);
+	}
+
+	wacom->expect = 0;
+	return wacom->result;
+}
+
+static int wacom_setup(struct wacom *wacom, struct serio *serio)
+{
+	int err;
+
+	/* Note that setting the link speed is the job of inputattach.
+	 * We assume that reset negotiation has already happened,
+	 * here. */
+	err = wacom_send_and_wait(wacom, serio, REQUEST_MODEL_AND_ROM_VERSION,
+				  "model and version");
+	if (err)
+		return err;
+
+	if (!(wacom->res_x && wacom->res_y)) {
+		err = wacom_send_and_wait(wacom, serio,
+					  REQUEST_CONFIGURATION_STRING,
+					  "configuration string");
+		if (err)
+			return err;
+	}
+
+	if (!(wacom->max_x && wacom->max_y)) {
+		err = wacom_send_and_wait(wacom, serio,
+					  REQUEST_MAX_COORDINATES,
+					  "coordinates string");
+		if (err)
+			return err;
+	}
+
+	return wacom_send_setup_string(wacom, serio);
+}
+
+static int wacom_connect(struct serio *serio, struct serio_driver *drv)
+{
+	struct wacom *wacom;
+	struct input_dev *input_dev;
+	int err = -ENOMEM;
+
+	wacom = kzalloc(sizeof(struct wacom), GFP_KERNEL);
+	input_dev = input_allocate_device();
+	if (!wacom || !input_dev)
+		goto free_device;
+
+	wacom->dev = input_dev;
+	wacom->extra_z_bits = 1;
+	wacom->eraser_mask = 0x04;
+	wacom->tool = wacom->idx = 0;
+	snprintf(wacom->phys, sizeof(wacom->phys), "%s/input0", serio->phys);
+	input_dev->phys = wacom->phys;
+	input_dev->id.bustype = BUS_RS232;
+	input_dev->id.vendor  = SERIO_WACOM_IV;
+	input_dev->id.product = serio->id.extra;
+	input_dev->dev.parent = &serio->dev;
+
+	input_dev->evbit[0] =
+		BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS) | BIT_MASK(EV_REL);
+	set_bit(ABS_MISC, input_dev->absbit);
+	set_bit(BTN_TOOL_PEN, input_dev->keybit);
+	set_bit(BTN_TOOL_RUBBER, input_dev->keybit);
+	set_bit(BTN_TOOL_MOUSE, input_dev->keybit);
+	set_bit(BTN_TOUCH, input_dev->keybit);
+	set_bit(BTN_STYLUS, input_dev->keybit);
+	set_bit(BTN_LEFT, input_dev->keybit);
+	set_bit(BTN_RIGHT, input_dev->keybit);
+	set_bit(BTN_MIDDLE, input_dev->keybit);
+
+	serio_set_drvdata(serio, wacom);
+
+	err = serio_open(serio, drv);
+	if (err)
+		goto free_device;
+
+	err = wacom_setup(wacom, serio);
+	if (err)
+		goto close_serio;
+
+	set_bit(INPUT_PROP_DIRECT, input_dev->propbit);
+	if (!(wacom->flags & F_COVERS_SCREEN))
+		__set_bit(INPUT_PROP_POINTER, input_dev->propbit);
+
+	if (wacom->flags & F_HAS_STYLUS2)
+		__set_bit(BTN_STYLUS2, input_dev->keybit);
+
+	if (wacom->flags & F_HAS_SCROLLWHEEL)
+		__set_bit(REL_WHEEL, input_dev->relbit);
+
+	input_abs_set_res(wacom->dev, ABS_X, wacom->res_x);
+	input_abs_set_res(wacom->dev, ABS_Y, wacom->res_y);
+	input_set_abs_params(wacom->dev, ABS_X, 0, wacom->max_x, 0, 0);
+	input_set_abs_params(wacom->dev, ABS_Y, 0, wacom->max_y, 0, 0);
+	input_set_abs_params(wacom->dev, ABS_PRESSURE, -1,
+			     (1 << (7 + wacom->extra_z_bits)) - 1, 0, 0);
+
+	err = input_register_device(wacom->dev);
+	if (err)
+		goto close_serio;
+
+	return 0;
+
+close_serio:
+	serio_close(serio);
+free_device:
+	serio_set_drvdata(serio, NULL);
+	input_free_device(input_dev);
+	kfree(wacom);
+	return err;
+}
+
+static struct serio_device_id wacom_serio_ids[] = {
+	{
+		.type	= SERIO_RS232,
+		.proto	= SERIO_WACOM_IV,
+		.id	= SERIO_ANY,
+		.extra	= SERIO_ANY,
+	},
+	{ 0 }
+};
+
+MODULE_DEVICE_TABLE(serio, wacom_serio_ids);
+
+static struct serio_driver wacom_drv = {
+	.driver		= {
+		.name	= "wacom_serial4",
+	},
+	.description	= "Wacom protocol 4 serial tablet driver",
+	.id_table	= wacom_serio_ids,
+	.interrupt	= wacom_interrupt,
+	.connect	= wacom_connect,
+	.disconnect	= wacom_disconnect,
+};
+
+module_serio_driver(wacom_drv);
diff --git a/include/uapi/linux/serio.h b/include/uapi/linux/serio.h
index 9f53fa7fc132..becdd78295cc 100644
--- a/include/uapi/linux/serio.h
+++ b/include/uapi/linux/serio.h
@@ -76,5 +76,6 @@
 #define SERIO_HAMPSHIRE	0x3b
 #define SERIO_PS2MULT	0x3c
 #define SERIO_TSC40	0x3d
+#define SERIO_WACOM_IV	0x3e
 
 #endif /* _UAPI_SERIO_H */
-- 
cgit v1.2.3


From ba4e9a61ad54c438d4c7b655e94e31f23a6fe13f Mon Sep 17 00:00:00 2001
From: David Herrmann <dh.herrmann@gmail.com>
Date: Sun, 20 Jul 2014 17:27:09 -0700
Subject: Input: uinput - add UI_GET_VERSION ioctl

This ioctl is the counterpart to EVIOCGVERSION and returns the
uinput-version the kernel was compiled with.

Reviewed-by: Peter Hutterer <peter.hutterer@who-t.net>
Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/misc/uinput.c | 6 ++++++
 include/uapi/linux/uinput.h | 9 +++++++++
 2 files changed, 15 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/input/misc/uinput.c b/drivers/input/misc/uinput.c
index 883f045f37df..421e29e4cd81 100644
--- a/drivers/input/misc/uinput.c
+++ b/drivers/input/misc/uinput.c
@@ -723,6 +723,12 @@ static long uinput_ioctl_handler(struct file *file, unsigned int cmd,
 	}
 
 	switch (cmd) {
+		case UI_GET_VERSION:
+			if (put_user(UINPUT_VERSION,
+				     (unsigned int __user *)p))
+				retval = -EFAULT;
+			goto out;
+
 		case UI_DEV_CREATE:
 			retval = uinput_create_device(udev);
 			goto out;
diff --git a/include/uapi/linux/uinput.h b/include/uapi/linux/uinput.h
index 0389b489bbba..baeab83deb64 100644
--- a/include/uapi/linux/uinput.h
+++ b/include/uapi/linux/uinput.h
@@ -84,6 +84,15 @@ struct uinput_ff_erase {
  */
 #define UI_GET_SYSNAME(len)	_IOC(_IOC_READ, UINPUT_IOCTL_BASE, 300, len)
 
+/**
+ * UI_GET_VERSION - Return version of uinput protocol
+ *
+ * This writes uinput protocol version implemented by the kernel into
+ * the integer pointed to by the ioctl argument. The protocol version
+ * is hard-coded in the kernel and is independent of the uinput device.
+ */
+#define UI_GET_VERSION		_IOR(UINPUT_IOCTL_BASE, 301, unsigned int)
+
 /*
  * To write a force-feedback-capable driver, the upload_effect
  * and erase_effect callbacks in input_dev must be implemented.
-- 
cgit v1.2.3


From 699a0ea0823d32030b0666b28ff8633960f7ffa7 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 2 Jun 2014 11:02:59 +1000
Subject: KVM: PPC: Book3S: Controls for in-kernel sPAPR hypercall handling

This provides a way for userspace controls which sPAPR hcalls get
handled in the kernel.  Each hcall can be individually enabled or
disabled for in-kernel handling, except for H_RTAS.  The exception
for H_RTAS is because userspace can already control whether
individual RTAS functions are handled in-kernel or not via the
KVM_PPC_RTAS_DEFINE_TOKEN ioctl, and because the numeric value for
H_RTAS is out of the normal sequence of hcall numbers.

Hcalls are enabled or disabled using the KVM_ENABLE_CAP ioctl for the
KVM_CAP_PPC_ENABLE_HCALL capability on the file descriptor for the VM.
The args field of the struct kvm_enable_cap specifies the hcall number
in args[0] and the enable/disable flag in args[1]; 0 means disable
in-kernel handling (so that the hcall will always cause an exit to
userspace) and 1 means enable.  Enabling or disabling in-kernel
handling of an hcall is effective across the whole VM.

The ability for KVM_ENABLE_CAP to be used on a VM file descriptor
on PowerPC is new, added by this commit.  The KVM_CAP_ENABLE_CAP_VM
capability advertises that this ability exists.

When a VM is created, an initial set of hcalls are enabled for
in-kernel handling.  The set that is enabled is the set that have
an in-kernel implementation at this point.  Any new hcall
implementations from this point onwards should not be added to the
default set without a good reason.

No distinction is made between real-mode and virtual-mode hcall
implementations; the one setting controls them both.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 Documentation/virtual/kvm/api.txt       | 41 ++++++++++++++++++++++++--
 arch/powerpc/include/asm/kvm_book3s.h   |  1 +
 arch/powerpc/include/asm/kvm_host.h     |  2 ++
 arch/powerpc/kernel/asm-offsets.c       |  1 +
 arch/powerpc/kvm/book3s_hv.c            | 51 +++++++++++++++++++++++++++++++++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 11 +++++++
 arch/powerpc/kvm/book3s_pr.c            |  5 ++++
 arch/powerpc/kvm/book3s_pr_papr.c       | 37 ++++++++++++++++++++++++
 arch/powerpc/kvm/powerpc.c              | 45 +++++++++++++++++++++++++++++
 include/uapi/linux/kvm.h                |  1 +
 10 files changed, 193 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 0fe36497642c..5c54d196f4c8 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2863,8 +2863,8 @@ The fields in each entry are defined as follows:
          this function/index combination
 
 
-6. Capabilities that can be enabled
------------------------------------
+6. Capabilities that can be enabled on vCPUs
+--------------------------------------------
 
 There are certain capabilities that change the behavior of the virtual CPU when
 enabled. To enable them, please see section 4.37. Below you can find a list of
@@ -3002,3 +3002,40 @@ Parameters: args[0] is the XICS device fd
             args[1] is the XICS CPU number (server ID) for this vcpu
 
 This capability connects the vcpu to an in-kernel XICS device.
+
+
+7. Capabilities that can be enabled on VMs
+------------------------------------------
+
+There are certain capabilities that change the behavior of the virtual
+machine when enabled. To enable them, please see section 4.37. Below
+you can find a list of capabilities and what their effect on the VM
+is when enabling them.
+
+The following information is provided along with the description:
+
+  Architectures: which instruction set architectures provide this ioctl.
+      x86 includes both i386 and x86_64.
+
+  Parameters: what parameters are accepted by the capability.
+
+  Returns: the return value.  General error numbers (EBADF, ENOMEM, EINVAL)
+      are not detailed, but errors with specific meanings are.
+
+
+7.1 KVM_CAP_PPC_ENABLE_HCALL
+
+Architectures: ppc
+Parameters: args[0] is the sPAPR hcall number
+	    args[1] is 0 to disable, 1 to enable in-kernel handling
+
+This capability controls whether individual sPAPR hypercalls (hcalls)
+get handled by the kernel or not.  Enabling or disabling in-kernel
+handling of an hcall is effective across the VM.  On creation, an
+initial set of hcalls are enabled for in-kernel handling, which
+consists of those hcalls for which in-kernel handlers were implemented
+before this capability was implemented.  If disabled, the kernel will
+not to attempt to handle the hcall, but will always exit to userspace
+to handle it.  Note that it may not make sense to enable some and
+disable others of a group of related hcalls, but KVM does not prevent
+userspace from doing that.
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index a20cc0bbd048..052ab2ad49b5 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -187,6 +187,7 @@ extern void kvmppc_hv_entry_trampoline(void);
 extern u32 kvmppc_alignment_dsisr(struct kvm_vcpu *vcpu, unsigned int inst);
 extern ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, unsigned int inst);
 extern int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd);
+extern void kvmppc_pr_init_default_hcalls(struct kvm *kvm);
 extern void kvmppc_copy_to_svcpu(struct kvmppc_book3s_shadow_vcpu *svcpu,
 				 struct kvm_vcpu *vcpu);
 extern void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu,
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index f9ae69682ce1..62b2cee450a5 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -34,6 +34,7 @@
 #include <asm/processor.h>
 #include <asm/page.h>
 #include <asm/cacheflush.h>
+#include <asm/hvcall.h>
 
 #define KVM_MAX_VCPUS		NR_CPUS
 #define KVM_MAX_VCORES		NR_CPUS
@@ -263,6 +264,7 @@ struct kvm_arch {
 #ifdef CONFIG_PPC_BOOK3S_64
 	struct list_head spapr_tce_tables;
 	struct list_head rtas_tokens;
+	DECLARE_BITMAP(enabled_hcalls, MAX_HCALL_OPCODE/4 + 1);
 #endif
 #ifdef CONFIG_KVM_MPIC
 	struct openpic *mpic;
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index f5995a912213..17ffcb4f27f9 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -493,6 +493,7 @@ int main(void)
 	DEFINE(KVM_HOST_SDR1, offsetof(struct kvm, arch.host_sdr1));
 	DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock));
 	DEFINE(KVM_NEED_FLUSH, offsetof(struct kvm, arch.need_tlb_flush.bits));
+	DEFINE(KVM_ENABLED_HCALLS, offsetof(struct kvm, arch.enabled_hcalls));
 	DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr));
 	DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor));
 	DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v));
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 1562acfa05bf..cf445d22570f 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -67,6 +67,8 @@
 /* Used as a "null" value for timebase values */
 #define TB_NIL	(~(u64)0)
 
+static DECLARE_BITMAP(default_enabled_hcalls, MAX_HCALL_OPCODE/4 + 1);
+
 static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
 
@@ -562,6 +564,10 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 	struct kvm_vcpu *tvcpu;
 	int idx, rc;
 
+	if (req <= MAX_HCALL_OPCODE &&
+	    !test_bit(req/4, vcpu->kvm->arch.enabled_hcalls))
+		return RESUME_HOST;
+
 	switch (req) {
 	case H_ENTER:
 		idx = srcu_read_lock(&vcpu->kvm->srcu);
@@ -2269,6 +2275,10 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 	 */
 	cpumask_setall(&kvm->arch.need_tlb_flush);
 
+	/* Start out with the default set of hcalls enabled */
+	memcpy(kvm->arch.enabled_hcalls, default_enabled_hcalls,
+	       sizeof(kvm->arch.enabled_hcalls));
+
 	kvm->arch.rma = NULL;
 
 	kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
@@ -2407,6 +2417,45 @@ static long kvm_arch_vm_ioctl_hv(struct file *filp,
 	return r;
 }
 
+/*
+ * List of hcall numbers to enable by default.
+ * For compatibility with old userspace, we enable by default
+ * all hcalls that were implemented before the hcall-enabling
+ * facility was added.  Note this list should not include H_RTAS.
+ */
+static unsigned int default_hcall_list[] = {
+	H_REMOVE,
+	H_ENTER,
+	H_READ,
+	H_PROTECT,
+	H_BULK_REMOVE,
+	H_GET_TCE,
+	H_PUT_TCE,
+	H_SET_DABR,
+	H_SET_XDABR,
+	H_CEDE,
+	H_PROD,
+	H_CONFER,
+	H_REGISTER_VPA,
+#ifdef CONFIG_KVM_XICS
+	H_EOI,
+	H_CPPR,
+	H_IPI,
+	H_IPOLL,
+	H_XIRR,
+	H_XIRR_X,
+#endif
+	0
+};
+
+static void init_default_hcalls(void)
+{
+	int i;
+
+	for (i = 0; default_hcall_list[i]; ++i)
+		__set_bit(default_hcall_list[i] / 4, default_enabled_hcalls);
+}
+
 static struct kvmppc_ops kvm_ops_hv = {
 	.get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
 	.set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
@@ -2454,6 +2503,8 @@ static int kvmppc_book3s_init_hv(void)
 	kvm_ops_hv.owner = THIS_MODULE;
 	kvmppc_hv_ops = &kvm_ops_hv;
 
+	init_default_hcalls();
+
 	r = kvmppc_mmu_hv_init();
 	return r;
 }
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 64ac56f6c3fe..33aaadef7139 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1909,6 +1909,17 @@ hcall_try_real_mode:
 	clrrdi	r3,r3,2
 	cmpldi	r3,hcall_real_table_end - hcall_real_table
 	bge	guest_exit_cont
+	/* See if this hcall is enabled for in-kernel handling */
+	ld	r4, VCPU_KVM(r9)
+	srdi	r0, r3, 8	/* r0 = (r3 / 4) >> 6 */
+	sldi	r0, r0, 3	/* index into kvm->arch.enabled_hcalls[] */
+	add	r4, r4, r0
+	ld	r0, KVM_ENABLED_HCALLS(r4)
+	rlwinm	r4, r3, 32-2, 0x3f	/* r4 = (r3 / 4) & 0x3f */
+	srd	r0, r0, r4
+	andi.	r0, r0, 1
+	beq	guest_exit_cont
+	/* Get pointer to handler, if any, and call it */
 	LOAD_REG_ADDR(r4, hcall_real_table)
 	lwax	r3,r3,r4
 	cmpwi	r3,0
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 3b82e8616dfa..123ac7dc5e1f 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -1597,6 +1597,11 @@ static int kvmppc_core_init_vm_pr(struct kvm *kvm)
 {
 	mutex_init(&kvm->arch.hpt_mutex);
 
+#ifdef CONFIG_PPC_BOOK3S_64
+	/* Start out with the default set of hcalls enabled */
+	kvmppc_pr_init_default_hcalls(kvm);
+#endif
+
 	if (firmware_has_feature(FW_FEATURE_SET_MODE)) {
 		spin_lock(&kvm_global_user_count_lock);
 		if (++kvm_global_user_count == 1)
diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
index f7c25c625a5b..eacaa6e4876e 100644
--- a/arch/powerpc/kvm/book3s_pr_papr.c
+++ b/arch/powerpc/kvm/book3s_pr_papr.c
@@ -267,6 +267,10 @@ static int kvmppc_h_pr_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
 
 int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
 {
+	if (cmd <= MAX_HCALL_OPCODE &&
+	    !test_bit(cmd/4, vcpu->kvm->arch.enabled_hcalls))
+		return EMULATE_FAIL;
+
 	switch (cmd) {
 	case H_ENTER:
 		return kvmppc_h_pr_enter(vcpu);
@@ -304,3 +308,36 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
 
 	return EMULATE_FAIL;
 }
+
+
+/*
+ * List of hcall numbers to enable by default.
+ * For compatibility with old userspace, we enable by default
+ * all hcalls that were implemented before the hcall-enabling
+ * facility was added.  Note this list should not include H_RTAS.
+ */
+static unsigned int default_hcall_list[] = {
+	H_ENTER,
+	H_REMOVE,
+	H_PROTECT,
+	H_BULK_REMOVE,
+	H_PUT_TCE,
+	H_CEDE,
+#ifdef CONFIG_KVM_XICS
+	H_XIRR,
+	H_CPPR,
+	H_EOI,
+	H_IPI,
+	H_IPOLL,
+	H_XIRR_X,
+#endif
+	0
+};
+
+void kvmppc_pr_init_default_hcalls(struct kvm *kvm)
+{
+	int i;
+
+	for (i = 0; default_hcall_list[i]; ++i)
+		__set_bit(default_hcall_list[i] / 4, kvm->arch.enabled_hcalls);
+}
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 61c738ab1283..3222a4d08a6f 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -387,6 +387,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_PPC_UNSET_IRQ:
 	case KVM_CAP_PPC_IRQ_LEVEL:
 	case KVM_CAP_ENABLE_CAP:
+	case KVM_CAP_ENABLE_CAP_VM:
 	case KVM_CAP_ONE_REG:
 	case KVM_CAP_IOEVENTFD:
 	case KVM_CAP_DEVICE_CTRL:
@@ -417,6 +418,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_PPC_ALLOC_HTAB:
 	case KVM_CAP_PPC_RTAS:
 	case KVM_CAP_PPC_FIXUP_HCALL:
+	case KVM_CAP_PPC_ENABLE_HCALL:
 #ifdef CONFIG_KVM_XICS
 	case KVM_CAP_IRQ_XICS:
 #endif
@@ -1099,6 +1101,40 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
 	return 0;
 }
 
+
+static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
+				   struct kvm_enable_cap *cap)
+{
+	int r;
+
+	if (cap->flags)
+		return -EINVAL;
+
+	switch (cap->cap) {
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+	case KVM_CAP_PPC_ENABLE_HCALL: {
+		unsigned long hcall = cap->args[0];
+
+		r = -EINVAL;
+		if (hcall > MAX_HCALL_OPCODE || (hcall & 3) ||
+		    cap->args[1] > 1)
+			break;
+		if (cap->args[1])
+			set_bit(hcall / 4, kvm->arch.enabled_hcalls);
+		else
+			clear_bit(hcall / 4, kvm->arch.enabled_hcalls);
+		r = 0;
+		break;
+	}
+#endif
+	default:
+		r = -EINVAL;
+		break;
+	}
+
+	return r;
+}
+
 long kvm_arch_vm_ioctl(struct file *filp,
                        unsigned int ioctl, unsigned long arg)
 {
@@ -1118,6 +1154,15 @@ long kvm_arch_vm_ioctl(struct file *filp,
 
 		break;
 	}
+	case KVM_ENABLE_CAP:
+	{
+		struct kvm_enable_cap cap;
+		r = -EFAULT;
+		if (copy_from_user(&cap, argp, sizeof(cap)))
+			goto out;
+		r = kvm_vm_ioctl_enable_cap(kvm, &cap);
+		break;
+	}
 #ifdef CONFIG_PPC_BOOK3S_64
 	case KVM_CREATE_SPAPR_TCE: {
 		struct kvm_create_spapr_tce create_tce;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index e11d8f170a62..0418b746cb68 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -758,6 +758,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_VM_ATTRIBUTES 101
 #define KVM_CAP_ARM_PSCI_0_2 102
 #define KVM_CAP_PPC_FIXUP_HCALL 103
+#define KVM_CAP_PPC_ENABLE_HCALL 104
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3


From 92b591a4c46b103ebd3fc0d03a084e1efd331253 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 14 Jul 2014 18:33:08 +0200
Subject: KVM: Allow KVM_CHECK_EXTENSION on the vm fd

The KVM_CHECK_EXTENSION is only available on the kvm fd today. Unfortunately
on PPC some of the capabilities change depending on the way a VM was created.

So instead we need a way to expose capabilities as VM ioctl, so that we can
see which VM type we're using (HV or PR). To enable this, add the
KVM_CHECK_EXTENSION ioctl to our vm ioctl portfolio.

Signed-off-by: Alexander Graf <agraf@suse.de>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/api.txt |  7 +++--
 include/uapi/linux/kvm.h          |  1 +
 virt/kvm/kvm_main.c               | 58 +++++++++++++++++++++------------------
 3 files changed, 37 insertions(+), 29 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 884f819e1908..8898caf8c090 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -148,9 +148,9 @@ of banks, as set via the KVM_X86_SETUP_MCE ioctl.
 
 4.4 KVM_CHECK_EXTENSION
 
-Capability: basic
+Capability: basic, KVM_CAP_CHECK_EXTENSION_VM for vm ioctl
 Architectures: all
-Type: system ioctl
+Type: system ioctl, vm ioctl
 Parameters: extension identifier (KVM_CAP_*)
 Returns: 0 if unsupported; 1 (or some other positive integer) if supported
 
@@ -160,6 +160,9 @@ receives an integer that describes the extension availability.
 Generally 0 means no and 1 means yes, but some extensions may report
 additional information in the integer return value.
 
+Based on their initialization different VMs may have different capabilities.
+It is thus encouraged to use the vm ioctl to query for capabilities (available
+with KVM_CAP_CHECK_EXTENSION_VM on the vm fd)
 
 4.5 KVM_GET_VCPU_MMAP_SIZE
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 0418b746cb68..51776cac6a9b 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -759,6 +759,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_ARM_PSCI_0_2 102
 #define KVM_CAP_PPC_FIXUP_HCALL 103
 #define KVM_CAP_PPC_ENABLE_HCALL 104
+#define KVM_CAP_CHECK_EXTENSION_VM 105
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e28f3caa539d..1b95cc926cfc 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2324,6 +2324,34 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
 	return 0;
 }
 
+static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
+{
+	switch (arg) {
+	case KVM_CAP_USER_MEMORY:
+	case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
+	case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
+#ifdef CONFIG_KVM_APIC_ARCHITECTURE
+	case KVM_CAP_SET_BOOT_CPU_ID:
+#endif
+	case KVM_CAP_INTERNAL_ERROR_DATA:
+#ifdef CONFIG_HAVE_KVM_MSI
+	case KVM_CAP_SIGNAL_MSI:
+#endif
+#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
+	case KVM_CAP_IRQFD_RESAMPLE:
+#endif
+	case KVM_CAP_CHECK_EXTENSION_VM:
+		return 1;
+#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
+	case KVM_CAP_IRQ_ROUTING:
+		return KVM_MAX_IRQ_ROUTES;
+#endif
+	default:
+		break;
+	}
+	return kvm_vm_ioctl_check_extension(kvm, arg);
+}
+
 static long kvm_vm_ioctl(struct file *filp,
 			   unsigned int ioctl, unsigned long arg)
 {
@@ -2487,6 +2515,9 @@ static long kvm_vm_ioctl(struct file *filp,
 		r = 0;
 		break;
 	}
+	case KVM_CHECK_EXTENSION:
+		r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
+		break;
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
 		if (r == -ENOTTY)
@@ -2571,33 +2602,6 @@ static int kvm_dev_ioctl_create_vm(unsigned long type)
 	return r;
 }
 
-static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
-{
-	switch (arg) {
-	case KVM_CAP_USER_MEMORY:
-	case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
-	case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
-#ifdef CONFIG_KVM_APIC_ARCHITECTURE
-	case KVM_CAP_SET_BOOT_CPU_ID:
-#endif
-	case KVM_CAP_INTERNAL_ERROR_DATA:
-#ifdef CONFIG_HAVE_KVM_MSI
-	case KVM_CAP_SIGNAL_MSI:
-#endif
-#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
-	case KVM_CAP_IRQFD_RESAMPLE:
-#endif
-		return 1;
-#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
-	case KVM_CAP_IRQ_ROUTING:
-		return KVM_MAX_IRQ_ROUTES;
-#endif
-	default:
-		break;
-	}
-	return kvm_vm_ioctl_check_extension(kvm, arg);
-}
-
 static long kvm_dev_ioctl(struct file *filp,
 			  unsigned int ioctl, unsigned long arg)
 {
-- 
cgit v1.2.3


From ce91ddc471b77ec75e5b2a43c803efac605f37b3 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 28 Jul 2014 19:29:13 +0200
Subject: KVM: PPC: Remove DCR handling

DCR handling was only needed for 440 KVM. Since we removed it, we can also
remove handling of DCR accesses.

Signed-off-by: Alexander Graf <agraf@suse.de>
---
 Documentation/virtual/kvm/api.txt   |  6 +++---
 arch/powerpc/include/asm/kvm_host.h |  4 ----
 arch/powerpc/include/asm/kvm_ppc.h  |  1 -
 arch/powerpc/kvm/booke.c            |  5 -----
 arch/powerpc/kvm/powerpc.c          | 10 ----------
 arch/powerpc/kvm/timing.c           |  1 -
 arch/powerpc/kvm/timing.h           |  3 ---
 include/uapi/linux/kvm.h            |  4 ++--
 8 files changed, 5 insertions(+), 29 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 8898caf8c090..a21ff2265c21 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2613,8 +2613,8 @@ The 'data' member contains, in its first 'len' bytes, the value as it would
 appear if the VCPU performed a load or store of the appropriate width directly
 to the byte array.
 
-NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_DCR,
-      KVM_EXIT_PAPR and KVM_EXIT_EPR the corresponding
+NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI KVM_EXIT_PAPR and
+      KVM_EXIT_EPR the corresponding
 operations are complete (and guest state is consistent) only after userspace
 has re-entered the kernel with KVM_RUN.  The kernel side will first finish
 incomplete operations and then check for pending signals.  Userspace
@@ -2685,7 +2685,7 @@ Principles of Operation Book in the Chapter for Dynamic Address Translation
 			__u8  is_write;
 		} dcr;
 
-powerpc specific.
+Deprecated - was used for 440 KVM.
 
 		/* KVM_EXIT_OSI */
 		struct {
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 66f5b5914e6c..98d9dd50d063 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -94,7 +94,6 @@ struct kvm_vm_stat {
 struct kvm_vcpu_stat {
 	u32 sum_exits;
 	u32 mmio_exits;
-	u32 dcr_exits;
 	u32 signal_exits;
 	u32 light_exits;
 	/* Account for special types of light exits: */
@@ -126,7 +125,6 @@ struct kvm_vcpu_stat {
 
 enum kvm_exit_types {
 	MMIO_EXITS,
-	DCR_EXITS,
 	SIGNAL_EXITS,
 	ITLB_REAL_MISS_EXITS,
 	ITLB_VIRT_MISS_EXITS,
@@ -601,8 +599,6 @@ struct kvm_vcpu_arch {
 	u8 io_gpr; /* GPR used as IO source/target */
 	u8 mmio_is_bigendian;
 	u8 mmio_sign_extend;
-	u8 dcr_needed;
-	u8 dcr_is_write;
 	u8 osi_needed;
 	u8 osi_enabled;
 	u8 papr_enabled;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index cbee4538307f..8e36c1e2c631 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -41,7 +41,6 @@
 enum emulation_result {
 	EMULATE_DONE,         /* no further processing */
 	EMULATE_DO_MMIO,      /* kvm_run filled with MMIO request */
-	EMULATE_DO_DCR,       /* kvm_run filled with DCR request */
 	EMULATE_FAIL,         /* can't emulate this instruction */
 	EMULATE_AGAIN,        /* something went wrong. go again */
 	EMULATE_EXIT_USER,    /* emulation requires exit to user-space */
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index f30948a17c03..b4c89fa6f109 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -51,7 +51,6 @@ unsigned long kvmppc_booke_handlers;
 
 struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "mmio",       VCPU_STAT(mmio_exits) },
-	{ "dcr",        VCPU_STAT(dcr_exits) },
 	{ "sig",        VCPU_STAT(signal_exits) },
 	{ "itlb_r",     VCPU_STAT(itlb_real_miss_exits) },
 	{ "itlb_v",     VCPU_STAT(itlb_virt_miss_exits) },
@@ -709,10 +708,6 @@ static int emulation_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	case EMULATE_AGAIN:
 		return RESUME_GUEST;
 
-	case EMULATE_DO_DCR:
-		run->exit_reason = KVM_EXIT_DCR;
-		return RESUME_HOST;
-
 	case EMULATE_FAIL:
 		printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n",
 		       __func__, vcpu->arch.pc, vcpu->arch.last_inst);
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index c14ed15fd60b..288b4bb05cbd 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -743,12 +743,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 #endif
 }
 
-static void kvmppc_complete_dcr_load(struct kvm_vcpu *vcpu,
-                                     struct kvm_run *run)
-{
-	kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, run->dcr.data);
-}
-
 static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
                                       struct kvm_run *run)
 {
@@ -945,10 +939,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		if (!vcpu->mmio_is_write)
 			kvmppc_complete_mmio_load(vcpu, run);
 		vcpu->mmio_needed = 0;
-	} else if (vcpu->arch.dcr_needed) {
-		if (!vcpu->arch.dcr_is_write)
-			kvmppc_complete_dcr_load(vcpu, run);
-		vcpu->arch.dcr_needed = 0;
 	} else if (vcpu->arch.osi_needed) {
 		u64 *gprs = run->osi.gprs;
 		int i;
diff --git a/arch/powerpc/kvm/timing.c b/arch/powerpc/kvm/timing.c
index 07b6110a4bb7..e44d2b2ea97e 100644
--- a/arch/powerpc/kvm/timing.c
+++ b/arch/powerpc/kvm/timing.c
@@ -110,7 +110,6 @@ void kvmppc_update_timing_stats(struct kvm_vcpu *vcpu)
 
 static const char *kvm_exit_names[__NUMBER_OF_KVM_EXIT_TYPES] = {
 	[MMIO_EXITS] =              "MMIO",
-	[DCR_EXITS] =               "DCR",
 	[SIGNAL_EXITS] =            "SIGNAL",
 	[ITLB_REAL_MISS_EXITS] =    "ITLBREAL",
 	[ITLB_VIRT_MISS_EXITS] =    "ITLBVIRT",
diff --git a/arch/powerpc/kvm/timing.h b/arch/powerpc/kvm/timing.h
index bf191e72b2d8..3123690c82dc 100644
--- a/arch/powerpc/kvm/timing.h
+++ b/arch/powerpc/kvm/timing.h
@@ -63,9 +63,6 @@ static inline void kvmppc_account_exit_stat(struct kvm_vcpu *vcpu, int type)
 	case EMULATED_INST_EXITS:
 		vcpu->stat.emulated_inst_exits++;
 		break;
-	case DCR_EXITS:
-		vcpu->stat.dcr_exits++;
-		break;
 	case DSI_EXITS:
 		vcpu->stat.dsi_exits++;
 		break;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 51776cac6a9b..f6f24aeb9e1a 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -162,7 +162,7 @@ struct kvm_pit_config {
 #define KVM_EXIT_TPR_ACCESS       12
 #define KVM_EXIT_S390_SIEIC       13
 #define KVM_EXIT_S390_RESET       14
-#define KVM_EXIT_DCR              15
+#define KVM_EXIT_DCR              15 /* deprecated */
 #define KVM_EXIT_NMI              16
 #define KVM_EXIT_INTERNAL_ERROR   17
 #define KVM_EXIT_OSI              18
@@ -268,7 +268,7 @@ struct kvm_run {
 			__u64 trans_exc_code;
 			__u32 pgm_code;
 		} s390_ucontrol;
-		/* KVM_EXIT_DCR */
+		/* KVM_EXIT_DCR (deprecated) */
 		struct {
 			__u32 dcrn;
 			__u32 data;
-- 
cgit v1.2.3


From 1b69be5e8afc634f39ad695a6ab6aad0cf0975c7 Mon Sep 17 00:00:00 2001
From: Gavin Shan <gwshan@linux.vnet.ibm.com>
Date: Tue, 10 Jun 2014 11:41:57 +1000
Subject: drivers/vfio: EEH support for VFIO PCI device

The patch adds new IOCTL commands for sPAPR VFIO container device
to support EEH functionality for PCI devices, which have been passed
through from host to somebody else via VFIO.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Acked-by: Alexander Graf <agraf@suse.de>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 Documentation/vfio.txt              | 87 +++++++++++++++++++++++++++++++++++--
 drivers/vfio/Makefile               |  1 +
 drivers/vfio/pci/vfio_pci.c         | 18 ++++++--
 drivers/vfio/vfio_iommu_spapr_tce.c | 17 +++++++-
 drivers/vfio/vfio_spapr_eeh.c       | 87 +++++++++++++++++++++++++++++++++++++
 include/linux/vfio.h                | 23 ++++++++++
 include/uapi/linux/vfio.h           | 34 +++++++++++++++
 7 files changed, 259 insertions(+), 8 deletions(-)
 create mode 100644 drivers/vfio/vfio_spapr_eeh.c

(limited to 'include/uapi/linux')

diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index b9ca02370d46..96978eced341 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -305,7 +305,15 @@ faster, the map/unmap handling has been implemented in real mode which provides
 an excellent performance which has limitations such as inability to do
 locked pages accounting in real time.
 
-So 3 additional ioctls have been added:
+4) According to sPAPR specification, A Partitionable Endpoint (PE) is an I/O
+subtree that can be treated as a unit for the purposes of partitioning and
+error recovery. A PE may be a single or multi-function IOA (IO Adapter), a
+function of a multi-function IOA, or multiple IOAs (possibly including switch
+and bridge structures above the multiple IOAs). PPC64 guests detect PCI errors
+and recover from them via EEH RTAS services, which works on the basis of
+additional ioctl commands.
+
+So 4 additional ioctls have been added:
 
 	VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
 		of the DMA window on the PCI bus.
@@ -316,9 +324,12 @@ So 3 additional ioctls have been added:
 
 	VFIO_IOMMU_DISABLE - disables the container.
 
+	VFIO_EEH_PE_OP - provides an API for EEH setup, error detection and recovery.
 
 The code flow from the example above should be slightly changed:
 
+	struct vfio_eeh_pe_op pe_op = { .argsz = sizeof(pe_op), .flags = 0 };
+
 	.....
 	/* Add the group to the container */
 	ioctl(group, VFIO_GROUP_SET_CONTAINER, &container);
@@ -342,9 +353,79 @@ The code flow from the example above should be slightly changed:
 	dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
 
 	/* Check here is .iova/.size are within DMA window from spapr_iommu_info */
-
 	ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map);
-	.....
+
+	/* Get a file descriptor for the device */
+	device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, "0000:06:0d.0");
+
+	....
+
+	/* Gratuitous device reset and go... */
+	ioctl(device, VFIO_DEVICE_RESET);
+
+	/* Make sure EEH is supported */
+	ioctl(container, VFIO_CHECK_EXTENSION, VFIO_EEH);
+
+	/* Enable the EEH functionality on the device */
+	pe_op.op = VFIO_EEH_PE_ENABLE;
+	ioctl(container, VFIO_EEH_PE_OP, &pe_op);
+
+	/* You're suggested to create additional data struct to represent
+	 * PE, and put child devices belonging to same IOMMU group to the
+	 * PE instance for later reference.
+	 */
+
+	/* Check the PE's state and make sure it's in functional state */
+	pe_op.op = VFIO_EEH_PE_GET_STATE;
+	ioctl(container, VFIO_EEH_PE_OP, &pe_op);
+
+	/* Save device state using pci_save_state().
+	 * EEH should be enabled on the specified device.
+	 */
+
+	....
+
+	/* When 0xFF's returned from reading PCI config space or IO BARs
+	 * of the PCI device. Check the PE's state to see if that has been
+	 * frozen.
+	 */
+	ioctl(container, VFIO_EEH_PE_OP, &pe_op);
+
+	/* Waiting for pending PCI transactions to be completed and don't
+	 * produce any more PCI traffic from/to the affected PE until
+	 * recovery is finished.
+	 */
+
+	/* Enable IO for the affected PE and collect logs. Usually, the
+	 * standard part of PCI config space, AER registers are dumped
+	 * as logs for further analysis.
+	 */
+	pe_op.op = VFIO_EEH_PE_UNFREEZE_IO;
+	ioctl(container, VFIO_EEH_PE_OP, &pe_op);
+
+	/*
+	 * Issue PE reset: hot or fundamental reset. Usually, hot reset
+	 * is enough. However, the firmware of some PCI adapters would
+	 * require fundamental reset.
+	 */
+	pe_op.op = VFIO_EEH_PE_RESET_HOT;
+	ioctl(container, VFIO_EEH_PE_OP, &pe_op);
+	pe_op.op = VFIO_EEH_PE_RESET_DEACTIVATE;
+	ioctl(container, VFIO_EEH_PE_OP, &pe_op);
+
+	/* Configure the PCI bridges for the affected PE */
+	pe_op.op = VFIO_EEH_PE_CONFIGURE;
+	ioctl(container, VFIO_EEH_PE_OP, &pe_op);
+
+	/* Restored state we saved at initialization time. pci_restore_state()
+	 * is good enough as an example.
+	 */
+
+	/* Hopefully, error is recovered successfully. Now, you can resume to
+	 * start PCI traffic to/from the affected PE.
+	 */
+
+	....
 
 -------------------------------------------------------------------------------
 
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 72bfabc8629e..50e30bc75e85 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -1,4 +1,5 @@
 obj-$(CONFIG_VFIO) += vfio.o
 obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
 obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
+obj-$(CONFIG_EEH) += vfio_spapr_eeh.o
 obj-$(CONFIG_VFIO_PCI) += pci/
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 010e0f8b8e4f..e2ee80f36e3e 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -157,8 +157,10 @@ static void vfio_pci_release(void *device_data)
 {
 	struct vfio_pci_device *vdev = device_data;
 
-	if (atomic_dec_and_test(&vdev->refcnt))
+	if (atomic_dec_and_test(&vdev->refcnt)) {
+		vfio_spapr_pci_eeh_release(vdev->pdev);
 		vfio_pci_disable(vdev);
+	}
 
 	module_put(THIS_MODULE);
 }
@@ -166,19 +168,27 @@ static void vfio_pci_release(void *device_data)
 static int vfio_pci_open(void *device_data)
 {
 	struct vfio_pci_device *vdev = device_data;
+	int ret;
 
 	if (!try_module_get(THIS_MODULE))
 		return -ENODEV;
 
 	if (atomic_inc_return(&vdev->refcnt) == 1) {
-		int ret = vfio_pci_enable(vdev);
+		ret = vfio_pci_enable(vdev);
+		if (ret)
+			goto error;
+
+		ret = vfio_spapr_pci_eeh_open(vdev->pdev);
 		if (ret) {
-			module_put(THIS_MODULE);
-			return ret;
+			vfio_pci_disable(vdev);
+			goto error;
 		}
 	}
 
 	return 0;
+error:
+	module_put(THIS_MODULE);
+	return ret;
 }
 
 static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index a84788ba662c..730b4ef3e0cc 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -156,7 +156,16 @@ static long tce_iommu_ioctl(void *iommu_data,
 
 	switch (cmd) {
 	case VFIO_CHECK_EXTENSION:
-		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
+		switch (arg) {
+		case VFIO_SPAPR_TCE_IOMMU:
+			ret = 1;
+			break;
+		default:
+			ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg);
+			break;
+		}
+
+		return (ret < 0) ? 0 : ret;
 
 	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
 		struct vfio_iommu_spapr_tce_info info;
@@ -283,6 +292,12 @@ static long tce_iommu_ioctl(void *iommu_data,
 		tce_iommu_disable(container);
 		mutex_unlock(&container->lock);
 		return 0;
+	case VFIO_EEH_PE_OP:
+		if (!container->tbl || !container->tbl->it_group)
+			return -ENODEV;
+
+		return vfio_spapr_iommu_eeh_ioctl(container->tbl->it_group,
+						  cmd, arg);
 	}
 
 	return -ENOTTY;
diff --git a/drivers/vfio/vfio_spapr_eeh.c b/drivers/vfio/vfio_spapr_eeh.c
new file mode 100644
index 000000000000..f834b4ce1431
--- /dev/null
+++ b/drivers/vfio/vfio_spapr_eeh.c
@@ -0,0 +1,87 @@
+/*
+ * EEH functionality support for VFIO devices. The feature is only
+ * available on sPAPR compatible platforms.
+ *
+ * Copyright Gavin Shan, IBM Corporation 2014.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/uaccess.h>
+#include <linux/vfio.h>
+#include <asm/eeh.h>
+
+/* We might build address mapping here for "fast" path later */
+int vfio_spapr_pci_eeh_open(struct pci_dev *pdev)
+{
+	return eeh_dev_open(pdev);
+}
+
+void vfio_spapr_pci_eeh_release(struct pci_dev *pdev)
+{
+	eeh_dev_release(pdev);
+}
+
+long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
+				unsigned int cmd, unsigned long arg)
+{
+	struct eeh_pe *pe;
+	struct vfio_eeh_pe_op op;
+	unsigned long minsz;
+	long ret = -EINVAL;
+
+	switch (cmd) {
+	case VFIO_CHECK_EXTENSION:
+		if (arg == VFIO_EEH)
+			ret = eeh_enabled() ? 1 : 0;
+		else
+			ret = 0;
+		break;
+	case VFIO_EEH_PE_OP:
+		pe = eeh_iommu_group_to_pe(group);
+		if (!pe)
+			return -ENODEV;
+
+		minsz = offsetofend(struct vfio_eeh_pe_op, op);
+		if (copy_from_user(&op, (void __user *)arg, minsz))
+			return -EFAULT;
+		if (op.argsz < minsz || op.flags)
+			return -EINVAL;
+
+		switch (op.op) {
+		case VFIO_EEH_PE_DISABLE:
+			ret = eeh_pe_set_option(pe, EEH_OPT_DISABLE);
+			break;
+		case VFIO_EEH_PE_ENABLE:
+			ret = eeh_pe_set_option(pe, EEH_OPT_ENABLE);
+			break;
+		case VFIO_EEH_PE_UNFREEZE_IO:
+			ret = eeh_pe_set_option(pe, EEH_OPT_THAW_MMIO);
+			break;
+		case VFIO_EEH_PE_UNFREEZE_DMA:
+			ret = eeh_pe_set_option(pe, EEH_OPT_THAW_DMA);
+			break;
+		case VFIO_EEH_PE_GET_STATE:
+			ret = eeh_pe_get_state(pe);
+			break;
+		case VFIO_EEH_PE_RESET_DEACTIVATE:
+			ret = eeh_pe_reset(pe, EEH_RESET_DEACTIVATE);
+			break;
+		case VFIO_EEH_PE_RESET_HOT:
+			ret = eeh_pe_reset(pe, EEH_RESET_HOT);
+			break;
+		case VFIO_EEH_PE_RESET_FUNDAMENTAL:
+			ret = eeh_pe_reset(pe, EEH_RESET_FUNDAMENTAL);
+			break;
+		case VFIO_EEH_PE_CONFIGURE:
+			ret = eeh_pe_configure(pe);
+			break;
+		default:
+			ret = -EINVAL;
+		}
+	}
+
+	return ret;
+}
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 8ec980b5e3af..25a0fbd4b998 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -98,4 +98,27 @@ extern int vfio_external_user_iommu_id(struct vfio_group *group);
 extern long vfio_external_check_extension(struct vfio_group *group,
 					  unsigned long arg);
 
+#ifdef CONFIG_EEH
+extern int vfio_spapr_pci_eeh_open(struct pci_dev *pdev);
+extern void vfio_spapr_pci_eeh_release(struct pci_dev *pdev);
+extern long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
+				       unsigned int cmd,
+				       unsigned long arg);
+#else
+static inline int vfio_spapr_pci_eeh_open(struct pci_dev *pdev)
+{
+	return 0;
+}
+
+static inline void vfio_spapr_pci_eeh_release(struct pci_dev *pdev)
+{
+}
+
+static inline long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
+					      unsigned int cmd,
+					      unsigned long arg)
+{
+	return -ENOTTY;
+}
+#endif /* CONFIG_EEH */
 #endif /* VFIO_H */
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index cb9023d4f063..6612974c64bf 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -30,6 +30,9 @@
  */
 #define VFIO_DMA_CC_IOMMU		4
 
+/* Check if EEH is supported */
+#define VFIO_EEH			5
+
 /*
  * The IOCTL interface is designed for extensibility by embedding the
  * structure length (argsz) and flags into structures passed between
@@ -455,6 +458,37 @@ struct vfio_iommu_spapr_tce_info {
 
 #define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
 
+/*
+ * EEH PE operation struct provides ways to:
+ * - enable/disable EEH functionality;
+ * - unfreeze IO/DMA for frozen PE;
+ * - read PE state;
+ * - reset PE;
+ * - configure PE.
+ */
+struct vfio_eeh_pe_op {
+	__u32 argsz;
+	__u32 flags;
+	__u32 op;
+};
+
+#define VFIO_EEH_PE_DISABLE		0	/* Disable EEH functionality */
+#define VFIO_EEH_PE_ENABLE		1	/* Enable EEH functionality  */
+#define VFIO_EEH_PE_UNFREEZE_IO		2	/* Enable IO for frozen PE   */
+#define VFIO_EEH_PE_UNFREEZE_DMA	3	/* Enable DMA for frozen PE  */
+#define VFIO_EEH_PE_GET_STATE		4	/* PE state retrieval        */
+#define  VFIO_EEH_PE_STATE_NORMAL	0	/* PE in functional state    */
+#define  VFIO_EEH_PE_STATE_RESET	1	/* PE reset in progress      */
+#define  VFIO_EEH_PE_STATE_STOPPED	2	/* Stopped DMA and IO        */
+#define  VFIO_EEH_PE_STATE_STOPPED_DMA	4	/* Stopped DMA only          */
+#define  VFIO_EEH_PE_STATE_UNAVAIL	5	/* State unavailable         */
+#define VFIO_EEH_PE_RESET_DEACTIVATE	5	/* Deassert PE reset         */
+#define VFIO_EEH_PE_RESET_HOT		6	/* Assert hot reset          */
+#define VFIO_EEH_PE_RESET_FUNDAMENTAL	7	/* Assert fundamental reset  */
+#define VFIO_EEH_PE_CONFIGURE		8	/* PE configuration          */
+
+#define VFIO_EEH_PE_OP			_IO(VFIO_TYPE, VFIO_BASE + 21)
+
 /* ***************************************************************** */
 
 #endif /* _UAPIVFIO_H */
-- 
cgit v1.2.3


From 753a2ad54ef45e3417a9d49537c2b42b04a2e1be Mon Sep 17 00:00:00 2001
From: Pablo Neira <pablo@netfilter.org>
Date: Thu, 7 Aug 2014 00:17:09 +0200
Subject: net: reallocate new socket option number for IPV6_AUTOFLOWLABEL

cb1ce2e ("ipv6: Implement automatic flow label generation on transmit")
accidentally uses socket option 64, which is already used by ip6tables:

 IP6T_SO_SET_REPLACE / IP6T_SO_GET_INFO               64
 IP6T_SO_SET_ADD_COUNTERS / IP6T_SO_GET_ENTRIES       65

There is comment include/uapi/linux/in6.h warning about that.

Allocate 70 for this, which seems to be unused instead.

Cc: Tom Herbert <therbert@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/in6.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h
index 22b7a69619d8..74a2a1773494 100644
--- a/include/uapi/linux/in6.h
+++ b/include/uapi/linux/in6.h
@@ -233,7 +233,6 @@ struct in6_flowlabel_req {
 #if 0	/* not yet */
 #define IPV6_USE_MIN_MTU	63
 #endif
-#define IPV6_AUTOFLOWLABEL	64
 
 /*
  * Netfilter (1)
@@ -262,6 +261,7 @@ struct in6_flowlabel_req {
  * IP6T_SO_ORIGINAL_DST		80
  */
 
+#define IPV6_AUTOFLOWLABEL	70
 /* RFC5014: Source address selection */
 #define IPV6_ADDR_PREFERENCES	72
 
-- 
cgit v1.2.3


From 40e041a2c858b3caefc757e26cb85bfceae5062b Mon Sep 17 00:00:00 2001
From: David Herrmann <dh.herrmann@gmail.com>
Date: Fri, 8 Aug 2014 14:25:27 -0700
Subject: shm: add sealing API

If two processes share a common memory region, they usually want some
guarantees to allow safe access. This often includes:
  - one side cannot overwrite data while the other reads it
  - one side cannot shrink the buffer while the other accesses it
  - one side cannot grow the buffer beyond previously set boundaries

If there is a trust-relationship between both parties, there is no need
for policy enforcement.  However, if there's no trust relationship (eg.,
for general-purpose IPC) sharing memory-regions is highly fragile and
often not possible without local copies.  Look at the following two
use-cases:

  1) A graphics client wants to share its rendering-buffer with a
     graphics-server. The memory-region is allocated by the client for
     read/write access and a second FD is passed to the server. While
     scanning out from the memory region, the server has no guarantee that
     the client doesn't shrink the buffer at any time, requiring rather
     cumbersome SIGBUS handling.
  2) A process wants to perform an RPC on another process. To avoid huge
     bandwidth consumption, zero-copy is preferred. After a message is
     assembled in-memory and a FD is passed to the remote side, both sides
     want to be sure that neither modifies this shared copy, anymore. The
     source may have put sensible data into the message without a separate
     copy and the target may want to parse the message inline, to avoid a
     local copy.

While SIGBUS handling, POSIX mandatory locking and MAP_DENYWRITE provide
ways to achieve most of this, the first one is unproportionally ugly to
use in libraries and the latter two are broken/racy or even disabled due
to denial of service attacks.

This patch introduces the concept of SEALING.  If you seal a file, a
specific set of operations is blocked on that file forever.  Unlike locks,
seals can only be set, never removed.  Hence, once you verified a specific
set of seals is set, you're guaranteed that no-one can perform the blocked
operations on this file, anymore.

An initial set of SEALS is introduced by this patch:
  - SHRINK: If SEAL_SHRINK is set, the file in question cannot be reduced
            in size. This affects ftruncate() and open(O_TRUNC).
  - GROW: If SEAL_GROW is set, the file in question cannot be increased
          in size. This affects ftruncate(), fallocate() and write().
  - WRITE: If SEAL_WRITE is set, no write operations (besides resizing)
           are possible. This affects fallocate(PUNCH_HOLE), mmap() and
           write().
  - SEAL: If SEAL_SEAL is set, no further seals can be added to a file.
          This basically prevents the F_ADD_SEAL operation on a file and
          can be set to prevent others from adding further seals that you
          don't want.

The described use-cases can easily use these seals to provide safe use
without any trust-relationship:

  1) The graphics server can verify that a passed file-descriptor has
     SEAL_SHRINK set. This allows safe scanout, while the client is
     allowed to increase buffer size for window-resizing on-the-fly.
     Concurrent writes are explicitly allowed.
  2) For general-purpose IPC, both processes can verify that SEAL_SHRINK,
     SEAL_GROW and SEAL_WRITE are set. This guarantees that neither
     process can modify the data while the other side parses it.
     Furthermore, it guarantees that even with writable FDs passed to the
     peer, it cannot increase the size to hit memory-limits of the source
     process (in case the file-storage is accounted to the source).

The new API is an extension to fcntl(), adding two new commands:
  F_GET_SEALS: Return a bitset describing the seals on the file. This
               can be called on any FD if the underlying file supports
               sealing.
  F_ADD_SEALS: Change the seals of a given file. This requires WRITE
               access to the file and F_SEAL_SEAL may not already be set.
               Furthermore, the underlying file must support sealing and
               there may not be any existing shared mapping of that file.
               Otherwise, EBADF/EPERM is returned.
               The given seals are _added_ to the existing set of seals
               on the file. You cannot remove seals again.

The fcntl() handler is currently specific to shmem and disabled on all
files. A file needs to explicitly support sealing for this interface to
work. A separate syscall is added in a follow-up, which creates files that
support sealing. There is no intention to support this on other
file-systems. Semantics are unclear for non-volatile files and we lack any
use-case right now. Therefore, the implementation is specific to shmem.

Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Ryan Lortie <desrt@desrt.ca>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Daniel Mack <zonque@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fcntl.c                 |   5 ++
 include/linux/shmem_fs.h   |  17 ++++++
 include/uapi/linux/fcntl.h |  15 +++++
 mm/shmem.c                 | 143 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 180 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 72c82f69b01b..22d1c3df61ac 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -21,6 +21,7 @@
 #include <linux/rcupdate.h>
 #include <linux/pid_namespace.h>
 #include <linux/user_namespace.h>
+#include <linux/shmem_fs.h>
 
 #include <asm/poll.h>
 #include <asm/siginfo.h>
@@ -336,6 +337,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 	case F_GETPIPE_SZ:
 		err = pipe_fcntl(filp, cmd, arg);
 		break;
+	case F_ADD_SEALS:
+	case F_GET_SEALS:
+		err = shmem_fcntl(filp, cmd, arg);
+		break;
 	default:
 		break;
 	}
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 4d1771c2d29f..50777b5b1e4c 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -1,6 +1,7 @@
 #ifndef __SHMEM_FS_H
 #define __SHMEM_FS_H
 
+#include <linux/file.h>
 #include <linux/swap.h>
 #include <linux/mempolicy.h>
 #include <linux/pagemap.h>
@@ -11,6 +12,7 @@
 
 struct shmem_inode_info {
 	spinlock_t		lock;
+	unsigned int		seals;		/* shmem seals */
 	unsigned long		flags;
 	unsigned long		alloced;	/* data pages alloced to file */
 	union {
@@ -65,4 +67,19 @@ static inline struct page *shmem_read_mapping_page(
 					mapping_gfp_mask(mapping));
 }
 
+#ifdef CONFIG_TMPFS
+
+extern int shmem_add_seals(struct file *file, unsigned int seals);
+extern int shmem_get_seals(struct file *file);
+extern long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg);
+
+#else
+
+static inline long shmem_fcntl(struct file *f, unsigned int c, unsigned long a)
+{
+	return -EINVAL;
+}
+
+#endif
+
 #endif
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 074b886c6be0..beed138bd359 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -27,6 +27,21 @@
 #define F_SETPIPE_SZ	(F_LINUX_SPECIFIC_BASE + 7)
 #define F_GETPIPE_SZ	(F_LINUX_SPECIFIC_BASE + 8)
 
+/*
+ * Set/Get seals
+ */
+#define F_ADD_SEALS	(F_LINUX_SPECIFIC_BASE + 9)
+#define F_GET_SEALS	(F_LINUX_SPECIFIC_BASE + 10)
+
+/*
+ * Types of seals
+ */
+#define F_SEAL_SEAL	0x0001	/* prevent further seals from being set */
+#define F_SEAL_SHRINK	0x0002	/* prevent file from shrinking */
+#define F_SEAL_GROW	0x0004	/* prevent file from growing */
+#define F_SEAL_WRITE	0x0008	/* prevent writes */
+/* (1U << 31) is reserved for signed error codes */
+
 /*
  * Types of directory notifications that may be requested.
  */
diff --git a/mm/shmem.c b/mm/shmem.c
index 6dc80d298f9d..8b43bb7a4efe 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -66,6 +66,7 @@ static struct vfsmount *shm_mnt;
 #include <linux/highmem.h>
 #include <linux/seq_file.h>
 #include <linux/magic.h>
+#include <linux/fcntl.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -547,6 +548,7 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range);
 static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
+	struct shmem_inode_info *info = SHMEM_I(inode);
 	int error;
 
 	error = inode_change_ok(inode, attr);
@@ -557,6 +559,11 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
 		loff_t oldsize = inode->i_size;
 		loff_t newsize = attr->ia_size;
 
+		/* protected by i_mutex */
+		if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
+		    (newsize > oldsize && (info->seals & F_SEAL_GROW)))
+			return -EPERM;
+
 		if (newsize != oldsize) {
 			error = shmem_reacct_size(SHMEM_I(inode)->flags,
 					oldsize, newsize);
@@ -1412,6 +1419,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 		info = SHMEM_I(inode);
 		memset(info, 0, (char *)inode - (char *)info);
 		spin_lock_init(&info->lock);
+		info->seals = F_SEAL_SEAL;
 		info->flags = flags & VM_NORESERVE;
 		INIT_LIST_HEAD(&info->swaplist);
 		simple_xattrs_init(&info->xattrs);
@@ -1470,7 +1478,17 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
 			struct page **pagep, void **fsdata)
 {
 	struct inode *inode = mapping->host;
+	struct shmem_inode_info *info = SHMEM_I(inode);
 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+
+	/* i_mutex is held by caller */
+	if (unlikely(info->seals)) {
+		if (info->seals & F_SEAL_WRITE)
+			return -EPERM;
+		if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
+			return -EPERM;
+	}
+
 	return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
 }
 
@@ -1808,11 +1826,125 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
 	return offset;
 }
 
+static int shmem_wait_for_pins(struct address_space *mapping)
+{
+	return 0;
+}
+
+#define F_ALL_SEALS (F_SEAL_SEAL | \
+		     F_SEAL_SHRINK | \
+		     F_SEAL_GROW | \
+		     F_SEAL_WRITE)
+
+int shmem_add_seals(struct file *file, unsigned int seals)
+{
+	struct inode *inode = file_inode(file);
+	struct shmem_inode_info *info = SHMEM_I(inode);
+	int error;
+
+	/*
+	 * SEALING
+	 * Sealing allows multiple parties to share a shmem-file but restrict
+	 * access to a specific subset of file operations. Seals can only be
+	 * added, but never removed. This way, mutually untrusted parties can
+	 * share common memory regions with a well-defined policy. A malicious
+	 * peer can thus never perform unwanted operations on a shared object.
+	 *
+	 * Seals are only supported on special shmem-files and always affect
+	 * the whole underlying inode. Once a seal is set, it may prevent some
+	 * kinds of access to the file. Currently, the following seals are
+	 * defined:
+	 *   SEAL_SEAL: Prevent further seals from being set on this file
+	 *   SEAL_SHRINK: Prevent the file from shrinking
+	 *   SEAL_GROW: Prevent the file from growing
+	 *   SEAL_WRITE: Prevent write access to the file
+	 *
+	 * As we don't require any trust relationship between two parties, we
+	 * must prevent seals from being removed. Therefore, sealing a file
+	 * only adds a given set of seals to the file, it never touches
+	 * existing seals. Furthermore, the "setting seals"-operation can be
+	 * sealed itself, which basically prevents any further seal from being
+	 * added.
+	 *
+	 * Semantics of sealing are only defined on volatile files. Only
+	 * anonymous shmem files support sealing. More importantly, seals are
+	 * never written to disk. Therefore, there's no plan to support it on
+	 * other file types.
+	 */
+
+	if (file->f_op != &shmem_file_operations)
+		return -EINVAL;
+	if (!(file->f_mode & FMODE_WRITE))
+		return -EPERM;
+	if (seals & ~(unsigned int)F_ALL_SEALS)
+		return -EINVAL;
+
+	mutex_lock(&inode->i_mutex);
+
+	if (info->seals & F_SEAL_SEAL) {
+		error = -EPERM;
+		goto unlock;
+	}
+
+	if ((seals & F_SEAL_WRITE) && !(info->seals & F_SEAL_WRITE)) {
+		error = mapping_deny_writable(file->f_mapping);
+		if (error)
+			goto unlock;
+
+		error = shmem_wait_for_pins(file->f_mapping);
+		if (error) {
+			mapping_allow_writable(file->f_mapping);
+			goto unlock;
+		}
+	}
+
+	info->seals |= seals;
+	error = 0;
+
+unlock:
+	mutex_unlock(&inode->i_mutex);
+	return error;
+}
+EXPORT_SYMBOL_GPL(shmem_add_seals);
+
+int shmem_get_seals(struct file *file)
+{
+	if (file->f_op != &shmem_file_operations)
+		return -EINVAL;
+
+	return SHMEM_I(file_inode(file))->seals;
+}
+EXPORT_SYMBOL_GPL(shmem_get_seals);
+
+long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	long error;
+
+	switch (cmd) {
+	case F_ADD_SEALS:
+		/* disallow upper 32bit */
+		if (arg > UINT_MAX)
+			return -EINVAL;
+
+		error = shmem_add_seals(file, arg);
+		break;
+	case F_GET_SEALS:
+		error = shmem_get_seals(file);
+		break;
+	default:
+		error = -EINVAL;
+		break;
+	}
+
+	return error;
+}
+
 static long shmem_fallocate(struct file *file, int mode, loff_t offset,
 							 loff_t len)
 {
 	struct inode *inode = file_inode(file);
 	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+	struct shmem_inode_info *info = SHMEM_I(inode);
 	struct shmem_falloc shmem_falloc;
 	pgoff_t start, index, end;
 	int error;
@@ -1828,6 +1960,12 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
 		loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
 		DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
 
+		/* protected by i_mutex */
+		if (info->seals & F_SEAL_WRITE) {
+			error = -EPERM;
+			goto out;
+		}
+
 		shmem_falloc.waitq = &shmem_falloc_waitq;
 		shmem_falloc.start = unmap_start >> PAGE_SHIFT;
 		shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
@@ -1854,6 +1992,11 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
 	if (error)
 		goto out;
 
+	if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
+		error = -EPERM;
+		goto out;
+	}
+
 	start = offset >> PAGE_CACHE_SHIFT;
 	end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 	/* Try to avoid a swapstorm if len is impossible to satisfy */
-- 
cgit v1.2.3


From 9183df25fe7b194563db3fec6dc3202a5855839c Mon Sep 17 00:00:00 2001
From: David Herrmann <dh.herrmann@gmail.com>
Date: Fri, 8 Aug 2014 14:25:29 -0700
Subject: shm: add memfd_create() syscall

memfd_create() is similar to mmap(MAP_ANON), but returns a file-descriptor
that you can pass to mmap().  It can support sealing and avoids any
connection to user-visible mount-points.  Thus, it's not subject to quotas
on mounted file-systems, but can be used like malloc()'ed memory, but with
a file-descriptor to it.

memfd_create() returns the raw shmem file, so calls like ftruncate() can
be used to modify the underlying inode.  Also calls like fstat() will
return proper information and mark the file as regular file.  If you want
sealing, you can specify MFD_ALLOW_SEALING.  Otherwise, sealing is not
supported (like on all other regular files).

Compared to O_TMPFILE, it does not require a tmpfs mount-point and is not
subject to a filesystem size limit.  It is still properly accounted to
memcg limits, though, and to the same overcommit or no-overcommit
accounting as all user memory.

Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Ryan Lortie <desrt@desrt.ca>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Daniel Mack <zonque@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/syscalls/syscall_32.tbl |  1 +
 arch/x86/syscalls/syscall_64.tbl |  1 +
 include/linux/syscalls.h         |  1 +
 include/uapi/linux/memfd.h       |  8 +++++
 kernel/sys_ni.c                  |  1 +
 mm/shmem.c                       | 73 ++++++++++++++++++++++++++++++++++++++++
 6 files changed, 85 insertions(+)
 create mode 100644 include/uapi/linux/memfd.h

(limited to 'include/uapi/linux')

diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index d1b4a119d4a5..028b78168d85 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -362,3 +362,4 @@
 353	i386	renameat2		sys_renameat2
 354	i386	seccomp			sys_seccomp
 355	i386	getrandom		sys_getrandom
+356	i386	memfd_create		sys_memfd_create
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 252c804bb1aa..ca2b9aa78c81 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -325,6 +325,7 @@
 316	common	renameat2		sys_renameat2
 317	common	seccomp			sys_seccomp
 318	common	getrandom		sys_getrandom
+319	common	memfd_create		sys_memfd_create
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 701daff5d899..15a069425cbf 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -802,6 +802,7 @@ asmlinkage long sys_timerfd_settime(int ufd, int flags,
 asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr);
 asmlinkage long sys_eventfd(unsigned int count);
 asmlinkage long sys_eventfd2(unsigned int count, int flags);
+asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags);
 asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
 asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int);
 asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *,
diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h
new file mode 100644
index 000000000000..534e364bda92
--- /dev/null
+++ b/include/uapi/linux/memfd.h
@@ -0,0 +1,8 @@
+#ifndef _UAPI_LINUX_MEMFD_H
+#define _UAPI_LINUX_MEMFD_H
+
+/* flags for memfd_create(2) (unsigned int) */
+#define MFD_CLOEXEC		0x0001U
+#define MFD_ALLOW_SEALING	0x0002U
+
+#endif /* _UAPI_LINUX_MEMFD_H */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 2904a2105914..1f79e3714533 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -197,6 +197,7 @@ cond_syscall(compat_sys_timerfd_settime);
 cond_syscall(compat_sys_timerfd_gettime);
 cond_syscall(sys_eventfd);
 cond_syscall(sys_eventfd2);
+cond_syscall(sys_memfd_create);
 
 /* performance counters: */
 cond_syscall(sys_perf_event_open);
diff --git a/mm/shmem.c b/mm/shmem.c
index 8b43bb7a4efe..4a5498795a2b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -66,7 +66,9 @@ static struct vfsmount *shm_mnt;
 #include <linux/highmem.h>
 #include <linux/seq_file.h>
 #include <linux/magic.h>
+#include <linux/syscalls.h>
 #include <linux/fcntl.h>
+#include <uapi/linux/memfd.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -2732,6 +2734,77 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
 	shmem_show_mpol(seq, sbinfo->mpol);
 	return 0;
 }
+
+#define MFD_NAME_PREFIX "memfd:"
+#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
+#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
+
+#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING)
+
+SYSCALL_DEFINE2(memfd_create,
+		const char __user *, uname,
+		unsigned int, flags)
+{
+	struct shmem_inode_info *info;
+	struct file *file;
+	int fd, error;
+	char *name;
+	long len;
+
+	if (flags & ~(unsigned int)MFD_ALL_FLAGS)
+		return -EINVAL;
+
+	/* length includes terminating zero */
+	len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
+	if (len <= 0)
+		return -EFAULT;
+	if (len > MFD_NAME_MAX_LEN + 1)
+		return -EINVAL;
+
+	name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_TEMPORARY);
+	if (!name)
+		return -ENOMEM;
+
+	strcpy(name, MFD_NAME_PREFIX);
+	if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
+		error = -EFAULT;
+		goto err_name;
+	}
+
+	/* terminating-zero may have changed after strnlen_user() returned */
+	if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
+		error = -EFAULT;
+		goto err_name;
+	}
+
+	fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
+	if (fd < 0) {
+		error = fd;
+		goto err_name;
+	}
+
+	file = shmem_file_setup(name, 0, VM_NORESERVE);
+	if (IS_ERR(file)) {
+		error = PTR_ERR(file);
+		goto err_fd;
+	}
+	info = SHMEM_I(file_inode(file));
+	file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
+	file->f_flags |= O_RDWR | O_LARGEFILE;
+	if (flags & MFD_ALLOW_SEALING)
+		info->seals &= ~F_SEAL_SEAL;
+
+	fd_install(fd, file);
+	kfree(name);
+	return fd;
+
+err_fd:
+	put_unused_fd(fd);
+err_name:
+	kfree(name);
+	return error;
+}
+
 #endif /* CONFIG_TMPFS */
 
 static void shmem_put_super(struct super_block *sb)
-- 
cgit v1.2.3


From cb1052581e2bddd6096544f3f944f4e7fdad4c7f Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Fri, 8 Aug 2014 14:25:57 -0700
Subject: kexec: implementation of new syscall kexec_file_load

Previous patch provided the interface definition and this patch prvides
implementation of new syscall.

Previously segment list was prepared in user space.  Now user space just
passes kernel fd, initrd fd and command line and kernel will create a
segment list internally.

This patch contains generic part of the code.  Actual segment preparation
and loading is done by arch and image specific loader.  Which comes in
next patch.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Matthew Garrett <mjg59@srcf.ucam.org>
Cc: Greg Kroah-Hartman <greg@kroah.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: WANG Chao <chaowang@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/machine_kexec_64.c |  45 ++++
 include/linux/kexec.h              |  53 ++++
 include/uapi/linux/kexec.h         |  11 +
 kernel/kexec.c                     | 483 ++++++++++++++++++++++++++++++++++++-
 4 files changed, 587 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 679cef0791cd..c8875b5545e1 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -22,6 +22,10 @@
 #include <asm/mmu_context.h>
 #include <asm/debugreg.h>
 
+static struct kexec_file_ops *kexec_file_loaders[] = {
+		NULL,
+};
+
 static void free_transition_pgtable(struct kimage *image)
 {
 	free_page((unsigned long)image->arch.pud);
@@ -283,3 +287,44 @@ void arch_crash_save_vmcoreinfo(void)
 			      (unsigned long)&_text - __START_KERNEL);
 }
 
+/* arch-dependent functionality related to kexec file-based syscall */
+
+int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+				  unsigned long buf_len)
+{
+	int i, ret = -ENOEXEC;
+	struct kexec_file_ops *fops;
+
+	for (i = 0; i < ARRAY_SIZE(kexec_file_loaders); i++) {
+		fops = kexec_file_loaders[i];
+		if (!fops || !fops->probe)
+			continue;
+
+		ret = fops->probe(buf, buf_len);
+		if (!ret) {
+			image->fops = fops;
+			return ret;
+		}
+	}
+
+	return ret;
+}
+
+void *arch_kexec_kernel_image_load(struct kimage *image)
+{
+	if (!image->fops || !image->fops->load)
+		return ERR_PTR(-ENOEXEC);
+
+	return image->fops->load(image, image->kernel_buf,
+				 image->kernel_buf_len, image->initrd_buf,
+				 image->initrd_buf_len, image->cmdline_buf,
+				 image->cmdline_buf_len);
+}
+
+int arch_kimage_file_post_load_cleanup(struct kimage *image)
+{
+	if (!image->fops || !image->fops->cleanup)
+		return 0;
+
+	return image->fops->cleanup(image);
+}
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 66d56ac0f64c..8e80901e466f 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -121,13 +121,57 @@ struct kimage {
 #define KEXEC_TYPE_DEFAULT 0
 #define KEXEC_TYPE_CRASH   1
 	unsigned int preserve_context : 1;
+	/* If set, we are using file mode kexec syscall */
+	unsigned int file_mode:1;
 
 #ifdef ARCH_HAS_KIMAGE_ARCH
 	struct kimage_arch arch;
 #endif
+
+	/* Additional fields for file based kexec syscall */
+	void *kernel_buf;
+	unsigned long kernel_buf_len;
+
+	void *initrd_buf;
+	unsigned long initrd_buf_len;
+
+	char *cmdline_buf;
+	unsigned long cmdline_buf_len;
+
+	/* File operations provided by image loader */
+	struct kexec_file_ops *fops;
+
+	/* Image loader handling the kernel can store a pointer here */
+	void *image_loader_data;
 };
 
+/*
+ * Keeps track of buffer parameters as provided by caller for requesting
+ * memory placement of buffer.
+ */
+struct kexec_buf {
+	struct kimage *image;
+	char *buffer;
+	unsigned long bufsz;
+	unsigned long memsz;
+	unsigned long buf_align;
+	unsigned long buf_min;
+	unsigned long buf_max;
+	bool top_down;		/* allocate from top of memory hole */
+};
 
+typedef int (kexec_probe_t)(const char *kernel_buf, unsigned long kernel_size);
+typedef void *(kexec_load_t)(struct kimage *image, char *kernel_buf,
+			     unsigned long kernel_len, char *initrd,
+			     unsigned long initrd_len, char *cmdline,
+			     unsigned long cmdline_len);
+typedef int (kexec_cleanup_t)(struct kimage *image);
+
+struct kexec_file_ops {
+	kexec_probe_t *probe;
+	kexec_load_t *load;
+	kexec_cleanup_t *cleanup;
+};
 
 /* kexec interface functions */
 extern void machine_kexec(struct kimage *image);
@@ -138,6 +182,11 @@ extern asmlinkage long sys_kexec_load(unsigned long entry,
 					struct kexec_segment __user *segments,
 					unsigned long flags);
 extern int kernel_kexec(void);
+extern int kexec_add_buffer(struct kimage *image, char *buffer,
+			    unsigned long bufsz, unsigned long memsz,
+			    unsigned long buf_align, unsigned long buf_min,
+			    unsigned long buf_max, bool top_down,
+			    unsigned long *load_addr);
 extern struct page *kimage_alloc_control_pages(struct kimage *image,
 						unsigned int order);
 extern void crash_kexec(struct pt_regs *);
@@ -188,6 +237,10 @@ extern int kexec_load_disabled;
 #define KEXEC_FLAGS    (KEXEC_ON_CRASH | KEXEC_PRESERVE_CONTEXT)
 #endif
 
+/* List of defined/legal kexec file flags */
+#define KEXEC_FILE_FLAGS	(KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \
+				 KEXEC_FILE_NO_INITRAMFS)
+
 #define VMCOREINFO_BYTES           (4096)
 #define VMCOREINFO_NOTE_NAME       "VMCOREINFO"
 #define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4)
diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h
index d6629d49a243..6925f5b42f89 100644
--- a/include/uapi/linux/kexec.h
+++ b/include/uapi/linux/kexec.h
@@ -13,6 +13,17 @@
 #define KEXEC_PRESERVE_CONTEXT	0x00000002
 #define KEXEC_ARCH_MASK		0xffff0000
 
+/*
+ * Kexec file load interface flags.
+ * KEXEC_FILE_UNLOAD : Unload already loaded kexec/kdump image.
+ * KEXEC_FILE_ON_CRASH : Load/unload operation belongs to kdump image.
+ * KEXEC_FILE_NO_INITRAMFS : No initramfs is being loaded. Ignore the initrd
+ *                           fd field.
+ */
+#define KEXEC_FILE_UNLOAD	0x00000001
+#define KEXEC_FILE_ON_CRASH	0x00000002
+#define KEXEC_FILE_NO_INITRAMFS	0x00000004
+
 /* These values match the ELF architecture values.
  * Unless there is a good reason that should continue to be the case.
  */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ec4386c1b94f..9b46219254dd 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -6,6 +6,8 @@
  * Version 2.  See the file COPYING for more details.
  */
 
+#define pr_fmt(fmt)	"kexec: " fmt
+
 #include <linux/capability.h>
 #include <linux/mm.h>
 #include <linux/file.h>
@@ -327,6 +329,221 @@ out_free_image:
 	return ret;
 }
 
+static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
+{
+	struct fd f = fdget(fd);
+	int ret;
+	struct kstat stat;
+	loff_t pos;
+	ssize_t bytes = 0;
+
+	if (!f.file)
+		return -EBADF;
+
+	ret = vfs_getattr(&f.file->f_path, &stat);
+	if (ret)
+		goto out;
+
+	if (stat.size > INT_MAX) {
+		ret = -EFBIG;
+		goto out;
+	}
+
+	/* Don't hand 0 to vmalloc, it whines. */
+	if (stat.size == 0) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	*buf = vmalloc(stat.size);
+	if (!*buf) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	pos = 0;
+	while (pos < stat.size) {
+		bytes = kernel_read(f.file, pos, (char *)(*buf) + pos,
+				    stat.size - pos);
+		if (bytes < 0) {
+			vfree(*buf);
+			ret = bytes;
+			goto out;
+		}
+
+		if (bytes == 0)
+			break;
+		pos += bytes;
+	}
+
+	if (pos != stat.size) {
+		ret = -EBADF;
+		vfree(*buf);
+		goto out;
+	}
+
+	*buf_len = pos;
+out:
+	fdput(f);
+	return ret;
+}
+
+/* Architectures can provide this probe function */
+int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+					 unsigned long buf_len)
+{
+	return -ENOEXEC;
+}
+
+void * __weak arch_kexec_kernel_image_load(struct kimage *image)
+{
+	return ERR_PTR(-ENOEXEC);
+}
+
+void __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
+{
+}
+
+/*
+ * Free up memory used by kernel, initrd, and comand line. This is temporary
+ * memory allocation which is not needed any more after these buffers have
+ * been loaded into separate segments and have been copied elsewhere.
+ */
+static void kimage_file_post_load_cleanup(struct kimage *image)
+{
+	vfree(image->kernel_buf);
+	image->kernel_buf = NULL;
+
+	vfree(image->initrd_buf);
+	image->initrd_buf = NULL;
+
+	kfree(image->cmdline_buf);
+	image->cmdline_buf = NULL;
+
+	/* See if architecture has anything to cleanup post load */
+	arch_kimage_file_post_load_cleanup(image);
+}
+
+/*
+ * In file mode list of segments is prepared by kernel. Copy relevant
+ * data from user space, do error checking, prepare segment list
+ */
+static int
+kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
+			     const char __user *cmdline_ptr,
+			     unsigned long cmdline_len, unsigned flags)
+{
+	int ret = 0;
+	void *ldata;
+
+	ret = copy_file_from_fd(kernel_fd, &image->kernel_buf,
+				&image->kernel_buf_len);
+	if (ret)
+		return ret;
+
+	/* Call arch image probe handlers */
+	ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
+					    image->kernel_buf_len);
+
+	if (ret)
+		goto out;
+
+	/* It is possible that there no initramfs is being loaded */
+	if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
+		ret = copy_file_from_fd(initrd_fd, &image->initrd_buf,
+					&image->initrd_buf_len);
+		if (ret)
+			goto out;
+	}
+
+	if (cmdline_len) {
+		image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
+		if (!image->cmdline_buf) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
+				     cmdline_len);
+		if (ret) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		image->cmdline_buf_len = cmdline_len;
+
+		/* command line should be a string with last byte null */
+		if (image->cmdline_buf[cmdline_len - 1] != '\0') {
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	/* Call arch image load handlers */
+	ldata = arch_kexec_kernel_image_load(image);
+
+	if (IS_ERR(ldata)) {
+		ret = PTR_ERR(ldata);
+		goto out;
+	}
+
+	image->image_loader_data = ldata;
+out:
+	/* In case of error, free up all allocated memory in this function */
+	if (ret)
+		kimage_file_post_load_cleanup(image);
+	return ret;
+}
+
+static int
+kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
+		       int initrd_fd, const char __user *cmdline_ptr,
+		       unsigned long cmdline_len, unsigned long flags)
+{
+	int ret;
+	struct kimage *image;
+
+	image = do_kimage_alloc_init();
+	if (!image)
+		return -ENOMEM;
+
+	image->file_mode = 1;
+
+	ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
+					   cmdline_ptr, cmdline_len, flags);
+	if (ret)
+		goto out_free_image;
+
+	ret = sanity_check_segment_list(image);
+	if (ret)
+		goto out_free_post_load_bufs;
+
+	ret = -ENOMEM;
+	image->control_code_page = kimage_alloc_control_pages(image,
+					   get_order(KEXEC_CONTROL_PAGE_SIZE));
+	if (!image->control_code_page) {
+		pr_err("Could not allocate control_code_buffer\n");
+		goto out_free_post_load_bufs;
+	}
+
+	image->swap_page = kimage_alloc_control_pages(image, 0);
+	if (!image->swap_page) {
+		pr_err(KERN_ERR "Could not allocate swap buffer\n");
+		goto out_free_control_pages;
+	}
+
+	*rimage = image;
+	return 0;
+out_free_control_pages:
+	kimage_free_page_list(&image->control_pages);
+out_free_post_load_bufs:
+	kimage_file_post_load_cleanup(image);
+	kfree(image->image_loader_data);
+out_free_image:
+	kfree(image);
+	return ret;
+}
+
 static int kimage_is_destination_range(struct kimage *image,
 					unsigned long start,
 					unsigned long end)
@@ -644,6 +861,16 @@ static void kimage_free(struct kimage *image)
 
 	/* Free the kexec control pages... */
 	kimage_free_page_list(&image->control_pages);
+
+	kfree(image->image_loader_data);
+
+	/*
+	 * Free up any temporary buffers allocated. This might hit if
+	 * error occurred much later after buffer allocation.
+	 */
+	if (image->file_mode)
+		kimage_file_post_load_cleanup(image);
+
 	kfree(image);
 }
 
@@ -772,10 +999,14 @@ static int kimage_load_normal_segment(struct kimage *image,
 	unsigned long maddr;
 	size_t ubytes, mbytes;
 	int result;
-	unsigned char __user *buf;
+	unsigned char __user *buf = NULL;
+	unsigned char *kbuf = NULL;
 
 	result = 0;
-	buf = segment->buf;
+	if (image->file_mode)
+		kbuf = segment->kbuf;
+	else
+		buf = segment->buf;
 	ubytes = segment->bufsz;
 	mbytes = segment->memsz;
 	maddr = segment->mem;
@@ -807,7 +1038,11 @@ static int kimage_load_normal_segment(struct kimage *image,
 				PAGE_SIZE - (maddr & ~PAGE_MASK));
 		uchunk = min(ubytes, mchunk);
 
-		result = copy_from_user(ptr, buf, uchunk);
+		/* For file based kexec, source pages are in kernel memory */
+		if (image->file_mode)
+			memcpy(ptr, kbuf, uchunk);
+		else
+			result = copy_from_user(ptr, buf, uchunk);
 		kunmap(page);
 		if (result) {
 			result = -EFAULT;
@@ -815,7 +1050,10 @@ static int kimage_load_normal_segment(struct kimage *image,
 		}
 		ubytes -= uchunk;
 		maddr  += mchunk;
-		buf    += mchunk;
+		if (image->file_mode)
+			kbuf += mchunk;
+		else
+			buf += mchunk;
 		mbytes -= mchunk;
 	}
 out:
@@ -1062,7 +1300,72 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
 		unsigned long, cmdline_len, const char __user *, cmdline_ptr,
 		unsigned long, flags)
 {
-	return -ENOSYS;
+	int ret = 0, i;
+	struct kimage **dest_image, *image;
+
+	/* We only trust the superuser with rebooting the system. */
+	if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+		return -EPERM;
+
+	/* Make sure we have a legal set of flags */
+	if (flags != (flags & KEXEC_FILE_FLAGS))
+		return -EINVAL;
+
+	image = NULL;
+
+	if (!mutex_trylock(&kexec_mutex))
+		return -EBUSY;
+
+	dest_image = &kexec_image;
+	if (flags & KEXEC_FILE_ON_CRASH)
+		dest_image = &kexec_crash_image;
+
+	if (flags & KEXEC_FILE_UNLOAD)
+		goto exchange;
+
+	/*
+	 * In case of crash, new kernel gets loaded in reserved region. It is
+	 * same memory where old crash kernel might be loaded. Free any
+	 * current crash dump kernel before we corrupt it.
+	 */
+	if (flags & KEXEC_FILE_ON_CRASH)
+		kimage_free(xchg(&kexec_crash_image, NULL));
+
+	ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr,
+				     cmdline_len, flags);
+	if (ret)
+		goto out;
+
+	ret = machine_kexec_prepare(image);
+	if (ret)
+		goto out;
+
+	for (i = 0; i < image->nr_segments; i++) {
+		struct kexec_segment *ksegment;
+
+		ksegment = &image->segment[i];
+		pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
+			 i, ksegment->buf, ksegment->bufsz, ksegment->mem,
+			 ksegment->memsz);
+
+		ret = kimage_load_segment(image, &image->segment[i]);
+		if (ret)
+			goto out;
+	}
+
+	kimage_terminate(image);
+
+	/*
+	 * Free up any temporary buffers allocated which are not needed
+	 * after image has been loaded
+	 */
+	kimage_file_post_load_cleanup(image);
+exchange:
+	image = xchg(dest_image, image);
+out:
+	mutex_unlock(&kexec_mutex);
+	kimage_free(image);
+	return ret;
 }
 
 void crash_kexec(struct pt_regs *regs)
@@ -1620,6 +1923,176 @@ static int __init crash_save_vmcoreinfo_init(void)
 
 subsys_initcall(crash_save_vmcoreinfo_init);
 
+static int __kexec_add_segment(struct kimage *image, char *buf,
+			       unsigned long bufsz, unsigned long mem,
+			       unsigned long memsz)
+{
+	struct kexec_segment *ksegment;
+
+	ksegment = &image->segment[image->nr_segments];
+	ksegment->kbuf = buf;
+	ksegment->bufsz = bufsz;
+	ksegment->mem = mem;
+	ksegment->memsz = memsz;
+	image->nr_segments++;
+
+	return 0;
+}
+
+static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
+				    struct kexec_buf *kbuf)
+{
+	struct kimage *image = kbuf->image;
+	unsigned long temp_start, temp_end;
+
+	temp_end = min(end, kbuf->buf_max);
+	temp_start = temp_end - kbuf->memsz;
+
+	do {
+		/* align down start */
+		temp_start = temp_start & (~(kbuf->buf_align - 1));
+
+		if (temp_start < start || temp_start < kbuf->buf_min)
+			return 0;
+
+		temp_end = temp_start + kbuf->memsz - 1;
+
+		/*
+		 * Make sure this does not conflict with any of existing
+		 * segments
+		 */
+		if (kimage_is_destination_range(image, temp_start, temp_end)) {
+			temp_start = temp_start - PAGE_SIZE;
+			continue;
+		}
+
+		/* We found a suitable memory range */
+		break;
+	} while (1);
+
+	/* If we are here, we found a suitable memory range */
+	__kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start,
+			    kbuf->memsz);
+
+	/* Success, stop navigating through remaining System RAM ranges */
+	return 1;
+}
+
+static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
+				     struct kexec_buf *kbuf)
+{
+	struct kimage *image = kbuf->image;
+	unsigned long temp_start, temp_end;
+
+	temp_start = max(start, kbuf->buf_min);
+
+	do {
+		temp_start = ALIGN(temp_start, kbuf->buf_align);
+		temp_end = temp_start + kbuf->memsz - 1;
+
+		if (temp_end > end || temp_end > kbuf->buf_max)
+			return 0;
+		/*
+		 * Make sure this does not conflict with any of existing
+		 * segments
+		 */
+		if (kimage_is_destination_range(image, temp_start, temp_end)) {
+			temp_start = temp_start + PAGE_SIZE;
+			continue;
+		}
+
+		/* We found a suitable memory range */
+		break;
+	} while (1);
+
+	/* If we are here, we found a suitable memory range */
+	__kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start,
+			    kbuf->memsz);
+
+	/* Success, stop navigating through remaining System RAM ranges */
+	return 1;
+}
+
+static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
+{
+	struct kexec_buf *kbuf = (struct kexec_buf *)arg;
+	unsigned long sz = end - start + 1;
+
+	/* Returning 0 will take to next memory range */
+	if (sz < kbuf->memsz)
+		return 0;
+
+	if (end < kbuf->buf_min || start > kbuf->buf_max)
+		return 0;
+
+	/*
+	 * Allocate memory top down with-in ram range. Otherwise bottom up
+	 * allocation.
+	 */
+	if (kbuf->top_down)
+		return locate_mem_hole_top_down(start, end, kbuf);
+	return locate_mem_hole_bottom_up(start, end, kbuf);
+}
+
+/*
+ * Helper function for placing a buffer in a kexec segment. This assumes
+ * that kexec_mutex is held.
+ */
+int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
+		     unsigned long memsz, unsigned long buf_align,
+		     unsigned long buf_min, unsigned long buf_max,
+		     bool top_down, unsigned long *load_addr)
+{
+
+	struct kexec_segment *ksegment;
+	struct kexec_buf buf, *kbuf;
+	int ret;
+
+	/* Currently adding segment this way is allowed only in file mode */
+	if (!image->file_mode)
+		return -EINVAL;
+
+	if (image->nr_segments >= KEXEC_SEGMENT_MAX)
+		return -EINVAL;
+
+	/*
+	 * Make sure we are not trying to add buffer after allocating
+	 * control pages. All segments need to be placed first before
+	 * any control pages are allocated. As control page allocation
+	 * logic goes through list of segments to make sure there are
+	 * no destination overlaps.
+	 */
+	if (!list_empty(&image->control_pages)) {
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	memset(&buf, 0, sizeof(struct kexec_buf));
+	kbuf = &buf;
+	kbuf->image = image;
+	kbuf->buffer = buffer;
+	kbuf->bufsz = bufsz;
+
+	kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
+	kbuf->buf_align = max(buf_align, PAGE_SIZE);
+	kbuf->buf_min = buf_min;
+	kbuf->buf_max = buf_max;
+	kbuf->top_down = top_down;
+
+	/* Walk the RAM ranges and allocate a suitable range for the buffer */
+	ret = walk_system_ram_res(0, -1, kbuf, locate_mem_hole_callback);
+	if (ret != 1) {
+		/* A suitable memory range could not be found for buffer */
+		return -EADDRNOTAVAIL;
+	}
+
+	/* Found a suitable memory range */
+	ksegment = &image->segment[image->nr_segments - 1];
+	*load_addr = ksegment->mem;
+	return 0;
+}
+
+
 /*
  * Move into place and start executing a preloaded standalone
  * executable.  If nothing was preloaded return an error.
-- 
cgit v1.2.3


From fa71f32b5de2be1644ee671ddbe211d79be7847f Mon Sep 17 00:00:00 2001
From: David Herrmann <dh.herrmann@gmail.com>
Date: Tue, 29 Jul 2014 17:14:21 +0200
Subject: HID: uhid: add ABI compatible UHID_GET_REPORT replacing UHID_FEATURE

The old hdev->hid_get_raw_report() was broken by design. It was never
clear what kind of HW request it should trigger. Benjamin fixed that with
the core HID cleanup, though we never really adjusted uhid.

Unfortunately, our old UHID_FEATURE command was modelled around the broken
hid_get_raw_report(). We converted it silently to the new GET_REPORT and
nothing broke. Make this explicit by renaming UHID_FEATURE to
UHID_GET_REPORT and UHID_FEATURE_ANSWER to UHID_GET_REPORT_REPLY.

Note that this is 100% ABI compatible to UHID_FEATURE. This is just a
rename. But we have to keep the old definitions around to not break API.

>From now on, UHID_GET_REPORT must trigger a GET_REPORT request on the
user-space hardware layer. All the ambiguity due to the weird "feature"
name should be gone now.

Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/uhid.c        | 28 ++++++++++++++--------------
 include/uapi/linux/uhid.h | 23 +++++++++++++++++++++--
 2 files changed, 35 insertions(+), 16 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/hid/uhid.c b/drivers/hid/uhid.c
index 2d2025a027fe..8f5e46b1bb46 100644
--- a/drivers/hid/uhid.c
+++ b/drivers/hid/uhid.c
@@ -124,8 +124,8 @@ static int uhid_hid_parse(struct hid_device *hid)
 	return hid_parse_report(hid, uhid->rd_data, uhid->rd_size);
 }
 
-static int uhid_hid_get_raw(struct hid_device *hid, unsigned char rnum,
-			    __u8 *buf, size_t count, unsigned char rtype)
+static int uhid_hid_get_report(struct hid_device *hid, unsigned char rnum,
+			       __u8 *buf, size_t count, unsigned char rtype)
 {
 	struct uhid_device *uhid = hid->driver_data;
 	__u8 report_type;
@@ -133,7 +133,7 @@ static int uhid_hid_get_raw(struct hid_device *hid, unsigned char rnum,
 	unsigned long flags;
 	int ret;
 	size_t uninitialized_var(len);
-	struct uhid_feature_answer_req *req;
+	struct uhid_get_report_reply_req *req;
 
 	if (!uhid->running)
 		return -EIO;
@@ -163,10 +163,10 @@ static int uhid_hid_get_raw(struct hid_device *hid, unsigned char rnum,
 	}
 
 	spin_lock_irqsave(&uhid->qlock, flags);
-	ev->type = UHID_FEATURE;
-	ev->u.feature.id = ++uhid->report_id;
-	ev->u.feature.rnum = rnum;
-	ev->u.feature.rtype = report_type;
+	ev->type = UHID_GET_REPORT;
+	ev->u.get_report.id = ++uhid->report_id;
+	ev->u.get_report.rnum = rnum;
+	ev->u.get_report.rtype = report_type;
 
 	uhid->report_running = true;
 	uhid_queue(uhid, ev);
@@ -182,7 +182,7 @@ static int uhid_hid_get_raw(struct hid_device *hid, unsigned char rnum,
 		ret = -ERESTARTSYS;
 	} else {
 		spin_lock_irqsave(&uhid->qlock, flags);
-		req = &uhid->report_buf.u.feature_answer;
+		req = &uhid->report_buf.u.get_report_reply;
 
 		if (req->err) {
 			ret = -EIO;
@@ -253,7 +253,7 @@ static int uhid_raw_request(struct hid_device *hid, unsigned char reportnum,
 {
 	switch (reqtype) {
 	case HID_REQ_GET_REPORT:
-		return uhid_hid_get_raw(hid, reportnum, buf, len, rtype);
+		return uhid_hid_get_report(hid, reportnum, buf, len, rtype);
 	case HID_REQ_SET_REPORT:
 		/* TODO: implement proper SET_REPORT functionality */
 		return -ENOSYS;
@@ -487,8 +487,8 @@ static int uhid_dev_input2(struct uhid_device *uhid, struct uhid_event *ev)
 	return 0;
 }
 
-static int uhid_dev_feature_answer(struct uhid_device *uhid,
-				   struct uhid_event *ev)
+static int uhid_dev_get_report_reply(struct uhid_device *uhid,
+				     struct uhid_event *ev)
 {
 	unsigned long flags;
 
@@ -498,7 +498,7 @@ static int uhid_dev_feature_answer(struct uhid_device *uhid,
 	spin_lock_irqsave(&uhid->qlock, flags);
 
 	/* id for old report; drop it silently */
-	if (uhid->report_id != ev->u.feature_answer.id)
+	if (uhid->report_id != ev->u.get_report_reply.id)
 		goto unlock;
 	if (!uhid->report_running)
 		goto unlock;
@@ -634,8 +634,8 @@ static ssize_t uhid_char_write(struct file *file, const char __user *buffer,
 	case UHID_INPUT2:
 		ret = uhid_dev_input2(uhid, &uhid->input_buf);
 		break;
-	case UHID_FEATURE_ANSWER:
-		ret = uhid_dev_feature_answer(uhid, &uhid->input_buf);
+	case UHID_GET_REPORT_REPLY:
+		ret = uhid_dev_get_report_reply(uhid, &uhid->input_buf);
 		break;
 	default:
 		ret = -EOPNOTSUPP;
diff --git a/include/uapi/linux/uhid.h b/include/uapi/linux/uhid.h
index 1e3b09c191cd..0a08f2bbe642 100644
--- a/include/uapi/linux/uhid.h
+++ b/include/uapi/linux/uhid.h
@@ -33,8 +33,10 @@ enum uhid_event_type {
 	UHID_OUTPUT,
 	UHID_OUTPUT_EV,			/* obsolete! */
 	UHID_INPUT,
-	UHID_FEATURE,
-	UHID_FEATURE_ANSWER,
+	UHID_FEATURE,			/* obsolete! use UHID_GET_REPORT */
+	UHID_GET_REPORT = UHID_FEATURE,
+	UHID_FEATURE_ANSWER,		/* obsolete! use UHID_GET_REPORT_REPLY */
+	UHID_GET_REPORT_REPLY = UHID_FEATURE_ANSWER,
 	UHID_CREATE2,
 	UHID_INPUT2,
 };
@@ -98,12 +100,20 @@ struct uhid_output_ev_req {
 	__s32 value;
 } __attribute__((__packed__));
 
+/* Obsolete! Kernel uses ABI compatible UHID_GET_REPORT. */
 struct uhid_feature_req {
 	__u32 id;
 	__u8 rnum;
 	__u8 rtype;
 } __attribute__((__packed__));
 
+struct uhid_get_report_req {
+	__u32 id;
+	__u8 rnum;
+	__u8 rtype;
+} __attribute__((__packed__));
+
+/* Obsolete! Use ABI compatible UHID_GET_REPORT_REPLY. */
 struct uhid_feature_answer_req {
 	__u32 id;
 	__u16 err;
@@ -111,6 +121,13 @@ struct uhid_feature_answer_req {
 	__u8 data[UHID_DATA_MAX];
 } __attribute__((__packed__));
 
+struct uhid_get_report_reply_req {
+	__u32 id;
+	__u16 err;
+	__u16 size;
+	__u8 data[UHID_DATA_MAX];
+} __attribute__((__packed__));
+
 struct uhid_event {
 	__u32 type;
 
@@ -120,7 +137,9 @@ struct uhid_event {
 		struct uhid_output_req output;
 		struct uhid_output_ev_req output_ev;
 		struct uhid_feature_req feature;
+		struct uhid_get_report_req get_report;
 		struct uhid_feature_answer_req feature_answer;
+		struct uhid_get_report_reply_req get_report_reply;
 		struct uhid_create2_req create2;
 		struct uhid_input2_req input2;
 	} u;
-- 
cgit v1.2.3


From 50598e7055d0d8610732e7eb2c84cbc3bc7db294 Mon Sep 17 00:00:00 2001
From: David Herrmann <dh.herrmann@gmail.com>
Date: Tue, 29 Jul 2014 17:14:22 +0200
Subject: HID: uhid: keep legacy definitions at the bottom of uhid.h

Instead of inlining the legacy definitions into the main part of uhid.h,
keep them at the bottom now. This way, the API is much easier to read and
legacy requests can be looked up at a separate place.

Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/uapi/linux/uhid.h | 101 ++++++++++++++++++++++++++++------------------
 1 file changed, 62 insertions(+), 39 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/uhid.h b/include/uapi/linux/uhid.h
index 0a08f2bbe642..116536eeae62 100644
--- a/include/uapi/linux/uhid.h
+++ b/include/uapi/linux/uhid.h
@@ -24,37 +24,21 @@
 #include <linux/hid.h>
 
 enum uhid_event_type {
-	UHID_CREATE,
+	__UHID_LEGACY_CREATE,
 	UHID_DESTROY,
 	UHID_START,
 	UHID_STOP,
 	UHID_OPEN,
 	UHID_CLOSE,
 	UHID_OUTPUT,
-	UHID_OUTPUT_EV,			/* obsolete! */
-	UHID_INPUT,
-	UHID_FEATURE,			/* obsolete! use UHID_GET_REPORT */
-	UHID_GET_REPORT = UHID_FEATURE,
-	UHID_FEATURE_ANSWER,		/* obsolete! use UHID_GET_REPORT_REPLY */
-	UHID_GET_REPORT_REPLY = UHID_FEATURE_ANSWER,
+	__UHID_LEGACY_OUTPUT_EV,
+	__UHID_LEGACY_INPUT,
+	UHID_GET_REPORT,
+	UHID_GET_REPORT_REPLY,
 	UHID_CREATE2,
 	UHID_INPUT2,
 };
 
-struct uhid_create_req {
-	__u8 name[128];
-	__u8 phys[64];
-	__u8 uniq[64];
-	__u8 __user *rd_data;
-	__u16 rd_size;
-
-	__u16 bus;
-	__u32 vendor;
-	__u32 product;
-	__u32 version;
-	__u32 country;
-} __attribute__((__packed__));
-
 struct uhid_create2_req {
 	__u8 name[128];
 	__u8 phys[64];
@@ -76,24 +60,67 @@ enum uhid_report_type {
 	UHID_INPUT_REPORT,
 };
 
-struct uhid_input_req {
+struct uhid_input2_req {
+	__u16 size;
+	__u8 data[UHID_DATA_MAX];
+} __attribute__((__packed__));
+
+struct uhid_output_req {
 	__u8 data[UHID_DATA_MAX];
 	__u16 size;
+	__u8 rtype;
 } __attribute__((__packed__));
 
-struct uhid_input2_req {
+struct uhid_get_report_req {
+	__u32 id;
+	__u8 rnum;
+	__u8 rtype;
+} __attribute__((__packed__));
+
+struct uhid_get_report_reply_req {
+	__u32 id;
+	__u16 err;
 	__u16 size;
 	__u8 data[UHID_DATA_MAX];
 } __attribute__((__packed__));
 
-struct uhid_output_req {
+/*
+ * Compat Layer
+ * All these commands and requests are obsolete. You should avoid using them in
+ * new code. We support them for backwards-compatibility, but you might not get
+ * access to new feature in case you use them.
+ */
+
+enum uhid_legacy_event_type {
+	UHID_CREATE			= __UHID_LEGACY_CREATE,
+	UHID_OUTPUT_EV			= __UHID_LEGACY_OUTPUT_EV,
+	UHID_INPUT			= __UHID_LEGACY_INPUT,
+	UHID_FEATURE			= UHID_GET_REPORT,
+	UHID_FEATURE_ANSWER		= UHID_GET_REPORT_REPLY,
+};
+
+/* Obsolete! Use UHID_CREATE2. */
+struct uhid_create_req {
+	__u8 name[128];
+	__u8 phys[64];
+	__u8 uniq[64];
+	__u8 __user *rd_data;
+	__u16 rd_size;
+
+	__u16 bus;
+	__u32 vendor;
+	__u32 product;
+	__u32 version;
+	__u32 country;
+} __attribute__((__packed__));
+
+/* Obsolete! Use UHID_INPUT2. */
+struct uhid_input_req {
 	__u8 data[UHID_DATA_MAX];
 	__u16 size;
-	__u8 rtype;
 } __attribute__((__packed__));
 
-/* Obsolete! Newer kernels will no longer send these events but instead convert
- * it into raw output reports via UHID_OUTPUT. */
+/* Obsolete! Kernel uses UHID_OUTPUT exclusively now. */
 struct uhid_output_ev_req {
 	__u16 type;
 	__u16 code;
@@ -107,12 +134,6 @@ struct uhid_feature_req {
 	__u8 rtype;
 } __attribute__((__packed__));
 
-struct uhid_get_report_req {
-	__u32 id;
-	__u8 rnum;
-	__u8 rtype;
-} __attribute__((__packed__));
-
 /* Obsolete! Use ABI compatible UHID_GET_REPORT_REPLY. */
 struct uhid_feature_answer_req {
 	__u32 id;
@@ -121,12 +142,14 @@ struct uhid_feature_answer_req {
 	__u8 data[UHID_DATA_MAX];
 } __attribute__((__packed__));
 
-struct uhid_get_report_reply_req {
-	__u32 id;
-	__u16 err;
-	__u16 size;
-	__u8 data[UHID_DATA_MAX];
-} __attribute__((__packed__));
+/*
+ * UHID Events
+ * All UHID events from and to the kernel are encoded as "struct uhid_event".
+ * The "type" field contains a UHID_* type identifier. All payload depends on
+ * that type and can be accessed via ev->u.XYZ accordingly.
+ * If user-space writes short events, they're extended with 0s by the kernel. If
+ * the kernel writes short events, user-space shall extend them with 0s.
+ */
 
 struct uhid_event {
 	__u32 type;
-- 
cgit v1.2.3


From 11c221553080408b203a00b91ad5f647dfb218d1 Mon Sep 17 00:00:00 2001
From: David Herrmann <dh.herrmann@gmail.com>
Date: Tue, 29 Jul 2014 17:14:24 +0200
Subject: HID: uhid: implement SET_REPORT

We so far lacked support for hid_hw_raw_request(..., HID_REQ_SET_REPORT);
Add support for it and simply forward the request to user-space. Note that
SET_REPORT is synchronous, just like GET_REPORT, even though it does not
provide any data back besides an error code.

If a transport layer does SET_REPORT asynchronously, they can just ACK it
immediately by writing an uhid_set_report_reply to uhid.

This patch re-uses the synchronous uhid-report infrastructure to query
user-space. Note that this means you cannot run SET_REPORT and GET_REPORT
in parallel. However, that has always been a restriction of HID and due to
its blocking nature, this is just fine. Maybe some future transport layer
supports parallel requests (very unlikely), however, until then lets not
over-complicate things and avoid request-lookup-tables.

Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/uhid.c        | 206 +++++++++++++++++++++++++++++++---------------
 include/uapi/linux/uhid.h |  17 ++++
 2 files changed, 155 insertions(+), 68 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/hid/uhid.c b/drivers/hid/uhid.c
index 8bf613e3783d..19511481a7d3 100644
--- a/drivers/hid/uhid.c
+++ b/drivers/hid/uhid.c
@@ -49,6 +49,7 @@ struct uhid_device {
 	wait_queue_head_t report_wait;
 	bool report_running;
 	u32 report_id;
+	u32 report_type;
 	struct uhid_event report_buf;
 };
 
@@ -124,95 +125,166 @@ static int uhid_hid_parse(struct hid_device *hid)
 	return hid_parse_report(hid, uhid->rd_data, uhid->rd_size);
 }
 
+/* must be called with report_lock held */
+static int __uhid_report_queue_and_wait(struct uhid_device *uhid,
+					struct uhid_event *ev,
+					__u32 *report_id)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&uhid->qlock, flags);
+	*report_id = ++uhid->report_id;
+	uhid->report_type = ev->type;
+	uhid->report_running = true;
+	uhid_queue(uhid, ev);
+	spin_unlock_irqrestore(&uhid->qlock, flags);
+
+	ret = wait_event_interruptible_timeout(uhid->report_wait,
+				!uhid->report_running || !uhid->running,
+				5 * HZ);
+	if (!ret || !uhid->running || uhid->report_running)
+		ret = -EIO;
+	else if (ret < 0)
+		ret = -ERESTARTSYS;
+	else
+		ret = 0;
+
+	uhid->report_running = false;
+
+	return ret;
+}
+
+static void uhid_report_wake_up(struct uhid_device *uhid, u32 id,
+				const struct uhid_event *ev)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&uhid->qlock, flags);
+
+	/* id for old report; drop it silently */
+	if (uhid->report_type != ev->type || uhid->report_id != id)
+		goto unlock;
+	if (!uhid->report_running)
+		goto unlock;
+
+	memcpy(&uhid->report_buf, ev, sizeof(*ev));
+	uhid->report_running = false;
+	wake_up_interruptible(&uhid->report_wait);
+
+unlock:
+	spin_unlock_irqrestore(&uhid->qlock, flags);
+}
+
 static int uhid_hid_get_report(struct hid_device *hid, unsigned char rnum,
-			       __u8 *buf, size_t count, unsigned char rtype)
+			       u8 *buf, size_t count, u8 rtype)
 {
 	struct uhid_device *uhid = hid->driver_data;
-	__u8 report_type;
+	struct uhid_get_report_reply_req *req;
 	struct uhid_event *ev;
-	unsigned long flags;
 	int ret;
-	size_t uninitialized_var(len);
-	struct uhid_get_report_reply_req *req;
 
 	if (!uhid->running)
 		return -EIO;
 
-	switch (rtype) {
-	case HID_FEATURE_REPORT:
-		report_type = UHID_FEATURE_REPORT;
-		break;
-	case HID_OUTPUT_REPORT:
-		report_type = UHID_OUTPUT_REPORT;
-		break;
-	case HID_INPUT_REPORT:
-		report_type = UHID_INPUT_REPORT;
-		break;
-	default:
-		return -EINVAL;
-	}
+	ev = kzalloc(sizeof(*ev), GFP_KERNEL);
+	if (!ev)
+		return -ENOMEM;
+
+	ev->type = UHID_GET_REPORT;
+	ev->u.get_report.rnum = rnum;
+	ev->u.get_report.rtype = rtype;
 
 	ret = mutex_lock_interruptible(&uhid->report_lock);
-	if (ret)
+	if (ret) {
+		kfree(ev);
 		return ret;
+	}
 
-	ev = kzalloc(sizeof(*ev), GFP_KERNEL);
-	if (!ev) {
-		ret = -ENOMEM;
+	/* this _always_ takes ownership of @ev */
+	ret = __uhid_report_queue_and_wait(uhid, ev, &ev->u.get_report.id);
+	if (ret)
 		goto unlock;
+
+	req = &uhid->report_buf.u.get_report_reply;
+	if (req->err) {
+		ret = -EIO;
+	} else {
+		ret = min3(count, (size_t)req->size, (size_t)UHID_DATA_MAX);
+		memcpy(buf, req->data, ret);
 	}
 
-	spin_lock_irqsave(&uhid->qlock, flags);
-	ev->type = UHID_GET_REPORT;
-	ev->u.get_report.id = ++uhid->report_id;
-	ev->u.get_report.rnum = rnum;
-	ev->u.get_report.rtype = report_type;
+unlock:
+	mutex_unlock(&uhid->report_lock);
+	return ret;
+}
 
-	uhid->report_running = true;
-	uhid_queue(uhid, ev);
-	spin_unlock_irqrestore(&uhid->qlock, flags);
+static int uhid_hid_set_report(struct hid_device *hid, unsigned char rnum,
+			       const u8 *buf, size_t count, u8 rtype)
+{
+	struct uhid_device *uhid = hid->driver_data;
+	struct uhid_event *ev;
+	int ret;
 
-	ret = wait_event_interruptible_timeout(uhid->report_wait,
-				!uhid->report_running || !uhid->running,
-				5 * HZ);
+	if (!uhid->running || count > UHID_DATA_MAX)
+		return -EIO;
 
-	if (!ret || !uhid->running) {
-		ret = -EIO;
-	} else if (ret < 0) {
-		ret = -ERESTARTSYS;
-	} else {
-		spin_lock_irqsave(&uhid->qlock, flags);
-		req = &uhid->report_buf.u.get_report_reply;
+	ev = kzalloc(sizeof(*ev), GFP_KERNEL);
+	if (!ev)
+		return -ENOMEM;
 
-		if (req->err) {
-			ret = -EIO;
-		} else {
-			ret = 0;
-			len = min(count,
-				min_t(size_t, req->size, UHID_DATA_MAX));
-			memcpy(buf, req->data, len);
-		}
+	ev->type = UHID_SET_REPORT;
+	ev->u.set_report.rnum = rnum;
+	ev->u.set_report.rtype = rtype;
+	ev->u.set_report.size = count;
+	memcpy(ev->u.set_report.data, buf, count);
 
-		spin_unlock_irqrestore(&uhid->qlock, flags);
+	ret = mutex_lock_interruptible(&uhid->report_lock);
+	if (ret) {
+		kfree(ev);
+		return ret;
 	}
 
-	uhid->report_running = false;
+	/* this _always_ takes ownership of @ev */
+	ret = __uhid_report_queue_and_wait(uhid, ev, &ev->u.set_report.id);
+	if (ret)
+		goto unlock;
+
+	if (uhid->report_buf.u.set_report_reply.err)
+		ret = -EIO;
+	else
+		ret = count;
 
 unlock:
 	mutex_unlock(&uhid->report_lock);
-	return ret ? ret : len;
+	return ret;
 }
 
 static int uhid_hid_raw_request(struct hid_device *hid, unsigned char reportnum,
 				__u8 *buf, size_t len, unsigned char rtype,
 				int reqtype)
 {
+	u8 u_rtype;
+
+	switch (rtype) {
+	case HID_FEATURE_REPORT:
+		u_rtype = UHID_FEATURE_REPORT;
+		break;
+	case HID_OUTPUT_REPORT:
+		u_rtype = UHID_OUTPUT_REPORT;
+		break;
+	case HID_INPUT_REPORT:
+		u_rtype = UHID_INPUT_REPORT;
+		break;
+	default:
+		return -EINVAL;
+	}
+
 	switch (reqtype) {
 	case HID_REQ_GET_REPORT:
-		return uhid_hid_get_report(hid, reportnum, buf, len, rtype);
+		return uhid_hid_get_report(hid, reportnum, buf, len, u_rtype);
 	case HID_REQ_SET_REPORT:
-		/* TODO: implement proper SET_REPORT functionality */
-		return -ENOSYS;
+		return uhid_hid_set_report(hid, reportnum, buf, len, u_rtype);
 	default:
 		return -EIO;
 	}
@@ -490,25 +562,20 @@ static int uhid_dev_input2(struct uhid_device *uhid, struct uhid_event *ev)
 static int uhid_dev_get_report_reply(struct uhid_device *uhid,
 				     struct uhid_event *ev)
 {
-	unsigned long flags;
-
 	if (!uhid->running)
 		return -EINVAL;
 
-	spin_lock_irqsave(&uhid->qlock, flags);
-
-	/* id for old report; drop it silently */
-	if (uhid->report_id != ev->u.get_report_reply.id)
-		goto unlock;
-	if (!uhid->report_running)
-		goto unlock;
+	uhid_report_wake_up(uhid, ev->u.get_report_reply.id, ev);
+	return 0;
+}
 
-	memcpy(&uhid->report_buf, ev, sizeof(*ev));
-	uhid->report_running = false;
-	wake_up_interruptible(&uhid->report_wait);
+static int uhid_dev_set_report_reply(struct uhid_device *uhid,
+				     struct uhid_event *ev)
+{
+	if (!uhid->running)
+		return -EINVAL;
 
-unlock:
-	spin_unlock_irqrestore(&uhid->qlock, flags);
+	uhid_report_wake_up(uhid, ev->u.set_report_reply.id, ev);
 	return 0;
 }
 
@@ -637,6 +704,9 @@ static ssize_t uhid_char_write(struct file *file, const char __user *buffer,
 	case UHID_GET_REPORT_REPLY:
 		ret = uhid_dev_get_report_reply(uhid, &uhid->input_buf);
 		break;
+	case UHID_SET_REPORT_REPLY:
+		ret = uhid_dev_set_report_reply(uhid, &uhid->input_buf);
+		break;
 	default:
 		ret = -EOPNOTSUPP;
 	}
diff --git a/include/uapi/linux/uhid.h b/include/uapi/linux/uhid.h
index 116536eeae62..62aac0e4edf3 100644
--- a/include/uapi/linux/uhid.h
+++ b/include/uapi/linux/uhid.h
@@ -37,6 +37,8 @@ enum uhid_event_type {
 	UHID_GET_REPORT_REPLY,
 	UHID_CREATE2,
 	UHID_INPUT2,
+	UHID_SET_REPORT,
+	UHID_SET_REPORT_REPLY,
 };
 
 struct uhid_create2_req {
@@ -84,6 +86,19 @@ struct uhid_get_report_reply_req {
 	__u8 data[UHID_DATA_MAX];
 } __attribute__((__packed__));
 
+struct uhid_set_report_req {
+	__u32 id;
+	__u8 rnum;
+	__u8 rtype;
+	__u16 size;
+	__u8 data[UHID_DATA_MAX];
+} __attribute__((__packed__));
+
+struct uhid_set_report_reply_req {
+	__u32 id;
+	__u16 err;
+} __attribute__((__packed__));
+
 /*
  * Compat Layer
  * All these commands and requests are obsolete. You should avoid using them in
@@ -165,6 +180,8 @@ struct uhid_event {
 		struct uhid_get_report_reply_req get_report_reply;
 		struct uhid_create2_req create2;
 		struct uhid_input2_req input2;
+		struct uhid_set_report_req set_report;
+		struct uhid_set_report_reply_req set_report_reply;
 	} u;
 } __attribute__((__packed__));
 
-- 
cgit v1.2.3


From c2b2f16c5c62583d4f8904e44c4b30c94a01eaf1 Mon Sep 17 00:00:00 2001
From: David Herrmann <dh.herrmann@gmail.com>
Date: Tue, 29 Jul 2014 17:14:25 +0200
Subject: HID: uhid: report to user-space whether reports are numbered

This makes UHID_START include a "dev_flags" field that describes details
of the hid-device in the kernel. The first flags we introduce describe
whether a given report-type uses numbered reports. This is useful for
transport layers that force report-numbers and therefore might have to
prefix kernel-provided HID-messages with the report-number.

Currently, only HoG needs this and the spec only talks about "global
report numbers". That is, it's a global boolean not a per-type boolean.
However, given the quirks we already have in kernel-space, a per-type
value seems much more appropriate.

Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/uhid.c        | 21 ++++++++++++++++++++-
 include/uapi/linux/uhid.h | 11 +++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/hid/uhid.c b/drivers/hid/uhid.c
index 19511481a7d3..f6ec5eaf6b89 100644
--- a/drivers/hid/uhid.c
+++ b/drivers/hid/uhid.c
@@ -92,8 +92,27 @@ static int uhid_queue_event(struct uhid_device *uhid, __u32 event)
 static int uhid_hid_start(struct hid_device *hid)
 {
 	struct uhid_device *uhid = hid->driver_data;
+	struct uhid_event *ev;
+	unsigned long flags;
+
+	ev = kzalloc(sizeof(*ev), GFP_KERNEL);
+	if (!ev)
+		return -ENOMEM;
+
+	ev->type = UHID_START;
 
-	return uhid_queue_event(uhid, UHID_START);
+	if (hid->report_enum[HID_FEATURE_REPORT].numbered)
+		ev->u.start.dev_flags |= UHID_DEV_NUMBERED_FEATURE_REPORTS;
+	if (hid->report_enum[HID_OUTPUT_REPORT].numbered)
+		ev->u.start.dev_flags |= UHID_DEV_NUMBERED_OUTPUT_REPORTS;
+	if (hid->report_enum[HID_INPUT_REPORT].numbered)
+		ev->u.start.dev_flags |= UHID_DEV_NUMBERED_INPUT_REPORTS;
+
+	spin_lock_irqsave(&uhid->qlock, flags);
+	uhid_queue(uhid, ev);
+	spin_unlock_irqrestore(&uhid->qlock, flags);
+
+	return 0;
 }
 
 static void uhid_hid_stop(struct hid_device *hid)
diff --git a/include/uapi/linux/uhid.h b/include/uapi/linux/uhid.h
index 62aac0e4edf3..aaa86d6bd1dd 100644
--- a/include/uapi/linux/uhid.h
+++ b/include/uapi/linux/uhid.h
@@ -54,6 +54,16 @@ struct uhid_create2_req {
 	__u8 rd_data[HID_MAX_DESCRIPTOR_SIZE];
 } __attribute__((__packed__));
 
+enum uhid_dev_flag {
+	UHID_DEV_NUMBERED_FEATURE_REPORTS			= (1ULL << 0),
+	UHID_DEV_NUMBERED_OUTPUT_REPORTS			= (1ULL << 1),
+	UHID_DEV_NUMBERED_INPUT_REPORTS				= (1ULL << 2),
+};
+
+struct uhid_start_req {
+	__u64 dev_flags;
+};
+
 #define UHID_DATA_MAX 4096
 
 enum uhid_report_type {
@@ -182,6 +192,7 @@ struct uhid_event {
 		struct uhid_input2_req input2;
 		struct uhid_set_report_req set_report;
 		struct uhid_set_report_reply_req set_report_reply;
+		struct uhid_start_req start;
 	} u;
 } __attribute__((__packed__));
 
-- 
cgit v1.2.3