14 files changed, 2209 insertions, 266 deletions
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index da6e2ce77495..23eced02aaf6 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -48,4 +48,5 @@ menuconfig VFIO_NOIOMMU
 
 source "drivers/vfio/pci/Kconfig"
 source "drivers/vfio/platform/Kconfig"
+source "drivers/vfio/mdev/Kconfig"
 source "virt/lib/Kconfig"
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 7b8a31f63fea..4a23c13b6be4 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -7,3 +7,4 @@ obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
 obj-$(CONFIG_VFIO_SPAPR_EEH) += vfio_spapr_eeh.o
 obj-$(CONFIG_VFIO_PCI) += pci/
 obj-$(CONFIG_VFIO_PLATFORM) += platform/
+obj-$(CONFIG_VFIO_MDEV) += mdev/
diff --git a/drivers/vfio/mdev/Kconfig b/drivers/vfio/mdev/Kconfig
new file mode 100644
index 000000000000..14fdb106a827
--- /dev/null
+++ b/drivers/vfio/mdev/Kconfig
@@ -0,0 +1,17 @@
+
+config VFIO_MDEV
+	tristate "Mediated device driver framework"
+	depends on VFIO
+	default n
+	help
+	  Provides a framework to virtualize devices.
+	  See Documentation/vfio-mediated-device.txt for more details.
+
+	  If you don't know what do here, say N.
+
+config VFIO_MDEV_DEVICE
+	tristate "VFIO driver for Mediated devices"
+	depends on VFIO && VFIO_MDEV
+	default n
+	help
+	  VFIO based driver for Mediated devices.
diff --git a/drivers/vfio/mdev/Makefile b/drivers/vfio/mdev/Makefile
new file mode 100644
index 000000000000..fa2d5ea466ee
--- /dev/null
+++ b/drivers/vfio/mdev/Makefile
@@ -0,0 +1,5 @@
+
+mdev-y := mdev_core.o mdev_sysfs.o mdev_driver.o
+
+obj-$(CONFIG_VFIO_MDEV) += mdev.o
+obj-$(CONFIG_VFIO_MDEV_DEVICE) += vfio_mdev.o
diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
new file mode 100644
index 000000000000..be1ee89ee917
--- /dev/null
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -0,0 +1,385 @@
+/*
+ * Mediated device Core Driver
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *     Author: Neo Jia <cjia@nvidia.com>
+ *             Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/uuid.h>
+#include <linux/sysfs.h>
+#include <linux/mdev.h>
+
+#include "mdev_private.h"
+
+#define DRIVER_VERSION		"0.1"
+#define DRIVER_AUTHOR		"NVIDIA Corporation"
+#define DRIVER_DESC		"Mediated device Core Driver"
+
+static LIST_HEAD(parent_list);
+static DEFINE_MUTEX(parent_list_lock);
+static struct class_compat *mdev_bus_compat_class;
+
+static int _find_mdev_device(struct device *dev, void *data)
+{
+	struct mdev_device *mdev;
+
+	if (!dev_is_mdev(dev))
+		return 0;
+
+	mdev = to_mdev_device(dev);
+
+	if (uuid_le_cmp(mdev->uuid, *(uuid_le *)data) == 0)
+		return 1;
+
+	return 0;
+}
+
+static bool mdev_device_exist(struct parent_device *parent, uuid_le uuid)
+{
+	struct device *dev;
+
+	dev = device_find_child(parent->dev, &uuid, _find_mdev_device);
+	if (dev) {
+		put_device(dev);
+		return true;
+	}
+
+	return false;
+}
+
+/* Should be called holding parent_list_lock */
+static struct parent_device *__find_parent_device(struct device *dev)
+{
+	struct parent_device *parent;
+
+	list_for_each_entry(parent, &parent_list, next) {
+		if (parent->dev == dev)
+			return parent;
+	}
+	return NULL;
+}
+
+static void mdev_release_parent(struct kref *kref)
+{
+	struct parent_device *parent = container_of(kref, struct parent_device,
+						    ref);
+	struct device *dev = parent->dev;
+
+	kfree(parent);
+	put_device(dev);
+}
+
+static
+inline struct parent_device *mdev_get_parent(struct parent_device *parent)
+{
+	if (parent)
+		kref_get(&parent->ref);
+
+	return parent;
+}
+
+static inline void mdev_put_parent(struct parent_device *parent)
+{
+	if (parent)
+		kref_put(&parent->ref, mdev_release_parent);
+}
+
+static int mdev_device_create_ops(struct kobject *kobj,
+				  struct mdev_device *mdev)
+{
+	struct parent_device *parent = mdev->parent;
+	int ret;
+
+	ret = parent->ops->create(kobj, mdev);
+	if (ret)
+		return ret;
+
+	ret = sysfs_create_groups(&mdev->dev.kobj,
+				  parent->ops->mdev_attr_groups);
+	if (ret)
+		parent->ops->remove(mdev);
+
+	return ret;
+}
+
+/*
+ * mdev_device_remove_ops gets called from sysfs's 'remove' and when parent
+ * device is being unregistered from mdev device framework.
+ * - 'force_remove' is set to 'false' when called from sysfs's 'remove' which
+ *   indicates that if the mdev device is active, used by VMM or userspace
+ *   application, vendor driver could return error then don't remove the device.
+ * - 'force_remove' is set to 'true' when called from mdev_unregister_device()
+ *   which indicate that parent device is being removed from mdev device
+ *   framework so remove mdev device forcefully.
+ */
+static int mdev_device_remove_ops(struct mdev_device *mdev, bool force_remove)
+{
+	struct parent_device *parent = mdev->parent;
+	int ret;
+
+	/*
+	 * Vendor driver can return error if VMM or userspace application is
+	 * using this mdev device.
+	 */
+	ret = parent->ops->remove(mdev);
+	if (ret && !force_remove)
+		return -EBUSY;
+
+	sysfs_remove_groups(&mdev->dev.kobj, parent->ops->mdev_attr_groups);
+	return 0;
+}
+
+static int mdev_device_remove_cb(struct device *dev, void *data)
+{
+	if (!dev_is_mdev(dev))
+		return 0;
+
+	return mdev_device_remove(dev, data ? *(bool *)data : true);
+}
+
+/*
+ * mdev_register_device : Register a device
+ * @dev: device structure representing parent device.
+ * @ops: Parent device operation structure to be registered.
+ *
+ * Add device to list of registered parent devices.
+ * Returns a negative value on error, otherwise 0.
+ */
+int mdev_register_device(struct device *dev, const struct parent_ops *ops)
+{
+	int ret;
+	struct parent_device *parent;
+
+	/* check for mandatory ops */
+	if (!ops || !ops->create || !ops->remove || !ops->supported_type_groups)
+		return -EINVAL;
+
+	dev = get_device(dev);
+	if (!dev)
+		return -EINVAL;
+
+	mutex_lock(&parent_list_lock);
+
+	/* Check for duplicate */
+	parent = __find_parent_device(dev);
+	if (parent) {
+		ret = -EEXIST;
+		goto add_dev_err;
+	}
+
+	parent = kzalloc(sizeof(*parent), GFP_KERNEL);
+	if (!parent) {
+		ret = -ENOMEM;
+		goto add_dev_err;
+	}
+
+	kref_init(&parent->ref);
+	mutex_init(&parent->lock);
+
+	parent->dev = dev;
+	parent->ops = ops;
+
+	if (!mdev_bus_compat_class) {
+		mdev_bus_compat_class = class_compat_register("mdev_bus");
+		if (!mdev_bus_compat_class) {
+			ret = -ENOMEM;
+			goto add_dev_err;
+		}
+	}
+
+	ret = parent_create_sysfs_files(parent);
+	if (ret)
+		goto add_dev_err;
+
+	ret = class_compat_create_link(mdev_bus_compat_class, dev, NULL);
+	if (ret)
+		dev_warn(dev, "Failed to create compatibility class link\n");
+
+	list_add(&parent->next, &parent_list);
+	mutex_unlock(&parent_list_lock);
+
+	dev_info(dev, "MDEV: Registered\n");
+	return 0;
+
+add_dev_err:
+	mutex_unlock(&parent_list_lock);
+	if (parent)
+		mdev_put_parent(parent);
+	else
+		put_device(dev);
+	return ret;
+}
+EXPORT_SYMBOL(mdev_register_device);
+
+/*
+ * mdev_unregister_device : Unregister a parent device
+ * @dev: device structure representing parent device.
+ *
+ * Remove device from list of registered parent devices. Give a chance to free
+ * existing mediated devices for given device.
+ */
+
+void mdev_unregister_device(struct device *dev)
+{
+	struct parent_device *parent;
+	bool force_remove = true;
+
+	mutex_lock(&parent_list_lock);
+	parent = __find_parent_device(dev);
+
+	if (!parent) {
+		mutex_unlock(&parent_list_lock);
+		return;
+	}
+	dev_info(dev, "MDEV: Unregistering\n");
+
+	list_del(&parent->next);
+	class_compat_remove_link(mdev_bus_compat_class, dev, NULL);
+
+	device_for_each_child(dev, (void *)&force_remove,
+			      mdev_device_remove_cb);
+
+	parent_remove_sysfs_files(parent);
+
+	mutex_unlock(&parent_list_lock);
+	mdev_put_parent(parent);
+}
+EXPORT_SYMBOL(mdev_unregister_device);
+
+static void mdev_device_release(struct device *dev)
+{
+	struct mdev_device *mdev = to_mdev_device(dev);
+
+	dev_dbg(&mdev->dev, "MDEV: destroying\n");
+	kfree(mdev);
+}
+
+int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid)
+{
+	int ret;
+	struct mdev_device *mdev;
+	struct parent_device *parent;
+	struct mdev_type *type = to_mdev_type(kobj);
+
+	parent = mdev_get_parent(type->parent);
+	if (!parent)
+		return -EINVAL;
+
+	mutex_lock(&parent->lock);
+
+	/* Check for duplicate */
+	if (mdev_device_exist(parent, uuid)) {
+		ret = -EEXIST;
+		goto create_err;
+	}
+
+	mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);
+	if (!mdev) {
+		ret = -ENOMEM;
+		goto create_err;
+	}
+
+	memcpy(&mdev->uuid, &uuid, sizeof(uuid_le));
+	mdev->parent = parent;
+	kref_init(&mdev->ref);
+
+	mdev->dev.parent  = dev;
+	mdev->dev.bus     = &mdev_bus_type;
+	mdev->dev.release = mdev_device_release;
+	dev_set_name(&mdev->dev, "%pUl", uuid.b);
+
+	ret = device_register(&mdev->dev);
+	if (ret) {
+		put_device(&mdev->dev);
+		goto create_err;
+	}
+
+	ret = mdev_device_create_ops(kobj, mdev);
+	if (ret)
+		goto create_failed;
+
+	ret = mdev_create_sysfs_files(&mdev->dev, type);
+	if (ret) {
+		mdev_device_remove_ops(mdev, true);
+		goto create_failed;
+	}
+
+	mdev->type_kobj = kobj;
+	dev_dbg(&mdev->dev, "MDEV: created\n");
+
+	mutex_unlock(&parent->lock);
+	return ret;
+
+create_failed:
+	device_unregister(&mdev->dev);
+
+create_err:
+	mutex_unlock(&parent->lock);
+	mdev_put_parent(parent);
+	return ret;
+}
+
+int mdev_device_remove(struct device *dev, bool force_remove)
+{
+	struct mdev_device *mdev;
+	struct parent_device *parent;
+	struct mdev_type *type;
+	int ret;
+
+	mdev = to_mdev_device(dev);
+	type = to_mdev_type(mdev->type_kobj);
+	parent = mdev->parent;
+	mutex_lock(&parent->lock);
+
+	ret = mdev_device_remove_ops(mdev, force_remove);
+	if (ret) {
+		mutex_unlock(&parent->lock);
+		return ret;
+	}
+
+	mdev_remove_sysfs_files(dev, type);
+	device_unregister(dev);
+	mutex_unlock(&parent->lock);
+	mdev_put_parent(parent);
+	return ret;
+}
+
+static int __init mdev_init(void)
+{
+	int ret;
+
+	ret = mdev_bus_register();
+
+	/*
+	 * Attempt to load known vfio_mdev.  This gives us a working environment
+	 * without the user needing to explicitly load vfio_mdev driver.
+	 */
+	if (!ret)
+		request_module_nowait("vfio_mdev");
+
+	return ret;
+}
+
+static void __exit mdev_exit(void)
+{
+	if (mdev_bus_compat_class)
+		class_compat_unregister(mdev_bus_compat_class);
+
+	mdev_bus_unregister();
+}
+
+module_init(mdev_init)
+module_exit(mdev_exit)
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c
new file mode 100644
index 000000000000..6f0391f6f9b6
--- /dev/null
+++ b/drivers/vfio/mdev/mdev_driver.c
@@ -0,0 +1,119 @@
+/*
+ * MDEV driver
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *     Author: Neo Jia <cjia@nvidia.com>
+ *             Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/device.h>
+#include <linux/iommu.h>
+#include <linux/mdev.h>
+
+#include "mdev_private.h"
+
+static int mdev_attach_iommu(struct mdev_device *mdev)
+{
+	int ret;
+	struct iommu_group *group;
+
+	group = iommu_group_alloc();
+	if (IS_ERR(group))
+		return PTR_ERR(group);
+
+	ret = iommu_group_add_device(group, &mdev->dev);
+	if (!ret)
+		dev_info(&mdev->dev, "MDEV: group_id = %d\n",
+			 iommu_group_id(group));
+
+	iommu_group_put(group);
+	return ret;
+}
+
+static void mdev_detach_iommu(struct mdev_device *mdev)
+{
+	iommu_group_remove_device(&mdev->dev);
+	dev_info(&mdev->dev, "MDEV: detaching iommu\n");
+}
+
+static int mdev_probe(struct device *dev)
+{
+	struct mdev_driver *drv = to_mdev_driver(dev->driver);
+	struct mdev_device *mdev = to_mdev_device(dev);
+	int ret;
+
+	ret = mdev_attach_iommu(mdev);
+	if (ret)
+		return ret;
+
+	if (drv && drv->probe) {
+		ret = drv->probe(dev);
+		if (ret)
+			mdev_detach_iommu(mdev);
+	}
+
+	return ret;
+}
+
+static int mdev_remove(struct device *dev)
+{
+	struct mdev_driver *drv = to_mdev_driver(dev->driver);
+	struct mdev_device *mdev = to_mdev_device(dev);
+
+	if (drv && drv->remove)
+		drv->remove(dev);
+
+	mdev_detach_iommu(mdev);
+
+	return 0;
+}
+
+struct bus_type mdev_bus_type = {
+	.name		= "mdev",
+	.probe		= mdev_probe,
+	.remove		= mdev_remove,
+};
+EXPORT_SYMBOL_GPL(mdev_bus_type);
+
+/**
+ * mdev_register_driver - register a new MDEV driver
+ * @drv: the driver to register
+ * @owner: module owner of driver to be registered
+ *
+ * Returns a negative value on error, otherwise 0.
+ **/
+int mdev_register_driver(struct mdev_driver *drv, struct module *owner)
+{
+	/* initialize common driver fields */
+	drv->driver.name = drv->name;
+	drv->driver.bus = &mdev_bus_type;
+	drv->driver.owner = owner;
+
+	/* register with core */
+	return driver_register(&drv->driver);
+}
+EXPORT_SYMBOL(mdev_register_driver);
+
+/*
+ * mdev_unregister_driver - unregister MDEV driver
+ * @drv: the driver to unregister
+ */
+void mdev_unregister_driver(struct mdev_driver *drv)
+{
+	driver_unregister(&drv->driver);
+}
+EXPORT_SYMBOL(mdev_unregister_driver);
+
+int mdev_bus_register(void)
+{
+	return bus_register(&mdev_bus_type);
+}
+
+void mdev_bus_unregister(void)
+{
+	bus_unregister(&mdev_bus_type);
+}
diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h
new file mode 100644
index 000000000000..d35097cbf3d7
--- /dev/null
+++ b/drivers/vfio/mdev/mdev_private.h
@@ -0,0 +1,41 @@
+/*
+ * Mediated device interal definitions
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *     Author: Neo Jia <cjia@nvidia.com>
+ *             Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef MDEV_PRIVATE_H
+#define MDEV_PRIVATE_H
+
+int  mdev_bus_register(void);
+void mdev_bus_unregister(void);
+
+struct mdev_type {
+	struct kobject kobj;
+	struct kobject *devices_kobj;
+	struct parent_device *parent;
+	struct list_head next;
+	struct attribute_group *group;
+};
+
+#define to_mdev_type_attr(_attr)	\
+	container_of(_attr, struct mdev_type_attribute, attr)
+#define to_mdev_type(_kobj)		\
+	container_of(_kobj, struct mdev_type, kobj)
+
+int  parent_create_sysfs_files(struct parent_device *parent);
+void parent_remove_sysfs_files(struct parent_device *parent);
+
+int  mdev_create_sysfs_files(struct device *dev, struct mdev_type *type);
+void mdev_remove_sysfs_files(struct device *dev, struct mdev_type *type);
+
+int  mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid);
+int  mdev_device_remove(struct device *dev, bool force_remove);
+
+#endif /* MDEV_PRIVATE_H */
diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c
new file mode 100644
index 000000000000..1a53deb2ee10
--- /dev/null
+++ b/drivers/vfio/mdev/mdev_sysfs.c
@@ -0,0 +1,286 @@
+/*
+ * File attributes for Mediated devices
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *     Author: Neo Jia <cjia@nvidia.com>
+ *             Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/sysfs.h>
+#include <linux/ctype.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/uuid.h>
+#include <linux/mdev.h>
+
+#include "mdev_private.h"
+
+/* Static functions */
+
+static ssize_t mdev_type_attr_show(struct kobject *kobj,
+				     struct attribute *__attr, char *buf)
+{
+	struct mdev_type_attribute *attr = to_mdev_type_attr(__attr);
+	struct mdev_type *type = to_mdev_type(kobj);
+	ssize_t ret = -EIO;
+
+	if (attr->show)
+		ret = attr->show(kobj, type->parent->dev, buf);
+	return ret;
+}
+
+static ssize_t mdev_type_attr_store(struct kobject *kobj,
+				      struct attribute *__attr,
+				      const char *buf, size_t count)
+{
+	struct mdev_type_attribute *attr = to_mdev_type_attr(__attr);
+	struct mdev_type *type = to_mdev_type(kobj);
+	ssize_t ret = -EIO;
+
+	if (attr->store)
+		ret = attr->store(&type->kobj, type->parent->dev, buf, count);
+	return ret;
+}
+
+static const struct sysfs_ops mdev_type_sysfs_ops = {
+	.show = mdev_type_attr_show,
+	.store = mdev_type_attr_store,
+};
+
+static ssize_t create_store(struct kobject *kobj, struct device *dev,
+			    const char *buf, size_t count)
+{
+	char *str;
+	uuid_le uuid;
+	int ret;
+
+	if ((count < UUID_STRING_LEN) || (count > UUID_STRING_LEN + 1))
+		return -EINVAL;
+
+	str = kstrndup(buf, count, GFP_KERNEL);
+	if (!str)
+		return -ENOMEM;
+
+	ret = uuid_le_to_bin(str, &uuid);
+	kfree(str);
+	if (ret)
+		return ret;
+
+	ret = mdev_device_create(kobj, dev, uuid);
+	if (ret)
+		return ret;
+
+	return count;
+}
+
+MDEV_TYPE_ATTR_WO(create);
+
+static void mdev_type_release(struct kobject *kobj)
+{
+	struct mdev_type *type = to_mdev_type(kobj);
+
+	pr_debug("Releasing group %s\n", kobj->name);
+	kfree(type);
+}
+
+static struct kobj_type mdev_type_ktype = {
+	.sysfs_ops = &mdev_type_sysfs_ops,
+	.release = mdev_type_release,
+};
+
+struct mdev_type *add_mdev_supported_type(struct parent_device *parent,
+					  struct attribute_group *group)
+{
+	struct mdev_type *type;
+	int ret;
+
+	if (!group->name) {
+		pr_err("%s: Type name empty!\n", __func__);
+		return ERR_PTR(-EINVAL);
+	}
+
+	type = kzalloc(sizeof(*type), GFP_KERNEL);
+	if (!type)
+		return ERR_PTR(-ENOMEM);
+
+	type->kobj.kset = parent->mdev_types_kset;
+
+	ret = kobject_init_and_add(&type->kobj, &mdev_type_ktype, NULL,
+				   "%s-%s", dev_driver_string(parent->dev),
+				   group->name);
+	if (ret) {
+		kfree(type);
+		return ERR_PTR(ret);
+	}
+
+	ret = sysfs_create_file(&type->kobj, &mdev_type_attr_create.attr);
+	if (ret)
+		goto attr_create_failed;
+
+	type->devices_kobj = kobject_create_and_add("devices", &type->kobj);
+	if (!type->devices_kobj) {
+		ret = -ENOMEM;
+		goto attr_devices_failed;
+	}
+
+	ret = sysfs_create_files(&type->kobj,
+				 (const struct attribute **)group->attrs);
+	if (ret) {
+		ret = -ENOMEM;
+		goto attrs_failed;
+	}
+
+	type->group = group;
+	type->parent = parent;
+	return type;
+
+attrs_failed:
+	kobject_put(type->devices_kobj);
+attr_devices_failed:
+	sysfs_remove_file(&type->kobj, &mdev_type_attr_create.attr);
+attr_create_failed:
+	kobject_del(&type->kobj);
+	kobject_put(&type->kobj);
+	return ERR_PTR(ret);
+}
+
+static void remove_mdev_supported_type(struct mdev_type *type)
+{
+	sysfs_remove_files(&type->kobj,
+			   (const struct attribute **)type->group->attrs);
+	kobject_put(type->devices_kobj);
+	sysfs_remove_file(&type->kobj, &mdev_type_attr_create.attr);
+	kobject_del(&type->kobj);
+	kobject_put(&type->kobj);
+}
+
+static int add_mdev_supported_type_groups(struct parent_device *parent)
+{
+	int i;
+
+	for (i = 0; parent->ops->supported_type_groups[i]; i++) {
+		struct mdev_type *type;
+
+		type = add_mdev_supported_type(parent,
+					parent->ops->supported_type_groups[i]);
+		if (IS_ERR(type)) {
+			struct mdev_type *ltype, *tmp;
+
+			list_for_each_entry_safe(ltype, tmp, &parent->type_list,
+						  next) {
+				list_del(&ltype->next);
+				remove_mdev_supported_type(ltype);
+			}
+			return PTR_ERR(type);
+		}
+		list_add(&type->next, &parent->type_list);
+	}
+	return 0;
+}
+
+/* mdev sysfs functions */
+void parent_remove_sysfs_files(struct parent_device *parent)
+{
+	struct mdev_type *type, *tmp;
+
+	list_for_each_entry_safe(type, tmp, &parent->type_list, next) {
+		list_del(&type->next);
+		remove_mdev_supported_type(type);
+	}
+
+	sysfs_remove_groups(&parent->dev->kobj, parent->ops->dev_attr_groups);
+	kset_unregister(parent->mdev_types_kset);
+}
+
+int parent_create_sysfs_files(struct parent_device *parent)
+{
+	int ret;
+
+	parent->mdev_types_kset = kset_create_and_add("mdev_supported_types",
+					       NULL, &parent->dev->kobj);
+
+	if (!parent->mdev_types_kset)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&parent->type_list);
+
+	ret = sysfs_create_groups(&parent->dev->kobj,
+				  parent->ops->dev_attr_groups);
+	if (ret)
+		goto create_err;
+
+	ret = add_mdev_supported_type_groups(parent);
+	if (ret)
+		sysfs_remove_groups(&parent->dev->kobj,
+				    parent->ops->dev_attr_groups);
+	else
+		return ret;
+
+create_err:
+	kset_unregister(parent->mdev_types_kset);
+	return ret;
+}
+
+static ssize_t remove_store(struct device *dev, struct device_attribute *attr,
+			    const char *buf, size_t count)
+{
+	unsigned long val;
+
+	if (kstrtoul(buf, 0, &val) < 0)
+		return -EINVAL;
+
+	if (val && device_remove_file_self(dev, attr)) {
+		int ret;
+
+		ret = mdev_device_remove(dev, false);
+		if (ret) {
+			device_create_file(dev, attr);
+			return ret;
+		}
+	}
+
+	return count;
+}
+
+static DEVICE_ATTR_WO(remove);
+
+static const struct attribute *mdev_device_attrs[] = {
+	&dev_attr_remove.attr,
+	NULL,
+};
+
+int  mdev_create_sysfs_files(struct device *dev, struct mdev_type *type)
+{
+	int ret;
+
+	ret = sysfs_create_files(&dev->kobj, mdev_device_attrs);
+	if (ret)
+		return ret;
+
+	ret = sysfs_create_link(type->devices_kobj, &dev->kobj, dev_name(dev));
+	if (ret)
+		goto device_link_failed;
+
+	ret = sysfs_create_link(&dev->kobj, &type->kobj, "mdev_type");
+	if (ret)
+		goto type_link_failed;
+
+	return ret;
+
+type_link_failed:
+	sysfs_remove_link(type->devices_kobj, dev_name(dev));
+device_link_failed:
+	sysfs_remove_files(&dev->kobj, mdev_device_attrs);
+	return ret;
+}
+
+void mdev_remove_sysfs_files(struct device *dev, struct mdev_type *type)
+{
+	sysfs_remove_link(&dev->kobj, "mdev_type");
+	sysfs_remove_link(type->devices_kobj, dev_name(dev));
+	sysfs_remove_files(&dev->kobj, mdev_device_attrs);
+}
diff --git a/drivers/vfio/mdev/vfio_mdev.c b/drivers/vfio/mdev/vfio_mdev.c
new file mode 100644
index 000000000000..ffc36758cb84
--- /dev/null
+++ b/drivers/vfio/mdev/vfio_mdev.c
@@ -0,0 +1,148 @@
+/*
+ * VFIO based driver for Mediated device
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *     Author: Neo Jia <cjia@nvidia.com>
+ *             Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/vfio.h>
+#include <linux/mdev.h>
+
+#include "mdev_private.h"
+
+#define DRIVER_VERSION  "0.1"
+#define DRIVER_AUTHOR   "NVIDIA Corporation"
+#define DRIVER_DESC     "VFIO based driver for Mediated device"
+
+static int vfio_mdev_open(void *device_data)
+{
+	struct mdev_device *mdev = device_data;
+	struct parent_device *parent = mdev->parent;
+	int ret;
+
+	if (unlikely(!parent->ops->open))
+		return -EINVAL;
+
+	if (!try_module_get(THIS_MODULE))
+		return -ENODEV;
+
+	ret = parent->ops->open(mdev);
+	if (ret)
+		module_put(THIS_MODULE);
+
+	return ret;
+}
+
+static void vfio_mdev_release(void *device_data)
+{
+	struct mdev_device *mdev = device_data;
+	struct parent_device *parent = mdev->parent;
+
+	if (likely(parent->ops->release))
+		parent->ops->release(mdev);
+
+	module_put(THIS_MODULE);
+}
+
+static long vfio_mdev_unlocked_ioctl(void *device_data,
+				     unsigned int cmd, unsigned long arg)
+{
+	struct mdev_device *mdev = device_data;
+	struct parent_device *parent = mdev->parent;
+
+	if (unlikely(!parent->ops->ioctl))
+		return -EINVAL;
+
+	return parent->ops->ioctl(mdev, cmd, arg);
+}
+
+static ssize_t vfio_mdev_read(void *device_data, char __user *buf,
+			      size_t count, loff_t *ppos)
+{
+	struct mdev_device *mdev = device_data;
+	struct parent_device *parent = mdev->parent;
+
+	if (unlikely(!parent->ops->read))
+		return -EINVAL;
+
+	return parent->ops->read(mdev, buf, count, ppos);
+}
+
+static ssize_t vfio_mdev_write(void *device_data, const char __user *buf,
+			       size_t count, loff_t *ppos)
+{
+	struct mdev_device *mdev = device_data;
+	struct parent_device *parent = mdev->parent;
+
+	if (unlikely(!parent->ops->write))
+		return -EINVAL;
+
+	return parent->ops->write(mdev, buf, count, ppos);
+}
+
+static int vfio_mdev_mmap(void *device_data, struct vm_area_struct *vma)
+{
+	struct mdev_device *mdev = device_data;
+	struct parent_device *parent = mdev->parent;
+
+	if (unlikely(!parent->ops->mmap))
+		return -EINVAL;
+
+	return parent->ops->mmap(mdev, vma);
+}
+
+static const struct vfio_device_ops vfio_mdev_dev_ops = {
+	.name		= "vfio-mdev",
+	.open		= vfio_mdev_open,
+	.release	= vfio_mdev_release,
+	.ioctl		= vfio_mdev_unlocked_ioctl,
+	.read		= vfio_mdev_read,
+	.write		= vfio_mdev_write,
+	.mmap		= vfio_mdev_mmap,
+};
+
+int vfio_mdev_probe(struct device *dev)
+{
+	struct mdev_device *mdev = to_mdev_device(dev);
+
+	return vfio_add_group_dev(dev, &vfio_mdev_dev_ops, mdev);
+}
+
+void vfio_mdev_remove(struct device *dev)
+{
+	vfio_del_group_dev(dev);
+}
+
+struct mdev_driver vfio_mdev_driver = {
+	.name	= "vfio_mdev",
+	.probe	= vfio_mdev_probe,
+	.remove	= vfio_mdev_remove,
+};
+
+static int __init vfio_mdev_init(void)
+{
+	return mdev_register_driver(&vfio_mdev_driver, THIS_MODULE);
+}
+
+static void __exit vfio_mdev_exit(void)
+{
+	mdev_unregister_driver(&vfio_mdev_driver);
+}
+
+module_init(vfio_mdev_init)
+module_exit(vfio_mdev_exit)
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 031bc08d000d..dcd7c2a99618 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -558,10 +558,9 @@ static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
 static int msix_sparse_mmap_cap(struct vfio_pci_device *vdev,
 				struct vfio_info_cap *caps)
 {
-	struct vfio_info_cap_header *header;
 	struct vfio_region_info_cap_sparse_mmap *sparse;
 	size_t end, size;
-	int nr_areas = 2, i = 0;
+	int nr_areas = 2, i = 0, ret;
 
 	end = pci_resource_len(vdev->pdev, vdev->msix_bar);
 
@@ -572,13 +571,10 @@ static int msix_sparse_mmap_cap(struct vfio_pci_device *vdev,
 
 	size = sizeof(*sparse) + (nr_areas * sizeof(*sparse->areas));
 
-	header = vfio_info_cap_add(caps, size,
-				   VFIO_REGION_INFO_CAP_SPARSE_MMAP, 1);
-	if (IS_ERR(header))
-		return PTR_ERR(header);
+	sparse = kzalloc(size, GFP_KERNEL);
+	if (!sparse)
+		return -ENOMEM;
 
-	sparse = container_of(header,
-			      struct vfio_region_info_cap_sparse_mmap, header);
 	sparse->nr_areas = nr_areas;
 
 	if (vdev->msix_offset & PAGE_MASK) {
@@ -594,26 +590,11 @@ static int msix_sparse_mmap_cap(struct vfio_pci_device *vdev,
 		i++;
 	}
 
-	return 0;
-}
-
-static int region_type_cap(struct vfio_pci_device *vdev,
-			   struct vfio_info_cap *caps,
-			   unsigned int type, unsigned int subtype)
-{
-	struct vfio_info_cap_header *header;
-	struct vfio_region_info_cap_type *cap;
-
-	header = vfio_info_cap_add(caps, sizeof(*cap),
-				   VFIO_REGION_INFO_CAP_TYPE, 1);
-	if (IS_ERR(header))
-		return PTR_ERR(header);
-
-	cap = container_of(header, struct vfio_region_info_cap_type, header);
-	cap->type = type;
-	cap->subtype = subtype;
+	ret = vfio_info_add_capability(caps, VFIO_REGION_INFO_CAP_SPARSE_MMAP,
+				       sparse);
+	kfree(sparse);
 
-	return 0;
+	return ret;
 }
 
 int vfio_pci_register_dev_region(struct vfio_pci_device *vdev,
@@ -752,6 +733,9 @@ static long vfio_pci_ioctl(void *device_data,
 
 			break;
 		default:
+		{
+			struct vfio_region_info_cap_type cap_type;
+
 			if (info.index >=
 			    VFIO_PCI_NUM_REGIONS + vdev->num_regions)
 				return -EINVAL;
@@ -762,11 +746,16 @@ static long vfio_pci_ioctl(void *device_data,
 			info.size = vdev->region[i].size;
 			info.flags = vdev->region[i].flags;
 
-			ret = region_type_cap(vdev, &caps,
-					      vdev->region[i].type,
-					      vdev->region[i].subtype);
+			cap_type.type = vdev->region[i].type;
+			cap_type.subtype = vdev->region[i].subtype;
+
+			ret = vfio_info_add_capability(&caps,
+						      VFIO_REGION_INFO_CAP_TYPE,
+						      &cap_type);
 			if (ret)
 				return ret;
+
+		}
 		}
 
 		if (caps.size) {
@@ -829,45 +818,25 @@ static long vfio_pci_ioctl(void *device_data,
 
 	} else if (cmd == VFIO_DEVICE_SET_IRQS) {
 		struct vfio_irq_set hdr;
-		size_t size;
 		u8 *data = NULL;
 		int max, ret = 0;
+		size_t data_size = 0;
 
 		minsz = offsetofend(struct vfio_irq_set, count);
 
 		if (copy_from_user(&hdr, (void __user *)arg, minsz))
 			return -EFAULT;
 
-		if (hdr.argsz < minsz || hdr.index >= VFIO_PCI_NUM_IRQS ||
-		    hdr.count >= (U32_MAX - hdr.start) ||
-		    hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
-				  VFIO_IRQ_SET_ACTION_TYPE_MASK))
-			return -EINVAL;
-
 		max = vfio_pci_get_irq_count(vdev, hdr.index);
-		if (hdr.start >= max || hdr.start + hdr.count > max)
-			return -EINVAL;
 
-		switch (hdr.flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
-		case VFIO_IRQ_SET_DATA_NONE:
-			size = 0;
-			break;
-		case VFIO_IRQ_SET_DATA_BOOL:
-			size = sizeof(uint8_t);
-			break;
-		case VFIO_IRQ_SET_DATA_EVENTFD:
-			size = sizeof(int32_t);
-			break;
-		default:
-			return -EINVAL;
-		}
-
-		if (size) {
-			if (hdr.argsz - minsz < hdr.count * size)
-				return -EINVAL;
+		ret = vfio_set_irqs_validate_and_prepare(&hdr, max,
+						 VFIO_PCI_NUM_IRQS, &data_size);
+		if (ret)
+			return ret;
 
+		if (data_size) {
 			data = memdup_user((void __user *)(arg + minsz),
-					   hdr.count * size);
+					    data_size);
 			if (IS_ERR(data))
 				return PTR_ERR(data);
 		}
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index 65d4a3015542..330a57024cbc 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -31,8 +31,6 @@
 
 #include "vfio_pci_private.h"
 
-#define PCI_CFG_SPACE_SIZE	256
-
 /* Fake capability ID for standard config space */
 #define PCI_CAP_ID_BASIC	0
 
@@ -152,7 +150,7 @@ static int vfio_user_config_read(struct pci_dev *pdev, int offset,
 
 	*val = cpu_to_le32(tmp_val);
 
-	return pcibios_err_to_errno(ret);
+	return ret;
 }
 
 static int vfio_user_config_write(struct pci_dev *pdev, int offset,
@@ -173,7 +171,7 @@ static int vfio_user_config_write(struct pci_dev *pdev, int offset,
 		break;
 	}
 
-	return pcibios_err_to_errno(ret);
+	return ret;
 }
 
 static int vfio_default_config_read(struct vfio_pci_device *vdev, int pos,
@@ -257,7 +255,7 @@ static int vfio_direct_config_read(struct vfio_pci_device *vdev, int pos,
 
 	ret = vfio_user_config_read(vdev->pdev, pos, val, count);
 	if (ret)
-		return pcibios_err_to_errno(ret);
+		return ret;
 
 	if (pos >= PCI_CFG_SPACE_SIZE) { /* Extended cap header mangling */
 		if (offset < 4)
@@ -295,7 +293,7 @@ static int vfio_raw_config_read(struct vfio_pci_device *vdev, int pos,
 
 	ret = vfio_user_config_read(vdev->pdev, pos, val, count);
 	if (ret)
-		return pcibios_err_to_errno(ret);
+		return ret;
 
 	return count;
 }
@@ -1089,7 +1087,7 @@ static int vfio_msi_config_write(struct vfio_pci_device *vdev, int pos,
 						 start + PCI_MSI_FLAGS,
 						 flags);
 		if (ret)
-			return pcibios_err_to_errno(ret);
+			return ret;
 	}
 
 	return count;
diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c
index d78142830754..4c27f4be3c3d 100644
--- a/drivers/vfio/platform/vfio_platform_common.c
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -364,36 +364,21 @@ static long vfio_platform_ioctl(void *device_data,
 		struct vfio_irq_set hdr;
 		u8 *data = NULL;
 		int ret = 0;
+		size_t data_size = 0;
 
 		minsz = offsetofend(struct vfio_irq_set, count);
 
 		if (copy_from_user(&hdr, (void __user *)arg, minsz))
 			return -EFAULT;
 
-		if (hdr.argsz < minsz)
-			return -EINVAL;
-
-		if (hdr.index >= vdev->num_irqs)
-			return -EINVAL;
-
-		if (hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
-				  VFIO_IRQ_SET_ACTION_TYPE_MASK))
-			return -EINVAL;
-
-		if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
-			size_t size;
-
-			if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL)
-				size = sizeof(uint8_t);
-			else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD)
-				size = sizeof(int32_t);
-			else
-				return -EINVAL;
-
-			if (hdr.argsz - minsz < size)
-				return -EINVAL;
+		ret = vfio_set_irqs_validate_and_prepare(&hdr, vdev->num_irqs,
+						 vdev->num_irqs, &data_size);
+		if (ret)
+			return ret;
 
-			data = memdup_user((void __user *)(arg + minsz), size);
+		if (data_size) {
+			data = memdup_user((void __user *)(arg + minsz),
+					    data_size);
 			if (IS_ERR(data))
 				return PTR_ERR(data);
 		}
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index d1d70e0b011b..9901c4671e2f 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -86,6 +86,8 @@ struct vfio_group {
 	struct mutex			unbound_lock;
 	atomic_t			opened;
 	bool				noiommu;
+	struct kvm			*kvm;
+	struct blocking_notifier_head	notifier;
 };
 
 struct vfio_device {
@@ -339,6 +341,7 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
 #ifdef CONFIG_VFIO_NOIOMMU
 	group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
 #endif
+	BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
 
 	group->nb.notifier_call = vfio_iommu_group_notifier;
 
@@ -480,6 +483,21 @@ static struct vfio_group *vfio_group_get_from_minor(int minor)
 	return group;
 }
 
+static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
+{
+	struct iommu_group *iommu_group;
+	struct vfio_group *group;
+
+	iommu_group = iommu_group_get(dev);
+	if (!iommu_group)
+		return NULL;
+
+	group = vfio_group_get_from_iommu(iommu_group);
+	iommu_group_put(iommu_group);
+
+	return group;
+}
+
 /**
  * Device objects - create, release, get, put, search
  */
@@ -811,16 +829,10 @@ EXPORT_SYMBOL_GPL(vfio_add_group_dev);
  */
 struct vfio_device *vfio_device_get_from_dev(struct device *dev)
 {
-	struct iommu_group *iommu_group;
 	struct vfio_group *group;
 	struct vfio_device *device;
 
-	iommu_group = iommu_group_get(dev);
-	if (!iommu_group)
-		return NULL;
-
-	group = vfio_group_get_from_iommu(iommu_group);
-	iommu_group_put(iommu_group);
+	group = vfio_group_get_from_dev(dev);
 	if (!group)
 		return NULL;
 
@@ -1376,6 +1388,23 @@ static bool vfio_group_viable(struct vfio_group *group)
 					 group, vfio_dev_viable) == 0);
 }
 
+static int vfio_group_add_container_user(struct vfio_group *group)
+{
+	if (!atomic_inc_not_zero(&group->container_users))
+		return -EINVAL;
+
+	if (group->noiommu) {
+		atomic_dec(&group->container_users);
+		return -EPERM;
+	}
+	if (!group->container->iommu_driver || !vfio_group_viable(group)) {
+		atomic_dec(&group->container_users);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static const struct file_operations vfio_device_fops;
 
 static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
@@ -1555,6 +1584,9 @@ static int vfio_group_fops_release(struct inode *inode, struct file *filep)
 
 	filep->private_data = NULL;
 
+	/* Any user didn't unregister? */
+	WARN_ON(group->notifier.head);
+
 	vfio_group_try_dissolve_container(group);
 
 	atomic_dec(&group->opened);
@@ -1685,23 +1717,14 @@ static const struct file_operations vfio_device_fops = {
 struct vfio_group *vfio_group_get_external_user(struct file *filep)
 {
 	struct vfio_group *group = filep->private_data;
+	int ret;
 
 	if (filep->f_op != &vfio_group_fops)
 		return ERR_PTR(-EINVAL);
 
-	if (!atomic_inc_not_zero(&group->container_users))
-		return ERR_PTR(-EINVAL);
-
-	if (group->noiommu) {
-		atomic_dec(&group->container_users);
-		return ERR_PTR(-EPERM);
-	}
-
-	if (!group->container->iommu_driver ||
-			!vfio_group_viable(group)) {
-		atomic_dec(&group->container_users);
-		return ERR_PTR(-EINVAL);
-	}
+	ret = vfio_group_add_container_user(group);
+	if (ret)
+		return ERR_PTR(ret);
 
 	vfio_group_get(group);
 
@@ -1763,7 +1786,7 @@ struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
 	header->version = version;
 
 	/* Add to the end of the capability chain */
-	for (tmp = caps->buf; tmp->next; tmp = (void *)tmp + tmp->next)
+	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
 		; /* nothing */
 
 	tmp->next = caps->size;
@@ -1776,11 +1799,403 @@ EXPORT_SYMBOL_GPL(vfio_info_cap_add);
 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
 {
 	struct vfio_info_cap_header *tmp;
+	void *buf = (void *)caps->buf;
 
-	for (tmp = caps->buf; tmp->next; tmp = (void *)tmp + tmp->next - offset)
+	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
 		tmp->next += offset;
 }
-EXPORT_SYMBOL_GPL(vfio_info_cap_shift);
+EXPORT_SYMBOL(vfio_info_cap_shift);
+
+static int sparse_mmap_cap(struct vfio_info_cap *caps, void *cap_type)
+{
+	struct vfio_info_cap_header *header;
+	struct vfio_region_info_cap_sparse_mmap *sparse_cap, *sparse = cap_type;
+	size_t size;
+
+	size = sizeof(*sparse) + sparse->nr_areas *  sizeof(*sparse->areas);
+	header = vfio_info_cap_add(caps, size,
+				   VFIO_REGION_INFO_CAP_SPARSE_MMAP, 1);
+	if (IS_ERR(header))
+		return PTR_ERR(header);
+
+	sparse_cap = container_of(header,
+			struct vfio_region_info_cap_sparse_mmap, header);
+	sparse_cap->nr_areas = sparse->nr_areas;
+	memcpy(sparse_cap->areas, sparse->areas,
+	       sparse->nr_areas * sizeof(*sparse->areas));
+	return 0;
+}
+
+static int region_type_cap(struct vfio_info_cap *caps, void *cap_type)
+{
+	struct vfio_info_cap_header *header;
+	struct vfio_region_info_cap_type *type_cap, *cap = cap_type;
+
+	header = vfio_info_cap_add(caps, sizeof(*cap),
+				   VFIO_REGION_INFO_CAP_TYPE, 1);
+	if (IS_ERR(header))
+		return PTR_ERR(header);
+
+	type_cap = container_of(header, struct vfio_region_info_cap_type,
+				header);
+	type_cap->type = cap->type;
+	type_cap->subtype = cap->subtype;
+	return 0;
+}
+
+int vfio_info_add_capability(struct vfio_info_cap *caps, int cap_type_id,
+			     void *cap_type)
+{
+	int ret = -EINVAL;
+
+	if (!cap_type)
+		return 0;
+
+	switch (cap_type_id) {
+	case VFIO_REGION_INFO_CAP_SPARSE_MMAP:
+		ret = sparse_mmap_cap(caps, cap_type);
+		break;
+
+	case VFIO_REGION_INFO_CAP_TYPE:
+		ret = region_type_cap(caps, cap_type);
+		break;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(vfio_info_add_capability);
+
+int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
+				       int max_irq_type, size_t *data_size)
+{
+	unsigned long minsz;
+	size_t size;
+
+	minsz = offsetofend(struct vfio_irq_set, count);
+
+	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
+	    (hdr->count >= (U32_MAX - hdr->start)) ||
+	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
+				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
+		return -EINVAL;
+
+	if (data_size)
+		*data_size = 0;
+
+	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
+		return -EINVAL;
+
+	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
+	case VFIO_IRQ_SET_DATA_NONE:
+		size = 0;
+		break;
+	case VFIO_IRQ_SET_DATA_BOOL:
+		size = sizeof(uint8_t);
+		break;
+	case VFIO_IRQ_SET_DATA_EVENTFD:
+		size = sizeof(int32_t);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (size) {
+		if (hdr->argsz - minsz < hdr->count * size)
+			return -EINVAL;
+
+		if (!data_size)
+			return -EINVAL;
+
+		*data_size = hdr->count * size;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
+
+/*
+ * Pin a set of guest PFNs and return their associated host PFNs for local
+ * domain only.
+ * @dev [in]     : device
+ * @user_pfn [in]: array of user/guest PFNs to be unpinned.
+ * @npage [in]   : count of elements in user_pfn array.  This count should not
+ *		   be greater VFIO_PIN_PAGES_MAX_ENTRIES.
+ * @prot [in]    : protection flags
+ * @phys_pfn[out]: array of host PFNs
+ * Return error or number of pages pinned.
+ */
+int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
+		   int prot, unsigned long *phys_pfn)
+{
+	struct vfio_container *container;
+	struct vfio_group *group;
+	struct vfio_iommu_driver *driver;
+	int ret;
+
+	if (!dev || !user_pfn || !phys_pfn || !npage)
+		return -EINVAL;
+
+	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
+		return -E2BIG;
+
+	group = vfio_group_get_from_dev(dev);
+	if (!group)
+		return -ENODEV;
+
+	ret = vfio_group_add_container_user(group);
+	if (ret)
+		goto err_pin_pages;
+
+	container = group->container;
+	down_read(&container->group_lock);
+
+	driver = container->iommu_driver;
+	if (likely(driver && driver->ops->pin_pages))
+		ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
+					     npage, prot, phys_pfn);
+	else
+		ret = -ENOTTY;
+
+	up_read(&container->group_lock);
+	vfio_group_try_dissolve_container(group);
+
+err_pin_pages:
+	vfio_group_put(group);
+	return ret;
+}
+EXPORT_SYMBOL(vfio_pin_pages);
+
+/*
+ * Unpin set of host PFNs for local domain only.
+ * @dev [in]     : device
+ * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
+ *		   PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
+ * @npage [in]   : count of elements in user_pfn array.  This count should not
+ *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
+ * Return error or number of pages unpinned.
+ */
+int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
+{
+	struct vfio_container *container;
+	struct vfio_group *group;
+	struct vfio_iommu_driver *driver;
+	int ret;
+
+	if (!dev || !user_pfn || !npage)
+		return -EINVAL;
+
+	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
+		return -E2BIG;
+
+	group = vfio_group_get_from_dev(dev);
+	if (!group)
+		return -ENODEV;
+
+	ret = vfio_group_add_container_user(group);
+	if (ret)
+		goto err_unpin_pages;
+
+	container = group->container;
+	down_read(&container->group_lock);
+
+	driver = container->iommu_driver;
+	if (likely(driver && driver->ops->unpin_pages))
+		ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
+					       npage);
+	else
+		ret = -ENOTTY;
+
+	up_read(&container->group_lock);
+	vfio_group_try_dissolve_container(group);
+
+err_unpin_pages:
+	vfio_group_put(group);
+	return ret;
+}
+EXPORT_SYMBOL(vfio_unpin_pages);
+
+static int vfio_register_iommu_notifier(struct vfio_group *group,
+					unsigned long *events,
+					struct notifier_block *nb)
+{
+	struct vfio_container *container;
+	struct vfio_iommu_driver *driver;
+	int ret;
+
+	ret = vfio_group_add_container_user(group);
+	if (ret)
+		return -EINVAL;
+
+	container = group->container;
+	down_read(&container->group_lock);
+
+	driver = container->iommu_driver;
+	if (likely(driver && driver->ops->register_notifier))
+		ret = driver->ops->register_notifier(container->iommu_data,
+						     events, nb);
+	else
+		ret = -ENOTTY;
+
+	up_read(&container->group_lock);
+	vfio_group_try_dissolve_container(group);
+
+	return ret;
+}
+
+static int vfio_unregister_iommu_notifier(struct vfio_group *group,
+					  struct notifier_block *nb)
+{
+	struct vfio_container *container;
+	struct vfio_iommu_driver *driver;
+	int ret;
+
+	ret = vfio_group_add_container_user(group);
+	if (ret)
+		return -EINVAL;
+
+	container = group->container;
+	down_read(&container->group_lock);
+
+	driver = container->iommu_driver;
+	if (likely(driver && driver->ops->unregister_notifier))
+		ret = driver->ops->unregister_notifier(container->iommu_data,
+						       nb);
+	else
+		ret = -ENOTTY;
+
+	up_read(&container->group_lock);
+	vfio_group_try_dissolve_container(group);
+
+	return ret;
+}
+
+void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
+{
+	group->kvm = kvm;
+	blocking_notifier_call_chain(&group->notifier,
+				VFIO_GROUP_NOTIFY_SET_KVM, kvm);
+}
+EXPORT_SYMBOL_GPL(vfio_group_set_kvm);
+
+static int vfio_register_group_notifier(struct vfio_group *group,
+					unsigned long *events,
+					struct notifier_block *nb)
+{
+	struct vfio_container *container;
+	int ret;
+	bool set_kvm = false;
+
+	if (*events & VFIO_GROUP_NOTIFY_SET_KVM)
+		set_kvm = true;
+
+	/* clear known events */
+	*events &= ~VFIO_GROUP_NOTIFY_SET_KVM;
+
+	/* refuse to continue if still events remaining */
+	if (*events)
+		return -EINVAL;
+
+	ret = vfio_group_add_container_user(group);
+	if (ret)
+		return -EINVAL;
+
+	container = group->container;
+	down_read(&container->group_lock);
+
+	ret = blocking_notifier_chain_register(&group->notifier, nb);
+
+	/*
+	 * The attaching of kvm and vfio_group might already happen, so
+	 * here we replay once upon registration.
+	 */
+	if (!ret && set_kvm && group->kvm)
+		blocking_notifier_call_chain(&group->notifier,
+					VFIO_GROUP_NOTIFY_SET_KVM, group->kvm);
+
+	up_read(&container->group_lock);
+	vfio_group_try_dissolve_container(group);
+
+	return ret;
+}
+
+static int vfio_unregister_group_notifier(struct vfio_group *group,
+					 struct notifier_block *nb)
+{
+	struct vfio_container *container;
+	int ret;
+
+	ret = vfio_group_add_container_user(group);
+	if (ret)
+		return -EINVAL;
+
+	container = group->container;
+	down_read(&container->group_lock);
+
+	ret = blocking_notifier_chain_unregister(&group->notifier, nb);
+
+	up_read(&container->group_lock);
+	vfio_group_try_dissolve_container(group);
+
+	return ret;
+}
+
+int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
+			   unsigned long *events, struct notifier_block *nb)
+{
+	struct vfio_group *group;
+	int ret;
+
+	if (!dev || !nb || !events || (*events == 0))
+		return -EINVAL;
+
+	group = vfio_group_get_from_dev(dev);
+	if (!group)
+		return -ENODEV;
+
+	switch (type) {
+	case VFIO_IOMMU_NOTIFY:
+		ret = vfio_register_iommu_notifier(group, events, nb);
+		break;
+	case VFIO_GROUP_NOTIFY:
+		ret = vfio_register_group_notifier(group, events, nb);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	vfio_group_put(group);
+	return ret;
+}
+EXPORT_SYMBOL(vfio_register_notifier);
+
+int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
+			     struct notifier_block *nb)
+{
+	struct vfio_group *group;
+	int ret;
+
+	if (!dev || !nb)
+		return -EINVAL;
+
+	group = vfio_group_get_from_dev(dev);
+	if (!group)
+		return -ENODEV;
+
+	switch (type) {
+	case VFIO_IOMMU_NOTIFY:
+		ret = vfio_unregister_iommu_notifier(group, nb);
+		break;
+	case VFIO_GROUP_NOTIFY:
+		ret = vfio_unregister_group_notifier(group, nb);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	vfio_group_put(group);
+	return ret;
+}
+EXPORT_SYMBOL(vfio_unregister_notifier);
 
 /**
  * Module/class support
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 2ba19424e4a1..f3726ba12aa6 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -36,6 +36,9 @@
 #include <linux/uaccess.h>
 #include <linux/vfio.h>
 #include <linux/workqueue.h>
+#include <linux/pid_namespace.h>
+#include <linux/mdev.h>
+#include <linux/notifier.h>
 
 #define DRIVER_VERSION  "0.2"
 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
@@ -55,8 +58,10 @@ MODULE_PARM_DESC(disable_hugepages,
 
 struct vfio_iommu {
 	struct list_head	domain_list;
+	struct vfio_domain	*external_domain; /* domain for external user */
 	struct mutex		lock;
 	struct rb_root		dma_list;
+	struct blocking_notifier_head notifier;
 	bool			v2;
 	bool			nesting;
 };
@@ -75,6 +80,9 @@ struct vfio_dma {
 	unsigned long		vaddr;		/* Process virtual addr */
 	size_t			size;		/* Map size (bytes) */
 	int			prot;		/* IOMMU_READ/WRITE */
+	bool			iommu_mapped;
+	struct task_struct	*task;
+	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
 };
 
 struct vfio_group {
@@ -83,6 +91,21 @@ struct vfio_group {
 };
 
 /*
+ * Guest RAM pinning working set or DMA target
+ */
+struct vfio_pfn {
+	struct rb_node		node;
+	dma_addr_t		iova;		/* Device address */
+	unsigned long		pfn;		/* Host pfn */
+	atomic_t		ref_count;
+};
+
+#define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
+					(!list_empty(&iommu->domain_list))
+
+static int put_pfn(unsigned long pfn, int prot);
+
+/*
  * This code handles mapping and unmapping of user data buffers
  * into DMA'ble space using the IOMMU
  */
@@ -130,6 +153,97 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
 	rb_erase(&old->node, &iommu->dma_list);
 }
 
+/*
+ * Helper Functions for host iova-pfn list
+ */
+static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova)
+{
+	struct vfio_pfn *vpfn;
+	struct rb_node *node = dma->pfn_list.rb_node;
+
+	while (node) {
+		vpfn = rb_entry(node, struct vfio_pfn, node);
+
+		if (iova < vpfn->iova)
+			node = node->rb_left;
+		else if (iova > vpfn->iova)
+			node = node->rb_right;
+		else
+			return vpfn;
+	}
+	return NULL;
+}
+
+static void vfio_link_pfn(struct vfio_dma *dma,
+			  struct vfio_pfn *new)
+{
+	struct rb_node **link, *parent = NULL;
+	struct vfio_pfn *vpfn;
+
+	link = &dma->pfn_list.rb_node;
+	while (*link) {
+		parent = *link;
+		vpfn = rb_entry(parent, struct vfio_pfn, node);
+
+		if (new->iova < vpfn->iova)
+			link = &(*link)->rb_left;
+		else
+			link = &(*link)->rb_right;
+	}
+
+	rb_link_node(&new->node, parent, link);
+	rb_insert_color(&new->node, &dma->pfn_list);
+}
+
+static void vfio_unlink_pfn(struct vfio_dma *dma, struct vfio_pfn *old)
+{
+	rb_erase(&old->node, &dma->pfn_list);
+}
+
+static int vfio_add_to_pfn_list(struct vfio_dma *dma, dma_addr_t iova,
+				unsigned long pfn)
+{
+	struct vfio_pfn *vpfn;
+
+	vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
+	if (!vpfn)
+		return -ENOMEM;
+
+	vpfn->iova = iova;
+	vpfn->pfn = pfn;
+	atomic_set(&vpfn->ref_count, 1);
+	vfio_link_pfn(dma, vpfn);
+	return 0;
+}
+
+static void vfio_remove_from_pfn_list(struct vfio_dma *dma,
+				      struct vfio_pfn *vpfn)
+{
+	vfio_unlink_pfn(dma, vpfn);
+	kfree(vpfn);
+}
+
+static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
+					       unsigned long iova)
+{
+	struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
+
+	if (vpfn)
+		atomic_inc(&vpfn->ref_count);
+	return vpfn;
+}
+
+static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
+{
+	int ret = 0;
+
+	if (atomic_dec_and_test(&vpfn->ref_count)) {
+		ret = put_pfn(vpfn->pfn, dma->prot);
+		vfio_remove_from_pfn_list(dma, vpfn);
+	}
+	return ret;
+}
+
 struct vwork {
 	struct mm_struct	*mm;
 	long			npage;
@@ -150,17 +264,22 @@ static void vfio_lock_acct_bg(struct work_struct *work)
 	kfree(vwork);
 }
 
-static void vfio_lock_acct(long npage)
+static void vfio_lock_acct(struct task_struct *task, long npage)
 {
 	struct vwork *vwork;
 	struct mm_struct *mm;
 
-	if (!current->mm || !npage)
+	if (!npage)
+		return;
+
+	mm = get_task_mm(task);
+	if (!mm)
 		return; /* process exited or nothing to do */
 
-	if (down_write_trylock(&current->mm->mmap_sem)) {
-		current->mm->locked_vm += npage;
-		up_write(&current->mm->mmap_sem);
+	if (down_write_trylock(&mm->mmap_sem)) {
+		mm->locked_vm += npage;
+		up_write(&mm->mmap_sem);
+		mmput(mm);
 		return;
 	}
 
@@ -170,11 +289,8 @@ static void vfio_lock_acct(long npage)
 	 * wouldn't need this silliness
 	 */
 	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
-	if (!vwork)
-		return;
-	mm = get_task_mm(current);
-	if (!mm) {
-		kfree(vwork);
+	if (!vwork) {
+		mmput(mm);
 		return;
 	}
 	INIT_WORK(&vwork->work, vfio_lock_acct_bg);
@@ -228,20 +344,36 @@ static int put_pfn(unsigned long pfn, int prot)
 	return 0;
 }
 
-static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
+static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
+			 int prot, unsigned long *pfn)
 {
 	struct page *page[1];
 	struct vm_area_struct *vma;
-	int ret = -EFAULT;
+	int ret;
+
+	if (mm == current->mm) {
+		ret = get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE),
+					  page);
+	} else {
+		unsigned int flags = 0;
+
+		if (prot & IOMMU_WRITE)
+			flags |= FOLL_WRITE;
+
+		down_read(&mm->mmap_sem);
+		ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page,
+					    NULL, NULL);
+		up_read(&mm->mmap_sem);
+	}
 
-	if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
+	if (ret == 1) {
 		*pfn = page_to_pfn(page[0]);
 		return 0;
 	}
 
-	down_read(&current->mm->mmap_sem);
+	down_read(&mm->mmap_sem);
 
-	vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
+	vma = find_vma_intersection(mm, vaddr, vaddr + 1);
 
 	if (vma && vma->vm_flags & VM_PFNMAP) {
 		*pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
@@ -249,8 +381,7 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
 			ret = 0;
 	}
 
-	up_read(&current->mm->mmap_sem);
-
+	up_read(&mm->mmap_sem);
 	return ret;
 }
 
@@ -259,88 +390,299 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
  * the iommu can only map chunks of consecutive pfns anyway, so get the
  * first page and all consecutive pages with the same locking.
  */
-static long vfio_pin_pages(unsigned long vaddr, long npage,
-			   int prot, unsigned long *pfn_base)
+static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
+				  long npage, unsigned long *pfn_base)
 {
-	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-	bool lock_cap = capable(CAP_IPC_LOCK);
-	long ret, i;
+	unsigned long limit;
+	bool lock_cap = ns_capable(task_active_pid_ns(dma->task)->user_ns,
+				   CAP_IPC_LOCK);
+	struct mm_struct *mm;
+	long ret, i = 0, lock_acct = 0;
 	bool rsvd;
+	dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
 
-	if (!current->mm)
+	mm = get_task_mm(dma->task);
+	if (!mm)
 		return -ENODEV;
 
-	ret = vaddr_get_pfn(vaddr, prot, pfn_base);
+	ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base);
 	if (ret)
-		return ret;
+		goto pin_pg_remote_exit;
 
 	rsvd = is_invalid_reserved_pfn(*pfn_base);
+	limit = task_rlimit(dma->task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 
-	if (!rsvd && !lock_cap && current->mm->locked_vm + 1 > limit) {
-		put_pfn(*pfn_base, prot);
-		pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
-			limit << PAGE_SHIFT);
-		return -ENOMEM;
+	/*
+	 * Reserved pages aren't counted against the user, externally pinned
+	 * pages are already counted against the user.
+	 */
+	if (!rsvd && !vfio_find_vpfn(dma, iova)) {
+		if (!lock_cap && mm->locked_vm + 1 > limit) {
+			put_pfn(*pfn_base, dma->prot);
+			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
+					limit << PAGE_SHIFT);
+			ret = -ENOMEM;
+			goto pin_pg_remote_exit;
+		}
+		lock_acct++;
 	}
 
-	if (unlikely(disable_hugepages)) {
-		if (!rsvd)
-			vfio_lock_acct(1);
-		return 1;
-	}
+	i++;
+	if (likely(!disable_hugepages)) {
+		/* Lock all the consecutive pages from pfn_base */
+		for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; i < npage;
+		     i++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
+			unsigned long pfn = 0;
 
-	/* Lock all the consecutive pages from pfn_base */
-	for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
-		unsigned long pfn = 0;
+			ret = vaddr_get_pfn(mm, vaddr, dma->prot, &pfn);
+			if (ret)
+				break;
 
-		ret = vaddr_get_pfn(vaddr, prot, &pfn);
-		if (ret)
-			break;
+			if (pfn != *pfn_base + i ||
+			    rsvd != is_invalid_reserved_pfn(pfn)) {
+				put_pfn(pfn, dma->prot);
+				break;
+			}
 
-		if (pfn != *pfn_base + i ||
-		    rsvd != is_invalid_reserved_pfn(pfn)) {
-			put_pfn(pfn, prot);
-			break;
+			if (!rsvd && !vfio_find_vpfn(dma, iova)) {
+				if (!lock_cap &&
+				    mm->locked_vm + lock_acct + 1 > limit) {
+					put_pfn(pfn, dma->prot);
+					pr_warn("%s: RLIMIT_MEMLOCK (%ld) "
+						"exceeded\n", __func__,
+						limit << PAGE_SHIFT);
+					break;
+				}
+				lock_acct++;
+			}
 		}
+	}
 
-		if (!rsvd && !lock_cap &&
-		    current->mm->locked_vm + i + 1 > limit) {
-			put_pfn(pfn, prot);
-			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
-				__func__, limit << PAGE_SHIFT);
-			break;
+	vfio_lock_acct(dma->task, lock_acct);
+	ret = i;
+
+pin_pg_remote_exit:
+	mmput(mm);
+	return ret;
+}
+
+static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
+				    unsigned long pfn, long npage,
+				    bool do_accounting)
+{
+	long unlocked = 0, locked = 0;
+	long i;
+
+	for (i = 0; i < npage; i++) {
+		if (put_pfn(pfn++, dma->prot)) {
+			unlocked++;
+			if (vfio_find_vpfn(dma, iova + (i << PAGE_SHIFT)))
+				locked++;
 		}
 	}
 
-	if (!rsvd)
-		vfio_lock_acct(i);
+	if (do_accounting)
+		vfio_lock_acct(dma->task, locked - unlocked);
 
-	return i;
+	return unlocked;
 }
 
-static long vfio_unpin_pages(unsigned long pfn, long npage,
-			     int prot, bool do_accounting)
+static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
+				  unsigned long *pfn_base, bool do_accounting)
 {
-	unsigned long unlocked = 0;
-	long i;
+	unsigned long limit;
+	bool lock_cap = ns_capable(task_active_pid_ns(dma->task)->user_ns,
+				   CAP_IPC_LOCK);
+	struct mm_struct *mm;
+	int ret;
+	bool rsvd;
 
-	for (i = 0; i < npage; i++)
-		unlocked += put_pfn(pfn++, prot);
+	mm = get_task_mm(dma->task);
+	if (!mm)
+		return -ENODEV;
+
+	ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base);
+	if (ret)
+		goto pin_page_exit;
+
+	rsvd = is_invalid_reserved_pfn(*pfn_base);
+	limit = task_rlimit(dma->task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+	if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) {
+		put_pfn(*pfn_base, dma->prot);
+		pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK (%ld) exceeded\n",
+			__func__, dma->task->comm, task_pid_nr(dma->task),
+			limit << PAGE_SHIFT);
+		ret = -ENOMEM;
+		goto pin_page_exit;
+	}
+
+	if (!rsvd && do_accounting)
+		vfio_lock_acct(dma->task, 1);
+	ret = 1;
+
+pin_page_exit:
+	mmput(mm);
+	return ret;
+}
+
+static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
+				    bool do_accounting)
+{
+	int unlocked;
+	struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
+
+	if (!vpfn)
+		return 0;
+
+	unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
 
 	if (do_accounting)
-		vfio_lock_acct(-unlocked);
+		vfio_lock_acct(dma->task, -unlocked);
 
 	return unlocked;
 }
 
-static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
+static int vfio_iommu_type1_pin_pages(void *iommu_data,
+				      unsigned long *user_pfn,
+				      int npage, int prot,
+				      unsigned long *phys_pfn)
+{
+	struct vfio_iommu *iommu = iommu_data;
+	int i, j, ret;
+	unsigned long remote_vaddr;
+	struct vfio_dma *dma;
+	bool do_accounting;
+
+	if (!iommu || !user_pfn || !phys_pfn)
+		return -EINVAL;
+
+	/* Supported for v2 version only */
+	if (!iommu->v2)
+		return -EACCES;
+
+	mutex_lock(&iommu->lock);
+
+	/* Fail if notifier list is empty */
+	if ((!iommu->external_domain) || (!iommu->notifier.head)) {
+		ret = -EINVAL;
+		goto pin_done;
+	}
+
+	/*
+	 * If iommu capable domain exist in the container then all pages are
+	 * already pinned and accounted. Accouting should be done if there is no
+	 * iommu capable domain in the container.
+	 */
+	do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
+
+	for (i = 0; i < npage; i++) {
+		dma_addr_t iova;
+		struct vfio_pfn *vpfn;
+
+		iova = user_pfn[i] << PAGE_SHIFT;
+		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
+		if (!dma) {
+			ret = -EINVAL;
+			goto pin_unwind;
+		}
+
+		if ((dma->prot & prot) != prot) {
+			ret = -EPERM;
+			goto pin_unwind;
+		}
+
+		vpfn = vfio_iova_get_vfio_pfn(dma, iova);
+		if (vpfn) {
+			phys_pfn[i] = vpfn->pfn;
+			continue;
+		}
+
+		remote_vaddr = dma->vaddr + iova - dma->iova;
+		ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i],
+					     do_accounting);
+		if (ret <= 0) {
+			WARN_ON(!ret);
+			goto pin_unwind;
+		}
+
+		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
+		if (ret) {
+			vfio_unpin_page_external(dma, iova, do_accounting);
+			goto pin_unwind;
+		}
+	}
+
+	ret = i;
+	goto pin_done;
+
+pin_unwind:
+	phys_pfn[i] = 0;
+	for (j = 0; j < i; j++) {
+		dma_addr_t iova;
+
+		iova = user_pfn[j] << PAGE_SHIFT;
+		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
+		vfio_unpin_page_external(dma, iova, do_accounting);
+		phys_pfn[j] = 0;
+	}
+pin_done:
+	mutex_unlock(&iommu->lock);
+	return ret;
+}
+
+static int vfio_iommu_type1_unpin_pages(void *iommu_data,
+					unsigned long *user_pfn,
+					int npage)
+{
+	struct vfio_iommu *iommu = iommu_data;
+	bool do_accounting;
+	int i;
+
+	if (!iommu || !user_pfn)
+		return -EINVAL;
+
+	/* Supported for v2 version only */
+	if (!iommu->v2)
+		return -EACCES;
+
+	mutex_lock(&iommu->lock);
+
+	if (!iommu->external_domain) {
+		mutex_unlock(&iommu->lock);
+		return -EINVAL;
+	}
+
+	do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
+	for (i = 0; i < npage; i++) {
+		struct vfio_dma *dma;
+		dma_addr_t iova;
+
+		iova = user_pfn[i] << PAGE_SHIFT;
+		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
+		if (!dma)
+			goto unpin_exit;
+		vfio_unpin_page_external(dma, iova, do_accounting);
+	}
+
+unpin_exit:
+	mutex_unlock(&iommu->lock);
+	return i > npage ? npage : (i > 0 ? i : -EINVAL);
+}
+
+static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
+			     bool do_accounting)
 {
 	dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
 	struct vfio_domain *domain, *d;
 	long unlocked = 0;
 
 	if (!dma->size)
-		return;
+		return 0;
+
+	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
+		return 0;
+
 	/*
 	 * We use the IOMMU to track the physical addresses, otherwise we'd
 	 * need a much more complicated tracking system.  Unfortunately that
@@ -382,21 +724,28 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
 		if (WARN_ON(!unmapped))
 			break;
 
-		unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
-					     unmapped >> PAGE_SHIFT,
-					     dma->prot, false);
+		unlocked += vfio_unpin_pages_remote(dma, iova,
+						    phys >> PAGE_SHIFT,
+						    unmapped >> PAGE_SHIFT,
+						    false);
 		iova += unmapped;
 
 		cond_resched();
 	}
 
-	vfio_lock_acct(-unlocked);
+	dma->iommu_mapped = false;
+	if (do_accounting) {
+		vfio_lock_acct(dma->task, -unlocked);
+		return 0;
+	}
+	return unlocked;
 }
 
 static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
 {
-	vfio_unmap_unpin(iommu, dma);
+	vfio_unmap_unpin(iommu, dma, true);
 	vfio_unlink_dma(iommu, dma);
+	put_task_struct(dma->task);
 	kfree(dma);
 }
 
@@ -430,9 +779,9 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 			     struct vfio_iommu_type1_dma_unmap *unmap)
 {
 	uint64_t mask;
-	struct vfio_dma *dma;
+	struct vfio_dma *dma, *dma_last = NULL;
 	size_t unmapped = 0;
-	int ret = 0;
+	int ret = 0, retries = 0;
 
 	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
 
@@ -442,7 +791,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 		return -EINVAL;
 
 	WARN_ON(mask & PAGE_MASK);
-
+again:
 	mutex_lock(&iommu->lock);
 
 	/*
@@ -477,7 +826,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 	 * mappings within the range.
 	 */
 	if (iommu->v2) {
-		dma = vfio_find_dma(iommu, unmap->iova, 0);
+		dma = vfio_find_dma(iommu, unmap->iova, 1);
 		if (dma && dma->iova != unmap->iova) {
 			ret = -EINVAL;
 			goto unlock;
@@ -492,6 +841,38 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 	while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
 		if (!iommu->v2 && unmap->iova > dma->iova)
 			break;
+		/*
+		 * Task with same address space who mapped this iova range is
+		 * allowed to unmap the iova range.
+		 */
+		if (dma->task->mm != current->mm)
+			break;
+
+		if (!RB_EMPTY_ROOT(&dma->pfn_list)) {
+			struct vfio_iommu_type1_dma_unmap nb_unmap;
+
+			if (dma_last == dma) {
+				BUG_ON(++retries > 10);
+			} else {
+				dma_last = dma;
+				retries = 0;
+			}
+
+			nb_unmap.iova = dma->iova;
+			nb_unmap.size = dma->size;
+
+			/*
+			 * Notify anyone (mdev vendor drivers) to invalidate and
+			 * unmap iovas within the range we're about to unmap.
+			 * Vendor drivers MUST unpin pages in response to an
+			 * invalidation.
+			 */
+			mutex_unlock(&iommu->lock);
+			blocking_notifier_call_chain(&iommu->notifier,
+						    VFIO_IOMMU_NOTIFY_DMA_UNMAP,
+						    &nb_unmap);
+			goto again;
+		}
 		unmapped += dma->size;
 		vfio_remove_dma(iommu, dma);
 	}
@@ -558,17 +939,56 @@ unwind:
 	return ret;
 }
 
+static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
+			    size_t map_size)
+{
+	dma_addr_t iova = dma->iova;
+	unsigned long vaddr = dma->vaddr;
+	size_t size = map_size;
+	long npage;
+	unsigned long pfn;
+	int ret = 0;
+
+	while (size) {
+		/* Pin a contiguous chunk of memory */
+		npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
+					      size >> PAGE_SHIFT, &pfn);
+		if (npage <= 0) {
+			WARN_ON(!npage);
+			ret = (int)npage;
+			break;
+		}
+
+		/* Map it! */
+		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage,
+				     dma->prot);
+		if (ret) {
+			vfio_unpin_pages_remote(dma, iova + dma->size, pfn,
+						npage, true);
+			break;
+		}
+
+		size -= npage << PAGE_SHIFT;
+		dma->size += npage << PAGE_SHIFT;
+	}
+
+	dma->iommu_mapped = true;
+
+	if (ret)
+		vfio_remove_dma(iommu, dma);
+
+	return ret;
+}
+
 static int vfio_dma_do_map(struct vfio_iommu *iommu,
 			   struct vfio_iommu_type1_dma_map *map)
 {
 	dma_addr_t iova = map->iova;
 	unsigned long vaddr = map->vaddr;
 	size_t size = map->size;
-	long npage;
 	int ret = 0, prot = 0;
 	uint64_t mask;
 	struct vfio_dma *dma;
-	unsigned long pfn;
 
 	/* Verify that none of our __u64 fields overflow */
 	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
@@ -594,47 +1014,33 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	mutex_lock(&iommu->lock);
 
 	if (vfio_find_dma(iommu, iova, size)) {
-		mutex_unlock(&iommu->lock);
-		return -EEXIST;
+		ret = -EEXIST;
+		goto out_unlock;
 	}
 
 	dma = kzalloc(sizeof(*dma), GFP_KERNEL);
 	if (!dma) {
-		mutex_unlock(&iommu->lock);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out_unlock;
 	}
 
 	dma->iova = iova;
 	dma->vaddr = vaddr;
 	dma->prot = prot;
+	get_task_struct(current);
+	dma->task = current;
+	dma->pfn_list = RB_ROOT;
 
 	/* Insert zero-sized and grow as we map chunks of it */
 	vfio_link_dma(iommu, dma);
 
-	while (size) {
-		/* Pin a contiguous chunk of memory */
-		npage = vfio_pin_pages(vaddr + dma->size,
-				       size >> PAGE_SHIFT, prot, &pfn);
-		if (npage <= 0) {
-			WARN_ON(!npage);
-			ret = (int)npage;
-			break;
-		}
-
-		/* Map it! */
-		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot);
-		if (ret) {
-			vfio_unpin_pages(pfn, npage, prot, true);
-			break;
-		}
-
-		size -= npage << PAGE_SHIFT;
-		dma->size += npage << PAGE_SHIFT;
-	}
-
-	if (ret)
-		vfio_remove_dma(iommu, dma);
+	/* Don't pin and map if container doesn't contain IOMMU capable domain*/
+	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
+		dma->size = size;
+	else
+		ret = vfio_pin_map_dma(iommu, dma, size);
 
+out_unlock:
 	mutex_unlock(&iommu->lock);
 	return ret;
 }
@@ -662,10 +1068,6 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
 	d = list_first_entry(&iommu->domain_list, struct vfio_domain, next);
 	n = rb_first(&iommu->dma_list);
 
-	/* If there's not a domain, there better not be any mappings */
-	if (WARN_ON(n && !d))
-		return -EINVAL;
-
 	for (; n; n = rb_next(n)) {
 		struct vfio_dma *dma;
 		dma_addr_t iova;
@@ -674,21 +1076,49 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
 		iova = dma->iova;
 
 		while (iova < dma->iova + dma->size) {
-			phys_addr_t phys = iommu_iova_to_phys(d->domain, iova);
+			phys_addr_t phys;
 			size_t size;
 
-			if (WARN_ON(!phys)) {
-				iova += PAGE_SIZE;
-				continue;
+			if (dma->iommu_mapped) {
+				phys_addr_t p;
+				dma_addr_t i;
+
+				phys = iommu_iova_to_phys(d->domain, iova);
+
+				if (WARN_ON(!phys)) {
+					iova += PAGE_SIZE;
+					continue;
+				}
+
+				size = PAGE_SIZE;
+				p = phys + size;
+				i = iova + size;
+				while (i < dma->iova + dma->size &&
+				       p == iommu_iova_to_phys(d->domain, i)) {
+					size += PAGE_SIZE;
+					p += PAGE_SIZE;
+					i += PAGE_SIZE;
+				}
+			} else {
+				unsigned long pfn;
+				unsigned long vaddr = dma->vaddr +
+						     (iova - dma->iova);
+				size_t n = dma->iova + dma->size - iova;
+				long npage;
+
+				npage = vfio_pin_pages_remote(dma, vaddr,
+							      n >> PAGE_SHIFT,
+							      &pfn);
+				if (npage <= 0) {
+					WARN_ON(!npage);
+					ret = (int)npage;
+					return ret;
+				}
+
+				phys = pfn << PAGE_SHIFT;
+				size = npage << PAGE_SHIFT;
 			}
 
-			size = PAGE_SIZE;
-
-			while (iova + size < dma->iova + dma->size &&
-			       phys + size == iommu_iova_to_phys(d->domain,
-								 iova + size))
-				size += PAGE_SIZE;
-
 			ret = iommu_map(domain->domain, iova, phys,
 					size, dma->prot | domain->prot);
 			if (ret)
@@ -696,8 +1126,8 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
 
 			iova += size;
 		}
+		dma->iommu_mapped = true;
 	}
-
 	return 0;
 }
 
@@ -734,22 +1164,39 @@ static void vfio_test_domain_fgsp(struct vfio_domain *domain)
 	__free_pages(pages, order);
 }
 
+static struct vfio_group *find_iommu_group(struct vfio_domain *domain,
+					   struct iommu_group *iommu_group)
+{
+	struct vfio_group *g;
+
+	list_for_each_entry(g, &domain->group_list, next) {
+		if (g->iommu_group == iommu_group)
+			return g;
+	}
+
+	return NULL;
+}
+
 static int vfio_iommu_type1_attach_group(void *iommu_data,
 					 struct iommu_group *iommu_group)
 {
 	struct vfio_iommu *iommu = iommu_data;
-	struct vfio_group *group, *g;
+	struct vfio_group *group;
 	struct vfio_domain *domain, *d;
-	struct bus_type *bus = NULL;
+	struct bus_type *bus = NULL, *mdev_bus;
 	int ret;
 
 	mutex_lock(&iommu->lock);
 
 	list_for_each_entry(d, &iommu->domain_list, next) {
-		list_for_each_entry(g, &d->group_list, next) {
-			if (g->iommu_group != iommu_group)
-				continue;
+		if (find_iommu_group(d, iommu_group)) {
+			mutex_unlock(&iommu->lock);
+			return -EINVAL;
+		}
+	}
 
+	if (iommu->external_domain) {
+		if (find_iommu_group(iommu->external_domain, iommu_group)) {
 			mutex_unlock(&iommu->lock);
 			return -EINVAL;
 		}
@@ -769,6 +1216,25 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 	if (ret)
 		goto out_free;
 
+	mdev_bus = symbol_get(mdev_bus_type);
+
+	if (mdev_bus) {
+		if ((bus == mdev_bus) && !iommu_present(bus)) {
+			symbol_put(mdev_bus_type);
+			if (!iommu->external_domain) {
+				INIT_LIST_HEAD(&domain->group_list);
+				iommu->external_domain = domain;
+			} else
+				kfree(domain);
+
+			list_add(&group->next,
+				 &iommu->external_domain->group_list);
+			mutex_unlock(&iommu->lock);
+			return 0;
+		}
+		symbol_put(mdev_bus_type);
+	}
+
 	domain->domain = iommu_domain_alloc(bus);
 	if (!domain->domain) {
 		ret = -EIO;
@@ -859,6 +1325,46 @@ static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
 		vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
 }
 
+static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
+{
+	struct rb_node *n, *p;
+
+	n = rb_first(&iommu->dma_list);
+	for (; n; n = rb_next(n)) {
+		struct vfio_dma *dma;
+		long locked = 0, unlocked = 0;
+
+		dma = rb_entry(n, struct vfio_dma, node);
+		unlocked += vfio_unmap_unpin(iommu, dma, false);
+		p = rb_first(&dma->pfn_list);
+		for (; p; p = rb_next(p)) {
+			struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
+							 node);
+
+			if (!is_invalid_reserved_pfn(vpfn->pfn))
+				locked++;
+		}
+		vfio_lock_acct(dma->task, locked - unlocked);
+	}
+}
+
+static void vfio_sanity_check_pfn_list(struct vfio_iommu *iommu)
+{
+	struct rb_node *n;
+
+	n = rb_first(&iommu->dma_list);
+	for (; n; n = rb_next(n)) {
+		struct vfio_dma *dma;
+
+		dma = rb_entry(n, struct vfio_dma, node);
+
+		if (WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list)))
+			break;
+	}
+	/* mdev vendor driver must unregister notifier */
+	WARN_ON(iommu->notifier.head);
+}
+
 static void vfio_iommu_type1_detach_group(void *iommu_data,
 					  struct iommu_group *iommu_group)
 {
@@ -868,31 +1374,55 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
 
 	mutex_lock(&iommu->lock);
 
-	list_for_each_entry(domain, &iommu->domain_list, next) {
-		list_for_each_entry(group, &domain->group_list, next) {
-			if (group->iommu_group != iommu_group)
-				continue;
-
-			iommu_detach_group(domain->domain, iommu_group);
+	if (iommu->external_domain) {
+		group = find_iommu_group(iommu->external_domain, iommu_group);
+		if (group) {
 			list_del(&group->next);
 			kfree(group);
-			/*
-			 * Group ownership provides privilege, if the group
-			 * list is empty, the domain goes away.  If it's the
-			 * last domain, then all the mappings go away too.
-			 */
-			if (list_empty(&domain->group_list)) {
-				if (list_is_singular(&iommu->domain_list))
+
+			if (list_empty(&iommu->external_domain->group_list)) {
+				vfio_sanity_check_pfn_list(iommu);
+
+				if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
 					vfio_iommu_unmap_unpin_all(iommu);
-				iommu_domain_free(domain->domain);
-				list_del(&domain->next);
-				kfree(domain);
+
+				kfree(iommu->external_domain);
+				iommu->external_domain = NULL;
+			}
+			goto detach_group_done;
+		}
+	}
+
+	list_for_each_entry(domain, &iommu->domain_list, next) {
+		group = find_iommu_group(domain, iommu_group);
+		if (!group)
+			continue;
+
+		iommu_detach_group(domain->domain, iommu_group);
+		list_del(&group->next);
+		kfree(group);
+		/*
+		 * Group ownership provides privilege, if the group list is
+		 * empty, the domain goes away. If it's the last domain with
+		 * iommu and external domain doesn't exist, then all the
+		 * mappings go away too. If it's the last domain with iommu and
+		 * external domain exist, update accounting
+		 */
+		if (list_empty(&domain->group_list)) {
+			if (list_is_singular(&iommu->domain_list)) {
+				if (!iommu->external_domain)
+					vfio_iommu_unmap_unpin_all(iommu);
+				else
+					vfio_iommu_unmap_unpin_reaccount(iommu);
 			}
-			goto done;
+			iommu_domain_free(domain->domain);
+			list_del(&domain->next);
+			kfree(domain);
 		}
+		break;
 	}
 
-done:
+detach_group_done:
 	mutex_unlock(&iommu->lock);
 }
 
@@ -920,31 +1450,46 @@ static void *vfio_iommu_type1_open(unsigned long arg)
 	INIT_LIST_HEAD(&iommu->domain_list);
 	iommu->dma_list = RB_ROOT;
 	mutex_init(&iommu->lock);
+	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
 
 	return iommu;
 }
 
+static void vfio_release_domain(struct vfio_domain *domain, bool external)
+{
+	struct vfio_group *group, *group_tmp;
+
+	list_for_each_entry_safe(group, group_tmp,
+				 &domain->group_list, next) {
+		if (!external)
+			iommu_detach_group(domain->domain, group->iommu_group);
+		list_del(&group->next);
+		kfree(group);
+	}
+
+	if (!external)
+		iommu_domain_free(domain->domain);
+}
+
 static void vfio_iommu_type1_release(void *iommu_data)
 {
 	struct vfio_iommu *iommu = iommu_data;
 	struct vfio_domain *domain, *domain_tmp;
-	struct vfio_group *group, *group_tmp;
+
+	if (iommu->external_domain) {
+		vfio_release_domain(iommu->external_domain, true);
+		vfio_sanity_check_pfn_list(iommu);
+		kfree(iommu->external_domain);
+	}
 
 	vfio_iommu_unmap_unpin_all(iommu);
 
 	list_for_each_entry_safe(domain, domain_tmp,
 				 &iommu->domain_list, next) {
-		list_for_each_entry_safe(group, group_tmp,
-					 &domain->group_list, next) {
-			iommu_detach_group(domain->domain, group->iommu_group);
-			list_del(&group->next);
-			kfree(group);
-		}
-		iommu_domain_free(domain->domain);
+		vfio_release_domain(domain, false);
 		list_del(&domain->next);
 		kfree(domain);
 	}
-
 	kfree(iommu);
 }
 
@@ -1040,14 +1585,42 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
 	return -ENOTTY;
 }
 
+static int vfio_iommu_type1_register_notifier(void *iommu_data,
+					      unsigned long *events,
+					      struct notifier_block *nb)
+{
+	struct vfio_iommu *iommu = iommu_data;
+
+	/* clear known events */
+	*events &= ~VFIO_IOMMU_NOTIFY_DMA_UNMAP;
+
+	/* refuse to register if still events remaining */
+	if (*events)
+		return -EINVAL;
+
+	return blocking_notifier_chain_register(&iommu->notifier, nb);
+}
+
+static int vfio_iommu_type1_unregister_notifier(void *iommu_data,
+						struct notifier_block *nb)
+{
+	struct vfio_iommu *iommu = iommu_data;
+
+	return blocking_notifier_chain_unregister(&iommu->notifier, nb);
+}
+
 static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
-	.name		= "vfio-iommu-type1",
-	.owner		= THIS_MODULE,
-	.open		= vfio_iommu_type1_open,
-	.release	= vfio_iommu_type1_release,
-	.ioctl		= vfio_iommu_type1_ioctl,
-	.attach_group	= vfio_iommu_type1_attach_group,
-	.detach_group	= vfio_iommu_type1_detach_group,
+	.name			= "vfio-iommu-type1",
+	.owner			= THIS_MODULE,
+	.open			= vfio_iommu_type1_open,
+	.release		= vfio_iommu_type1_release,
+	.ioctl			= vfio_iommu_type1_ioctl,
+	.attach_group		= vfio_iommu_type1_attach_group,
+	.detach_group		= vfio_iommu_type1_detach_group,
+	.pin_pages		= vfio_iommu_type1_pin_pages,
+	.unpin_pages		= vfio_iommu_type1_unpin_pages,
+	.register_notifier	= vfio_iommu_type1_register_notifier,
+	.unregister_notifier	= vfio_iommu_type1_unregister_notifier,
 };
 
 static int __init vfio_iommu_type1_init(void)