summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorAlex Williamson <alex.williamson@redhat.com>2022-03-03 19:51:03 +0300
committerAlex Williamson <alex.williamson@redhat.com>2022-03-03 19:51:03 +0300
commitb042b27868c00142da1a7e31f4740a5fbd5628ed (patch)
tree1f4da31129aaf99885388a518eb2ae98e12ba94d /include
parentcfb92440ee71adcc2105b0890bb01ac3cddb8507 (diff)
parent88faa5e8ead62a2a559b1246bba7aa982fde7c67 (diff)
downloadlinux-b042b27868c00142da1a7e31f4740a5fbd5628ed.tar.xz
Merge tag 'mlx5-vfio-v10' of https://git.kernel.org/pub/scm/linux/kernel/git/mellanox/linux into v5.18/vfio/next/mlx5-migration-v10
Add mlx5 live migration driver and v2 migration protocol This series adds mlx5 live migration driver for VFs that are migration capable and includes the v2 migration protocol definition and mlx5 implementation. The mlx5 driver uses the vfio_pci_core split to create a specific VFIO PCI driver that matches the mlx5 virtual functions. The driver provides the same experience as normal vfio-pci with the addition of migration support. In HW the migration is controlled by the PF function, using its mlx5_core driver, and the VFIO PCI VF driver co-ordinates with the PF to execute the migration actions. The bulk of the v2 migration protocol is semantically the same v1, however it has been recast into a FSM for the device_state and the actual syscall interface uses normal ioctl(), read() and write() instead of building a syscall interface using the region. Several bits of infrastructure work are included here: - pci_iov_vf_id() to help drivers like mlx5 figure out the VF index from a BDF - pci_iov_get_pf_drvdata() to clarify the tricky locking protocol when a VF reaches into its PF's driver - mlx5_core uses the normal SRIOV lifecycle and disables SRIOV before driver remove, to be compatible with pci_iov_get_pf_drvdata() - Lifting VFIO_DEVICE_FEATURE into core VFIO code This series comes after alot of discussion. Some major points: - v1 ABI compatible migration defined using the same FSM approach: https://lore.kernel.org/all/0-v1-a4f7cab64938+3f-vfio_mig_states_jgg@nvidia.com/ - Attempts to clarify how the v1 API works: Alex's: https://lore.kernel.org/kvm/163909282574.728533.7460416142511440919.stgit@omen/ Jason's: https://lore.kernel.org/all/0-v3-184b374ad0a8+24c-vfio_mig_doc_jgg@nvidia.com/ - Etherpad exploring the scope and questions of general VFIO migration: https://lore.kernel.org/kvm/87mtm2loml.fsf@redhat.com/ NOTE: As this series touched mlx5_core parts we need to send this in a pull request format to VFIO to avoid conflicts. Matching qemu changes can be previewed here: https://github.com/jgunthorpe/qemu/commits/vfio_migration_v2 Link: https://lore.kernel.org/all/20220224142024.147653-1-yishaih@nvidia.com Signed-of-by: Leon Romanovsky <leonro@nvidia.com>
Diffstat (limited to 'include')
-rw-r--r--include/linux/mlx5/driver.h3
-rw-r--r--include/linux/mlx5/mlx5_ifc.h147
-rw-r--r--include/linux/pci.h15
-rw-r--r--include/linux/vfio.h53
-rw-r--r--include/linux/vfio_pci_core.h4
-rw-r--r--include/uapi/linux/vfio.h406
6 files changed, 415 insertions, 213 deletions
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 78655d8d13a7..319322a8ff94 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1143,6 +1143,9 @@ int mlx5_dm_sw_icm_alloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type,
int mlx5_dm_sw_icm_dealloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type,
u64 length, u16 uid, phys_addr_t addr, u32 obj_id);
+struct mlx5_core_dev *mlx5_vf_get_core_dev(struct pci_dev *pdev);
+void mlx5_vf_put_core_dev(struct mlx5_core_dev *mdev);
+
#ifdef CONFIG_MLX5_CORE_IPOIB
struct net_device *mlx5_rdma_netdev_alloc(struct mlx5_core_dev *mdev,
struct ib_device *ibdev,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 598ac3bcc901..b1c27409c997 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -127,6 +127,11 @@ enum {
MLX5_CMD_OP_QUERY_SF_PARTITION = 0x111,
MLX5_CMD_OP_ALLOC_SF = 0x113,
MLX5_CMD_OP_DEALLOC_SF = 0x114,
+ MLX5_CMD_OP_SUSPEND_VHCA = 0x115,
+ MLX5_CMD_OP_RESUME_VHCA = 0x116,
+ MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE = 0x117,
+ MLX5_CMD_OP_SAVE_VHCA_STATE = 0x118,
+ MLX5_CMD_OP_LOAD_VHCA_STATE = 0x119,
MLX5_CMD_OP_CREATE_MKEY = 0x200,
MLX5_CMD_OP_QUERY_MKEY = 0x201,
MLX5_CMD_OP_DESTROY_MKEY = 0x202,
@@ -1757,7 +1762,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
u8 reserved_at_682[0x1];
u8 log_max_sf[0x5];
u8 apu[0x1];
- u8 reserved_at_689[0x7];
+ u8 reserved_at_689[0x4];
+ u8 migration[0x1];
+ u8 reserved_at_68e[0x2];
u8 log_min_sf_size[0x8];
u8 max_num_sf_partitions[0x8];
@@ -11519,4 +11526,142 @@ enum {
MLX5_MTT_PERM_RW = MLX5_MTT_PERM_READ | MLX5_MTT_PERM_WRITE,
};
+enum {
+ MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR = 0x0,
+ MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER = 0x1,
+};
+
+struct mlx5_ifc_suspend_vhca_in_bits {
+ u8 opcode[0x10];
+ u8 uid[0x10];
+
+ u8 reserved_at_20[0x10];
+ u8 op_mod[0x10];
+
+ u8 reserved_at_40[0x10];
+ u8 vhca_id[0x10];
+
+ u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_suspend_vhca_out_bits {
+ u8 status[0x8];
+ u8 reserved_at_8[0x18];
+
+ u8 syndrome[0x20];
+
+ u8 reserved_at_40[0x40];
+};
+
+enum {
+ MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER = 0x0,
+ MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR = 0x1,
+};
+
+struct mlx5_ifc_resume_vhca_in_bits {
+ u8 opcode[0x10];
+ u8 uid[0x10];
+
+ u8 reserved_at_20[0x10];
+ u8 op_mod[0x10];
+
+ u8 reserved_at_40[0x10];
+ u8 vhca_id[0x10];
+
+ u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_resume_vhca_out_bits {
+ u8 status[0x8];
+ u8 reserved_at_8[0x18];
+
+ u8 syndrome[0x20];
+
+ u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_query_vhca_migration_state_in_bits {
+ u8 opcode[0x10];
+ u8 uid[0x10];
+
+ u8 reserved_at_20[0x10];
+ u8 op_mod[0x10];
+
+ u8 reserved_at_40[0x10];
+ u8 vhca_id[0x10];
+
+ u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_query_vhca_migration_state_out_bits {
+ u8 status[0x8];
+ u8 reserved_at_8[0x18];
+
+ u8 syndrome[0x20];
+
+ u8 reserved_at_40[0x40];
+
+ u8 required_umem_size[0x20];
+
+ u8 reserved_at_a0[0x160];
+};
+
+struct mlx5_ifc_save_vhca_state_in_bits {
+ u8 opcode[0x10];
+ u8 uid[0x10];
+
+ u8 reserved_at_20[0x10];
+ u8 op_mod[0x10];
+
+ u8 reserved_at_40[0x10];
+ u8 vhca_id[0x10];
+
+ u8 reserved_at_60[0x20];
+
+ u8 va[0x40];
+
+ u8 mkey[0x20];
+
+ u8 size[0x20];
+};
+
+struct mlx5_ifc_save_vhca_state_out_bits {
+ u8 status[0x8];
+ u8 reserved_at_8[0x18];
+
+ u8 syndrome[0x20];
+
+ u8 actual_image_size[0x20];
+
+ u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_load_vhca_state_in_bits {
+ u8 opcode[0x10];
+ u8 uid[0x10];
+
+ u8 reserved_at_20[0x10];
+ u8 op_mod[0x10];
+
+ u8 reserved_at_40[0x10];
+ u8 vhca_id[0x10];
+
+ u8 reserved_at_60[0x20];
+
+ u8 va[0x40];
+
+ u8 mkey[0x20];
+
+ u8 size[0x20];
+};
+
+struct mlx5_ifc_load_vhca_state_out_bits {
+ u8 status[0x8];
+ u8 reserved_at_8[0x18];
+
+ u8 syndrome[0x20];
+
+ u8 reserved_at_40[0x40];
+};
+
#endif /* MLX5_IFC_H */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 8253a5413d7c..60d423d8f0c4 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2166,7 +2166,8 @@ void __iomem *pci_ioremap_wc_bar(struct pci_dev *pdev, int bar);
#ifdef CONFIG_PCI_IOV
int pci_iov_virtfn_bus(struct pci_dev *dev, int id);
int pci_iov_virtfn_devfn(struct pci_dev *dev, int id);
-
+int pci_iov_vf_id(struct pci_dev *dev);
+void *pci_iov_get_pf_drvdata(struct pci_dev *dev, struct pci_driver *pf_driver);
int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn);
void pci_disable_sriov(struct pci_dev *dev);
@@ -2194,6 +2195,18 @@ static inline int pci_iov_virtfn_devfn(struct pci_dev *dev, int id)
{
return -ENOSYS;
}
+
+static inline int pci_iov_vf_id(struct pci_dev *dev)
+{
+ return -ENOSYS;
+}
+
+static inline void *pci_iov_get_pf_drvdata(struct pci_dev *dev,
+ struct pci_driver *pf_driver)
+{
+ return ERR_PTR(-EINVAL);
+}
+
static inline int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn)
{ return -ENODEV; }
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 76191d7abed1..66dda06ec42d 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -33,6 +33,7 @@ struct vfio_device {
struct vfio_group *group;
struct vfio_device_set *dev_set;
struct list_head dev_set_list;
+ unsigned int migration_flags;
/* Members below here are private, not for driver use */
refcount_t refcount;
@@ -55,6 +56,17 @@ struct vfio_device {
* @match: Optional device name match callback (return: 0 for no-match, >0 for
* match, -errno for abort (ex. match with insufficient or incorrect
* additional args)
+ * @device_feature: Optional, fill in the VFIO_DEVICE_FEATURE ioctl
+ * @migration_set_state: Optional callback to change the migration state for
+ * devices that support migration. It's mandatory for
+ * VFIO_DEVICE_FEATURE_MIGRATION migration support.
+ * The returned FD is used for data transfer according to the FSM
+ * definition. The driver is responsible to ensure that FD reaches end
+ * of stream or error whenever the migration FSM leaves a data transfer
+ * state or before close_device() returns.
+ * @migration_get_state: Optional callback to get the migration state for
+ * devices that support migration. It's mandatory for
+ * VFIO_DEVICE_FEATURE_MIGRATION migration support.
*/
struct vfio_device_ops {
char *name;
@@ -69,8 +81,44 @@ struct vfio_device_ops {
int (*mmap)(struct vfio_device *vdev, struct vm_area_struct *vma);
void (*request)(struct vfio_device *vdev, unsigned int count);
int (*match)(struct vfio_device *vdev, char *buf);
+ int (*device_feature)(struct vfio_device *device, u32 flags,
+ void __user *arg, size_t argsz);
+ struct file *(*migration_set_state)(
+ struct vfio_device *device,
+ enum vfio_device_mig_state new_state);
+ int (*migration_get_state)(struct vfio_device *device,
+ enum vfio_device_mig_state *curr_state);
};
+/**
+ * vfio_check_feature - Validate user input for the VFIO_DEVICE_FEATURE ioctl
+ * @flags: Arg from the device_feature op
+ * @argsz: Arg from the device_feature op
+ * @supported_ops: Combination of VFIO_DEVICE_FEATURE_GET and SET the driver
+ * supports
+ * @minsz: Minimum data size the driver accepts
+ *
+ * For use in a driver's device_feature op. Checks that the inputs to the
+ * VFIO_DEVICE_FEATURE ioctl are correct for the driver's feature. Returns 1 if
+ * the driver should execute the get or set, otherwise the relevant
+ * value should be returned.
+ */
+static inline int vfio_check_feature(u32 flags, size_t argsz, u32 supported_ops,
+ size_t minsz)
+{
+ if ((flags & (VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_SET)) &
+ ~supported_ops)
+ return -EINVAL;
+ if (flags & VFIO_DEVICE_FEATURE_PROBE)
+ return 0;
+ /* Without PROBE one of GET or SET must be requested */
+ if (!(flags & (VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_SET)))
+ return -EINVAL;
+ if (argsz < minsz)
+ return -EINVAL;
+ return 1;
+}
+
void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
const struct vfio_device_ops *ops);
void vfio_uninit_group_dev(struct vfio_device *device);
@@ -82,6 +130,11 @@ extern void vfio_device_put(struct vfio_device *device);
int vfio_assign_device_set(struct vfio_device *device, void *set_id);
+int vfio_mig_get_next_state(struct vfio_device *device,
+ enum vfio_device_mig_state cur_fsm,
+ enum vfio_device_mig_state new_fsm,
+ enum vfio_device_mig_state *next_fsm);
+
/*
* External user API
*/
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index ef9a44b6cf5d..9f1bf8e49d43 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -220,6 +220,8 @@ int vfio_pci_core_sriov_configure(struct pci_dev *pdev, int nr_virtfn);
extern const struct pci_error_handlers vfio_pci_core_err_handlers;
long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
unsigned long arg);
+int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
+ void __user *arg, size_t argsz);
ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf,
size_t count, loff_t *ppos);
ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf,
@@ -230,6 +232,8 @@ int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf);
int vfio_pci_core_enable(struct vfio_pci_core_device *vdev);
void vfio_pci_core_disable(struct vfio_pci_core_device *vdev);
void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev);
+pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev,
+ pci_channel_state_t state);
static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
{
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index ef33ea002b0b..fea86061b44e 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -323,7 +323,7 @@ struct vfio_region_info_cap_type {
#define VFIO_REGION_TYPE_PCI_VENDOR_MASK (0xffff)
#define VFIO_REGION_TYPE_GFX (1)
#define VFIO_REGION_TYPE_CCW (2)
-#define VFIO_REGION_TYPE_MIGRATION (3)
+#define VFIO_REGION_TYPE_MIGRATION_DEPRECATED (3)
/* sub-types for VFIO_REGION_TYPE_PCI_* */
@@ -405,225 +405,29 @@ struct vfio_region_gfx_edid {
#define VFIO_REGION_SUBTYPE_CCW_CRW (3)
/* sub-types for VFIO_REGION_TYPE_MIGRATION */
-#define VFIO_REGION_SUBTYPE_MIGRATION (1)
-
-/*
- * The structure vfio_device_migration_info is placed at the 0th offset of
- * the VFIO_REGION_SUBTYPE_MIGRATION region to get and set VFIO device related
- * migration information. Field accesses from this structure are only supported
- * at their native width and alignment. Otherwise, the result is undefined and
- * vendor drivers should return an error.
- *
- * device_state: (read/write)
- * - The user application writes to this field to inform the vendor driver
- * about the device state to be transitioned to.
- * - The vendor driver should take the necessary actions to change the
- * device state. After successful transition to a given state, the
- * vendor driver should return success on write(device_state, state)
- * system call. If the device state transition fails, the vendor driver
- * should return an appropriate -errno for the fault condition.
- * - On the user application side, if the device state transition fails,
- * that is, if write(device_state, state) returns an error, read
- * device_state again to determine the current state of the device from
- * the vendor driver.
- * - The vendor driver should return previous state of the device unless
- * the vendor driver has encountered an internal error, in which case
- * the vendor driver may report the device_state VFIO_DEVICE_STATE_ERROR.
- * - The user application must use the device reset ioctl to recover the
- * device from VFIO_DEVICE_STATE_ERROR state. If the device is
- * indicated to be in a valid device state by reading device_state, the
- * user application may attempt to transition the device to any valid
- * state reachable from the current state or terminate itself.
- *
- * device_state consists of 3 bits:
- * - If bit 0 is set, it indicates the _RUNNING state. If bit 0 is clear,
- * it indicates the _STOP state. When the device state is changed to
- * _STOP, driver should stop the device before write() returns.
- * - If bit 1 is set, it indicates the _SAVING state, which means that the
- * driver should start gathering device state information that will be
- * provided to the VFIO user application to save the device's state.
- * - If bit 2 is set, it indicates the _RESUMING state, which means that
- * the driver should prepare to resume the device. Data provided through
- * the migration region should be used to resume the device.
- * Bits 3 - 31 are reserved for future use. To preserve them, the user
- * application should perform a read-modify-write operation on this
- * field when modifying the specified bits.
- *
- * +------- _RESUMING
- * |+------ _SAVING
- * ||+----- _RUNNING
- * |||
- * 000b => Device Stopped, not saving or resuming
- * 001b => Device running, which is the default state
- * 010b => Stop the device & save the device state, stop-and-copy state
- * 011b => Device running and save the device state, pre-copy state
- * 100b => Device stopped and the device state is resuming
- * 101b => Invalid state
- * 110b => Error state
- * 111b => Invalid state
- *
- * State transitions:
- *
- * _RESUMING _RUNNING Pre-copy Stop-and-copy _STOP
- * (100b) (001b) (011b) (010b) (000b)
- * 0. Running or default state
- * |
- *
- * 1. Normal Shutdown (optional)
- * |------------------------------------->|
- *
- * 2. Save the state or suspend
- * |------------------------->|---------->|
- *
- * 3. Save the state during live migration
- * |----------->|------------>|---------->|
- *
- * 4. Resuming
- * |<---------|
- *
- * 5. Resumed
- * |--------->|
- *
- * 0. Default state of VFIO device is _RUNNING when the user application starts.
- * 1. During normal shutdown of the user application, the user application may
- * optionally change the VFIO device state from _RUNNING to _STOP. This
- * transition is optional. The vendor driver must support this transition but
- * must not require it.
- * 2. When the user application saves state or suspends the application, the
- * device state transitions from _RUNNING to stop-and-copy and then to _STOP.
- * On state transition from _RUNNING to stop-and-copy, driver must stop the
- * device, save the device state and send it to the application through the
- * migration region. The sequence to be followed for such transition is given
- * below.
- * 3. In live migration of user application, the state transitions from _RUNNING
- * to pre-copy, to stop-and-copy, and to _STOP.
- * On state transition from _RUNNING to pre-copy, the driver should start
- * gathering the device state while the application is still running and send
- * the device state data to application through the migration region.
- * On state transition from pre-copy to stop-and-copy, the driver must stop
- * the device, save the device state and send it to the user application
- * through the migration region.
- * Vendor drivers must support the pre-copy state even for implementations
- * where no data is provided to the user before the stop-and-copy state. The
- * user must not be required to consume all migration data before the device
- * transitions to a new state, including the stop-and-copy state.
- * The sequence to be followed for above two transitions is given below.
- * 4. To start the resuming phase, the device state should be transitioned from
- * the _RUNNING to the _RESUMING state.
- * In the _RESUMING state, the driver should use the device state data
- * received through the migration region to resume the device.
- * 5. After providing saved device data to the driver, the application should
- * change the state from _RESUMING to _RUNNING.
- *
- * reserved:
- * Reads on this field return zero and writes are ignored.
- *
- * pending_bytes: (read only)
- * The number of pending bytes still to be migrated from the vendor driver.
- *
- * data_offset: (read only)
- * The user application should read data_offset field from the migration
- * region. The user application should read the device data from this
- * offset within the migration region during the _SAVING state or write
- * the device data during the _RESUMING state. See below for details of
- * sequence to be followed.
- *
- * data_size: (read/write)
- * The user application should read data_size to get the size in bytes of
- * the data copied in the migration region during the _SAVING state and
- * write the size in bytes of the data copied in the migration region
- * during the _RESUMING state.
- *
- * The format of the migration region is as follows:
- * ------------------------------------------------------------------
- * |vfio_device_migration_info| data section |
- * | | /////////////////////////////// |
- * ------------------------------------------------------------------
- * ^ ^
- * offset 0-trapped part data_offset
- *
- * The structure vfio_device_migration_info is always followed by the data
- * section in the region, so data_offset will always be nonzero. The offset
- * from where the data is copied is decided by the kernel driver. The data
- * section can be trapped, mmapped, or partitioned, depending on how the kernel
- * driver defines the data section. The data section partition can be defined
- * as mapped by the sparse mmap capability. If mmapped, data_offset must be
- * page aligned, whereas initial section which contains the
- * vfio_device_migration_info structure, might not end at the offset, which is
- * page aligned. The user is not required to access through mmap regardless
- * of the capabilities of the region mmap.
- * The vendor driver should determine whether and how to partition the data
- * section. The vendor driver should return data_offset accordingly.
- *
- * The sequence to be followed while in pre-copy state and stop-and-copy state
- * is as follows:
- * a. Read pending_bytes, indicating the start of a new iteration to get device
- * data. Repeated read on pending_bytes at this stage should have no side
- * effects.
- * If pending_bytes == 0, the user application should not iterate to get data
- * for that device.
- * If pending_bytes > 0, perform the following steps.
- * b. Read data_offset, indicating that the vendor driver should make data
- * available through the data section. The vendor driver should return this
- * read operation only after data is available from (region + data_offset)
- * to (region + data_offset + data_size).
- * c. Read data_size, which is the amount of data in bytes available through
- * the migration region.
- * Read on data_offset and data_size should return the offset and size of
- * the current buffer if the user application reads data_offset and
- * data_size more than once here.
- * d. Read data_size bytes of data from (region + data_offset) from the
- * migration region.
- * e. Process the data.
- * f. Read pending_bytes, which indicates that the data from the previous
- * iteration has been read. If pending_bytes > 0, go to step b.
- *
- * The user application can transition from the _SAVING|_RUNNING
- * (pre-copy state) to the _SAVING (stop-and-copy) state regardless of the
- * number of pending bytes. The user application should iterate in _SAVING
- * (stop-and-copy) until pending_bytes is 0.
- *
- * The sequence to be followed while _RESUMING device state is as follows:
- * While data for this device is available, repeat the following steps:
- * a. Read data_offset from where the user application should write data.
- * b. Write migration data starting at the migration region + data_offset for
- * the length determined by data_size from the migration source.
- * c. Write data_size, which indicates to the vendor driver that data is
- * written in the migration region. Vendor driver must return this write
- * operations on consuming data. Vendor driver should apply the
- * user-provided migration region data to the device resume state.
- *
- * If an error occurs during the above sequences, the vendor driver can return
- * an error code for next read() or write() operation, which will terminate the
- * loop. The user application should then take the next necessary action, for
- * example, failing migration or terminating the user application.
- *
- * For the user application, data is opaque. The user application should write
- * data in the same order as the data is received and the data should be of
- * same transaction size at the source.
- */
+#define VFIO_REGION_SUBTYPE_MIGRATION_DEPRECATED (1)
struct vfio_device_migration_info {
__u32 device_state; /* VFIO device state */
-#define VFIO_DEVICE_STATE_STOP (0)
-#define VFIO_DEVICE_STATE_RUNNING (1 << 0)
-#define VFIO_DEVICE_STATE_SAVING (1 << 1)
-#define VFIO_DEVICE_STATE_RESUMING (1 << 2)
-#define VFIO_DEVICE_STATE_MASK (VFIO_DEVICE_STATE_RUNNING | \
- VFIO_DEVICE_STATE_SAVING | \
- VFIO_DEVICE_STATE_RESUMING)
+#define VFIO_DEVICE_STATE_V1_STOP (0)
+#define VFIO_DEVICE_STATE_V1_RUNNING (1 << 0)
+#define VFIO_DEVICE_STATE_V1_SAVING (1 << 1)
+#define VFIO_DEVICE_STATE_V1_RESUMING (1 << 2)
+#define VFIO_DEVICE_STATE_MASK (VFIO_DEVICE_STATE_V1_RUNNING | \
+ VFIO_DEVICE_STATE_V1_SAVING | \
+ VFIO_DEVICE_STATE_V1_RESUMING)
#define VFIO_DEVICE_STATE_VALID(state) \
- (state & VFIO_DEVICE_STATE_RESUMING ? \
- (state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_RESUMING : 1)
+ (state & VFIO_DEVICE_STATE_V1_RESUMING ? \
+ (state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_V1_RESUMING : 1)
#define VFIO_DEVICE_STATE_IS_ERROR(state) \
- ((state & VFIO_DEVICE_STATE_MASK) == (VFIO_DEVICE_STATE_SAVING | \
- VFIO_DEVICE_STATE_RESUMING))
+ ((state & VFIO_DEVICE_STATE_MASK) == (VFIO_DEVICE_STATE_V1_SAVING | \
+ VFIO_DEVICE_STATE_V1_RESUMING))
#define VFIO_DEVICE_STATE_SET_ERROR(state) \
- ((state & ~VFIO_DEVICE_STATE_MASK) | VFIO_DEVICE_SATE_SAVING | \
- VFIO_DEVICE_STATE_RESUMING)
+ ((state & ~VFIO_DEVICE_STATE_MASK) | VFIO_DEVICE_STATE_V1_SAVING | \
+ VFIO_DEVICE_STATE_V1_RESUMING)
__u32 reserved;
__u64 pending_bytes;
@@ -1002,6 +806,186 @@ struct vfio_device_feature {
*/
#define VFIO_DEVICE_FEATURE_PCI_VF_TOKEN (0)
+/*
+ * Indicates the device can support the migration API through
+ * VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE. If this GET succeeds, the RUNNING and
+ * ERROR states are always supported. Support for additional states is
+ * indicated via the flags field; at least VFIO_MIGRATION_STOP_COPY must be
+ * set.
+ *
+ * VFIO_MIGRATION_STOP_COPY means that STOP, STOP_COPY and
+ * RESUMING are supported.
+ *
+ * VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P means that RUNNING_P2P
+ * is supported in addition to the STOP_COPY states.
+ *
+ * Other combinations of flags have behavior to be defined in the future.
+ */
+struct vfio_device_feature_migration {
+ __aligned_u64 flags;
+#define VFIO_MIGRATION_STOP_COPY (1 << 0)
+#define VFIO_MIGRATION_P2P (1 << 1)
+};
+#define VFIO_DEVICE_FEATURE_MIGRATION 1
+
+/*
+ * Upon VFIO_DEVICE_FEATURE_SET, execute a migration state change on the VFIO
+ * device. The new state is supplied in device_state, see enum
+ * vfio_device_mig_state for details
+ *
+ * The kernel migration driver must fully transition the device to the new state
+ * value before the operation returns to the user.
+ *
+ * The kernel migration driver must not generate asynchronous device state
+ * transitions outside of manipulation by the user or the VFIO_DEVICE_RESET
+ * ioctl as described above.
+ *
+ * If this function fails then current device_state may be the original
+ * operating state or some other state along the combination transition path.
+ * The user can then decide if it should execute a VFIO_DEVICE_RESET, attempt
+ * to return to the original state, or attempt to return to some other state
+ * such as RUNNING or STOP.
+ *
+ * If the new_state starts a new data transfer session then the FD associated
+ * with that session is returned in data_fd. The user is responsible to close
+ * this FD when it is finished. The user must consider the migration data stream
+ * carried over the FD to be opaque and must preserve the byte order of the
+ * stream. The user is not required to preserve buffer segmentation when writing
+ * the data stream during the RESUMING operation.
+ *
+ * Upon VFIO_DEVICE_FEATURE_GET, get the current migration state of the VFIO
+ * device, data_fd will be -1.
+ */
+struct vfio_device_feature_mig_state {
+ __u32 device_state; /* From enum vfio_device_mig_state */
+ __s32 data_fd;
+};
+#define VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE 2
+
+/*
+ * The device migration Finite State Machine is described by the enum
+ * vfio_device_mig_state. Some of the FSM arcs will create a migration data
+ * transfer session by returning a FD, in this case the migration data will
+ * flow over the FD using read() and write() as discussed below.
+ *
+ * There are 5 states to support VFIO_MIGRATION_STOP_COPY:
+ * RUNNING - The device is running normally
+ * STOP - The device does not change the internal or external state
+ * STOP_COPY - The device internal state can be read out
+ * RESUMING - The device is stopped and is loading a new internal state
+ * ERROR - The device has failed and must be reset
+ *
+ * And 1 optional state to support VFIO_MIGRATION_P2P:
+ * RUNNING_P2P - RUNNING, except the device cannot do peer to peer DMA
+ *
+ * The FSM takes actions on the arcs between FSM states. The driver implements
+ * the following behavior for the FSM arcs:
+ *
+ * RUNNING_P2P -> STOP
+ * STOP_COPY -> STOP
+ * While in STOP the device must stop the operation of the device. The device
+ * must not generate interrupts, DMA, or any other change to external state.
+ * It must not change its internal state. When stopped the device and kernel
+ * migration driver must accept and respond to interaction to support external
+ * subsystems in the STOP state, for example PCI MSI-X and PCI config space.
+ * Failure by the user to restrict device access while in STOP must not result
+ * in error conditions outside the user context (ex. host system faults).
+ *
+ * The STOP_COPY arc will terminate a data transfer session.
+ *
+ * RESUMING -> STOP
+ * Leaving RESUMING terminates a data transfer session and indicates the
+ * device should complete processing of the data delivered by write(). The
+ * kernel migration driver should complete the incorporation of data written
+ * to the data transfer FD into the device internal state and perform
+ * final validity and consistency checking of the new device state. If the
+ * user provided data is found to be incomplete, inconsistent, or otherwise
+ * invalid, the migration driver must fail the SET_STATE ioctl and
+ * optionally go to the ERROR state as described below.
+ *
+ * While in STOP the device has the same behavior as other STOP states
+ * described above.
+ *
+ * To abort a RESUMING session the device must be reset.
+ *
+ * RUNNING_P2P -> RUNNING
+ * While in RUNNING the device is fully operational, the device may generate
+ * interrupts, DMA, respond to MMIO, all vfio device regions are functional,
+ * and the device may advance its internal state.
+ *
+ * RUNNING -> RUNNING_P2P
+ * STOP -> RUNNING_P2P
+ * While in RUNNING_P2P the device is partially running in the P2P quiescent
+ * state defined below.
+ *
+ * STOP -> STOP_COPY
+ * This arc begin the process of saving the device state and will return a
+ * new data_fd.
+ *
+ * While in the STOP_COPY state the device has the same behavior as STOP
+ * with the addition that the data transfers session continues to stream the
+ * migration state. End of stream on the FD indicates the entire device
+ * state has been transferred.
+ *
+ * The user should take steps to restrict access to vfio device regions while
+ * the device is in STOP_COPY or risk corruption of the device migration data
+ * stream.
+ *
+ * STOP -> RESUMING
+ * Entering the RESUMING state starts a process of restoring the device state
+ * and will return a new data_fd. The data stream fed into the data_fd should
+ * be taken from the data transfer output of a single FD during saving from
+ * a compatible device. The migration driver may alter/reset the internal
+ * device state for this arc if required to prepare the device to receive the
+ * migration data.
+ *
+ * any -> ERROR
+ * ERROR cannot be specified as a device state, however any transition request
+ * can be failed with an errno return and may then move the device_state into
+ * ERROR. In this case the device was unable to execute the requested arc and
+ * was also unable to restore the device to any valid device_state.
+ * To recover from ERROR VFIO_DEVICE_RESET must be used to return the
+ * device_state back to RUNNING.
+ *
+ * The optional peer to peer (P2P) quiescent state is intended to be a quiescent
+ * state for the device for the purposes of managing multiple devices within a
+ * user context where peer-to-peer DMA between devices may be active. The
+ * RUNNING_P2P states must prevent the device from initiating
+ * any new P2P DMA transactions. If the device can identify P2P transactions
+ * then it can stop only P2P DMA, otherwise it must stop all DMA. The migration
+ * driver must complete any such outstanding operations prior to completing the
+ * FSM arc into a P2P state. For the purpose of specification the states
+ * behave as though the device was fully running if not supported. Like while in
+ * STOP or STOP_COPY the user must not touch the device, otherwise the state
+ * can be exited.
+ *
+ * The remaining possible transitions are interpreted as combinations of the
+ * above FSM arcs. As there are multiple paths through the FSM arcs the path
+ * should be selected based on the following rules:
+ * - Select the shortest path.
+ * Refer to vfio_mig_get_next_state() for the result of the algorithm.
+ *
+ * The automatic transit through the FSM arcs that make up the combination
+ * transition is invisible to the user. When working with combination arcs the
+ * user may see any step along the path in the device_state if SET_STATE
+ * fails. When handling these types of errors users should anticipate future
+ * revisions of this protocol using new states and those states becoming
+ * visible in this case.
+ *
+ * The optional states cannot be used with SET_STATE if the device does not
+ * support them. The user can discover if these states are supported by using
+ * VFIO_DEVICE_FEATURE_MIGRATION. By using combination transitions the user can
+ * avoid knowing about these optional states if the kernel driver supports them.
+ */
+enum vfio_device_mig_state {
+ VFIO_DEVICE_STATE_ERROR = 0,
+ VFIO_DEVICE_STATE_STOP = 1,
+ VFIO_DEVICE_STATE_RUNNING = 2,
+ VFIO_DEVICE_STATE_STOP_COPY = 3,
+ VFIO_DEVICE_STATE_RESUMING = 4,
+ VFIO_DEVICE_STATE_RUNNING_P2P = 5,
+};
+
/* -------- API for Type1 VFIO IOMMU -------- */
/**