summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
Diffstat (limited to 'include')
-rw-r--r--include/linux/mlx5/driver.h3
-rw-r--r--include/linux/mlx5/mlx5_ifc.h147
-rw-r--r--include/linux/pci.h15
-rw-r--r--include/linux/vfio.h53
-rw-r--r--include/linux/vfio_pci_core.h4
-rw-r--r--include/uapi/linux/vfio.h406
6 files changed, 415 insertions, 213 deletions
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 78655d8d13a7..319322a8ff94 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1143,6 +1143,9 @@ int mlx5_dm_sw_icm_alloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type,
int mlx5_dm_sw_icm_dealloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type,
u64 length, u16 uid, phys_addr_t addr, u32 obj_id);
+struct mlx5_core_dev *mlx5_vf_get_core_dev(struct pci_dev *pdev);
+void mlx5_vf_put_core_dev(struct mlx5_core_dev *mdev);
+
#ifdef CONFIG_MLX5_CORE_IPOIB
struct net_device *mlx5_rdma_netdev_alloc(struct mlx5_core_dev *mdev,
struct ib_device *ibdev,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 598ac3bcc901..b1c27409c997 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -127,6 +127,11 @@ enum {
MLX5_CMD_OP_QUERY_SF_PARTITION = 0x111,
MLX5_CMD_OP_ALLOC_SF = 0x113,
MLX5_CMD_OP_DEALLOC_SF = 0x114,
+ MLX5_CMD_OP_SUSPEND_VHCA = 0x115,
+ MLX5_CMD_OP_RESUME_VHCA = 0x116,
+ MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE = 0x117,
+ MLX5_CMD_OP_SAVE_VHCA_STATE = 0x118,
+ MLX5_CMD_OP_LOAD_VHCA_STATE = 0x119,
MLX5_CMD_OP_CREATE_MKEY = 0x200,
MLX5_CMD_OP_QUERY_MKEY = 0x201,
MLX5_CMD_OP_DESTROY_MKEY = 0x202,
@@ -1757,7 +1762,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
u8 reserved_at_682[0x1];
u8 log_max_sf[0x5];
u8 apu[0x1];
- u8 reserved_at_689[0x7];
+ u8 reserved_at_689[0x4];
+ u8 migration[0x1];
+ u8 reserved_at_68e[0x2];
u8 log_min_sf_size[0x8];
u8 max_num_sf_partitions[0x8];
@@ -11519,4 +11526,142 @@ enum {
MLX5_MTT_PERM_RW = MLX5_MTT_PERM_READ | MLX5_MTT_PERM_WRITE,
};
+enum {
+ MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR = 0x0,
+ MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER = 0x1,
+};
+
+struct mlx5_ifc_suspend_vhca_in_bits {
+ u8 opcode[0x10];
+ u8 uid[0x10];
+
+ u8 reserved_at_20[0x10];
+ u8 op_mod[0x10];
+
+ u8 reserved_at_40[0x10];
+ u8 vhca_id[0x10];
+
+ u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_suspend_vhca_out_bits {
+ u8 status[0x8];
+ u8 reserved_at_8[0x18];
+
+ u8 syndrome[0x20];
+
+ u8 reserved_at_40[0x40];
+};
+
+enum {
+ MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER = 0x0,
+ MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR = 0x1,
+};
+
+struct mlx5_ifc_resume_vhca_in_bits {
+ u8 opcode[0x10];
+ u8 uid[0x10];
+
+ u8 reserved_at_20[0x10];
+ u8 op_mod[0x10];
+
+ u8 reserved_at_40[0x10];
+ u8 vhca_id[0x10];
+
+ u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_resume_vhca_out_bits {
+ u8 status[0x8];
+ u8 reserved_at_8[0x18];
+
+ u8 syndrome[0x20];
+
+ u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_query_vhca_migration_state_in_bits {
+ u8 opcode[0x10];
+ u8 uid[0x10];
+
+ u8 reserved_at_20[0x10];
+ u8 op_mod[0x10];
+
+ u8 reserved_at_40[0x10];
+ u8 vhca_id[0x10];
+
+ u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_query_vhca_migration_state_out_bits {
+ u8 status[0x8];
+ u8 reserved_at_8[0x18];
+
+ u8 syndrome[0x20];
+
+ u8 reserved_at_40[0x40];
+
+ u8 required_umem_size[0x20];
+
+ u8 reserved_at_a0[0x160];
+};
+
+struct mlx5_ifc_save_vhca_state_in_bits {
+ u8 opcode[0x10];
+ u8 uid[0x10];
+
+ u8 reserved_at_20[0x10];
+ u8 op_mod[0x10];
+
+ u8 reserved_at_40[0x10];
+ u8 vhca_id[0x10];
+
+ u8 reserved_at_60[0x20];
+
+ u8 va[0x40];
+
+ u8 mkey[0x20];
+
+ u8 size[0x20];
+};
+
+struct mlx5_ifc_save_vhca_state_out_bits {
+ u8 status[0x8];
+ u8 reserved_at_8[0x18];
+
+ u8 syndrome[0x20];
+
+ u8 actual_image_size[0x20];
+
+ u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_load_vhca_state_in_bits {
+ u8 opcode[0x10];
+ u8 uid[0x10];
+
+ u8 reserved_at_20[0x10];
+ u8 op_mod[0x10];
+
+ u8 reserved_at_40[0x10];
+ u8 vhca_id[0x10];
+
+ u8 reserved_at_60[0x20];
+
+ u8 va[0x40];
+
+ u8 mkey[0x20];
+
+ u8 size[0x20];
+};
+
+struct mlx5_ifc_load_vhca_state_out_bits {
+ u8 status[0x8];
+ u8 reserved_at_8[0x18];
+
+ u8 syndrome[0x20];
+
+ u8 reserved_at_40[0x40];
+};
+
#endif /* MLX5_IFC_H */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 8253a5413d7c..60d423d8f0c4 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2166,7 +2166,8 @@ void __iomem *pci_ioremap_wc_bar(struct pci_dev *pdev, int bar);
#ifdef CONFIG_PCI_IOV
int pci_iov_virtfn_bus(struct pci_dev *dev, int id);
int pci_iov_virtfn_devfn(struct pci_dev *dev, int id);
-
+int pci_iov_vf_id(struct pci_dev *dev);
+void *pci_iov_get_pf_drvdata(struct pci_dev *dev, struct pci_driver *pf_driver);
int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn);
void pci_disable_sriov(struct pci_dev *dev);
@@ -2194,6 +2195,18 @@ static inline int pci_iov_virtfn_devfn(struct pci_dev *dev, int id)
{
return -ENOSYS;
}
+
+static inline int pci_iov_vf_id(struct pci_dev *dev)
+{
+ return -ENOSYS;
+}
+
+static inline void *pci_iov_get_pf_drvdata(struct pci_dev *dev,
+ struct pci_driver *pf_driver)
+{
+ return ERR_PTR(-EINVAL);
+}
+
static inline int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn)
{ return -ENODEV; }
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 76191d7abed1..66dda06ec42d 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -33,6 +33,7 @@ struct vfio_device {
struct vfio_group *group;
struct vfio_device_set *dev_set;
struct list_head dev_set_list;
+ unsigned int migration_flags;
/* Members below here are private, not for driver use */
refcount_t refcount;
@@ -55,6 +56,17 @@ struct vfio_device {
* @match: Optional device name match callback (return: 0 for no-match, >0 for
* match, -errno for abort (ex. match with insufficient or incorrect
* additional args)
+ * @device_feature: Optional, fill in the VFIO_DEVICE_FEATURE ioctl
+ * @migration_set_state: Optional callback to change the migration state for
+ * devices that support migration. It's mandatory for
+ * VFIO_DEVICE_FEATURE_MIGRATION migration support.
+ * The returned FD is used for data transfer according to the FSM
+ * definition. The driver is responsible to ensure that FD reaches end
+ * of stream or error whenever the migration FSM leaves a data transfer
+ * state or before close_device() returns.
+ * @migration_get_state: Optional callback to get the migration state for
+ * devices that support migration. It's mandatory for
+ * VFIO_DEVICE_FEATURE_MIGRATION migration support.
*/
struct vfio_device_ops {
char *name;
@@ -69,8 +81,44 @@ struct vfio_device_ops {
int (*mmap)(struct vfio_device *vdev, struct vm_area_struct *vma);
void (*request)(struct vfio_device *vdev, unsigned int count);
int (*match)(struct vfio_device *vdev, char *buf);
+ int (*device_feature)(struct vfio_device *device, u32 flags,
+ void __user *arg, size_t argsz);
+ struct file *(*migration_set_state)(
+ struct vfio_device *device,
+ enum vfio_device_mig_state new_state);
+ int (*migration_get_state)(struct vfio_device *device,
+ enum vfio_device_mig_state *curr_state);
};
+/**
+ * vfio_check_feature - Validate user input for the VFIO_DEVICE_FEATURE ioctl
+ * @flags: Arg from the device_feature op
+ * @argsz: Arg from the device_feature op
+ * @supported_ops: Combination of VFIO_DEVICE_FEATURE_GET and SET the driver
+ * supports
+ * @minsz: Minimum data size the driver accepts
+ *
+ * For use in a driver's device_feature op. Checks that the inputs to the
+ * VFIO_DEVICE_FEATURE ioctl are correct for the driver's feature. Returns 1 if
+ * the driver should execute the get or set, otherwise the relevant
+ * value should be returned.
+ */
+static inline int vfio_check_feature(u32 flags, size_t argsz, u32 supported_ops,
+ size_t minsz)
+{
+ if ((flags & (VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_SET)) &
+ ~supported_ops)
+ return -EINVAL;
+ if (flags & VFIO_DEVICE_FEATURE_PROBE)
+ return 0;
+ /* Without PROBE one of GET or SET must be requested */
+ if (!(flags & (VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_SET)))
+ return -EINVAL;
+ if (argsz < minsz)
+ return -EINVAL;
+ return 1;
+}
+
void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
const struct vfio_device_ops *ops);
void vfio_uninit_group_dev(struct vfio_device *device);
@@ -82,6 +130,11 @@ extern void vfio_device_put(struct vfio_device *device);
int vfio_assign_device_set(struct vfio_device *device, void *set_id);
+int vfio_mig_get_next_state(struct vfio_device *device,
+ enum vfio_device_mig_state cur_fsm,
+ enum vfio_device_mig_state new_fsm,
+ enum vfio_device_mig_state *next_fsm);
+
/*
* External user API
*/
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index ef9a44b6cf5d..9f1bf8e49d43 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -220,6 +220,8 @@ int vfio_pci_core_sriov_configure(struct pci_dev *pdev, int nr_virtfn);
extern const struct pci_error_handlers vfio_pci_core_err_handlers;
long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
unsigned long arg);
+int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
+ void __user *arg, size_t argsz);
ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf,
size_t count, loff_t *ppos);
ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf,
@@ -230,6 +232,8 @@ int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf);
int vfio_pci_core_enable(struct vfio_pci_core_device *vdev);
void vfio_pci_core_disable(struct vfio_pci_core_device *vdev);
void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev);
+pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev,
+ pci_channel_state_t state);
static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
{
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index ef33ea002b0b..fea86061b44e 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -323,7 +323,7 @@ struct vfio_region_info_cap_type {
#define VFIO_REGION_TYPE_PCI_VENDOR_MASK (0xffff)
#define VFIO_REGION_TYPE_GFX (1)
#define VFIO_REGION_TYPE_CCW (2)
-#define VFIO_REGION_TYPE_MIGRATION (3)
+#define VFIO_REGION_TYPE_MIGRATION_DEPRECATED (3)
/* sub-types for VFIO_REGION_TYPE_PCI_* */
@@ -405,225 +405,29 @@ struct vfio_region_gfx_edid {
#define VFIO_REGION_SUBTYPE_CCW_CRW (3)
/* sub-types for VFIO_REGION_TYPE_MIGRATION */
-#define VFIO_REGION_SUBTYPE_MIGRATION (1)
-
-/*
- * The structure vfio_device_migration_info is placed at the 0th offset of
- * the VFIO_REGION_SUBTYPE_MIGRATION region to get and set VFIO device related
- * migration information. Field accesses from this structure are only supported
- * at their native width and alignment. Otherwise, the result is undefined and
- * vendor drivers should return an error.
- *
- * device_state: (read/write)
- * - The user application writes to this field to inform the vendor driver
- * about the device state to be transitioned to.
- * - The vendor driver should take the necessary actions to change the
- * device state. After successful transition to a given state, the
- * vendor driver should return success on write(device_state, state)
- * system call. If the device state transition fails, the vendor driver
- * should return an appropriate -errno for the fault condition.
- * - On the user application side, if the device state transition fails,
- * that is, if write(device_state, state) returns an error, read
- * device_state again to determine the current state of the device from
- * the vendor driver.
- * - The vendor driver should return previous state of the device unless
- * the vendor driver has encountered an internal error, in which case
- * the vendor driver may report the device_state VFIO_DEVICE_STATE_ERROR.
- * - The user application must use the device reset ioctl to recover the
- * device from VFIO_DEVICE_STATE_ERROR state. If the device is
- * indicated to be in a valid device state by reading device_state, the
- * user application may attempt to transition the device to any valid
- * state reachable from the current state or terminate itself.
- *
- * device_state consists of 3 bits:
- * - If bit 0 is set, it indicates the _RUNNING state. If bit 0 is clear,
- * it indicates the _STOP state. When the device state is changed to
- * _STOP, driver should stop the device before write() returns.
- * - If bit 1 is set, it indicates the _SAVING state, which means that the
- * driver should start gathering device state information that will be
- * provided to the VFIO user application to save the device's state.
- * - If bit 2 is set, it indicates the _RESUMING state, which means that
- * the driver should prepare to resume the device. Data provided through
- * the migration region should be used to resume the device.
- * Bits 3 - 31 are reserved for future use. To preserve them, the user
- * application should perform a read-modify-write operation on this
- * field when modifying the specified bits.
- *
- * +------- _RESUMING
- * |+------ _SAVING
- * ||+----- _RUNNING
- * |||
- * 000b => Device Stopped, not saving or resuming
- * 001b => Device running, which is the default state
- * 010b => Stop the device & save the device state, stop-and-copy state
- * 011b => Device running and save the device state, pre-copy state
- * 100b => Device stopped and the device state is resuming
- * 101b => Invalid state
- * 110b => Error state
- * 111b => Invalid state
- *
- * State transitions:
- *
- * _RESUMING _RUNNING Pre-copy Stop-and-copy _STOP
- * (100b) (001b) (011b) (010b) (000b)
- * 0. Running or default state
- * |
- *
- * 1. Normal Shutdown (optional)
- * |------------------------------------->|
- *
- * 2. Save the state or suspend
- * |------------------------->|---------->|
- *
- * 3. Save the state during live migration
- * |----------->|------------>|---------->|
- *
- * 4. Resuming
- * |<---------|
- *
- * 5. Resumed
- * |--------->|
- *
- * 0. Default state of VFIO device is _RUNNING when the user application starts.
- * 1. During normal shutdown of the user application, the user application may
- * optionally change the VFIO device state from _RUNNING to _STOP. This
- * transition is optional. The vendor driver must support this transition but
- * must not require it.
- * 2. When the user application saves state or suspends the application, the
- * device state transitions from _RUNNING to stop-and-copy and then to _STOP.
- * On state transition from _RUNNING to stop-and-copy, driver must stop the
- * device, save the device state and send it to the application through the
- * migration region. The sequence to be followed for such transition is given
- * below.
- * 3. In live migration of user application, the state transitions from _RUNNING
- * to pre-copy, to stop-and-copy, and to _STOP.
- * On state transition from _RUNNING to pre-copy, the driver should start
- * gathering the device state while the application is still running and send
- * the device state data to application through the migration region.
- * On state transition from pre-copy to stop-and-copy, the driver must stop
- * the device, save the device state and send it to the user application
- * through the migration region.
- * Vendor drivers must support the pre-copy state even for implementations
- * where no data is provided to the user before the stop-and-copy state. The
- * user must not be required to consume all migration data before the device
- * transitions to a new state, including the stop-and-copy state.
- * The sequence to be followed for above two transitions is given below.
- * 4. To start the resuming phase, the device state should be transitioned from
- * the _RUNNING to the _RESUMING state.
- * In the _RESUMING state, the driver should use the device state data
- * received through the migration region to resume the device.
- * 5. After providing saved device data to the driver, the application should
- * change the state from _RESUMING to _RUNNING.
- *
- * reserved:
- * Reads on this field return zero and writes are ignored.
- *
- * pending_bytes: (read only)
- * The number of pending bytes still to be migrated from the vendor driver.
- *
- * data_offset: (read only)
- * The user application should read data_offset field from the migration
- * region. The user application should read the device data from this
- * offset within the migration region during the _SAVING state or write
- * the device data during the _RESUMING state. See below for details of
- * sequence to be followed.
- *
- * data_size: (read/write)
- * The user application should read data_size to get the size in bytes of
- * the data copied in the migration region during the _SAVING state and
- * write the size in bytes of the data copied in the migration region
- * during the _RESUMING state.
- *
- * The format of the migration region is as follows:
- * ------------------------------------------------------------------
- * |vfio_device_migration_info| data section |
- * | | /////////////////////////////// |
- * ------------------------------------------------------------------
- * ^ ^
- * offset 0-trapped part data_offset
- *
- * The structure vfio_device_migration_info is always followed by the data
- * section in the region, so data_offset will always be nonzero. The offset
- * from where the data is copied is decided by the kernel driver. The data
- * section can be trapped, mmapped, or partitioned, depending on how the kernel
- * driver defines the data section. The data section partition can be defined
- * as mapped by the sparse mmap capability. If mmapped, data_offset must be
- * page aligned, whereas initial section which contains the
- * vfio_device_migration_info structure, might not end at the offset, which is
- * page aligned. The user is not required to access through mmap regardless
- * of the capabilities of the region mmap.
- * The vendor driver should determine whether and how to partition the data
- * section. The vendor driver should return data_offset accordingly.
- *
- * The sequence to be followed while in pre-copy state and stop-and-copy state
- * is as follows:
- * a. Read pending_bytes, indicating the start of a new iteration to get device
- * data. Repeated read on pending_bytes at this stage should have no side
- * effects.
- * If pending_bytes == 0, the user application should not iterate to get data
- * for that device.
- * If pending_bytes > 0, perform the following steps.
- * b. Read data_offset, indicating that the vendor driver should make data
- * available through the data section. The vendor driver should return this
- * read operation only after data is available from (region + data_offset)
- * to (region + data_offset + data_size).
- * c. Read data_size, which is the amount of data in bytes available through
- * the migration region.
- * Read on data_offset and data_size should return the offset and size of
- * the current buffer if the user application reads data_offset and
- * data_size more than once here.
- * d. Read data_size bytes of data from (region + data_offset) from the
- * migration region.
- * e. Process the data.
- * f. Read pending_bytes, which indicates that the data from the previous
- * iteration has been read. If pending_bytes > 0, go to step b.
- *
- * The user application can transition from the _SAVING|_RUNNING
- * (pre-copy state) to the _SAVING (stop-and-copy) state regardless of the
- * number of pending bytes. The user application should iterate in _SAVING
- * (stop-and-copy) until pending_bytes is 0.
- *
- * The sequence to be followed while _RESUMING device state is as follows:
- * While data for this device is available, repeat the following steps:
- * a. Read data_offset from where the user application should write data.
- * b. Write migration data starting at the migration region + data_offset for
- * the length determined by data_size from the migration source.
- * c. Write data_size, which indicates to the vendor driver that data is
- * written in the migration region. Vendor driver must return this write
- * operations on consuming data. Vendor driver should apply the
- * user-provided migration region data to the device resume state.
- *
- * If an error occurs during the above sequences, the vendor driver can return
- * an error code for next read() or write() operation, which will terminate the
- * loop. The user application should then take the next necessary action, for
- * example, failing migration or terminating the user application.
- *
- * For the user application, data is opaque. The user application should write
- * data in the same order as the data is received and the data should be of
- * same transaction size at the source.
- */
+#define VFIO_REGION_SUBTYPE_MIGRATION_DEPRECATED (1)
struct vfio_device_migration_info {
__u32 device_state; /* VFIO device state */
-#define VFIO_DEVICE_STATE_STOP (0)
-#define VFIO_DEVICE_STATE_RUNNING (1 << 0)
-#define VFIO_DEVICE_STATE_SAVING (1 << 1)
-#define VFIO_DEVICE_STATE_RESUMING (1 << 2)
-#define VFIO_DEVICE_STATE_MASK (VFIO_DEVICE_STATE_RUNNING | \
- VFIO_DEVICE_STATE_SAVING | \
- VFIO_DEVICE_STATE_RESUMING)
+#define VFIO_DEVICE_STATE_V1_STOP (0)
+#define VFIO_DEVICE_STATE_V1_RUNNING (1 << 0)
+#define VFIO_DEVICE_STATE_V1_SAVING (1 << 1)
+#define VFIO_DEVICE_STATE_V1_RESUMING (1 << 2)
+#define VFIO_DEVICE_STATE_MASK (VFIO_DEVICE_STATE_V1_RUNNING | \
+ VFIO_DEVICE_STATE_V1_SAVING | \
+ VFIO_DEVICE_STATE_V1_RESUMING)
#define VFIO_DEVICE_STATE_VALID(state) \
- (state & VFIO_DEVICE_STATE_RESUMING ? \
- (state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_RESUMING : 1)
+ (state & VFIO_DEVICE_STATE_V1_RESUMING ? \
+ (state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_V1_RESUMING : 1)
#define VFIO_DEVICE_STATE_IS_ERROR(state) \
- ((state & VFIO_DEVICE_STATE_MASK) == (VFIO_DEVICE_STATE_SAVING | \
- VFIO_DEVICE_STATE_RESUMING))
+ ((state & VFIO_DEVICE_STATE_MASK) == (VFIO_DEVICE_STATE_V1_SAVING | \
+ VFIO_DEVICE_STATE_V1_RESUMING))
#define VFIO_DEVICE_STATE_SET_ERROR(state) \
- ((state & ~VFIO_DEVICE_STATE_MASK) | VFIO_DEVICE_SATE_SAVING | \
- VFIO_DEVICE_STATE_RESUMING)
+ ((state & ~VFIO_DEVICE_STATE_MASK) | VFIO_DEVICE_STATE_V1_SAVING | \
+ VFIO_DEVICE_STATE_V1_RESUMING)
__u32 reserved;
__u64 pending_bytes;
@@ -1002,6 +806,186 @@ struct vfio_device_feature {
*/
#define VFIO_DEVICE_FEATURE_PCI_VF_TOKEN (0)
+/*
+ * Indicates the device can support the migration API through
+ * VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE. If this GET succeeds, the RUNNING and
+ * ERROR states are always supported. Support for additional states is
+ * indicated via the flags field; at least VFIO_MIGRATION_STOP_COPY must be
+ * set.
+ *
+ * VFIO_MIGRATION_STOP_COPY means that STOP, STOP_COPY and
+ * RESUMING are supported.
+ *
+ * VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P means that RUNNING_P2P
+ * is supported in addition to the STOP_COPY states.
+ *
+ * Other combinations of flags have behavior to be defined in the future.
+ */
+struct vfio_device_feature_migration {
+ __aligned_u64 flags;
+#define VFIO_MIGRATION_STOP_COPY (1 << 0)
+#define VFIO_MIGRATION_P2P (1 << 1)
+};
+#define VFIO_DEVICE_FEATURE_MIGRATION 1
+
+/*
+ * Upon VFIO_DEVICE_FEATURE_SET, execute a migration state change on the VFIO
+ * device. The new state is supplied in device_state, see enum
+ * vfio_device_mig_state for details
+ *
+ * The kernel migration driver must fully transition the device to the new state
+ * value before the operation returns to the user.
+ *
+ * The kernel migration driver must not generate asynchronous device state
+ * transitions outside of manipulation by the user or the VFIO_DEVICE_RESET
+ * ioctl as described above.
+ *
+ * If this function fails then current device_state may be the original
+ * operating state or some other state along the combination transition path.
+ * The user can then decide if it should execute a VFIO_DEVICE_RESET, attempt
+ * to return to the original state, or attempt to return to some other state
+ * such as RUNNING or STOP.
+ *
+ * If the new_state starts a new data transfer session then the FD associated
+ * with that session is returned in data_fd. The user is responsible to close
+ * this FD when it is finished. The user must consider the migration data stream
+ * carried over the FD to be opaque and must preserve the byte order of the
+ * stream. The user is not required to preserve buffer segmentation when writing
+ * the data stream during the RESUMING operation.
+ *
+ * Upon VFIO_DEVICE_FEATURE_GET, get the current migration state of the VFIO
+ * device, data_fd will be -1.
+ */
+struct vfio_device_feature_mig_state {
+ __u32 device_state; /* From enum vfio_device_mig_state */
+ __s32 data_fd;
+};
+#define VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE 2
+
+/*
+ * The device migration Finite State Machine is described by the enum
+ * vfio_device_mig_state. Some of the FSM arcs will create a migration data
+ * transfer session by returning a FD, in this case the migration data will
+ * flow over the FD using read() and write() as discussed below.
+ *
+ * There are 5 states to support VFIO_MIGRATION_STOP_COPY:
+ * RUNNING - The device is running normally
+ * STOP - The device does not change the internal or external state
+ * STOP_COPY - The device internal state can be read out
+ * RESUMING - The device is stopped and is loading a new internal state
+ * ERROR - The device has failed and must be reset
+ *
+ * And 1 optional state to support VFIO_MIGRATION_P2P:
+ * RUNNING_P2P - RUNNING, except the device cannot do peer to peer DMA
+ *
+ * The FSM takes actions on the arcs between FSM states. The driver implements
+ * the following behavior for the FSM arcs:
+ *
+ * RUNNING_P2P -> STOP
+ * STOP_COPY -> STOP
+ * While in STOP the device must stop the operation of the device. The device
+ * must not generate interrupts, DMA, or any other change to external state.
+ * It must not change its internal state. When stopped the device and kernel
+ * migration driver must accept and respond to interaction to support external
+ * subsystems in the STOP state, for example PCI MSI-X and PCI config space.
+ * Failure by the user to restrict device access while in STOP must not result
+ * in error conditions outside the user context (ex. host system faults).
+ *
+ * The STOP_COPY arc will terminate a data transfer session.
+ *
+ * RESUMING -> STOP
+ * Leaving RESUMING terminates a data transfer session and indicates the
+ * device should complete processing of the data delivered by write(). The
+ * kernel migration driver should complete the incorporation of data written
+ * to the data transfer FD into the device internal state and perform
+ * final validity and consistency checking of the new device state. If the
+ * user provided data is found to be incomplete, inconsistent, or otherwise
+ * invalid, the migration driver must fail the SET_STATE ioctl and
+ * optionally go to the ERROR state as described below.
+ *
+ * While in STOP the device has the same behavior as other STOP states
+ * described above.
+ *
+ * To abort a RESUMING session the device must be reset.
+ *
+ * RUNNING_P2P -> RUNNING
+ * While in RUNNING the device is fully operational, the device may generate
+ * interrupts, DMA, respond to MMIO, all vfio device regions are functional,
+ * and the device may advance its internal state.
+ *
+ * RUNNING -> RUNNING_P2P
+ * STOP -> RUNNING_P2P
+ * While in RUNNING_P2P the device is partially running in the P2P quiescent
+ * state defined below.
+ *
+ * STOP -> STOP_COPY
+ * This arc begin the process of saving the device state and will return a
+ * new data_fd.
+ *
+ * While in the STOP_COPY state the device has the same behavior as STOP
+ * with the addition that the data transfers session continues to stream the
+ * migration state. End of stream on the FD indicates the entire device
+ * state has been transferred.
+ *
+ * The user should take steps to restrict access to vfio device regions while
+ * the device is in STOP_COPY or risk corruption of the device migration data
+ * stream.
+ *
+ * STOP -> RESUMING
+ * Entering the RESUMING state starts a process of restoring the device state
+ * and will return a new data_fd. The data stream fed into the data_fd should
+ * be taken from the data transfer output of a single FD during saving from
+ * a compatible device. The migration driver may alter/reset the internal
+ * device state for this arc if required to prepare the device to receive the
+ * migration data.
+ *
+ * any -> ERROR
+ * ERROR cannot be specified as a device state, however any transition request
+ * can be failed with an errno return and may then move the device_state into
+ * ERROR. In this case the device was unable to execute the requested arc and
+ * was also unable to restore the device to any valid device_state.
+ * To recover from ERROR VFIO_DEVICE_RESET must be used to return the
+ * device_state back to RUNNING.
+ *
+ * The optional peer to peer (P2P) quiescent state is intended to be a quiescent
+ * state for the device for the purposes of managing multiple devices within a
+ * user context where peer-to-peer DMA between devices may be active. The
+ * RUNNING_P2P states must prevent the device from initiating
+ * any new P2P DMA transactions. If the device can identify P2P transactions
+ * then it can stop only P2P DMA, otherwise it must stop all DMA. The migration
+ * driver must complete any such outstanding operations prior to completing the
+ * FSM arc into a P2P state. For the purpose of specification the states
+ * behave as though the device was fully running if not supported. Like while in
+ * STOP or STOP_COPY the user must not touch the device, otherwise the state
+ * can be exited.
+ *
+ * The remaining possible transitions are interpreted as combinations of the
+ * above FSM arcs. As there are multiple paths through the FSM arcs the path
+ * should be selected based on the following rules:
+ * - Select the shortest path.
+ * Refer to vfio_mig_get_next_state() for the result of the algorithm.
+ *
+ * The automatic transit through the FSM arcs that make up the combination
+ * transition is invisible to the user. When working with combination arcs the
+ * user may see any step along the path in the device_state if SET_STATE
+ * fails. When handling these types of errors users should anticipate future
+ * revisions of this protocol using new states and those states becoming
+ * visible in this case.
+ *
+ * The optional states cannot be used with SET_STATE if the device does not
+ * support them. The user can discover if these states are supported by using
+ * VFIO_DEVICE_FEATURE_MIGRATION. By using combination transitions the user can
+ * avoid knowing about these optional states if the kernel driver supports them.
+ */
+enum vfio_device_mig_state {
+ VFIO_DEVICE_STATE_ERROR = 0,
+ VFIO_DEVICE_STATE_STOP = 1,
+ VFIO_DEVICE_STATE_RUNNING = 2,
+ VFIO_DEVICE_STATE_STOP_COPY = 3,
+ VFIO_DEVICE_STATE_RESUMING = 4,
+ VFIO_DEVICE_STATE_RUNNING_P2P = 5,
+};
+
/* -------- API for Type1 VFIO IOMMU -------- */
/**