From 2c2b0d880f1b4c01f30e14242977b82fa527342d Mon Sep 17 00:00:00 2001 From: Mukul Joshi Date: Thu, 23 Jul 2020 23:09:57 -0400 Subject: drm/amdkfd: Add thermal throttling SMI event Add support for reporting thermal throttling events through SMI. Also, add a counter to count the number of throttling interrupts observed and report the count in the SMI event message. Signed-off-by: Mukul Joshi Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- include/uapi/linux/kfd_ioctl.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index f738c3b53f4e..df6c7a43aadc 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -450,7 +450,8 @@ struct kfd_ioctl_import_dmabuf_args { * KFD SMI(System Management Interface) events */ /* Event type (defined by bitmask) */ -#define KFD_SMI_EVENT_VMFAULT 0x0000000000000001 +#define KFD_SMI_EVENT_VMFAULT 0x0000000000000001 +#define KFD_SMI_EVENT_THERMAL_THROTTLE 0x0000000000000002 struct kfd_ioctl_smi_events_args { __u32 gpuid; /* to KFD */ -- cgit v1.2.3 From 522ec6e0eed0ab0678e7d5b5bf00487dfe83f7ce Mon Sep 17 00:00:00 2001 From: Mukul Joshi Date: Thu, 30 Jul 2020 18:04:33 -0400 Subject: drm/amdkfd: Replace bitmask with event idx in SMI event msg Event bitmask is a 64-bit mask with only 1 bit set. Sending this event bitmask in KFD SMI event message is both wasteful of memory and potentially limiting to only 64 events. Instead send event index in SMI event message. Please note this change does not break the ABI for the two event types defined so far. The new index is identical to the mask used before. Signed-off-by: Mukul Joshi Suggested-by: Felix Kuehling Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 24 +++++++++++++----------- include/uapi/linux/kfd_ioctl.h | 10 +++++++--- 2 files changed, 20 insertions(+), 14 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index 86c2c3e97944..4d4b6e3ab697 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -149,7 +149,7 @@ static int kfd_smi_ev_release(struct inode *inode, struct file *filep) return 0; } -static void add_event_to_kfifo(struct kfd_dev *dev, unsigned long long smi_event, +static void add_event_to_kfifo(struct kfd_dev *dev, unsigned int smi_event, char *event_msg, int len) { struct kfd_smi_client *client; @@ -157,14 +157,15 @@ static void add_event_to_kfifo(struct kfd_dev *dev, unsigned long long smi_event rcu_read_lock(); list_for_each_entry_rcu(client, &dev->smi_clients, list) { - if (!(READ_ONCE(client->events) & smi_event)) + if (!(READ_ONCE(client->events) & + KFD_SMI_EVENT_MASK_FROM_INDEX(smi_event))) continue; spin_lock(&client->lock); if (kfifo_avail(&client->fifo) >= len) { kfifo_in(&client->fifo, event_msg, len); wake_up_all(&client->wait_queue); } else { - pr_debug("smi_event(EventID: %llu): no space left\n", + pr_debug("smi_event(EventID: %u): no space left\n", smi_event); } spin_unlock(&client->lock); @@ -180,21 +181,21 @@ void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev, /* * ThermalThrottle msg = throttle_bitmask(8): * thermal_interrupt_count(16): - * 16 bytes event + 1 byte space + 8 byte throttle_bitmask + + * 1 byte event + 1 byte space + 8 byte throttle_bitmask + * 1 byte : + 16 byte thermal_interupt_counter + 1 byte \n + - * 1 byte \0 = 44 + * 1 byte \0 = 29 */ - char fifo_in[44]; + char fifo_in[29]; int len; if (list_empty(&dev->smi_clients)) return; - len = snprintf(fifo_in, 44, "%x %x:%llx\n", + len = snprintf(fifo_in, 29, "%x %x:%llx\n", KFD_SMI_EVENT_THERMAL_THROTTLE, throttle_bitmask, atomic64_read(&adev->smu.throttle_int_counter)); - add_event_to_kfifo(dev, KFD_SMI_EVENT_THERMAL_THROTTLE, fifo_in, len); + add_event_to_kfifo(dev, KFD_SMI_EVENT_THERMAL_THROTTLE, fifo_in, len); } void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid) @@ -202,9 +203,10 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid) struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd; struct amdgpu_task_info task_info; /* VmFault msg = (hex)uint32_pid(8) + :(1) + task name(16) = 25 */ - /* 16 bytes event + 1 byte space + 25 bytes msg + 1 byte \n = 43 + /* 1 byte event + 1 byte space + 25 bytes msg + 1 byte \n + + * 1 byte \0 = 29 */ - char fifo_in[43]; + char fifo_in[29]; int len; if (list_empty(&dev->smi_clients)) @@ -216,7 +218,7 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid) if (!task_info.pid) return; - len = snprintf(fifo_in, 43, "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT, + len = snprintf(fifo_in, 29, "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT, task_info.pid, task_info.task_name); add_event_to_kfifo(dev, KFD_SMI_EVENT_VMFAULT, fifo_in, len); diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index df6c7a43aadc..cb1f963a84e0 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -449,9 +449,13 @@ struct kfd_ioctl_import_dmabuf_args { /* * KFD SMI(System Management Interface) events */ -/* Event type (defined by bitmask) */ -#define KFD_SMI_EVENT_VMFAULT 0x0000000000000001 -#define KFD_SMI_EVENT_THERMAL_THROTTLE 0x0000000000000002 +enum kfd_smi_event { + KFD_SMI_EVENT_NONE = 0, /* not used */ + KFD_SMI_EVENT_VMFAULT = 1, /* event start counting at 1 */ + KFD_SMI_EVENT_THERMAL_THROTTLE = 2, +}; + +#define KFD_SMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1)) struct kfd_ioctl_smi_events_args { __u32 gpuid; /* to KFD */ -- cgit v1.2.3 From 592d9fba33c275b72cb4dae99c187444daafcd33 Mon Sep 17 00:00:00 2001 From: David Stevens Date: Tue, 18 Aug 2020 16:13:42 +0900 Subject: virtio-gpu: add VIRTIO_GPU_F_RESOURCE_UUID feature This feature allows the guest to request a UUID from the host for a particular virtio_gpu resource. The UUID can then be shared with other virtio devices, to allow the other host devices to access the virtio_gpu's corresponding host resource. Signed-off-by: David Stevens Acked-by: Michael S. Tsirkin Link: http://patchwork.freedesktop.org/patch/msgid/20200818071343.3461203-3-stevensd@chromium.org Signed-off-by: Gerd Hoffmann --- include/uapi/linux/virtio_gpu.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/virtio_gpu.h b/include/uapi/linux/virtio_gpu.h index 0c85914d9369..9721d58b4d58 100644 --- a/include/uapi/linux/virtio_gpu.h +++ b/include/uapi/linux/virtio_gpu.h @@ -50,6 +50,10 @@ * VIRTIO_GPU_CMD_GET_EDID */ #define VIRTIO_GPU_F_EDID 1 +/* + * VIRTIO_GPU_CMD_RESOURCE_ASSIGN_UUID + */ +#define VIRTIO_GPU_F_RESOURCE_UUID 2 enum virtio_gpu_ctrl_type { VIRTIO_GPU_UNDEFINED = 0, @@ -66,6 +70,7 @@ enum virtio_gpu_ctrl_type { VIRTIO_GPU_CMD_GET_CAPSET_INFO, VIRTIO_GPU_CMD_GET_CAPSET, VIRTIO_GPU_CMD_GET_EDID, + VIRTIO_GPU_CMD_RESOURCE_ASSIGN_UUID, /* 3d commands */ VIRTIO_GPU_CMD_CTX_CREATE = 0x0200, @@ -87,6 +92,7 @@ enum virtio_gpu_ctrl_type { VIRTIO_GPU_RESP_OK_CAPSET_INFO, VIRTIO_GPU_RESP_OK_CAPSET, VIRTIO_GPU_RESP_OK_EDID, + VIRTIO_GPU_RESP_OK_RESOURCE_UUID, /* error responses */ VIRTIO_GPU_RESP_ERR_UNSPEC = 0x1200, @@ -340,4 +346,17 @@ enum virtio_gpu_formats { VIRTIO_GPU_FORMAT_R8G8B8X8_UNORM = 134, }; +/* VIRTIO_GPU_CMD_RESOURCE_ASSIGN_UUID */ +struct virtio_gpu_resource_assign_uuid { + struct virtio_gpu_ctrl_hdr hdr; + __le32 resource_id; + __le32 padding; +}; + +/* VIRTIO_GPU_RESP_OK_RESOURCE_UUID */ +struct virtio_gpu_resp_resource_uuid { + struct virtio_gpu_ctrl_hdr hdr; + __u8 uuid[16]; +}; + #endif -- cgit v1.2.3 From aa207a05f95abc3530b7415232f0f73278336bd3 Mon Sep 17 00:00:00 2001 From: Alexander Usyskin Date: Tue, 18 Aug 2020 14:51:45 +0300 Subject: mei: add connect with vtag ioctl This IOCTL is used to associate the current file descriptor with a FW Client (given by UUID), and virtual tag (vtag). The IOCTL opens a communication channel between a host client and a FW client on a tagged channel. From this point on, every reader and write will communicate with the associated FW client on the tagged channel. Upon close() the communication is terminated. The IOCTL argument is a struct with a union that contains the input parameter and the output parameter for this IOCTL. The input parameter is UUID of the FW Client, a vtag [0,255] The output parameter is the properties of the FW client Clients that do not support tagged connection will respond with -EOPNOTSUPP Signed-off-by: Alexander Usyskin Signed-off-by: Tomas Winkler Link: https://lore.kernel.org/r/20200818115147.2567012-12-tomas.winkler@intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mei/main.c | 210 +++++++++++++++++++++++++++++++++++++++++++---- include/uapi/linux/mei.h | 49 +++++++++++ 2 files changed, 243 insertions(+), 16 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/misc/mei/main.c b/drivers/misc/mei/main.c index 401bf8743689..9f6682033ed7 100644 --- a/drivers/misc/mei/main.c +++ b/drivers/misc/mei/main.c @@ -395,17 +395,18 @@ out: * mei_ioctl_connect_client - the connect to fw client IOCTL function * * @file: private data of the file object - * @data: IOCTL connect data, input and output parameters + * @in_client_uuid: requested UUID for connection + * @client: IOCTL connect data, output parameters * * Locking: called under "dev->device_lock" lock * * Return: 0 on success, <0 on failure. */ static int mei_ioctl_connect_client(struct file *file, - struct mei_connect_client_data *data) + const uuid_le *in_client_uuid, + struct mei_client *client) { struct mei_device *dev; - struct mei_client *client; struct mei_me_client *me_cl; struct mei_cl *cl; int rets; @@ -413,18 +414,15 @@ static int mei_ioctl_connect_client(struct file *file, cl = file->private_data; dev = cl->dev; - if (dev->dev_state != MEI_DEV_ENABLED) - return -ENODEV; - if (cl->state != MEI_FILE_INITIALIZING && cl->state != MEI_FILE_DISCONNECTED) return -EBUSY; /* find ME client we're trying to connect to */ - me_cl = mei_me_cl_by_uuid(dev, &data->in_client_uuid); + me_cl = mei_me_cl_by_uuid(dev, in_client_uuid); if (!me_cl) { dev_dbg(dev->dev, "Cannot connect to FW Client UUID = %pUl\n", - &data->in_client_uuid); + in_client_uuid); rets = -ENOTTY; goto end; } @@ -434,7 +432,7 @@ static int mei_ioctl_connect_client(struct file *file, !dev->allow_fixed_address : !dev->hbm_f_fa_supported; if (forbidden) { dev_dbg(dev->dev, "Connection forbidden to FW Client UUID = %pUl\n", - &data->in_client_uuid); + in_client_uuid); rets = -ENOTTY; goto end; } @@ -448,7 +446,6 @@ static int mei_ioctl_connect_client(struct file *file, me_cl->props.max_msg_length); /* prepare the output buffer */ - client = &data->out_client_properties; client->max_msg_length = me_cl->props.max_msg_length; client->protocol_version = me_cl->props.protocol_version; dev_dbg(dev->dev, "Can connect?\n"); @@ -460,6 +457,135 @@ end: return rets; } +/** + * mei_vt_support_check - check if client support vtags + * + * Locking: called under "dev->device_lock" lock + * + * @dev: mei_device + * @uuid: client UUID + * + * Return: + * 0 - supported + * -ENOTTY - no such client + * -EOPNOTSUPP - vtags are not supported by client + */ +static int mei_vt_support_check(struct mei_device *dev, const uuid_le *uuid) +{ + struct mei_me_client *me_cl; + int ret; + + if (!dev->hbm_f_vt_supported) + return -EOPNOTSUPP; + + me_cl = mei_me_cl_by_uuid(dev, uuid); + if (!me_cl) { + dev_dbg(dev->dev, "Cannot connect to FW Client UUID = %pUl\n", + uuid); + return -ENOTTY; + } + ret = me_cl->props.vt_supported ? 0 : -EOPNOTSUPP; + mei_me_cl_put(me_cl); + + return ret; +} + +/** + * mei_ioctl_connect_vtag - connect to fw client with vtag IOCTL function + * + * @file: private data of the file object + * @in_client_uuid: requested UUID for connection + * @client: IOCTL connect data, output parameters + * @vtag: vm tag + * + * Locking: called under "dev->device_lock" lock + * + * Return: 0 on success, <0 on failure. + */ +static int mei_ioctl_connect_vtag(struct file *file, + const uuid_le *in_client_uuid, + struct mei_client *client, + u8 vtag) +{ + struct mei_device *dev; + struct mei_cl *cl; + struct mei_cl *pos; + struct mei_cl_vtag *cl_vtag; + + cl = file->private_data; + dev = cl->dev; + + dev_dbg(dev->dev, "FW Client %pUl vtag %d\n", in_client_uuid, vtag); + + switch (cl->state) { + case MEI_FILE_DISCONNECTED: + if (mei_cl_vtag_by_fp(cl, file) != vtag) { + dev_err(dev->dev, "reconnect with different vtag\n"); + return -EINVAL; + } + break; + case MEI_FILE_INITIALIZING: + /* malicious connect from another thread may push vtag */ + if (!IS_ERR(mei_cl_fp_by_vtag(cl, vtag))) { + dev_err(dev->dev, "vtag already filled\n"); + return -EINVAL; + } + + list_for_each_entry(pos, &dev->file_list, link) { + if (pos == cl) + continue; + if (!pos->me_cl) + continue; + + /* only search for same UUID */ + if (uuid_le_cmp(*mei_cl_uuid(pos), *in_client_uuid)) + continue; + + /* if tag already exist try another fp */ + if (!IS_ERR(mei_cl_fp_by_vtag(pos, vtag))) + continue; + + /* replace cl with acquired one */ + dev_dbg(dev->dev, "replacing with existing cl\n"); + mei_cl_unlink(cl); + kfree(cl); + file->private_data = pos; + cl = pos; + break; + } + + cl_vtag = mei_cl_vtag_alloc(file, vtag); + if (IS_ERR(cl_vtag)) + return -ENOMEM; + + list_add_tail(&cl_vtag->list, &cl->vtag_map); + break; + default: + return -EBUSY; + } + + while (cl->state != MEI_FILE_INITIALIZING && + cl->state != MEI_FILE_DISCONNECTED && + cl->state != MEI_FILE_CONNECTED) { + mutex_unlock(&dev->device_lock); + wait_event_timeout(cl->wait, + (cl->state == MEI_FILE_CONNECTED || + cl->state == MEI_FILE_DISCONNECTED || + cl->state == MEI_FILE_DISCONNECT_REQUIRED || + cl->state == MEI_FILE_DISCONNECT_REPLY), + mei_secs_to_jiffies(MEI_CL_CONNECT_TIMEOUT)); + mutex_lock(&dev->device_lock); + } + + if (!mei_cl_is_connected(cl)) + return mei_ioctl_connect_client(file, in_client_uuid, client); + + client->max_msg_length = cl->me_cl->props.max_msg_length; + client->protocol_version = cl->me_cl->props.protocol_version; + + return 0; +} + /** * mei_ioctl_client_notify_request - * propagate event notification request to client @@ -516,7 +642,11 @@ static long mei_ioctl(struct file *file, unsigned int cmd, unsigned long data) { struct mei_device *dev; struct mei_cl *cl = file->private_data; - struct mei_connect_client_data connect_data; + struct mei_connect_client_data conn; + struct mei_connect_client_data_vtag conn_vtag; + const uuid_le *cl_uuid; + struct mei_client *props; + u8 vtag; u32 notify_get, notify_req; int rets; @@ -537,20 +667,68 @@ static long mei_ioctl(struct file *file, unsigned int cmd, unsigned long data) switch (cmd) { case IOCTL_MEI_CONNECT_CLIENT: dev_dbg(dev->dev, ": IOCTL_MEI_CONNECT_CLIENT.\n"); - if (copy_from_user(&connect_data, (char __user *)data, - sizeof(connect_data))) { + if (copy_from_user(&conn, (char __user *)data, sizeof(conn))) { + dev_dbg(dev->dev, "failed to copy data from userland\n"); + rets = -EFAULT; + goto out; + } + cl_uuid = &conn.in_client_uuid; + props = &conn.out_client_properties; + vtag = 0; + + rets = mei_vt_support_check(dev, cl_uuid); + if (rets == -ENOTTY) + goto out; + if (!rets) + rets = mei_ioctl_connect_vtag(file, cl_uuid, props, + vtag); + else + rets = mei_ioctl_connect_client(file, cl_uuid, props); + if (rets) + goto out; + + /* if all is ok, copying the data back to user. */ + if (copy_to_user((char __user *)data, &conn, sizeof(conn))) { + dev_dbg(dev->dev, "failed to copy data to userland\n"); + rets = -EFAULT; + goto out; + } + + break; + + case IOCTL_MEI_CONNECT_CLIENT_VTAG: + dev_dbg(dev->dev, "IOCTL_MEI_CONNECT_CLIENT_VTAG\n"); + if (copy_from_user(&conn_vtag, (char __user *)data, + sizeof(conn_vtag))) { dev_dbg(dev->dev, "failed to copy data from userland\n"); rets = -EFAULT; goto out; } - rets = mei_ioctl_connect_client(file, &connect_data); + cl_uuid = &conn_vtag.connect.in_client_uuid; + props = &conn_vtag.out_client_properties; + vtag = conn_vtag.connect.vtag; + + rets = mei_vt_support_check(dev, cl_uuid); + if (rets == -EOPNOTSUPP) + dev_dbg(dev->dev, "FW Client %pUl does not support vtags\n", + cl_uuid); + if (rets) + goto out; + + if (!vtag) { + dev_dbg(dev->dev, "vtag can't be zero\n"); + rets = -EINVAL; + goto out; + } + + rets = mei_ioctl_connect_vtag(file, cl_uuid, props, vtag); if (rets) goto out; /* if all is ok, copying the data back to user. */ - if (copy_to_user((char __user *)data, &connect_data, - sizeof(connect_data))) { + if (copy_to_user((char __user *)data, &conn_vtag, + sizeof(conn_vtag))) { dev_dbg(dev->dev, "failed to copy data to userland\n"); rets = -EFAULT; goto out; diff --git a/include/uapi/linux/mei.h b/include/uapi/linux/mei.h index c6aec86cc5de..4f3638489d01 100644 --- a/include/uapi/linux/mei.h +++ b/include/uapi/linux/mei.h @@ -66,4 +66,53 @@ struct mei_connect_client_data { */ #define IOCTL_MEI_NOTIFY_GET _IOR('H', 0x03, __u32) +/** + * struct mei_connect_client_vtag - mei client information struct with vtag + * + * @in_client_uuid: UUID of client to connect + * @vtag: virtual tag + * @reserved: reserved for future use + */ +struct mei_connect_client_vtag { + uuid_le in_client_uuid; + __u8 vtag; + __u8 reserved[3]; +}; + +/** + * struct mei_connect_client_data_vtag - IOCTL connect data union + * + * @connect: input connect data + * @out_client_properties: output client data + */ +struct mei_connect_client_data_vtag { + union { + struct mei_connect_client_vtag connect; + struct mei_client out_client_properties; + }; +}; + +/** + * DOC: + * This IOCTL is used to associate the current file descriptor with a + * FW Client (given by UUID), and virtual tag (vtag). + * The IOCTL opens a communication channel between a host client and + * a FW client on a tagged channel. From this point on, every read + * and write will communicate with the associated FW client with + * on the tagged channel. + * Upone close() the communication is terminated. + * + * The IOCTL argument is a struct with a union that contains + * the input parameter and the output parameter for this IOCTL. + * + * The input parameter is UUID of the FW Client, a vtag [0,255] + * The output parameter is the properties of the FW client + * (FW protocool version and max message size). + * + * Clients that do not support tagged connection + * will respond with -EOPNOTSUPP. + */ +#define IOCTL_MEI_CONNECT_CLIENT_VTAG \ + _IOWR('H', 0x04, struct mei_connect_client_data_vtag) + #endif /* _LINUX_MEI_H */ -- cgit v1.2.3 From ba171d3f0850003216fd1a85190d17b1feddb961 Mon Sep 17 00:00:00 2001 From: Cedric Neveux Date: Mon, 4 Mar 2019 08:54:23 +0100 Subject: driver: tee: Handle NULL pointer indication from client TEE Client introduce a new capability "TEE_GEN_CAP_MEMREF_NULL" to handle the support of the shared memory buffer with a NULL pointer. This capability depends on TEE Capabilities and driver support. Driver and TEE exchange capabilities at driver initialization. Signed-off-by: Michael Whitfield Signed-off-by: Cedric Neveux Reviewed-by: Joakim Bech Tested-by: Joakim Bech (QEMU) Signed-off-by: Jens Wiklander --- drivers/tee/optee/core.c | 7 +++++++ drivers/tee/optee/optee_smc.h | 3 +++ drivers/tee/tee_core.c | 49 +++++++++++++++++++++++++++---------------- include/linux/tee_drv.h | 3 +++ include/uapi/linux/tee.h | 13 ++++++++++++ 5 files changed, 57 insertions(+), 18 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/tee/optee/core.c b/drivers/tee/optee/core.c index b373b1b08b6d..cf4718c6d35d 100644 --- a/drivers/tee/optee/core.c +++ b/drivers/tee/optee/core.c @@ -216,6 +216,8 @@ static void optee_get_version(struct tee_device *teedev, if (optee->sec_caps & OPTEE_SMC_SEC_CAP_DYNAMIC_SHM) v.gen_caps |= TEE_GEN_CAP_REG_MEM; + if (optee->sec_caps & OPTEE_SMC_SEC_CAP_MEMREF_NULL) + v.gen_caps |= TEE_GEN_CAP_MEMREF_NULL; *vers = v; } @@ -262,6 +264,11 @@ static int optee_open(struct tee_context *ctx) mutex_init(&ctxdata->mutex); INIT_LIST_HEAD(&ctxdata->sess_list); + if (optee->sec_caps & OPTEE_SMC_SEC_CAP_MEMREF_NULL) + ctx->cap_memref_null = true; + else + ctx->cap_memref_null = false; + ctx->data = ctxdata; return 0; } diff --git a/drivers/tee/optee/optee_smc.h b/drivers/tee/optee/optee_smc.h index c72122d9c997..777ad54d4c2c 100644 --- a/drivers/tee/optee/optee_smc.h +++ b/drivers/tee/optee/optee_smc.h @@ -215,6 +215,9 @@ struct optee_smc_get_shm_config_result { */ #define OPTEE_SMC_SEC_CAP_DYNAMIC_SHM BIT(2) +/* Secure world supports Shared Memory with a NULL buffer reference */ +#define OPTEE_SMC_SEC_CAP_MEMREF_NULL BIT(4) + #define OPTEE_SMC_FUNCID_EXCHANGE_CAPABILITIES 9 #define OPTEE_SMC_EXCHANGE_CAPABILITIES \ OPTEE_SMC_FAST_CALL_VAL(OPTEE_SMC_FUNCID_EXCHANGE_CAPABILITIES) diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c index 64637e09a095..ce0f0309b6ac 100644 --- a/drivers/tee/tee_core.c +++ b/drivers/tee/tee_core.c @@ -383,25 +383,38 @@ static int params_from_user(struct tee_context *ctx, struct tee_param *params, case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT: /* - * If we fail to get a pointer to a shared memory - * object (and increase the ref count) from an - * identifier we return an error. All pointers that - * has been added in params have an increased ref - * count. It's the callers responibility to do - * tee_shm_put() on all resolved pointers. + * If a NULL pointer is passed to a TA in the TEE, + * the ip.c IOCTL parameters is set to TEE_MEMREF_NULL + * indicating a NULL memory reference. */ - shm = tee_shm_get_from_id(ctx, ip.c); - if (IS_ERR(shm)) - return PTR_ERR(shm); - - /* - * Ensure offset + size does not overflow offset - * and does not overflow the size of the referred - * shared memory object. - */ - if ((ip.a + ip.b) < ip.a || - (ip.a + ip.b) > shm->size) { - tee_shm_put(shm); + if (ip.c != TEE_MEMREF_NULL) { + /* + * If we fail to get a pointer to a shared + * memory object (and increase the ref count) + * from an identifier we return an error. All + * pointers that has been added in params have + * an increased ref count. It's the callers + * responibility to do tee_shm_put() on all + * resolved pointers. + */ + shm = tee_shm_get_from_id(ctx, ip.c); + if (IS_ERR(shm)) + return PTR_ERR(shm); + + /* + * Ensure offset + size does not overflow + * offset and does not overflow the size of + * the referred shared memory object. + */ + if ((ip.a + ip.b) < ip.a || + (ip.a + ip.b) > shm->size) { + tee_shm_put(shm); + return -EINVAL; + } + } else if (ctx->cap_memref_null) { + /* Pass NULL pointer to OP-TEE */ + shm = NULL; + } else { return -EINVAL; } diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h index d074302989dd..cdd049a724b1 100644 --- a/include/linux/tee_drv.h +++ b/include/linux/tee_drv.h @@ -47,6 +47,8 @@ struct tee_shm_pool; * and just return with an error code. It is needed for requests * that arises from TEE based kernel drivers that should be * non-blocking in nature. + * @cap_memref_null: flag indicating if the TEE Client support shared + * memory buffer with a NULL pointer. */ struct tee_context { struct tee_device *teedev; @@ -54,6 +56,7 @@ struct tee_context { struct kref refcount; bool releasing; bool supp_nowait; + bool cap_memref_null; }; struct tee_param_memref { diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h index b619f37ee03e..d67cadf221fc 100644 --- a/include/uapi/linux/tee.h +++ b/include/uapi/linux/tee.h @@ -51,6 +51,9 @@ #define TEE_GEN_CAP_GP (1 << 0)/* GlobalPlatform compliant TEE */ #define TEE_GEN_CAP_PRIVILEGED (1 << 1)/* Privileged device (for supplicant) */ #define TEE_GEN_CAP_REG_MEM (1 << 2)/* Supports registering shared memory */ +#define TEE_GEN_CAP_MEMREF_NULL (1 << 3)/* NULL MemRef support */ + +#define TEE_MEMREF_NULL (__u64)(-1) /* NULL MemRef Buffer */ /* * TEE Implementation ID @@ -200,6 +203,16 @@ struct tee_ioctl_buf_data { * a part of a shared memory by specifying an offset (@a) and size (@b) of * the object. To supply the entire shared memory object set the offset * (@a) to 0 and size (@b) to the previously returned size of the object. + * + * A client may need to present a NULL pointer in the argument + * passed to a trusted application in the TEE. + * This is also a requirement in GlobalPlatform Client API v1.0c + * (section 3.2.5 memory references), which can be found at + * http://www.globalplatform.org/specificationsdevice.asp + * + * If a NULL pointer is passed to a TA in the TEE, the (@c) + * IOCTL parameters value must be set to TEE_MEMREF_NULL indicating a NULL + * memory reference. */ struct tee_ioctl_param { __u64 attr; -- cgit v1.2.3 From 6b0a249a301e2af9adda84adbced3a2988248b95 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 21 Aug 2020 11:44:18 -0700 Subject: bpf: Implement link_query for bpf iterators This patch implemented bpf_link callback functions show_fdinfo and fill_link_info to support link_query interface. The general interface for show_fdinfo and fill_link_info will print/fill the target_name. Each targets can register show_fdinfo and fill_link_info callbacks to print/fill more target specific information. For example, the below is a fdinfo result for a bpf task iterator. $ cat /proc/1749/fdinfo/7 pos: 0 flags: 02000000 mnt_id: 14 link_type: iter link_id: 11 prog_tag: 990e1f8152f7e54f prog_id: 59 target_name: task Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200821184418.574122-1-yhs@fb.com --- include/linux/bpf.h | 6 +++++ include/uapi/linux/bpf.h | 7 +++++ kernel/bpf/bpf_iter.c | 58 ++++++++++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 7 +++++ 4 files changed, 78 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a9b7185a6b37..529e9b183eeb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1218,12 +1218,18 @@ typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, union bpf_iter_link_info *linfo, struct bpf_iter_aux_info *aux); typedef void (*bpf_iter_detach_target_t)(struct bpf_iter_aux_info *aux); +typedef void (*bpf_iter_show_fdinfo_t) (const struct bpf_iter_aux_info *aux, + struct seq_file *seq); +typedef int (*bpf_iter_fill_link_info_t)(const struct bpf_iter_aux_info *aux, + struct bpf_link_info *info); #define BPF_ITER_CTX_ARG_MAX 2 struct bpf_iter_reg { const char *target; bpf_iter_attach_target_t attach_target; bpf_iter_detach_target_t detach_target; + bpf_iter_show_fdinfo_t show_fdinfo; + bpf_iter_fill_link_info_t fill_link_info; u32 ctx_arg_info_size; struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX]; const struct bpf_iter_seq_info *seq_info; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0480f893facd..a1bbaff7a0af 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4071,6 +4071,13 @@ struct bpf_link_info { __u64 cgroup_id; __u32 attach_type; } cgroup; + struct { + __aligned_u64 target_name; /* in/out: target_name buffer ptr */ + __u32 target_name_len; /* in/out: target_name buffer len */ + union { + __u32 map_id; + } map; + } iter; struct { __u32 netns_ino; __u32 attach_type; diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index b6715964b685..aeec7e174188 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -377,10 +377,68 @@ out_unlock: return ret; } +static void bpf_iter_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_iter_link *iter_link = + container_of(link, struct bpf_iter_link, link); + bpf_iter_show_fdinfo_t show_fdinfo; + + seq_printf(seq, + "target_name:\t%s\n", + iter_link->tinfo->reg_info->target); + + show_fdinfo = iter_link->tinfo->reg_info->show_fdinfo; + if (show_fdinfo) + show_fdinfo(&iter_link->aux, seq); +} + +static int bpf_iter_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_iter_link *iter_link = + container_of(link, struct bpf_iter_link, link); + char __user *ubuf = u64_to_user_ptr(info->iter.target_name); + bpf_iter_fill_link_info_t fill_link_info; + u32 ulen = info->iter.target_name_len; + const char *target_name; + u32 target_len; + + if (!ulen ^ !ubuf) + return -EINVAL; + + target_name = iter_link->tinfo->reg_info->target; + target_len = strlen(target_name); + info->iter.target_name_len = target_len + 1; + + if (ubuf) { + if (ulen >= target_len + 1) { + if (copy_to_user(ubuf, target_name, target_len + 1)) + return -EFAULT; + } else { + char zero = '\0'; + + if (copy_to_user(ubuf, target_name, ulen - 1)) + return -EFAULT; + if (put_user(zero, ubuf + ulen - 1)) + return -EFAULT; + return -ENOSPC; + } + } + + fill_link_info = iter_link->tinfo->reg_info->fill_link_info; + if (fill_link_info) + return fill_link_info(&iter_link->aux, info); + + return 0; +} + static const struct bpf_link_ops bpf_iter_link_lops = { .release = bpf_iter_link_release, .dealloc = bpf_iter_link_dealloc, .update_prog = bpf_iter_link_replace, + .show_fdinfo = bpf_iter_link_show_fdinfo, + .fill_link_info = bpf_iter_link_fill_link_info, }; bool bpf_link_is_iter(struct bpf_link *link) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0480f893facd..a1bbaff7a0af 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4071,6 +4071,13 @@ struct bpf_link_info { __u64 cgroup_id; __u32 attach_type; } cgroup; + struct { + __aligned_u64 target_name; /* in/out: target_name buffer ptr */ + __u32 target_name_len; /* in/out: target_name buffer len */ + union { + __u32 map_id; + } map; + } iter; struct { __u32 netns_ino; __u32 attach_type; -- cgit v1.2.3 From 4ffa22fd22a7cbde1a1394b2707ea73593dc0fda Mon Sep 17 00:00:00 2001 From: Matt Ranostay Date: Thu, 23 Jul 2020 09:29:43 +0300 Subject: iio: add IIO_MOD_O2 modifier Add modifier IIO_MOD_O2 for O2 concentration reporting Signed-off-by: Matt Ranostay Signed-off-by: Jonathan Cameron --- Documentation/ABI/testing/sysfs-bus-iio | 2 ++ drivers/iio/industrialio-core.c | 1 + include/uapi/linux/iio/types.h | 1 + tools/iio/iio_event_monitor.c | 2 ++ 4 files changed, 6 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio index 5c62bfb0f3f5..405181fde40a 100644 --- a/Documentation/ABI/testing/sysfs-bus-iio +++ b/Documentation/ABI/testing/sysfs-bus-iio @@ -1564,6 +1564,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_concentration_ethanol_raw What: /sys/bus/iio/devices/iio:deviceX/in_concentrationX_ethanol_raw What: /sys/bus/iio/devices/iio:deviceX/in_concentration_h2_raw What: /sys/bus/iio/devices/iio:deviceX/in_concentrationX_h2_raw +What: /sys/bus/iio/devices/iio:deviceX/in_concentration_o2_raw +What: /sys/bus/iio/devices/iio:deviceX/in_concentrationX_o2_raw What: /sys/bus/iio/devices/iio:deviceX/in_concentration_voc_raw What: /sys/bus/iio/devices/iio:deviceX/in_concentrationX_voc_raw KernelVersion: 4.3 diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c index 606d5e61c575..59003dc44e60 100644 --- a/drivers/iio/industrialio-core.c +++ b/drivers/iio/industrialio-core.c @@ -133,6 +133,7 @@ static const char * const iio_modifier_names[] = { [IIO_MOD_PM10] = "pm10", [IIO_MOD_ETHANOL] = "ethanol", [IIO_MOD_H2] = "h2", + [IIO_MOD_O2] = "o2", }; /* relies on pairs of these shared then separate */ diff --git a/include/uapi/linux/iio/types.h b/include/uapi/linux/iio/types.h index fdd81affca4b..48c13147c0a8 100644 --- a/include/uapi/linux/iio/types.h +++ b/include/uapi/linux/iio/types.h @@ -94,6 +94,7 @@ enum iio_modifier { IIO_MOD_PM10, IIO_MOD_ETHANOL, IIO_MOD_H2, + IIO_MOD_O2, }; enum iio_event_type { diff --git a/tools/iio/iio_event_monitor.c b/tools/iio/iio_event_monitor.c index f115d166c985..bb03859db89d 100644 --- a/tools/iio/iio_event_monitor.c +++ b/tools/iio/iio_event_monitor.c @@ -119,6 +119,7 @@ static const char * const iio_modifier_names[] = { [IIO_MOD_PM2P5] = "pm2p5", [IIO_MOD_PM4] = "pm4", [IIO_MOD_PM10] = "pm10", + [IIO_MOD_O2] = "o2", }; static bool event_is_known(struct iio_event_data *event) @@ -211,6 +212,7 @@ static bool event_is_known(struct iio_event_data *event) case IIO_MOD_PM2P5: case IIO_MOD_PM4: case IIO_MOD_PM10: + case IIO_MOD_O2: break; default: return false; -- cgit v1.2.3 From eee049c0ef5b5b433f36841801e34c21c9f82a23 Mon Sep 17 00:00:00 2001 From: Tom Parkin Date: Sat, 22 Aug 2020 15:59:08 +0100 Subject: l2tp: remove tunnel and session debug flags field The l2tp subsystem now uses standard kernel logging APIs for informational and warning messages, and tracepoints for debug information. Now that the tunnel and session debug flags are unused, remove the field from the core structures. Various system calls (in the case of l2tp_ppp) and netlink messages handle the getting and setting of debug flags. To avoid userspace breakage don't modify the API of these calls; simply ignore set requests, and send dummy data for get requests. Signed-off-by: Tom Parkin Signed-off-by: David S. Miller --- include/uapi/linux/if_pppol2tp.h | 2 +- include/uapi/linux/l2tp.h | 6 ++++-- net/l2tp/l2tp_core.c | 8 -------- net/l2tp/l2tp_core.h | 4 ---- net/l2tp/l2tp_debugfs.c | 4 ++-- net/l2tp/l2tp_netlink.c | 16 ++-------------- net/l2tp/l2tp_ppp.c | 15 ++++++++------- 7 files changed, 17 insertions(+), 38 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_pppol2tp.h b/include/uapi/linux/if_pppol2tp.h index 060b4d1f3129..a91044328bc9 100644 --- a/include/uapi/linux/if_pppol2tp.h +++ b/include/uapi/linux/if_pppol2tp.h @@ -75,7 +75,7 @@ struct pppol2tpv3in6_addr { }; /* Socket options: - * DEBUG - bitmask of debug message categories + * DEBUG - bitmask of debug message categories (not used) * SENDSEQ - 0 => don't send packets with sequence numbers * 1 => send packets with sequence numbers * RECVSEQ - 0 => receive packet sequence numbers are optional diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h index 61158f5a1a5b..88a0d32b8c07 100644 --- a/include/uapi/linux/l2tp.h +++ b/include/uapi/linux/l2tp.h @@ -108,7 +108,7 @@ enum { L2TP_ATTR_VLAN_ID, /* u16 (not used) */ L2TP_ATTR_COOKIE, /* 0, 4 or 8 bytes */ L2TP_ATTR_PEER_COOKIE, /* 0, 4 or 8 bytes */ - L2TP_ATTR_DEBUG, /* u32, enum l2tp_debug_flags */ + L2TP_ATTR_DEBUG, /* u32, enum l2tp_debug_flags (not used) */ L2TP_ATTR_RECV_SEQ, /* u8 */ L2TP_ATTR_SEND_SEQ, /* u8 */ L2TP_ATTR_LNS_MODE, /* u8 */ @@ -177,7 +177,9 @@ enum l2tp_seqmode { }; /** - * enum l2tp_debug_flags - debug message categories for L2TP tunnels/sessions + * enum l2tp_debug_flags - debug message categories for L2TP tunnels/sessions. + * + * Unused. * * @L2TP_MSG_DEBUG: verbose debug (if compiled in) * @L2TP_MSG_CONTROL: userspace - kernel interface diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index d8435b6f6fee..560c687f5457 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1401,16 +1401,12 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 tunnel->version = version; tunnel->tunnel_id = tunnel_id; tunnel->peer_tunnel_id = peer_tunnel_id; - tunnel->debug = L2TP_DEFAULT_DEBUG_FLAGS; tunnel->magic = L2TP_TUNNEL_MAGIC; sprintf(&tunnel->name[0], "tunl %u", tunnel_id); rwlock_init(&tunnel->hlist_lock); tunnel->acpt_newsess = true; - if (cfg) - tunnel->debug = cfg->debug; - tunnel->encap = encap; refcount_set(&tunnel->ref_count, 1); @@ -1608,12 +1604,8 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn INIT_HLIST_NODE(&session->hlist); INIT_HLIST_NODE(&session->global_hlist); - /* Inherit debug options from tunnel */ - session->debug = tunnel->debug; - if (cfg) { session->pwtype = cfg->pw_type; - session->debug = cfg->debug; session->send_seq = cfg->send_seq; session->recv_seq = cfg->recv_seq; session->lns_mode = cfg->lns_mode; diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 7a06ac135a9b..07249c5f22ef 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -51,7 +51,6 @@ struct l2tp_session_cfg { unsigned int lns_mode:1; /* behave as LNS? * LAC enables sequence numbers under LNS control. */ - int debug; /* bitmask of debug message categories */ u16 l2specific_type; /* Layer 2 specific type */ u8 cookie[8]; /* optional cookie */ int cookie_len; /* 0, 4 or 8 bytes */ @@ -98,7 +97,6 @@ struct l2tp_session { unsigned int lns_mode:1; /* behave as LNS? * LAC enables sequence numbers under LNS control. */ - int debug; /* bitmask of debug message categories */ int reorder_timeout; /* configured reorder timeout (in jiffies) */ int reorder_skip; /* set if skip to next nr */ enum l2tp_pwtype pwtype; @@ -132,7 +130,6 @@ struct l2tp_session { /* L2TP tunnel configuration */ struct l2tp_tunnel_cfg { - int debug; /* bitmask of debug message categories */ enum l2tp_encap_type encap; /* Used only for kernel-created sockets */ @@ -173,7 +170,6 @@ struct l2tp_tunnel { int version; /* 2=>L2TPv2, 3=>L2TPv3 */ char name[L2TP_TUNNEL_NAME_MAX]; /* for logging */ - int debug; /* bitmask of debug message categories */ enum l2tp_encap_type encap; struct l2tp_stats stats; diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c index 96cb9601c21b..bca75bef8282 100644 --- a/net/l2tp/l2tp_debugfs.c +++ b/net/l2tp/l2tp_debugfs.c @@ -167,7 +167,7 @@ static void l2tp_dfs_seq_tunnel_show(struct seq_file *m, void *v) tunnel->sock ? refcount_read(&tunnel->sock->sk_refcnt) : 0, refcount_read(&tunnel->ref_count)); seq_printf(m, " %08x rx %ld/%ld/%ld rx %ld/%ld/%ld\n", - tunnel->debug, + 0, atomic_long_read(&tunnel->stats.tx_packets), atomic_long_read(&tunnel->stats.tx_bytes), atomic_long_read(&tunnel->stats.tx_errors), @@ -192,7 +192,7 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v) session->recv_seq ? 'R' : '-', session->send_seq ? 'S' : '-', session->lns_mode ? "LNS" : "LAC", - session->debug, + 0, jiffies_to_msecs(session->reorder_timeout)); seq_printf(m, " offset 0 l2specific %hu/%hu\n", session->l2specific_type, l2tp_get_l2specific_len(session)); diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index def78eebca4c..31a1e27eab20 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -229,9 +229,6 @@ static int l2tp_nl_cmd_tunnel_create(struct sk_buff *skb, struct genl_info *info goto out; } - if (attrs[L2TP_ATTR_DEBUG]) - cfg.debug = nla_get_u32(attrs[L2TP_ATTR_DEBUG]); - ret = -EINVAL; switch (cfg.encap) { case L2TP_ENCAPTYPE_UDP: @@ -307,9 +304,6 @@ static int l2tp_nl_cmd_tunnel_modify(struct sk_buff *skb, struct genl_info *info goto out; } - if (info->attrs[L2TP_ATTR_DEBUG]) - tunnel->debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]); - ret = l2tp_tunnel_notify(&l2tp_nl_family, info, tunnel, L2TP_CMD_TUNNEL_MODIFY); @@ -400,7 +394,7 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int fla if (nla_put_u8(skb, L2TP_ATTR_PROTO_VERSION, tunnel->version) || nla_put_u32(skb, L2TP_ATTR_CONN_ID, tunnel->tunnel_id) || nla_put_u32(skb, L2TP_ATTR_PEER_CONN_ID, tunnel->peer_tunnel_id) || - nla_put_u32(skb, L2TP_ATTR_DEBUG, tunnel->debug) || + nla_put_u32(skb, L2TP_ATTR_DEBUG, 0) || nla_put_u16(skb, L2TP_ATTR_ENCAP_TYPE, tunnel->encap)) goto nla_put_failure; @@ -605,9 +599,6 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf cfg.ifname = nla_data(info->attrs[L2TP_ATTR_IFNAME]); } - if (info->attrs[L2TP_ATTR_DEBUG]) - cfg.debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]); - if (info->attrs[L2TP_ATTR_RECV_SEQ]) cfg.recv_seq = nla_get_u8(info->attrs[L2TP_ATTR_RECV_SEQ]); @@ -689,9 +680,6 @@ static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *inf goto out; } - if (info->attrs[L2TP_ATTR_DEBUG]) - session->debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]); - if (info->attrs[L2TP_ATTR_RECV_SEQ]) session->recv_seq = nla_get_u8(info->attrs[L2TP_ATTR_RECV_SEQ]); @@ -730,7 +718,7 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl nla_put_u32(skb, L2TP_ATTR_SESSION_ID, session->session_id) || nla_put_u32(skb, L2TP_ATTR_PEER_CONN_ID, tunnel->peer_tunnel_id) || nla_put_u32(skb, L2TP_ATTR_PEER_SESSION_ID, session->peer_session_id) || - nla_put_u32(skb, L2TP_ATTR_DEBUG, session->debug) || + nla_put_u32(skb, L2TP_ATTR_DEBUG, 0) || nla_put_u16(skb, L2TP_ATTR_PW_TYPE, session->pwtype)) goto nla_put_failure; diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index bd6bb17dfadb..450637ffa557 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -702,7 +702,6 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, if (!tunnel) { struct l2tp_tunnel_cfg tcfg = { .encap = L2TP_ENCAPTYPE_UDP, - .debug = 0, }; /* Prevent l2tp_tunnel_register() from trying to set up @@ -1147,7 +1146,7 @@ static int pppol2tp_tunnel_setsockopt(struct sock *sk, switch (optname) { case PPPOL2TP_SO_DEBUG: - tunnel->debug = val; + /* Tunnel debug flags option is deprecated */ break; default: @@ -1199,7 +1198,7 @@ static int pppol2tp_session_setsockopt(struct sock *sk, break; case PPPOL2TP_SO_DEBUG: - session->debug = val; + /* Session debug flags option is deprecated */ break; case PPPOL2TP_SO_REORDERTO: @@ -1271,7 +1270,8 @@ static int pppol2tp_tunnel_getsockopt(struct sock *sk, switch (optname) { case PPPOL2TP_SO_DEBUG: - *val = tunnel->debug; + /* Tunnel debug flags option is deprecated */ + *val = 0; break; default: @@ -1304,7 +1304,8 @@ static int pppol2tp_session_getsockopt(struct sock *sk, break; case PPPOL2TP_SO_DEBUG: - *val = session->debug; + /* Session debug flags option is deprecated */ + *val = 0; break; case PPPOL2TP_SO_REORDERTO: @@ -1496,7 +1497,7 @@ static void pppol2tp_seq_tunnel_show(struct seq_file *m, void *v) (tunnel == tunnel->sock->sk_user_data) ? 'Y' : 'N', refcount_read(&tunnel->ref_count) - 1); seq_printf(m, " %08x %ld/%ld/%ld %ld/%ld/%ld\n", - tunnel->debug, + 0, atomic_long_read(&tunnel->stats.tx_packets), atomic_long_read(&tunnel->stats.tx_bytes), atomic_long_read(&tunnel->stats.tx_errors), @@ -1542,7 +1543,7 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v) session->recv_seq ? 'R' : '-', session->send_seq ? 'S' : '-', session->lns_mode ? "LNS" : "LAC", - session->debug, + 0, jiffies_to_msecs(session->reorder_timeout)); seq_printf(m, " %hu/%hu %ld/%ld/%ld %ld/%ld/%ld\n", session->nr, session->ns, -- cgit v1.2.3 From 2b8ee4f05d4f6a6c427ad30dd6c1bb49eb2efd3b Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 20 Aug 2020 12:00:21 -0700 Subject: tcp: bpf: Add TCP_BPF_DELACK_MAX setsockopt This change is mostly from an internal patch and adapts it from sysctl config to the bpf_setsockopt setup. The bpf_prog can set the max delay ack by using bpf_setsockopt(TCP_BPF_DELACK_MAX). This max delay ack can be communicated to its peer through bpf header option. The receiving peer can then use this max delay ack and set a potentially lower rto by using bpf_setsockopt(TCP_BPF_RTO_MIN) which will be introduced in the next patch. Another later selftest patch will also use it like the above to show how to write and parse bpf tcp header option. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Reviewed-by: Eric Dumazet Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200820190021.2884000-1-kafai@fb.com --- include/net/inet_connection_sock.h | 1 + include/uapi/linux/bpf.h | 1 + net/core/filter.c | 8 ++++++++ net/ipv4/tcp.c | 2 ++ net/ipv4/tcp_output.c | 2 ++ tools/include/uapi/linux/bpf.h | 1 + 6 files changed, 15 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index aa8893c68c50..da7264a1ebfc 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -86,6 +86,7 @@ struct inet_connection_sock { struct timer_list icsk_retransmit_timer; struct timer_list icsk_delack_timer; __u32 icsk_rto; + __u32 icsk_delack_max; __u32 icsk_pmtu_cookie; const struct tcp_congestion_ops *icsk_ca_ops; const struct inet_connection_sock_af_ops *icsk_af_ops; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a1bbaff7a0af..7b905cb0213e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4257,6 +4257,7 @@ enum { enum { TCP_BPF_IW = 1001, /* Set TCP initial congestion window */ TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */ + TCP_BPF_DELACK_MAX = 1003, /* Max delay ack in usecs */ }; struct bpf_perf_event_value { diff --git a/net/core/filter.c b/net/core/filter.c index c847b1285acd..80fe7420f609 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4459,6 +4459,7 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, } else { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + unsigned long timeout; if (optlen != sizeof(int)) return -EINVAL; @@ -4480,6 +4481,13 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, tp->snd_ssthresh = val; } break; + case TCP_BPF_DELACK_MAX: + timeout = usecs_to_jiffies(val); + if (timeout > TCP_DELACK_MAX || + timeout < TCP_TIMEOUT_MIN) + return -EINVAL; + inet_csk(sk)->icsk_delack_max = timeout; + break; case TCP_SAVE_SYN: if (val < 0 || val > 1) ret = -EINVAL; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 87d3036d8bd8..44c353a39ad4 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -418,6 +418,7 @@ void tcp_init_sock(struct sock *sk) INIT_LIST_HEAD(&tp->tsorted_sent_queue); icsk->icsk_rto = TCP_TIMEOUT_INIT; + icsk->icsk_delack_max = TCP_DELACK_MAX; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U); @@ -2685,6 +2686,7 @@ int tcp_disconnect(struct sock *sk, int flags) icsk->icsk_backoff = 0; icsk->icsk_probes_out = 0; icsk->icsk_rto = TCP_TIMEOUT_INIT; + icsk->icsk_delack_max = TCP_DELACK_MAX; tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd = TCP_INIT_CWND; tp->snd_cwnd_cnt = 0; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 85ff417bda7f..44ffa4891beb 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3741,6 +3741,8 @@ void tcp_send_delayed_ack(struct sock *sk) ato = min(ato, max_ato); } + ato = min_t(u32, ato, inet_csk(sk)->icsk_delack_max); + /* Stay within the limit we were given */ timeout = jiffies + ato; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index a1bbaff7a0af..7b905cb0213e 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4257,6 +4257,7 @@ enum { enum { TCP_BPF_IW = 1001, /* Set TCP initial congestion window */ TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */ + TCP_BPF_DELACK_MAX = 1003, /* Max delay ack in usecs */ }; struct bpf_perf_event_value { -- cgit v1.2.3 From ca584ba070864c606f3a54faaafe774726d5b4a1 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 20 Aug 2020 12:00:27 -0700 Subject: tcp: bpf: Add TCP_BPF_RTO_MIN for bpf_setsockopt This patch adds bpf_setsockopt(TCP_BPF_RTO_MIN) to allow bpf prog to set the min rto of a connection. It could be used together with the earlier patch which has added bpf_setsockopt(TCP_BPF_DELACK_MAX). A later selftest patch will communicate the max delay ack in a bpf tcp header option and then the receiving side can use bpf_setsockopt(TCP_BPF_RTO_MIN) to set a shorter rto. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Reviewed-by: Eric Dumazet Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200820190027.2884170-1-kafai@fb.com --- include/net/inet_connection_sock.h | 1 + include/net/tcp.h | 2 +- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 7 +++++++ net/ipv4/tcp.c | 2 ++ tools/include/uapi/linux/bpf.h | 1 + 6 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index da7264a1ebfc..c738abeb3265 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -86,6 +86,7 @@ struct inet_connection_sock { struct timer_list icsk_retransmit_timer; struct timer_list icsk_delack_timer; __u32 icsk_rto; + __u32 icsk_rto_min; __u32 icsk_delack_max; __u32 icsk_pmtu_cookie; const struct tcp_congestion_ops *icsk_ca_ops; diff --git a/include/net/tcp.h b/include/net/tcp.h index eab6c7510b5b..dda778c782fe 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -699,7 +699,7 @@ static inline void tcp_fast_path_check(struct sock *sk) static inline u32 tcp_rto_min(struct sock *sk) { const struct dst_entry *dst = __sk_dst_get(sk); - u32 rto_min = TCP_RTO_MIN; + u32 rto_min = inet_csk(sk)->icsk_rto_min; if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7b905cb0213e..1ae20058b574 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4258,6 +4258,7 @@ enum { TCP_BPF_IW = 1001, /* Set TCP initial congestion window */ TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */ TCP_BPF_DELACK_MAX = 1003, /* Max delay ack in usecs */ + TCP_BPF_RTO_MIN = 1004, /* Min delay ack in usecs */ }; struct bpf_perf_event_value { diff --git a/net/core/filter.c b/net/core/filter.c index 80fe7420f609..075ab71b985c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4488,6 +4488,13 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, return -EINVAL; inet_csk(sk)->icsk_delack_max = timeout; break; + case TCP_BPF_RTO_MIN: + timeout = usecs_to_jiffies(val); + if (timeout > TCP_RTO_MIN || + timeout < TCP_TIMEOUT_MIN) + return -EINVAL; + inet_csk(sk)->icsk_rto_min = timeout; + break; case TCP_SAVE_SYN: if (val < 0 || val > 1) ret = -EINVAL; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 44c353a39ad4..6075cb091a20 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -418,6 +418,7 @@ void tcp_init_sock(struct sock *sk) INIT_LIST_HEAD(&tp->tsorted_sent_queue); icsk->icsk_rto = TCP_TIMEOUT_INIT; + icsk->icsk_rto_min = TCP_RTO_MIN; icsk->icsk_delack_max = TCP_DELACK_MAX; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U); @@ -2686,6 +2687,7 @@ int tcp_disconnect(struct sock *sk, int flags) icsk->icsk_backoff = 0; icsk->icsk_probes_out = 0; icsk->icsk_rto = TCP_TIMEOUT_INIT; + icsk->icsk_rto_min = TCP_RTO_MIN; icsk->icsk_delack_max = TCP_DELACK_MAX; tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd = TCP_INIT_CWND; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7b905cb0213e..1ae20058b574 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4258,6 +4258,7 @@ enum { TCP_BPF_IW = 1001, /* Set TCP initial congestion window */ TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */ TCP_BPF_DELACK_MAX = 1003, /* Max delay ack in usecs */ + TCP_BPF_RTO_MIN = 1004, /* Min delay ack in usecs */ }; struct bpf_perf_event_value { -- cgit v1.2.3 From 00d211a4ea6f48e8e3b758813fe23ad28193d3bf Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 20 Aug 2020 12:00:46 -0700 Subject: bpf: tcp: Add bpf_skops_parse_hdr() The patch adds a function bpf_skops_parse_hdr(). It will call the bpf prog to parse the TCP header received at a tcp_sock that has at least reached the ESTABLISHED state. For the packets received during the 3WHS (SYN, SYNACK and ACK), the received skb will be available to the bpf prog during the callback in bpf_skops_established() introduced in the previous patch and in the bpf_skops_write_hdr_opt() that will be added in the next patch. Calling bpf prog to parse header is controlled by two new flags in tp->bpf_sock_ops_cb_flags: BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG and BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG. When BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG is set, the bpf prog will only be called when there is unknown option in the TCP header. When BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG is set, the bpf prog will be called on all received TCP header. This function is half implemented to highlight the changes in TCP stack. The actual codes preparing the bpf running context and invoking the bpf prog will be added in the later patch with other necessary bpf pieces. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/bpf/20200820190046.2885054-1-kafai@fb.com --- include/uapi/linux/bpf.h | 4 +++- net/ipv4/tcp_input.c | 36 ++++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 4 +++- 3 files changed, 42 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1ae20058b574..010ed2abcb66 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4173,8 +4173,10 @@ enum { BPF_SOCK_OPS_RETRANS_CB_FLAG = (1<<1), BPF_SOCK_OPS_STATE_CB_FLAG = (1<<2), BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3), + BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = (1<<4), + BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG = (1<<5), /* Mask of all currently supported cb flags */ - BPF_SOCK_OPS_ALL_CB_FLAGS = 0xF, + BPF_SOCK_OPS_ALL_CB_FLAGS = 0x3F, }; /* List of known BPF sock_ops operators. diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 7b0faa2bfe32..b520450170d1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -139,6 +139,36 @@ EXPORT_SYMBOL_GPL(clean_acked_data_flush); #endif #ifdef CONFIG_CGROUP_BPF +static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) +{ + bool unknown_opt = tcp_sk(sk)->rx_opt.saw_unknown && + BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), + BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG); + bool parse_all_opt = BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), + BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG); + + if (likely(!unknown_opt && !parse_all_opt)) + return; + + /* The skb will be handled in the + * bpf_skops_established() or + * bpf_skops_write_hdr_opt(). + */ + switch (sk->sk_state) { + case TCP_SYN_RECV: + case TCP_SYN_SENT: + case TCP_LISTEN: + return; + } + + /* BPF prog will have access to the sk and skb. + * + * The bpf running context preparation and the actual bpf prog + * calling will be implemented in a later PATCH together with + * other bpf pieces. + */ +} + static void bpf_skops_established(struct sock *sk, int bpf_op, struct sk_buff *skb) { @@ -155,6 +185,10 @@ static void bpf_skops_established(struct sock *sk, int bpf_op, BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); } #else +static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) +{ +} + static void bpf_skops_established(struct sock *sk, int bpf_op, struct sk_buff *skb) { @@ -5623,6 +5657,8 @@ syn_challenge: goto discard; } + bpf_skops_parse_hdr(sk, skb); + return true; discard: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 1ae20058b574..010ed2abcb66 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4173,8 +4173,10 @@ enum { BPF_SOCK_OPS_RETRANS_CB_FLAG = (1<<1), BPF_SOCK_OPS_STATE_CB_FLAG = (1<<2), BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3), + BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = (1<<4), + BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG = (1<<5), /* Mask of all currently supported cb flags */ - BPF_SOCK_OPS_ALL_CB_FLAGS = 0xF, + BPF_SOCK_OPS_ALL_CB_FLAGS = 0x3F, }; /* List of known BPF sock_ops operators. -- cgit v1.2.3 From 331fca4315efa3bbd258fbdf8209d59d253c0480 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 20 Aug 2020 12:00:52 -0700 Subject: bpf: tcp: Add bpf_skops_hdr_opt_len() and bpf_skops_write_hdr_opt() The bpf prog needs to parse the SYN header to learn what options have been sent by the peer's bpf-prog before writing its options into SYNACK. This patch adds a "syn_skb" arg to tcp_make_synack() and send_synack(). This syn_skb will eventually be made available (as read-only) to the bpf prog. This will be the only SYN packet available to the bpf prog during syncookie. For other regular cases, the bpf prog can also use the saved_syn. When writing options, the bpf prog will first be called to tell the kernel its required number of bytes. It is done by the new bpf_skops_hdr_opt_len(). The bpf prog will only be called when the new BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG is set in tp->bpf_sock_ops_cb_flags. When the bpf prog returns, the kernel will know how many bytes are needed and then update the "*remaining" arg accordingly. 4 byte alignment will be included in the "*remaining" before this function returns. The 4 byte aligned number of bytes will also be stored into the opts->bpf_opt_len. "bpf_opt_len" is a newly added member to the struct tcp_out_options. Then the new bpf_skops_write_hdr_opt() will call the bpf prog to write the header options. The bpf prog is only called if it has reserved spaces before (opts->bpf_opt_len > 0). The bpf prog is the last one getting a chance to reserve header space and writing the header option. These two functions are half implemented to highlight the changes in TCP stack. The actual codes preparing the bpf running context and invoking the bpf prog will be added in the later patch with other necessary bpf pieces. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/bpf/20200820190052.2885316-1-kafai@fb.com --- include/net/tcp.h | 6 ++- include/uapi/linux/bpf.h | 3 +- net/ipv4/tcp_input.c | 5 +- net/ipv4/tcp_ipv4.c | 5 +- net/ipv4/tcp_output.c | 105 ++++++++++++++++++++++++++++++++++++----- net/ipv6/tcp_ipv6.c | 5 +- tools/include/uapi/linux/bpf.h | 3 +- 7 files changed, 109 insertions(+), 23 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/tcp.h b/include/net/tcp.h index c186dbf731e1..3e768a6b8264 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -455,7 +455,8 @@ enum tcp_synack_type { struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, struct tcp_fastopen_cookie *foc, - enum tcp_synack_type synack_type); + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb); int tcp_disconnect(struct sock *sk, int flags); void tcp_finish_connect(struct sock *sk, struct sk_buff *skb); @@ -2035,7 +2036,8 @@ struct tcp_request_sock_ops { int (*send_synack)(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, - enum tcp_synack_type synack_type); + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb); }; extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 010ed2abcb66..18d0e128bc3c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4175,8 +4175,9 @@ enum { BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3), BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = (1<<4), BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG = (1<<5), + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG = (1<<6), /* Mask of all currently supported cb flags */ - BPF_SOCK_OPS_ALL_CB_FLAGS = 0x3F, + BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F, }; /* List of known BPF sock_ops operators. diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b520450170d1..8c9da4b65dae 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6824,7 +6824,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, } if (fastopen_sk) { af_ops->send_synack(fastopen_sk, dst, &fl, req, - &foc, TCP_SYNACK_FASTOPEN); + &foc, TCP_SYNACK_FASTOPEN, skb); /* Add the child socket directly into the accept queue */ if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) { reqsk_fastopen_remove(fastopen_sk, req, false); @@ -6842,7 +6842,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_timeout_init((struct sock *)req)); af_ops->send_synack(sk, dst, &fl, req, &foc, !want_cookie ? TCP_SYNACK_NORMAL : - TCP_SYNACK_COOKIE); + TCP_SYNACK_COOKIE, + skb); if (want_cookie) { reqsk_free(req); return 0; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5084333b5ab6..631a5ee0dd4e 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -965,7 +965,8 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, - enum tcp_synack_type synack_type) + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb) { const struct inet_request_sock *ireq = inet_rsk(req); struct flowi4 fl4; @@ -976,7 +977,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) return -1; - skb = tcp_make_synack(sk, dst, req, foc, synack_type); + skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); if (skb) { __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 44ffa4891beb..673db6879e46 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -438,6 +438,7 @@ struct tcp_out_options { u8 ws; /* window scale, 0 to disable */ u8 num_sack_blocks; /* number of SACK blocks to include */ u8 hash_size; /* bytes in hash_location */ + u8 bpf_opt_len; /* length of BPF hdr option */ __u8 *hash_location; /* temporary pointer, overloaded */ __u32 tsval, tsecr; /* need to include OPTION_TS */ struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ @@ -452,6 +453,59 @@ static void mptcp_options_write(__be32 *ptr, struct tcp_out_options *opts) #endif } +#ifdef CONFIG_CGROUP_BPF +/* req, syn_skb and synack_type are used when writing synack */ +static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct sk_buff *syn_skb, + enum tcp_synack_type synack_type, + struct tcp_out_options *opts, + unsigned int *remaining) +{ + if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) || + !*remaining) + return; + + /* The bpf running context preparation and the actual bpf prog + * calling will be implemented in a later PATCH together with + * other bpf pieces. + */ +} + +static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct sk_buff *syn_skb, + enum tcp_synack_type synack_type, + struct tcp_out_options *opts) +{ + if (likely(!opts->bpf_opt_len)) + return; + + /* The bpf running context preparation and the actual bpf prog + * calling will be implemented in a later PATCH together with + * other bpf pieces. + */ +} +#else +static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct sk_buff *syn_skb, + enum tcp_synack_type synack_type, + struct tcp_out_options *opts, + unsigned int *remaining) +{ +} + +static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct sk_buff *syn_skb, + enum tcp_synack_type synack_type, + struct tcp_out_options *opts) +{ +} +#endif + /* Write previously computed TCP options to the packet. * * Beware: Something in the Internet is very sensitive to the ordering of @@ -691,6 +745,8 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, } } + bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); + return MAX_TCP_OPTION_SPACE - remaining; } @@ -701,7 +757,8 @@ static unsigned int tcp_synack_options(const struct sock *sk, struct tcp_out_options *opts, const struct tcp_md5sig_key *md5, struct tcp_fastopen_cookie *foc, - enum tcp_synack_type synack_type) + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb) { struct inet_request_sock *ireq = inet_rsk(req); unsigned int remaining = MAX_TCP_OPTION_SPACE; @@ -758,6 +815,9 @@ static unsigned int tcp_synack_options(const struct sock *sk, smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); + bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb, + synack_type, opts, &remaining); + return MAX_TCP_OPTION_SPACE - remaining; } @@ -826,6 +886,15 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; } + if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp, + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) { + unsigned int remaining = MAX_TCP_OPTION_SPACE - size; + + bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); + + size = MAX_TCP_OPTION_SPACE - remaining; + } + return size; } @@ -1213,6 +1282,9 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, } #endif + /* BPF prog is the last one writing header option */ + bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts); + INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check, tcp_v6_send_check, tcp_v4_send_check, sk, skb); @@ -3336,20 +3408,20 @@ int tcp_send_synack(struct sock *sk) } /** - * tcp_make_synack - Prepare a SYN-ACK. - * sk: listener socket - * dst: dst entry attached to the SYNACK - * req: request_sock pointer - * foc: cookie for tcp fast open - * synack_type: Type of synback to prepare - * - * Allocate one skb and build a SYNACK packet. - * @dst is consumed : Caller should not use it again. + * tcp_make_synack - Allocate one skb and build a SYNACK packet. + * @sk: listener socket + * @dst: dst entry attached to the SYNACK. It is consumed and caller + * should not use it again. + * @req: request_sock pointer + * @foc: cookie for tcp fast open + * @synack_type: Type of synack to prepare + * @syn_skb: SYN packet just received. It could be NULL for rtx case. */ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, struct tcp_fastopen_cookie *foc, - enum tcp_synack_type synack_type) + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb) { struct inet_request_sock *ireq = inet_rsk(req); const struct tcp_sock *tp = tcp_sk(sk); @@ -3408,8 +3480,11 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req)); #endif skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4); + /* bpf program will be interested in the tcp_flags */ + TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN | TCPHDR_ACK; tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5, - foc, synack_type) + sizeof(*th); + foc, synack_type, + syn_skb) + sizeof(*th); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); @@ -3441,6 +3516,9 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, rcu_read_unlock(); #endif + bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb, + synack_type, &opts); + skb->skb_mstamp_ns = now; tcp_add_tx_delay(skb, tp); @@ -3936,7 +4014,8 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) int res; tcp_rsk(req)->txhash = net_tx_rndhash(); - res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL); + res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL, + NULL); if (!res) { __TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 305870a72352..87a633e1fbef 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -501,7 +501,8 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, - enum tcp_synack_type synack_type) + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb) { struct inet_request_sock *ireq = inet_rsk(req); struct ipv6_pinfo *np = tcp_inet6_sk(sk); @@ -515,7 +516,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, IPPROTO_TCP)) == NULL) goto done; - skb = tcp_make_synack(sk, dst, req, foc, synack_type); + skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); if (skb) { __tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr, diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 010ed2abcb66..18d0e128bc3c 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4175,8 +4175,9 @@ enum { BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3), BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = (1<<4), BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG = (1<<5), + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG = (1<<6), /* Mask of all currently supported cb flags */ - BPF_SOCK_OPS_ALL_CB_FLAGS = 0x3F, + BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F, }; /* List of known BPF sock_ops operators. -- cgit v1.2.3 From 0813a841566f0962a5551be7749b43c45f0022a0 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 20 Aug 2020 12:01:04 -0700 Subject: bpf: tcp: Allow bpf prog to write and parse TCP header option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Note: The TCP changes here is mainly to implement the bpf pieces into the bpf_skops_*() functions introduced in the earlier patches. ] The earlier effort in BPF-TCP-CC allows the TCP Congestion Control algorithm to be written in BPF. It opens up opportunities to allow a faster turnaround time in testing/releasing new congestion control ideas to production environment. The same flexibility can be extended to writing TCP header option. It is not uncommon that people want to test new TCP header option to improve the TCP performance. Another use case is for data-center that has a more controlled environment and has more flexibility in putting header options for internal only use. For example, we want to test the idea in putting maximum delay ACK in TCP header option which is similar to a draft RFC proposal [1]. This patch introduces the necessary BPF API and use them in the TCP stack to allow BPF_PROG_TYPE_SOCK_OPS program to parse and write TCP header options. It currently supports most of the TCP packet except RST. Supported TCP header option: ─────────────────────────── This patch allows the bpf-prog to write any option kind. Different bpf-progs can write its own option by calling the new helper bpf_store_hdr_opt(). The helper will ensure there is no duplicated option in the header. By allowing bpf-prog to write any option kind, this gives a lot of flexibility to the bpf-prog. Different bpf-prog can write its own option kind. It could also allow the bpf-prog to support a recently standardized option on an older kernel. Sockops Callback Flags: ────────────────────── The bpf program will only be called to parse/write tcp header option if the following newly added callback flags are enabled in tp->bpf_sock_ops_cb_flags: BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG A few words on the PARSE CB flags. When the above PARSE CB flags are turned on, the bpf-prog will be called on packets received at a sk that has at least reached the ESTABLISHED state. The parsing of the SYN-SYNACK-ACK will be discussed in the "3 Way HandShake" section. The default is off for all of the above new CB flags, i.e. the bpf prog will not be called to parse or write bpf hdr option. There are details comment on these new cb flags in the UAPI bpf.h. sock_ops->skb_data and bpf_load_hdr_opt() ───────────────────────────────────────── sock_ops->skb_data and sock_ops->skb_data_end covers the whole TCP header and its options. They are read only. The new bpf_load_hdr_opt() helps to read a particular option "kind" from the skb_data. Please refer to the comment in UAPI bpf.h. It has details on what skb_data contains under different sock_ops->op. 3 Way HandShake ─────────────── The bpf-prog can learn if it is sending SYN or SYNACK by reading the sock_ops->skb_tcp_flags. * Passive side When writing SYNACK (i.e. sock_ops->op == BPF_SOCK_OPS_WRITE_HDR_OPT_CB), the received SYN skb will be available to the bpf prog. The bpf prog can use the SYN skb (which may carry the header option sent from the remote bpf prog) to decide what bpf header option should be written to the outgoing SYNACK skb. The SYN packet can be obtained by getsockopt(TCP_BPF_SYN*). More on this later. Also, the bpf prog can learn if it is in syncookie mode (by checking sock_ops->args[0] == BPF_WRITE_HDR_TCP_SYNACK_COOKIE). The bpf prog can store the received SYN pkt by using the existing bpf_setsockopt(TCP_SAVE_SYN). The example in a later patch does it. [ Note that the fullsock here is a listen sk, bpf_sk_storage is not very useful here since the listen sk will be shared by many concurrent connection requests. Extending bpf_sk_storage support to request_sock will add weight to the minisock and it is not necessary better than storing the whole ~100 bytes SYN pkt. ] When the connection is established, the bpf prog will be called in the existing PASSIVE_ESTABLISHED_CB callback. At that time, the bpf prog can get the header option from the saved syn and then apply the needed operation to the newly established socket. The later patch will use the max delay ack specified in the SYN header and set the RTO of this newly established connection as an example. The received ACK (that concludes the 3WHS) will also be available to the bpf prog during PASSIVE_ESTABLISHED_CB through the sock_ops->skb_data. It could be useful in syncookie scenario. More on this later. There is an existing getsockopt "TCP_SAVED_SYN" to return the whole saved syn pkt which includes the IP[46] header and the TCP header. A few "TCP_BPF_SYN*" getsockopt has been added to allow specifying where to start getting from, e.g. starting from TCP header, or from IP[46] header. The new getsockopt(TCP_BPF_SYN*) will also know where it can get the SYN's packet from: - (a) the just received syn (available when the bpf prog is writing SYNACK) and it is the only way to get SYN during syncookie mode. or - (b) the saved syn (available in PASSIVE_ESTABLISHED_CB and also other existing CB). The bpf prog does not need to know where the SYN pkt is coming from. The getsockopt(TCP_BPF_SYN*) will hide this details. Similarly, a flags "BPF_LOAD_HDR_OPT_TCP_SYN" is also added to bpf_load_hdr_opt() to read a particular header option from the SYN packet. * Fastopen Fastopen should work the same as the regular non fastopen case. This is a test in a later patch. * Syncookie For syncookie, the later example patch asks the active side's bpf prog to resend the header options in ACK. The server can use bpf_load_hdr_opt() to look at the options in this received ACK during PASSIVE_ESTABLISHED_CB. * Active side The bpf prog will get a chance to write the bpf header option in the SYN packet during WRITE_HDR_OPT_CB. The received SYNACK pkt will also be available to the bpf prog during the existing ACTIVE_ESTABLISHED_CB callback through the sock_ops->skb_data and bpf_load_hdr_opt(). * Turn off header CB flags after 3WHS If the bpf prog does not need to write/parse header options beyond the 3WHS, the bpf prog can clear the bpf_sock_ops_cb_flags to avoid being called for header options. Or the bpf-prog can select to leave the UNKNOWN_HDR_OPT_CB_FLAG on so that the kernel will only call it when there is option that the kernel cannot handle. [1]: draft-wang-tcpm-low-latency-opt-00 https://tools.ietf.org/html/draft-wang-tcpm-low-latency-opt-00 Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200820190104.2885895-1-kafai@fb.com --- include/linux/bpf-cgroup.h | 25 +++ include/linux/filter.h | 4 + include/net/tcp.h | 49 ++++++ include/uapi/linux/bpf.h | 300 ++++++++++++++++++++++++++++++++- net/core/filter.c | 365 +++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_input.c | 20 ++- net/ipv4/tcp_minisocks.c | 1 + net/ipv4/tcp_output.c | 104 +++++++++++- tools/include/uapi/linux/bpf.h | 300 ++++++++++++++++++++++++++++++++- 9 files changed, 1150 insertions(+), 18 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 64f367044e25..2f98d2fce62e 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -279,6 +279,31 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_RECVMSG, NULL) +/* The SOCK_OPS"_SK" macro should be used when sock_ops->sk is not a + * fullsock and its parent fullsock cannot be traced by + * sk_to_full_sk(). + * + * e.g. sock_ops->sk is a request_sock and it is under syncookie mode. + * Its listener-sk is not attached to the rsk_listener. + * In this case, the caller holds the listener-sk (unlocked), + * set its sock_ops->sk to req_sk, and call this SOCK_OPS"_SK" with + * the listener-sk such that the cgroup-bpf-progs of the + * listener-sk will be run. + * + * Regardless of syncookie mode or not, + * calling bpf_setsockopt on listener-sk will not make sense anyway, + * so passing 'sock_ops->sk == req_sk' to the bpf prog is appropriate here. + */ +#define BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(sock_ops, sk) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) \ + __ret = __cgroup_bpf_run_filter_sock_ops(sk, \ + sock_ops, \ + BPF_CGROUP_SOCK_OPS); \ + __ret; \ +}) + #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ ({ \ int __ret = 0; \ diff --git a/include/linux/filter.h b/include/linux/filter.h index c427dfa5f908..995625950cc1 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1241,8 +1241,12 @@ struct bpf_sock_ops_kern { u32 reply; u32 replylong[4]; }; + struct sk_buff *syn_skb; + struct sk_buff *skb; + void *skb_data_end; u8 op; u8 is_fullsock; + u8 remaining_opt_len; u64 temp; /* temp and everything after is not * initialized to 0 before calling * the BPF program. New fields that diff --git a/include/net/tcp.h b/include/net/tcp.h index 3e768a6b8264..1f967b4e22f6 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2235,6 +2235,55 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags); #endif /* CONFIG_NET_SOCK_MSG */ +#ifdef CONFIG_CGROUP_BPF +/* Copy the listen sk's HDR_OPT_CB flags to its child. + * + * During 3-Way-HandShake, the synack is usually sent from + * the listen sk with the HDR_OPT_CB flags set so that + * bpf-prog will be called to write the BPF hdr option. + * + * In fastopen, the child sk is used to send synack instead + * of the listen sk. Thus, inheriting the HDR_OPT_CB flags + * from the listen sk gives the bpf-prog a chance to write + * BPF hdr option in the synack pkt during fastopen. + * + * Both fastopen and non-fastopen child will inherit the + * HDR_OPT_CB flags to keep the bpf-prog having a consistent + * behavior when deciding to clear this cb flags (or not) + * during the PASSIVE_ESTABLISHED_CB. + * + * In the future, other cb flags could be inherited here also. + */ +static inline void bpf_skops_init_child(const struct sock *sk, + struct sock *child) +{ + tcp_sk(child)->bpf_sock_ops_cb_flags = + tcp_sk(sk)->bpf_sock_ops_cb_flags & + (BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG | + BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG | + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG); +} + +static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops, + struct sk_buff *skb, + unsigned int end_offset) +{ + skops->skb = skb; + skops->skb_data_end = skb->data + end_offset; +} +#else +static inline void bpf_skops_init_child(const struct sock *sk, + struct sock *child) +{ +} + +static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops, + struct sk_buff *skb, + unsigned int end_offset) +{ +} +#endif + /* Call BPF_SOCK_OPS program that returns an int. If the return value * is < 0, then the BPF op failed (for example if the loaded BPF * program does not support the chosen operation or there is no BPF diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 18d0e128bc3c..f67ec5d9e57d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3395,6 +3395,120 @@ union bpf_attr { * A non-negative value equal to or less than *size* on success, * or a negative error in case of failure. * + * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags) + * Description + * Load header option. Support reading a particular TCP header + * option for bpf program (BPF_PROG_TYPE_SOCK_OPS). + * + * If *flags* is 0, it will search the option from the + * sock_ops->skb_data. The comment in "struct bpf_sock_ops" + * has details on what skb_data contains under different + * sock_ops->op. + * + * The first byte of the *searchby_res* specifies the + * kind that it wants to search. + * + * If the searching kind is an experimental kind + * (i.e. 253 or 254 according to RFC6994). It also + * needs to specify the "magic" which is either + * 2 bytes or 4 bytes. It then also needs to + * specify the size of the magic by using + * the 2nd byte which is "kind-length" of a TCP + * header option and the "kind-length" also + * includes the first 2 bytes "kind" and "kind-length" + * itself as a normal TCP header option also does. + * + * For example, to search experimental kind 254 with + * 2 byte magic 0xeB9F, the searchby_res should be + * [ 254, 4, 0xeB, 0x9F, 0, 0, .... 0 ]. + * + * To search for the standard window scale option (3), + * the searchby_res should be [ 3, 0, 0, .... 0 ]. + * Note, kind-length must be 0 for regular option. + * + * Searching for No-Op (0) and End-of-Option-List (1) are + * not supported. + * + * *len* must be at least 2 bytes which is the minimal size + * of a header option. + * + * Supported flags: + * * **BPF_LOAD_HDR_OPT_TCP_SYN** to search from the + * saved_syn packet or the just-received syn packet. + * + * Return + * >0 when found, the header option is copied to *searchby_res*. + * The return value is the total length copied. + * + * **-EINVAL** If param is invalid + * + * **-ENOMSG** The option is not found + * + * **-ENOENT** No syn packet available when + * **BPF_LOAD_HDR_OPT_TCP_SYN** is used + * + * **-ENOSPC** Not enough space. Only *len* number of + * bytes are copied. + * + * **-EFAULT** Cannot parse the header options in the packet + * + * **-EPERM** This helper cannot be used under the + * current sock_ops->op. + * + * long bpf_store_hdr_opt(struct bpf_sock_ops *skops, const void *from, u32 len, u64 flags) + * Description + * Store header option. The data will be copied + * from buffer *from* with length *len* to the TCP header. + * + * The buffer *from* should have the whole option that + * includes the kind, kind-length, and the actual + * option data. The *len* must be at least kind-length + * long. The kind-length does not have to be 4 byte + * aligned. The kernel will take care of the padding + * and setting the 4 bytes aligned value to th->doff. + * + * This helper will check for duplicated option + * by searching the same option in the outgoing skb. + * + * This helper can only be called during + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * + * Return + * 0 on success, or negative error in case of failure: + * + * **-EINVAL** If param is invalid + * + * **-ENOSPC** Not enough space in the header. + * Nothing has been written + * + * **-EEXIST** The option has already existed + * + * **-EFAULT** Cannot parse the existing header options + * + * **-EPERM** This helper cannot be used under the + * current sock_ops->op. + * + * long bpf_reserve_hdr_opt(struct bpf_sock_ops *skops, u32 len, u64 flags) + * Description + * Reserve *len* bytes for the bpf header option. The + * space will be used by bpf_store_hdr_opt() later in + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * + * If bpf_reserve_hdr_opt() is called multiple times, + * the total number of bytes will be reserved. + * + * This helper can only be called during + * BPF_SOCK_OPS_HDR_OPT_LEN_CB. + * + * Return + * 0 on success, or negative error in case of failure: + * + * **-EINVAL** if param is invalid + * + * **-ENOSPC** Not enough space in the header. + * + * **-EPERM** This helper cannot be used under the + * current sock_ops->op. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3539,6 +3653,9 @@ union bpf_attr { FN(skc_to_tcp_request_sock), \ FN(skc_to_udp6_sock), \ FN(get_task_stack), \ + FN(load_hdr_opt), \ + FN(store_hdr_opt), \ + FN(reserve_hdr_opt), /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -4165,6 +4282,36 @@ struct bpf_sock_ops { __u64 bytes_received; __u64 bytes_acked; __bpf_md_ptr(struct bpf_sock *, sk); + /* [skb_data, skb_data_end) covers the whole TCP header. + * + * BPF_SOCK_OPS_PARSE_HDR_OPT_CB: The packet received + * BPF_SOCK_OPS_HDR_OPT_LEN_CB: Not useful because the + * header has not been written. + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB: The header and options have + * been written so far. + * BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: The SYNACK that concludes + * the 3WHS. + * BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: The ACK that concludes + * the 3WHS. + * + * bpf_load_hdr_opt() can also be used to read a particular option. + */ + __bpf_md_ptr(void *, skb_data); + __bpf_md_ptr(void *, skb_data_end); + __u32 skb_len; /* The total length of a packet. + * It includes the header, options, + * and payload. + */ + __u32 skb_tcp_flags; /* tcp_flags of the header. It provides + * an easy way to check for tcp_flags + * without parsing skb_data. + * + * In particular, the skb_tcp_flags + * will still be available in + * BPF_SOCK_OPS_HDR_OPT_LEN even though + * the outgoing header has not + * been written yet. + */ }; /* Definitions for bpf_sock_ops_cb_flags */ @@ -4173,8 +4320,48 @@ enum { BPF_SOCK_OPS_RETRANS_CB_FLAG = (1<<1), BPF_SOCK_OPS_STATE_CB_FLAG = (1<<2), BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3), - BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = (1<<4), + /* Call bpf for all received TCP headers. The bpf prog will be + * called under sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * + * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * for the header option related helpers that will be useful + * to the bpf programs. + * + * It could be used at the client/active side (i.e. connect() side) + * when the server told it that the server was in syncookie + * mode and required the active side to resend the bpf-written + * options. The active side can keep writing the bpf-options until + * it received a valid packet from the server side to confirm + * the earlier packet (and options) has been received. The later + * example patch is using it like this at the active side when the + * server is in syncookie mode. + * + * The bpf prog will usually turn this off in the common cases. + */ + BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = (1<<4), + /* Call bpf when kernel has received a header option that + * the kernel cannot handle. The bpf prog will be called under + * sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB. + * + * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * for the header option related helpers that will be useful + * to the bpf programs. + */ BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG = (1<<5), + /* Call bpf when the kernel is writing header options for the + * outgoing packet. The bpf prog will first be called + * to reserve space in a skb under + * sock_ops->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB. Then + * the bpf prog will be called to write the header option(s) + * under sock_ops->op == BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * + * Please refer to the comment in BPF_SOCK_OPS_HDR_OPT_LEN_CB + * and BPF_SOCK_OPS_WRITE_HDR_OPT_CB for the header option + * related helpers that will be useful to the bpf programs. + * + * The kernel gets its chance to reserve space and write + * options first before the BPF program does. + */ BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG = (1<<6), /* Mask of all currently supported cb flags */ BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F, @@ -4233,6 +4420,63 @@ enum { */ BPF_SOCK_OPS_RTT_CB, /* Called on every RTT. */ + BPF_SOCK_OPS_PARSE_HDR_OPT_CB, /* Parse the header option. + * It will be called to handle + * the packets received at + * an already established + * connection. + * + * sock_ops->skb_data: + * Referring to the received skb. + * It covers the TCP header only. + * + * bpf_load_hdr_opt() can also + * be used to search for a + * particular option. + */ + BPF_SOCK_OPS_HDR_OPT_LEN_CB, /* Reserve space for writing the + * header option later in + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * Arg1: bool want_cookie. (in + * writing SYNACK only) + * + * sock_ops->skb_data: + * Not available because no header has + * been written yet. + * + * sock_ops->skb_tcp_flags: + * The tcp_flags of the + * outgoing skb. (e.g. SYN, ACK, FIN). + * + * bpf_reserve_hdr_opt() should + * be used to reserve space. + */ + BPF_SOCK_OPS_WRITE_HDR_OPT_CB, /* Write the header options + * Arg1: bool want_cookie. (in + * writing SYNACK only) + * + * sock_ops->skb_data: + * Referring to the outgoing skb. + * It covers the TCP header + * that has already been written + * by the kernel and the + * earlier bpf-progs. + * + * sock_ops->skb_tcp_flags: + * The tcp_flags of the outgoing + * skb. (e.g. SYN, ACK, FIN). + * + * bpf_store_hdr_opt() should + * be used to write the + * option. + * + * bpf_load_hdr_opt() can also + * be used to search for a + * particular option that + * has already been written + * by the kernel or the + * earlier bpf-progs. + */ }; /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect @@ -4262,6 +4506,60 @@ enum { TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */ TCP_BPF_DELACK_MAX = 1003, /* Max delay ack in usecs */ TCP_BPF_RTO_MIN = 1004, /* Min delay ack in usecs */ + /* Copy the SYN pkt to optval + * + * BPF_PROG_TYPE_SOCK_OPS only. It is similar to the + * bpf_getsockopt(TCP_SAVED_SYN) but it does not limit + * to only getting from the saved_syn. It can either get the + * syn packet from: + * + * 1. the just-received SYN packet (only available when writing the + * SYNACK). It will be useful when it is not necessary to + * save the SYN packet for latter use. It is also the only way + * to get the SYN during syncookie mode because the syn + * packet cannot be saved during syncookie. + * + * OR + * + * 2. the earlier saved syn which was done by + * bpf_setsockopt(TCP_SAVE_SYN). + * + * The bpf_getsockopt(TCP_BPF_SYN*) option will hide where the + * SYN packet is obtained. + * + * If the bpf-prog does not need the IP[46] header, the + * bpf-prog can avoid parsing the IP header by using + * TCP_BPF_SYN. Otherwise, the bpf-prog can get both + * IP[46] and TCP header by using TCP_BPF_SYN_IP. + * + * >0: Total number of bytes copied + * -ENOSPC: Not enough space in optval. Only optlen number of + * bytes is copied. + * -ENOENT: The SYN skb is not available now and the earlier SYN pkt + * is not saved by setsockopt(TCP_SAVE_SYN). + */ + TCP_BPF_SYN = 1005, /* Copy the TCP header */ + TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */ +}; + +enum { + BPF_LOAD_HDR_OPT_TCP_SYN = (1ULL << 0), +}; + +/* args[0] value during BPF_SOCK_OPS_HDR_OPT_LEN_CB and + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + */ +enum { + BPF_WRITE_HDR_TCP_CURRENT_MSS = 1, /* Kernel is finding the + * total option spaces + * required for an established + * sk in order to calculate the + * MSS. No skb is actually + * sent. + */ + BPF_WRITE_HDR_TCP_SYNACK_COOKIE = 2, /* Kernel is in syncookie mode + * when sending a SYN. + */ }; struct bpf_perf_event_value { diff --git a/net/core/filter.c b/net/core/filter.c index 1608f4b3987f..ab5603d5b62a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4669,9 +4669,82 @@ static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = { .arg5_type = ARG_CONST_SIZE, }; +static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock, + int optname, const u8 **start) +{ + struct sk_buff *syn_skb = bpf_sock->syn_skb; + const u8 *hdr_start; + int ret; + + if (syn_skb) { + /* sk is a request_sock here */ + + if (optname == TCP_BPF_SYN) { + hdr_start = syn_skb->data; + ret = tcp_hdrlen(syn_skb); + } else { + /* optname == TCP_BPF_SYN_IP */ + hdr_start = skb_network_header(syn_skb); + ret = skb_network_header_len(syn_skb) + + tcp_hdrlen(syn_skb); + } + } else { + struct sock *sk = bpf_sock->sk; + struct saved_syn *saved_syn; + + if (sk->sk_state == TCP_NEW_SYN_RECV) + /* synack retransmit. bpf_sock->syn_skb will + * not be available. It has to resort to + * saved_syn (if it is saved). + */ + saved_syn = inet_reqsk(sk)->saved_syn; + else + saved_syn = tcp_sk(sk)->saved_syn; + + if (!saved_syn) + return -ENOENT; + + if (optname == TCP_BPF_SYN) { + hdr_start = saved_syn->data + + saved_syn->network_hdrlen; + ret = saved_syn->tcp_hdrlen; + } else { + /* optname == TCP_BPF_SYN_IP */ + hdr_start = saved_syn->data; + ret = saved_syn->network_hdrlen + + saved_syn->tcp_hdrlen; + } + } + + *start = hdr_start; + return ret; +} + BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { + if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP && + optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_IP) { + int ret, copy_len = 0; + const u8 *start; + + ret = bpf_sock_ops_get_syn(bpf_sock, optname, &start); + if (ret > 0) { + copy_len = ret; + if (optlen < copy_len) { + copy_len = optlen; + ret = -ENOSPC; + } + + memcpy(optval, start, copy_len); + } + + /* Zero out unused buffer at the end */ + memset(optval + copy_len, 0, optlen - copy_len); + + return ret; + } + return _bpf_getsockopt(bpf_sock->sk, level, optname, optval, optlen); } @@ -6165,6 +6238,232 @@ static const struct bpf_func_proto bpf_sk_assign_proto = { .arg3_type = ARG_ANYTHING, }; +static const u8 *bpf_search_tcp_opt(const u8 *op, const u8 *opend, + u8 search_kind, const u8 *magic, + u8 magic_len, bool *eol) +{ + u8 kind, kind_len; + + *eol = false; + + while (op < opend) { + kind = op[0]; + + if (kind == TCPOPT_EOL) { + *eol = true; + return ERR_PTR(-ENOMSG); + } else if (kind == TCPOPT_NOP) { + op++; + continue; + } + + if (opend - op < 2 || opend - op < op[1] || op[1] < 2) + /* Something is wrong in the received header. + * Follow the TCP stack's tcp_parse_options() + * and just bail here. + */ + return ERR_PTR(-EFAULT); + + kind_len = op[1]; + if (search_kind == kind) { + if (!magic_len) + return op; + + if (magic_len > kind_len - 2) + return ERR_PTR(-ENOMSG); + + if (!memcmp(&op[2], magic, magic_len)) + return op; + } + + op += kind_len; + } + + return ERR_PTR(-ENOMSG); +} + +BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, + void *, search_res, u32, len, u64, flags) +{ + bool eol, load_syn = flags & BPF_LOAD_HDR_OPT_TCP_SYN; + const u8 *op, *opend, *magic, *search = search_res; + u8 search_kind, search_len, copy_len, magic_len; + int ret; + + /* 2 byte is the minimal option len except TCPOPT_NOP and + * TCPOPT_EOL which are useless for the bpf prog to learn + * and this helper disallow loading them also. + */ + if (len < 2 || flags & ~BPF_LOAD_HDR_OPT_TCP_SYN) + return -EINVAL; + + search_kind = search[0]; + search_len = search[1]; + + if (search_len > len || search_kind == TCPOPT_NOP || + search_kind == TCPOPT_EOL) + return -EINVAL; + + if (search_kind == TCPOPT_EXP || search_kind == 253) { + /* 16 or 32 bit magic. +2 for kind and kind length */ + if (search_len != 4 && search_len != 6) + return -EINVAL; + magic = &search[2]; + magic_len = search_len - 2; + } else { + if (search_len) + return -EINVAL; + magic = NULL; + magic_len = 0; + } + + if (load_syn) { + ret = bpf_sock_ops_get_syn(bpf_sock, TCP_BPF_SYN, &op); + if (ret < 0) + return ret; + + opend = op + ret; + op += sizeof(struct tcphdr); + } else { + if (!bpf_sock->skb || + bpf_sock->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB) + /* This bpf_sock->op cannot call this helper */ + return -EPERM; + + opend = bpf_sock->skb_data_end; + op = bpf_sock->skb->data + sizeof(struct tcphdr); + } + + op = bpf_search_tcp_opt(op, opend, search_kind, magic, magic_len, + &eol); + if (IS_ERR(op)) + return PTR_ERR(op); + + copy_len = op[1]; + ret = copy_len; + if (copy_len > len) { + ret = -ENOSPC; + copy_len = len; + } + + memcpy(search_res, op, copy_len); + return ret; +} + +static const struct bpf_func_proto bpf_sock_ops_load_hdr_opt_proto = { + .func = bpf_sock_ops_load_hdr_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_sock_ops_store_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, + const void *, from, u32, len, u64, flags) +{ + u8 new_kind, new_kind_len, magic_len = 0, *opend; + const u8 *op, *new_op, *magic = NULL; + struct sk_buff *skb; + bool eol; + + if (bpf_sock->op != BPF_SOCK_OPS_WRITE_HDR_OPT_CB) + return -EPERM; + + if (len < 2 || flags) + return -EINVAL; + + new_op = from; + new_kind = new_op[0]; + new_kind_len = new_op[1]; + + if (new_kind_len > len || new_kind == TCPOPT_NOP || + new_kind == TCPOPT_EOL) + return -EINVAL; + + if (new_kind_len > bpf_sock->remaining_opt_len) + return -ENOSPC; + + /* 253 is another experimental kind */ + if (new_kind == TCPOPT_EXP || new_kind == 253) { + if (new_kind_len < 4) + return -EINVAL; + /* Match for the 2 byte magic also. + * RFC 6994: the magic could be 2 or 4 bytes. + * Hence, matching by 2 byte only is on the + * conservative side but it is the right + * thing to do for the 'search-for-duplication' + * purpose. + */ + magic = &new_op[2]; + magic_len = 2; + } + + /* Check for duplication */ + skb = bpf_sock->skb; + op = skb->data + sizeof(struct tcphdr); + opend = bpf_sock->skb_data_end; + + op = bpf_search_tcp_opt(op, opend, new_kind, magic, magic_len, + &eol); + if (!IS_ERR(op)) + return -EEXIST; + + if (PTR_ERR(op) != -ENOMSG) + return PTR_ERR(op); + + if (eol) + /* The option has been ended. Treat it as no more + * header option can be written. + */ + return -ENOSPC; + + /* No duplication found. Store the header option. */ + memcpy(opend, from, new_kind_len); + + bpf_sock->remaining_opt_len -= new_kind_len; + bpf_sock->skb_data_end += new_kind_len; + + return 0; +} + +static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = { + .func = bpf_sock_ops_store_hdr_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_3(bpf_sock_ops_reserve_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, + u32, len, u64, flags) +{ + if (bpf_sock->op != BPF_SOCK_OPS_HDR_OPT_LEN_CB) + return -EPERM; + + if (flags || len < 2) + return -EINVAL; + + if (len > bpf_sock->remaining_opt_len) + return -ENOSPC; + + bpf_sock->remaining_opt_len -= len; + + return 0; +} + +static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = { + .func = bpf_sock_ops_reserve_hdr_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -6193,6 +6492,9 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_lwt_seg6_store_bytes || func == bpf_lwt_seg6_adjust_srh || func == bpf_lwt_seg6_action || +#endif +#ifdef CONFIG_INET + func == bpf_sock_ops_store_hdr_opt || #endif func == bpf_lwt_in_push_encap || func == bpf_lwt_xmit_push_encap) @@ -6565,6 +6867,12 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; #ifdef CONFIG_INET + case BPF_FUNC_load_hdr_opt: + return &bpf_sock_ops_load_hdr_opt_proto; + case BPF_FUNC_store_hdr_opt: + return &bpf_sock_ops_store_hdr_opt_proto; + case BPF_FUNC_reserve_hdr_opt: + return &bpf_sock_ops_reserve_hdr_opt_proto; case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; #endif /* CONFIG_INET */ @@ -7364,6 +7672,20 @@ static bool sock_ops_is_valid_access(int off, int size, return false; info->reg_type = PTR_TO_SOCKET_OR_NULL; break; + case offsetof(struct bpf_sock_ops, skb_data): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_PACKET; + break; + case offsetof(struct bpf_sock_ops, skb_data_end): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_PACKET_END; + break; + case offsetof(struct bpf_sock_ops, skb_tcp_flags): + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, + size_default); default: if (size != size_default) return false; @@ -8701,6 +9023,49 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, case offsetof(struct bpf_sock_ops, sk): SOCK_OPS_GET_SK(); break; + case offsetof(struct bpf_sock_ops, skb_data_end): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, + skb_data_end), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + skb_data_end)); + break; + case offsetof(struct bpf_sock_ops, skb_data): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, + skb), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + skb)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), + si->dst_reg, si->dst_reg, + offsetof(struct sk_buff, data)); + break; + case offsetof(struct bpf_sock_ops, skb_len): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, + skb), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + skb)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len), + si->dst_reg, si->dst_reg, + offsetof(struct sk_buff, len)); + break; + case offsetof(struct bpf_sock_ops, skb_tcp_flags): + off = offsetof(struct sk_buff, cb); + off += offsetof(struct tcp_skb_cb, tcp_flags); + *target_size = sizeof_field(struct tcp_skb_cb, tcp_flags); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, + skb), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + skb)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_skb_cb, + tcp_flags), + si->dst_reg, si->dst_reg, off); + break; } return insn - insn_buf; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8c9da4b65dae..319cc7fd5117 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -146,6 +146,7 @@ static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG); bool parse_all_opt = BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG); + struct bpf_sock_ops_kern sock_ops; if (likely(!unknown_opt && !parse_all_opt)) return; @@ -161,12 +162,15 @@ static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) return; } - /* BPF prog will have access to the sk and skb. - * - * The bpf running context preparation and the actual bpf prog - * calling will be implemented in a later PATCH together with - * other bpf pieces. - */ + sock_owned_by_me(sk); + + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); + sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB; + sock_ops.is_fullsock = 1; + sock_ops.sk = sk; + bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb)); + + BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); } static void bpf_skops_established(struct sock *sk, int bpf_op, @@ -180,7 +184,9 @@ static void bpf_skops_established(struct sock *sk, int bpf_op, sock_ops.op = bpf_op; sock_ops.is_fullsock = 1; sock_ops.sk = sk; - /* skb will be passed to the bpf prog in a later patch. */ + /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */ + if (skb) + bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb)); BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 495dda2449fe..56c306e3cd2f 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -548,6 +548,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->fastopen_req = NULL; RCU_INIT_POINTER(newtp->fastopen_rsk, NULL); + bpf_skops_init_child(sk, newsk); tcp_bpf_clone(sk, newsk); __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 673db6879e46..ab79d36ed07f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -454,6 +454,18 @@ static void mptcp_options_write(__be32 *ptr, struct tcp_out_options *opts) } #ifdef CONFIG_CGROUP_BPF +static int bpf_skops_write_hdr_opt_arg0(struct sk_buff *skb, + enum tcp_synack_type synack_type) +{ + if (unlikely(!skb)) + return BPF_WRITE_HDR_TCP_CURRENT_MSS; + + if (unlikely(synack_type == TCP_SYNACK_COOKIE)) + return BPF_WRITE_HDR_TCP_SYNACK_COOKIE; + + return 0; +} + /* req, syn_skb and synack_type are used when writing synack */ static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb, struct request_sock *req, @@ -462,15 +474,60 @@ static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb, struct tcp_out_options *opts, unsigned int *remaining) { + struct bpf_sock_ops_kern sock_ops; + int err; + if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) || !*remaining) return; - /* The bpf running context preparation and the actual bpf prog - * calling will be implemented in a later PATCH together with - * other bpf pieces. - */ + /* *remaining has already been aligned to 4 bytes, so *remaining >= 4 */ + + /* init sock_ops */ + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); + + sock_ops.op = BPF_SOCK_OPS_HDR_OPT_LEN_CB; + + if (req) { + /* The listen "sk" cannot be passed here because + * it is not locked. It would not make too much + * sense to do bpf_setsockopt(listen_sk) based + * on individual connection request also. + * + * Thus, "req" is passed here and the cgroup-bpf-progs + * of the listen "sk" will be run. + * + * "req" is also used here for fastopen even the "sk" here is + * a fullsock "child" sk. It is to keep the behavior + * consistent between fastopen and non-fastopen on + * the bpf programming side. + */ + sock_ops.sk = (struct sock *)req; + sock_ops.syn_skb = syn_skb; + } else { + sock_owned_by_me(sk); + + sock_ops.is_fullsock = 1; + sock_ops.sk = sk; + } + + sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type); + sock_ops.remaining_opt_len = *remaining; + /* tcp_current_mss() does not pass a skb */ + if (skb) + bpf_skops_init_skb(&sock_ops, skb, 0); + + err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk); + + if (err || sock_ops.remaining_opt_len == *remaining) + return; + + opts->bpf_opt_len = *remaining - sock_ops.remaining_opt_len; + /* round up to 4 bytes */ + opts->bpf_opt_len = (opts->bpf_opt_len + 3) & ~3; + + *remaining -= opts->bpf_opt_len; } static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, @@ -479,13 +536,42 @@ static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, enum tcp_synack_type synack_type, struct tcp_out_options *opts) { - if (likely(!opts->bpf_opt_len)) + u8 first_opt_off, nr_written, max_opt_len = opts->bpf_opt_len; + struct bpf_sock_ops_kern sock_ops; + int err; + + if (likely(!max_opt_len)) return; - /* The bpf running context preparation and the actual bpf prog - * calling will be implemented in a later PATCH together with - * other bpf pieces. - */ + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); + + sock_ops.op = BPF_SOCK_OPS_WRITE_HDR_OPT_CB; + + if (req) { + sock_ops.sk = (struct sock *)req; + sock_ops.syn_skb = syn_skb; + } else { + sock_owned_by_me(sk); + + sock_ops.is_fullsock = 1; + sock_ops.sk = sk; + } + + sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type); + sock_ops.remaining_opt_len = max_opt_len; + first_opt_off = tcp_hdrlen(skb) - max_opt_len; + bpf_skops_init_skb(&sock_ops, skb, first_opt_off); + + err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk); + + if (err) + nr_written = 0; + else + nr_written = max_opt_len - sock_ops.remaining_opt_len; + + if (nr_written < max_opt_len) + memset(skb->data + first_opt_off + nr_written, TCPOPT_NOP, + max_opt_len - nr_written); } #else static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb, diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 18d0e128bc3c..f67ec5d9e57d 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3395,6 +3395,120 @@ union bpf_attr { * A non-negative value equal to or less than *size* on success, * or a negative error in case of failure. * + * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags) + * Description + * Load header option. Support reading a particular TCP header + * option for bpf program (BPF_PROG_TYPE_SOCK_OPS). + * + * If *flags* is 0, it will search the option from the + * sock_ops->skb_data. The comment in "struct bpf_sock_ops" + * has details on what skb_data contains under different + * sock_ops->op. + * + * The first byte of the *searchby_res* specifies the + * kind that it wants to search. + * + * If the searching kind is an experimental kind + * (i.e. 253 or 254 according to RFC6994). It also + * needs to specify the "magic" which is either + * 2 bytes or 4 bytes. It then also needs to + * specify the size of the magic by using + * the 2nd byte which is "kind-length" of a TCP + * header option and the "kind-length" also + * includes the first 2 bytes "kind" and "kind-length" + * itself as a normal TCP header option also does. + * + * For example, to search experimental kind 254 with + * 2 byte magic 0xeB9F, the searchby_res should be + * [ 254, 4, 0xeB, 0x9F, 0, 0, .... 0 ]. + * + * To search for the standard window scale option (3), + * the searchby_res should be [ 3, 0, 0, .... 0 ]. + * Note, kind-length must be 0 for regular option. + * + * Searching for No-Op (0) and End-of-Option-List (1) are + * not supported. + * + * *len* must be at least 2 bytes which is the minimal size + * of a header option. + * + * Supported flags: + * * **BPF_LOAD_HDR_OPT_TCP_SYN** to search from the + * saved_syn packet or the just-received syn packet. + * + * Return + * >0 when found, the header option is copied to *searchby_res*. + * The return value is the total length copied. + * + * **-EINVAL** If param is invalid + * + * **-ENOMSG** The option is not found + * + * **-ENOENT** No syn packet available when + * **BPF_LOAD_HDR_OPT_TCP_SYN** is used + * + * **-ENOSPC** Not enough space. Only *len* number of + * bytes are copied. + * + * **-EFAULT** Cannot parse the header options in the packet + * + * **-EPERM** This helper cannot be used under the + * current sock_ops->op. + * + * long bpf_store_hdr_opt(struct bpf_sock_ops *skops, const void *from, u32 len, u64 flags) + * Description + * Store header option. The data will be copied + * from buffer *from* with length *len* to the TCP header. + * + * The buffer *from* should have the whole option that + * includes the kind, kind-length, and the actual + * option data. The *len* must be at least kind-length + * long. The kind-length does not have to be 4 byte + * aligned. The kernel will take care of the padding + * and setting the 4 bytes aligned value to th->doff. + * + * This helper will check for duplicated option + * by searching the same option in the outgoing skb. + * + * This helper can only be called during + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * + * Return + * 0 on success, or negative error in case of failure: + * + * **-EINVAL** If param is invalid + * + * **-ENOSPC** Not enough space in the header. + * Nothing has been written + * + * **-EEXIST** The option has already existed + * + * **-EFAULT** Cannot parse the existing header options + * + * **-EPERM** This helper cannot be used under the + * current sock_ops->op. + * + * long bpf_reserve_hdr_opt(struct bpf_sock_ops *skops, u32 len, u64 flags) + * Description + * Reserve *len* bytes for the bpf header option. The + * space will be used by bpf_store_hdr_opt() later in + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * + * If bpf_reserve_hdr_opt() is called multiple times, + * the total number of bytes will be reserved. + * + * This helper can only be called during + * BPF_SOCK_OPS_HDR_OPT_LEN_CB. + * + * Return + * 0 on success, or negative error in case of failure: + * + * **-EINVAL** if param is invalid + * + * **-ENOSPC** Not enough space in the header. + * + * **-EPERM** This helper cannot be used under the + * current sock_ops->op. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3539,6 +3653,9 @@ union bpf_attr { FN(skc_to_tcp_request_sock), \ FN(skc_to_udp6_sock), \ FN(get_task_stack), \ + FN(load_hdr_opt), \ + FN(store_hdr_opt), \ + FN(reserve_hdr_opt), /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -4165,6 +4282,36 @@ struct bpf_sock_ops { __u64 bytes_received; __u64 bytes_acked; __bpf_md_ptr(struct bpf_sock *, sk); + /* [skb_data, skb_data_end) covers the whole TCP header. + * + * BPF_SOCK_OPS_PARSE_HDR_OPT_CB: The packet received + * BPF_SOCK_OPS_HDR_OPT_LEN_CB: Not useful because the + * header has not been written. + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB: The header and options have + * been written so far. + * BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: The SYNACK that concludes + * the 3WHS. + * BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: The ACK that concludes + * the 3WHS. + * + * bpf_load_hdr_opt() can also be used to read a particular option. + */ + __bpf_md_ptr(void *, skb_data); + __bpf_md_ptr(void *, skb_data_end); + __u32 skb_len; /* The total length of a packet. + * It includes the header, options, + * and payload. + */ + __u32 skb_tcp_flags; /* tcp_flags of the header. It provides + * an easy way to check for tcp_flags + * without parsing skb_data. + * + * In particular, the skb_tcp_flags + * will still be available in + * BPF_SOCK_OPS_HDR_OPT_LEN even though + * the outgoing header has not + * been written yet. + */ }; /* Definitions for bpf_sock_ops_cb_flags */ @@ -4173,8 +4320,48 @@ enum { BPF_SOCK_OPS_RETRANS_CB_FLAG = (1<<1), BPF_SOCK_OPS_STATE_CB_FLAG = (1<<2), BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3), - BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = (1<<4), + /* Call bpf for all received TCP headers. The bpf prog will be + * called under sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * + * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * for the header option related helpers that will be useful + * to the bpf programs. + * + * It could be used at the client/active side (i.e. connect() side) + * when the server told it that the server was in syncookie + * mode and required the active side to resend the bpf-written + * options. The active side can keep writing the bpf-options until + * it received a valid packet from the server side to confirm + * the earlier packet (and options) has been received. The later + * example patch is using it like this at the active side when the + * server is in syncookie mode. + * + * The bpf prog will usually turn this off in the common cases. + */ + BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = (1<<4), + /* Call bpf when kernel has received a header option that + * the kernel cannot handle. The bpf prog will be called under + * sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB. + * + * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * for the header option related helpers that will be useful + * to the bpf programs. + */ BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG = (1<<5), + /* Call bpf when the kernel is writing header options for the + * outgoing packet. The bpf prog will first be called + * to reserve space in a skb under + * sock_ops->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB. Then + * the bpf prog will be called to write the header option(s) + * under sock_ops->op == BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * + * Please refer to the comment in BPF_SOCK_OPS_HDR_OPT_LEN_CB + * and BPF_SOCK_OPS_WRITE_HDR_OPT_CB for the header option + * related helpers that will be useful to the bpf programs. + * + * The kernel gets its chance to reserve space and write + * options first before the BPF program does. + */ BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG = (1<<6), /* Mask of all currently supported cb flags */ BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F, @@ -4233,6 +4420,63 @@ enum { */ BPF_SOCK_OPS_RTT_CB, /* Called on every RTT. */ + BPF_SOCK_OPS_PARSE_HDR_OPT_CB, /* Parse the header option. + * It will be called to handle + * the packets received at + * an already established + * connection. + * + * sock_ops->skb_data: + * Referring to the received skb. + * It covers the TCP header only. + * + * bpf_load_hdr_opt() can also + * be used to search for a + * particular option. + */ + BPF_SOCK_OPS_HDR_OPT_LEN_CB, /* Reserve space for writing the + * header option later in + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * Arg1: bool want_cookie. (in + * writing SYNACK only) + * + * sock_ops->skb_data: + * Not available because no header has + * been written yet. + * + * sock_ops->skb_tcp_flags: + * The tcp_flags of the + * outgoing skb. (e.g. SYN, ACK, FIN). + * + * bpf_reserve_hdr_opt() should + * be used to reserve space. + */ + BPF_SOCK_OPS_WRITE_HDR_OPT_CB, /* Write the header options + * Arg1: bool want_cookie. (in + * writing SYNACK only) + * + * sock_ops->skb_data: + * Referring to the outgoing skb. + * It covers the TCP header + * that has already been written + * by the kernel and the + * earlier bpf-progs. + * + * sock_ops->skb_tcp_flags: + * The tcp_flags of the outgoing + * skb. (e.g. SYN, ACK, FIN). + * + * bpf_store_hdr_opt() should + * be used to write the + * option. + * + * bpf_load_hdr_opt() can also + * be used to search for a + * particular option that + * has already been written + * by the kernel or the + * earlier bpf-progs. + */ }; /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect @@ -4262,6 +4506,60 @@ enum { TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */ TCP_BPF_DELACK_MAX = 1003, /* Max delay ack in usecs */ TCP_BPF_RTO_MIN = 1004, /* Min delay ack in usecs */ + /* Copy the SYN pkt to optval + * + * BPF_PROG_TYPE_SOCK_OPS only. It is similar to the + * bpf_getsockopt(TCP_SAVED_SYN) but it does not limit + * to only getting from the saved_syn. It can either get the + * syn packet from: + * + * 1. the just-received SYN packet (only available when writing the + * SYNACK). It will be useful when it is not necessary to + * save the SYN packet for latter use. It is also the only way + * to get the SYN during syncookie mode because the syn + * packet cannot be saved during syncookie. + * + * OR + * + * 2. the earlier saved syn which was done by + * bpf_setsockopt(TCP_SAVE_SYN). + * + * The bpf_getsockopt(TCP_BPF_SYN*) option will hide where the + * SYN packet is obtained. + * + * If the bpf-prog does not need the IP[46] header, the + * bpf-prog can avoid parsing the IP header by using + * TCP_BPF_SYN. Otherwise, the bpf-prog can get both + * IP[46] and TCP header by using TCP_BPF_SYN_IP. + * + * >0: Total number of bytes copied + * -ENOSPC: Not enough space in optval. Only optlen number of + * bytes is copied. + * -ENOENT: The SYN skb is not available now and the earlier SYN pkt + * is not saved by setsockopt(TCP_SAVE_SYN). + */ + TCP_BPF_SYN = 1005, /* Copy the TCP header */ + TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */ +}; + +enum { + BPF_LOAD_HDR_OPT_TCP_SYN = (1ULL << 0), +}; + +/* args[0] value during BPF_SOCK_OPS_HDR_OPT_LEN_CB and + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + */ +enum { + BPF_WRITE_HDR_TCP_CURRENT_MSS = 1, /* Kernel is finding the + * total option spaces + * required for an established + * sk in order to calculate the + * MSS. No skb is actually + * sent. + */ + BPF_WRITE_HDR_TCP_SYNACK_COOKIE = 2, /* Kernel is in syncookie mode + * when sending a SYN. + */ }; struct bpf_perf_event_value { -- cgit v1.2.3 From 267cf9fa43d1c9d525d5d818a8651f2900e3aa9e Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 20 Aug 2020 12:01:23 -0700 Subject: tcp: bpf: Optionally store mac header in TCP_SAVE_SYN This patch is adapted from Eric's patch in an earlier discussion [1]. The TCP_SAVE_SYN currently only stores the network header and tcp header. This patch allows it to optionally store the mac header also if the setsockopt's optval is 2. It requires one more bit for the "save_syn" bit field in tcp_sock. This patch achieves this by moving the syn_smc bit next to the is_mptcp. The syn_smc is currently used with the TCP experimental option. Since syn_smc is only used when CONFIG_SMC is enabled, this patch also puts the "IS_ENABLED(CONFIG_SMC)" around it like the is_mptcp did with "IS_ENABLED(CONFIG_MPTCP)". The mac_hdrlen is also stored in the "struct saved_syn" to allow a quick offset from the bpf prog if it chooses to start getting from the network header or the tcp header. [1]: https://lore.kernel.org/netdev/CANn89iLJNWh6bkH7DNhy_kmcAexuUCccqERqe7z2QsvPhGrYPQ@mail.gmail.com/ Suggested-by: Eric Dumazet Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/bpf/20200820190123.2886935-1-kafai@fb.com --- include/linux/tcp.h | 13 ++++++++----- include/net/request_sock.h | 1 + include/uapi/linux/bpf.h | 1 + net/core/filter.c | 27 ++++++++++++++++++++++----- net/ipv4/tcp.c | 3 ++- net/ipv4/tcp_input.c | 14 +++++++++++++- tools/include/uapi/linux/bpf.h | 1 + 7 files changed, 48 insertions(+), 12 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 29d166263ae7..56ff2952edaf 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -239,14 +239,13 @@ struct tcp_sock { repair : 1, frto : 1;/* F-RTO (RFC5682) activated in CA_Loss */ u8 repair_queue; - u8 syn_data:1, /* SYN includes data */ + u8 save_syn:2, /* Save headers of SYN packet */ + syn_data:1, /* SYN includes data */ syn_fastopen:1, /* SYN includes Fast Open option */ syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */ syn_fastopen_ch:1, /* Active TFO re-enabling probe */ syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ - save_syn:1, /* Save headers of SYN packet */ - is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */ - syn_smc:1; /* SYN includes SMC */ + is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */ u32 tlp_high_seq; /* snd_nxt at the time of TLP */ u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ @@ -393,6 +392,9 @@ struct tcp_sock { #if IS_ENABLED(CONFIG_MPTCP) bool is_mptcp; #endif +#if IS_ENABLED(CONFIG_SMC) + bool syn_smc; /* SYN includes SMC */ +#endif #ifdef CONFIG_TCP_MD5SIG /* TCP AF-Specific parts; only used by MD5 Signature support so far */ @@ -488,7 +490,8 @@ static inline void tcp_saved_syn_free(struct tcp_sock *tp) static inline u32 tcp_saved_syn_len(const struct saved_syn *saved_syn) { - return saved_syn->network_hdrlen + saved_syn->tcp_hdrlen; + return saved_syn->mac_hdrlen + saved_syn->network_hdrlen + + saved_syn->tcp_hdrlen; } struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 7d9ed99a77bd..29e41ff3ec93 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -42,6 +42,7 @@ struct request_sock_ops { int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req); struct saved_syn { + u32 mac_hdrlen; u32 network_hdrlen; u32 tcp_hdrlen; u8 data[]; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f67ec5d9e57d..544b89a64918 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4540,6 +4540,7 @@ enum { */ TCP_BPF_SYN = 1005, /* Copy the TCP header */ TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */ + TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */ }; enum { diff --git a/net/core/filter.c b/net/core/filter.c index ab5603d5b62a..47eef9a0be6a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4682,11 +4682,16 @@ static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock, if (optname == TCP_BPF_SYN) { hdr_start = syn_skb->data; ret = tcp_hdrlen(syn_skb); - } else { - /* optname == TCP_BPF_SYN_IP */ + } else if (optname == TCP_BPF_SYN_IP) { hdr_start = skb_network_header(syn_skb); ret = skb_network_header_len(syn_skb) + tcp_hdrlen(syn_skb); + } else { + /* optname == TCP_BPF_SYN_MAC */ + hdr_start = skb_mac_header(syn_skb); + ret = skb_mac_header_len(syn_skb) + + skb_network_header_len(syn_skb) + + tcp_hdrlen(syn_skb); } } else { struct sock *sk = bpf_sock->sk; @@ -4706,12 +4711,24 @@ static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock, if (optname == TCP_BPF_SYN) { hdr_start = saved_syn->data + + saved_syn->mac_hdrlen + saved_syn->network_hdrlen; ret = saved_syn->tcp_hdrlen; + } else if (optname == TCP_BPF_SYN_IP) { + hdr_start = saved_syn->data + + saved_syn->mac_hdrlen; + ret = saved_syn->network_hdrlen + + saved_syn->tcp_hdrlen; } else { - /* optname == TCP_BPF_SYN_IP */ + /* optname == TCP_BPF_SYN_MAC */ + + /* TCP_SAVE_SYN may not have saved the mac hdr */ + if (!saved_syn->mac_hdrlen) + return -ENOENT; + hdr_start = saved_syn->data; - ret = saved_syn->network_hdrlen + + ret = saved_syn->mac_hdrlen + + saved_syn->network_hdrlen + saved_syn->tcp_hdrlen; } } @@ -4724,7 +4741,7 @@ BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP && - optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_IP) { + optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_MAC) { int ret, copy_len = 0; const u8 *start; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 6075cb091a20..57a568875539 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3211,7 +3211,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname, break; case TCP_SAVE_SYN: - if (val < 0 || val > 1) + /* 0: disable, 1: enable, 2: start from ether_header */ + if (val < 0 || val > 2) err = -EINVAL; else tp->save_syn = val; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 319cc7fd5117..4337841faeff 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6676,13 +6676,25 @@ static void tcp_reqsk_record_syn(const struct sock *sk, if (tcp_sk(sk)->save_syn) { u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb); struct saved_syn *saved_syn; + u32 mac_hdrlen; + void *base; + + if (tcp_sk(sk)->save_syn == 2) { /* Save full header. */ + base = skb_mac_header(skb); + mac_hdrlen = skb_mac_header_len(skb); + len += mac_hdrlen; + } else { + base = skb_network_header(skb); + mac_hdrlen = 0; + } saved_syn = kmalloc(struct_size(saved_syn, data, len), GFP_ATOMIC); if (saved_syn) { + saved_syn->mac_hdrlen = mac_hdrlen; saved_syn->network_hdrlen = skb_network_header_len(skb); saved_syn->tcp_hdrlen = tcp_hdrlen(skb); - memcpy(saved_syn->data, skb_network_header(skb), len); + memcpy(saved_syn->data, base, len); req->saved_syn = saved_syn; } } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f67ec5d9e57d..544b89a64918 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4540,6 +4540,7 @@ enum { */ TCP_BPF_SYN = 1005, /* Copy the TCP header */ TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */ + TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */ }; enum { -- cgit v1.2.3 From f836a56e84ffc9f1a1cd73f77e10404ca46a4616 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Tue, 25 Aug 2020 20:29:15 +0200 Subject: bpf: Generalize bpf_sk_storage Refactor the functionality in bpf_sk_storage.c so that concept of storage linked to kernel objects can be extended to other objects like inode, task_struct etc. Each new local storage will still be a separate map and provide its own set of helpers. This allows for future object specific extensions and still share a lot of the underlying implementation. This includes the changes suggested by Martin in: https://lore.kernel.org/bpf/20200725013047.4006241-1-kafai@fb.com/ adding new map operations to support bpf_local_storage maps: * storages for different kernel objects to optionally have different memory charging strategy (map_local_storage_charge, map_local_storage_uncharge) * Functionality to extract the storage pointer from a pointer to the owning object (map_owner_storage_ptr) Co-developed-by: Martin KaFai Lau Signed-off-by: Martin KaFai Lau Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200825182919.1118197-4-kpsingh@chromium.org --- include/linux/bpf.h | 8 ++ include/net/bpf_sk_storage.h | 52 +++++++++ include/uapi/linux/bpf.h | 8 +- net/core/bpf_sk_storage.c | 238 +++++++++++++++++++++++++++-------------- tools/include/uapi/linux/bpf.h | 8 +- 5 files changed, 228 insertions(+), 86 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 81f38e2fda78..8c443b93ac11 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -34,6 +34,8 @@ struct btf_type; struct exception_table_entry; struct seq_operations; struct bpf_iter_aux_info; +struct bpf_local_storage; +struct bpf_local_storage_map; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; @@ -104,6 +106,12 @@ struct bpf_map_ops { __poll_t (*map_poll)(struct bpf_map *map, struct file *filp, struct poll_table_struct *pts); + /* Functions called by bpf_local_storage maps */ + int (*map_local_storage_charge)(struct bpf_local_storage_map *smap, + void *owner, u32 size); + void (*map_local_storage_uncharge)(struct bpf_local_storage_map *smap, + void *owner, u32 size); + struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner); /* BTF name and id of struct allocated by map_alloc */ const char * const map_btf_name; int *map_btf_id; diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h index 950c5aaba15e..9e631b5466e3 100644 --- a/include/net/bpf_sk_storage.h +++ b/include/net/bpf_sk_storage.h @@ -3,8 +3,15 @@ #ifndef _BPF_SK_STORAGE_H #define _BPF_SK_STORAGE_H +#include +#include +#include #include #include +#include +#include +#include +#include struct sock; @@ -13,6 +20,7 @@ void bpf_sk_storage_free(struct sock *sk); extern const struct bpf_func_proto bpf_sk_storage_get_proto; extern const struct bpf_func_proto bpf_sk_storage_delete_proto; +struct bpf_local_storage_elem; struct bpf_sk_storage_diag; struct sk_buff; struct nlattr; @@ -34,6 +42,50 @@ u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache); void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache, u16 idx); +/* Helper functions for bpf_local_storage */ +int bpf_local_storage_map_alloc_check(union bpf_attr *attr); + +struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr); + +struct bpf_local_storage_data * +bpf_local_storage_lookup(struct bpf_local_storage *local_storage, + struct bpf_local_storage_map *smap, + bool cacheit_lockit); + +void bpf_local_storage_map_free(struct bpf_local_storage_map *smap); + +int bpf_local_storage_map_check_btf(const struct bpf_map *map, + const struct btf *btf, + const struct btf_type *key_type, + const struct btf_type *value_type); + +void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem); + +bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem, + bool uncharge_omem); + +void bpf_selem_unlink(struct bpf_local_storage_elem *selem); + +void bpf_selem_link_map(struct bpf_local_storage_map *smap, + struct bpf_local_storage_elem *selem); + +void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem); + +struct bpf_local_storage_elem * +bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, + bool charge_mem); + +int +bpf_local_storage_alloc(void *owner, + struct bpf_local_storage_map *smap, + struct bpf_local_storage_elem *first_selem); + +struct bpf_local_storage_data * +bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, + void *value, u64 map_flags); + #ifdef CONFIG_BPF_SYSCALL int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk); struct bpf_sk_storage_diag * diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 544b89a64918..2cbd137eed86 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3765,9 +3765,13 @@ enum { BPF_F_SYSCTL_BASE_NAME = (1ULL << 0), }; -/* BPF_FUNC_sk_storage_get flags */ +/* BPF_FUNC__storage_get flags */ enum { - BPF_SK_STORAGE_GET_F_CREATE = (1ULL << 0), + BPF_LOCAL_STORAGE_GET_F_CREATE = (1ULL << 0), + /* BPF_SK_STORAGE_GET_F_CREATE is only kept for backward compatibility + * and BPF_LOCAL_STORAGE_GET_F_CREATE must be used instead. + */ + BPF_SK_STORAGE_GET_F_CREATE = BPF_LOCAL_STORAGE_GET_F_CREATE, }; /* BPF_FUNC_read_branch_records flags. */ diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index ec61ee7c7ee4..cd8b7017913b 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -84,7 +84,7 @@ struct bpf_local_storage_elem { struct bpf_local_storage { struct bpf_local_storage_data __rcu *cache[BPF_LOCAL_STORAGE_CACHE_SIZE]; struct hlist_head list; /* List of bpf_local_storage_elem */ - struct sock *owner; /* The object that owns the above "list" of + void *owner; /* The object that owns the above "list" of * bpf_local_storage_elem. */ struct rcu_head rcu; @@ -110,6 +110,33 @@ static int omem_charge(struct sock *sk, unsigned int size) return -ENOMEM; } +static int mem_charge(struct bpf_local_storage_map *smap, void *owner, u32 size) +{ + struct bpf_map *map = &smap->map; + + if (!map->ops->map_local_storage_charge) + return 0; + + return map->ops->map_local_storage_charge(smap, owner, size); +} + +static void mem_uncharge(struct bpf_local_storage_map *smap, void *owner, + u32 size) +{ + struct bpf_map *map = &smap->map; + + if (map->ops->map_local_storage_uncharge) + map->ops->map_local_storage_uncharge(smap, owner, size); +} + +static struct bpf_local_storage __rcu ** +owner_storage(struct bpf_local_storage_map *smap, void *owner) +{ + struct bpf_map *map = &smap->map; + + return map->ops->map_owner_storage_ptr(owner); +} + static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem) { return !hlist_unhashed(&selem->snode); @@ -120,13 +147,13 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem) return !hlist_unhashed(&selem->map_node); } -static struct bpf_local_storage_elem * -bpf_selem_alloc(struct bpf_local_storage_map *smap, struct sock *sk, - void *value, bool charge_omem) +struct bpf_local_storage_elem * +bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, + void *value, bool charge_mem) { struct bpf_local_storage_elem *selem; - if (charge_omem && omem_charge(sk, smap->elem_size)) + if (charge_mem && mem_charge(smap, owner, smap->elem_size)) return NULL; selem = kzalloc(smap->elem_size, GFP_ATOMIC | __GFP_NOWARN); @@ -136,8 +163,8 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, struct sock *sk, return selem; } - if (charge_omem) - atomic_sub(smap->elem_size, &sk->sk_omem_alloc); + if (charge_mem) + mem_uncharge(smap, owner, smap->elem_size); return NULL; } @@ -146,32 +173,32 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, struct sock *sk, * The caller must ensure selem->smap is still valid to be * dereferenced for its smap->elem_size and smap->cache_idx. */ -static bool -bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, - struct bpf_local_storage_elem *selem, - bool uncharge_omem) +bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem, + bool uncharge_mem) { struct bpf_local_storage_map *smap; bool free_local_storage; - struct sock *sk; + void *owner; smap = rcu_dereference(SDATA(selem)->smap); - sk = local_storage->owner; + owner = local_storage->owner; /* All uncharging on the owner must be done first. * The owner may be freed once the last selem is unlinked * from local_storage. */ - if (uncharge_omem) - atomic_sub(smap->elem_size, &sk->sk_omem_alloc); + if (uncharge_mem) + mem_uncharge(smap, owner, smap->elem_size); free_local_storage = hlist_is_singular_node(&selem->snode, &local_storage->list); if (free_local_storage) { - atomic_sub(sizeof(struct bpf_local_storage), &sk->sk_omem_alloc); + mem_uncharge(smap, owner, sizeof(struct bpf_local_storage)); local_storage->owner = NULL; - /* After this RCU_INIT, sk may be freed and cannot be used */ - RCU_INIT_POINTER(sk->sk_bpf_storage, NULL); + + /* After this RCU_INIT, owner may be freed and cannot be used */ + RCU_INIT_POINTER(*owner_storage(smap, owner), NULL); /* local_storage is not freed now. local_storage->lock is * still held and raw_spin_unlock_bh(&local_storage->lock) @@ -209,23 +236,22 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem) local_storage = rcu_dereference(selem->local_storage); raw_spin_lock_bh(&local_storage->lock); if (likely(selem_linked_to_storage(selem))) - free_local_storage = - bpf_selem_unlink_storage_nolock(local_storage, selem, true); + free_local_storage = bpf_selem_unlink_storage_nolock( + local_storage, selem, true); raw_spin_unlock_bh(&local_storage->lock); if (free_local_storage) kfree_rcu(local_storage, rcu); } -static void -bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, - struct bpf_local_storage_elem *selem) +void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem) { RCU_INIT_POINTER(selem->local_storage, local_storage); hlist_add_head(&selem->snode, &local_storage->list); } -static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) +void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) { struct bpf_local_storage_map *smap; struct bpf_local_storage_map_bucket *b; @@ -242,8 +268,8 @@ static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) raw_spin_unlock_bh(&b->lock); } -static void bpf_selem_link_map(struct bpf_local_storage_map *smap, - struct bpf_local_storage_elem *selem) +void bpf_selem_link_map(struct bpf_local_storage_map *smap, + struct bpf_local_storage_elem *selem) { struct bpf_local_storage_map_bucket *b = select_bucket(smap, selem); @@ -253,7 +279,7 @@ static void bpf_selem_link_map(struct bpf_local_storage_map *smap, raw_spin_unlock_bh(&b->lock); } -static void bpf_selem_unlink(struct bpf_local_storage_elem *selem) +void bpf_selem_unlink(struct bpf_local_storage_elem *selem) { /* Always unlink from map before unlinking from local_storage * because selem will be freed after successfully unlinked from @@ -263,7 +289,7 @@ static void bpf_selem_unlink(struct bpf_local_storage_elem *selem) __bpf_selem_unlink_storage(selem); } -static struct bpf_local_storage_data * +struct bpf_local_storage_data * bpf_local_storage_lookup(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *smap, bool cacheit_lockit) @@ -329,40 +355,45 @@ static int check_flags(const struct bpf_local_storage_data *old_sdata, return 0; } -static int sk_storage_alloc(struct sock *sk, +int bpf_local_storage_alloc(void *owner, struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *first_selem) { - struct bpf_local_storage *prev_sk_storage, *sk_storage; + struct bpf_local_storage *prev_storage, *storage; + struct bpf_local_storage **owner_storage_ptr; int err; - err = omem_charge(sk, sizeof(*sk_storage)); + err = mem_charge(smap, owner, sizeof(*storage)); if (err) return err; - sk_storage = kzalloc(sizeof(*sk_storage), GFP_ATOMIC | __GFP_NOWARN); - if (!sk_storage) { + storage = kzalloc(sizeof(*storage), GFP_ATOMIC | __GFP_NOWARN); + if (!storage) { err = -ENOMEM; goto uncharge; } - INIT_HLIST_HEAD(&sk_storage->list); - raw_spin_lock_init(&sk_storage->lock); - sk_storage->owner = sk; - bpf_selem_link_storage_nolock(sk_storage, first_selem); + INIT_HLIST_HEAD(&storage->list); + raw_spin_lock_init(&storage->lock); + storage->owner = owner; + + bpf_selem_link_storage_nolock(storage, first_selem); bpf_selem_link_map(smap, first_selem); - /* Publish sk_storage to sk. sk->sk_lock cannot be acquired. - * Hence, atomic ops is used to set sk->sk_bpf_storage - * from NULL to the newly allocated sk_storage ptr. + + owner_storage_ptr = + (struct bpf_local_storage **)owner_storage(smap, owner); + /* Publish storage to the owner. + * Instead of using any lock of the kernel object (i.e. owner), + * cmpxchg will work with any kernel object regardless what + * the running context is, bh, irq...etc. * - * From now on, the sk->sk_bpf_storage pointer is protected - * by the sk_storage->lock. Hence, when freeing - * the sk->sk_bpf_storage, the sk_storage->lock must - * be held before setting sk->sk_bpf_storage to NULL. + * From now on, the owner->storage pointer (e.g. sk->sk_bpf_storage) + * is protected by the storage->lock. Hence, when freeing + * the owner->storage, the storage->lock must be held before + * setting owner->storage ptr to NULL. */ - prev_sk_storage = cmpxchg((struct bpf_local_storage **)&sk->sk_bpf_storage, - NULL, sk_storage); - if (unlikely(prev_sk_storage)) { + prev_storage = cmpxchg(owner_storage_ptr, NULL, storage); + if (unlikely(prev_storage)) { bpf_selem_unlink_map(first_selem); err = -EAGAIN; goto uncharge; @@ -380,8 +411,8 @@ static int sk_storage_alloc(struct sock *sk, return 0; uncharge: - kfree(sk_storage); - atomic_sub(sizeof(*sk_storage), &sk->sk_omem_alloc); + kfree(storage); + mem_uncharge(smap, owner, sizeof(*storage)); return err; } @@ -390,38 +421,37 @@ uncharge: * Otherwise, it will become a leak (and other memory issues * during map destruction). */ -static struct bpf_local_storage_data * -bpf_local_storage_update(struct sock *sk, struct bpf_map *map, void *value, - u64 map_flags) +struct bpf_local_storage_data * +bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, + void *value, u64 map_flags) { struct bpf_local_storage_data *old_sdata = NULL; struct bpf_local_storage_elem *selem; struct bpf_local_storage *local_storage; - struct bpf_local_storage_map *smap; int err; /* BPF_EXIST and BPF_NOEXIST cannot be both set */ if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) || /* BPF_F_LOCK can only be used in a value with spin_lock */ - unlikely((map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))) + unlikely((map_flags & BPF_F_LOCK) && + !map_value_has_spin_lock(&smap->map))) return ERR_PTR(-EINVAL); - smap = (struct bpf_local_storage_map *)map; - local_storage = rcu_dereference(sk->sk_bpf_storage); + local_storage = rcu_dereference(*owner_storage(smap, owner)); if (!local_storage || hlist_empty(&local_storage->list)) { /* Very first elem for the owner */ err = check_flags(NULL, map_flags); if (err) return ERR_PTR(err); - selem = bpf_selem_alloc(smap, sk, value, true); + selem = bpf_selem_alloc(smap, owner, value, true); if (!selem) return ERR_PTR(-ENOMEM); - err = sk_storage_alloc(sk, smap, selem); + err = bpf_local_storage_alloc(owner, smap, selem); if (err) { kfree(selem); - atomic_sub(smap->elem_size, &sk->sk_omem_alloc); + mem_uncharge(smap, owner, smap->elem_size); return ERR_PTR(err); } @@ -439,7 +469,7 @@ bpf_local_storage_update(struct sock *sk, struct bpf_map *map, void *value, if (err) return ERR_PTR(err); if (old_sdata && selem_linked_to_storage(SELEM(old_sdata))) { - copy_map_value_locked(map, old_sdata->data, + copy_map_value_locked(&smap->map, old_sdata->data, value, false); return old_sdata; } @@ -464,7 +494,8 @@ bpf_local_storage_update(struct sock *sk, struct bpf_map *map, void *value, goto unlock_err; if (old_sdata && (map_flags & BPF_F_LOCK)) { - copy_map_value_locked(map, old_sdata->data, value, false); + copy_map_value_locked(&smap->map, old_sdata->data, value, + false); selem = SELEM(old_sdata); goto unlock; } @@ -478,7 +509,7 @@ bpf_local_storage_update(struct sock *sk, struct bpf_map *map, void *value, * old_sdata will not be uncharged later during * bpf_selem_unlink_storage_nolock(). */ - selem = bpf_selem_alloc(smap, sk, value, !old_sdata); + selem = bpf_selem_alloc(smap, owner, value, !old_sdata); if (!selem) { err = -ENOMEM; goto unlock_err; @@ -591,17 +622,12 @@ void bpf_sk_storage_free(struct sock *sk) kfree_rcu(sk_storage, rcu); } -static void bpf_local_storage_map_free(struct bpf_map *map) +void bpf_local_storage_map_free(struct bpf_local_storage_map *smap) { struct bpf_local_storage_elem *selem; - struct bpf_local_storage_map *smap; struct bpf_local_storage_map_bucket *b; unsigned int i; - smap = (struct bpf_local_storage_map *)map; - - bpf_local_storage_cache_idx_free(&sk_cache, smap->cache_idx); - /* Note that this map might be concurrently cloned from * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone * RCU read section to finish before proceeding. New RCU @@ -646,7 +672,16 @@ static void bpf_local_storage_map_free(struct bpf_map *map) synchronize_rcu(); kvfree(smap->buckets); - kfree(map); + kfree(smap); +} + +static void sk_storage_map_free(struct bpf_map *map) +{ + struct bpf_local_storage_map *smap; + + smap = (struct bpf_local_storage_map *)map; + bpf_local_storage_cache_idx_free(&sk_cache, smap->cache_idx); + bpf_local_storage_map_free(smap); } /* U16_MAX is much more than enough for sk local storage @@ -658,7 +693,7 @@ static void bpf_local_storage_map_free(struct bpf_map *map) sizeof(struct bpf_local_storage_elem)), \ (U16_MAX - sizeof(struct bpf_local_storage_elem))) -static int bpf_local_storage_map_alloc_check(union bpf_attr *attr) +int bpf_local_storage_map_alloc_check(union bpf_attr *attr) { if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK || !(attr->map_flags & BPF_F_NO_PREALLOC) || @@ -677,7 +712,7 @@ static int bpf_local_storage_map_alloc_check(union bpf_attr *attr) return 0; } -static struct bpf_map *bpf_local_storage_map_alloc(union bpf_attr *attr) +struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr) { struct bpf_local_storage_map *smap; unsigned int i; @@ -717,8 +752,19 @@ static struct bpf_map *bpf_local_storage_map_alloc(union bpf_attr *attr) smap->elem_size = sizeof(struct bpf_local_storage_elem) + attr->value_size; - smap->cache_idx = bpf_local_storage_cache_idx_get(&sk_cache); + return smap; +} + +static struct bpf_map *sk_storage_map_alloc(union bpf_attr *attr) +{ + struct bpf_local_storage_map *smap; + + smap = bpf_local_storage_map_alloc(attr); + if (IS_ERR(smap)) + return ERR_CAST(smap); + + smap->cache_idx = bpf_local_storage_cache_idx_get(&sk_cache); return &smap->map; } @@ -728,10 +774,10 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, return -ENOTSUPP; } -static int bpf_local_storage_map_check_btf(const struct bpf_map *map, - const struct btf *btf, - const struct btf_type *key_type, - const struct btf_type *value_type) +int bpf_local_storage_map_check_btf(const struct bpf_map *map, + const struct btf *btf, + const struct btf_type *key_type, + const struct btf_type *value_type) { u32 int_data; @@ -772,8 +818,9 @@ static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key, fd = *(int *)key; sock = sockfd_lookup(fd, &err); if (sock) { - sdata = bpf_local_storage_update(sock->sk, map, value, - map_flags); + sdata = bpf_local_storage_update( + sock->sk, (struct bpf_local_storage_map *)map, value, + map_flags); sockfd_put(sock); return PTR_ERR_OR_ZERO(sdata); } @@ -862,7 +909,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) bpf_selem_link_map(smap, copy_selem); bpf_selem_link_storage_nolock(new_sk_storage, copy_selem); } else { - ret = sk_storage_alloc(newsk, smap, copy_selem); + ret = bpf_local_storage_alloc(newsk, smap, copy_selem); if (ret) { kfree(copy_selem); atomic_sub(smap->elem_size, @@ -906,7 +953,9 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, * destruction). */ refcount_inc_not_zero(&sk->sk_refcnt)) { - sdata = bpf_local_storage_update(sk, map, value, BPF_NOEXIST); + sdata = bpf_local_storage_update( + sk, (struct bpf_local_storage_map *)map, value, + BPF_NOEXIST); /* sk must be a fullsock (guaranteed by verifier), * so sock_gen_put() is unnecessary. */ @@ -931,11 +980,33 @@ BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) return -ENOENT; } +static int sk_storage_charge(struct bpf_local_storage_map *smap, + void *owner, u32 size) +{ + return omem_charge(owner, size); +} + +static void sk_storage_uncharge(struct bpf_local_storage_map *smap, + void *owner, u32 size) +{ + struct sock *sk = owner; + + atomic_sub(size, &sk->sk_omem_alloc); +} + +static struct bpf_local_storage __rcu ** +sk_storage_ptr(void *owner) +{ + struct sock *sk = owner; + + return &sk->sk_bpf_storage; +} + static int sk_storage_map_btf_id; const struct bpf_map_ops sk_storage_map_ops = { .map_alloc_check = bpf_local_storage_map_alloc_check, - .map_alloc = bpf_local_storage_map_alloc, - .map_free = bpf_local_storage_map_free, + .map_alloc = sk_storage_map_alloc, + .map_free = sk_storage_map_free, .map_get_next_key = notsupp_get_next_key, .map_lookup_elem = bpf_fd_sk_storage_lookup_elem, .map_update_elem = bpf_fd_sk_storage_update_elem, @@ -943,6 +1014,9 @@ const struct bpf_map_ops sk_storage_map_ops = { .map_check_btf = bpf_local_storage_map_check_btf, .map_btf_name = "bpf_local_storage_map", .map_btf_id = &sk_storage_map_btf_id, + .map_local_storage_charge = sk_storage_charge, + .map_local_storage_uncharge = sk_storage_uncharge, + .map_owner_storage_ptr = sk_storage_ptr, }; const struct bpf_func_proto bpf_sk_storage_get_proto = { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 544b89a64918..2cbd137eed86 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3765,9 +3765,13 @@ enum { BPF_F_SYSCTL_BASE_NAME = (1ULL << 0), }; -/* BPF_FUNC_sk_storage_get flags */ +/* BPF_FUNC__storage_get flags */ enum { - BPF_SK_STORAGE_GET_F_CREATE = (1ULL << 0), + BPF_LOCAL_STORAGE_GET_F_CREATE = (1ULL << 0), + /* BPF_SK_STORAGE_GET_F_CREATE is only kept for backward compatibility + * and BPF_LOCAL_STORAGE_GET_F_CREATE must be used instead. + */ + BPF_SK_STORAGE_GET_F_CREATE = BPF_LOCAL_STORAGE_GET_F_CREATE, }; /* BPF_FUNC_read_branch_records flags. */ -- cgit v1.2.3 From 8ea636848aca35b9f97c5b5dee30225cf2dd0fe6 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Tue, 25 Aug 2020 20:29:17 +0200 Subject: bpf: Implement bpf_local_storage for inodes Similar to bpf_local_storage for sockets, add local storage for inodes. The life-cycle of storage is managed with the life-cycle of the inode. i.e. the storage is destroyed along with the owning inode. The BPF LSM allocates an __rcu pointer to the bpf_local_storage in the security blob which are now stackable and can co-exist with other LSMs. Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200825182919.1118197-6-kpsingh@chromium.org --- include/linux/bpf_lsm.h | 29 +++ include/linux/bpf_types.h | 3 + include/uapi/linux/bpf.h | 40 +++- kernel/bpf/Makefile | 1 + kernel/bpf/bpf_inode_storage.c | 273 ++++++++++++++++++++++++ kernel/bpf/syscall.c | 3 +- kernel/bpf/verifier.c | 10 + security/bpf/hooks.c | 6 + tools/bpf/bpftool/Documentation/bpftool-map.rst | 2 +- tools/bpf/bpftool/bash-completion/bpftool | 3 +- tools/bpf/bpftool/map.c | 3 +- tools/include/uapi/linux/bpf.h | 40 +++- tools/lib/bpf/libbpf_probes.c | 5 +- 13 files changed, 410 insertions(+), 8 deletions(-) create mode 100644 kernel/bpf/bpf_inode_storage.c (limited to 'include/uapi/linux') diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h index af74712af585..aaacb6aafc87 100644 --- a/include/linux/bpf_lsm.h +++ b/include/linux/bpf_lsm.h @@ -17,9 +17,28 @@ #include #undef LSM_HOOK +struct bpf_storage_blob { + struct bpf_local_storage __rcu *storage; +}; + +extern struct lsm_blob_sizes bpf_lsm_blob_sizes; + int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog); +static inline struct bpf_storage_blob *bpf_inode( + const struct inode *inode) +{ + if (unlikely(!inode->i_security)) + return NULL; + + return inode->i_security + bpf_lsm_blob_sizes.lbs_inode; +} + +extern const struct bpf_func_proto bpf_inode_storage_get_proto; +extern const struct bpf_func_proto bpf_inode_storage_delete_proto; +void bpf_inode_storage_free(struct inode *inode); + #else /* !CONFIG_BPF_LSM */ static inline int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, @@ -28,6 +47,16 @@ static inline int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, return -EOPNOTSUPP; } +static inline struct bpf_storage_blob *bpf_inode( + const struct inode *inode) +{ + return NULL; +} + +static inline void bpf_inode_storage_free(struct inode *inode) +{ +} + #endif /* CONFIG_BPF_LSM */ #endif /* _LINUX_BPF_LSM_H */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index a52a5688418e..2e6f568377f1 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -107,6 +107,9 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops) #endif +#ifdef CONFIG_BPF_LSM +BPF_MAP_TYPE(BPF_MAP_TYPE_INODE_STORAGE, inode_storage_map_ops) +#endif BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) #if defined(CONFIG_XDP_SOCKETS) BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2cbd137eed86..b6bfcd085a76 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -155,6 +155,7 @@ enum bpf_map_type { BPF_MAP_TYPE_DEVMAP_HASH, BPF_MAP_TYPE_STRUCT_OPS, BPF_MAP_TYPE_RINGBUF, + BPF_MAP_TYPE_INODE_STORAGE, }; /* Note that tracing related programs such as @@ -3509,6 +3510,41 @@ union bpf_attr { * * **-EPERM** This helper cannot be used under the * current sock_ops->op. + * void *bpf_inode_storage_get(struct bpf_map *map, void *inode, void *value, u64 flags) + * Description + * Get a bpf_local_storage from an *inode*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *inode* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *inode*) except this + * helper enforces the key must be an inode and the map must also + * be a **BPF_MAP_TYPE_INODE_STORAGE**. + * + * Underneath, the value is stored locally at *inode* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf_local_storage residing at *inode*. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * Return + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + * + * int bpf_inode_storage_delete(struct bpf_map *map, void *inode) + * Description + * Delete a bpf_local_storage from an *inode*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3655,7 +3691,9 @@ union bpf_attr { FN(get_task_stack), \ FN(load_hdr_opt), \ FN(store_hdr_opt), \ - FN(reserve_hdr_opt), + FN(reserve_hdr_opt), \ + FN(inode_storage_get), \ + FN(inode_storage_delete), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 6961ff400cba..bdc8cd1b6767 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -5,6 +5,7 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o +obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c new file mode 100644 index 000000000000..f3a44e929447 --- /dev/null +++ b/kernel/bpf/bpf_inode_storage.c @@ -0,0 +1,273 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2019 Facebook + * Copyright 2020 Google LLC. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +DEFINE_BPF_STORAGE_CACHE(inode_cache); + +static struct bpf_local_storage __rcu ** +inode_storage_ptr(void *owner) +{ + struct inode *inode = owner; + struct bpf_storage_blob *bsb; + + bsb = bpf_inode(inode); + if (!bsb) + return NULL; + return &bsb->storage; +} + +static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode, + struct bpf_map *map, + bool cacheit_lockit) +{ + struct bpf_local_storage *inode_storage; + struct bpf_local_storage_map *smap; + struct bpf_storage_blob *bsb; + + bsb = bpf_inode(inode); + if (!bsb) + return NULL; + + inode_storage = rcu_dereference(bsb->storage); + if (!inode_storage) + return NULL; + + smap = (struct bpf_local_storage_map *)map; + return bpf_local_storage_lookup(inode_storage, smap, cacheit_lockit); +} + +void bpf_inode_storage_free(struct inode *inode) +{ + struct bpf_local_storage_elem *selem; + struct bpf_local_storage *local_storage; + bool free_inode_storage = false; + struct bpf_storage_blob *bsb; + struct hlist_node *n; + + bsb = bpf_inode(inode); + if (!bsb) + return; + + rcu_read_lock(); + + local_storage = rcu_dereference(bsb->storage); + if (!local_storage) { + rcu_read_unlock(); + return; + } + + /* Netiher the bpf_prog nor the bpf-map's syscall + * could be modifying the local_storage->list now. + * Thus, no elem can be added-to or deleted-from the + * local_storage->list by the bpf_prog or by the bpf-map's syscall. + * + * It is racing with bpf_local_storage_map_free() alone + * when unlinking elem from the local_storage->list and + * the map's bucket->list. + */ + raw_spin_lock_bh(&local_storage->lock); + hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { + /* Always unlink from map before unlinking from + * local_storage. + */ + bpf_selem_unlink_map(selem); + free_inode_storage = bpf_selem_unlink_storage_nolock( + local_storage, selem, false); + } + raw_spin_unlock_bh(&local_storage->lock); + rcu_read_unlock(); + + /* free_inoode_storage should always be true as long as + * local_storage->list was non-empty. + */ + if (free_inode_storage) + kfree_rcu(local_storage, rcu); +} + +static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_local_storage_data *sdata; + struct file *f; + int fd; + + fd = *(int *)key; + f = fget_raw(fd); + if (!f) + return NULL; + + sdata = inode_storage_lookup(f->f_inode, map, true); + fput(f); + return sdata ? sdata->data : NULL; +} + +static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + struct bpf_local_storage_data *sdata; + struct file *f; + int fd; + + fd = *(int *)key; + f = fget_raw(fd); + if (!f || !inode_storage_ptr(f->f_inode)) + return -EBADF; + + sdata = bpf_local_storage_update(f->f_inode, + (struct bpf_local_storage_map *)map, + value, map_flags); + fput(f); + return PTR_ERR_OR_ZERO(sdata); +} + +static int inode_storage_delete(struct inode *inode, struct bpf_map *map) +{ + struct bpf_local_storage_data *sdata; + + sdata = inode_storage_lookup(inode, map, false); + if (!sdata) + return -ENOENT; + + bpf_selem_unlink(SELEM(sdata)); + + return 0; +} + +static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key) +{ + struct file *f; + int fd, err; + + fd = *(int *)key; + f = fget_raw(fd); + if (!f) + return -EBADF; + + err = inode_storage_delete(f->f_inode, map); + fput(f); + return err; +} + +BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, + void *, value, u64, flags) +{ + struct bpf_local_storage_data *sdata; + + if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) + return (unsigned long)NULL; + + /* explicitly check that the inode_storage_ptr is not + * NULL as inode_storage_lookup returns NULL in this case and + * bpf_local_storage_update expects the owner to have a + * valid storage pointer. + */ + if (!inode_storage_ptr(inode)) + return (unsigned long)NULL; + + sdata = inode_storage_lookup(inode, map, true); + if (sdata) + return (unsigned long)sdata->data; + + /* This helper must only called from where the inode is gurranteed + * to have a refcount and cannot be freed. + */ + if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) { + sdata = bpf_local_storage_update( + inode, (struct bpf_local_storage_map *)map, value, + BPF_NOEXIST); + return IS_ERR(sdata) ? (unsigned long)NULL : + (unsigned long)sdata->data; + } + + return (unsigned long)NULL; +} + +BPF_CALL_2(bpf_inode_storage_delete, + struct bpf_map *, map, struct inode *, inode) +{ + /* This helper must only called from where the inode is gurranteed + * to have a refcount and cannot be freed. + */ + return inode_storage_delete(inode, map); +} + +static int notsupp_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + return -ENOTSUPP; +} + +static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr) +{ + struct bpf_local_storage_map *smap; + + smap = bpf_local_storage_map_alloc(attr); + if (IS_ERR(smap)) + return ERR_CAST(smap); + + smap->cache_idx = bpf_local_storage_cache_idx_get(&inode_cache); + return &smap->map; +} + +static void inode_storage_map_free(struct bpf_map *map) +{ + struct bpf_local_storage_map *smap; + + smap = (struct bpf_local_storage_map *)map; + bpf_local_storage_cache_idx_free(&inode_cache, smap->cache_idx); + bpf_local_storage_map_free(smap); +} + +static int inode_storage_map_btf_id; +const struct bpf_map_ops inode_storage_map_ops = { + .map_alloc_check = bpf_local_storage_map_alloc_check, + .map_alloc = inode_storage_map_alloc, + .map_free = inode_storage_map_free, + .map_get_next_key = notsupp_get_next_key, + .map_lookup_elem = bpf_fd_inode_storage_lookup_elem, + .map_update_elem = bpf_fd_inode_storage_update_elem, + .map_delete_elem = bpf_fd_inode_storage_delete_elem, + .map_check_btf = bpf_local_storage_map_check_btf, + .map_btf_name = "bpf_local_storage_map", + .map_btf_id = &inode_storage_map_btf_id, + .map_owner_storage_ptr = inode_storage_ptr, +}; + +BTF_ID_LIST(bpf_inode_storage_btf_ids) +BTF_ID_UNUSED +BTF_ID(struct, inode) + +const struct bpf_func_proto bpf_inode_storage_get_proto = { + .func = bpf_inode_storage_get, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, + .arg4_type = ARG_ANYTHING, + .btf_id = bpf_inode_storage_btf_ids, +}; + +const struct bpf_func_proto bpf_inode_storage_delete_proto = { + .func = bpf_inode_storage_delete, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .btf_id = bpf_inode_storage_btf_ids, +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b46e973faee9..5443cea86cef 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -769,7 +769,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_ARRAY && map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && - map->map_type != BPF_MAP_TYPE_SK_STORAGE) + map->map_type != BPF_MAP_TYPE_SK_STORAGE && + map->map_type != BPF_MAP_TYPE_INODE_STORAGE) return -ENOTSUPP; if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > map->value_size) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index dd24503ab3d3..38748794518e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4311,6 +4311,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_sk_storage_delete) goto error; break; + case BPF_MAP_TYPE_INODE_STORAGE: + if (func_id != BPF_FUNC_inode_storage_get && + func_id != BPF_FUNC_inode_storage_delete) + goto error; + break; default: break; } @@ -4384,6 +4389,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) goto error; break; + case BPF_FUNC_inode_storage_get: + case BPF_FUNC_inode_storage_delete: + if (map->map_type != BPF_MAP_TYPE_INODE_STORAGE) + goto error; + break; default: break; } diff --git a/security/bpf/hooks.c b/security/bpf/hooks.c index 32d32d485451..788667d582ae 100644 --- a/security/bpf/hooks.c +++ b/security/bpf/hooks.c @@ -11,6 +11,7 @@ static struct security_hook_list bpf_lsm_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(NAME, bpf_lsm_##NAME), #include #undef LSM_HOOK + LSM_HOOK_INIT(inode_free_security, bpf_inode_storage_free), }; static int __init bpf_lsm_init(void) @@ -20,7 +21,12 @@ static int __init bpf_lsm_init(void) return 0; } +struct lsm_blob_sizes bpf_lsm_blob_sizes __lsm_ro_after_init = { + .lbs_inode = sizeof(struct bpf_storage_blob), +}; + DEFINE_LSM(bpf) = { .name = "bpf", .init = bpf_lsm_init, + .blobs = &bpf_lsm_blob_sizes }; diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index 41e2a74252d0..083db6c2fc67 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -49,7 +49,7 @@ MAP COMMANDS | | **lru_percpu_hash** | **lpm_trie** | **array_of_maps** | **hash_of_maps** | | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash** | | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage** -| | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** } +| | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage** } DESCRIPTION =========== diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index f53ed2f1a4aa..7b68e3c0a5fb 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -704,7 +704,8 @@ _bpftool() lru_percpu_hash lpm_trie array_of_maps \ hash_of_maps devmap devmap_hash sockmap cpumap \ xskmap sockhash cgroup_storage reuseport_sockarray \ - percpu_cgroup_storage queue stack' -- \ + percpu_cgroup_storage queue stack sk_storage \ + struct_ops inode_storage' -- \ "$cur" ) ) return 0 ;; diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index 3a27d31a1856..bc0071228f88 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -50,6 +50,7 @@ const char * const map_type_name[] = { [BPF_MAP_TYPE_SK_STORAGE] = "sk_storage", [BPF_MAP_TYPE_STRUCT_OPS] = "struct_ops", [BPF_MAP_TYPE_RINGBUF] = "ringbuf", + [BPF_MAP_TYPE_INODE_STORAGE] = "inode_storage", }; const size_t map_type_name_size = ARRAY_SIZE(map_type_name); @@ -1442,7 +1443,7 @@ static int do_help(int argc, char **argv) " lru_percpu_hash | lpm_trie | array_of_maps | hash_of_maps |\n" " devmap | devmap_hash | sockmap | cpumap | xskmap | sockhash |\n" " cgroup_storage | reuseport_sockarray | percpu_cgroup_storage |\n" - " queue | stack | sk_storage | struct_ops | ringbuf }\n" + " queue | stack | sk_storage | struct_ops | ringbuf | inode_storage }\n" " " HELP_SPEC_OPTIONS "\n" "", bin_name, argv[-2]); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 2cbd137eed86..b6bfcd085a76 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -155,6 +155,7 @@ enum bpf_map_type { BPF_MAP_TYPE_DEVMAP_HASH, BPF_MAP_TYPE_STRUCT_OPS, BPF_MAP_TYPE_RINGBUF, + BPF_MAP_TYPE_INODE_STORAGE, }; /* Note that tracing related programs such as @@ -3509,6 +3510,41 @@ union bpf_attr { * * **-EPERM** This helper cannot be used under the * current sock_ops->op. + * void *bpf_inode_storage_get(struct bpf_map *map, void *inode, void *value, u64 flags) + * Description + * Get a bpf_local_storage from an *inode*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *inode* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *inode*) except this + * helper enforces the key must be an inode and the map must also + * be a **BPF_MAP_TYPE_INODE_STORAGE**. + * + * Underneath, the value is stored locally at *inode* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf_local_storage residing at *inode*. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * Return + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + * + * int bpf_inode_storage_delete(struct bpf_map *map, void *inode) + * Description + * Delete a bpf_local_storage from an *inode*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3655,7 +3691,9 @@ union bpf_attr { FN(get_task_stack), \ FN(load_hdr_opt), \ FN(store_hdr_opt), \ - FN(reserve_hdr_opt), + FN(reserve_hdr_opt), \ + FN(inode_storage_get), \ + FN(inode_storage_delete), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c index 010c9a76fd2b..5482a9b7ae2d 100644 --- a/tools/lib/bpf/libbpf_probes.c +++ b/tools/lib/bpf/libbpf_probes.c @@ -170,7 +170,7 @@ int libbpf__load_raw_btf(const char *raw_types, size_t types_len, return btf_fd; } -static int load_sk_storage_btf(void) +static int load_local_storage_btf(void) { const char strs[] = "\0bpf_spin_lock\0val\0cnt\0l"; /* struct bpf_spin_lock { @@ -229,12 +229,13 @@ bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex) key_size = 0; break; case BPF_MAP_TYPE_SK_STORAGE: + case BPF_MAP_TYPE_INODE_STORAGE: btf_key_type_id = 1; btf_value_type_id = 3; value_size = 8; max_entries = 0; map_flags = BPF_F_NO_PREALLOC; - btf_fd = load_sk_storage_btf(); + btf_fd = load_local_storage_btf(); if (btf_fd < 0) return false; break; -- cgit v1.2.3 From 30897832d8b97e93833fb52c0a02951db3692ed2 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Tue, 25 Aug 2020 20:29:18 +0200 Subject: bpf: Allow local storage to be used from LSM programs Adds support for both bpf_{sk, inode}_storage_{get, delete} to be used in LSM programs. These helpers are not used for tracing programs (currently) as their usage is tied to the life-cycle of the object and should only be used where the owning object won't be freed (when the owning object is passed as an argument to the LSM hook). Thus, they are safer to use in LSM hooks than tracing. Usage of local storage in tracing programs will probably follow a per function based whitelist approach. Since the UAPI helper signature for bpf_sk_storage expect a bpf_sock, it, leads to a compilation warning for LSM programs, it's also updated to accept a void * pointer instead. Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200825182919.1118197-7-kpsingh@chromium.org --- include/net/bpf_sk_storage.h | 2 ++ include/uapi/linux/bpf.h | 7 +++++-- kernel/bpf/bpf_lsm.c | 21 ++++++++++++++++++++- net/core/bpf_sk_storage.c | 25 +++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 7 +++++-- 5 files changed, 57 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h index 3c516dd07caf..119f4c9c3a9c 100644 --- a/include/net/bpf_sk_storage.h +++ b/include/net/bpf_sk_storage.h @@ -20,6 +20,8 @@ void bpf_sk_storage_free(struct sock *sk); extern const struct bpf_func_proto bpf_sk_storage_get_proto; extern const struct bpf_func_proto bpf_sk_storage_delete_proto; +extern const struct bpf_func_proto sk_storage_get_btf_proto; +extern const struct bpf_func_proto sk_storage_delete_btf_proto; struct bpf_local_storage_elem; struct bpf_sk_storage_diag; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b6bfcd085a76..0e1cdf806fe1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2808,7 +2808,7 @@ union bpf_attr { * * **-ERANGE** if resulting value was out of range. * - * void *bpf_sk_storage_get(struct bpf_map *map, struct bpf_sock *sk, void *value, u64 flags) + * void *bpf_sk_storage_get(struct bpf_map *map, void *sk, void *value, u64 flags) * Description * Get a bpf-local-storage from a *sk*. * @@ -2824,6 +2824,9 @@ union bpf_attr { * "type". The bpf-local-storage "type" (i.e. the *map*) is * searched against all bpf-local-storages residing at *sk*. * + * *sk* is a kernel **struct sock** pointer for LSM program. + * *sk* is a **struct bpf_sock** pointer for other program types. + * * An optional *flags* (**BPF_SK_STORAGE_GET_F_CREATE**) can be * used such that a new bpf-local-storage will be * created if one does not exist. *value* can be used @@ -2836,7 +2839,7 @@ union bpf_attr { * **NULL** if not found or there was an error in adding * a new bpf-local-storage. * - * long bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk) + * long bpf_sk_storage_delete(struct bpf_map *map, void *sk) * Description * Delete a bpf-local-storage from a *sk*. * Return diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index fb278144e9fd..9cd1428c7199 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -11,6 +11,8 @@ #include #include #include +#include +#include /* For every LSM hook that allows attachment of BPF programs, declare a nop * function where a BPF program can be attached. @@ -45,10 +47,27 @@ int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, return 0; } +static const struct bpf_func_proto * +bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_inode_storage_get: + return &bpf_inode_storage_get_proto; + case BPF_FUNC_inode_storage_delete: + return &bpf_inode_storage_delete_proto; + case BPF_FUNC_sk_storage_get: + return &sk_storage_get_btf_proto; + case BPF_FUNC_sk_storage_delete: + return &sk_storage_delete_btf_proto; + default: + return tracing_prog_func_proto(func_id, prog); + } +} + const struct bpf_prog_ops lsm_prog_ops = { }; const struct bpf_verifier_ops lsm_verifier_ops = { - .get_func_proto = tracing_prog_func_proto, + .get_func_proto = bpf_lsm_func_proto, .is_valid_access = btf_ctx_access, }; diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index f29d9a9b4ea4..55fae03b4cc3 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -12,6 +12,7 @@ #include #include #include +#include DEFINE_BPF_STORAGE_CACHE(sk_cache); @@ -377,6 +378,30 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = { .arg2_type = ARG_PTR_TO_SOCKET, }; +BTF_ID_LIST(sk_storage_btf_ids) +BTF_ID_UNUSED +BTF_ID(struct, sock) + +const struct bpf_func_proto sk_storage_get_btf_proto = { + .func = bpf_sk_storage_get, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, + .arg4_type = ARG_ANYTHING, + .btf_id = sk_storage_btf_ids, +}; + +const struct bpf_func_proto sk_storage_delete_btf_proto = { + .func = bpf_sk_storage_delete, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .btf_id = sk_storage_btf_ids, +}; + struct bpf_sk_storage_diag { u32 nr_maps; struct bpf_map *maps[]; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b6bfcd085a76..0e1cdf806fe1 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2808,7 +2808,7 @@ union bpf_attr { * * **-ERANGE** if resulting value was out of range. * - * void *bpf_sk_storage_get(struct bpf_map *map, struct bpf_sock *sk, void *value, u64 flags) + * void *bpf_sk_storage_get(struct bpf_map *map, void *sk, void *value, u64 flags) * Description * Get a bpf-local-storage from a *sk*. * @@ -2824,6 +2824,9 @@ union bpf_attr { * "type". The bpf-local-storage "type" (i.e. the *map*) is * searched against all bpf-local-storages residing at *sk*. * + * *sk* is a kernel **struct sock** pointer for LSM program. + * *sk* is a **struct bpf_sock** pointer for other program types. + * * An optional *flags* (**BPF_SK_STORAGE_GET_F_CREATE**) can be * used such that a new bpf-local-storage will be * created if one does not exist. *value* can be used @@ -2836,7 +2839,7 @@ union bpf_attr { * **NULL** if not found or there was an error in adding * a new bpf-local-storage. * - * long bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk) + * long bpf_sk_storage_delete(struct bpf_map *map, void *sk) * Description * Delete a bpf-local-storage from a *sk*. * Return -- cgit v1.2.3 From 6e22ab9da79343532cd3cde39df25e5a5478c692 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 25 Aug 2020 21:21:20 +0200 Subject: bpf: Add d_path helper Adding d_path helper function that returns full path for given 'struct path' object, which needs to be the kernel BTF 'path' object. The path is returned in buffer provided 'buf' of size 'sz' and is zero terminated. bpf_d_path(&file->f_path, buf, size); The helper calls directly d_path function, so there's only limited set of function it can be called from. Adding just very modest set for the start. Updating also bpf.h tools uapi header and adding 'path' to bpf_helpers_doc.py script. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20200825192124.710397-11-jolsa@kernel.org --- include/uapi/linux/bpf.h | 14 ++++++++++++ kernel/trace/bpf_trace.c | 48 ++++++++++++++++++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 2 ++ tools/include/uapi/linux/bpf.h | 14 ++++++++++++ 4 files changed, 78 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0e1cdf806fe1..0388bc0200b0 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3513,6 +3513,7 @@ union bpf_attr { * * **-EPERM** This helper cannot be used under the * current sock_ops->op. + * * void *bpf_inode_storage_get(struct bpf_map *map, void *inode, void *value, u64 flags) * Description * Get a bpf_local_storage from an *inode*. @@ -3548,6 +3549,18 @@ union bpf_attr { * 0 on success. * * **-ENOENT** if the bpf_local_storage cannot be found. + * + * long bpf_d_path(struct path *path, char *buf, u32 sz) + * Description + * Return full path for given 'struct path' object, which + * needs to be the kernel BTF 'path' object. The path is + * returned in the provided buffer 'buf' of size 'sz' and + * is zero terminated. + * + * Return + * On success, the strictly positive length of the string, + * including the trailing NUL character. On error, a negative + * value. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3697,6 +3710,7 @@ union bpf_attr { FN(reserve_hdr_opt), \ FN(inode_storage_get), \ FN(inode_storage_delete), \ + FN(d_path), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index a8d4f253ed77..d973d891f2e2 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1098,6 +1098,52 @@ static const struct bpf_func_proto bpf_send_signal_thread_proto = { .arg1_type = ARG_ANYTHING, }; +BPF_CALL_3(bpf_d_path, struct path *, path, char *, buf, u32, sz) +{ + long len; + char *p; + + if (!sz) + return 0; + + p = d_path(path, buf, sz); + if (IS_ERR(p)) { + len = PTR_ERR(p); + } else { + len = buf + sz - p; + memmove(buf, p, len); + } + + return len; +} + +BTF_SET_START(btf_allowlist_d_path) +BTF_ID(func, vfs_truncate) +BTF_ID(func, vfs_fallocate) +BTF_ID(func, dentry_open) +BTF_ID(func, vfs_getattr) +BTF_ID(func, filp_close) +BTF_SET_END(btf_allowlist_d_path) + +static bool bpf_d_path_allowed(const struct bpf_prog *prog) +{ + return btf_id_set_contains(&btf_allowlist_d_path, prog->aux->attach_btf_id); +} + +BTF_ID_LIST(bpf_d_path_btf_ids) +BTF_ID(struct, path) + +static const struct bpf_func_proto bpf_d_path_proto = { + .func = bpf_d_path, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .btf_id = bpf_d_path_btf_ids, + .allowed = bpf_d_path_allowed, +}; + const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1579,6 +1625,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return prog->expected_attach_type == BPF_TRACE_ITER ? &bpf_seq_write_proto : NULL; + case BPF_FUNC_d_path: + return &bpf_d_path_proto; default: return raw_tp_prog_func_proto(func_id, prog); } diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 5bfa448b4704..08388173973f 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -432,6 +432,7 @@ class PrinterHelpers(Printer): 'struct __sk_buff', 'struct sk_msg_md', 'struct xdp_md', + 'struct path', ] known_types = { '...', @@ -472,6 +473,7 @@ class PrinterHelpers(Printer): 'struct tcp_request_sock', 'struct udp6_sock', 'struct task_struct', + 'struct path', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0e1cdf806fe1..0388bc0200b0 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3513,6 +3513,7 @@ union bpf_attr { * * **-EPERM** This helper cannot be used under the * current sock_ops->op. + * * void *bpf_inode_storage_get(struct bpf_map *map, void *inode, void *value, u64 flags) * Description * Get a bpf_local_storage from an *inode*. @@ -3548,6 +3549,18 @@ union bpf_attr { * 0 on success. * * **-ENOENT** if the bpf_local_storage cannot be found. + * + * long bpf_d_path(struct path *path, char *buf, u32 sz) + * Description + * Return full path for given 'struct path' object, which + * needs to be the kernel BTF 'path' object. The path is + * returned in the provided buffer 'buf' of size 'sz' and + * is zero terminated. + * + * Return + * On success, the strictly positive length of the string, + * including the trailing NUL character. On error, a negative + * value. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3697,6 +3710,7 @@ union bpf_attr { FN(reserve_hdr_opt), \ FN(inode_storage_get), \ FN(inode_storage_delete), \ + FN(d_path), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From b305dfe2e93434b12d438434461b709641f62af4 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Thu, 20 Aug 2020 12:47:16 +0200 Subject: media: videodev2.h: RGB BT2020 and HSV are always full range The default RGB quantization range for BT.2020 is full range (just as for all the other RGB pixel encodings), not limited range. Update the V4L2_MAP_QUANTIZATION_DEFAULT macro and documentation accordingly. Also mention that HSV is always full range and cannot be limited range. When RGB BT2020 was introduced in V4L2 it was not clear whether it should be limited or full range, but full range is the right (and consistent) choice. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- .../userspace-api/media/v4l/colorspaces-defs.rst | 9 ++++----- .../userspace-api/media/v4l/colorspaces-details.rst | 5 ++--- include/uapi/linux/videodev2.h | 17 ++++++++--------- 3 files changed, 14 insertions(+), 17 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/userspace-api/media/v4l/colorspaces-defs.rst b/Documentation/userspace-api/media/v4l/colorspaces-defs.rst index 01404e1f609a..4089f426258d 100644 --- a/Documentation/userspace-api/media/v4l/colorspaces-defs.rst +++ b/Documentation/userspace-api/media/v4l/colorspaces-defs.rst @@ -36,8 +36,7 @@ whole range, 0-255, dividing the angular value by 1.41. The enum :c:type:`v4l2_hsv_encoding` specifies which encoding is used. .. note:: The default R'G'B' quantization is full range for all - colorspaces except for BT.2020 which uses limited range R'G'B' - quantization. + colorspaces. HSV formats are always full range. .. tabularcolumns:: |p{6.7cm}|p{10.8cm}| @@ -169,8 +168,8 @@ whole range, 0-255, dividing the angular value by 1.41. The enum - Details * - ``V4L2_QUANTIZATION_DEFAULT`` - Use the default quantization encoding as defined by the - colorspace. This is always full range for R'G'B' (except for the - BT.2020 colorspace) and HSV. It is usually limited range for Y'CbCr. + colorspace. This is always full range for R'G'B' and HSV. + It is usually limited range for Y'CbCr. * - ``V4L2_QUANTIZATION_FULL_RANGE`` - Use the full range quantization encoding. I.e. the range [0…1] is mapped to [0…255] (with possible clipping to [1…254] to avoid the @@ -180,4 +179,4 @@ whole range, 0-255, dividing the angular value by 1.41. The enum * - ``V4L2_QUANTIZATION_LIM_RANGE`` - Use the limited range quantization encoding. I.e. the range [0…1] is mapped to [16…235]. Cb and Cr are mapped from [-0.5…0.5] to - [16…240]. + [16…240]. Limited Range cannot be used with HSV. diff --git a/Documentation/userspace-api/media/v4l/colorspaces-details.rst b/Documentation/userspace-api/media/v4l/colorspaces-details.rst index 300c5d2e7d0f..cf1b825ec34a 100644 --- a/Documentation/userspace-api/media/v4l/colorspaces-details.rst +++ b/Documentation/userspace-api/media/v4l/colorspaces-details.rst @@ -377,9 +377,8 @@ Colorspace BT.2020 (V4L2_COLORSPACE_BT2020) The :ref:`itu2020` standard defines the colorspace used by Ultra-high definition television (UHDTV). The default transfer function is ``V4L2_XFER_FUNC_709``. The default Y'CbCr encoding is -``V4L2_YCBCR_ENC_BT2020``. The default R'G'B' quantization is limited -range (!), and so is the default Y'CbCr quantization. The chromaticities -of the primary colors and the white reference are: +``V4L2_YCBCR_ENC_BT2020``. The default Y'CbCr quantization is limited range. +The chromaticities of the primary colors and the white reference are: diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index c7b70ff53bc1..4769628790da 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -375,9 +375,9 @@ enum v4l2_hsv_encoding { enum v4l2_quantization { /* - * The default for R'G'B' quantization is always full range, except - * for the BT2020 colorspace. For Y'CbCr the quantization is always - * limited range, except for COLORSPACE_JPEG: this is full range. + * The default for R'G'B' quantization is always full range. + * For Y'CbCr the quantization is always limited range, except + * for COLORSPACE_JPEG: this is full range. */ V4L2_QUANTIZATION_DEFAULT = 0, V4L2_QUANTIZATION_FULL_RANGE = 1, @@ -386,14 +386,13 @@ enum v4l2_quantization { /* * Determine how QUANTIZATION_DEFAULT should map to a proper quantization. - * This depends on whether the image is RGB or not, the colorspace and the - * Y'CbCr encoding. + * This depends on whether the image is RGB or not, the colorspace. + * The Y'CbCr encoding is not used anymore, but is still there for backwards + * compatibility. */ #define V4L2_MAP_QUANTIZATION_DEFAULT(is_rgb_or_hsv, colsp, ycbcr_enc) \ - (((is_rgb_or_hsv) && (colsp) == V4L2_COLORSPACE_BT2020) ? \ - V4L2_QUANTIZATION_LIM_RANGE : \ - (((is_rgb_or_hsv) || (colsp) == V4L2_COLORSPACE_JPEG) ? \ - V4L2_QUANTIZATION_FULL_RANGE : V4L2_QUANTIZATION_LIM_RANGE)) + (((is_rgb_or_hsv) || (colsp) == V4L2_COLORSPACE_JPEG) ? \ + V4L2_QUANTIZATION_FULL_RANGE : V4L2_QUANTIZATION_LIM_RANGE) /* * Deprecated names for opRGB colorspace (IEC 61966-2-5) -- cgit v1.2.3 From 493a0ebd804c986e6bd207603c5e1ca748470d3d Mon Sep 17 00:00:00 2001 From: James Prestwood Date: Mon, 13 Apr 2020 09:20:53 -0700 Subject: nl80211: fix PORT_AUTHORIZED wording to reflect behavior The CMD_PORT_AUTHORIZED event was described as an event which indicated a successfully completed 4-way handshake. But the behavior was not as advertized. The only driver which uses this is brcmfmac, and this driver only sends the event after a successful 802.1X-FT roam. This prevents userspace applications from knowing if the 4-way completed on: 1. Normal 802.1X connects 2. Normal PSK connections 3. FT-PSK roams wpa_supplicant handles this incorrect behavior by just completing the connection after association, before the 4-way has completed. If the 4-way ends up failing it disconnects at that point. Since this behavior appears to be expected (wpa_s handles it this way) I have changed the wording in the API description to reflect the actual behavior. Signed-off-by: James Prestwood Link: https://lore.kernel.org/r/20200413162053.3711-1-prestwoj@gmail.com [fix spelling of 802.1X throughout ...] Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 631f3a997b3c..8cc2b825e4e4 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -647,13 +647,9 @@ * authentication/association or not receiving a response from the AP. * Non-zero %NL80211_ATTR_STATUS_CODE value is indicated in that case as * well to remain backwards compatible. - * When establishing a security association, drivers that support 4 way - * handshake offload should send %NL80211_CMD_PORT_AUTHORIZED event when - * the 4 way handshake is completed successfully. * @NL80211_CMD_ROAM: Notification indicating the card/driver roamed by itself. - * When a security association was established with the new AP (e.g. if - * the FT protocol was used for roaming or the driver completed the 4 way - * handshake), this event should be followed by an + * When a security association was established on an 802.1X network using + * fast transition, this event should be followed by an * %NL80211_CMD_PORT_AUTHORIZED event. * @NL80211_CMD_DISCONNECT: drop a given connection; also used to notify * userspace that a connection was dropped by the AP or due to other @@ -1067,13 +1063,11 @@ * @NL80211_CMD_DEL_PMK: For offloaded 4-Way handshake, delete the previously * configured PMK for the authenticator address identified by * %NL80211_ATTR_MAC. - * @NL80211_CMD_PORT_AUTHORIZED: An event that indicates that the 4 way - * handshake was completed successfully by the driver. The BSSID is - * specified with %NL80211_ATTR_MAC. Drivers that support 4 way handshake - * offload should send this event after indicating 802.11 association with - * %NL80211_CMD_CONNECT or %NL80211_CMD_ROAM. If the 4 way handshake failed - * %NL80211_CMD_DISCONNECT should be indicated instead. - * + * @NL80211_CMD_PORT_AUTHORIZED: An event that indicates an 802.1X FT roam was + * completed successfully. Drivers that support 4 way handshake offload + * should send this event after indicating 802.1X FT assocation with + * %NL80211_CMD_ROAM. If the 4 way handshake failed %NL80211_CMD_DISCONNECT + * should be indicated instead. * @NL80211_CMD_CONTROL_PORT_FRAME: Control Port (e.g. PAE) frame TX request * and RX notification. This command is used both as a request to transmit * a control port frame and as a notification that a control port frame -- cgit v1.2.3 From eb89a6a6b7a1af2d9c8d83ee44fa67700d6337e7 Mon Sep 17 00:00:00 2001 From: Miles Hu Date: Tue, 4 Aug 2020 10:16:29 +0200 Subject: nl80211: add support for setting fixed HE rate/gi/ltf This patch adds the nl80211 structs, definitions, policies and parsing code required to pass fixed HE rate, GI and LTF settings. Signed-off-by: Miles Hu Signed-off-by: John Crispin Link: https://lore.kernel.org/r/20200804081630.2013619-1-john@phrozen.org [fix comment] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 + include/uapi/linux/nl80211.h | 28 +++++++++ net/wireless/nl80211.c | 137 ++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 160 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index d9e6b9fbd95b..c9bce9bba511 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -678,7 +678,10 @@ struct cfg80211_bitrate_mask { u32 legacy; u8 ht_mcs[IEEE80211_HT_MCS_MASK_LEN]; u16 vht_mcs[NL80211_VHT_NSS_MAX]; + u16 he_mcs[NL80211_HE_NSS_MAX]; enum nl80211_txrate_gi gi; + enum nl80211_he_gi he_gi; + enum nl80211_he_ltf he_ltf; } control[NUM_NL80211_BANDS]; }; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 8cc2b825e4e4..1a4b922f489f 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3180,6 +3180,18 @@ enum nl80211_he_gi { NL80211_RATE_INFO_HE_GI_3_2, }; +/** + * enum nl80211_he_ltf - HE long training field + * @NL80211_RATE_INFO_HE_1xLTF: 3.2 usec + * @NL80211_RATE_INFO_HE_2xLTF: 6.4 usec + * @NL80211_RATE_INFO_HE_4xLTF: 12.8 usec + */ +enum nl80211_he_ltf { + NL80211_RATE_INFO_HE_1XLTF, + NL80211_RATE_INFO_HE_2XLTF, + NL80211_RATE_INFO_HE_4XLTF, +}; + /** * enum nl80211_he_ru_alloc - HE RU allocation values * @NL80211_RATE_INFO_HE_RU_ALLOC_26: 26-tone RU allocation @@ -4735,6 +4747,10 @@ enum nl80211_key_attributes { * @NL80211_TXRATE_VHT: VHT rates allowed for TX rate selection, * see &struct nl80211_txrate_vht * @NL80211_TXRATE_GI: configure GI, see &enum nl80211_txrate_gi + * @NL80211_TXRATE_HE: HE rates allowed for TX rate selection, + * see &struct nl80211_txrate_he + * @NL80211_TXRATE_HE_GI: configure HE GI, 0.8us, 1.6us and 3.2us. + * @NL80211_TXRATE_HE_LTF: configure HE LTF, 1XLTF, 2XLTF and 4XLTF. * @__NL80211_TXRATE_AFTER_LAST: internal * @NL80211_TXRATE_MAX: highest TX rate attribute */ @@ -4744,6 +4760,9 @@ enum nl80211_tx_rate_attributes { NL80211_TXRATE_HT, NL80211_TXRATE_VHT, NL80211_TXRATE_GI, + NL80211_TXRATE_HE, + NL80211_TXRATE_HE_GI, + NL80211_TXRATE_HE_LTF, /* keep last */ __NL80211_TXRATE_AFTER_LAST, @@ -4761,6 +4780,15 @@ struct nl80211_txrate_vht { __u16 mcs[NL80211_VHT_NSS_MAX]; }; +#define NL80211_HE_NSS_MAX 8 +/** + * struct nl80211_txrate_he - HE MCS/NSS txrate bitmap + * @mcs: MCS bitmap table for each NSS (array index 0 for 1 stream, etc.) + */ +struct nl80211_txrate_he { + __u16 mcs[NL80211_HE_NSS_MAX]; +}; + enum nl80211_txrate_gi { NL80211_TXRATE_DEFAULT_GI, NL80211_TXRATE_FORCE_SGI, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 6ee3bc48d776..da0f33c2d2d8 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -336,6 +336,13 @@ static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = { .len = NL80211_MAX_SUPP_HT_RATES }, [NL80211_TXRATE_VHT] = NLA_POLICY_EXACT_LEN_WARN(sizeof(struct nl80211_txrate_vht)), [NL80211_TXRATE_GI] = { .type = NLA_U8 }, + [NL80211_TXRATE_HE] = NLA_POLICY_EXACT_LEN(sizeof(struct nl80211_txrate_he)), + [NL80211_TXRATE_HE_GI] = NLA_POLICY_RANGE(NLA_U8, + NL80211_RATE_INFO_HE_GI_0_8, + NL80211_RATE_INFO_HE_GI_3_2), + [NL80211_TXRATE_HE_LTF] = NLA_POLICY_RANGE(NLA_U8, + NL80211_RATE_INFO_HE_1XLTF, + NL80211_RATE_INFO_HE_4XLTF), }; static const struct nla_policy @@ -4430,21 +4437,106 @@ static bool vht_set_mcs_mask(struct ieee80211_supported_band *sband, return true; } +static u16 he_mcs_map_to_mcs_mask(u8 he_mcs_map) +{ + switch (he_mcs_map) { + case IEEE80211_HE_MCS_NOT_SUPPORTED: + return 0; + case IEEE80211_HE_MCS_SUPPORT_0_7: + return 0x00FF; + case IEEE80211_HE_MCS_SUPPORT_0_9: + return 0x03FF; + case IEEE80211_HE_MCS_SUPPORT_0_11: + return 0xFFF; + default: + break; + } + return 0; +} + +static void he_build_mcs_mask(u16 he_mcs_map, + u16 he_mcs_mask[NL80211_HE_NSS_MAX]) +{ + u8 nss; + + for (nss = 0; nss < NL80211_HE_NSS_MAX; nss++) { + he_mcs_mask[nss] = he_mcs_map_to_mcs_mask(he_mcs_map & 0x03); + he_mcs_map >>= 2; + } +} + +static u16 he_get_txmcsmap(struct genl_info *info, + const struct ieee80211_sta_he_cap *he_cap) +{ + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + __le16 tx_mcs; + + switch (wdev->chandef.width) { + case NL80211_CHAN_WIDTH_80P80: + tx_mcs = he_cap->he_mcs_nss_supp.tx_mcs_80p80; + break; + case NL80211_CHAN_WIDTH_160: + tx_mcs = he_cap->he_mcs_nss_supp.tx_mcs_160; + break; + default: + tx_mcs = he_cap->he_mcs_nss_supp.tx_mcs_80; + break; + } + return le16_to_cpu(tx_mcs); +} + +static bool he_set_mcs_mask(struct genl_info *info, + struct wireless_dev *wdev, + struct ieee80211_supported_band *sband, + struct nl80211_txrate_he *txrate, + u16 mcs[NL80211_HE_NSS_MAX]) +{ + const struct ieee80211_sta_he_cap *he_cap; + u16 tx_mcs_mask[NL80211_HE_NSS_MAX] = {}; + u16 tx_mcs_map = 0; + u8 i; + + he_cap = ieee80211_get_he_iftype_cap(sband, wdev->iftype); + if (!he_cap) + return false; + + memset(mcs, 0, sizeof(u16) * NL80211_HE_NSS_MAX); + + tx_mcs_map = he_get_txmcsmap(info, he_cap); + + /* Build he_mcs_mask from HE capabilities */ + he_build_mcs_mask(tx_mcs_map, tx_mcs_mask); + + for (i = 0; i < NL80211_HE_NSS_MAX; i++) { + if ((tx_mcs_mask[i] & txrate->mcs[i]) == txrate->mcs[i]) + mcs[i] = txrate->mcs[i]; + else + return false; + } + + return true; +} + static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, struct nlattr *attrs[], enum nl80211_attrs attr, - struct cfg80211_bitrate_mask *mask) + struct cfg80211_bitrate_mask *mask, + struct net_device *dev) { struct nlattr *tb[NL80211_TXRATE_MAX + 1]; struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct wireless_dev *wdev = dev->ieee80211_ptr; int rem, i; struct nlattr *tx_rates; struct ieee80211_supported_band *sband; - u16 vht_tx_mcs_map; + u16 vht_tx_mcs_map, he_tx_mcs_map; memset(mask, 0, sizeof(*mask)); /* Default to all rates enabled */ for (i = 0; i < NUM_NL80211_BANDS; i++) { + const struct ieee80211_sta_he_cap *he_cap; + sband = rdev->wiphy.bands[i]; if (!sband) @@ -4460,6 +4552,16 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, vht_tx_mcs_map = le16_to_cpu(sband->vht_cap.vht_mcs.tx_mcs_map); vht_build_mcs_mask(vht_tx_mcs_map, mask->control[i].vht_mcs); + + he_cap = ieee80211_get_he_iftype_cap(sband, wdev->iftype); + if (!he_cap) + continue; + + he_tx_mcs_map = he_get_txmcsmap(info, he_cap); + he_build_mcs_mask(he_tx_mcs_map, mask->control[i].he_mcs); + + mask->control[i].he_gi = 0xFF; + mask->control[i].he_ltf = 0xFF; } /* if no rates are given set it back to the defaults */ @@ -4515,13 +4617,25 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, if (mask->control[band].gi > NL80211_TXRATE_FORCE_LGI) return -EINVAL; } + if (tb[NL80211_TXRATE_HE] && + !he_set_mcs_mask(info, wdev, sband, + nla_data(tb[NL80211_TXRATE_HE]), + mask->control[band].he_mcs)) + return -EINVAL; + if (tb[NL80211_TXRATE_HE_GI]) + mask->control[band].he_gi = + nla_get_u8(tb[NL80211_TXRATE_HE_GI]); + if (tb[NL80211_TXRATE_HE_LTF]) + mask->control[band].he_ltf = + nla_get_u8(tb[NL80211_TXRATE_HE_LTF]); if (mask->control[band].legacy == 0) { - /* don't allow empty legacy rates if HT or VHT + /* don't allow empty legacy rates if HT, VHT or HE * are not even supported. */ if (!(rdev->wiphy.bands[band]->ht_cap.ht_supported || - rdev->wiphy.bands[band]->vht_cap.vht_supported)) + rdev->wiphy.bands[band]->vht_cap.vht_supported || + ieee80211_get_he_iftype_cap(sband, wdev->iftype))) return -EINVAL; for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) @@ -4532,6 +4646,10 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, if (mask->control[band].vht_mcs[i]) goto out; + for (i = 0; i < NL80211_HE_NSS_MAX; i++) + if (mask->control[band].he_mcs[i]) + goto out; + /* legacy and mcs rates may not be both empty */ return -EINVAL; } @@ -4976,7 +5094,8 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_TX_RATES]) { err = nl80211_parse_tx_bitrate_mask(info, info->attrs, NL80211_ATTR_TX_RATES, - ¶ms.beacon_rate); + ¶ms.beacon_rate, + dev); if (err) return err; @@ -10780,7 +10899,8 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb, return -EOPNOTSUPP; err = nl80211_parse_tx_bitrate_mask(info, info->attrs, - NL80211_ATTR_TX_RATES, &mask); + NL80211_ATTR_TX_RATES, &mask, + dev); if (err) return err; @@ -11388,7 +11508,8 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_TX_RATES]) { err = nl80211_parse_tx_bitrate_mask(info, info->attrs, NL80211_ATTR_TX_RATES, - &setup.beacon_rate); + &setup.beacon_rate, + dev); if (err) return err; @@ -14168,7 +14289,7 @@ static int parse_tid_conf(struct cfg80211_registered_device *rdev, if (tid_conf->txrate_type != NL80211_TX_RATE_AUTOMATIC) { attr = NL80211_TID_CONFIG_ATTR_TX_RATE; err = nl80211_parse_tx_bitrate_mask(info, attrs, attr, - &tid_conf->txrate_mask); + &tid_conf->txrate_mask, dev); if (err) return err; -- cgit v1.2.3 From 00c207edfb2bff9cf03a8f21e57c9c752a1d9f16 Mon Sep 17 00:00:00 2001 From: John Crispin Date: Tue, 11 Aug 2020 10:01:03 +0200 Subject: nl80211: rename csa counter attributes countdown counters We want to reuse the attributes for other counters such as BSS color change. Rename them to more generic names. Signed-off-by: John Crispin Link: https://lore.kernel.org/r/20200811080107.3615705-1-john@phrozen.org Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 14 ++++++++------ net/wireless/nl80211.c | 16 ++++++++-------- 2 files changed, 16 insertions(+), 14 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 1a4b922f489f..ec96d5fe0e05 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2076,10 +2076,10 @@ enum nl80211_commands { * operation). * @NL80211_ATTR_CSA_IES: Nested set of attributes containing the IE information * for the time while performing a channel switch. - * @NL80211_ATTR_CSA_C_OFF_BEACON: An array of offsets (u16) to the channel - * switch counters in the beacons tail (%NL80211_ATTR_BEACON_TAIL). - * @NL80211_ATTR_CSA_C_OFF_PRESP: An array of offsets (u16) to the channel - * switch counters in the probe response (%NL80211_ATTR_PROBE_RESP). + * @NL80211_ATTR_CNTDWN_OFFS_BEACON: An array of offsets (u16) to the channel + * switch or color change counters in the beacons tail (%NL80211_ATTR_BEACON_TAIL). + * @NL80211_ATTR_CNTDWN_OFFS_PRESP: An array of offsets (u16) to the channel + * switch or color change counters in the probe response (%NL80211_ATTR_PROBE_RESP). * * @NL80211_ATTR_RXMGMT_FLAGS: flags for nl80211_send_mgmt(), u32. * As specified in the &enum nl80211_rxmgmt_flags. @@ -2815,8 +2815,8 @@ enum nl80211_attrs { NL80211_ATTR_CH_SWITCH_COUNT, NL80211_ATTR_CH_SWITCH_BLOCK_TX, NL80211_ATTR_CSA_IES, - NL80211_ATTR_CSA_C_OFF_BEACON, - NL80211_ATTR_CSA_C_OFF_PRESP, + NL80211_ATTR_CNTDWN_OFFS_BEACON, + NL80211_ATTR_CNTDWN_OFFS_PRESP, NL80211_ATTR_RXMGMT_FLAGS, @@ -3003,6 +3003,8 @@ enum nl80211_attrs { #define NL80211_ATTR_MESH_PARAMS NL80211_ATTR_MESH_CONFIG #define NL80211_ATTR_IFACE_SOCKET_OWNER NL80211_ATTR_SOCKET_OWNER #define NL80211_ATTR_SAE_DATA NL80211_ATTR_AUTH_DATA +#define NL80211_ATTR_CSA_C_OFF_BEACON NL80211_ATTR_CNTDWN_OFFS_BEACON +#define NL80211_ATTR_CSA_C_OFF_PRESP NL80211_ATTR_CNTDWN_OFFS_PRESP /* * Allow user space programs to use #ifdef on new attributes by defining them diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index da0f33c2d2d8..e640e65f3255 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -578,8 +578,8 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_CH_SWITCH_COUNT] = { .type = NLA_U32 }, [NL80211_ATTR_CH_SWITCH_BLOCK_TX] = { .type = NLA_FLAG }, [NL80211_ATTR_CSA_IES] = { .type = NLA_NESTED }, - [NL80211_ATTR_CSA_C_OFF_BEACON] = { .type = NLA_BINARY }, - [NL80211_ATTR_CSA_C_OFF_PRESP] = { .type = NLA_BINARY }, + [NL80211_ATTR_CNTDWN_OFFS_BEACON] = { .type = NLA_BINARY }, + [NL80211_ATTR_CNTDWN_OFFS_PRESP] = { .type = NLA_BINARY }, [NL80211_ATTR_STA_SUPPORTED_CHANNELS] = NLA_POLICY_MIN_LEN(2), /* * The value of the Length field of the Supported Operating @@ -8891,10 +8891,10 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) if (err) return err; - if (!csa_attrs[NL80211_ATTR_CSA_C_OFF_BEACON]) + if (!csa_attrs[NL80211_ATTR_CNTDWN_OFFS_BEACON]) return -EINVAL; - len = nla_len(csa_attrs[NL80211_ATTR_CSA_C_OFF_BEACON]); + len = nla_len(csa_attrs[NL80211_ATTR_CNTDWN_OFFS_BEACON]); if (!len || (len % sizeof(u16))) return -EINVAL; @@ -8905,7 +8905,7 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) return -EINVAL; params.counter_offsets_beacon = - nla_data(csa_attrs[NL80211_ATTR_CSA_C_OFF_BEACON]); + nla_data(csa_attrs[NL80211_ATTR_CNTDWN_OFFS_BEACON]); /* sanity checks - counters should fit and be the same */ for (i = 0; i < params.n_counter_offsets_beacon; i++) { @@ -8918,8 +8918,8 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } - if (csa_attrs[NL80211_ATTR_CSA_C_OFF_PRESP]) { - len = nla_len(csa_attrs[NL80211_ATTR_CSA_C_OFF_PRESP]); + if (csa_attrs[NL80211_ATTR_CNTDWN_OFFS_PRESP]) { + len = nla_len(csa_attrs[NL80211_ATTR_CNTDWN_OFFS_PRESP]); if (!len || (len % sizeof(u16))) return -EINVAL; @@ -8930,7 +8930,7 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) return -EINVAL; params.counter_offsets_presp = - nla_data(csa_attrs[NL80211_ATTR_CSA_C_OFF_PRESP]); + nla_data(csa_attrs[NL80211_ATTR_CNTDWN_OFFS_PRESP]); /* sanity checks - counters should fit and be the same */ for (i = 0; i < params.n_counter_offsets_presp; i++) { -- cgit v1.2.3 From 2831a631022eed6e3f800f08892132c6edde652c Mon Sep 17 00:00:00 2001 From: Chung-Hsien Hsu Date: Mon, 17 Aug 2020 02:33:15 -0500 Subject: nl80211: support SAE authentication offload in AP mode Let drivers advertise support for AP-mode SAE authentication offload with a new NL80211_EXT_FEATURE_SAE_OFFLOAD_AP flag. Signed-off-by: Chung-Hsien Hsu Signed-off-by: Chi-Hsien Lin Link: https://lore.kernel.org/r/20200817073316.33402-4-stanley.hsu@cypress.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 14 +++++++++++--- net/wireless/nl80211.c | 9 ++++++--- 2 files changed, 17 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index ec96d5fe0e05..0584e0d349f0 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -252,9 +252,13 @@ * DOC: SAE authentication offload * * By setting @NL80211_EXT_FEATURE_SAE_OFFLOAD flag drivers can indicate they - * support offloading SAE authentication for WPA3-Personal networks. In - * %NL80211_CMD_CONNECT the password for SAE should be specified using - * %NL80211_ATTR_SAE_PASSWORD. + * support offloading SAE authentication for WPA3-Personal networks in station + * mode. Similarly @NL80211_EXT_FEATURE_SAE_OFFLOAD_AP flag can be set by + * drivers indicating the offload support in AP mode. + * + * The password for SAE should be specified using %NL80211_ATTR_SAE_PASSWORD in + * %NL80211_CMD_CONNECT and %NL80211_CMD_START_AP for station and AP mode + * respectively. */ /** @@ -5845,6 +5849,9 @@ enum nl80211_feature_flags { * handshake with PSK in AP mode (PSK is passed as part of the start AP * command). * + * @NL80211_EXT_FEATURE_SAE_OFFLOAD_AP: Device wants to do SAE authentication + * in AP mode (SAE password is passed as part of the start AP command). + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5902,6 +5909,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211_TX_STATUS, NL80211_EXT_FEATURE_OPERATING_CHANNEL_VALIDATION, NL80211_EXT_FEATURE_4WAY_HANDSHAKE_AP_PSK, + NL80211_EXT_FEATURE_SAE_OFFLOAD_AP, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index e640e65f3255..201d029687cc 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4960,8 +4960,9 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev, return false; return true; case NL80211_CMD_START_AP: - /* SAE not supported yet */ - if (auth_type == NL80211_AUTHTYPE_SAE) + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_SAE_OFFLOAD_AP) && + auth_type == NL80211_AUTHTYPE_SAE) return false; /* FILS not supported yet */ if (auth_type == NL80211_AUTHTYPE_FILS_SK || @@ -9552,7 +9553,9 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev, if (info->attrs[NL80211_ATTR_SAE_PASSWORD]) { if (!wiphy_ext_feature_isset(&rdev->wiphy, - NL80211_EXT_FEATURE_SAE_OFFLOAD)) + NL80211_EXT_FEATURE_SAE_OFFLOAD) && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_SAE_OFFLOAD_AP)) return -EINVAL; settings->sae_pwd = nla_data(info->attrs[NL80211_ATTR_SAE_PASSWORD]); -- cgit v1.2.3 From 50aba46c234ea6ab3134cebb5ab27885f33a3e5d Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 27 Aug 2020 14:19:23 +0200 Subject: gtp: add notification mechanism Like all other network functions, let's notify gtp context on creation and deletion. Signed-off-by: Nicolas Dichtel Tested-by: Gabriel Ganne Acked-by: Harald Welte Signed-off-by: David S. Miller --- drivers/net/gtp.c | 58 ++++++++++++++++++++++++++++++++++++++++-------- include/uapi/linux/gtp.h | 2 ++ 2 files changed, 51 insertions(+), 9 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 21640a035d7d..c84a10569388 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -928,8 +928,8 @@ static void ipv4_pdp_fill(struct pdp_ctx *pctx, struct genl_info *info) } } -static int gtp_pdp_add(struct gtp_dev *gtp, struct sock *sk, - struct genl_info *info) +static struct pdp_ctx *gtp_pdp_add(struct gtp_dev *gtp, struct sock *sk, + struct genl_info *info) { struct pdp_ctx *pctx, *pctx_tid = NULL; struct net_device *dev = gtp->dev; @@ -956,12 +956,12 @@ static int gtp_pdp_add(struct gtp_dev *gtp, struct sock *sk, if (found) { if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) - return -EEXIST; + return ERR_PTR(-EEXIST); if (info->nlhdr->nlmsg_flags & NLM_F_REPLACE) - return -EOPNOTSUPP; + return ERR_PTR(-EOPNOTSUPP); if (pctx && pctx_tid) - return -EEXIST; + return ERR_PTR(-EEXIST); if (!pctx) pctx = pctx_tid; @@ -974,13 +974,13 @@ static int gtp_pdp_add(struct gtp_dev *gtp, struct sock *sk, netdev_dbg(dev, "GTPv1-U: update tunnel id = %x/%x (pdp %p)\n", pctx->u.v1.i_tei, pctx->u.v1.o_tei, pctx); - return 0; + return pctx; } pctx = kmalloc(sizeof(*pctx), GFP_ATOMIC); if (pctx == NULL) - return -ENOMEM; + return ERR_PTR(-ENOMEM); sock_hold(sk); pctx->sk = sk; @@ -1018,7 +1018,7 @@ static int gtp_pdp_add(struct gtp_dev *gtp, struct sock *sk, break; } - return 0; + return pctx; } static void pdp_context_free(struct rcu_head *head) @@ -1036,9 +1036,12 @@ static void pdp_context_delete(struct pdp_ctx *pctx) call_rcu(&pctx->rcu_head, pdp_context_free); } +static int gtp_tunnel_notify(struct pdp_ctx *pctx, u8 cmd); + static int gtp_genl_new_pdp(struct sk_buff *skb, struct genl_info *info) { unsigned int version; + struct pdp_ctx *pctx; struct gtp_dev *gtp; struct sock *sk; int err; @@ -1088,7 +1091,13 @@ static int gtp_genl_new_pdp(struct sk_buff *skb, struct genl_info *info) goto out_unlock; } - err = gtp_pdp_add(gtp, sk, info); + pctx = gtp_pdp_add(gtp, sk, info); + if (IS_ERR(pctx)) { + err = PTR_ERR(pctx); + } else { + gtp_tunnel_notify(pctx, GTP_CMD_NEWPDP); + err = 0; + } out_unlock: rcu_read_unlock(); @@ -1159,6 +1168,7 @@ static int gtp_genl_del_pdp(struct sk_buff *skb, struct genl_info *info) netdev_dbg(pctx->dev, "GTPv1-U: deleting tunnel id = %x/%x (pdp %p)\n", pctx->u.v1.i_tei, pctx->u.v1.o_tei, pctx); + gtp_tunnel_notify(pctx, GTP_CMD_DELPDP); pdp_context_delete(pctx); out_unlock: @@ -1168,6 +1178,14 @@ out_unlock: static struct genl_family gtp_genl_family; +enum gtp_multicast_groups { + GTP_GENL_MCGRP, +}; + +static const struct genl_multicast_group gtp_genl_mcgrps[] = { + [GTP_GENL_MCGRP] = { .name = GTP_GENL_MCGRP_NAME }, +}; + static int gtp_genl_fill_info(struct sk_buff *skb, u32 snd_portid, u32 snd_seq, int flags, u32 type, struct pdp_ctx *pctx) { @@ -1204,6 +1222,26 @@ nla_put_failure: return -EMSGSIZE; } +static int gtp_tunnel_notify(struct pdp_ctx *pctx, u8 cmd) +{ + struct sk_buff *msg; + int ret; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (!msg) + return -ENOMEM; + + ret = gtp_genl_fill_info(msg, 0, 0, 0, cmd, pctx); + if (ret < 0) { + nlmsg_free(msg); + return ret; + } + + ret = genlmsg_multicast_netns(>p_genl_family, dev_net(pctx->dev), msg, + 0, GTP_GENL_MCGRP, GFP_ATOMIC); + return ret; +} + static int gtp_genl_get_pdp(struct sk_buff *skb, struct genl_info *info) { struct pdp_ctx *pctx = NULL; @@ -1334,6 +1372,8 @@ static struct genl_family gtp_genl_family __ro_after_init = { .module = THIS_MODULE, .ops = gtp_genl_ops, .n_ops = ARRAY_SIZE(gtp_genl_ops), + .mcgrps = gtp_genl_mcgrps, + .n_mcgrps = ARRAY_SIZE(gtp_genl_mcgrps), }; static int __net_init gtp_net_init(struct net *net) diff --git a/include/uapi/linux/gtp.h b/include/uapi/linux/gtp.h index c7d66755d212..79f9191bbb24 100644 --- a/include/uapi/linux/gtp.h +++ b/include/uapi/linux/gtp.h @@ -2,6 +2,8 @@ #ifndef _UAPI_LINUX_GTP_H_ #define _UAPI_LINUX_GTP_H_ +#define GTP_GENL_MCGRP_NAME "gtp" + enum gtp_genl_cmds { GTP_CMD_NEWPDP, GTP_CMD_DELPDP, -- cgit v1.2.3 From dab741e0e02bd3c4f5e2e97be74b39df2523fc6e Mon Sep 17 00:00:00 2001 From: Mattias Nissler Date: Thu, 27 Aug 2020 11:09:46 -0600 Subject: Add a "nosymfollow" mount option. For mounts that have the new "nosymfollow" option, don't follow symlinks when resolving paths. The new option is similar in spirit to the existing "nodev", "noexec", and "nosuid" options, as well as to the LOOKUP_NO_SYMLINKS resolve flag in the openat2(2) syscall. Various BSD variants have been supporting the "nosymfollow" mount option for a long time with equivalent implementations. Note that symlinks may still be created on file systems mounted with the "nosymfollow" option present. readlink() remains functional, so user space code that is aware of symlinks can still choose to follow them explicitly. Setting the "nosymfollow" mount option helps prevent privileged writers from modifying files unintentionally in case there is an unexpected link along the accessed path. The "nosymfollow" option is thus useful as a defensive measure for systems that need to deal with untrusted file systems in privileged contexts. More information on the history and motivation for this patch can be found here: https://sites.google.com/a/chromium.org/dev/chromium-os/chromiumos-design-docs/hardening-against-malicious-stateful-data#TOC-Restricting-symlink-traversal Signed-off-by: Mattias Nissler Signed-off-by: Ross Zwisler Reviewed-by: Aleksa Sarai Signed-off-by: Al Viro --- fs/namei.c | 3 ++- fs/namespace.c | 2 ++ fs/proc_namespace.c | 1 + fs/statfs.c | 2 ++ include/linux/mount.h | 3 ++- include/linux/statfs.h | 1 + include/uapi/linux/mount.h | 1 + 7 files changed, 11 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/namei.c b/fs/namei.c index e99e2a9da0f7..33e8c79bc761 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1626,7 +1626,8 @@ static const char *pick_link(struct nameidata *nd, struct path *link, return ERR_PTR(error); } - if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS)) + if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) || + unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW)) return ERR_PTR(-ELOOP); if (!(nd->flags & LOOKUP_RCU)) { diff --git a/fs/namespace.c b/fs/namespace.c index bae0e95b3713..6408788a649e 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3160,6 +3160,8 @@ int path_mount(const char *dev_name, struct path *path, mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME); if (flags & MS_RDONLY) mnt_flags |= MNT_READONLY; + if (flags & MS_NOSYMFOLLOW) + mnt_flags |= MNT_NOSYMFOLLOW; /* The default atime for remount is preservation */ if ((flags & MS_REMOUNT) && diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 3059a9394c2d..e59d4bb3a89e 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -70,6 +70,7 @@ static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt) { MNT_NOATIME, ",noatime" }, { MNT_NODIRATIME, ",nodiratime" }, { MNT_RELATIME, ",relatime" }, + { MNT_NOSYMFOLLOW, ",nosymfollow" }, { 0, NULL } }; const struct proc_fs_opts *fs_infop; diff --git a/fs/statfs.c b/fs/statfs.c index 2616424012ea..59f33752c131 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -29,6 +29,8 @@ static int flags_by_mnt(int mnt_flags) flags |= ST_NODIRATIME; if (mnt_flags & MNT_RELATIME) flags |= ST_RELATIME; + if (mnt_flags & MNT_NOSYMFOLLOW) + flags |= ST_NOSYMFOLLOW; return flags; } diff --git a/include/linux/mount.h b/include/linux/mount.h index de657bd211fa..aaf343b38671 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -30,6 +30,7 @@ struct fs_context; #define MNT_NODIRATIME 0x10 #define MNT_RELATIME 0x20 #define MNT_READONLY 0x40 /* does the user want this to be r/o? */ +#define MNT_NOSYMFOLLOW 0x80 #define MNT_SHRINKABLE 0x100 #define MNT_WRITE_HOLD 0x200 @@ -46,7 +47,7 @@ struct fs_context; #define MNT_SHARED_MASK (MNT_UNBINDABLE) #define MNT_USER_SETTABLE_MASK (MNT_NOSUID | MNT_NODEV | MNT_NOEXEC \ | MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME \ - | MNT_READONLY) + | MNT_READONLY | MNT_NOSYMFOLLOW) #define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME ) #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \ diff --git a/include/linux/statfs.h b/include/linux/statfs.h index 9bc69edb8f18..fac4356ea1bf 100644 --- a/include/linux/statfs.h +++ b/include/linux/statfs.h @@ -40,6 +40,7 @@ struct kstatfs { #define ST_NOATIME 0x0400 /* do not update access times */ #define ST_NODIRATIME 0x0800 /* do not update directory access times */ #define ST_RELATIME 0x1000 /* update atime relative to mtime/ctime */ +#define ST_NOSYMFOLLOW 0x2000 /* do not follow symlinks */ struct dentry; extern int vfs_get_fsid(struct dentry *dentry, __kernel_fsid_t *fsid); diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index 96a0240f23fe..dd8306ea336c 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -16,6 +16,7 @@ #define MS_REMOUNT 32 /* Alter flags of a mounted FS */ #define MS_MANDLOCK 64 /* Allow mandatory locks on an FS */ #define MS_DIRSYNC 128 /* Directory modifications are synchronous */ +#define MS_NOSYMFOLLOW 256 /* Do not follow symlinks */ #define MS_NOATIME 1024 /* Do not update access times. */ #define MS_NODIRATIME 2048 /* Do not update directory access times */ #define MS_BIND 4096 -- cgit v1.2.3 From b0c9eb37817943840a1a82dbc998c491609a0afd Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 27 Aug 2020 22:19:22 -0700 Subject: bpf: Make bpf_link_info.iter similar to bpf_iter_link_info bpf_link_info.iter is used by link_query to return bpf_iter_link_info to user space. Fields may be different, e.g., map_fd vs. map_id, so we cannot reuse the exact structure. But make them similar, e.g., struct bpf_link_info { /* common fields */ union { struct { ... } raw_tracepoint; struct { ... } tracing; ... struct { /* common fields for iter */ union { struct { __u32 map_id; } map; /* other structs for other targets */ }; }; }; }; so the structure is extensible the same way as bpf_iter_link_info. Fixes: 6b0a249a301e ("bpf: Implement link_query for bpf iterators") Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200828051922.758950-1-yhs@fb.com --- include/uapi/linux/bpf.h | 6 ++++-- tools/include/uapi/linux/bpf.h | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0388bc0200b0..ef7af384f5ee 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4251,8 +4251,10 @@ struct bpf_link_info { __aligned_u64 target_name; /* in/out: target_name buffer ptr */ __u32 target_name_len; /* in/out: target_name buffer len */ union { - __u32 map_id; - } map; + struct { + __u32 map_id; + } map; + }; } iter; struct { __u32 netns_ino; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0388bc0200b0..ef7af384f5ee 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4251,8 +4251,10 @@ struct bpf_link_info { __aligned_u64 target_name; /* in/out: target_name buffer ptr */ __u32 target_name_len; /* in/out: target_name buffer len */ union { - __u32 map_id; - } map; + struct { + __u32 map_id; + } map; + }; } iter; struct { __u32 netns_ino; -- cgit v1.2.3 From 7a81575b806e5dab214025e6757362c62d946405 Mon Sep 17 00:00:00 2001 From: "Jose M. Guisado Gomez" Date: Thu, 20 Aug 2020 10:19:01 +0200 Subject: netfilter: nf_tables: add userdata attributes to nft_table Enables storing userdata for nft_table. Field udata points to user data and udlen store its length. Adds new attribute flag NFTA_TABLE_USERDATA Signed-off-by: Jose M. Guisado Gomez Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 ++ include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 22 +++++++++++++++++++++- 3 files changed, 25 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index bf9491b77d16..97a7e147a59a 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1080,6 +1080,8 @@ struct nft_table { flags:8, genmask:2; char *name; + u16 udlen; + u8 *udata; }; void nft_register_chain_type(const struct nft_chain_type *); diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 42f351c1f5c5..aeb88cbd303e 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -172,6 +172,7 @@ enum nft_table_flags { * @NFTA_TABLE_NAME: name of the table (NLA_STRING) * @NFTA_TABLE_FLAGS: bitmask of enum nft_table_flags (NLA_U32) * @NFTA_TABLE_USE: number of chains in this table (NLA_U32) + * @NFTA_TABLE_USERDATA: user data (NLA_BINARY) */ enum nft_table_attributes { NFTA_TABLE_UNSPEC, @@ -180,6 +181,7 @@ enum nft_table_attributes { NFTA_TABLE_USE, NFTA_TABLE_HANDLE, NFTA_TABLE_PAD, + NFTA_TABLE_USERDATA, __NFTA_TABLE_MAX }; #define NFTA_TABLE_MAX (__NFTA_TABLE_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index fd814e514f94..6ccce2a2e715 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -650,6 +650,8 @@ static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = { .len = NFT_TABLE_MAXNAMELEN - 1 }, [NFTA_TABLE_FLAGS] = { .type = NLA_U32 }, [NFTA_TABLE_HANDLE] = { .type = NLA_U64 }, + [NFTA_TABLE_USERDATA] = { .type = NLA_BINARY, + .len = NFT_USERDATA_MAXLEN } }; static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, @@ -676,6 +678,11 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, NFTA_TABLE_PAD)) goto nla_put_failure; + if (table->udata) { + if (nla_put(skb, NFTA_TABLE_USERDATA, table->udlen, table->udata)) + goto nla_put_failure; + } + nlmsg_end(skb, nlh); return 0; @@ -977,8 +984,9 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, int family = nfmsg->nfgen_family; const struct nlattr *attr; struct nft_table *table; - u32 flags = 0; struct nft_ctx ctx; + u32 flags = 0; + u16 udlen = 0; int err; lockdep_assert_held(&net->nft.commit_mutex); @@ -1014,6 +1022,16 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, if (table->name == NULL) goto err_strdup; + if (nla[NFTA_TABLE_USERDATA]) { + udlen = nla_len(nla[NFTA_TABLE_USERDATA]); + table->udata = kzalloc(udlen, GFP_KERNEL); + if (table->udata == NULL) + goto err_table_udata; + + nla_memcpy(table->udata, nla[NFTA_TABLE_USERDATA], udlen); + table->udlen = udlen; + } + err = rhltable_init(&table->chains_ht, &nft_chain_ht_params); if (err) goto err_chain_ht; @@ -1036,6 +1054,8 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, err_trans: rhltable_destroy(&table->chains_ht); err_chain_ht: + kfree(table->udata); +err_table_udata: kfree(table->name); err_strdup: kfree(table); -- cgit v1.2.3 From 4afc41dfa5a716e9e7a90c22972583f337c0bcbf Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 26 Aug 2020 00:52:43 +0200 Subject: netfilter: conntrack: remove ignore stats This counter increments when nf_conntrack_in sees a packet that already has a conntrack attached or when the packet is marked as UNTRACKED. Neither is an error. The former is normal for loopback traffic. The second happens for certain ICMPv6 packets or when nftables/ip(6)tables rules are in place. In case someone needs to count UNTRACKED packets, or packets that are marked as untracked before conntrack_in this can be done with both nftables and ip(6)tables rules. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_common.h | 1 - include/uapi/linux/netfilter/nfnetlink_conntrack.h | 2 +- net/netfilter/nf_conntrack_core.c | 4 +--- net/netfilter/nf_conntrack_netlink.c | 1 - net/netfilter/nf_conntrack_standalone.c | 2 +- 5 files changed, 3 insertions(+), 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h index 1db83c931d9c..96b90d7e361f 100644 --- a/include/linux/netfilter/nf_conntrack_common.h +++ b/include/linux/netfilter/nf_conntrack_common.h @@ -8,7 +8,6 @@ struct ip_conntrack_stat { unsigned int found; unsigned int invalid; - unsigned int ignore; unsigned int insert; unsigned int insert_failed; unsigned int drop; diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h index 262881792671..3e471558da82 100644 --- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h +++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h @@ -247,7 +247,7 @@ enum ctattr_stats_cpu { CTA_STATS_FOUND, CTA_STATS_NEW, /* no longer used */ CTA_STATS_INVALID, - CTA_STATS_IGNORE, + CTA_STATS_IGNORE, /* no longer used */ CTA_STATS_DELETE, /* no longer used */ CTA_STATS_DELETE_LIST, /* no longer used */ CTA_STATS_INSERT, diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 3cfbafdff941..a111bcf1b93c 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1800,10 +1800,8 @@ nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state) if (tmpl || ctinfo == IP_CT_UNTRACKED) { /* Previously seen (loopback or untracked)? Ignore. */ if ((tmpl && !nf_ct_is_template(tmpl)) || - ctinfo == IP_CT_UNTRACKED) { - NF_CT_STAT_INC_ATOMIC(state->net, ignore); + ctinfo == IP_CT_UNTRACKED) return NF_ACCEPT; - } skb->_nfct = 0; } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 832eabecfbdd..c64f23a8f373 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2509,7 +2509,6 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq, if (nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) || nla_put_be32(skb, CTA_STATS_INVALID, htonl(st->invalid)) || - nla_put_be32(skb, CTA_STATS_IGNORE, htonl(st->ignore)) || nla_put_be32(skb, CTA_STATS_INSERT, htonl(st->insert)) || nla_put_be32(skb, CTA_STATS_INSERT_FAILED, htonl(st->insert_failed)) || diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index a604f43e3e6b..b673a03624d2 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -439,7 +439,7 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v) st->found, 0, st->invalid, - st->ignore, + 0, 0, 0, st->insert, -- cgit v1.2.3 From bc92470413f3af152db0d8f90ef3eb13f8cc417a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 26 Aug 2020 00:52:44 +0200 Subject: netfilter: conntrack: add clash resolution stat counter There is a misconception about what "insert_failed" means. We increment this even when a clash got resolved, so it might not indicate a problem. Add a dedicated counter for clash resolution and only increment insert_failed if a clash cannot be resolved. For the old /proc interface, export this in place of an older stat that got removed a while back. For ctnetlink, export this with a new attribute. Also correct an outdated comment that implies we add a duplicate tuple -- we only add the (unique) reply direction. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_common.h | 1 + include/uapi/linux/netfilter/nfnetlink_conntrack.h | 1 + net/netfilter/nf_conntrack_core.c | 9 +++++---- net/netfilter/nf_conntrack_netlink.c | 4 +++- net/netfilter/nf_conntrack_standalone.c | 2 +- 5 files changed, 11 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h index 96b90d7e361f..0c7d8d1e945d 100644 --- a/include/linux/netfilter/nf_conntrack_common.h +++ b/include/linux/netfilter/nf_conntrack_common.h @@ -10,6 +10,7 @@ struct ip_conntrack_stat { unsigned int invalid; unsigned int insert; unsigned int insert_failed; + unsigned int clash_resolve; unsigned int drop; unsigned int early_drop; unsigned int error; diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h index 3e471558da82..d8484be72fdc 100644 --- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h +++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h @@ -256,6 +256,7 @@ enum ctattr_stats_cpu { CTA_STATS_EARLY_DROP, CTA_STATS_ERROR, CTA_STATS_SEARCH_RESTART, + CTA_STATS_CLASH_RESOLVE, __CTA_STATS_MAX, }; #define CTA_STATS_MAX (__CTA_STATS_MAX - 1) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index a111bcf1b93c..93e77ca0efad 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -859,7 +859,6 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) out: nf_conntrack_double_unlock(hash, reply_hash); - NF_CT_STAT_INC(net, insert_failed); local_bh_enable(); return -EEXIST; } @@ -934,7 +933,7 @@ static int __nf_ct_resolve_clash(struct sk_buff *skb, nf_conntrack_put(&loser_ct->ct_general); nf_ct_set(skb, ct, ctinfo); - NF_CT_STAT_INC(net, insert_failed); + NF_CT_STAT_INC(net, clash_resolve); return NF_ACCEPT; } @@ -998,6 +997,8 @@ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx) hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode, &nf_conntrack_hash[repl_idx]); + + NF_CT_STAT_INC(net, clash_resolve); return NF_ACCEPT; } @@ -1027,10 +1028,10 @@ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx) * * Failing that, the new, unconfirmed conntrack is still added to the table * provided that the collision only occurs in the ORIGINAL direction. - * The new entry will be added after the existing one in the hash list, + * The new entry will be added only in the non-clashing REPLY direction, * so packets in the ORIGINAL direction will continue to match the existing * entry. The new entry will also have a fixed timeout so it expires -- - * due to the collision, it will not see bidirectional traffic. + * due to the collision, it will only see reply traffic. * * Returns NF_DROP if the clash could not be resolved. */ diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index c64f23a8f373..89d99f6dfd0a 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2516,7 +2516,9 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq, nla_put_be32(skb, CTA_STATS_EARLY_DROP, htonl(st->early_drop)) || nla_put_be32(skb, CTA_STATS_ERROR, htonl(st->error)) || nla_put_be32(skb, CTA_STATS_SEARCH_RESTART, - htonl(st->search_restart))) + htonl(st->search_restart)) || + nla_put_be32(skb, CTA_STATS_CLASH_RESOLVE, + htonl(st->clash_resolve))) goto nla_put_failure; nlmsg_end(skb, nlh); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index b673a03624d2..0ff39740797d 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -435,7 +435,7 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x " "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", nr_conntracks, - 0, + st->clash_resolve, /* was: searched */ st->found, 0, st->invalid, -- cgit v1.2.3 From 1e6c62a8821557720a9b2ea9617359b264f2f67c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 27 Aug 2020 15:01:11 -0700 Subject: bpf: Introduce sleepable BPF programs Introduce sleepable BPF programs that can request such property for themselves via BPF_F_SLEEPABLE flag at program load time. In such case they will be able to use helpers like bpf_copy_from_user() that might sleep. At present only fentry/fexit/fmod_ret and lsm programs can request to be sleepable and only when they are attached to kernel functions that are known to allow sleeping. The non-sleepable programs are relying on implicit rcu_read_lock() and migrate_disable() to protect life time of programs, maps that they use and per-cpu kernel structures used to pass info between bpf programs and the kernel. The sleepable programs cannot be enclosed into rcu_read_lock(). migrate_disable() maps to preempt_disable() in non-RT kernels, so the progs should not be enclosed in migrate_disable() as well. Therefore rcu_read_lock_trace is used to protect the life time of sleepable progs. There are many networking and tracing program types. In many cases the 'struct bpf_prog *' pointer itself is rcu protected within some other kernel data structure and the kernel code is using rcu_dereference() to load that program pointer and call BPF_PROG_RUN() on it. All these cases are not touched. Instead sleepable bpf programs are allowed with bpf trampoline only. The program pointers are hard-coded into generated assembly of bpf trampoline and synchronize_rcu_tasks_trace() is used to protect the life time of the program. The same trampoline can hold both sleepable and non-sleepable progs. When rcu_read_lock_trace is held it means that some sleepable bpf program is running from bpf trampoline. Those programs can use bpf arrays and preallocated hash/lru maps. These map types are waiting on programs to complete via synchronize_rcu_tasks_trace(); Updates to trampoline now has to do synchronize_rcu_tasks_trace() and synchronize_rcu_tasks() to wait for sleepable progs to finish and for trampoline assembly to finish. This is the first step of introducing sleepable progs. Eventually dynamically allocated hash maps can be allowed and networking program types can become sleepable too. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Reviewed-by: Josef Bacik Acked-by: Andrii Nakryiko Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20200827220114.69225-3-alexei.starovoitov@gmail.com --- arch/x86/net/bpf_jit_comp.c | 32 +++++++++++------ include/linux/bpf.h | 3 ++ include/uapi/linux/bpf.h | 8 +++++ init/Kconfig | 1 + kernel/bpf/arraymap.c | 1 + kernel/bpf/hashtab.c | 12 +++---- kernel/bpf/syscall.c | 13 +++++-- kernel/bpf/trampoline.c | 28 +++++++++++++-- kernel/bpf/verifier.c | 81 ++++++++++++++++++++++++++++++++++++++++-- tools/include/uapi/linux/bpf.h | 8 +++++ 10 files changed, 162 insertions(+), 25 deletions(-) (limited to 'include/uapi/linux') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 42b6709e6dc7..7d9ea7b41c71 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1379,10 +1379,15 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, u8 *prog = *pprog; int cnt = 0; - if (emit_call(&prog, __bpf_prog_enter, prog)) - return -EINVAL; - /* remember prog start time returned by __bpf_prog_enter */ - emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0); + if (p->aux->sleepable) { + if (emit_call(&prog, __bpf_prog_enter_sleepable, prog)) + return -EINVAL; + } else { + if (emit_call(&prog, __bpf_prog_enter, prog)) + return -EINVAL; + /* remember prog start time returned by __bpf_prog_enter */ + emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0); + } /* arg1: lea rdi, [rbp - stack_size] */ EMIT4(0x48, 0x8D, 0x7D, -stack_size); @@ -1402,13 +1407,18 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, if (mod_ret) emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); - /* arg1: mov rdi, progs[i] */ - emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, - (u32) (long) p); - /* arg2: mov rsi, rbx <- start time in nsec */ - emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); - if (emit_call(&prog, __bpf_prog_exit, prog)) - return -EINVAL; + if (p->aux->sleepable) { + if (emit_call(&prog, __bpf_prog_exit_sleepable, prog)) + return -EINVAL; + } else { + /* arg1: mov rdi, progs[i] */ + emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, + (u32) (long) p); + /* arg2: mov rsi, rbx <- start time in nsec */ + emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); + if (emit_call(&prog, __bpf_prog_exit, prog)) + return -EINVAL; + } *pprog = prog; return 0; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index dbba82a80087..4dd7e927621d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -539,6 +539,8 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, /* these two functions are called from generated trampoline */ u64 notrace __bpf_prog_enter(void); void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start); +void notrace __bpf_prog_enter_sleepable(void); +void notrace __bpf_prog_exit_sleepable(void); struct bpf_ksym { unsigned long start; @@ -734,6 +736,7 @@ struct bpf_prog_aux { bool offload_requested; bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ bool func_proto_unreliable; + bool sleepable; enum bpf_tramp_prog_type trampoline_prog_type; struct bpf_trampoline *trampoline; struct hlist_node tramp_hlist; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ef7af384f5ee..6e8b706aeb05 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -346,6 +346,14 @@ enum bpf_link_type { /* The verifier internal test flag. Behavior is undefined */ #define BPF_F_TEST_STATE_FREQ (1U << 3) +/* If BPF_F_SLEEPABLE is used in BPF_PROG_LOAD command, the verifier will + * restrict map and helper usage for such programs. Sleepable BPF programs can + * only be attached to hooks where kernel execution context allows sleeping. + * Such programs are allowed to use helpers that may sleep like + * bpf_copy_from_user(). + */ +#define BPF_F_SLEEPABLE (1U << 4) + /* When BPF ldimm64's insn[0].src_reg != 0 then this can have * two extensions: * diff --git a/init/Kconfig b/init/Kconfig index fc10f7ede5f6..6ecc00e130ff 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1691,6 +1691,7 @@ config BPF_SYSCALL bool "Enable bpf() system call" select BPF select IRQ_WORK + select TASKS_TRACE_RCU default n help Enable the bpf() system call that allows to manipulate eBPF diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index d851ebbcf302..e046fb7d17cd 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "map_in_map.h" diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index ad80f45774e7..fe0e06284d33 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "percpu_freelist.h" #include "bpf_lru_list.h" #include "map_in_map.h" @@ -577,8 +578,7 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) struct htab_elem *l; u32 hash, key_size; - /* Must be called with rcu_read_lock. */ - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; @@ -941,7 +941,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; @@ -1032,7 +1032,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; @@ -1220,7 +1220,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) u32 hash, key_size; int ret = -ENOENT; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; @@ -1252,7 +1252,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) u32 hash, key_size; int ret = -ENOENT; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b86b1155b748..4108ef3b828b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -29,6 +29,7 @@ #include #include #include +#include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -1731,10 +1732,14 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) btf_put(prog->aux->btf); bpf_prog_free_linfo(prog); - if (deferred) - call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); - else + if (deferred) { + if (prog->aux->sleepable) + call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu); + else + call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); + } else { __bpf_prog_put_rcu(&prog->aux->rcu); + } } static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) @@ -2104,6 +2109,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | BPF_F_ANY_ALIGNMENT | BPF_F_TEST_STATE_FREQ | + BPF_F_SLEEPABLE | BPF_F_TEST_RND_HI32)) return -EINVAL; @@ -2159,6 +2165,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) } prog->aux->offload_requested = !!attr->prog_ifindex; + prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE; err = security_bpf_prog_alloc(prog->aux); if (err) diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 9be85aa4ec5f..c2b76545153c 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -7,6 +7,8 @@ #include #include #include +#include +#include /* dummy _ops. The verifier will operate on target program's ops. */ const struct bpf_verifier_ops bpf_extension_verifier_ops = { @@ -210,9 +212,12 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) * updates to trampoline would change the code from underneath the * preempted task. Hence wait for tasks to voluntarily schedule or go * to userspace. + * The same trampoline can hold both sleepable and non-sleepable progs. + * synchronize_rcu_tasks_trace() is needed to make sure all sleepable + * programs finish executing. + * Wait for these two grace periods together. */ - - synchronize_rcu_tasks(); + synchronize_rcu_mult(call_rcu_tasks, call_rcu_tasks_trace); err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2, &tr->func.model, flags, tprogs, @@ -344,7 +349,14 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT]))) goto out; bpf_image_ksym_del(&tr->ksym); - /* wait for tasks to get out of trampoline before freeing it */ + /* This code will be executed when all bpf progs (both sleepable and + * non-sleepable) went through + * bpf_prog_put()->call_rcu[_tasks_trace]()->bpf_prog_free_deferred(). + * Hence no need for another synchronize_rcu_tasks_trace() here, + * but synchronize_rcu_tasks() is still needed, since trampoline + * may not have had any sleepable programs and we need to wait + * for tasks to get out of trampoline code before freeing it. + */ synchronize_rcu_tasks(); bpf_jit_free_exec(tr->image); hlist_del(&tr->hlist); @@ -394,6 +406,16 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) rcu_read_unlock(); } +void notrace __bpf_prog_enter_sleepable(void) +{ + rcu_read_lock_trace(); +} + +void notrace __bpf_prog_exit_sleepable(void) +{ + rcu_read_unlock_trace(); +} + int __weak arch_prepare_bpf_trampoline(void *image, void *image_end, const struct btf_func_model *m, u32 flags, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6f5a9f51cc03..3ebfdb7bd427 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "disasm.h" @@ -9367,6 +9368,23 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, return -EINVAL; } + if (prog->aux->sleepable) + switch (map->map_type) { + case BPF_MAP_TYPE_HASH: + case BPF_MAP_TYPE_LRU_HASH: + case BPF_MAP_TYPE_ARRAY: + if (!is_preallocated_map(map)) { + verbose(env, + "Sleepable programs can only use preallocated hash maps\n"); + return -EINVAL; + } + break; + default: + verbose(env, + "Sleepable programs can only use array and hash maps\n"); + return -EINVAL; + } + return 0; } @@ -10985,6 +11003,36 @@ static int check_attach_modify_return(struct bpf_prog *prog, unsigned long addr) return -EINVAL; } +/* non exhaustive list of sleepable bpf_lsm_*() functions */ +BTF_SET_START(btf_sleepable_lsm_hooks) +#ifdef CONFIG_BPF_LSM +BTF_ID(func, bpf_lsm_file_mprotect) +BTF_ID(func, bpf_lsm_bprm_committed_creds) +#endif +BTF_SET_END(btf_sleepable_lsm_hooks) + +static int check_sleepable_lsm_hook(u32 btf_id) +{ + return btf_id_set_contains(&btf_sleepable_lsm_hooks, btf_id); +} + +/* list of non-sleepable functions that are otherwise on + * ALLOW_ERROR_INJECTION list + */ +BTF_SET_START(btf_non_sleepable_error_inject) +/* Three functions below can be called from sleepable and non-sleepable context. + * Assume non-sleepable from bpf safety point of view. + */ +BTF_ID(func, __add_to_page_cache_locked) +BTF_ID(func, should_fail_alloc_page) +BTF_ID(func, should_failslab) +BTF_SET_END(btf_non_sleepable_error_inject) + +static int check_non_sleepable_error_inject(u32 btf_id) +{ + return btf_id_set_contains(&btf_non_sleepable_error_inject, btf_id); +} + static int check_attach_btf_id(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; @@ -11002,6 +11050,12 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) long addr; u64 key; + if (prog->aux->sleepable && prog->type != BPF_PROG_TYPE_TRACING && + prog->type != BPF_PROG_TYPE_LSM) { + verbose(env, "Only fentry/fexit/fmod_ret and lsm programs can be sleepable\n"); + return -EINVAL; + } + if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) return check_struct_ops_btf_id(env); @@ -11210,13 +11264,36 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) } } - if (prog->expected_attach_type == BPF_MODIFY_RETURN) { + if (prog->aux->sleepable) { + ret = -EINVAL; + switch (prog->type) { + case BPF_PROG_TYPE_TRACING: + /* fentry/fexit/fmod_ret progs can be sleepable only if they are + * attached to ALLOW_ERROR_INJECTION and are not in denylist. + */ + if (!check_non_sleepable_error_inject(btf_id) && + within_error_injection_list(addr)) + ret = 0; + break; + case BPF_PROG_TYPE_LSM: + /* LSM progs check that they are attached to bpf_lsm_*() funcs. + * Only some of them are sleepable. + */ + if (check_sleepable_lsm_hook(btf_id)) + ret = 0; + break; + default: + break; + } + if (ret) + verbose(env, "%s is not sleepable\n", + prog->aux->attach_func_name); + } else if (prog->expected_attach_type == BPF_MODIFY_RETURN) { ret = check_attach_modify_return(prog, addr); if (ret) verbose(env, "%s() is not modifiable\n", prog->aux->attach_func_name); } - if (ret) goto out; tr->func.addr = (void *)addr; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index ef7af384f5ee..6e8b706aeb05 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -346,6 +346,14 @@ enum bpf_link_type { /* The verifier internal test flag. Behavior is undefined */ #define BPF_F_TEST_STATE_FREQ (1U << 3) +/* If BPF_F_SLEEPABLE is used in BPF_PROG_LOAD command, the verifier will + * restrict map and helper usage for such programs. Sleepable BPF programs can + * only be attached to hooks where kernel execution context allows sleeping. + * Such programs are allowed to use helpers that may sleep like + * bpf_copy_from_user(). + */ +#define BPF_F_SLEEPABLE (1U << 4) + /* When BPF ldimm64's insn[0].src_reg != 0 then this can have * two extensions: * -- cgit v1.2.3 From 07be4c4a3e7a0db148e44b16c5190e753d1c8569 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 27 Aug 2020 15:01:12 -0700 Subject: bpf: Add bpf_copy_from_user() helper. Sleepable BPF programs can now use copy_from_user() to access user memory. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20200827220114.69225-4-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 8 ++++++++ kernel/bpf/helpers.c | 22 ++++++++++++++++++++++ kernel/trace/bpf_trace.c | 2 ++ tools/include/uapi/linux/bpf.h | 8 ++++++++ 5 files changed, 41 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4dd7e927621d..c6d9f2c444f4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1784,6 +1784,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; +extern const struct bpf_func_proto bpf_copy_from_user_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6e8b706aeb05..a613750d5515 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3569,6 +3569,13 @@ union bpf_attr { * On success, the strictly positive length of the string, * including the trailing NUL character. On error, a negative * value. + * + * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr) + * Description + * Read *size* bytes from user space address *user_ptr* and store + * the data in *dst*. This is a wrapper of copy_from_user(). + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3719,6 +3726,7 @@ union bpf_attr { FN(inode_storage_get), \ FN(inode_storage_delete), \ FN(d_path), \ + FN(copy_from_user), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index be43ab3e619f..5cc7425ee476 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -601,6 +601,28 @@ const struct bpf_func_proto bpf_event_output_data_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; +BPF_CALL_3(bpf_copy_from_user, void *, dst, u32, size, + const void __user *, user_ptr) +{ + int ret = copy_from_user(dst, user_ptr, size); + + if (unlikely(ret)) { + memset(dst, 0, size); + ret = -EFAULT; + } + + return ret; +} + +const struct bpf_func_proto bpf_copy_from_user_proto = { + .func = bpf_copy_from_user, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; + const struct bpf_func_proto bpf_get_current_task_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d973d891f2e2..b2a5380eb187 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1228,6 +1228,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_jiffies64_proto; case BPF_FUNC_get_task_stack: return &bpf_get_task_stack_proto; + case BPF_FUNC_copy_from_user: + return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL; default: return NULL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 6e8b706aeb05..a613750d5515 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3569,6 +3569,13 @@ union bpf_attr { * On success, the strictly positive length of the string, * including the trailing NUL character. On error, a negative * value. + * + * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr) + * Description + * Read *size* bytes from user space address *user_ptr* and store + * the data in *dst*. This is a wrapper of copy_from_user(). + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3719,6 +3726,7 @@ union bpf_attr { FN(inode_storage_get), \ FN(inode_storage_delete), \ FN(d_path), \ + FN(copy_from_user), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 67407a406db337acdaabecd3747d160d89a929e4 Mon Sep 17 00:00:00 2001 From: Balazs Scheidler Date: Sat, 29 Aug 2020 08:19:15 +0200 Subject: netfilter: nft_socket: add wildcard support Add NFT_SOCKET_WILDCARD to match to wildcard socket listener. Signed-off-by: Balazs Scheidler Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nft_socket.c | 27 +++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index aeb88cbd303e..543dc697b796 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1010,10 +1010,12 @@ enum nft_socket_attributes { * * @NFT_SOCKET_TRANSPARENT: Value of the IP(V6)_TRANSPARENT socket option * @NFT_SOCKET_MARK: Value of the socket mark + * @NFT_SOCKET_WILDCARD: Whether the socket is zero-bound (e.g. 0.0.0.0 or ::0) */ enum nft_socket_keys { NFT_SOCKET_TRANSPARENT, NFT_SOCKET_MARK, + NFT_SOCKET_WILDCARD, __NFT_SOCKET_MAX }; #define NFT_SOCKET_MAX (__NFT_SOCKET_MAX - 1) diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index 637ce3e8c575..a28aca5124ce 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -14,6 +14,25 @@ struct nft_socket { }; }; +static void nft_socket_wildcard(const struct nft_pktinfo *pkt, + struct nft_regs *regs, struct sock *sk, + u32 *dest) +{ + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + nft_reg_store8(dest, inet_sk(sk)->inet_rcv_saddr == 0); + break; +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) + case NFPROTO_IPV6: + nft_reg_store8(dest, ipv6_addr_any(&sk->sk_v6_rcv_saddr)); + break; +#endif + default: + regs->verdict.code = NFT_BREAK; + return; + } +} + static void nft_socket_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -59,6 +78,13 @@ static void nft_socket_eval(const struct nft_expr *expr, return; } break; + case NFT_SOCKET_WILDCARD: + if (!sk_fullsock(sk)) { + regs->verdict.code = NFT_BREAK; + return; + } + nft_socket_wildcard(pkt, regs, sk, dest); + break; default: WARN_ON(1); regs->verdict.code = NFT_BREAK; @@ -97,6 +123,7 @@ static int nft_socket_init(const struct nft_ctx *ctx, priv->key = ntohl(nla_get_u32(tb[NFTA_SOCKET_KEY])); switch(priv->key) { case NFT_SOCKET_TRANSPARENT: + case NFT_SOCKET_WILDCARD: len = sizeof(u8); break; case NFT_SOCKET_MARK: -- cgit v1.2.3 From 55977744f9d862512a524fea93fc5226b09e76a9 Mon Sep 17 00:00:00 2001 From: Mukul Joshi Date: Fri, 28 Aug 2020 18:50:42 -0400 Subject: drm/amdkfd: Add GPU reset SMI event Add support for reporting GPU reset events through SMI. KFD would report both pre and post GPU reset events. Signed-off-by: Mukul Joshi Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 4 ++++ drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 ++ drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 34 +++++++++++++++++++++++++++-- drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 1 + include/uapi/linux/kfd_ioctl.h | 2 ++ 5 files changed, 41 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index e1cd6599529f..0e71a0543f98 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -812,6 +812,8 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd) if (!kfd->init_complete) return 0; + kfd_smi_event_update_gpu_reset(kfd, false); + kfd->dqm->ops.pre_reset(kfd->dqm); kgd2kfd_suspend(kfd, false); @@ -840,6 +842,8 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd) atomic_set(&kfd->sram_ecc_flag, 0); + kfd_smi_event_update_gpu_reset(kfd, true); + return 0; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index f14beb93acb4..023629f28495 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -312,6 +312,8 @@ struct kfd_dev { /* Clients watching SMI events */ struct list_head smi_clients; spinlock_t smi_lock; + + uint32_t reset_seq_num; }; enum kfd_mempool { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index 4d4b6e3ab697..17d1736367ea 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -174,6 +174,36 @@ static void add_event_to_kfifo(struct kfd_dev *dev, unsigned int smi_event, rcu_read_unlock(); } +void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool post_reset) +{ + /* + * GpuReset msg = Reset seq number (incremented for + * every reset message sent before GPU reset). + * 1 byte event + 1 byte space + 8 bytes seq num + + * 1 byte \n + 1 byte \0 = 12 + */ + char fifo_in[12]; + int len; + unsigned int event; + + if (list_empty(&dev->smi_clients)) + return; + + memset(fifo_in, 0x0, sizeof(fifo_in)); + + if (post_reset) { + event = KFD_SMI_EVENT_GPU_POST_RESET; + } else { + event = KFD_SMI_EVENT_GPU_PRE_RESET; + ++(dev->reset_seq_num); + } + + len = snprintf(fifo_in, sizeof(fifo_in), "%x %x\n", event, + dev->reset_seq_num); + + add_event_to_kfifo(dev, event, fifo_in, len); +} + void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev, uint32_t throttle_bitmask) { @@ -191,7 +221,7 @@ void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev, if (list_empty(&dev->smi_clients)) return; - len = snprintf(fifo_in, 29, "%x %x:%llx\n", + len = snprintf(fifo_in, sizeof(fifo_in), "%x %x:%llx\n", KFD_SMI_EVENT_THERMAL_THROTTLE, throttle_bitmask, atomic64_read(&adev->smu.throttle_int_counter)); @@ -218,7 +248,7 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid) if (!task_info.pid) return; - len = snprintf(fifo_in, 29, "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT, + len = snprintf(fifo_in, sizeof(fifo_in), "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT, task_info.pid, task_info.task_name); add_event_to_kfifo(dev, KFD_SMI_EVENT_VMFAULT, fifo_in, len); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h index 15537b2cccb5..b9b0438202e2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h @@ -27,5 +27,6 @@ int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd); void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid); void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev, uint32_t throttle_bitmask); +void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool post_reset); #endif diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index cb1f963a84e0..8b7368bfbd84 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -453,6 +453,8 @@ enum kfd_smi_event { KFD_SMI_EVENT_NONE = 0, /* not used */ KFD_SMI_EVENT_VMFAULT = 1, /* event start counting at 1 */ KFD_SMI_EVENT_THERMAL_THROTTLE = 2, + KFD_SMI_EVENT_GPU_PRE_RESET = 3, + KFD_SMI_EVENT_GPU_POST_RESET = 4, }; #define KFD_SMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1)) -- cgit v1.2.3 From 5dc1a0bcb758c343b873e8330ee986417f5a1727 Mon Sep 17 00:00:00 2001 From: Mukul Joshi Date: Fri, 28 Aug 2020 19:53:08 -0400 Subject: include/uapi/linux: Fix indentation in kfd_smi_event enum Replace spaces with Tabs to fix indentation in kfd_smi_event enum. Signed-off-by: Mukul Joshi Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- include/uapi/linux/kfd_ioctl.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 8b7368bfbd84..695b606da4b1 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -450,9 +450,9 @@ struct kfd_ioctl_import_dmabuf_args { * KFD SMI(System Management Interface) events */ enum kfd_smi_event { - KFD_SMI_EVENT_NONE = 0, /* not used */ - KFD_SMI_EVENT_VMFAULT = 1, /* event start counting at 1 */ - KFD_SMI_EVENT_THERMAL_THROTTLE = 2, + KFD_SMI_EVENT_NONE = 0, /* not used */ + KFD_SMI_EVENT_VMFAULT = 1, /* event start counting at 1 */ + KFD_SMI_EVENT_THERMAL_THROTTLE = 2, KFD_SMI_EVENT_GPU_PRE_RESET = 3, KFD_SMI_EVENT_GPU_POST_RESET = 4, }; -- cgit v1.2.3 From 4ad1b0d410c88c7c8e8fd1298c9d2293b651e35c Mon Sep 17 00:00:00 2001 From: Maheshwar Ajja Date: Sat, 23 May 2020 03:05:26 +0200 Subject: media: v4l2-ctrls: Add encoder constant quality control When V4L2_CID_MPEG_VIDEO_BITRATE_MODE value is V4L2_MPEG_VIDEO_BITRATE_MODE_CQ, encoder will produce constant quality output indicated by V4L2_CID_MPEG_VIDEO_CONSTANT_QUALITY control value. Encoder will choose appropriate quantization parameter and bitrate to produce requested frame quality level. Signed-off-by: Maheshwar Ajja Reviewed-by: Hans Verkuil Signed-off-by: Stanimir Varbanov Signed-off-by: Mauro Carvalho Chehab --- Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst | 10 ++++++++++ drivers/media/v4l2-core/v4l2-ctrls.c | 2 ++ include/uapi/linux/v4l2-controls.h | 2 ++ 3 files changed, 14 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst index e2b94b1d0ab0..ebd367dcf8b9 100644 --- a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst +++ b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst @@ -581,6 +581,8 @@ enum v4l2_mpeg_video_bitrate_mode - - Variable bitrate * - ``V4L2_MPEG_VIDEO_BITRATE_MODE_CBR`` - Constant bitrate + * - ``V4L2_MPEG_VIDEO_BITRATE_MODE_CQ`` + - Constant quality @@ -592,6 +594,14 @@ enum v4l2_mpeg_video_bitrate_mode - the average video bitrate. It is ignored if the video bitrate mode is set to constant bitrate. +``V4L2_CID_MPEG_VIDEO_CONSTANT_QUALITY (integer)`` + Constant quality level control. This control is applicable when + ``V4L2_CID_MPEG_VIDEO_BITRATE_MODE`` value is + ``V4L2_MPEG_VIDEO_BITRATE_MODE_CQ``. Valid range is 1 to 100 + where 1 indicates lowest quality and 100 indicates highest quality. + Encoder will decide the appropriate quantization parameter and + bitrate to produce requested frame quality. + ``V4L2_CID_MPEG_VIDEO_TEMPORAL_DECIMATION (integer)`` For every captured frame, skip this many subsequent frames (default 0). diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c index b846f5b089c9..83372789eec3 100644 --- a/drivers/media/v4l2-core/v4l2-ctrls.c +++ b/drivers/media/v4l2-core/v4l2-ctrls.c @@ -200,6 +200,7 @@ const char * const *v4l2_ctrl_get_menu(u32 id) static const char * const mpeg_video_bitrate_mode[] = { "Variable Bitrate", "Constant Bitrate", + "Constant Quality", NULL }; static const char * const mpeg_stream_type[] = { @@ -832,6 +833,7 @@ const char *v4l2_ctrl_get_name(u32 id) case V4L2_CID_MPEG_VIDEO_GOP_CLOSURE: return "Video GOP Closure"; case V4L2_CID_MPEG_VIDEO_PULLDOWN: return "Video Pulldown"; case V4L2_CID_MPEG_VIDEO_BITRATE_MODE: return "Video Bitrate Mode"; + case V4L2_CID_MPEG_VIDEO_CONSTANT_QUALITY: return "Constant Quality"; case V4L2_CID_MPEG_VIDEO_BITRATE: return "Video Bitrate"; case V4L2_CID_MPEG_VIDEO_BITRATE_PEAK: return "Video Peak Bitrate"; case V4L2_CID_MPEG_VIDEO_TEMPORAL_DECIMATION: return "Video Temporal Decimation"; diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h index 62271418c1be..0f7e4388dcce 100644 --- a/include/uapi/linux/v4l2-controls.h +++ b/include/uapi/linux/v4l2-controls.h @@ -375,6 +375,7 @@ enum v4l2_mpeg_video_aspect { enum v4l2_mpeg_video_bitrate_mode { V4L2_MPEG_VIDEO_BITRATE_MODE_VBR = 0, V4L2_MPEG_VIDEO_BITRATE_MODE_CBR = 1, + V4L2_MPEG_VIDEO_BITRATE_MODE_CQ = 2, }; #define V4L2_CID_MPEG_VIDEO_BITRATE (V4L2_CID_MPEG_BASE+207) #define V4L2_CID_MPEG_VIDEO_BITRATE_PEAK (V4L2_CID_MPEG_BASE+208) @@ -742,6 +743,7 @@ enum v4l2_cid_mpeg_video_hevc_size_of_length_field { #define V4L2_CID_MPEG_VIDEO_HEVC_HIER_CODING_L6_BR (V4L2_CID_MPEG_BASE + 642) #define V4L2_CID_MPEG_VIDEO_REF_NUMBER_FOR_PFRAMES (V4L2_CID_MPEG_BASE + 643) #define V4L2_CID_MPEG_VIDEO_PREPEND_SPSPPS_TO_IDR (V4L2_CID_MPEG_BASE + 644) +#define V4L2_CID_MPEG_VIDEO_CONSTANT_QUALITY (V4L2_CID_MPEG_BASE + 645) /* MPEG-class control IDs specific to the CX2341x driver as defined by V4L2 */ #define V4L2_CID_MPEG_CX2341X_BASE (V4L2_CTRL_CLASS_MPEG | 0x1000) -- cgit v1.2.3 From 44f5b2fffc3213c919f53adddadb1a05519bdc0e Mon Sep 17 00:00:00 2001 From: Stanimir Varbanov Date: Sun, 5 Jul 2020 01:41:00 +0200 Subject: media: v4l2-ctrl: Add frame-skip std encoder control Adds encoders standard v4l2 control for frame-skip. The control is a copy of a custom encoder control so that other v4l2 encoder drivers can use it. Signed-off-by: Stanimir Varbanov Reviewed-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- .../userspace-api/media/v4l/ext-ctrls-codec.rst | 38 ++++++++++++++++++++++ drivers/media/v4l2-core/v4l2-ctrls.c | 10 ++++++ include/uapi/linux/v4l2-controls.h | 6 ++++ 3 files changed, 54 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst index ebd367dcf8b9..750a6d4fadaf 100644 --- a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst +++ b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst @@ -602,6 +602,40 @@ enum v4l2_mpeg_video_bitrate_mode - Encoder will decide the appropriate quantization parameter and bitrate to produce requested frame quality. + +``V4L2_CID_MPEG_VIDEO_FRAME_SKIP_MODE (enum)`` + +enum v4l2_mpeg_video_frame_skip_mode - + Indicates in what conditions the encoder should skip frames. If + encoding a frame would cause the encoded stream to be larger then a + chosen data limit then the frame will be skipped. Possible values + are: + + +.. tabularcolumns:: |p{9.2cm}|p{8.3cm}| + +.. raw:: latex + + \small + +.. flat-table:: + :header-rows: 0 + :stub-columns: 0 + + * - ``V4L2_MPEG_FRAME_SKIP_MODE_DISABLED`` + - Frame skip mode is disabled. + * - ``V4L2_MPEG_FRAME_SKIP_MODE_LEVEL_LIMIT`` + - Frame skip mode enabled and buffer limit is set by the chosen + level and is defined by the standard. + * - ``V4L2_MPEG_FRAME_SKIP_MODE_BUF_LIMIT`` + - Frame skip mode enabled and buffer limit is set by the + :ref:`VBV (MPEG1/2/4) ` or + :ref:`CPB (H264) buffer size ` control. + +.. raw:: latex + + \normalsize + ``V4L2_CID_MPEG_VIDEO_TEMPORAL_DECIMATION (integer)`` For every captured frame, skip this many subsequent frames (default 0). @@ -1173,6 +1207,8 @@ enum v4l2_mpeg_video_h264_entropy_mode - Quantization parameter for an B frame for MPEG4. Valid range: from 1 to 31. +.. _v4l2-mpeg-video-vbv-size: + ``V4L2_CID_MPEG_VIDEO_VBV_SIZE (integer)`` The Video Buffer Verifier size in kilobytes, it is used as a limitation of frame skip. The VBV is defined in the standard as a @@ -1210,6 +1246,8 @@ enum v4l2_mpeg_video_h264_entropy_mode - Force a key frame for the next queued buffer. Applicable to encoders. This is a general, codec-agnostic keyframe control. +.. _v4l2-mpeg-video-h264-cpb-size: + ``V4L2_CID_MPEG_VIDEO_H264_CPB_SIZE (integer)`` The Coded Picture Buffer size in kilobytes, it is used as a limitation of frame skip. The CPB is defined in the H264 standard as diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c index 83372789eec3..c138914f507b 100644 --- a/drivers/media/v4l2-core/v4l2-ctrls.c +++ b/drivers/media/v4l2-core/v4l2-ctrls.c @@ -591,6 +591,12 @@ const char * const *v4l2_ctrl_get_menu(u32 id) "External", NULL, }; + static const char * const mpeg_video_frame_skip[] = { + "Disabled", + "Level Limit", + "VBV/CPB Limit", + NULL, + }; switch (id) { case V4L2_CID_MPEG_AUDIO_SAMPLING_FREQ: @@ -652,6 +658,8 @@ const char * const *v4l2_ctrl_get_menu(u32 id) return flash_strobe_source; case V4L2_CID_MPEG_VIDEO_HEADER_MODE: return header_mode; + case V4L2_CID_MPEG_VIDEO_FRAME_SKIP_MODE: + return mpeg_video_frame_skip; case V4L2_CID_MPEG_VIDEO_MULTI_SLICE_MODE: return multi_slice; case V4L2_CID_MPEG_VIDEO_H264_ENTROPY_MODE: @@ -846,6 +854,7 @@ const char *v4l2_ctrl_get_name(u32 id) case V4L2_CID_MPEG_VIDEO_MB_RC_ENABLE: return "H264 MB Level Rate Control"; case V4L2_CID_MPEG_VIDEO_HEADER_MODE: return "Sequence Header Mode"; case V4L2_CID_MPEG_VIDEO_MAX_REF_PIC: return "Max Number of Reference Pics"; + case V4L2_CID_MPEG_VIDEO_FRAME_SKIP_MODE: return "Frame Skip Mode"; case V4L2_CID_MPEG_VIDEO_H263_I_FRAME_QP: return "H263 I-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_H263_P_FRAME_QP: return "H263 P-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_H263_B_FRAME_QP: return "H263 B-Frame QP Value"; @@ -1268,6 +1277,7 @@ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type, case V4L2_CID_FLASH_LED_MODE: case V4L2_CID_FLASH_STROBE_SOURCE: case V4L2_CID_MPEG_VIDEO_HEADER_MODE: + case V4L2_CID_MPEG_VIDEO_FRAME_SKIP_MODE: case V4L2_CID_MPEG_VIDEO_MULTI_SLICE_MODE: case V4L2_CID_MPEG_VIDEO_H264_ENTROPY_MODE: case V4L2_CID_MPEG_VIDEO_H264_LEVEL: diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h index 0f7e4388dcce..053827cda8e6 100644 --- a/include/uapi/linux/v4l2-controls.h +++ b/include/uapi/linux/v4l2-controls.h @@ -744,6 +744,12 @@ enum v4l2_cid_mpeg_video_hevc_size_of_length_field { #define V4L2_CID_MPEG_VIDEO_REF_NUMBER_FOR_PFRAMES (V4L2_CID_MPEG_BASE + 643) #define V4L2_CID_MPEG_VIDEO_PREPEND_SPSPPS_TO_IDR (V4L2_CID_MPEG_BASE + 644) #define V4L2_CID_MPEG_VIDEO_CONSTANT_QUALITY (V4L2_CID_MPEG_BASE + 645) +#define V4L2_CID_MPEG_VIDEO_FRAME_SKIP_MODE (V4L2_CID_MPEG_BASE + 646) +enum v4l2_mpeg_video_frame_skip_mode { + V4L2_MPEG_VIDEO_FRAME_SKIP_MODE_DISABLED = 0, + V4L2_MPEG_VIDEO_FRAME_SKIP_MODE_LEVEL_LIMIT = 1, + V4L2_MPEG_VIDEO_FRAME_SKIP_MODE_BUF_LIMIT = 2, +}; /* MPEG-class control IDs specific to the CX2341x driver as defined by V4L2 */ #define V4L2_CID_MPEG_CX2341X_BASE (V4L2_CTRL_CLASS_MPEG | 0x1000) -- cgit v1.2.3 From 9d3a39a5f1e45827b008fff1ee9cf3cac3409665 Mon Sep 17 00:00:00 2001 From: Khazhismel Kumykov Date: Mon, 24 Aug 2020 15:10:34 -0700 Subject: block: grant IOPRIO_CLASS_RT to CAP_SYS_NICE CAP_SYS_ADMIN is too broad, and ionice fits into CAP_SYS_NICE's grouping. Retain CAP_SYS_ADMIN permission for backwards compatibility. Signed-off-by: Khazhismel Kumykov Reviewed-by: Bart Van Assche Acked-by: Serge Hallyn Signed-off-by: Jens Axboe --- block/ioprio.c | 2 +- include/uapi/linux/capability.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/block/ioprio.c b/block/ioprio.c index 04ebd37966f1..364d2294ba90 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -69,7 +69,7 @@ int ioprio_check_cap(int ioprio) switch (class) { case IOPRIO_CLASS_RT: - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_NICE) && !capable(CAP_SYS_ADMIN)) return -EPERM; fallthrough; /* rt has prio field too */ diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h index 395dd0df8d08..c6ca33034147 100644 --- a/include/uapi/linux/capability.h +++ b/include/uapi/linux/capability.h @@ -288,6 +288,8 @@ struct vfs_ns_cap_data { processes and setting the scheduling algorithm used by another process. */ /* Allow setting cpu affinity on other processes */ +/* Allow setting realtime ioprio class */ +/* Allow setting ioprio class on other processes */ #define CAP_SYS_NICE 23 -- cgit v1.2.3 From c1077616142907bb6ee987ecd136d6857ffd8787 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 1 Sep 2020 15:10:08 -0700 Subject: ip: expose inet sockopts through inet_diag Expose all exisiting inet sockopt bits through inet_diag for debug purpose. Corresponding changes in iproute2 ss will be submitted to output all these values. Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Signed-off-by: Mahesh Bandewar Signed-off-by: David S. Miller --- include/linux/inet_diag.h | 2 ++ include/uapi/linux/inet_diag.h | 18 ++++++++++++++++++ net/ipv4/inet_diag.c | 17 +++++++++++++++++ 3 files changed, 37 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index 0ef2d800fda7..84abb30a3fbb 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -75,6 +75,8 @@ static inline size_t inet_diag_msg_attrs_size(void) #ifdef CONFIG_SOCK_CGROUP_DATA + nla_total_size_64bit(sizeof(u64)) /* INET_DIAG_CGROUP_ID */ #endif + + nla_total_size(sizeof(struct inet_diag_sockopt)) + /* INET_DIAG_SOCKOPT */ ; } int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index 5ba122c1949a..20ee93f0f876 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -160,6 +160,7 @@ enum { INET_DIAG_ULP_INFO, INET_DIAG_SK_BPF_STORAGES, INET_DIAG_CGROUP_ID, + INET_DIAG_SOCKOPT, __INET_DIAG_MAX, }; @@ -183,6 +184,23 @@ struct inet_diag_meminfo { __u32 idiag_tmem; }; +/* INET_DIAG_SOCKOPT */ + +struct inet_diag_sockopt { + __u8 recverr:1, + is_icsk:1, + freebind:1, + hdrincl:1, + mc_loop:1, + transparent:1, + mc_all:1, + nodefrag:1; + __u8 bind_address_no_port:1, + recverr_rfc4884:1, + defer_connect:1, + unused:5; +}; + /* INET_DIAG_VEGASINFO */ struct tcpvegas_info { diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 4a98dd736270..93816d47e55a 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -125,6 +125,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, bool net_admin) { const struct inet_sock *inet = inet_sk(sk); + struct inet_diag_sockopt inet_sockopt; if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown)) goto errout; @@ -180,6 +181,22 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); r->idiag_inode = sock_i_ino(sk); + memset(&inet_sockopt, 0, sizeof(inet_sockopt)); + inet_sockopt.recverr = inet->recverr; + inet_sockopt.is_icsk = inet->is_icsk; + inet_sockopt.freebind = inet->freebind; + inet_sockopt.hdrincl = inet->hdrincl; + inet_sockopt.mc_loop = inet->mc_loop; + inet_sockopt.transparent = inet->transparent; + inet_sockopt.mc_all = inet->mc_all; + inet_sockopt.nodefrag = inet->nodefrag; + inet_sockopt.bind_address_no_port = inet->bind_address_no_port; + inet_sockopt.recverr_rfc4884 = inet->recverr_rfc4884; + inet_sockopt.defer_connect = inet->defer_connect; + if (nla_put(skb, INET_DIAG_SOCKOPT, sizeof(inet_sockopt), + &inet_sockopt)) + goto errout; + return 0; errout: return 1; -- cgit v1.2.3 From 6da73d15258a1e5e86d03d4ffba8776d17a8a287 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 2 Sep 2020 12:21:27 +0200 Subject: pidfd: support PIDFD_NONBLOCK in pidfd_open() Introduce PIDFD_NONBLOCK to support non-blocking pidfd file descriptors. Ever since the introduction of pidfds and more advanced async io various programming languages such as Rust have grown support for async event libraries. These libraries are created to help build epoll-based event loops around file descriptors. A common pattern is to automatically make all file descriptors they manage to O_NONBLOCK. For such libraries the EAGAIN error code is treated specially. When a function is called that returns EAGAIN the function isn't called again until the event loop indicates the the file descriptor is ready. Supporting EAGAIN when waiting on pidfds makes such libraries just work with little effort. In the following patch we will extend waitid() internally to support non-blocking pidfds. This introduces a new flag PIDFD_NONBLOCK that is equivalent to O_NONBLOCK. This follows the same patterns we have for other (anon inode) file descriptors such as EFD_NONBLOCK, IN_NONBLOCK, SFD_NONBLOCK, TFD_NONBLOCK and the same for close-on-exec flags. Suggested-by: Josh Triplett Signed-off-by: Christian Brauner Reviewed-by: Josh Triplett Reviewed-by: Oleg Nesterov Cc: Kees Cook Cc: Sargun Dhillon Cc: Oleg Nesterov Link: https://lore.kernel.org/lkml/20200811181236.GA18763@localhost/ Link: https://github.com/joshtriplett/async-pidfd Link: https://lore.kernel.org/r/20200902102130.147672-2-christian.brauner@ubuntu.com --- include/uapi/linux/pidfd.h | 12 ++++++++++++ kernel/pid.c | 12 +++++++----- 2 files changed, 19 insertions(+), 5 deletions(-) create mode 100644 include/uapi/linux/pidfd.h (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h new file mode 100644 index 000000000000..5406fbc13074 --- /dev/null +++ b/include/uapi/linux/pidfd.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ + +#ifndef _UAPI_LINUX_PIDFD_H +#define _UAPI_LINUX_PIDFD_H + +#include +#include + +/* Flags for pidfd_open(). */ +#define PIDFD_NONBLOCK O_NONBLOCK + +#endif /* _UAPI_LINUX_PIDFD_H */ diff --git a/kernel/pid.c b/kernel/pid.c index b2562a7ce525..74ddbff1a6ba 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -43,6 +43,7 @@ #include #include #include +#include struct pid init_struct_pid = { .count = REFCOUNT_INIT(1), @@ -522,7 +523,8 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) /** * pidfd_create() - Create a new pid file descriptor. * - * @pid: struct pid that the pidfd will reference + * @pid: struct pid that the pidfd will reference + * @flags: flags to pass * * This creates a new pid file descriptor with the O_CLOEXEC flag set. * @@ -532,12 +534,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) * Return: On success, a cloexec pidfd is returned. * On error, a negative errno number will be returned. */ -static int pidfd_create(struct pid *pid) +static int pidfd_create(struct pid *pid, unsigned int flags) { int fd; fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), - O_RDWR | O_CLOEXEC); + flags | O_RDWR | O_CLOEXEC); if (fd < 0) put_pid(pid); @@ -565,7 +567,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) int fd; struct pid *p; - if (flags) + if (flags & ~PIDFD_NONBLOCK) return -EINVAL; if (pid <= 0) @@ -576,7 +578,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) return -ESRCH; if (pid_has_task(p, PIDTYPE_TGID)) - fd = pidfd_create(p); + fd = pidfd_create(p, flags); else fd = -EINVAL; -- cgit v1.2.3 From 1c101da8b971a36695319dce7a24711dc567a0dd Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 27 Nov 2019 10:30:15 +0000 Subject: arm64: mte: Allow user control of the tag check mode via prctl() By default, even if PROT_MTE is set on a memory range, there is no tag check fault reporting (SIGSEGV). Introduce a set of option to the exiting prctl(PR_SET_TAGGED_ADDR_CTRL) to allow user control of the tag check fault mode: PR_MTE_TCF_NONE - no reporting (default) PR_MTE_TCF_SYNC - synchronous tag check fault reporting PR_MTE_TCF_ASYNC - asynchronous tag check fault reporting These options translate into the corresponding SCTLR_EL1.TCF0 bitfield, context-switched by the kernel. Note that the kernel accesses to the user address space (e.g. read() system call) are not checked if the user thread tag checking mode is PR_MTE_TCF_NONE or PR_MTE_TCF_ASYNC. If the tag checking mode is PR_MTE_TCF_SYNC, the kernel makes a best effort to check its user address accesses, however it cannot always guarantee it. Signed-off-by: Catalin Marinas Cc: Will Deacon --- arch/arm64/include/asm/mte.h | 14 +++++++ arch/arm64/include/asm/processor.h | 3 ++ arch/arm64/kernel/mte.c | 77 ++++++++++++++++++++++++++++++++++++++ arch/arm64/kernel/process.c | 26 +++++++++++-- include/uapi/linux/prctl.h | 6 +++ 5 files changed, 123 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h index b2577eee62c2..df2efbc9f8f1 100644 --- a/arch/arm64/include/asm/mte.h +++ b/arch/arm64/include/asm/mte.h @@ -21,6 +21,9 @@ void mte_clear_page_tags(void *addr); void mte_sync_tags(pte_t *ptep, pte_t pte); void mte_copy_page_tags(void *kto, const void *kfrom); void flush_mte_state(void); +void mte_thread_switch(struct task_struct *next); +long set_mte_ctrl(unsigned long arg); +long get_mte_ctrl(void); #else @@ -36,6 +39,17 @@ static inline void mte_copy_page_tags(void *kto, const void *kfrom) static inline void flush_mte_state(void) { } +static inline void mte_thread_switch(struct task_struct *next) +{ +} +static inline long set_mte_ctrl(unsigned long arg) +{ + return 0; +} +static inline long get_mte_ctrl(void) +{ + return 0; +} #endif diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 240fe5e5b720..80e7f0573309 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -151,6 +151,9 @@ struct thread_struct { struct ptrauth_keys_user keys_user; struct ptrauth_keys_kernel keys_kernel; #endif +#ifdef CONFIG_ARM64_MTE + u64 sctlr_tcf0; +#endif }; static inline void arch_thread_struct_whitelist(unsigned long *offset, diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 5f54fd140610..375483a1f573 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -5,6 +5,8 @@ #include #include +#include +#include #include #include @@ -49,6 +51,26 @@ int memcmp_pages(struct page *page1, struct page *page2) return ret; } +static void update_sctlr_el1_tcf0(u64 tcf0) +{ + /* ISB required for the kernel uaccess routines */ + sysreg_clear_set(sctlr_el1, SCTLR_EL1_TCF0_MASK, tcf0); + isb(); +} + +static void set_sctlr_el1_tcf0(u64 tcf0) +{ + /* + * mte_thread_switch() checks current->thread.sctlr_tcf0 as an + * optimisation. Disable preemption so that it does not see + * the variable update before the SCTLR_EL1.TCF0 one. + */ + preempt_disable(); + current->thread.sctlr_tcf0 = tcf0; + update_sctlr_el1_tcf0(tcf0); + preempt_enable(); +} + void flush_mte_state(void) { if (!system_supports_mte()) @@ -58,4 +80,59 @@ void flush_mte_state(void) dsb(ish); write_sysreg_s(0, SYS_TFSRE0_EL1); clear_thread_flag(TIF_MTE_ASYNC_FAULT); + /* disable tag checking */ + set_sctlr_el1_tcf0(SCTLR_EL1_TCF0_NONE); +} + +void mte_thread_switch(struct task_struct *next) +{ + if (!system_supports_mte()) + return; + + /* avoid expensive SCTLR_EL1 accesses if no change */ + if (current->thread.sctlr_tcf0 != next->thread.sctlr_tcf0) + update_sctlr_el1_tcf0(next->thread.sctlr_tcf0); +} + +long set_mte_ctrl(unsigned long arg) +{ + u64 tcf0; + + if (!system_supports_mte()) + return 0; + + switch (arg & PR_MTE_TCF_MASK) { + case PR_MTE_TCF_NONE: + tcf0 = SCTLR_EL1_TCF0_NONE; + break; + case PR_MTE_TCF_SYNC: + tcf0 = SCTLR_EL1_TCF0_SYNC; + break; + case PR_MTE_TCF_ASYNC: + tcf0 = SCTLR_EL1_TCF0_ASYNC; + break; + default: + return -EINVAL; + } + + set_sctlr_el1_tcf0(tcf0); + + return 0; +} + +long get_mte_ctrl(void) +{ + if (!system_supports_mte()) + return 0; + + switch (current->thread.sctlr_tcf0) { + case SCTLR_EL1_TCF0_NONE: + return PR_MTE_TCF_NONE; + case SCTLR_EL1_TCF0_SYNC: + return PR_MTE_TCF_SYNC; + case SCTLR_EL1_TCF0_ASYNC: + return PR_MTE_TCF_ASYNC; + } + + return 0; } diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index a49028efab68..bb759b88d44a 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -577,6 +577,13 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev, */ dsb(ish); + /* + * MTE thread switching must happen after the DSB above to ensure that + * any asynchronous tag check faults have been logged in the TFSR*_EL1 + * registers. + */ + mte_thread_switch(next); + /* the actual thread switch */ last = cpu_switch_to(prev, next); @@ -636,9 +643,15 @@ static unsigned int tagged_addr_disabled; long set_tagged_addr_ctrl(unsigned long arg) { + unsigned long valid_mask = PR_TAGGED_ADDR_ENABLE; + if (is_compat_task()) return -EINVAL; - if (arg & ~PR_TAGGED_ADDR_ENABLE) + + if (system_supports_mte()) + valid_mask |= PR_MTE_TCF_MASK; + + if (arg & ~valid_mask) return -EINVAL; /* @@ -648,6 +661,9 @@ long set_tagged_addr_ctrl(unsigned long arg) if (arg & PR_TAGGED_ADDR_ENABLE && tagged_addr_disabled) return -EINVAL; + if (set_mte_ctrl(arg) != 0) + return -EINVAL; + update_thread_flag(TIF_TAGGED_ADDR, arg & PR_TAGGED_ADDR_ENABLE); return 0; @@ -655,13 +671,17 @@ long set_tagged_addr_ctrl(unsigned long arg) long get_tagged_addr_ctrl(void) { + long ret = 0; + if (is_compat_task()) return -EINVAL; if (test_thread_flag(TIF_TAGGED_ADDR)) - return PR_TAGGED_ADDR_ENABLE; + ret = PR_TAGGED_ADDR_ENABLE; - return 0; + ret |= get_mte_ctrl(); + + return ret; } /* diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 07b4f8131e36..2390ab324afa 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -233,6 +233,12 @@ struct prctl_mm_map { #define PR_SET_TAGGED_ADDR_CTRL 55 #define PR_GET_TAGGED_ADDR_CTRL 56 # define PR_TAGGED_ADDR_ENABLE (1UL << 0) +/* MTE tag check fault modes */ +# define PR_MTE_TCF_SHIFT 1 +# define PR_MTE_TCF_NONE (0UL << PR_MTE_TCF_SHIFT) +# define PR_MTE_TCF_SYNC (1UL << PR_MTE_TCF_SHIFT) +# define PR_MTE_TCF_ASYNC (2UL << PR_MTE_TCF_SHIFT) +# define PR_MTE_TCF_MASK (3UL << PR_MTE_TCF_SHIFT) /* Control reclaim behavior when allocating memory */ #define PR_SET_IO_FLUSHER 57 -- cgit v1.2.3 From af5ce95282dc99d08a27a407a02c763dde1c5558 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Tue, 10 Dec 2019 11:19:15 +0000 Subject: arm64: mte: Allow user control of the generated random tags via prctl() The IRG, ADDG and SUBG instructions insert a random tag in the resulting address. Certain tags can be excluded via the GCR_EL1.Exclude bitmap when, for example, the user wants a certain colour for freed buffers. Since the GCR_EL1 register is not accessible at EL0, extend the prctl(PR_SET_TAGGED_ADDR_CTRL) interface to include a 16-bit field in the first argument for controlling which tags can be generated by the above instruction (an include rather than exclude mask). Note that by default all non-zero tags are excluded. This setting is per-thread. Signed-off-by: Catalin Marinas Cc: Will Deacon --- arch/arm64/include/asm/processor.h | 1 + arch/arm64/include/asm/sysreg.h | 7 +++++++ arch/arm64/kernel/mte.c | 35 ++++++++++++++++++++++++++++++++--- arch/arm64/kernel/process.c | 2 +- include/uapi/linux/prctl.h | 3 +++ 5 files changed, 44 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 80e7f0573309..e1b1c2a6086e 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -153,6 +153,7 @@ struct thread_struct { #endif #ifdef CONFIG_ARM64_MTE u64 sctlr_tcf0; + u64 gcr_user_incl; #endif }; diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index daf030a05de0..52eefe2f7d95 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -1078,6 +1078,13 @@ write_sysreg(__scs_new, sysreg); \ } while (0) +#define sysreg_clear_set_s(sysreg, clear, set) do { \ + u64 __scs_val = read_sysreg_s(sysreg); \ + u64 __scs_new = (__scs_val & ~(u64)(clear)) | (set); \ + if (__scs_new != __scs_val) \ + write_sysreg_s(__scs_new, sysreg); \ +} while (0) + #endif #endif /* __ASM_SYSREG_H */ diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 375483a1f573..07798b8d5039 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -71,6 +71,25 @@ static void set_sctlr_el1_tcf0(u64 tcf0) preempt_enable(); } +static void update_gcr_el1_excl(u64 incl) +{ + u64 excl = ~incl & SYS_GCR_EL1_EXCL_MASK; + + /* + * Note that 'incl' is an include mask (controlled by the user via + * prctl()) while GCR_EL1 accepts an exclude mask. + * No need for ISB since this only affects EL0 currently, implicit + * with ERET. + */ + sysreg_clear_set_s(SYS_GCR_EL1, SYS_GCR_EL1_EXCL_MASK, excl); +} + +static void set_gcr_el1_excl(u64 incl) +{ + current->thread.gcr_user_incl = incl; + update_gcr_el1_excl(incl); +} + void flush_mte_state(void) { if (!system_supports_mte()) @@ -82,6 +101,8 @@ void flush_mte_state(void) clear_thread_flag(TIF_MTE_ASYNC_FAULT); /* disable tag checking */ set_sctlr_el1_tcf0(SCTLR_EL1_TCF0_NONE); + /* reset tag generation mask */ + set_gcr_el1_excl(0); } void mte_thread_switch(struct task_struct *next) @@ -92,6 +113,7 @@ void mte_thread_switch(struct task_struct *next) /* avoid expensive SCTLR_EL1 accesses if no change */ if (current->thread.sctlr_tcf0 != next->thread.sctlr_tcf0) update_sctlr_el1_tcf0(next->thread.sctlr_tcf0); + update_gcr_el1_excl(next->thread.gcr_user_incl); } long set_mte_ctrl(unsigned long arg) @@ -116,23 +138,30 @@ long set_mte_ctrl(unsigned long arg) } set_sctlr_el1_tcf0(tcf0); + set_gcr_el1_excl((arg & PR_MTE_TAG_MASK) >> PR_MTE_TAG_SHIFT); return 0; } long get_mte_ctrl(void) { + unsigned long ret; + if (!system_supports_mte()) return 0; + ret = current->thread.gcr_user_incl << PR_MTE_TAG_SHIFT; + switch (current->thread.sctlr_tcf0) { case SCTLR_EL1_TCF0_NONE: return PR_MTE_TCF_NONE; case SCTLR_EL1_TCF0_SYNC: - return PR_MTE_TCF_SYNC; + ret |= PR_MTE_TCF_SYNC; + break; case SCTLR_EL1_TCF0_ASYNC: - return PR_MTE_TCF_ASYNC; + ret |= PR_MTE_TCF_ASYNC; + break; } - return 0; + return ret; } diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index bb759b88d44a..c80383f30d6a 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -649,7 +649,7 @@ long set_tagged_addr_ctrl(unsigned long arg) return -EINVAL; if (system_supports_mte()) - valid_mask |= PR_MTE_TCF_MASK; + valid_mask |= PR_MTE_TCF_MASK | PR_MTE_TAG_MASK; if (arg & ~valid_mask) return -EINVAL; diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 2390ab324afa..7f0827705c9a 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -239,6 +239,9 @@ struct prctl_mm_map { # define PR_MTE_TCF_SYNC (1UL << PR_MTE_TCF_SHIFT) # define PR_MTE_TCF_ASYNC (2UL << PR_MTE_TCF_SHIFT) # define PR_MTE_TCF_MASK (3UL << PR_MTE_TCF_SHIFT) +/* MTE tag inclusion mask */ +# define PR_MTE_TAG_SHIFT 3 +# define PR_MTE_TAG_MASK (0xffffUL << PR_MTE_TAG_SHIFT) /* Control reclaim behavior when allocating memory */ #define PR_SET_IO_FLUSHER 57 -- cgit v1.2.3 From 2200aa7154cb7ef76bac93e98326883ba64bfa2e Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 3 Jul 2020 15:12:57 +0100 Subject: arm64: mte: ptrace: Add NT_ARM_TAGGED_ADDR_CTRL regset This regset allows read/write access to a ptraced process prctl(PR_SET_TAGGED_ADDR_CTRL) setting. Signed-off-by: Catalin Marinas Cc: Will Deacon Cc: Alan Hayward Cc: Luis Machado Cc: Omair Javaid --- arch/arm64/kernel/ptrace.c | 42 ++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/elf.h | 1 + 2 files changed, 43 insertions(+) (limited to 'include/uapi/linux') diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 101040a37d40..f49b349e16a3 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -1033,6 +1033,35 @@ static int pac_generic_keys_set(struct task_struct *target, #endif /* CONFIG_CHECKPOINT_RESTORE */ #endif /* CONFIG_ARM64_PTR_AUTH */ +#ifdef CONFIG_ARM64_TAGGED_ADDR_ABI +static int tagged_addr_ctrl_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + long ctrl = get_tagged_addr_ctrl(target); + + if (IS_ERR_VALUE(ctrl)) + return ctrl; + + return membuf_write(&to, &ctrl, sizeof(ctrl)); +} + +static int tagged_addr_ctrl_set(struct task_struct *target, const struct + user_regset *regset, unsigned int pos, + unsigned int count, const void *kbuf, const + void __user *ubuf) +{ + int ret; + long ctrl; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ctrl, 0, -1); + if (ret) + return ret; + + return set_tagged_addr_ctrl(target, ctrl); +} +#endif + enum aarch64_regset { REGSET_GPR, REGSET_FPR, @@ -1052,6 +1081,9 @@ enum aarch64_regset { REGSET_PACG_KEYS, #endif #endif +#ifdef CONFIG_ARM64_TAGGED_ADDR_ABI + REGSET_TAGGED_ADDR_CTRL, +#endif }; static const struct user_regset aarch64_regsets[] = { @@ -1149,6 +1181,16 @@ static const struct user_regset aarch64_regsets[] = { }, #endif #endif +#ifdef CONFIG_ARM64_TAGGED_ADDR_ABI + [REGSET_TAGGED_ADDR_CTRL] = { + .core_note_type = NT_ARM_TAGGED_ADDR_CTRL, + .n = 1, + .size = sizeof(long), + .align = sizeof(long), + .regset_get = tagged_addr_ctrl_get, + .set = tagged_addr_ctrl_set, + }, +#endif }; static const struct user_regset_view user_aarch64_view = { diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index 22220945a5fd..30f68b42eeb5 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -425,6 +425,7 @@ typedef struct elf64_shdr { #define NT_ARM_PAC_MASK 0x406 /* ARM pointer authentication code masks */ #define NT_ARM_PACA_KEYS 0x407 /* ARM pointer authentication address keys */ #define NT_ARM_PACG_KEYS 0x408 /* ARM pointer authentication generic key */ +#define NT_ARM_TAGGED_ADDR_CTRL 0x409 /* arm64 tagged address control (prctl()) */ #define NT_ARC_V2 0x600 /* ARCv2 accumulator/extra registers */ #define NT_VMCOREDD 0x700 /* Vmcore Device Dump Note */ #define NT_MIPS_DSP 0x800 /* MIPS DSP ASE registers */ -- cgit v1.2.3 From 16270a92355722e387e9ca19627c5a4d7bae1354 Mon Sep 17 00:00:00 2001 From: Hou Zhiqiang Date: Tue, 18 Aug 2020 17:27:46 +0800 Subject: PCI: designware-ep: Fix the Header Type check The current check will result in the multiple function device fails to initialize. So fix the check by masking out the multiple function bit. Link: https://lore.kernel.org/r/20200818092746.24366-1-Zhiqiang.Hou@nxp.com Fixes: 0b24134f7888 ("PCI: dwc: Add validation that PCIe core is set to correct mode") Signed-off-by: Hou Zhiqiang Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring --- drivers/pci/controller/dwc/pcie-designware-ep.c | 3 ++- include/uapi/linux/pci_regs.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c b/drivers/pci/controller/dwc/pcie-designware-ep.c index 305bfec2424d..29f5c616c3bc 100644 --- a/drivers/pci/controller/dwc/pcie-designware-ep.c +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c @@ -505,7 +505,8 @@ int dw_pcie_ep_init_complete(struct dw_pcie_ep *ep) u32 reg; int i; - hdr_type = dw_pcie_readb_dbi(pci, PCI_HEADER_TYPE); + hdr_type = dw_pcie_readb_dbi(pci, PCI_HEADER_TYPE) & + PCI_HEADER_TYPE_MASK; if (hdr_type != PCI_HEADER_TYPE_NORMAL) { dev_err(pci->dev, "PCIe controller is not set to EP mode (hdr_type:0x%x)!\n", diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index f9701410d3b5..57a222014cd2 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -76,6 +76,7 @@ #define PCI_CACHE_LINE_SIZE 0x0c /* 8 bits */ #define PCI_LATENCY_TIMER 0x0d /* 8 bits */ #define PCI_HEADER_TYPE 0x0e /* 8 bits */ +#define PCI_HEADER_TYPE_MASK 0x7f #define PCI_HEADER_TYPE_NORMAL 0 #define PCI_HEADER_TYPE_BRIDGE 1 #define PCI_HEADER_TYPE_CARDBUS 2 -- cgit v1.2.3 From 938c3efd9e650ca343d04e70d11a17c64119e17c Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Fri, 4 Sep 2020 17:14:53 +0100 Subject: bpf: Fix formatting in documentation for BPF helpers Fix a formatting error in the description of bpf_load_hdr_opt() (rst2man complains about a wrong indentation, but what is missing is actually a blank line before the bullet list). Fix and harmonise the formatting for other helpers. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200904161454.31135-3-quentin@isovalent.com --- include/uapi/linux/bpf.h | 87 +++++++++++++++++++++++++----------------------- 1 file changed, 45 insertions(+), 42 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8dda13880957..90359cab501d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3349,38 +3349,38 @@ union bpf_attr { * Description * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * struct tcp_sock *bpf_skc_to_tcp_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *tcp_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * struct tcp_timewait_sock *bpf_skc_to_tcp_timewait_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *tcp_timewait_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * struct tcp_request_sock *bpf_skc_to_tcp_request_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * struct udp6_sock *bpf_skc_to_udp6_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *udp6_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags) * Description * Return a user or a kernel stack in bpf program provided buffer. * To achieve this, the helper needs *task*, which is a valid - * pointer to struct task_struct. To store the stacktrace, the - * bpf program provides *buf* with a nonnegative *size*. + * pointer to **struct task_struct**. To store the stacktrace, the + * bpf program provides *buf* with a nonnegative *size*. * * The last argument, *flags*, holds the number of stack frames to * skip (from 0 to 255), masked with @@ -3410,12 +3410,12 @@ union bpf_attr { * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags) * Description * Load header option. Support reading a particular TCP header - * option for bpf program (BPF_PROG_TYPE_SOCK_OPS). + * option for bpf program (**BPF_PROG_TYPE_SOCK_OPS**). * * If *flags* is 0, it will search the option from the - * sock_ops->skb_data. The comment in "struct bpf_sock_ops" + * *skops*\ **->skb_data**. The comment in **struct bpf_sock_ops** * has details on what skb_data contains under different - * sock_ops->op. + * *skops*\ **->op**. * * The first byte of the *searchby_res* specifies the * kind that it wants to search. @@ -3435,7 +3435,7 @@ union bpf_attr { * [ 254, 4, 0xeB, 0x9F, 0, 0, .... 0 ]. * * To search for the standard window scale option (3), - * the searchby_res should be [ 3, 0, 0, .... 0 ]. + * the *searchby_res* should be [ 3, 0, 0, .... 0 ]. * Note, kind-length must be 0 for regular option. * * Searching for No-Op (0) and End-of-Option-List (1) are @@ -3445,27 +3445,30 @@ union bpf_attr { * of a header option. * * Supported flags: + * * * **BPF_LOAD_HDR_OPT_TCP_SYN** to search from the * saved_syn packet or the just-received syn packet. * * Return - * >0 when found, the header option is copied to *searchby_res*. - * The return value is the total length copied. + * > 0 when found, the header option is copied to *searchby_res*. + * The return value is the total length copied. On failure, a + * negative error code is returned: * - * **-EINVAL** If param is invalid + * **-EINVAL** if a parameter is invalid. * - * **-ENOMSG** The option is not found + * **-ENOMSG** if the option is not found. * - * **-ENOENT** No syn packet available when - * **BPF_LOAD_HDR_OPT_TCP_SYN** is used + * **-ENOENT** if no syn packet is available when + * **BPF_LOAD_HDR_OPT_TCP_SYN** is used. * - * **-ENOSPC** Not enough space. Only *len* number of - * bytes are copied. + * **-ENOSPC** if there is not enough space. Only *len* number of + * bytes are copied. * - * **-EFAULT** Cannot parse the header options in the packet + * **-EFAULT** on failure to parse the header options in the + * packet. * - * **-EPERM** This helper cannot be used under the - * current sock_ops->op. + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. * * long bpf_store_hdr_opt(struct bpf_sock_ops *skops, const void *from, u32 len, u64 flags) * Description @@ -3483,44 +3486,44 @@ union bpf_attr { * by searching the same option in the outgoing skb. * * This helper can only be called during - * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**. * * Return * 0 on success, or negative error in case of failure: * - * **-EINVAL** If param is invalid + * **-EINVAL** If param is invalid. * - * **-ENOSPC** Not enough space in the header. - * Nothing has been written + * **-ENOSPC** if there is not enough space in the header. + * Nothing has been written * - * **-EEXIST** The option has already existed + * **-EEXIST** if the option already exists. * - * **-EFAULT** Cannot parse the existing header options + * **-EFAULT** on failrue to parse the existing header options. * - * **-EPERM** This helper cannot be used under the - * current sock_ops->op. + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. * * long bpf_reserve_hdr_opt(struct bpf_sock_ops *skops, u32 len, u64 flags) * Description * Reserve *len* bytes for the bpf header option. The - * space will be used by bpf_store_hdr_opt() later in - * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * space will be used by **bpf_store_hdr_opt**\ () later in + * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**. * - * If bpf_reserve_hdr_opt() is called multiple times, + * If **bpf_reserve_hdr_opt**\ () is called multiple times, * the total number of bytes will be reserved. * * This helper can only be called during - * BPF_SOCK_OPS_HDR_OPT_LEN_CB. + * **BPF_SOCK_OPS_HDR_OPT_LEN_CB**. * * Return * 0 on success, or negative error in case of failure: * - * **-EINVAL** if param is invalid + * **-EINVAL** if a parameter is invalid. * - * **-ENOSPC** Not enough space in the header. + * **-ENOSPC** if there is not enough space in the header. * - * **-EPERM** This helper cannot be used under the - * current sock_ops->op. + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. * * void *bpf_inode_storage_get(struct bpf_map *map, void *inode, void *value, u64 flags) * Description @@ -3560,9 +3563,9 @@ union bpf_attr { * * long bpf_d_path(struct path *path, char *buf, u32 sz) * Description - * Return full path for given 'struct path' object, which - * needs to be the kernel BTF 'path' object. The path is - * returned in the provided buffer 'buf' of size 'sz' and + * Return full path for given **struct path** object, which + * needs to be the kernel BTF *path* object. The path is + * returned in the provided buffer *buf* of size *sz* and * is zero terminated. * * Return @@ -3573,7 +3576,7 @@ union bpf_attr { * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr) * Description * Read *size* bytes from user space address *user_ptr* and store - * the data in *dst*. This is a wrapper of copy_from_user(). + * the data in *dst*. This is a wrapper of **copy_from_user**\ (). * Return * 0 on success, or a negative error in case of failure. */ -- cgit v1.2.3 From 5205e919c9f0c5b48678f2c787871c96f665ca1b Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Mon, 7 Sep 2020 12:56:08 +0300 Subject: net: bridge: mcast: add support for src list and filter mode dumping Support per port group src list (address and timer) and filter mode dumping. Protected by either multicast_lock or rcu. v3: add IPv6 support v2: require RCU or multicast_lock to traverse src groups Signed-off-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_bridge.h | 21 +++++++++++ net/bridge/br_mdb.c | 85 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 104 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index c1227aecd38f..75a2ac479247 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -455,10 +455,31 @@ enum { enum { MDBA_MDB_EATTR_UNSPEC, MDBA_MDB_EATTR_TIMER, + MDBA_MDB_EATTR_SRC_LIST, + MDBA_MDB_EATTR_GROUP_MODE, __MDBA_MDB_EATTR_MAX }; #define MDBA_MDB_EATTR_MAX (__MDBA_MDB_EATTR_MAX - 1) +/* per mdb entry source */ +enum { + MDBA_MDB_SRCLIST_UNSPEC, + MDBA_MDB_SRCLIST_ENTRY, + __MDBA_MDB_SRCLIST_MAX +}; +#define MDBA_MDB_SRCLIST_MAX (__MDBA_MDB_SRCLIST_MAX - 1) + +/* per mdb entry per source attributes + * these are embedded in MDBA_MDB_SRCLIST_ENTRY + */ +enum { + MDBA_MDB_SRCATTR_UNSPEC, + MDBA_MDB_SRCATTR_ADDRESS, + MDBA_MDB_SRCATTR_TIMER, + __MDBA_MDB_SRCATTR_MAX +}; +#define MDBA_MDB_SRCATTR_MAX (__MDBA_MDB_SRCATTR_MAX - 1) + /* multicast router types */ enum { MDB_RTR_TYPE_DISABLED, diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 559bdc256a1e..9dc12ce61018 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -77,10 +77,67 @@ static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip) #endif } +static int __mdb_fill_srcs(struct sk_buff *skb, + struct net_bridge_port_group *p) +{ + struct net_bridge_group_src *ent; + struct nlattr *nest, *nest_ent; + + if (hlist_empty(&p->src_list)) + return 0; + + nest = nla_nest_start(skb, MDBA_MDB_EATTR_SRC_LIST); + if (!nest) + return -EMSGSIZE; + + hlist_for_each_entry_rcu(ent, &p->src_list, node, + lockdep_is_held(&p->port->br->multicast_lock)) { + nest_ent = nla_nest_start(skb, MDBA_MDB_SRCLIST_ENTRY); + if (!nest_ent) + goto out_cancel_err; + switch (ent->addr.proto) { + case htons(ETH_P_IP): + if (nla_put_in_addr(skb, MDBA_MDB_SRCATTR_ADDRESS, + ent->addr.u.ip4)) { + nla_nest_cancel(skb, nest_ent); + goto out_cancel_err; + } + break; +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + if (nla_put_in6_addr(skb, MDBA_MDB_SRCATTR_ADDRESS, + &ent->addr.u.ip6)) { + nla_nest_cancel(skb, nest_ent); + goto out_cancel_err; + } + break; +#endif + default: + nla_nest_cancel(skb, nest_ent); + continue; + } + if (nla_put_u32(skb, MDBA_MDB_SRCATTR_TIMER, + br_timer_value(&ent->timer))) { + nla_nest_cancel(skb, nest_ent); + goto out_cancel_err; + } + nla_nest_end(skb, nest_ent); + } + + nla_nest_end(skb, nest); + + return 0; + +out_cancel_err: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + static int __mdb_fill_info(struct sk_buff *skb, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *p) { + bool dump_srcs_mode = false; struct timer_list *mtimer; struct nlattr *nest_ent; struct br_mdb_entry e; @@ -119,6 +176,23 @@ static int __mdb_fill_info(struct sk_buff *skb, nla_nest_cancel(skb, nest_ent); return -EMSGSIZE; } + switch (mp->addr.proto) { + case htons(ETH_P_IP): + dump_srcs_mode = !!(p && mp->br->multicast_igmp_version == 3); + break; +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + dump_srcs_mode = !!(p && mp->br->multicast_mld_version == 2); + break; +#endif + } + if (dump_srcs_mode && + (__mdb_fill_srcs(skb, p) || + nla_put_u8(skb, MDBA_MDB_EATTR_GROUP_MODE, p->filter_mode))) { + nla_nest_cancel(skb, nest_ent); + return -EMSGSIZE; + } + nla_nest_end(skb, nest_ent); return 0; @@ -127,7 +201,7 @@ static int __mdb_fill_info(struct sk_buff *skb, static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev) { - int idx = 0, s_idx = cb->args[1], err = 0; + int idx = 0, s_idx = cb->args[1], err = 0, pidx = 0, s_pidx = cb->args[2]; struct net_bridge *br = netdev_priv(dev); struct net_bridge_mdb_entry *mp; struct nlattr *nest, *nest2; @@ -152,7 +226,7 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, break; } - if (mp->host_joined) { + if (!s_pidx && mp->host_joined) { err = __mdb_fill_info(skb, mp, NULL); if (err) { nla_nest_cancel(skb, nest2); @@ -164,13 +238,19 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, pp = &p->next) { if (!p->port) continue; + if (pidx < s_pidx) + goto skip_pg; err = __mdb_fill_info(skb, mp, p); if (err) { nla_nest_cancel(skb, nest2); goto out; } +skip_pg: + pidx++; } + pidx = 0; + s_pidx = 0; nla_nest_end(skb, nest2); skip: idx++; @@ -178,6 +258,7 @@ skip: out: cb->args[1] = idx; + cb->args[2] = pidx; nla_nest_end(skb, nest); return err; } -- cgit v1.2.3 From 0db0c34cfbc9838c1a14cb04dd880602abd699a7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 Sep 2020 16:14:31 -0700 Subject: net: tighten the definition of interface statistics This patch is born out of an investigation into which IEEE statistics correspond to which struct rtnl_link_stats64 members. Turns out that there seems to be reasonable consensus on the matter, among many drivers. To save others the time (and it took more time than I'm comfortable admitting) I'm adding comments referring to IEEE attributes to struct rtnl_link_stats64. Up until now we had two forms of documentation for stats - in Documentation/ABI/testing/sysfs-class-net-statistics and the comments on struct rtnl_link_stats64 itself. While the former is very cautious in defining the expected behavior, the latter feel quite dated and may not be easy to understand for modern day driver author (e.g. rx_over_errors). At the same time modern systems are far more complex and once obvious definitions lost their clarity. For example - does rx_packet count at the MAC layer (aFramesReceivedOK)? packets processed correctly by hardware? received by the driver? or maybe received by the stack? I tried to clarify the expectations, further clarifications from others are very welcome. The part hardest to untangle is rx_over_errors vs rx_fifo_errors vs rx_missed_errors. After much deliberation I concluded that for modern HW only two of the counters will make sense. The distinction between internal FIFO overflow and packets dropped due to back-pressure from the host is likely too implementation (driver and device) specific to expose in the standard stats. Now - which two of those counters we select to use is anyone's pick: sysfs documentation suggests rx_over_errors counts packets which did not fit into buffers due to MTU being too small, which I reused. There don't seem to be many modern drivers using it (well, CAN drivers seem to love this statistic). Of the remaining two I picked rx_missed_errors to report device drops. bnxt reports it and it's folded into "drop"s in procfs (while rx_fifo_errors is an error, and modern devices usually receive the frame OK, they just can't admit it into the pipeline). Of the drivers I looked at only AMD Lance-like and NS8390-like use all three of these counters. rx_missed_errors counts missed frames, rx_over_errors counts overflow events, and rx_fifo_errors counts frames which were truncated because they didn't fit into buffers. This suggests that rx_fifo_errors may be the correct stat for truncated packets, but I'd think a FIFO stat counting truncated packets would be very confusing to a modern reader. v2: - add driver developer notes about ethtool stat count and reset - replace Ethernet with IEEE 802.3 to better indicate source of attrs - mention byte counters don't count FCS - clarify RX counter is from device to host - drop "sightly" from sysfs paragraph - add examples of ethtool stats - s/incoming/received/ s/incoming/transmitted/ Signed-off-by: Jakub Kicinski --- Documentation/networking/index.rst | 1 + Documentation/networking/statistics.rst | 132 +++++++++++++++++++++ include/uapi/linux/if_link.h | 204 +++++++++++++++++++++++++++++--- 3 files changed, 320 insertions(+), 17 deletions(-) create mode 100644 Documentation/networking/statistics.rst (limited to 'include/uapi/linux') diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index c29496fff81c..4167acc5c076 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -93,6 +93,7 @@ Contents: sctp secid seg6-sysctl + statistics strparser switchdev tc-actions-env-rules diff --git a/Documentation/networking/statistics.rst b/Documentation/networking/statistics.rst new file mode 100644 index 000000000000..d490b535cd14 --- /dev/null +++ b/Documentation/networking/statistics.rst @@ -0,0 +1,132 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==================== +Interface statistics +==================== + +This document is a guide to Linux network interface statistics. + +There are two main sources of interface statistics in Linux: + + - standard interface statistics based on + :c:type:`struct rtnl_link_stats64 `; and + - driver-defined statistics available via ethtool. + +There are multiple interfaces to reach the former. Most commonly used +is the `ip` command from `iproute2`:: + + $ ip -s -s link show dev ens4u1u1 + 6: ens4u1u1: mtu 1500 qdisc fq_codel state UP mode DEFAULT group default qlen 1000 + link/ether 48:2a:e3:4c:b1:d1 brd ff:ff:ff:ff:ff:ff + RX: bytes packets errors dropped overrun mcast + 74327665117 69016965 0 0 0 0 + RX errors: length crc frame fifo missed + 0 0 0 0 0 + TX: bytes packets errors dropped carrier collsns + 21405556176 44608960 0 0 0 0 + TX errors: aborted fifo window heartbeat transns + 0 0 0 0 128 + altname enp58s0u1u1 + +Note that `-s` has been specified twice to see all members of +:c:type:`struct rtnl_link_stats64 `. +If `-s` is specified once the detailed errors won't be shown. + +`ip` supports JSON formatting via the `-j` option. + +Ethtool statistics can be dumped using `ethtool -S $ifc`, e.g.:: + + $ ethtool -S ens4u1u1 + NIC statistics: + tx_single_collisions: 0 + tx_multi_collisions: 0 + +uAPIs +===== + +procfs +------ + +The historical `/proc/net/dev` text interface gives access to the list +of interfaces as well as their statistics. + +Note that even though this interface is using +:c:type:`struct rtnl_link_stats64 ` +internally it combines some of the fields. + +sysfs +----- + +Each device directory in sysfs contains a `statistics` directory (e.g. +`/sys/class/net/lo/statistics/`) with files corresponding to +members of :c:type:`struct rtnl_link_stats64 `. + +This simple interface is convenient especially in constrained/embedded +environments without access to tools. However, it's inefficient when +reading multiple stats as it internally performs a full dump of +:c:type:`struct rtnl_link_stats64 ` +and reports only the stat corresponding to the accessed file. + +Sysfs files are documented in +`Documentation/ABI/testing/sysfs-class-net-statistics`. + + +netlink +------- + +`rtnetlink` (`NETLINK_ROUTE`) is the preferred method of accessing +:c:type:`struct rtnl_link_stats64 ` stats. + +Statistics are reported both in the responses to link information +requests (`RTM_GETLINK`) and statistic requests (`RTM_GETSTATS`, +when `IFLA_STATS_LINK_64` bit is set in the `.filter_mask` of the request). + +ethtool +------- + +Ethtool IOCTL interface allows drivers to report implementation +specific statistics. Historically it has also been used to report +statistics for which other APIs did not exist, like per-device-queue +statistics, or standard-based statistics (e.g. RFC 2863). + +Statistics and their string identifiers are retrieved separately. +Identifiers via `ETHTOOL_GSTRINGS` with `string_set` set to `ETH_SS_STATS`, +and values via `ETHTOOL_GSTATS`. User space should use `ETHTOOL_GDRVINFO` +to retrieve the number of statistics (`.n_stats`). + +debugfs +------- + +Some drivers expose extra statistics via `debugfs`. + +struct rtnl_link_stats64 +======================== + +.. kernel-doc:: include/uapi/linux/if_link.h + :identifiers: rtnl_link_stats64 + +Notes for driver authors +======================== + +Drivers should report all statistics which have a matching member in +:c:type:`struct rtnl_link_stats64 ` exclusively +via `.ndo_get_stats64`. Reporting such standard stats via ethtool +or debugfs will not be accepted. + +Drivers must ensure best possible compliance with +:c:type:`struct rtnl_link_stats64 `. +Please note for example that detailed error statistics must be +added into the general `rx_error` / `tx_error` counters. + +The `.ndo_get_stats64` callback can not sleep because of accesses +via `/proc/net/dev`. If driver may sleep when retrieving the statistics +from the device it should do so periodically asynchronously and only return +a recent copy from `.ndo_get_stats64`. Ethtool interrupt coalescing interface +allows setting the frequency of refreshing statistics, if needed. + +Retrieving ethtool statistics is a multi-syscall process, drivers are advised +to keep the number of statistics constant to avoid race conditions with +user space trying to read them. + +Statistics must persist across routine operations like bringing the interface +down and up. diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 7fba4de511de..bf4667403cab 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -40,26 +40,197 @@ struct rtnl_link_stats { __u32 rx_nohandler; /* dropped, no handler found */ }; -/* The main device statistics structure */ +/** + * struct rtnl_link_stats64 - The main device statistics structure. + * + * @rx_packets: Number of good packets received by the interface. + * For hardware interfaces counts all good packets received from the device + * by the host, including packets which host had to drop at various stages + * of processing (even in the driver). + * + * @tx_packets: Number of packets successfully transmitted. + * For hardware interfaces counts packets which host was able to successfully + * hand over to the device, which does not necessarily mean that packets + * had been successfully transmitted out of the device, only that device + * acknowledged it copied them out of host memory. + * + * @rx_bytes: Number of good received bytes, corresponding to @rx_packets. + * + * For IEEE 802.3 devices should count the length of Ethernet Frames + * excluding the FCS. + * + * @tx_bytes: Number of good transmitted bytes, corresponding to @tx_packets. + * + * For IEEE 802.3 devices should count the length of Ethernet Frames + * excluding the FCS. + * + * @rx_errors: Total number of bad packets received on this network device. + * This counter must include events counted by @rx_length_errors, + * @rx_crc_errors, @rx_frame_errors and other errors not otherwise + * counted. + * + * @tx_errors: Total number of transmit problems. + * This counter must include events counter by @tx_aborted_errors, + * @tx_carrier_errors, @tx_fifo_errors, @tx_heartbeat_errors, + * @tx_window_errors and other errors not otherwise counted. + * + * @rx_dropped: Number of packets received but not processed, + * e.g. due to lack of resources or unsupported protocol. + * For hardware interfaces this counter should not include packets + * dropped by the device which are counted separately in + * @rx_missed_errors (since procfs folds those two counters together). + * + * @tx_dropped: Number of packets dropped on their way to transmission, + * e.g. due to lack of resources. + * + * @multicast: Multicast packets received. + * For hardware interfaces this statistic is commonly calculated + * at the device level (unlike @rx_packets) and therefore may include + * packets which did not reach the host. + * + * For IEEE 802.3 devices this counter may be equivalent to: + * + * - 30.3.1.1.21 aMulticastFramesReceivedOK + * + * @collisions: Number of collisions during packet transmissions. + * + * @rx_length_errors: Number of packets dropped due to invalid length. + * Part of aggregate "frame" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter should be equivalent to a sum + * of the following attributes: + * + * - 30.3.1.1.23 aInRangeLengthErrors + * - 30.3.1.1.24 aOutOfRangeLengthField + * - 30.3.1.1.25 aFrameTooLongErrors + * + * @rx_over_errors: Receiver FIFO overflow event counter. + * + * Historically the count of overflow events. Such events may be + * reported in the receive descriptors or via interrupts, and may + * not correspond one-to-one with dropped packets. + * + * The recommended interpretation for high speed interfaces is - + * number of packets dropped because they did not fit into buffers + * provided by the host, e.g. packets larger than MTU or next buffer + * in the ring was not available for a scatter transfer. + * + * Part of aggregate "frame" errors in `/proc/net/dev`. + * + * This statistics was historically used interchangeably with + * @rx_fifo_errors. + * + * This statistic corresponds to hardware events and is not commonly used + * on software devices. + * + * @rx_crc_errors: Number of packets received with a CRC error. + * Part of aggregate "frame" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter must be equivalent to: + * + * - 30.3.1.1.6 aFrameCheckSequenceErrors + * + * @rx_frame_errors: Receiver frame alignment errors. + * Part of aggregate "frame" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter should be equivalent to: + * + * - 30.3.1.1.7 aAlignmentErrors + * + * @rx_fifo_errors: Receiver FIFO error counter. + * + * Historically the count of overflow events. Those events may be + * reported in the receive descriptors or via interrupts, and may + * not correspond one-to-one with dropped packets. + * + * This statistics was used interchangeably with @rx_over_errors. + * Not recommended for use in drivers for high speed interfaces. + * + * This statistic is used on software devices, e.g. to count software + * packet queue overflow (can) or sequencing errors (GRE). + * + * @rx_missed_errors: Count of packets missed by the host. + * Folded into the "drop" counter in `/proc/net/dev`. + * + * Counts number of packets dropped by the device due to lack + * of buffer space. This usually indicates that the host interface + * is slower than the network interface, or host is not keeping up + * with the receive packet rate. + * + * This statistic corresponds to hardware events and is not used + * on software devices. + * + * @tx_aborted_errors: + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * For IEEE 802.3 devices capable of half-duplex operation this counter + * must be equivalent to: + * + * - 30.3.1.1.11 aFramesAbortedDueToXSColls + * + * High speed interfaces may use this counter as a general device + * discard counter. + * + * @tx_carrier_errors: Number of frame transmission errors due to loss + * of carrier during transmission. + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter must be equivalent to: + * + * - 30.3.1.1.13 aCarrierSenseErrors + * + * @tx_fifo_errors: Number of frame transmission errors due to device + * FIFO underrun / underflow. This condition occurs when the device + * begins transmission of a frame but is unable to deliver the + * entire frame to the transmitter in time for transmission. + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * + * @tx_heartbeat_errors: Number of Heartbeat / SQE Test errors for + * old half-duplex Ethernet. + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices possibly equivalent to: + * + * - 30.3.2.1.4 aSQETestErrors + * + * @tx_window_errors: Number of frame transmission errors due + * to late collisions (for Ethernet - after the first 64B of transmission). + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter must be equivalent to: + * + * - 30.3.1.1.10 aLateCollisions + * + * @rx_compressed: Number of correctly received compressed packets. + * This counters is only meaningful for interfaces which support + * packet compression (e.g. CSLIP, PPP). + * + * @tx_compressed: Number of transmitted compressed packets. + * This counters is only meaningful for interfaces which support + * packet compression (e.g. CSLIP, PPP). + * + * @rx_nohandler: Number of packets received on the interface + * but dropped by the networking stack because the device is + * not designated to receive packets (e.g. backup link in a bond). + */ struct rtnl_link_stats64 { - __u64 rx_packets; /* total packets received */ - __u64 tx_packets; /* total packets transmitted */ - __u64 rx_bytes; /* total bytes received */ - __u64 tx_bytes; /* total bytes transmitted */ - __u64 rx_errors; /* bad packets received */ - __u64 tx_errors; /* packet transmit problems */ - __u64 rx_dropped; /* no space in linux buffers */ - __u64 tx_dropped; /* no space available in linux */ - __u64 multicast; /* multicast packets received */ + __u64 rx_packets; + __u64 tx_packets; + __u64 rx_bytes; + __u64 tx_bytes; + __u64 rx_errors; + __u64 tx_errors; + __u64 rx_dropped; + __u64 tx_dropped; + __u64 multicast; __u64 collisions; /* detailed rx_errors: */ __u64 rx_length_errors; - __u64 rx_over_errors; /* receiver ring buff overflow */ - __u64 rx_crc_errors; /* recved pkt with crc error */ - __u64 rx_frame_errors; /* recv'd frame alignment error */ - __u64 rx_fifo_errors; /* recv'r fifo overrun */ - __u64 rx_missed_errors; /* receiver missed packet */ + __u64 rx_over_errors; + __u64 rx_crc_errors; + __u64 rx_frame_errors; + __u64 rx_fifo_errors; + __u64 rx_missed_errors; /* detailed tx_errors */ __u64 tx_aborted_errors; @@ -71,8 +242,7 @@ struct rtnl_link_stats64 { /* for cslip etc */ __u64 rx_compressed; __u64 tx_compressed; - - __u64 rx_nohandler; /* dropped, no handler found */ + __u64 rx_nohandler; }; /* The struct should be in sync with struct ifmap */ -- cgit v1.2.3 From b131c96496b369c7b14125e7c50e89ac7cec8051 Mon Sep 17 00:00:00 2001 From: "Jose M. Guisado Gomez" Date: Tue, 8 Sep 2020 13:01:41 +0200 Subject: netfilter: nf_tables: add userdata support for nft_object Enables storing userdata for nft_object. Initially this will store an optional comment but can be extended in the future as needed. Adds new attribute NFTA_OBJ_USERDATA to nft_object. Signed-off-by: Jose M. Guisado Gomez Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 ++ include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 35 ++++++++++++++++++++++++-------- 3 files changed, 31 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 97a7e147a59a..99c1b3188b1e 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1123,6 +1123,8 @@ struct nft_object { u32 genmask:2, use:30; u64 handle; + u16 udlen; + u8 *udata; /* runtime data below here */ const struct nft_object_ops *ops ____cacheline_aligned; unsigned char data[] diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 543dc697b796..2a6e09dea1a0 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1559,6 +1559,7 @@ enum nft_ct_expectation_attributes { * @NFTA_OBJ_DATA: stateful object data (NLA_NESTED) * @NFTA_OBJ_USE: number of references to this expression (NLA_U32) * @NFTA_OBJ_HANDLE: object handle (NLA_U64) + * @NFTA_OBJ_USERDATA: user data (NLA_BINARY) */ enum nft_object_attributes { NFTA_OBJ_UNSPEC, @@ -1569,6 +1570,7 @@ enum nft_object_attributes { NFTA_OBJ_USE, NFTA_OBJ_HANDLE, NFTA_OBJ_PAD, + NFTA_OBJ_USERDATA, __NFTA_OBJ_MAX }; #define NFTA_OBJ_MAX (__NFTA_OBJ_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 6ccce2a2e715..e9b4848e9dd0 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5755,6 +5755,8 @@ static const struct nla_policy nft_obj_policy[NFTA_OBJ_MAX + 1] = { [NFTA_OBJ_TYPE] = { .type = NLA_U32 }, [NFTA_OBJ_DATA] = { .type = NLA_NESTED }, [NFTA_OBJ_HANDLE] = { .type = NLA_U64}, + [NFTA_OBJ_USERDATA] = { .type = NLA_BINARY, + .len = NFT_USERDATA_MAXLEN }, }; static struct nft_object *nft_obj_init(const struct nft_ctx *ctx, @@ -5902,6 +5904,7 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, struct nft_object *obj; struct nft_ctx ctx; u32 objtype; + u16 udlen; int err; if (!nla[NFTA_OBJ_TYPE] || @@ -5946,7 +5949,7 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, obj = nft_obj_init(&ctx, type, nla[NFTA_OBJ_DATA]); if (IS_ERR(obj)) { err = PTR_ERR(obj); - goto err1; + goto err_init; } obj->key.table = table; obj->handle = nf_tables_alloc_handle(table); @@ -5954,32 +5957,44 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, obj->key.name = nla_strdup(nla[NFTA_OBJ_NAME], GFP_KERNEL); if (!obj->key.name) { err = -ENOMEM; - goto err2; + goto err_strdup; + } + + if (nla[NFTA_OBJ_USERDATA]) { + udlen = nla_len(nla[NFTA_OBJ_USERDATA]); + obj->udata = kzalloc(udlen, GFP_KERNEL); + if (obj->udata == NULL) + goto err_userdata; + + nla_memcpy(obj->udata, nla[NFTA_OBJ_USERDATA], udlen); + obj->udlen = udlen; } err = nft_trans_obj_add(&ctx, NFT_MSG_NEWOBJ, obj); if (err < 0) - goto err3; + goto err_trans; err = rhltable_insert(&nft_objname_ht, &obj->rhlhead, nft_objname_ht_params); if (err < 0) - goto err4; + goto err_obj_ht; list_add_tail_rcu(&obj->list, &table->objects); table->use++; return 0; -err4: +err_obj_ht: /* queued in transaction log */ INIT_LIST_HEAD(&obj->list); return err; -err3: +err_trans: kfree(obj->key.name); -err2: +err_userdata: + kfree(obj->udata); +err_strdup: if (obj->ops->destroy) obj->ops->destroy(&ctx, obj); kfree(obj); -err1: +err_init: module_put(type->owner); return err; } @@ -6011,6 +6026,10 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net, NFTA_OBJ_PAD)) goto nla_put_failure; + if (obj->udata && + nla_put(skb, NFTA_OBJ_USERDATA, obj->udlen, obj->udata)) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; -- cgit v1.2.3 From cd80ec795156346236e9b1cd9f5cbff5a9bbd212 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 8 Sep 2020 09:33:56 -0700 Subject: Input: allocate keycodes for notification-center, pickup-phone and hangup-phone New Lenovo Thinkpad models, e.g. the X1 Carbon 8th gen and the new T14 gen1 models have 3 new symbols / shortcuts on their F9-F11 keys (and the thinkpad_acpi driver receives 3 new "scancodes" for these): F9: Has a symbol resembling a rectangular speech balloon, the manual says the hotkey functions shows or hides the notification center F10: Has a symbol of a telephone horn which has been picked up from the receiver, the manual says: "Answer incoming calls" F11: Has a symbol of a telephone horn which is resting on the receiver, the manual says: "Decline incoming calls" We have no existing keycodes which are a good match for these, so add 3 new keycodes for these. I noticed that we have a hole in our keycodes between 0x1ba and 0x1c0 which does not seem to be reserved for any specific purpose, so these new 3 codes use 0x1bc - 0x1be, instead of starting at 0x27b. Acked-by: Henrique de Moraes Holschuh Acked-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Dmitry Torokhov --- include/uapi/linux/input-event-codes.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h index 0c2e27d28e0a..b74821d09145 100644 --- a/include/uapi/linux/input-event-codes.h +++ b/include/uapi/linux/input-event-codes.h @@ -515,6 +515,9 @@ #define KEY_10CHANNELSUP 0x1b8 /* 10 channels up (10+) */ #define KEY_10CHANNELSDOWN 0x1b9 /* 10 channels down (10-) */ #define KEY_IMAGES 0x1ba /* AL Image Browser */ +#define KEY_NOTIFICATION_CENTER 0x1bc /* Show/hide the notification center */ +#define KEY_PICKUP_PHONE 0x1bd /* Answer incoming call */ +#define KEY_HANGUP_PHONE 0x1be /* Decline incoming call */ #define KEY_DEL_EOL 0x1c0 #define KEY_DEL_EOS 0x1c1 -- cgit v1.2.3 From bba013e1ca5e7150b42a1a1a1e852010d772edad Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 8 Sep 2020 09:58:09 -0700 Subject: Input: allocate keycode for Fn + right shift The last 2 generations of Lenovo Thinkpads send an acpi_thinkpad event when Fn + right shift is pressed. This is intended for use with "Lenovo Quick Clean" software, which disables the touchpad + kbd for 2 minutes on this key-combo so that healthcare workes can disinfect it. But there is no silkscreen print on the right-keyboard to indicate this, so add a KEY_FN_RIGHT_SHIFT keycode define to use for this key-combo. Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20200908135147.4044-3-hdegoede@redhat.com Signed-off-by: Dmitry Torokhov --- include/uapi/linux/input-event-codes.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h index b74821d09145..ee93428ced9a 100644 --- a/include/uapi/linux/input-event-codes.h +++ b/include/uapi/linux/input-event-codes.h @@ -545,6 +545,7 @@ #define KEY_FN_F 0x1e2 #define KEY_FN_S 0x1e3 #define KEY_FN_B 0x1e4 +#define KEY_FN_RIGHT_SHIFT 0x1e5 #define KEY_BRL_DOT1 0x1f1 #define KEY_BRL_DOT2 0x1f2 -- cgit v1.2.3 From 05b595e9c44acaca94192c6db430a489c1b212a7 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 9 Sep 2020 07:50:36 +0300 Subject: devlink: Introduce external controller flag A devlink eswitch port may represent PCI PF/VF ports of a controller. A controller either located on same system or it can be an external controller located in host where such NIC is plugged in. Add the ability for driver to specify if a port is for external controller. Use such flag in the mlx5_core driver. An example of an external controller having VF1 of PF0 belong to controller 1. $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev ens2f0pf0vf1 flavour pcivf pfnum 0 vfnum 1 external true splittable false function: hw_addr 00:00:00:00:00:00 $ devlink port show pci/0000:06:00.0/2 -jp { "port": { "pci/0000:06:00.0/2": { "type": "eth", "netdev": "ens2f0pf0vf1", "flavour": "pcivf", "pfnum": 0, "vfnum": 1, "external": true, "splittable": false, "function": { "hw_addr": "00:00:00:00:00:00" } } } } Signed-off-by: Parav Pandit Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 6 ++++-- include/net/devlink.h | 8 ++++++-- include/uapi/linux/devlink.h | 1 + net/core/devlink.c | 12 ++++++++++-- 4 files changed, 21 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index e13e5d1b3eae..5b3599caa007 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -1215,11 +1215,13 @@ static int register_devlink_port(struct mlx5_core_dev *dev, struct devlink_port_attrs attrs = {}; struct netdev_phys_item_id ppid = {}; unsigned int dl_port_index = 0; + bool external; u16 pfnum; if (!is_devlink_port_supported(dev, rpriv)) return 0; + external = mlx5_core_is_ecpf_esw_manager(dev); mlx5e_rep_get_port_parent_id(rpriv->netdev, &ppid); dl_port_index = mlx5_esw_vport_to_devlink_port_index(dev, rep->vport); pfnum = PCI_FUNC(dev->pdev->devfn); @@ -1232,12 +1234,12 @@ static int register_devlink_port(struct mlx5_core_dev *dev, } else if (rep->vport == MLX5_VPORT_PF) { memcpy(rpriv->dl_port.attrs.switch_id.id, &ppid.id[0], ppid.id_len); rpriv->dl_port.attrs.switch_id.id_len = ppid.id_len; - devlink_port_attrs_pci_pf_set(&rpriv->dl_port, pfnum); + devlink_port_attrs_pci_pf_set(&rpriv->dl_port, pfnum, external); } else if (mlx5_eswitch_is_vf_vport(dev->priv.eswitch, rpriv->rep->vport)) { memcpy(rpriv->dl_port.attrs.switch_id.id, &ppid.id[0], ppid.id_len); rpriv->dl_port.attrs.switch_id.id_len = ppid.id_len; devlink_port_attrs_pci_vf_set(&rpriv->dl_port, - pfnum, rep->vport - 1); + pfnum, rep->vport - 1, external); } return devlink_port_register(devlink, &rpriv->dl_port, dl_port_index); } diff --git a/include/net/devlink.h b/include/net/devlink.h index efff9274d248..2dad8c9151f4 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -60,19 +60,23 @@ struct devlink_port_phys_attrs { /** * struct devlink_port_pci_pf_attrs - devlink port's PCI PF attributes * @pf: Associated PCI PF number for this port. + * @external: when set, indicates if a port is for an external controller */ struct devlink_port_pci_pf_attrs { u16 pf; + u8 external:1; }; /** * struct devlink_port_pci_vf_attrs - devlink port's PCI VF attributes * @pf: Associated PCI PF number for this port. * @vf: Associated PCI VF for of the PCI PF for this port. + * @external: when set, indicates if a port is for an external controller */ struct devlink_port_pci_vf_attrs { u16 pf; u16 vf; + u8 external:1; }; /** @@ -1215,9 +1219,9 @@ void devlink_port_type_ib_set(struct devlink_port *devlink_port, void devlink_port_type_clear(struct devlink_port *devlink_port); void devlink_port_attrs_set(struct devlink_port *devlink_port, struct devlink_port_attrs *devlink_port_attrs); -void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u16 pf); +void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u16 pf, bool external); void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, - u16 pf, u16 vf); + u16 pf, u16 vf, bool external); int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, u32 size, u16 ingress_pools_count, u16 egress_pools_count, u16 ingress_tc_count, diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index cfef4245ea5a..40823ed7e05a 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -458,6 +458,7 @@ enum devlink_attr { DEVLINK_ATTR_PORT_LANES, /* u32 */ DEVLINK_ATTR_PORT_SPLITTABLE, /* u8 */ + DEVLINK_ATTR_PORT_EXTERNAL, /* u8 */ /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 49e911c19881..6f5f85372721 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -526,6 +526,8 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg, if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_pf.pf)) return -EMSGSIZE; + if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_pf.external)) + return -EMSGSIZE; break; case DEVLINK_PORT_FLAVOUR_PCI_VF: if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, @@ -533,6 +535,8 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg, nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_VF_NUMBER, attrs->pci_vf.vf)) return -EMSGSIZE; + if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_vf.external)) + return -EMSGSIZE; break; case DEVLINK_PORT_FLAVOUR_PHYSICAL: case DEVLINK_PORT_FLAVOUR_CPU: @@ -7716,8 +7720,9 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_set); * * @devlink_port: devlink port * @pf: associated PF for the devlink port instance + * @external: indicates if the port is for an external controller */ -void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u16 pf) +void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u16 pf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; @@ -7728,6 +7733,7 @@ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u16 pf) return; attrs->pci_pf.pf = pf; + attrs->pci_pf.external = external; } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set); @@ -7737,9 +7743,10 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set); * @devlink_port: devlink port * @pf: associated PF for the devlink port instance * @vf: associated VF of a PF for the devlink port instance + * @external: indicates if the port is for an external controller */ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, - u16 pf, u16 vf) + u16 pf, u16 vf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; @@ -7750,6 +7757,7 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, return; attrs->pci_vf.pf = pf; attrs->pci_vf.vf = vf; + attrs->pci_vf.external = external; } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set); -- cgit v1.2.3 From 3a2d9588c4f79adae6a0e986b64ebdd5b38085c6 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 9 Sep 2020 07:50:37 +0300 Subject: devlink: Introduce controller number A devlink port may be for a controller consist of PCI device. A devlink instance holds ports of two types of controllers. (1) controller discovered on same system where eswitch resides This is the case where PCI PF/VF of a controller and devlink eswitch instance both are located on a single system. (2) controller located on external host system. This is the case where a controller is located in one system and its devlink eswitch ports are located in a different system. When a devlink eswitch instance serves the devlink ports of both controllers together, PCI PF/VF numbers may overlap. Due to this a unique phys_port_name cannot be constructed. For example in below such system controller-0 and controller-1, each has PCI PF pf0 whose eswitch ports can be present in controller-0. These results in phys_port_name as "pf0" for both. Similar problem exists for VFs and upcoming Sub functions. An example view of two controller systems: --------------------------------------------------------- | | | --------- --------- ------- ------- | ----------- | | vf(s) | | sf(s) | |vf(s)| |sf(s)| | | server | | ------- ----/---- ---/----- ------- ---/--- ---/--- | | pci rc |=== | pf0 |______/________/ | pf1 |___/_______/ | | connect | | ------- ------- | ----------- | | controller_num=1 (no eswitch) | ------|-------------------------------------------------- (internal wire) | --------------------------------------------------------- | devlink eswitch ports and reps | | ----------------------------------------------------- | | |ctrl-0 | ctrl-0 | ctrl-0 | ctrl-0 | ctrl-0 |ctrl-0 | | | |pf0 | pf0vfN | pf0sfN | pf1 | pf1vfN |pf1sfN | | | ----------------------------------------------------- | | |ctrl-1 | ctrl-1 | ctrl-1 | ctrl-1 | ctrl-1 |ctrl-1 | | | |pf1 | pf1vfN | pf1sfN | pf1 | pf1vfN |pf0sfN | | | ----------------------------------------------------- | | | | | | --------- --------- ------- ------- | | | vf(s) | | sf(s) | |vf(s)| |sf(s)| | | ------- ----/---- ---/----- ------- ---/--- ---/--- | | | pf0 |______/________/ | pf1 |___/_______/ | | ------- ------- | | | | local controller_num=0 (eswitch) | --------------------------------------------------------- An example devlink port for external controller with controller number = 1 for a VF 1 of PF 0: $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev ens2f0pf0vf1 flavour pcivf controller 1 pfnum 0 vfnum 1 external true splittable false function: hw_addr 00:00:00:00:00:00 $ devlink port show pci/0000:06:00.0/2 -jp { "port": { "pci/0000:06:00.0/2": { "type": "eth", "netdev": "ens2f0pf0vf1", "flavour": "pcivf", "controller": 1, "pfnum": 0, "vfnum": 1, "external": true, "splittable": false, "function": { "hw_addr": "00:00:00:00:00:00" } } } } Signed-off-by: Parav Pandit Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 9 +++++++-- include/net/devlink.h | 9 +++++++-- include/uapi/linux/devlink.h | 1 + net/core/devlink.c | 23 ++++++++++++++--------- 4 files changed, 29 insertions(+), 13 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 5b3599caa007..135ee26881c9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -1210,11 +1210,13 @@ is_devlink_port_supported(const struct mlx5_core_dev *dev, static int register_devlink_port(struct mlx5_core_dev *dev, struct mlx5e_rep_priv *rpriv) { + struct mlx5_esw_offload *offloads = &dev->priv.eswitch->offloads; struct devlink *devlink = priv_to_devlink(dev); struct mlx5_eswitch_rep *rep = rpriv->rep; struct devlink_port_attrs attrs = {}; struct netdev_phys_item_id ppid = {}; unsigned int dl_port_index = 0; + u32 controller_num = 0; bool external; u16 pfnum; @@ -1222,6 +1224,8 @@ static int register_devlink_port(struct mlx5_core_dev *dev, return 0; external = mlx5_core_is_ecpf_esw_manager(dev); + if (external) + controller_num = offloads->host_number + 1; mlx5e_rep_get_port_parent_id(rpriv->netdev, &ppid); dl_port_index = mlx5_esw_vport_to_devlink_port_index(dev, rep->vport); pfnum = PCI_FUNC(dev->pdev->devfn); @@ -1234,11 +1238,12 @@ static int register_devlink_port(struct mlx5_core_dev *dev, } else if (rep->vport == MLX5_VPORT_PF) { memcpy(rpriv->dl_port.attrs.switch_id.id, &ppid.id[0], ppid.id_len); rpriv->dl_port.attrs.switch_id.id_len = ppid.id_len; - devlink_port_attrs_pci_pf_set(&rpriv->dl_port, pfnum, external); + devlink_port_attrs_pci_pf_set(&rpriv->dl_port, controller_num, + pfnum, external); } else if (mlx5_eswitch_is_vf_vport(dev->priv.eswitch, rpriv->rep->vport)) { memcpy(rpriv->dl_port.attrs.switch_id.id, &ppid.id[0], ppid.id_len); rpriv->dl_port.attrs.switch_id.id_len = ppid.id_len; - devlink_port_attrs_pci_vf_set(&rpriv->dl_port, + devlink_port_attrs_pci_vf_set(&rpriv->dl_port, controller_num, pfnum, rep->vport - 1, external); } return devlink_port_register(devlink, &rpriv->dl_port, dl_port_index); diff --git a/include/net/devlink.h b/include/net/devlink.h index 2dad8c9151f4..eaec0a8cc5ef 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -59,21 +59,25 @@ struct devlink_port_phys_attrs { /** * struct devlink_port_pci_pf_attrs - devlink port's PCI PF attributes + * @controller: Associated controller number * @pf: Associated PCI PF number for this port. * @external: when set, indicates if a port is for an external controller */ struct devlink_port_pci_pf_attrs { + u32 controller; u16 pf; u8 external:1; }; /** * struct devlink_port_pci_vf_attrs - devlink port's PCI VF attributes + * @controller: Associated controller number * @pf: Associated PCI PF number for this port. * @vf: Associated PCI VF for of the PCI PF for this port. * @external: when set, indicates if a port is for an external controller */ struct devlink_port_pci_vf_attrs { + u32 controller; u16 pf; u16 vf; u8 external:1; @@ -1219,8 +1223,9 @@ void devlink_port_type_ib_set(struct devlink_port *devlink_port, void devlink_port_type_clear(struct devlink_port *devlink_port); void devlink_port_attrs_set(struct devlink_port *devlink_port, struct devlink_port_attrs *devlink_port_attrs); -void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u16 pf, bool external); -void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, +void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 controller, + u16 pf, bool external); +void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, u16 vf, bool external); int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, u32 size, u16 ingress_pools_count, diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 40823ed7e05a..40d35145c879 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -459,6 +459,7 @@ enum devlink_attr { DEVLINK_ATTR_PORT_SPLITTABLE, /* u8 */ DEVLINK_ATTR_PORT_EXTERNAL, /* u8 */ + DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, /* u32 */ /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 6f5f85372721..9cf5b118253b 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -523,17 +523,18 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg, return -EMSGSIZE; switch (devlink_port->attrs.flavour) { case DEVLINK_PORT_FLAVOUR_PCI_PF: - if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, - attrs->pci_pf.pf)) + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, + attrs->pci_pf.controller) || + nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_pf.pf)) return -EMSGSIZE; if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_pf.external)) return -EMSGSIZE; break; case DEVLINK_PORT_FLAVOUR_PCI_VF: - if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, - attrs->pci_vf.pf) || - nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_VF_NUMBER, - attrs->pci_vf.vf)) + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, + attrs->pci_vf.controller) || + nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_vf.pf) || + nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_VF_NUMBER, attrs->pci_vf.vf)) return -EMSGSIZE; if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_vf.external)) return -EMSGSIZE; @@ -7719,10 +7720,12 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_set); * devlink_port_attrs_pci_pf_set - Set PCI PF port attributes * * @devlink_port: devlink port + * @controller: associated controller number for the devlink port instance * @pf: associated PF for the devlink port instance * @external: indicates if the port is for an external controller */ -void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u16 pf, bool external) +void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 controller, + u16 pf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; @@ -7731,7 +7734,7 @@ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u16 pf, bo DEVLINK_PORT_FLAVOUR_PCI_PF); if (ret) return; - + attrs->pci_pf.controller = controller; attrs->pci_pf.pf = pf; attrs->pci_pf.external = external; } @@ -7741,11 +7744,12 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set); * devlink_port_attrs_pci_vf_set - Set PCI VF port attributes * * @devlink_port: devlink port + * @controller: associated controller number for the devlink port instance * @pf: associated PF for the devlink port instance * @vf: associated VF of a PF for the devlink port instance * @external: indicates if the port is for an external controller */ -void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, +void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, u16 vf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; @@ -7755,6 +7759,7 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, DEVLINK_PORT_FLAVOUR_PCI_VF); if (ret) return; + attrs->pci_vf.controller = controller; attrs->pci_vf.pf = pf; attrs->pci_vf.vf = vf; attrs->pci_vf.external = external; -- cgit v1.2.3 From ad47ff330b26a9fefa882032be2122700e1625ab Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 9 Sep 2020 09:34:13 -0700 Subject: quota: widen timestamps for the fs_disk_quota structure Soon, XFS will support quota grace period expiration timestamps beyond the year 2038, widen the timestamp fields to handle the extra time bits. Internally, XFS now stores unsigned 34-bit quantities, so the extra 8 bits here should work fine. (Note that XFS is the only user of this structure.) Link: https://lore.kernel.org/r/20200909163413.GJ7955@magnolia Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara --- fs/quota/quota.c | 42 ++++++++++++++++++++++++++++++++++++------ include/uapi/linux/dqblk_xfs.h | 11 ++++++++++- 2 files changed, 46 insertions(+), 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 47f9e151988b..52362eeaea94 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -481,6 +481,14 @@ static inline u64 quota_btobb(u64 bytes) return (bytes + (1 << XFS_BB_SHIFT) - 1) >> XFS_BB_SHIFT; } +static inline s64 copy_from_xfs_dqblk_ts(const struct fs_disk_quota *d, + __s32 timer, __s8 timer_hi) +{ + if (d->d_fieldmask & FS_DQ_BIGTIME) + return (u32)timer | (s64)timer_hi << 32; + return timer; +} + static void copy_from_xfs_dqblk(struct qc_dqblk *dst, struct fs_disk_quota *src) { dst->d_spc_hardlimit = quota_bbtob(src->d_blk_hardlimit); @@ -489,14 +497,17 @@ static void copy_from_xfs_dqblk(struct qc_dqblk *dst, struct fs_disk_quota *src) dst->d_ino_softlimit = src->d_ino_softlimit; dst->d_space = quota_bbtob(src->d_bcount); dst->d_ino_count = src->d_icount; - dst->d_ino_timer = src->d_itimer; - dst->d_spc_timer = src->d_btimer; + dst->d_ino_timer = copy_from_xfs_dqblk_ts(src, src->d_itimer, + src->d_itimer_hi); + dst->d_spc_timer = copy_from_xfs_dqblk_ts(src, src->d_btimer, + src->d_btimer_hi); dst->d_ino_warns = src->d_iwarns; dst->d_spc_warns = src->d_bwarns; dst->d_rt_spc_hardlimit = quota_bbtob(src->d_rtb_hardlimit); dst->d_rt_spc_softlimit = quota_bbtob(src->d_rtb_softlimit); dst->d_rt_space = quota_bbtob(src->d_rtbcount); - dst->d_rt_spc_timer = src->d_rtbtimer; + dst->d_rt_spc_timer = copy_from_xfs_dqblk_ts(src, src->d_rtbtimer, + src->d_rtbtimer_hi); dst->d_rt_spc_warns = src->d_rtbwarns; dst->d_fieldmask = 0; if (src->d_fieldmask & FS_DQ_ISOFT) @@ -588,10 +599,26 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id, return sb->s_qcop->set_dqblk(sb, qid, &qdq); } +static inline void copy_to_xfs_dqblk_ts(const struct fs_disk_quota *d, + __s32 *timer_lo, __s8 *timer_hi, s64 timer) +{ + *timer_lo = timer; + if (d->d_fieldmask & FS_DQ_BIGTIME) + *timer_hi = timer >> 32; +} + +static inline bool want_bigtime(s64 timer) +{ + return timer > S32_MAX || timer < S32_MIN; +} + static void copy_to_xfs_dqblk(struct fs_disk_quota *dst, struct qc_dqblk *src, int type, qid_t id) { memset(dst, 0, sizeof(*dst)); + if (want_bigtime(src->d_ino_timer) || want_bigtime(src->d_spc_timer) || + want_bigtime(src->d_rt_spc_timer)) + dst->d_fieldmask |= FS_DQ_BIGTIME; dst->d_version = FS_DQUOT_VERSION; dst->d_id = id; if (type == USRQUOTA) @@ -606,14 +633,17 @@ static void copy_to_xfs_dqblk(struct fs_disk_quota *dst, struct qc_dqblk *src, dst->d_ino_softlimit = src->d_ino_softlimit; dst->d_bcount = quota_btobb(src->d_space); dst->d_icount = src->d_ino_count; - dst->d_itimer = src->d_ino_timer; - dst->d_btimer = src->d_spc_timer; + copy_to_xfs_dqblk_ts(dst, &dst->d_itimer, &dst->d_itimer_hi, + src->d_ino_timer); + copy_to_xfs_dqblk_ts(dst, &dst->d_btimer, &dst->d_btimer_hi, + src->d_spc_timer); dst->d_iwarns = src->d_ino_warns; dst->d_bwarns = src->d_spc_warns; dst->d_rtb_hardlimit = quota_btobb(src->d_rt_spc_hardlimit); dst->d_rtb_softlimit = quota_btobb(src->d_rt_spc_softlimit); dst->d_rtbcount = quota_btobb(src->d_rt_space); - dst->d_rtbtimer = src->d_rt_spc_timer; + copy_to_xfs_dqblk_ts(dst, &dst->d_rtbtimer, &dst->d_rtbtimer_hi, + src->d_rt_spc_timer); dst->d_rtbwarns = src->d_rt_spc_warns; } diff --git a/include/uapi/linux/dqblk_xfs.h b/include/uapi/linux/dqblk_xfs.h index 03d890b80ebc..16d73f54376d 100644 --- a/include/uapi/linux/dqblk_xfs.h +++ b/include/uapi/linux/dqblk_xfs.h @@ -66,7 +66,10 @@ typedef struct fs_disk_quota { __s32 d_btimer; /* similar to above; for disk blocks */ __u16 d_iwarns; /* # warnings issued wrt num inodes */ __u16 d_bwarns; /* # warnings issued wrt disk blocks */ - __s32 d_padding2; /* padding2 - for future use */ + __s8 d_itimer_hi; /* upper 8 bits of timer values */ + __s8 d_btimer_hi; + __s8 d_rtbtimer_hi; + __s8 d_padding2; /* padding2 - for future use */ __u64 d_rtb_hardlimit;/* absolute limit on realtime blks */ __u64 d_rtb_softlimit;/* preferred limit on RT disk blks */ __u64 d_rtbcount; /* # realtime blocks owned */ @@ -121,6 +124,12 @@ typedef struct fs_disk_quota { #define FS_DQ_RTBCOUNT (1<<14) #define FS_DQ_ACCT_MASK (FS_DQ_BCOUNT | FS_DQ_ICOUNT | FS_DQ_RTBCOUNT) +/* + * Quota expiration timestamps are 40-bit signed integers, with the upper 8 + * bits encoded in the _hi fields. + */ +#define FS_DQ_BIGTIME (1<<15) + /* * Various flags related to quotactl(2). */ -- cgit v1.2.3 From d1c10767837c4181f2e054865a58166fc117783b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 9 Sep 2020 15:54:46 +0200 Subject: quota: Expand comment describing d_itimer Expand comment describing d_itimer in struct fs_disk_quota. Reported-by: Matthew Wilcox Signed-off-by: Jan Kara --- include/uapi/linux/dqblk_xfs.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/dqblk_xfs.h b/include/uapi/linux/dqblk_xfs.h index 16d73f54376d..c71d909addda 100644 --- a/include/uapi/linux/dqblk_xfs.h +++ b/include/uapi/linux/dqblk_xfs.h @@ -61,8 +61,9 @@ typedef struct fs_disk_quota { __u64 d_ino_softlimit;/* preferred inode limit */ __u64 d_bcount; /* # disk blocks owned by the user */ __u64 d_icount; /* # inodes owned by the user */ - __s32 d_itimer; /* zero if within inode limits */ - /* if not, we refuse service */ + __s32 d_itimer; /* Zero if within inode limits. If + * not, we refuse service at this time + * (in seconds since Unix epoch) */ __s32 d_btimer; /* similar to above; for disk blocks */ __u16 d_iwarns; /* # warnings issued wrt num inodes */ __u16 d_bwarns; /* # warnings issued wrt disk blocks */ -- cgit v1.2.3 From 0dd4ff93f4c8dba016ad79384007da4938cd54a1 Mon Sep 17 00:00:00 2001 From: Sebastien Boeuf Date: Wed, 19 Aug 2020 18:19:42 -0400 Subject: virtio: Implement get_shm_region for PCI transport On PCI the shm regions are found using capability entries; find a region by searching for the capability. Signed-off-by: Sebastien Boeuf Signed-off-by: Dr. David Alan Gilbert Signed-off-by: kbuild test robot Acked-by: Michael S. Tsirkin Cc: kvm@vger.kernel.org Cc: virtualization@lists.linux-foundation.org Cc: "Michael S. Tsirkin" Signed-off-by: Miklos Szeredi --- drivers/virtio/virtio_pci_modern.c | 95 ++++++++++++++++++++++++++++++++++++++ include/uapi/linux/virtio_pci.h | 11 ++++- 2 files changed, 105 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 3e14e700b231..3d6ae5a5e252 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -444,6 +444,99 @@ static void del_vq(struct virtio_pci_vq_info *info) vring_del_virtqueue(vq); } +static int virtio_pci_find_shm_cap(struct pci_dev *dev, u8 required_id, + u8 *bar, u64 *offset, u64 *len) +{ + int pos; + + for (pos = pci_find_capability(dev, PCI_CAP_ID_VNDR); pos > 0; + pos = pci_find_next_capability(dev, pos, PCI_CAP_ID_VNDR)) { + u8 type, cap_len, id; + u32 tmp32; + u64 res_offset, res_length; + + pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap, + cfg_type), &type); + if (type != VIRTIO_PCI_CAP_SHARED_MEMORY_CFG) + continue; + + pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap, + cap_len), &cap_len); + if (cap_len != sizeof(struct virtio_pci_cap64)) { + dev_err(&dev->dev, "%s: shm cap with bad size offset:" + " %d size: %d\n", __func__, pos, cap_len); + continue; + } + + pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap, + id), &id); + if (id != required_id) + continue; + + /* Type, and ID match, looks good */ + pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap, + bar), bar); + + /* Read the lower 32bit of length and offset */ + pci_read_config_dword(dev, pos + offsetof(struct virtio_pci_cap, + offset), &tmp32); + res_offset = tmp32; + pci_read_config_dword(dev, pos + offsetof(struct virtio_pci_cap, + length), &tmp32); + res_length = tmp32; + + /* and now the top half */ + pci_read_config_dword(dev, + pos + offsetof(struct virtio_pci_cap64, + offset_hi), &tmp32); + res_offset |= ((u64)tmp32) << 32; + pci_read_config_dword(dev, + pos + offsetof(struct virtio_pci_cap64, + length_hi), &tmp32); + res_length |= ((u64)tmp32) << 32; + + *offset = res_offset; + *len = res_length; + + return pos; + } + return 0; +} + +static bool vp_get_shm_region(struct virtio_device *vdev, + struct virtio_shm_region *region, u8 id) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct pci_dev *pci_dev = vp_dev->pci_dev; + u8 bar; + u64 offset, len; + phys_addr_t phys_addr; + size_t bar_len; + + if (!virtio_pci_find_shm_cap(pci_dev, id, &bar, &offset, &len)) + return false; + + phys_addr = pci_resource_start(pci_dev, bar); + bar_len = pci_resource_len(pci_dev, bar); + + if ((offset + len) < offset) { + dev_err(&pci_dev->dev, "%s: cap offset+len overflow detected\n", + __func__); + return false; + } + + if (offset + len > bar_len) { + dev_err(&pci_dev->dev, "%s: bar shorter than cap offset+len\n", + __func__); + return false; + } + + region->len = len; + region->addr = (u64) phys_addr + offset; + + return true; +} + static const struct virtio_config_ops virtio_pci_config_nodev_ops = { .get = NULL, .set = NULL, @@ -458,6 +551,7 @@ static const struct virtio_config_ops virtio_pci_config_nodev_ops = { .bus_name = vp_bus_name, .set_vq_affinity = vp_set_vq_affinity, .get_vq_affinity = vp_get_vq_affinity, + .get_shm_region = vp_get_shm_region, }; static const struct virtio_config_ops virtio_pci_config_ops = { @@ -474,6 +568,7 @@ static const struct virtio_config_ops virtio_pci_config_ops = { .bus_name = vp_bus_name, .set_vq_affinity = vp_set_vq_affinity, .get_vq_affinity = vp_get_vq_affinity, + .get_shm_region = vp_get_shm_region, }; /** diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h index 90007a1abcab..3a86f36d7e3d 100644 --- a/include/uapi/linux/virtio_pci.h +++ b/include/uapi/linux/virtio_pci.h @@ -113,6 +113,8 @@ #define VIRTIO_PCI_CAP_DEVICE_CFG 4 /* PCI configuration access */ #define VIRTIO_PCI_CAP_PCI_CFG 5 +/* Additional shared memory capability */ +#define VIRTIO_PCI_CAP_SHARED_MEMORY_CFG 8 /* This is the PCI capability header: */ struct virtio_pci_cap { @@ -121,11 +123,18 @@ struct virtio_pci_cap { __u8 cap_len; /* Generic PCI field: capability length */ __u8 cfg_type; /* Identifies the structure. */ __u8 bar; /* Where to find it. */ - __u8 padding[3]; /* Pad to full dword. */ + __u8 id; /* Multiple capabilities of the same type */ + __u8 padding[2]; /* Pad to full dword. */ __le32 offset; /* Offset within bar. */ __le32 length; /* Length of the structure, in bytes. */ }; +struct virtio_pci_cap64 { + struct virtio_pci_cap cap; + __le32 offset_hi; /* Most sig 32 bits of offset */ + __le32 length_hi; /* Most sig 32 bits of length */ +}; + struct virtio_pci_notify_cap { struct virtio_pci_cap cap; __le32 notify_off_multiplier; /* Multiplier for queue_notify_off. */ -- cgit v1.2.3 From 38e895487afc2ed42c11045853cbb3fa20b52b6e Mon Sep 17 00:00:00 2001 From: Sebastien Boeuf Date: Wed, 19 Aug 2020 18:19:43 -0400 Subject: virtio: Implement get_shm_region for MMIO transport On MMIO a new set of registers is defined for finding SHM regions. Add their definitions and use them to find the region. Signed-off-by: Sebastien Boeuf Cc: kvm@vger.kernel.org Cc: virtualization@lists.linux-foundation.org Cc: "Michael S. Tsirkin" Signed-off-by: Miklos Szeredi --- drivers/virtio/virtio_mmio.c | 31 +++++++++++++++++++++++++++++++ include/uapi/linux/virtio_mmio.h | 11 +++++++++++ 2 files changed, 42 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c index 627ac0487494..238383ff1064 100644 --- a/drivers/virtio/virtio_mmio.c +++ b/drivers/virtio/virtio_mmio.c @@ -498,6 +498,36 @@ static const char *vm_bus_name(struct virtio_device *vdev) return vm_dev->pdev->name; } +static bool vm_get_shm_region(struct virtio_device *vdev, + struct virtio_shm_region *region, u8 id) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + u64 len, addr; + + /* Select the region we're interested in */ + writel(id, vm_dev->base + VIRTIO_MMIO_SHM_SEL); + + /* Read the region size */ + len = (u64) readl(vm_dev->base + VIRTIO_MMIO_SHM_LEN_LOW); + len |= (u64) readl(vm_dev->base + VIRTIO_MMIO_SHM_LEN_HIGH) << 32; + + region->len = len; + + /* Check if region length is -1. If that's the case, the shared memory + * region does not exist and there is no need to proceed further. + */ + if (len == ~(u64)0) + return false; + + /* Read the region base address */ + addr = (u64) readl(vm_dev->base + VIRTIO_MMIO_SHM_BASE_LOW); + addr |= (u64) readl(vm_dev->base + VIRTIO_MMIO_SHM_BASE_HIGH) << 32; + + region->addr = addr; + + return true; +} + static const struct virtio_config_ops virtio_mmio_config_ops = { .get = vm_get, .set = vm_set, @@ -510,6 +540,7 @@ static const struct virtio_config_ops virtio_mmio_config_ops = { .get_features = vm_get_features, .finalize_features = vm_finalize_features, .bus_name = vm_bus_name, + .get_shm_region = vm_get_shm_region, }; diff --git a/include/uapi/linux/virtio_mmio.h b/include/uapi/linux/virtio_mmio.h index c4b09689ab64..0650f91bea6c 100644 --- a/include/uapi/linux/virtio_mmio.h +++ b/include/uapi/linux/virtio_mmio.h @@ -122,6 +122,17 @@ #define VIRTIO_MMIO_QUEUE_USED_LOW 0x0a0 #define VIRTIO_MMIO_QUEUE_USED_HIGH 0x0a4 +/* Shared memory region id */ +#define VIRTIO_MMIO_SHM_SEL 0x0ac + +/* Shared memory region length, 64 bits in two halves */ +#define VIRTIO_MMIO_SHM_LEN_LOW 0x0b0 +#define VIRTIO_MMIO_SHM_LEN_HIGH 0x0b4 + +/* Shared memory region base address, 64 bits in two halves */ +#define VIRTIO_MMIO_SHM_BASE_LOW 0x0b8 +#define VIRTIO_MMIO_SHM_BASE_HIGH 0x0bc + /* Configuration atomicity value */ #define VIRTIO_MMIO_CONFIG_GENERATION 0x0fc -- cgit v1.2.3 From 22f3787e9d95e72d1f09795f294fb010e2998f43 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Wed, 19 Aug 2020 18:19:46 -0400 Subject: virtiofs: set up virtio_fs dax_device Setup a dax device. Use the shm capability to find the cache entry and map it. The DAX window is accessed by the fs/dax.c infrastructure and must have struct pages (at least on x86). Use devm_memremap_pages() to map the DAX window PCI BAR and allocate struct page. Signed-off-by: Stefan Hajnoczi Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Vivek Goyal Signed-off-by: Sebastien Boeuf Signed-off-by: Liu Bo Signed-off-by: Miklos Szeredi --- fs/fuse/virtio_fs.c | 138 +++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/virtio_fs.h | 3 + 2 files changed, 141 insertions(+) (limited to 'include/uapi/linux') diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 47ecdc15f25d..f31a59f74475 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -5,12 +5,16 @@ */ #include +#include +#include +#include #include #include #include #include #include #include +#include #include "fuse_i.h" /* List of virtio-fs device instances and a lock for the list. Also provides @@ -49,6 +53,12 @@ struct virtio_fs { struct virtio_fs_vq *vqs; unsigned int nvqs; /* number of virtqueues */ unsigned int num_request_queues; /* number of request queues */ + struct dax_device *dax_dev; + + /* DAX memory window where file contents are mapped */ + void *window_kaddr; + phys_addr_t window_phys_addr; + size_t window_len; }; struct virtio_fs_forget_req { @@ -686,6 +696,130 @@ static void virtio_fs_cleanup_vqs(struct virtio_device *vdev, vdev->config->del_vqs(vdev); } +/* Map a window offset to a page frame number. The window offset will have + * been produced by .iomap_begin(), which maps a file offset to a window + * offset. + */ +static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, + long nr_pages, void **kaddr, pfn_t *pfn) +{ + struct virtio_fs *fs = dax_get_private(dax_dev); + phys_addr_t offset = PFN_PHYS(pgoff); + size_t max_nr_pages = fs->window_len/PAGE_SIZE - pgoff; + + if (kaddr) + *kaddr = fs->window_kaddr + offset; + if (pfn) + *pfn = phys_to_pfn_t(fs->window_phys_addr + offset, + PFN_DEV | PFN_MAP); + return nr_pages > max_nr_pages ? max_nr_pages : nr_pages; +} + +static size_t virtio_fs_copy_from_iter(struct dax_device *dax_dev, + pgoff_t pgoff, void *addr, + size_t bytes, struct iov_iter *i) +{ + return copy_from_iter(addr, bytes, i); +} + +static size_t virtio_fs_copy_to_iter(struct dax_device *dax_dev, + pgoff_t pgoff, void *addr, + size_t bytes, struct iov_iter *i) +{ + return copy_to_iter(addr, bytes, i); +} + +static int virtio_fs_zero_page_range(struct dax_device *dax_dev, + pgoff_t pgoff, size_t nr_pages) +{ + long rc; + void *kaddr; + + rc = dax_direct_access(dax_dev, pgoff, nr_pages, &kaddr, NULL); + if (rc < 0) + return rc; + memset(kaddr, 0, nr_pages << PAGE_SHIFT); + dax_flush(dax_dev, kaddr, nr_pages << PAGE_SHIFT); + return 0; +} + +static const struct dax_operations virtio_fs_dax_ops = { + .direct_access = virtio_fs_direct_access, + .copy_from_iter = virtio_fs_copy_from_iter, + .copy_to_iter = virtio_fs_copy_to_iter, + .zero_page_range = virtio_fs_zero_page_range, +}; + +static void virtio_fs_cleanup_dax(void *data) +{ + struct dax_device *dax_dev = data; + + kill_dax(dax_dev); + put_dax(dax_dev); +} + +static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs) +{ + struct virtio_shm_region cache_reg; + struct dev_pagemap *pgmap; + bool have_cache; + + if (!IS_ENABLED(CONFIG_FUSE_DAX)) + return 0; + + /* Get cache region */ + have_cache = virtio_get_shm_region(vdev, &cache_reg, + (u8)VIRTIO_FS_SHMCAP_ID_CACHE); + if (!have_cache) { + dev_notice(&vdev->dev, "%s: No cache capability\n", __func__); + return 0; + } + + if (!devm_request_mem_region(&vdev->dev, cache_reg.addr, cache_reg.len, + dev_name(&vdev->dev))) { + dev_warn(&vdev->dev, "could not reserve region addr=0x%llx len=0x%llx\n", + cache_reg.addr, cache_reg.len); + return -EBUSY; + } + + dev_notice(&vdev->dev, "Cache len: 0x%llx @ 0x%llx\n", cache_reg.len, + cache_reg.addr); + + pgmap = devm_kzalloc(&vdev->dev, sizeof(*pgmap), GFP_KERNEL); + if (!pgmap) + return -ENOMEM; + + pgmap->type = MEMORY_DEVICE_FS_DAX; + + /* Ideally we would directly use the PCI BAR resource but + * devm_memremap_pages() wants its own copy in pgmap. So + * initialize a struct resource from scratch (only the start + * and end fields will be used). + */ + pgmap->res = (struct resource){ + .name = "virtio-fs dax window", + .start = (phys_addr_t) cache_reg.addr, + .end = (phys_addr_t) cache_reg.addr + cache_reg.len - 1, + }; + + fs->window_kaddr = devm_memremap_pages(&vdev->dev, pgmap); + if (IS_ERR(fs->window_kaddr)) + return PTR_ERR(fs->window_kaddr); + + fs->window_phys_addr = (phys_addr_t) cache_reg.addr; + fs->window_len = (phys_addr_t) cache_reg.len; + + dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len 0x%llx\n", + __func__, fs->window_kaddr, cache_reg.addr, cache_reg.len); + + fs->dax_dev = alloc_dax(fs, NULL, &virtio_fs_dax_ops, 0); + if (IS_ERR(fs->dax_dev)) + return PTR_ERR(fs->dax_dev); + + return devm_add_action_or_reset(&vdev->dev, virtio_fs_cleanup_dax, + fs->dax_dev); +} + static int virtio_fs_probe(struct virtio_device *vdev) { struct virtio_fs *fs; @@ -707,6 +841,10 @@ static int virtio_fs_probe(struct virtio_device *vdev) /* TODO vq affinity */ + ret = virtio_fs_setup_dax(vdev, fs); + if (ret < 0) + goto out_vqs; + /* Bring the device online in case the filesystem is mounted and * requests need to be sent before we return. */ diff --git a/include/uapi/linux/virtio_fs.h b/include/uapi/linux/virtio_fs.h index 3056b6e9f8ce..bea38291421b 100644 --- a/include/uapi/linux/virtio_fs.h +++ b/include/uapi/linux/virtio_fs.h @@ -16,4 +16,7 @@ struct virtio_fs_config { __le32 num_request_queues; } __attribute__((packed)); +/* For the id field in virtio_pci_shm_cap */ +#define VIRTIO_FS_SHMCAP_ID_CACHE 0 + #endif /* _UAPI_LINUX_VIRTIO_FS_H */ -- cgit v1.2.3 From fd1a1dc6f5aa7361e3562790336e116935f8fcfa Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Wed, 19 Aug 2020 18:19:49 -0400 Subject: virtiofs: implement FUSE_INIT map_alignment field The device communicates FUSE_SETUPMAPPING/FUSE_REMOVMAPPING alignment constraints via the FUST_INIT map_alignment field. Parse this field and ensure our DAX mappings meet the alignment constraints. We don't actually align anything differently since our mappings are already 2MB aligned. Just check the value when the connection is established. If it becomes necessary to honor arbitrary alignments in the future we'll have to adjust how mappings are sized. The upshot of this commit is that we can be confident that mappings will work even when emulating x86 on Power and similar combinations where the host page sizes are different. Signed-off-by: Stefan Hajnoczi Signed-off-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- fs/fuse/dax.c | 15 ++++++++++++++- fs/fuse/fuse_i.h | 1 + fs/fuse/inode.c | 17 ++++++++++++++++- include/uapi/linux/fuse.h | 4 +++- 4 files changed, 34 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 031106020f75..fec8a2bd75b3 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -9,7 +9,10 @@ #include #include -/* Default memory range size, 2MB */ +/* + * Default memory range size. A power of 2 so it agrees with common FUSE_INIT + * map_alignment values 4KB and 64KB. + */ #define FUSE_DAX_SHIFT 21 #define FUSE_DAX_SZ (1 << FUSE_DAX_SHIFT) #define FUSE_DAX_PAGES (FUSE_DAX_SZ / PAGE_SIZE) @@ -123,3 +126,13 @@ int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev) fc->dax = fcd; return 0; } + +bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment) +{ + if (fc->dax && (map_alignment > FUSE_DAX_SHIFT)) { + pr_warn("FUSE: map_alignment %u incompatible with dax mem range size %u\n", + map_alignment, FUSE_DAX_SZ); + return false; + } + return true; +} diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 97af7952373a..2f3f04aa64c7 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1106,5 +1106,6 @@ void fuse_free_conn(struct fuse_conn *fc); int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev); void fuse_dax_conn_free(struct fuse_conn *fc); +bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment); #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 1780dfe063ab..67e99cee5a4f 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -908,9 +908,10 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args, { struct fuse_init_args *ia = container_of(args, typeof(*ia), args); struct fuse_init_out *arg = &ia->out; + bool ok = true; if (error || arg->major != FUSE_KERNEL_VERSION) - fc->conn_error = 1; + ok = false; else { unsigned long ra_pages; @@ -973,6 +974,11 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args, min_t(unsigned int, FUSE_MAX_MAX_PAGES, max_t(unsigned int, arg->max_pages, 1)); } + if (IS_ENABLED(CONFIG_FUSE_DAX) && + arg->flags & FUSE_MAP_ALIGNMENT && + !fuse_dax_check_alignment(fc, arg->map_alignment)) { + ok = false; + } } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -988,6 +994,11 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args, } kfree(ia); + if (!ok) { + fc->conn_init = 0; + fc->conn_error = 1; + } + fuse_set_initialized(fc); wake_up_all(&fc->blocked_waitq); } @@ -1011,6 +1022,10 @@ void fuse_send_init(struct fuse_conn *fc) FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS | FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA; +#ifdef CONFIG_FUSE_DAX + if (fc->dax) + ia->in.flags |= FUSE_MAP_ALIGNMENT; +#endif ia->args.opcode = FUSE_INIT; ia->args.in_numargs = 1; ia->args.in_args[0].size = sizeof(ia->in); diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 373cada89815..5b85819e045f 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -313,7 +313,9 @@ struct fuse_file_lock { * FUSE_CACHE_SYMLINKS: cache READLINK responses * FUSE_NO_OPENDIR_SUPPORT: kernel supports zero-message opendir * FUSE_EXPLICIT_INVAL_DATA: only invalidate cached pages on explicit request - * FUSE_MAP_ALIGNMENT: map_alignment field is valid + * FUSE_MAP_ALIGNMENT: init_out.map_alignment contains log2(byte alignment) for + * foffset and moffset fields in struct + * fuse_setupmapping_out and fuse_removemapping_one. */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) -- cgit v1.2.3 From ceec02d4354a317cacce4b053a580ea3c7fc6cdc Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 19 Aug 2020 18:19:50 -0400 Subject: virtiofs: introduce setupmapping/removemapping commands Introduce two new fuse commands to setup/remove memory mappings. This will be used to setup/tear down file mapping in dax window. Signed-off-by: Vivek Goyal Signed-off-by: Peng Tao Signed-off-by: Miklos Szeredi --- include/uapi/linux/fuse.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 5b85819e045f..60a7bfc787ce 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -894,4 +894,33 @@ struct fuse_copy_file_range_in { uint64_t flags; }; +#define FUSE_SETUPMAPPING_FLAG_WRITE (1ull << 0) +struct fuse_setupmapping_in { + /* An already open handle */ + uint64_t fh; + /* Offset into the file to start the mapping */ + uint64_t foffset; + /* Length of mapping required */ + uint64_t len; + /* Flags, FUSE_SETUPMAPPING_FLAG_* */ + uint64_t flags; + /* Offset in Memory Window */ + uint64_t moffset; +}; + +struct fuse_removemapping_in { + /* number of fuse_removemapping_one follows */ + uint32_t count; +}; + +struct fuse_removemapping_one { + /* Offset into the dax window start the unmapping */ + uint64_t moffset; + /* Length of mapping required */ + uint64_t len; +}; + +#define FUSE_REMOVEMAPPING_MAX_ENTRY \ + (PAGE_SIZE / sizeof(struct fuse_removemapping_one)) + #endif /* _LINUX_FUSE_H */ -- cgit v1.2.3 From c2d0ad00d948de73c78f05d2b3e5bdfa605035cc Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 19 Aug 2020 18:19:51 -0400 Subject: virtiofs: implement dax read/write operations This patch implements basic DAX support. mmap() is not implemented yet and will come in later patches. This patch looks into implemeting read/write. We make use of interval tree to keep track of per inode dax mappings. Do not use dax for file extending writes, instead just send WRITE message to daemon (like we do for direct I/O path). This will keep write and i_size change atomic w.r.t crash. Signed-off-by: Stefan Hajnoczi Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Vivek Goyal Signed-off-by: Liu Bo Signed-off-by: Peng Tao Cc: Dave Chinner Signed-off-by: Miklos Szeredi --- fs/fuse/Kconfig | 1 + fs/fuse/dax.c | 565 ++++++++++++++++++++++++++++++++++++++++++++++ fs/fuse/file.c | 15 +- fs/fuse/fuse_i.h | 15 ++ fs/fuse/inode.c | 21 +- include/uapi/linux/fuse.h | 1 + 6 files changed, 612 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index fddd40630077..40ce9a1c12e5 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -42,6 +42,7 @@ config VIRTIO_FS config FUSE_DAX bool "Virtio Filesystem Direct Host Memory Access support" default y + select INTERVAL_TREE depends on VIRTIO_FS depends on FS_DAX depends on DAX_DRIVER diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index fec8a2bd75b3..a8d311b2db8e 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -7,7 +7,10 @@ #include "fuse_i.h" #include +#include #include +#include +#include /* * Default memory range size. A power of 2 so it agrees with common FUSE_INIT @@ -22,22 +25,556 @@ struct fuse_dax_mapping { /* Will connect in fcd->free_ranges to keep track of free memory */ struct list_head list; + /* For interval tree in file/inode */ + struct interval_tree_node itn; + /** Position in DAX window */ u64 window_offset; /** Length of mapping, in bytes */ loff_t length; + + /* Is this mapping read-only or read-write */ + bool writable; +}; + +/* Per-inode dax map */ +struct fuse_inode_dax { + /* Semaphore to protect modifications to the dmap tree */ + struct rw_semaphore sem; + + /* Sorted rb tree of struct fuse_dax_mapping elements */ + struct rb_root_cached tree; + unsigned long nr; }; struct fuse_conn_dax { /* DAX device */ struct dax_device *dev; + /* Lock protecting accessess to members of this structure */ + spinlock_t lock; + /* DAX Window Free Ranges */ long nr_free_ranges; struct list_head free_ranges; }; +static inline struct fuse_dax_mapping * +node_to_dmap(struct interval_tree_node *node) +{ + if (!node) + return NULL; + + return container_of(node, struct fuse_dax_mapping, itn); +} + +static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn_dax *fcd) +{ + struct fuse_dax_mapping *dmap; + + spin_lock(&fcd->lock); + dmap = list_first_entry_or_null(&fcd->free_ranges, + struct fuse_dax_mapping, list); + if (dmap) { + list_del_init(&dmap->list); + WARN_ON(fcd->nr_free_ranges <= 0); + fcd->nr_free_ranges--; + } + spin_unlock(&fcd->lock); + return dmap; +} + +/* This assumes fcd->lock is held */ +static void __dmap_add_to_free_pool(struct fuse_conn_dax *fcd, + struct fuse_dax_mapping *dmap) +{ + list_add_tail(&dmap->list, &fcd->free_ranges); + fcd->nr_free_ranges++; +} + +static void dmap_add_to_free_pool(struct fuse_conn_dax *fcd, + struct fuse_dax_mapping *dmap) +{ + /* Return fuse_dax_mapping to free list */ + spin_lock(&fcd->lock); + __dmap_add_to_free_pool(fcd, dmap); + spin_unlock(&fcd->lock); +} + +static int fuse_setup_one_mapping(struct inode *inode, unsigned long start_idx, + struct fuse_dax_mapping *dmap, bool writable, + bool upgrade) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_conn_dax *fcd = fc->dax; + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_setupmapping_in inarg; + loff_t offset = start_idx << FUSE_DAX_SHIFT; + FUSE_ARGS(args); + ssize_t err; + + WARN_ON(fcd->nr_free_ranges < 0); + + /* Ask fuse daemon to setup mapping */ + memset(&inarg, 0, sizeof(inarg)); + inarg.foffset = offset; + inarg.fh = -1; + inarg.moffset = dmap->window_offset; + inarg.len = FUSE_DAX_SZ; + inarg.flags |= FUSE_SETUPMAPPING_FLAG_READ; + if (writable) + inarg.flags |= FUSE_SETUPMAPPING_FLAG_WRITE; + args.opcode = FUSE_SETUPMAPPING; + args.nodeid = fi->nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + err = fuse_simple_request(fc, &args); + if (err < 0) + return err; + dmap->writable = writable; + if (!upgrade) { + dmap->itn.start = dmap->itn.last = start_idx; + /* Protected by fi->dax->sem */ + interval_tree_insert(&dmap->itn, &fi->dax->tree); + fi->dax->nr++; + } + return 0; +} + +static int fuse_send_removemapping(struct inode *inode, + struct fuse_removemapping_in *inargp, + struct fuse_removemapping_one *remove_one) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = get_fuse_conn(inode); + FUSE_ARGS(args); + + args.opcode = FUSE_REMOVEMAPPING; + args.nodeid = fi->nodeid; + args.in_numargs = 2; + args.in_args[0].size = sizeof(*inargp); + args.in_args[0].value = inargp; + args.in_args[1].size = inargp->count * sizeof(*remove_one); + args.in_args[1].value = remove_one; + return fuse_simple_request(fc, &args); +} + +static int dmap_removemapping_list(struct inode *inode, unsigned int num, + struct list_head *to_remove) +{ + struct fuse_removemapping_one *remove_one, *ptr; + struct fuse_removemapping_in inarg; + struct fuse_dax_mapping *dmap; + int ret, i = 0, nr_alloc; + + nr_alloc = min_t(unsigned int, num, FUSE_REMOVEMAPPING_MAX_ENTRY); + remove_one = kmalloc_array(nr_alloc, sizeof(*remove_one), GFP_NOFS); + if (!remove_one) + return -ENOMEM; + + ptr = remove_one; + list_for_each_entry(dmap, to_remove, list) { + ptr->moffset = dmap->window_offset; + ptr->len = dmap->length; + ptr++; + i++; + num--; + if (i >= nr_alloc || num == 0) { + memset(&inarg, 0, sizeof(inarg)); + inarg.count = i; + ret = fuse_send_removemapping(inode, &inarg, + remove_one); + if (ret) + goto out; + ptr = remove_one; + i = 0; + } + } +out: + kfree(remove_one); + return ret; +} + +/* + * Cleanup dmap entry and add back to free list. This should be called with + * fcd->lock held. + */ +static void dmap_reinit_add_to_free_pool(struct fuse_conn_dax *fcd, + struct fuse_dax_mapping *dmap) +{ + pr_debug("fuse: freeing memory range start_idx=0x%lx end_idx=0x%lx window_offset=0x%llx length=0x%llx\n", + dmap->itn.start, dmap->itn.last, dmap->window_offset, + dmap->length); + dmap->itn.start = dmap->itn.last = 0; + __dmap_add_to_free_pool(fcd, dmap); +} + +/* + * Free inode dmap entries whose range falls inside [start, end]. + * Does not take any locks. At this point of time it should only be + * called from evict_inode() path where we know all dmap entries can be + * reclaimed. + */ +static void inode_reclaim_dmap_range(struct fuse_conn_dax *fcd, + struct inode *inode, + loff_t start, loff_t end) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_dax_mapping *dmap, *n; + int err, num = 0; + LIST_HEAD(to_remove); + unsigned long start_idx = start >> FUSE_DAX_SHIFT; + unsigned long end_idx = end >> FUSE_DAX_SHIFT; + struct interval_tree_node *node; + + while (1) { + node = interval_tree_iter_first(&fi->dax->tree, start_idx, + end_idx); + if (!node) + break; + dmap = node_to_dmap(node); + interval_tree_remove(&dmap->itn, &fi->dax->tree); + num++; + list_add(&dmap->list, &to_remove); + } + + /* Nothing to remove */ + if (list_empty(&to_remove)) + return; + + WARN_ON(fi->dax->nr < num); + fi->dax->nr -= num; + err = dmap_removemapping_list(inode, num, &to_remove); + if (err && err != -ENOTCONN) { + pr_warn("Failed to removemappings. start=0x%llx end=0x%llx\n", + start, end); + } + spin_lock(&fcd->lock); + list_for_each_entry_safe(dmap, n, &to_remove, list) { + list_del_init(&dmap->list); + dmap_reinit_add_to_free_pool(fcd, dmap); + } + spin_unlock(&fcd->lock); +} + +/* + * It is called from evict_inode() and by that time inode is going away. So + * this function does not take any locks like fi->dax->sem for traversing + * that fuse inode interval tree. If that lock is taken then lock validator + * complains of deadlock situation w.r.t fs_reclaim lock. + */ +void fuse_dax_inode_cleanup(struct inode *inode) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + /* + * fuse_evict_inode() has already called truncate_inode_pages_final() + * before we arrive here. So we should not have to worry about any + * pages/exception entries still associated with inode. + */ + inode_reclaim_dmap_range(fc->dax, inode, 0, -1); + WARN_ON(fi->dax->nr); +} + +static void fuse_fill_iomap_hole(struct iomap *iomap, loff_t length) +{ + iomap->addr = IOMAP_NULL_ADDR; + iomap->length = length; + iomap->type = IOMAP_HOLE; +} + +static void fuse_fill_iomap(struct inode *inode, loff_t pos, loff_t length, + struct iomap *iomap, struct fuse_dax_mapping *dmap, + unsigned int flags) +{ + loff_t offset, len; + loff_t i_size = i_size_read(inode); + + offset = pos - (dmap->itn.start << FUSE_DAX_SHIFT); + len = min(length, dmap->length - offset); + + /* If length is beyond end of file, truncate further */ + if (pos + len > i_size) + len = i_size - pos; + + if (len > 0) { + iomap->addr = dmap->window_offset + offset; + iomap->length = len; + if (flags & IOMAP_FAULT) + iomap->length = ALIGN(len, PAGE_SIZE); + iomap->type = IOMAP_MAPPED; + } else { + /* Mapping beyond end of file is hole */ + fuse_fill_iomap_hole(iomap, length); + } +} + +static int fuse_setup_new_dax_mapping(struct inode *inode, loff_t pos, + loff_t length, unsigned int flags, + struct iomap *iomap) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_conn_dax *fcd = fc->dax; + struct fuse_dax_mapping *dmap, *alloc_dmap = NULL; + int ret; + bool writable = flags & IOMAP_WRITE; + unsigned long start_idx = pos >> FUSE_DAX_SHIFT; + struct interval_tree_node *node; + + alloc_dmap = alloc_dax_mapping(fcd); + if (!alloc_dmap) + return -EIO; + + /* + * Take write lock so that only one caller can try to setup mapping + * and other waits. + */ + down_write(&fi->dax->sem); + /* + * We dropped lock. Check again if somebody else setup + * mapping already. + */ + node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx); + if (node) { + dmap = node_to_dmap(node); + fuse_fill_iomap(inode, pos, length, iomap, dmap, flags); + dmap_add_to_free_pool(fcd, alloc_dmap); + up_write(&fi->dax->sem); + return 0; + } + + /* Setup one mapping */ + ret = fuse_setup_one_mapping(inode, pos >> FUSE_DAX_SHIFT, alloc_dmap, + writable, false); + if (ret < 0) { + dmap_add_to_free_pool(fcd, alloc_dmap); + up_write(&fi->dax->sem); + return ret; + } + fuse_fill_iomap(inode, pos, length, iomap, alloc_dmap, flags); + up_write(&fi->dax->sem); + return 0; +} + +static int fuse_upgrade_dax_mapping(struct inode *inode, loff_t pos, + loff_t length, unsigned int flags, + struct iomap *iomap) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_dax_mapping *dmap; + int ret; + unsigned long idx = pos >> FUSE_DAX_SHIFT; + struct interval_tree_node *node; + + /* + * Take exclusive lock so that only one caller can try to setup + * mapping and others wait. + */ + down_write(&fi->dax->sem); + node = interval_tree_iter_first(&fi->dax->tree, idx, idx); + + /* We are holding either inode lock or i_mmap_sem, and that should + * ensure that dmap can't reclaimed or truncated and it should still + * be there in tree despite the fact we dropped and re-acquired the + * lock. + */ + ret = -EIO; + if (WARN_ON(!node)) + goto out_err; + + dmap = node_to_dmap(node); + + /* Maybe another thread already upgraded mapping while we were not + * holding lock. + */ + if (dmap->writable) { + ret = 0; + goto out_fill_iomap; + } + + ret = fuse_setup_one_mapping(inode, pos >> FUSE_DAX_SHIFT, dmap, true, + true); + if (ret < 0) + goto out_err; +out_fill_iomap: + fuse_fill_iomap(inode, pos, length, iomap, dmap, flags); +out_err: + up_write(&fi->dax->sem); + return ret; +} + +/* This is just for DAX and the mapping is ephemeral, do not use it for other + * purposes since there is no block device with a permanent mapping. + */ +static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t length, + unsigned int flags, struct iomap *iomap, + struct iomap *srcmap) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_dax_mapping *dmap; + bool writable = flags & IOMAP_WRITE; + unsigned long start_idx = pos >> FUSE_DAX_SHIFT; + struct interval_tree_node *node; + + /* We don't support FIEMAP */ + if (WARN_ON(flags & IOMAP_REPORT)) + return -EIO; + + iomap->offset = pos; + iomap->flags = 0; + iomap->bdev = NULL; + iomap->dax_dev = fc->dax->dev; + + /* + * Both read/write and mmap path can race here. So we need something + * to make sure if we are setting up mapping, then other path waits + * + * For now, use a semaphore for this. It probably needs to be + * optimized later. + */ + down_read(&fi->dax->sem); + node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx); + if (node) { + dmap = node_to_dmap(node); + if (writable && !dmap->writable) { + /* Upgrade read-only mapping to read-write. This will + * require exclusive fi->dax->sem lock as we don't want + * two threads to be trying to this simultaneously + * for same dmap. So drop shared lock and acquire + * exclusive lock. + */ + up_read(&fi->dax->sem); + pr_debug("%s: Upgrading mapping at offset 0x%llx length 0x%llx\n", + __func__, pos, length); + return fuse_upgrade_dax_mapping(inode, pos, length, + flags, iomap); + } else { + fuse_fill_iomap(inode, pos, length, iomap, dmap, flags); + up_read(&fi->dax->sem); + return 0; + } + } else { + up_read(&fi->dax->sem); + pr_debug("%s: no mapping at offset 0x%llx length 0x%llx\n", + __func__, pos, length); + if (pos >= i_size_read(inode)) + goto iomap_hole; + + return fuse_setup_new_dax_mapping(inode, pos, length, flags, + iomap); + } + + /* + * If read beyond end of file happnes, fs code seems to return + * it as hole + */ +iomap_hole: + fuse_fill_iomap_hole(iomap, length); + pr_debug("%s returning hole mapping. pos=0x%llx length_asked=0x%llx length_returned=0x%llx\n", + __func__, pos, length, iomap->length); + return 0; +} + +static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t length, + ssize_t written, unsigned int flags, + struct iomap *iomap) +{ + /* DAX writes beyond end-of-file aren't handled using iomap, so the + * file size is unchanged and there is nothing to do here. + */ + return 0; +} + +static const struct iomap_ops fuse_iomap_ops = { + .iomap_begin = fuse_iomap_begin, + .iomap_end = fuse_iomap_end, +}; + +ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock_shared(inode)) + return -EAGAIN; + } else { + inode_lock_shared(inode); + } + + ret = dax_iomap_rw(iocb, to, &fuse_iomap_ops); + inode_unlock_shared(inode); + + /* TODO file_accessed(iocb->f_filp) */ + return ret; +} + +static bool file_extending_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + return (iov_iter_rw(from) == WRITE && + ((iocb->ki_pos) >= i_size_read(inode) || + (iocb->ki_pos + iov_iter_count(from) > i_size_read(inode)))); +} + +static ssize_t fuse_dax_direct_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); + ssize_t ret; + + ret = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE); + if (ret < 0) + return ret; + + fuse_invalidate_attr(inode); + fuse_write_update_size(inode, iocb->ki_pos); + return ret; +} + +ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock(inode)) + return -EAGAIN; + } else { + inode_lock(inode); + } + + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto out; + + ret = file_remove_privs(iocb->ki_filp); + if (ret) + goto out; + /* TODO file_update_time() but we don't want metadata I/O */ + + /* Do not use dax for file extending writes as write and on + * disk i_size increase are not atomic otherwise. + */ + if (file_extending_write(iocb, from)) + ret = fuse_dax_direct_write(iocb, from); + else + ret = dax_iomap_rw(iocb, from, &fuse_iomap_ops); + +out: + inode_unlock(inode); + + if (ret > 0) + ret = generic_write_sync(iocb, ret); + return ret; +} + static void fuse_free_dax_mem_ranges(struct list_head *mem_list) { struct fuse_dax_mapping *range, *temp; @@ -116,6 +653,7 @@ int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev) if (!fcd) return -ENOMEM; + spin_lock_init(&fcd->lock); fcd->dev = dax_dev; err = fuse_dax_mem_range_init(fcd); if (err) { @@ -127,6 +665,33 @@ int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev) return 0; } +bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi) +{ + struct fuse_conn *fc = get_fuse_conn_super(sb); + + fi->dax = NULL; + if (fc->dax) { + fi->dax = kzalloc(sizeof(*fi->dax), GFP_KERNEL_ACCOUNT); + if (!fi->dax) + return false; + + init_rwsem(&fi->dax->sem); + fi->dax->tree = RB_ROOT_CACHED; + } + + return true; +} + +void fuse_dax_inode_init(struct inode *inode) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + + if (!fc->dax) + return; + + inode->i_flags |= S_DAX; +} + bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment) { if (fc->dax && (map_alignment > FUSE_DAX_SHIFT)) { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 6611ef3269a8..6c586bc97b64 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1539,10 +1539,14 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); - if (is_bad_inode(file_inode(file))) + if (is_bad_inode(inode)) return -EIO; + if (FUSE_IS_DAX(inode)) + return fuse_dax_read_iter(iocb, to); + if (!(ff->open_flags & FOPEN_DIRECT_IO)) return fuse_cache_read_iter(iocb, to); else @@ -1553,10 +1557,14 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); - if (is_bad_inode(file_inode(file))) + if (is_bad_inode(inode)) return -EIO; + if (FUSE_IS_DAX(inode)) + return fuse_dax_write_iter(iocb, from); + if (!(ff->open_flags & FOPEN_DIRECT_IO)) return fuse_cache_write_iter(iocb, from); else @@ -3440,4 +3448,7 @@ void fuse_init_file_inode(struct inode *inode) fi->writectr = 0; init_waitqueue_head(&fi->page_waitq); fi->writepages = RB_ROOT; + + if (IS_ENABLED(CONFIG_FUSE_DAX)) + fuse_dax_inode_init(inode); } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 2f3f04aa64c7..2d2bdd596194 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -148,6 +148,13 @@ struct fuse_inode { /** Lock to protect write related fields */ spinlock_t lock; + +#ifdef CONFIG_FUSE_DAX + /* + * Dax specific inode data + */ + struct fuse_inode_dax *dax; +#endif }; /** FUSE inode state bits */ @@ -1104,8 +1111,16 @@ void fuse_free_conn(struct fuse_conn *fc); /* dax.c */ +#define FUSE_IS_DAX(inode) (IS_ENABLED(CONFIG_FUSE_DAX) && IS_DAX(inode)) + +ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to); +ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from); +int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma); int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev); void fuse_dax_conn_free(struct fuse_conn *fc); +bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi); +void fuse_dax_inode_init(struct inode *inode); +void fuse_dax_inode_cleanup(struct inode *inode); bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment); #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 67e99cee5a4f..cab4239bd78a 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -87,12 +87,19 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) mutex_init(&fi->mutex); spin_lock_init(&fi->lock); fi->forget = fuse_alloc_forget(); - if (!fi->forget) { - kmem_cache_free(fuse_inode_cachep, fi); - return NULL; - } + if (!fi->forget) + goto out_free; + + if (IS_ENABLED(CONFIG_FUSE_DAX) && !fuse_dax_inode_alloc(sb, fi)) + goto out_free_forget; return &fi->inode; + +out_free_forget: + kfree(fi->forget); +out_free: + kmem_cache_free(fuse_inode_cachep, fi); + return NULL; } static void fuse_free_inode(struct inode *inode) @@ -101,6 +108,9 @@ static void fuse_free_inode(struct inode *inode) mutex_destroy(&fi->mutex); kfree(fi->forget); +#ifdef CONFIG_FUSE_DAX + kfree(fi->dax); +#endif kmem_cache_free(fuse_inode_cachep, fi); } @@ -112,6 +122,9 @@ static void fuse_evict_inode(struct inode *inode) clear_inode(inode); if (inode->i_sb->s_flags & SB_ACTIVE) { struct fuse_conn *fc = get_fuse_conn(inode); + + if (FUSE_IS_DAX(inode)) + fuse_dax_inode_cleanup(inode); fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup); fi->forget = NULL; } diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 60a7bfc787ce..8899e4862309 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -895,6 +895,7 @@ struct fuse_copy_file_range_in { }; #define FUSE_SETUPMAPPING_FLAG_WRITE (1ull << 0) +#define FUSE_SETUPMAPPING_FLAG_READ (1ull << 1) struct fuse_setupmapping_in { /* An already open handle */ uint64_t fh; -- cgit v1.2.3 From 501cb008906631a019f3ab2104a17ef8b2651ed0 Mon Sep 17 00:00:00 2001 From: Paul Davey Date: Tue, 8 Sep 2020 10:04:06 +1200 Subject: ipmr: Add route table ID to netlink cache reports Insert the multicast route table ID as a Netlink attribute to Netlink cache report notifications. When multiple route tables are in use it is necessary to have a way to determine which route table a given cache report belongs to when receiving the cache report. Signed-off-by: Paul Davey Signed-off-by: David S. Miller --- include/uapi/linux/mroute.h | 1 + net/ipv4/ipmr.c | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mroute.h b/include/uapi/linux/mroute.h index 11c8c1fc1124..918f1ef32ffe 100644 --- a/include/uapi/linux/mroute.h +++ b/include/uapi/linux/mroute.h @@ -169,6 +169,7 @@ enum { IPMRA_CREPORT_SRC_ADDR, IPMRA_CREPORT_DST_ADDR, IPMRA_CREPORT_PKT, + IPMRA_CREPORT_TABLE, __IPMRA_CREPORT_MAX }; #define IPMRA_CREPORT_MAX (__IPMRA_CREPORT_MAX - 1) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 876fd6ff1ff9..19b2f586319b 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -2396,6 +2396,7 @@ static size_t igmpmsg_netlink_msgsize(size_t payloadlen) + nla_total_size(4) /* IPMRA_CREPORT_VIF_ID */ + nla_total_size(4) /* IPMRA_CREPORT_SRC_ADDR */ + nla_total_size(4) /* IPMRA_CREPORT_DST_ADDR */ + + nla_total_size(4) /* IPMRA_CREPORT_TABLE */ /* IPMRA_CREPORT_PKT */ + nla_total_size(payloadlen) ; @@ -2431,7 +2432,8 @@ static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt) nla_put_in_addr(skb, IPMRA_CREPORT_SRC_ADDR, msg->im_src.s_addr) || nla_put_in_addr(skb, IPMRA_CREPORT_DST_ADDR, - msg->im_dst.s_addr)) + msg->im_dst.s_addr) || + nla_put_u32(skb, IPMRA_CREPORT_TABLE, mrt->id)) goto nla_put_failure; nla = nla_reserve(skb, IPMRA_CREPORT_PKT, payloadlen); -- cgit v1.2.3 From c8715a8e9f38906e73d6d78764216742db13ba0e Mon Sep 17 00:00:00 2001 From: Paul Davey Date: Tue, 8 Sep 2020 10:04:07 +1200 Subject: ipmr: Add high byte of VIF ID to igmpmsg Use the unused3 byte in struct igmpmsg to hold the high 8 bits of the VIF ID. If using more than 255 IPv4 multicast interfaces it is necessary to have access to a VIF ID for cache reports that is wider than 8 bits, the VIF ID present in the igmpmsg reports sent to mroute_sk was only 8 bits wide in the igmpmsg header. Adding the high 8 bits of the 16 bit VIF ID in the unused byte allows use of more than 255 IPv4 multicast interfaces. Signed-off-by: Paul Davey Signed-off-by: David S. Miller --- include/uapi/linux/mroute.h | 4 ++-- net/ipv4/ipmr.c | 8 ++++++-- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mroute.h b/include/uapi/linux/mroute.h index 918f1ef32ffe..1a42f5f9b31b 100644 --- a/include/uapi/linux/mroute.h +++ b/include/uapi/linux/mroute.h @@ -113,8 +113,8 @@ struct igmpmsg { __u32 unused1,unused2; unsigned char im_msgtype; /* What is this */ unsigned char im_mbz; /* Must be zero */ - unsigned char im_vif; /* Interface (this ought to be a vifi_t!) */ - unsigned char unused3; + unsigned char im_vif; /* Low 8 bits of Interface */ + unsigned char im_vif_hi; /* High 8 bits of Interface */ struct in_addr im_src,im_dst; }; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 19b2f586319b..4809318f591b 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1038,10 +1038,13 @@ static int ipmr_cache_report(struct mr_table *mrt, memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr)); msg->im_msgtype = assert; msg->im_mbz = 0; - if (assert == IGMPMSG_WRVIFWHOLE) + if (assert == IGMPMSG_WRVIFWHOLE) { msg->im_vif = vifi; - else + msg->im_vif_hi = vifi >> 8; + } else { msg->im_vif = mrt->mroute_reg_vif_num; + msg->im_vif_hi = mrt->mroute_reg_vif_num >> 8; + } ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2; ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) + sizeof(struct iphdr)); @@ -1054,6 +1057,7 @@ static int ipmr_cache_report(struct mr_table *mrt, ip_hdr(skb)->protocol = 0; msg = (struct igmpmsg *)skb_network_header(skb); msg->im_vif = vifi; + msg->im_vif_hi = vifi >> 8; skb_dst_set(skb, dst_clone(skb_dst(pkt))); /* Add our header */ igmp = skb_put(skb, sizeof(struct igmphdr)); -- cgit v1.2.3 From 1aef5b4391f0c75c0a1523706a7b0311846ee12f Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 10 Sep 2020 13:33:14 -0700 Subject: bpf: Fix comment for helper bpf_current_task_under_cgroup() This should be "current" not "skb". Fixes: c6b5fb8690fa ("bpf: add documentation for eBPF helpers (42-50)") Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Cc: Link: https://lore.kernel.org/bpf/20200910203314.70018-1-songliubraving@fb.com --- include/uapi/linux/bpf.h | 4 ++-- tools/include/uapi/linux/bpf.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 90359cab501d..7dd314176df7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1447,8 +1447,8 @@ union bpf_attr { * Return * The return value depends on the result of the test, and can be: * - * * 0, if the *skb* task belongs to the cgroup2. - * * 1, if the *skb* task does not belong to the cgroup2. + * * 0, if current task belongs to the cgroup2. + * * 1, if current task does not belong to the cgroup2. * * A negative error code, if an error occurred. * * long bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 90359cab501d..7dd314176df7 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1447,8 +1447,8 @@ union bpf_attr { * Return * The return value depends on the result of the test, and can be: * - * * 0, if the *skb* task belongs to the cgroup2. - * * 1, if the *skb* task does not belong to the cgroup2. + * * 0, if current task belongs to the cgroup2. + * * 1, if current task does not belong to the cgroup2. * * A negative error code, if an error occurred. * * long bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) -- cgit v1.2.3 From 5823833c9adab5a9ce5500e7f1ce7deeff00ee73 Mon Sep 17 00:00:00 2001 From: Stanimir Varbanov Date: Sat, 11 Jul 2020 14:52:36 +0200 Subject: media: v4l2-ctrl: Add VP9 codec levels Add menu control for VP9 codec levels. A total of 14 levels are defined for Profile 0 (8bit) and Profile 2 (10bit). Each level is a set of constrained bitstreams coded with targeted resolutions, frame rates, and bitrates. The definitions have been taken from webm project [1]. [1] https://www.webmproject.org/vp9/levels/ Signed-off-by: Stanimir Varbanov Reviewed-by: Nicolas Dufresne Reviewed-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- .../userspace-api/media/v4l/ext-ctrls-codec.rst | 43 ++++++++++++++++++++++ drivers/media/v4l2-core/v4l2-ctrls.c | 21 +++++++++++ include/uapi/linux/v4l2-controls.h | 17 +++++++++ 3 files changed, 81 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst index 289d380e2cf0..ce728c757eaf 100644 --- a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst +++ b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst @@ -3383,6 +3383,49 @@ enum v4l2_mpeg_video_vp9_profile - * - ``V4L2_MPEG_VIDEO_VP9_PROFILE_3`` - Profile 3 +.. _v4l2-mpeg-video-vp9-level: + +``V4L2_CID_MPEG_VIDEO_VP9_LEVEL (enum)`` + +enum v4l2_mpeg_video_vp9_level - + This control allows selecting the level for VP9 encoder. + This is also used to enumerate supported levels by VP9 encoder or decoder. + More information can be found at + `webmproject `__. Possible values are: + +.. flat-table:: + :header-rows: 0 + :stub-columns: 0 + + * - ``V4L2_MPEG_VIDEO_VP9_LEVEL_1_0`` + - Level 1 + * - ``V4L2_MPEG_VIDEO_VP9_LEVEL_1_1`` + - Level 1.1 + * - ``V4L2_MPEG_VIDEO_VP9_LEVEL_2_0`` + - Level 2 + * - ``V4L2_MPEG_VIDEO_VP9_LEVEL_2_1`` + - Level 2.1 + * - ``V4L2_MPEG_VIDEO_VP9_LEVEL_3_0`` + - Level 3 + * - ``V4L2_MPEG_VIDEO_VP9_LEVEL_3_1`` + - Level 3.1 + * - ``V4L2_MPEG_VIDEO_VP9_LEVEL_4_0`` + - Level 4 + * - ``V4L2_MPEG_VIDEO_VP9_LEVEL_4_1`` + - Level 4.1 + * - ``V4L2_MPEG_VIDEO_VP9_LEVEL_5_0`` + - Level 5 + * - ``V4L2_MPEG_VIDEO_VP9_LEVEL_5_1`` + - Level 5.1 + * - ``V4L2_MPEG_VIDEO_VP9_LEVEL_5_2`` + - Level 5.2 + * - ``V4L2_MPEG_VIDEO_VP9_LEVEL_6_0`` + - Level 6 + * - ``V4L2_MPEG_VIDEO_VP9_LEVEL_6_1`` + - Level 6.1 + * - ``V4L2_MPEG_VIDEO_VP9_LEVEL_6_2`` + - Level 6.2 + High Efficiency Video Coding (HEVC/H.265) Control Reference =========================================================== diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c index 73f3d65957ff..bd7f330c941c 100644 --- a/drivers/media/v4l2-core/v4l2-ctrls.c +++ b/drivers/media/v4l2-core/v4l2-ctrls.c @@ -475,6 +475,23 @@ const char * const *v4l2_ctrl_get_menu(u32 id) "3", NULL, }; + static const char * const vp9_level[] = { + "1", + "1.1", + "2", + "2.1", + "3", + "3.1", + "4", + "4.1", + "5", + "5.1", + "5.2", + "6", + "6.1", + "6.2", + NULL, + }; static const char * const flash_led_mode[] = { "Off", @@ -694,6 +711,8 @@ const char * const *v4l2_ctrl_get_menu(u32 id) return vp8_profile; case V4L2_CID_MPEG_VIDEO_VP9_PROFILE: return vp9_profile; + case V4L2_CID_MPEG_VIDEO_VP9_LEVEL: + return vp9_level; case V4L2_CID_JPEG_CHROMA_SUBSAMPLING: return jpeg_chroma_subsampling; case V4L2_CID_DV_TX_MODE: @@ -950,6 +969,7 @@ const char *v4l2_ctrl_get_name(u32 id) case V4L2_CID_MPEG_VIDEO_VPX_P_FRAME_QP: return "VPX P-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_VP8_PROFILE: return "VP8 Profile"; case V4L2_CID_MPEG_VIDEO_VP9_PROFILE: return "VP9 Profile"; + case V4L2_CID_MPEG_VIDEO_VP9_LEVEL: return "VP9 Level"; case V4L2_CID_MPEG_VIDEO_VP8_FRAME_HEADER: return "VP8 Frame Header"; /* HEVC controls */ @@ -1307,6 +1327,7 @@ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type, case V4L2_CID_MPEG_VIDEO_VPX_GOLDEN_FRAME_SEL: case V4L2_CID_MPEG_VIDEO_VP8_PROFILE: case V4L2_CID_MPEG_VIDEO_VP9_PROFILE: + case V4L2_CID_MPEG_VIDEO_VP9_LEVEL: case V4L2_CID_DETECT_MD_MODE: case V4L2_CID_MPEG_VIDEO_HEVC_PROFILE: case V4L2_CID_MPEG_VIDEO_HEVC_LEVEL: diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h index 053827cda8e6..a184c4939438 100644 --- a/include/uapi/linux/v4l2-controls.h +++ b/include/uapi/linux/v4l2-controls.h @@ -651,6 +651,23 @@ enum v4l2_mpeg_video_vp9_profile { V4L2_MPEG_VIDEO_VP9_PROFILE_2 = 2, V4L2_MPEG_VIDEO_VP9_PROFILE_3 = 3, }; +#define V4L2_CID_MPEG_VIDEO_VP9_LEVEL (V4L2_CID_MPEG_BASE+513) +enum v4l2_mpeg_video_vp9_level { + V4L2_MPEG_VIDEO_VP9_LEVEL_1_0 = 0, + V4L2_MPEG_VIDEO_VP9_LEVEL_1_1 = 1, + V4L2_MPEG_VIDEO_VP9_LEVEL_2_0 = 2, + V4L2_MPEG_VIDEO_VP9_LEVEL_2_1 = 3, + V4L2_MPEG_VIDEO_VP9_LEVEL_3_0 = 4, + V4L2_MPEG_VIDEO_VP9_LEVEL_3_1 = 5, + V4L2_MPEG_VIDEO_VP9_LEVEL_4_0 = 6, + V4L2_MPEG_VIDEO_VP9_LEVEL_4_1 = 7, + V4L2_MPEG_VIDEO_VP9_LEVEL_5_0 = 8, + V4L2_MPEG_VIDEO_VP9_LEVEL_5_1 = 9, + V4L2_MPEG_VIDEO_VP9_LEVEL_5_2 = 10, + V4L2_MPEG_VIDEO_VP9_LEVEL_6_0 = 11, + V4L2_MPEG_VIDEO_VP9_LEVEL_6_1 = 12, + V4L2_MPEG_VIDEO_VP9_LEVEL_6_2 = 13, +}; /* CIDs for HEVC encoding. */ -- cgit v1.2.3 From e47168f3d1b14af5281cf50c59561d59d28201f9 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 31 Aug 2020 08:30:44 +0000 Subject: powerpc/8xx: Support 16k hugepages with 4k pages The 8xx has 4 page sizes: 4k, 16k, 512k and 8M 4k and 16k can be selected at build time as standard page sizes, and 512k and 8M are hugepages. When 4k standard pages are selected, 16k pages are not available. Allow 16k pages as hugepages when 4k pages are used. To allow that, implement arch_make_huge_pte() which receives the necessary arguments to allow setting the PTE in accordance with the page size: - 512 k pages must have _PAGE_HUGE and _PAGE_SPS. They are set by pte_mkhuge(). arch_make_huge_pte() does nothing. - 16 k pages must have only _PAGE_SPS. arch_make_huge_pte() clears _PAGE_HUGE. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/a518abc29266a708dfbccc8fce9ae6694fe4c2c6.1598862623.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h | 14 ++++++++++++++ arch/powerpc/include/asm/nohash/32/pgtable.h | 2 ++ arch/powerpc/mm/hugetlbpage.c | 2 +- arch/powerpc/mm/nohash/tlb.c | 4 ---- arch/powerpc/mm/ptdump/8xx.c | 5 +++++ include/uapi/asm-generic/hugetlb_encode.h | 1 + include/uapi/linux/mman.h | 1 + 7 files changed, 24 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h index e752a5807a59..39be9aea86db 100644 --- a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h @@ -65,4 +65,18 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, pte_update(mm, addr, ptep, clr, set, 1); } +#ifdef CONFIG_PPC_4K_PAGES +static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, + struct page *page, int writable) +{ + size_t size = huge_page_size(hstate_vma(vma)); + + if (size == SZ_16K) + return __pte(pte_val(entry) & ~_PAGE_HUGE); + else + return entry; +} +#define arch_make_huge_pte arch_make_huge_pte +#endif + #endif /* _ASM_POWERPC_NOHASH_32_HUGETLB_8XX_H */ diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index 80bbc21b87f0..ee2243ba96cf 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -235,6 +235,8 @@ static int number_of_cells_per_pte(pmd_t *pmd, pte_basic_t val, int huge) return PAGE_SIZE / SZ_4K; else if (hugepd_ok(*((hugepd_t *)pmd))) return 1; + else if (IS_ENABLED(CONFIG_PPC_4K_PAGES) && !(val & _PAGE_HUGE)) + return SZ_16K / SZ_4K; else return SZ_512K / SZ_4K; } diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index e7ae2a2c4545..36c3800769fb 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -180,7 +180,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz if (!hpdp) return NULL; - if (IS_ENABLED(CONFIG_PPC_8xx) && sz == SZ_512K) + if (IS_ENABLED(CONFIG_PPC_8xx) && pshift < PMD_SHIFT) return pte_alloc_map(mm, (pmd_t *)hpdp, addr); BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp)); diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c index 14514585db98..5872f69141d5 100644 --- a/arch/powerpc/mm/nohash/tlb.c +++ b/arch/powerpc/mm/nohash/tlb.c @@ -83,16 +83,12 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { }; #elif defined(CONFIG_PPC_8xx) struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { - /* we only manage 4k and 16k pages as normal pages */ -#ifdef CONFIG_PPC_4K_PAGES [MMU_PAGE_4K] = { .shift = 12, }, -#else [MMU_PAGE_16K] = { .shift = 14, }, -#endif [MMU_PAGE_512K] = { .shift = 19, }, diff --git a/arch/powerpc/mm/ptdump/8xx.c b/arch/powerpc/mm/ptdump/8xx.c index 8a797dcbf475..86da2a669680 100644 --- a/arch/powerpc/mm/ptdump/8xx.c +++ b/arch/powerpc/mm/ptdump/8xx.c @@ -11,8 +11,13 @@ static const struct flag_info flag_array[] = { { +#ifdef CONFIG_PPC_16K_PAGES .mask = _PAGE_HUGE, .val = _PAGE_HUGE, +#else + .mask = _PAGE_SPS, + .val = _PAGE_SPS, +#endif .set = "huge", .clear = " ", }, { diff --git a/include/uapi/asm-generic/hugetlb_encode.h b/include/uapi/asm-generic/hugetlb_encode.h index b0f8e87235bd..4f3d5aaa11f5 100644 --- a/include/uapi/asm-generic/hugetlb_encode.h +++ b/include/uapi/asm-generic/hugetlb_encode.h @@ -20,6 +20,7 @@ #define HUGETLB_FLAG_ENCODE_SHIFT 26 #define HUGETLB_FLAG_ENCODE_MASK 0x3f +#define HUGETLB_FLAG_ENCODE_16KB (14 << HUGETLB_FLAG_ENCODE_SHIFT) #define HUGETLB_FLAG_ENCODE_64KB (16 << HUGETLB_FLAG_ENCODE_SHIFT) #define HUGETLB_FLAG_ENCODE_512KB (19 << HUGETLB_FLAG_ENCODE_SHIFT) #define HUGETLB_FLAG_ENCODE_1MB (20 << HUGETLB_FLAG_ENCODE_SHIFT) diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h index 923cc162609c..f55bc680b5b0 100644 --- a/include/uapi/linux/mman.h +++ b/include/uapi/linux/mman.h @@ -27,6 +27,7 @@ #define MAP_HUGE_SHIFT HUGETLB_FLAG_ENCODE_SHIFT #define MAP_HUGE_MASK HUGETLB_FLAG_ENCODE_MASK +#define MAP_HUGE_16KB HUGETLB_FLAG_ENCODE_16KB #define MAP_HUGE_64KB HUGETLB_FLAG_ENCODE_64KB #define MAP_HUGE_512KB HUGETLB_FLAG_ENCODE_512KB #define MAP_HUGE_1MB HUGETLB_FLAG_ENCODE_1MB -- cgit v1.2.3 From f8910ffa81b085030dc54814c85d338c26a3157e Mon Sep 17 00:00:00 2001 From: Xianting Tian Date: Tue, 15 Sep 2020 15:18:17 +0800 Subject: ipmi:msghandler: retry to get device id on an error We fail to get the BMCS's device id with low probability when loading the ipmi driver and it causes BMC device registration failed. When this issue occurs we got below kernel prints: [Wed Sep 9 19:52:03 2020] ipmi_si IPI0001:00: IPMI message handler: device id demangle failed: -22 [Wed Sep 9 19:52:03 2020] IPMI BT: using default values [Wed Sep 9 19:52:03 2020] IPMI BT: req2rsp=5 secs retries=2 [Wed Sep 9 19:52:03 2020] ipmi_si IPI0001:00: Unable to get the device id: -5 [Wed Sep 9 19:52:04 2020] ipmi_si IPI0001:00: Unable to register device: error -5 When this issue happens, we want to manually unload the driver and try to load it again, but it can't be unloaded by 'rmmod' as it is already 'in use'. We add a print in handle_one_recv_msg(), when this issue happens, the msg we received is "Recv: 1c 01 d5", which means the data_len is 1, data[0] is 0xd5 (completion code), which means "bmc cannot execute command. Command, or request parameter(s), not supported in present state". Debug code: static int handle_one_recv_msg(struct ipmi_smi *intf, struct ipmi_smi_msg *msg) { printk("Recv: %*ph\n", msg->rsp_size, msg->rsp); ... ... } Then in ipmi_demangle_device_id(), it returned '-EINVAL' as 'data_len < 7' and 'data[0] != 0'. We created this patch to retry the get device id when this error happens. We reproduced this issue again and the retry succeed on the first retry, we finally got the correct msg and then all is ok: Recv: 1c 01 00 01 81 05 84 02 af db 07 00 01 00 b9 00 10 00 So use a retry machanism in this patch to give bmc more opportunity to correctly response kernel when we received specific completion codes. Signed-off-by: Xianting Tian Message-Id: <20200915071817.4484-1-tian.xianting@h3c.com> [Cleaned up the verbage a bit in the header and prints.] Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_msghandler.c | 29 +++++++++++++++++++++++++---- include/uapi/linux/ipmi_msgdefs.h | 2 ++ 2 files changed, 27 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index 9f61a1a30f2f..e56409659459 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -34,6 +34,7 @@ #include #include #include +#include #define IPMI_DRIVER_VERSION "39.2" @@ -60,6 +61,9 @@ enum ipmi_panic_event_op { #else #define IPMI_PANIC_DEFAULT IPMI_SEND_PANIC_EVENT_NONE #endif + +#define GET_DEVICE_ID_MAX_RETRY 5 + static enum ipmi_panic_event_op ipmi_send_panic_event = IPMI_PANIC_DEFAULT; static int panic_op_write_handler(const char *val, @@ -317,6 +321,7 @@ struct bmc_device { int dyn_guid_set; struct kref usecount; struct work_struct remove_work; + char cc; /* completion code */ }; #define to_bmc_device(x) container_of((x), struct bmc_device, pdev.dev) @@ -2381,6 +2386,8 @@ static void bmc_device_id_handler(struct ipmi_smi *intf, msg->msg.data, msg->msg.data_len, &intf->bmc->fetch_id); if (rv) { dev_warn(intf->si_dev, "device id demangle failed: %d\n", rv); + /* record completion code when error */ + intf->bmc->cc = msg->msg.data[0]; intf->bmc->dyn_id_set = 0; } else { /* @@ -2426,19 +2433,34 @@ send_get_device_id_cmd(struct ipmi_smi *intf) static int __get_device_id(struct ipmi_smi *intf, struct bmc_device *bmc) { int rv; - - bmc->dyn_id_set = 2; + unsigned int retry_count = 0; intf->null_user_handler = bmc_device_id_handler; +retry: + bmc->cc = 0; + bmc->dyn_id_set = 2; + rv = send_get_device_id_cmd(intf); if (rv) goto out_reset_handler; wait_event(intf->waitq, bmc->dyn_id_set != 2); - if (!bmc->dyn_id_set) + if (!bmc->dyn_id_set) { + if ((bmc->cc == IPMI_DEVICE_IN_FW_UPDATE_ERR + || bmc->cc == IPMI_DEVICE_IN_INIT_ERR + || bmc->cc == IPMI_NOT_IN_MY_STATE_ERR) + && ++retry_count <= GET_DEVICE_ID_MAX_RETRY) { + msleep(500); + dev_warn(intf->si_dev, + "BMC returned 0x%2.2x, retry get bmc device id\n", + bmc->cc); + goto retry; + } + rv = -EIO; /* Something went wrong in the fetch. */ + } /* dyn_id_set makes the id data available. */ smp_rmb(); @@ -3246,7 +3268,6 @@ channel_handler(struct ipmi_smi *intf, struct ipmi_recv_msg *msg) /* It's the one we want */ if (msg->msg.data[0] != 0) { /* Got an error from the channel, just go on. */ - if (msg->msg.data[0] == IPMI_INVALID_COMMAND_ERR) { /* * If the MC does not support this diff --git a/include/uapi/linux/ipmi_msgdefs.h b/include/uapi/linux/ipmi_msgdefs.h index c2b23a9fdf3d..0934af3b8037 100644 --- a/include/uapi/linux/ipmi_msgdefs.h +++ b/include/uapi/linux/ipmi_msgdefs.h @@ -69,6 +69,8 @@ #define IPMI_ERR_MSG_TRUNCATED 0xc6 #define IPMI_REQ_LEN_INVALID_ERR 0xc7 #define IPMI_REQ_LEN_EXCEEDED_ERR 0xc8 +#define IPMI_DEVICE_IN_FW_UPDATE_ERR 0xd1 +#define IPMI_DEVICE_IN_INIT_ERR 0xd2 #define IPMI_NOT_IN_MY_STATE_ERR 0xd5 /* IPMI 2.0 */ #define IPMI_LOST_ARBITRATION_ERR 0x81 #define IPMI_BUS_ERR 0x82 -- cgit v1.2.3 From 9a27a33027f22a716ce362be48d70ae0eb012ab7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 14 Sep 2020 17:11:52 -0700 Subject: ethtool: add standard pause stats Currently drivers have to report their pause frames statistics via ethtool -S, and there is a wide variety of names used for these statistics. Add the two statistics defined in IEEE 802.3x to the standard API. Create a new ethtool request header flag for including statistics in the response to GET commands. Always create the ETHTOOL_A_PAUSE_STATS nest in replies when flag is set. Testing if driver declares the op is not a reliable way of checking if any stats will actually be included and therefore we don't want to give the impression that presence of ETHTOOL_A_PAUSE_STATS indicates driver support. Note that this patch does not include PFC counters, which may fit better in dcbnl? But mostly I don't need them/have a setup to test them so I haven't looked deeply into exposing them :) v3: - add a helper for "uninitializing" stats, rather than a cryptic memset() (Andrew) Signed-off-by: Jakub Kicinski Reviewed-by: Saeed Mahameed Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 11 +++++ include/linux/ethtool.h | 26 ++++++++++++ include/uapi/linux/ethtool_netlink.h | 18 +++++++- net/ethtool/pause.c | 63 +++++++++++++++++++++++++++- 4 files changed, 116 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index d53bcb31645a..2c8e0ddf548e 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -68,6 +68,7 @@ the flags may not apply to requests. Recognized flags are: ================================= =================================== ``ETHTOOL_FLAG_COMPACT_BITSETS`` use compact format bitsets in reply ``ETHTOOL_FLAG_OMIT_REPLY`` omit optional reply (_SET and _ACT) + ``ETHTOOL_FLAG_STATS`` include optional device statistics ================================= =================================== New request flags should follow the general idea that if the flag is not set, @@ -989,8 +990,18 @@ Kernel response contents: ``ETHTOOL_A_PAUSE_AUTONEG`` bool pause autonegotiation ``ETHTOOL_A_PAUSE_RX`` bool receive pause frames ``ETHTOOL_A_PAUSE_TX`` bool transmit pause frames + ``ETHTOOL_A_PAUSE_STATS`` nested pause statistics ===================================== ====== ========================== +``ETHTOOL_A_PAUSE_STATS`` are reported if ``ETHTOOL_FLAG_STATS`` was set +in ``ETHTOOL_A_HEADER_FLAGS``. +It will be empty if driver did not report any statistics. Drivers fill in +the statistics in the following structure: + +.. kernel-doc:: include/linux/ethtool.h + :identifiers: ethtool_pause_stats + +Each member has a corresponding attribute defined. PAUSE_SET ============ diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 969a80211df6..060b20f0b20f 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -241,6 +241,27 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, ETHTOOL_COALESCE_PKT_RATE_LOW | ETHTOOL_COALESCE_PKT_RATE_HIGH | \ ETHTOOL_COALESCE_RATE_SAMPLE_INTERVAL) +#define ETHTOOL_STAT_NOT_SET (~0ULL) + +/** + * struct ethtool_pause_stats - statistics for IEEE 802.3x pause frames + * @tx_pause_frames: transmitted pause frame count. Reported to user space + * as %ETHTOOL_A_PAUSE_STAT_TX_FRAMES. + * + * Equivalent to `30.3.4.2 aPAUSEMACCtrlFramesTransmitted` + * from the standard. + * + * @rx_pause_frames: received pause frame count. Reported to user space + * as %ETHTOOL_A_PAUSE_STAT_RX_FRAMES. Equivalent to: + * + * Equivalent to `30.3.4.3 aPAUSEMACCtrlFramesReceived` + * from the standard. + */ +struct ethtool_pause_stats { + u64 tx_pause_frames; + u64 rx_pause_frames; +}; + /** * struct ethtool_ops - optional netdev operations * @supported_coalesce_params: supported types of interrupt coalescing. @@ -282,6 +303,9 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, * Returns a negative error code or zero. * @get_ringparam: Report ring sizes * @set_ringparam: Set ring sizes. Returns a negative error code or zero. + * @get_pause_stats: Report pause frame statistics. Drivers must not zero + * statistics which they don't report. The stats structure is initialized + * to ETHTOOL_STAT_NOT_SET indicating driver does not report statistics. * @get_pauseparam: Report pause parameters * @set_pauseparam: Set pause parameters. Returns a negative error code * or zero. @@ -418,6 +442,8 @@ struct ethtool_ops { struct ethtool_ringparam *); int (*set_ringparam)(struct net_device *, struct ethtool_ringparam *); + void (*get_pause_stats)(struct net_device *dev, + struct ethtool_pause_stats *pause_stats); void (*get_pauseparam)(struct net_device *, struct ethtool_pauseparam*); int (*set_pauseparam)(struct net_device *, diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 5dcd24cb33ea..9cee6df01a10 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -91,9 +91,12 @@ enum { #define ETHTOOL_FLAG_COMPACT_BITSETS (1 << 0) /* provide optional reply for SET or ACT requests */ #define ETHTOOL_FLAG_OMIT_REPLY (1 << 1) +/* request statistics, if supported by the driver */ +#define ETHTOOL_FLAG_STATS (1 << 2) #define ETHTOOL_FLAG_ALL (ETHTOOL_FLAG_COMPACT_BITSETS | \ - ETHTOOL_FLAG_OMIT_REPLY) + ETHTOOL_FLAG_OMIT_REPLY | \ + ETHTOOL_FLAG_STATS) enum { ETHTOOL_A_HEADER_UNSPEC, @@ -376,12 +379,25 @@ enum { ETHTOOL_A_PAUSE_AUTONEG, /* u8 */ ETHTOOL_A_PAUSE_RX, /* u8 */ ETHTOOL_A_PAUSE_TX, /* u8 */ + ETHTOOL_A_PAUSE_STATS, /* nest - _PAUSE_STAT_* */ /* add new constants above here */ __ETHTOOL_A_PAUSE_CNT, ETHTOOL_A_PAUSE_MAX = (__ETHTOOL_A_PAUSE_CNT - 1) }; +enum { + ETHTOOL_A_PAUSE_STAT_UNSPEC, + ETHTOOL_A_PAUSE_STAT_PAD, + + ETHTOOL_A_PAUSE_STAT_TX_FRAMES, + ETHTOOL_A_PAUSE_STAT_RX_FRAMES, + + /* add new constants above here */ + __ETHTOOL_A_PAUSE_STAT_CNT, + ETHTOOL_A_PAUSE_STAT_MAX = (__ETHTOOL_A_PAUSE_STAT_CNT - 1) +}; + /* EEE */ enum { diff --git a/net/ethtool/pause.c b/net/ethtool/pause.c index 7aea35d1e8a5..1980aa7eb2b6 100644 --- a/net/ethtool/pause.c +++ b/net/ethtool/pause.c @@ -10,6 +10,7 @@ struct pause_req_info { struct pause_reply_data { struct ethnl_reply_data base; struct ethtool_pauseparam pauseparam; + struct ethtool_pause_stats pausestat; }; #define PAUSE_REPDATA(__reply_base) \ @@ -22,8 +23,15 @@ pause_get_policy[ETHTOOL_A_PAUSE_MAX + 1] = { [ETHTOOL_A_PAUSE_AUTONEG] = { .type = NLA_REJECT }, [ETHTOOL_A_PAUSE_RX] = { .type = NLA_REJECT }, [ETHTOOL_A_PAUSE_TX] = { .type = NLA_REJECT }, + [ETHTOOL_A_PAUSE_STATS] = { .type = NLA_REJECT }, }; +static void ethtool_stats_init(u64 *stats, unsigned int n) +{ + while (n--) + stats[n] = ETHTOOL_STAT_NOT_SET; +} + static int pause_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, struct genl_info *info) @@ -34,10 +42,17 @@ static int pause_prepare_data(const struct ethnl_req_info *req_base, if (!dev->ethtool_ops->get_pauseparam) return -EOPNOTSUPP; + ret = ethnl_ops_begin(dev); if (ret < 0) return ret; dev->ethtool_ops->get_pauseparam(dev, &data->pauseparam); + if (req_base->flags & ETHTOOL_FLAG_STATS && + dev->ethtool_ops->get_pause_stats) { + ethtool_stats_init((u64 *)&data->pausestat, + sizeof(data->pausestat) / 8); + dev->ethtool_ops->get_pause_stats(dev, &data->pausestat); + } ethnl_ops_complete(dev); return 0; @@ -46,9 +61,50 @@ static int pause_prepare_data(const struct ethnl_req_info *req_base, static int pause_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { - return nla_total_size(sizeof(u8)) + /* _PAUSE_AUTONEG */ + int n = nla_total_size(sizeof(u8)) + /* _PAUSE_AUTONEG */ nla_total_size(sizeof(u8)) + /* _PAUSE_RX */ nla_total_size(sizeof(u8)); /* _PAUSE_TX */ + + if (req_base->flags & ETHTOOL_FLAG_STATS) + n += nla_total_size(0) + /* _PAUSE_STATS */ + nla_total_size_64bit(sizeof(u64)) * + (ETHTOOL_A_PAUSE_STAT_MAX - 2); + return n; +} + +static int ethtool_put_stat(struct sk_buff *skb, u64 val, u16 attrtype, + u16 padtype) +{ + if (val == ETHTOOL_STAT_NOT_SET) + return 0; + if (nla_put_u64_64bit(skb, attrtype, val, padtype)) + return -EMSGSIZE; + + return 0; +} + +static int pause_put_stats(struct sk_buff *skb, + const struct ethtool_pause_stats *pause_stats) +{ + const u16 pad = ETHTOOL_A_PAUSE_STAT_PAD; + struct nlattr *nest; + + nest = nla_nest_start(skb, ETHTOOL_A_PAUSE_STATS); + if (!nest) + return -EMSGSIZE; + + if (ethtool_put_stat(skb, pause_stats->tx_pause_frames, + ETHTOOL_A_PAUSE_STAT_TX_FRAMES, pad) || + ethtool_put_stat(skb, pause_stats->rx_pause_frames, + ETHTOOL_A_PAUSE_STAT_RX_FRAMES, pad)) + goto err_cancel; + + nla_nest_end(skb, nest); + return 0; + +err_cancel: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; } static int pause_fill_reply(struct sk_buff *skb, @@ -63,6 +119,10 @@ static int pause_fill_reply(struct sk_buff *skb, nla_put_u8(skb, ETHTOOL_A_PAUSE_TX, !!pauseparam->tx_pause)) return -EMSGSIZE; + if (req_base->flags & ETHTOOL_FLAG_STATS && + pause_put_stats(skb, &data->pausestat)) + return -EMSGSIZE; + return 0; } @@ -89,6 +149,7 @@ pause_set_policy[ETHTOOL_A_PAUSE_MAX + 1] = { [ETHTOOL_A_PAUSE_AUTONEG] = { .type = NLA_U8 }, [ETHTOOL_A_PAUSE_RX] = { .type = NLA_U8 }, [ETHTOOL_A_PAUSE_TX] = { .type = NLA_U8 }, + [ETHTOOL_A_PAUSE_STATS] = { .type = NLA_REJECT }, }; int ethnl_set_pause(struct sk_buff *skb, struct genl_info *info) -- cgit v1.2.3 From e2ce94dc1d89e0f76ddd202cea72e0f505083d0a Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 15 Sep 2020 11:40:57 +0300 Subject: devlink: introduce the health reporter test command Introduce a test command for health reporters. User might use this command to trigger test event on a reporter if the reporter supports it. Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/devlink.h | 3 +++ include/uapi/linux/devlink.h | 2 ++ net/core/devlink.c | 30 ++++++++++++++++++++++++++++++ 3 files changed, 35 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/devlink.h b/include/net/devlink.h index eaec0a8cc5ef..48b1c1ef1ebd 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -566,6 +566,7 @@ enum devlink_health_reporter_state { * @dump: callback to dump an object * if priv_ctx is NULL, run a full dump * @diagnose: callback to diagnose the current status + * @test: callback to trigger a test event */ struct devlink_health_reporter_ops { @@ -578,6 +579,8 @@ struct devlink_health_reporter_ops { int (*diagnose)(struct devlink_health_reporter *reporter, struct devlink_fmsg *fmsg, struct netlink_ext_ack *extack); + int (*test)(struct devlink_health_reporter *reporter, + struct netlink_ext_ack *extack); }; /** diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 40d35145c879..631f5bdf1707 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -122,6 +122,8 @@ enum devlink_command { DEVLINK_CMD_TRAP_POLICER_NEW, DEVLINK_CMD_TRAP_POLICER_DEL, + DEVLINK_CMD_HEALTH_REPORTER_TEST, + /* add new commands above here */ __DEVLINK_CMD_MAX, DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1 diff --git a/net/core/devlink.c b/net/core/devlink.c index 19037f114307..e5b71f3c2d4d 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -6096,6 +6096,28 @@ devlink_nl_cmd_health_reporter_dump_clear_doit(struct sk_buff *skb, return 0; } +static int devlink_nl_cmd_health_reporter_test_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_health_reporter *reporter; + int err; + + reporter = devlink_health_reporter_get_from_info(devlink, info); + if (!reporter) + return -EINVAL; + + if (!reporter->ops->test) { + devlink_health_reporter_put(reporter); + return -EOPNOTSUPP; + } + + err = reporter->ops->test(reporter, info->extack); + + devlink_health_reporter_put(reporter); + return err; +} + struct devlink_stats { u64 rx_bytes; u64 rx_packets; @@ -7316,6 +7338,14 @@ static const struct genl_ops devlink_nl_ops[] = { .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT | DEVLINK_NL_FLAG_NO_LOCK, }, + { + .cmd = DEVLINK_CMD_HEALTH_REPORTER_TEST, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_health_reporter_test_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT | + DEVLINK_NL_FLAG_NO_LOCK, + }, { .cmd = DEVLINK_CMD_FLASH_UPDATE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, -- cgit v1.2.3 From ef15314aa5de955c6afd87d512e8b00f5ac08d06 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Tue, 15 Sep 2020 16:45:40 -0700 Subject: bpf: Add BPF_PROG_BIND_MAP syscall This syscall binds a map to a program. Returns success if the map is already bound to the program. Signed-off-by: YiFei Zhu Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Cc: YiFei Zhu Link: https://lore.kernel.org/bpf/20200915234543.3220146-3-sdf@google.com --- include/uapi/linux/bpf.h | 7 +++++ kernel/bpf/syscall.c | 63 ++++++++++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 7 +++++ 3 files changed, 77 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7dd314176df7..a22812561064 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -124,6 +124,7 @@ enum bpf_cmd { BPF_ENABLE_STATS, BPF_ITER_CREATE, BPF_LINK_DETACH, + BPF_PROG_BIND_MAP, }; enum bpf_map_type { @@ -658,6 +659,12 @@ union bpf_attr { __u32 flags; } iter_create; + struct { /* struct used by BPF_PROG_BIND_MAP command */ + __u32 prog_fd; + __u32 map_fd; + __u32 flags; /* extra flags */ + } prog_bind_map; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a67b8c6746be..2ce32cad5c8e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4161,6 +4161,66 @@ static int bpf_iter_create(union bpf_attr *attr) return err; } +#define BPF_PROG_BIND_MAP_LAST_FIELD prog_bind_map.flags + +static int bpf_prog_bind_map(union bpf_attr *attr) +{ + struct bpf_prog *prog; + struct bpf_map *map; + struct bpf_map **used_maps_old, **used_maps_new; + int i, ret = 0; + + if (CHECK_ATTR(BPF_PROG_BIND_MAP)) + return -EINVAL; + + if (attr->prog_bind_map.flags) + return -EINVAL; + + prog = bpf_prog_get(attr->prog_bind_map.prog_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + map = bpf_map_get(attr->prog_bind_map.map_fd); + if (IS_ERR(map)) { + ret = PTR_ERR(map); + goto out_prog_put; + } + + mutex_lock(&prog->aux->used_maps_mutex); + + used_maps_old = prog->aux->used_maps; + + for (i = 0; i < prog->aux->used_map_cnt; i++) + if (used_maps_old[i] == map) + goto out_unlock; + + used_maps_new = kmalloc_array(prog->aux->used_map_cnt + 1, + sizeof(used_maps_new[0]), + GFP_KERNEL); + if (!used_maps_new) { + ret = -ENOMEM; + goto out_unlock; + } + + memcpy(used_maps_new, used_maps_old, + sizeof(used_maps_old[0]) * prog->aux->used_map_cnt); + used_maps_new[prog->aux->used_map_cnt] = map; + + prog->aux->used_map_cnt++; + prog->aux->used_maps = used_maps_new; + + kfree(used_maps_old); + +out_unlock: + mutex_unlock(&prog->aux->used_maps_mutex); + + if (ret) + bpf_map_put(map); +out_prog_put: + bpf_prog_put(prog); + return ret; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr; @@ -4294,6 +4354,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_LINK_DETACH: err = link_detach(&attr); break; + case BPF_PROG_BIND_MAP: + err = bpf_prog_bind_map(&attr); + break; default: err = -EINVAL; break; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7dd314176df7..a22812561064 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -124,6 +124,7 @@ enum bpf_cmd { BPF_ENABLE_STATS, BPF_ITER_CREATE, BPF_LINK_DETACH, + BPF_PROG_BIND_MAP, }; enum bpf_map_type { @@ -658,6 +659,12 @@ union bpf_attr { __u32 flags; } iter_create; + struct { /* struct used by BPF_PROG_BIND_MAP command */ + __u32 prog_fd; + __u32 map_fd; + __u32 flags; /* extra flags */ + } prog_bind_map; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF -- cgit v1.2.3 From 4af8b3d3eb5032fe6f4a8104c48c176bf68a6946 Mon Sep 17 00:00:00 2001 From: Tingwei Zhang Date: Wed, 16 Sep 2020 13:17:23 -0600 Subject: coresight: stm: Support marked packet STP_PACKET_MARKED is not supported by STM currently. Add STM_FLAG_MARKED to support marked packet in STM. Signed-off-by: Tingwei Zhang Signed-off-by: Mathieu Poirier Link: https://lore.kernel.org/r/20200916191737.4001561-3-mathieu.poirier@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/hwtracing/coresight/coresight-stm.c | 11 +++++++---- include/uapi/linux/coresight-stm.h | 1 + 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/hwtracing/coresight/coresight-stm.c b/drivers/hwtracing/coresight/coresight-stm.c index 673d2f56ed1e..2ba819a47cf6 100644 --- a/drivers/hwtracing/coresight/coresight-stm.c +++ b/drivers/hwtracing/coresight/coresight-stm.c @@ -412,6 +412,7 @@ static ssize_t notrace stm_generic_packet(struct stm_data *stm_data, void __iomem *ch_addr; struct stm_drvdata *drvdata = container_of(stm_data, struct stm_drvdata, stm); + unsigned int stm_flags; if (!(drvdata && local_read(&drvdata->mode))) return -EACCES; @@ -421,8 +422,9 @@ static ssize_t notrace stm_generic_packet(struct stm_data *stm_data, ch_addr = stm_channel_addr(drvdata, channel); - flags = (flags == STP_PACKET_TIMESTAMPED) ? STM_FLAG_TIMESTAMPED : 0; - flags |= test_bit(channel, drvdata->chs.guaranteed) ? + stm_flags = (flags & STP_PACKET_TIMESTAMPED) ? + STM_FLAG_TIMESTAMPED : 0; + stm_flags |= test_bit(channel, drvdata->chs.guaranteed) ? STM_FLAG_GUARANTEED : 0; if (size > drvdata->write_bytes) @@ -432,7 +434,7 @@ static ssize_t notrace stm_generic_packet(struct stm_data *stm_data, switch (packet) { case STP_PACKET_FLAG: - ch_addr += stm_channel_off(STM_PKT_TYPE_FLAG, flags); + ch_addr += stm_channel_off(STM_PKT_TYPE_FLAG, stm_flags); /* * The generic STM core sets a size of '0' on flag packets. @@ -444,7 +446,8 @@ static ssize_t notrace stm_generic_packet(struct stm_data *stm_data, break; case STP_PACKET_DATA: - ch_addr += stm_channel_off(STM_PKT_TYPE_DATA, flags); + stm_flags |= (flags & STP_PACKET_MARKED) ? STM_FLAG_MARKED : 0; + ch_addr += stm_channel_off(STM_PKT_TYPE_DATA, stm_flags); stm_send(ch_addr, payload, size, drvdata->write_bytes); break; diff --git a/include/uapi/linux/coresight-stm.h b/include/uapi/linux/coresight-stm.h index 8847dbf24151..7ff3709c01b8 100644 --- a/include/uapi/linux/coresight-stm.h +++ b/include/uapi/linux/coresight-stm.h @@ -5,6 +5,7 @@ #include #define STM_FLAG_TIMESTAMPED _BITUL(3) +#define STM_FLAG_MARKED _BITUL(4) #define STM_FLAG_GUARANTEED _BITUL(7) /* -- cgit v1.2.3 From 78a3ea5557137b0811f3c5a020afaafa7b61d6aa Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 17 Sep 2020 10:51:32 -0700 Subject: net: remove comments on struct rtnl_link_stats We removed the misleading comments from struct rtnl_link_stats64 when we added proper kdoc. struct rtnl_link_stats has the same inline comments, so remove them, too. Signed-off-by: Jakub Kicinski Reviewed-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index bf4667403cab..c4b23f06f69e 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -7,24 +7,23 @@ /* This struct should be in sync with struct rtnl_link_stats64 */ struct rtnl_link_stats { - __u32 rx_packets; /* total packets received */ - __u32 tx_packets; /* total packets transmitted */ - __u32 rx_bytes; /* total bytes received */ - __u32 tx_bytes; /* total bytes transmitted */ - __u32 rx_errors; /* bad packets received */ - __u32 tx_errors; /* packet transmit problems */ - __u32 rx_dropped; /* no space in linux buffers */ - __u32 tx_dropped; /* no space available in linux */ - __u32 multicast; /* multicast packets received */ + __u32 rx_packets; + __u32 tx_packets; + __u32 rx_bytes; + __u32 tx_bytes; + __u32 rx_errors; + __u32 tx_errors; + __u32 rx_dropped; + __u32 tx_dropped; + __u32 multicast; __u32 collisions; - /* detailed rx_errors: */ __u32 rx_length_errors; - __u32 rx_over_errors; /* receiver ring buff overflow */ - __u32 rx_crc_errors; /* recved pkt with crc error */ - __u32 rx_frame_errors; /* recv'd frame alignment error */ - __u32 rx_fifo_errors; /* recv'r fifo overrun */ - __u32 rx_missed_errors; /* receiver missed packet */ + __u32 rx_over_errors; + __u32 rx_crc_errors; + __u32 rx_frame_errors; + __u32 rx_fifo_errors; + __u32 rx_missed_errors; /* detailed tx_errors */ __u32 tx_aborted_errors; @@ -37,7 +36,7 @@ struct rtnl_link_stats { __u32 rx_compressed; __u32 tx_compressed; - __u32 rx_nohandler; /* dropped, no handler found */ + __u32 rx_nohandler; }; /** -- cgit v1.2.3 From d65a977087f94f3bb97f351798d864556063109a Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Tue, 8 Sep 2020 12:03:03 -0700 Subject: nl80211: advertise supported channel width in S1G S1G supports 5 channel widths: 1, 2, 4, 8, and 16. One channel width is allowed per frequency in each operating class, so it makes more sense to advertise the specific channel width allowed. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200908190323.15814-3-thomas@adapt-ip.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 15 +++++++++++++++ include/uapi/linux/nl80211.h | 15 +++++++++++++++ net/wireless/nl80211.c | 15 +++++++++++++++ 3 files changed, 45 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 7ad530912b21..2a7561743717 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -96,6 +96,16 @@ struct wiphy; * @IEEE80211_CHAN_NO_10MHZ: 10 MHz bandwidth is not permitted * on this channel. * @IEEE80211_CHAN_NO_HE: HE operation is not permitted on this channel. + * @IEEE80211_CHAN_1MHZ: 1 MHz bandwidth is permitted + * on this channel. + * @IEEE80211_CHAN_2MHZ: 2 MHz bandwidth is permitted + * on this channel. + * @IEEE80211_CHAN_4MHZ: 4 MHz bandwidth is permitted + * on this channel. + * @IEEE80211_CHAN_8MHZ: 8 MHz bandwidth is permitted + * on this channel. + * @IEEE80211_CHAN_16MHZ: 16 MHz bandwidth is permitted + * on this channel. * */ enum ieee80211_channel_flags { @@ -113,6 +123,11 @@ enum ieee80211_channel_flags { IEEE80211_CHAN_NO_20MHZ = 1<<11, IEEE80211_CHAN_NO_10MHZ = 1<<12, IEEE80211_CHAN_NO_HE = 1<<13, + IEEE80211_CHAN_1MHZ = 1<<14, + IEEE80211_CHAN_2MHZ = 1<<15, + IEEE80211_CHAN_4MHZ = 1<<16, + IEEE80211_CHAN_8MHZ = 1<<17, + IEEE80211_CHAN_16MHZ = 1<<18, }; #define IEEE80211_CHAN_NO_HT40 \ diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 0584e0d349f0..4e119c6afa31 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3737,6 +3737,16 @@ enum nl80211_wmm_rule { * @NL80211_FREQUENCY_ATTR_NO_HE: HE operation is not allowed on this channel * in current regulatory domain. * @NL80211_FREQUENCY_ATTR_OFFSET: frequency offset in KHz + * @NL80211_FREQUENCY_ATTR_1MHZ: 1 MHz operation is allowed + * on this channel in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_2MHZ: 2 MHz operation is allowed + * on this channel in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_4MHZ: 4 MHz operation is allowed + * on this channel in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_8MHZ: 8 MHz operation is allowed + * on this channel in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_16MHZ: 16 MHz operation is allowed + * on this channel in current regulatory domain. * @NL80211_FREQUENCY_ATTR_MAX: highest frequency attribute number * currently defined * @__NL80211_FREQUENCY_ATTR_AFTER_LAST: internal use @@ -3768,6 +3778,11 @@ enum nl80211_frequency_attr { NL80211_FREQUENCY_ATTR_WMM, NL80211_FREQUENCY_ATTR_NO_HE, NL80211_FREQUENCY_ATTR_OFFSET, + NL80211_FREQUENCY_ATTR_1MHZ, + NL80211_FREQUENCY_ATTR_2MHZ, + NL80211_FREQUENCY_ATTR_4MHZ, + NL80211_FREQUENCY_ATTR_8MHZ, + NL80211_FREQUENCY_ATTR_16MHZ, /* keep last */ __NL80211_FREQUENCY_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 52a35e788547..7da4d84bcc1a 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -1010,6 +1010,21 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy, if ((chan->flags & IEEE80211_CHAN_NO_HE) && nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_HE)) goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_1MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_1MHZ)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_2MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_2MHZ)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_4MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_4MHZ)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_8MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_8MHZ)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_16MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_16MHZ)) + goto nla_put_failure; } if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER, -- cgit v1.2.3 From 291c49ded2fda1fd0d7bd6056de99fe47d2332e6 Mon Sep 17 00:00:00 2001 From: Aloka Dixit Date: Fri, 11 Sep 2020 00:05:29 +0000 Subject: nl80211: Add FILS discovery support FILS discovery attribute, NL80211_ATTR_FILS_DISCOVERY, is nested which supports following parameters as given in IEEE Std 802.11ai-2016, Annex C.3 MIB detail: (1) NL80211_FILS_DISCOVERY_ATTR_INT_MIN - Minimum packet interval (2) NL80211_FILS_DISCOVERY_ATTR_INT_MAX - Maximum packet interval (3) NL80211_FILS_DISCOVERY_ATTR_TMPL - Template data Signed-off-by: Aloka Dixit Link: https://lore.kernel.org/r/20200805011838.28166-2-alokad@codeaurora.org [fix attribute and other names, use NLA_RANGE(), use policy only once] Link: https://lore.kernel.org/r/010101747a7b38a8-306f06b2-9061-4baf-81c1-054a42a18e22-000000@us-west-2.amazonses.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 19 +++++++++++++++++ include/uapi/linux/nl80211.h | 44 +++++++++++++++++++++++++++++++++++++++ net/wireless/nl80211.c | 49 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 112 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 44db9f80e495..c90700727945 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1082,6 +1082,23 @@ struct cfg80211_acl_data { struct mac_address mac_addrs[]; }; +/** + * struct cfg80211_fils_discovery - FILS discovery parameters from + * IEEE Std 802.11ai-2016, Annex C.3 MIB detail. + * + * @min_interval: Minimum packet interval in TUs (0 - 10000) + * @max_interval: Maximum packet interval in TUs (0 - 10000) + * @tmpl_len: Template length + * @tmpl: Template data for FILS discovery frame including the action + * frame headers. + */ +struct cfg80211_fils_discovery { + u32 min_interval; + u32 max_interval; + size_t tmpl_len; + const u8 *tmpl; +}; + /** * enum cfg80211_ap_settings_flags - AP settings flags * @@ -1129,6 +1146,7 @@ enum cfg80211_ap_settings_flags { * @he_obss_pd: OBSS Packet Detection settings * @he_bss_color: BSS Color settings * @he_oper: HE operation IE (or %NULL if HE isn't enabled) + * @fils_discovery: FILS discovery transmission parameters */ struct cfg80211_ap_settings { struct cfg80211_chan_def chandef; @@ -1159,6 +1177,7 @@ struct cfg80211_ap_settings { u32 flags; struct ieee80211_he_obss_pd he_obss_pd; struct cfg80211_he_bss_color he_bss_color; + struct cfg80211_fils_discovery fils_discovery; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 4e119c6afa31..ad2bea3b07e3 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2513,6 +2513,10 @@ enum nl80211_commands { * @NL80211_ATTR_HE_6GHZ_CAPABILITY: HE 6 GHz Band Capability element (from * association request when used with NL80211_CMD_NEW_STATION). * + * @NL80211_ATTR_FILS_DISCOVERY: Optional parameter to configure FILS + * discovery. It is a nested attribute, see + * &enum nl80211_fils_discovery_attributes. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2995,6 +2999,8 @@ enum nl80211_attrs { NL80211_ATTR_HE_6GHZ_CAPABILITY, + NL80211_ATTR_FILS_DISCOVERY, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -5867,6 +5873,9 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_SAE_OFFLOAD_AP: Device wants to do SAE authentication * in AP mode (SAE password is passed as part of the start AP command). * + * @NL80211_EXT_FEATURE_FILS_DISCOVERY: Driver/device supports FILS discovery + * frames transmission + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5925,6 +5934,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_OPERATING_CHANNEL_VALIDATION, NL80211_EXT_FEATURE_4WAY_HANDSHAKE_AP_PSK, NL80211_EXT_FEATURE_SAE_OFFLOAD_AP, + NL80211_EXT_FEATURE_FILS_DISCOVERY, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, @@ -7019,4 +7029,38 @@ enum nl80211_iftype_akm_attributes { NL80211_IFTYPE_AKM_ATTR_MAX = __NL80211_IFTYPE_AKM_ATTR_LAST - 1, }; +/** + * enum nl80211_fils_discovery_attributes - FILS discovery configuration + * from IEEE Std 802.11ai-2016, Annex C.3 MIB detail. + * + * @__NL80211_FILS_DISCOVERY_ATTR_INVALID: Invalid + * + * @NL80211_FILS_DISCOVERY_ATTR_INT_MIN: Minimum packet interval (u32, TU). + * Allowed range: 0..10000 (TU = Time Unit) + * @NL80211_FILS_DISCOVERY_ATTR_INT_MAX: Maximum packet interval (u32, TU). + * Allowed range: 0..10000 (TU = Time Unit) + * @NL80211_FILS_DISCOVERY_ATTR_TMPL: Template data for FILS discovery action + * frame including the headers. + * + * @__NL80211_FILS_DISCOVERY_ATTR_LAST: Internal + * @NL80211_FILS_DISCOVERY_ATTR_MAX: highest attribute + */ +enum nl80211_fils_discovery_attributes { + __NL80211_FILS_DISCOVERY_ATTR_INVALID, + + NL80211_FILS_DISCOVERY_ATTR_INT_MIN, + NL80211_FILS_DISCOVERY_ATTR_INT_MAX, + NL80211_FILS_DISCOVERY_ATTR_TMPL, + + /* keep last */ + __NL80211_FILS_DISCOVERY_ATTR_LAST, + NL80211_FILS_DISCOVERY_ATTR_MAX = __NL80211_FILS_DISCOVERY_ATTR_LAST - 1 +}; + +/* + * FILS discovery template minimum length with action frame headers and + * mandatory fields. + */ +#define NL80211_FILS_DISCOVERY_TMPL_MIN_LEN 42 + #endif /* __LINUX_NL80211_H */ diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 5d9d51cfc653..afe782887ca9 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -376,6 +376,15 @@ nl80211_tid_config_attr_policy[NL80211_TID_CONFIG_ATTR_MAX + 1] = { NLA_POLICY_NESTED(nl80211_txattr_policy), }; +static const struct nla_policy +nl80211_fils_discovery_policy[NL80211_FILS_DISCOVERY_ATTR_MAX + 1] = { + [NL80211_FILS_DISCOVERY_ATTR_INT_MIN] = NLA_POLICY_MAX(NLA_U32, 10000), + [NL80211_FILS_DISCOVERY_ATTR_INT_MAX] = NLA_POLICY_MAX(NLA_U32, 10000), + NLA_POLICY_RANGE(NLA_BINARY, + NL80211_FILS_DISCOVERY_TMPL_MIN_LEN, + IEEE80211_MAX_DATA_LEN), +}; + static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [0] = { .strict_start_type = NL80211_ATTR_HE_OBSS_PD }, [NL80211_ATTR_WIPHY] = { .type = NLA_U32 }, @@ -684,6 +693,8 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_SCAN_FREQ_KHZ] = { .type = NLA_NESTED }, [NL80211_ATTR_HE_6GHZ_CAPABILITY] = NLA_POLICY_EXACT_LEN(sizeof(struct ieee80211_he_6ghz_capa)), + [NL80211_ATTR_FILS_DISCOVERY] = + NLA_POLICY_NESTED(nl80211_fils_discovery_policy), }; /* policy for the key attributes */ @@ -4874,6 +4885,36 @@ static int nl80211_parse_he_bss_color(struct nlattr *attrs, return 0; } +static int nl80211_parse_fils_discovery(struct cfg80211_registered_device *rdev, + struct nlattr *attrs, + struct cfg80211_ap_settings *params) +{ + struct nlattr *tb[NL80211_FILS_DISCOVERY_ATTR_MAX + 1]; + int ret; + struct cfg80211_fils_discovery *fd = ¶ms->fils_discovery; + + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_FILS_DISCOVERY)) + return -EINVAL; + + ret = nla_parse_nested(tb, NL80211_FILS_DISCOVERY_ATTR_MAX, attrs, + NULL, NULL); + if (ret) + return ret; + + if (!tb[NL80211_FILS_DISCOVERY_ATTR_INT_MIN] || + !tb[NL80211_FILS_DISCOVERY_ATTR_INT_MAX] || + !tb[NL80211_FILS_DISCOVERY_ATTR_TMPL]) + return -EINVAL; + + fd->tmpl_len = nla_len(tb[NL80211_FILS_DISCOVERY_ATTR_TMPL]); + fd->tmpl = nla_data(tb[NL80211_FILS_DISCOVERY_ATTR_TMPL]); + fd->min_interval = nla_get_u32(tb[NL80211_FILS_DISCOVERY_ATTR_INT_MIN]); + fd->max_interval = nla_get_u32(tb[NL80211_FILS_DISCOVERY_ATTR_INT_MAX]); + + return 0; +} + static void nl80211_check_ap_rate_selectors(struct cfg80211_ap_settings *params, const u8 *rates) { @@ -5182,6 +5223,14 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) goto out; } + if (info->attrs[NL80211_ATTR_FILS_DISCOVERY]) { + err = nl80211_parse_fils_discovery(rdev, + info->attrs[NL80211_ATTR_FILS_DISCOVERY], + ¶ms); + if (err) + goto out; + } + nl80211_calculate_ap_params(¶ms); if (info->attrs[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT]) -- cgit v1.2.3 From 7443dcd1f1718a355e9c4ebeb7e95c3f9f27bb5f Mon Sep 17 00:00:00 2001 From: Aloka Dixit Date: Fri, 11 Sep 2020 00:33:00 +0000 Subject: nl80211: Unsolicited broadcast probe response support This patch adds new attributes to support unsolicited broadcast probe response transmission used for in-band discovery in 6GHz band (IEEE P802.11ax/D6.0 26.17.2.3.2, AP behavior for fast passive scanning). The new attribute, NL80211_ATTR_UNSOL_BCAST_PROBE_RESP, is nested which supports following parameters: (1) NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT - Packet interval (2) NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL - Template data Signed-off-by: Aloka Dixit Link: https://lore.kernel.org/r/010101747a946698-aac263ae-2ed3-4dab-9590-0bc7131214e1-000000@us-west-2.amazonses.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 18 +++++++++++++++++ include/uapi/linux/nl80211.h | 36 ++++++++++++++++++++++++++++++++++ net/wireless/nl80211.c | 46 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 100 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index c90700727945..93d666a571da 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1099,6 +1099,22 @@ struct cfg80211_fils_discovery { const u8 *tmpl; }; +/** + * struct cfg80211_unsol_bcast_probe_resp - Unsolicited broadcast probe + * response parameters in 6GHz. + * + * @interval: Packet interval in TUs. Maximum allowed is 20 TU, as mentioned + * in IEEE P802.11ax/D6.0 26.17.2.3.2 - AP behavior for fast passive + * scanning + * @tmpl_len: Template length + * @tmpl: Template data for probe response + */ +struct cfg80211_unsol_bcast_probe_resp { + u32 interval; + size_t tmpl_len; + const u8 *tmpl; +}; + /** * enum cfg80211_ap_settings_flags - AP settings flags * @@ -1147,6 +1163,7 @@ enum cfg80211_ap_settings_flags { * @he_bss_color: BSS Color settings * @he_oper: HE operation IE (or %NULL if HE isn't enabled) * @fils_discovery: FILS discovery transmission parameters + * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters */ struct cfg80211_ap_settings { struct cfg80211_chan_def chandef; @@ -1178,6 +1195,7 @@ struct cfg80211_ap_settings { struct ieee80211_he_obss_pd he_obss_pd; struct cfg80211_he_bss_color he_bss_color; struct cfg80211_fils_discovery fils_discovery; + struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index ad2bea3b07e3..bdc90b8dfd24 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2517,6 +2517,10 @@ enum nl80211_commands { * discovery. It is a nested attribute, see * &enum nl80211_fils_discovery_attributes. * + * @NL80211_ATTR_UNSOL_BCAST_PROBE_RESP: Optional parameter to configure + * unsolicited broadcast probe response. It is a nested attribute, see + * &enum nl80211_unsol_bcast_probe_resp_attributes. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3001,6 +3005,8 @@ enum nl80211_attrs { NL80211_ATTR_FILS_DISCOVERY, + NL80211_ATTR_UNSOL_BCAST_PROBE_RESP, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -5876,6 +5882,9 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_FILS_DISCOVERY: Driver/device supports FILS discovery * frames transmission * + * @NL80211_EXT_FEATURE_UNSOL_BCAST_PROBE_RESP: Driver/device supports + * unsolicited broadcast probe response transmission + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5935,6 +5944,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_4WAY_HANDSHAKE_AP_PSK, NL80211_EXT_FEATURE_SAE_OFFLOAD_AP, NL80211_EXT_FEATURE_FILS_DISCOVERY, + NL80211_EXT_FEATURE_UNSOL_BCAST_PROBE_RESP, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, @@ -7063,4 +7073,30 @@ enum nl80211_fils_discovery_attributes { */ #define NL80211_FILS_DISCOVERY_TMPL_MIN_LEN 42 +/** + * enum nl80211_unsol_bcast_probe_resp_attributes - Unsolicited broadcast probe + * response configuration. Applicable only in 6GHz. + * + * @__NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INVALID: Invalid + * + * @NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT: Maximum packet interval (u32, TU). + * Allowed range: 0..20 (TU = Time Unit). IEEE P802.11ax/D6.0 + * 26.17.2.3.2 (AP behavior for fast passive scanning). + * @NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL: Unsolicited broadcast probe response + * frame template (binary). + * + * @__NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_LAST: Internal + * @NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_MAX: highest attribute + */ +enum nl80211_unsol_bcast_probe_resp_attributes { + __NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INVALID, + + NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT, + NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL, + + /* keep last */ + __NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_LAST, + NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_MAX = + __NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_LAST - 1 +}; #endif /* __LINUX_NL80211_H */ diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index afe782887ca9..1a212db7a300 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -385,6 +385,13 @@ nl80211_fils_discovery_policy[NL80211_FILS_DISCOVERY_ATTR_MAX + 1] = { IEEE80211_MAX_DATA_LEN), }; +static const struct nla_policy +nl80211_unsol_bcast_probe_resp_policy[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_MAX + 1] = { + [NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT] = NLA_POLICY_MAX(NLA_U32, 20), + [NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL] = { .type = NLA_BINARY, + .len = IEEE80211_MAX_DATA_LEN } +}; + static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [0] = { .strict_start_type = NL80211_ATTR_HE_OBSS_PD }, [NL80211_ATTR_WIPHY] = { .type = NLA_U32 }, @@ -695,6 +702,8 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { NLA_POLICY_EXACT_LEN(sizeof(struct ieee80211_he_6ghz_capa)), [NL80211_ATTR_FILS_DISCOVERY] = NLA_POLICY_NESTED(nl80211_fils_discovery_policy), + [NL80211_ATTR_UNSOL_BCAST_PROBE_RESP] = + NLA_POLICY_NESTED(nl80211_unsol_bcast_probe_resp_policy), }; /* policy for the key attributes */ @@ -4915,6 +4924,35 @@ static int nl80211_parse_fils_discovery(struct cfg80211_registered_device *rdev, return 0; } +static int +nl80211_parse_unsol_bcast_probe_resp(struct cfg80211_registered_device *rdev, + struct nlattr *attrs, + struct cfg80211_ap_settings *params) +{ + struct nlattr *tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_MAX + 1]; + int ret; + struct cfg80211_unsol_bcast_probe_resp *presp = + ¶ms->unsol_bcast_probe_resp; + + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_UNSOL_BCAST_PROBE_RESP)) + return -EINVAL; + + ret = nla_parse_nested(tb, NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_MAX, + attrs, NULL, NULL); + if (ret) + return ret; + + if (!tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT] || + !tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL]) + return -EINVAL; + + presp->tmpl = nla_data(tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL]); + presp->tmpl_len = nla_len(tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL]); + presp->interval = nla_get_u32(tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT]); + return 0; +} + static void nl80211_check_ap_rate_selectors(struct cfg80211_ap_settings *params, const u8 *rates) { @@ -5231,6 +5269,14 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) goto out; } + if (info->attrs[NL80211_ATTR_UNSOL_BCAST_PROBE_RESP]) { + err = nl80211_parse_unsol_bcast_probe_resp( + rdev, info->attrs[NL80211_ATTR_UNSOL_BCAST_PROBE_RESP], + ¶ms); + if (err) + return err; + } + nl80211_calculate_ap_params(¶ms); if (info->attrs[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT]) -- cgit v1.2.3 From c6ff213fe5b8696c9539a1b34ff03de9306dfff9 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Tue, 8 Sep 2020 18:01:48 +0200 Subject: fuse: add submount support to - Add fuse_attr.flags - Add FUSE_ATTR_SUBMOUNT This is a flag for fuse_attr.flags that indicates that the given entry resides on a different filesystem than the parent, and as such should have a different st_dev. - Add FUSE_SUBMOUNTS The client sets this flag if it supports automounting directories. Signed-off-by: Max Reitz Signed-off-by: Miklos Szeredi --- include/uapi/linux/fuse.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 8899e4862309..7233502ea991 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -172,6 +172,9 @@ * - add FUSE_WRITE_KILL_PRIV flag * - add FUSE_SETUPMAPPING and FUSE_REMOVEMAPPING * - add map_alignment to fuse_init_out, add FUSE_MAP_ALIGNMENT flag + * + * 7.32 + * - add flags to fuse_attr, add FUSE_ATTR_SUBMOUNT, add FUSE_SUBMOUNTS */ #ifndef _LINUX_FUSE_H @@ -207,7 +210,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 31 +#define FUSE_KERNEL_MINOR_VERSION 32 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -231,7 +234,7 @@ struct fuse_attr { uint32_t gid; uint32_t rdev; uint32_t blksize; - uint32_t padding; + uint32_t flags; }; struct fuse_kstatfs { @@ -316,6 +319,7 @@ struct fuse_file_lock { * FUSE_MAP_ALIGNMENT: init_out.map_alignment contains log2(byte alignment) for * foffset and moffset fields in struct * fuse_setupmapping_out and fuse_removemapping_one. + * FUSE_SUBMOUNTS: kernel supports auto-mounting directory submounts */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -344,6 +348,7 @@ struct fuse_file_lock { #define FUSE_NO_OPENDIR_SUPPORT (1 << 24) #define FUSE_EXPLICIT_INVAL_DATA (1 << 25) #define FUSE_MAP_ALIGNMENT (1 << 26) +#define FUSE_SUBMOUNTS (1 << 27) /** * CUSE INIT request/reply flags @@ -419,6 +424,13 @@ struct fuse_file_lock { */ #define FUSE_FSYNC_FDATASYNC (1 << 0) +/** + * fuse_attr flags + * + * FUSE_ATTR_SUBMOUNT: Object is a submount root + */ +#define FUSE_ATTR_SUBMOUNT (1 << 0) + enum fuse_opcode { FUSE_LOOKUP = 1, FUSE_FORGET = 2, /* no reply */ -- cgit v1.2.3 From f92970c694b36a4dbac2b650b173c78c0f0954cc Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Thu, 17 Sep 2020 18:13:23 -0700 Subject: devlink: add timeout information to status_notify Add a timeout element to the DEVLINK_CMD_FLASH_UPDATE_STATUS netlink message for use by a userland utility to show that a particular firmware flash activity may take a long but bounded time to finish. Also add a handy helper for drivers to make use of the new timeout value. UI usage hints: - if non-zero, add timeout display to the end of the status line [component] status_msg ( Xm Ys : Am Bs ) using the timeout value for Am Bs and updating the Xm Ys every second - if the timeout expires while awaiting the next update, display something like [component] status_msg ( timeout reached : Am Bs ) - if new status notify messages are received, remove the timeout and start over Signed-off-by: Shannon Nelson Reviewed-by: Jakub Kicinski Reviewed-by: Jacob Keller Signed-off-by: David S. Miller --- include/net/devlink.h | 4 ++++ include/uapi/linux/devlink.h | 3 +++ net/core/devlink.c | 29 +++++++++++++++++++++++------ 3 files changed, 30 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/devlink.h b/include/net/devlink.h index 48b1c1ef1ebd..be132c17fbcc 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1403,6 +1403,10 @@ void devlink_flash_update_status_notify(struct devlink *devlink, const char *component, unsigned long done, unsigned long total); +void devlink_flash_update_timeout_notify(struct devlink *devlink, + const char *status_msg, + const char *component, + unsigned long timeout); int devlink_traps_register(struct devlink *devlink, const struct devlink_trap *traps, diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 631f5bdf1707..a2ecc8b00611 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -462,6 +462,9 @@ enum devlink_attr { DEVLINK_ATTR_PORT_EXTERNAL, /* u8 */ DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, /* u32 */ + + DEVLINK_ATTR_FLASH_UPDATE_STATUS_TIMEOUT, /* u64 */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index e5b71f3c2d4d..a32e15851119 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -3024,7 +3024,9 @@ static int devlink_nl_flash_update_fill(struct sk_buff *msg, enum devlink_command cmd, const char *status_msg, const char *component, - unsigned long done, unsigned long total) + unsigned long done, + unsigned long total, + unsigned long timeout) { void *hdr; @@ -3052,6 +3054,9 @@ static int devlink_nl_flash_update_fill(struct sk_buff *msg, if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL, total, DEVLINK_ATTR_PAD)) goto nla_put_failure; + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TIMEOUT, + timeout, DEVLINK_ATTR_PAD)) + goto nla_put_failure; out: genlmsg_end(msg, hdr); @@ -3067,7 +3072,8 @@ static void __devlink_flash_update_notify(struct devlink *devlink, const char *status_msg, const char *component, unsigned long done, - unsigned long total) + unsigned long total, + unsigned long timeout) { struct sk_buff *msg; int err; @@ -3081,7 +3087,7 @@ static void __devlink_flash_update_notify(struct devlink *devlink, return; err = devlink_nl_flash_update_fill(msg, devlink, cmd, status_msg, - component, done, total); + component, done, total, timeout); if (err) goto out_free_msg; @@ -3097,7 +3103,7 @@ void devlink_flash_update_begin_notify(struct devlink *devlink) { __devlink_flash_update_notify(devlink, DEVLINK_CMD_FLASH_UPDATE, - NULL, NULL, 0, 0); + NULL, NULL, 0, 0, 0); } EXPORT_SYMBOL_GPL(devlink_flash_update_begin_notify); @@ -3105,7 +3111,7 @@ void devlink_flash_update_end_notify(struct devlink *devlink) { __devlink_flash_update_notify(devlink, DEVLINK_CMD_FLASH_UPDATE_END, - NULL, NULL, 0, 0); + NULL, NULL, 0, 0, 0); } EXPORT_SYMBOL_GPL(devlink_flash_update_end_notify); @@ -3117,10 +3123,21 @@ void devlink_flash_update_status_notify(struct devlink *devlink, { __devlink_flash_update_notify(devlink, DEVLINK_CMD_FLASH_UPDATE_STATUS, - status_msg, component, done, total); + status_msg, component, done, total, 0); } EXPORT_SYMBOL_GPL(devlink_flash_update_status_notify); +void devlink_flash_update_timeout_notify(struct devlink *devlink, + const char *status_msg, + const char *component, + unsigned long timeout) +{ + __devlink_flash_update_notify(devlink, + DEVLINK_CMD_FLASH_UPDATE_STATUS, + status_msg, component, 0, 0, timeout); +} +EXPORT_SYMBOL_GPL(devlink_flash_update_timeout_notify); + static int devlink_nl_cmd_flash_update(struct sk_buff *skb, struct genl_info *info) { -- cgit v1.2.3 From daef1ee3798b25e8464b8eb618eaa74b8f423ac7 Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Fri, 18 Sep 2020 08:17:27 +0700 Subject: tipc: introduce encryption master key In addition to the supported cluster & per-node encryption keys for the en/decryption of TIPC messages, we now introduce one option for user to set a cluster key as 'master key', which is simply a symmetric key like the former but has a longer life cycle. It has two purposes: - Authentication of new member nodes in the cluster. New nodes, having no knowledge of current session keys in the cluster will still be able to join the cluster as long as they know the master key. This is because all neighbor discovery (LINK_CONFIG) messages must be encrypted with this key. - Encryption of session encryption keys during automatic exchange and update of those.This is a feature we will introduce in a later commit in this series. We insert the new key into the currently unused slot 0 in the key array and start using it immediately once the user has set it. After joining, a node only knowing the master key should be fully communicable to existing nodes in the cluster, although those nodes may have their own session keys activated (i.e. not the master one). To support this, we define a 'grace period', starting from the time a node itself reports having no RX keys, so the existing nodes will use the master key for encryption instead. The grace period can be extended but will automatically stop after e.g. 5 seconds without a new report. This is also the basis for later key exchanging feature as the new node will be impossible to decrypt anything without the support from master key. For user to set a master key, we define a new netlink flag - 'TIPC_NLA_NODE_KEY_MASTER', so it can be added to the current 'set key' netlink command to specify the setting key to be a master key. Above all, the traditional cluster/per-node key mechanism is guaranteed to work when user comes not to use this master key option. This is also compatible to legacy nodes without the feature supported. Even this master key can be updated without any interruption of cluster connectivity but is so is needed, this has to be coordinated and set by the user. Acked-by: Jon Maloy Signed-off-by: Tuong Lien Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 1 + net/tipc/crypto.c | 210 ++++++++++++++++++++++++++++---------- net/tipc/crypto.h | 15 ++- net/tipc/msg.h | 4 +- net/tipc/netlink.c | 1 + net/tipc/node.c | 6 +- 6 files changed, 175 insertions(+), 62 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index dc0d23a50e69..d484baa9d365 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -165,6 +165,7 @@ enum { TIPC_NLA_NODE_UP, /* flag */ TIPC_NLA_NODE_ID, /* data */ TIPC_NLA_NODE_KEY, /* data */ + TIPC_NLA_NODE_KEY_MASTER, /* flag */ __TIPC_NLA_NODE_MAX, TIPC_NLA_NODE_MAX = __TIPC_NLA_NODE_MAX - 1 diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index 45a8f4d9d9de..2510b82d3cc1 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -38,6 +38,7 @@ #include #include "crypto.h" +#define TIPC_TX_GRACE_PERIOD msecs_to_jiffies(5000) /* 5s */ #define TIPC_TX_LASTING_TIME msecs_to_jiffies(10000) /* 10s */ #define TIPC_RX_ACTIVE_LIM msecs_to_jiffies(3000) /* 3s */ #define TIPC_RX_PASSIVE_LIM msecs_to_jiffies(15000) /* 15s */ @@ -49,9 +50,9 @@ * TIPC Key ids */ enum { - KEY_UNUSED = 0, - KEY_MIN, - KEY_1 = KEY_MIN, + KEY_MASTER = 0, + KEY_MIN = KEY_MASTER, + KEY_1 = 1, KEY_2, KEY_3, KEY_MAX = KEY_3, @@ -166,27 +167,36 @@ struct tipc_crypto_stats { * @aead: array of pointers to AEAD keys for encryption/decryption * @peer_rx_active: replicated peer RX active key index * @key: the key states - * @working: the crypto is working or not * @stats: the crypto statistics * @name: the crypto name * @sndnxt: the per-peer sndnxt (TX) * @timer1: general timer 1 (jiffies) * @timer2: general timer 2 (jiffies) + * @working: the crypto is working or not + * @key_master: flag indicates if master key exists + * @legacy_user: flag indicates if a peer joins w/o master key (for bwd comp.) * @lock: tipc_key lock */ struct tipc_crypto { struct net *net; struct tipc_node *node; - struct tipc_aead __rcu *aead[KEY_MAX + 1]; /* key[0] is UNUSED */ + struct tipc_aead __rcu *aead[KEY_MAX + 1]; atomic_t peer_rx_active; struct tipc_key key; - u8 working:1; struct tipc_crypto_stats __percpu *stats; char name[48]; atomic64_t sndnxt ____cacheline_aligned; unsigned long timer1; unsigned long timer2; + union { + struct { + u8 working:1; + u8 key_master:1; + u8 legacy_user:1; + }; + u8 flags; + }; spinlock_t lock; /* crypto lock */ } ____cacheline_aligned; @@ -236,13 +246,19 @@ static inline void tipc_crypto_key_set_state(struct tipc_crypto *c, u8 new_active, u8 new_pending); static int tipc_crypto_key_attach(struct tipc_crypto *c, - struct tipc_aead *aead, u8 pos); + struct tipc_aead *aead, u8 pos, + bool master_key); static bool tipc_crypto_key_try_align(struct tipc_crypto *rx, u8 new_pending); static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx, struct tipc_crypto *rx, - struct sk_buff *skb); + struct sk_buff *skb, + u8 tx_key); static void tipc_crypto_key_synch(struct tipc_crypto *rx, struct sk_buff *skb); static int tipc_crypto_key_revoke(struct net *net, u8 tx_key); +static inline void tipc_crypto_clone_msg(struct net *net, struct sk_buff *_skb, + struct tipc_bearer *b, + struct tipc_media_addr *dst, + struct tipc_node *__dnode, u8 type); static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead, struct tipc_bearer *b, struct sk_buff **skb, int err); @@ -943,8 +959,6 @@ bool tipc_ehdr_validate(struct sk_buff *skb) return false; if (unlikely(skb->len <= ehsz + TIPC_AES_GCM_TAG_SIZE)) return false; - if (unlikely(!ehdr->tx_key)) - return false; return true; } @@ -997,6 +1011,8 @@ static int tipc_ehdr_build(struct net *net, struct tipc_aead *aead, ehdr->tx_key = tx_key; ehdr->destined = (__rx) ? 1 : 0; ehdr->rx_key_active = (__rx) ? __rx->key.active : 0; + ehdr->rx_nokey = (__rx) ? !__rx->key.keys : 0; + ehdr->master_key = aead->crypto->key_master; ehdr->reserved_1 = 0; ehdr->reserved_2 = 0; @@ -1039,6 +1055,7 @@ static inline void tipc_crypto_key_set_state(struct tipc_crypto *c, * @c: TIPC crypto to which new key is attached * @ukey: the user key * @mode: the key mode (CLUSTER_KEY or PER_NODE_KEY) + * @master_key: specify this is a cluster master key * * A new TIPC AEAD key will be allocated and initiated with the specified user * key, then attached to the TIPC crypto. @@ -1046,7 +1063,7 @@ static inline void tipc_crypto_key_set_state(struct tipc_crypto *c, * Return: new key id in case of success, otherwise: < 0 */ int tipc_crypto_key_init(struct tipc_crypto *c, struct tipc_aead_key *ukey, - u8 mode) + u8 mode, bool master_key) { struct tipc_aead *aead = NULL; int rc = 0; @@ -1056,7 +1073,7 @@ int tipc_crypto_key_init(struct tipc_crypto *c, struct tipc_aead_key *ukey, /* Attach it to the crypto */ if (likely(!rc)) { - rc = tipc_crypto_key_attach(c, aead, 0); + rc = tipc_crypto_key_attach(c, aead, 0, master_key); if (rc < 0) tipc_aead_free(&aead->rcu); } @@ -1069,11 +1086,13 @@ int tipc_crypto_key_init(struct tipc_crypto *c, struct tipc_aead_key *ukey, * @c: TIPC crypto to which the new AEAD key is attached * @aead: the new AEAD key pointer * @pos: desired slot in the crypto key array, = 0 if any! + * @master_key: specify this is a cluster master key * * Return: new key id in case of success, otherwise: -EBUSY */ static int tipc_crypto_key_attach(struct tipc_crypto *c, - struct tipc_aead *aead, u8 pos) + struct tipc_aead *aead, u8 pos, + bool master_key) { struct tipc_key key; int rc = -EBUSY; @@ -1081,6 +1100,10 @@ static int tipc_crypto_key_attach(struct tipc_crypto *c, spin_lock_bh(&c->lock); key = c->key; + if (master_key) { + new_key = KEY_MASTER; + goto attach; + } if (key.active && key.passive) goto exit; if (key.pending) { @@ -1112,8 +1135,7 @@ attach: tipc_crypto_key_set_state(c, key.passive, key.active, key.pending); c->working = 1; - c->timer1 = jiffies; - c->timer2 = jiffies; + c->key_master |= master_key; rc = new_key; exit: @@ -1126,7 +1148,7 @@ void tipc_crypto_key_flush(struct tipc_crypto *c) int k; spin_lock_bh(&c->lock); - c->working = 0; + c->flags = 0; tipc_crypto_key_set_state(c, 0, 0, 0); for (k = KEY_MIN; k <= KEY_MAX; k++) tipc_crypto_key_detach(c->aead[k], &c->lock); @@ -1202,6 +1224,7 @@ exit: * @tx: TX crypto handle * @rx: RX crypto handle (can be NULL) * @skb: the message skb which will be decrypted later + * @tx_key: peer TX key id * * This function looks up the existing TX keys and pick one which is suitable * for the message decryption, that must be a cluster key and not used before @@ -1211,7 +1234,8 @@ exit: */ static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx, struct tipc_crypto *rx, - struct sk_buff *skb) + struct sk_buff *skb, + u8 tx_key) { struct tipc_skb_cb *skb_cb = TIPC_SKB_CB(skb); struct tipc_aead *aead = NULL; @@ -1230,6 +1254,10 @@ static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx, /* Pick one TX key */ spin_lock(&tx->lock); + if (tx_key == KEY_MASTER) { + aead = tipc_aead_rcu_ptr(tx->aead[KEY_MASTER], &tx->lock); + goto done; + } do { k = (i == 0) ? key.pending : ((i == 1) ? key.active : key.passive); @@ -1249,9 +1277,12 @@ static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx, skb->next = skb_clone(skb, GFP_ATOMIC); if (unlikely(!skb->next)) pr_warn("Failed to clone skb for next round if any\n"); - WARN_ON(!refcount_inc_not_zero(&aead->refcnt)); break; } while (++i < 3); + +done: + if (likely(aead)) + WARN_ON(!refcount_inc_not_zero(&aead->refcnt)); spin_unlock(&tx->lock); return aead; @@ -1266,6 +1297,9 @@ static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx, * has changed, so the number of TX keys' users on this node are increased and * decreased correspondingly. * + * It also considers if peer has no key, then we need to make own master key + * (if any) taking over i.e. starting grace period. + * * The "per-peer" sndnxt is also reset when the peer key has switched. */ static void tipc_crypto_key_synch(struct tipc_crypto *rx, struct sk_buff *skb) @@ -1276,11 +1310,23 @@ static void tipc_crypto_key_synch(struct tipc_crypto *rx, struct sk_buff *skb) u32 self = tipc_own_addr(rx->net); u8 cur, new; - /* Ensure this message is destined to us first */ + /* Update RX 'key_master' flag according to peer, also mark "legacy" if + * a peer has no master key. + */ + rx->key_master = ehdr->master_key; + if (!rx->key_master) + tx->legacy_user = 1; + + /* For later cases, apply only if message is destined to this node */ if (!ehdr->destined || msg_short(hdr) || msg_destnode(hdr) != self) return; - /* Peer RX active key has changed, let's update own TX users */ + /* Case 1: Peer has no keys, let's make master key take over */ + if (ehdr->rx_nokey) + /* Set or extend grace period */ + tx->timer2 = jiffies; + + /* Case 2: Peer RX active key has changed, let's update own TX users */ cur = atomic_read(&rx->peer_rx_active); new = ehdr->rx_key_active; if (tx->key.keys && @@ -1338,7 +1384,7 @@ int tipc_crypto_start(struct tipc_crypto **crypto, struct net *net, return -ENOMEM; } - c->working = 0; + c->flags = 0; c->net = net; c->node = node; tipc_crypto_key_set_state(c, 0, 0, 0); @@ -1473,6 +1519,12 @@ s4: s5: spin_unlock(&rx->lock); + /* Relax it here, the flag will be set again if it really is, but only + * when we are not in grace period for safety! + */ + if (time_after(jiffies, tx->timer2 + TIPC_TX_GRACE_PERIOD)) + tx->legacy_user = 0; + /* Limit max_tfms & do debug commands if needed */ if (likely(sysctl_tipc_max_tfms <= TIPC_MAX_TFMS_LIM)) return; @@ -1482,6 +1534,22 @@ s5: tipc_crypto_do_cmd(rx->net, cmd); } +static inline void tipc_crypto_clone_msg(struct net *net, struct sk_buff *_skb, + struct tipc_bearer *b, + struct tipc_media_addr *dst, + struct tipc_node *__dnode, u8 type) +{ + struct sk_buff *skb; + + skb = skb_clone(_skb, GFP_ATOMIC); + if (skb) { + TIPC_SKB_CB(skb)->xmit_type = type; + tipc_crypto_xmit(net, &skb, b, dst, __dnode); + if (skb) + b->media->send_msg(net, skb, b, dst); + } +} + /** * tipc_crypto_xmit - Build & encrypt TIPC message for xmit * @net: struct net @@ -1491,7 +1559,8 @@ s5: * @__dnode: destination node for reference if any * * First, build an encryption message header on the top of the message, then - * encrypt the original TIPC message by using the active or pending TX key. + * encrypt the original TIPC message by using the pending, master or active + * key with this preference order. * If the encryption is successful, the encrypted skb is returned directly or * via the callback. * Otherwise, the skb is freed! @@ -1514,46 +1583,63 @@ int tipc_crypto_xmit(struct net *net, struct sk_buff **skb, struct tipc_msg *hdr = buf_msg(*skb); struct tipc_key key = tx->key; struct tipc_aead *aead = NULL; - struct sk_buff *_skb; - int rc = -ENOKEY; u32 user = msg_user(hdr); - u8 tx_key; + u32 type = msg_type(hdr); + int rc = -ENOKEY; + u8 tx_key = 0; /* No encryption? */ if (!tx->working) return 0; - /* Try with the pending key if available and: - * 1) This is the only choice (i.e. no active key) or; - * 2) Peer has switched to this key (unicast only) or; - * 3) It is time to do a pending key probe; - */ + /* Pending key if peer has active on it or probing time */ if (unlikely(key.pending)) { tx_key = key.pending; - if (!key.active) + if (!tx->key_master && !key.active) goto encrypt; if (__rx && atomic_read(&__rx->peer_rx_active) == tx_key) goto encrypt; - if (TIPC_SKB_CB(*skb)->probe) { + if (TIPC_SKB_CB(*skb)->xmit_type == SKB_PROBING) { pr_debug("%s: probing for key[%d]\n", tx->name, key.pending); goto encrypt; } - if (user == LINK_CONFIG || user == LINK_PROTOCOL) { - _skb = skb_clone(*skb, GFP_ATOMIC); - if (_skb) { - TIPC_SKB_CB(_skb)->probe = 1; - tipc_crypto_xmit(net, &_skb, b, dst, __dnode); - if (_skb) - b->media->send_msg(net, _skb, b, dst); + if (user == LINK_CONFIG || user == LINK_PROTOCOL) + tipc_crypto_clone_msg(net, *skb, b, dst, __dnode, + SKB_PROBING); + } + + /* Master key if this is a *vital* message or in grace period */ + if (tx->key_master) { + tx_key = KEY_MASTER; + if (!key.active) + goto encrypt; + if (TIPC_SKB_CB(*skb)->xmit_type == SKB_GRACING) { + pr_debug("%s: gracing for msg (%d %d)\n", tx->name, + user, type); + goto encrypt; + } + if (user == LINK_CONFIG || + (user == LINK_PROTOCOL && type == RESET_MSG) || + time_before(jiffies, tx->timer2 + TIPC_TX_GRACE_PERIOD)) { + if (__rx && __rx->key_master && + !atomic_read(&__rx->peer_rx_active)) + goto encrypt; + if (!__rx) { + if (likely(!tx->legacy_user)) + goto encrypt; + tipc_crypto_clone_msg(net, *skb, b, dst, + __dnode, SKB_GRACING); } } } + /* Else, use the active key if any */ if (likely(key.active)) { tx_key = key.active; goto encrypt; } + goto exit; encrypt: @@ -1619,15 +1705,16 @@ int tipc_crypto_rcv(struct net *net, struct tipc_crypto *rx, struct tipc_aead *aead = NULL; struct tipc_key key; int rc = -ENOKEY; - u8 tx_key = 0; + u8 tx_key; + + tx_key = ((struct tipc_ehdr *)(*skb)->data)->tx_key; /* New peer? * Let's try with TX key (i.e. cluster mode) & verify the skb first! */ - if (unlikely(!rx)) + if (unlikely(!rx || tx_key == KEY_MASTER)) goto pick_tx; - tx_key = ((struct tipc_ehdr *)(*skb)->data)->tx_key; /* Pick RX key according to TX key if any */ key = rx->key; if (tx_key == key.active || tx_key == key.pending || @@ -1640,7 +1727,7 @@ int tipc_crypto_rcv(struct net *net, struct tipc_crypto *rx, pick_tx: /* No key suitable? Try to pick one from TX... */ - aead = tipc_crypto_key_pick_tx(tx, rx, *skb); + aead = tipc_crypto_key_pick_tx(tx, rx, *skb, tx_key); if (aead) goto decrypt; goto exit; @@ -1722,9 +1809,12 @@ static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead, goto free_skb; } + /* Ignore cloning if it was TX master key */ + if (ehdr->tx_key == KEY_MASTER) + goto rcv; if (tipc_aead_clone(&tmp, aead) < 0) goto rcv; - if (tipc_crypto_key_attach(rx, tmp, ehdr->tx_key) < 0) { + if (tipc_crypto_key_attach(rx, tmp, ehdr->tx_key, false) < 0) { tipc_aead_free(&tmp->rcu); goto rcv; } @@ -1740,10 +1830,10 @@ static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead, /* Set the RX key's user */ tipc_aead_users_set(aead, 1); -rcv: /* Mark this point, RX works */ rx->timer1 = jiffies; +rcv: /* Remove ehdr & auth. tag prior to tipc_rcv() */ ehdr = (struct tipc_ehdr *)(*skb)->data; @@ -1865,14 +1955,24 @@ static char *tipc_crypto_key_dump(struct tipc_crypto *c, char *buf) char *s; for (k = KEY_MIN; k <= KEY_MAX; k++) { - if (k == key.passive) - s = "PAS"; - else if (k == key.active) - s = "ACT"; - else if (k == key.pending) - s = "PEN"; - else - s = "-"; + if (k == KEY_MASTER) { + if (is_rx(c)) + continue; + if (time_before(jiffies, + c->timer2 + TIPC_TX_GRACE_PERIOD)) + s = "ACT"; + else + s = "PAS"; + } else { + if (k == key.passive) + s = "PAS"; + else if (k == key.active) + s = "ACT"; + else if (k == key.pending) + s = "PEN"; + else + s = "-"; + } i += scnprintf(buf + i, 200 - i, "\tKey%d: %s", k, s); rcu_read_lock(); @@ -1905,7 +2005,7 @@ static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new, /* Output format: "[%s %s %s] -> [%s %s %s]", max len = 32 */ again: i += scnprintf(buf + i, 32 - i, "["); - for (k = KEY_MIN; k <= KEY_MAX; k++) { + for (k = KEY_1; k <= KEY_3; k++) { if (k == key->passive) s = "pas"; else if (k == key->active) @@ -1915,7 +2015,7 @@ again: else s = "-"; i += scnprintf(buf + i, 32 - i, - (k != KEY_MAX) ? "%s " : "%s", s); + (k != KEY_3) ? "%s " : "%s", s); } if (key != &new) { i += scnprintf(buf + i, 32 - i, "] -> "); diff --git a/net/tipc/crypto.h b/net/tipc/crypto.h index c387240e03d0..643b55077112 100644 --- a/net/tipc/crypto.h +++ b/net/tipc/crypto.h @@ -74,7 +74,7 @@ extern int sysctl_tipc_max_tfms __read_mostly; * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 * 1 0 9 8 7 6 5 4|3 2 1 0 9 8 7 6|5 4 3 2 1 0 9 8|7 6 5 4 3 2 1 0 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * w0:|Ver=7| User |D|TX |RX |K| Rsvd | + * w0:|Ver=7| User |D|TX |RX |K|M|N| Rsvd | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * w1:| Seqno | * w2:| (8 octets) | @@ -101,6 +101,9 @@ extern int sysctl_tipc_max_tfms __read_mostly; * RX : Currently RX active key corresponding to the destination * node's TX key (when the "D" bit is set) * K : Keep-alive bit (for RPS, LINK_PROTOCOL/STATE_MSG only) + * M : Bit indicates if sender has master key + * N : Bit indicates if sender has no RX keys corresponding to the + * receiver's TX (when the "D" bit is set) * Rsvd : Reserved bit, field * Word1-2: * Seqno : The 64-bit sequence number of the encrypted message, also @@ -117,7 +120,9 @@ struct tipc_ehdr { __u8 destined:1, user:4, version:3; - __u8 reserved_1:3, + __u8 reserved_1:1, + rx_nokey:1, + master_key:1, keepalive:1, rx_key_active:2, tx_key:2; @@ -128,7 +133,9 @@ struct tipc_ehdr { __u8 tx_key:2, rx_key_active:2, keepalive:1, - reserved_1:3; + master_key:1, + rx_nokey:1, + reserved_1:1; #else #error "Please fix " #endif @@ -158,7 +165,7 @@ int tipc_crypto_xmit(struct net *net, struct sk_buff **skb, int tipc_crypto_rcv(struct net *net, struct tipc_crypto *rx, struct sk_buff **skb, struct tipc_bearer *b); int tipc_crypto_key_init(struct tipc_crypto *c, struct tipc_aead_key *ukey, - u8 mode); + u8 mode, bool master_key); void tipc_crypto_key_flush(struct tipc_crypto *c); int tipc_aead_key_validate(struct tipc_aead_key *ukey, struct genl_info *info); bool tipc_ehdr_validate(struct sk_buff *skb); diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 1016e96db5c4..25e5c5c8a6ff 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -127,7 +127,9 @@ struct tipc_skb_cb { #ifdef CONFIG_TIPC_CRYPTO u8 encrypted:1; u8 decrypted:1; - u8 probe:1; +#define SKB_PROBING 1 +#define SKB_GRACING 2 + u8 xmit_type:2; u8 tx_clone_deferred:1; #endif }; diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index c4aee6247d55..1ec00fcc26ee 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -108,6 +108,7 @@ const struct nla_policy tipc_nl_node_policy[TIPC_NLA_NODE_MAX + 1] = { .len = TIPC_NODEID_LEN}, [TIPC_NLA_NODE_KEY] = { .type = NLA_BINARY, .len = TIPC_AEAD_KEY_SIZE_MAX}, + [TIPC_NLA_NODE_KEY_MASTER] = { .type = NLA_FLAG }, }; /* Properties valid for media, bearer and link */ diff --git a/net/tipc/node.c b/net/tipc/node.c index 70045630e6bb..5da94d1dda77 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -2875,6 +2875,7 @@ static int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) struct tipc_crypto *tx = tipc_net(net)->crypto_tx, *c = tx; struct tipc_node *n = NULL; struct tipc_aead_key *ukey; + bool master_key = false; u8 *id, *own_id, mode; int rc = 0; @@ -2905,6 +2906,7 @@ static int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) switch (rc) { case -ENODATA: mode = CLUSTER_KEY; + master_key = !!(attrs[TIPC_NLA_NODE_KEY_MASTER]); break; case 0: mode = PER_NODE_KEY; @@ -2921,11 +2923,11 @@ static int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) } /* Initiate the TX/RX key */ - rc = tipc_crypto_key_init(c, ukey, mode); + rc = tipc_crypto_key_init(c, ukey, mode, master_key); if (n) tipc_node_put(n); - if (rc < 0) { + if (unlikely(rc < 0)) { GENL_SET_ERR_MSG(info, "unable to initiate or attach new key"); return rc; } -- cgit v1.2.3 From 23700da29b83e859a8c3727fddd33ba74c4f3a39 Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Fri, 18 Sep 2020 08:17:29 +0700 Subject: tipc: add automatic rekeying for encryption key Rekeying is required for security since a key is less secure when using for a long time. Also, key will be detached when its nonce value (or seqno ...) is exhausted. We now make the rekeying process automatic and configurable by user. Basically, TIPC will at a specific interval generate a new key by using the kernel 'Random Number Generator' cipher, then attach it as the node TX key and securely distribute to others in the cluster as RX keys (- the key exchange). The automatic key switching will then take over, and make the new key active shortly. Afterwards, the traffic from this node will be encrypted with the new session key. The same can happen in peer nodes but not necessarily at the same time. For simplicity, the automatically generated key will be initiated as a per node key. It is not too hard to also support a cluster key rekeying (e.g. a given node will generate a unique cluster key and update to the others in the cluster...), but that doesn't bring much benefit, while a per-node key is even more secure. We also enable user to force a rekeying or change the rekeying interval via netlink, the new 'set key' command option: 'TIPC_NLA_NODE_REKEYING' is added for these purposes as follows: - A value >= 1 will be set as the rekeying interval (in minutes); - A value of 0 will disable the rekeying; - A value of 'TIPC_REKEYING_NOW' (~0) will force an immediate rekeying; The default rekeying interval is (60 * 24) minutes i.e. done every day. There isn't any restriction for the value but user shouldn't set it too small or too large which results in an "ineffective" rekeying (thats ok for testing though). Acked-by: Jon Maloy Signed-off-by: Tuong Lien Signed-off-by: David S. Miller --- include/uapi/linux/tipc.h | 2 + include/uapi/linux/tipc_netlink.h | 1 + net/tipc/crypto.c | 113 +++++++++++++++++++++++++++++++++++++- net/tipc/crypto.h | 2 + net/tipc/netlink.c | 1 + net/tipc/node.c | 25 ++++++++- 6 files changed, 141 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h index add01db1daef..80ea15e12113 100644 --- a/include/uapi/linux/tipc.h +++ b/include/uapi/linux/tipc.h @@ -254,6 +254,8 @@ static inline int tipc_aead_key_size(struct tipc_aead_key *key) return sizeof(*key) + key->keylen; } +#define TIPC_REKEYING_NOW (~0U) + /* The macros and functions below are deprecated: */ diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index d484baa9d365..d847dd671d79 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -166,6 +166,7 @@ enum { TIPC_NLA_NODE_ID, /* data */ TIPC_NLA_NODE_KEY, /* data */ TIPC_NLA_NODE_KEY_MASTER, /* flag */ + TIPC_NLA_NODE_REKEYING, /* u32 */ __TIPC_NLA_NODE_MAX, TIPC_NLA_NODE_MAX = __TIPC_NLA_NODE_MAX - 1 diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index 91d8b268cae0..40c44101fe8e 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -36,6 +36,7 @@ #include #include +#include #include "crypto.h" #include "msg.h" #include "bcast.h" @@ -48,6 +49,8 @@ #define TIPC_MAX_TFMS_DEF 10 #define TIPC_MAX_TFMS_LIM 1000 +#define TIPC_REKEYING_INTV_DEF (60 * 24) /* default: 1 day */ + /** * TIPC Key ids */ @@ -181,6 +184,7 @@ struct tipc_crypto_stats { * @wq: common workqueue on TX crypto * @work: delayed work sched for TX/RX * @key_distr: key distributing state + * @rekeying_intv: rekeying interval (in minutes) * @stats: the crypto statistics * @name: the crypto name * @sndnxt: the per-peer sndnxt (TX) @@ -206,6 +210,7 @@ struct tipc_crypto { #define KEY_DISTR_SCHED 1 #define KEY_DISTR_COMPL 2 atomic_t key_distr; + u32 rekeying_intv; struct tipc_crypto_stats __percpu *stats; char name[48]; @@ -294,7 +299,9 @@ static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new, static int tipc_crypto_key_xmit(struct net *net, struct tipc_aead_key *skey, u16 gen, u8 mode, u32 dnode); static bool tipc_crypto_key_rcv(struct tipc_crypto *rx, struct tipc_msg *hdr); +static void tipc_crypto_work_tx(struct work_struct *work); static void tipc_crypto_work_rx(struct work_struct *work); +static int tipc_aead_key_generate(struct tipc_aead_key *skey); #define is_tx(crypto) (!(crypto)->node) #define is_rx(crypto) (!is_tx(crypto)) @@ -346,6 +353,27 @@ int tipc_aead_key_validate(struct tipc_aead_key *ukey, struct genl_info *info) return 0; } +/** + * tipc_aead_key_generate - Generate new session key + * @skey: input/output key with new content + * + * Return: 0 in case of success, otherwise < 0 + */ +static int tipc_aead_key_generate(struct tipc_aead_key *skey) +{ + int rc = 0; + + /* Fill the key's content with a random value via RNG cipher */ + rc = crypto_get_default_rng(); + if (likely(!rc)) { + rc = crypto_rng_get_bytes(crypto_default_rng, skey->key, + skey->keylen); + crypto_put_default_rng(); + } + + return rc; +} + static struct tipc_aead *tipc_aead_get(struct tipc_aead __rcu *aead) { struct tipc_aead *tmp; @@ -1471,6 +1499,7 @@ int tipc_crypto_start(struct tipc_crypto **crypto, struct net *net, atomic64_set(&c->sndnxt, 0); c->timer1 = jiffies; c->timer2 = jiffies; + c->rekeying_intv = TIPC_REKEYING_INTV_DEF; spin_lock_init(&c->lock); scnprintf(c->name, 48, "%s(%s)", (is_rx(c)) ? "RX" : "TX", (is_rx(c)) ? tipc_node_get_id_str(c->node) : @@ -1478,6 +1507,8 @@ int tipc_crypto_start(struct tipc_crypto **crypto, struct net *net, if (is_rx(c)) INIT_DELAYED_WORK(&c->work, tipc_crypto_work_rx); + else + INIT_DELAYED_WORK(&c->work, tipc_crypto_work_tx); *crypto = c; return 0; @@ -1492,8 +1523,11 @@ void tipc_crypto_stop(struct tipc_crypto **crypto) return; /* Flush any queued works & destroy wq */ - if (is_tx(c)) + if (is_tx(c)) { + c->rekeying_intv = 0; + cancel_delayed_work_sync(&c->work); destroy_workqueue(c->wq); + } /* Release AEAD keys */ rcu_read_lock(); @@ -2351,3 +2385,80 @@ static void tipc_crypto_work_rx(struct work_struct *work) tipc_node_put(rx->node); } + +/** + * tipc_crypto_rekeying_sched - (Re)schedule rekeying w/o new interval + * @tx: TX crypto + * @changed: if the rekeying needs to be rescheduled with new interval + * @new_intv: new rekeying interval (when "changed" = true) + */ +void tipc_crypto_rekeying_sched(struct tipc_crypto *tx, bool changed, + u32 new_intv) +{ + unsigned long delay; + bool now = false; + + if (changed) { + if (new_intv == TIPC_REKEYING_NOW) + now = true; + else + tx->rekeying_intv = new_intv; + cancel_delayed_work_sync(&tx->work); + } + + if (tx->rekeying_intv || now) { + delay = (now) ? 0 : tx->rekeying_intv * 60 * 1000; + queue_delayed_work(tx->wq, &tx->work, msecs_to_jiffies(delay)); + } +} + +/** + * tipc_crypto_work_tx - Scheduled TX works handler + * @work: the struct TX work + * + * The function processes the previous scheduled work, i.e. key rekeying, by + * generating a new session key based on current one, then attaching it to the + * TX crypto and finally distributing it to peers. It also re-schedules the + * rekeying if needed. + */ +static void tipc_crypto_work_tx(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct tipc_crypto *tx = container_of(dwork, struct tipc_crypto, work); + struct tipc_aead_key *skey = NULL; + struct tipc_key key = tx->key; + struct tipc_aead *aead; + int rc = -ENOMEM; + + if (unlikely(key.pending)) + goto resched; + + /* Take current key as a template */ + rcu_read_lock(); + aead = rcu_dereference(tx->aead[key.active ?: KEY_MASTER]); + if (unlikely(!aead)) { + rcu_read_unlock(); + /* At least one key should exist for securing */ + return; + } + + /* Lets duplicate it first */ + skey = kmemdup(aead->key, tipc_aead_key_size(aead->key), GFP_ATOMIC); + rcu_read_unlock(); + + /* Now, generate new key, initiate & distribute it */ + if (likely(skey)) { + rc = tipc_aead_key_generate(skey) ?: + tipc_crypto_key_init(tx, skey, PER_NODE_KEY, false); + if (likely(rc > 0)) + rc = tipc_crypto_key_distr(tx, rc, NULL); + kzfree(skey); + } + + if (unlikely(rc)) + pr_warn_ratelimited("%s: rekeying returns %d\n", tx->name, rc); + +resched: + /* Re-schedule rekeying if any */ + tipc_crypto_rekeying_sched(tx, false, 0); +} diff --git a/net/tipc/crypto.h b/net/tipc/crypto.h index b2a9c9b90684..e71193bd5e36 100644 --- a/net/tipc/crypto.h +++ b/net/tipc/crypto.h @@ -171,6 +171,8 @@ void tipc_crypto_key_flush(struct tipc_crypto *c); int tipc_crypto_key_distr(struct tipc_crypto *tx, u8 key, struct tipc_node *dest); void tipc_crypto_msg_rcv(struct net *net, struct sk_buff *skb); +void tipc_crypto_rekeying_sched(struct tipc_crypto *tx, bool changed, + u32 new_intv); int tipc_aead_key_validate(struct tipc_aead_key *ukey, struct genl_info *info); bool tipc_ehdr_validate(struct sk_buff *skb); diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index 1ec00fcc26ee..c447cb5f879e 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -109,6 +109,7 @@ const struct nla_policy tipc_nl_node_policy[TIPC_NLA_NODE_MAX + 1] = { [TIPC_NLA_NODE_KEY] = { .type = NLA_BINARY, .len = TIPC_AEAD_KEY_SIZE_MAX}, [TIPC_NLA_NODE_KEY_MASTER] = { .type = NLA_FLAG }, + [TIPC_NLA_NODE_REKEYING] = { .type = NLA_U32 }, }; /* Properties valid for media, bearer and link */ diff --git a/net/tipc/node.c b/net/tipc/node.c index c9b6042e32b5..cf4b239fc569 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -2879,6 +2879,17 @@ static int tipc_nl_retrieve_nodeid(struct nlattr **attrs, u8 **node_id) return 0; } +static int tipc_nl_retrieve_rekeying(struct nlattr **attrs, u32 *intv) +{ + struct nlattr *attr = attrs[TIPC_NLA_NODE_REKEYING]; + + if (!attr) + return -ENODATA; + + *intv = nla_get_u32(attr); + return 0; +} + static int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attrs[TIPC_NLA_NODE_MAX + 1]; @@ -2886,8 +2897,9 @@ static int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) struct tipc_crypto *tx = tipc_net(net)->crypto_tx, *c = tx; struct tipc_node *n = NULL; struct tipc_aead_key *ukey; - bool master_key = false; + bool rekeying = true, master_key = false; u8 *id, *own_id, mode; + u32 intv = 0; int rc = 0; if (!info->attrs[TIPC_NLA_NODE]) @@ -2905,8 +2917,14 @@ static int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) return -EPERM; } + rc = tipc_nl_retrieve_rekeying(attrs, &intv); + if (rc == -ENODATA) + rekeying = false; + rc = tipc_nl_retrieve_key(attrs, &ukey); - if (rc) + if (rc == -ENODATA && rekeying) + goto rekeying; + else if (rc) return rc; rc = tipc_aead_key_validate(ukey, info); @@ -2945,6 +2963,9 @@ static int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) /* Distribute TX key but not master one */ if (!master_key && tipc_crypto_key_distr(tx, rc, NULL)) GENL_SET_ERR_MSG(info, "failed to replicate new key"); +rekeying: + /* Schedule TX rekeying if needed */ + tipc_crypto_rekeying_sched(tx, rekeying, intv); } return 0; -- cgit v1.2.3 From 55f13311785cebd60b9bab9ca7fd64205436c462 Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Fri, 18 Sep 2020 14:14:51 -0500 Subject: ethtool: Add 100base-FX link mode entries Add entries for the 100base-FX full and half duplex supported modes. $ ethtool eth0 Supported ports: [ FIBRE ] Supported link modes: 100baseFX/Half 100baseFX/Full Supported pause frame use: Symmetric Receive-only Supports auto-negotiation: No Supported FEC modes: Not reported Advertised link modes: 100baseFX/Half 100baseFX/Full Advertised pause frame use: No Advertised auto-negotiation: No Advertised FEC modes: Not reported Speed: 100Mb/s Duplex: Full Auto-negotiation: off Port: MII PHYAD: 1 Transceiver: external Supports Wake-on: gs Wake-on: d SecureOn password: 00:00:00:00:00:00 Current message level: 0x00000000 (0) Link detected: yes Signed-off-by: Dan Murphy Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/phy-core.c | 4 +++- include/uapi/linux/ethtool.h | 2 ++ net/ethtool/common.c | 2 ++ net/ethtool/linkmodes.c | 2 ++ 4 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c index ff8e14b01eeb..de5b869139d7 100644 --- a/drivers/net/phy/phy-core.c +++ b/drivers/net/phy/phy-core.c @@ -8,7 +8,7 @@ const char *phy_speed_to_str(int speed) { - BUILD_BUG_ON_MSG(__ETHTOOL_LINK_MODE_MASK_NBITS != 90, + BUILD_BUG_ON_MSG(__ETHTOOL_LINK_MODE_MASK_NBITS != 92, "Enum ethtool_link_mode_bit_indices and phylib are out of sync. " "If a speed or mode has been added please update phy_speed_to_str " "and the PHY settings array.\n"); @@ -160,6 +160,8 @@ static const struct phy_setting settings[] = { PHY_SETTING( 100, FULL, 100baseT_Full ), PHY_SETTING( 100, FULL, 100baseT1_Full ), PHY_SETTING( 100, HALF, 100baseT_Half ), + PHY_SETTING( 100, HALF, 100baseFX_Half ), + PHY_SETTING( 100, FULL, 100baseFX_Full ), /* 10M */ PHY_SETTING( 10, FULL, 10baseT_Full ), PHY_SETTING( 10, HALF, 10baseT_Half ), diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index b4f2d134e713..9ca87bc73c44 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1617,6 +1617,8 @@ enum ethtool_link_mode_bit_indices { ETHTOOL_LINK_MODE_400000baseLR4_ER4_FR4_Full_BIT = 87, ETHTOOL_LINK_MODE_400000baseDR4_Full_BIT = 88, ETHTOOL_LINK_MODE_400000baseCR4_Full_BIT = 89, + ETHTOOL_LINK_MODE_100baseFX_Half_BIT = 90, + ETHTOOL_LINK_MODE_100baseFX_Full_BIT = 91, /* must be last entry */ __ETHTOOL_LINK_MODE_MASK_NBITS }; diff --git a/net/ethtool/common.c b/net/ethtool/common.c index ed19573fccd7..24036e3055a1 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -192,6 +192,8 @@ const char link_mode_names[][ETH_GSTRING_LEN] = { __DEFINE_LINK_MODE_NAME(400000, LR4_ER4_FR4, Full), __DEFINE_LINK_MODE_NAME(400000, DR4, Full), __DEFINE_LINK_MODE_NAME(400000, CR4, Full), + __DEFINE_LINK_MODE_NAME(100, FX, Half), + __DEFINE_LINK_MODE_NAME(100, FX, Full), }; static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c index 7044a2853886..29dcd675b65a 100644 --- a/net/ethtool/linkmodes.c +++ b/net/ethtool/linkmodes.c @@ -272,6 +272,8 @@ static const struct link_mode_info link_mode_params[] = { __DEFINE_LINK_MODE_PARAMS(400000, LR4_ER4_FR4, Full), __DEFINE_LINK_MODE_PARAMS(400000, DR4, Full), __DEFINE_LINK_MODE_PARAMS(400000, CR4, Full), + __DEFINE_LINK_MODE_PARAMS(100, FX, Half), + __DEFINE_LINK_MODE_PARAMS(100, FX, Full), }; static const struct nla_policy -- cgit v1.2.3 From c12fa88c6d16ed3865072d91154cff6fd1cd9cd4 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Thu, 10 Sep 2020 20:25:08 +0800 Subject: vfio: Fix typo of the device_state A typo fix ("_RUNNNG" => "_RUNNING") in comment block of the uapi header. Signed-off-by: Zenghui Yu Reviewed-by: Cornelia Huck Reviewed-by: Kirti Wankhede Signed-off-by: Alex Williamson --- include/uapi/linux/vfio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 920470502329..d4bd39e124bf 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -462,7 +462,7 @@ struct vfio_region_gfx_edid { * 5. Resumed * |--------->| * - * 0. Default state of VFIO device is _RUNNNG when the user application starts. + * 0. Default state of VFIO device is _RUNNING when the user application starts. * 1. During normal shutdown of the user application, the user application may * optionally change the VFIO device state from _RUNNING to _STOP. This * transition is optional. The vendor driver must support this transition but -- cgit v1.2.3 From 7d6e1329652ed971d1b6e0e7bea66fba5044e271 Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Tue, 15 Sep 2020 15:05:18 -0400 Subject: vfio iommu: Add dma available capability Commit 492855939bdb ("vfio/type1: Limit DMA mappings per container") added the ability to limit the number of memory backed DMA mappings. However on s390x, when lazy mapping is in use, we use a very large number of concurrent mappings. Let's provide the current allowable number of DMA mappings to userspace via the IOMMU info chain so that userspace can take appropriate mitigation. Signed-off-by: Matthew Rosato Reviewed-by: Cornelia Huck Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 17 +++++++++++++++++ include/uapi/linux/vfio.h | 15 +++++++++++++++ 2 files changed, 32 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 5fbf0c1f7433..15e21dbffb16 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -2609,6 +2609,20 @@ static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu, return vfio_info_add_capability(caps, &cap_mig.header, sizeof(cap_mig)); } +static int vfio_iommu_dma_avail_build_caps(struct vfio_iommu *iommu, + struct vfio_info_cap *caps) +{ + struct vfio_iommu_type1_info_dma_avail cap_dma_avail; + + cap_dma_avail.header.id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL; + cap_dma_avail.header.version = 1; + + cap_dma_avail.avail = iommu->dma_avail; + + return vfio_info_add_capability(caps, &cap_dma_avail.header, + sizeof(cap_dma_avail)); +} + static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu, unsigned long arg) { @@ -2641,6 +2655,9 @@ static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu, ret = vfio_iommu_migration_build_caps(iommu, &caps); + if (!ret) + ret = vfio_iommu_dma_avail_build_caps(iommu, &caps); + if (!ret) ret = vfio_iommu_iova_build_caps(iommu, &caps); diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 920470502329..3891e03d3af0 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -1039,6 +1039,21 @@ struct vfio_iommu_type1_info_cap_migration { __u64 max_dirty_bitmap_size; /* in bytes */ }; +/* + * The DMA available capability allows to report the current number of + * simultaneously outstanding DMA mappings that are allowed. + * + * The structure below defines version 1 of this capability. + * + * avail: specifies the current number of outstanding DMA mappings allowed. + */ +#define VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL 3 + +struct vfio_iommu_type1_info_dma_avail { + struct vfio_info_cap_header header; + __u32 avail; +}; + #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) /** -- cgit v1.2.3 From 15b760c37ad3c3f2b922506eaca4ca8b4292e621 Mon Sep 17 00:00:00 2001 From: Andra Paraschiv Date: Mon, 21 Sep 2020 15:17:15 +0300 Subject: nitro_enclaves: Add ioctl interface definition The Nitro Enclaves driver handles the enclave lifetime management. This includes enclave creation, termination and setting up its resources such as memory and CPU. An enclave runs alongside the VM that spawned it. It is abstracted as a process running in the VM that launched it. The process interacts with the NE driver, that exposes an ioctl interface for creating an enclave and setting up its resources. Changelog v9 -> v10 * Update commit message to include the changelog before the SoB tag(s). v8 -> v9 * No changes. v7 -> v8 * Add NE custom error codes for user space memory regions not backed by pages multiple of 2 MiB, invalid flags and enclave CID. * Add max flag value for enclave image load info. v6 -> v7 * Clarify in the ioctls documentation that the return value is -1 and errno is set on failure. * Update the error code value for NE_ERR_INVALID_MEM_REGION_SIZE as it gets in user space as value 25 (ENOTTY) instead of 515. Update the NE custom error codes values range to not be the same as the ones defined in include/linux/errno.h, although these are not propagated to user space. v5 -> v6 * Fix typo in the description about the NE CPU pool. * Update documentation to kernel-doc format. * Remove the ioctl to query API version. v4 -> v5 * Add more details about the ioctl calls usage e.g. error codes, file descriptors used. * Update the ioctl to set an enclave vCPU to not return a file descriptor. * Add specific NE error codes. v3 -> v4 * Decouple NE ioctl interface from KVM API. * Add NE API version and the corresponding ioctl call. * Add enclave / image load flags options. v2 -> v3 * Remove the GPL additional wording as SPDX-License-Identifier is already in place. v1 -> v2 * Add ioctl for getting enclave image load metadata. * Update NE_ENCLAVE_START ioctl name to NE_START_ENCLAVE. * Add entry in Documentation/userspace-api/ioctl/ioctl-number.rst for NE ioctls. * Update NE ioctls definition based on the updated ioctl range for major and minor. Reviewed-by: Alexander Graf Reviewed-by: Stefan Hajnoczi Signed-off-by: Alexandru Vasile Signed-off-by: Andra Paraschiv Link: https://lore.kernel.org/r/20200921121732.44291-2-andraprs@amazon.com Signed-off-by: Greg Kroah-Hartman --- Documentation/userspace-api/ioctl/ioctl-number.rst | 5 +- include/linux/nitro_enclaves.h | 11 + include/uapi/linux/nitro_enclaves.h | 359 +++++++++++++++++++++ 3 files changed, 374 insertions(+), 1 deletion(-) create mode 100644 include/linux/nitro_enclaves.h create mode 100644 include/uapi/linux/nitro_enclaves.h (limited to 'include/uapi/linux') diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst index 2a198838fca9..5f7ff00f394e 100644 --- a/Documentation/userspace-api/ioctl/ioctl-number.rst +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst @@ -328,8 +328,11 @@ Code Seq# Include File Comments 0xAC 00-1F linux/raw.h 0xAD 00 Netfilter device in development: -0xAE all linux/kvm.h Kernel-based Virtual Machine +0xAE 00-1F linux/kvm.h Kernel-based Virtual Machine +0xAE 40-FF linux/kvm.h Kernel-based Virtual Machine + +0xAE 20-3F linux/nitro_enclaves.h Nitro Enclaves 0xAF 00-1F linux/fsl_hypervisor.h Freescale hypervisor 0xB0 all RATIO devices in development: diff --git a/include/linux/nitro_enclaves.h b/include/linux/nitro_enclaves.h new file mode 100644 index 000000000000..d91ef2bfdf47 --- /dev/null +++ b/include/linux/nitro_enclaves.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. + */ + +#ifndef _LINUX_NITRO_ENCLAVES_H_ +#define _LINUX_NITRO_ENCLAVES_H_ + +#include + +#endif /* _LINUX_NITRO_ENCLAVES_H_ */ diff --git a/include/uapi/linux/nitro_enclaves.h b/include/uapi/linux/nitro_enclaves.h new file mode 100644 index 000000000000..b945073fe544 --- /dev/null +++ b/include/uapi/linux/nitro_enclaves.h @@ -0,0 +1,359 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. + */ + +#ifndef _UAPI_LINUX_NITRO_ENCLAVES_H_ +#define _UAPI_LINUX_NITRO_ENCLAVES_H_ + +#include + +/** + * DOC: Nitro Enclaves (NE) Kernel Driver Interface + */ + +/** + * NE_CREATE_VM - The command is used to create a slot that is associated with + * an enclave VM. + * The generated unique slot id is an output parameter. + * The ioctl can be invoked on the /dev/nitro_enclaves fd, before + * setting any resources, such as memory and vCPUs, for an + * enclave. Memory and vCPUs are set for the slot mapped to an enclave. + * A NE CPU pool has to be set before calling this function. The + * pool can be set after the NE driver load, using + * /sys/module/nitro_enclaves/parameters/ne_cpus. + * Its format is the detailed in the cpu-lists section: + * https://www.kernel.org/doc/html/latest/admin-guide/kernel-parameters.html + * CPU 0 and its siblings have to remain available for the + * primary / parent VM, so they cannot be set for enclaves. Full + * CPU core(s), from the same NUMA node, need(s) to be included + * in the CPU pool. + * + * Context: Process context. + * Return: + * * Enclave file descriptor - Enclave file descriptor used with + * ioctl calls to set vCPUs and memory + * regions, then start the enclave. + * * -1 - There was a failure in the ioctl logic. + * On failure, errno is set to: + * * EFAULT - copy_to_user() failure. + * * ENOMEM - Memory allocation failure for internal + * bookkeeping variables. + * * NE_ERR_NO_CPUS_AVAIL_IN_POOL - No NE CPU pool set / no CPUs available + * in the pool. + * * Error codes from get_unused_fd_flags() and anon_inode_getfile(). + * * Error codes from the NE PCI device request. + */ +#define NE_CREATE_VM _IOR(0xAE, 0x20, __u64) + +/** + * NE_ADD_VCPU - The command is used to set a vCPU for an enclave. The vCPU can + * be auto-chosen from the NE CPU pool or it can be set by the + * caller, with the note that it needs to be available in the NE + * CPU pool. Full CPU core(s), from the same NUMA node, need(s) to + * be associated with an enclave. + * The vCPU id is an input / output parameter. If its value is 0, + * then a CPU is chosen from the enclave CPU pool and returned via + * this parameter. + * The ioctl can be invoked on the enclave fd, before an enclave + * is started. + * + * Context: Process context. + * Return: + * * 0 - Logic succesfully completed. + * * -1 - There was a failure in the ioctl logic. + * On failure, errno is set to: + * * EFAULT - copy_from_user() / copy_to_user() failure. + * * ENOMEM - Memory allocation failure for internal + * bookkeeping variables. + * * EIO - Current task mm is not the same as the one + * that created the enclave. + * * NE_ERR_NO_CPUS_AVAIL_IN_POOL - No CPUs available in the NE CPU pool. + * * NE_ERR_VCPU_ALREADY_USED - The provided vCPU is already used. + * * NE_ERR_VCPU_NOT_IN_CPU_POOL - The provided vCPU is not available in the + * NE CPU pool. + * * NE_ERR_VCPU_INVALID_CPU_CORE - The core id of the provided vCPU is invalid + * or out of range. + * * NE_ERR_NOT_IN_INIT_STATE - The enclave is not in init state + * (init = before being started). + * * NE_ERR_INVALID_VCPU - The provided vCPU is not in the available + * CPUs range. + * * Error codes from the NE PCI device request. + */ +#define NE_ADD_VCPU _IOWR(0xAE, 0x21, __u32) + +/** + * NE_GET_IMAGE_LOAD_INFO - The command is used to get information needed for + * in-memory enclave image loading e.g. offset in + * enclave memory to start placing the enclave image. + * The image load info is an input / output parameter. + * It includes info provided by the caller - flags - + * and returns the offset in enclave memory where to + * start placing the enclave image. + * The ioctl can be invoked on the enclave fd, before + * an enclave is started. + * + * Context: Process context. + * Return: + * * 0 - Logic succesfully completed. + * * -1 - There was a failure in the ioctl logic. + * On failure, errno is set to: + * * EFAULT - copy_from_user() / copy_to_user() failure. + * * NE_ERR_NOT_IN_INIT_STATE - The enclave is not in init state (init = + * before being started). + * * NE_ERR_INVALID_FLAG_VALUE - The value of the provided flag is invalid. + */ +#define NE_GET_IMAGE_LOAD_INFO _IOWR(0xAE, 0x22, struct ne_image_load_info) + +/** + * NE_SET_USER_MEMORY_REGION - The command is used to set a memory region for an + * enclave, given the allocated memory from the + * userspace. Enclave memory needs to be from the + * same NUMA node as the enclave CPUs. + * The user memory region is an input parameter. It + * includes info provided by the caller - flags, + * memory size and userspace address. + * The ioctl can be invoked on the enclave fd, + * before an enclave is started. + * + * Context: Process context. + * Return: + * * 0 - Logic succesfully completed. + * * -1 - There was a failure in the ioctl logic. + * On failure, errno is set to: + * * EFAULT - copy_from_user() failure. + * * EINVAL - Invalid physical memory region(s) e.g. + * unaligned address. + * * EIO - Current task mm is not the same as + * the one that created the enclave. + * * ENOMEM - Memory allocation failure for internal + * bookkeeping variables. + * * NE_ERR_NOT_IN_INIT_STATE - The enclave is not in init state + * (init = before being started). + * * NE_ERR_INVALID_MEM_REGION_SIZE - The memory size of the region is not + * multiple of 2 MiB. + * * NE_ERR_INVALID_MEM_REGION_ADDR - Invalid user space address given. + * * NE_ERR_UNALIGNED_MEM_REGION_ADDR - Unaligned user space address given. + * * NE_ERR_MEM_REGION_ALREADY_USED - The memory region is already used. + * * NE_ERR_MEM_NOT_HUGE_PAGE - The memory region is not backed by + * huge pages. + * * NE_ERR_MEM_DIFFERENT_NUMA_NODE - The memory region is not from the same + * NUMA node as the CPUs. + * * NE_ERR_MEM_MAX_REGIONS - The number of memory regions set for + * the enclave reached maximum. + * * NE_ERR_INVALID_PAGE_SIZE - The memory region is not backed by + * pages multiple of 2 MiB. + * * NE_ERR_INVALID_FLAG_VALUE - The value of the provided flag is invalid. + * * Error codes from get_user_pages(). + * * Error codes from the NE PCI device request. + */ +#define NE_SET_USER_MEMORY_REGION _IOW(0xAE, 0x23, struct ne_user_memory_region) + +/** + * NE_START_ENCLAVE - The command is used to trigger enclave start after the + * enclave resources, such as memory and CPU, have been set. + * The enclave start info is an input / output parameter. It + * includes info provided by the caller - enclave cid and + * flags - and returns the cid (if input cid is 0). + * The ioctl can be invoked on the enclave fd, after an + * enclave slot is created and resources, such as memory and + * vCPUs are set for an enclave. + * + * Context: Process context. + * Return: + * * 0 - Logic succesfully completed. + * * -1 - There was a failure in the ioctl logic. + * On failure, errno is set to: + * * EFAULT - copy_from_user() / copy_to_user() failure. + * * NE_ERR_NOT_IN_INIT_STATE - The enclave is not in init state + * (init = before being started). + * * NE_ERR_NO_MEM_REGIONS_ADDED - No memory regions are set. + * * NE_ERR_NO_VCPUS_ADDED - No vCPUs are set. + * * NE_ERR_FULL_CORES_NOT_USED - Full core(s) not set for the enclave. + * * NE_ERR_ENCLAVE_MEM_MIN_SIZE - Enclave memory is less than minimum + * memory size (64 MiB). + * * NE_ERR_INVALID_FLAG_VALUE - The value of the provided flag is invalid. + * * NE_ERR_INVALID_ENCLAVE_CID - The provided enclave CID is invalid. + * * Error codes from the NE PCI device request. + */ +#define NE_START_ENCLAVE _IOWR(0xAE, 0x24, struct ne_enclave_start_info) + +/** + * DOC: NE specific error codes + */ + +/** + * NE_ERR_VCPU_ALREADY_USED - The provided vCPU is already used. + */ +#define NE_ERR_VCPU_ALREADY_USED (256) +/** + * NE_ERR_VCPU_NOT_IN_CPU_POOL - The provided vCPU is not available in the + * NE CPU pool. + */ +#define NE_ERR_VCPU_NOT_IN_CPU_POOL (257) +/** + * NE_ERR_VCPU_INVALID_CPU_CORE - The core id of the provided vCPU is invalid + * or out of range of the NE CPU pool. + */ +#define NE_ERR_VCPU_INVALID_CPU_CORE (258) +/** + * NE_ERR_INVALID_MEM_REGION_SIZE - The user space memory region size is not + * multiple of 2 MiB. + */ +#define NE_ERR_INVALID_MEM_REGION_SIZE (259) +/** + * NE_ERR_INVALID_MEM_REGION_ADDR - The user space memory region address range + * is invalid. + */ +#define NE_ERR_INVALID_MEM_REGION_ADDR (260) +/** + * NE_ERR_UNALIGNED_MEM_REGION_ADDR - The user space memory region address is + * not aligned. + */ +#define NE_ERR_UNALIGNED_MEM_REGION_ADDR (261) +/** + * NE_ERR_MEM_REGION_ALREADY_USED - The user space memory region is already used. + */ +#define NE_ERR_MEM_REGION_ALREADY_USED (262) +/** + * NE_ERR_MEM_NOT_HUGE_PAGE - The user space memory region is not backed by + * contiguous physical huge page(s). + */ +#define NE_ERR_MEM_NOT_HUGE_PAGE (263) +/** + * NE_ERR_MEM_DIFFERENT_NUMA_NODE - The user space memory region is backed by + * pages from different NUMA nodes than the CPUs. + */ +#define NE_ERR_MEM_DIFFERENT_NUMA_NODE (264) +/** + * NE_ERR_MEM_MAX_REGIONS - The supported max memory regions per enclaves has + * been reached. + */ +#define NE_ERR_MEM_MAX_REGIONS (265) +/** + * NE_ERR_NO_MEM_REGIONS_ADDED - The command to start an enclave is triggered + * and no memory regions are added. + */ +#define NE_ERR_NO_MEM_REGIONS_ADDED (266) +/** + * NE_ERR_NO_VCPUS_ADDED - The command to start an enclave is triggered and no + * vCPUs are added. + */ +#define NE_ERR_NO_VCPUS_ADDED (267) +/** + * NE_ERR_ENCLAVE_MEM_MIN_SIZE - The enclave memory size is lower than the + * minimum supported. + */ +#define NE_ERR_ENCLAVE_MEM_MIN_SIZE (268) +/** + * NE_ERR_FULL_CORES_NOT_USED - The command to start an enclave is triggered and + * full CPU cores are not set. + */ +#define NE_ERR_FULL_CORES_NOT_USED (269) +/** + * NE_ERR_NOT_IN_INIT_STATE - The enclave is not in init state when setting + * resources or triggering start. + */ +#define NE_ERR_NOT_IN_INIT_STATE (270) +/** + * NE_ERR_INVALID_VCPU - The provided vCPU is out of range of the available CPUs. + */ +#define NE_ERR_INVALID_VCPU (271) +/** + * NE_ERR_NO_CPUS_AVAIL_IN_POOL - The command to create an enclave is triggered + * and no CPUs are available in the pool. + */ +#define NE_ERR_NO_CPUS_AVAIL_IN_POOL (272) +/** + * NE_ERR_INVALID_PAGE_SIZE - The user space memory region is not backed by pages + * multiple of 2 MiB. + */ +#define NE_ERR_INVALID_PAGE_SIZE (273) +/** + * NE_ERR_INVALID_FLAG_VALUE - The provided flag value is invalid. + */ +#define NE_ERR_INVALID_FLAG_VALUE (274) +/** + * NE_ERR_INVALID_ENCLAVE_CID - The provided enclave CID is invalid, either + * being a well-known value or the CID of the + * parent / primary VM. + */ +#define NE_ERR_INVALID_ENCLAVE_CID (275) + +/** + * DOC: Image load info flags + */ + +/** + * NE_EIF_IMAGE - Enclave Image Format (EIF) + */ +#define NE_EIF_IMAGE (0x01) + +#define NE_IMAGE_LOAD_MAX_FLAG_VAL (0x02) + +/** + * struct ne_image_load_info - Info necessary for in-memory enclave image + * loading (in / out). + * @flags: Flags to determine the enclave image type + * (e.g. Enclave Image Format - EIF) (in). + * @memory_offset: Offset in enclave memory where to start placing the + * enclave image (out). + */ +struct ne_image_load_info { + __u64 flags; + __u64 memory_offset; +}; + +/** + * DOC: User memory region flags + */ + +/** + * NE_DEFAULT_MEMORY_REGION - Memory region for enclave general usage. + */ +#define NE_DEFAULT_MEMORY_REGION (0x00) + +#define NE_MEMORY_REGION_MAX_FLAG_VAL (0x01) + +/** + * struct ne_user_memory_region - Memory region to be set for an enclave (in). + * @flags: Flags to determine the usage for the memory region (in). + * @memory_size: The size, in bytes, of the memory region to be set for + * an enclave (in). + * @userspace_addr: The start address of the userspace allocated memory of + * the memory region to set for an enclave (in). + */ +struct ne_user_memory_region { + __u64 flags; + __u64 memory_size; + __u64 userspace_addr; +}; + +/** + * DOC: Enclave start info flags + */ + +/** + * NE_ENCLAVE_PRODUCTION_MODE - Start enclave in production mode. + */ +#define NE_ENCLAVE_PRODUCTION_MODE (0x00) +/** + * NE_ENCLAVE_DEBUG_MODE - Start enclave in debug mode. + */ +#define NE_ENCLAVE_DEBUG_MODE (0x01) + +#define NE_ENCLAVE_START_MAX_FLAG_VAL (0x02) + +/** + * struct ne_enclave_start_info - Setup info necessary for enclave start (in / out). + * @flags: Flags for the enclave to start with (e.g. debug mode) (in). + * @enclave_cid: Context ID (CID) for the enclave vsock device. If 0 as + * input, the CID is autogenerated by the hypervisor and + * returned back as output by the driver (in / out). + */ +struct ne_enclave_start_info { + __u64 flags; + __u64 enclave_cid; +}; + +#endif /* _UAPI_LINUX_NITRO_ENCLAVES_H_ */ -- cgit v1.2.3 From c7f0207b613033c56b1217032d2f6326d0c69217 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 16 Sep 2020 21:11:33 -0700 Subject: fscrypt: make "#define fscrypt_policy" user-only The fscrypt UAPI header defines fscrypt_policy to fscrypt_policy_v1, for source compatibility with old userspace programs. Internally, the kernel doesn't want that compatibility definition. Instead, fscrypt_private.h #undefs it and re-defines it to a union. That works for now. However, in order to add fscrypt_operations::get_dummy_policy(), we'll need to forward declare 'union fscrypt_policy' in include/linux/fscrypt.h. That would cause build errors because "fscrypt_policy" is used in ioctl numbers. To avoid this, modify the UAPI header to make the fscrypt_policy compatibility definition conditional on !__KERNEL__, and make the ioctls use fscrypt_policy_v1 instead of fscrypt_policy. Note that this doesn't change the actual ioctl numbers. Acked-by: Jeff Layton Link: https://lore.kernel.org/r/20200917041136.178600-11-ebiggers@kernel.org Signed-off-by: Eric Biggers --- fs/crypto/fscrypt_private.h | 1 - include/uapi/linux/fscrypt.h | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 355f6d937751..ac3352086ee4 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -97,7 +97,6 @@ static inline const u8 *fscrypt_context_nonce(const union fscrypt_context *ctx) return NULL; } -#undef fscrypt_policy union fscrypt_policy { u8 version; struct fscrypt_policy_v1 v1; diff --git a/include/uapi/linux/fscrypt.h b/include/uapi/linux/fscrypt.h index 7875709ccfeb..e5de60336938 100644 --- a/include/uapi/linux/fscrypt.h +++ b/include/uapi/linux/fscrypt.h @@ -45,7 +45,6 @@ struct fscrypt_policy_v1 { __u8 flags; __u8 master_key_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE]; }; -#define fscrypt_policy fscrypt_policy_v1 /* * Process-subscribed "logon" key description prefix and payload format. @@ -156,9 +155,9 @@ struct fscrypt_get_key_status_arg { __u32 __out_reserved[13]; }; -#define FS_IOC_SET_ENCRYPTION_POLICY _IOR('f', 19, struct fscrypt_policy) +#define FS_IOC_SET_ENCRYPTION_POLICY _IOR('f', 19, struct fscrypt_policy_v1) #define FS_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16]) -#define FS_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct fscrypt_policy) +#define FS_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct fscrypt_policy_v1) #define FS_IOC_GET_ENCRYPTION_POLICY_EX _IOWR('f', 22, __u8[9]) /* size + version */ #define FS_IOC_ADD_ENCRYPTION_KEY _IOWR('f', 23, struct fscrypt_add_key_arg) #define FS_IOC_REMOVE_ENCRYPTION_KEY _IOWR('f', 24, struct fscrypt_remove_key_arg) @@ -170,6 +169,7 @@ struct fscrypt_get_key_status_arg { /* old names; don't add anything new here! */ #ifndef __KERNEL__ +#define fscrypt_policy fscrypt_policy_v1 #define FS_KEY_DESCRIPTOR_SIZE FSCRYPT_KEY_DESCRIPTOR_SIZE #define FS_POLICY_FLAGS_PAD_4 FSCRYPT_POLICY_FLAGS_PAD_4 #define FS_POLICY_FLAGS_PAD_8 FSCRYPT_POLICY_FLAGS_PAD_8 -- cgit v1.2.3 From 9c4258c78a2a7624c79b797f40ae2dbfd2555e26 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 22 Sep 2020 10:30:18 +0300 Subject: net: bridge: mdb: add support to extend add/del commands Since the MDB add/del code expects an exact struct br_mdb_entry we can't really add any extensions, thus add a new nested attribute at the level of MDBA_SET_ENTRY called MDBA_SET_ENTRY_ATTRS which will be used to pass all new options via netlink attributes. This patch doesn't change anything functionally since the new attribute is not used yet, only parsed. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 12 ++++++++++++ net/bridge/br_mdb.c | 22 +++++++++++++++++++--- 2 files changed, 31 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 75a2ac479247..dc52f8cffa0d 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -530,10 +530,22 @@ struct br_mdb_entry { enum { MDBA_SET_ENTRY_UNSPEC, MDBA_SET_ENTRY, + MDBA_SET_ENTRY_ATTRS, __MDBA_SET_ENTRY_MAX, }; #define MDBA_SET_ENTRY_MAX (__MDBA_SET_ENTRY_MAX - 1) +/* [MDBA_SET_ENTRY_ATTRS] = { + * [MDBE_ATTR_xxx] + * ... + * } + */ +enum { + MDBE_ATTR_UNSPEC, + __MDBE_ATTR_MAX, +}; +#define MDBE_ATTR_MAX (__MDBE_ATTR_MAX - 1) + /* Embedded inside LINK_XSTATS_TYPE_BRIDGE */ enum { BRIDGE_XSTATS_UNSPEC, diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index a1ff0a372185..907df6d695ec 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -670,9 +670,12 @@ static bool is_valid_mdb_entry(struct br_mdb_entry *entry, return true; } +static const struct nla_policy br_mdbe_attrs_pol[MDBE_ATTR_MAX + 1] = { +}; + static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh, struct net_device **pdev, struct br_mdb_entry **pentry, - struct netlink_ext_ack *extack) + struct nlattr **mdb_attrs, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct br_mdb_entry *entry; @@ -719,6 +722,17 @@ static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh, return -EINVAL; *pentry = entry; + if (tb[MDBA_SET_ENTRY_ATTRS]) { + err = nla_parse_nested(mdb_attrs, MDBE_ATTR_MAX, + tb[MDBA_SET_ENTRY_ATTRS], + br_mdbe_attrs_pol, extack); + if (err) + return err; + } else { + memset(mdb_attrs, 0, + sizeof(struct nlattr *) * (MDBE_ATTR_MAX + 1)); + } + return 0; } @@ -803,6 +817,7 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br, static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + struct nlattr *mdb_attrs[MDBE_ATTR_MAX + 1]; struct net *net = sock_net(skb->sk); struct net_bridge_vlan_group *vg; struct net_bridge_port *p = NULL; @@ -812,7 +827,7 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct net_bridge *br; int err; - err = br_mdb_parse(skb, nlh, &dev, &entry, extack); + err = br_mdb_parse(skb, nlh, &dev, &entry, mdb_attrs, extack); if (err < 0) return err; @@ -921,6 +936,7 @@ unlock: static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + struct nlattr *mdb_attrs[MDBE_ATTR_MAX + 1]; struct net *net = sock_net(skb->sk); struct net_bridge_vlan_group *vg; struct net_bridge_port *p = NULL; @@ -930,7 +946,7 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, struct net_bridge *br; int err; - err = br_mdb_parse(skb, nlh, &dev, &entry, extack); + err = br_mdb_parse(skb, nlh, &dev, &entry, mdb_attrs, extack); if (err < 0) return err; -- cgit v1.2.3 From 88d4bd180419a7cde3947f191dc4e26fbb19f80b Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 22 Sep 2020 10:30:19 +0300 Subject: net: bridge: mdb: add support for add/del/dump of entries with source Add new mdb attributes (MDBE_ATTR_SOURCE for setting, MDBA_MDB_EATTR_SOURCE for dumping) to allow add/del and dump of mdb entries with a source address (S,G). New S,G entries are created with filter mode of MCAST_INCLUDE. The same attributes are used for IPv4 and IPv6, they're validated and parsed based on their protocol. S,G host joined entries which are added by user are not allowed yet. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 2 + net/bridge/br_mdb.c | 142 +++++++++++++++++++++++++++++++++-------- net/bridge/br_private.h | 14 ++++ 3 files changed, 130 insertions(+), 28 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index dc52f8cffa0d..3e6377c865eb 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -457,6 +457,7 @@ enum { MDBA_MDB_EATTR_TIMER, MDBA_MDB_EATTR_SRC_LIST, MDBA_MDB_EATTR_GROUP_MODE, + MDBA_MDB_EATTR_SOURCE, __MDBA_MDB_EATTR_MAX }; #define MDBA_MDB_EATTR_MAX (__MDBA_MDB_EATTR_MAX - 1) @@ -542,6 +543,7 @@ enum { */ enum { MDBE_ATTR_UNSPEC, + MDBE_ATTR_SOURCE, __MDBE_ATTR_MAX, }; #define MDBE_ATTR_MAX (__MDBE_ATTR_MAX - 1) diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 907df6d695ec..7f9ca5c20120 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -64,17 +64,27 @@ static void __mdb_entry_fill_flags(struct br_mdb_entry *e, unsigned char flags) e->flags |= MDB_FLAGS_FAST_LEAVE; } -static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip) +static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip, + struct nlattr **mdb_attrs) { memset(ip, 0, sizeof(struct br_ip)); ip->vid = entry->vid; ip->proto = entry->addr.proto; - if (ip->proto == htons(ETH_P_IP)) + switch (ip->proto) { + case htons(ETH_P_IP): ip->dst.ip4 = entry->addr.u.ip4; + if (mdb_attrs && mdb_attrs[MDBE_ATTR_SOURCE]) + ip->src.ip4 = nla_get_in_addr(mdb_attrs[MDBE_ATTR_SOURCE]); + break; #if IS_ENABLED(CONFIG_IPV6) - else + case htons(ETH_P_IPV6): ip->dst.ip6 = entry->addr.u.ip6; + if (mdb_attrs && mdb_attrs[MDBE_ATTR_SOURCE]) + ip->src.ip6 = nla_get_in6_addr(mdb_attrs[MDBE_ATTR_SOURCE]); + break; #endif + } + } static int __mdb_fill_srcs(struct sk_buff *skb, @@ -172,30 +182,41 @@ static int __mdb_fill_info(struct sk_buff *skb, if (nla_put_nohdr(skb, sizeof(e), &e) || nla_put_u32(skb, MDBA_MDB_EATTR_TIMER, - br_timer_value(mtimer))) { - nla_nest_cancel(skb, nest_ent); - return -EMSGSIZE; - } + br_timer_value(mtimer))) + goto nest_err; switch (mp->addr.proto) { case htons(ETH_P_IP): - dump_srcs_mode = !!(p && mp->br->multicast_igmp_version == 3); + dump_srcs_mode = !!(mp->br->multicast_igmp_version == 3); + if (mp->addr.src.ip4) { + if (nla_put_in_addr(skb, MDBA_MDB_EATTR_SOURCE, + mp->addr.src.ip4)) + goto nest_err; + break; + } break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): - dump_srcs_mode = !!(p && mp->br->multicast_mld_version == 2); + dump_srcs_mode = !!(mp->br->multicast_mld_version == 2); + if (!ipv6_addr_any(&mp->addr.src.ip6)) { + if (nla_put_in6_addr(skb, MDBA_MDB_EATTR_SOURCE, + &mp->addr.src.ip6)) + goto nest_err; + break; + } break; #endif } - if (dump_srcs_mode && + if (p && dump_srcs_mode && (__mdb_fill_srcs(skb, p) || - nla_put_u8(skb, MDBA_MDB_EATTR_GROUP_MODE, p->filter_mode))) { - nla_nest_cancel(skb, nest_ent); - return -EMSGSIZE; - } - + nla_put_u8(skb, MDBA_MDB_EATTR_GROUP_MODE, p->filter_mode))) + goto nest_err; nla_nest_end(skb, nest_ent); return 0; + +nest_err: + nla_nest_cancel(skb, nest_ent); + return -EMSGSIZE; } static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, @@ -395,12 +416,18 @@ static size_t rtnl_mdb_nlmsg_size(struct net_bridge_port_group *pg) switch (pg->addr.proto) { case htons(ETH_P_IP): + /* MDBA_MDB_EATTR_SOURCE */ + if (pg->addr.src.ip4) + nlmsg_size += nla_total_size(sizeof(__be32)); if (pg->port->br->multicast_igmp_version == 2) goto out; addr_size = sizeof(__be32); break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): + /* MDBA_MDB_EATTR_SOURCE */ + if (!ipv6_addr_any(&pg->addr.src.ip6)) + nlmsg_size += nla_total_size(sizeof(struct in6_addr)); if (pg->port->br->multicast_mld_version == 1) goto out; addr_size = sizeof(struct in6_addr); @@ -670,7 +697,48 @@ static bool is_valid_mdb_entry(struct br_mdb_entry *entry, return true; } +static bool is_valid_mdb_source(struct nlattr *attr, __be16 proto, + struct netlink_ext_ack *extack) +{ + switch (proto) { + case htons(ETH_P_IP): + if (nla_len(attr) != sizeof(struct in_addr)) { + NL_SET_ERR_MSG_MOD(extack, "IPv4 invalid source address length"); + return false; + } + if (ipv4_is_multicast(nla_get_in_addr(attr))) { + NL_SET_ERR_MSG_MOD(extack, "IPv4 multicast source address is not allowed"); + return false; + } + break; +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): { + struct in6_addr src; + + if (nla_len(attr) != sizeof(struct in6_addr)) { + NL_SET_ERR_MSG_MOD(extack, "IPv6 invalid source address length"); + return false; + } + src = nla_get_in6_addr(attr); + if (ipv6_addr_is_multicast(&src)) { + NL_SET_ERR_MSG_MOD(extack, "IPv6 multicast source address is not allowed"); + return false; + } + break; + } +#endif + default: + NL_SET_ERR_MSG_MOD(extack, "Invalid protocol used with source address"); + return false; + } + + return true; +} + static const struct nla_policy br_mdbe_attrs_pol[MDBE_ATTR_MAX + 1] = { + [MDBE_ATTR_SOURCE] = NLA_POLICY_RANGE(NLA_BINARY, + sizeof(struct in_addr), + sizeof(struct in6_addr)), }; static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -728,6 +796,10 @@ static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh, br_mdbe_attrs_pol, extack); if (err) return err; + if (mdb_attrs[MDBE_ATTR_SOURCE] && + !is_valid_mdb_source(mdb_attrs[MDBE_ATTR_SOURCE], + entry->addr.proto, extack)) + return -EINVAL; } else { memset(mdb_attrs, 0, sizeof(struct nlattr *) * (MDBE_ATTR_MAX + 1)); @@ -744,8 +816,22 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; unsigned long now = jiffies; + u8 filter_mode; int err; + /* host join errors which can happen before creating the group */ + if (!port) { + /* don't allow any flags for host-joined groups */ + if (entry->state) { + NL_SET_ERR_MSG_MOD(extack, "Flags are not allowed for host groups"); + return -EINVAL; + } + if (!br_multicast_is_star_g(group)) { + NL_SET_ERR_MSG_MOD(extack, "Groups with sources cannot be manually host joined"); + return -EINVAL; + } + } + mp = br_mdb_ip_get(br, group); if (!mp) { mp = br_multicast_new_group(br, group); @@ -756,11 +842,6 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, /* host join */ if (!port) { - /* don't allow any flags for host-joined groups */ - if (entry->state) { - NL_SET_ERR_MSG_MOD(extack, "Flags are not allowed for host groups"); - return -EINVAL; - } if (mp->host_joined) { NL_SET_ERR_MSG_MOD(extack, "Group is already joined by host"); return -EEXIST; @@ -783,8 +864,11 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, break; } + filter_mode = br_multicast_is_star_g(group) ? MCAST_EXCLUDE : + MCAST_INCLUDE; + p = br_multicast_new_port_group(port, group, *pp, entry->state, NULL, - MCAST_EXCLUDE); + filter_mode); if (unlikely(!p)) { NL_SET_ERR_MSG_MOD(extack, "Couldn't allocate new port group"); return -ENOMEM; @@ -800,12 +884,13 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, static int __br_mdb_add(struct net *net, struct net_bridge *br, struct net_bridge_port *p, struct br_mdb_entry *entry, + struct nlattr **mdb_attrs, struct netlink_ext_ack *extack) { struct br_ip ip; int ret; - __mdb_entry_to_br_ip(entry, &ip); + __mdb_entry_to_br_ip(entry, &ip, mdb_attrs); spin_lock_bh(&br->multicast_lock); ret = br_mdb_add_group(br, p, &ip, entry, extack); @@ -875,18 +960,19 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (br_vlan_enabled(br->dev) && vg && entry->vid == 0) { list_for_each_entry(v, &vg->vlan_list, vlist) { entry->vid = v->vid; - err = __br_mdb_add(net, br, p, entry, extack); + err = __br_mdb_add(net, br, p, entry, mdb_attrs, extack); if (err) break; } } else { - err = __br_mdb_add(net, br, p, entry, extack); + err = __br_mdb_add(net, br, p, entry, mdb_attrs, extack); } return err; } -static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) +static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry, + struct nlattr **mdb_attrs) { struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; @@ -897,7 +983,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) if (!netif_running(br->dev) || !br_opt_get(br, BROPT_MULTICAST_ENABLED)) return -EINVAL; - __mdb_entry_to_br_ip(entry, &ip); + __mdb_entry_to_br_ip(entry, &ip, mdb_attrs); spin_lock_bh(&br->multicast_lock); mp = br_mdb_ip_get(br, &ip); @@ -971,10 +1057,10 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, if (br_vlan_enabled(br->dev) && vg && entry->vid == 0) { list_for_each_entry(v, &vg->vlan_list, vlist) { entry->vid = v->vid; - err = __br_mdb_del(br, entry); + err = __br_mdb_del(br, entry, mdb_attrs); } } else { - err = __br_mdb_del(br, entry); + err = __br_mdb_del(br, entry, mdb_attrs); } return err; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index a23d2bae56e1..0f54a7a7c186 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -873,6 +873,20 @@ static inline bool br_multicast_querier_exists(struct net_bridge *br, } } +static inline bool br_multicast_is_star_g(const struct br_ip *ip) +{ + switch (ip->proto) { + case htons(ETH_P_IP): + return ipv4_is_zeronet(ip->src.ip4); +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + return ipv6_addr_any(&ip->src.ip6); +#endif + default: + return false; + } +} + static inline int br_multicast_igmp_type(const struct sk_buff *skb) { return BR_INPUT_SKB_CB(skb)->igmp; -- cgit v1.2.3 From 8f8cb77e0b22d9044d8d57ab3bb18ea8d0474752 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 22 Sep 2020 10:30:21 +0300 Subject: net: bridge: mcast: add rt_protocol field to the port group struct We need to be able to differentiate between pg entries created by user-space and the kernel when we start generating S,G entries for IGMPv3/MLDv2's fast path. User-space entries are created by default as RTPROT_STATIC and the kernel entries are RTPROT_KERNEL. Later we can allow user-space to provide the entry rt_protocol so we can differentiate between who added the entries specifically (e.g. clag, admin, frr etc). Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 1 + net/bridge/br_mdb.c | 42 ++++++++++++++++++++++++++---------------- net/bridge/br_multicast.c | 7 +++++-- net/bridge/br_private.h | 3 ++- 4 files changed, 34 insertions(+), 19 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 3e6377c865eb..1054f151078d 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -458,6 +458,7 @@ enum { MDBA_MDB_EATTR_SRC_LIST, MDBA_MDB_EATTR_GROUP_MODE, MDBA_MDB_EATTR_SOURCE, + MDBA_MDB_EATTR_RTPROT, __MDBA_MDB_EATTR_MAX }; #define MDBA_MDB_EATTR_MAX (__MDBA_MDB_EATTR_MAX - 1) diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 7f9ca5c20120..b386a5e07698 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -184,6 +184,7 @@ static int __mdb_fill_info(struct sk_buff *skb, MDBA_MDB_EATTR_TIMER, br_timer_value(mtimer))) goto nest_err; + switch (mp->addr.proto) { case htons(ETH_P_IP): dump_srcs_mode = !!(mp->br->multicast_igmp_version == 3); @@ -206,10 +207,15 @@ static int __mdb_fill_info(struct sk_buff *skb, break; #endif } - if (p && dump_srcs_mode && - (__mdb_fill_srcs(skb, p) || - nla_put_u8(skb, MDBA_MDB_EATTR_GROUP_MODE, p->filter_mode))) - goto nest_err; + if (p) { + if (nla_put_u8(skb, MDBA_MDB_EATTR_RTPROT, p->rt_protocol)) + goto nest_err; + if (dump_srcs_mode && + (__mdb_fill_srcs(skb, p) || + nla_put_u8(skb, MDBA_MDB_EATTR_GROUP_MODE, + p->filter_mode))) + goto nest_err; + } nla_nest_end(skb, nest_ent); return 0; @@ -414,6 +420,9 @@ static size_t rtnl_mdb_nlmsg_size(struct net_bridge_port_group *pg) if (!pg) goto out; + /* MDBA_MDB_EATTR_RTPROT */ + nlmsg_size += nla_total_size(sizeof(u8)); + switch (pg->addr.proto) { case htons(ETH_P_IP): /* MDBA_MDB_EATTR_SOURCE */ @@ -809,16 +818,20 @@ static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh, } static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, - struct br_ip *group, struct br_mdb_entry *entry, + struct br_mdb_entry *entry, + struct nlattr **mdb_attrs, struct netlink_ext_ack *extack) { struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; unsigned long now = jiffies; + struct br_ip group; u8 filter_mode; int err; + __mdb_entry_to_br_ip(entry, &group, mdb_attrs); + /* host join errors which can happen before creating the group */ if (!port) { /* don't allow any flags for host-joined groups */ @@ -826,15 +839,15 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, NL_SET_ERR_MSG_MOD(extack, "Flags are not allowed for host groups"); return -EINVAL; } - if (!br_multicast_is_star_g(group)) { + if (!br_multicast_is_star_g(&group)) { NL_SET_ERR_MSG_MOD(extack, "Groups with sources cannot be manually host joined"); return -EINVAL; } } - mp = br_mdb_ip_get(br, group); + mp = br_mdb_ip_get(br, &group); if (!mp) { - mp = br_multicast_new_group(br, group); + mp = br_multicast_new_group(br, &group); err = PTR_ERR_OR_ZERO(mp); if (err) return err; @@ -864,11 +877,11 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, break; } - filter_mode = br_multicast_is_star_g(group) ? MCAST_EXCLUDE : - MCAST_INCLUDE; + filter_mode = br_multicast_is_star_g(&group) ? MCAST_EXCLUDE : + MCAST_INCLUDE; - p = br_multicast_new_port_group(port, group, *pp, entry->state, NULL, - filter_mode); + p = br_multicast_new_port_group(port, &group, *pp, entry->state, NULL, + filter_mode, RTPROT_STATIC); if (unlikely(!p)) { NL_SET_ERR_MSG_MOD(extack, "Couldn't allocate new port group"); return -ENOMEM; @@ -887,13 +900,10 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br, struct nlattr **mdb_attrs, struct netlink_ext_ack *extack) { - struct br_ip ip; int ret; - __mdb_entry_to_br_ip(entry, &ip, mdb_attrs); - spin_lock_bh(&br->multicast_lock); - ret = br_mdb_add_group(br, p, &ip, entry, extack); + ret = br_mdb_add_group(br, p, entry, mdb_attrs, extack); spin_unlock_bh(&br->multicast_lock); return ret; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 4fd690bc848f..b6e7b0ece422 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -795,7 +795,8 @@ struct net_bridge_port_group *br_multicast_new_port_group( struct net_bridge_port_group __rcu *next, unsigned char flags, const unsigned char *src, - u8 filter_mode) + u8 filter_mode, + u8 rt_protocol) { struct net_bridge_port_group *p; @@ -807,6 +808,7 @@ struct net_bridge_port_group *br_multicast_new_port_group( p->port = port; p->flags = flags; p->filter_mode = filter_mode; + p->rt_protocol = rt_protocol; p->mcast_gc.destroy = br_multicast_destroy_port_group; INIT_HLIST_HEAD(&p->src_list); rcu_assign_pointer(p->next, next); @@ -892,7 +894,8 @@ static int br_multicast_add_group(struct net_bridge *br, break; } - p = br_multicast_new_port_group(port, group, *pp, 0, src, filter_mode); + p = br_multicast_new_port_group(port, group, *pp, 0, src, filter_mode, + RTPROT_KERNEL); if (unlikely(!p)) goto err; rcu_assign_pointer(*pp, p); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 0f54a7a7c186..dae7e3526fc7 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -246,6 +246,7 @@ struct net_bridge_port_group { unsigned char flags; unsigned char filter_mode; unsigned char grp_query_rexmit_cnt; + unsigned char rt_protocol; struct hlist_head src_list; unsigned int src_ents; @@ -804,7 +805,7 @@ struct net_bridge_port_group * br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group, struct net_bridge_port_group __rcu *next, unsigned char flags, const unsigned char *src, - u8 filter_mode); + u8 filter_mode, u8 rt_protocol); int br_mdb_hash_init(struct net_bridge *br); void br_mdb_hash_fini(struct net_bridge *br); void br_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, -- cgit v1.2.3 From 8266a0491e92d39dc9af739e8380a0daa9b8836b Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 22 Sep 2020 10:30:24 +0300 Subject: net: bridge: mcast: handle port group filter modes We need to handle group filter mode transitions and initial state. To change a port group's INCLUDE -> EXCLUDE mode (or when we have added a new port group in EXCLUDE mode) we need to add that port to all of *,G ports' S,G entries for proper replication. When the EXCLUDE state is changed from IGMPv3 report, br_multicast_fwd_filter_exclude() must be called after the source list processing because the assumption is that all of the group's S,G entries will be created before transitioning to EXCLUDE mode, i.e. most importantly its blocked entries will already be added so it will not get automatically added to them. The transition EXCLUDE -> INCLUDE happens only when a port group timer expires, it requires us to remove that port from all of *,G ports' S,G entries where it was automatically added previously. Finally when we are adding a new S,G entry we must add all of *,G's EXCLUDE ports to it. In order to distinguish automatically added *,G EXCLUDE ports we have a new port group flag - MDB_PG_FLAGS_STAR_EXCL. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 1 + net/bridge/br_mdb.c | 25 +++++- net/bridge/br_multicast.c | 172 +++++++++++++++++++++++++++++++++++++++++ net/bridge/br_private.h | 20 +++++ 4 files changed, 216 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 1054f151078d..e4bd30a25f6b 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -518,6 +518,7 @@ struct br_mdb_entry { __u8 state; #define MDB_FLAGS_OFFLOAD (1 << 0) #define MDB_FLAGS_FAST_LEAVE (1 << 1) +#define MDB_FLAGS_STAR_EXCL (1 << 2) __u8 flags; __u16 vid; struct { diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 4e3a5cefc626..28cd35a9cf37 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -62,6 +62,8 @@ static void __mdb_entry_fill_flags(struct br_mdb_entry *e, unsigned char flags) e->flags |= MDB_FLAGS_OFFLOAD; if (flags & MDB_PG_FLAGS_FAST_LEAVE) e->flags |= MDB_FLAGS_FAST_LEAVE; + if (flags & MDB_PG_FLAGS_STAR_EXCL) + e->flags |= MDB_FLAGS_STAR_EXCL; } static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip, @@ -822,11 +824,11 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, struct nlattr **mdb_attrs, struct netlink_ext_ack *extack) { - struct net_bridge_mdb_entry *mp; + struct net_bridge_mdb_entry *mp, *star_mp; struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; + struct br_ip group, star_group; unsigned long now = jiffies; - struct br_ip group; u8 filter_mode; int err; @@ -890,6 +892,25 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, if (entry->state == MDB_TEMPORARY) mod_timer(&p->timer, now + br->multicast_membership_interval); br_mdb_notify(br->dev, mp, p, RTM_NEWMDB); + /* if we are adding a new EXCLUDE port group (*,G) it needs to be also + * added to all S,G entries for proper replication, if we are adding + * a new INCLUDE port (S,G) then all of *,G EXCLUDE ports need to be + * added to it for proper replication + */ + if (br_multicast_should_handle_mode(br, group.proto)) { + switch (filter_mode) { + case MCAST_EXCLUDE: + br_multicast_star_g_handle_mode(p, MCAST_EXCLUDE); + break; + case MCAST_INCLUDE: + star_group = p->key.addr; + memset(&star_group.src, 0, sizeof(star_group.src)); + star_mp = br_mdb_ip_get(br, &star_group); + if (star_mp) + br_multicast_sg_add_exclude_ports(star_mp, p); + break; + } + } return 0; } diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index ece8ac805e98..f39bbd733722 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -73,6 +73,8 @@ __br_multicast_add_group(struct net_bridge *br, const unsigned char *src, u8 filter_mode, bool igmpv2_mldv1); +static void br_multicast_find_del_pg(struct net_bridge *br, + struct net_bridge_port_group *pg); static struct net_bridge_port_group * br_sg_port_find(struct net_bridge *br, @@ -195,8 +197,163 @@ static bool br_port_group_equal(struct net_bridge_port_group *p, return ether_addr_equal(src, p->eth_addr); } +static void __fwd_add_star_excl(struct net_bridge_port_group *pg, + struct br_ip *sg_ip) +{ + struct net_bridge_port_group_sg_key sg_key; + struct net_bridge *br = pg->key.port->br; + struct net_bridge_port_group *src_pg; + + memset(&sg_key, 0, sizeof(sg_key)); + sg_key.port = pg->key.port; + sg_key.addr = *sg_ip; + if (br_sg_port_find(br, &sg_key)) + return; + + src_pg = __br_multicast_add_group(br, pg->key.port, sg_ip, pg->eth_addr, + MCAST_INCLUDE, false); + if (IS_ERR_OR_NULL(src_pg) || + src_pg->rt_protocol != RTPROT_KERNEL) + return; + + src_pg->flags |= MDB_PG_FLAGS_STAR_EXCL; +} + +static void __fwd_del_star_excl(struct net_bridge_port_group *pg, + struct br_ip *sg_ip) +{ + struct net_bridge_port_group_sg_key sg_key; + struct net_bridge *br = pg->key.port->br; + struct net_bridge_port_group *src_pg; + + memset(&sg_key, 0, sizeof(sg_key)); + sg_key.port = pg->key.port; + sg_key.addr = *sg_ip; + src_pg = br_sg_port_find(br, &sg_key); + if (!src_pg || !(src_pg->flags & MDB_PG_FLAGS_STAR_EXCL) || + src_pg->rt_protocol != RTPROT_KERNEL) + return; + + br_multicast_find_del_pg(br, src_pg); +} + +/* When a port group transitions to (or is added as) EXCLUDE we need to add it + * to all other ports' S,G entries which are not blocked by the current group + * for proper replication, the assumption is that any S,G blocked entries + * are already added so the S,G,port lookup should skip them. + * When a port group transitions from EXCLUDE -> INCLUDE mode or is being + * deleted we need to remove it from all ports' S,G entries where it was + * automatically installed before (i.e. where it's MDB_PG_FLAGS_STAR_EXCL). + */ +void br_multicast_star_g_handle_mode(struct net_bridge_port_group *pg, + u8 filter_mode) +{ + struct net_bridge *br = pg->key.port->br; + struct net_bridge_port_group *pg_lst; + struct net_bridge_mdb_entry *mp; + struct br_ip sg_ip; + + if (WARN_ON(!br_multicast_is_star_g(&pg->key.addr))) + return; + + mp = br_mdb_ip_get(br, &pg->key.addr); + if (!mp) + return; + + memset(&sg_ip, 0, sizeof(sg_ip)); + sg_ip = pg->key.addr; + for (pg_lst = mlock_dereference(mp->ports, br); + pg_lst; + pg_lst = mlock_dereference(pg_lst->next, br)) { + struct net_bridge_group_src *src_ent; + + if (pg_lst == pg) + continue; + hlist_for_each_entry(src_ent, &pg_lst->src_list, node) { + if (!(src_ent->flags & BR_SGRP_F_INSTALLED)) + continue; + sg_ip.src = src_ent->addr.src; + switch (filter_mode) { + case MCAST_INCLUDE: + __fwd_del_star_excl(pg, &sg_ip); + break; + case MCAST_EXCLUDE: + __fwd_add_star_excl(pg, &sg_ip); + break; + } + } + } +} + +static void br_multicast_sg_del_exclude_ports(struct net_bridge_mdb_entry *sgmp) +{ + struct net_bridge_port_group __rcu **pp; + struct net_bridge_port_group *p; + + /* *,G exclude ports are only added to S,G entries */ + if (WARN_ON(br_multicast_is_star_g(&sgmp->addr))) + return; + + /* we need the STAR_EXCLUDE ports if there are non-STAR_EXCLUDE ports + * we should ignore perm entries since they're managed by user-space + */ + for (pp = &sgmp->ports; + (p = mlock_dereference(*pp, sgmp->br)) != NULL; + pp = &p->next) + if (!(p->flags & (MDB_PG_FLAGS_STAR_EXCL | + MDB_PG_FLAGS_PERMANENT))) + return; + + for (pp = &sgmp->ports; + (p = mlock_dereference(*pp, sgmp->br)) != NULL;) { + if (!(p->flags & MDB_PG_FLAGS_PERMANENT)) + br_multicast_del_pg(sgmp, p, pp); + else + pp = &p->next; + } +} + +void br_multicast_sg_add_exclude_ports(struct net_bridge_mdb_entry *star_mp, + struct net_bridge_port_group *sg) +{ + struct net_bridge_port_group_sg_key sg_key; + struct net_bridge *br = star_mp->br; + struct net_bridge_port_group *pg; + + if (WARN_ON(br_multicast_is_star_g(&sg->key.addr))) + return; + if (WARN_ON(!br_multicast_is_star_g(&star_mp->addr))) + return; + + memset(&sg_key, 0, sizeof(sg_key)); + sg_key.addr = sg->key.addr; + /* we need to add all exclude ports to the S,G */ + for (pg = mlock_dereference(star_mp->ports, br); + pg; + pg = mlock_dereference(pg->next, br)) { + struct net_bridge_port_group *src_pg; + + if (pg == sg || pg->filter_mode == MCAST_INCLUDE) + continue; + + sg_key.port = pg->key.port; + if (br_sg_port_find(br, &sg_key)) + continue; + + src_pg = __br_multicast_add_group(br, pg->key.port, + &sg->key.addr, + sg->eth_addr, + MCAST_INCLUDE, false); + if (IS_ERR_OR_NULL(src_pg) || + src_pg->rt_protocol != RTPROT_KERNEL) + continue; + src_pg->flags |= MDB_PG_FLAGS_STAR_EXCL; + } +} + static void br_multicast_fwd_src_add(struct net_bridge_group_src *src) { + struct net_bridge_mdb_entry *star_mp; struct net_bridge_port_group *sg; struct br_ip sg_ip; @@ -211,6 +368,7 @@ static void br_multicast_fwd_src_add(struct net_bridge_group_src *src) if (IS_ERR_OR_NULL(sg)) return; src->flags |= BR_SGRP_F_INSTALLED; + sg->flags &= ~MDB_PG_FLAGS_STAR_EXCL; /* if it was added by user-space as perm we can skip next steps */ if (sg->rt_protocol != RTPROT_KERNEL && @@ -219,6 +377,11 @@ static void br_multicast_fwd_src_add(struct net_bridge_group_src *src) /* the kernel is now responsible for removing this S,G */ del_timer(&sg->timer); + star_mp = br_mdb_ip_get(src->br, &src->pg->key.addr); + if (!star_mp) + return; + + br_multicast_sg_add_exclude_ports(star_mp, sg); } static void br_multicast_fwd_src_remove(struct net_bridge_group_src *src) @@ -349,6 +512,10 @@ void br_multicast_del_pg(struct net_bridge_mdb_entry *mp, hlist_for_each_entry_safe(ent, tmp, &pg->src_list, node) br_multicast_del_group_src(ent); br_mdb_notify(br->dev, mp, pg, RTM_DELMDB); + if (!br_multicast_is_star_g(&mp->addr)) + br_multicast_sg_del_exclude_ports(mp); + else + br_multicast_star_g_handle_mode(pg, MCAST_INCLUDE); hlist_add_head(&pg->mcast_gc.gc_node, &br->mcast_gc_list); queue_work(system_long_wq, &br->mcast_gc_work); @@ -407,6 +574,9 @@ static void br_multicast_port_group_expired(struct timer_list *t) } else if (changed) { struct net_bridge_mdb_entry *mp = br_mdb_ip_get(br, &pg->key.addr); + if (changed && br_multicast_is_star_g(&pg->key.addr)) + br_multicast_star_g_handle_mode(pg, MCAST_INCLUDE); + if (WARN_ON(!mp)) goto out; br_mdb_notify(br->dev, mp, pg, RTM_NEWMDB); @@ -1641,6 +1811,7 @@ static bool br_multicast_isexc(struct net_bridge_port_group *pg, switch (pg->filter_mode) { case MCAST_INCLUDE: __grp_src_isexc_incl(pg, srcs, nsrcs, src_size); + br_multicast_star_g_handle_mode(pg, MCAST_EXCLUDE); changed = true; break; case MCAST_EXCLUDE: @@ -1853,6 +2024,7 @@ static bool br_multicast_toex(struct net_bridge_port_group *pg, switch (pg->filter_mode) { case MCAST_INCLUDE: __grp_src_toex_incl(pg, srcs, nsrcs, src_size); + br_multicast_star_g_handle_mode(pg, MCAST_EXCLUDE); changed = true; break; case MCAST_EXCLUDE: diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 93d76b3dfc35..128d2d0417a0 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -213,6 +213,7 @@ struct net_bridge_fdb_entry { #define MDB_PG_FLAGS_PERMANENT BIT(0) #define MDB_PG_FLAGS_OFFLOAD BIT(1) #define MDB_PG_FLAGS_FAST_LEAVE BIT(2) +#define MDB_PG_FLAGS_STAR_EXCL BIT(3) #define PG_SRC_ENT_LIMIT 32 @@ -833,6 +834,10 @@ void br_mdb_init(void); void br_mdb_uninit(void); void br_multicast_host_join(struct net_bridge_mdb_entry *mp, bool notify); void br_multicast_host_leave(struct net_bridge_mdb_entry *mp, bool notify); +void br_multicast_star_g_handle_mode(struct net_bridge_port_group *pg, + u8 filter_mode); +void br_multicast_sg_add_exclude_ports(struct net_bridge_mdb_entry *star_mp, + struct net_bridge_port_group *sg); #define mlock_dereference(X, br) \ rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock)) @@ -895,6 +900,21 @@ static inline bool br_multicast_is_star_g(const struct br_ip *ip) } } +static inline bool br_multicast_should_handle_mode(const struct net_bridge *br, + __be16 proto) +{ + switch (proto) { + case htons(ETH_P_IP): + return !!(br->multicast_igmp_version == 3); +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + return !!(br->multicast_mld_version == 2); +#endif + default: + return false; + } +} + static inline int br_multicast_igmp_type(const struct sk_buff *skb) { return BR_INPUT_SKB_CB(skb)->igmp; -- cgit v1.2.3 From 9116ffbf1dd71f953ffda4198d01f82d3ca16df8 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 22 Sep 2020 10:30:25 +0300 Subject: net: bridge: mcast: add support for blocked port groups When excluding S,G entries we need a way to block a particular S,G,port. The new port group flag is managed based on the source's timer as per RFCs 3376 and 3810. When a source expires and its port group is in EXCLUDE mode, it will be blocked. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 1 + net/bridge/br_mdb.c | 2 ++ net/bridge/br_multicast.c | 49 ++++++++++++++++++++++++++++++++++++------ net/bridge/br_private.h | 1 + 4 files changed, 47 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index e4bd30a25f6b..4c687686aa8f 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -519,6 +519,7 @@ struct br_mdb_entry { #define MDB_FLAGS_OFFLOAD (1 << 0) #define MDB_FLAGS_FAST_LEAVE (1 << 1) #define MDB_FLAGS_STAR_EXCL (1 << 2) +#define MDB_FLAGS_BLOCKED (1 << 3) __u8 flags; __u16 vid; struct { diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 28cd35a9cf37..e15bab19a012 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -64,6 +64,8 @@ static void __mdb_entry_fill_flags(struct br_mdb_entry *e, unsigned char flags) e->flags |= MDB_FLAGS_FAST_LEAVE; if (flags & MDB_PG_FLAGS_STAR_EXCL) e->flags |= MDB_FLAGS_STAR_EXCL; + if (flags & MDB_PG_FLAGS_BLOCKED) + e->flags |= MDB_FLAGS_BLOCKED; } static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip, diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index f39bbd733722..11d224c01914 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -72,7 +72,8 @@ __br_multicast_add_group(struct net_bridge *br, struct br_ip *group, const unsigned char *src, u8 filter_mode, - bool igmpv2_mldv1); + bool igmpv2_mldv1, + bool blocked); static void br_multicast_find_del_pg(struct net_bridge *br, struct net_bridge_port_group *pg); @@ -211,7 +212,7 @@ static void __fwd_add_star_excl(struct net_bridge_port_group *pg, return; src_pg = __br_multicast_add_group(br, pg->key.port, sg_ip, pg->eth_addr, - MCAST_INCLUDE, false); + MCAST_INCLUDE, false, false); if (IS_ERR_OR_NULL(src_pg) || src_pg->rt_protocol != RTPROT_KERNEL) return; @@ -343,7 +344,7 @@ void br_multicast_sg_add_exclude_ports(struct net_bridge_mdb_entry *star_mp, src_pg = __br_multicast_add_group(br, pg->key.port, &sg->key.addr, sg->eth_addr, - MCAST_INCLUDE, false); + MCAST_INCLUDE, false, false); if (IS_ERR_OR_NULL(src_pg) || src_pg->rt_protocol != RTPROT_KERNEL) continue; @@ -364,7 +365,8 @@ static void br_multicast_fwd_src_add(struct net_bridge_group_src *src) sg_ip = src->pg->key.addr; sg_ip.src = src->addr.src; sg = __br_multicast_add_group(src->br, src->pg->key.port, &sg_ip, - src->pg->eth_addr, MCAST_INCLUDE, false); + src->pg->eth_addr, MCAST_INCLUDE, false, + !timer_pending(&src->timer)); if (IS_ERR_OR_NULL(sg)) return; src->flags |= BR_SGRP_F_INSTALLED; @@ -415,9 +417,38 @@ static void br_multicast_fwd_src_remove(struct net_bridge_group_src *src) src->flags &= ~BR_SGRP_F_INSTALLED; } +/* install S,G and based on src's timer enable or disable forwarding */ static void br_multicast_fwd_src_handle(struct net_bridge_group_src *src) { + struct net_bridge_port_group_sg_key sg_key; + struct net_bridge_port_group *sg; + u8 old_flags; + br_multicast_fwd_src_add(src); + + memset(&sg_key, 0, sizeof(sg_key)); + sg_key.addr = src->pg->key.addr; + sg_key.addr.src = src->addr.src; + sg_key.port = src->pg->key.port; + + sg = br_sg_port_find(src->br, &sg_key); + if (!sg || (sg->flags & MDB_PG_FLAGS_PERMANENT)) + return; + + old_flags = sg->flags; + if (timer_pending(&src->timer)) + sg->flags &= ~MDB_PG_FLAGS_BLOCKED; + else + sg->flags |= MDB_PG_FLAGS_BLOCKED; + + if (old_flags != sg->flags) { + struct net_bridge_mdb_entry *sg_mp; + + sg_mp = br_mdb_ip_get(src->br, &sg_key.addr); + if (!sg_mp) + return; + br_mdb_notify(src->br->dev, sg_mp, sg, RTM_NEWMDB); + } } static void br_multicast_destroy_mdb_entry(struct net_bridge_mcast_gc *gc) @@ -995,7 +1026,10 @@ static void br_multicast_group_src_expired(struct timer_list *t) if (!hlist_empty(&pg->src_list)) goto out; br_multicast_find_del_pg(br, pg); + } else { + br_multicast_fwd_src_handle(src); } + out: spin_unlock(&br->multicast_lock); } @@ -1131,7 +1165,8 @@ __br_multicast_add_group(struct net_bridge *br, struct br_ip *group, const unsigned char *src, u8 filter_mode, - bool igmpv2_mldv1) + bool igmpv2_mldv1, + bool blocked) { struct net_bridge_port_group __rcu **pp; struct net_bridge_port_group *p = NULL; @@ -1167,6 +1202,8 @@ __br_multicast_add_group(struct net_bridge *br, goto out; } rcu_assign_pointer(*pp, p); + if (blocked) + p->flags |= MDB_PG_FLAGS_BLOCKED; br_mdb_notify(br->dev, mp, p, RTM_NEWMDB); found: @@ -1189,7 +1226,7 @@ static int br_multicast_add_group(struct net_bridge *br, spin_lock(&br->multicast_lock); pg = __br_multicast_add_group(br, port, group, src, filter_mode, - igmpv2_mldv1); + igmpv2_mldv1, false); /* NULL is considered valid for host joined groups */ err = IS_ERR(pg) ? PTR_ERR(pg) : 0; spin_unlock(&br->multicast_lock); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 128d2d0417a0..345118e35c42 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -214,6 +214,7 @@ struct net_bridge_fdb_entry { #define MDB_PG_FLAGS_OFFLOAD BIT(1) #define MDB_PG_FLAGS_FAST_LEAVE BIT(2) #define MDB_PG_FLAGS_STAR_EXCL BIT(3) +#define MDB_PG_FLAGS_BLOCKED BIT(4) #define PG_SRC_ENT_LIMIT 32 -- cgit v1.2.3 From 77ebdabe8de7c02f43c6de3357f79ff96f9f0579 Mon Sep 17 00:00:00 2001 From: Elena Petrova Date: Fri, 18 Sep 2020 16:42:16 +0100 Subject: crypto: af_alg - add extra parameters for DRBG interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend the user-space RNG interface: 1. Add entropy input via ALG_SET_DRBG_ENTROPY setsockopt option; 2. Add additional data input via sendmsg syscall. This allows DRBG to be tested with test vectors, for example for the purpose of CAVP testing, which otherwise isn't possible. To prevent erroneous use of entropy input, it is hidden under CRYPTO_USER_API_RNG_CAVP config option and requires CAP_SYS_ADMIN to succeed. Signed-off-by: Elena Petrova Acked-by: Stephan Müller Reviewed-by: Eric Biggers Signed-off-by: Herbert Xu --- Documentation/crypto/userspace-if.rst | 20 +++- crypto/Kconfig | 9 ++ crypto/af_alg.c | 14 ++- crypto/algif_rng.c | 175 +++++++++++++++++++++++++++++++--- include/crypto/if_alg.h | 1 + include/uapi/linux/if_alg.h | 1 + 6 files changed, 205 insertions(+), 15 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/crypto/userspace-if.rst b/Documentation/crypto/userspace-if.rst index 52019e905900..b45dabbf69d6 100644 --- a/Documentation/crypto/userspace-if.rst +++ b/Documentation/crypto/userspace-if.rst @@ -296,15 +296,16 @@ follows: struct sockaddr_alg sa = { .salg_family = AF_ALG, - .salg_type = "rng", /* this selects the symmetric cipher */ - .salg_name = "drbg_nopr_sha256" /* this is the cipher name */ + .salg_type = "rng", /* this selects the random number generator */ + .salg_name = "drbg_nopr_sha256" /* this is the RNG name */ }; Depending on the RNG type, the RNG must be seeded. The seed is provided using the setsockopt interface to set the key. For example, the ansi_cprng requires a seed. The DRBGs do not require a seed, but may be -seeded. +seeded. The seed is also known as a *Personalization String* in NIST SP 800-90A +standard. Using the read()/recvmsg() system calls, random numbers can be obtained. The kernel generates at most 128 bytes in one call. If user space @@ -314,6 +315,16 @@ WARNING: The user space caller may invoke the initially mentioned accept system call multiple times. In this case, the returned file descriptors have the same state. +Following CAVP testing interfaces are enabled when kernel is built with +CRYPTO_USER_API_RNG_CAVP option: + +- the concatenation of *Entropy* and *Nonce* can be provided to the RNG via + ALG_SET_DRBG_ENTROPY setsockopt interface. Setting the entropy requires + CAP_SYS_ADMIN permission. + +- *Additional Data* can be provided using the send()/sendmsg() system calls, + but only after the entropy has been set. + Zero-Copy Interface ------------------- @@ -377,6 +388,9 @@ mentioned optname: provided ciphertext is assumed to contain an authentication tag of the given size (see section about AEAD memory layout below). +- ALG_SET_DRBG_ENTROPY -- Setting the entropy of the random number generator. + This option is applicable to RNG cipher type only. + User space API example ---------------------- diff --git a/crypto/Kconfig b/crypto/Kconfig index fac10143d23f..88f98caaf30d 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1875,6 +1875,15 @@ config CRYPTO_USER_API_RNG This option enables the user-spaces interface for random number generator algorithms. +config CRYPTO_USER_API_RNG_CAVP + bool "Enable CAVP testing of DRBG" + depends on CRYPTO_USER_API_RNG && CRYPTO_DRBG + help + This option enables extra API for CAVP testing via the user-space + interface: resetting of DRBG entropy, and providing Additional Data. + This should only be enabled for CAVP testing. You should say + no unless you know what this is. + config CRYPTO_USER_API_AEAD tristate "User-space interface for AEAD cipher algorithms" depends on NET diff --git a/crypto/af_alg.c b/crypto/af_alg.c index a6f581ab200c..8535cb03b484 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -253,6 +253,14 @@ static int alg_setsockopt(struct socket *sock, int level, int optname, if (!type->setauthsize) goto unlock; err = type->setauthsize(ask->private, optlen); + break; + case ALG_SET_DRBG_ENTROPY: + if (sock->state == SS_CONNECTED) + goto unlock; + if (!type->setentropy) + goto unlock; + + err = type->setentropy(ask->private, optval, optlen); } unlock: @@ -285,6 +293,11 @@ int af_alg_accept(struct sock *sk, struct socket *newsock, bool kern) security_sock_graft(sk2, newsock); security_sk_clone(sk, sk2); + /* + * newsock->ops assigned here to allow type->accept call to override + * them when required. + */ + newsock->ops = type->ops; err = type->accept(ask->private, sk2); nokey = err == -ENOKEY; @@ -303,7 +316,6 @@ int af_alg_accept(struct sock *sk, struct socket *newsock, bool kern) alg_sk(sk2)->parent = sk; alg_sk(sk2)->type = type; - newsock->ops = type->ops; newsock->state = SS_CONNECTED; if (nokey) diff --git a/crypto/algif_rng.c b/crypto/algif_rng.c index 6300e0566dc5..407408c43730 100644 --- a/crypto/algif_rng.c +++ b/crypto/algif_rng.c @@ -38,6 +38,7 @@ * DAMAGE. */ +#include #include #include #include @@ -53,15 +54,26 @@ struct rng_ctx { #define MAXSIZE 128 unsigned int len; struct crypto_rng *drng; + u8 *addtl; + size_t addtl_len; }; -static int rng_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, - int flags) +struct rng_parent_ctx { + struct crypto_rng *drng; + u8 *entropy; +}; + +static void rng_reset_addtl(struct rng_ctx *ctx) { - struct sock *sk = sock->sk; - struct alg_sock *ask = alg_sk(sk); - struct rng_ctx *ctx = ask->private; - int err; + kfree_sensitive(ctx->addtl); + ctx->addtl = NULL; + ctx->addtl_len = 0; +} + +static int _rng_recvmsg(struct crypto_rng *drng, struct msghdr *msg, size_t len, + u8 *addtl, size_t addtl_len) +{ + int err = 0; int genlen = 0; u8 result[MAXSIZE]; @@ -82,7 +94,7 @@ static int rng_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, * seeding as they automatically seed. The X9.31 DRNG will return * an error if it was not seeded properly. */ - genlen = crypto_rng_get_bytes(ctx->drng, result, len); + genlen = crypto_rng_generate(drng, addtl, addtl_len, result, len); if (genlen < 0) return genlen; @@ -92,6 +104,63 @@ static int rng_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, return err ? err : len; } +static int rng_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags) +{ + struct sock *sk = sock->sk; + struct alg_sock *ask = alg_sk(sk); + struct rng_ctx *ctx = ask->private; + + return _rng_recvmsg(ctx->drng, msg, len, NULL, 0); +} + +static int rng_test_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags) +{ + struct sock *sk = sock->sk; + struct alg_sock *ask = alg_sk(sk); + struct rng_ctx *ctx = ask->private; + int ret; + + lock_sock(sock->sk); + ret = _rng_recvmsg(ctx->drng, msg, len, ctx->addtl, ctx->addtl_len); + rng_reset_addtl(ctx); + release_sock(sock->sk); + + return ret; +} + +static int rng_test_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) +{ + int err; + struct alg_sock *ask = alg_sk(sock->sk); + struct rng_ctx *ctx = ask->private; + + lock_sock(sock->sk); + if (len > MAXSIZE) { + err = -EMSGSIZE; + goto unlock; + } + + rng_reset_addtl(ctx); + ctx->addtl = kmalloc(len, GFP_KERNEL); + if (!ctx->addtl) { + err = -ENOMEM; + goto unlock; + } + + err = memcpy_from_msg(ctx->addtl, msg, len); + if (err) { + rng_reset_addtl(ctx); + goto unlock; + } + ctx->addtl_len = len; + +unlock: + release_sock(sock->sk); + return err ? err : len; +} + static struct proto_ops algif_rng_ops = { .family = PF_ALG, @@ -111,14 +180,53 @@ static struct proto_ops algif_rng_ops = { .recvmsg = rng_recvmsg, }; +static struct proto_ops __maybe_unused algif_rng_test_ops = { + .family = PF_ALG, + + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .getname = sock_no_getname, + .ioctl = sock_no_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .mmap = sock_no_mmap, + .bind = sock_no_bind, + .accept = sock_no_accept, + .sendpage = sock_no_sendpage, + + .release = af_alg_release, + .recvmsg = rng_test_recvmsg, + .sendmsg = rng_test_sendmsg, +}; + static void *rng_bind(const char *name, u32 type, u32 mask) { - return crypto_alloc_rng(name, type, mask); + struct rng_parent_ctx *pctx; + struct crypto_rng *rng; + + pctx = kzalloc(sizeof(*pctx), GFP_KERNEL); + if (!pctx) + return ERR_PTR(-ENOMEM); + + rng = crypto_alloc_rng(name, type, mask); + if (IS_ERR(rng)) { + kfree(pctx); + return ERR_CAST(rng); + } + + pctx->drng = rng; + return pctx; } static void rng_release(void *private) { - crypto_free_rng(private); + struct rng_parent_ctx *pctx = private; + + if (unlikely(!pctx)) + return; + crypto_free_rng(pctx->drng); + kfree_sensitive(pctx->entropy); + kfree_sensitive(pctx); } static void rng_sock_destruct(struct sock *sk) @@ -126,6 +234,7 @@ static void rng_sock_destruct(struct sock *sk) struct alg_sock *ask = alg_sk(sk); struct rng_ctx *ctx = ask->private; + rng_reset_addtl(ctx); sock_kfree_s(sk, ctx, ctx->len); af_alg_release_parent(sk); } @@ -133,6 +242,7 @@ static void rng_sock_destruct(struct sock *sk) static int rng_accept_parent(void *private, struct sock *sk) { struct rng_ctx *ctx; + struct rng_parent_ctx *pctx = private; struct alg_sock *ask = alg_sk(sk); unsigned int len = sizeof(*ctx); @@ -141,6 +251,8 @@ static int rng_accept_parent(void *private, struct sock *sk) return -ENOMEM; ctx->len = len; + ctx->addtl = NULL; + ctx->addtl_len = 0; /* * No seeding done at that point -- if multiple accepts are @@ -148,20 +260,58 @@ static int rng_accept_parent(void *private, struct sock *sk) * state of the RNG. */ - ctx->drng = private; + ctx->drng = pctx->drng; ask->private = ctx; sk->sk_destruct = rng_sock_destruct; + /* + * Non NULL pctx->entropy means that CAVP test has been initiated on + * this socket, replace proto_ops algif_rng_ops with algif_rng_test_ops. + */ + if (IS_ENABLED(CONFIG_CRYPTO_USER_API_RNG_CAVP) && pctx->entropy) + sk->sk_socket->ops = &algif_rng_test_ops; + return 0; } static int rng_setkey(void *private, const u8 *seed, unsigned int seedlen) { + struct rng_parent_ctx *pctx = private; /* * Check whether seedlen is of sufficient size is done in RNG * implementations. */ - return crypto_rng_reset(private, seed, seedlen); + return crypto_rng_reset(pctx->drng, seed, seedlen); +} + +static int __maybe_unused rng_setentropy(void *private, sockptr_t entropy, + unsigned int len) +{ + struct rng_parent_ctx *pctx = private; + u8 *kentropy = NULL; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (pctx->entropy) + return -EINVAL; + + if (len > MAXSIZE) + return -EMSGSIZE; + + if (len) { + kentropy = memdup_sockptr(entropy, len); + if (IS_ERR(kentropy)) + return PTR_ERR(kentropy); + } + + crypto_rng_alg(pctx->drng)->set_ent(pctx->drng, kentropy, len); + /* + * Since rng doesn't perform any memory management for the entropy + * buffer, save kentropy pointer to pctx now to free it after use. + */ + pctx->entropy = kentropy; + return 0; } static const struct af_alg_type algif_type_rng = { @@ -169,6 +319,9 @@ static const struct af_alg_type algif_type_rng = { .release = rng_release, .accept = rng_accept_parent, .setkey = rng_setkey, +#ifdef CONFIG_CRYPTO_USER_API_RNG_CAVP + .setentropy = rng_setentropy, +#endif .ops = &algif_rng_ops, .name = "rng", .owner = THIS_MODULE diff --git a/include/crypto/if_alg.h b/include/crypto/if_alg.h index ee6412314f8f..a5db86670bdf 100644 --- a/include/crypto/if_alg.h +++ b/include/crypto/if_alg.h @@ -46,6 +46,7 @@ struct af_alg_type { void *(*bind)(const char *name, u32 type, u32 mask); void (*release)(void *private); int (*setkey)(void *private, const u8 *key, unsigned int keylen); + int (*setentropy)(void *private, sockptr_t entropy, unsigned int len); int (*accept)(void *private, struct sock *sk); int (*accept_nokey)(void *private, struct sock *sk); int (*setauthsize)(void *private, unsigned int authsize); diff --git a/include/uapi/linux/if_alg.h b/include/uapi/linux/if_alg.h index bc2bcdec377b..60b7c2efd921 100644 --- a/include/uapi/linux/if_alg.h +++ b/include/uapi/linux/if_alg.h @@ -35,6 +35,7 @@ struct af_alg_iv { #define ALG_SET_OP 3 #define ALG_SET_AEAD_ASSOCLEN 4 #define ALG_SET_AEAD_AUTHSIZE 5 +#define ALG_SET_DRBG_ENTROPY 6 /* Operations */ #define ALG_OP_DECRYPT 0 -- cgit v1.2.3 From 2a36ab717e8fe678d98f81c14a0b124712719840 Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Wed, 23 Sep 2020 16:36:16 -0700 Subject: rseq/membarrier: Add MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ This patchset is based on Google-internal RSEQ work done by Paul Turner and Andrew Hunter. When working with per-CPU RSEQ-based memory allocations, it is sometimes important to make sure that a global memory location is no longer accessed from RSEQ critical sections. For example, there can be two per-CPU lists, one is "active" and accessed per-CPU, while another one is inactive and worked on asynchronously "off CPU" (e.g. garbage collection is performed). Then at some point the two lists are swapped, and a fast RCU-like mechanism is required to make sure that the previously active list is no longer accessed. This patch introduces such a mechanism: in short, membarrier() syscall issues an IPI to a CPU, restarting a potentially active RSEQ critical section on the CPU. Signed-off-by: Peter Oskolkov Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mathieu Desnoyers Link: https://lkml.kernel.org/r/20200923233618.2572849-1-posk@google.com --- include/linux/sched/mm.h | 3 + include/linux/syscalls.h | 2 +- include/uapi/linux/membarrier.h | 26 ++++++++ kernel/sched/membarrier.c | 136 +++++++++++++++++++++++++++++++--------- 4 files changed, 136 insertions(+), 31 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index f889e332912f..15bfb06f2884 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -348,10 +348,13 @@ enum { MEMBARRIER_STATE_GLOBAL_EXPEDITED = (1U << 3), MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY = (1U << 4), MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE = (1U << 5), + MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY = (1U << 6), + MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ = (1U << 7), }; enum { MEMBARRIER_FLAG_SYNC_CORE = (1U << 0), + MEMBARRIER_FLAG_RSEQ = (1U << 1), }; #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 75ac7f8ae93c..06db09875aa4 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -974,7 +974,7 @@ asmlinkage long sys_execveat(int dfd, const char __user *filename, const char __user *const __user *argv, const char __user *const __user *envp, int flags); asmlinkage long sys_userfaultfd(int flags); -asmlinkage long sys_membarrier(int cmd, int flags); +asmlinkage long sys_membarrier(int cmd, unsigned int flags, int cpu_id); asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags); asmlinkage long sys_copy_file_range(int fd_in, loff_t __user *off_in, int fd_out, loff_t __user *off_out, diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h index 5891d7614c8c..737605897f36 100644 --- a/include/uapi/linux/membarrier.h +++ b/include/uapi/linux/membarrier.h @@ -114,6 +114,26 @@ * If this command is not implemented by an * architecture, -EINVAL is returned. * Returns 0 on success. + * @MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: + * Ensure the caller thread, upon return from + * system call, that all its running thread + * siblings have any currently running rseq + * critical sections restarted if @flags + * parameter is 0; if @flags parameter is + * MEMBARRIER_CMD_FLAG_CPU, + * then this operation is performed only + * on CPU indicated by @cpu_id. If this command is + * not implemented by an architecture, -EINVAL + * is returned. A process needs to register its + * intent to use the private expedited rseq + * command prior to using it, otherwise + * this command returns -EPERM. + * @MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ: + * Register the process intent to use + * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ. + * If this command is not implemented by an + * architecture, -EINVAL is returned. + * Returns 0 on success. * @MEMBARRIER_CMD_SHARED: * Alias to MEMBARRIER_CMD_GLOBAL. Provided for * header backward compatibility. @@ -131,9 +151,15 @@ enum membarrier_cmd { MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED = (1 << 4), MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE = (1 << 5), MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE = (1 << 6), + MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ = (1 << 7), + MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ = (1 << 8), /* Alias for header backward compatibility. */ MEMBARRIER_CMD_SHARED = MEMBARRIER_CMD_GLOBAL, }; +enum membarrier_cmd_flag { + MEMBARRIER_CMD_FLAG_CPU = (1 << 0), +}; + #endif /* _UAPI_LINUX_MEMBARRIER_H */ diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 168479a7d61b..e23e74d52db5 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -18,6 +18,14 @@ #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0 #endif +#ifdef CONFIG_RSEQ +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK \ + (MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ \ + | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ_BITMASK) +#else +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK 0 +#endif + #define MEMBARRIER_CMD_BITMASK \ (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ @@ -30,6 +38,11 @@ static void ipi_mb(void *info) smp_mb(); /* IPIs should be serializing but paranoid. */ } +static void ipi_rseq(void *info) +{ + rseq_preempt(current); +} + static void ipi_sync_rq_state(void *info) { struct mm_struct *mm = (struct mm_struct *) info; @@ -129,19 +142,27 @@ static int membarrier_global_expedited(void) return 0; } -static int membarrier_private_expedited(int flags) +static int membarrier_private_expedited(int flags, int cpu_id) { - int cpu; cpumask_var_t tmpmask; struct mm_struct *mm = current->mm; + smp_call_func_t ipi_func = ipi_mb; - if (flags & MEMBARRIER_FLAG_SYNC_CORE) { + if (flags == MEMBARRIER_FLAG_SYNC_CORE) { if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) return -EINVAL; if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) return -EPERM; + } else if (flags == MEMBARRIER_FLAG_RSEQ) { + if (!IS_ENABLED(CONFIG_RSEQ)) + return -EINVAL; + if (!(atomic_read(&mm->membarrier_state) & + MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY)) + return -EPERM; + ipi_func = ipi_rseq; } else { + WARN_ON_ONCE(flags); if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) return -EPERM; @@ -156,35 +177,59 @@ static int membarrier_private_expedited(int flags) */ smp_mb(); /* system call entry is not a mb. */ - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) return -ENOMEM; cpus_read_lock(); - rcu_read_lock(); - for_each_online_cpu(cpu) { + + if (cpu_id >= 0) { struct task_struct *p; - /* - * Skipping the current CPU is OK even through we can be - * migrated at any point. The current CPU, at the point - * where we read raw_smp_processor_id(), is ensured to - * be in program order with respect to the caller - * thread. Therefore, we can skip this CPU from the - * iteration. - */ - if (cpu == raw_smp_processor_id()) - continue; - p = rcu_dereference(cpu_rq(cpu)->curr); - if (p && p->mm == mm) - __cpumask_set_cpu(cpu, tmpmask); + if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id)) + goto out; + if (cpu_id == raw_smp_processor_id()) + goto out; + rcu_read_lock(); + p = rcu_dereference(cpu_rq(cpu_id)->curr); + if (!p || p->mm != mm) { + rcu_read_unlock(); + goto out; + } + rcu_read_unlock(); + } else { + int cpu; + + rcu_read_lock(); + for_each_online_cpu(cpu) { + struct task_struct *p; + + /* + * Skipping the current CPU is OK even through we can be + * migrated at any point. The current CPU, at the point + * where we read raw_smp_processor_id(), is ensured to + * be in program order with respect to the caller + * thread. Therefore, we can skip this CPU from the + * iteration. + */ + if (cpu == raw_smp_processor_id()) + continue; + p = rcu_dereference(cpu_rq(cpu)->curr); + if (p && p->mm == mm) + __cpumask_set_cpu(cpu, tmpmask); + } + rcu_read_unlock(); } - rcu_read_unlock(); preempt_disable(); - smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + if (cpu_id >= 0) + smp_call_function_single(cpu_id, ipi_func, NULL, 1); + else + smp_call_function_many(tmpmask, ipi_func, NULL, 1); preempt_enable(); - free_cpumask_var(tmpmask); +out: + if (cpu_id < 0) + free_cpumask_var(tmpmask); cpus_read_unlock(); /* @@ -283,11 +328,18 @@ static int membarrier_register_private_expedited(int flags) set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED, ret; - if (flags & MEMBARRIER_FLAG_SYNC_CORE) { + if (flags == MEMBARRIER_FLAG_SYNC_CORE) { if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) return -EINVAL; ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; + } else if (flags == MEMBARRIER_FLAG_RSEQ) { + if (!IS_ENABLED(CONFIG_RSEQ)) + return -EINVAL; + ready_state = + MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY; + } else { + WARN_ON_ONCE(flags); } /* @@ -299,6 +351,8 @@ static int membarrier_register_private_expedited(int flags) return 0; if (flags & MEMBARRIER_FLAG_SYNC_CORE) set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE; + if (flags & MEMBARRIER_FLAG_RSEQ) + set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ; atomic_or(set_state, &mm->membarrier_state); ret = sync_runqueues_membarrier_state(mm); if (ret) @@ -310,8 +364,15 @@ static int membarrier_register_private_expedited(int flags) /** * sys_membarrier - issue memory barriers on a set of threads - * @cmd: Takes command values defined in enum membarrier_cmd. - * @flags: Currently needs to be 0. For future extensions. + * @cmd: Takes command values defined in enum membarrier_cmd. + * @flags: Currently needs to be 0 for all commands other than + * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: in the latter + * case it can be MEMBARRIER_CMD_FLAG_CPU, indicating that @cpu_id + * contains the CPU on which to interrupt (= restart) + * the RSEQ critical section. + * @cpu_id: if @flags == MEMBARRIER_CMD_FLAG_CPU, indicates the cpu on which + * RSEQ CS should be interrupted (@cmd must be + * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ). * * If this system call is not implemented, -ENOSYS is returned. If the * command specified does not exist, not available on the running @@ -337,10 +398,21 @@ static int membarrier_register_private_expedited(int flags) * smp_mb() X O O * sys_membarrier() O O O */ -SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) +SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id) { - if (unlikely(flags)) - return -EINVAL; + switch (cmd) { + case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: + if (unlikely(flags && flags != MEMBARRIER_CMD_FLAG_CPU)) + return -EINVAL; + break; + default: + if (unlikely(flags)) + return -EINVAL; + } + + if (!(flags & MEMBARRIER_CMD_FLAG_CPU)) + cpu_id = -1; + switch (cmd) { case MEMBARRIER_CMD_QUERY: { @@ -362,13 +434,17 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED: return membarrier_register_global_expedited(); case MEMBARRIER_CMD_PRIVATE_EXPEDITED: - return membarrier_private_expedited(0); + return membarrier_private_expedited(0, cpu_id); case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: return membarrier_register_private_expedited(0); case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE: - return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE); + return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE, cpu_id); case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE: return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE); + case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: + return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id); + case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ: + return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ); default: return -EINVAL; } -- cgit v1.2.3 From a5fa25adf03d4b063aece74ba70ccbb3a71af122 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 24 Sep 2020 17:03:56 -0700 Subject: bpf: Change bpf_sk_release and bpf_sk_*cgroup_id to accept ARG_PTR_TO_BTF_ID_SOCK_COMMON The previous patch allows the networking bpf prog to use the bpf_skc_to_*() helpers to get a PTR_TO_BTF_ID socket pointer, e.g. "struct tcp_sock *". It allows the bpf prog to read all the fields of the tcp_sock. This patch changes the bpf_sk_release() and bpf_sk_*cgroup_id() to take ARG_PTR_TO_BTF_ID_SOCK_COMMON such that they will work with the pointer returned by the bpf_skc_to_*() helpers also. For example, the following will work: sk = bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0); if (!sk) return; tp = bpf_skc_to_tcp_sock(sk); if (!tp) { bpf_sk_release(sk); return; } lsndtime = tp->lsndtime; /* Pass tp to bpf_sk_release() will also work */ bpf_sk_release(tp); Since PTR_TO_BTF_ID could be NULL, the helper taking ARG_PTR_TO_BTF_ID_SOCK_COMMON has to check for NULL at runtime. A btf_id of "struct sock" may not always mean a fullsock. Regardless the helper's running context may get a non-fullsock or not, considering fullsock check/handling is pretty cheap, it is better to keep the same verifier expectation on helper that takes ARG_PTR_TO_BTF_ID* will be able to handle the minisock situation. In the bpf_sk_*cgroup_id() case, it will try to get a fullsock by using sk_to_full_sk() as its skb variant bpf_sk"b"_*cgroup_id() has already been doing. bpf_sk_release can already handle minisock, so nothing special has to be done. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200925000356.3856047-1-kafai@fb.com --- include/uapi/linux/bpf.h | 8 ++++---- net/core/filter.c | 30 ++++++++++++++---------------- tools/include/uapi/linux/bpf.h | 8 ++++---- 3 files changed, 22 insertions(+), 24 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a22812561064..c96a56d9c3be 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2512,7 +2512,7 @@ union bpf_attr { * result is from *reuse*\ **->socks**\ [] using the hash of the * tuple. * - * long bpf_sk_release(struct bpf_sock *sock) + * long bpf_sk_release(void *sock) * Description * Release the reference held by *sock*. *sock* must be a * non-**NULL** pointer that was returned from @@ -3234,11 +3234,11 @@ union bpf_attr { * * **-EOVERFLOW** if an overflow happened: The same object will be tried again. * - * u64 bpf_sk_cgroup_id(struct bpf_sock *sk) + * u64 bpf_sk_cgroup_id(void *sk) * Description * Return the cgroup v2 id of the socket *sk*. * - * *sk* must be a non-**NULL** pointer to a full socket, e.g. one + * *sk* must be a non-**NULL** pointer to a socket, e.g. one * returned from **bpf_sk_lookup_xxx**\ (), * **bpf_sk_fullsock**\ (), etc. The format of returned id is * same as in **bpf_skb_cgroup_id**\ (). @@ -3248,7 +3248,7 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * u64 bpf_sk_ancestor_cgroup_id(struct bpf_sock *sk, int ancestor_level) + * u64 bpf_sk_ancestor_cgroup_id(void *sk, int ancestor_level) * Description * Return id of cgroup v2 that is ancestor of cgroup associated * with the *sk* at the *ancestor_level*. The root cgroup is at diff --git a/net/core/filter.c b/net/core/filter.c index 6d1864f2bd51..06d397eeef2a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4088,18 +4088,17 @@ static inline u64 __bpf_sk_cgroup_id(struct sock *sk) { struct cgroup *cgrp; + sk = sk_to_full_sk(sk); + if (!sk || !sk_fullsock(sk)) + return 0; + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); return cgroup_id(cgrp); } BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb) { - struct sock *sk = skb_to_full_sk(skb); - - if (!sk || !sk_fullsock(sk)) - return 0; - - return __bpf_sk_cgroup_id(sk); + return __bpf_sk_cgroup_id(skb->sk); } static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { @@ -4115,6 +4114,10 @@ static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk, struct cgroup *ancestor; struct cgroup *cgrp; + sk = sk_to_full_sk(sk); + if (!sk || !sk_fullsock(sk)) + return 0; + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); ancestor = cgroup_ancestor(cgrp, ancestor_level); if (!ancestor) @@ -4126,12 +4129,7 @@ static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk, BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, ancestor_level) { - struct sock *sk = skb_to_full_sk(skb); - - if (!sk || !sk_fullsock(sk)) - return 0; - - return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level); + return __bpf_sk_ancestor_cgroup_id(skb->sk, ancestor_level); } static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { @@ -4151,7 +4149,7 @@ static const struct bpf_func_proto bpf_sk_cgroup_id_proto = { .func = bpf_sk_cgroup_id, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_SOCKET, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, }; BPF_CALL_2(bpf_sk_ancestor_cgroup_id, struct sock *, sk, int, ancestor_level) @@ -4163,7 +4161,7 @@ static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = { .func = bpf_sk_ancestor_cgroup_id, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_SOCKET, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_ANYTHING, }; #endif @@ -5697,7 +5695,7 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { BPF_CALL_1(bpf_sk_release, struct sock *, sk) { - if (sk_is_refcounted(sk)) + if (sk && sk_is_refcounted(sk)) sock_gen_put(sk); return 0; } @@ -5706,7 +5704,7 @@ static const struct bpf_func_proto bpf_sk_release_proto = { .func = bpf_sk_release, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_SOCK_COMMON, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, }; BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx, diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index a22812561064..c96a56d9c3be 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2512,7 +2512,7 @@ union bpf_attr { * result is from *reuse*\ **->socks**\ [] using the hash of the * tuple. * - * long bpf_sk_release(struct bpf_sock *sock) + * long bpf_sk_release(void *sock) * Description * Release the reference held by *sock*. *sock* must be a * non-**NULL** pointer that was returned from @@ -3234,11 +3234,11 @@ union bpf_attr { * * **-EOVERFLOW** if an overflow happened: The same object will be tried again. * - * u64 bpf_sk_cgroup_id(struct bpf_sock *sk) + * u64 bpf_sk_cgroup_id(void *sk) * Description * Return the cgroup v2 id of the socket *sk*. * - * *sk* must be a non-**NULL** pointer to a full socket, e.g. one + * *sk* must be a non-**NULL** pointer to a socket, e.g. one * returned from **bpf_sk_lookup_xxx**\ (), * **bpf_sk_fullsock**\ (), etc. The format of returned id is * same as in **bpf_skb_cgroup_id**\ (). @@ -3248,7 +3248,7 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * u64 bpf_sk_ancestor_cgroup_id(struct bpf_sock *sk, int ancestor_level) + * u64 bpf_sk_ancestor_cgroup_id(void *sk, int ancestor_level) * Description * Return id of cgroup v2 that is ancestor of cgroup associated * with the *sk* at the *ancestor_level*. The root cgroup is at -- cgit v1.2.3 From 592a3498648af000e93dff2d36229ab11cd8c7f6 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 24 Sep 2020 17:04:02 -0700 Subject: bpf: Change bpf_sk_storage_*() to accept ARG_PTR_TO_BTF_ID_SOCK_COMMON This patch changes the bpf_sk_storage_*() to take ARG_PTR_TO_BTF_ID_SOCK_COMMON such that they will work with the pointer returned by the bpf_skc_to_*() helpers also. A micro benchmark has been done on a "cgroup_skb/egress" bpf program which does a bpf_sk_storage_get(). It was driven by netperf doing a 4096 connected UDP_STREAM test with 64bytes packet. The stats from "kernel.bpf_stats_enabled" shows no meaningful difference. The sk_storage_get_btf_proto, sk_storage_delete_btf_proto, btf_sk_storage_get_proto, and btf_sk_storage_delete_proto are no longer needed, so they are removed. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20200925000402.3856307-1-kafai@fb.com --- include/net/bpf_sk_storage.h | 2 -- include/uapi/linux/bpf.h | 1 + kernel/bpf/bpf_lsm.c | 4 ++-- net/core/bpf_sk_storage.c | 29 ++++++----------------------- net/ipv4/bpf_tcp_ca.c | 23 ++--------------------- tools/include/uapi/linux/bpf.h | 1 + 6 files changed, 12 insertions(+), 48 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h index 119f4c9c3a9c..3c516dd07caf 100644 --- a/include/net/bpf_sk_storage.h +++ b/include/net/bpf_sk_storage.h @@ -20,8 +20,6 @@ void bpf_sk_storage_free(struct sock *sk); extern const struct bpf_func_proto bpf_sk_storage_get_proto; extern const struct bpf_func_proto bpf_sk_storage_delete_proto; -extern const struct bpf_func_proto sk_storage_get_btf_proto; -extern const struct bpf_func_proto sk_storage_delete_btf_proto; struct bpf_local_storage_elem; struct bpf_sk_storage_diag; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c96a56d9c3be..0ec6dbeb17a5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2861,6 +2861,7 @@ union bpf_attr { * 0 on success. * * **-ENOENT** if the bpf-local-storage cannot be found. + * **-EINVAL** if sk is not a fullsock (e.g. a request_sock). * * long bpf_send_signal(u32 sig) * Description diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 9cd1428c7199..78ea8a7bd27f 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -56,9 +56,9 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_inode_storage_delete: return &bpf_inode_storage_delete_proto; case BPF_FUNC_sk_storage_get: - return &sk_storage_get_btf_proto; + return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: - return &sk_storage_delete_btf_proto; + return &bpf_sk_storage_delete_proto; default: return tracing_prog_func_proto(func_id, prog); } diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 838efc682cff..c907f0dc7f87 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -269,7 +269,7 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, { struct bpf_local_storage_data *sdata; - if (flags > BPF_SK_STORAGE_GET_F_CREATE) + if (!sk || !sk_fullsock(sk) || flags > BPF_SK_STORAGE_GET_F_CREATE) return (unsigned long)NULL; sdata = sk_storage_lookup(sk, map, true); @@ -299,6 +299,9 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) { + if (!sk || !sk_fullsock(sk)) + return -EINVAL; + if (refcount_inc_not_zero(&sk->sk_refcnt)) { int err; @@ -355,7 +358,7 @@ const struct bpf_func_proto bpf_sk_storage_get_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_SOCKET, + .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, }; @@ -375,27 +378,7 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_SOCKET, -}; - -const struct bpf_func_proto sk_storage_get_btf_proto = { - .func = bpf_sk_storage_get, - .gpl_only = false, - .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, - .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_BTF_ID, - .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK], - .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, - .arg4_type = ARG_ANYTHING, -}; - -const struct bpf_func_proto sk_storage_delete_btf_proto = { - .func = bpf_sk_storage_delete, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_BTF_ID, - .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK], + .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, }; struct bpf_sk_storage_diag { diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 74a2ef598c31..618954f82764 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -28,22 +28,6 @@ static u32 unsupported_ops[] = { static const struct btf_type *tcp_sock_type; static u32 tcp_sock_id, sock_id; -static struct bpf_func_proto btf_sk_storage_get_proto __read_mostly; -static struct bpf_func_proto btf_sk_storage_delete_proto __read_mostly; - -static void convert_sk_func_proto(struct bpf_func_proto *to, const struct bpf_func_proto *from) -{ - int i; - - *to = *from; - for (i = 0; i < ARRAY_SIZE(to->arg_type); i++) { - if (to->arg_type[i] == ARG_PTR_TO_SOCKET) { - to->arg_type[i] = ARG_PTR_TO_BTF_ID; - to->arg_btf_id[i] = &tcp_sock_id; - } - } -} - static int bpf_tcp_ca_init(struct btf *btf) { s32 type_id; @@ -59,9 +43,6 @@ static int bpf_tcp_ca_init(struct btf *btf) tcp_sock_id = type_id; tcp_sock_type = btf_type_by_id(btf, tcp_sock_id); - convert_sk_func_proto(&btf_sk_storage_get_proto, &bpf_sk_storage_get_proto); - convert_sk_func_proto(&btf_sk_storage_delete_proto, &bpf_sk_storage_delete_proto); - return 0; } @@ -188,9 +169,9 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, case BPF_FUNC_tcp_send_ack: return &bpf_tcp_send_ack_proto; case BPF_FUNC_sk_storage_get: - return &btf_sk_storage_get_proto; + return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: - return &btf_sk_storage_delete_proto; + return &bpf_sk_storage_delete_proto; default: return bpf_base_func_proto(func_id); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index c96a56d9c3be..0ec6dbeb17a5 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2861,6 +2861,7 @@ union bpf_attr { * 0 on success. * * **-ENOENT** if the bpf-local-storage cannot be found. + * **-EINVAL** if sk is not a fullsock (e.g. a request_sock). * * long bpf_send_signal(u32 sig) * Description -- cgit v1.2.3 From c0df236e1394970f3503a8fb103de95d000014ca Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 24 Sep 2020 17:04:09 -0700 Subject: bpf: Change bpf_tcp_*_syncookie to accept ARG_PTR_TO_BTF_ID_SOCK_COMMON This patch changes the bpf_tcp_*_syncookie() to take ARG_PTR_TO_BTF_ID_SOCK_COMMON such that they will work with the pointer returned by the bpf_skc_to_*() helpers also. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20200925000409.3856725-1-kafai@fb.com --- include/uapi/linux/bpf.h | 4 ++-- net/core/filter.c | 8 ++++---- tools/include/uapi/linux/bpf.h | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0ec6dbeb17a5..69b9e30375bc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2692,7 +2692,7 @@ union bpf_attr { * result is from *reuse*\ **->socks**\ [] using the hash of the * tuple. * - * long bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * long bpf_tcp_check_syncookie(void *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) * Description * Check whether *iph* and *th* contain a valid SYN cookie ACK for * the listening socket in *sk*. @@ -2878,7 +2878,7 @@ union bpf_attr { * * **-EAGAIN** if bpf program can try again. * - * s64 bpf_tcp_gen_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * s64 bpf_tcp_gen_syncookie(void *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) * Description * Try to issue a SYN cookie for the packet with corresponding * IP/TCP headers, *iph* and *th*, on the listening socket in *sk*. diff --git a/net/core/filter.c b/net/core/filter.c index 06d397eeef2a..1d88e9b498eb 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6086,7 +6086,7 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len u32 cookie; int ret; - if (unlikely(th_len < sizeof(*th))) + if (unlikely(!sk || th_len < sizeof(*th))) return -EINVAL; /* sk_listener() allows TCP_NEW_SYN_RECV, which makes no sense here. */ @@ -6139,7 +6139,7 @@ static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = { .gpl_only = true, .pkt_access = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_SOCK_COMMON, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_PTR_TO_MEM, @@ -6153,7 +6153,7 @@ BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len, u32 cookie; u16 mss; - if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4)) + if (unlikely(!sk || th_len < sizeof(*th) || th_len != th->doff * 4)) return -EINVAL; if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN) @@ -6208,7 +6208,7 @@ static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = { .gpl_only = true, /* __cookie_v*_init_sequence() is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_SOCK_COMMON, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_PTR_TO_MEM, diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0ec6dbeb17a5..69b9e30375bc 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2692,7 +2692,7 @@ union bpf_attr { * result is from *reuse*\ **->socks**\ [] using the hash of the * tuple. * - * long bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * long bpf_tcp_check_syncookie(void *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) * Description * Check whether *iph* and *th* contain a valid SYN cookie ACK for * the listening socket in *sk*. @@ -2878,7 +2878,7 @@ union bpf_attr { * * **-EAGAIN** if bpf program can try again. * - * s64 bpf_tcp_gen_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * s64 bpf_tcp_gen_syncookie(void *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) * Description * Try to issue a SYN cookie for the packet with corresponding * IP/TCP headers, *iph* and *th*, on the listening socket in *sk*. -- cgit v1.2.3 From 27e5203bd9c5cc6d54dcac48c3027f3f04522b8b Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 24 Sep 2020 17:04:15 -0700 Subject: bpf: Change bpf_sk_assign to accept ARG_PTR_TO_BTF_ID_SOCK_COMMON This patch changes the bpf_sk_assign() to take ARG_PTR_TO_BTF_ID_SOCK_COMMON such that they will work with the pointer returned by the bpf_skc_to_*() helpers also. The bpf_sk_lookup_assign() is taking ARG_PTR_TO_SOCKET_"OR_NULL". Meaning it specifically takes a literal NULL. ARG_PTR_TO_BTF_ID_SOCK_COMMON does not allow a literal NULL, so another ARG type is required for this purpose and another follow-up patch can be used if there is such need. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200925000415.3857374-1-kafai@fb.com --- include/uapi/linux/bpf.h | 2 +- net/core/filter.c | 4 ++-- tools/include/uapi/linux/bpf.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 69b9e30375bc..2d6519a2ed77 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3107,7 +3107,7 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * long bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags) + * long bpf_sk_assign(struct sk_buff *skb, void *sk, u64 flags) * Description * Helper is overloaded depending on BPF program type. This * description applies to **BPF_PROG_TYPE_SCHED_CLS** and diff --git a/net/core/filter.c b/net/core/filter.c index 1d88e9b498eb..af88935e24b1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6217,7 +6217,7 @@ static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = { BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags) { - if (flags != 0) + if (!sk || flags != 0) return -EINVAL; if (!skb_at_tc_ingress(skb)) return -EOPNOTSUPP; @@ -6241,7 +6241,7 @@ static const struct bpf_func_proto bpf_sk_assign_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_SOCK_COMMON, + .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg3_type = ARG_ANYTHING, }; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 69b9e30375bc..2d6519a2ed77 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3107,7 +3107,7 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * long bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags) + * long bpf_sk_assign(struct sk_buff *skb, void *sk, u64 flags) * Description * Helper is overloaded depending on BPF program type. This * description applies to **BPF_PROG_TYPE_SCHED_CLS** and -- cgit v1.2.3 From 5d5b4128c4caae34ddcd9b2dc30ac4d6155617a3 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Fri, 25 Sep 2020 13:46:07 -0700 Subject: devlink: introduce flash update overwrite mask Sections of device flash may contain settings or device identifying information. When performing a flash update, it is generally expected that these settings and identifiers are not overwritten. However, it may sometimes be useful to allow overwriting these fields when performing a flash update. Some examples include, 1) customizing the initial device config on first programming, such as overwriting default device identifying information, or 2) reverting a device configuration to known good state provided in the new firmware image, or 3) in case it is suspected that current firmware logic for managing the preservation of fields during an update is broken. Although some devices are able to completely separate these types of settings and fields into separate components, this is not true for all hardware. To support controlling this behavior, a new DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK is defined. This is an nla_bitfield32 which will define what subset of fields in a component should be overwritten during an update. If no bits are specified, or of the overwrite mask is not provided, then an update should not overwrite anything, and should maintain the settings and identifiers as they are in the previous image. If the overwrite mask has the DEVLINK_FLASH_OVERWRITE_SETTINGS bit set, then the device should be configured to overwrite any of the settings in the requested component with settings found in the provided image. Similarly, if the DEVLINK_FLASH_OVERWRITE_IDENTIFIERS bit is set, the device should be configured to overwrite any device identifiers in the requested component with the identifiers from the image. Multiple overwrite modes may be combined to indicate that a combination of the set of fields that should be overwritten. Drivers which support the new overwrite mask must set the DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK in the supported_flash_update_params field of their devlink_ops. Signed-off-by: Jacob Keller Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- Documentation/networking/devlink/devlink-flash.rst | 28 ++++++++++++++++++++++ include/net/devlink.h | 4 +++- include/uapi/linux/devlink.h | 23 ++++++++++++++++++ net/core/devlink.c | 17 ++++++++++++- 4 files changed, 70 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/devlink/devlink-flash.rst b/Documentation/networking/devlink/devlink-flash.rst index 40a87c0222cb..603e732f00cc 100644 --- a/Documentation/networking/devlink/devlink-flash.rst +++ b/Documentation/networking/devlink/devlink-flash.rst @@ -16,6 +16,34 @@ Note that the file name is a path relative to the firmware loading path (usually ``/lib/firmware/``). Drivers may send status updates to inform user space about the progress of the update operation. +Overwrite Mask +============== + +The ``devlink-flash`` command allows optionally specifying a mask indicating +how the device should handle subsections of flash components when updating. +This mask indicates the set of sections which are allowed to be overwritten. + +.. list-table:: List of overwrite mask bits + :widths: 5 95 + + * - Name + - Description + * - ``DEVLINK_FLASH_OVERWRITE_SETTINGS`` + - Indicates that the device should overwrite settings in the components + being updated with the settings found in the provided image. + * - ``DEVLINK_FLASH_OVERWRITE_IDENTIFIERS`` + - Indicates that the device should overwrite identifiers in the + components being updated with the identifiers found in the provided + image. This includes MAC addresses, serial IDs, and similar device + identifiers. + +Multiple overwrite bits may be combined and requested together. If no bits +are provided, it is expected that the device only update firmware binaries +in the components being updated. Settings and identifiers are expected to be +preserved across the update. A device may not support every combination and +the driver for such a device must reject any combination which cannot be +faithfully implemented. + Firmware Loading ================ diff --git a/include/net/devlink.h b/include/net/devlink.h index 7794e1601772..7339bf9ba6b4 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -562,9 +562,11 @@ enum devlink_param_generic_id { struct devlink_flash_update_params { const char *file_name; const char *component; + u32 overwrite_mask; }; -#define DEVLINK_SUPPORT_FLASH_UPDATE_COMPONENT BIT(0) +#define DEVLINK_SUPPORT_FLASH_UPDATE_COMPONENT BIT(0) +#define DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK BIT(1) struct devlink_region; struct devlink_info_req; diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index a2ecc8b00611..7b0face1bad5 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -230,6 +230,28 @@ enum { DEVLINK_ATTR_STATS_MAX = __DEVLINK_ATTR_STATS_MAX - 1 }; +/* Specify what sections of a flash component can be overwritten when + * performing an update. Overwriting of firmware binary sections is always + * implicitly assumed to be allowed. + * + * Each section must be documented in + * Documentation/networking/devlink/devlink-flash.rst + * + */ +enum { + DEVLINK_FLASH_OVERWRITE_SETTINGS_BIT, + DEVLINK_FLASH_OVERWRITE_IDENTIFIERS_BIT, + + __DEVLINK_FLASH_OVERWRITE_MAX_BIT, + DEVLINK_FLASH_OVERWRITE_MAX_BIT = __DEVLINK_FLASH_OVERWRITE_MAX_BIT - 1 +}; + +#define DEVLINK_FLASH_OVERWRITE_SETTINGS _BITUL(DEVLINK_FLASH_OVERWRITE_SETTINGS_BIT) +#define DEVLINK_FLASH_OVERWRITE_IDENTIFIERS _BITUL(DEVLINK_FLASH_OVERWRITE_IDENTIFIERS_BIT) + +#define DEVLINK_SUPPORTED_FLASH_OVERWRITE_SECTIONS \ + (_BITUL(__DEVLINK_FLASH_OVERWRITE_MAX_BIT) - 1) + /** * enum devlink_trap_action - Packet trap action. * @DEVLINK_TRAP_ACTION_DROP: Packet is dropped by the device and a copy is not @@ -464,6 +486,7 @@ enum devlink_attr { DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, /* u32 */ DEVLINK_ATTR_FLASH_UPDATE_STATUS_TIMEOUT, /* u64 */ + DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK, /* bitfield32 */ /* add new attributes above here, update the policy in devlink.c */ diff --git a/net/core/devlink.c b/net/core/devlink.c index 6766f9ef3152..7a38f9e25922 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -3147,9 +3147,9 @@ EXPORT_SYMBOL_GPL(devlink_flash_update_timeout_notify); static int devlink_nl_cmd_flash_update(struct sk_buff *skb, struct genl_info *info) { + struct nlattr *nla_component, *nla_overwrite_mask; struct devlink_flash_update_params params = {}; struct devlink *devlink = info->user_ptr[0]; - struct nlattr *nla_component; u32 supported_params; if (!devlink->ops->flash_update) @@ -3172,6 +3172,19 @@ static int devlink_nl_cmd_flash_update(struct sk_buff *skb, params.component = nla_data(nla_component); } + nla_overwrite_mask = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK]; + if (nla_overwrite_mask) { + struct nla_bitfield32 sections; + + if (!(supported_params & DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK)) { + NL_SET_ERR_MSG_ATTR(info->extack, nla_overwrite_mask, + "overwrite settings are not supported by this device"); + return -EOPNOTSUPP; + } + sections = nla_get_bitfield32(nla_overwrite_mask); + params.overwrite_mask = sections.value & sections.selector; + } + return devlink->ops->flash_update(devlink, ¶ms, info->extack); } @@ -7093,6 +7106,8 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8 }, [DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_FLASH_UPDATE_COMPONENT] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK] = + NLA_POLICY_BITFIELD32(DEVLINK_SUPPORTED_FLASH_OVERWRITE_SECTIONS), [DEVLINK_ATTR_TRAP_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_TRAP_ACTION] = { .type = NLA_U8 }, [DEVLINK_ATTR_TRAP_GROUP_NAME] = { .type = NLA_NUL_STRING }, -- cgit v1.2.3 From b38c73ca1c213bbf8a872b334a6bb835becfaba5 Mon Sep 17 00:00:00 2001 From: Dafna Hirschfeld Date: Thu, 27 Aug 2020 21:46:08 +0200 Subject: media: v4l2: add support for colorspace conversion API (CSC) for video capture For video capture it is the driver that reports the colorspace, transfer function, Y'CbCr/HSV encoding and quantization range used by the video, and there is no way to request something different, even though many HDTV receivers have some sort of colorspace conversion capabilities. For output video this feature already exists since the application specifies this information for the video format it will send out, and the transmitter will enable any available CSC if a format conversion has to be performed in order to match the capabilities of the sink. For video capture we propose adding new v4l2_pix_format flag: V4L2_PIX_FMT_FLAG_SET_CSC. The flag is set by the application, the driver will interpret the colorspace, xfer_func, ycbcr_enc/hsv_enc and quantization fields as the requested colorspace information and will attempt to do the conversion it supports. Drivers set the flags V4L2_FMT_FLAG_CSC_COLORSPACE, V4L2_FMT_FLAG_CSC_XFER_FUNC, V4L2_FMT_FLAG_CSC_YCBCR_ENC/V4L2_FMT_FLAG_CSC_HSV_ENC, V4L2_FMT_FLAG_CSC_QUANTIZATION, in the flags field of the struct v4l2_fmtdesc during enumeration to indicate that they support colorspace conversion for the respective field. Drivers do not have to actually look at the flags. If the flags are not set, then the fields 'colorspace', 'xfer_func', 'ycbcr_enc/hsv_enc', and 'quantization' are set to the default values by the core, i.e. just pass on the received format without conversion. Signed-off-by: Hans Verkuil Signed-off-by: Philipp Zabel Signed-off-by: Dafna Hirschfeld Signed-off-by: Mauro Carvalho Chehab --- .../userspace-api/media/v4l/pixfmt-v4l2-mplane.rst | 16 ++---- .../userspace-api/media/v4l/pixfmt-v4l2.rst | 64 ++++++++++++++++++++-- .../userspace-api/media/v4l/vidioc-enum-fmt.rst | 35 ++++++++++++ .../userspace-api/media/videodev2.h.rst.exceptions | 5 ++ include/uapi/linux/videodev2.h | 6 ++ 5 files changed, 109 insertions(+), 17 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/userspace-api/media/v4l/pixfmt-v4l2-mplane.rst b/Documentation/userspace-api/media/v4l/pixfmt-v4l2-mplane.rst index ac82882135ae..977facc3a1f4 100644 --- a/Documentation/userspace-api/media/v4l/pixfmt-v4l2-mplane.rst +++ b/Documentation/userspace-api/media/v4l/pixfmt-v4l2-mplane.rst @@ -98,29 +98,21 @@ describing all planes of that format. * - __u8 - ``ycbcr_enc`` - Y'CbCr encoding, from enum :c:type:`v4l2_ycbcr_encoding`. - This information supplements the ``colorspace`` and must be set by - the driver for capture streams and by the application for output - streams, see :ref:`colorspaces`. + See struct :c:type:`v4l2_pix_format`. * - __u8 - ``hsv_enc`` - HSV encoding, from enum :c:type:`v4l2_hsv_encoding`. - This information supplements the ``colorspace`` and must be set by - the driver for capture streams and by the application for output - streams, see :ref:`colorspaces`. + See struct :c:type:`v4l2_pix_format`. * - } - * - __u8 - ``quantization`` - Quantization range, from enum :c:type:`v4l2_quantization`. - This information supplements the ``colorspace`` and must be set by - the driver for capture streams and by the application for output - streams, see :ref:`colorspaces`. + See struct :c:type:`v4l2_pix_format`. * - __u8 - ``xfer_func`` - Transfer function, from enum :c:type:`v4l2_xfer_func`. - This information supplements the ``colorspace`` and must be set by - the driver for capture streams and by the application for output - streams, see :ref:`colorspaces`. + See struct :c:type:`v4l2_pix_format`. * - __u8 - ``reserved[7]`` - Reserved for future extensions. Should be zeroed by drivers and diff --git a/Documentation/userspace-api/media/v4l/pixfmt-v4l2.rst b/Documentation/userspace-api/media/v4l/pixfmt-v4l2.rst index 8424d6f53b0c..71e828093310 100644 --- a/Documentation/userspace-api/media/v4l/pixfmt-v4l2.rst +++ b/Documentation/userspace-api/media/v4l/pixfmt-v4l2.rst @@ -109,7 +109,14 @@ Single-planar format structure - Image colorspace, from enum :c:type:`v4l2_colorspace`. This information supplements the ``pixelformat`` and must be set by the driver for capture streams and by the application for - output streams, see :ref:`colorspaces`. + output streams, see :ref:`colorspaces`. If the application sets the + flag ``V4L2_PIX_FMT_FLAG_SET_CSC`` then the application can set + this field for a capture stream to request a specific colorspace + for the captured image data. If the driver cannot handle requested + conversion, it will return another supported colorspace. + The driver indicates that colorspace conversion is supported by setting + the flag V4L2_FMT_FLAG_CSC_COLORSPACE in the corresponding struct + :c:type:`v4l2_fmtdesc` during enumeration. See :ref:`fmtdesc-flags`. * - __u32 - ``priv`` - This field indicates whether the remaining fields of the @@ -146,13 +153,29 @@ Single-planar format structure - Y'CbCr encoding, from enum :c:type:`v4l2_ycbcr_encoding`. This information supplements the ``colorspace`` and must be set by the driver for capture streams and by the application for output - streams, see :ref:`colorspaces`. + streams, see :ref:`colorspaces`. If the application sets the + flag ``V4L2_PIX_FMT_FLAG_SET_CSC`` then the application can set + this field for a capture stream to request a specific Y'CbCr encoding + for the captured image data. If the driver cannot handle requested + conversion, it will return another supported encoding. + This field is ignored for HSV pixelformats. The driver indicates that + ycbcr_enc conversion is supported by setting the flag + V4L2_FMT_FLAG_CSC_YCBCR_ENC in the corresponding struct + :c:type:`v4l2_fmtdesc` during enumeration. See :ref:`fmtdesc-flags`. * - __u32 - ``hsv_enc`` - HSV encoding, from enum :c:type:`v4l2_hsv_encoding`. This information supplements the ``colorspace`` and must be set by the driver for capture streams and by the application for output - streams, see :ref:`colorspaces`. + streams, see :ref:`colorspaces`. If the application sets the flag + ``V4L2_PIX_FMT_FLAG_SET_CSC`` then the application can set this + field for a capture stream to request a specific HSV encoding for the + captured image data. If the driver cannot handle requested + conversion, it will return another supported encoding. + This field is ignored for non-HSV pixelformats. The driver indicates + that hsv_enc conversion is supported by setting the flag + V4L2_FMT_FLAG_CSC_HSV_ENC in the corresponding struct + :c:type:`v4l2_fmtdesc` during enumeration. See :ref:`fmtdesc-flags`. * - } - * - __u32 @@ -160,13 +183,27 @@ Single-planar format structure - Quantization range, from enum :c:type:`v4l2_quantization`. This information supplements the ``colorspace`` and must be set by the driver for capture streams and by the application for output - streams, see :ref:`colorspaces`. + streams, see :ref:`colorspaces`. If the application sets the flag + ``V4L2_PIX_FMT_FLAG_SET_CSC`` then the application can set + this field for a capture stream to request a specific quantization + range for the captured image data. If the driver cannot handle requested + conversion, it will return another supported quantization. + The driver indicates that quantization conversion is supported by setting + the flag V4L2_FMT_FLAG_CSC_QUANTIZATION in the corresponding struct + :c:type:`v4l2_fmtdesc` during enumeration. See :ref:`fmtdesc-flags`. * - __u32 - ``xfer_func`` - Transfer function, from enum :c:type:`v4l2_xfer_func`. This information supplements the ``colorspace`` and must be set by the driver for capture streams and by the application for output - streams, see :ref:`colorspaces`. + streams, see :ref:`colorspaces`. If the application sets the flag + ``V4L2_PIX_FMT_FLAG_SET_CSC`` then the application can set + this field for a capture stream to request a specific transfer function + for the captured image data. If the driver cannot handle requested + conversion, it will return another supported transfer function. + The driver indicates that xfer_func conversion is supported by setting + the flag V4L2_FMT_FLAG_CSC_XFER_FUNC in the corresponding struct + :c:type:`v4l2_fmtdesc` during enumeration. See :ref:`fmtdesc-flags`. .. tabularcolumns:: |p{6.6cm}|p{2.2cm}|p{8.7cm}| @@ -184,3 +221,20 @@ Single-planar format structure by RGBA values (128, 192, 255, 128), the same pixel described with premultiplied colors would be described by RGBA values (64, 96, 128, 128) + * .. _`v4l2-pix-fmt-flag-set-csc`: + + - ``V4L2_PIX_FMT_FLAG_SET_CSC`` + - 0x00000002 + - Set by the application. It is only used for capture and is + ignored for output streams. If set, then request the device to do + colorspace conversion from the received colorspace to the requested + colorspace values. If the colorimetry field (``colorspace``, ``xfer_func``, + ``ycbcr_enc``, ``hsv_enc`` or ``quantization``) is set to ``*_DEFAULT``, + then that colorimetry setting will remain unchanged from what was received. + So in order to change the quantization, only the ``quantization`` field shall + be set to non default value (``V4L2_QUANTIZATION_FULL_RANGE`` or + ``V4L2_QUANTIZATION_LIM_RANGE``) and all other colorimetry fields shall + be set to ``*_DEFAULT``. + + To check which conversions are supported by the hardware for the current + pixel format, see :ref:`fmtdesc-flags`. diff --git a/Documentation/userspace-api/media/v4l/vidioc-enum-fmt.rst b/Documentation/userspace-api/media/v4l/vidioc-enum-fmt.rst index 296b7d437431..b8347a96a554 100644 --- a/Documentation/userspace-api/media/v4l/vidioc-enum-fmt.rst +++ b/Documentation/userspace-api/media/v4l/vidioc-enum-fmt.rst @@ -191,6 +191,41 @@ the ``mbus_code`` field is handled differently: This flag can only be used in combination with the ``V4L2_FMT_FLAG_COMPRESSED`` flag, since this applies to compressed formats only. This flag is valid for stateful encoders only. + * - ``V4L2_FMT_FLAG_CSC_COLORSPACE`` + - 0x0020 + - The driver allows the application to try to change the default + colorspace. This flag is relevant only for capture devices. + The application can ask to configure the colorspace of the capture device + when calling the :ref:`VIDIOC_S_FMT ` ioctl with + :ref:`V4L2_PIX_FMT_FLAG_SET_CSC ` set. + * - ``V4L2_FMT_FLAG_CSC_XFER_FUNC`` + - 0x0040 + - The driver allows the application to try to change the default + transfer function. This flag is relevant only for capture devices. + The application can ask to configure the transfer function of the capture + device when calling the :ref:`VIDIOC_S_FMT ` ioctl with + :ref:`V4L2_PIX_FMT_FLAG_SET_CSC ` set. + * - ``V4L2_FMT_FLAG_CSC_YCBCR_ENC`` + - 0x0080 + - The driver allows the application to try to change the default + Y'CbCr encoding. This flag is relevant only for capture devices. + The application can ask to configure the Y'CbCr encoding of the capture device + when calling the :ref:`VIDIOC_S_FMT ` ioctl with + :ref:`V4L2_PIX_FMT_FLAG_SET_CSC ` set. + * - ``V4L2_FMT_FLAG_CSC_HSV_ENC`` + - 0x0080 + - The driver allows the application to try to change the default + HSV encoding. This flag is relevant only for capture devices. + The application can ask to configure the HSV encoding of the capture device + when calling the :ref:`VIDIOC_S_FMT ` ioctl with + :ref:`V4L2_PIX_FMT_FLAG_SET_CSC ` set. + * - ``V4L2_FMT_FLAG_CSC_QUANTIZATION`` + - 0x0100 + - The driver allows the application to try to change the default + quantization. This flag is relevant only for capture devices. + The application can ask to configure the quantization of the capture + device when calling the :ref:`VIDIOC_S_FMT ` ioctl with + :ref:`V4L2_PIX_FMT_FLAG_SET_CSC ` set. Return Value diff --git a/Documentation/userspace-api/media/videodev2.h.rst.exceptions b/Documentation/userspace-api/media/videodev2.h.rst.exceptions index 659799cc1eca..121e396a2779 100644 --- a/Documentation/userspace-api/media/videodev2.h.rst.exceptions +++ b/Documentation/userspace-api/media/videodev2.h.rst.exceptions @@ -188,6 +188,11 @@ replace define V4L2_FMT_FLAG_EMULATED fmtdesc-flags replace define V4L2_FMT_FLAG_CONTINUOUS_BYTESTREAM fmtdesc-flags replace define V4L2_FMT_FLAG_DYN_RESOLUTION fmtdesc-flags replace define V4L2_FMT_FLAG_ENC_CAP_FRAME_INTERVAL fmtdesc-flags +replace define V4L2_FMT_FLAG_CSC_COLORSPACE fmtdesc-flags +replace define V4L2_FMT_FLAG_CSC_XFER_FUNC fmtdesc-flags +replace define V4L2_FMT_FLAG_CSC_YCBCR_ENC fmtdesc-flags +replace define V4L2_FMT_FLAG_CSC_HSV_ENC fmtdesc-flags +replace define V4L2_FMT_FLAG_CSC_QUANTIZATION fmtdesc-flags # V4L2 timecode types replace define V4L2_TC_TYPE_24FPS timecode-type diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index 4769628790da..b2bc83f37024 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -777,6 +777,7 @@ struct v4l2_pix_format { /* Flags */ #define V4L2_PIX_FMT_FLAG_PREMUL_ALPHA 0x00000001 +#define V4L2_PIX_FMT_FLAG_SET_CSC 0x00000002 /* * F O R M A T E N U M E R A T I O N @@ -796,6 +797,11 @@ struct v4l2_fmtdesc { #define V4L2_FMT_FLAG_CONTINUOUS_BYTESTREAM 0x0004 #define V4L2_FMT_FLAG_DYN_RESOLUTION 0x0008 #define V4L2_FMT_FLAG_ENC_CAP_FRAME_INTERVAL 0x0010 +#define V4L2_FMT_FLAG_CSC_COLORSPACE 0x0020 +#define V4L2_FMT_FLAG_CSC_XFER_FUNC 0x0040 +#define V4L2_FMT_FLAG_CSC_YCBCR_ENC 0x0080 +#define V4L2_FMT_FLAG_CSC_HSV_ENC V4L2_FMT_FLAG_CSC_YCBCR_ENC +#define V4L2_FMT_FLAG_CSC_QUANTIZATION 0x0100 /* Frame Size and frame rate enumeration */ /* -- cgit v1.2.3 From 62aacfa9bf93f94f6949338e0c7a2ed4c4bd2c2a Mon Sep 17 00:00:00 2001 From: Dafna Hirschfeld Date: Thu, 27 Aug 2020 21:46:10 +0200 Subject: media: v4l2: extend the CSC API to subdevice. This patch extends the CSC API in video devices to be supported also on sub-devices. The flag V4L2_MBUS_FRAMEFMT_SET_CSC set by the application when calling VIDIOC_SUBDEV_S_FMT ioctl. The flags: V4L2_SUBDEV_MBUS_CODE_CSC_COLORSPACE, V4L2_SUBDEV_MBUS_CODE_CSC_XFER_FUNC, V4L2_SUBDEV_MBUS_CODE_CSC_YCBCR_ENC/V4L2_SUBDEV_MBUS_CODE_CSC_HSV_ENC V4L2_SUBDEV_MBUS_CODE_CSC_QUANTIZATION are set by the driver in the VIDIOC_SUBDEV_ENUM_MBUS_CODE ioctl. New 'flags' fields were added to the structs v4l2_subdev_mbus_code_enum, v4l2_mbus_framefmt which are borrowed from the 'reserved' field The patch also replaces the 'ycbcr_enc' field in 'struct v4l2_mbus_framefmt' with a union that includes 'hsv_enc' Signed-off-by: Dafna Hirschfeld Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- .../userspace-api/media/v4l/subdev-formats.rst | 95 +++++++++++++++++++--- .../media/v4l/vidioc-subdev-enum-mbus-code.rst | 51 +++++++++++- include/uapi/linux/v4l2-mediabus.h | 15 +++- include/uapi/linux/v4l2-subdev.h | 10 ++- 4 files changed, 157 insertions(+), 14 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/userspace-api/media/v4l/subdev-formats.rst b/Documentation/userspace-api/media/v4l/subdev-formats.rst index dd2f037b74dd..c9b7bb3ca089 100644 --- a/Documentation/userspace-api/media/v4l/subdev-formats.rst +++ b/Documentation/userspace-api/media/v4l/subdev-formats.rst @@ -34,32 +34,107 @@ Media Bus Formats :ref:`field-order` for details. * - __u32 - ``colorspace`` - - Image colorspace, from enum - :c:type:`v4l2_colorspace`. See - :ref:`colorspaces` for details. + - Image colorspace, from enum :c:type:`v4l2_colorspace`. + Must be set by the driver for subdevices. If the application sets the + flag ``V4L2_MBUS_FRAMEFMT_SET_CSC`` then the application can set this + field on the source pad to request a specific colorspace for the media + bus data. If the driver cannot handle the requested conversion, it will + return another supported colorspace. The driver indicates that colorspace + conversion is supported by setting the flag + V4L2_SUBDEV_MBUS_CODE_CSC_COLORSPACE in the corresponding struct + :c:type:`v4l2_subdev_mbus_code_enum` during enumeration. + See :ref:`v4l2-subdev-mbus-code-flags`. + * - union { + - (anonymous) * - __u16 - ``ycbcr_enc`` - Y'CbCr encoding, from enum :c:type:`v4l2_ycbcr_encoding`. This information supplements the ``colorspace`` and must be set by - the driver for capture streams and by the application for output - streams, see :ref:`colorspaces`. + the driver for subdevices, see :ref:`colorspaces`. If the application + sets the flag ``V4L2_MBUS_FRAMEFMT_SET_CSC`` then the application can set + this field on a source pad to request a specific Y'CbCr encoding + for the media bus data. If the driver cannot handle the requested + conversion, it will return another supported encoding. + This field is ignored for HSV media bus formats. The driver indicates + that ycbcr_enc conversion is supported by setting the flag + V4L2_SUBDEV_MBUS_CODE_CSC_YCBCR_ENC in the corresponding struct + :c:type:`v4l2_subdev_mbus_code_enum` during enumeration. + See :ref:`v4l2-subdev-mbus-code-flags`. + * - __u16 + - ``hsv_enc`` + - HSV encoding, from enum :c:type:`v4l2_hsv_encoding`. + This information supplements the ``colorspace`` and must be set by + the driver for subdevices, see :ref:`colorspaces`. If the application + sets the flag ``V4L2_MBUS_FRAMEFMT_SET_CSC`` then the application can set + this field on a source pad to request a specific HSV encoding + for the media bus data. If the driver cannot handle the requested + conversion, it will return another supported encoding. + This field is ignored for Y'CbCr media bus formats. The driver indicates + that hsv_enc conversion is supported by setting the flag + V4L2_SUBDEV_MBUS_CODE_CSC_HSV_ENC in the corresponding struct + :c:type:`v4l2_subdev_mbus_code_enum` during enumeration. + See :ref:`v4l2-subdev-mbus-code-flags` + * - } + - * - __u16 - ``quantization`` - Quantization range, from enum :c:type:`v4l2_quantization`. This information supplements the ``colorspace`` and must be set by - the driver for capture streams and by the application for output - streams, see :ref:`colorspaces`. + the driver for subdevices, see :ref:`colorspaces`. If the application + sets the flag ``V4L2_MBUS_FRAMEFMT_SET_CSC`` then the application can set + this field on a source pad to request a specific quantization + for the media bus data. If the driver cannot handle the requested + conversion, it will return another supported quantization. + The driver indicates that quantization conversion is supported by + setting the flag V4L2_SUBDEV_MBUS_CODE_CSC_QUANTIZATION in the + corresponding struct :c:type:`v4l2_subdev_mbus_code_enum` + during enumeration. See :ref:`v4l2-subdev-mbus-code-flags`. + * - __u16 - ``xfer_func`` - Transfer function, from enum :c:type:`v4l2_xfer_func`. This information supplements the ``colorspace`` and must be set by - the driver for capture streams and by the application for output - streams, see :ref:`colorspaces`. + the driver for subdevices, see :ref:`colorspaces`. If the application + sets the flag ``V4L2_MBUS_FRAMEFMT_SET_CSC`` then the application can set + this field on a source pad to request a specific transfer + function for the media bus data. If the driver cannot handle the requested + conversion, it will return another supported transfer function. + The driver indicates that the transfer function conversion is supported by + setting the flag V4L2_SUBDEV_MBUS_CODE_CSC_XFER_FUNC in the + corresponding struct :c:type:`v4l2_subdev_mbus_code_enum` + during enumeration. See :ref:`v4l2-subdev-mbus-code-flags`. * - __u16 - - ``reserved``\ [11] + - ``flags`` + - flags See: :ref:v4l2-mbus-framefmt-flags + * - __u16 + - ``reserved``\ [10] - Reserved for future extensions. Applications and drivers must set the array to zero. +.. _v4l2-mbus-framefmt-flags: + +.. flat-table:: v4l2_mbus_framefmt Flags + :header-rows: 0 + :stub-columns: 0 + :widths: 3 1 4 + + * .. _`mbus-framefmt-set-csc`: + + - ``V4L2_MBUS_FRAMEFMT_SET_CSC`` + - 0x0001 + - Set by the application. It is only used for source pads and is + ignored for sink pads. If set, then request the subdevice to do + colorspace conversion from the received colorspace to the requested + colorspace values. If the colorimetry field (``colorspace``, ``xfer_func``, + ``ycbcr_enc``, ``hsv_enc`` or ``quantization``) is set to ``*_DEFAULT``, + then that colorimetry setting will remain unchanged from what was received. + So in order to change the quantization, only the ``quantization`` field shall + be set to non default value (``V4L2_QUANTIZATION_FULL_RANGE`` or + ``V4L2_QUANTIZATION_LIM_RANGE``) and all other colorimetry fields shall + be set to ``*_DEFAULT``. + + To check which conversions are supported by the hardware for the current + media bus frame format, see :ref:`v4l2-subdev-mbus-code-flags`. .. _v4l2-mbus-pixelcode: diff --git a/Documentation/userspace-api/media/v4l/vidioc-subdev-enum-mbus-code.rst b/Documentation/userspace-api/media/v4l/vidioc-subdev-enum-mbus-code.rst index 9db76f7d240f..3b6a8044c391 100644 --- a/Documentation/userspace-api/media/v4l/vidioc-subdev-enum-mbus-code.rst +++ b/Documentation/userspace-api/media/v4l/vidioc-subdev-enum-mbus-code.rst @@ -72,11 +72,60 @@ information about the try formats. - Media bus format codes to be enumerated, from enum :ref:`v4l2_subdev_format_whence `. * - __u32 - - ``reserved``\ [8] + - ``flags`` + - See :ref:`v4l2-subdev-mbus-code-flags` + * - __u32 + - ``reserved``\ [7] - Reserved for future extensions. Applications and drivers must set the array to zero. + +.. tabularcolumns:: |p{4.4cm}|p{4.4cm}|p{7.7cm}| + +.. _v4l2-subdev-mbus-code-flags: + +.. flat-table:: Subdev Media Bus Code Enumerate Flags + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - V4L2_SUBDEV_MBUS_CODE_CSC_COLORSPACE + - 0x00000001 + - The driver allows the application to try to change the default colorspace + encoding. The application can ask to configure the colorspace of the + subdevice when calling the :ref:`VIDIOC_SUBDEV_S_FMT ` + ioctl with :ref:`V4L2_MBUS_FRAMEFMT_SET_CSC ` set. + See :ref:`v4l2-mbus-format` on how to do this. + * - V4L2_SUBDEV_MBUS_CODE_CSC_XFER_FUNC + - 0x00000002 + - The driver allows the application to try to change the default transform function. + The application can ask to configure the transform function of + the subdevice when calling the :ref:`VIDIOC_SUBDEV_S_FMT ` + ioctl with :ref:`V4L2_MBUS_FRAMEFMT_SET_CSC ` set. + See :ref:`v4l2-mbus-format` on how to do this. + * - V4L2_SUBDEV_MBUS_CODE_CSC_YCBCR_ENC + - 0x00000004 + - The driver allows the application to try to change the default Y'CbCr + encoding. The application can ask to configure the Y'CbCr encoding of the + subdevice when calling the :ref:`VIDIOC_SUBDEV_S_FMT ` + ioctl with :ref:`V4L2_MBUS_FRAMEFMT_SET_CSC ` set. + See :ref:`v4l2-mbus-format` on how to do this. + * - V4L2_SUBDEV_MBUS_CODE_CSC_HSV_ENC + - 0x00000004 + - The driver allows the application to try to change the default HSV + encoding. The application can ask to configure the HSV encoding of the + subdevice when calling the :ref:`VIDIOC_SUBDEV_S_FMT ` + ioctl with :ref:`V4L2_MBUS_FRAMEFMT_SET_CSC ` set. + See :ref:`v4l2-mbus-format` on how to do this. + * - V4L2_SUBDEV_MBUS_CODE_CSC_QUANTIZATION + - 0x00000008 + - The driver allows the application to try to change the default + quantization. The application can ask to configure the quantization of + the subdevice when calling the :ref:`VIDIOC_SUBDEV_S_FMT ` + ioctl with :ref:`V4L2_MBUS_FRAMEFMT_SET_CSC ` set. + See :ref:`v4l2-mbus-format` on how to do this. + Return Value ============ diff --git a/include/uapi/linux/v4l2-mediabus.h b/include/uapi/linux/v4l2-mediabus.h index 123a231001a8..903e67b16711 100644 --- a/include/uapi/linux/v4l2-mediabus.h +++ b/include/uapi/linux/v4l2-mediabus.h @@ -16,6 +16,8 @@ #include #include +#define V4L2_MBUS_FRAMEFMT_SET_CSC 0x0001 + /** * struct v4l2_mbus_framefmt - frame format on the media bus * @width: image width @@ -24,8 +26,11 @@ * @field: used interlacing type (from enum v4l2_field) * @colorspace: colorspace of the data (from enum v4l2_colorspace) * @ycbcr_enc: YCbCr encoding of the data (from enum v4l2_ycbcr_encoding) + * @hsv_enc: HSV encoding of the data (from enum v4l2_hsv_encoding) * @quantization: quantization of the data (from enum v4l2_quantization) * @xfer_func: transfer function of the data (from enum v4l2_xfer_func) + * @flags: flags (V4L2_MBUS_FRAMEFMT_*) + * @reserved: reserved bytes that can be later used */ struct v4l2_mbus_framefmt { __u32 width; @@ -33,10 +38,16 @@ struct v4l2_mbus_framefmt { __u32 code; __u32 field; __u32 colorspace; - __u16 ycbcr_enc; + union { + /* enum v4l2_ycbcr_encoding */ + __u16 ycbcr_enc; + /* enum v4l2_hsv_encoding */ + __u16 hsv_enc; + }; __u16 quantization; __u16 xfer_func; - __u16 reserved[11]; + __u16 flags; + __u16 reserved[10]; }; #ifndef __KERNEL__ diff --git a/include/uapi/linux/v4l2-subdev.h b/include/uapi/linux/v4l2-subdev.h index 5d2a1dab7911..00850b98078a 100644 --- a/include/uapi/linux/v4l2-subdev.h +++ b/include/uapi/linux/v4l2-subdev.h @@ -65,19 +65,27 @@ struct v4l2_subdev_crop { __u32 reserved[8]; }; +#define V4L2_SUBDEV_MBUS_CODE_CSC_COLORSPACE 0x00000001 +#define V4L2_SUBDEV_MBUS_CODE_CSC_XFER_FUNC 0x00000002 +#define V4L2_SUBDEV_MBUS_CODE_CSC_YCBCR_ENC 0x00000004 +#define V4L2_SUBDEV_MBUS_CODE_CSC_HSV_ENC V4L2_SUBDEV_MBUS_CODE_CSC_YCBCR_ENC +#define V4L2_SUBDEV_MBUS_CODE_CSC_QUANTIZATION 0x00000008 + /** * struct v4l2_subdev_mbus_code_enum - Media bus format enumeration * @pad: pad number, as reported by the media API * @index: format index during enumeration * @code: format code (MEDIA_BUS_FMT_ definitions) * @which: format type (from enum v4l2_subdev_format_whence) + * @flags: flags set by the driver, (V4L2_SUBDEV_MBUS_CODE_*) */ struct v4l2_subdev_mbus_code_enum { __u32 pad; __u32 index; __u32 code; __u32 which; - __u32 reserved[8]; + __u32 flags; + __u32 reserved[7]; }; /** -- cgit v1.2.3 From c8cb5b854b40f2ce52ccd032fa19750f4181d5fc Mon Sep 17 00:00:00 2001 From: Tova Mussai Date: Fri, 18 Sep 2020 11:33:13 +0200 Subject: nl80211/cfg80211: support 6 GHz scanning Support 6 GHz scanning, by * a new scan flag to scan for colocated BSSes advertised by (and found) APs on 2.4 & 5 GHz * doing the necessary reduced neighbor report parsing for this, to find them * adding the ability to split the scan request in case the device by itself cannot support this. Also add some necessary bits in mac80211 to not break with these changes. Signed-off-by: Tova Mussai Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20200918113313.232917c93af9.Ida22f0212f9122f47094d81659e879a50434a6a2@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 32 ++- include/uapi/linux/nl80211.h | 3 + net/mac80211/scan.c | 9 +- net/wireless/core.c | 8 +- net/wireless/core.h | 5 +- net/wireless/nl80211.c | 11 +- net/wireless/scan.c | 501 ++++++++++++++++++++++++++++++++++++++++++- 7 files changed, 552 insertions(+), 17 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 10c2cc8f0efc..11eb81676e95 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2095,6 +2095,27 @@ struct cfg80211_scan_info { bool aborted; }; +/** + * struct cfg80211_scan_6ghz_params - relevant for 6 GHz only + * + * @short_bssid: short ssid to scan for + * @bssid: bssid to scan for + * @channel_idx: idx of the channel in the channel array in the scan request + * which the above info relvant to + * @unsolicited_probe: the AP transmits unsolicited probe response every 20 TU + * @short_ssid_valid: short_ssid is valid and can be used + * @psc_no_listen: when set, and the channel is a PSC channel, no need to wait + * 20 TUs before starting to send probe requests. + */ +struct cfg80211_scan_6ghz_params { + u32 short_ssid; + u32 channel_idx; + u8 bssid[ETH_ALEN]; + bool unsolicited_probe; + bool short_ssid_valid; + bool psc_no_listen; +}; + /** * struct cfg80211_scan_request - scan request description * @@ -2122,6 +2143,10 @@ struct cfg80211_scan_info { * @mac_addr_mask: MAC address mask used with randomisation, bits that * are 0 in the mask should be randomised, bits that are 1 should * be taken from the @mac_addr + * @scan_6ghz: relevant for split scan request only, + * true if this is the second scan request + * @n_6ghz_params: number of 6 GHz params + * @scan_6ghz_params: 6 GHz params * @bssid: BSSID to scan for (most commonly, the wildcard BSSID) */ struct cfg80211_scan_request { @@ -2149,6 +2174,9 @@ struct cfg80211_scan_request { struct cfg80211_scan_info info; bool notified; bool no_cck; + bool scan_6ghz; + u32 n_6ghz_params; + struct cfg80211_scan_6ghz_params *scan_6ghz_params; /* keep last */ struct ieee80211_channel *channels[]; @@ -4217,6 +4245,8 @@ struct cfg80211_ops { /** * enum wiphy_flags - wiphy capability flags * + * @WIPHY_FLAG_SPLIT_SCAN_6GHZ: if set to true, the scan request will be split + * into two, first for legacy bands and second for UHB. * @WIPHY_FLAG_NETNS_OK: if not set, do not allow changing the netns of this * wiphy at all * @WIPHY_FLAG_PS_ON_BY_DEFAULT: if set to true, powersave will be enabled @@ -4260,7 +4290,7 @@ struct cfg80211_ops { enum wiphy_flags { WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK = BIT(0), /* use hole at 1 */ - /* use hole at 2 */ + WIPHY_FLAG_SPLIT_SCAN_6GHZ = BIT(2), WIPHY_FLAG_NETNS_OK = BIT(3), WIPHY_FLAG_PS_ON_BY_DEFAULT = BIT(4), WIPHY_FLAG_4ADDR_AP = BIT(5), diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index bdc90b8dfd24..c74ceaddb909 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -6059,6 +6059,8 @@ enum nl80211_timeout_reason { * @NL80211_SCAN_FLAG_FREQ_KHZ: report scan results with * %NL80211_ATTR_SCAN_FREQ_KHZ. This also means * %NL80211_ATTR_SCAN_FREQUENCIES will not be included. + * @NL80211_SCAN_FLAG_COLOCATED_6GHZ: scan for colocated APs reported by + * 2.4/5 GHz APs */ enum nl80211_scan_flags { NL80211_SCAN_FLAG_LOW_PRIORITY = 1<<0, @@ -6075,6 +6077,7 @@ enum nl80211_scan_flags { NL80211_SCAN_FLAG_RANDOM_SN = 1<<11, NL80211_SCAN_FLAG_MIN_PREQ_CONTENT = 1<<12, NL80211_SCAN_FLAG_FREQ_KHZ = 1<<13, + NL80211_SCAN_FLAG_COLOCATED_6GHZ = 1<<14, }; /** diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 5ac2785cdc7b..7361e1239bf2 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -9,7 +9,7 @@ * Copyright 2007, Michael Wu * Copyright 2013-2015 Intel Mobile Communications GmbH * Copyright 2016-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2019 Intel Corporation + * Copyright (C) 2018-2020 Intel Corporation */ #include @@ -712,6 +712,10 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, req->duration_mandatory; local->hw_scan_band = 0; + local->hw_scan_req->req.n_6ghz_params = req->n_6ghz_params; + local->hw_scan_req->req.scan_6ghz_params = + req->scan_6ghz_params; + local->hw_scan_req->req.scan_6ghz = req->scan_6ghz; /* * After allocating local->hw_scan_req, we must @@ -1124,7 +1128,8 @@ int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata, int max_n; for (band = 0; band < NUM_NL80211_BANDS; band++) { - if (!local->hw.wiphy->bands[band]) + if (!local->hw.wiphy->bands[band] || + band == NL80211_BAND_6GHZ) continue; max_n = local->hw.wiphy->bands[band]->n_channels; diff --git a/net/wireless/core.c b/net/wireless/core.c index 354b0ccbdc24..9f23923e8d29 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -236,7 +236,9 @@ void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, rdev->opencount--; if (rdev->scan_req && rdev->scan_req->wdev == wdev) { - if (WARN_ON(!rdev->scan_req->notified)) + if (WARN_ON(!rdev->scan_req->notified && + (!rdev->int_scan_req || + !rdev->int_scan_req->notified))) rdev->scan_req->info.aborted = true; ___cfg80211_scan_done(rdev, false); } @@ -1336,7 +1338,9 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, case NETDEV_DOWN: cfg80211_update_iface_num(rdev, wdev->iftype, -1); if (rdev->scan_req && rdev->scan_req->wdev == wdev) { - if (WARN_ON(!rdev->scan_req->notified)) + if (WARN_ON(!rdev->scan_req->notified && + (!rdev->int_scan_req || + !rdev->int_scan_req->notified))) rdev->scan_req->info.aborted = true; ___cfg80211_scan_done(rdev, false); } diff --git a/net/wireless/core.h b/net/wireless/core.h index 2ebc2a66680d..e1ec9ac8e608 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -3,7 +3,7 @@ * Wireless configuration interface internals. * * Copyright 2006-2010 Johannes Berg - * Copyright (C) 2018-2019 Intel Corporation + * Copyright (C) 2018-2020 Intel Corporation */ #ifndef __NET_WIRELESS_CORE_H #define __NET_WIRELESS_CORE_H @@ -72,6 +72,7 @@ struct cfg80211_registered_device { u32 bss_generation; u32 bss_entries; struct cfg80211_scan_request *scan_req; /* protected by RTNL */ + struct cfg80211_scan_request *int_scan_req; struct sk_buff *scan_msg; struct list_head sched_scan_req_list; time64_t suspend_at; @@ -457,6 +458,8 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev); bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range, u32 center_freq_khz, u32 bw_khz); +int cfg80211_scan(struct cfg80211_registered_device *rdev); + extern struct work_struct cfg80211_disconnect_work; /** diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 1a212db7a300..d98db166d5e6 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -8236,7 +8236,7 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) request->scan_start = jiffies; rdev->scan_req = request; - err = rdev_scan(rdev, request); + err = cfg80211_scan(rdev); if (err) goto out_free; @@ -15518,6 +15518,7 @@ static int nl80211_add_scan_req(struct sk_buff *msg, struct cfg80211_scan_request *req = rdev->scan_req; struct nlattr *nest; int i; + struct cfg80211_scan_info *info; if (WARN_ON(!req)) return 0; @@ -15561,11 +15562,13 @@ static int nl80211_add_scan_req(struct sk_buff *msg, nla_put_u32(msg, NL80211_ATTR_SCAN_FLAGS, req->flags)) goto nla_put_failure; - if (req->info.scan_start_tsf && + info = rdev->int_scan_req ? &rdev->int_scan_req->info : + &rdev->scan_req->info; + if (info->scan_start_tsf && (nla_put_u64_64bit(msg, NL80211_ATTR_SCAN_START_TIME_TSF, - req->info.scan_start_tsf, NL80211_BSS_PAD) || + info->scan_start_tsf, NL80211_BSS_PAD) || nla_put(msg, NL80211_ATTR_SCAN_START_TIME_TSF_BSSID, ETH_ALEN, - req->info.tsf_bssid))) + info->tsf_bssid))) goto nla_put_failure; return 0; diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 84fc8ab16dd2..4fbeb17580d9 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -5,7 +5,7 @@ * Copyright 2008 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2016 Intel Deutschland GmbH - * Copyright (C) 2018-2019 Intel Corporation + * Copyright (C) 2018-2020 Intel Corporation */ #include #include @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include #include #include @@ -74,6 +76,43 @@ MODULE_PARM_DESC(bss_entries_limit, #define IEEE80211_SCAN_RESULT_EXPIRE (30 * HZ) +/** + * struct cfg80211_colocated_ap - colocated AP information + * + * @list: linked list to all colocated aPS + * @bssid: BSSID of the reported AP + * @ssid: SSID of the reported AP + * @ssid_len: length of the ssid + * @center_freq: frequency the reported AP is on + * @unsolicited_probe: the reported AP is part of an ESS, where all the APs + * that operate in the same channel as the reported AP and that might be + * detected by a STA receiving this frame, are transmitting unsolicited + * Probe Response frames every 20 TUs + * @oct_recommended: OCT is recommended to exchange MMPDUs with the reported AP + * @same_ssid: the reported AP has the same SSID as the reporting AP + * @multi_bss: the reported AP is part of a multiple BSSID set + * @transmitted_bssid: the reported AP is the transmitting BSSID + * @colocated_ess: all the APs that share the same ESS as the reported AP are + * colocated and can be discovered via legacy bands. + * @short_ssid_valid: short_ssid is valid and can be used + * @short_ssid: the short SSID for this SSID + */ +struct cfg80211_colocated_ap { + struct list_head list; + u8 bssid[ETH_ALEN]; + u8 ssid[IEEE80211_MAX_SSID_LEN]; + size_t ssid_len; + u32 short_ssid; + u32 center_freq; + u8 unsolicited_probe:1, + oct_recommended:1, + same_ssid:1, + multi_bss:1, + transmitted_bssid:1, + colocated_ess:1, + short_ssid_valid:1; +}; + static void bss_free(struct cfg80211_internal_bss *bss) { struct cfg80211_bss_ies *ies; @@ -448,10 +487,433 @@ static bool cfg80211_bss_expire_oldest(struct cfg80211_registered_device *rdev) return ret; } +static u8 cfg80211_parse_bss_param(u8 data, + struct cfg80211_colocated_ap *coloc_ap) +{ + coloc_ap->oct_recommended = + u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_OCT_RECOMMENDED); + coloc_ap->same_ssid = + u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_SAME_SSID); + coloc_ap->multi_bss = + u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_MULTI_BSSID); + coloc_ap->transmitted_bssid = + u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_TRANSMITTED_BSSID); + coloc_ap->unsolicited_probe = + u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_PROBE_ACTIVE); + coloc_ap->colocated_ess = + u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_COLOC_ESS); + + return u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_COLOC_AP); +} + +static int cfg80211_calc_short_ssid(const struct cfg80211_bss_ies *ies, + const struct element **elem, u32 *s_ssid) +{ + + *elem = cfg80211_find_elem(WLAN_EID_SSID, ies->data, ies->len); + if (!*elem || (*elem)->datalen > IEEE80211_MAX_SSID_LEN) + return -EINVAL; + + *s_ssid = ~crc32_le(~0, (*elem)->data, (*elem)->datalen); + return 0; +} + +static void cfg80211_free_coloc_ap_list(struct list_head *coloc_ap_list) +{ + struct cfg80211_colocated_ap *ap, *tmp_ap; + + list_for_each_entry_safe(ap, tmp_ap, coloc_ap_list, list) { + list_del(&ap->list); + kfree(ap); + } +} + +static int cfg80211_parse_ap_info(struct cfg80211_colocated_ap *entry, + const u8 *pos, u8 length, + const struct element *ssid_elem, + int s_ssid_tmp) +{ + /* skip the TBTT offset */ + pos++; + + memcpy(entry->bssid, pos, ETH_ALEN); + pos += ETH_ALEN; + + if (length == IEEE80211_TBTT_INFO_OFFSET_BSSID_SSSID_BSS_PARAM) { + memcpy(&entry->short_ssid, pos, + sizeof(entry->short_ssid)); + entry->short_ssid_valid = true; + pos += 4; + } + + /* skip non colocated APs */ + if (!cfg80211_parse_bss_param(*pos, entry)) + return -EINVAL; + pos++; + + if (length == IEEE80211_TBTT_INFO_OFFSET_BSSID_BSS_PARAM) { + /* + * no information about the short ssid. Consider the entry valid + * for now. It would later be dropped in case there are explicit + * SSIDs that need to be matched + */ + if (!entry->same_ssid) + return 0; + } + + if (entry->same_ssid) { + entry->short_ssid = s_ssid_tmp; + entry->short_ssid_valid = true; + + /* + * This is safe because we validate datalen in + * cfg80211_parse_colocated_ap(), before calling this + * function. + */ + memcpy(&entry->ssid, &ssid_elem->data, + ssid_elem->datalen); + entry->ssid_len = ssid_elem->datalen; + } + return 0; +} + +static int cfg80211_parse_colocated_ap(const struct cfg80211_bss_ies *ies, + struct list_head *list) +{ + struct ieee80211_neighbor_ap_info *ap_info; + const struct element *elem, *ssid_elem; + const u8 *pos, *end; + u32 s_ssid_tmp; + int n_coloc = 0, ret; + LIST_HEAD(ap_list); + + elem = cfg80211_find_elem(WLAN_EID_REDUCED_NEIGHBOR_REPORT, ies->data, + ies->len); + if (!elem || elem->datalen > IEEE80211_MAX_SSID_LEN) + return 0; + + pos = elem->data; + end = pos + elem->datalen; + + ret = cfg80211_calc_short_ssid(ies, &ssid_elem, &s_ssid_tmp); + if (ret) + return ret; + + /* RNR IE may contain more than one NEIGHBOR_AP_INFO */ + while (pos + sizeof(*ap_info) <= end) { + enum nl80211_band band; + int freq; + u8 length, i, count; + + ap_info = (void *)pos; + count = u8_get_bits(ap_info->tbtt_info_hdr, + IEEE80211_AP_INFO_TBTT_HDR_COUNT) + 1; + length = ap_info->tbtt_info_len; + + pos += sizeof(*ap_info); + + if (!ieee80211_operating_class_to_band(ap_info->op_class, + &band)) + break; + + freq = ieee80211_channel_to_frequency(ap_info->channel, band); + + if (end - pos < count * ap_info->tbtt_info_len) + break; + + /* + * TBTT info must include bss param + BSSID + + * (short SSID or same_ssid bit to be set). + * ignore other options, and move to the + * next AP info + */ + if (band != NL80211_BAND_6GHZ || + (length != IEEE80211_TBTT_INFO_OFFSET_BSSID_BSS_PARAM && + length < IEEE80211_TBTT_INFO_OFFSET_BSSID_SSSID_BSS_PARAM)) { + pos += count * ap_info->tbtt_info_len; + continue; + } + + for (i = 0; i < count; i++) { + struct cfg80211_colocated_ap *entry; + + entry = kzalloc(sizeof(*entry) + IEEE80211_MAX_SSID_LEN, + GFP_ATOMIC); + + if (!entry) + break; + + entry->center_freq = freq; + + if (!cfg80211_parse_ap_info(entry, pos, length, + ssid_elem, s_ssid_tmp)) { + n_coloc++; + list_add_tail(&entry->list, &ap_list); + } else { + kfree(entry); + } + + pos += ap_info->tbtt_info_len; + } + } + + if (pos != end) { + cfg80211_free_coloc_ap_list(&ap_list); + return 0; + } + + list_splice_tail(&ap_list, list); + return n_coloc; +} + +static void cfg80211_scan_req_add_chan(struct cfg80211_scan_request *request, + struct ieee80211_channel *chan, + bool add_to_6ghz) +{ + int i; + u32 n_channels = request->n_channels; + struct cfg80211_scan_6ghz_params *params = + &request->scan_6ghz_params[request->n_6ghz_params]; + + for (i = 0; i < n_channels; i++) { + if (request->channels[i] == chan) { + if (add_to_6ghz) + params->channel_idx = i; + return; + } + } + + request->channels[n_channels] = chan; + if (add_to_6ghz) + request->scan_6ghz_params[request->n_6ghz_params].channel_idx = + n_channels; + + request->n_channels++; +} + +static bool cfg80211_find_ssid_match(struct cfg80211_colocated_ap *ap, + struct cfg80211_scan_request *request) +{ + u8 i; + u32 s_ssid; + + for (i = 0; i < request->n_ssids; i++) { + /* wildcard ssid in the scan request */ + if (!request->ssids[i].ssid_len) + return true; + + if (ap->ssid_len && + ap->ssid_len == request->ssids[i].ssid_len) { + if (!memcmp(request->ssids[i].ssid, ap->ssid, + ap->ssid_len)) + return true; + } else if (ap->short_ssid_valid) { + s_ssid = ~crc32_le(~0, request->ssids[i].ssid, + request->ssids[i].ssid_len); + + if (ap->short_ssid == s_ssid) + return true; + } + } + + return false; +} + +static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) +{ + u8 i; + struct cfg80211_colocated_ap *ap; + int n_channels, count = 0, err; + struct cfg80211_scan_request *request, *rdev_req = rdev->scan_req; + LIST_HEAD(coloc_ap_list); + bool need_scan_psc; + const struct ieee80211_sband_iftype_data *iftd; + + rdev_req->scan_6ghz = true; + + if (!rdev->wiphy.bands[NL80211_BAND_6GHZ]) + return -EOPNOTSUPP; + + iftd = ieee80211_get_sband_iftype_data(rdev->wiphy.bands[NL80211_BAND_6GHZ], + rdev_req->wdev->iftype); + if (!iftd || !iftd->he_cap.has_he) + return -EOPNOTSUPP; + + n_channels = rdev->wiphy.bands[NL80211_BAND_6GHZ]->n_channels; + + if (rdev_req->flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ) { + struct cfg80211_internal_bss *intbss; + + spin_lock_bh(&rdev->bss_lock); + list_for_each_entry(intbss, &rdev->bss_list, list) { + struct cfg80211_bss *res = &intbss->pub; + const struct cfg80211_bss_ies *ies; + + ies = rcu_access_pointer(res->ies); + count += cfg80211_parse_colocated_ap(ies, + &coloc_ap_list); + } + spin_unlock_bh(&rdev->bss_lock); + } + + request = kzalloc(struct_size(request, channels, n_channels) + + sizeof(*request->scan_6ghz_params) * count, + GFP_KERNEL); + if (!request) { + cfg80211_free_coloc_ap_list(&coloc_ap_list); + return -ENOMEM; + } + + *request = *rdev_req; + request->n_channels = 0; + request->scan_6ghz_params = + (void *)&request->channels[n_channels]; + + /* + * PSC channels should not be scanned if all the reported co-located APs + * are indicating that all APs in the same ESS are co-located + */ + if (count) { + need_scan_psc = false; + + list_for_each_entry(ap, &coloc_ap_list, list) { + if (!ap->colocated_ess) { + need_scan_psc = true; + break; + } + } + } else { + need_scan_psc = true; + } + + /* + * add to the scan request the channels that need to be scanned + * regardless of the collocated APs (PSC channels or all channels + * in case that NL80211_SCAN_FLAG_COLOCATED_6GHZ is not set) + */ + for (i = 0; i < rdev_req->n_channels; i++) { + if (rdev_req->channels[i]->band == NL80211_BAND_6GHZ && + ((need_scan_psc && + cfg80211_channel_is_psc(rdev_req->channels[i])) || + !(rdev_req->flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ))) { + cfg80211_scan_req_add_chan(request, + rdev_req->channels[i], + false); + } + } + + if (!(rdev_req->flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ)) + goto skip; + + list_for_each_entry(ap, &coloc_ap_list, list) { + bool found = false; + struct cfg80211_scan_6ghz_params *scan_6ghz_params = + &request->scan_6ghz_params[request->n_6ghz_params]; + struct ieee80211_channel *chan = + ieee80211_get_channel(&rdev->wiphy, ap->center_freq); + + if (!chan || chan->flags & IEEE80211_CHAN_DISABLED) + continue; + + for (i = 0; i < rdev_req->n_channels; i++) { + if (rdev_req->channels[i] == chan) + found = true; + } + + if (!found) + continue; + + if (request->n_ssids > 0 && + !cfg80211_find_ssid_match(ap, request)) + continue; + + cfg80211_scan_req_add_chan(request, chan, true); + memcpy(scan_6ghz_params->bssid, ap->bssid, ETH_ALEN); + scan_6ghz_params->short_ssid = ap->short_ssid; + scan_6ghz_params->short_ssid_valid = ap->short_ssid_valid; + scan_6ghz_params->unsolicited_probe = ap->unsolicited_probe; + + /* + * If a PSC channel is added to the scan and 'need_scan_psc' is + * set to false, then all the APs that the scan logic is + * interested with on the channel are collocated and thus there + * is no need to perform the initial PSC channel listen. + */ + if (cfg80211_channel_is_psc(chan) && !need_scan_psc) + scan_6ghz_params->psc_no_listen = true; + + request->n_6ghz_params++; + } + +skip: + cfg80211_free_coloc_ap_list(&coloc_ap_list); + + if (request->n_channels) { + struct cfg80211_scan_request *old = rdev->int_scan_req; + + rdev->int_scan_req = request; + + /* + * If this scan follows a previous scan, save the scan start + * info from the first part of the scan + */ + if (old) + rdev->int_scan_req->info = old->info; + + err = rdev_scan(rdev, request); + if (err) { + rdev->int_scan_req = old; + kfree(request); + } else { + kfree(old); + } + + return err; + } + + kfree(request); + return -EINVAL; +} + +int cfg80211_scan(struct cfg80211_registered_device *rdev) +{ + struct cfg80211_scan_request *request; + struct cfg80211_scan_request *rdev_req = rdev->scan_req; + u32 n_channels = 0, idx, i; + + if (!(rdev->wiphy.flags & WIPHY_FLAG_SPLIT_SCAN_6GHZ)) + return rdev_scan(rdev, rdev_req); + + for (i = 0; i < rdev_req->n_channels; i++) { + if (rdev_req->channels[i]->band != NL80211_BAND_6GHZ) + n_channels++; + } + + if (!n_channels) + return cfg80211_scan_6ghz(rdev); + + request = kzalloc(struct_size(request, channels, n_channels), + GFP_KERNEL); + if (!request) + return -ENOMEM; + + *request = *rdev_req; + request->n_channels = n_channels; + + for (i = idx = 0; i < rdev_req->n_channels; i++) { + if (rdev_req->channels[i]->band != NL80211_BAND_6GHZ) + request->channels[idx++] = rdev_req->channels[i]; + } + + rdev_req->scan_6ghz = false; + rdev->int_scan_req = request; + return rdev_scan(rdev, request); +} + void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, bool send_message) { - struct cfg80211_scan_request *request; + struct cfg80211_scan_request *request, *rdev_req; struct wireless_dev *wdev; struct sk_buff *msg; #ifdef CONFIG_CFG80211_WEXT @@ -466,11 +928,18 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, return; } - request = rdev->scan_req; - if (!request) + rdev_req = rdev->scan_req; + if (!rdev_req) return; - wdev = request->wdev; + wdev = rdev_req->wdev; + request = rdev->int_scan_req ? rdev->int_scan_req : rdev_req; + + if (wdev_running(wdev) && + (rdev->wiphy.flags & WIPHY_FLAG_SPLIT_SCAN_6GHZ) && + !rdev_req->scan_6ghz && !request->info.aborted && + !cfg80211_scan_6ghz(rdev)) + return; /* * This must be before sending the other events! @@ -501,8 +970,11 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, if (wdev->netdev) dev_put(wdev->netdev); + kfree(rdev->int_scan_req); + rdev->int_scan_req = NULL; + + kfree(rdev->scan_req); rdev->scan_req = NULL; - kfree(request); if (!send_message) rdev->scan_msg = msg; @@ -525,10 +997,25 @@ void __cfg80211_scan_done(struct work_struct *wk) void cfg80211_scan_done(struct cfg80211_scan_request *request, struct cfg80211_scan_info *info) { + struct cfg80211_scan_info old_info = request->info; + trace_cfg80211_scan_done(request, info); - WARN_ON(request != wiphy_to_rdev(request->wiphy)->scan_req); + WARN_ON(request != wiphy_to_rdev(request->wiphy)->scan_req && + request != wiphy_to_rdev(request->wiphy)->int_scan_req); request->info = *info; + + /* + * In case the scan is split, the scan_start_tsf and tsf_bssid should + * be of the first part. In such a case old_info.scan_start_tsf should + * be non zero. + */ + if (request->scan_6ghz && old_info.scan_start_tsf) { + request->info.scan_start_tsf = old_info.scan_start_tsf; + memcpy(request->info.tsf_bssid, old_info.tsf_bssid, + sizeof(request->info.tsf_bssid)); + } + request->notified = true; queue_work(cfg80211_wq, &wiphy_to_rdev(request->wiphy)->scan_done_wk); } -- cgit v1.2.3 From d2b7588a47de8322891de38ec14d15105d66cb1e Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Mon, 21 Sep 2020 19:28:04 -0700 Subject: nl80211: support S1G capability overrides in assoc NL80211_ATTR_S1G_CAPABILITY can be passed along with NL80211_ATTR_S1G_CAPABILITY_MASK to NL80211_CMD_ASSOCIATE to indicate S1G capabilities which should override the hardware capabilities in eg. the association request. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200922022818.15855-4-thomas@adapt-ip.com [johannes: always require both attributes together, commit message] Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 2 ++ include/net/cfg80211.h | 3 +++ include/uapi/linux/nl80211.h | 9 +++++++++ net/wireless/nl80211.c | 20 ++++++++++++++++++++ 4 files changed, 34 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 53fba39d4ba6..f71cffa18176 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2330,6 +2330,8 @@ ieee80211_he_spr_size(const u8 *he_spr_ie) } /* S1G Capabilities Information field */ +#define IEEE80211_S1G_CAPABILITY_LEN 15 + #define S1G_CAP0_S1G_LONG BIT(0) #define S1G_CAP0_SGI_1MHZ BIT(1) #define S1G_CAP0_SGI_2MHZ BIT(2) diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 11eb81676e95..bead4b9afeca 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2556,6 +2556,8 @@ enum cfg80211_assoc_req_flags { * @fils_nonces: FILS nonces (part of AAD) for protecting (Re)Association * Request/Response frame or %NULL if FILS is not used. This field starts * with 16 octets of STA Nonce followed by 16 octets of AP Nonce. + * @s1g_capa: S1G capability override + * @s1g_capa_mask: S1G capability override mask */ struct cfg80211_assoc_request { struct cfg80211_bss *bss; @@ -2570,6 +2572,7 @@ struct cfg80211_assoc_request { const u8 *fils_kek; size_t fils_kek_len; const u8 *fils_nonces; + struct ieee80211_s1g_cap s1g_capa, s1g_capa_mask; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index c74ceaddb909..05db40b4c56f 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2521,6 +2521,12 @@ enum nl80211_commands { * unsolicited broadcast probe response. It is a nested attribute, see * &enum nl80211_unsol_bcast_probe_resp_attributes. * + * @NL80211_ATTR_S1G_CAPABILITY: S1G Capability information element (from + * association request when used with NL80211_CMD_NEW_STATION) + * @NL80211_ATTR_S1G_CAPABILITY_MASK: S1G Capability Information element + * override mask. Used with NL80211_ATTR_S1G_CAPABILITY in + * NL80211_CMD_ASSOCIATE or NL80211_CMD_CONNECT. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3007,6 +3013,9 @@ enum nl80211_attrs { NL80211_ATTR_UNSOL_BCAST_PROBE_RESP, + NL80211_ATTR_S1G_CAPABILITY, + NL80211_ATTR_S1G_CAPABILITY_MASK, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d98db166d5e6..d31451db5407 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -704,6 +704,10 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { NLA_POLICY_NESTED(nl80211_fils_discovery_policy), [NL80211_ATTR_UNSOL_BCAST_PROBE_RESP] = NLA_POLICY_NESTED(nl80211_unsol_bcast_probe_resp_policy), + [NL80211_ATTR_S1G_CAPABILITY] = + NLA_POLICY_EXACT_LEN(IEEE80211_S1G_CAPABILITY_LEN), + [NL80211_ATTR_S1G_CAPABILITY_MASK] = + NLA_POLICY_EXACT_LEN(IEEE80211_S1G_CAPABILITY_LEN), }; /* policy for the key attributes */ @@ -9792,6 +9796,22 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) nla_data(info->attrs[NL80211_ATTR_FILS_NONCES]); } + if (info->attrs[NL80211_ATTR_S1G_CAPABILITY_MASK]) { + if (!info->attrs[NL80211_ATTR_S1G_CAPABILITY]) + return -EINVAL; + memcpy(&req.s1g_capa_mask, + nla_data(info->attrs[NL80211_ATTR_S1G_CAPABILITY_MASK]), + sizeof(req.s1g_capa_mask)); + } + + if (info->attrs[NL80211_ATTR_S1G_CAPABILITY]) { + if (!info->attrs[NL80211_ATTR_S1G_CAPABILITY_MASK]) + return -EINVAL; + memcpy(&req.s1g_capa, + nla_data(info->attrs[NL80211_ATTR_S1G_CAPABILITY]), + sizeof(req.s1g_capa)); + } + err = nl80211_crypto_settings(rdev, info, &req.crypto, 1); if (!err) { wdev_lock(dev->ieee80211_ptr); -- cgit v1.2.3 From 1ae099540e8c7f1ee066b3ad45cc91f582bb1ce8 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Fri, 25 Sep 2020 16:34:16 +0200 Subject: KVM: x86: Allow deflecting unknown MSR accesses to user space MSRs are weird. Some of them are normal control registers, such as EFER. Some however are registers that really are model specific, not very interesting to virtualization workloads, and not performance critical. Others again are really just windows into package configuration. Out of these MSRs, only the first category is necessary to implement in kernel space. Rarely accessed MSRs, MSRs that should be fine tunes against certain CPU models and MSRs that contain information on the package level are much better suited for user space to process. However, over time we have accumulated a lot of MSRs that are not the first category, but still handled by in-kernel KVM code. This patch adds a generic interface to handle WRMSR and RDMSR from user space. With this, any future MSR that is part of the latter categories can be handled in user space. Furthermore, it allows us to replace the existing "ignore_msrs" logic with something that applies per-VM rather than on the full system. That way you can run productive VMs in parallel to experimental ones where you don't care about proper MSR handling. Signed-off-by: Alexander Graf Reviewed-by: Jim Mattson Message-Id: <20200925143422.21718-3-graf@amazon.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 85 +++++++++++++++++++++++++--- arch/x86/include/asm/kvm_host.h | 3 + arch/x86/kvm/emulate.c | 18 +++++- arch/x86/kvm/x86.c | 120 ++++++++++++++++++++++++++++++++++++++-- include/trace/events/kvm.h | 2 +- include/uapi/linux/kvm.h | 13 +++++ 6 files changed, 226 insertions(+), 15 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 9e2a545d8084..4fdba43d83e8 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -4872,14 +4872,13 @@ to the byte array. .. note:: - For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR and - KVM_EXIT_EPR the corresponding - -operations are complete (and guest state is consistent) only after userspace -has re-entered the kernel with KVM_RUN. The kernel side will first finish -incomplete operations and then check for pending signals. Userspace -can re-enter the guest with an unmasked signal pending to complete -pending operations. + For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, + KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding + operations are complete (and guest state is consistent) only after userspace + has re-entered the kernel with KVM_RUN. The kernel side will first finish + incomplete operations and then check for pending signals. Userspace + can re-enter the guest with an unmasked signal pending to complete + pending operations. :: @@ -5166,6 +5165,43 @@ Note that KVM does not skip the faulting instruction as it does for KVM_EXIT_MMIO, but userspace has to emulate any change to the processing state if it decides to decode and emulate the instruction. +:: + + /* KVM_EXIT_X86_RDMSR / KVM_EXIT_X86_WRMSR */ + struct { + __u8 error; /* user -> kernel */ + __u8 pad[7]; + __u32 reason; /* kernel -> user */ + __u32 index; /* kernel -> user */ + __u64 data; /* kernel <-> user */ + } msr; + +Used on x86 systems. When the VM capability KVM_CAP_X86_USER_SPACE_MSR is +enabled, MSR accesses to registers that would invoke a #GP by KVM kernel code +will instead trigger a KVM_EXIT_X86_RDMSR exit for reads and KVM_EXIT_X86_WRMSR +exit for writes. + +The "reason" field specifies why the MSR trap occurred. User space will only +receive MSR exit traps when a particular reason was requested during through +ENABLE_CAP. Currently valid exit reasons are: + + KVM_MSR_EXIT_REASON_UNKNOWN - access to MSR that is unknown to KVM + KVM_MSR_EXIT_REASON_INVAL - access to invalid MSRs or reserved bits + +For KVM_EXIT_X86_RDMSR, the "index" field tells user space which MSR the guest +wants to read. To respond to this request with a successful read, user space +writes the respective data into the "data" field and must continue guest +execution to ensure the read data is transferred into guest register state. + +If the RDMSR request was unsuccessful, user space indicates that with a "1" in +the "error" field. This will inject a #GP into the guest when the VCPU is +executed again. + +For KVM_EXIT_X86_WRMSR, the "index" field tells user space which MSR the guest +wants to write. Once finished processing the event, user space must continue +vCPU execution. If the MSR write was unsuccessful, user space also sets the +"error" field to "1". + :: /* Fix the size of the union. */ @@ -5855,6 +5891,28 @@ controlled by the kvm module parameter halt_poll_ns. This capability allows the maximum halt time to specified on a per-VM basis, effectively overriding the module parameter for the target VM. +7.21 KVM_CAP_X86_USER_SPACE_MSR +------------------------------- + +:Architectures: x86 +:Target: VM +:Parameters: args[0] contains the mask of KVM_MSR_EXIT_REASON_* events to report +:Returns: 0 on success; -1 on error + +This capability enables trapping of #GP invoking RDMSR and WRMSR instructions +into user space. + +When a guest requests to read or write an MSR, KVM may not implement all MSRs +that are relevant to a respective system. It also does not differentiate by +CPU type. + +To allow more fine grained control over MSR handling, user space may enable +this capability. With it enabled, MSR accesses that match the mask specified in +args[0] and trigger a #GP event inside the guest by KVM will instead trigger +KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR exit notifications which user space +can then handle to implement model specific MSR handling and/or user notifications +to inform a user that an MSR was not handled. + 8. Other capabilities. ====================== @@ -6196,3 +6254,14 @@ distribution...) If this capability is available, then the CPNC and CPVC can be synchronized between KVM and userspace via the sync regs mechanism (KVM_SYNC_DIAG318). + +8.26 KVM_CAP_X86_USER_SPACE_MSR +------------------------------- + +:Architectures: x86 + +This capability indicates that KVM supports deflection of MSR reads and +writes to user space. It can be enabled on a VM level. If enabled, MSR +accesses that would usually trigger a #GP by KVM into the guest will +instead get bounced to user space through the KVM_EXIT_X86_RDMSR and +KVM_EXIT_X86_WRMSR exit notifications. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5d4c39c37390..dd2665504dc0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -961,6 +961,9 @@ struct kvm_arch { bool guest_can_read_msr_platform_info; bool exception_payload_enabled; + /* Deflect RDMSR and WRMSR to user space when they trigger a #GP */ + u32 user_space_msr_mask; + struct kvm_pmu_event_filter *pmu_event_filter; struct task_struct *nx_lpage_recovery_thread; }; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 85111cd0adcd..0cc0db500f71 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3701,11 +3701,18 @@ static int em_dr_write(struct x86_emulate_ctxt *ctxt) static int em_wrmsr(struct x86_emulate_ctxt *ctxt) { + u64 msr_index = reg_read(ctxt, VCPU_REGS_RCX); u64 msr_data; + int r; msr_data = (u32)reg_read(ctxt, VCPU_REGS_RAX) | ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32); - if (ctxt->ops->set_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), msr_data)) + r = ctxt->ops->set_msr(ctxt, msr_index, msr_data); + + if (r == X86EMUL_IO_NEEDED) + return r; + + if (r) return emulate_gp(ctxt, 0); return X86EMUL_CONTINUE; @@ -3713,9 +3720,16 @@ static int em_wrmsr(struct x86_emulate_ctxt *ctxt) static int em_rdmsr(struct x86_emulate_ctxt *ctxt) { + u64 msr_index = reg_read(ctxt, VCPU_REGS_RCX); u64 msr_data; + int r; + + r = ctxt->ops->get_msr(ctxt, msr_index, &msr_data); + + if (r == X86EMUL_IO_NEEDED) + return r; - if (ctxt->ops->get_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), &msr_data)) + if (r) return emulate_gp(ctxt, 0); *reg_write(ctxt, VCPU_REGS_RAX) = (u32)msr_data; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 43173382f02f..af6d008145cd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1590,12 +1590,89 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) } EXPORT_SYMBOL_GPL(kvm_set_msr); +static int complete_emulated_msr(struct kvm_vcpu *vcpu, bool is_read) +{ + if (vcpu->run->msr.error) { + kvm_inject_gp(vcpu, 0); + return 1; + } else if (is_read) { + kvm_rax_write(vcpu, (u32)vcpu->run->msr.data); + kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32); + } + + return kvm_skip_emulated_instruction(vcpu); +} + +static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu) +{ + return complete_emulated_msr(vcpu, true); +} + +static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu) +{ + return complete_emulated_msr(vcpu, false); +} + +static u64 kvm_msr_reason(int r) +{ + switch (r) { + case -ENOENT: + return KVM_MSR_EXIT_REASON_UNKNOWN; + default: + return KVM_MSR_EXIT_REASON_INVAL; + } +} + +static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index, + u32 exit_reason, u64 data, + int (*completion)(struct kvm_vcpu *vcpu), + int r) +{ + u64 msr_reason = kvm_msr_reason(r); + + /* Check if the user wanted to know about this MSR fault */ + if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason)) + return 0; + + vcpu->run->exit_reason = exit_reason; + vcpu->run->msr.error = 0; + memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad)); + vcpu->run->msr.reason = msr_reason; + vcpu->run->msr.index = index; + vcpu->run->msr.data = data; + vcpu->arch.complete_userspace_io = completion; + + return 1; +} + +static int kvm_get_msr_user_space(struct kvm_vcpu *vcpu, u32 index, int r) +{ + return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_RDMSR, 0, + complete_emulated_rdmsr, r); +} + +static int kvm_set_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u64 data, int r) +{ + return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_WRMSR, data, + complete_emulated_wrmsr, r); +} + int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) { u32 ecx = kvm_rcx_read(vcpu); u64 data; + int r; + + r = kvm_get_msr(vcpu, ecx, &data); - if (kvm_get_msr(vcpu, ecx, &data)) { + /* MSR read failed? See if we should ask user space */ + if (r && kvm_get_msr_user_space(vcpu, ecx, r)) { + /* Bounce to user space */ + return 0; + } + + /* MSR read failed? Inject a #GP */ + if (r) { trace_kvm_msr_read_ex(ecx); kvm_inject_gp(vcpu, 0); return 1; @@ -1613,8 +1690,18 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) { u32 ecx = kvm_rcx_read(vcpu); u64 data = kvm_read_edx_eax(vcpu); + int r; - if (kvm_set_msr(vcpu, ecx, data)) { + r = kvm_set_msr(vcpu, ecx, data); + + /* MSR write failed? See if we should ask user space */ + if (r && kvm_set_msr_user_space(vcpu, ecx, data, r)) { + /* Bounce to user space */ + return 0; + } + + /* MSR write failed? Inject a #GP */ + if (r) { trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(vcpu, 0); return 1; @@ -3526,6 +3613,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_EXCEPTION_PAYLOAD: case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_LAST_CPU: + case KVM_CAP_X86_USER_SPACE_MSR: r = 1; break; case KVM_CAP_SYNC_REGS: @@ -5046,6 +5134,10 @@ split_irqchip_unlock: kvm->arch.exception_payload_enabled = cap->args[0]; r = 0; break; + case KVM_CAP_X86_USER_SPACE_MSR: + kvm->arch.user_space_msr_mask = cap->args[0]; + r = 0; + break; default: r = -EINVAL; break; @@ -6378,13 +6470,33 @@ static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector, static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata) { - return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata); + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + int r; + + r = kvm_get_msr(vcpu, msr_index, pdata); + + if (r && kvm_get_msr_user_space(vcpu, msr_index, r)) { + /* Bounce to user space */ + return X86EMUL_IO_NEEDED; + } + + return r; } static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data) { - return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + int r; + + r = kvm_set_msr(vcpu, msr_index, data); + + if (r && kvm_set_msr_user_space(vcpu, msr_index, data, r)) { + /* Bounce to user space */ + return X86EMUL_IO_NEEDED; + } + + return r; } static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt) diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 9417a34aad08..26cfb0fa8e7e 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -17,7 +17,7 @@ ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI), ERSN(PAPR_HCALL), \ ERSN(S390_UCONTROL), ERSN(WATCHDOG), ERSN(S390_TSCH), ERSN(EPR),\ ERSN(SYSTEM_EVENT), ERSN(S390_STSI), ERSN(IOAPIC_EOI), \ - ERSN(HYPERV), ERSN(ARM_NISV) + ERSN(HYPERV), ERSN(ARM_NISV), ERSN(X86_RDMSR), ERSN(X86_WRMSR) TRACE_EVENT(kvm_userspace_exit, TP_PROTO(__u32 reason, int errno), diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 7d8eced6f459..31292a3cdfc2 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -248,6 +248,8 @@ struct kvm_hyperv_exit { #define KVM_EXIT_IOAPIC_EOI 26 #define KVM_EXIT_HYPERV 27 #define KVM_EXIT_ARM_NISV 28 +#define KVM_EXIT_X86_RDMSR 29 +#define KVM_EXIT_X86_WRMSR 30 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -413,6 +415,16 @@ struct kvm_run { __u64 esr_iss; __u64 fault_ipa; } arm_nisv; + /* KVM_EXIT_X86_RDMSR / KVM_EXIT_X86_WRMSR */ + struct { + __u8 error; /* user -> kernel */ + __u8 pad[7]; +#define KVM_MSR_EXIT_REASON_INVAL (1 << 0) +#define KVM_MSR_EXIT_REASON_UNKNOWN (1 << 1) + __u32 reason; /* kernel -> user */ + __u32 index; /* kernel -> user */ + __u64 data; /* kernel <-> user */ + } msr; /* Fix the size of the union. */ char padding[256]; }; @@ -1037,6 +1049,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_SMALLER_MAXPHYADDR 185 #define KVM_CAP_S390_DIAG318 186 #define KVM_CAP_STEAL_TIME 187 +#define KVM_CAP_X86_USER_SPACE_MSR 188 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From 1a155254ff937ac92cf9940d273ea597b2c667a2 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Fri, 25 Sep 2020 16:34:21 +0200 Subject: KVM: x86: Introduce MSR filtering It's not desireable to have all MSRs always handled by KVM kernel space. Some MSRs would be useful to handle in user space to either emulate behavior (like uCode updates) or differentiate whether they are valid based on the CPU model. To allow user space to specify which MSRs it wants to see handled by KVM, this patch introduces a new ioctl to push filter rules with bitmaps into KVM. Based on these bitmaps, KVM can then decide whether to reject MSR access. With the addition of KVM_CAP_X86_USER_SPACE_MSR it can also deflect the denied MSR events to user space to operate on. If no filter is populated, MSR handling stays identical to before. Signed-off-by: Alexander Graf Message-Id: <20200925143422.21718-8-graf@amazon.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 108 ++++++++++++++++++++++++++++++ arch/x86/include/asm/kvm_host.h | 14 ++++ arch/x86/include/uapi/asm/kvm.h | 18 +++++ arch/x86/kvm/x86.c | 145 +++++++++++++++++++++++++++++++++++++++- include/uapi/linux/kvm.h | 5 ++ 5 files changed, 289 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 4fdba43d83e8..425325ff4434 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -4707,6 +4707,99 @@ KVM_PV_VM_VERIFY Verify the integrity of the unpacked image. Only if this succeeds, KVM is allowed to start protected VCPUs. +4.126 KVM_X86_SET_MSR_FILTER +---------------------------- + +:Capability: KVM_X86_SET_MSR_FILTER +:Architectures: x86 +:Type: vm ioctl +:Parameters: struct kvm_msr_filter +:Returns: 0 on success, < 0 on error + +:: + + struct kvm_msr_filter_range { + #define KVM_MSR_FILTER_READ (1 << 0) + #define KVM_MSR_FILTER_WRITE (1 << 1) + __u32 flags; + __u32 nmsrs; /* number of msrs in bitmap */ + __u32 base; /* MSR index the bitmap starts at */ + __u8 *bitmap; /* a 1 bit allows the operations in flags, 0 denies */ + }; + + #define KVM_MSR_FILTER_MAX_RANGES 16 + struct kvm_msr_filter { + #define KVM_MSR_FILTER_DEFAULT_ALLOW (0 << 0) + #define KVM_MSR_FILTER_DEFAULT_DENY (1 << 0) + __u32 flags; + struct kvm_msr_filter_range ranges[KVM_MSR_FILTER_MAX_RANGES]; + }; + +flags values for struct kvm_msr_filter_range: + +KVM_MSR_FILTER_READ + + Filter read accesses to MSRs using the given bitmap. A 0 in the bitmap + indicates that a read should immediately fail, while a 1 indicates that + a read for a particular MSR should be handled regardless of the default + filter action. + +KVM_MSR_FILTER_WRITE + + Filter write accesses to MSRs using the given bitmap. A 0 in the bitmap + indicates that a write should immediately fail, while a 1 indicates that + a write for a particular MSR should be handled regardless of the default + filter action. + +KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE + + Filter both read and write accesses to MSRs using the given bitmap. A 0 + in the bitmap indicates that both reads and writes should immediately fail, + while a 1 indicates that reads and writes for a particular MSR are not + filtered by this range. + +flags values for struct kvm_msr_filter: + +KVM_MSR_FILTER_DEFAULT_ALLOW + + If no filter range matches an MSR index that is getting accessed, KVM will + fall back to allowing access to the MSR. + +KVM_MSR_FILTER_DEFAULT_DENY + + If no filter range matches an MSR index that is getting accessed, KVM will + fall back to rejecting access to the MSR. In this mode, all MSRs that should + be processed by KVM need to explicitly be marked as allowed in the bitmaps. + +This ioctl allows user space to define up to 16 bitmaps of MSR ranges to +specify whether a certain MSR access should be explicitly filtered for or not. + +If this ioctl has never been invoked, MSR accesses are not guarded and the +old KVM in-kernel emulation behavior is fully preserved. + +As soon as the filtering is in place, every MSR access is processed through +the filtering. If a bit is within one of the defined ranges, read and write +accesses are guarded by the bitmap's value for the MSR index. If it is not +defined in any range, whether MSR access is rejected is determined by the flags +field in the kvm_msr_filter struct: KVM_MSR_FILTER_DEFAULT_ALLOW and +KVM_MSR_FILTER_DEFAULT_DENY. + +Calling this ioctl with an empty set of ranges (all nmsrs == 0) disables MSR +filtering. In that mode, KVM_MSR_FILTER_DEFAULT_DENY no longer has any effect. + +Each bitmap range specifies a range of MSRs to potentially allow access on. +The range goes from MSR index [base .. base+nmsrs]. The flags field +indicates whether reads, writes or both reads and writes are filtered +by setting a 1 bit in the bitmap for the corresponding MSR index. + +If an MSR access is not permitted through the filtering, it generates a +#GP inside the guest. When combined with KVM_CAP_X86_USER_SPACE_MSR, that +allows user space to deflect and potentially handle various MSR accesses +into user space. + +If a vCPU is in running state while this ioctl is invoked, the vCPU may +experience inconsistent filtering behavior on MSR accesses. + 5. The kvm_run structure ======================== @@ -5187,6 +5280,7 @@ ENABLE_CAP. Currently valid exit reasons are: KVM_MSR_EXIT_REASON_UNKNOWN - access to MSR that is unknown to KVM KVM_MSR_EXIT_REASON_INVAL - access to invalid MSRs or reserved bits + KVM_MSR_EXIT_REASON_FILTER - access blocked by KVM_X86_SET_MSR_FILTER For KVM_EXIT_X86_RDMSR, the "index" field tells user space which MSR the guest wants to read. To respond to this request with a successful read, user space @@ -6265,3 +6359,17 @@ writes to user space. It can be enabled on a VM level. If enabled, MSR accesses that would usually trigger a #GP by KVM into the guest will instead get bounced to user space through the KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR exit notifications. + +8.25 KVM_X86_SET_MSR_FILTER +--------------------------- + +:Architectures: x86 + +This capability indicates that KVM supports that accesses to user defined MSRs +may be rejected. With this capability exposed, KVM exports new VM ioctl +KVM_X86_SET_MSR_FILTER which user space can call to specify bitmaps of MSR +ranges that KVM should reject access to. + +In combination with KVM_CAP_X86_USER_SPACE_MSR, this allows user space to +trap and emulate MSRs that are outside of the scope of KVM as well as +limit the attack surface on KVM's MSR emulation code. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f4a2443219bc..dc7a58b39faf 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -87,6 +87,7 @@ #define KVM_REQ_HV_TLB_FLUSH \ KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_APF_READY KVM_ARCH_REQ(28) +#define KVM_REQ_MSR_FILTER_CHANGED KVM_ARCH_REQ(29) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ @@ -860,6 +861,13 @@ struct kvm_hv { struct kvm_hv_syndbg hv_syndbg; }; +struct msr_bitmap_range { + u32 flags; + u32 nmsrs; + u32 base; + unsigned long *bitmap; +}; + enum kvm_irqchip_mode { KVM_IRQCHIP_NONE, KVM_IRQCHIP_KERNEL, /* created with KVM_CREATE_IRQCHIP */ @@ -964,6 +972,12 @@ struct kvm_arch { /* Deflect RDMSR and WRMSR to user space when they trigger a #GP */ u32 user_space_msr_mask; + struct { + u8 count; + bool default_allow:1; + struct msr_bitmap_range ranges[16]; + } msr_filter; + struct kvm_pmu_event_filter *pmu_event_filter; struct task_struct *nx_lpage_recovery_thread; }; diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index c2fd0aa2f587..89e5f3d1bba8 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -192,8 +192,26 @@ struct kvm_msr_list { __u32 indices[0]; }; +/* Maximum size of any access bitmap in bytes */ +#define KVM_MSR_FILTER_MAX_BITMAP_SIZE 0x600 + +/* for KVM_X86_SET_MSR_FILTER */ +struct kvm_msr_filter_range { #define KVM_MSR_FILTER_READ (1 << 0) #define KVM_MSR_FILTER_WRITE (1 << 1) + __u32 flags; + __u32 nmsrs; /* number of msrs in bitmap */ + __u32 base; /* MSR index the bitmap starts at */ + __u8 *bitmap; /* a 1 bit allows the operations in flags, 0 denies */ +}; + +#define KVM_MSR_FILTER_MAX_RANGES 16 +struct kvm_msr_filter { +#define KVM_MSR_FILTER_DEFAULT_ALLOW (0 << 0) +#define KVM_MSR_FILTER_DEFAULT_DENY (1 << 0) + __u32 flags; + struct kvm_msr_filter_range ranges[KVM_MSR_FILTER_MAX_RANGES]; +}; struct kvm_cpuid_entry { __u32 function; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 60219882fee2..72f91f3640f3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1490,7 +1490,35 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type) { - return true; + struct kvm *kvm = vcpu->kvm; + struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges; + u32 count = kvm->arch.msr_filter.count; + u32 i; + bool r = kvm->arch.msr_filter.default_allow; + int idx; + + /* MSR filtering not set up, allow everything */ + if (!count) + return true; + + /* Prevent collision with set_msr_filter */ + idx = srcu_read_lock(&kvm->srcu); + + for (i = 0; i < count; i++) { + u32 start = ranges[i].base; + u32 end = start + ranges[i].nmsrs; + u32 flags = ranges[i].flags; + unsigned long *bitmap = ranges[i].bitmap; + + if ((index >= start) && (index < end) && (flags & type)) { + r = !!test_bit(index - start, bitmap); + break; + } + } + + srcu_read_unlock(&kvm->srcu, idx); + + return r; } EXPORT_SYMBOL_GPL(kvm_msr_allowed); @@ -1505,6 +1533,9 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, { struct msr_data msr; + if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) + return -EPERM; + switch (index) { case MSR_FS_BASE: case MSR_GS_BASE: @@ -1561,6 +1592,9 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, struct msr_data msr; int ret; + if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) + return -EPERM; + msr.index = index; msr.host_initiated = host_initiated; @@ -1624,6 +1658,8 @@ static u64 kvm_msr_reason(int r) switch (r) { case -ENOENT: return KVM_MSR_EXIT_REASON_UNKNOWN; + case -EPERM: + return KVM_MSR_EXIT_REASON_FILTER; default: return KVM_MSR_EXIT_REASON_INVAL; } @@ -3620,6 +3656,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_LAST_CPU: case KVM_CAP_X86_USER_SPACE_MSR: + case KVM_CAP_X86_MSR_FILTER: r = 1; break; case KVM_CAP_SYNC_REGS: @@ -5151,6 +5188,103 @@ split_irqchip_unlock: return r; } +static void kvm_clear_msr_filter(struct kvm *kvm) +{ + u32 i; + u32 count = kvm->arch.msr_filter.count; + struct msr_bitmap_range ranges[16]; + + mutex_lock(&kvm->lock); + kvm->arch.msr_filter.count = 0; + memcpy(ranges, kvm->arch.msr_filter.ranges, count * sizeof(ranges[0])); + mutex_unlock(&kvm->lock); + synchronize_srcu(&kvm->srcu); + + for (i = 0; i < count; i++) + kfree(ranges[i].bitmap); +} + +static int kvm_add_msr_filter(struct kvm *kvm, struct kvm_msr_filter_range *user_range) +{ + struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges; + struct msr_bitmap_range range; + unsigned long *bitmap = NULL; + size_t bitmap_size; + int r; + + if (!user_range->nmsrs) + return 0; + + bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long); + if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE) + return -EINVAL; + + bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size); + if (IS_ERR(bitmap)) + return PTR_ERR(bitmap); + + range = (struct msr_bitmap_range) { + .flags = user_range->flags, + .base = user_range->base, + .nmsrs = user_range->nmsrs, + .bitmap = bitmap, + }; + + if (range.flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) { + r = -EINVAL; + goto err; + } + + if (!range.flags) { + r = -EINVAL; + goto err; + } + + /* Everything ok, add this range identifier to our global pool */ + ranges[kvm->arch.msr_filter.count] = range; + /* Make sure we filled the array before we tell anyone to walk it */ + smp_wmb(); + kvm->arch.msr_filter.count++; + + return 0; +err: + kfree(bitmap); + return r; +} + +static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) +{ + struct kvm_msr_filter __user *user_msr_filter = argp; + struct kvm_msr_filter filter; + bool default_allow; + int r = 0; + u32 i; + + if (copy_from_user(&filter, user_msr_filter, sizeof(filter))) + return -EFAULT; + + kvm_clear_msr_filter(kvm); + + default_allow = !(filter.flags & KVM_MSR_FILTER_DEFAULT_DENY); + kvm->arch.msr_filter.default_allow = default_allow; + + /* + * Protect from concurrent calls to this function that could trigger + * a TOCTOU violation on kvm->arch.msr_filter.count. + */ + mutex_lock(&kvm->lock); + for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) { + r = kvm_add_msr_filter(kvm, &filter.ranges[i]); + if (r) + break; + } + + kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED); + mutex_unlock(&kvm->lock); + + return r; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -5457,6 +5591,9 @@ set_pit2_out: case KVM_SET_PMU_EVENT_FILTER: r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp); break; + case KVM_X86_SET_MSR_FILTER: + r = kvm_vm_ioctl_set_msr_filter(kvm, argp); + break; default: r = -ENOTTY; } @@ -8611,6 +8748,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_vcpu_update_apicv(vcpu); if (kvm_check_request(KVM_REQ_APF_READY, vcpu)) kvm_check_async_pf_completion(vcpu); + if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu)) + kvm_x86_ops.msr_filter_changed(vcpu); } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { @@ -10163,6 +10302,8 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm) { + u32 i; + if (current->mm == kvm->mm) { /* * Free memory regions allocated on behalf of userspace, @@ -10179,6 +10320,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm) } if (kvm_x86_ops.vm_destroy) kvm_x86_ops.vm_destroy(kvm); + for (i = 0; i < kvm->arch.msr_filter.count; i++) + kfree(kvm->arch.msr_filter.ranges[i].bitmap); kvm_pic_destroy(kvm); kvm_ioapic_destroy(kvm); kvm_free_vcpus(kvm); diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 31292a3cdfc2..58f43aa1fc21 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -421,6 +421,7 @@ struct kvm_run { __u8 pad[7]; #define KVM_MSR_EXIT_REASON_INVAL (1 << 0) #define KVM_MSR_EXIT_REASON_UNKNOWN (1 << 1) +#define KVM_MSR_EXIT_REASON_FILTER (1 << 2) __u32 reason; /* kernel -> user */ __u32 index; /* kernel -> user */ __u64 data; /* kernel <-> user */ @@ -1050,6 +1051,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_S390_DIAG318 186 #define KVM_CAP_STEAL_TIME 187 #define KVM_CAP_X86_USER_SPACE_MSR 188 +#define KVM_CAP_X86_MSR_FILTER 189 #ifdef KVM_CAP_IRQ_ROUTING @@ -1551,6 +1553,9 @@ struct kvm_pv_cmd { /* Available with KVM_CAP_S390_PROTECTED */ #define KVM_S390_PV_COMMAND _IOWR(KVMIO, 0xc5, struct kvm_pv_cmd) +/* Available with KVM_CAP_X86_MSR_FILTER */ +#define KVM_X86_SET_MSR_FILTER _IOW(KVMIO, 0xc6, struct kvm_msr_filter) + /* Secure Encrypted Virtualization command */ enum sev_cmd_id { /* Guest initialization commands */ -- cgit v1.2.3 From 58ef7c1b555e0e605da24b76cb2821dd3fcd6bc6 Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Mon, 21 Sep 2020 19:28:16 -0700 Subject: nl80211: include frequency offset in survey info Recently channels gained a potential frequency offset, so include this in the per-channel survey info. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200922022818.15855-16-thomas@adapt-ip.com [add the offset only if non-zero] Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 2 ++ net/wireless/nl80211.c | 5 +++++ 2 files changed, 7 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 05db40b4c56f..1e51445f81cd 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4097,6 +4097,7 @@ enum nl80211_user_reg_hint_type { * receiving frames destined to the local BSS * @NL80211_SURVEY_INFO_MAX: highest survey info attribute number * currently defined + * @NL80211_SURVEY_INFO_FREQUENCY_OFFSET: center frequency offset in KHz * @__NL80211_SURVEY_INFO_AFTER_LAST: internal use */ enum nl80211_survey_info { @@ -4112,6 +4113,7 @@ enum nl80211_survey_info { NL80211_SURVEY_INFO_TIME_SCAN, NL80211_SURVEY_INFO_PAD, NL80211_SURVEY_INFO_TIME_BSS_RX, + NL80211_SURVEY_INFO_FREQUENCY_OFFSET, /* keep last */ __NL80211_SURVEY_INFO_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d31451db5407..aece2352a349 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -9319,6 +9319,11 @@ static int nl80211_send_survey(struct sk_buff *msg, u32 portid, u32 seq, survey->channel->center_freq)) goto nla_put_failure; + if (survey->channel && survey->channel->freq_offset && + nla_put_u32(msg, NL80211_SURVEY_INFO_FREQUENCY_OFFSET, + survey->channel->freq_offset)) + goto nla_put_failure; + if ((survey->filled & SURVEY_INFO_NOISE_DBM) && nla_put_u8(msg, NL80211_SURVEY_INFO_NOISE, survey->noise)) goto nla_put_failure; -- cgit v1.2.3 From f5bec330e3010450daeb5cb6a94a4a7c54afa306 Mon Sep 17 00:00:00 2001 From: Rajkumar Manoharan Date: Mon, 28 Sep 2020 00:28:11 -0700 Subject: nl80211: extend support to config spatial reuse parameter set Allow the user to configure below Spatial Reuse Parameter Set element. * Non-SRG OBSS PD Max Offset * SRG BSS Color Bitmap * SRG Partial BSSID Bitmap Signed-off-by: Rajkumar Manoharan Link: https://lore.kernel.org/r/1601278091-20313-2-git-send-email-rmanohar@codeaurora.org Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 7 +++++-- include/net/cfg80211.h | 10 ++++++++++ include/uapi/linux/nl80211.h | 11 +++++++++++ net/wireless/nl80211.c | 25 +++++++++++++++++++++++++ 4 files changed, 51 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index f2f56b287aed..770408b2fdaf 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2350,8 +2350,11 @@ ieee80211_he_6ghz_oper(const struct ieee80211_he_operation *he_oper) } /* HE Spatial Reuse defines */ -#define IEEE80211_HE_SPR_NON_SRG_OFFSET_PRESENT 0x4 -#define IEEE80211_HE_SPR_SRG_INFORMATION_PRESENT 0x8 +#define IEEE80211_HE_SPR_PSR_DISALLOWED BIT(0) +#define IEEE80211_HE_SPR_NON_SRG_OBSS_PD_SR_DISALLOWED BIT(1) +#define IEEE80211_HE_SPR_NON_SRG_OFFSET_PRESENT BIT(2) +#define IEEE80211_HE_SPR_SRG_INFORMATION_PRESENT BIT(3) +#define IEEE80211_HE_SPR_HESIGA_SR_VAL15_ALLOWED BIT(4) /* * ieee80211_he_spr_size - calculate 802.11ax HE Spatial Reuse IE size diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index bead4b9afeca..aee47f2b5709 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -269,13 +269,23 @@ struct ieee80211_rate { * struct ieee80211_he_obss_pd - AP settings for spatial reuse * * @enable: is the feature enabled. + * @sr_ctrl: The SR Control field of SRP element. + * @non_srg_max_offset: non-SRG maximum tx power offset * @min_offset: minimal tx power offset an associated station shall use * @max_offset: maximum tx power offset an associated station shall use + * @bss_color_bitmap: bitmap that indicates the BSS color values used by + * members of the SRG + * @partial_bssid_bitmap: bitmap that indicates the partial BSSID values + * used by members of the SRG */ struct ieee80211_he_obss_pd { bool enable; + u8 sr_ctrl; + u8 non_srg_max_offset; u8 min_offset; u8 max_offset; + u8 bss_color_bitmap[8]; + u8 partial_bssid_bitmap[8]; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 1e51445f81cd..47700a2b9af9 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -6991,6 +6991,13 @@ enum nl80211_peer_measurement_ftm_resp { * * @NL80211_HE_OBSS_PD_ATTR_MIN_OFFSET: the OBSS PD minimum tx power offset. * @NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET: the OBSS PD maximum tx power offset. + * @NL80211_HE_OBSS_PD_ATTR_NON_SRG_MAX_OFFSET: the non-SRG OBSS PD maximum + * tx power offset. + * @NL80211_HE_OBSS_PD_ATTR_BSS_COLOR_BITMAP: bitmap that indicates the BSS color + * values used by members of the SRG. + * @NL80211_HE_OBSS_PD_ATTR_PARTIAL_BSSID_BITMAP: bitmap that indicates the partial + * BSSID values used by members of the SRG. + * @NL80211_HE_OBSS_PD_ATTR_SR_CTRL: The SR Control field of SRP element. * * @__NL80211_HE_OBSS_PD_ATTR_LAST: Internal * @NL80211_HE_OBSS_PD_ATTR_MAX: highest OBSS PD attribute. @@ -7000,6 +7007,10 @@ enum nl80211_obss_pd_attributes { NL80211_HE_OBSS_PD_ATTR_MIN_OFFSET, NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET, + NL80211_HE_OBSS_PD_ATTR_NON_SRG_MAX_OFFSET, + NL80211_HE_OBSS_PD_ATTR_BSS_COLOR_BITMAP, + NL80211_HE_OBSS_PD_ATTR_PARTIAL_BSSID_BITMAP, + NL80211_HE_OBSS_PD_ATTR_SR_CTRL, /* keep last */ __NL80211_HE_OBSS_PD_ATTR_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index e501bce86436..d76b8bd0e1d1 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -329,6 +329,13 @@ he_obss_pd_policy[NL80211_HE_OBSS_PD_ATTR_MAX + 1] = { NLA_POLICY_RANGE(NLA_U8, 1, 20), [NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET] = NLA_POLICY_RANGE(NLA_U8, 1, 20), + [NL80211_HE_OBSS_PD_ATTR_NON_SRG_MAX_OFFSET] = + NLA_POLICY_RANGE(NLA_U8, 1, 20), + [NL80211_HE_OBSS_PD_ATTR_BSS_COLOR_BITMAP] = + NLA_POLICY_EXACT_LEN(8), + [NL80211_HE_OBSS_PD_ATTR_PARTIAL_BSSID_BITMAP] = + NLA_POLICY_EXACT_LEN(8), + [NL80211_HE_OBSS_PD_ATTR_SR_CTRL] = { .type = NLA_U8 }, }; static const struct nla_policy @@ -4857,16 +4864,34 @@ static int nl80211_parse_he_obss_pd(struct nlattr *attrs, if (err) return err; + if (!tb[NL80211_HE_OBSS_PD_ATTR_SR_CTRL]) + return -EINVAL; + + he_obss_pd->sr_ctrl = nla_get_u8(tb[NL80211_HE_OBSS_PD_ATTR_SR_CTRL]); + if (tb[NL80211_HE_OBSS_PD_ATTR_MIN_OFFSET]) he_obss_pd->min_offset = nla_get_u8(tb[NL80211_HE_OBSS_PD_ATTR_MIN_OFFSET]); if (tb[NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET]) he_obss_pd->max_offset = nla_get_u8(tb[NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET]); + if (tb[NL80211_HE_OBSS_PD_ATTR_NON_SRG_MAX_OFFSET]) + he_obss_pd->non_srg_max_offset = + nla_get_u8(tb[NL80211_HE_OBSS_PD_ATTR_NON_SRG_MAX_OFFSET]); if (he_obss_pd->min_offset > he_obss_pd->max_offset) return -EINVAL; + if (tb[NL80211_HE_OBSS_PD_ATTR_BSS_COLOR_BITMAP]) + memcpy(he_obss_pd->bss_color_bitmap, + nla_data(tb[NL80211_HE_OBSS_PD_ATTR_BSS_COLOR_BITMAP]), + sizeof(he_obss_pd->bss_color_bitmap)); + + if (tb[NL80211_HE_OBSS_PD_ATTR_PARTIAL_BSSID_BITMAP]) + memcpy(he_obss_pd->partial_bssid_bitmap, + nla_data(tb[NL80211_HE_OBSS_PD_ATTR_PARTIAL_BSSID_BITMAP]), + sizeof(he_obss_pd->partial_bssid_bitmap)); + he_obss_pd->enable = true; return 0; -- cgit v1.2.3 From 1b4d60ec162f82ea29a2e7a907b5c6cc9f926321 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 25 Sep 2020 13:54:29 -0700 Subject: bpf: Enable BPF_PROG_TEST_RUN for raw_tracepoint Add .test_run for raw_tracepoint. Also, introduce a new feature that runs the target program on a specific CPU. This is achieved by a new flag in bpf_attr.test, BPF_F_TEST_RUN_ON_CPU. When this flag is set, the program is triggered on cpu with id bpf_attr.test.cpu. This feature is needed for BPF programs that handle perf_event and other percpu resources, as the program can access these resource locally. Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200925205432.1777-2-songliubraving@fb.com --- include/linux/bpf.h | 3 ++ include/uapi/linux/bpf.h | 7 ++++ kernel/bpf/syscall.c | 2 +- kernel/trace/bpf_trace.c | 1 + net/bpf/test_run.c | 91 ++++++++++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 7 ++++ 6 files changed, 110 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 79902325bef8..db6dcdee7933 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1396,6 +1396,9 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog, int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); +int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr); bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2d6519a2ed77..82522f05c021 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -424,6 +424,11 @@ enum { */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) +/* Flags for BPF_PROG_TEST_RUN */ + +/* If set, run the test on the cpu specified by bpf_attr.test.cpu */ +#define BPF_F_TEST_RUN_ON_CPU (1U << 0) + /* type for BPF_ENABLE_STATS */ enum bpf_stats_type { /* enabled run_time_ns and run_cnt */ @@ -566,6 +571,8 @@ union bpf_attr { */ __aligned_u64 ctx_in; __aligned_u64 ctx_out; + __u32 flags; + __u32 cpu; } test; struct { /* anonymous struct used by BPF_*_GET_*_ID */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2740df19f55e..3bc2ed2e171b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2979,7 +2979,7 @@ static int bpf_prog_query(const union bpf_attr *attr, } } -#define BPF_PROG_TEST_RUN_LAST_FIELD test.ctx_out +#define BPF_PROG_TEST_RUN_LAST_FIELD test.cpu static int bpf_prog_test_run(const union bpf_attr *attr, union bpf_attr __user *uattr) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 36508f46a8db..2834866d379a 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1678,6 +1678,7 @@ const struct bpf_verifier_ops raw_tracepoint_verifier_ops = { }; const struct bpf_prog_ops raw_tracepoint_prog_ops = { + .test_run = bpf_prog_test_run_raw_tp, }; const struct bpf_verifier_ops tracing_verifier_ops = { diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index a66f211726e7..fde5db93507c 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -11,6 +11,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -204,6 +205,9 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog, int b = 2, err = -EFAULT; u32 retval = 0; + if (kattr->test.flags || kattr->test.cpu) + return -EINVAL; + switch (prog->expected_attach_type) { case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: @@ -236,6 +240,87 @@ out: return err; } +struct bpf_raw_tp_test_run_info { + struct bpf_prog *prog; + void *ctx; + u32 retval; +}; + +static void +__bpf_prog_test_run_raw_tp(void *data) +{ + struct bpf_raw_tp_test_run_info *info = data; + + rcu_read_lock(); + migrate_disable(); + info->retval = BPF_PROG_RUN(info->prog, info->ctx); + migrate_enable(); + rcu_read_unlock(); +} + +int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + void __user *ctx_in = u64_to_user_ptr(kattr->test.ctx_in); + __u32 ctx_size_in = kattr->test.ctx_size_in; + struct bpf_raw_tp_test_run_info info; + int cpu = kattr->test.cpu, err = 0; + + /* doesn't support data_in/out, ctx_out, duration, or repeat */ + if (kattr->test.data_in || kattr->test.data_out || + kattr->test.ctx_out || kattr->test.duration || + kattr->test.repeat) + return -EINVAL; + + if (ctx_size_in < prog->aux->max_ctx_offset) + return -EINVAL; + + if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 && cpu != 0) + return -EINVAL; + + if (ctx_size_in) { + info.ctx = kzalloc(ctx_size_in, GFP_USER); + if (!info.ctx) + return -ENOMEM; + if (copy_from_user(info.ctx, ctx_in, ctx_size_in)) { + err = -EFAULT; + goto out; + } + } else { + info.ctx = NULL; + } + + info.prog = prog; + + if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 || + cpu == smp_processor_id()) { + __bpf_prog_test_run_raw_tp(&info); + } else { + /* smp_call_function_single() also checks cpu_online() + * after csd_lock(). However, since cpu is from user + * space, let's do an extra quick check to filter out + * invalid value before smp_call_function_single(). + */ + if (cpu >= nr_cpu_ids || !cpu_online(cpu)) { + err = -ENXIO; + goto out; + } + + err = smp_call_function_single(cpu, __bpf_prog_test_run_raw_tp, + &info, 1); + if (err) + goto out; + } + + if (copy_to_user(&uattr->test.retval, &info.retval, sizeof(u32))) + err = -EFAULT; + +out: + kfree(info.ctx); + return err; +} + static void *bpf_ctx_init(const union bpf_attr *kattr, u32 max_size) { void __user *data_in = u64_to_user_ptr(kattr->test.ctx_in); @@ -410,6 +495,9 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, void *data; int ret; + if (kattr->test.flags || kattr->test.cpu) + return -EINVAL; + data = bpf_test_init(kattr, size, NET_SKB_PAD + NET_IP_ALIGN, SKB_DATA_ALIGN(sizeof(struct skb_shared_info))); if (IS_ERR(data)) @@ -607,6 +695,9 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, if (prog->type != BPF_PROG_TYPE_FLOW_DISSECTOR) return -EINVAL; + if (kattr->test.flags || kattr->test.cpu) + return -EINVAL; + if (size < ETH_HLEN) return -EINVAL; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 2d6519a2ed77..82522f05c021 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -424,6 +424,11 @@ enum { */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) +/* Flags for BPF_PROG_TEST_RUN */ + +/* If set, run the test on the cpu specified by bpf_attr.test.cpu */ +#define BPF_F_TEST_RUN_ON_CPU (1U << 0) + /* type for BPF_ENABLE_STATS */ enum bpf_stats_type { /* enabled run_time_ns and run_cnt */ @@ -566,6 +571,8 @@ union bpf_attr { */ __aligned_u64 ctx_in; __aligned_u64 ctx_out; + __u32 flags; + __u32 cpu; } test; struct { /* anonymous struct used by BPF_*_GET_*_ID */ -- cgit v1.2.3 From c4d0bfb45068d853a478b9067a95969b1886a30f Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Mon, 28 Sep 2020 12:31:05 +0100 Subject: bpf: Add bpf_snprintf_btf helper A helper is added to support tracing kernel type information in BPF using the BPF Type Format (BTF). Its signature is long bpf_snprintf_btf(char *str, u32 str_size, struct btf_ptr *ptr, u32 btf_ptr_size, u64 flags); struct btf_ptr * specifies - a pointer to the data to be traced - the BTF id of the type of data pointed to - a flags field is provided for future use; these flags are not to be confused with the BTF_F_* flags below that control how the btf_ptr is displayed; the flags member of the struct btf_ptr may be used to disambiguate types in kernel versus module BTF, etc; the main distinction is the flags relate to the type and information needed in identifying it; not how it is displayed. For example a BPF program with a struct sk_buff *skb could do the following: static struct btf_ptr b = { }; b.ptr = skb; b.type_id = __builtin_btf_type_id(struct sk_buff, 1); bpf_snprintf_btf(str, sizeof(str), &b, sizeof(b), 0, 0); Default output looks like this: (struct sk_buff){ .transport_header = (__u16)65535, .mac_header = (__u16)65535, .end = (sk_buff_data_t)192, .head = (unsigned char *)0x000000007524fd8b, .data = (unsigned char *)0x000000007524fd8b, .truesize = (unsigned int)768, .users = (refcount_t){ .refs = (atomic_t){ .counter = (int)1, }, }, } Flags modifying display are as follows: - BTF_F_COMPACT: no formatting around type information - BTF_F_NONAME: no struct/union member names/types - BTF_F_PTR_RAW: show raw (unobfuscated) pointer values; equivalent to %px. - BTF_F_ZERO: show zero-valued struct/union members; they are not displayed by default Signed-off-by: Alan Maguire Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1601292670-1616-4-git-send-email-alan.maguire@oracle.com --- include/linux/bpf.h | 1 + include/linux/btf.h | 9 +++--- include/uapi/linux/bpf.h | 67 ++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/core.c | 1 + kernel/bpf/helpers.c | 4 +++ kernel/trace/bpf_trace.c | 65 ++++++++++++++++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 2 ++ tools/include/uapi/linux/bpf.h | 67 ++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 212 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e620a4b1290f..768b533ba48e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1822,6 +1822,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; extern const struct bpf_func_proto bpf_copy_from_user_proto; +extern const struct bpf_func_proto bpf_snprintf_btf_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/linux/btf.h b/include/linux/btf.h index d0f5d3c9ec3d..3e5cdc2ba963 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -6,6 +6,7 @@ #include #include +#include #define BTF_TYPE_EMIT(type) ((void)(type *)0) @@ -59,10 +60,10 @@ const struct btf_type *btf_type_id_size(const struct btf *btf, * - BTF_SHOW_UNSAFE: skip use of bpf_probe_read() to safely read * data before displaying it. */ -#define BTF_SHOW_COMPACT (1ULL << 0) -#define BTF_SHOW_NONAME (1ULL << 1) -#define BTF_SHOW_PTR_RAW (1ULL << 2) -#define BTF_SHOW_ZERO (1ULL << 3) +#define BTF_SHOW_COMPACT BTF_F_COMPACT +#define BTF_SHOW_NONAME BTF_F_NONAME +#define BTF_SHOW_PTR_RAW BTF_F_PTR_RAW +#define BTF_SHOW_ZERO BTF_F_ZERO #define BTF_SHOW_UNSAFE (1ULL << 4) void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 82522f05c021..cca9eb1b13e5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3594,6 +3594,42 @@ union bpf_attr { * the data in *dst*. This is a wrapper of **copy_from_user**\ (). * Return * 0 on success, or a negative error in case of failure. + * + * long bpf_snprintf_btf(char *str, u32 str_size, struct btf_ptr *ptr, u32 btf_ptr_size, u64 flags) + * Description + * Use BTF to store a string representation of *ptr*->ptr in *str*, + * using *ptr*->type_id. This value should specify the type + * that *ptr*->ptr points to. LLVM __builtin_btf_type_id(type, 1) + * can be used to look up vmlinux BTF type ids. Traversing the + * data structure using BTF, the type information and values are + * stored in the first *str_size* - 1 bytes of *str*. Safe copy of + * the pointer data is carried out to avoid kernel crashes during + * operation. Smaller types can use string space on the stack; + * larger programs can use map data to store the string + * representation. + * + * The string can be subsequently shared with userspace via + * bpf_perf_event_output() or ring buffer interfaces. + * bpf_trace_printk() is to be avoided as it places too small + * a limit on string size to be useful. + * + * *flags* is a combination of + * + * **BTF_F_COMPACT** + * no formatting around type information + * **BTF_F_NONAME** + * no struct/union member names/types + * **BTF_F_PTR_RAW** + * show raw (unobfuscated) pointer values; + * equivalent to printk specifier %px. + * **BTF_F_ZERO** + * show zero-valued struct/union members; they + * are not displayed by default + * + * Return + * The number of bytes that were written (or would have been + * written if output had to be truncated due to string size), + * or a negative error in cases of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3745,6 +3781,7 @@ union bpf_attr { FN(inode_storage_delete), \ FN(d_path), \ FN(copy_from_user), \ + FN(snprintf_btf), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -4853,4 +4890,34 @@ struct bpf_sk_lookup { __u32 local_port; /* Host byte order */ }; +/* + * struct btf_ptr is used for typed pointer representation; the + * type id is used to render the pointer data as the appropriate type + * via the bpf_snprintf_btf() helper described above. A flags field - + * potentially to specify additional details about the BTF pointer + * (rather than its mode of display) - is included for future use. + * Display flags - BTF_F_* - are passed to bpf_snprintf_btf separately. + */ +struct btf_ptr { + void *ptr; + __u32 type_id; + __u32 flags; /* BTF ptr flags; unused at present. */ +}; + +/* + * Flags to control bpf_snprintf_btf() behaviour. + * - BTF_F_COMPACT: no formatting around type information + * - BTF_F_NONAME: no struct/union member names/types + * - BTF_F_PTR_RAW: show raw (unobfuscated) pointer values; + * equivalent to %px. + * - BTF_F_ZERO: show zero-valued struct/union members; they + * are not displayed by default + */ +enum { + BTF_F_COMPACT = (1ULL << 0), + BTF_F_NONAME = (1ULL << 1), + BTF_F_PTR_RAW = (1ULL << 2), + BTF_F_ZERO = (1ULL << 3), +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c4811b139caa..403fb2341a86 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2216,6 +2216,7 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto __weak; const struct bpf_func_proto bpf_get_local_storage_proto __weak; const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak; +const struct bpf_func_proto bpf_snprintf_btf_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 5cc7425ee476..e825441781ab 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -683,6 +683,10 @@ bpf_base_func_proto(enum bpf_func_id func_id) if (!perfmon_capable()) return NULL; return bpf_get_trace_printk_proto(); + case BPF_FUNC_snprintf_btf: + if (!perfmon_capable()) + return NULL; + return &bpf_snprintf_btf_proto; case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; default: diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 2834866d379a..140e1be9dab6 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -16,6 +17,9 @@ #include #include +#include +#include + #include #include "trace_probe.h" @@ -1147,6 +1151,65 @@ static const struct bpf_func_proto bpf_d_path_proto = { .allowed = bpf_d_path_allowed, }; +#define BTF_F_ALL (BTF_F_COMPACT | BTF_F_NONAME | \ + BTF_F_PTR_RAW | BTF_F_ZERO) + +static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size, + u64 flags, const struct btf **btf, + s32 *btf_id) +{ + const struct btf_type *t; + + if (unlikely(flags & ~(BTF_F_ALL))) + return -EINVAL; + + if (btf_ptr_size != sizeof(struct btf_ptr)) + return -EINVAL; + + *btf = bpf_get_btf_vmlinux(); + + if (IS_ERR_OR_NULL(*btf)) + return PTR_ERR(*btf); + + if (ptr->type_id > 0) + *btf_id = ptr->type_id; + else + return -EINVAL; + + if (*btf_id > 0) + t = btf_type_by_id(*btf, *btf_id); + if (*btf_id <= 0 || !t) + return -ENOENT; + + return 0; +} + +BPF_CALL_5(bpf_snprintf_btf, char *, str, u32, str_size, struct btf_ptr *, ptr, + u32, btf_ptr_size, u64, flags) +{ + const struct btf *btf; + s32 btf_id; + int ret; + + ret = bpf_btf_printf_prepare(ptr, btf_ptr_size, flags, &btf, &btf_id); + if (ret) + return ret; + + return btf_type_snprintf_show(btf, btf_id, ptr->ptr, str, str_size, + flags); +} + +const struct bpf_func_proto bpf_snprintf_btf_proto = { + .func = bpf_snprintf_btf, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, + .arg3_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_CONST_SIZE, + .arg5_type = ARG_ANYTHING, +}; + const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1233,6 +1296,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_task_stack_proto; case BPF_FUNC_copy_from_user: return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL; + case BPF_FUNC_snprintf_btf: + return &bpf_snprintf_btf_proto; default: return NULL; } diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 08388173973f..7d86fdd190be 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -433,6 +433,7 @@ class PrinterHelpers(Printer): 'struct sk_msg_md', 'struct xdp_md', 'struct path', + 'struct btf_ptr', ] known_types = { '...', @@ -474,6 +475,7 @@ class PrinterHelpers(Printer): 'struct udp6_sock', 'struct task_struct', 'struct path', + 'struct btf_ptr', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 82522f05c021..cca9eb1b13e5 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3594,6 +3594,42 @@ union bpf_attr { * the data in *dst*. This is a wrapper of **copy_from_user**\ (). * Return * 0 on success, or a negative error in case of failure. + * + * long bpf_snprintf_btf(char *str, u32 str_size, struct btf_ptr *ptr, u32 btf_ptr_size, u64 flags) + * Description + * Use BTF to store a string representation of *ptr*->ptr in *str*, + * using *ptr*->type_id. This value should specify the type + * that *ptr*->ptr points to. LLVM __builtin_btf_type_id(type, 1) + * can be used to look up vmlinux BTF type ids. Traversing the + * data structure using BTF, the type information and values are + * stored in the first *str_size* - 1 bytes of *str*. Safe copy of + * the pointer data is carried out to avoid kernel crashes during + * operation. Smaller types can use string space on the stack; + * larger programs can use map data to store the string + * representation. + * + * The string can be subsequently shared with userspace via + * bpf_perf_event_output() or ring buffer interfaces. + * bpf_trace_printk() is to be avoided as it places too small + * a limit on string size to be useful. + * + * *flags* is a combination of + * + * **BTF_F_COMPACT** + * no formatting around type information + * **BTF_F_NONAME** + * no struct/union member names/types + * **BTF_F_PTR_RAW** + * show raw (unobfuscated) pointer values; + * equivalent to printk specifier %px. + * **BTF_F_ZERO** + * show zero-valued struct/union members; they + * are not displayed by default + * + * Return + * The number of bytes that were written (or would have been + * written if output had to be truncated due to string size), + * or a negative error in cases of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3745,6 +3781,7 @@ union bpf_attr { FN(inode_storage_delete), \ FN(d_path), \ FN(copy_from_user), \ + FN(snprintf_btf), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -4853,4 +4890,34 @@ struct bpf_sk_lookup { __u32 local_port; /* Host byte order */ }; +/* + * struct btf_ptr is used for typed pointer representation; the + * type id is used to render the pointer data as the appropriate type + * via the bpf_snprintf_btf() helper described above. A flags field - + * potentially to specify additional details about the BTF pointer + * (rather than its mode of display) - is included for future use. + * Display flags - BTF_F_* - are passed to bpf_snprintf_btf separately. + */ +struct btf_ptr { + void *ptr; + __u32 type_id; + __u32 flags; /* BTF ptr flags; unused at present. */ +}; + +/* + * Flags to control bpf_snprintf_btf() behaviour. + * - BTF_F_COMPACT: no formatting around type information + * - BTF_F_NONAME: no struct/union member names/types + * - BTF_F_PTR_RAW: show raw (unobfuscated) pointer values; + * equivalent to %px. + * - BTF_F_ZERO: show zero-valued struct/union members; they + * are not displayed by default + */ +enum { + BTF_F_COMPACT = (1ULL << 0), + BTF_F_NONAME = (1ULL << 1), + BTF_F_PTR_RAW = (1ULL << 2), + BTF_F_ZERO = (1ULL << 3), +}; + #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3 From eb411377aed9e27835e77ee0710ee8f4649958f3 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Mon, 28 Sep 2020 12:31:09 +0100 Subject: bpf: Add bpf_seq_printf_btf helper A helper is added to allow seq file writing of kernel data structures using vmlinux BTF. Its signature is long bpf_seq_printf_btf(struct seq_file *m, struct btf_ptr *ptr, u32 btf_ptr_size, u64 flags); Flags and struct btf_ptr definitions/use are identical to the bpf_snprintf_btf helper, and the helper returns 0 on success or a negative error value. Suggested-by: Alexei Starovoitov Signed-off-by: Alan Maguire Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1601292670-1616-8-git-send-email-alan.maguire@oracle.com --- include/linux/btf.h | 2 ++ include/uapi/linux/bpf.h | 9 +++++++++ kernel/bpf/btf.c | 4 ++-- kernel/bpf/core.c | 1 + kernel/trace/bpf_trace.c | 33 +++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 9 +++++++++ 6 files changed, 56 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index 3e5cdc2ba963..024e16ff7dcc 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -68,6 +68,8 @@ const struct btf_type *btf_type_id_size(const struct btf *btf, void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m); +int btf_type_seq_show_flags(const struct btf *btf, u32 type_id, void *obj, + struct seq_file *m, u64 flags); /* * Copy len bytes of string representation of obj of BTF type_id into buf. diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index cca9eb1b13e5..96ddb00b91dc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3630,6 +3630,14 @@ union bpf_attr { * The number of bytes that were written (or would have been * written if output had to be truncated due to string size), * or a negative error in cases of failure. + * + * long bpf_seq_printf_btf(struct seq_file *m, struct btf_ptr *ptr, u32 ptr_size, u64 flags) + * Description + * Use BTF to write to seq_write a string representation of + * *ptr*->ptr, using *ptr*->type_id as per bpf_snprintf_btf(). + * *flags* are identical to those used for bpf_snprintf_btf. + * Return + * 0 on success or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3782,6 +3790,7 @@ union bpf_attr { FN(d_path), \ FN(copy_from_user), \ FN(snprintf_btf), \ + FN(seq_printf_btf), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index dcdd7109aa29..498e5e553825 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5346,8 +5346,8 @@ static void btf_seq_show(struct btf_show *show, const char *fmt, seq_vprintf((struct seq_file *)show->target, fmt, args); } -static int btf_type_seq_show_flags(const struct btf *btf, u32 type_id, - void *obj, struct seq_file *m, u64 flags) +int btf_type_seq_show_flags(const struct btf *btf, u32 type_id, + void *obj, struct seq_file *m, u64 flags) { struct btf_show sseq; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 403fb2341a86..c4ba45fa4fe1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2217,6 +2217,7 @@ const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto __weak; const struct bpf_func_proto bpf_get_local_storage_proto __weak; const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_snprintf_btf_proto __weak; +const struct bpf_func_proto bpf_seq_printf_btf_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 140e1be9dab6..e118a83439c3 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -71,6 +71,10 @@ static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name) u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); +static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size, + u64 flags, const struct btf **btf, + s32 *btf_id); + /** * trace_call_bpf - invoke BPF program * @call: tracepoint event @@ -776,6 +780,31 @@ static const struct bpf_func_proto bpf_seq_write_proto = { .arg3_type = ARG_CONST_SIZE_OR_ZERO, }; +BPF_CALL_4(bpf_seq_printf_btf, struct seq_file *, m, struct btf_ptr *, ptr, + u32, btf_ptr_size, u64, flags) +{ + const struct btf *btf; + s32 btf_id; + int ret; + + ret = bpf_btf_printf_prepare(ptr, btf_ptr_size, flags, &btf, &btf_id); + if (ret) + return ret; + + return btf_type_seq_show_flags(btf, btf_id, ptr->ptr, m, flags); +} + +static const struct bpf_func_proto bpf_seq_printf_btf_proto = { + .func = bpf_seq_printf_btf, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_seq_file_ids[0], + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + static __always_inline int get_map_perf_counter(struct bpf_map *map, u64 flags, u64 *value, u64 *enabled, u64 *running) @@ -1695,6 +1724,10 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return prog->expected_attach_type == BPF_TRACE_ITER ? &bpf_seq_write_proto : NULL; + case BPF_FUNC_seq_printf_btf: + return prog->expected_attach_type == BPF_TRACE_ITER ? + &bpf_seq_printf_btf_proto : + NULL; case BPF_FUNC_d_path: return &bpf_d_path_proto; default: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index cca9eb1b13e5..96ddb00b91dc 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3630,6 +3630,14 @@ union bpf_attr { * The number of bytes that were written (or would have been * written if output had to be truncated due to string size), * or a negative error in cases of failure. + * + * long bpf_seq_printf_btf(struct seq_file *m, struct btf_ptr *ptr, u32 ptr_size, u64 flags) + * Description + * Use BTF to write to seq_write a string representation of + * *ptr*->ptr, using *ptr*->type_id as per bpf_snprintf_btf(). + * *flags* are identical to those used for bpf_snprintf_btf. + * Return + * 0 on success or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3782,6 +3790,7 @@ union bpf_attr { FN(d_path), \ FN(copy_from_user), \ FN(snprintf_btf), \ + FN(seq_printf_btf), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 3789af9a13e5561738c0f2114e3a5e22c843ca3e Mon Sep 17 00:00:00 2001 From: Krzysztof Wilczyński Date: Thu, 30 Jul 2020 21:08:48 +0000 Subject: PCI/PM: Rename pci_dev.d3_delay to d3hot_delay MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PCI devices support two variants of the D3 power state: D3hot (main power present) D3cold (main power removed). Previously struct pci_dev contained: unsigned int d3_delay; /* D3->D0 transition time in ms */ unsigned int d3cold_delay; /* D3cold->D0 transition time in ms */ "d3_delay" refers specifically to the D3hot state. Rename it to "d3hot_delay" to avoid ambiguity and align with the ACPI "_DSM for Specifying Device Readiness Durations" in the PCI Firmware spec r3.2, sec 4.6.9. There is no change to the functionality. Link: https://lore.kernel.org/r/20200730210848.1578826-1-kw@linux.com Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas --- Documentation/power/pci.rst | 2 +- arch/x86/pci/fixup.c | 2 +- arch/x86/pci/intel_mid_pci.c | 2 +- drivers/hid/intel-ish-hid/ipc/ipc.c | 2 +- drivers/net/ethernet/marvell/sky2.c | 2 +- drivers/pci/pci-acpi.c | 6 +-- drivers/pci/pci.c | 14 ++--- drivers/pci/pci.h | 4 +- drivers/pci/quirks.c | 68 ++++++++++++------------ drivers/staging/media/atomisp/pci/atomisp_v4l2.c | 2 +- include/linux/pci.h | 2 +- include/uapi/linux/pci_regs.h | 2 +- 12 files changed, 54 insertions(+), 54 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/power/pci.rst b/Documentation/power/pci.rst index 1831e431f725..b04fb18cc4e2 100644 --- a/Documentation/power/pci.rst +++ b/Documentation/power/pci.rst @@ -320,7 +320,7 @@ that these callbacks operate on:: unsigned int d2_support:1; /* Low power state D2 is supported */ unsigned int no_d1d2:1; /* D1 and D2 are forbidden */ unsigned int wakeup_prepared:1; /* Device prepared for wake up */ - unsigned int d3_delay; /* D3->D0 transition time in ms */ + unsigned int d3hot_delay; /* D3hot->D0 transition time in ms */ ... }; diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index b8c9a5b87f37..0a0e168be1cb 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -587,7 +587,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0xa26d, pci_invalid_bar); static void pci_fixup_amd_ehci_pme(struct pci_dev *dev) { dev_info(&dev->dev, "PME# does not work under D3, disabling it\n"); - dev->pme_support &= ~((PCI_PM_CAP_PME_D3 | PCI_PM_CAP_PME_D3cold) + dev->pme_support &= ~((PCI_PM_CAP_PME_D3hot | PCI_PM_CAP_PME_D3cold) >> PCI_PM_CAP_PME_SHIFT); } DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x7808, pci_fixup_amd_ehci_pme); diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c index 00c62115f39c..979f310b67d4 100644 --- a/arch/x86/pci/intel_mid_pci.c +++ b/arch/x86/pci/intel_mid_pci.c @@ -322,7 +322,7 @@ static void pci_d3delay_fixup(struct pci_dev *dev) */ if (type1_access_ok(dev->bus->number, dev->devfn, PCI_DEVICE_ID)) return; - dev->d3_delay = 0; + dev->d3hot_delay = 0; } DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_d3delay_fixup); diff --git a/drivers/hid/intel-ish-hid/ipc/ipc.c b/drivers/hid/intel-ish-hid/ipc/ipc.c index 8f8dfdf64833..a45ac7fa417b 100644 --- a/drivers/hid/intel-ish-hid/ipc/ipc.c +++ b/drivers/hid/intel-ish-hid/ipc/ipc.c @@ -755,7 +755,7 @@ static int _ish_hw_reset(struct ishtp_device *dev) csr |= PCI_D3hot; pci_write_config_word(pdev, pdev->pm_cap + PCI_PM_CTRL, csr); - mdelay(pdev->d3_delay); + mdelay(pdev->d3hot_delay); csr &= ~PCI_PM_CTRL_STATE_MASK; csr |= PCI_D0; diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c index cec8124301c7..dd11c06ca7f9 100644 --- a/drivers/net/ethernet/marvell/sky2.c +++ b/drivers/net/ethernet/marvell/sky2.c @@ -5105,7 +5105,7 @@ static int sky2_probe(struct pci_dev *pdev, const struct pci_device_id *ent) INIT_WORK(&hw->restart_work, sky2_restart); pci_set_drvdata(pdev, hw); - pdev->d3_delay = 300; + pdev->d3hot_delay = 300; return 0; diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index d5869a03f748..154db9a47511 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -1167,7 +1167,7 @@ static struct acpi_device *acpi_pci_find_companion(struct device *dev) * @pdev: the PCI device whose delay is to be updated * @handle: ACPI handle of this device * - * Update the d3_delay and d3cold_delay of a PCI device from the ACPI _DSM + * Update the d3hot_delay and d3cold_delay of a PCI device from the ACPI _DSM * control method of either the device itself or the PCI host bridge. * * Function 8, "Reset Delay," applies to the entire hierarchy below a PCI @@ -1206,8 +1206,8 @@ static void pci_acpi_optimize_delay(struct pci_dev *pdev, } if (elements[3].type == ACPI_TYPE_INTEGER) { value = (int)elements[3].integer.value / 1000; - if (value < PCI_PM_D3_WAIT) - pdev->d3_delay = value; + if (value < PCI_PM_D3HOT_WAIT) + pdev->d3hot_delay = value; } } ACPI_FREE(obj); diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index a458c46d7e39..c4a26532a447 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -49,7 +49,7 @@ EXPORT_SYMBOL(isa_dma_bridge_buggy); int pci_pci_problems; EXPORT_SYMBOL(pci_pci_problems); -unsigned int pci_pm_d3_delay; +unsigned int pci_pm_d3hot_delay; static void pci_pme_list_scan(struct work_struct *work); @@ -66,10 +66,10 @@ struct pci_pme_device { static void pci_dev_d3_sleep(struct pci_dev *dev) { - unsigned int delay = dev->d3_delay; + unsigned int delay = dev->d3hot_delay; - if (delay < pci_pm_d3_delay) - delay = pci_pm_d3_delay; + if (delay < pci_pm_d3hot_delay) + delay = pci_pm_d3hot_delay; if (delay) msleep(delay); @@ -3013,7 +3013,7 @@ void pci_pm_init(struct pci_dev *dev) } dev->pm_cap = pm; - dev->d3_delay = PCI_PM_D3_WAIT; + dev->d3hot_delay = PCI_PM_D3HOT_WAIT; dev->d3cold_delay = PCI_PM_D3COLD_WAIT; dev->bridge_d3 = pci_bridge_d3_possible(dev); dev->d3cold_allowed = true; @@ -3038,7 +3038,7 @@ void pci_pm_init(struct pci_dev *dev) (pmc & PCI_PM_CAP_PME_D0) ? " D0" : "", (pmc & PCI_PM_CAP_PME_D1) ? " D1" : "", (pmc & PCI_PM_CAP_PME_D2) ? " D2" : "", - (pmc & PCI_PM_CAP_PME_D3) ? " D3hot" : "", + (pmc & PCI_PM_CAP_PME_D3hot) ? " D3hot" : "", (pmc & PCI_PM_CAP_PME_D3cold) ? " D3cold" : ""); dev->pme_support = pmc >> PCI_PM_CAP_PME_SHIFT; dev->pme_poll = true; @@ -4621,7 +4621,7 @@ static int pci_af_flr(struct pci_dev *dev, int probe) * * NOTE: This causes the caller to sleep for twice the device power transition * cooldown period, which for the D0->D3hot and D3hot->D0 transitions is 10 ms - * by default (i.e. unless the @dev's d3_delay field has a different value). + * by default (i.e. unless the @dev's d3hot_delay field has a different value). * Moreover, only devices in D0 can be reset by this function. */ static int pci_pm_reset(struct pci_dev *dev, int probe) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index fa12f7cbc1a0..8d492669ecfd 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -44,7 +44,7 @@ int pci_bridge_secondary_bus_reset(struct pci_dev *dev); int pci_bus_error_reset(struct pci_dev *dev); #define PCI_PM_D2_DELAY 200 -#define PCI_PM_D3_WAIT 10 +#define PCI_PM_D3HOT_WAIT 10 #define PCI_PM_D3COLD_WAIT 100 #define PCI_PM_BUS_WAIT 50 @@ -178,7 +178,7 @@ extern struct mutex pci_slot_mutex; extern raw_spinlock_t pci_lock; -extern unsigned int pci_pm_d3_delay; +extern unsigned int pci_pm_d3hot_delay; #ifdef CONFIG_PCI_MSI void pci_no_msi(void); diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index bdf9b52567e0..72b22a35e516 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -1846,7 +1846,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_PXHV, quirk_pci */ static void quirk_intel_pcie_pm(struct pci_dev *dev) { - pci_pm_d3_delay = 120; + pci_pm_d3hot_delay = 120; dev->no_d1d2 = 1; } DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x25e2, quirk_intel_pcie_pm); @@ -1873,12 +1873,12 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x260b, quirk_intel_pcie_pm); static void quirk_d3hot_delay(struct pci_dev *dev, unsigned int delay) { - if (dev->d3_delay >= delay) + if (dev->d3hot_delay >= delay) return; - dev->d3_delay = delay; + dev->d3hot_delay = delay; pci_info(dev, "extending delay after power-on from D3hot to %d msec\n", - dev->d3_delay); + dev->d3hot_delay); } static void quirk_radeon_pm(struct pci_dev *dev) @@ -3387,36 +3387,36 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0152, disable_igfx_irq); * PCI devices which are on Intel chips can skip the 10ms delay * before entering D3 mode. */ -static void quirk_remove_d3_delay(struct pci_dev *dev) -{ - dev->d3_delay = 0; -} -/* C600 Series devices do not need 10ms d3_delay */ -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0412, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0c00, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0c0c, quirk_remove_d3_delay); -/* Lynxpoint-H PCH devices do not need 10ms d3_delay */ -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c02, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c18, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c1c, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c20, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c22, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c26, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c2d, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c31, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c3a, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c3d, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c4e, quirk_remove_d3_delay); -/* Intel Cherrytrail devices do not need 10ms d3_delay */ -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x2280, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x2298, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x229c, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x22b0, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x22b5, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x22b7, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x22b8, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x22d8, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x22dc, quirk_remove_d3_delay); +static void quirk_remove_d3hot_delay(struct pci_dev *dev) +{ + dev->d3hot_delay = 0; +} +/* C600 Series devices do not need 10ms d3hot_delay */ +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0412, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0c00, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0c0c, quirk_remove_d3hot_delay); +/* Lynxpoint-H PCH devices do not need 10ms d3hot_delay */ +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c02, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c18, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c1c, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c20, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c22, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c26, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c2d, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c31, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c3a, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c3d, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c4e, quirk_remove_d3hot_delay); +/* Intel Cherrytrail devices do not need 10ms d3hot_delay */ +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x2280, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x2298, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x229c, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x22b0, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x22b5, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x22b7, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x22b8, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x22d8, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x22dc, quirk_remove_d3hot_delay); /* * Some devices may pass our check in pci_intx_mask_supported() if diff --git a/drivers/staging/media/atomisp/pci/atomisp_v4l2.c b/drivers/staging/media/atomisp/pci/atomisp_v4l2.c index a000a1e316f7..beba430a197e 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_v4l2.c +++ b/drivers/staging/media/atomisp/pci/atomisp_v4l2.c @@ -1573,7 +1573,7 @@ static int atomisp_pci_probe(struct pci_dev *pdev, const struct pci_device_id *i spin_lock_init(&isp->lock); /* This is not a true PCI device on SoC, so the delay is not needed. */ - pdev->d3_delay = 0; + pdev->d3hot_delay = 0; pci_set_drvdata(pdev, isp); diff --git a/include/linux/pci.h b/include/linux/pci.h index c9e169c4e216..bea1a03faab6 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -373,7 +373,7 @@ struct pci_dev { user sysfs */ unsigned int clear_retrain_link:1; /* Need to clear Retrain Link bit manually */ - unsigned int d3_delay; /* D3->D0 transition time in ms */ + unsigned int d3hot_delay; /* D3hot->D0 transition time in ms */ unsigned int d3cold_delay; /* D3cold->D0 transition time in ms */ #ifdef CONFIG_PCIEASPM diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index f9701410d3b5..49f15c37e771 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -246,7 +246,7 @@ #define PCI_PM_CAP_PME_D0 0x0800 /* PME# from D0 */ #define PCI_PM_CAP_PME_D1 0x1000 /* PME# from D1 */ #define PCI_PM_CAP_PME_D2 0x2000 /* PME# from D2 */ -#define PCI_PM_CAP_PME_D3 0x4000 /* PME# from D3 (hot) */ +#define PCI_PM_CAP_PME_D3hot 0x4000 /* PME# from D3 (hot) */ #define PCI_PM_CAP_PME_D3cold 0x8000 /* PME# from D3 (cold) */ #define PCI_PM_CAP_PME_SHIFT 11 /* Start of the PME Mask in PMC */ #define PCI_PM_CTRL 4 /* PM control and status register */ -- cgit v1.2.3 From 4a1e7c0c63e02daad751842b7880f9bbcdfb6e89 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Tue, 29 Sep 2020 14:45:51 +0200 Subject: bpf: Support attaching freplace programs to multiple attach points MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This enables support for attaching freplace programs to multiple attach points. It does this by amending the UAPI for bpf_link_Create with a target btf ID that can be used to supply the new attachment point along with the target program fd. The target must be compatible with the target that was supplied at program load time. The implementation reuses the checks that were factored out of check_attach_btf_id() to ensure compatibility between the BTF types of the old and new attachment. If these match, a new bpf_tracing_link will be created for the new attach target, allowing multiple attachments to co-exist simultaneously. The code could theoretically support multiple-attach of other types of tracing programs as well, but since I don't have a use case for any of those, there is no API support for doing so. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/160138355169.48470.17165680973640685368.stgit@toke.dk --- include/linux/bpf.h | 2 + include/uapi/linux/bpf.h | 9 ++- kernel/bpf/syscall.c | 132 ++++++++++++++++++++++++++++++++++++----- kernel/bpf/verifier.c | 10 ++++ tools/include/uapi/linux/bpf.h | 9 ++- 5 files changed, 142 insertions(+), 20 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 839dd8670a7a..50e5c4b52bd1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -768,6 +768,8 @@ struct bpf_prog_aux { struct mutex dst_mutex; /* protects dst_* pointers below, *after* prog becomes visible */ struct bpf_prog *dst_prog; struct bpf_trampoline *dst_trampoline; + enum bpf_prog_type saved_dst_prog_type; + enum bpf_attach_type saved_dst_attach_type; bool verifier_zext; /* Zero extensions has been inserted by verifier. */ bool offload_requested; bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 96ddb00b91dc..2b1d3f16cbd1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -639,8 +639,13 @@ union bpf_attr { }; __u32 attach_type; /* attach type */ __u32 flags; /* extra flags */ - __aligned_u64 iter_info; /* extra bpf_iter_link_info */ - __u32 iter_info_len; /* iter_info length */ + union { + __u32 target_btf_id; /* btf_id of target to attach to */ + struct { + __aligned_u64 iter_info; /* extra bpf_iter_link_info */ + __u32 iter_info_len; /* iter_info length */ + }; + }; } link_create; struct { /* struct used by BPF_LINK_UPDATE command */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e6a0a948e30c..f1528c2a6927 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -2554,12 +2555,15 @@ static const struct bpf_link_ops bpf_tracing_link_lops = { .fill_link_info = bpf_tracing_link_fill_link_info, }; -static int bpf_tracing_prog_attach(struct bpf_prog *prog) +static int bpf_tracing_prog_attach(struct bpf_prog *prog, + int tgt_prog_fd, + u32 btf_id) { struct bpf_link_primer link_primer; struct bpf_prog *tgt_prog = NULL; + struct bpf_trampoline *tr = NULL; struct bpf_tracing_link *link; - struct bpf_trampoline *tr; + u64 key = 0; int err; switch (prog->type) { @@ -2588,6 +2592,28 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog) goto out_put_prog; } + if (!!tgt_prog_fd != !!btf_id) { + err = -EINVAL; + goto out_put_prog; + } + + if (tgt_prog_fd) { + /* For now we only allow new targets for BPF_PROG_TYPE_EXT */ + if (prog->type != BPF_PROG_TYPE_EXT) { + err = -EINVAL; + goto out_put_prog; + } + + tgt_prog = bpf_prog_get(tgt_prog_fd); + if (IS_ERR(tgt_prog)) { + err = PTR_ERR(tgt_prog); + tgt_prog = NULL; + goto out_put_prog; + } + + key = bpf_trampoline_compute_key(tgt_prog, btf_id); + } + link = kzalloc(sizeof(*link), GFP_USER); if (!link) { err = -ENOMEM; @@ -2599,12 +2625,58 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog) mutex_lock(&prog->aux->dst_mutex); - if (!prog->aux->dst_trampoline) { + /* There are a few possible cases here: + * + * - if prog->aux->dst_trampoline is set, the program was just loaded + * and not yet attached to anything, so we can use the values stored + * in prog->aux + * + * - if prog->aux->dst_trampoline is NULL, the program has already been + * attached to a target and its initial target was cleared (below) + * + * - if tgt_prog != NULL, the caller specified tgt_prog_fd + + * target_btf_id using the link_create API. + * + * - if tgt_prog == NULL when this function was called using the old + * raw_tracepoint_open API, and we need a target from prog->aux + * + * The combination of no saved target in prog->aux, and no target + * specified on load is illegal, and we reject that here. + */ + if (!prog->aux->dst_trampoline && !tgt_prog) { err = -ENOENT; goto out_unlock; } - tr = prog->aux->dst_trampoline; - tgt_prog = prog->aux->dst_prog; + + if (!prog->aux->dst_trampoline || + (key && key != prog->aux->dst_trampoline->key)) { + /* If there is no saved target, or the specified target is + * different from the destination specified at load time, we + * need a new trampoline and a check for compatibility + */ + struct bpf_attach_target_info tgt_info = {}; + + err = bpf_check_attach_target(NULL, prog, tgt_prog, btf_id, + &tgt_info); + if (err) + goto out_unlock; + + tr = bpf_trampoline_get(key, &tgt_info); + if (!tr) { + err = -ENOMEM; + goto out_unlock; + } + } else { + /* The caller didn't specify a target, or the target was the + * same as the destination supplied during program load. This + * means we can reuse the trampoline and reference from program + * load time, and there is no need to allocate a new one. This + * can only happen once for any program, as the saved values in + * prog->aux are cleared below. + */ + tr = prog->aux->dst_trampoline; + tgt_prog = prog->aux->dst_prog; + } err = bpf_link_prime(&link->link, &link_primer); if (err) @@ -2620,15 +2692,31 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog) link->tgt_prog = tgt_prog; link->trampoline = tr; + /* Always clear the trampoline and target prog from prog->aux to make + * sure the original attach destination is not kept alive after a + * program is (re-)attached to another target. + */ + if (prog->aux->dst_prog && + (tgt_prog_fd || tr != prog->aux->dst_trampoline)) + /* got extra prog ref from syscall, or attaching to different prog */ + bpf_prog_put(prog->aux->dst_prog); + if (prog->aux->dst_trampoline && tr != prog->aux->dst_trampoline) + /* we allocated a new trampoline, so free the old one */ + bpf_trampoline_put(prog->aux->dst_trampoline); + prog->aux->dst_prog = NULL; prog->aux->dst_trampoline = NULL; mutex_unlock(&prog->aux->dst_mutex); return bpf_link_settle(&link_primer); out_unlock: + if (tr && tr != prog->aux->dst_trampoline) + bpf_trampoline_put(tr); mutex_unlock(&prog->aux->dst_mutex); kfree(link); out_put_prog: + if (tgt_prog_fd && tgt_prog) + bpf_prog_put(tgt_prog); bpf_prog_put(prog); return err; } @@ -2742,7 +2830,7 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) tp_name = prog->aux->attach_func_name; break; } - return bpf_tracing_prog_attach(prog); + return bpf_tracing_prog_attach(prog, 0, 0); case BPF_PROG_TYPE_RAW_TRACEPOINT: case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: if (strncpy_from_user(buf, @@ -3926,10 +4014,15 @@ err_put: static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { - if (attr->link_create.attach_type == BPF_TRACE_ITER && - prog->expected_attach_type == BPF_TRACE_ITER) - return bpf_iter_link_attach(attr, prog); + if (attr->link_create.attach_type != prog->expected_attach_type) + return -EINVAL; + if (prog->expected_attach_type == BPF_TRACE_ITER) + return bpf_iter_link_attach(attr, prog); + else if (prog->type == BPF_PROG_TYPE_EXT) + return bpf_tracing_prog_attach(prog, + attr->link_create.target_fd, + attr->link_create.target_btf_id); return -EINVAL; } @@ -3943,18 +4036,25 @@ static int link_create(union bpf_attr *attr) if (CHECK_ATTR(BPF_LINK_CREATE)) return -EINVAL; - ptype = attach_type_to_prog_type(attr->link_create.attach_type); - if (ptype == BPF_PROG_TYPE_UNSPEC) - return -EINVAL; - - prog = bpf_prog_get_type(attr->link_create.prog_fd, ptype); + prog = bpf_prog_get(attr->link_create.prog_fd); if (IS_ERR(prog)) return PTR_ERR(prog); ret = bpf_prog_attach_check_attach_type(prog, attr->link_create.attach_type); if (ret) - goto err_out; + goto out; + + if (prog->type == BPF_PROG_TYPE_EXT) { + ret = tracing_bpf_link_attach(attr, prog); + goto out; + } + + ptype = attach_type_to_prog_type(attr->link_create.attach_type); + if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) { + ret = -EINVAL; + goto out; + } switch (ptype) { case BPF_PROG_TYPE_CGROUP_SKB: @@ -3982,7 +4082,7 @@ static int link_create(union bpf_attr *attr) ret = -EINVAL; } -err_out: +out: if (ret < 0) bpf_prog_put(prog); return ret; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a97a2f2964e3..015a1c074b6b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11404,6 +11404,11 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, if (!btf_type_is_func_proto(t)) return -EINVAL; + if ((prog->aux->saved_dst_prog_type || prog->aux->saved_dst_attach_type) && + (!tgt_prog || prog->aux->saved_dst_prog_type != tgt_prog->type || + prog->aux->saved_dst_attach_type != tgt_prog->expected_attach_type)) + return -EINVAL; + if (tgt_prog && conservative) t = NULL; @@ -11512,6 +11517,11 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_func_proto = tgt_info.tgt_type; prog->aux->attach_func_name = tgt_info.tgt_name; + if (tgt_prog) { + prog->aux->saved_dst_prog_type = tgt_prog->type; + prog->aux->saved_dst_attach_type = tgt_prog->expected_attach_type; + } + if (prog->expected_attach_type == BPF_TRACE_RAW_TP) { prog->aux->attach_btf_trace = true; return 0; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 96ddb00b91dc..2b1d3f16cbd1 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -639,8 +639,13 @@ union bpf_attr { }; __u32 attach_type; /* attach type */ __u32 flags; /* extra flags */ - __aligned_u64 iter_info; /* extra bpf_iter_link_info */ - __u32 iter_info_len; /* iter_info length */ + union { + __u32 target_btf_id; /* btf_id of target to attach to */ + struct { + __aligned_u64 iter_info; /* extra bpf_iter_link_info */ + __u32 iter_info_len; /* iter_info length */ + }; + }; } link_create; struct { /* struct used by BPF_LINK_UPDATE command */ -- cgit v1.2.3 From 3f47cb4c1cf3bceb2438ea962bfffc6665ee4a9f Mon Sep 17 00:00:00 2001 From: Tom Parkin Date: Tue, 29 Sep 2020 13:35:41 +0100 Subject: l2tp: report rx cookie discards in netlink get When an L2TPv3 session receives a data frame with an incorrect cookie l2tp_core logs a warning message and bumps a stats counter to reflect the fact that the packet has been dropped. However, the stats counter in question is missing from the l2tp_netlink get message for tunnel and session instances. Include the statistic in the netlink get response. Signed-off-by: Tom Parkin Signed-off-by: David S. Miller --- include/uapi/linux/l2tp.h | 1 + net/l2tp/l2tp_netlink.c | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h index 88a0d32b8c07..30c80d5ba4bf 100644 --- a/include/uapi/linux/l2tp.h +++ b/include/uapi/linux/l2tp.h @@ -144,6 +144,7 @@ enum { L2TP_ATTR_RX_OOS_PACKETS, /* u64 */ L2TP_ATTR_RX_ERRORS, /* u64 */ L2TP_ATTR_STATS_PAD, + L2TP_ATTR_RX_COOKIE_DISCARDS, /* u64 */ __L2TP_ATTR_STATS_MAX, }; diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index 83c015f7f20d..5ca5056e9636 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -420,6 +420,9 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int fla nla_put_u64_64bit(skb, L2TP_ATTR_RX_SEQ_DISCARDS, atomic_long_read(&tunnel->stats.rx_seq_discards), L2TP_ATTR_STATS_PAD) || + nla_put_u64_64bit(skb, L2TP_ATTR_RX_COOKIE_DISCARDS, + atomic_long_read(&tunnel->stats.rx_cookie_discards), + L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_OOS_PACKETS, atomic_long_read(&tunnel->stats.rx_oos_packets), L2TP_ATTR_STATS_PAD) || @@ -760,6 +763,9 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl nla_put_u64_64bit(skb, L2TP_ATTR_RX_SEQ_DISCARDS, atomic_long_read(&session->stats.rx_seq_discards), L2TP_ATTR_STATS_PAD) || + nla_put_u64_64bit(skb, L2TP_ATTR_RX_COOKIE_DISCARDS, + atomic_long_read(&session->stats.rx_cookie_discards), + L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_OOS_PACKETS, atomic_long_read(&session->stats.rx_oos_packets), L2TP_ATTR_STATS_PAD) || -- cgit v1.2.3 From 2ec13cbcfadbbeac499f3b63de0f7db490d45a7e Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Tue, 29 Sep 2020 11:08:59 -0700 Subject: devlink: include for _BITUL Commit 5d5b4128c4ca ("devlink: introduce flash update overwrite mask") added a usage of _BITUL to the UAPI header, but failed to include the header file where it was defined. It happens that this does not break any existing kernel include chains because it gets included through other sources. However, when including the UAPI headers in a userspace application (such as devlink in iproute2), _BITUL is not defined. Fixes: 5d5b4128c4ca ("devlink: introduce flash update overwrite mask") Signed-off-by: Jacob Keller Signed-off-by: David S. Miller --- include/uapi/linux/devlink.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 7b0face1bad5..ba467dc07852 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -13,6 +13,8 @@ #ifndef _UAPI_LINUX_DEVLINK_H_ #define _UAPI_LINUX_DEVLINK_H_ +#include + #define DEVLINK_GENL_NAME "devlink" #define DEVLINK_GENL_VERSION 0x1 #define DEVLINK_GENL_MCGRP_CONFIG_NAME "config" -- cgit v1.2.3 From 539430fbbcc4a8d02451c77fff1ecd1f3b5f8abf Mon Sep 17 00:00:00 2001 From: Kent Gibson Date: Mon, 28 Sep 2020 08:27:50 +0800 Subject: gpio: uapi: define GPIO_MAX_NAME_SIZE for array sizes Replace constant array sizes with a macro constant to clarify the source of array sizes, provide a place to document any constraints on the size, and to simplify array sizing in userspace if constructing structs from their composite fields. Signed-off-by: Kent Gibson Reviewed-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski --- include/uapi/linux/gpio.h | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/gpio.h b/include/uapi/linux/gpio.h index 9c27cecf406f..285cc10355b2 100644 --- a/include/uapi/linux/gpio.h +++ b/include/uapi/linux/gpio.h @@ -14,6 +14,11 @@ #include #include +/* + * The maximum size of name and label arrays. + */ +#define GPIO_MAX_NAME_SIZE 32 + /** * struct gpiochip_info - Information about a certain GPIO chip * @name: the Linux kernel name of this GPIO chip @@ -22,8 +27,8 @@ * @lines: number of GPIO lines on this chip */ struct gpiochip_info { - char name[32]; - char label[32]; + char name[GPIO_MAX_NAME_SIZE]; + char label[GPIO_MAX_NAME_SIZE]; __u32 lines; }; @@ -52,8 +57,8 @@ struct gpiochip_info { struct gpioline_info { __u32 line_offset; __u32 flags; - char name[32]; - char consumer[32]; + char name[GPIO_MAX_NAME_SIZE]; + char consumer[GPIO_MAX_NAME_SIZE]; }; /* Maximum number of requested handles */ @@ -123,7 +128,7 @@ struct gpiohandle_request { __u32 lineoffsets[GPIOHANDLES_MAX]; __u32 flags; __u8 default_values[GPIOHANDLES_MAX]; - char consumer_label[32]; + char consumer_label[GPIO_MAX_NAME_SIZE]; __u32 lines; int fd; }; @@ -182,7 +187,7 @@ struct gpioevent_request { __u32 lineoffset; __u32 handleflags; __u32 eventflags; - char consumer_label[32]; + char consumer_label[GPIO_MAX_NAME_SIZE]; int fd; }; -- cgit v1.2.3 From b53911aa872db462be2e5f1dd611b25c4c2e663b Mon Sep 17 00:00:00 2001 From: Kent Gibson Date: Mon, 28 Sep 2020 08:27:51 +0800 Subject: gpio: uapi: define uAPI v2 Add a new version of the uAPI to address existing 32/64-bit alignment issues, add support for debounce and event sequence numbers, allow requested lines with different configurations, and provide some future proofing by adding padding reserved for future use. The alignment issue relates to the gpioevent_data, which packs to different sizes on 32-bit and 64-bit platforms. That creates problems for 32-bit apps running on 64-bit kernels. uAPI v2 addresses that particular issue, and the problem more generally, by adding pad fields that explicitly pad structs out to 64-bit boundaries, so they will pack to the same size now, and even if some of the reserved padding is used for __u64 fields in the future. The new structs have been analysed with pahole to ensure that they are sized as expected and contain no implicit padding. The lack of future proofing in v1 makes it impossible to, for example, add the debounce feature that is included in v2. The future proofing is addressed by providing configurable attributes in line config and reserved padding in all structs for future features. Specifically, the line request, config, info, info_changed and event structs receive updated versions and new ioctls. As the majority of the structs and ioctls were being replaced, it is opportune to rework some of the other aspects of the uAPI: v1 has three different flags fields, each with their own separate bit definitions. In v2 that is collapsed to one - gpio_v2_line_flag. The handle and event requests are merged into a single request, the line request, as the two requests were mostly the same other than the edge detection provided by event requests. As a byproduct, the v2 uAPI allows for multiple lines producing edge events on the same line handle. This is a new capability as v1 only supports a single line in an event request. As a consequence, there are now only two types of file handle to be concerned with, the chip and the line, and it is clearer which ioctls apply to which type of handle. There is also some minor renaming of fields for consistency compared to their v1 counterparts, e.g. offset rather than lineoffset or line_offset, and consumer rather than consumer_label. Additionally, v1 GPIOHANDLES_MAX becomes GPIO_V2_LINES_MAX in v2 for clarity, and the gpiohandle_data __u8 array becomes a bitmap in gpio_v2_line_values. The v2 uAPI is mostly a reorganisation and extension of v1, so userspace code, particularly libgpiod, should readily port to it. Signed-off-by: Kent Gibson Reviewed-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski --- include/uapi/linux/gpio.h | 291 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 284 insertions(+), 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/gpio.h b/include/uapi/linux/gpio.h index 285cc10355b2..5904f49399de 100644 --- a/include/uapi/linux/gpio.h +++ b/include/uapi/linux/gpio.h @@ -11,11 +11,14 @@ #ifndef _UAPI_GPIO_H_ #define _UAPI_GPIO_H_ +#include #include #include /* * The maximum size of name and label arrays. + * + * Must be a multiple of 8 to ensure 32/64-bit alignment of structs. */ #define GPIO_MAX_NAME_SIZE 32 @@ -32,6 +35,265 @@ struct gpiochip_info { __u32 lines; }; +/* + * Maximum number of requested lines. + * + * Must be no greater than 64, as bitmaps are restricted here to 64-bits + * for simplicity, and a multiple of 2 to ensure 32/64-bit alignment of + * structs. + */ +#define GPIO_V2_LINES_MAX 64 + +/* + * The maximum number of configuration attributes associated with a line + * request. + */ +#define GPIO_V2_LINE_NUM_ATTRS_MAX 10 + +/** + * enum gpio_v2_line_flag - &struct gpio_v2_line_attribute.flags values + * @GPIO_V2_LINE_FLAG_USED: line is not available for request + * @GPIO_V2_LINE_FLAG_ACTIVE_LOW: line active state is physical low + * @GPIO_V2_LINE_FLAG_INPUT: line is an input + * @GPIO_V2_LINE_FLAG_OUTPUT: line is an output + * @GPIO_V2_LINE_FLAG_EDGE_RISING: line detects rising (inactive to active) + * edges + * @GPIO_V2_LINE_FLAG_EDGE_FALLING: line detects falling (active to + * inactive) edges + * @GPIO_V2_LINE_FLAG_OPEN_DRAIN: line is an open drain output + * @GPIO_V2_LINE_FLAG_OPEN_SOURCE: line is an open source output + * @GPIO_V2_LINE_FLAG_BIAS_PULL_UP: line has pull-up bias enabled + * @GPIO_V2_LINE_FLAG_BIAS_PULL_DOWN: line has pull-down bias enabled + * @GPIO_V2_LINE_FLAG_BIAS_DISABLED: line has bias disabled + */ +enum gpio_v2_line_flag { + GPIO_V2_LINE_FLAG_USED = _BITULL(0), + GPIO_V2_LINE_FLAG_ACTIVE_LOW = _BITULL(1), + GPIO_V2_LINE_FLAG_INPUT = _BITULL(2), + GPIO_V2_LINE_FLAG_OUTPUT = _BITULL(3), + GPIO_V2_LINE_FLAG_EDGE_RISING = _BITULL(4), + GPIO_V2_LINE_FLAG_EDGE_FALLING = _BITULL(5), + GPIO_V2_LINE_FLAG_OPEN_DRAIN = _BITULL(6), + GPIO_V2_LINE_FLAG_OPEN_SOURCE = _BITULL(7), + GPIO_V2_LINE_FLAG_BIAS_PULL_UP = _BITULL(8), + GPIO_V2_LINE_FLAG_BIAS_PULL_DOWN = _BITULL(9), + GPIO_V2_LINE_FLAG_BIAS_DISABLED = _BITULL(10), +}; + +/** + * struct gpio_v2_line_values - Values of GPIO lines + * @bits: a bitmap containing the value of the lines, set to 1 for active + * and 0 for inactive. + * @mask: a bitmap identifying the lines to get or set, with each bit + * number corresponding to the index into &struct + * gpio_v2_line_request.offsets. + */ +struct gpio_v2_line_values { + __aligned_u64 bits; + __aligned_u64 mask; +}; + +/** + * enum gpio_v2_line_attr_id - &struct gpio_v2_line_attribute.id values + * identifying which field of the attribute union is in use. + * @GPIO_V2_LINE_ATTR_ID_FLAGS: flags field is in use + * @GPIO_V2_LINE_ATTR_ID_OUTPUT_VALUES: values field is in use + * @GPIO_V2_LINE_ATTR_ID_DEBOUNCE: debounce_period_us is in use + */ +enum gpio_v2_line_attr_id { + GPIO_V2_LINE_ATTR_ID_FLAGS = 1, + GPIO_V2_LINE_ATTR_ID_OUTPUT_VALUES = 2, + GPIO_V2_LINE_ATTR_ID_DEBOUNCE = 3, +}; + +/** + * struct gpio_v2_line_attribute - a configurable attribute of a line + * @id: attribute identifier with value from &enum gpio_v2_line_attr_id + * @padding: reserved for future use and must be zero filled + * @flags: if id is GPIO_V2_LINE_ATTR_ID_FLAGS, the flags for the GPIO + * line, with values from enum gpio_v2_line_flag, such as + * GPIO_V2_LINE_FLAG_ACTIVE_LOW, GPIO_V2_LINE_FLAG_OUTPUT etc, OR:ed + * together. This overrides the default flags contained in the &struct + * gpio_v2_line_config for the associated line. + * @values: if id is GPIO_V2_LINE_ATTR_ID_OUTPUT_VALUES, a bitmap + * containing the values to which the lines will be set, with each bit + * number corresponding to the index into &struct + * gpio_v2_line_request.offsets. + * @debounce_period_us: if id is GPIO_V2_LINE_ATTR_ID_DEBOUNCE, the desired + * debounce period, in microseconds + */ +struct gpio_v2_line_attribute { + __u32 id; + __u32 padding; + union { + __aligned_u64 flags; + __aligned_u64 values; + __u32 debounce_period_us; + }; +}; + +/** + * struct gpio_v2_line_config_attribute - a configuration attribute + * associated with one or more of the requested lines. + * @attr: the configurable attribute + * @mask: a bitmap identifying the lines to which the attribute applies, + * with each bit number corresponding to the index into &struct + * gpio_v2_line_request.offsets. + */ +struct gpio_v2_line_config_attribute { + struct gpio_v2_line_attribute attr; + __aligned_u64 mask; +}; + +/** + * struct gpio_v2_line_config - Configuration for GPIO lines + * @flags: flags for the GPIO lines, with values from enum + * gpio_v2_line_flag, such as GPIO_V2_LINE_FLAG_ACTIVE_LOW, + * GPIO_V2_LINE_FLAG_OUTPUT etc, OR:ed together. This is the default for + * all requested lines but may be overridden for particular lines using + * attrs. + * @num_attrs: the number of attributes in attrs + * @padding: reserved for future use and must be zero filled + * @attrs: the configuration attributes associated with the requested + * lines. Any attribute should only be associated with a particular line + * once. If an attribute is associated with a line multiple times then the + * first occurrence (i.e. lowest index) has precedence. + */ +struct gpio_v2_line_config { + __aligned_u64 flags; + __u32 num_attrs; + /* Pad to fill implicit padding and reserve space for future use. */ + __u32 padding[5]; + struct gpio_v2_line_config_attribute attrs[GPIO_V2_LINE_NUM_ATTRS_MAX]; +}; + +/** + * struct gpio_v2_line_request - Information about a request for GPIO lines + * @offsets: an array of desired lines, specified by offset index for the + * associated GPIO chip + * @consumer: a desired consumer label for the selected GPIO lines such as + * "my-bitbanged-relay" + * @config: requested configuration for the lines. + * @num_lines: number of lines requested in this request, i.e. the number + * of valid fields in the GPIO_V2_LINES_MAX sized arrays, set to 1 to + * request a single line + * @event_buffer_size: a suggested minimum number of line events that the + * kernel should buffer. This is only relevant if edge detection is + * enabled in the configuration. Note that this is only a suggested value + * and the kernel may allocate a larger buffer or cap the size of the + * buffer. If this field is zero then the buffer size defaults to a minimum + * of num_lines*16. + * @padding: reserved for future use and must be zero filled + * @fd: if successful this field will contain a valid anonymous file handle + * after a GPIO_GET_LINE_IOCTL operation, zero or negative value means + * error + */ +struct gpio_v2_line_request { + __u32 offsets[GPIO_V2_LINES_MAX]; + char consumer[GPIO_MAX_NAME_SIZE]; + struct gpio_v2_line_config config; + __u32 num_lines; + __u32 event_buffer_size; + /* Pad to fill implicit padding and reserve space for future use. */ + __u32 padding[5]; + __s32 fd; +}; + +/** + * struct gpio_v2_line_info - Information about a certain GPIO line + * @name: the name of this GPIO line, such as the output pin of the line on + * the chip, a rail or a pin header name on a board, as specified by the + * GPIO chip, may be empty + * @consumer: a functional name for the consumer of this GPIO line as set + * by whatever is using it, will be empty if there is no current user but + * may also be empty if the consumer doesn't set this up + * @flags: flags for the GPIO line, such as GPIO_V2_LINE_FLAG_ACTIVE_LOW, + * GPIO_V2_LINE_FLAG_OUTPUT etc, OR:ed together + * @offset: the local offset on this GPIO chip, fill this in when + * requesting the line information from the kernel + * @num_attrs: the number of attributes in attrs + * @attrs: the configuration attributes associated with the line + * @padding: reserved for future use + */ +struct gpio_v2_line_info { + char name[GPIO_MAX_NAME_SIZE]; + char consumer[GPIO_MAX_NAME_SIZE]; + __u32 offset; + __u32 num_attrs; + __aligned_u64 flags; + struct gpio_v2_line_attribute attrs[GPIO_V2_LINE_NUM_ATTRS_MAX]; + /* Space reserved for future use. */ + __u32 padding[4]; +}; + +/** + * enum gpio_v2_line_changed_type - &struct gpio_v2_line_changed.event_type + * values + * @GPIO_V2_LINE_CHANGED_REQUESTED: line has been requested + * @GPIO_V2_LINE_CHANGED_RELEASED: line has been released + * @GPIO_V2_LINE_CHANGED_CONFIG: line has been reconfigured + */ +enum gpio_v2_line_changed_type { + GPIO_V2_LINE_CHANGED_REQUESTED = 1, + GPIO_V2_LINE_CHANGED_RELEASED = 2, + GPIO_V2_LINE_CHANGED_CONFIG = 3, +}; + +/** + * struct gpio_v2_line_info_changed - Information about a change in status + * of a GPIO line + * @info: updated line information + * @timestamp_ns: estimate of time of status change occurrence, in nanoseconds + * @event_type: the type of change with a value from enum + * gpio_v2_line_changed_type + * @padding: reserved for future use + */ +struct gpio_v2_line_info_changed { + struct gpio_v2_line_info info; + __aligned_u64 timestamp_ns; + __u32 event_type; + /* Pad struct to 64-bit boundary and reserve space for future use. */ + __u32 padding[5]; +}; + +/** + * enum gpio_v2_line_event_id - &struct gpio_v2_line_event.id values + * @GPIO_V2_LINE_EVENT_RISING_EDGE: event triggered by a rising edge + * @GPIO_V2_LINE_EVENT_FALLING_EDGE: event triggered by a falling edge + */ +enum gpio_v2_line_event_id { + GPIO_V2_LINE_EVENT_RISING_EDGE = 1, + GPIO_V2_LINE_EVENT_FALLING_EDGE = 2, +}; + +/** + * struct gpio_v2_line_event - The actual event being pushed to userspace + * @timestamp_ns: best estimate of time of event occurrence, in nanoseconds. + * The timestamp_ns is read from CLOCK_MONOTONIC and is intended to allow the + * accurate measurement of the time between events. It does not provide + * the wall-clock time. + * @id: event identifier with value from enum gpio_v2_line_event_id + * @offset: the offset of the line that triggered the event + * @seqno: the sequence number for this event in the sequence of events for + * all the lines in this line request + * @line_seqno: the sequence number for this event in the sequence of + * events on this particular line + * @padding: reserved for future use + */ +struct gpio_v2_line_event { + __aligned_u64 timestamp_ns; + __u32 id; + __u32 offset; + __u32 seqno; + __u32 line_seqno; + /* Space reserved for future use. */ + __u32 padding[6]; +}; + +/* + * ABI v1 + */ + /* Informational flags */ #define GPIOLINE_FLAG_KERNEL (1UL << 0) /* Line used by the kernel */ #define GPIOLINE_FLAG_IS_OUT (1UL << 1) @@ -149,8 +411,6 @@ struct gpiohandle_config { __u32 padding[4]; /* padding for future use */ }; -#define GPIOHANDLE_SET_CONFIG_IOCTL _IOWR(0xB4, 0x0a, struct gpiohandle_config) - /** * struct gpiohandle_data - Information of values on a GPIO handle * @values: when getting the state of lines this contains the current @@ -161,9 +421,6 @@ struct gpiohandle_data { __u8 values[GPIOHANDLES_MAX]; }; -#define GPIOHANDLE_GET_LINE_VALUES_IOCTL _IOWR(0xB4, 0x08, struct gpiohandle_data) -#define GPIOHANDLE_SET_LINE_VALUES_IOCTL _IOWR(0xB4, 0x09, struct gpiohandle_data) - /* Eventrequest flags */ #define GPIOEVENT_REQUEST_RISING_EDGE (1UL << 0) #define GPIOEVENT_REQUEST_FALLING_EDGE (1UL << 1) @@ -207,11 +464,31 @@ struct gpioevent_data { __u32 id; }; +/* + * v1 and v2 ioctl()s + */ #define GPIO_GET_CHIPINFO_IOCTL _IOR(0xB4, 0x01, struct gpiochip_info) +#define GPIO_GET_LINEINFO_UNWATCH_IOCTL _IOWR(0xB4, 0x0C, __u32) + +/* + * v2 ioctl()s + */ +#define GPIO_V2_GET_LINEINFO_IOCTL _IOWR(0xB4, 0x05, struct gpio_v2_line_info) +#define GPIO_V2_GET_LINEINFO_WATCH_IOCTL _IOWR(0xB4, 0x06, struct gpio_v2_line_info) +#define GPIO_V2_GET_LINE_IOCTL _IOWR(0xB4, 0x07, struct gpio_v2_line_request) +#define GPIO_V2_LINE_SET_CONFIG_IOCTL _IOWR(0xB4, 0x0D, struct gpio_v2_line_config) +#define GPIO_V2_LINE_GET_VALUES_IOCTL _IOWR(0xB4, 0x0E, struct gpio_v2_line_values) +#define GPIO_V2_LINE_SET_VALUES_IOCTL _IOWR(0xB4, 0x0F, struct gpio_v2_line_values) + +/* + * v1 ioctl()s + */ #define GPIO_GET_LINEINFO_IOCTL _IOWR(0xB4, 0x02, struct gpioline_info) -#define GPIO_GET_LINEINFO_WATCH_IOCTL _IOWR(0xB4, 0x0b, struct gpioline_info) -#define GPIO_GET_LINEINFO_UNWATCH_IOCTL _IOWR(0xB4, 0x0c, __u32) #define GPIO_GET_LINEHANDLE_IOCTL _IOWR(0xB4, 0x03, struct gpiohandle_request) #define GPIO_GET_LINEEVENT_IOCTL _IOWR(0xB4, 0x04, struct gpioevent_request) +#define GPIOHANDLE_GET_LINE_VALUES_IOCTL _IOWR(0xB4, 0x08, struct gpiohandle_data) +#define GPIOHANDLE_SET_LINE_VALUES_IOCTL _IOWR(0xB4, 0x09, struct gpiohandle_data) +#define GPIOHANDLE_SET_CONFIG_IOCTL _IOWR(0xB4, 0x0A, struct gpiohandle_config) +#define GPIO_GET_LINEINFO_WATCH_IOCTL _IOWR(0xB4, 0x0B, struct gpioline_info) #endif /* _UAPI_GPIO_H_ */ -- cgit v1.2.3 From b234d233fe30c63c4e461b03e2884a6765c8e5b0 Mon Sep 17 00:00:00 2001 From: Kent Gibson Date: Mon, 28 Sep 2020 08:28:00 +0800 Subject: gpio: uapi: document uAPI v1 as deprecated Update uAPI documentation to deprecate v1 structs and ioctls. Signed-off-by: Kent Gibson Signed-off-by: Bartosz Golaszewski --- include/uapi/linux/gpio.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/gpio.h b/include/uapi/linux/gpio.h index 5904f49399de..07865c601099 100644 --- a/include/uapi/linux/gpio.h +++ b/include/uapi/linux/gpio.h @@ -292,6 +292,9 @@ struct gpio_v2_line_event { /* * ABI v1 + * + * This version of the ABI is deprecated. + * Use the latest version of the ABI, defined above, instead. */ /* Informational flags */ @@ -315,6 +318,9 @@ struct gpio_v2_line_event { * @consumer: a functional name for the consumer of this GPIO line as set by * whatever is using it, will be empty if there is no current user but may * also be empty if the consumer doesn't set this up + * + * This struct is part of ABI v1 and is deprecated. + * Use struct gpio_v2_line_info instead. */ struct gpioline_info { __u32 line_offset; @@ -346,6 +352,9 @@ enum { * guarantee there are no implicit holes between it and subsequent members. * The 20-byte padding at the end makes sure we don't add any implicit padding * at the end of the structure on 64-bit architectures. + * + * This struct is part of ABI v1 and is deprecated. + * Use struct gpio_v2_line_info_changed instead. */ struct gpioline_info_changed { struct gpioline_info info; @@ -385,6 +394,9 @@ struct gpioline_info_changed { * @fd: if successful this field will contain a valid anonymous file handle * after a GPIO_GET_LINEHANDLE_IOCTL operation, zero or negative value * means error + * + * This struct is part of ABI v1 and is deprecated. + * Use struct gpio_v2_line_request instead. */ struct gpiohandle_request { __u32 lineoffsets[GPIOHANDLES_MAX]; @@ -404,6 +416,9 @@ struct gpiohandle_request { * this specifies the default output value, should be 0 (low) or * 1 (high), anything else than 0 or 1 will be interpreted as 1 (high) * @padding: reserved for future use and should be zero filled + * + * This struct is part of ABI v1 and is deprecated. + * Use struct gpio_v2_line_config instead. */ struct gpiohandle_config { __u32 flags; @@ -416,6 +431,9 @@ struct gpiohandle_config { * @values: when getting the state of lines this contains the current * state of a line, when setting the state of lines these should contain * the desired target state + * + * This struct is part of ABI v1 and is deprecated. + * Use struct gpio_v2_line_values instead. */ struct gpiohandle_data { __u8 values[GPIOHANDLES_MAX]; @@ -439,6 +457,9 @@ struct gpiohandle_data { * @fd: if successful this field will contain a valid anonymous file handle * after a GPIO_GET_LINEEVENT_IOCTL operation, zero or negative value * means error + * + * This struct is part of ABI v1 and is deprecated. + * Use struct gpio_v2_line_request instead. */ struct gpioevent_request { __u32 lineoffset; @@ -458,6 +479,9 @@ struct gpioevent_request { * struct gpioevent_data - The actual event being pushed to userspace * @timestamp: best estimate of time of event occurrence, in nanoseconds * @id: event identifier + * + * This struct is part of ABI v1 and is deprecated. + * Use struct gpio_v2_line_event instead. */ struct gpioevent_data { __u64 timestamp; @@ -482,6 +506,8 @@ struct gpioevent_data { /* * v1 ioctl()s + * + * These ioctl()s are deprecated. Use the v2 equivalent instead. */ #define GPIO_GET_LINEINFO_IOCTL _IOWR(0xB4, 0x02, struct gpioline_info) #define GPIO_GET_LINEHANDLE_IOCTL _IOWR(0xB4, 0x03, struct gpiohandle_request) -- cgit v1.2.3 From 002f2176532093753cb6ced61e5ea7b8904c6cae Mon Sep 17 00:00:00 2001 From: "Jose M. Guisado Gomez" Date: Mon, 28 Sep 2020 14:27:10 +0200 Subject: netfilter: nf_tables: add userdata attributes to nft_chain Enables storing userdata for nft_chain. Field udata points to user data and udlen stores its length. Adds new attribute flag NFTA_CHAIN_USERDATA. Signed-off-by: Jose M. Guisado Gomez Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 ++ include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 33 ++++++++++++++++++++++++-------- 3 files changed, 29 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index c4c526507ddb..0bd2a081ae39 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -945,6 +945,8 @@ struct nft_chain { bound:1, genmask:2; char *name; + u16 udlen; + u8 *udata; /* Only used during control plane commit phase: */ struct nft_rule **rules_next; diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 3c2469b43742..352ee51707a1 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -208,6 +208,7 @@ enum nft_chain_flags { * @NFTA_CHAIN_COUNTERS: counter specification of the chain (NLA_NESTED: nft_counter_attributes) * @NFTA_CHAIN_FLAGS: chain flags * @NFTA_CHAIN_ID: uniquely identifies a chain in a transaction (NLA_U32) + * @NFTA_CHAIN_USERDATA: user data (NLA_BINARY) */ enum nft_chain_attributes { NFTA_CHAIN_UNSPEC, @@ -222,6 +223,7 @@ enum nft_chain_attributes { NFTA_CHAIN_PAD, NFTA_CHAIN_FLAGS, NFTA_CHAIN_ID, + NFTA_CHAIN_USERDATA, __NFTA_CHAIN_MAX }; #define NFTA_CHAIN_MAX (__NFTA_CHAIN_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 0473316aa392..3cfff31e4818 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1304,6 +1304,8 @@ static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = { [NFTA_CHAIN_COUNTERS] = { .type = NLA_NESTED }, [NFTA_CHAIN_FLAGS] = { .type = NLA_U32 }, [NFTA_CHAIN_ID] = { .type = NLA_U32 }, + [NFTA_CHAIN_USERDATA] = { .type = NLA_BINARY, + .len = NFT_USERDATA_MAXLEN }, }; static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = { @@ -1445,6 +1447,10 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, if (nla_put_be32(skb, NFTA_CHAIN_USE, htonl(chain->use))) goto nla_put_failure; + if (chain->udata && + nla_put(skb, NFTA_CHAIN_USERDATA, chain->udlen, chain->udata)) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; @@ -1682,9 +1688,11 @@ void nf_tables_chain_destroy(struct nft_ctx *ctx) free_percpu(rcu_dereference_raw(basechain->stats)); } kfree(chain->name); + kfree(chain->udata); kfree(basechain); } else { kfree(chain->name); + kfree(chain->udata); kfree(chain); } } @@ -2038,7 +2046,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, } else { if (!(flags & NFT_CHAIN_BINDING)) { err = -EINVAL; - goto err1; + goto err_destroy_chain; } snprintf(name, sizeof(name), "__chain%llu", ++chain_id); @@ -2047,13 +2055,22 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, if (!chain->name) { err = -ENOMEM; - goto err1; + goto err_destroy_chain; + } + + if (nla[NFTA_CHAIN_USERDATA]) { + chain->udata = nla_memdup(nla[NFTA_CHAIN_USERDATA], GFP_KERNEL); + if (chain->udata == NULL) { + err = -ENOMEM; + goto err_destroy_chain; + } + chain->udlen = nla_len(nla[NFTA_CHAIN_USERDATA]); } rules = nf_tables_chain_alloc_rules(chain, 0); if (!rules) { err = -ENOMEM; - goto err1; + goto err_destroy_chain; } *rules = NULL; @@ -2062,12 +2079,12 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, err = nf_tables_register_hook(net, table, chain); if (err < 0) - goto err1; + goto err_destroy_chain; trans = nft_trans_chain_add(ctx, NFT_MSG_NEWCHAIN); if (IS_ERR(trans)) { err = PTR_ERR(trans); - goto err2; + goto err_unregister_hook; } nft_trans_chain_policy(trans) = NFT_CHAIN_POLICY_UNSET; @@ -2077,15 +2094,15 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, err = nft_chain_add(table, chain); if (err < 0) { nft_trans_destroy(trans); - goto err2; + goto err_unregister_hook; } table->use++; return 0; -err2: +err_unregister_hook: nf_tables_unregister_hook(net, table, chain); -err1: +err_destroy_chain: nf_tables_chain_destroy(ctx); return err; -- cgit v1.2.3 From b426ce83baa7dff947fb354118d3133f2953aac8 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 30 Sep 2020 17:18:15 +0200 Subject: bpf: Add classid helper only based on skb->sk Similarly to 5a52ae4e32a6 ("bpf: Allow to retrieve cgroup v1 classid from v2 hooks"), add a helper to retrieve cgroup v1 classid solely based on the skb->sk, so it can be used as key as part of BPF map lookups out of tc from host ns, in particular given the skb->sk is retained these days when crossing net ns thanks to 9c4c325252c5 ("skbuff: preserve sock reference when scrubbing the skb."). This is similar to bpf_skb_cgroup_id() which implements the same for v2. Kubernetes ecosystem is still operating on v1 however, hence net_cls needs to be used there until this can be dropped in with the v2 helper of bpf_skb_cgroup_id(). Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/ed633cf27a1c620e901c5aa99ebdefb028dce600.1601477936.git.daniel@iogearbox.net --- include/uapi/linux/bpf.h | 10 ++++++++++ net/core/filter.c | 21 +++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 10 ++++++++++ 3 files changed, 41 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2b1d3f16cbd1..6116a7f54c8f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3643,6 +3643,15 @@ union bpf_attr { * *flags* are identical to those used for bpf_snprintf_btf. * Return * 0 on success or a negative error in case of failure. + * + * u64 bpf_skb_cgroup_classid(struct sk_buff *skb) + * Description + * See **bpf_get_cgroup_classid**\ () for the main description. + * This helper differs from **bpf_get_cgroup_classid**\ () in that + * the cgroup v1 net_cls class is retrieved only from the *skb*'s + * associated socket instead of the current process. + * Return + * The id is returned or 0 in case the id could not be retrieved. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3796,6 +3805,7 @@ union bpf_attr { FN(copy_from_user), \ FN(snprintf_btf), \ FN(seq_printf_btf), \ + FN(skb_cgroup_classid), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/net/core/filter.c b/net/core/filter.c index af88935e24b1..fa01c697977d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2707,6 +2707,23 @@ static const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = { .gpl_only = false, .ret_type = RET_INTEGER, }; + +BPF_CALL_1(bpf_skb_cgroup_classid, const struct sk_buff *, skb) +{ + struct sock *sk = skb_to_full_sk(skb); + + if (!sk || !sk_fullsock(sk)) + return 0; + + return sock_cgroup_classid(&sk->sk_cgrp_data); +} + +static const struct bpf_func_proto bpf_skb_cgroup_classid_proto = { + .func = bpf_skb_cgroup_classid, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; #endif BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) @@ -6772,6 +6789,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_get_xfrm_state: return &bpf_skb_get_xfrm_state_proto; #endif +#ifdef CONFIG_CGROUP_NET_CLASSID + case BPF_FUNC_skb_cgroup_classid: + return &bpf_skb_cgroup_classid_proto; +#endif #ifdef CONFIG_SOCK_CGROUP_DATA case BPF_FUNC_skb_cgroup_id: return &bpf_skb_cgroup_id_proto; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 2b1d3f16cbd1..6116a7f54c8f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3643,6 +3643,15 @@ union bpf_attr { * *flags* are identical to those used for bpf_snprintf_btf. * Return * 0 on success or a negative error in case of failure. + * + * u64 bpf_skb_cgroup_classid(struct sk_buff *skb) + * Description + * See **bpf_get_cgroup_classid**\ () for the main description. + * This helper differs from **bpf_get_cgroup_classid**\ () in that + * the cgroup v1 net_cls class is retrieved only from the *skb*'s + * associated socket instead of the current process. + * Return + * The id is returned or 0 in case the id could not be retrieved. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3796,6 +3805,7 @@ union bpf_attr { FN(copy_from_user), \ FN(snprintf_btf), \ FN(seq_printf_btf), \ + FN(skb_cgroup_classid), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From b4ab31414970a7a03a5d55d75083f2c101a30592 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 30 Sep 2020 17:18:17 +0200 Subject: bpf: Add redirect_neigh helper as redirect drop-in Add a redirect_neigh() helper as redirect() drop-in replacement for the xmit side. Main idea for the helper is to be very similar in semantics to the latter just that the skb gets injected into the neighboring subsystem in order to let the stack do the work it knows best anyway to populate the L2 addresses of the packet and then hand over to dev_queue_xmit() as redirect() does. This solves two bigger items: i) skbs don't need to go up to the stack on the host facing veth ingress side for traffic egressing the container to achieve the same for populating L2 which also has the huge advantage that ii) the skb->sk won't get orphaned in ip_rcv_core() when entering the IP routing layer on the host stack. Given that skb->sk neither gets orphaned when crossing the netns as per 9c4c325252c5 ("skbuff: preserve sock reference when scrubbing the skb.") the helper can then push the skbs directly to the phys device where FQ scheduler can do its work and TCP stack gets proper backpressure given we hold on to skb->sk as long as skb is still residing in queues. With the helper used in BPF data path to then push the skb to the phys device, I observed a stable/consistent TCP_STREAM improvement on veth devices for traffic going container -> host -> host -> container from ~10Gbps to ~15Gbps for a single stream in my test environment. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Reviewed-by: David Ahern Acked-by: Martin KaFai Lau Cc: David Ahern Link: https://lore.kernel.org/bpf/f207de81629e1724899b73b8112e0013be782d35.1601477936.git.daniel@iogearbox.net --- include/linux/skbuff.h | 5 + include/uapi/linux/bpf.h | 14 +++ net/core/filter.c | 276 ++++++++++++++++++++++++++++++++++++++--- tools/include/uapi/linux/bpf.h | 14 +++ 4 files changed, 294 insertions(+), 15 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 04a18e01b362..3d0cf3722bb4 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2548,6 +2548,11 @@ static inline int skb_mac_header_was_set(const struct sk_buff *skb) return skb->mac_header != (typeof(skb->mac_header))~0U; } +static inline void skb_unset_mac_header(struct sk_buff *skb) +{ + skb->mac_header = (typeof(skb->mac_header))~0U; +} + static inline void skb_reset_mac_header(struct sk_buff *skb) { skb->mac_header = skb->data - skb->head; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6116a7f54c8f..1f17c6752deb 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3652,6 +3652,19 @@ union bpf_attr { * associated socket instead of the current process. * Return * The id is returned or 0 in case the id could not be retrieved. + * + * long bpf_redirect_neigh(u32 ifindex, u64 flags) + * Description + * Redirect the packet to another net device of index *ifindex* + * and fill in L2 addresses from neighboring subsystem. This helper + * is somewhat similar to **bpf_redirect**\ (), except that it + * fills in e.g. MAC addresses based on the L3 information from + * the packet. This helper is supported for IPv4 and IPv6 protocols. + * The *flags* argument is reserved and must be 0. The helper is + * currently only supported for tc BPF program types. + * Return + * The helper returns **TC_ACT_REDIRECT** on success or + * **TC_ACT_SHOT** on error. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3806,6 +3819,7 @@ union bpf_attr { FN(snprintf_btf), \ FN(seq_printf_btf), \ FN(skb_cgroup_classid), \ + FN(redirect_neigh), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/net/core/filter.c b/net/core/filter.c index a0776e48dcc9..3fb6adad1957 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2163,13 +2163,233 @@ static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, return __bpf_redirect_no_mac(skb, dev, flags); } +#if IS_ENABLED(CONFIG_IPV6) +static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct net_device *dev = dst->dev; + u32 hh_len = LL_RESERVED_SPACE(dev); + const struct in6_addr *nexthop; + struct neighbour *neigh; + + if (dev_xmit_recursion()) { + net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); + goto out_drop; + } + + skb->dev = dev; + skb->tstamp = 0; + + if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { + struct sk_buff *skb2; + + skb2 = skb_realloc_headroom(skb, hh_len); + if (unlikely(!skb2)) { + kfree_skb(skb); + return -ENOMEM; + } + if (skb->sk) + skb_set_owner_w(skb2, skb->sk); + consume_skb(skb); + skb = skb2; + } + + rcu_read_lock_bh(); + nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst), + &ipv6_hdr(skb)->daddr); + neigh = ip_neigh_gw6(dev, nexthop); + if (likely(!IS_ERR(neigh))) { + int ret; + + sock_confirm_neigh(skb, neigh); + dev_xmit_recursion_inc(); + ret = neigh_output(neigh, skb, false); + dev_xmit_recursion_dec(); + rcu_read_unlock_bh(); + return ret; + } + rcu_read_unlock_bh(); + IP6_INC_STATS(dev_net(dst->dev), + ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); +out_drop: + kfree_skb(skb); + return -ENETDOWN; +} + +static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + struct net *net = dev_net(dev); + int err, ret = NET_XMIT_DROP; + struct dst_entry *dst; + struct flowi6 fl6 = { + .flowi6_flags = FLOWI_FLAG_ANYSRC, + .flowi6_mark = skb->mark, + .flowlabel = ip6_flowinfo(ip6h), + .flowi6_oif = dev->ifindex, + .flowi6_proto = ip6h->nexthdr, + .daddr = ip6h->daddr, + .saddr = ip6h->saddr, + }; + + dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); + if (IS_ERR(dst)) + goto out_drop; + + skb_dst_set(skb, dst); + + err = bpf_out_neigh_v6(net, skb); + if (unlikely(net_xmit_eval(err))) + dev->stats.tx_errors++; + else + ret = NET_XMIT_SUCCESS; + goto out_xmit; +out_drop: + dev->stats.tx_errors++; + kfree_skb(skb); +out_xmit: + return ret; +} +#else +static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev) +{ + kfree_skb(skb); + return NET_XMIT_DROP; +} +#endif /* CONFIG_IPV6 */ + +#if IS_ENABLED(CONFIG_INET) +static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct rtable *rt = container_of(dst, struct rtable, dst); + struct net_device *dev = dst->dev; + u32 hh_len = LL_RESERVED_SPACE(dev); + struct neighbour *neigh; + bool is_v6gw = false; + + if (dev_xmit_recursion()) { + net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); + goto out_drop; + } + + skb->dev = dev; + skb->tstamp = 0; + + if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { + struct sk_buff *skb2; + + skb2 = skb_realloc_headroom(skb, hh_len); + if (unlikely(!skb2)) { + kfree_skb(skb); + return -ENOMEM; + } + if (skb->sk) + skb_set_owner_w(skb2, skb->sk); + consume_skb(skb); + skb = skb2; + } + + rcu_read_lock_bh(); + neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); + if (likely(!IS_ERR(neigh))) { + int ret; + + sock_confirm_neigh(skb, neigh); + dev_xmit_recursion_inc(); + ret = neigh_output(neigh, skb, is_v6gw); + dev_xmit_recursion_dec(); + rcu_read_unlock_bh(); + return ret; + } + rcu_read_unlock_bh(); +out_drop: + kfree_skb(skb); + return -ENETDOWN; +} + +static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev) +{ + const struct iphdr *ip4h = ip_hdr(skb); + struct net *net = dev_net(dev); + int err, ret = NET_XMIT_DROP; + struct rtable *rt; + struct flowi4 fl4 = { + .flowi4_flags = FLOWI_FLAG_ANYSRC, + .flowi4_mark = skb->mark, + .flowi4_tos = RT_TOS(ip4h->tos), + .flowi4_oif = dev->ifindex, + .flowi4_proto = ip4h->protocol, + .daddr = ip4h->daddr, + .saddr = ip4h->saddr, + }; + + rt = ip_route_output_flow(net, &fl4, NULL); + if (IS_ERR(rt)) + goto out_drop; + if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { + ip_rt_put(rt); + goto out_drop; + } + + skb_dst_set(skb, &rt->dst); + + err = bpf_out_neigh_v4(net, skb); + if (unlikely(net_xmit_eval(err))) + dev->stats.tx_errors++; + else + ret = NET_XMIT_SUCCESS; + goto out_xmit; +out_drop: + dev->stats.tx_errors++; + kfree_skb(skb); +out_xmit: + return ret; +} +#else +static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev) +{ + kfree_skb(skb); + return NET_XMIT_DROP; +} +#endif /* CONFIG_INET */ + +static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev) +{ + struct ethhdr *ethh = eth_hdr(skb); + + if (unlikely(skb->mac_header >= skb->network_header)) + goto out; + bpf_push_mac_rcsum(skb); + if (is_multicast_ether_addr(ethh->h_dest)) + goto out; + + skb_pull(skb, sizeof(*ethh)); + skb_unset_mac_header(skb); + skb_reset_network_header(skb); + + if (skb->protocol == htons(ETH_P_IP)) + return __bpf_redirect_neigh_v4(skb, dev); + else if (skb->protocol == htons(ETH_P_IPV6)) + return __bpf_redirect_neigh_v6(skb, dev); +out: + kfree_skb(skb); + return -ENOTSUPP; +} + +/* Internal, non-exposed redirect flags. */ +enum { + BPF_F_NEIGH = (1ULL << 1), +#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH) +}; + BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) { struct net_device *dev; struct sk_buff *clone; int ret; - if (unlikely(flags & ~(BPF_F_INGRESS))) + if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL))) return -EINVAL; dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex); @@ -2206,23 +2426,11 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = { DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info); -BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) -{ - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - - if (unlikely(flags & ~(BPF_F_INGRESS))) - return TC_ACT_SHOT; - - ri->flags = flags; - ri->tgt_index = ifindex; - - return TC_ACT_REDIRECT; -} - int skb_do_redirect(struct sk_buff *skb) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct net_device *dev; + u32 flags = ri->flags; dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->tgt_index); ri->tgt_index = 0; @@ -2231,7 +2439,22 @@ int skb_do_redirect(struct sk_buff *skb) return -EINVAL; } - return __bpf_redirect(skb, dev, ri->flags); + return flags & BPF_F_NEIGH ? + __bpf_redirect_neigh(skb, dev) : + __bpf_redirect(skb, dev, flags); +} + +BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL))) + return TC_ACT_SHOT; + + ri->flags = flags; + ri->tgt_index = ifindex; + + return TC_ACT_REDIRECT; } static const struct bpf_func_proto bpf_redirect_proto = { @@ -2242,6 +2465,27 @@ static const struct bpf_func_proto bpf_redirect_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_redirect_neigh, u32, ifindex, u64, flags) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + if (unlikely(flags)) + return TC_ACT_SHOT; + + ri->flags = BPF_F_NEIGH; + ri->tgt_index = ifindex; + + return TC_ACT_REDIRECT; +} + +static const struct bpf_func_proto bpf_redirect_neigh_proto = { + .func = bpf_redirect_neigh, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes) { msg->apply_bytes = bytes; @@ -6759,6 +7003,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return bpf_get_skb_set_tunnel_proto(func_id); case BPF_FUNC_redirect: return &bpf_redirect_proto; + case BPF_FUNC_redirect_neigh: + return &bpf_redirect_neigh_proto; case BPF_FUNC_get_route_realm: return &bpf_get_route_realm_proto; case BPF_FUNC_get_hash_recalc: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 6116a7f54c8f..1f17c6752deb 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3652,6 +3652,19 @@ union bpf_attr { * associated socket instead of the current process. * Return * The id is returned or 0 in case the id could not be retrieved. + * + * long bpf_redirect_neigh(u32 ifindex, u64 flags) + * Description + * Redirect the packet to another net device of index *ifindex* + * and fill in L2 addresses from neighboring subsystem. This helper + * is somewhat similar to **bpf_redirect**\ (), except that it + * fills in e.g. MAC addresses based on the L3 information from + * the packet. This helper is supported for IPv4 and IPv6 protocols. + * The *flags* argument is reserved and must be 0. The helper is + * currently only supported for tc BPF program types. + * Return + * The helper returns **TC_ACT_REDIRECT** on success or + * **TC_ACT_SHOT** on error. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3806,6 +3819,7 @@ union bpf_attr { FN(snprintf_btf), \ FN(seq_printf_btf), \ FN(skb_cgroup_classid), \ + FN(redirect_neigh), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 9d4a75efa200a31deabe9ba1c941aef697e6bb30 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 27 Aug 2020 16:58:29 +0200 Subject: io_uring: use an enumeration for io_uring_register(2) opcodes The enumeration allows us to keep track of the last io_uring_register(2) opcode available. Behaviour and opcodes names don't change. Signed-off-by: Stefano Garzarella Reviewed-by: Kees Cook Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index d65fde732518..5f12ae6a415c 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -255,17 +255,22 @@ struct io_uring_params { /* * io_uring_register(2) opcodes and arguments */ -#define IORING_REGISTER_BUFFERS 0 -#define IORING_UNREGISTER_BUFFERS 1 -#define IORING_REGISTER_FILES 2 -#define IORING_UNREGISTER_FILES 3 -#define IORING_REGISTER_EVENTFD 4 -#define IORING_UNREGISTER_EVENTFD 5 -#define IORING_REGISTER_FILES_UPDATE 6 -#define IORING_REGISTER_EVENTFD_ASYNC 7 -#define IORING_REGISTER_PROBE 8 -#define IORING_REGISTER_PERSONALITY 9 -#define IORING_UNREGISTER_PERSONALITY 10 +enum { + IORING_REGISTER_BUFFERS = 0, + IORING_UNREGISTER_BUFFERS = 1, + IORING_REGISTER_FILES = 2, + IORING_UNREGISTER_FILES = 3, + IORING_REGISTER_EVENTFD = 4, + IORING_UNREGISTER_EVENTFD = 5, + IORING_REGISTER_FILES_UPDATE = 6, + IORING_REGISTER_EVENTFD_ASYNC = 7, + IORING_REGISTER_PROBE = 8, + IORING_REGISTER_PERSONALITY = 9, + IORING_UNREGISTER_PERSONALITY = 10, + + /* this goes last */ + IORING_REGISTER_LAST +}; struct io_uring_files_update { __u32 offset; -- cgit v1.2.3 From 21b55dbc0653018b8cd4513c37cbca303b0f0d50 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 27 Aug 2020 16:58:30 +0200 Subject: io_uring: add IOURING_REGISTER_RESTRICTIONS opcode The new io_uring_register(2) IOURING_REGISTER_RESTRICTIONS opcode permanently installs a feature allowlist on an io_ring_ctx. The io_ring_ctx can then be passed to untrusted code with the knowledge that only operations present in the allowlist can be executed. The allowlist approach ensures that new features added to io_uring do not accidentally become available when an existing application is launched on a newer kernel version. Currently is it possible to restrict sqe opcodes, sqe flags, and register opcodes. IOURING_REGISTER_RESTRICTIONS can only be made once. Afterwards it is not possible to change restrictions anymore. This prevents untrusted code from removing restrictions. Suggested-by: Stefan Hajnoczi Signed-off-by: Stefano Garzarella Reviewed-by: Kees Cook Signed-off-by: Jens Axboe --- fs/io_uring.c | 124 +++++++++++++++++++++++++++++++++++++++++- include/uapi/linux/io_uring.h | 31 +++++++++++ 2 files changed, 154 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/fs/io_uring.c b/fs/io_uring.c index 05ec385a6094..c4855cecc8f3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -99,6 +99,8 @@ #define IORING_MAX_FILES_TABLE (1U << IORING_FILE_TABLE_SHIFT) #define IORING_FILE_TABLE_MASK (IORING_MAX_FILES_TABLE - 1) #define IORING_MAX_FIXED_FILES (64 * IORING_MAX_FILES_TABLE) +#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ + IORING_REGISTER_LAST + IORING_OP_LAST) struct io_uring { u32 head ____cacheline_aligned_in_smp; @@ -220,6 +222,13 @@ struct io_buffer { __u16 bid; }; +struct io_restriction { + DECLARE_BITMAP(register_op, IORING_REGISTER_LAST); + DECLARE_BITMAP(sqe_op, IORING_OP_LAST); + u8 sqe_flags_allowed; + u8 sqe_flags_required; +}; + struct io_ring_ctx { struct { struct percpu_ref refs; @@ -232,6 +241,7 @@ struct io_ring_ctx { unsigned int cq_overflow_flushed: 1; unsigned int drain_next: 1; unsigned int eventfd_async: 1; + unsigned int restricted: 1; /* * Ring buffer of indices into array of io_uring_sqe, which is @@ -346,6 +356,7 @@ struct io_ring_ctx { struct llist_head file_put_llist; struct work_struct exit_work; + struct io_restriction restrictions; }; /* @@ -6438,6 +6449,32 @@ static inline void io_consume_sqe(struct io_ring_ctx *ctx) ctx->cached_sq_head++; } +/* + * Check SQE restrictions (opcode and flags). + * + * Returns 'true' if SQE is allowed, 'false' otherwise. + */ +static inline bool io_check_restriction(struct io_ring_ctx *ctx, + struct io_kiocb *req, + unsigned int sqe_flags) +{ + if (!ctx->restricted) + return true; + + if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) + return false; + + if ((sqe_flags & ctx->restrictions.sqe_flags_required) != + ctx->restrictions.sqe_flags_required) + return false; + + if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed | + ctx->restrictions.sqe_flags_required)) + return false; + + return true; +} + #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ IOSQE_IO_HARDLINK | IOSQE_ASYNC | \ IOSQE_BUFFER_SELECT) @@ -6473,6 +6510,9 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) return -EINVAL; + if (unlikely(!io_check_restriction(ctx, req, sqe_flags))) + return -EACCES; + if ((sqe_flags & IOSQE_BUFFER_SELECT) && !io_op_defs[req->opcode].buffer_select) return -EOPNOTSUPP; @@ -9077,6 +9117,72 @@ static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) return -EINVAL; } +static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg, + unsigned int nr_args) +{ + struct io_uring_restriction *res; + size_t size; + int i, ret; + + /* We allow only a single restrictions registration */ + if (ctx->restricted) + return -EBUSY; + + if (!arg || nr_args > IORING_MAX_RESTRICTIONS) + return -EINVAL; + + size = array_size(nr_args, sizeof(*res)); + if (size == SIZE_MAX) + return -EOVERFLOW; + + res = memdup_user(arg, size); + if (IS_ERR(res)) + return PTR_ERR(res); + + ret = 0; + + for (i = 0; i < nr_args; i++) { + switch (res[i].opcode) { + case IORING_RESTRICTION_REGISTER_OP: + if (res[i].register_op >= IORING_REGISTER_LAST) { + ret = -EINVAL; + goto out; + } + + __set_bit(res[i].register_op, + ctx->restrictions.register_op); + break; + case IORING_RESTRICTION_SQE_OP: + if (res[i].sqe_op >= IORING_OP_LAST) { + ret = -EINVAL; + goto out; + } + + __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op); + break; + case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: + ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags; + break; + case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: + ctx->restrictions.sqe_flags_required = res[i].sqe_flags; + break; + default: + ret = -EINVAL; + goto out; + } + } + +out: + /* Reset all restrictions if an error happened */ + if (ret != 0) + memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); + else + ctx->restricted = 1; + + kfree(res); + return ret; +} + static bool io_register_op_must_quiesce(int op) { switch (op) { @@ -9123,6 +9229,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, if (ret) { percpu_ref_resurrect(&ctx->refs); ret = -EINTR; + goto out_quiesce; + } + } + + if (ctx->restricted) { + if (opcode >= IORING_REGISTER_LAST) { + ret = -EINVAL; + goto out; + } + + if (!test_bit(opcode, ctx->restrictions.register_op)) { + ret = -EACCES; goto out; } } @@ -9186,15 +9304,19 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_unregister_personality(ctx, nr_args); break; + case IORING_REGISTER_RESTRICTIONS: + ret = io_register_restrictions(ctx, arg, nr_args); + break; default: ret = -EINVAL; break; } +out: if (io_register_op_must_quiesce(opcode)) { /* bring the ctx back to life */ percpu_ref_reinit(&ctx->refs); -out: +out_quiesce: reinit_completion(&ctx->ref_comp); } return ret; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 5f12ae6a415c..6e7f2e5e917b 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -267,6 +267,7 @@ enum { IORING_REGISTER_PROBE = 8, IORING_REGISTER_PERSONALITY = 9, IORING_UNREGISTER_PERSONALITY = 10, + IORING_REGISTER_RESTRICTIONS = 11, /* this goes last */ IORING_REGISTER_LAST @@ -295,4 +296,34 @@ struct io_uring_probe { struct io_uring_probe_op ops[0]; }; +struct io_uring_restriction { + __u16 opcode; + union { + __u8 register_op; /* IORING_RESTRICTION_REGISTER_OP */ + __u8 sqe_op; /* IORING_RESTRICTION_SQE_OP */ + __u8 sqe_flags; /* IORING_RESTRICTION_SQE_FLAGS_* */ + }; + __u8 resv; + __u32 resv2[3]; +}; + +/* + * io_uring_restriction->opcode values + */ +enum { + /* Allow an io_uring_register(2) opcode */ + IORING_RESTRICTION_REGISTER_OP = 0, + + /* Allow an sqe opcode */ + IORING_RESTRICTION_SQE_OP = 1, + + /* Allow sqe flags */ + IORING_RESTRICTION_SQE_FLAGS_ALLOWED = 2, + + /* Require sqe flags (these flags must be set on each submission) */ + IORING_RESTRICTION_SQE_FLAGS_REQUIRED = 3, + + IORING_RESTRICTION_LAST +}; + #endif -- cgit v1.2.3 From 7e84e1c7566a1df470a9e1f49d3db2ce311261a4 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 27 Aug 2020 16:58:31 +0200 Subject: io_uring: allow disabling rings during the creation This patch adds a new IORING_SETUP_R_DISABLED flag to start the rings disabled, allowing the user to register restrictions, buffers, files, before to start processing SQEs. When IORING_SETUP_R_DISABLED is set, SQE are not processed and SQPOLL kthread is not started. The restrictions registration are allowed only when the rings are disable to prevent concurrency issue while processing SQEs. The rings can be enabled using IORING_REGISTER_ENABLE_RINGS opcode with io_uring_register(2). Suggested-by: Jens Axboe Signed-off-by: Stefano Garzarella Reviewed-by: Kees Cook Signed-off-by: Jens Axboe --- fs/io_uring.c | 61 ++++++++++++++++++++++++++++++++++++++----- include/uapi/linux/io_uring.h | 2 ++ 2 files changed, 56 insertions(+), 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/io_uring.c b/fs/io_uring.c index c4855cecc8f3..b8fdb10c23e3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -227,6 +227,7 @@ struct io_restriction { DECLARE_BITMAP(sqe_op, IORING_OP_LAST); u8 sqe_flags_allowed; u8 sqe_flags_required; + bool registered; }; struct io_ring_ctx { @@ -6910,6 +6911,14 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) static void io_sq_thread_stop(struct io_ring_ctx *ctx) { if (ctx->sqo_thread) { + /* + * We may arrive here from the error branch in + * io_sq_offload_create() where the kthread is created + * without being waked up, thus wake it up now to make + * sure the wait will complete. + */ + wake_up_process(ctx->sqo_thread); + wait_for_completion(&ctx->sq_thread_comp); /* * The park is a bit of a work-around, without it we get @@ -7581,8 +7590,8 @@ void __io_uring_free(struct task_struct *tsk) tsk->io_uring = NULL; } -static int io_sq_offload_start(struct io_ring_ctx *ctx, - struct io_uring_params *p) +static int io_sq_offload_create(struct io_ring_ctx *ctx, + struct io_uring_params *p) { int ret; @@ -7619,7 +7628,6 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, ret = io_uring_alloc_task_context(ctx->sqo_thread); if (ret) goto err; - wake_up_process(ctx->sqo_thread); } else if (p->flags & IORING_SETUP_SQ_AFF) { /* Can't have SQ_AFF without SQPOLL */ ret = -EINVAL; @@ -7636,6 +7644,12 @@ err: return ret; } +static void io_sq_offload_start(struct io_ring_ctx *ctx) +{ + if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sqo_thread) + wake_up_process(ctx->sqo_thread); +} + static inline void __io_unaccount_mem(struct user_struct *user, unsigned long nr_pages) { @@ -8633,6 +8647,10 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (!percpu_ref_tryget(&ctx->refs)) goto out_fput; + ret = -EBADFD; + if (ctx->flags & IORING_SETUP_R_DISABLED) + goto out; + /* * For SQ polling, the thread will do all submissions and completions. * Just return the requested submit count, and wake the thread if @@ -8975,10 +8993,13 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, if (ret) goto err; - ret = io_sq_offload_start(ctx, p); + ret = io_sq_offload_create(ctx, p); if (ret) goto err; + if (!(p->flags & IORING_SETUP_R_DISABLED)) + io_sq_offload_start(ctx); + memset(&p->sq_off, 0, sizeof(p->sq_off)); p->sq_off.head = offsetof(struct io_rings, sq.head); p->sq_off.tail = offsetof(struct io_rings, sq.tail); @@ -9041,7 +9062,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | - IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ)) + IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ | + IORING_SETUP_R_DISABLED)) return -EINVAL; return io_uring_create(entries, &p, params); @@ -9124,8 +9146,12 @@ static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg, size_t size; int i, ret; + /* Restrictions allowed only if rings started disabled */ + if (!(ctx->flags & IORING_SETUP_R_DISABLED)) + return -EBADFD; + /* We allow only a single restrictions registration */ - if (ctx->restricted) + if (ctx->restrictions.registered) return -EBUSY; if (!arg || nr_args > IORING_MAX_RESTRICTIONS) @@ -9177,12 +9203,27 @@ out: if (ret != 0) memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); else - ctx->restricted = 1; + ctx->restrictions.registered = true; kfree(res); return ret; } +static int io_register_enable_rings(struct io_ring_ctx *ctx) +{ + if (!(ctx->flags & IORING_SETUP_R_DISABLED)) + return -EBADFD; + + if (ctx->restrictions.registered) + ctx->restricted = 1; + + ctx->flags &= ~IORING_SETUP_R_DISABLED; + + io_sq_offload_start(ctx); + + return 0; +} + static bool io_register_op_must_quiesce(int op) { switch (op) { @@ -9304,6 +9345,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_unregister_personality(ctx, nr_args); break; + case IORING_REGISTER_ENABLE_RINGS: + ret = -EINVAL; + if (arg || nr_args) + break; + ret = io_register_enable_rings(ctx); + break; case IORING_REGISTER_RESTRICTIONS: ret = io_register_restrictions(ctx, arg, nr_args); break; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 6e7f2e5e917b..a0c85e0e9016 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -95,6 +95,7 @@ enum { #define IORING_SETUP_CQSIZE (1U << 3) /* app defines CQ size */ #define IORING_SETUP_CLAMP (1U << 4) /* clamp SQ/CQ ring sizes */ #define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */ +#define IORING_SETUP_R_DISABLED (1U << 6) /* start with ring disabled */ enum { IORING_OP_NOP, @@ -268,6 +269,7 @@ enum { IORING_REGISTER_PERSONALITY = 9, IORING_UNREGISTER_PERSONALITY = 10, IORING_REGISTER_RESTRICTIONS = 11, + IORING_REGISTER_ENABLE_RINGS = 12, /* this goes last */ IORING_REGISTER_LAST -- cgit v1.2.3 From 90554200724d5b280439dc361fe7ee92fe459ea7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 3 Sep 2020 12:12:41 -0600 Subject: io_uring: provide IORING_ENTER_SQ_WAIT for SQPOLL SQ ring waits When using SQPOLL, applications can run into the issue of running out of SQ ring entries because the thread hasn't consumed them yet. The only option for dealing with that is checking later, or busy checking for the condition. Provide IORING_ENTER_SQ_WAIT if applications want to wait on this condition. Signed-off-by: Jens Axboe --- fs/io_uring.c | 40 +++++++++++++++++++++++++++++++++++++--- include/uapi/linux/io_uring.h | 1 + 2 files changed, 38 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/io_uring.c b/fs/io_uring.c index a924ab1cf15b..3ee6ee1785d2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -301,6 +301,7 @@ struct io_ring_ctx { struct io_sq_data *sq_data; /* if using sq thread polling */ + struct wait_queue_head sqo_sq_wait; struct wait_queue_entry sqo_wait_entry; struct list_head sqd_list; @@ -1072,6 +1073,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) goto err; ctx->flags = p->flags; + init_waitqueue_head(&ctx->sqo_sq_wait); INIT_LIST_HEAD(&ctx->sqd_list); init_waitqueue_head(&ctx->cq_wait); INIT_LIST_HEAD(&ctx->cq_overflow_list); @@ -1324,6 +1326,13 @@ static void io_commit_cqring(struct io_ring_ctx *ctx) __io_queue_deferred(ctx); } +static inline bool io_sqring_full(struct io_ring_ctx *ctx) +{ + struct io_rings *r = ctx->rings; + + return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == r->sq_ring_entries; +} + static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; @@ -6736,6 +6745,10 @@ again: if (likely(!percpu_ref_is_dying(&ctx->refs))) ret = io_submit_sqes(ctx, to_submit); mutex_unlock(&ctx->uring_lock); + + if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait)) + wake_up(&ctx->sqo_sq_wait); + return SQT_DID_WORK; } @@ -8231,8 +8244,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) * io_commit_cqring */ smp_rmb(); - if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head != - ctx->rings->sq_ring_entries) + if (!io_sqring_full(ctx)) mask |= EPOLLOUT | EPOLLWRNORM; if (io_cqring_events(ctx, false)) mask |= EPOLLIN | EPOLLRDNORM; @@ -8801,6 +8813,25 @@ static unsigned long io_uring_nommu_get_unmapped_area(struct file *file, #endif /* !CONFIG_MMU */ +static void io_sqpoll_wait_sq(struct io_ring_ctx *ctx) +{ + DEFINE_WAIT(wait); + + do { + if (!io_sqring_full(ctx)) + break; + + prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE); + + if (!io_sqring_full(ctx)) + break; + + schedule(); + } while (!signal_pending(current)); + + finish_wait(&ctx->sqo_sq_wait, &wait); +} + SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, u32, min_complete, u32, flags, const sigset_t __user *, sig, size_t, sigsz) @@ -8812,7 +8843,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, io_run_task_work(); - if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP)) + if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | + IORING_ENTER_SQ_WAIT)) return -EINVAL; f = fdget(fd); @@ -8843,6 +8875,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, io_cqring_overflow_flush(ctx, false, NULL, NULL); if (flags & IORING_ENTER_SQ_WAKEUP) wake_up(&ctx->sq_data->wait); + if (flags & IORING_ENTER_SQ_WAIT) + io_sqpoll_wait_sq(ctx); submitted = to_submit; } else if (to_submit) { ret = io_uring_add_task_file(f.file); diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index a0c85e0e9016..98d8e06dea22 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -225,6 +225,7 @@ struct io_cqring_offsets { */ #define IORING_ENTER_GETEVENTS (1U << 0) #define IORING_ENTER_SQ_WAKEUP (1U << 1) +#define IORING_ENTER_SQ_WAIT (1U << 2) /* * Passed in for io_uring_setup(2). Copied back with updated info on success -- cgit v1.2.3 From 792caccc4526bb489e054f9ab61d7c024b15dea2 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 30 Sep 2020 15:49:26 -0700 Subject: bpf: Introduce BPF_F_PRESERVE_ELEMS for perf event array Currently, perf event in perf event array is removed from the array when the map fd used to add the event is closed. This behavior makes it difficult to the share perf events with perf event array. Introduce perf event map that keeps the perf event open with a new flag BPF_F_PRESERVE_ELEMS. With this flag set, perf events in the array are not removed when the original map fd is closed. Instead, the perf event will stay in the map until 1) it is explicitly removed from the array; or 2) the array is freed. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200930224927.1936644-2-songliubraving@fb.com --- include/uapi/linux/bpf.h | 3 +++ kernel/bpf/arraymap.c | 19 +++++++++++++++++-- tools/include/uapi/linux/bpf.h | 3 +++ 3 files changed, 23 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1f17c6752deb..4f556cfcbfbe 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -414,6 +414,9 @@ enum { /* Enable memory-mapping BPF map */ BPF_F_MMAPABLE = (1U << 10), + +/* Share perf_event among processes */ + BPF_F_PRESERVE_ELEMS = (1U << 11), }; /* Flags for BPF_PROG_QUERY. */ diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index e5fd31268ae0..bd777dd6f967 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -15,7 +15,8 @@ #include "map_in_map.h" #define ARRAY_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK) + (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK | \ + BPF_F_PRESERVE_ELEMS) static void bpf_array_free_percpu(struct bpf_array *array) { @@ -64,6 +65,10 @@ int array_map_alloc_check(union bpf_attr *attr) attr->map_flags & BPF_F_MMAPABLE) return -EINVAL; + if (attr->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY && + attr->map_flags & BPF_F_PRESERVE_ELEMS) + return -EINVAL; + if (attr->value_size > KMALLOC_MAX_SIZE) /* if value_size is bigger, the user space won't be able to * access the elements. @@ -1134,6 +1139,9 @@ static void perf_event_fd_array_release(struct bpf_map *map, struct bpf_event_entry *ee; int i; + if (map->map_flags & BPF_F_PRESERVE_ELEMS) + return; + rcu_read_lock(); for (i = 0; i < array->map.max_entries; i++) { ee = READ_ONCE(array->ptrs[i]); @@ -1143,12 +1151,19 @@ static void perf_event_fd_array_release(struct bpf_map *map, rcu_read_unlock(); } +static void perf_event_fd_array_map_free(struct bpf_map *map) +{ + if (map->map_flags & BPF_F_PRESERVE_ELEMS) + bpf_fd_array_map_clear(map); + fd_array_map_free(map); +} + static int perf_event_array_map_btf_id; const struct bpf_map_ops perf_event_array_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, - .map_free = fd_array_map_free, + .map_free = perf_event_fd_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 1f17c6752deb..4f556cfcbfbe 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -414,6 +414,9 @@ enum { /* Enable memory-mapping BPF map */ BPF_F_MMAPABLE = (1U << 10), + +/* Share perf_event among processes */ + BPF_F_PRESERVE_ELEMS = (1U << 11), }; /* Flags for BPF_PROG_QUERY. */ -- cgit v1.2.3 From 1e6aaae93e9ddb9dc664993eb949b1da94cab3a5 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Fri, 25 Sep 2020 09:32:43 -0700 Subject: iommu/uapi: Add argsz for user filled data As IOMMU UAPI gets extended, user data size may increase. To support backward compatibiliy, this patch introduces a size field to each UAPI data structures. It is *always* the responsibility for the user to fill in the correct size. Padding fields are adjusted to ensure 8 byte alignment. Specific scenarios for user data handling are documented in: Documentation/userspace-api/iommu.rst As there is no current users of the API, struct version is not incremented. Signed-off-by: Liu Yi L Signed-off-by: Jacob Pan Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/1601051567-54787-3-git-send-email-jacob.jun.pan@linux.intel.com Signed-off-by: Joerg Roedel --- include/uapi/linux/iommu.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h index c2b2caf9ed41..b42acc8fe007 100644 --- a/include/uapi/linux/iommu.h +++ b/include/uapi/linux/iommu.h @@ -139,6 +139,7 @@ enum iommu_page_response_code { /** * struct iommu_page_response - Generic page response information + * @argsz: User filled size of this data * @version: API version of this structure * @flags: encodes whether the corresponding fields are valid * (IOMMU_FAULT_PAGE_RESPONSE_* values) @@ -147,6 +148,7 @@ enum iommu_page_response_code { * @code: response code from &enum iommu_page_response_code */ struct iommu_page_response { + __u32 argsz; #define IOMMU_PAGE_RESP_VERSION_1 1 __u32 version; #define IOMMU_PAGE_RESP_PASID_VALID (1 << 0) @@ -222,6 +224,7 @@ struct iommu_inv_pasid_info { /** * struct iommu_cache_invalidate_info - First level/stage invalidation * information + * @argsz: User filled size of this data * @version: API version of this structure * @cache: bitfield that allows to select which caches to invalidate * @granularity: defines the lowest granularity used for the invalidation: @@ -250,6 +253,7 @@ struct iommu_inv_pasid_info { * must support the used granularity. */ struct iommu_cache_invalidate_info { + __u32 argsz; #define IOMMU_CACHE_INVALIDATE_INFO_VERSION_1 1 __u32 version; /* IOMMU paging structure cache */ @@ -259,7 +263,7 @@ struct iommu_cache_invalidate_info { #define IOMMU_CACHE_INV_TYPE_NR (3) __u8 cache; __u8 granularity; - __u8 padding[2]; + __u8 padding[6]; union { struct iommu_inv_pasid_info pasid_info; struct iommu_inv_addr_info addr_info; @@ -296,6 +300,7 @@ struct iommu_gpasid_bind_data_vtd { /** * struct iommu_gpasid_bind_data - Information about device and guest PASID binding + * @argsz: User filled size of this data * @version: Version of this data structure * @format: PASID table entry format * @flags: Additional information on guest bind request @@ -313,17 +318,18 @@ struct iommu_gpasid_bind_data_vtd { * PASID to host PASID based on this bind data. */ struct iommu_gpasid_bind_data { + __u32 argsz; #define IOMMU_GPASID_BIND_VERSION_1 1 __u32 version; #define IOMMU_PASID_FORMAT_INTEL_VTD 1 __u32 format; + __u32 addr_width; #define IOMMU_SVA_GPASID_VAL (1 << 0) /* guest PASID valid */ __u64 flags; __u64 gpgd; __u64 hpasid; __u64 gpasid; - __u32 addr_width; - __u8 padding[12]; + __u8 padding[8]; /* Vendor specific data */ union { struct iommu_gpasid_bind_data_vtd vtd; -- cgit v1.2.3 From 8d3bb3b8cbf2ffb3ef73720a48b3445518dcdb55 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Fri, 25 Sep 2020 09:32:44 -0700 Subject: iommu/uapi: Use named union for user data IOMMU UAPI data size is filled by the user space which must be validated by the kernel. To ensure backward compatibility, user data can only be extended by either re-purpose padding bytes or extend the variable sized union at the end. No size change is allowed before the union. Therefore, the minimum size is the offset of the union. To use offsetof() on the union, we must make it named. Signed-off-by: Jacob Pan Reviewed-by: Lu Baolu Reviewed-by: Eric Auger Link: https://lore.kernel.org/linux-iommu/20200611145518.0c2817d6@x1.home/ Link: https://lore.kernel.org/r/1601051567-54787-4-git-send-email-jacob.jun.pan@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 22 +++++++++++----------- drivers/iommu/intel/svm.c | 2 +- include/uapi/linux/iommu.h | 4 ++-- 3 files changed, 14 insertions(+), 14 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index f8177c59d229..f1c66c94be55 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -5424,8 +5424,8 @@ intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev, /* Size is only valid in address selective invalidation */ if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) - size = to_vtd_size(inv_info->addr_info.granule_size, - inv_info->addr_info.nb_granules); + size = to_vtd_size(inv_info->granu.addr_info.granule_size, + inv_info->granu.addr_info.nb_granules); for_each_set_bit(cache_type, (unsigned long *)&inv_info->cache, @@ -5446,20 +5446,20 @@ intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev, * granularity. */ if (inv_info->granularity == IOMMU_INV_GRANU_PASID && - (inv_info->pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID)) - pasid = inv_info->pasid_info.pasid; + (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID)) + pasid = inv_info->granu.pasid_info.pasid; else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR && - (inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID)) - pasid = inv_info->addr_info.pasid; + (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID)) + pasid = inv_info->granu.addr_info.pasid; switch (BIT(cache_type)) { case IOMMU_CACHE_INV_TYPE_IOTLB: /* HW will ignore LSB bits based on address mask */ if (inv_info->granularity == IOMMU_INV_GRANU_ADDR && size && - (inv_info->addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) { + (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) { pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n", - inv_info->addr_info.addr, size); + inv_info->granu.addr_info.addr, size); } /* @@ -5467,9 +5467,9 @@ intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev, * We use npages = -1 to indicate that. */ qi_flush_piotlb(iommu, did, pasid, - mm_to_dma_pfn(inv_info->addr_info.addr), + mm_to_dma_pfn(inv_info->granu.addr_info.addr), (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size, - inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF); + inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF); if (!info->ats_enabled) break; @@ -5492,7 +5492,7 @@ intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev, size = 64 - VTD_PAGE_SHIFT; addr = 0; } else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) { - addr = inv_info->addr_info.addr; + addr = inv_info->granu.addr_info.addr; } if (info->ats_enabled) diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 95c3164a2302..99353d6468fa 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -370,7 +370,7 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev, spin_lock(&iommu->lock); ret = intel_pasid_setup_nested(iommu, dev, (pgd_t *)(uintptr_t)data->gpgd, - data->hpasid, &data->vtd, dmar_domain, + data->hpasid, &data->vendor.vtd, dmar_domain, data->addr_width); spin_unlock(&iommu->lock); if (ret) { diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h index b42acc8fe007..5946779ac1f9 100644 --- a/include/uapi/linux/iommu.h +++ b/include/uapi/linux/iommu.h @@ -267,7 +267,7 @@ struct iommu_cache_invalidate_info { union { struct iommu_inv_pasid_info pasid_info; struct iommu_inv_addr_info addr_info; - }; + } granu; }; /** @@ -333,7 +333,7 @@ struct iommu_gpasid_bind_data { /* Vendor specific data */ union { struct iommu_gpasid_bind_data_vtd vtd; - }; + } vendor; }; #endif /* _UAPI_IOMMU_H */ -- cgit v1.2.3 From d90573812eea63c6bc8ab8a38f661b4c27c3cdc0 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Fri, 25 Sep 2020 09:32:46 -0700 Subject: iommu/uapi: Handle data and argsz filled by users IOMMU user APIs are responsible for processing user data. This patch changes the interface such that user pointers can be passed into IOMMU code directly. Separate kernel APIs without user pointers are introduced for in-kernel users of the UAPI functionality. IOMMU UAPI data has a user filled argsz field which indicates the data length of the structure. User data is not trusted, argsz must be validated based on the current kernel data size, mandatory data size, and feature flags. User data may also be extended, resulting in possible argsz increase. Backward compatibility is ensured based on size and flags (or the functional equivalent fields) checking. This patch adds sanity checks in the IOMMU layer. In addition to argsz, reserved/unused fields in padding, flags, and version are also checked. Details are documented in Documentation/userspace-api/iommu.rst Signed-off-by: Liu Yi L Signed-off-by: Jacob Pan Reviewed-by: Jean-Philippe Brucker Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/1601051567-54787-6-git-send-email-jacob.jun.pan@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 194 +++++++++++++++++++++++++++++++++++++++++++-- include/linux/iommu.h | 28 ++++--- include/uapi/linux/iommu.h | 1 + 3 files changed, 207 insertions(+), 16 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index e412a0abfefa..6d847027d35e 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1961,34 +1961,214 @@ out_unlock: } EXPORT_SYMBOL_GPL(iommu_attach_device); +/* + * Check flags and other user provided data for valid combinations. We also + * make sure no reserved fields or unused flags are set. This is to ensure + * not breaking userspace in the future when these fields or flags are used. + */ +static int iommu_check_cache_invl_data(struct iommu_cache_invalidate_info *info) +{ + u32 mask; + int i; + + if (info->version != IOMMU_CACHE_INVALIDATE_INFO_VERSION_1) + return -EINVAL; + + mask = (1 << IOMMU_CACHE_INV_TYPE_NR) - 1; + if (info->cache & ~mask) + return -EINVAL; + + if (info->granularity >= IOMMU_INV_GRANU_NR) + return -EINVAL; + + switch (info->granularity) { + case IOMMU_INV_GRANU_ADDR: + if (info->cache & IOMMU_CACHE_INV_TYPE_PASID) + return -EINVAL; + + mask = IOMMU_INV_ADDR_FLAGS_PASID | + IOMMU_INV_ADDR_FLAGS_ARCHID | + IOMMU_INV_ADDR_FLAGS_LEAF; + + if (info->granu.addr_info.flags & ~mask) + return -EINVAL; + break; + case IOMMU_INV_GRANU_PASID: + mask = IOMMU_INV_PASID_FLAGS_PASID | + IOMMU_INV_PASID_FLAGS_ARCHID; + if (info->granu.pasid_info.flags & ~mask) + return -EINVAL; + + break; + case IOMMU_INV_GRANU_DOMAIN: + if (info->cache & IOMMU_CACHE_INV_TYPE_DEV_IOTLB) + return -EINVAL; + break; + default: + return -EINVAL; + } + + /* Check reserved padding fields */ + for (i = 0; i < sizeof(info->padding); i++) { + if (info->padding[i]) + return -EINVAL; + } + + return 0; +} + int iommu_uapi_cache_invalidate(struct iommu_domain *domain, struct device *dev, - struct iommu_cache_invalidate_info *inv_info) + void __user *uinfo) { + struct iommu_cache_invalidate_info inv_info = { 0 }; + u32 minsz; + int ret; + if (unlikely(!domain->ops->cache_invalidate)) return -ENODEV; - return domain->ops->cache_invalidate(domain, dev, inv_info); + /* + * No new spaces can be added before the variable sized union, the + * minimum size is the offset to the union. + */ + minsz = offsetof(struct iommu_cache_invalidate_info, granu); + + /* Copy minsz from user to get flags and argsz */ + if (copy_from_user(&inv_info, uinfo, minsz)) + return -EFAULT; + + /* Fields before the variable size union are mandatory */ + if (inv_info.argsz < minsz) + return -EINVAL; + + /* PASID and address granu require additional info beyond minsz */ + if (inv_info.granularity == IOMMU_INV_GRANU_PASID && + inv_info.argsz < offsetofend(struct iommu_cache_invalidate_info, granu.pasid_info)) + return -EINVAL; + + if (inv_info.granularity == IOMMU_INV_GRANU_ADDR && + inv_info.argsz < offsetofend(struct iommu_cache_invalidate_info, granu.addr_info)) + return -EINVAL; + + /* + * User might be using a newer UAPI header which has a larger data + * size, we shall support the existing flags within the current + * size. Copy the remaining user data _after_ minsz but not more + * than the current kernel supported size. + */ + if (copy_from_user((void *)&inv_info + minsz, uinfo + minsz, + min_t(u32, inv_info.argsz, sizeof(inv_info)) - minsz)) + return -EFAULT; + + /* Now the argsz is validated, check the content */ + ret = iommu_check_cache_invl_data(&inv_info); + if (ret) + return ret; + + return domain->ops->cache_invalidate(domain, dev, &inv_info); } EXPORT_SYMBOL_GPL(iommu_uapi_cache_invalidate); -int iommu_uapi_sva_bind_gpasid(struct iommu_domain *domain, - struct device *dev, struct iommu_gpasid_bind_data *data) +static int iommu_check_bind_data(struct iommu_gpasid_bind_data *data) +{ + u32 mask; + int i; + + if (data->version != IOMMU_GPASID_BIND_VERSION_1) + return -EINVAL; + + /* Check the range of supported formats */ + if (data->format >= IOMMU_PASID_FORMAT_LAST) + return -EINVAL; + + /* Check all flags */ + mask = IOMMU_SVA_GPASID_VAL; + if (data->flags & ~mask) + return -EINVAL; + + /* Check reserved padding fields */ + for (i = 0; i < sizeof(data->padding); i++) { + if (data->padding[i]) + return -EINVAL; + } + + return 0; +} + +static int iommu_sva_prepare_bind_data(void __user *udata, + struct iommu_gpasid_bind_data *data) +{ + u32 minsz; + + /* + * No new spaces can be added before the variable sized union, the + * minimum size is the offset to the union. + */ + minsz = offsetof(struct iommu_gpasid_bind_data, vendor); + + /* Copy minsz from user to get flags and argsz */ + if (copy_from_user(data, udata, minsz)) + return -EFAULT; + + /* Fields before the variable size union are mandatory */ + if (data->argsz < minsz) + return -EINVAL; + /* + * User might be using a newer UAPI header, we shall let IOMMU vendor + * driver decide on what size it needs. Since the guest PASID bind data + * can be vendor specific, larger argsz could be the result of extension + * for one vendor but it should not affect another vendor. + * Copy the remaining user data _after_ minsz + */ + if (copy_from_user((void *)data + minsz, udata + minsz, + min_t(u32, data->argsz, sizeof(*data)) - minsz)) + return -EFAULT; + + return iommu_check_bind_data(data); +} + +int iommu_uapi_sva_bind_gpasid(struct iommu_domain *domain, struct device *dev, + void __user *udata) { + struct iommu_gpasid_bind_data data = { 0 }; + int ret; + if (unlikely(!domain->ops->sva_bind_gpasid)) return -ENODEV; - return domain->ops->sva_bind_gpasid(domain, dev, data); + ret = iommu_sva_prepare_bind_data(udata, &data); + if (ret) + return ret; + + return domain->ops->sva_bind_gpasid(domain, dev, &data); } EXPORT_SYMBOL_GPL(iommu_uapi_sva_bind_gpasid); -int iommu_uapi_sva_unbind_gpasid(struct iommu_domain *domain, struct device *dev, - ioasid_t pasid) +int iommu_sva_unbind_gpasid(struct iommu_domain *domain, struct device *dev, + ioasid_t pasid) { if (unlikely(!domain->ops->sva_unbind_gpasid)) return -ENODEV; return domain->ops->sva_unbind_gpasid(dev, pasid); } +EXPORT_SYMBOL_GPL(iommu_sva_unbind_gpasid); + +int iommu_uapi_sva_unbind_gpasid(struct iommu_domain *domain, struct device *dev, + void __user *udata) +{ + struct iommu_gpasid_bind_data data = { 0 }; + int ret; + + if (unlikely(!domain->ops->sva_bind_gpasid)) + return -ENODEV; + + ret = iommu_sva_prepare_bind_data(udata, &data); + if (ret) + return ret; + + return iommu_sva_unbind_gpasid(domain, dev, data.hpasid); +} EXPORT_SYMBOL_GPL(iommu_uapi_sva_unbind_gpasid); static void __iommu_detach_device(struct iommu_domain *domain, diff --git a/include/linux/iommu.h b/include/linux/iommu.h index d18de2afa6fb..82876f682367 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -426,11 +426,14 @@ extern void iommu_detach_device(struct iommu_domain *domain, struct device *dev); extern int iommu_uapi_cache_invalidate(struct iommu_domain *domain, struct device *dev, - struct iommu_cache_invalidate_info *inv_info); + void __user *uinfo); + extern int iommu_uapi_sva_bind_gpasid(struct iommu_domain *domain, - struct device *dev, struct iommu_gpasid_bind_data *data); + struct device *dev, void __user *udata); extern int iommu_uapi_sva_unbind_gpasid(struct iommu_domain *domain, - struct device *dev, ioasid_t pasid); + struct device *dev, void __user *udata); +extern int iommu_sva_unbind_gpasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid); extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev); extern struct iommu_domain *iommu_get_dma_domain(struct device *dev); extern int iommu_map(struct iommu_domain *domain, unsigned long iova, @@ -1032,22 +1035,29 @@ static inline int iommu_sva_get_pasid(struct iommu_sva *handle) return IOMMU_PASID_INVALID; } -static inline int iommu_uapi_cache_invalidate(struct iommu_domain *domain, - struct device *dev, - struct iommu_cache_invalidate_info *inv_info) +static inline int +iommu_uapi_cache_invalidate(struct iommu_domain *domain, + struct device *dev, + struct iommu_cache_invalidate_info *inv_info) { return -ENODEV; } static inline int iommu_uapi_sva_bind_gpasid(struct iommu_domain *domain, - struct device *dev, - struct iommu_gpasid_bind_data *data) + struct device *dev, void __user *udata) { return -ENODEV; } static inline int iommu_uapi_sva_unbind_gpasid(struct iommu_domain *domain, - struct device *dev, int pasid) + struct device *dev, void __user *udata) +{ + return -ENODEV; +} + +static inline int iommu_sva_unbind_gpasid(struct iommu_domain *domain, + struct device *dev, + ioasid_t pasid) { return -ENODEV; } diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h index 5946779ac1f9..66d4ca40b40f 100644 --- a/include/uapi/linux/iommu.h +++ b/include/uapi/linux/iommu.h @@ -322,6 +322,7 @@ struct iommu_gpasid_bind_data { #define IOMMU_GPASID_BIND_VERSION_1 1 __u32 version; #define IOMMU_PASID_FORMAT_INTEL_VTD 1 +#define IOMMU_PASID_FORMAT_LAST 2 __u32 format; __u32 addr_width; #define IOMMU_SVA_GPASID_VAL (1 << 0) /* guest PASID valid */ -- cgit v1.2.3 From 6278eecba31f3983fe2743fc01b198433aa18247 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Fri, 25 Sep 2020 09:32:47 -0700 Subject: iommu/vt-d: Check UAPI data processed by IOMMU core IOMMU generic layer already does sanity checks on UAPI data for version match and argsz range based on generic information. This patch adjusts the following data checking responsibilities: - removes the redundant version check from VT-d driver - removes the check for vendor specific data size - adds check for the use of reserved/undefined flags Signed-off-by: Jacob Pan Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/1601051567-54787-7-git-send-email-jacob.jun.pan@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 3 +-- drivers/iommu/intel/svm.c | 11 +++++++++-- include/uapi/linux/iommu.h | 1 + 3 files changed, 11 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index f1c66c94be55..cae400b6807e 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -5398,8 +5398,7 @@ intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev, int ret = 0; u64 size = 0; - if (!inv_info || !dmar_domain || - inv_info->version != IOMMU_CACHE_INVALIDATE_INFO_VERSION_1) + if (!inv_info || !dmar_domain) return -EINVAL; if (!dev || !dev_is_pci(dev)) diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 99353d6468fa..0cb9a15f1112 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -284,8 +284,15 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev, if (WARN_ON(!iommu) || !data) return -EINVAL; - if (data->version != IOMMU_GPASID_BIND_VERSION_1 || - data->format != IOMMU_PASID_FORMAT_INTEL_VTD) + if (data->format != IOMMU_PASID_FORMAT_INTEL_VTD) + return -EINVAL; + + /* IOMMU core ensures argsz is more than the start of the union */ + if (data->argsz < offsetofend(struct iommu_gpasid_bind_data, vendor.vtd)) + return -EINVAL; + + /* Make sure no undefined flags are used in vendor data */ + if (data->vendor.vtd.flags & ~(IOMMU_SVA_VTD_GPASID_LAST - 1)) return -EINVAL; if (!dev_is_pci(dev)) diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h index 66d4ca40b40f..e1d9e75f2c94 100644 --- a/include/uapi/linux/iommu.h +++ b/include/uapi/linux/iommu.h @@ -288,6 +288,7 @@ struct iommu_gpasid_bind_data_vtd { #define IOMMU_SVA_VTD_GPASID_PWT (1 << 3) /* page-level write through */ #define IOMMU_SVA_VTD_GPASID_EMTE (1 << 4) /* extended mem type enable */ #define IOMMU_SVA_VTD_GPASID_CD (1 << 5) /* PASID-level cache disable */ +#define IOMMU_SVA_VTD_GPASID_LAST (1 << 6) __u64 flags; __u32 pat; __u32 emt; -- cgit v1.2.3 From 61931c0ee9cf5da575996b977a2358b598ef84bb Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 1 Oct 2020 15:00:56 -0400 Subject: dm: export dm_copy_name_and_uuid Allow DM targets to access the configured name and uuid. Also, bump DM ioctl version. Signed-off-by: Mike Snitzer --- drivers/md/dm-ioctl.c | 2 +- include/uapi/linux/dm-ioctl.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 28122e850ea1..cd0478d44058 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -2044,7 +2044,7 @@ out: return r; } - +EXPORT_SYMBOL_GPL(dm_copy_name_and_uuid); /** * dm_early_create - create a mapped device in early boot. diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h index 6622912c2342..4933b6b67b85 100644 --- a/include/uapi/linux/dm-ioctl.h +++ b/include/uapi/linux/dm-ioctl.h @@ -272,9 +272,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 42 +#define DM_VERSION_MINOR 43 #define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2020-02-27)" +#define DM_VERSION_EXTRA "-ioctl (2020-10-01)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ -- cgit v1.2.3 From ba1df797e5bbba68ddd1a29bd658b1c11f9a60b6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 1 Oct 2020 18:59:07 -0400 Subject: NFSACL: Replace PROC() macro with open code Clean up: Follow-up on ten-year-old commit b9081d90f5b9 ("NFS: kill off complicated macro 'PROC'") by performing the same conversion in the NFSACL code. To reduce the chance of error, I copied the original C preprocessor output and then made some minor edits. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs2acl.c | 72 ++++++++++++++++++++++++++++++--------------- fs/nfsd/nfs3acl.c | 49 +++++++++++++++++------------- include/uapi/linux/nfsacl.h | 2 ++ 3 files changed, 80 insertions(+), 43 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index cbab1d2d8a75..8d20e0d74417 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -347,36 +347,62 @@ static void nfsaclsvc_release_access(struct svc_rqst *rqstp) fh_put(&resp->fh); } -#define nfsaclsvc_decode_voidargs NULL -#define nfsaclsvc_release_void NULL -#define nfsd3_fhandleargs nfsd_fhandle -#define nfsd3_attrstatres nfsd_attrstat -#define nfsd3_voidres nfsd3_voidargs struct nfsd3_voidargs { int dummy; }; -#define PROC(name, argt, rest, relt, cache, respsize) \ -{ \ - .pc_func = nfsacld_proc_##name, \ - .pc_decode = nfsaclsvc_decode_##argt##args, \ - .pc_encode = nfsaclsvc_encode_##rest##res, \ - .pc_release = nfsaclsvc_release_##relt, \ - .pc_argsize = sizeof(struct nfsd3_##argt##args), \ - .pc_ressize = sizeof(struct nfsd3_##rest##res), \ - .pc_cachetype = cache, \ - .pc_xdrressize = respsize, \ -} - #define ST 1 /* status*/ #define AT 21 /* attributes */ #define pAT (1+AT) /* post attributes - conditional */ #define ACL (1+NFS_ACL_MAX_ENTRIES*3) /* Access Control List */ -static const struct svc_procedure nfsd_acl_procedures2[] = { - PROC(null, void, void, void, RC_NOCACHE, ST), - PROC(getacl, getacl, getacl, getacl, RC_NOCACHE, ST+1+2*(1+ACL)), - PROC(setacl, setacl, attrstat, attrstat, RC_NOCACHE, ST+AT), - PROC(getattr, fhandle, attrstat, attrstat, RC_NOCACHE, ST+AT), - PROC(access, access, access, access, RC_NOCACHE, ST+AT+1), +static const struct svc_procedure nfsd_acl_procedures2[5] = { + [ACLPROC2_NULL] = { + .pc_func = nfsacld_proc_null, + .pc_encode = nfsaclsvc_encode_voidres, + .pc_argsize = sizeof(struct nfsd3_voidargs), + .pc_ressize = sizeof(struct nfsd3_voidargs), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST, + }, + [ACLPROC2_GETACL] = { + .pc_func = nfsacld_proc_getacl, + .pc_decode = nfsaclsvc_decode_getaclargs, + .pc_encode = nfsaclsvc_encode_getaclres, + .pc_release = nfsaclsvc_release_getacl, + .pc_argsize = sizeof(struct nfsd3_getaclargs), + .pc_ressize = sizeof(struct nfsd3_getaclres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+1+2*(1+ACL), + }, + [ACLPROC2_SETACL] = { + .pc_func = nfsacld_proc_setacl, + .pc_decode = nfsaclsvc_decode_setaclargs, + .pc_encode = nfsaclsvc_encode_attrstatres, + .pc_release = nfsaclsvc_release_attrstat, + .pc_argsize = sizeof(struct nfsd3_setaclargs), + .pc_ressize = sizeof(struct nfsd_attrstat), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+AT, + }, + [ACLPROC2_GETATTR] = { + .pc_func = nfsacld_proc_getattr, + .pc_decode = nfsaclsvc_decode_fhandleargs, + .pc_encode = nfsaclsvc_encode_attrstatres, + .pc_release = nfsaclsvc_release_attrstat, + .pc_argsize = sizeof(struct nfsd_fhandle), + .pc_ressize = sizeof(struct nfsd_attrstat), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+AT, + }, + [ACLPROC2_ACCESS] = { + .pc_func = nfsacld_proc_access, + .pc_decode = nfsaclsvc_decode_accessargs, + .pc_encode = nfsaclsvc_encode_accessres, + .pc_release = nfsaclsvc_release_access, + .pc_argsize = sizeof(struct nfsd3_accessargs), + .pc_ressize = sizeof(struct nfsd3_accessres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+AT+1, + }, }; static unsigned int nfsd_acl_count2[ARRAY_SIZE(nfsd_acl_procedures2)]; diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 13bca4a2f89d..292acb2e529c 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -235,33 +235,42 @@ static void nfs3svc_release_getacl(struct svc_rqst *rqstp) posix_acl_release(resp->acl_default); } -#define nfs3svc_decode_voidargs NULL -#define nfs3svc_release_void NULL -#define nfsd3_setaclres nfsd3_attrstat -#define nfsd3_voidres nfsd3_voidargs struct nfsd3_voidargs { int dummy; }; -#define PROC(name, argt, rest, relt, cache, respsize) \ -{ \ - .pc_func = nfsd3_proc_##name, \ - .pc_decode = nfs3svc_decode_##argt##args, \ - .pc_encode = nfs3svc_encode_##rest##res, \ - .pc_release = nfs3svc_release_##relt, \ - .pc_argsize = sizeof(struct nfsd3_##argt##args), \ - .pc_ressize = sizeof(struct nfsd3_##rest##res), \ - .pc_cachetype = cache, \ - .pc_xdrressize = respsize, \ -} - #define ST 1 /* status*/ #define AT 21 /* attributes */ #define pAT (1+AT) /* post attributes - conditional */ #define ACL (1+NFS_ACL_MAX_ENTRIES*3) /* Access Control List */ -static const struct svc_procedure nfsd_acl_procedures3[] = { - PROC(null, void, void, void, RC_NOCACHE, ST), - PROC(getacl, getacl, getacl, getacl, RC_NOCACHE, ST+1+2*(1+ACL)), - PROC(setacl, setacl, setacl, fhandle, RC_NOCACHE, ST+pAT), +static const struct svc_procedure nfsd_acl_procedures3[3] = { + [ACLPROC3_NULL] = { + .pc_func = nfsd3_proc_null, + .pc_encode = nfs3svc_encode_voidres, + .pc_argsize = sizeof(struct nfsd3_voidargs), + .pc_ressize = sizeof(struct nfsd3_voidargs), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST, + }, + [ACLPROC3_GETACL] = { + .pc_func = nfsd3_proc_getacl, + .pc_decode = nfs3svc_decode_getaclargs, + .pc_encode = nfs3svc_encode_getaclres, + .pc_release = nfs3svc_release_getacl, + .pc_argsize = sizeof(struct nfsd3_getaclargs), + .pc_ressize = sizeof(struct nfsd3_getaclres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+1+2*(1+ACL), + }, + [ACLPROC3_SETACL] = { + .pc_func = nfsd3_proc_setacl, + .pc_decode = nfs3svc_decode_setaclargs, + .pc_encode = nfs3svc_encode_setaclres, + .pc_release = nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_setaclargs), + .pc_ressize = sizeof(struct nfsd3_attrstat), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+pAT, + }, }; static unsigned int nfsd_acl_count3[ARRAY_SIZE(nfsd_acl_procedures3)]; diff --git a/include/uapi/linux/nfsacl.h b/include/uapi/linux/nfsacl.h index ca9a8501ff30..2c2ad204d3b0 100644 --- a/include/uapi/linux/nfsacl.h +++ b/include/uapi/linux/nfsacl.h @@ -9,11 +9,13 @@ #define NFS_ACL_PROGRAM 100227 +#define ACLPROC2_NULL 0 #define ACLPROC2_GETACL 1 #define ACLPROC2_SETACL 2 #define ACLPROC2_GETATTR 3 #define ACLPROC2_ACCESS 4 +#define ACLPROC3_NULL 0 #define ACLPROC3_GETACL 1 #define ACLPROC3_SETACL 2 -- cgit v1.2.3 From 4976b718c3551faba2c0616ef55ebeb74db1c5ca Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Tue, 29 Sep 2020 16:50:44 -0700 Subject: bpf: Introduce pseudo_btf_id Pseudo_btf_id is a type of ld_imm insn that associates a btf_id to a ksym so that further dereferences on the ksym can use the BTF info to validate accesses. Internally, when seeing a pseudo_btf_id ld insn, the verifier reads the btf_id stored in the insn[0]'s imm field and marks the dst_reg as PTR_TO_BTF_ID. The btf_id points to a VAR_KIND, which is encoded in btf_vminux by pahole. If the VAR is not of a struct type, the dst reg will be marked as PTR_TO_MEM instead of PTR_TO_BTF_ID and the mem_size is resolved to the size of the VAR's type. >From the VAR btf_id, the verifier can also read the address of the ksym's corresponding kernel var from kallsyms and use that to fill dst_reg. Therefore, the proper functionality of pseudo_btf_id depends on (1) kallsyms and (2) the encoding of kernel global VARs in pahole, which should be available since pahole v1.18. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200929235049.2533242-2-haoluo@google.com --- include/linux/bpf_verifier.h | 7 +++ include/linux/btf.h | 15 +++++ include/uapi/linux/bpf.h | 36 +++++++++--- kernel/bpf/btf.c | 15 ----- kernel/bpf/verifier.c | 125 ++++++++++++++++++++++++++++++++++++----- tools/include/uapi/linux/bpf.h | 36 +++++++++--- 6 files changed, 188 insertions(+), 46 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 363b4f1c562a..e83ef6f6bf43 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -308,6 +308,13 @@ struct bpf_insn_aux_data { u32 map_index; /* index into used_maps[] */ u32 map_off; /* offset from value base address */ }; + struct { + enum bpf_reg_type reg_type; /* type of pseudo_btf_id */ + union { + u32 btf_id; /* btf_id for struct typed var */ + u32 mem_size; /* mem_size for non-struct typed var */ + }; + } btf_var; }; u64 map_key_state; /* constant (32 bit) key tracking for maps */ int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ diff --git a/include/linux/btf.h b/include/linux/btf.h index 024e16ff7dcc..af1244180588 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -145,6 +145,21 @@ static inline bool btf_type_is_func_proto(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC_PROTO; } +static inline bool btf_type_is_var(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_VAR; +} + +/* union is only a special case of struct: + * all its offsetof(member) == 0 + */ +static inline bool btf_type_is_struct(const struct btf_type *t) +{ + u8 kind = BTF_INFO_KIND(t->info); + + return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; +} + static inline u16 btf_type_vlen(const struct btf_type *t) { return BTF_INFO_VLEN(t->info); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4f556cfcbfbe..2aa156af24d6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -356,18 +356,36 @@ enum bpf_link_type { #define BPF_F_SLEEPABLE (1U << 4) /* When BPF ldimm64's insn[0].src_reg != 0 then this can have - * two extensions: - * - * insn[0].src_reg: BPF_PSEUDO_MAP_FD BPF_PSEUDO_MAP_VALUE - * insn[0].imm: map fd map fd - * insn[1].imm: 0 offset into value - * insn[0].off: 0 0 - * insn[1].off: 0 0 - * ldimm64 rewrite: address of map address of map[0]+offset - * verifier type: CONST_PTR_TO_MAP PTR_TO_MAP_VALUE + * the following extensions: + * + * insn[0].src_reg: BPF_PSEUDO_MAP_FD + * insn[0].imm: map fd + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of map + * verifier type: CONST_PTR_TO_MAP */ #define BPF_PSEUDO_MAP_FD 1 +/* insn[0].src_reg: BPF_PSEUDO_MAP_VALUE + * insn[0].imm: map fd + * insn[1].imm: offset into value + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of map[0]+offset + * verifier type: PTR_TO_MAP_VALUE + */ #define BPF_PSEUDO_MAP_VALUE 2 +/* insn[0].src_reg: BPF_PSEUDO_BTF_ID + * insn[0].imm: kernel btd id of VAR + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of the kernel variable + * verifier type: PTR_TO_BTF_ID or PTR_TO_MEM, depending on whether the var + * is struct/union. + */ +#define BPF_PSEUDO_BTF_ID 3 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative * offset to another bpf function diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 4d0ee7839fdb..00569afe3d0d 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -440,16 +440,6 @@ static bool btf_type_nosize_or_null(const struct btf_type *t) return !t || btf_type_nosize(t); } -/* union is only a special case of struct: - * all its offsetof(member) == 0 - */ -static bool btf_type_is_struct(const struct btf_type *t) -{ - u8 kind = BTF_INFO_KIND(t->info); - - return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; -} - static bool __btf_type_is_struct(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT; @@ -460,11 +450,6 @@ static bool btf_type_is_array(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY; } -static bool btf_type_is_var(const struct btf_type *t) -{ - return BTF_INFO_KIND(t->info) == BTF_KIND_VAR; -} - static bool btf_type_is_datasec(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 015a1c074b6b..fe4965079773 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7488,6 +7488,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) { struct bpf_insn_aux_data *aux = cur_aux(env); struct bpf_reg_state *regs = cur_regs(env); + struct bpf_reg_state *dst_reg; struct bpf_map *map; int err; @@ -7504,25 +7505,44 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) if (err) return err; + dst_reg = ®s[insn->dst_reg]; if (insn->src_reg == 0) { u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; - regs[insn->dst_reg].type = SCALAR_VALUE; + dst_reg->type = SCALAR_VALUE; __mark_reg_known(®s[insn->dst_reg], imm); return 0; } + if (insn->src_reg == BPF_PSEUDO_BTF_ID) { + mark_reg_known_zero(env, regs, insn->dst_reg); + + dst_reg->type = aux->btf_var.reg_type; + switch (dst_reg->type) { + case PTR_TO_MEM: + dst_reg->mem_size = aux->btf_var.mem_size; + break; + case PTR_TO_BTF_ID: + dst_reg->btf_id = aux->btf_var.btf_id; + break; + default: + verbose(env, "bpf verifier is misconfigured\n"); + return -EFAULT; + } + return 0; + } + map = env->used_maps[aux->map_index]; mark_reg_known_zero(env, regs, insn->dst_reg); - regs[insn->dst_reg].map_ptr = map; + dst_reg->map_ptr = map; if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) { - regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; - regs[insn->dst_reg].off = aux->map_off; + dst_reg->type = PTR_TO_MAP_VALUE; + dst_reg->off = aux->map_off; if (map_value_has_spin_lock(map)) - regs[insn->dst_reg].id = ++env->id_gen; + dst_reg->id = ++env->id_gen; } else if (insn->src_reg == BPF_PSEUDO_MAP_FD) { - regs[insn->dst_reg].type = CONST_PTR_TO_MAP; + dst_reg->type = CONST_PTR_TO_MAP; } else { verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; @@ -9424,6 +9444,73 @@ process_bpf_exit: return 0; } +/* replace pseudo btf_id with kernel symbol address */ +static int check_pseudo_btf_id(struct bpf_verifier_env *env, + struct bpf_insn *insn, + struct bpf_insn_aux_data *aux) +{ + u32 type, id = insn->imm; + const struct btf_type *t; + const char *sym_name; + u64 addr; + + if (!btf_vmlinux) { + verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n"); + return -EINVAL; + } + + if (insn[1].imm != 0) { + verbose(env, "reserved field (insn[1].imm) is used in pseudo_btf_id ldimm64 insn.\n"); + return -EINVAL; + } + + t = btf_type_by_id(btf_vmlinux, id); + if (!t) { + verbose(env, "ldimm64 insn specifies invalid btf_id %d.\n", id); + return -ENOENT; + } + + if (!btf_type_is_var(t)) { + verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n", + id); + return -EINVAL; + } + + sym_name = btf_name_by_offset(btf_vmlinux, t->name_off); + addr = kallsyms_lookup_name(sym_name); + if (!addr) { + verbose(env, "ldimm64 failed to find the address for kernel symbol '%s'.\n", + sym_name); + return -ENOENT; + } + + insn[0].imm = (u32)addr; + insn[1].imm = addr >> 32; + + type = t->type; + t = btf_type_skip_modifiers(btf_vmlinux, type, NULL); + if (!btf_type_is_struct(t)) { + const struct btf_type *ret; + const char *tname; + u32 tsize; + + /* resolve the type size of ksym. */ + ret = btf_resolve_size(btf_vmlinux, t, &tsize); + if (IS_ERR(ret)) { + tname = btf_name_by_offset(btf_vmlinux, t->name_off); + verbose(env, "ldimm64 unable to resolve the size of type '%s': %ld\n", + tname, PTR_ERR(ret)); + return -EINVAL; + } + aux->btf_var.reg_type = PTR_TO_MEM; + aux->btf_var.mem_size = tsize; + } else { + aux->btf_var.reg_type = PTR_TO_BTF_ID; + aux->btf_var.btf_id = type; + } + return 0; +} + static int check_map_prealloc(struct bpf_map *map) { return (map->map_type != BPF_MAP_TYPE_HASH && @@ -9534,10 +9621,14 @@ static bool bpf_map_is_cgroup_storage(struct bpf_map *map) map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE); } -/* look for pseudo eBPF instructions that access map FDs and - * replace them with actual map pointers +/* find and rewrite pseudo imm in ld_imm64 instructions: + * + * 1. if it accesses map FD, replace it with actual map pointer. + * 2. if it accesses btf_id of a VAR, replace it with pointer to the var. + * + * NOTE: btf_vmlinux is required for converting pseudo btf_id. */ -static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) +static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) { struct bpf_insn *insn = env->prog->insnsi; int insn_cnt = env->prog->len; @@ -9578,6 +9669,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) /* valid generic load 64-bit imm */ goto next_insn; + if (insn[0].src_reg == BPF_PSEUDO_BTF_ID) { + aux = &env->insn_aux_data[i]; + err = check_pseudo_btf_id(env, insn, aux); + if (err) + return err; + goto next_insn; + } + /* In final convert_pseudo_ld_imm64() step, this is * converted into regular 64-bit imm load insn. */ @@ -11633,10 +11732,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (is_priv) env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; - ret = replace_map_fd_with_map_ptr(env); - if (ret < 0) - goto skip_full_check; - if (bpf_prog_is_dev_bound(env->prog->aux)) { ret = bpf_prog_offload_verifier_prep(env->prog); if (ret) @@ -11662,6 +11757,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (ret) goto skip_full_check; + ret = resolve_pseudo_ldimm64(env); + if (ret < 0) + goto skip_full_check; + ret = check_cfg(env); if (ret < 0) goto skip_full_check; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 4f556cfcbfbe..2aa156af24d6 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -356,18 +356,36 @@ enum bpf_link_type { #define BPF_F_SLEEPABLE (1U << 4) /* When BPF ldimm64's insn[0].src_reg != 0 then this can have - * two extensions: - * - * insn[0].src_reg: BPF_PSEUDO_MAP_FD BPF_PSEUDO_MAP_VALUE - * insn[0].imm: map fd map fd - * insn[1].imm: 0 offset into value - * insn[0].off: 0 0 - * insn[1].off: 0 0 - * ldimm64 rewrite: address of map address of map[0]+offset - * verifier type: CONST_PTR_TO_MAP PTR_TO_MAP_VALUE + * the following extensions: + * + * insn[0].src_reg: BPF_PSEUDO_MAP_FD + * insn[0].imm: map fd + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of map + * verifier type: CONST_PTR_TO_MAP */ #define BPF_PSEUDO_MAP_FD 1 +/* insn[0].src_reg: BPF_PSEUDO_MAP_VALUE + * insn[0].imm: map fd + * insn[1].imm: offset into value + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of map[0]+offset + * verifier type: PTR_TO_MAP_VALUE + */ #define BPF_PSEUDO_MAP_VALUE 2 +/* insn[0].src_reg: BPF_PSEUDO_BTF_ID + * insn[0].imm: kernel btd id of VAR + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of the kernel variable + * verifier type: PTR_TO_BTF_ID or PTR_TO_MEM, depending on whether the var + * is struct/union. + */ +#define BPF_PSEUDO_BTF_ID 3 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative * offset to another bpf function -- cgit v1.2.3 From eaa6bcb71ef6ed3dc18fc525ee7e293b06b4882b Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Tue, 29 Sep 2020 16:50:47 -0700 Subject: bpf: Introduce bpf_per_cpu_ptr() Add bpf_per_cpu_ptr() to help bpf programs access percpu vars. bpf_per_cpu_ptr() has the same semantic as per_cpu_ptr() in the kernel except that it may return NULL. This happens when the cpu parameter is out of range. So the caller must check the returned value. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200929235049.2533242-5-haoluo@google.com --- include/linux/bpf.h | 4 +++ include/linux/btf.h | 11 ++++++++ include/uapi/linux/bpf.h | 18 ++++++++++++ kernel/bpf/btf.c | 10 ------- kernel/bpf/helpers.c | 18 ++++++++++++ kernel/bpf/verifier.c | 64 ++++++++++++++++++++++++++++++++++++++++-- kernel/trace/bpf_trace.c | 2 ++ tools/include/uapi/linux/bpf.h | 18 ++++++++++++ 8 files changed, 132 insertions(+), 13 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 50e5c4b52bd1..9dde15b2479d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -293,6 +293,7 @@ enum bpf_arg_type { ARG_PTR_TO_ALLOC_MEM_OR_NULL, /* pointer to dynamically allocated memory or NULL */ ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */ ARG_PTR_TO_BTF_ID_SOCK_COMMON, /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */ + ARG_PTR_TO_PERCPU_BTF_ID, /* pointer to in-kernel percpu type */ __BPF_ARG_TYPE_MAX, }; @@ -307,6 +308,7 @@ enum bpf_return_type { RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */ RET_PTR_TO_ALLOC_MEM_OR_NULL, /* returns a pointer to dynamically allocated memory or NULL */ RET_PTR_TO_BTF_ID_OR_NULL, /* returns a pointer to a btf_id or NULL */ + RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */ }; /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs @@ -405,6 +407,7 @@ enum bpf_reg_type { PTR_TO_RDONLY_BUF_OR_NULL, /* reg points to a readonly buffer or NULL */ PTR_TO_RDWR_BUF, /* reg points to a read/write buffer */ PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */ + PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */ }; /* The information passed from prog-specific *_is_valid_access @@ -1828,6 +1831,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; extern const struct bpf_func_proto bpf_copy_from_user_proto; extern const struct bpf_func_proto bpf_snprintf_btf_proto; +extern const struct bpf_func_proto bpf_per_cpu_ptr_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/linux/btf.h b/include/linux/btf.h index af1244180588..2bf641829664 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -110,6 +110,11 @@ btf_resolve_size(const struct btf *btf, const struct btf_type *type, i < btf_type_vlen(struct_type); \ i++, member++) +#define for_each_vsi(i, datasec_type, member) \ + for (i = 0, member = btf_type_var_secinfo(datasec_type); \ + i < btf_type_vlen(datasec_type); \ + i++, member++) + static inline bool btf_type_is_ptr(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_PTR; @@ -194,6 +199,12 @@ static inline const struct btf_member *btf_type_member(const struct btf_type *t) return (const struct btf_member *)(t + 1); } +static inline const struct btf_var_secinfo *btf_type_var_secinfo( + const struct btf_type *t) +{ + return (const struct btf_var_secinfo *)(t + 1); +} + #ifdef CONFIG_BPF_SYSCALL const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); const char *btf_name_by_offset(const struct btf *btf, u32 offset); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2aa156af24d6..f3c1b637ab39 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3686,6 +3686,23 @@ union bpf_attr { * Return * The helper returns **TC_ACT_REDIRECT** on success or * **TC_ACT_SHOT** on error. + * + * void *bpf_per_cpu_ptr(const void *percpu_ptr, u32 cpu) + * Description + * Take a pointer to a percpu ksym, *percpu_ptr*, and return a + * pointer to the percpu kernel variable on *cpu*. A ksym is an + * extern variable decorated with '__ksym'. For ksym, there is a + * global var (either static or global) defined of the same name + * in the kernel. The ksym is percpu if the global var is percpu. + * The returned pointer points to the global percpu var on *cpu*. + * + * bpf_per_cpu_ptr() has the same semantic as per_cpu_ptr() in the + * kernel, except that bpf_per_cpu_ptr() may return NULL. This + * happens if *cpu* is larger than nr_cpu_ids. The caller of + * bpf_per_cpu_ptr() must check the returned value. + * Return + * A pointer pointing to the kernel percpu variable on *cpu*, or + * NULL, if *cpu* is invalid. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3841,6 +3858,7 @@ union bpf_attr { FN(seq_printf_btf), \ FN(skb_cgroup_classid), \ FN(redirect_neigh), \ + FN(bpf_per_cpu_ptr), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 00569afe3d0d..ed7d02e8bc93 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -188,11 +188,6 @@ i < btf_type_vlen(struct_type); \ i++, member++) -#define for_each_vsi(i, struct_type, member) \ - for (i = 0, member = btf_type_var_secinfo(struct_type); \ - i < btf_type_vlen(struct_type); \ - i++, member++) - #define for_each_vsi_from(i, from, struct_type, member) \ for (i = from, member = btf_type_var_secinfo(struct_type) + from; \ i < btf_type_vlen(struct_type); \ @@ -598,11 +593,6 @@ static const struct btf_var *btf_type_var(const struct btf_type *t) return (const struct btf_var *)(t + 1); } -static const struct btf_var_secinfo *btf_type_var_secinfo(const struct btf_type *t) -{ - return (const struct btf_var_secinfo *)(t + 1); -} - static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) { return kind_ops[BTF_INFO_KIND(t->info)]; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index e825441781ab..14fe3f64fd82 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -623,6 +623,22 @@ const struct bpf_func_proto bpf_copy_from_user_proto = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu) +{ + if (cpu >= nr_cpu_ids) + return (unsigned long)NULL; + + return (unsigned long)per_cpu_ptr((const void __percpu *)ptr, cpu); +} + +const struct bpf_func_proto bpf_per_cpu_ptr_proto = { + .func = bpf_per_cpu_ptr, + .gpl_only = false, + .ret_type = RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, + .arg2_type = ARG_ANYTHING, +}; + const struct bpf_func_proto bpf_get_current_task_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; @@ -689,6 +705,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_snprintf_btf_proto; case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; + case BPF_FUNC_bpf_per_cpu_ptr: + return &bpf_per_cpu_ptr_proto; default: break; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fe4965079773..216b8ece23ce 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -238,6 +238,8 @@ struct bpf_call_arg_meta { u64 msize_max_value; int ref_obj_id; int func_id; + u32 btf_id; + u32 ret_btf_id; }; struct btf *btf_vmlinux; @@ -517,6 +519,7 @@ static const char * const reg_type_str[] = { [PTR_TO_XDP_SOCK] = "xdp_sock", [PTR_TO_BTF_ID] = "ptr_", [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_", + [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_", [PTR_TO_MEM] = "mem", [PTR_TO_MEM_OR_NULL] = "mem_or_null", [PTR_TO_RDONLY_BUF] = "rdonly_buf", @@ -583,7 +586,9 @@ static void print_verifier_state(struct bpf_verifier_env *env, /* reg->off should be 0 for SCALAR_VALUE */ verbose(env, "%lld", reg->var_off.value + reg->off); } else { - if (t == PTR_TO_BTF_ID || t == PTR_TO_BTF_ID_OR_NULL) + if (t == PTR_TO_BTF_ID || + t == PTR_TO_BTF_ID_OR_NULL || + t == PTR_TO_PERCPU_BTF_ID) verbose(env, "%s", kernel_type_name(reg->btf_id)); verbose(env, "(id=%d", reg->id); if (reg_type_may_be_refcounted_or_null(t)) @@ -2204,6 +2209,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_RDONLY_BUF_OR_NULL: case PTR_TO_RDWR_BUF: case PTR_TO_RDWR_BUF_OR_NULL: + case PTR_TO_PERCPU_BTF_ID: return true; default: return false; @@ -4017,6 +4023,7 @@ static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM } }; static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } }; static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } }; static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } }; +static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } }; static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, @@ -4042,6 +4049,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_ALLOC_MEM_OR_NULL] = &alloc_mem_types, [ARG_PTR_TO_INT] = &int_ptr_types, [ARG_PTR_TO_LONG] = &int_ptr_types, + [ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types, }; static int check_reg_type(struct bpf_verifier_env *env, u32 regno, @@ -4205,6 +4213,12 @@ skip_type_check: err = check_helper_mem_access(env, regno, meta->map_ptr->value_size, false, meta); + } else if (arg_type == ARG_PTR_TO_PERCPU_BTF_ID) { + if (!reg->btf_id) { + verbose(env, "Helper has invalid btf_id in R%d\n", regno); + return -EACCES; + } + meta->ret_btf_id = reg->btf_id; } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { if (meta->func_id == BPF_FUNC_spin_lock) { if (process_spin_lock(env, regno, true)) @@ -5114,6 +5128,30 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; regs[BPF_REG_0].id = ++env->id_gen; regs[BPF_REG_0].mem_size = meta.mem_size; + } else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL) { + const struct btf_type *t; + + mark_reg_known_zero(env, regs, BPF_REG_0); + t = btf_type_skip_modifiers(btf_vmlinux, meta.ret_btf_id, NULL); + if (!btf_type_is_struct(t)) { + u32 tsize; + const struct btf_type *ret; + const char *tname; + + /* resolve the type size of ksym. */ + ret = btf_resolve_size(btf_vmlinux, t, &tsize); + if (IS_ERR(ret)) { + tname = btf_name_by_offset(btf_vmlinux, t->name_off); + verbose(env, "unable to resolve the size of type '%s': %ld\n", + tname, PTR_ERR(ret)); + return -EINVAL; + } + regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; + regs[BPF_REG_0].mem_size = tsize; + } else { + regs[BPF_REG_0].type = PTR_TO_BTF_ID_OR_NULL; + regs[BPF_REG_0].btf_id = meta.ret_btf_id; + } } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL) { int ret_btf_id; @@ -7523,6 +7561,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) dst_reg->mem_size = aux->btf_var.mem_size; break; case PTR_TO_BTF_ID: + case PTR_TO_PERCPU_BTF_ID: dst_reg->btf_id = aux->btf_var.btf_id; break; default: @@ -9449,10 +9488,14 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_insn_aux_data *aux) { - u32 type, id = insn->imm; + u32 datasec_id, type, id = insn->imm; + const struct btf_var_secinfo *vsi; + const struct btf_type *datasec; const struct btf_type *t; const char *sym_name; + bool percpu = false; u64 addr; + int i; if (!btf_vmlinux) { verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n"); @@ -9484,12 +9527,27 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, return -ENOENT; } + datasec_id = btf_find_by_name_kind(btf_vmlinux, ".data..percpu", + BTF_KIND_DATASEC); + if (datasec_id > 0) { + datasec = btf_type_by_id(btf_vmlinux, datasec_id); + for_each_vsi(i, datasec, vsi) { + if (vsi->type == id) { + percpu = true; + break; + } + } + } + insn[0].imm = (u32)addr; insn[1].imm = addr >> 32; type = t->type; t = btf_type_skip_modifiers(btf_vmlinux, type, NULL); - if (!btf_type_is_struct(t)) { + if (percpu) { + aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID; + aux->btf_var.btf_id = type; + } else if (!btf_type_is_struct(t)) { const struct btf_type *ret; const char *tname; u32 tsize; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e118a83439c3..364a322e2898 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1327,6 +1327,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL; case BPF_FUNC_snprintf_btf: return &bpf_snprintf_btf_proto; + case BPF_FUNC_bpf_per_cpu_ptr: + return &bpf_per_cpu_ptr_proto; default: return NULL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 2aa156af24d6..f3c1b637ab39 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3686,6 +3686,23 @@ union bpf_attr { * Return * The helper returns **TC_ACT_REDIRECT** on success or * **TC_ACT_SHOT** on error. + * + * void *bpf_per_cpu_ptr(const void *percpu_ptr, u32 cpu) + * Description + * Take a pointer to a percpu ksym, *percpu_ptr*, and return a + * pointer to the percpu kernel variable on *cpu*. A ksym is an + * extern variable decorated with '__ksym'. For ksym, there is a + * global var (either static or global) defined of the same name + * in the kernel. The ksym is percpu if the global var is percpu. + * The returned pointer points to the global percpu var on *cpu*. + * + * bpf_per_cpu_ptr() has the same semantic as per_cpu_ptr() in the + * kernel, except that bpf_per_cpu_ptr() may return NULL. This + * happens if *cpu* is larger than nr_cpu_ids. The caller of + * bpf_per_cpu_ptr() must check the returned value. + * Return + * A pointer pointing to the kernel percpu variable on *cpu*, or + * NULL, if *cpu* is invalid. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3841,6 +3858,7 @@ union bpf_attr { FN(seq_printf_btf), \ FN(skb_cgroup_classid), \ FN(redirect_neigh), \ + FN(bpf_per_cpu_ptr), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 63d9b80dcf2c67bc5ade61cbbaa09d7af21f43f1 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Tue, 29 Sep 2020 16:50:48 -0700 Subject: bpf: Introducte bpf_this_cpu_ptr() Add bpf_this_cpu_ptr() to help access percpu var on this cpu. This helper always returns a valid pointer, therefore no need to check returned value for NULL. Also note that all programs run with preemption disabled, which means that the returned pointer is stable during all the execution of the program. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200929235049.2533242-6-haoluo@google.com --- include/linux/bpf.h | 2 ++ include/uapi/linux/bpf.h | 13 +++++++++++++ kernel/bpf/helpers.c | 14 ++++++++++++++ kernel/bpf/verifier.c | 11 ++++++++--- kernel/trace/bpf_trace.c | 2 ++ tools/include/uapi/linux/bpf.h | 13 +++++++++++++ 6 files changed, 52 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9dde15b2479d..dc63eeed4fd9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -309,6 +309,7 @@ enum bpf_return_type { RET_PTR_TO_ALLOC_MEM_OR_NULL, /* returns a pointer to dynamically allocated memory or NULL */ RET_PTR_TO_BTF_ID_OR_NULL, /* returns a pointer to a btf_id or NULL */ RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */ + RET_PTR_TO_MEM_OR_BTF_ID, /* returns a pointer to a valid memory or a btf_id */ }; /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs @@ -1832,6 +1833,7 @@ extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; extern const struct bpf_func_proto bpf_copy_from_user_proto; extern const struct bpf_func_proto bpf_snprintf_btf_proto; extern const struct bpf_func_proto bpf_per_cpu_ptr_proto; +extern const struct bpf_func_proto bpf_this_cpu_ptr_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f3c1b637ab39..c446394135be 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3703,6 +3703,18 @@ union bpf_attr { * Return * A pointer pointing to the kernel percpu variable on *cpu*, or * NULL, if *cpu* is invalid. + * + * void *bpf_this_cpu_ptr(const void *percpu_ptr) + * Description + * Take a pointer to a percpu ksym, *percpu_ptr*, and return a + * pointer to the percpu kernel variable on this cpu. See the + * description of 'ksym' in **bpf_per_cpu_ptr**\ (). + * + * bpf_this_cpu_ptr() has the same semantic as this_cpu_ptr() in + * the kernel. Different from **bpf_per_cpu_ptr**\ (), it would + * never return NULL. + * Return + * A pointer pointing to the kernel percpu variable on this cpu. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3859,6 +3871,7 @@ union bpf_attr { FN(skb_cgroup_classid), \ FN(redirect_neigh), \ FN(bpf_per_cpu_ptr), \ + FN(bpf_this_cpu_ptr), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 14fe3f64fd82..25520f5eeaf6 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -639,6 +639,18 @@ const struct bpf_func_proto bpf_per_cpu_ptr_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr) +{ + return (unsigned long)this_cpu_ptr((const void __percpu *)percpu_ptr); +} + +const struct bpf_func_proto bpf_this_cpu_ptr_proto = { + .func = bpf_this_cpu_ptr, + .gpl_only = false, + .ret_type = RET_PTR_TO_MEM_OR_BTF_ID, + .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, +}; + const struct bpf_func_proto bpf_get_current_task_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; @@ -707,6 +719,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_jiffies64_proto; case BPF_FUNC_bpf_per_cpu_ptr: return &bpf_per_cpu_ptr_proto; + case BPF_FUNC_bpf_this_cpu_ptr: + return &bpf_this_cpu_ptr_proto; default: break; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 216b8ece23ce..d9dbf271ebab 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5128,7 +5128,8 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; regs[BPF_REG_0].id = ++env->id_gen; regs[BPF_REG_0].mem_size = meta.mem_size; - } else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL) { + } else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL || + fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID) { const struct btf_type *t; mark_reg_known_zero(env, regs, BPF_REG_0); @@ -5146,10 +5147,14 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn tname, PTR_ERR(ret)); return -EINVAL; } - regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; + regs[BPF_REG_0].type = + fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ? + PTR_TO_MEM : PTR_TO_MEM_OR_NULL; regs[BPF_REG_0].mem_size = tsize; } else { - regs[BPF_REG_0].type = PTR_TO_BTF_ID_OR_NULL; + regs[BPF_REG_0].type = + fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ? + PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL; regs[BPF_REG_0].btf_id = meta.ret_btf_id; } } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL) { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 364a322e2898..a136a6a63a71 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1329,6 +1329,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_snprintf_btf_proto; case BPF_FUNC_bpf_per_cpu_ptr: return &bpf_per_cpu_ptr_proto; + case BPF_FUNC_bpf_this_cpu_ptr: + return &bpf_this_cpu_ptr_proto; default: return NULL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f3c1b637ab39..c446394135be 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3703,6 +3703,18 @@ union bpf_attr { * Return * A pointer pointing to the kernel percpu variable on *cpu*, or * NULL, if *cpu* is invalid. + * + * void *bpf_this_cpu_ptr(const void *percpu_ptr) + * Description + * Take a pointer to a percpu ksym, *percpu_ptr*, and return a + * pointer to the percpu kernel variable on this cpu. See the + * description of 'ksym' in **bpf_per_cpu_ptr**\ (). + * + * bpf_this_cpu_ptr() has the same semantic as this_cpu_ptr() in + * the kernel. Different from **bpf_per_cpu_ptr**\ (), it would + * never return NULL. + * Return + * A pointer pointing to the kernel percpu variable on this cpu. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3859,6 +3871,7 @@ union bpf_attr { FN(skb_cgroup_classid), \ FN(redirect_neigh), \ FN(bpf_per_cpu_ptr), \ + FN(bpf_this_cpu_ptr), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From f5ace5ef37b1e1de49882248656f35c45e041585 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 2 Oct 2020 18:10:33 -0500 Subject: block: scsi_ioctl: Avoid the use of one-element arrays One-element arrays are being deprecated[1]. Replace the one-element array with a simple object of type compat_caddr_t: 'compat_caddr_t unused'[2], once it seems this field is actually never used. Also, update struct cdrom_generic_command in UAPI by adding an anonimous union to avoid using the one-element array _reserved_. [1] https://www.kernel.org/doc/html/v5.9-rc1/process/deprecated.html#zero-length-and-one-element-arrays [2] https://github.com/KSPP/linux/issues/86 Signed-off-by: Gustavo A. R. Silva Link: https://lore.kernel.org/lkml/5f76f5d0.qJ4t%2FHWuRzSW7bTa%25lkp@intel.com/ Build-tested-by: kernel test robot Signed-off-by: Jens Axboe --- block/scsi_ioctl.c | 6 +++--- include/uapi/linux/cdrom.h | 5 ++++- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 4421e61c1af1..227f489aeaa5 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -651,7 +651,7 @@ struct compat_cdrom_generic_command { unsigned char data_direction; compat_int_t quiet; compat_int_t timeout; - compat_caddr_t reserved[1]; + compat_caddr_t unused; }; #endif @@ -673,7 +673,7 @@ static int scsi_get_cdrom_generic_arg(struct cdrom_generic_command *cgc, .data_direction = cgc32.data_direction, .quiet = cgc32.quiet, .timeout = cgc32.timeout, - .reserved[0] = compat_ptr(cgc32.reserved[0]), + .unused = compat_ptr(cgc32.unused), }; memcpy(&cgc->cmd, &cgc32.cmd, CDROM_PACKET_SIZE); return 0; @@ -698,7 +698,7 @@ static int scsi_put_cdrom_generic_arg(const struct cdrom_generic_command *cgc, .data_direction = cgc->data_direction, .quiet = cgc->quiet, .timeout = cgc->timeout, - .reserved[0] = (uintptr_t)(cgc->reserved[0]), + .unused = (uintptr_t)(cgc->unused), }; memcpy(&cgc32.cmd, &cgc->cmd, CDROM_PACKET_SIZE); diff --git a/include/uapi/linux/cdrom.h b/include/uapi/linux/cdrom.h index 2817230148fd..6c34f6e2f1f7 100644 --- a/include/uapi/linux/cdrom.h +++ b/include/uapi/linux/cdrom.h @@ -289,7 +289,10 @@ struct cdrom_generic_command unsigned char data_direction; int quiet; int timeout; - void __user *reserved[1]; /* unused, actually */ + union { + void __user *reserved[1]; /* unused, actually */ + void __user *unused; + }; }; /* -- cgit v1.2.3 From 50a896cf2d6f34e884a00139d6e6012c9833ace3 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 3 Oct 2020 10:44:45 +0200 Subject: genetlink: properly support per-op policy dumping Add support for per-op policy dumping. The data is pretty much as before, except that now the assumption that the policy with index 0 is "the" policy no longer holds - you now need to look at the new CTRL_ATTR_OP_POLICY attribute which is a nested attr (indexed by op) containing attributes for do and dump policies. When a single op is requested, the CTRL_ATTR_OP_POLICY will be added in the same way, since do and dump policies may differ. v2: - conditionally advertise per-command policies only if there actually is a policy being used for the do/dump and it's present at all Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/uapi/linux/genetlink.h | 10 ++++ net/netlink/genetlink.c | 102 +++++++++++++++++++++++++++++++++++++---- 2 files changed, 102 insertions(+), 10 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/genetlink.h b/include/uapi/linux/genetlink.h index 9c0636ec2286..bc9c98e84828 100644 --- a/include/uapi/linux/genetlink.h +++ b/include/uapi/linux/genetlink.h @@ -64,6 +64,7 @@ enum { CTRL_ATTR_OPS, CTRL_ATTR_MCAST_GROUPS, CTRL_ATTR_POLICY, + CTRL_ATTR_OP_POLICY, __CTRL_ATTR_MAX, }; @@ -85,6 +86,15 @@ enum { __CTRL_ATTR_MCAST_GRP_MAX, }; +enum { + CTRL_ATTR_POLICY_UNSPEC, + CTRL_ATTR_POLICY_DO, + CTRL_ATTR_POLICY_DUMP, + + __CTRL_ATTR_POLICY_DUMP_MAX, + CTRL_ATTR_POLICY_DUMP_MAX = __CTRL_ATTR_POLICY_DUMP_MAX - 1 +}; + #define CTRL_ATTR_MCAST_GRP_MAX (__CTRL_ATTR_MCAST_GRP_MAX - 1) diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 5e33c7938470..eb916c44884f 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1112,7 +1112,10 @@ static int genl_ctrl_event(int event, const struct genl_family *family, struct ctrl_dump_policy_ctx { struct netlink_policy_dump_state *state; + const struct genl_family *rt; + unsigned int opidx; u16 fam_id; + u8 policies:1; }; static const struct nla_policy ctrl_policy_policy[] = { @@ -1127,6 +1130,8 @@ static int ctrl_dumppolicy_start(struct netlink_callback *cb) struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx; struct nlattr **tb = info->attrs; const struct genl_family *rt; + struct genl_ops op; + int err, i; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); @@ -1147,11 +1152,23 @@ static int ctrl_dumppolicy_start(struct netlink_callback *cb) if (!rt) return -ENOENT; - if (!rt->policy) - return -ENODATA; + ctx->rt = rt; + + for (i = 0; i < genl_get_cmd_cnt(rt); i++) { + genl_get_cmd_by_index(i, rt, &op); + + if (op.policy) { + err = netlink_policy_dump_add_policy(&ctx->state, + op.policy, + op.maxattr); + if (err) + return err; + } + } - return netlink_policy_dump_add_policy(&ctx->state, rt->policy, - rt->maxattr); + if (!ctx->state) + return -ENODATA; + return 0; } static void *ctrl_dumppolicy_prep(struct sk_buff *skb, @@ -1172,12 +1189,78 @@ static void *ctrl_dumppolicy_prep(struct sk_buff *skb, return hdr; } +static int ctrl_dumppolicy_put_op(struct sk_buff *skb, + struct netlink_callback *cb, + struct genl_ops *op) +{ + struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx; + struct nlattr *nest_pol, *nest_op; + void *hdr; + int idx; + + /* skip if we have nothing to show */ + if (!op->policy) + return 0; + if (!op->doit && + (!op->dumpit || op->validate & GENL_DONT_VALIDATE_DUMP)) + return 0; + + hdr = ctrl_dumppolicy_prep(skb, cb); + if (!hdr) + return -ENOBUFS; + + nest_pol = nla_nest_start(skb, CTRL_ATTR_OP_POLICY); + if (!nest_pol) + goto err; + + nest_op = nla_nest_start(skb, op->cmd); + if (!nest_op) + goto err; + + /* for now both do/dump are always the same */ + idx = netlink_policy_dump_get_policy_idx(ctx->state, + op->policy, + op->maxattr); + + if (op->doit && nla_put_u32(skb, CTRL_ATTR_POLICY_DO, idx)) + goto err; + + if (op->dumpit && !(op->validate & GENL_DONT_VALIDATE_DUMP) && + nla_put_u32(skb, CTRL_ATTR_POLICY_DUMP, idx)) + goto err; + + nla_nest_end(skb, nest_op); + nla_nest_end(skb, nest_pol); + genlmsg_end(skb, hdr); + + return 0; +err: + genlmsg_cancel(skb, hdr); + return -ENOBUFS; +} + static int ctrl_dumppolicy(struct sk_buff *skb, struct netlink_callback *cb) { struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx; + void *hdr; + + if (!ctx->policies) { + while (ctx->opidx < genl_get_cmd_cnt(ctx->rt)) { + struct genl_ops op; + + genl_get_cmd_by_index(ctx->opidx, ctx->rt, &op); + + if (ctrl_dumppolicy_put_op(skb, cb, &op)) + return skb->len; + + ctx->opidx++; + } + + /* completed with the per-op policy index list */ + ctx->policies = true; + } while (netlink_policy_dump_loop(ctx->state)) { - void *hdr; struct nlattr *nest; hdr = ctrl_dumppolicy_prep(skb, cb); @@ -1194,14 +1277,13 @@ static int ctrl_dumppolicy(struct sk_buff *skb, struct netlink_callback *cb) nla_nest_end(skb, nest); genlmsg_end(skb, hdr); - continue; - -nla_put_failure: - genlmsg_cancel(skb, hdr); - break; } return skb->len; + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return skb->len; } static int ctrl_dumppolicy_done(struct netlink_callback *cb) -- cgit v1.2.3 From e992a6eda9a1eeeab73a8d2792464e4a2b1ebc3b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 3 Oct 2020 10:44:46 +0200 Subject: genetlink: allow dumping command-specific policy Right now CTRL_CMD_GETPOLICY can only dump the family-wide policy. Support dumping policy of a specific op. v3: - rebase after per-op policy export and handle that v2: - make cmd U32, just in case. v1: - don't echo op in the output in a naive way, this should make it cleaner to extend the output format for dumping policies for all the commands at once in the future. Signed-off-by: Jakub Kicinski Link: https://lore.kernel.org/r/20201001225933.1373426-11-kuba@kernel.org Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/uapi/linux/genetlink.h | 1 + net/netlink/genetlink.c | 41 ++++++++++++++++++++++++++++++++++++----- 2 files changed, 37 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/genetlink.h b/include/uapi/linux/genetlink.h index bc9c98e84828..d83f214b4134 100644 --- a/include/uapi/linux/genetlink.h +++ b/include/uapi/linux/genetlink.h @@ -65,6 +65,7 @@ enum { CTRL_ATTR_MCAST_GROUPS, CTRL_ATTR_POLICY, CTRL_ATTR_OP_POLICY, + CTRL_ATTR_OP, __CTRL_ATTR_MAX, }; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index eb916c44884f..c992424e4d63 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -123,7 +123,7 @@ static void genl_op_from_full(const struct genl_family *family, op->policy = family->policy; } -static int genl_get_cmd_full(u8 cmd, const struct genl_family *family, +static int genl_get_cmd_full(u32 cmd, const struct genl_family *family, struct genl_ops *op) { int i; @@ -152,7 +152,7 @@ static void genl_op_from_small(const struct genl_family *family, op->policy = family->policy; } -static int genl_get_cmd_small(u8 cmd, const struct genl_family *family, +static int genl_get_cmd_small(u32 cmd, const struct genl_family *family, struct genl_ops *op) { int i; @@ -166,7 +166,7 @@ static int genl_get_cmd_small(u8 cmd, const struct genl_family *family, return -ENOENT; } -static int genl_get_cmd(u8 cmd, const struct genl_family *family, +static int genl_get_cmd(u32 cmd, const struct genl_family *family, struct genl_ops *op) { if (!genl_get_cmd_full(cmd, family, op)) @@ -1114,14 +1114,17 @@ struct ctrl_dump_policy_ctx { struct netlink_policy_dump_state *state; const struct genl_family *rt; unsigned int opidx; + u32 op; u16 fam_id; - u8 policies:1; + u8 policies:1, + single_op:1; }; static const struct nla_policy ctrl_policy_policy[] = { [CTRL_ATTR_FAMILY_ID] = { .type = NLA_U16 }, [CTRL_ATTR_FAMILY_NAME] = { .type = NLA_NUL_STRING, .len = GENL_NAMSIZ - 1 }, + [CTRL_ATTR_OP] = { .type = NLA_U32 }, }; static int ctrl_dumppolicy_start(struct netlink_callback *cb) @@ -1154,6 +1157,23 @@ static int ctrl_dumppolicy_start(struct netlink_callback *cb) ctx->rt = rt; + if (tb[CTRL_ATTR_OP]) { + ctx->single_op = true; + ctx->op = nla_get_u32(tb[CTRL_ATTR_OP]); + + err = genl_get_cmd(ctx->op, rt, &op); + if (err) { + NL_SET_BAD_ATTR(cb->extack, tb[CTRL_ATTR_OP]); + return err; + } + + if (!op.policy) + return -ENODATA; + + return netlink_policy_dump_add_policy(&ctx->state, op.policy, + op.maxattr); + } + for (i = 0; i < genl_get_cmd_cnt(rt); i++) { genl_get_cmd_by_index(i, rt, &op); @@ -1248,7 +1268,18 @@ static int ctrl_dumppolicy(struct sk_buff *skb, struct netlink_callback *cb) while (ctx->opidx < genl_get_cmd_cnt(ctx->rt)) { struct genl_ops op; - genl_get_cmd_by_index(ctx->opidx, ctx->rt, &op); + if (ctx->single_op) { + int err; + + err = genl_get_cmd(ctx->op, ctx->rt, &op); + if (WARN_ON(err)) + return skb->len; + + /* break out of the loop after this one */ + ctx->opidx = genl_get_cmd_cnt(ctx->rt); + } else { + genl_get_cmd_by_index(ctx->opidx, ctx->rt, &op); + } if (ctrl_dumppolicy_put_op(skb, cb, &op)) return skb->len; -- cgit v1.2.3 From 19fbcb36a39eefbe8912a13ccc02e937b1c418d6 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Sat, 3 Oct 2020 00:44:28 +0200 Subject: net/sched: act_vlan: Add {POP,PUSH}_ETH actions Implement TCA_VLAN_ACT_POP_ETH and TCA_VLAN_ACT_PUSH_ETH, to respectively pop and push a base Ethernet header at the beginning of a frame. POP_ETH is just a matter of pulling ETH_HLEN bytes. VLAN tags, if any, must be stripped before calling POP_ETH. PUSH_ETH is restricted to skbs with no mac_header, and only the MAC addresses can be configured. The Ethertype is automatically set from skb->protocol. These restrictions ensure that all skb's fields remain consistent, so that this action can't confuse other part of the networking stack (like GSO). Since openvswitch already had these actions, consolidate the code in skbuff.c (like for vlan and mpls push/pop). Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- include/linux/skbuff.h | 3 ++ include/net/tc_act/tc_vlan.h | 2 ++ include/uapi/linux/tc_act/tc_vlan.h | 4 +++ net/core/skbuff.c | 67 +++++++++++++++++++++++++++++++++++++ net/openvswitch/actions.c | 28 ++++++---------- net/sched/act_vlan.c | 40 ++++++++++++++++++++++ 6 files changed, 126 insertions(+), 18 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 3d0cf3722bb4..42131e325e27 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3573,6 +3573,9 @@ int skb_ensure_writable(struct sk_buff *skb, int write_len); int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci); int skb_vlan_pop(struct sk_buff *skb); int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci); +int skb_eth_pop(struct sk_buff *skb); +int skb_eth_push(struct sk_buff *skb, const unsigned char *dst, + const unsigned char *src); int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, int mac_len, bool ethernet); int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len, diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h index 4e2502408c31..f051046ba034 100644 --- a/include/net/tc_act/tc_vlan.h +++ b/include/net/tc_act/tc_vlan.h @@ -11,6 +11,8 @@ struct tcf_vlan_params { int tcfv_action; + unsigned char tcfv_push_dst[ETH_ALEN]; + unsigned char tcfv_push_src[ETH_ALEN]; u16 tcfv_push_vid; __be16 tcfv_push_proto; u8 tcfv_push_prio; diff --git a/include/uapi/linux/tc_act/tc_vlan.h b/include/uapi/linux/tc_act/tc_vlan.h index 168995b54a70..5b306fe815cc 100644 --- a/include/uapi/linux/tc_act/tc_vlan.h +++ b/include/uapi/linux/tc_act/tc_vlan.h @@ -16,6 +16,8 @@ #define TCA_VLAN_ACT_POP 1 #define TCA_VLAN_ACT_PUSH 2 #define TCA_VLAN_ACT_MODIFY 3 +#define TCA_VLAN_ACT_POP_ETH 4 +#define TCA_VLAN_ACT_PUSH_ETH 5 struct tc_vlan { tc_gen; @@ -30,6 +32,8 @@ enum { TCA_VLAN_PUSH_VLAN_PROTOCOL, TCA_VLAN_PAD, TCA_VLAN_PUSH_VLAN_PRIORITY, + TCA_VLAN_PUSH_ETH_DST, + TCA_VLAN_PUSH_ETH_SRC, __TCA_VLAN_MAX, }; #define TCA_VLAN_MAX (__TCA_VLAN_MAX - 1) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e0774471f56d..75b043accddb 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5558,6 +5558,73 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) } EXPORT_SYMBOL(skb_vlan_push); +/** + * skb_eth_pop() - Drop the Ethernet header at the head of a packet + * + * @skb: Socket buffer to modify + * + * Drop the Ethernet header of @skb. + * + * Expects that skb->data points to the mac header and that no VLAN tags are + * present. + * + * Returns 0 on success, -errno otherwise. + */ +int skb_eth_pop(struct sk_buff *skb) +{ + if (!pskb_may_pull(skb, ETH_HLEN) || skb_vlan_tagged(skb) || + skb_network_offset(skb) < ETH_HLEN) + return -EPROTO; + + skb_pull_rcsum(skb, ETH_HLEN); + skb_reset_mac_header(skb); + skb_reset_mac_len(skb); + + return 0; +} +EXPORT_SYMBOL(skb_eth_pop); + +/** + * skb_eth_push() - Add a new Ethernet header at the head of a packet + * + * @skb: Socket buffer to modify + * @dst: Destination MAC address of the new header + * @src: Source MAC address of the new header + * + * Prepend @skb with a new Ethernet header. + * + * Expects that skb->data points to the mac header, which must be empty. + * + * Returns 0 on success, -errno otherwise. + */ +int skb_eth_push(struct sk_buff *skb, const unsigned char *dst, + const unsigned char *src) +{ + struct ethhdr *eth; + int err; + + if (skb_network_offset(skb) || skb_vlan_tag_present(skb)) + return -EPROTO; + + err = skb_cow_head(skb, sizeof(*eth)); + if (err < 0) + return err; + + skb_push(skb, sizeof(*eth)); + skb_reset_mac_header(skb); + skb_reset_mac_len(skb); + + eth = eth_hdr(skb); + ether_addr_copy(eth->h_dest, dst); + ether_addr_copy(eth->h_source, src); + eth->h_proto = skb->protocol; + + skb_postpush_rcsum(skb, eth, sizeof(*eth)); + + return 0; +} +EXPORT_SYMBOL(skb_eth_push); + /* Update the ethertype of hdr and the skb csum value if required. */ static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr, __be16 ethertype) diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 855f2c155956..b87bfc82f44f 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -277,9 +277,11 @@ static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key, */ static int pop_eth(struct sk_buff *skb, struct sw_flow_key *key) { - skb_pull_rcsum(skb, ETH_HLEN); - skb_reset_mac_header(skb); - skb_reset_mac_len(skb); + int err; + + err = skb_eth_pop(skb); + if (err) + return err; /* safe right before invalidate_flow_key */ key->mac_proto = MAC_PROTO_NONE; @@ -290,22 +292,12 @@ static int pop_eth(struct sk_buff *skb, struct sw_flow_key *key) static int push_eth(struct sk_buff *skb, struct sw_flow_key *key, const struct ovs_action_push_eth *ethh) { - struct ethhdr *hdr; - - /* Add the new Ethernet header */ - if (skb_cow_head(skb, ETH_HLEN) < 0) - return -ENOMEM; - - skb_push(skb, ETH_HLEN); - skb_reset_mac_header(skb); - skb_reset_mac_len(skb); - - hdr = eth_hdr(skb); - ether_addr_copy(hdr->h_source, ethh->addresses.eth_src); - ether_addr_copy(hdr->h_dest, ethh->addresses.eth_dst); - hdr->h_proto = skb->protocol; + int err; - skb_postpush_rcsum(skb, hdr, ETH_HLEN); + err = skb_eth_push(skb, ethh->addresses.eth_dst, + ethh->addresses.eth_src); + if (err) + return err; /* safe right before invalidate_flow_key */ key->mac_proto = MAC_PROTO_ETHERNET; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index a5ff9f68ab02..8758bd2a78fa 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -77,6 +77,16 @@ static int tcf_vlan_act(struct sk_buff *skb, const struct tc_action *a, /* put updated tci as hwaccel tag */ __vlan_hwaccel_put_tag(skb, p->tcfv_push_proto, tci); break; + case TCA_VLAN_ACT_POP_ETH: + err = skb_eth_pop(skb); + if (err) + goto drop; + break; + case TCA_VLAN_ACT_PUSH_ETH: + err = skb_eth_push(skb, p->tcfv_push_dst, p->tcfv_push_src); + if (err) + goto drop; + break; default: BUG(); } @@ -93,10 +103,13 @@ drop: } static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = { + [TCA_VLAN_UNSPEC] = { .strict_start_type = TCA_VLAN_PUSH_ETH_DST }, [TCA_VLAN_PARMS] = { .len = sizeof(struct tc_vlan) }, [TCA_VLAN_PUSH_VLAN_ID] = { .type = NLA_U16 }, [TCA_VLAN_PUSH_VLAN_PROTOCOL] = { .type = NLA_U16 }, [TCA_VLAN_PUSH_VLAN_PRIORITY] = { .type = NLA_U8 }, + [TCA_VLAN_PUSH_ETH_DST] = NLA_POLICY_ETH_ADDR, + [TCA_VLAN_PUSH_ETH_SRC] = NLA_POLICY_ETH_ADDR, }; static int tcf_vlan_init(struct net *net, struct nlattr *nla, @@ -179,6 +192,17 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, if (tb[TCA_VLAN_PUSH_VLAN_PRIORITY]) push_prio = nla_get_u8(tb[TCA_VLAN_PUSH_VLAN_PRIORITY]); break; + case TCA_VLAN_ACT_POP_ETH: + break; + case TCA_VLAN_ACT_PUSH_ETH: + if (!tb[TCA_VLAN_PUSH_ETH_DST] || !tb[TCA_VLAN_PUSH_ETH_SRC]) { + if (exists) + tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, index); + return -EINVAL; + } + break; default: if (exists) tcf_idr_release(*a, bind); @@ -219,6 +243,13 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, p->tcfv_push_prio = push_prio; p->tcfv_push_proto = push_proto; + if (action == TCA_VLAN_ACT_PUSH_ETH) { + nla_memcpy(&p->tcfv_push_dst, tb[TCA_VLAN_PUSH_ETH_DST], + ETH_ALEN); + nla_memcpy(&p->tcfv_push_src, tb[TCA_VLAN_PUSH_ETH_SRC], + ETH_ALEN); + } + spin_lock_bh(&v->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); p = rcu_replace_pointer(v->vlan_p, p, lockdep_is_held(&v->tcf_lock)); @@ -279,6 +310,15 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, p->tcfv_push_prio)))) goto nla_put_failure; + if (p->tcfv_action == TCA_VLAN_ACT_PUSH_ETH) { + if (nla_put(skb, TCA_VLAN_PUSH_ETH_DST, ETH_ALEN, + p->tcfv_push_dst)) + goto nla_put_failure; + if (nla_put(skb, TCA_VLAN_PUSH_ETH_SRC, ETH_ALEN, + p->tcfv_push_src)) + goto nla_put_failure; + } + tcf_tm_dump(&t, &v->tcf_tm); if (nla_put_64bit(skb, TCA_VLAN_TM, sizeof(t), &t, TCA_VLAN_PAD)) goto nla_put_failure; -- cgit v1.2.3 From a45294af9e96a3e060b6272fa7cd2c4b196de335 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Sat, 3 Oct 2020 00:44:31 +0200 Subject: net/sched: act_mpls: Add action to push MPLS LSE before Ethernet header Define the MAC_PUSH action which pushes an MPLS LSE before the mac header (instead of between the mac and the network headers as the plain PUSH action does). The only special case is when the skb has an offloaded VLAN. In that case, it has to be inlined before pushing the MPLS header. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- include/uapi/linux/tc_act/tc_mpls.h | 1 + net/sched/act_mpls.c | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tc_act/tc_mpls.h b/include/uapi/linux/tc_act/tc_mpls.h index 9360e95273c7..9e4e8f52a779 100644 --- a/include/uapi/linux/tc_act/tc_mpls.h +++ b/include/uapi/linux/tc_act/tc_mpls.h @@ -10,6 +10,7 @@ #define TCA_MPLS_ACT_PUSH 2 #define TCA_MPLS_ACT_MODIFY 3 #define TCA_MPLS_ACT_DEC_TTL 4 +#define TCA_MPLS_ACT_MAC_PUSH 5 struct tc_mpls { tc_gen; /* generic TC action fields. */ diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index 8118e2640979..bb6b715636db 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -87,6 +87,23 @@ static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a, skb->dev && skb->dev->type == ARPHRD_ETHER)) goto drop; break; + case TCA_MPLS_ACT_MAC_PUSH: + if (skb_vlan_tag_present(skb)) { + if (__vlan_insert_inner_tag(skb, skb->vlan_proto, + skb_vlan_tag_get(skb), + ETH_HLEN) < 0) + goto drop; + + skb->protocol = skb->vlan_proto; + __vlan_hwaccel_clear_tag(skb); + } + + new_lse = tcf_mpls_get_lse(NULL, p, mac_len || + !eth_p_mpls(skb->protocol)); + + if (skb_mpls_push(skb, new_lse, p->tcfm_proto, 0, false)) + goto drop; + break; case TCA_MPLS_ACT_MODIFY: new_lse = tcf_mpls_get_lse(mpls_hdr(skb), p, false); if (skb_mpls_update_lse(skb, new_lse)) @@ -188,6 +205,7 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla, } break; case TCA_MPLS_ACT_PUSH: + case TCA_MPLS_ACT_MAC_PUSH: if (!tb[TCA_MPLS_LABEL]) { NL_SET_ERR_MSG_MOD(extack, "Label is required for MPLS push"); return -EINVAL; -- cgit v1.2.3 From cf1166349c68816f4259d32559f54972b0d5c1a4 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 4 Oct 2020 18:12:51 +0200 Subject: net: devlink: Add unused port flavour Not all ports of a switch need to be used, particularly in embedded systems. Add a port flavour for ports which physically exist in the switch, but are not connected to the front panel etc, and so are unused. By having unused ports present in devlink, it gives a more accurate representation of the hardware. It also allows regions to be associated to such ports, so allowing, for example, to determine unused ports are correctly powered off, or to compare probable reset defaults of unused ports to used ports experiences issues. Actually registering unused ports and setting the flavour to unused is optional. The DSA core will register all such switch ports, but such ports are expected to be limited in number. Bigger ASICs may decide not to list unused ports. v2: Expand the description about why it is useful Reviewed-by: Vladimir Oltean Tested-by: Vladimir Oltean Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/uapi/linux/devlink.h | 3 +++ net/core/devlink.c | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index ba467dc07852..5f1d6c327670 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -197,6 +197,9 @@ enum devlink_port_flavour { * port that faces the PCI VF. */ DEVLINK_PORT_FLAVOUR_VIRTUAL, /* Any virtual port facing the user. */ + DEVLINK_PORT_FLAVOUR_UNUSED, /* Port which exists in the switch, but + * is not used in any way. + */ }; enum devlink_param_cmode { diff --git a/net/core/devlink.c b/net/core/devlink.c index 0f3c8b2ec056..20224fd1ebaf 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -7612,7 +7612,8 @@ static bool devlink_port_type_should_warn(struct devlink_port *devlink_port) { /* Ignore CPU and DSA flavours. */ return devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_CPU && - devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA; + devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA && + devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_UNUSED; } #define DEVLINK_PORT_TYPE_WARN_TIMEOUT (HZ * 3600) @@ -7897,6 +7898,7 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, break; case DEVLINK_PORT_FLAVOUR_CPU: case DEVLINK_PORT_FLAVOUR_DSA: + case DEVLINK_PORT_FLAVOUR_UNUSED: /* As CPU and DSA ports do not have a netdevice associated * case should not ever happen. */ -- cgit v1.2.3 From bdbb4e29df8b790db50cb73ce25d23543329f05f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 5 Oct 2020 15:07:38 -0700 Subject: netlink: add mask validation We don't have good validation policy for existing unsigned int attrs which serve as flags (for new ones we could use NLA_BITFIELD32). With increased use of policy dumping having the validation be expressed as part of the policy is important. Add validation policy in form of a mask of supported/valid bits. Support u64 in the uAPI to be future-proof, but really for now the embedded mask member can only hold 32 bits, so anything with bit 32+ set will always fail validation. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/netlink.h | 10 ++++++++++ include/uapi/linux/netlink.h | 2 ++ lib/nlattr.c | 36 ++++++++++++++++++++++++++++++++++++ net/netlink/policy.c | 8 ++++++++ 4 files changed, 56 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/netlink.h b/include/net/netlink.h index c5aa46f379bc..2b9e41075f19 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -200,6 +200,7 @@ enum nla_policy_validation { NLA_VALIDATE_RANGE_WARN_TOO_LONG, NLA_VALIDATE_MIN, NLA_VALIDATE_MAX, + NLA_VALIDATE_MASK, NLA_VALIDATE_RANGE_PTR, NLA_VALIDATE_FUNCTION, }; @@ -317,6 +318,7 @@ struct nla_policy { u16 len; union { const u32 bitfield32_valid; + const u32 mask; const char *reject_message; const struct nla_policy *nested_policy; struct netlink_range_validation *range; @@ -368,6 +370,8 @@ struct nla_policy { (tp == NLA_S8 || tp == NLA_S16 || tp == NLA_S32 || tp == NLA_S64) #define __NLA_ENSURE(condition) BUILD_BUG_ON_ZERO(!(condition)) +#define NLA_ENSURE_UINT_TYPE(tp) \ + (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp)) + tp) #define NLA_ENSURE_UINT_OR_BINARY_TYPE(tp) \ (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp) || \ tp == NLA_MSECS || \ @@ -416,6 +420,12 @@ struct nla_policy { .max = _max, \ } +#define NLA_POLICY_MASK(tp, _mask) { \ + .type = NLA_ENSURE_UINT_TYPE(tp), \ + .validation_type = NLA_VALIDATE_MASK, \ + .mask = _mask, \ +} + #define NLA_POLICY_VALIDATE_FN(tp, fn, ...) { \ .type = NLA_ENSURE_NO_VALIDATION_PTR(tp), \ .validation_type = NLA_VALIDATE_FUNCTION, \ diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index eac8a6a648ea..d02e472ba54c 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -331,6 +331,7 @@ enum netlink_attribute_type { * the index, if limited inside the nesting (U32) * @NL_POLICY_TYPE_ATTR_BITFIELD32_MASK: valid mask for the * bitfield32 type (U32) + * @NL_POLICY_TYPE_ATTR_MASK: mask of valid bits for unsigned integers (U64) * @NL_POLICY_TYPE_ATTR_PAD: pad attribute for 64-bit alignment */ enum netlink_policy_type_attr { @@ -346,6 +347,7 @@ enum netlink_policy_type_attr { NL_POLICY_TYPE_ATTR_POLICY_MAXTYPE, NL_POLICY_TYPE_ATTR_BITFIELD32_MASK, NL_POLICY_TYPE_ATTR_PAD, + NL_POLICY_TYPE_ATTR_MASK, /* keep last */ __NL_POLICY_TYPE_ATTR_MAX, diff --git a/lib/nlattr.c b/lib/nlattr.c index 80ff9fe83696..9c99f5daa4d2 100644 --- a/lib/nlattr.c +++ b/lib/nlattr.c @@ -323,6 +323,37 @@ static int nla_validate_int_range(const struct nla_policy *pt, } } +static int nla_validate_mask(const struct nla_policy *pt, + const struct nlattr *nla, + struct netlink_ext_ack *extack) +{ + u64 value; + + switch (pt->type) { + case NLA_U8: + value = nla_get_u8(nla); + break; + case NLA_U16: + value = nla_get_u16(nla); + break; + case NLA_U32: + value = nla_get_u32(nla); + break; + case NLA_U64: + value = nla_get_u64(nla); + break; + default: + return -EINVAL; + } + + if (value & ~(u64)pt->mask) { + NL_SET_ERR_MSG_ATTR(extack, nla, "reserved bit set"); + return -EINVAL; + } + + return 0; +} + static int validate_nla(const struct nlattr *nla, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack, unsigned int depth) @@ -503,6 +534,11 @@ static int validate_nla(const struct nlattr *nla, int maxtype, if (err) return err; break; + case NLA_VALIDATE_MASK: + err = nla_validate_mask(pt, nla, extack); + if (err) + return err; + break; case NLA_VALIDATE_FUNCTION: if (pt->validate) { err = pt->validate(nla, extack); diff --git a/net/netlink/policy.c b/net/netlink/policy.c index cf23c0151721..ee26d01328ee 100644 --- a/net/netlink/policy.c +++ b/net/netlink/policy.c @@ -263,6 +263,14 @@ send_attribute: else type = NL_ATTR_TYPE_U64; + if (pt->validation_type == NLA_VALIDATE_MASK) { + if (nla_put_u64_64bit(skb, NL_POLICY_TYPE_ATTR_MASK, + pt->mask, + NL_POLICY_TYPE_ATTR_PAD)) + goto nla_put_failure; + break; + } + nla_get_range_unsigned(pt, &range); if (nla_put_u64_64bit(skb, NL_POLICY_TYPE_ATTR_MIN_VALUE_U, -- cgit v1.2.3 From eb88531bdbfaafb827192d1fc6c5a3fcc4fadd96 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Sun, 27 Sep 2020 01:24:31 +0900 Subject: can: raw: add missing error queue support Error queue are not yet implemented in CAN-raw sockets. The problem: a userland call to recvmsg(soc, msg, MSG_ERRQUEUE) on a CAN-raw socket would unqueue messages from the normal queue without any kind of error or warning. As such, it prevented CAN drivers from using the functionalities that relies on the error queue such as skb_tx_timestamp(). SCM_CAN_RAW_ERRQUEUE is defined as the type for the CAN raw error queue. SCM stands for "Socket control messages". The name is inspired from SCM_J1939_ERRQUEUE of include/uapi/linux/can/j1939.h. Signed-off-by: Vincent Mailhol Link: https://lore.kernel.org/r/20200926162527.270030-1-mailhol.vincent@wanadoo.fr Acked-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- include/uapi/linux/can/raw.h | 3 +++ net/can/raw.c | 4 ++++ 2 files changed, 7 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/can/raw.h b/include/uapi/linux/can/raw.h index 6a11d308eb5c..3386aa81fdf2 100644 --- a/include/uapi/linux/can/raw.h +++ b/include/uapi/linux/can/raw.h @@ -49,6 +49,9 @@ #include #define SOL_CAN_RAW (SOL_CAN_BASE + CAN_RAW) +enum { + SCM_CAN_RAW_ERRQUEUE = 1, +}; /* for socket options affecting the socket (not the global system) */ diff --git a/net/can/raw.c b/net/can/raw.c index 24db4b4afdc7..ea70850f9152 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -804,6 +804,10 @@ static int raw_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, noblock = flags & MSG_DONTWAIT; flags &= ~MSG_DONTWAIT; + if (flags & MSG_ERRQUEUE) + return sock_recv_errqueue(sk, msg, size, + SOL_CAN_RAW, SCM_CAN_RAW_ERRQUEUE); + skb = skb_recv_datagram(sk, flags, noblock, &err); if (!skb) return err; -- cgit v1.2.3 From 1465af12e254a68706e110846f59cf0f09683184 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 22 Sep 2020 10:37:01 +0800 Subject: btrfs: tree-checker: fix false alert caused by legacy btrfs root item Commit 259ee7754b67 ("btrfs: tree-checker: Add ROOT_ITEM check") introduced btrfs root item size check, however btrfs root item has two versions, the legacy one which just ends before generation_v2 member, is smaller than current btrfs root item size. This caused btrfs kernel to reject valid but old tree root leaves. Fix this problem by also allowing legacy root item, since kernel can already handle them pretty well and upgrade to newer root item format when needed. Reported-by: Martin Steigerwald Fixes: 259ee7754b67 ("btrfs: tree-checker: Add ROOT_ITEM check") CC: stable@vger.kernel.org # 5.4+ Tested-By: Martin Steigerwald Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 17 ++++++++++++----- include/uapi/linux/btrfs_tree.h | 14 ++++++++++++++ 2 files changed, 26 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 7b1fee630f97..f0ffd5ee77bd 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1035,7 +1035,7 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot) { struct btrfs_fs_info *fs_info = leaf->fs_info; - struct btrfs_root_item ri; + struct btrfs_root_item ri = { 0 }; const u64 valid_root_flags = BTRFS_ROOT_SUBVOL_RDONLY | BTRFS_ROOT_SUBVOL_DEAD; int ret; @@ -1044,14 +1044,21 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, if (ret < 0) return ret; - if (btrfs_item_size_nr(leaf, slot) != sizeof(ri)) { + if (btrfs_item_size_nr(leaf, slot) != sizeof(ri) && + btrfs_item_size_nr(leaf, slot) != btrfs_legacy_root_item_size()) { generic_err(leaf, slot, - "invalid root item size, have %u expect %zu", - btrfs_item_size_nr(leaf, slot), sizeof(ri)); + "invalid root item size, have %u expect %zu or %u", + btrfs_item_size_nr(leaf, slot), sizeof(ri), + btrfs_legacy_root_item_size()); } + /* + * For legacy root item, the members starting at generation_v2 will be + * all filled with 0. + * And since we allow geneartion_v2 as 0, it will still pass the check. + */ read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot), - sizeof(ri)); + btrfs_item_size_nr(leaf, slot)); /* Generation related */ if (btrfs_root_generation(&ri) > diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index 9ba64ca6b4ac..6b885982ece6 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -4,6 +4,11 @@ #include #include +#ifdef __KERNEL__ +#include +#else +#include +#endif /* * This header contains the structure definitions and constants used @@ -644,6 +649,15 @@ struct btrfs_root_item { __le64 reserved[8]; /* for future */ } __attribute__ ((__packed__)); +/* + * Btrfs root item used to be smaller than current size. The old format ends + * at where member generation_v2 is. + */ +static inline __u32 btrfs_legacy_root_item_size(void) +{ + return offsetof(struct btrfs_root_item, generation_v2); +} + /* * this is used for both forward and backward root refs */ -- cgit v1.2.3 From 49f3d12b0f70ea867b891ad2a97f6e51bb564e18 Mon Sep 17 00:00:00 2001 From: Jakub Wilk Date: Wed, 7 Oct 2020 07:57:17 +0200 Subject: bpf: Fix typo in uapi/linux/bpf.h Reported-by: Samanta Navarro Signed-off-by: Jakub Wilk Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20201007055717.7319-1-jwilk@jwilk.net --- include/uapi/linux/bpf.h | 2 +- tools/include/uapi/linux/bpf.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c446394135be..d83561e8cd2c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2253,7 +2253,7 @@ union bpf_attr { * Description * This helper is used in programs implementing policies at the * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. - * if the verdeict eBPF program returns **SK_PASS**), redirect it + * if the verdict eBPF program returns **SK_PASS**), redirect it * to the socket referenced by *map* (of type * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and * egress interfaces can be used for redirection. The diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index c446394135be..d83561e8cd2c 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2253,7 +2253,7 @@ union bpf_attr { * Description * This helper is used in programs implementing policies at the * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. - * if the verdeict eBPF program returns **SK_PASS**), redirect it + * if the verdict eBPF program returns **SK_PASS**), redirect it * to the socket referenced by *map* (of type * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and * egress interfaces can be used for redirection. The -- cgit v1.2.3 From fb1ff4c1941573aea59e4cb575dc5a723303cd70 Mon Sep 17 00:00:00 2001 From: Bharat Bhushan Date: Mon, 5 Oct 2020 20:36:45 +0300 Subject: vfio/fsl-mc: Add VFIO framework skeleton for fsl-mc devices DPAA2 (Data Path Acceleration Architecture) consists in mechanisms for processing Ethernet packets, queue management, accelerators, etc. The Management Complex (mc) is a hardware entity that manages the DPAA2 hardware resources. It provides an object-based abstraction for software drivers to use the DPAA2 hardware. The MC mediates operations such as create, discover, destroy of DPAA2 objects. The MC provides memory-mapped I/O command interfaces (MC portals) which DPAA2 software drivers use to operate on DPAA2 objects. A DPRC is a container object that holds other types of DPAA2 objects. Each object in the DPRC is a Linux device and bound to a driver. The MC-bus driver is a platform driver (different from PCI or platform bus). The DPRC driver does runtime management of a bus instance. It performs the initial scan of the DPRC and handles changes in the DPRC configuration (adding/removing objects). All objects inside a container share the same hardware isolation context, meaning that only an entire DPRC can be assigned to a virtual machine. When a container is assigned to a virtual machine, all the objects within that container are assigned to that virtual machine. The DPRC container assigned to the virtual machine is not allowed to change contents (add/remove objects) by the guest. The restriction is set by the host and enforced by the mc hardware. The DPAA2 objects can be directly assigned to the guest. However the MC portals (the memory mapped command interface to the MC) need to be emulated because there are commands that configure the interrupts and the isolation IDs which are virtual in the guest. Example: echo vfio-fsl-mc > /sys/bus/fsl-mc/devices/dprc.2/driver_override echo dprc.2 > /sys/bus/fsl-mc/drivers/vfio-fsl-mc/bind The dprc.2 is bound to the VFIO driver and all the objects within dprc.2 are going to be bound to the VFIO driver. This patch adds the infrastructure for VFIO support for fsl-mc devices. Subsequent patches will add support for binding and secure assigning these devices using VFIO. More details about the DPAA2 objects can be found here: Documentation/networking/device_drivers/freescale/dpaa2/overview.rst Signed-off-by: Bharat Bhushan Signed-off-by: Diana Craciun Reviewed-by: Eric Auger Signed-off-by: Alex Williamson --- MAINTAINERS | 6 ++ drivers/vfio/Kconfig | 1 + drivers/vfio/Makefile | 1 + drivers/vfio/fsl-mc/Kconfig | 9 ++ drivers/vfio/fsl-mc/Makefile | 4 + drivers/vfio/fsl-mc/vfio_fsl_mc.c | 157 ++++++++++++++++++++++++++++++ drivers/vfio/fsl-mc/vfio_fsl_mc_private.h | 14 +++ include/uapi/linux/vfio.h | 1 + 8 files changed, 193 insertions(+) create mode 100644 drivers/vfio/fsl-mc/Kconfig create mode 100644 drivers/vfio/fsl-mc/Makefile create mode 100644 drivers/vfio/fsl-mc/vfio_fsl_mc.c create mode 100644 drivers/vfio/fsl-mc/vfio_fsl_mc_private.h (limited to 'include/uapi/linux') diff --git a/MAINTAINERS b/MAINTAINERS index d746519253c3..e955a00af046 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18260,6 +18260,12 @@ F: drivers/vfio/ F: include/linux/vfio.h F: include/uapi/linux/vfio.h +VFIO FSL-MC DRIVER +M: Diana Craciun +L: kvm@vger.kernel.org +S: Maintained +F: drivers/vfio/fsl-mc/ + VFIO MEDIATED DEVICE DRIVERS M: Kirti Wankhede L: kvm@vger.kernel.org diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index fd17db9b432f..5533df91b257 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -47,4 +47,5 @@ menuconfig VFIO_NOIOMMU source "drivers/vfio/pci/Kconfig" source "drivers/vfio/platform/Kconfig" source "drivers/vfio/mdev/Kconfig" +source "drivers/vfio/fsl-mc/Kconfig" source "virt/lib/Kconfig" diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index de67c4725cce..fee73f3d9480 100644 --- a/drivers/vfio/Makefile +++ b/drivers/vfio/Makefile @@ -9,3 +9,4 @@ obj-$(CONFIG_VFIO_SPAPR_EEH) += vfio_spapr_eeh.o obj-$(CONFIG_VFIO_PCI) += pci/ obj-$(CONFIG_VFIO_PLATFORM) += platform/ obj-$(CONFIG_VFIO_MDEV) += mdev/ +obj-$(CONFIG_VFIO_FSL_MC) += fsl-mc/ diff --git a/drivers/vfio/fsl-mc/Kconfig b/drivers/vfio/fsl-mc/Kconfig new file mode 100644 index 000000000000..b1a527d6b6f2 --- /dev/null +++ b/drivers/vfio/fsl-mc/Kconfig @@ -0,0 +1,9 @@ +config VFIO_FSL_MC + tristate "VFIO support for QorIQ DPAA2 fsl-mc bus devices" + depends on VFIO && FSL_MC_BUS && EVENTFD + help + Driver to enable support for the VFIO QorIQ DPAA2 fsl-mc + (Management Complex) devices. This is required to passthrough + fsl-mc bus devices using the VFIO framework. + + If you don't know what to do here, say N. diff --git a/drivers/vfio/fsl-mc/Makefile b/drivers/vfio/fsl-mc/Makefile new file mode 100644 index 000000000000..0c6e5d2ddaae --- /dev/null +++ b/drivers/vfio/fsl-mc/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) + +vfio-fsl-mc-y := vfio_fsl_mc.o +obj-$(CONFIG_VFIO_FSL_MC) += vfio-fsl-mc.o diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c new file mode 100644 index 000000000000..a7a483a1e90b --- /dev/null +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) +/* + * Copyright 2013-2016 Freescale Semiconductor Inc. + * Copyright 2016-2017,2019-2020 NXP + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vfio_fsl_mc_private.h" + +static int vfio_fsl_mc_open(void *device_data) +{ + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + return 0; +} + +static void vfio_fsl_mc_release(void *device_data) +{ + module_put(THIS_MODULE); +} + +static long vfio_fsl_mc_ioctl(void *device_data, unsigned int cmd, + unsigned long arg) +{ + switch (cmd) { + case VFIO_DEVICE_GET_INFO: + { + return -ENOTTY; + } + case VFIO_DEVICE_GET_REGION_INFO: + { + return -ENOTTY; + } + case VFIO_DEVICE_GET_IRQ_INFO: + { + return -ENOTTY; + } + case VFIO_DEVICE_SET_IRQS: + { + return -ENOTTY; + } + case VFIO_DEVICE_RESET: + { + return -ENOTTY; + } + default: + return -ENOTTY; + } +} + +static ssize_t vfio_fsl_mc_read(void *device_data, char __user *buf, + size_t count, loff_t *ppos) +{ + return -EINVAL; +} + +static ssize_t vfio_fsl_mc_write(void *device_data, const char __user *buf, + size_t count, loff_t *ppos) +{ + return -EINVAL; +} + +static int vfio_fsl_mc_mmap(void *device_data, struct vm_area_struct *vma) +{ + return -EINVAL; +} + +static const struct vfio_device_ops vfio_fsl_mc_ops = { + .name = "vfio-fsl-mc", + .open = vfio_fsl_mc_open, + .release = vfio_fsl_mc_release, + .ioctl = vfio_fsl_mc_ioctl, + .read = vfio_fsl_mc_read, + .write = vfio_fsl_mc_write, + .mmap = vfio_fsl_mc_mmap, +}; + +static int vfio_fsl_mc_probe(struct fsl_mc_device *mc_dev) +{ + struct iommu_group *group; + struct vfio_fsl_mc_device *vdev; + struct device *dev = &mc_dev->dev; + int ret; + + group = vfio_iommu_group_get(dev); + if (!group) { + dev_err(dev, "VFIO_FSL_MC: No IOMMU group\n"); + return -EINVAL; + } + + vdev = devm_kzalloc(dev, sizeof(*vdev), GFP_KERNEL); + if (!vdev) { + ret = -ENOMEM; + goto out_group_put; + } + + vdev->mc_dev = mc_dev; + + ret = vfio_add_group_dev(dev, &vfio_fsl_mc_ops, vdev); + if (ret) { + dev_err(dev, "VFIO_FSL_MC: Failed to add to vfio group\n"); + goto out_group_put; + } + return 0; + +out_group_put: + vfio_iommu_group_put(group, dev); + return ret; +} + +static int vfio_fsl_mc_remove(struct fsl_mc_device *mc_dev) +{ + struct vfio_fsl_mc_device *vdev; + struct device *dev = &mc_dev->dev; + + vdev = vfio_del_group_dev(dev); + if (!vdev) + return -EINVAL; + + vfio_iommu_group_put(mc_dev->dev.iommu_group, dev); + + return 0; +} + +static struct fsl_mc_driver vfio_fsl_mc_driver = { + .probe = vfio_fsl_mc_probe, + .remove = vfio_fsl_mc_remove, + .driver = { + .name = "vfio-fsl-mc", + .owner = THIS_MODULE, + }, +}; + +static int __init vfio_fsl_mc_driver_init(void) +{ + return fsl_mc_driver_register(&vfio_fsl_mc_driver); +} + +static void __exit vfio_fsl_mc_driver_exit(void) +{ + fsl_mc_driver_unregister(&vfio_fsl_mc_driver); +} + +module_init(vfio_fsl_mc_driver_init); +module_exit(vfio_fsl_mc_driver_exit); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("VFIO for FSL-MC devices - User Level meta-driver"); diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc_private.h b/drivers/vfio/fsl-mc/vfio_fsl_mc_private.h new file mode 100644 index 000000000000..e79cc116f6b8 --- /dev/null +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc_private.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */ +/* + * Copyright 2013-2016 Freescale Semiconductor Inc. + * Copyright 2016,2019-2020 NXP + */ + +#ifndef VFIO_FSL_MC_PRIVATE_H +#define VFIO_FSL_MC_PRIVATE_H + +struct vfio_fsl_mc_device { + struct fsl_mc_device *mc_dev; +}; + +#endif /* VFIO_FSL_MC_PRIVATE_H */ diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 920470502329..95deac891378 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -201,6 +201,7 @@ struct vfio_device_info { #define VFIO_DEVICE_FLAGS_AMBA (1 << 3) /* vfio-amba device */ #define VFIO_DEVICE_FLAGS_CCW (1 << 4) /* vfio-ccw device */ #define VFIO_DEVICE_FLAGS_AP (1 << 5) /* vfio-ap device */ +#define VFIO_DEVICE_FLAGS_FSL_MC (1 << 6) /* vfio-fsl-mc device */ __u32 num_regions; /* Max region index + 1 */ __u32 num_irqs; /* Max IRQ index + 1 */ }; -- cgit v1.2.3 From 0c633f0be1dc70a6db46d90dba4cdae82073350a Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Wed, 7 Oct 2020 14:56:22 -0400 Subject: vfio: Introduce capability definitions for VFIO_DEVICE_GET_INFO Allow the VFIO_DEVICE_GET_INFO ioctl to include a capability chain. Add a flag indicating capability chain support, and introduce the definitions for the first set of capabilities which are specified to s390 zPCI devices. Signed-off-by: Matthew Rosato Reviewed-by: Cornelia Huck Signed-off-by: Alex Williamson --- include/uapi/linux/vfio.h | 11 ++++++ include/uapi/linux/vfio_zdev.h | 78 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) create mode 100644 include/uapi/linux/vfio_zdev.h (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 920470502329..04fbe425ad0c 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -201,8 +201,10 @@ struct vfio_device_info { #define VFIO_DEVICE_FLAGS_AMBA (1 << 3) /* vfio-amba device */ #define VFIO_DEVICE_FLAGS_CCW (1 << 4) /* vfio-ccw device */ #define VFIO_DEVICE_FLAGS_AP (1 << 5) /* vfio-ap device */ +#define VFIO_DEVICE_FLAGS_CAPS (1 << 7) /* Info supports caps */ __u32 num_regions; /* Max region index + 1 */ __u32 num_irqs; /* Max IRQ index + 1 */ + __u32 cap_offset; /* Offset within info struct of first cap */ }; #define VFIO_DEVICE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 7) @@ -218,6 +220,15 @@ struct vfio_device_info { #define VFIO_DEVICE_API_CCW_STRING "vfio-ccw" #define VFIO_DEVICE_API_AP_STRING "vfio-ap" +/* + * The following capabilities are unique to s390 zPCI devices. Their contents + * are further-defined in vfio_zdev.h + */ +#define VFIO_DEVICE_INFO_CAP_ZPCI_BASE 1 +#define VFIO_DEVICE_INFO_CAP_ZPCI_GROUP 2 +#define VFIO_DEVICE_INFO_CAP_ZPCI_UTIL 3 +#define VFIO_DEVICE_INFO_CAP_ZPCI_PFIP 4 + /** * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8, * struct vfio_region_info) diff --git a/include/uapi/linux/vfio_zdev.h b/include/uapi/linux/vfio_zdev.h new file mode 100644 index 000000000000..b4309397b6b2 --- /dev/null +++ b/include/uapi/linux/vfio_zdev.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * VFIO Region definitions for ZPCI devices + * + * Copyright IBM Corp. 2020 + * + * Author(s): Pierre Morel + * Matthew Rosato + */ + +#ifndef _VFIO_ZDEV_H_ +#define _VFIO_ZDEV_H_ + +#include +#include + +/** + * VFIO_DEVICE_INFO_CAP_ZPCI_BASE - Base PCI Function information + * + * This capability provides a set of descriptive information about the + * associated PCI function. + */ +struct vfio_device_info_cap_zpci_base { + struct vfio_info_cap_header header; + __u64 start_dma; /* Start of available DMA addresses */ + __u64 end_dma; /* End of available DMA addresses */ + __u16 pchid; /* Physical Channel ID */ + __u16 vfn; /* Virtual function number */ + __u16 fmb_length; /* Measurement Block Length (in bytes) */ + __u8 pft; /* PCI Function Type */ + __u8 gid; /* PCI function group ID */ +}; + +/** + * VFIO_DEVICE_INFO_CAP_ZPCI_GROUP - Base PCI Function Group information + * + * This capability provides a set of descriptive information about the group of + * PCI functions that the associated device belongs to. + */ +struct vfio_device_info_cap_zpci_group { + struct vfio_info_cap_header header; + __u64 dasm; /* DMA Address space mask */ + __u64 msi_addr; /* MSI address */ + __u64 flags; +#define VFIO_DEVICE_INFO_ZPCI_FLAG_REFRESH 1 /* Program-specified TLB refresh */ + __u16 mui; /* Measurement Block Update Interval */ + __u16 noi; /* Maximum number of MSIs */ + __u16 maxstbl; /* Maximum Store Block Length */ + __u8 version; /* Supported PCI Version */ +}; + +/** + * VFIO_DEVICE_INFO_CAP_ZPCI_UTIL - Utility String + * + * This capability provides the utility string for the associated device, which + * is a device identifier string made up of EBCDID characters. 'size' specifies + * the length of 'util_str'. + */ +struct vfio_device_info_cap_zpci_util { + struct vfio_info_cap_header header; + __u32 size; + __u8 util_str[]; +}; + +/** + * VFIO_DEVICE_INFO_CAP_ZPCI_PFIP - PCI Function Path + * + * This capability provides the PCI function path string, which is an identifier + * that describes the internal hardware path of the device. 'size' specifies + * the length of 'pfip'. + */ +struct vfio_device_info_cap_zpci_pfip { + struct vfio_info_cap_header header; + __u32 size; + __u8 pfip[]; +}; + +#endif -- cgit v1.2.3 From e057dd3fc20ffb3d7f150af46542a51b59b90127 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 28 Sep 2020 22:04:04 +0200 Subject: can: add ISO 15765-2:2016 transport protocol CAN Transport Protocols offer support for segmented Point-to-Point communication between CAN nodes via two defined CAN Identifiers. As CAN frames can only transport a small amount of data bytes (max. 8 bytes for 'classic' CAN and max. 64 bytes for CAN FD) this segmentation is needed to transport longer PDUs as needed e.g. for vehicle diagnosis (UDS, ISO 14229) or IP-over-CAN traffic. This protocol driver implements data transfers according to ISO 15765-2:2016 for 'classic' CAN and CAN FD frame types. Signed-off-by: Oliver Hartkopp Link: https://lore.kernel.org/r/20200928200404.82229-1-socketcan@hartkopp.net [mkl: Removed "WITH Linux-syscall-note" from isotp.c. Fixed indention, a checkpatch warning and typos. Replaced __u{8,32} by u{8,32}. Removed always false (optlen < 0) check in isotp_setsockopt().] Signed-off-by: Marc Kleine-Budde --- MAINTAINERS | 1 + include/uapi/linux/can/isotp.h | 166 +++++ net/can/Kconfig | 13 + net/can/Makefile | 3 + net/can/isotp.c | 1426 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 1609 insertions(+) create mode 100644 include/uapi/linux/can/isotp.h create mode 100644 net/can/isotp.c (limited to 'include/uapi/linux') diff --git a/MAINTAINERS b/MAINTAINERS index d651a0934be7..7a8a53adba91 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3912,6 +3912,7 @@ F: include/net/netns/can.h F: include/uapi/linux/can.h F: include/uapi/linux/can/bcm.h F: include/uapi/linux/can/gw.h +F: include/uapi/linux/can/isotp.h F: include/uapi/linux/can/raw.h F: net/can/ diff --git a/include/uapi/linux/can/isotp.h b/include/uapi/linux/can/isotp.h new file mode 100644 index 000000000000..553006509f4e --- /dev/null +++ b/include/uapi/linux/can/isotp.h @@ -0,0 +1,166 @@ +/* SPDX-License-Identifier: ((GPL-2.0-only WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* + * linux/can/isotp.h + * + * Definitions for isotp CAN sockets (ISO 15765-2:2016) + * + * Copyright (c) 2020 Volkswagen Group Electronic Research + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of Volkswagen nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * Alternatively, provided that this notice is retained in full, this + * software may be distributed under the terms of the GNU General + * Public License ("GPL") version 2, in which case the provisions of the + * GPL apply INSTEAD OF those given above. + * + * The provided data structures and external interfaces from this code + * are not restricted to be used by modules with a GPL compatible license. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +#ifndef _UAPI_CAN_ISOTP_H +#define _UAPI_CAN_ISOTP_H + +#include +#include + +#define SOL_CAN_ISOTP (SOL_CAN_BASE + CAN_ISOTP) + +/* for socket options affecting the socket (not the global system) */ + +#define CAN_ISOTP_OPTS 1 /* pass struct can_isotp_options */ + +#define CAN_ISOTP_RECV_FC 2 /* pass struct can_isotp_fc_options */ + +/* sockopts to force stmin timer values for protocol regression tests */ + +#define CAN_ISOTP_TX_STMIN 3 /* pass __u32 value in nano secs */ + /* use this time instead of value */ + /* provided in FC from the receiver */ + +#define CAN_ISOTP_RX_STMIN 4 /* pass __u32 value in nano secs */ + /* ignore received CF frames which */ + /* timestamps differ less than val */ + +#define CAN_ISOTP_LL_OPTS 5 /* pass struct can_isotp_ll_options */ + +struct can_isotp_options { + + __u32 flags; /* set flags for isotp behaviour. */ + /* __u32 value : flags see below */ + + __u32 frame_txtime; /* frame transmission time (N_As/N_Ar) */ + /* __u32 value : time in nano secs */ + + __u8 ext_address; /* set address for extended addressing */ + /* __u8 value : extended address */ + + __u8 txpad_content; /* set content of padding byte (tx) */ + /* __u8 value : content on tx path */ + + __u8 rxpad_content; /* set content of padding byte (rx) */ + /* __u8 value : content on rx path */ + + __u8 rx_ext_address; /* set address for extended addressing */ + /* __u8 value : extended address (rx) */ +}; + +struct can_isotp_fc_options { + + __u8 bs; /* blocksize provided in FC frame */ + /* __u8 value : blocksize. 0 = off */ + + __u8 stmin; /* separation time provided in FC frame */ + /* __u8 value : */ + /* 0x00 - 0x7F : 0 - 127 ms */ + /* 0x80 - 0xF0 : reserved */ + /* 0xF1 - 0xF9 : 100 us - 900 us */ + /* 0xFA - 0xFF : reserved */ + + __u8 wftmax; /* max. number of wait frame transmiss. */ + /* __u8 value : 0 = omit FC N_PDU WT */ +}; + +struct can_isotp_ll_options { + + __u8 mtu; /* generated & accepted CAN frame type */ + /* __u8 value : */ + /* CAN_MTU (16) -> standard CAN 2.0 */ + /* CANFD_MTU (72) -> CAN FD frame */ + + __u8 tx_dl; /* tx link layer data length in bytes */ + /* (configured maximum payload length) */ + /* __u8 value : 8,12,16,20,24,32,48,64 */ + /* => rx path supports all LL_DL values */ + + __u8 tx_flags; /* set into struct canfd_frame.flags */ + /* at frame creation: e.g. CANFD_BRS */ + /* Obsolete when the BRS flag is fixed */ + /* by the CAN netdriver configuration */ +}; + +/* flags for isotp behaviour */ + +#define CAN_ISOTP_LISTEN_MODE 0x001 /* listen only (do not send FC) */ +#define CAN_ISOTP_EXTEND_ADDR 0x002 /* enable extended addressing */ +#define CAN_ISOTP_TX_PADDING 0x004 /* enable CAN frame padding tx path */ +#define CAN_ISOTP_RX_PADDING 0x008 /* enable CAN frame padding rx path */ +#define CAN_ISOTP_CHK_PAD_LEN 0x010 /* check received CAN frame padding */ +#define CAN_ISOTP_CHK_PAD_DATA 0x020 /* check received CAN frame padding */ +#define CAN_ISOTP_HALF_DUPLEX 0x040 /* half duplex error state handling */ +#define CAN_ISOTP_FORCE_TXSTMIN 0x080 /* ignore stmin from received FC */ +#define CAN_ISOTP_FORCE_RXSTMIN 0x100 /* ignore CFs depending on rx stmin */ +#define CAN_ISOTP_RX_EXT_ADDR 0x200 /* different rx extended addressing */ +#define CAN_ISOTP_WAIT_TX_DONE 0x400 /* wait for tx completion */ + + +/* default values */ + +#define CAN_ISOTP_DEFAULT_FLAGS 0 +#define CAN_ISOTP_DEFAULT_EXT_ADDRESS 0x00 +#define CAN_ISOTP_DEFAULT_PAD_CONTENT 0xCC /* prevent bit-stuffing */ +#define CAN_ISOTP_DEFAULT_FRAME_TXTIME 0 +#define CAN_ISOTP_DEFAULT_RECV_BS 0 +#define CAN_ISOTP_DEFAULT_RECV_STMIN 0x00 +#define CAN_ISOTP_DEFAULT_RECV_WFTMAX 0 + +#define CAN_ISOTP_DEFAULT_LL_MTU CAN_MTU +#define CAN_ISOTP_DEFAULT_LL_TX_DL CAN_MAX_DLEN +#define CAN_ISOTP_DEFAULT_LL_TX_FLAGS 0 + +/* + * Remark on CAN_ISOTP_DEFAULT_RECV_* values: + * + * We can strongly assume, that the Linux Kernel implementation of + * CAN_ISOTP is capable to run with BS=0, STmin=0 and WFTmax=0. + * But as we like to be able to behave as a commonly available ECU, + * these default settings can be changed via sockopts. + * For that reason the STmin value is intentionally _not_ checked for + * consistency and copied directly into the flow control (FC) frame. + * + */ + +#endif /* !_UAPI_CAN_ISOTP_H */ diff --git a/net/can/Kconfig b/net/can/Kconfig index 25436a715db3..021fe03a8ed6 100644 --- a/net/can/Kconfig +++ b/net/can/Kconfig @@ -55,6 +55,19 @@ config CAN_GW source "net/can/j1939/Kconfig" +config CAN_ISOTP + tristate "ISO 15765-2:2016 CAN transport protocol" + default y + help + CAN Transport Protocols offer support for segmented Point-to-Point + communication between CAN nodes via two defined CAN Identifiers. + As CAN frames can only transport a small amount of data bytes + (max. 8 bytes for 'classic' CAN and max. 64 bytes for CAN FD) this + segmentation is needed to transport longer PDUs as needed e.g. for + vehicle diagnosis (UDS, ISO 14229) or IP-over-CAN traffic. + This protocol driver implements data transfers according to + ISO 15765-2:2016 for 'classic' CAN and CAN FD frame types. + source "drivers/net/can/Kconfig" endif diff --git a/net/can/Makefile b/net/can/Makefile index 08bd217fc051..58f2c31c1ef3 100644 --- a/net/can/Makefile +++ b/net/can/Makefile @@ -17,3 +17,6 @@ obj-$(CONFIG_CAN_GW) += can-gw.o can-gw-y := gw.o obj-$(CONFIG_CAN_J1939) += j1939/ + +obj-$(CONFIG_CAN_ISOTP) += can-isotp.o +can-isotp-y := isotp.o diff --git a/net/can/isotp.c b/net/can/isotp.c new file mode 100644 index 000000000000..e6ff032b5426 --- /dev/null +++ b/net/can/isotp.c @@ -0,0 +1,1426 @@ +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +/* isotp.c - ISO 15765-2 CAN transport protocol for protocol family CAN + * + * This implementation does not provide ISO-TP specific return values to the + * userspace. + * + * - RX path timeout of data reception leads to -ETIMEDOUT + * - RX path SN mismatch leads to -EILSEQ + * - RX path data reception with wrong padding leads to -EBADMSG + * - TX path flowcontrol reception timeout leads to -ECOMM + * - TX path flowcontrol reception overflow leads to -EMSGSIZE + * - TX path flowcontrol reception with wrong layout/padding leads to -EBADMSG + * - when a transfer (tx) is on the run the next write() blocks until it's done + * - use CAN_ISOTP_WAIT_TX_DONE flag to block the caller until the PDU is sent + * - as we have static buffers the check whether the PDU fits into the buffer + * is done at FF reception time (no support for sending 'wait frames') + * - take care of the tx-queue-len as traffic shaping is still on the TODO list + * + * Copyright (c) 2020 Volkswagen Group Electronic Research + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of Volkswagen nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * Alternatively, provided that this notice is retained in full, this + * software may be distributed under the terms of the GNU General + * Public License ("GPL") version 2, in which case the provisions of the + * GPL apply INSTEAD OF those given above. + * + * The provided data structures and external interfaces from this code + * are not restricted to be used by modules with a GPL compatible license. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CAN_ISOTP_VERSION "20200928" + +MODULE_DESCRIPTION("PF_CAN isotp 15765-2:2016 protocol"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_AUTHOR("Oliver Hartkopp "); +MODULE_ALIAS("can-proto-6"); + +#define SINGLE_MASK(id) (((id) & CAN_EFF_FLAG) ? \ + (CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG) : \ + (CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG)) + +/* ISO 15765-2:2016 supports more than 4095 byte per ISO PDU as the FF_DL can + * take full 32 bit values (4 Gbyte). We would need some good concept to handle + * this between user space and kernel space. For now increase the static buffer + * to something about 8 kbyte to be able to test this new functionality. + */ +#define MAX_MSG_LENGTH 8200 + +/* N_PCI type values in bits 7-4 of N_PCI bytes */ +#define N_PCI_SF 0x00 /* single frame */ +#define N_PCI_FF 0x10 /* first frame */ +#define N_PCI_CF 0x20 /* consecutive frame */ +#define N_PCI_FC 0x30 /* flow control */ + +#define N_PCI_SZ 1 /* size of the PCI byte #1 */ +#define SF_PCI_SZ4 1 /* size of SingleFrame PCI including 4 bit SF_DL */ +#define SF_PCI_SZ8 2 /* size of SingleFrame PCI including 8 bit SF_DL */ +#define FF_PCI_SZ12 2 /* size of FirstFrame PCI including 12 bit FF_DL */ +#define FF_PCI_SZ32 6 /* size of FirstFrame PCI including 32 bit FF_DL */ +#define FC_CONTENT_SZ 3 /* flow control content size in byte (FS/BS/STmin) */ + +#define ISOTP_CHECK_PADDING (CAN_ISOTP_CHK_PAD_LEN | CAN_ISOTP_CHK_PAD_DATA) + +/* Flow Status given in FC frame */ +#define ISOTP_FC_CTS 0 /* clear to send */ +#define ISOTP_FC_WT 1 /* wait */ +#define ISOTP_FC_OVFLW 2 /* overflow */ + +enum { + ISOTP_IDLE = 0, + ISOTP_WAIT_FIRST_FC, + ISOTP_WAIT_FC, + ISOTP_WAIT_DATA, + ISOTP_SENDING +}; + +struct tpcon { + int idx; + int len; + u8 state; + u8 bs; + u8 sn; + u8 ll_dl; + u8 buf[MAX_MSG_LENGTH + 1]; +}; + +struct isotp_sock { + struct sock sk; + int bound; + int ifindex; + canid_t txid; + canid_t rxid; + ktime_t tx_gap; + ktime_t lastrxcf_tstamp; + struct hrtimer rxtimer, txtimer; + struct can_isotp_options opt; + struct can_isotp_fc_options rxfc, txfc; + struct can_isotp_ll_options ll; + u32 force_tx_stmin; + u32 force_rx_stmin; + struct tpcon rx, tx; + struct notifier_block notifier; + wait_queue_head_t wait; +}; + +static inline struct isotp_sock *isotp_sk(const struct sock *sk) +{ + return (struct isotp_sock *)sk; +} + +static enum hrtimer_restart isotp_rx_timer_handler(struct hrtimer *hrtimer) +{ + struct isotp_sock *so = container_of(hrtimer, struct isotp_sock, + rxtimer); + struct sock *sk = &so->sk; + + if (so->rx.state == ISOTP_WAIT_DATA) { + /* we did not get new data frames in time */ + + /* report 'connection timed out' */ + sk->sk_err = ETIMEDOUT; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + + /* reset rx state */ + so->rx.state = ISOTP_IDLE; + } + + return HRTIMER_NORESTART; +} + +static int isotp_send_fc(struct sock *sk, int ae, u8 flowstatus) +{ + struct net_device *dev; + struct sk_buff *nskb; + struct canfd_frame *ncf; + struct isotp_sock *so = isotp_sk(sk); + int can_send_ret; + + nskb = alloc_skb(so->ll.mtu + sizeof(struct can_skb_priv), gfp_any()); + if (!nskb) + return 1; + + dev = dev_get_by_index(sock_net(sk), so->ifindex); + if (!dev) { + kfree_skb(nskb); + return 1; + } + + can_skb_reserve(nskb); + can_skb_prv(nskb)->ifindex = dev->ifindex; + can_skb_prv(nskb)->skbcnt = 0; + + nskb->dev = dev; + can_skb_set_owner(nskb, sk); + ncf = (struct canfd_frame *)nskb->data; + skb_put(nskb, so->ll.mtu); + + /* create & send flow control reply */ + ncf->can_id = so->txid; + + if (so->opt.flags & CAN_ISOTP_TX_PADDING) { + memset(ncf->data, so->opt.txpad_content, CAN_MAX_DLEN); + ncf->len = CAN_MAX_DLEN; + } else { + ncf->len = ae + FC_CONTENT_SZ; + } + + ncf->data[ae] = N_PCI_FC | flowstatus; + ncf->data[ae + 1] = so->rxfc.bs; + ncf->data[ae + 2] = so->rxfc.stmin; + + if (ae) + ncf->data[0] = so->opt.ext_address; + + if (so->ll.mtu == CANFD_MTU) + ncf->flags = so->ll.tx_flags; + + can_send_ret = can_send(nskb, 1); + if (can_send_ret) + printk_once(KERN_NOTICE "can-isotp: %s: can_send_ret %d\n", + __func__, can_send_ret); + + dev_put(dev); + + /* reset blocksize counter */ + so->rx.bs = 0; + + /* reset last CF frame rx timestamp for rx stmin enforcement */ + so->lastrxcf_tstamp = ktime_set(0, 0); + + /* start rx timeout watchdog */ + hrtimer_start(&so->rxtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT); + return 0; +} + +static void isotp_rcv_skb(struct sk_buff *skb, struct sock *sk) +{ + struct sockaddr_can *addr = (struct sockaddr_can *)skb->cb; + + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct sockaddr_can)); + + memset(addr, 0, sizeof(*addr)); + addr->can_family = AF_CAN; + addr->can_ifindex = skb->dev->ifindex; + + if (sock_queue_rcv_skb(sk, skb) < 0) + kfree_skb(skb); +} + +static u8 padlen(u8 datalen) +{ + const u8 plen[] = {8, 8, 8, 8, 8, 8, 8, 8, 8, /* 0 - 8 */ + 12, 12, 12, 12, /* 9 - 12 */ + 16, 16, 16, 16, /* 13 - 16 */ + 20, 20, 20, 20, /* 17 - 20 */ + 24, 24, 24, 24, /* 21 - 24 */ + 32, 32, 32, 32, 32, 32, 32, 32, /* 25 - 32 */ + 48, 48, 48, 48, 48, 48, 48, 48, /* 33 - 40 */ + 48, 48, 48, 48, 48, 48, 48, 48}; /* 41 - 48 */ + + if (datalen > 48) + return 64; + + return plen[datalen]; +} + +/* check for length optimization and return 1/true when the check fails */ +static int check_optimized(struct canfd_frame *cf, int start_index) +{ + /* for CAN_DL <= 8 the start_index is equal to the CAN_DL as the + * padding would start at this point. E.g. if the padding would + * start at cf.data[7] cf->len has to be 7 to be optimal. + * Note: The data[] index starts with zero. + */ + if (cf->len <= CAN_MAX_DLEN) + return (cf->len != start_index); + + /* This relation is also valid in the non-linear DLC range, where + * we need to take care of the minimal next possible CAN_DL. + * The correct check would be (padlen(cf->len) != padlen(start_index)). + * But as cf->len can only take discrete values from 12, .., 64 at this + * point the padlen(cf->len) is always equal to cf->len. + */ + return (cf->len != padlen(start_index)); +} + +/* check padding and return 1/true when the check fails */ +static int check_pad(struct isotp_sock *so, struct canfd_frame *cf, + int start_index, u8 content) +{ + int i; + + /* no RX_PADDING value => check length of optimized frame length */ + if (!(so->opt.flags & CAN_ISOTP_RX_PADDING)) { + if (so->opt.flags & CAN_ISOTP_CHK_PAD_LEN) + return check_optimized(cf, start_index); + + /* no valid test against empty value => ignore frame */ + return 1; + } + + /* check datalength of correctly padded CAN frame */ + if ((so->opt.flags & CAN_ISOTP_CHK_PAD_LEN) && + cf->len != padlen(cf->len)) + return 1; + + /* check padding content */ + if (so->opt.flags & CAN_ISOTP_CHK_PAD_DATA) { + for (i = start_index; i < cf->len; i++) + if (cf->data[i] != content) + return 1; + } + return 0; +} + +static int isotp_rcv_fc(struct isotp_sock *so, struct canfd_frame *cf, int ae) +{ + struct sock *sk = &so->sk; + + if (so->tx.state != ISOTP_WAIT_FC && + so->tx.state != ISOTP_WAIT_FIRST_FC) + return 0; + + hrtimer_cancel(&so->txtimer); + + if ((cf->len < ae + FC_CONTENT_SZ) || + ((so->opt.flags & ISOTP_CHECK_PADDING) && + check_pad(so, cf, ae + FC_CONTENT_SZ, so->opt.rxpad_content))) { + /* malformed PDU - report 'not a data message' */ + sk->sk_err = EBADMSG; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + + so->tx.state = ISOTP_IDLE; + wake_up_interruptible(&so->wait); + return 1; + } + + /* get communication parameters only from the first FC frame */ + if (so->tx.state == ISOTP_WAIT_FIRST_FC) { + so->txfc.bs = cf->data[ae + 1]; + so->txfc.stmin = cf->data[ae + 2]; + + /* fix wrong STmin values according spec */ + if (so->txfc.stmin > 0x7F && + (so->txfc.stmin < 0xF1 || so->txfc.stmin > 0xF9)) + so->txfc.stmin = 0x7F; + + so->tx_gap = ktime_set(0, 0); + /* add transmission time for CAN frame N_As */ + so->tx_gap = ktime_add_ns(so->tx_gap, so->opt.frame_txtime); + /* add waiting time for consecutive frames N_Cs */ + if (so->opt.flags & CAN_ISOTP_FORCE_TXSTMIN) + so->tx_gap = ktime_add_ns(so->tx_gap, + so->force_tx_stmin); + else if (so->txfc.stmin < 0x80) + so->tx_gap = ktime_add_ns(so->tx_gap, + so->txfc.stmin * 1000000); + else + so->tx_gap = ktime_add_ns(so->tx_gap, + (so->txfc.stmin - 0xF0) + * 100000); + so->tx.state = ISOTP_WAIT_FC; + } + + switch (cf->data[ae] & 0x0F) { + case ISOTP_FC_CTS: + so->tx.bs = 0; + so->tx.state = ISOTP_SENDING; + /* start cyclic timer for sending CF frame */ + hrtimer_start(&so->txtimer, so->tx_gap, + HRTIMER_MODE_REL_SOFT); + break; + + case ISOTP_FC_WT: + /* start timer to wait for next FC frame */ + hrtimer_start(&so->txtimer, ktime_set(1, 0), + HRTIMER_MODE_REL_SOFT); + break; + + case ISOTP_FC_OVFLW: + /* overflow on receiver side - report 'message too long' */ + sk->sk_err = EMSGSIZE; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + fallthrough; + + default: + /* stop this tx job */ + so->tx.state = ISOTP_IDLE; + wake_up_interruptible(&so->wait); + } + return 0; +} + +static int isotp_rcv_sf(struct sock *sk, struct canfd_frame *cf, int pcilen, + struct sk_buff *skb, int len) +{ + struct isotp_sock *so = isotp_sk(sk); + struct sk_buff *nskb; + + hrtimer_cancel(&so->rxtimer); + so->rx.state = ISOTP_IDLE; + + if (!len || len > cf->len - pcilen) + return 1; + + if ((so->opt.flags & ISOTP_CHECK_PADDING) && + check_pad(so, cf, pcilen + len, so->opt.rxpad_content)) { + /* malformed PDU - report 'not a data message' */ + sk->sk_err = EBADMSG; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + return 1; + } + + nskb = alloc_skb(len, gfp_any()); + if (!nskb) + return 1; + + memcpy(skb_put(nskb, len), &cf->data[pcilen], len); + + nskb->tstamp = skb->tstamp; + nskb->dev = skb->dev; + isotp_rcv_skb(nskb, sk); + return 0; +} + +static int isotp_rcv_ff(struct sock *sk, struct canfd_frame *cf, int ae) +{ + struct isotp_sock *so = isotp_sk(sk); + int i; + int off; + int ff_pci_sz; + + hrtimer_cancel(&so->rxtimer); + so->rx.state = ISOTP_IDLE; + + /* get the used sender LL_DL from the (first) CAN frame data length */ + so->rx.ll_dl = padlen(cf->len); + + /* the first frame has to use the entire frame up to LL_DL length */ + if (cf->len != so->rx.ll_dl) + return 1; + + /* get the FF_DL */ + so->rx.len = (cf->data[ae] & 0x0F) << 8; + so->rx.len += cf->data[ae + 1]; + + /* Check for FF_DL escape sequence supporting 32 bit PDU length */ + if (so->rx.len) { + ff_pci_sz = FF_PCI_SZ12; + } else { + /* FF_DL = 0 => get real length from next 4 bytes */ + so->rx.len = cf->data[ae + 2] << 24; + so->rx.len += cf->data[ae + 3] << 16; + so->rx.len += cf->data[ae + 4] << 8; + so->rx.len += cf->data[ae + 5]; + ff_pci_sz = FF_PCI_SZ32; + } + + /* take care of a potential SF_DL ESC offset for TX_DL > 8 */ + off = (so->rx.ll_dl > CAN_MAX_DLEN) ? 1 : 0; + + if (so->rx.len + ae + off + ff_pci_sz < so->rx.ll_dl) + return 1; + + if (so->rx.len > MAX_MSG_LENGTH) { + /* send FC frame with overflow status */ + isotp_send_fc(sk, ae, ISOTP_FC_OVFLW); + return 1; + } + + /* copy the first received data bytes */ + so->rx.idx = 0; + for (i = ae + ff_pci_sz; i < so->rx.ll_dl; i++) + so->rx.buf[so->rx.idx++] = cf->data[i]; + + /* initial setup for this pdu reception */ + so->rx.sn = 1; + so->rx.state = ISOTP_WAIT_DATA; + + /* no creation of flow control frames */ + if (so->opt.flags & CAN_ISOTP_LISTEN_MODE) + return 0; + + /* send our first FC frame */ + isotp_send_fc(sk, ae, ISOTP_FC_CTS); + return 0; +} + +static int isotp_rcv_cf(struct sock *sk, struct canfd_frame *cf, int ae, + struct sk_buff *skb) +{ + struct isotp_sock *so = isotp_sk(sk); + struct sk_buff *nskb; + int i; + + if (so->rx.state != ISOTP_WAIT_DATA) + return 0; + + /* drop if timestamp gap is less than force_rx_stmin nano secs */ + if (so->opt.flags & CAN_ISOTP_FORCE_RXSTMIN) { + if (ktime_to_ns(ktime_sub(skb->tstamp, so->lastrxcf_tstamp)) < + so->force_rx_stmin) + return 0; + + so->lastrxcf_tstamp = skb->tstamp; + } + + hrtimer_cancel(&so->rxtimer); + + /* CFs are never longer than the FF */ + if (cf->len > so->rx.ll_dl) + return 1; + + /* CFs have usually the LL_DL length */ + if (cf->len < so->rx.ll_dl) { + /* this is only allowed for the last CF */ + if (so->rx.len - so->rx.idx > so->rx.ll_dl - ae - N_PCI_SZ) + return 1; + } + + if ((cf->data[ae] & 0x0F) != so->rx.sn) { + /* wrong sn detected - report 'illegal byte sequence' */ + sk->sk_err = EILSEQ; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + + /* reset rx state */ + so->rx.state = ISOTP_IDLE; + return 1; + } + so->rx.sn++; + so->rx.sn %= 16; + + for (i = ae + N_PCI_SZ; i < cf->len; i++) { + so->rx.buf[so->rx.idx++] = cf->data[i]; + if (so->rx.idx >= so->rx.len) + break; + } + + if (so->rx.idx >= so->rx.len) { + /* we are done */ + so->rx.state = ISOTP_IDLE; + + if ((so->opt.flags & ISOTP_CHECK_PADDING) && + check_pad(so, cf, i + 1, so->opt.rxpad_content)) { + /* malformed PDU - report 'not a data message' */ + sk->sk_err = EBADMSG; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + return 1; + } + + nskb = alloc_skb(so->rx.len, gfp_any()); + if (!nskb) + return 1; + + memcpy(skb_put(nskb, so->rx.len), so->rx.buf, + so->rx.len); + + nskb->tstamp = skb->tstamp; + nskb->dev = skb->dev; + isotp_rcv_skb(nskb, sk); + return 0; + } + + /* no creation of flow control frames */ + if (so->opt.flags & CAN_ISOTP_LISTEN_MODE) + return 0; + + /* perform blocksize handling, if enabled */ + if (!so->rxfc.bs || ++so->rx.bs < so->rxfc.bs) { + /* start rx timeout watchdog */ + hrtimer_start(&so->rxtimer, ktime_set(1, 0), + HRTIMER_MODE_REL_SOFT); + return 0; + } + + /* we reached the specified blocksize so->rxfc.bs */ + isotp_send_fc(sk, ae, ISOTP_FC_CTS); + return 0; +} + +static void isotp_rcv(struct sk_buff *skb, void *data) +{ + struct sock *sk = (struct sock *)data; + struct isotp_sock *so = isotp_sk(sk); + struct canfd_frame *cf; + int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0; + u8 n_pci_type, sf_dl; + + /* Strictly receive only frames with the configured MTU size + * => clear separation of CAN2.0 / CAN FD transport channels + */ + if (skb->len != so->ll.mtu) + return; + + cf = (struct canfd_frame *)skb->data; + + /* if enabled: check reception of my configured extended address */ + if (ae && cf->data[0] != so->opt.rx_ext_address) + return; + + n_pci_type = cf->data[ae] & 0xF0; + + if (so->opt.flags & CAN_ISOTP_HALF_DUPLEX) { + /* check rx/tx path half duplex expectations */ + if ((so->tx.state != ISOTP_IDLE && n_pci_type != N_PCI_FC) || + (so->rx.state != ISOTP_IDLE && n_pci_type == N_PCI_FC)) + return; + } + + switch (n_pci_type) { + case N_PCI_FC: + /* tx path: flow control frame containing the FC parameters */ + isotp_rcv_fc(so, cf, ae); + break; + + case N_PCI_SF: + /* rx path: single frame + * + * As we do not have a rx.ll_dl configuration, we can only test + * if the CAN frames payload length matches the LL_DL == 8 + * requirements - no matter if it's CAN 2.0 or CAN FD + */ + + /* get the SF_DL from the N_PCI byte */ + sf_dl = cf->data[ae] & 0x0F; + + if (cf->len <= CAN_MAX_DLEN) { + isotp_rcv_sf(sk, cf, SF_PCI_SZ4 + ae, skb, sf_dl); + } else { + if (skb->len == CANFD_MTU) { + /* We have a CAN FD frame and CAN_DL is greater than 8: + * Only frames with the SF_DL == 0 ESC value are valid. + * + * If so take care of the increased SF PCI size + * (SF_PCI_SZ8) to point to the message content behind + * the extended SF PCI info and get the real SF_DL + * length value from the formerly first data byte. + */ + if (sf_dl == 0) + isotp_rcv_sf(sk, cf, SF_PCI_SZ8 + ae, skb, + cf->data[SF_PCI_SZ4 + ae]); + } + } + break; + + case N_PCI_FF: + /* rx path: first frame */ + isotp_rcv_ff(sk, cf, ae); + break; + + case N_PCI_CF: + /* rx path: consecutive frame */ + isotp_rcv_cf(sk, cf, ae, skb); + break; + } +} + +static void isotp_fill_dataframe(struct canfd_frame *cf, struct isotp_sock *so, + int ae, int off) +{ + int pcilen = N_PCI_SZ + ae + off; + int space = so->tx.ll_dl - pcilen; + int num = min_t(int, so->tx.len - so->tx.idx, space); + int i; + + cf->can_id = so->txid; + cf->len = num + pcilen; + + if (num < space) { + if (so->opt.flags & CAN_ISOTP_TX_PADDING) { + /* user requested padding */ + cf->len = padlen(cf->len); + memset(cf->data, so->opt.txpad_content, cf->len); + } else if (cf->len > CAN_MAX_DLEN) { + /* mandatory padding for CAN FD frames */ + cf->len = padlen(cf->len); + memset(cf->data, CAN_ISOTP_DEFAULT_PAD_CONTENT, + cf->len); + } + } + + for (i = 0; i < num; i++) + cf->data[pcilen + i] = so->tx.buf[so->tx.idx++]; + + if (ae) + cf->data[0] = so->opt.ext_address; +} + +static void isotp_create_fframe(struct canfd_frame *cf, struct isotp_sock *so, + int ae) +{ + int i; + int ff_pci_sz; + + cf->can_id = so->txid; + cf->len = so->tx.ll_dl; + if (ae) + cf->data[0] = so->opt.ext_address; + + /* create N_PCI bytes with 12/32 bit FF_DL data length */ + if (so->tx.len > 4095) { + /* use 32 bit FF_DL notation */ + cf->data[ae] = N_PCI_FF; + cf->data[ae + 1] = 0; + cf->data[ae + 2] = (u8)(so->tx.len >> 24) & 0xFFU; + cf->data[ae + 3] = (u8)(so->tx.len >> 16) & 0xFFU; + cf->data[ae + 4] = (u8)(so->tx.len >> 8) & 0xFFU; + cf->data[ae + 5] = (u8)so->tx.len & 0xFFU; + ff_pci_sz = FF_PCI_SZ32; + } else { + /* use 12 bit FF_DL notation */ + cf->data[ae] = (u8)(so->tx.len >> 8) | N_PCI_FF; + cf->data[ae + 1] = (u8)so->tx.len & 0xFFU; + ff_pci_sz = FF_PCI_SZ12; + } + + /* add first data bytes depending on ae */ + for (i = ae + ff_pci_sz; i < so->tx.ll_dl; i++) + cf->data[i] = so->tx.buf[so->tx.idx++]; + + so->tx.sn = 1; + so->tx.state = ISOTP_WAIT_FIRST_FC; +} + +static enum hrtimer_restart isotp_tx_timer_handler(struct hrtimer *hrtimer) +{ + struct isotp_sock *so = container_of(hrtimer, struct isotp_sock, + txtimer); + struct sock *sk = &so->sk; + struct sk_buff *skb; + struct net_device *dev; + struct canfd_frame *cf; + enum hrtimer_restart restart = HRTIMER_NORESTART; + int can_send_ret; + int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0; + + switch (so->tx.state) { + case ISOTP_WAIT_FC: + case ISOTP_WAIT_FIRST_FC: + + /* we did not get any flow control frame in time */ + + /* report 'communication error on send' */ + sk->sk_err = ECOMM; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + + /* reset tx state */ + so->tx.state = ISOTP_IDLE; + wake_up_interruptible(&so->wait); + break; + + case ISOTP_SENDING: + + /* push out the next segmented pdu */ + dev = dev_get_by_index(sock_net(sk), so->ifindex); + if (!dev) + break; + +isotp_tx_burst: + skb = alloc_skb(so->ll.mtu + sizeof(struct can_skb_priv), + gfp_any()); + if (!skb) { + dev_put(dev); + break; + } + + can_skb_reserve(skb); + can_skb_prv(skb)->ifindex = dev->ifindex; + can_skb_prv(skb)->skbcnt = 0; + + cf = (struct canfd_frame *)skb->data; + skb_put(skb, so->ll.mtu); + + /* create consecutive frame */ + isotp_fill_dataframe(cf, so, ae, 0); + + /* place consecutive frame N_PCI in appropriate index */ + cf->data[ae] = N_PCI_CF | so->tx.sn++; + so->tx.sn %= 16; + so->tx.bs++; + + if (so->ll.mtu == CANFD_MTU) + cf->flags = so->ll.tx_flags; + + skb->dev = dev; + can_skb_set_owner(skb, sk); + + can_send_ret = can_send(skb, 1); + if (can_send_ret) + printk_once(KERN_NOTICE "can-isotp: %s: can_send_ret %d\n", + __func__, can_send_ret); + + if (so->tx.idx >= so->tx.len) { + /* we are done */ + so->tx.state = ISOTP_IDLE; + dev_put(dev); + wake_up_interruptible(&so->wait); + break; + } + + if (so->txfc.bs && so->tx.bs >= so->txfc.bs) { + /* stop and wait for FC */ + so->tx.state = ISOTP_WAIT_FC; + dev_put(dev); + hrtimer_set_expires(&so->txtimer, + ktime_add(ktime_get(), + ktime_set(1, 0))); + restart = HRTIMER_RESTART; + break; + } + + /* no gap between data frames needed => use burst mode */ + if (!so->tx_gap) + goto isotp_tx_burst; + + /* start timer to send next data frame with correct delay */ + dev_put(dev); + hrtimer_set_expires(&so->txtimer, + ktime_add(ktime_get(), so->tx_gap)); + restart = HRTIMER_RESTART; + break; + + default: + WARN_ON_ONCE(1); + } + + return restart; +} + +static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) +{ + struct sock *sk = sock->sk; + struct isotp_sock *so = isotp_sk(sk); + struct sk_buff *skb; + struct net_device *dev; + struct canfd_frame *cf; + int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0; + int wait_tx_done = (so->opt.flags & CAN_ISOTP_WAIT_TX_DONE) ? 1 : 0; + int off; + int err; + + if (!so->bound) + return -EADDRNOTAVAIL; + + /* we do not support multiple buffers - for now */ + if (so->tx.state != ISOTP_IDLE || wq_has_sleeper(&so->wait)) { + if (msg->msg_flags & MSG_DONTWAIT) + return -EAGAIN; + + /* wait for complete transmission of current pdu */ + wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); + } + + if (!size || size > MAX_MSG_LENGTH) + return -EINVAL; + + err = memcpy_from_msg(so->tx.buf, msg, size); + if (err < 0) + return err; + + dev = dev_get_by_index(sock_net(sk), so->ifindex); + if (!dev) + return -ENXIO; + + skb = sock_alloc_send_skb(sk, so->ll.mtu + sizeof(struct can_skb_priv), + msg->msg_flags & MSG_DONTWAIT, &err); + if (!skb) { + dev_put(dev); + return err; + } + + can_skb_reserve(skb); + can_skb_prv(skb)->ifindex = dev->ifindex; + can_skb_prv(skb)->skbcnt = 0; + + so->tx.state = ISOTP_SENDING; + so->tx.len = size; + so->tx.idx = 0; + + cf = (struct canfd_frame *)skb->data; + skb_put(skb, so->ll.mtu); + + /* take care of a potential SF_DL ESC offset for TX_DL > 8 */ + off = (so->tx.ll_dl > CAN_MAX_DLEN) ? 1 : 0; + + /* check for single frame transmission depending on TX_DL */ + if (size <= so->tx.ll_dl - SF_PCI_SZ4 - ae - off) { + /* The message size generally fits into a SingleFrame - good. + * + * SF_DL ESC offset optimization: + * + * When TX_DL is greater 8 but the message would still fit + * into a 8 byte CAN frame, we can omit the offset. + * This prevents a protocol caused length extension from + * CAN_DL = 8 to CAN_DL = 12 due to the SF_SL ESC handling. + */ + if (size <= CAN_MAX_DLEN - SF_PCI_SZ4 - ae) + off = 0; + + isotp_fill_dataframe(cf, so, ae, off); + + /* place single frame N_PCI w/o length in appropriate index */ + cf->data[ae] = N_PCI_SF; + + /* place SF_DL size value depending on the SF_DL ESC offset */ + if (off) + cf->data[SF_PCI_SZ4 + ae] = size; + else + cf->data[ae] |= size; + + so->tx.state = ISOTP_IDLE; + wake_up_interruptible(&so->wait); + + /* don't enable wait queue for a single frame transmission */ + wait_tx_done = 0; + } else { + /* send first frame and wait for FC */ + + isotp_create_fframe(cf, so, ae); + + /* start timeout for FC */ + hrtimer_start(&so->txtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT); + } + + /* send the first or only CAN frame */ + if (so->ll.mtu == CANFD_MTU) + cf->flags = so->ll.tx_flags; + + skb->dev = dev; + skb->sk = sk; + err = can_send(skb, 1); + dev_put(dev); + if (err) { + printk_once(KERN_NOTICE "can-isotp: %s: can_send_ret %d\n", + __func__, err); + return err; + } + + if (wait_tx_done) { + /* wait for complete transmission of current pdu */ + wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); + } + + return size; +} + +static int isotp_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, + int flags) +{ + struct sock *sk = sock->sk; + struct sk_buff *skb; + int err = 0; + int noblock; + + noblock = flags & MSG_DONTWAIT; + flags &= ~MSG_DONTWAIT; + + skb = skb_recv_datagram(sk, flags, noblock, &err); + if (!skb) + return err; + + if (size < skb->len) + msg->msg_flags |= MSG_TRUNC; + else + size = skb->len; + + err = memcpy_to_msg(msg, skb->data, size); + if (err < 0) { + skb_free_datagram(sk, skb); + return err; + } + + sock_recv_timestamp(msg, sk, skb); + + if (msg->msg_name) { + msg->msg_namelen = sizeof(struct sockaddr_can); + memcpy(msg->msg_name, skb->cb, msg->msg_namelen); + } + + skb_free_datagram(sk, skb); + + return size; +} + +static int isotp_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct isotp_sock *so; + struct net *net; + + if (!sk) + return 0; + + so = isotp_sk(sk); + net = sock_net(sk); + + /* wait for complete transmission of current pdu */ + wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); + + unregister_netdevice_notifier(&so->notifier); + + lock_sock(sk); + + hrtimer_cancel(&so->txtimer); + hrtimer_cancel(&so->rxtimer); + + /* remove current filters & unregister */ + if (so->bound) { + if (so->ifindex) { + struct net_device *dev; + + dev = dev_get_by_index(net, so->ifindex); + if (dev) { + can_rx_unregister(net, dev, so->rxid, + SINGLE_MASK(so->rxid), + isotp_rcv, sk); + dev_put(dev); + } + } + } + + so->ifindex = 0; + so->bound = 0; + + sock_orphan(sk); + sock->sk = NULL; + + release_sock(sk); + sock_put(sk); + + return 0; +} + +static int isotp_bind(struct socket *sock, struct sockaddr *uaddr, int len) +{ + struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; + struct sock *sk = sock->sk; + struct isotp_sock *so = isotp_sk(sk); + struct net *net = sock_net(sk); + int ifindex; + struct net_device *dev; + int err = 0; + int notify_enetdown = 0; + + if (len < CAN_REQUIRED_SIZE(struct sockaddr_can, can_addr.tp)) + return -EINVAL; + + if (addr->can_addr.tp.rx_id == addr->can_addr.tp.tx_id) + return -EADDRNOTAVAIL; + + if ((addr->can_addr.tp.rx_id | addr->can_addr.tp.tx_id) & + (CAN_ERR_FLAG | CAN_RTR_FLAG)) + return -EADDRNOTAVAIL; + + if (!addr->can_ifindex) + return -ENODEV; + + lock_sock(sk); + + if (so->bound && addr->can_ifindex == so->ifindex && + addr->can_addr.tp.rx_id == so->rxid && + addr->can_addr.tp.tx_id == so->txid) + goto out; + + dev = dev_get_by_index(net, addr->can_ifindex); + if (!dev) { + err = -ENODEV; + goto out; + } + if (dev->type != ARPHRD_CAN) { + dev_put(dev); + err = -ENODEV; + goto out; + } + if (dev->mtu < so->ll.mtu) { + dev_put(dev); + err = -EINVAL; + goto out; + } + if (!(dev->flags & IFF_UP)) + notify_enetdown = 1; + + ifindex = dev->ifindex; + + can_rx_register(net, dev, addr->can_addr.tp.rx_id, + SINGLE_MASK(addr->can_addr.tp.rx_id), isotp_rcv, sk, + "isotp", sk); + + dev_put(dev); + + if (so->bound) { + /* unregister old filter */ + if (so->ifindex) { + dev = dev_get_by_index(net, so->ifindex); + if (dev) { + can_rx_unregister(net, dev, so->rxid, + SINGLE_MASK(so->rxid), + isotp_rcv, sk); + dev_put(dev); + } + } + } + + /* switch to new settings */ + so->ifindex = ifindex; + so->rxid = addr->can_addr.tp.rx_id; + so->txid = addr->can_addr.tp.tx_id; + so->bound = 1; + +out: + release_sock(sk); + + if (notify_enetdown) { + sk->sk_err = ENETDOWN; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + } + + return err; +} + +static int isotp_getname(struct socket *sock, struct sockaddr *uaddr, int peer) +{ + struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; + struct sock *sk = sock->sk; + struct isotp_sock *so = isotp_sk(sk); + + if (peer) + return -EOPNOTSUPP; + + addr->can_family = AF_CAN; + addr->can_ifindex = so->ifindex; + addr->can_addr.tp.rx_id = so->rxid; + addr->can_addr.tp.tx_id = so->txid; + + return sizeof(*addr); +} + +static int isotp_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + struct isotp_sock *so = isotp_sk(sk); + int ret = 0; + + if (level != SOL_CAN_ISOTP) + return -EINVAL; + + switch (optname) { + case CAN_ISOTP_OPTS: + if (optlen != sizeof(struct can_isotp_options)) + return -EINVAL; + + if (copy_from_sockptr(&so->opt, optval, optlen)) + return -EFAULT; + + /* no separate rx_ext_address is given => use ext_address */ + if (!(so->opt.flags & CAN_ISOTP_RX_EXT_ADDR)) + so->opt.rx_ext_address = so->opt.ext_address; + break; + + case CAN_ISOTP_RECV_FC: + if (optlen != sizeof(struct can_isotp_fc_options)) + return -EINVAL; + + if (copy_from_sockptr(&so->rxfc, optval, optlen)) + return -EFAULT; + break; + + case CAN_ISOTP_TX_STMIN: + if (optlen != sizeof(u32)) + return -EINVAL; + + if (copy_from_sockptr(&so->force_tx_stmin, optval, optlen)) + return -EFAULT; + break; + + case CAN_ISOTP_RX_STMIN: + if (optlen != sizeof(u32)) + return -EINVAL; + + if (copy_from_sockptr(&so->force_rx_stmin, optval, optlen)) + return -EFAULT; + break; + + case CAN_ISOTP_LL_OPTS: + if (optlen == sizeof(struct can_isotp_ll_options)) { + struct can_isotp_ll_options ll; + + if (copy_from_sockptr(&ll, optval, optlen)) + return -EFAULT; + + /* check for correct ISO 11898-1 DLC data length */ + if (ll.tx_dl != padlen(ll.tx_dl)) + return -EINVAL; + + if (ll.mtu != CAN_MTU && ll.mtu != CANFD_MTU) + return -EINVAL; + + if (ll.mtu == CAN_MTU && ll.tx_dl > CAN_MAX_DLEN) + return -EINVAL; + + memcpy(&so->ll, &ll, sizeof(ll)); + + /* set ll_dl for tx path to similar place as for rx */ + so->tx.ll_dl = ll.tx_dl; + } else { + return -EINVAL; + } + break; + + default: + ret = -ENOPROTOOPT; + } + + return ret; +} + +static int isotp_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct isotp_sock *so = isotp_sk(sk); + int len; + void *val; + + if (level != SOL_CAN_ISOTP) + return -EINVAL; + if (get_user(len, optlen)) + return -EFAULT; + if (len < 0) + return -EINVAL; + + switch (optname) { + case CAN_ISOTP_OPTS: + len = min_t(int, len, sizeof(struct can_isotp_options)); + val = &so->opt; + break; + + case CAN_ISOTP_RECV_FC: + len = min_t(int, len, sizeof(struct can_isotp_fc_options)); + val = &so->rxfc; + break; + + case CAN_ISOTP_TX_STMIN: + len = min_t(int, len, sizeof(u32)); + val = &so->force_tx_stmin; + break; + + case CAN_ISOTP_RX_STMIN: + len = min_t(int, len, sizeof(u32)); + val = &so->force_rx_stmin; + break; + + case CAN_ISOTP_LL_OPTS: + len = min_t(int, len, sizeof(struct can_isotp_ll_options)); + val = &so->ll; + break; + + default: + return -ENOPROTOOPT; + } + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, val, len)) + return -EFAULT; + return 0; +} + +static int isotp_notifier(struct notifier_block *nb, unsigned long msg, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct isotp_sock *so = container_of(nb, struct isotp_sock, notifier); + struct sock *sk = &so->sk; + + if (!net_eq(dev_net(dev), sock_net(sk))) + return NOTIFY_DONE; + + if (dev->type != ARPHRD_CAN) + return NOTIFY_DONE; + + if (so->ifindex != dev->ifindex) + return NOTIFY_DONE; + + switch (msg) { + case NETDEV_UNREGISTER: + lock_sock(sk); + /* remove current filters & unregister */ + if (so->bound) + can_rx_unregister(dev_net(dev), dev, so->rxid, + SINGLE_MASK(so->rxid), + isotp_rcv, sk); + + so->ifindex = 0; + so->bound = 0; + release_sock(sk); + + sk->sk_err = ENODEV; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + break; + + case NETDEV_DOWN: + sk->sk_err = ENETDOWN; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + break; + } + + return NOTIFY_DONE; +} + +static int isotp_init(struct sock *sk) +{ + struct isotp_sock *so = isotp_sk(sk); + + so->ifindex = 0; + so->bound = 0; + + so->opt.flags = CAN_ISOTP_DEFAULT_FLAGS; + so->opt.ext_address = CAN_ISOTP_DEFAULT_EXT_ADDRESS; + so->opt.rx_ext_address = CAN_ISOTP_DEFAULT_EXT_ADDRESS; + so->opt.rxpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT; + so->opt.txpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT; + so->opt.frame_txtime = CAN_ISOTP_DEFAULT_FRAME_TXTIME; + so->rxfc.bs = CAN_ISOTP_DEFAULT_RECV_BS; + so->rxfc.stmin = CAN_ISOTP_DEFAULT_RECV_STMIN; + so->rxfc.wftmax = CAN_ISOTP_DEFAULT_RECV_WFTMAX; + so->ll.mtu = CAN_ISOTP_DEFAULT_LL_MTU; + so->ll.tx_dl = CAN_ISOTP_DEFAULT_LL_TX_DL; + so->ll.tx_flags = CAN_ISOTP_DEFAULT_LL_TX_FLAGS; + + /* set ll_dl for tx path to similar place as for rx */ + so->tx.ll_dl = so->ll.tx_dl; + + so->rx.state = ISOTP_IDLE; + so->tx.state = ISOTP_IDLE; + + hrtimer_init(&so->rxtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); + so->rxtimer.function = isotp_rx_timer_handler; + hrtimer_init(&so->txtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); + so->txtimer.function = isotp_tx_timer_handler; + + init_waitqueue_head(&so->wait); + + so->notifier.notifier_call = isotp_notifier; + register_netdevice_notifier(&so->notifier); + + return 0; +} + +static int isotp_sock_no_ioctlcmd(struct socket *sock, unsigned int cmd, + unsigned long arg) +{ + /* no ioctls for socket layer -> hand it down to NIC layer */ + return -ENOIOCTLCMD; +} + +static const struct proto_ops isotp_ops = { + .family = PF_CAN, + .release = isotp_release, + .bind = isotp_bind, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = isotp_getname, + .poll = datagram_poll, + .ioctl = isotp_sock_no_ioctlcmd, + .gettstamp = sock_gettstamp, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = isotp_setsockopt, + .getsockopt = isotp_getsockopt, + .sendmsg = isotp_sendmsg, + .recvmsg = isotp_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static struct proto isotp_proto __read_mostly = { + .name = "CAN_ISOTP", + .owner = THIS_MODULE, + .obj_size = sizeof(struct isotp_sock), + .init = isotp_init, +}; + +static const struct can_proto isotp_can_proto = { + .type = SOCK_DGRAM, + .protocol = CAN_ISOTP, + .ops = &isotp_ops, + .prot = &isotp_proto, +}; + +static __init int isotp_module_init(void) +{ + int err; + + pr_info("can: isotp protocol (rev " CAN_ISOTP_VERSION ")\n"); + + err = can_proto_register(&isotp_can_proto); + if (err < 0) + pr_err("can: registration of isotp protocol failed\n"); + + return err; +} + +static __exit void isotp_module_exit(void) +{ + can_proto_unregister(&isotp_can_proto); +} + +module_init(isotp_module_init); +module_exit(isotp_module_exit); -- cgit v1.2.3 From eca43ee6c46db92dd850ce659316b0680d70e137 Mon Sep 17 00:00:00 2001 From: "Nikita V. Shirokov" Date: Fri, 9 Oct 2020 07:03:25 +0000 Subject: bpf: Add tcp_notsent_lowat bpf setsockopt Adding support for TCP_NOTSENT_LOWAT sockoption (https://lwn.net/Articles/560082/) in tcp bpf programs. Signed-off-by: Nikita V. Shirokov Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20201009070325.226855-1-tehnerd@tehnerd.com --- include/uapi/linux/bpf.h | 2 +- net/core/filter.c | 4 ++++ tools/include/uapi/linux/bpf.h | 2 +- tools/testing/selftests/bpf/progs/connect4_prog.c | 19 +++++++++++++++++++ 4 files changed, 25 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d83561e8cd2c..42d2df799397 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1698,7 +1698,7 @@ union bpf_attr { * **TCP_CONGESTION**, **TCP_BPF_IW**, * **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**, * **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**, - * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**. + * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**. * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. * Return diff --git a/net/core/filter.c b/net/core/filter.c index 05df73780dd3..5da44b11e1ec 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4827,6 +4827,10 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, else icsk->icsk_user_timeout = val; break; + case TCP_NOTSENT_LOWAT: + tp->notsent_lowat = val; + sk->sk_write_space(sk); + break; default: ret = -EINVAL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index d83561e8cd2c..42d2df799397 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1698,7 +1698,7 @@ union bpf_attr { * **TCP_CONGESTION**, **TCP_BPF_IW**, * **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**, * **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**, - * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**. + * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**. * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. * Return diff --git a/tools/testing/selftests/bpf/progs/connect4_prog.c b/tools/testing/selftests/bpf/progs/connect4_prog.c index b1b2773c0b9d..a943d394fd3a 100644 --- a/tools/testing/selftests/bpf/progs/connect4_prog.c +++ b/tools/testing/selftests/bpf/progs/connect4_prog.c @@ -23,6 +23,10 @@ #define TCP_CA_NAME_MAX 16 #endif +#ifndef TCP_NOTSENT_LOWAT +#define TCP_NOTSENT_LOWAT 25 +#endif + #ifndef IFNAMSIZ #define IFNAMSIZ 16 #endif @@ -128,6 +132,18 @@ static __inline int set_keepalive(struct bpf_sock_addr *ctx) return 0; } +static __inline int set_notsent_lowat(struct bpf_sock_addr *ctx) +{ + int lowat = 65535; + + if (ctx->type == SOCK_STREAM) { + if (bpf_setsockopt(ctx, SOL_TCP, TCP_NOTSENT_LOWAT, &lowat, sizeof(lowat))) + return 1; + } + + return 0; +} + SEC("cgroup/connect4") int connect_v4_prog(struct bpf_sock_addr *ctx) { @@ -148,6 +164,9 @@ int connect_v4_prog(struct bpf_sock_addr *ctx) if (set_keepalive(ctx)) return 0; + if (set_notsent_lowat(ctx)) + return 0; + if (ctx->type != SOCK_STREAM && ctx->type != SOCK_DGRAM) return 0; else if (ctx->type == SOCK_STREAM) -- cgit v1.2.3 From 8858e8d98d5457ba23bcd0d99ce23e272b8b09a1 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 9 Oct 2020 18:07:14 +0900 Subject: block: fix uapi blkzoned.h comments Update the kdoc comments for struct blk_zone (capacity field description missing) and for struct blk_zone_report (flags field description missing). Signed-off-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/uapi/linux/blkzoned.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h index 42c3366cc25f..656a326821a2 100644 --- a/include/uapi/linux/blkzoned.h +++ b/include/uapi/linux/blkzoned.h @@ -93,12 +93,15 @@ enum blk_zone_report_flags { * @non_seq: Flag indicating that the zone is using non-sequential resources * (for host-aware zoned block devices only). * @reset: Flag indicating that a zone reset is recommended. - * @reserved: Padding to 64 B to match the ZBC/ZAC defined zone descriptor size. + * @resv: Padding for 8B alignment. + * @capacity: Zone usable capacity in 512 B sector units + * @reserved: Padding to 64 B to match the ZBC, ZAC and ZNS defined zone + * descriptor size. * - * start, len and wp use the regular 512 B sector unit, regardless of the - * device logical block size. The overall structure size is 64 B to match the - * ZBC/ZAC defined zone descriptor and allow support for future additional - * zone information. + * start, len, capacity and wp use the regular 512 B sector unit, regardless + * of the device logical block size. The overall structure size is 64 B to + * match the ZBC, ZAC and ZNS defined zone descriptor and allow support for + * future additional zone information. */ struct blk_zone { __u64 start; /* Zone start sector */ @@ -118,7 +121,7 @@ struct blk_zone { * * @sector: starting sector of report * @nr_zones: IN maximum / OUT actual - * @reserved: padding to 16 byte alignment + * @flags: one or more flags as defined by enum blk_zone_report_flags. * @zones: Space to hold @nr_zones @zones entries on reply. * * The array of at most @nr_zones must follow this structure in memory. -- cgit v1.2.3 From ccdf07219da6bd1f43c6ddcde4c0e36993c7365a Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 7 Oct 2020 09:00:43 +0300 Subject: devlink: Add reload action option to devlink reload command Add devlink reload action to allow the user to request a specific reload action. The action parameter is optional, if not specified then devlink driver re-init action is used (backward compatible). Note that when required to do firmware activation some drivers may need to reload the driver. On the other hand some drivers may need to reset the firmware to reinitialize the driver entities. Therefore, the devlink reload command returns the actions which were actually performed. Reload actions supported are: driver_reinit: driver entities re-initialization, applying devlink-param and devlink-resource values. fw_activate: firmware activate. command examples: $devlink dev reload pci/0000:82:00.0 action driver_reinit reload_actions_performed: driver_reinit $devlink dev reload pci/0000:82:00.0 action fw_activate reload_actions_performed: driver_reinit fw_activate Signed-off-by: Moshe Shemesh Reviewed-by: Jakub Kicinski Reviewed-by: Jacob Keller Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx4/main.c | 7 +- drivers/net/ethernet/mellanox/mlx5/core/devlink.c | 7 +- drivers/net/ethernet/mellanox/mlxsw/core.c | 10 ++- drivers/net/netdevsim/dev.c | 8 +- include/net/devlink.h | 7 +- include/uapi/linux/devlink.h | 13 +++ net/core/devlink.c | 98 +++++++++++++++++++++-- 7 files changed, 131 insertions(+), 19 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 70cf24ba71e4..649c5323cf9f 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -3946,6 +3946,7 @@ static int mlx4_restart_one_up(struct pci_dev *pdev, bool reload, struct devlink *devlink); static int mlx4_devlink_reload_down(struct devlink *devlink, bool netns_change, + enum devlink_reload_action action, struct netlink_ext_ack *extack) { struct mlx4_priv *priv = devlink_priv(devlink); @@ -3962,14 +3963,15 @@ static int mlx4_devlink_reload_down(struct devlink *devlink, bool netns_change, return 0; } -static int mlx4_devlink_reload_up(struct devlink *devlink, - struct netlink_ext_ack *extack) +static int mlx4_devlink_reload_up(struct devlink *devlink, enum devlink_reload_action action, + u32 *actions_performed, struct netlink_ext_ack *extack) { struct mlx4_priv *priv = devlink_priv(devlink); struct mlx4_dev *dev = &priv->dev; struct mlx4_dev_persistent *persist = dev->persist; int err; + *actions_performed = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT); err = mlx4_restart_one_up(persist->pdev, true, devlink); if (err) mlx4_err(persist->dev, "mlx4_restart_one_up failed, ret=%d\n", @@ -3980,6 +3982,7 @@ static int mlx4_devlink_reload_up(struct devlink *devlink, static const struct devlink_ops mlx4_devlink_ops = { .port_type_set = mlx4_devlink_port_type_set, + .reload_actions = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT), .reload_down = mlx4_devlink_reload_down, .reload_up = mlx4_devlink_reload_up, }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c index 9b14e3f805a2..1b248c01a209 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c @@ -85,6 +85,7 @@ mlx5_devlink_info_get(struct devlink *devlink, struct devlink_info_req *req, } static int mlx5_devlink_reload_down(struct devlink *devlink, bool netns_change, + enum devlink_reload_action action, struct netlink_ext_ack *extack) { struct mlx5_core_dev *dev = devlink_priv(devlink); @@ -93,11 +94,12 @@ static int mlx5_devlink_reload_down(struct devlink *devlink, bool netns_change, return 0; } -static int mlx5_devlink_reload_up(struct devlink *devlink, - struct netlink_ext_ack *extack) +static int mlx5_devlink_reload_up(struct devlink *devlink, enum devlink_reload_action action, + u32 *actions_performed, struct netlink_ext_ack *extack) { struct mlx5_core_dev *dev = devlink_priv(devlink); + *actions_performed = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT); return mlx5_load_one(dev, false); } @@ -114,6 +116,7 @@ static const struct devlink_ops mlx5_devlink_ops = { #endif .flash_update = mlx5_devlink_flash_update, .info_get = mlx5_devlink_info_get, + .reload_actions = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT), .reload_down = mlx5_devlink_reload_down, .reload_up = mlx5_devlink_reload_up, }; diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c index a21afa56e3f7..cd9f56c73827 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core.c @@ -1414,7 +1414,7 @@ mlxsw_devlink_info_get(struct devlink *devlink, struct devlink_info_req *req, static int mlxsw_devlink_core_bus_device_reload_down(struct devlink *devlink, - bool netns_change, + bool netns_change, enum devlink_reload_action action, struct netlink_ext_ack *extack) { struct mlxsw_core *mlxsw_core = devlink_priv(devlink); @@ -1427,11 +1427,13 @@ mlxsw_devlink_core_bus_device_reload_down(struct devlink *devlink, } static int -mlxsw_devlink_core_bus_device_reload_up(struct devlink *devlink, - struct netlink_ext_ack *extack) +mlxsw_devlink_core_bus_device_reload_up(struct devlink *devlink, enum devlink_reload_action action, + u32 *actions_performed, struct netlink_ext_ack *extack) { struct mlxsw_core *mlxsw_core = devlink_priv(devlink); + *actions_performed = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT) | + BIT(DEVLINK_RELOAD_ACTION_FW_ACTIVATE); return mlxsw_core_bus_device_register(mlxsw_core->bus_info, mlxsw_core->bus, mlxsw_core->bus_priv, true, @@ -1564,6 +1566,8 @@ mlxsw_devlink_trap_policer_counter_get(struct devlink *devlink, } static const struct devlink_ops mlxsw_devlink_ops = { + .reload_actions = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT) | + BIT(DEVLINK_RELOAD_ACTION_FW_ACTIVATE), .reload_down = mlxsw_devlink_core_bus_device_reload_down, .reload_up = mlxsw_devlink_core_bus_device_reload_up, .port_type_set = mlxsw_devlink_port_type_set, diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c index 56213ba151f6..b57e35c4ef6f 100644 --- a/drivers/net/netdevsim/dev.c +++ b/drivers/net/netdevsim/dev.c @@ -701,7 +701,7 @@ static int nsim_dev_reload_create(struct nsim_dev *nsim_dev, static void nsim_dev_reload_destroy(struct nsim_dev *nsim_dev); static int nsim_dev_reload_down(struct devlink *devlink, bool netns_change, - struct netlink_ext_ack *extack) + enum devlink_reload_action action, struct netlink_ext_ack *extack) { struct nsim_dev *nsim_dev = devlink_priv(devlink); @@ -717,8 +717,8 @@ static int nsim_dev_reload_down(struct devlink *devlink, bool netns_change, return 0; } -static int nsim_dev_reload_up(struct devlink *devlink, - struct netlink_ext_ack *extack) +static int nsim_dev_reload_up(struct devlink *devlink, enum devlink_reload_action action, + u32 *actions_performed, struct netlink_ext_ack *extack) { struct nsim_dev *nsim_dev = devlink_priv(devlink); @@ -730,6 +730,7 @@ static int nsim_dev_reload_up(struct devlink *devlink, return -EINVAL; } + *actions_performed = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT); return nsim_dev_reload_create(nsim_dev, extack); } @@ -886,6 +887,7 @@ nsim_dev_devlink_trap_policer_counter_get(struct devlink *devlink, static const struct devlink_ops nsim_dev_devlink_ops = { .supported_flash_update_params = DEVLINK_SUPPORT_FLASH_UPDATE_COMPONENT | DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK, + .reload_actions = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT), .reload_down = nsim_dev_reload_down, .reload_up = nsim_dev_reload_up, .info_get = nsim_dev_info_get, diff --git a/include/net/devlink.h b/include/net/devlink.h index 237ba5e29a3b..93c535ae5a4b 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1150,10 +1150,11 @@ struct devlink_ops { * implemementation. */ u32 supported_flash_update_params; + unsigned long reload_actions; int (*reload_down)(struct devlink *devlink, bool netns_change, - struct netlink_ext_ack *extack); - int (*reload_up)(struct devlink *devlink, - struct netlink_ext_ack *extack); + enum devlink_reload_action action, struct netlink_ext_ack *extack); + int (*reload_up)(struct devlink *devlink, enum devlink_reload_action action, + u32 *actions_performed, struct netlink_ext_ack *extack); int (*port_type_set)(struct devlink_port *devlink_port, enum devlink_port_type port_type); int (*port_split)(struct devlink *devlink, unsigned int port_index, diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 5f1d6c327670..74bdad252c36 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -301,6 +301,16 @@ enum { DEVLINK_ATTR_TRAP_METADATA_TYPE_FA_COOKIE, }; +enum devlink_reload_action { + DEVLINK_RELOAD_ACTION_UNSPEC, + DEVLINK_RELOAD_ACTION_DRIVER_REINIT, /* Driver entities re-instantiation */ + DEVLINK_RELOAD_ACTION_FW_ACTIVATE, /* FW activate */ + + /* Add new reload actions above */ + __DEVLINK_RELOAD_ACTION_MAX, + DEVLINK_RELOAD_ACTION_MAX = __DEVLINK_RELOAD_ACTION_MAX - 1 +}; + enum devlink_attr { /* don't change the order or add anything between, this is ABI! */ DEVLINK_ATTR_UNSPEC, @@ -493,6 +503,9 @@ enum devlink_attr { DEVLINK_ATTR_FLASH_UPDATE_STATUS_TIMEOUT, /* u64 */ DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK, /* bitfield32 */ + DEVLINK_ATTR_RELOAD_ACTION, /* u8 */ + DEVLINK_ATTR_RELOAD_ACTIONS_PERFORMED, /* bitfield32 */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 5c45b3964ec3..c026ed3519c9 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -479,6 +479,12 @@ static int devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink) return 0; } +static bool +devlink_reload_action_is_supported(struct devlink *devlink, enum devlink_reload_action action) +{ + return test_bit(action, &devlink->ops->reload_actions); +} + static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink, enum devlink_command cmd, u32 portid, u32 seq, int flags) @@ -2984,6 +2990,7 @@ bool devlink_is_reload_failed(const struct devlink *devlink) EXPORT_SYMBOL_GPL(devlink_is_reload_failed); static int devlink_reload(struct devlink *devlink, struct net *dest_net, + enum devlink_reload_action action, u32 *actions_performed, struct netlink_ext_ack *extack) { int err; @@ -2991,22 +2998,60 @@ static int devlink_reload(struct devlink *devlink, struct net *dest_net, if (!devlink->reload_enabled) return -EOPNOTSUPP; - err = devlink->ops->reload_down(devlink, !!dest_net, extack); + err = devlink->ops->reload_down(devlink, !!dest_net, action, extack); if (err) return err; if (dest_net && !net_eq(dest_net, devlink_net(devlink))) devlink_reload_netns_change(devlink, dest_net); - err = devlink->ops->reload_up(devlink, extack); + err = devlink->ops->reload_up(devlink, action, actions_performed, extack); devlink_reload_failed_set(devlink, !!err); - return err; + if (err) + return err; + + WARN_ON(!(*actions_performed & BIT(action))); + return 0; +} + +static int +devlink_nl_reload_actions_performed_snd(struct devlink *devlink, u32 actions_performed, + enum devlink_command cmd, struct genl_info *info) +{ + struct sk_buff *msg; + void *hdr; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, &devlink_nl_family, 0, cmd); + if (!hdr) + goto free_msg; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + + if (nla_put_bitfield32(msg, DEVLINK_ATTR_RELOAD_ACTIONS_PERFORMED, actions_performed, + actions_performed)) + goto nla_put_failure; + genlmsg_end(msg, hdr); + + return genlmsg_reply(msg, info); + +nla_put_failure: + genlmsg_cancel(msg, hdr); +free_msg: + nlmsg_free(msg); + return -EMSGSIZE; } static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; + enum devlink_reload_action action; struct net *dest_net = NULL; + u32 actions_performed; int err; if (!devlink_reload_supported(devlink->ops)) @@ -3026,12 +3071,30 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) return PTR_ERR(dest_net); } - err = devlink_reload(devlink, dest_net, info->extack); + if (info->attrs[DEVLINK_ATTR_RELOAD_ACTION]) + action = nla_get_u8(info->attrs[DEVLINK_ATTR_RELOAD_ACTION]); + else + action = DEVLINK_RELOAD_ACTION_DRIVER_REINIT; + + if (!devlink_reload_action_is_supported(devlink, action)) { + NL_SET_ERR_MSG_MOD(info->extack, + "Requested reload action is not supported by the driver"); + return -EOPNOTSUPP; + } + + err = devlink_reload(devlink, dest_net, action, &actions_performed, info->extack); if (dest_net) put_net(dest_net); - return err; + if (err) + return err; + /* For backward compatibility generate reply only if attributes used by user */ + if (!info->attrs[DEVLINK_ATTR_RELOAD_ACTION]) + return 0; + + return devlink_nl_reload_actions_performed_snd(devlink, actions_performed, + DEVLINK_CMD_RELOAD, info); } static int devlink_nl_flash_update_fill(struct sk_buff *msg, @@ -7282,6 +7345,8 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_TRAP_POLICER_RATE] = { .type = NLA_U64 }, [DEVLINK_ATTR_TRAP_POLICER_BURST] = { .type = NLA_U64 }, [DEVLINK_ATTR_PORT_FUNCTION] = { .type = NLA_NESTED }, + [DEVLINK_ATTR_RELOAD_ACTION] = NLA_POLICY_RANGE(NLA_U8, DEVLINK_RELOAD_ACTION_DRIVER_REINIT, + DEVLINK_RELOAD_ACTION_MAX), }; static const struct genl_small_ops devlink_nl_ops[] = { @@ -7615,6 +7680,21 @@ static struct genl_family devlink_nl_family __ro_after_init = { .n_mcgrps = ARRAY_SIZE(devlink_nl_mcgrps), }; +static bool devlink_reload_actions_valid(const struct devlink_ops *ops) +{ + if (!devlink_reload_supported(ops)) { + if (WARN_ON(ops->reload_actions)) + return false; + return true; + } + + if (WARN_ON(!ops->reload_actions || + ops->reload_actions & BIT(DEVLINK_RELOAD_ACTION_UNSPEC) || + ops->reload_actions >= BIT(__DEVLINK_RELOAD_ACTION_MAX))) + return false; + return true; +} + /** * devlink_alloc - Allocate new devlink instance resources * @@ -7631,6 +7711,9 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) if (WARN_ON(!ops)) return NULL; + if (!devlink_reload_actions_valid(ops)) + return NULL; + devlink = kzalloc(sizeof(*devlink) + priv_size, GFP_KERNEL); if (!devlink) return NULL; @@ -9960,6 +10043,7 @@ int devlink_compat_switch_id_get(struct net_device *dev, static void __net_exit devlink_pernet_pre_exit(struct net *net) { struct devlink *devlink; + u32 actions_performed; int err; /* In case network namespace is getting destroyed, reload @@ -9970,7 +10054,9 @@ static void __net_exit devlink_pernet_pre_exit(struct net *net) if (net_eq(devlink_net(devlink), net)) { if (WARN_ON(!devlink_reload_supported(devlink->ops))) continue; - err = devlink_reload(devlink, &init_net, NULL); + err = devlink_reload(devlink, &init_net, + DEVLINK_RELOAD_ACTION_DRIVER_REINIT, + &actions_performed, NULL); if (err && err != -EOPNOTSUPP) pr_warn("Failed to reload devlink instance into init_net\n"); } -- cgit v1.2.3 From dc64cc7c63102ac78bac3cfbc00ef3abd7a3fdf3 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 7 Oct 2020 09:00:44 +0300 Subject: devlink: Add devlink reload limit option Add reload limit to demand restrictions on reload actions. Reload limits supported: no_reset: No reset allowed, no down time allowed, no link flap and no configuration is lost. By default reload limit is unspecified and so no constraints on reload actions are required. Some combinations of action and limit are invalid. For example, driver can not reinitialize its entities without any downtime. The no_reset reload limit will have usecase in this patchset to implement restricted fw_activate on mlx5. Have the uapi parameter of reload limit ready for future support of multiselection. Signed-off-by: Moshe Shemesh Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx4/main.c | 4 +- drivers/net/ethernet/mellanox/mlx5/core/devlink.c | 4 +- drivers/net/ethernet/mellanox/mlxsw/core.c | 4 +- drivers/net/netdevsim/dev.c | 6 +- include/net/devlink.h | 8 +- include/uapi/linux/devlink.h | 14 ++++ net/core/devlink.c | 92 +++++++++++++++++++++-- 7 files changed, 119 insertions(+), 13 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 649c5323cf9f..c326b434734e 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -3947,6 +3947,7 @@ static int mlx4_restart_one_up(struct pci_dev *pdev, bool reload, static int mlx4_devlink_reload_down(struct devlink *devlink, bool netns_change, enum devlink_reload_action action, + enum devlink_reload_limit limit, struct netlink_ext_ack *extack) { struct mlx4_priv *priv = devlink_priv(devlink); @@ -3964,7 +3965,8 @@ static int mlx4_devlink_reload_down(struct devlink *devlink, bool netns_change, } static int mlx4_devlink_reload_up(struct devlink *devlink, enum devlink_reload_action action, - u32 *actions_performed, struct netlink_ext_ack *extack) + enum devlink_reload_limit limit, u32 *actions_performed, + struct netlink_ext_ack *extack) { struct mlx4_priv *priv = devlink_priv(devlink); struct mlx4_dev *dev = &priv->dev; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c index 1b248c01a209..0016041e8779 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c @@ -86,6 +86,7 @@ mlx5_devlink_info_get(struct devlink *devlink, struct devlink_info_req *req, static int mlx5_devlink_reload_down(struct devlink *devlink, bool netns_change, enum devlink_reload_action action, + enum devlink_reload_limit limit, struct netlink_ext_ack *extack) { struct mlx5_core_dev *dev = devlink_priv(devlink); @@ -95,7 +96,8 @@ static int mlx5_devlink_reload_down(struct devlink *devlink, bool netns_change, } static int mlx5_devlink_reload_up(struct devlink *devlink, enum devlink_reload_action action, - u32 *actions_performed, struct netlink_ext_ack *extack) + enum devlink_reload_limit limit, u32 *actions_performed, + struct netlink_ext_ack *extack) { struct mlx5_core_dev *dev = devlink_priv(devlink); diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c index cd9f56c73827..7f77c2a71d1c 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core.c @@ -1415,6 +1415,7 @@ mlxsw_devlink_info_get(struct devlink *devlink, struct devlink_info_req *req, static int mlxsw_devlink_core_bus_device_reload_down(struct devlink *devlink, bool netns_change, enum devlink_reload_action action, + enum devlink_reload_limit limit, struct netlink_ext_ack *extack) { struct mlxsw_core *mlxsw_core = devlink_priv(devlink); @@ -1428,7 +1429,8 @@ mlxsw_devlink_core_bus_device_reload_down(struct devlink *devlink, static int mlxsw_devlink_core_bus_device_reload_up(struct devlink *devlink, enum devlink_reload_action action, - u32 *actions_performed, struct netlink_ext_ack *extack) + enum devlink_reload_limit limit, u32 *actions_performed, + struct netlink_ext_ack *extack) { struct mlxsw_core *mlxsw_core = devlink_priv(devlink); diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c index b57e35c4ef6f..d07061417675 100644 --- a/drivers/net/netdevsim/dev.c +++ b/drivers/net/netdevsim/dev.c @@ -701,7 +701,8 @@ static int nsim_dev_reload_create(struct nsim_dev *nsim_dev, static void nsim_dev_reload_destroy(struct nsim_dev *nsim_dev); static int nsim_dev_reload_down(struct devlink *devlink, bool netns_change, - enum devlink_reload_action action, struct netlink_ext_ack *extack) + enum devlink_reload_action action, enum devlink_reload_limit limit, + struct netlink_ext_ack *extack) { struct nsim_dev *nsim_dev = devlink_priv(devlink); @@ -718,7 +719,8 @@ static int nsim_dev_reload_down(struct devlink *devlink, bool netns_change, } static int nsim_dev_reload_up(struct devlink *devlink, enum devlink_reload_action action, - u32 *actions_performed, struct netlink_ext_ack *extack) + enum devlink_reload_limit limit, u32 *actions_performed, + struct netlink_ext_ack *extack) { struct nsim_dev *nsim_dev = devlink_priv(devlink); diff --git a/include/net/devlink.h b/include/net/devlink.h index 93c535ae5a4b..9f5c37c391f8 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1151,10 +1151,14 @@ struct devlink_ops { */ u32 supported_flash_update_params; unsigned long reload_actions; + unsigned long reload_limits; int (*reload_down)(struct devlink *devlink, bool netns_change, - enum devlink_reload_action action, struct netlink_ext_ack *extack); + enum devlink_reload_action action, + enum devlink_reload_limit limit, + struct netlink_ext_ack *extack); int (*reload_up)(struct devlink *devlink, enum devlink_reload_action action, - u32 *actions_performed, struct netlink_ext_ack *extack); + enum devlink_reload_limit limit, u32 *actions_performed, + struct netlink_ext_ack *extack); int (*port_type_set)(struct devlink_port *devlink_port, enum devlink_port_type port_type); int (*port_split)(struct devlink *devlink, unsigned int port_index, diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 74bdad252c36..82a5e66c1518 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -311,6 +311,19 @@ enum devlink_reload_action { DEVLINK_RELOAD_ACTION_MAX = __DEVLINK_RELOAD_ACTION_MAX - 1 }; +enum devlink_reload_limit { + DEVLINK_RELOAD_LIMIT_UNSPEC, /* unspecified, no constraints */ + DEVLINK_RELOAD_LIMIT_NO_RESET, /* No reset allowed, no down time allowed, + * no link flap and no configuration is lost. + */ + + /* Add new reload limit above */ + __DEVLINK_RELOAD_LIMIT_MAX, + DEVLINK_RELOAD_LIMIT_MAX = __DEVLINK_RELOAD_LIMIT_MAX - 1 +}; + +#define DEVLINK_RELOAD_LIMITS_VALID_MASK (BIT(__DEVLINK_RELOAD_LIMIT_MAX) - 1) + enum devlink_attr { /* don't change the order or add anything between, this is ABI! */ DEVLINK_ATTR_UNSPEC, @@ -505,6 +518,7 @@ enum devlink_attr { DEVLINK_ATTR_RELOAD_ACTION, /* u8 */ DEVLINK_ATTR_RELOAD_ACTIONS_PERFORMED, /* bitfield32 */ + DEVLINK_ATTR_RELOAD_LIMITS, /* bitfield32 */ /* add new attributes above here, update the policy in devlink.c */ diff --git a/net/core/devlink.c b/net/core/devlink.c index c026ed3519c9..28b63faa3c6b 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -479,12 +479,44 @@ static int devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink) return 0; } +struct devlink_reload_combination { + enum devlink_reload_action action; + enum devlink_reload_limit limit; +}; + +static const struct devlink_reload_combination devlink_reload_invalid_combinations[] = { + { + /* can't reinitialize driver with no down time */ + .action = DEVLINK_RELOAD_ACTION_DRIVER_REINIT, + .limit = DEVLINK_RELOAD_LIMIT_NO_RESET, + }, +}; + +static bool +devlink_reload_combination_is_invalid(enum devlink_reload_action action, + enum devlink_reload_limit limit) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(devlink_reload_invalid_combinations); i++) + if (devlink_reload_invalid_combinations[i].action == action && + devlink_reload_invalid_combinations[i].limit == limit) + return true; + return false; +} + static bool devlink_reload_action_is_supported(struct devlink *devlink, enum devlink_reload_action action) { return test_bit(action, &devlink->ops->reload_actions); } +static bool +devlink_reload_limit_is_supported(struct devlink *devlink, enum devlink_reload_limit limit) +{ + return test_bit(limit, &devlink->ops->reload_limits); +} + static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink, enum devlink_command cmd, u32 portid, u32 seq, int flags) @@ -2990,22 +3022,22 @@ bool devlink_is_reload_failed(const struct devlink *devlink) EXPORT_SYMBOL_GPL(devlink_is_reload_failed); static int devlink_reload(struct devlink *devlink, struct net *dest_net, - enum devlink_reload_action action, u32 *actions_performed, - struct netlink_ext_ack *extack) + enum devlink_reload_action action, enum devlink_reload_limit limit, + u32 *actions_performed, struct netlink_ext_ack *extack) { int err; if (!devlink->reload_enabled) return -EOPNOTSUPP; - err = devlink->ops->reload_down(devlink, !!dest_net, action, extack); + err = devlink->ops->reload_down(devlink, !!dest_net, action, limit, extack); if (err) return err; if (dest_net && !net_eq(dest_net, devlink_net(devlink))) devlink_reload_netns_change(devlink, dest_net); - err = devlink->ops->reload_up(devlink, action, actions_performed, extack); + err = devlink->ops->reload_up(devlink, action, limit, actions_performed, extack); devlink_reload_failed_set(devlink, !!err); if (err) return err; @@ -3050,6 +3082,7 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; enum devlink_reload_action action; + enum devlink_reload_limit limit; struct net *dest_net = NULL; u32 actions_performed; int err; @@ -3082,7 +3115,38 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) return -EOPNOTSUPP; } - err = devlink_reload(devlink, dest_net, action, &actions_performed, info->extack); + limit = DEVLINK_RELOAD_LIMIT_UNSPEC; + if (info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]) { + struct nla_bitfield32 limits; + u32 limits_selected; + + limits = nla_get_bitfield32(info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]); + limits_selected = limits.value & limits.selector; + if (!limits_selected) { + NL_SET_ERR_MSG_MOD(info->extack, "Invalid limit selected"); + return -EINVAL; + } + for (limit = 0 ; limit <= DEVLINK_RELOAD_LIMIT_MAX ; limit++) + if (limits_selected & BIT(limit)) + break; + /* UAPI enables multiselection, but currently it is not used */ + if (limits_selected != BIT(limit)) { + NL_SET_ERR_MSG_MOD(info->extack, + "Multiselection of limit is not supported"); + return -EOPNOTSUPP; + } + if (!devlink_reload_limit_is_supported(devlink, limit)) { + NL_SET_ERR_MSG_MOD(info->extack, + "Requested limit is not supported by the driver"); + return -EOPNOTSUPP; + } + if (devlink_reload_combination_is_invalid(action, limit)) { + NL_SET_ERR_MSG_MOD(info->extack, + "Requested limit is invalid for this action"); + return -EINVAL; + } + } + err = devlink_reload(devlink, dest_net, action, limit, &actions_performed, info->extack); if (dest_net) put_net(dest_net); @@ -3090,7 +3154,7 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) if (err) return err; /* For backward compatibility generate reply only if attributes used by user */ - if (!info->attrs[DEVLINK_ATTR_RELOAD_ACTION]) + if (!info->attrs[DEVLINK_ATTR_RELOAD_ACTION] && !info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]) return 0; return devlink_nl_reload_actions_performed_snd(devlink, actions_performed, @@ -7347,6 +7411,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_PORT_FUNCTION] = { .type = NLA_NESTED }, [DEVLINK_ATTR_RELOAD_ACTION] = NLA_POLICY_RANGE(NLA_U8, DEVLINK_RELOAD_ACTION_DRIVER_REINIT, DEVLINK_RELOAD_ACTION_MAX), + [DEVLINK_ATTR_RELOAD_LIMITS] = NLA_POLICY_BITFIELD32(DEVLINK_RELOAD_LIMITS_VALID_MASK), }; static const struct genl_small_ops devlink_nl_ops[] = { @@ -7682,6 +7747,9 @@ static struct genl_family devlink_nl_family __ro_after_init = { static bool devlink_reload_actions_valid(const struct devlink_ops *ops) { + const struct devlink_reload_combination *comb; + int i; + if (!devlink_reload_supported(ops)) { if (WARN_ON(ops->reload_actions)) return false; @@ -7692,6 +7760,17 @@ static bool devlink_reload_actions_valid(const struct devlink_ops *ops) ops->reload_actions & BIT(DEVLINK_RELOAD_ACTION_UNSPEC) || ops->reload_actions >= BIT(__DEVLINK_RELOAD_ACTION_MAX))) return false; + + if (WARN_ON(ops->reload_limits & BIT(DEVLINK_RELOAD_LIMIT_UNSPEC) || + ops->reload_limits >= BIT(__DEVLINK_RELOAD_LIMIT_MAX))) + return false; + + for (i = 0; i < ARRAY_SIZE(devlink_reload_invalid_combinations); i++) { + comb = &devlink_reload_invalid_combinations[i]; + if (ops->reload_actions == BIT(comb->action) && + ops->reload_limits == BIT(comb->limit)) + return false; + } return true; } @@ -10056,6 +10135,7 @@ static void __net_exit devlink_pernet_pre_exit(struct net *net) continue; err = devlink_reload(devlink, &init_net, DEVLINK_RELOAD_ACTION_DRIVER_REINIT, + DEVLINK_RELOAD_LIMIT_UNSPEC, &actions_performed, NULL); if (err && err != -EOPNOTSUPP) pr_warn("Failed to reload devlink instance into init_net\n"); -- cgit v1.2.3 From a254c264267e8746fb257806c166e54375cf9c06 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 7 Oct 2020 09:00:45 +0300 Subject: devlink: Add reload stats Add reload stats to hold the history per reload action type and limit. For example, the number of times fw_activate has been performed on this device since the driver module was added or if the firmware activation was performed with or without reset. Add devlink notification on stats update. Expose devlink reload stats to the user through devlink dev get command. Examples: $ devlink dev show pci/0000:82:00.0: stats: reload: driver_reinit 2 fw_activate 1 fw_activate_no_reset 0 pci/0000:82:00.1: stats: reload: driver_reinit 1 fw_activate 0 fw_activate_no_reset 0 $ devlink dev show -jp { "dev": { "pci/0000:82:00.0": { "stats": { "reload": { "driver_reinit": 2, "fw_activate": 1, "fw_activate_no_reset": 0 } } }, "pci/0000:82:00.1": { "stats": { "reload": { "driver_reinit": 1, "fw_activate": 0, "fw_activate_no_reset": 0 } } } } } Signed-off-by: Moshe Shemesh Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 8 ++++ include/uapi/linux/devlink.h | 6 +++ net/core/devlink.c | 90 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 104 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/devlink.h b/include/net/devlink.h index 9f5c37c391f8..d091c6ba82ce 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -20,6 +20,13 @@ #include #include +#define DEVLINK_RELOAD_STATS_ARRAY_SIZE \ + (__DEVLINK_RELOAD_LIMIT_MAX * __DEVLINK_RELOAD_ACTION_MAX) + +struct devlink_dev_stats { + u32 reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; +}; + struct devlink_ops; struct devlink { @@ -38,6 +45,7 @@ struct devlink { struct list_head trap_policer_list; const struct devlink_ops *ops; struct xarray snapshot_ids; + struct devlink_dev_stats stats; struct device *dev; possible_net_t _net; struct mutex lock; /* Serializes access to devlink instance specific objects such as diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 82a5e66c1518..ab15fc597b74 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -520,6 +520,12 @@ enum devlink_attr { DEVLINK_ATTR_RELOAD_ACTIONS_PERFORMED, /* bitfield32 */ DEVLINK_ATTR_RELOAD_LIMITS, /* bitfield32 */ + DEVLINK_ATTR_DEV_STATS, /* nested */ + DEVLINK_ATTR_RELOAD_STATS, /* nested */ + DEVLINK_ATTR_RELOAD_STATS_ENTRY, /* nested */ + DEVLINK_ATTR_RELOAD_STATS_LIMIT, /* u8 */ + DEVLINK_ATTR_RELOAD_STATS_VALUE, /* u32 */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 28b63faa3c6b..a167c3bb468c 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -517,10 +517,66 @@ devlink_reload_limit_is_supported(struct devlink *devlink, enum devlink_reload_l return test_bit(limit, &devlink->ops->reload_limits); } +static int devlink_reload_stat_put(struct sk_buff *msg, enum devlink_reload_action action, + enum devlink_reload_limit limit, u32 value) +{ + struct nlattr *reload_stats_entry; + + reload_stats_entry = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_STATS_ENTRY); + if (!reload_stats_entry) + return -EMSGSIZE; + + if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_ACTION, action) || + nla_put_u8(msg, DEVLINK_ATTR_RELOAD_STATS_LIMIT, limit) || + nla_put_u32(msg, DEVLINK_ATTR_RELOAD_STATS_VALUE, value)) + goto nla_put_failure; + nla_nest_end(msg, reload_stats_entry); + return 0; + +nla_put_failure: + nla_nest_cancel(msg, reload_stats_entry); + return -EMSGSIZE; +} + +static int devlink_reload_stats_put(struct sk_buff *msg, struct devlink *devlink) +{ + struct nlattr *reload_stats_attr; + int i, j, stat_idx; + u32 value; + + reload_stats_attr = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_STATS); + + if (!reload_stats_attr) + return -EMSGSIZE; + + for (j = 0; j <= DEVLINK_RELOAD_LIMIT_MAX; j++) { + if (j != DEVLINK_RELOAD_LIMIT_UNSPEC && + !devlink_reload_limit_is_supported(devlink, j)) + continue; + for (i = 0; i <= DEVLINK_RELOAD_ACTION_MAX; i++) { + if (!devlink_reload_action_is_supported(devlink, i) || + devlink_reload_combination_is_invalid(i, j)) + continue; + + stat_idx = j * __DEVLINK_RELOAD_ACTION_MAX + i; + value = devlink->stats.reload_stats[stat_idx]; + if (devlink_reload_stat_put(msg, i, j, value)) + goto nla_put_failure; + } + } + nla_nest_end(msg, reload_stats_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(msg, reload_stats_attr); + return -EMSGSIZE; +} + static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink, enum devlink_command cmd, u32 portid, u32 seq, int flags) { + struct nlattr *dev_stats; void *hdr; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); @@ -532,9 +588,19 @@ static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink, if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_FAILED, devlink->reload_failed)) goto nla_put_failure; + dev_stats = nla_nest_start(msg, DEVLINK_ATTR_DEV_STATS); + if (!dev_stats) + goto nla_put_failure; + + if (devlink_reload_stats_put(msg, devlink)) + goto dev_stats_nest_cancel; + + nla_nest_end(msg, dev_stats); genlmsg_end(msg, hdr); return 0; +dev_stats_nest_cancel: + nla_nest_cancel(msg, dev_stats); nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; @@ -3021,6 +3087,29 @@ bool devlink_is_reload_failed(const struct devlink *devlink) } EXPORT_SYMBOL_GPL(devlink_is_reload_failed); +static void +__devlink_reload_stats_update(struct devlink *devlink, u32 *reload_stats, + enum devlink_reload_limit limit, u32 actions_performed) +{ + unsigned long actions = actions_performed; + int stat_idx; + int action; + + for_each_set_bit(action, &actions, __DEVLINK_RELOAD_ACTION_MAX) { + stat_idx = limit * __DEVLINK_RELOAD_ACTION_MAX + action; + reload_stats[stat_idx]++; + } + devlink_notify(devlink, DEVLINK_CMD_NEW); +} + +static void +devlink_reload_stats_update(struct devlink *devlink, enum devlink_reload_limit limit, + u32 actions_performed) +{ + __devlink_reload_stats_update(devlink, devlink->stats.reload_stats, limit, + actions_performed); +} + static int devlink_reload(struct devlink *devlink, struct net *dest_net, enum devlink_reload_action action, enum devlink_reload_limit limit, u32 *actions_performed, struct netlink_ext_ack *extack) @@ -3043,6 +3132,7 @@ static int devlink_reload(struct devlink *devlink, struct net *dest_net, return err; WARN_ON(!(*actions_performed & BIT(action))); + devlink_reload_stats_update(devlink, limit, *actions_performed); return 0; } -- cgit v1.2.3 From 77069ba2e3adf48c472fbbd9cbd7a4f5370b17df Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 7 Oct 2020 09:00:46 +0300 Subject: devlink: Add remote reload stats Add remote reload stats to hold the history of actions performed due devlink reload commands initiated by remote host. For example, in case firmware activation with reset finished successfully but was initiated by remote host. The function devlink_remote_reload_actions_performed() is exported to enable drivers update on remote reload actions performed as it was not initiated by their own devlink instance. Expose devlink remote reload stats to the user through devlink dev get command. Examples: $ devlink dev show pci/0000:82:00.0: stats: reload: driver_reinit 2 fw_activate 1 fw_activate_no_reset 0 remote_reload: driver_reinit 0 fw_activate 0 fw_activate_no_reset 0 pci/0000:82:00.1: stats: reload: driver_reinit 1 fw_activate 0 fw_activate_no_reset 0 remote_reload: driver_reinit 1 fw_activate 1 fw_activate_no_reset 0 $ devlink dev show -jp { "dev": { "pci/0000:82:00.0": { "stats": { "reload": { "driver_reinit": 2, "fw_activate": 1, "fw_activate_no_reset": 0 }, "remote_reload": { "driver_reinit": 0, "fw_activate": 0, "fw_activate_no_reset": 0 } } }, "pci/0000:82:00.1": { "stats": { "reload": { "driver_reinit": 1, "fw_activate": 0, "fw_activate_no_reset": 0 }, "remote_reload": { "driver_reinit": 1, "fw_activate": 1, "fw_activate_no_reset": 0 } } } } } Signed-off-by: Moshe Shemesh Reviewed-by: Jakub Kicinski Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 4 +++ include/uapi/linux/devlink.h | 1 + net/core/devlink.c | 60 +++++++++++++++++++++++++++++++++++++++----- 3 files changed, 59 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/devlink.h b/include/net/devlink.h index d091c6ba82ce..d2771e57a278 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -25,6 +25,7 @@ struct devlink_dev_stats { u32 reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; + u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; }; struct devlink_ops; @@ -1567,6 +1568,9 @@ void devlink_health_reporter_recovery_done(struct devlink_health_reporter *reporter); bool devlink_is_reload_failed(const struct devlink *devlink); +void devlink_remote_reload_actions_performed(struct devlink *devlink, + enum devlink_reload_limit limit, + u32 actions_performed); void devlink_flash_update_begin_notify(struct devlink *devlink); void devlink_flash_update_end_notify(struct devlink *devlink); diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index ab15fc597b74..0113bc4db9f5 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -525,6 +525,7 @@ enum devlink_attr { DEVLINK_ATTR_RELOAD_STATS_ENTRY, /* nested */ DEVLINK_ATTR_RELOAD_STATS_LIMIT, /* u8 */ DEVLINK_ATTR_RELOAD_STATS_VALUE, /* u32 */ + DEVLINK_ATTR_REMOTE_RELOAD_STATS, /* nested */ /* add new attributes above here, update the policy in devlink.c */ diff --git a/net/core/devlink.c b/net/core/devlink.c index a167c3bb468c..dd889334fed9 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -538,28 +538,39 @@ nla_put_failure: return -EMSGSIZE; } -static int devlink_reload_stats_put(struct sk_buff *msg, struct devlink *devlink) +static int devlink_reload_stats_put(struct sk_buff *msg, struct devlink *devlink, bool is_remote) { struct nlattr *reload_stats_attr; int i, j, stat_idx; u32 value; - reload_stats_attr = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_STATS); + if (!is_remote) + reload_stats_attr = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_STATS); + else + reload_stats_attr = nla_nest_start(msg, DEVLINK_ATTR_REMOTE_RELOAD_STATS); if (!reload_stats_attr) return -EMSGSIZE; for (j = 0; j <= DEVLINK_RELOAD_LIMIT_MAX; j++) { - if (j != DEVLINK_RELOAD_LIMIT_UNSPEC && + /* Remote stats are shown even if not locally supported. Stats + * of actions with unspecified limit are shown though drivers + * don't need to register unspecified limit. + */ + if (!is_remote && j != DEVLINK_RELOAD_LIMIT_UNSPEC && !devlink_reload_limit_is_supported(devlink, j)) continue; for (i = 0; i <= DEVLINK_RELOAD_ACTION_MAX; i++) { - if (!devlink_reload_action_is_supported(devlink, i) || + if ((!is_remote && !devlink_reload_action_is_supported(devlink, i)) || + i == DEVLINK_RELOAD_ACTION_UNSPEC || devlink_reload_combination_is_invalid(i, j)) continue; stat_idx = j * __DEVLINK_RELOAD_ACTION_MAX + i; - value = devlink->stats.reload_stats[stat_idx]; + if (!is_remote) + value = devlink->stats.reload_stats[stat_idx]; + else + value = devlink->stats.remote_reload_stats[stat_idx]; if (devlink_reload_stat_put(msg, i, j, value)) goto nla_put_failure; } @@ -592,7 +603,9 @@ static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink, if (!dev_stats) goto nla_put_failure; - if (devlink_reload_stats_put(msg, devlink)) + if (devlink_reload_stats_put(msg, devlink, false)) + goto dev_stats_nest_cancel; + if (devlink_reload_stats_put(msg, devlink, true)) goto dev_stats_nest_cancel; nla_nest_end(msg, dev_stats); @@ -3110,15 +3123,47 @@ devlink_reload_stats_update(struct devlink *devlink, enum devlink_reload_limit l actions_performed); } +/** + * devlink_remote_reload_actions_performed - Update devlink on reload actions + * performed which are not a direct result of devlink reload call. + * + * This should be called by a driver after performing reload actions in case it was not + * a result of devlink reload call. For example fw_activate was performed as a result + * of devlink reload triggered fw_activate on another host. + * The motivation for this function is to keep data on reload actions performed on this + * function whether it was done due to direct devlink reload call or not. + * + * @devlink: devlink + * @limit: reload limit + * @actions_performed: bitmask of actions performed + */ +void devlink_remote_reload_actions_performed(struct devlink *devlink, + enum devlink_reload_limit limit, + u32 actions_performed) +{ + if (WARN_ON(!actions_performed || + actions_performed & BIT(DEVLINK_RELOAD_ACTION_UNSPEC) || + actions_performed >= BIT(__DEVLINK_RELOAD_ACTION_MAX) || + limit > DEVLINK_RELOAD_LIMIT_MAX)) + return; + + __devlink_reload_stats_update(devlink, devlink->stats.remote_reload_stats, limit, + actions_performed); +} +EXPORT_SYMBOL_GPL(devlink_remote_reload_actions_performed); + static int devlink_reload(struct devlink *devlink, struct net *dest_net, enum devlink_reload_action action, enum devlink_reload_limit limit, u32 *actions_performed, struct netlink_ext_ack *extack) { + u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; int err; if (!devlink->reload_enabled) return -EOPNOTSUPP; + memcpy(remote_reload_stats, devlink->stats.remote_reload_stats, + sizeof(remote_reload_stats)); err = devlink->ops->reload_down(devlink, !!dest_net, action, limit, extack); if (err) return err; @@ -3132,6 +3177,9 @@ static int devlink_reload(struct devlink *devlink, struct net *dest_net, return err; WARN_ON(!(*actions_performed & BIT(action))); + /* Catch driver on updating the remote action within devlink reload */ + WARN_ON(memcmp(remote_reload_stats, devlink->stats.remote_reload_stats, + sizeof(remote_reload_stats))); devlink_reload_stats_update(devlink, limit, *actions_performed); return 0; } -- cgit v1.2.3 From 44f3625bc61653ea3bde9960298faf2f5518fda5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 8 Oct 2020 12:45:17 +0200 Subject: netlink: export policy in extended ACK Add a new attribute NLMSGERR_ATTR_POLICY to the extended ACK to advertise the policy, e.g. if an attribute was out of range, you'll know the range that's permissible. Add new NL_SET_ERR_MSG_ATTR_POL() and NL_SET_ERR_MSG_ATTR_POL() macros to set this, since realistically it's only useful to do this when the bad attribute (offset) is also returned. Use it in lib/nlattr.c which practically does all the policy validation. v2: - add and use netlink_policy_dump_attr_size_estimate() v3: - remove redundant break v4: - really remove redundant break ... sorry Reviewed-by: Jakub Kicinski Signed-off-by: Johannes Berg Signed-off-by: Jakub Kicinski --- include/linux/netlink.h | 30 ++++++++++++++-------- include/net/netlink.h | 4 +++ include/uapi/linux/netlink.h | 2 ++ lib/nlattr.c | 35 +++++++++++++------------ net/netlink/af_netlink.c | 5 ++++ net/netlink/policy.c | 61 ++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 110 insertions(+), 27 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index e3e49f0e5c13..666cd0390699 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -68,12 +68,14 @@ netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg) * @_msg: message string to report - don't access directly, use * %NL_SET_ERR_MSG * @bad_attr: attribute with error + * @policy: policy for a bad attribute * @cookie: cookie data to return to userspace (for success) * @cookie_len: actual cookie data length */ struct netlink_ext_ack { const char *_msg; const struct nlattr *bad_attr; + const struct nla_policy *policy; u8 cookie[NETLINK_MAX_COOKIE_LEN]; u8 cookie_len; }; @@ -95,21 +97,29 @@ struct netlink_ext_ack { #define NL_SET_ERR_MSG_MOD(extack, msg) \ NL_SET_ERR_MSG((extack), KBUILD_MODNAME ": " msg) -#define NL_SET_BAD_ATTR(extack, attr) do { \ - if ((extack)) \ +#define NL_SET_BAD_ATTR_POLICY(extack, attr, pol) do { \ + if ((extack)) { \ (extack)->bad_attr = (attr); \ + (extack)->policy = (pol); \ + } \ } while (0) -#define NL_SET_ERR_MSG_ATTR(extack, attr, msg) do { \ - static const char __msg[] = msg; \ - struct netlink_ext_ack *__extack = (extack); \ - \ - if (__extack) { \ - __extack->_msg = __msg; \ - __extack->bad_attr = (attr); \ - } \ +#define NL_SET_BAD_ATTR(extack, attr) NL_SET_BAD_ATTR_POLICY(extack, attr, NULL) + +#define NL_SET_ERR_MSG_ATTR_POL(extack, attr, pol, msg) do { \ + static const char __msg[] = msg; \ + struct netlink_ext_ack *__extack = (extack); \ + \ + if (__extack) { \ + __extack->_msg = __msg; \ + __extack->bad_attr = (attr); \ + __extack->policy = (pol); \ + } \ } while (0) +#define NL_SET_ERR_MSG_ATTR(extack, attr, msg) \ + NL_SET_ERR_MSG_ATTR_POL(extack, attr, NULL, msg) + static inline void nl_set_extack_cookie_u64(struct netlink_ext_ack *extack, u64 cookie) { diff --git a/include/net/netlink.h b/include/net/netlink.h index 2b9e41075f19..7356f41d23ba 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -1957,6 +1957,10 @@ int netlink_policy_dump_get_policy_idx(struct netlink_policy_dump_state *state, bool netlink_policy_dump_loop(struct netlink_policy_dump_state *state); int netlink_policy_dump_write(struct sk_buff *skb, struct netlink_policy_dump_state *state); +int netlink_policy_dump_attr_size_estimate(const struct nla_policy *pt); +int netlink_policy_dump_write_attr(struct sk_buff *skb, + const struct nla_policy *pt, + int nestattr); void netlink_policy_dump_free(struct netlink_policy_dump_state *state); #endif diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index d02e472ba54c..c3816ff7bfc3 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -129,6 +129,7 @@ struct nlmsgerr { * @NLMSGERR_ATTR_COOKIE: arbitrary subsystem specific cookie to * be used - in the success case - to identify a created * object or operation or similar (binary) + * @NLMSGERR_ATTR_POLICY: policy for a rejected attribute * @__NLMSGERR_ATTR_MAX: number of attributes * @NLMSGERR_ATTR_MAX: highest attribute number */ @@ -137,6 +138,7 @@ enum nlmsgerr_attrs { NLMSGERR_ATTR_MSG, NLMSGERR_ATTR_OFFS, NLMSGERR_ATTR_COOKIE, + NLMSGERR_ATTR_POLICY, __NLMSGERR_ATTR_MAX, NLMSGERR_ATTR_MAX = __NLMSGERR_ATTR_MAX - 1 diff --git a/lib/nlattr.c b/lib/nlattr.c index 9c99f5daa4d2..74019c8ebf6b 100644 --- a/lib/nlattr.c +++ b/lib/nlattr.c @@ -96,8 +96,8 @@ static int nla_validate_array(const struct nlattr *head, int len, int maxtype, continue; if (nla_len(entry) < NLA_HDRLEN) { - NL_SET_ERR_MSG_ATTR(extack, entry, - "Array element too short"); + NL_SET_ERR_MSG_ATTR_POL(extack, entry, policy, + "Array element too short"); return -ERANGE; } @@ -195,8 +195,8 @@ static int nla_validate_range_unsigned(const struct nla_policy *pt, pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n", current->comm, pt->type); if (validate & NL_VALIDATE_STRICT_ATTRS) { - NL_SET_ERR_MSG_ATTR(extack, nla, - "invalid attribute length"); + NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, + "invalid attribute length"); return -EINVAL; } @@ -208,11 +208,11 @@ static int nla_validate_range_unsigned(const struct nla_policy *pt, bool binary = pt->type == NLA_BINARY; if (binary) - NL_SET_ERR_MSG_ATTR(extack, nla, - "binary attribute size out of range"); + NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, + "binary attribute size out of range"); else - NL_SET_ERR_MSG_ATTR(extack, nla, - "integer out of range"); + NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, + "integer out of range"); return -ERANGE; } @@ -291,8 +291,8 @@ static int nla_validate_int_range_signed(const struct nla_policy *pt, nla_get_range_signed(pt, &range); if (value < range.min || value > range.max) { - NL_SET_ERR_MSG_ATTR(extack, nla, - "integer out of range"); + NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, + "integer out of range"); return -ERANGE; } @@ -377,8 +377,8 @@ static int validate_nla(const struct nlattr *nla, int maxtype, pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n", current->comm, type); if (validate & NL_VALIDATE_STRICT_ATTRS) { - NL_SET_ERR_MSG_ATTR(extack, nla, - "invalid attribute length"); + NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, + "invalid attribute length"); return -EINVAL; } } @@ -386,14 +386,14 @@ static int validate_nla(const struct nlattr *nla, int maxtype, if (validate & NL_VALIDATE_NESTED) { if ((pt->type == NLA_NESTED || pt->type == NLA_NESTED_ARRAY) && !(nla->nla_type & NLA_F_NESTED)) { - NL_SET_ERR_MSG_ATTR(extack, nla, - "NLA_F_NESTED is missing"); + NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, + "NLA_F_NESTED is missing"); return -EINVAL; } if (pt->type != NLA_NESTED && pt->type != NLA_NESTED_ARRAY && pt->type != NLA_UNSPEC && (nla->nla_type & NLA_F_NESTED)) { - NL_SET_ERR_MSG_ATTR(extack, nla, - "NLA_F_NESTED not expected"); + NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, + "NLA_F_NESTED not expected"); return -EINVAL; } } @@ -550,7 +550,8 @@ static int validate_nla(const struct nlattr *nla, int maxtype, return 0; out_err: - NL_SET_ERR_MSG_ATTR(extack, nla, "Attribute failed policy validation"); + NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, + "Attribute failed policy validation"); return err; } diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index df675a8e1918..daca50d6bb12 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2420,6 +2420,8 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, tlvlen += nla_total_size(sizeof(u32)); if (nlk_has_extack && extack && extack->cookie_len) tlvlen += nla_total_size(extack->cookie_len); + if (err && nlk_has_extack && extack && extack->policy) + tlvlen += netlink_policy_dump_attr_size_estimate(extack->policy); if (tlvlen) flags |= NLM_F_ACK_TLVS; @@ -2452,6 +2454,9 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, if (extack->cookie_len) WARN_ON(nla_put(skb, NLMSGERR_ATTR_COOKIE, extack->cookie_len, extack->cookie)); + if (extack->policy) + netlink_policy_dump_write_attr(skb, extack->policy, + NLMSGERR_ATTR_POLICY); } nlmsg_end(skb, rep); diff --git a/net/netlink/policy.c b/net/netlink/policy.c index 4383436759e2..8d7c900e27f4 100644 --- a/net/netlink/policy.c +++ b/net/netlink/policy.c @@ -196,12 +196,54 @@ bool netlink_policy_dump_loop(struct netlink_policy_dump_state *state) return !netlink_policy_dump_finished(state); } +int netlink_policy_dump_attr_size_estimate(const struct nla_policy *pt) +{ + /* nested + type */ + int common = 2 * nla_attr_size(sizeof(u32)); + + switch (pt->type) { + case NLA_UNSPEC: + case NLA_REJECT: + /* these actually don't need any space */ + return 0; + case NLA_NESTED: + case NLA_NESTED_ARRAY: + /* common, policy idx, policy maxattr */ + return common + 2 * nla_attr_size(sizeof(u32)); + case NLA_U8: + case NLA_U16: + case NLA_U32: + case NLA_U64: + case NLA_MSECS: + case NLA_S8: + case NLA_S16: + case NLA_S32: + case NLA_S64: + /* maximum is common, u64 min/max with padding */ + return common + + 2 * (nla_attr_size(0) + nla_attr_size(sizeof(u64))); + case NLA_BITFIELD32: + return common + nla_attr_size(sizeof(u32)); + case NLA_STRING: + case NLA_NUL_STRING: + case NLA_BINARY: + /* maximum is common, u32 min-length/max-length */ + return common + 2 * nla_attr_size(sizeof(u32)); + case NLA_FLAG: + return common; + } + + /* this should then cause a warning later */ + return 0; +} + static int __netlink_policy_dump_write_attr(struct netlink_policy_dump_state *state, struct sk_buff *skb, const struct nla_policy *pt, int nestattr) { + int estimate = netlink_policy_dump_attr_size_estimate(pt); enum netlink_attribute_type type; struct nlattr *attr; @@ -334,12 +376,31 @@ __netlink_policy_dump_write_attr(struct netlink_policy_dump_state *state, goto nla_put_failure; nla_nest_end(skb, attr); + WARN_ON(attr->nla_len > estimate); + return 0; nla_put_failure: nla_nest_cancel(skb, attr); return -ENOBUFS; } +/** + * netlink_policy_dump_write_attr - write a given attribute policy + * @skb: the message skb to write to + * @pt: the attribute's policy + * @nestattr: the nested attribute ID to use + * + * Returns: 0 on success, an error code otherwise; -%ENODATA is + * special, indicating that there's no policy data and + * the attribute is generally rejected. + */ +int netlink_policy_dump_write_attr(struct sk_buff *skb, + const struct nla_policy *pt, + int nestattr) +{ + return __netlink_policy_dump_write_attr(NULL, skb, pt, nestattr); +} + /** * netlink_policy_dump_write - write current policy dump attributes * @skb: the message skb to write to -- cgit v1.2.3 From dd2ce6a5373c6f5c830be54be10775458a8bd312 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Oct 2020 01:40:01 +0200 Subject: bpf: Improve bpf_redirect_neigh helper description Follow-up to address David's feedback that we should better describe internals of the bpf_redirect_neigh() helper. Suggested-by: David Ahern Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Reviewed-by: David Ahern Link: https://lore.kernel.org/bpf/20201010234006.7075-2-daniel@iogearbox.net --- include/uapi/linux/bpf.h | 10 +++++++--- tools/include/uapi/linux/bpf.h | 10 +++++++--- 2 files changed, 14 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 42d2df799397..4272cc53d478 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3679,10 +3679,14 @@ union bpf_attr { * Redirect the packet to another net device of index *ifindex* * and fill in L2 addresses from neighboring subsystem. This helper * is somewhat similar to **bpf_redirect**\ (), except that it - * fills in e.g. MAC addresses based on the L3 information from - * the packet. This helper is supported for IPv4 and IPv6 protocols. + * populates L2 addresses as well, meaning, internally, the helper + * performs a FIB lookup based on the skb's networking header to + * get the address of the next hop and then relies on the neighbor + * lookup for the L2 address of the nexthop. + * * The *flags* argument is reserved and must be 0. The helper is - * currently only supported for tc BPF program types. + * currently only supported for tc BPF program types, and enabled + * for IPv4 and IPv6 protocols. * Return * The helper returns **TC_ACT_REDIRECT** on success or * **TC_ACT_SHOT** on error. diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 42d2df799397..4272cc53d478 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3679,10 +3679,14 @@ union bpf_attr { * Redirect the packet to another net device of index *ifindex* * and fill in L2 addresses from neighboring subsystem. This helper * is somewhat similar to **bpf_redirect**\ (), except that it - * fills in e.g. MAC addresses based on the L3 information from - * the packet. This helper is supported for IPv4 and IPv6 protocols. + * populates L2 addresses as well, meaning, internally, the helper + * performs a FIB lookup based on the skb's networking header to + * get the address of the next hop and then relies on the neighbor + * lookup for the L2 address of the nexthop. + * * The *flags* argument is reserved and must be 0. The helper is - * currently only supported for tc BPF program types. + * currently only supported for tc BPF program types, and enabled + * for IPv4 and IPv6 protocols. * Return * The helper returns **TC_ACT_REDIRECT** on success or * **TC_ACT_SHOT** on error. -- cgit v1.2.3 From 9aa1206e8f48222f35a0c809f33b2f4aaa1e2661 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Oct 2020 01:40:02 +0200 Subject: bpf: Add redirect_peer helper Add an efficient ingress to ingress netns switch that can be used out of tc BPF programs in order to redirect traffic from host ns ingress into a container veth device ingress without having to go via CPU backlog queue [0]. For local containers this can also be utilized and path via CPU backlog queue only needs to be taken once, not twice. On a high level this borrows from ipvlan which does similar switch in __netif_receive_skb_core() and then iterates via another_round. This helps to reduce latency for mentioned use cases. Pod to remote pod with redirect(), TCP_RR [1]: # percpu_netperf 10.217.1.33 RT_LATENCY: 122.450 (per CPU: 122.666 122.401 122.333 122.401 ) MEAN_LATENCY: 121.210 (per CPU: 121.100 121.260 121.320 121.160 ) STDDEV_LATENCY: 120.040 (per CPU: 119.420 119.910 125.460 115.370 ) MIN_LATENCY: 46.500 (per CPU: 47.000 47.000 47.000 45.000 ) P50_LATENCY: 118.500 (per CPU: 118.000 119.000 118.000 119.000 ) P90_LATENCY: 127.500 (per CPU: 127.000 128.000 127.000 128.000 ) P99_LATENCY: 130.750 (per CPU: 131.000 131.000 129.000 132.000 ) TRANSACTION_RATE: 32666.400 (per CPU: 8152.200 8169.842 8174.439 8169.897 ) Pod to remote pod with redirect_peer(), TCP_RR: # percpu_netperf 10.217.1.33 RT_LATENCY: 44.449 (per CPU: 43.767 43.127 45.279 45.622 ) MEAN_LATENCY: 45.065 (per CPU: 44.030 45.530 45.190 45.510 ) STDDEV_LATENCY: 84.823 (per CPU: 66.770 97.290 84.380 90.850 ) MIN_LATENCY: 33.500 (per CPU: 33.000 33.000 34.000 34.000 ) P50_LATENCY: 43.250 (per CPU: 43.000 43.000 43.000 44.000 ) P90_LATENCY: 46.750 (per CPU: 46.000 47.000 47.000 47.000 ) P99_LATENCY: 52.750 (per CPU: 51.000 54.000 53.000 53.000 ) TRANSACTION_RATE: 90039.500 (per CPU: 22848.186 23187.089 22085.077 21919.130 ) [0] https://linuxplumbersconf.org/event/7/contributions/674/attachments/568/1002/plumbers_2020_cilium_load_balancer.pdf [1] https://github.com/borkmann/netperf_scripts/blob/master/percpu_netperf Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201010234006.7075-3-daniel@iogearbox.net --- drivers/net/veth.c | 9 +++++++ include/linux/netdevice.h | 4 ++++ include/uapi/linux/bpf.h | 17 +++++++++++++ net/core/dev.c | 15 +++++++++--- net/core/filter.c | 54 ++++++++++++++++++++++++++++++++++++------ tools/include/uapi/linux/bpf.h | 17 +++++++++++++ 6 files changed, 106 insertions(+), 10 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 091e5b4ba042..8c737668008a 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -420,6 +420,14 @@ static int veth_select_rxq(struct net_device *dev) return smp_processor_id() % dev->real_num_rx_queues; } +static struct net_device *veth_peer_dev(struct net_device *dev) +{ + struct veth_priv *priv = netdev_priv(dev); + + /* Callers must be under RCU read side. */ + return rcu_dereference(priv->peer); +} + static int veth_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, u32 flags, bool ndo_xmit) @@ -1224,6 +1232,7 @@ static const struct net_device_ops veth_netdev_ops = { .ndo_set_rx_headroom = veth_set_rx_headroom, .ndo_bpf = veth_xdp, .ndo_xdp_xmit = veth_ndo_xdp_xmit, + .ndo_get_peer_dev = veth_peer_dev, }; #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 28cfa53daf72..0533f86018dd 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1277,6 +1277,9 @@ struct netdev_net_notifier { * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p, * int cmd); * Add, change, delete or get information on an IPv4 tunnel. + * struct net_device *(*ndo_get_peer_dev)(struct net_device *dev); + * If a device is paired with a peer device, return the peer instance. + * The caller must be under RCU read context. */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1484,6 +1487,7 @@ struct net_device_ops { struct devlink_port * (*ndo_get_devlink_port)(struct net_device *dev); int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p, int cmd); + struct net_device * (*ndo_get_peer_dev)(struct net_device *dev); }; /** diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4272cc53d478..b97bc5abb3b8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3719,6 +3719,22 @@ union bpf_attr { * never return NULL. * Return * A pointer pointing to the kernel percpu variable on this cpu. + * + * long bpf_redirect_peer(u32 ifindex, u64 flags) + * Description + * Redirect the packet to another net device of index *ifindex*. + * This helper is somewhat similar to **bpf_redirect**\ (), except + * that the redirection happens to the *ifindex*' peer device and + * the netns switch takes place from ingress to ingress without + * going through the CPU's backlog queue. + * + * The *flags* argument is reserved and must be 0. The helper is + * currently only supported for tc BPF program types at the ingress + * hook and for veth device types. The peer device must reside in a + * different network namespace. + * Return + * The helper returns **TC_ACT_REDIRECT** on success or + * **TC_ACT_SHOT** on error. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3876,6 +3892,7 @@ union bpf_attr { FN(redirect_neigh), \ FN(bpf_per_cpu_ptr), \ FN(bpf_this_cpu_ptr), \ + FN(redirect_peer), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/net/core/dev.c b/net/core/dev.c index 9d55bf5d1a65..7dd015823593 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4930,7 +4930,7 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); static inline struct sk_buff * sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, - struct net_device *orig_dev) + struct net_device *orig_dev, bool *another) { #ifdef CONFIG_NET_CLS_ACT struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress); @@ -4974,7 +4974,11 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, * redirecting to another netdev */ __skb_push(skb, skb->mac_len); - skb_do_redirect(skb); + if (skb_do_redirect(skb) == -EAGAIN) { + __skb_pull(skb, skb->mac_len); + *another = true; + break; + } return NULL; case TC_ACT_CONSUMED: return NULL; @@ -5163,7 +5167,12 @@ another_round: skip_taps: #ifdef CONFIG_NET_INGRESS if (static_branch_unlikely(&ingress_needed_key)) { - skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev); + bool another = false; + + skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev, + &another); + if (another) + goto another_round; if (!skb) goto out; diff --git a/net/core/filter.c b/net/core/filter.c index 5da44b11e1ec..fab951c6be57 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2380,8 +2380,9 @@ out: /* Internal, non-exposed redirect flags. */ enum { - BPF_F_NEIGH = (1ULL << 1), -#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH) + BPF_F_NEIGH = (1ULL << 1), + BPF_F_PEER = (1ULL << 2), +#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER) }; BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) @@ -2430,19 +2431,35 @@ EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info); int skb_do_redirect(struct sk_buff *skb) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct net *net = dev_net(skb->dev); struct net_device *dev; u32 flags = ri->flags; - dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->tgt_index); + dev = dev_get_by_index_rcu(net, ri->tgt_index); ri->tgt_index = 0; - if (unlikely(!dev)) { - kfree_skb(skb); - return -EINVAL; + ri->flags = 0; + if (unlikely(!dev)) + goto out_drop; + if (flags & BPF_F_PEER) { + const struct net_device_ops *ops = dev->netdev_ops; + + if (unlikely(!ops->ndo_get_peer_dev || + !skb_at_tc_ingress(skb))) + goto out_drop; + dev = ops->ndo_get_peer_dev(dev); + if (unlikely(!dev || + !is_skb_forwardable(dev, skb) || + net_eq(net, dev_net(dev)))) + goto out_drop; + skb->dev = dev; + return -EAGAIN; } - return flags & BPF_F_NEIGH ? __bpf_redirect_neigh(skb, dev) : __bpf_redirect(skb, dev, flags); +out_drop: + kfree_skb(skb); + return -EINVAL; } BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) @@ -2466,6 +2483,27 @@ static const struct bpf_func_proto bpf_redirect_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + if (unlikely(flags)) + return TC_ACT_SHOT; + + ri->flags = BPF_F_PEER; + ri->tgt_index = ifindex; + + return TC_ACT_REDIRECT; +} + +static const struct bpf_func_proto bpf_redirect_peer_proto = { + .func = bpf_redirect_peer, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_2(bpf_redirect_neigh, u32, ifindex, u64, flags) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); @@ -7053,6 +7091,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_redirect_proto; case BPF_FUNC_redirect_neigh: return &bpf_redirect_neigh_proto; + case BPF_FUNC_redirect_peer: + return &bpf_redirect_peer_proto; case BPF_FUNC_get_route_realm: return &bpf_get_route_realm_proto; case BPF_FUNC_get_hash_recalc: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 4272cc53d478..b97bc5abb3b8 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3719,6 +3719,22 @@ union bpf_attr { * never return NULL. * Return * A pointer pointing to the kernel percpu variable on this cpu. + * + * long bpf_redirect_peer(u32 ifindex, u64 flags) + * Description + * Redirect the packet to another net device of index *ifindex*. + * This helper is somewhat similar to **bpf_redirect**\ (), except + * that the redirection happens to the *ifindex*' peer device and + * the netns switch takes place from ingress to ingress without + * going through the CPU's backlog queue. + * + * The *flags* argument is reserved and must be 0. The helper is + * currently only supported for tc BPF program types at the ingress + * hook and for veth device types. The peer device must reside in a + * different network namespace. + * Return + * The helper returns **TC_ACT_REDIRECT** on success or + * **TC_ACT_SHOT** on error. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3876,6 +3892,7 @@ union bpf_attr { FN(redirect_neigh), \ FN(bpf_per_cpu_ptr), \ FN(bpf_this_cpu_ptr), \ + FN(redirect_peer), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 4a8f87e60f6db40e640f1db555d063b2c4dea5f1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Oct 2020 01:40:03 +0200 Subject: bpf: Allow for map-in-map with dynamic inner array map entries Recent work in f4d05259213f ("bpf: Add map_meta_equal map ops") and 134fede4eecf ("bpf: Relax max_entries check for most of the inner map types") added support for dynamic inner max elements for most map-in-map types. Exceptions were maps like array or prog array where the map_gen_lookup() callback uses the maps' max_entries field as a constant when emitting instructions. We recently implemented Maglev consistent hashing into Cilium's load balancer which uses map-in-map with an outer map being hash and inner being array holding the Maglev backend table for each service. This has been designed this way in order to reduce overall memory consumption given the outer hash map allows to avoid preallocating a large, flat memory area for all services. Also, the number of service mappings is not always known a-priori. The use case for dynamic inner array map entries is to further reduce memory overhead, for example, some services might just have a small number of back ends while others could have a large number. Right now the Maglev backend table for small and large number of backends would need to have the same inner array map entries which adds a lot of unneeded overhead. Dynamic inner array map entries can be realized by avoiding the inlined code generation for their lookup. The lookup will still be efficient since it will be calling into array_map_lookup_elem() directly and thus avoiding retpoline. The patch adds a BPF_F_INNER_MAP flag to map creation which therefore skips inline code generation and relaxes array_map_meta_equal() check to ignore both maps' max_entries. This also still allows to have faster lookups for map-in-map when BPF_F_INNER_MAP is not specified and hence dynamic max_entries not needed. Example code generation where inner map is dynamic sized array: # bpftool p d x i 125 int handle__sys_enter(void * ctx): ; int handle__sys_enter(void *ctx) 0: (b4) w1 = 0 ; int key = 0; 1: (63) *(u32 *)(r10 -4) = r1 2: (bf) r2 = r10 ; 3: (07) r2 += -4 ; inner_map = bpf_map_lookup_elem(&outer_arr_dyn, &key); 4: (18) r1 = map[id:468] 6: (07) r1 += 272 7: (61) r0 = *(u32 *)(r2 +0) 8: (35) if r0 >= 0x3 goto pc+5 9: (67) r0 <<= 3 10: (0f) r0 += r1 11: (79) r0 = *(u64 *)(r0 +0) 12: (15) if r0 == 0x0 goto pc+1 13: (05) goto pc+1 14: (b7) r0 = 0 15: (b4) w6 = -1 ; if (!inner_map) 16: (15) if r0 == 0x0 goto pc+6 17: (bf) r2 = r10 ; 18: (07) r2 += -4 ; val = bpf_map_lookup_elem(inner_map, &key); 19: (bf) r1 = r0 | No inlining but instead 20: (85) call array_map_lookup_elem#149280 | call to array_map_lookup_elem() ; return val ? *val : -1; | for inner array lookup. 21: (15) if r0 == 0x0 goto pc+1 ; return val ? *val : -1; 22: (61) r6 = *(u32 *)(r0 +0) ; } 23: (bc) w0 = w6 24: (95) exit Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201010234006.7075-4-daniel@iogearbox.net --- include/linux/bpf.h | 2 +- include/uapi/linux/bpf.h | 3 +++ kernel/bpf/arraymap.c | 17 +++++++++++------ kernel/bpf/hashtab.c | 6 +++--- kernel/bpf/verifier.c | 6 ++++-- net/xdp/xskmap.c | 2 +- tools/include/uapi/linux/bpf.h | 3 +++ 7 files changed, 26 insertions(+), 13 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index dc63eeed4fd9..2b16bf48aab6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -82,7 +82,7 @@ struct bpf_map_ops { void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, int fd); void (*map_fd_put_ptr)(void *ptr); - u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); + int (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); u32 (*map_fd_sys_lookup_elem)(void *ptr); void (*map_seq_show_elem)(struct bpf_map *map, void *key, struct seq_file *m); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b97bc5abb3b8..bf5a99d803e4 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -435,6 +435,9 @@ enum { /* Share perf_event among processes */ BPF_F_PRESERVE_ELEMS = (1U << 11), + +/* Create a map that is suitable to be an inner map with dynamic max entries */ + BPF_F_INNER_MAP = (1U << 12), }; /* Flags for BPF_PROG_QUERY. */ diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index bd777dd6f967..c6c81eceb68f 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -16,7 +16,7 @@ #define ARRAY_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK | \ - BPF_F_PRESERVE_ELEMS) + BPF_F_PRESERVE_ELEMS | BPF_F_INNER_MAP) static void bpf_array_free_percpu(struct bpf_array *array) { @@ -62,7 +62,7 @@ int array_map_alloc_check(union bpf_attr *attr) return -EINVAL; if (attr->map_type != BPF_MAP_TYPE_ARRAY && - attr->map_flags & BPF_F_MMAPABLE) + attr->map_flags & (BPF_F_MMAPABLE | BPF_F_INNER_MAP)) return -EINVAL; if (attr->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY && @@ -214,7 +214,7 @@ static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm, } /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ -static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +static int array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_insn *insn = insn_buf; @@ -223,6 +223,9 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) const int map_ptr = BPF_REG_1; const int index = BPF_REG_2; + if (map->map_flags & BPF_F_INNER_MAP) + return -EOPNOTSUPP; + *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); if (!map->bypass_spec_v1) { @@ -496,8 +499,10 @@ static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) static bool array_map_meta_equal(const struct bpf_map *meta0, const struct bpf_map *meta1) { - return meta0->max_entries == meta1->max_entries && - bpf_map_meta_equal(meta0, meta1); + if (!bpf_map_meta_equal(meta0, meta1)) + return false; + return meta0->map_flags & BPF_F_INNER_MAP ? true : + meta0->max_entries == meta1->max_entries; } struct bpf_iter_seq_array_map_info { @@ -1251,7 +1256,7 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) return READ_ONCE(*inner_map); } -static u32 array_of_map_gen_lookup(struct bpf_map *map, +static int array_of_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_array *array = container_of(map, struct bpf_array, map); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3395cf140d22..1815e97d4c9c 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -612,7 +612,7 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key) * bpf_prog * __htab_map_lookup_elem */ -static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_insn *insn = insn_buf; const int ret = BPF_REG_0; @@ -651,7 +651,7 @@ static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key) return __htab_lru_map_lookup_elem(map, key, false); } -static u32 htab_lru_map_gen_lookup(struct bpf_map *map, +static int htab_lru_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_insn *insn = insn_buf; @@ -2070,7 +2070,7 @@ static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key) return READ_ONCE(*inner_map); } -static u32 htab_of_map_gen_lookup(struct bpf_map *map, +static int htab_of_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_insn *insn = insn_buf; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f3e36eade3d4..fa5badc9279a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11049,7 +11049,9 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) if (insn->imm == BPF_FUNC_map_lookup_elem && ops->map_gen_lookup) { cnt = ops->map_gen_lookup(map_ptr, insn_buf); - if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { + if (cnt == -EOPNOTSUPP) + goto patch_map_ops_generic; + if (cnt <= 0 || cnt >= ARRAY_SIZE(insn_buf)) { verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; } @@ -11079,7 +11081,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) (int (*)(struct bpf_map *map, void *value))NULL)); BUILD_BUG_ON(!__same_type(ops->map_peek_elem, (int (*)(struct bpf_map *map, void *value))NULL)); - +patch_map_ops_generic: switch (insn->imm) { case BPF_FUNC_map_lookup_elem: insn->imm = BPF_CAST_CALL(ops->map_lookup_elem) - diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index 0c5df593bc56..49da2b8ace8b 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -132,7 +132,7 @@ static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } -static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +static int xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2; struct bpf_insn *insn = insn_buf; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b97bc5abb3b8..bf5a99d803e4 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -435,6 +435,9 @@ enum { /* Share perf_event among processes */ BPF_F_PRESERVE_ELEMS = (1U << 11), + +/* Create a map that is suitable to be an inner map with dynamic max entries */ + BPF_F_INNER_MAP = (1U << 12), }; /* Flags for BPF_PROG_QUERY. */ -- cgit v1.2.3 From 60a3815da702fd9e4759945f26cce5c47d3967ad Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 8 Oct 2020 01:14:47 +0200 Subject: netfilter: add inet ingress support This patch adds the NF_INET_INGRESS pseudohook for the NFPROTO_INET family. This is a mapping this new hook to the existing NFPROTO_NETDEV and NF_NETDEV_INGRESS hook. The hook does not guarantee that packets are inet only, users must filter out non-ip traffic explicitly. This infrastructure makes it easier to support this new hook in nf_tables. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter.h | 1 + net/netfilter/core.c | 103 ++++++++++++++++++++++++++++++++--------- 2 files changed, 83 insertions(+), 21 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter.h b/include/uapi/linux/netfilter.h index ca9e63d6e0e4..6a6179af0d7c 100644 --- a/include/uapi/linux/netfilter.h +++ b/include/uapi/linux/netfilter.h @@ -45,6 +45,7 @@ enum nf_inet_hooks { NF_INET_FORWARD, NF_INET_LOCAL_OUT, NF_INET_POST_ROUTING, + NF_INET_INGRESS, NF_INET_NUMHOOKS }; diff --git a/net/netfilter/core.c b/net/netfilter/core.c index c82f779a587e..63d032191e62 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -281,6 +281,16 @@ nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum, if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_bridge) <= hooknum)) return NULL; return net->nf.hooks_bridge + hooknum; +#endif +#ifdef CONFIG_NETFILTER_INGRESS + case NFPROTO_INET: + if (WARN_ON_ONCE(hooknum != NF_INET_INGRESS)) + return NULL; + if (!dev || dev_net(dev) != net) { + WARN_ON_ONCE(1); + return NULL; + } + return &dev->nf_hooks_ingress; #endif case NFPROTO_IPV4: if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_ipv4) <= hooknum)) @@ -311,22 +321,56 @@ nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum, return NULL; } +static int nf_ingress_check(struct net *net, const struct nf_hook_ops *reg, + int hooknum) +{ +#ifndef CONFIG_NETFILTER_INGRESS + if (reg->hooknum == hooknum) + return -EOPNOTSUPP; +#endif + if (reg->hooknum != hooknum || + !reg->dev || dev_net(reg->dev) != net) + return -EINVAL; + + return 0; +} + static inline bool nf_ingress_hook(const struct nf_hook_ops *reg, int pf) { - return pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS; + if ((pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) || + (pf == NFPROTO_INET && reg->hooknum == NF_INET_INGRESS)) + return true; + + return false; } static void nf_static_key_inc(const struct nf_hook_ops *reg, int pf) { #ifdef CONFIG_JUMP_LABEL - static_key_slow_inc(&nf_hooks_needed[pf][reg->hooknum]); + int hooknum; + + if (pf == NFPROTO_INET && reg->hooknum == NF_INET_INGRESS) { + pf = NFPROTO_NETDEV; + hooknum = NF_NETDEV_INGRESS; + } else { + hooknum = reg->hooknum; + } + static_key_slow_inc(&nf_hooks_needed[pf][hooknum]); #endif } static void nf_static_key_dec(const struct nf_hook_ops *reg, int pf) { #ifdef CONFIG_JUMP_LABEL - static_key_slow_dec(&nf_hooks_needed[pf][reg->hooknum]); + int hooknum; + + if (pf == NFPROTO_INET && reg->hooknum == NF_INET_INGRESS) { + pf = NFPROTO_NETDEV; + hooknum = NF_NETDEV_INGRESS; + } else { + hooknum = reg->hooknum; + } + static_key_slow_dec(&nf_hooks_needed[pf][hooknum]); #endif } @@ -335,15 +379,22 @@ static int __nf_register_net_hook(struct net *net, int pf, { struct nf_hook_entries *p, *new_hooks; struct nf_hook_entries __rcu **pp; + int err; - if (pf == NFPROTO_NETDEV) { -#ifndef CONFIG_NETFILTER_INGRESS - if (reg->hooknum == NF_NETDEV_INGRESS) - return -EOPNOTSUPP; -#endif - if (reg->hooknum != NF_NETDEV_INGRESS || - !reg->dev || dev_net(reg->dev) != net) - return -EINVAL; + switch (pf) { + case NFPROTO_NETDEV: + err = nf_ingress_check(net, reg, NF_NETDEV_INGRESS); + if (err < 0) + return err; + break; + case NFPROTO_INET: + if (reg->hooknum != NF_INET_INGRESS) + break; + + err = nf_ingress_check(net, reg, NF_INET_INGRESS); + if (err < 0) + return err; + break; } pp = nf_hook_entry_head(net, pf, reg->hooknum, reg->dev); @@ -441,8 +492,12 @@ static void __nf_unregister_net_hook(struct net *net, int pf, void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) { if (reg->pf == NFPROTO_INET) { - __nf_unregister_net_hook(net, NFPROTO_IPV4, reg); - __nf_unregister_net_hook(net, NFPROTO_IPV6, reg); + if (reg->hooknum == NF_INET_INGRESS) { + __nf_unregister_net_hook(net, NFPROTO_INET, reg); + } else { + __nf_unregister_net_hook(net, NFPROTO_IPV4, reg); + __nf_unregister_net_hook(net, NFPROTO_IPV6, reg); + } } else { __nf_unregister_net_hook(net, reg->pf, reg); } @@ -467,14 +522,20 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) int err; if (reg->pf == NFPROTO_INET) { - err = __nf_register_net_hook(net, NFPROTO_IPV4, reg); - if (err < 0) - return err; - - err = __nf_register_net_hook(net, NFPROTO_IPV6, reg); - if (err < 0) { - __nf_unregister_net_hook(net, NFPROTO_IPV4, reg); - return err; + if (reg->hooknum == NF_INET_INGRESS) { + err = __nf_register_net_hook(net, NFPROTO_INET, reg); + if (err < 0) + return err; + } else { + err = __nf_register_net_hook(net, NFPROTO_IPV4, reg); + if (err < 0) + return err; + + err = __nf_register_net_hook(net, NFPROTO_IPV6, reg); + if (err < 0) { + __nf_unregister_net_hook(net, NFPROTO_IPV4, reg); + return err; + } } } else { err = __nf_register_net_hook(net, reg->pf, reg); -- cgit v1.2.3 From ac911bfeb34b5d79fb4e23a08b8db0b89c529b53 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 12 Oct 2020 09:43:53 +0200 Subject: can: isotp: implement cleanups / improvements from review As pointed out by Jakub Kicinski here: http://lore.kernel.org/r/20201009175751.5c54097f@kicinski-fedora-pc1c0hjn.dhcp.thefacebook.com this patch addresses the remarked issues: - remove empty line in comment - remove default=y for CAN_ISOTP in Kconfig - make use of pr_notice_once() - use GFP_ATOMIC instead of gfp_any() in soft hrtimer context The version strings in the CAN subsystem are removed by a separate patch. Signed-off-by: Oliver Hartkopp Link: https://lore.kernel.org/r/20201012074354.25839-1-socketcan@hartkopp.net Signed-off-by: Marc Kleine-Budde --- include/uapi/linux/can/isotp.h | 1 - net/can/Kconfig | 3 ++- net/can/isotp.c | 14 +++++++------- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/can/isotp.h b/include/uapi/linux/can/isotp.h index 553006509f4e..7793b26aa154 100644 --- a/include/uapi/linux/can/isotp.h +++ b/include/uapi/linux/can/isotp.h @@ -160,7 +160,6 @@ struct can_isotp_ll_options { * these default settings can be changed via sockopts. * For that reason the STmin value is intentionally _not_ checked for * consistency and copied directly into the flow control (FC) frame. - * */ #endif /* !_UAPI_CAN_ISOTP_H */ diff --git a/net/can/Kconfig b/net/can/Kconfig index 021fe03a8ed6..224e5e0283a9 100644 --- a/net/can/Kconfig +++ b/net/can/Kconfig @@ -57,7 +57,6 @@ source "net/can/j1939/Kconfig" config CAN_ISOTP tristate "ISO 15765-2:2016 CAN transport protocol" - default y help CAN Transport Protocols offer support for segmented Point-to-Point communication between CAN nodes via two defined CAN Identifiers. @@ -67,6 +66,8 @@ config CAN_ISOTP vehicle diagnosis (UDS, ISO 14229) or IP-over-CAN traffic. This protocol driver implements data transfers according to ISO 15765-2:2016 for 'classic' CAN and CAN FD frame types. + If you want to perform automotive vehicle diagnostic services (UDS), + say 'y'. source "drivers/net/can/Kconfig" diff --git a/net/can/isotp.c b/net/can/isotp.c index e6ff032b5426..ca63061bb932 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -222,8 +222,8 @@ static int isotp_send_fc(struct sock *sk, int ae, u8 flowstatus) can_send_ret = can_send(nskb, 1); if (can_send_ret) - printk_once(KERN_NOTICE "can-isotp: %s: can_send_ret %d\n", - __func__, can_send_ret); + pr_notice_once("can-isotp: %s: can_send_ret %d\n", + __func__, can_send_ret); dev_put(dev); @@ -769,7 +769,7 @@ static enum hrtimer_restart isotp_tx_timer_handler(struct hrtimer *hrtimer) isotp_tx_burst: skb = alloc_skb(so->ll.mtu + sizeof(struct can_skb_priv), - gfp_any()); + GFP_ATOMIC); if (!skb) { dev_put(dev); break; @@ -798,8 +798,8 @@ isotp_tx_burst: can_send_ret = can_send(skb, 1); if (can_send_ret) - printk_once(KERN_NOTICE "can-isotp: %s: can_send_ret %d\n", - __func__, can_send_ret); + pr_notice_once("can-isotp: %s: can_send_ret %d\n", + __func__, can_send_ret); if (so->tx.idx >= so->tx.len) { /* we are done */ @@ -942,8 +942,8 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) err = can_send(skb, 1); dev_put(dev); if (err) { - printk_once(KERN_NOTICE "can-isotp: %s: can_send_ret %d\n", - __func__, err); + pr_notice_once("can-isotp: %s: can_send_ret %d\n", + __func__, err); return err; } -- cgit v1.2.3 From d25e2e9388eda61b6e298585024ee3355f50c493 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 14 Oct 2020 21:34:32 +0200 Subject: netfilter: restore NF_INET_NUMHOOKS This definition is used by the iptables legacy UAPI, restore it. Fixes: d3519cb89f6d ("netfilter: nf_tables: add inet ingress support") Reported-by: Jason A. Donenfeld Tested-by: Jason A. Donenfeld Signed-off-by: Pablo Neira Ayuso Signed-off-by: Jakub Kicinski --- include/net/netfilter/nf_tables.h | 4 +++- include/uapi/linux/netfilter.h | 4 ++-- net/netfilter/nf_tables_api.c | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 3965ce18226f..3f7e56b1171e 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -14,6 +14,8 @@ #include #include +#define NFT_MAX_HOOKS (NF_INET_INGRESS + 1) + struct module; #define NFT_JUMP_STACK_SIZE 16 @@ -979,7 +981,7 @@ struct nft_chain_type { int family; struct module *owner; unsigned int hook_mask; - nf_hookfn *hooks[NF_MAX_HOOKS]; + nf_hookfn *hooks[NFT_MAX_HOOKS]; int (*ops_register)(struct net *net, const struct nf_hook_ops *ops); void (*ops_unregister)(struct net *net, const struct nf_hook_ops *ops); }; diff --git a/include/uapi/linux/netfilter.h b/include/uapi/linux/netfilter.h index 6a6179af0d7c..ef9a44286e23 100644 --- a/include/uapi/linux/netfilter.h +++ b/include/uapi/linux/netfilter.h @@ -45,8 +45,8 @@ enum nf_inet_hooks { NF_INET_FORWARD, NF_INET_LOCAL_OUT, NF_INET_POST_ROUTING, - NF_INET_INGRESS, - NF_INET_NUMHOOKS + NF_INET_NUMHOOKS, + NF_INET_INGRESS = NF_INET_NUMHOOKS, }; enum nf_dev_hooks { diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index f22ad21d0230..7f1c184c00d2 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1864,7 +1864,7 @@ static int nft_chain_parse_hook(struct net *net, if (IS_ERR(type)) return PTR_ERR(type); } - if (hook->num > NF_MAX_HOOKS || !(type->hook_mask & (1 << hook->num))) + if (hook->num >= NFT_MAX_HOOKS || !(type->hook_mask & (1 << hook->num))) return -EOPNOTSUPP; if (type->type == NFT_CHAIN_T_NAT && -- cgit v1.2.3 From 346e320cb2103edef709c4466a29140c4a8e527a Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Thu, 15 Oct 2020 18:39:27 +0200 Subject: netfilter: nftables: allow re-computing sctp CRC-32C in 'payload' statements nftables payload statements are used to mangle SCTP headers, but they can only replace the Internet Checksum. As a consequence, nftables rules that mangle sport/dport/vtag in SCTP headers potentially generate packets that are discarded by the receiver, unless the CRC-32C is "offloaded" (e.g the rule mangles a skb having 'ip_summed' equal to 'CHECKSUM_PARTIAL'. Fix this extending uAPI definitions and L4 checksum update function, in a way that userspace programs (e.g. nft) can instruct the kernel to compute CRC-32C in SCTP headers. Also ensure that LIBCRC32C is built if NF_TABLES is 'y' or 'm' in the kernel build configuration. Signed-off-by: Davide Caratti Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Jakub Kicinski --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/Kconfig | 1 + net/netfilter/nft_payload.c | 28 ++++++++++++++++++++++++++++ 3 files changed, 31 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 352ee51707a1..98272cb5f617 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -749,10 +749,12 @@ enum nft_payload_bases { * * @NFT_PAYLOAD_CSUM_NONE: no checksumming * @NFT_PAYLOAD_CSUM_INET: internet checksum (RFC 791) + * @NFT_PAYLOAD_CSUM_SCTP: CRC-32c, for use in SCTP header (RFC 3309) */ enum nft_payload_csum_types { NFT_PAYLOAD_CSUM_NONE, NFT_PAYLOAD_CSUM_INET, + NFT_PAYLOAD_CSUM_SCTP, }; enum nft_payload_csum_flags { diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 25313c29d799..52370211e46b 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -441,6 +441,7 @@ endif # NF_CONNTRACK config NF_TABLES select NETFILTER_NETLINK + select LIBCRC32C tristate "Netfilter nf_tables support" help nftables is the new packet classification framework that intends to diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 7a2e59638499..dcd3c7b8a367 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -22,6 +22,7 @@ #include #include #include +#include static bool nft_payload_rebuild_vlan_hdr(const struct sk_buff *skb, int mac_off, struct vlan_ethhdr *veth) @@ -484,6 +485,19 @@ static int nft_payload_l4csum_offset(const struct nft_pktinfo *pkt, return 0; } +static int nft_payload_csum_sctp(struct sk_buff *skb, int offset) +{ + struct sctphdr *sh; + + if (skb_ensure_writable(skb, offset + sizeof(*sh))) + return -1; + + sh = (struct sctphdr *)(skb->data + offset); + sh->checksum = sctp_compute_cksum(skb, offset); + skb->ip_summed = CHECKSUM_UNNECESSARY; + return 0; +} + static int nft_payload_l4csum_update(const struct nft_pktinfo *pkt, struct sk_buff *skb, __wsum fsum, __wsum tsum) @@ -587,6 +601,13 @@ static void nft_payload_set_eval(const struct nft_expr *expr, skb_store_bits(skb, offset, src, priv->len) < 0) goto err; + if (priv->csum_type == NFT_PAYLOAD_CSUM_SCTP && + pkt->tprot == IPPROTO_SCTP && + skb->ip_summed != CHECKSUM_PARTIAL) { + if (nft_payload_csum_sctp(skb, pkt->xt.thoff)) + goto err; + } + return; err: regs->verdict.code = NFT_BREAK; @@ -623,6 +644,13 @@ static int nft_payload_set_init(const struct nft_ctx *ctx, case NFT_PAYLOAD_CSUM_NONE: case NFT_PAYLOAD_CSUM_INET: break; + case NFT_PAYLOAD_CSUM_SCTP: + if (priv->base != NFT_PAYLOAD_TRANSPORT_HEADER) + return -EINVAL; + + if (priv->csum_offset != offsetof(struct sctphdr, checksum)) + return -EINVAL; + break; default: return -EOPNOTSUPP; } -- cgit v1.2.3 From 8c39076c276be0b31982e44654e2c2357473258a Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Fri, 16 Oct 2020 09:25:45 -0400 Subject: NFSv4.2: support EXCHGID4_FLAG_SUPP_FENCE_OPS 4.2 EXCHANGE_ID flag RFC 7862 introduced a new flag that either client or server is allowed to set: EXCHGID4_FLAG_SUPP_FENCE_OPS. Client needs to update its bitmask to allow for this flag value. v2: changed minor version argument to unsigned int Signed-off-by: Olga Kornievskaia CC: Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 9 ++++++--- include/uapi/linux/nfs4.h | 3 +++ 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 2e33995691f5..9e0ca9b2b210 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -8133,9 +8133,11 @@ int nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, * both PNFS and NON_PNFS flags set, and not having one of NON_PNFS, PNFS, or * DS flags set. */ -static int nfs4_check_cl_exchange_flags(u32 flags) +static int nfs4_check_cl_exchange_flags(u32 flags, u32 version) { - if (flags & ~EXCHGID4_FLAG_MASK_R) + if (version >= 2 && (flags & ~EXCHGID4_2_FLAG_MASK_R)) + goto out_inval; + else if (version < 2 && (flags & ~EXCHGID4_FLAG_MASK_R)) goto out_inval; if ((flags & EXCHGID4_FLAG_USE_PNFS_MDS) && (flags & EXCHGID4_FLAG_USE_NON_PNFS)) @@ -8548,7 +8550,8 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, const struct cred *cre if (status != 0) goto out; - status = nfs4_check_cl_exchange_flags(resp->flags); + status = nfs4_check_cl_exchange_flags(resp->flags, + clp->cl_mvops->minor_version); if (status != 0) goto out; diff --git a/include/uapi/linux/nfs4.h b/include/uapi/linux/nfs4.h index bf197e99b98f..ed5415e0f1c1 100644 --- a/include/uapi/linux/nfs4.h +++ b/include/uapi/linux/nfs4.h @@ -139,6 +139,8 @@ #define EXCHGID4_FLAG_UPD_CONFIRMED_REC_A 0x40000000 #define EXCHGID4_FLAG_CONFIRMED_R 0x80000000 + +#define EXCHGID4_FLAG_SUPP_FENCE_OPS 0x00000004 /* * Since the validity of these bits depends on whether * they're set in the argument or response, have separate @@ -146,6 +148,7 @@ */ #define EXCHGID4_FLAG_MASK_A 0x40070103 #define EXCHGID4_FLAG_MASK_R 0x80070103 +#define EXCHGID4_2_FLAG_MASK_R 0x80070107 #define SEQ4_STATUS_CB_PATH_DOWN 0x00000001 #define SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRING 0x00000002 -- cgit v1.2.3 From c6e5f02b5281a3166a9b7b4d66830cc234421ba5 Mon Sep 17 00:00:00 2001 From: "Saheed O. Bolarinwa" Date: Thu, 15 Oct 2020 14:30:31 -0500 Subject: PCI/ASPM: Remove struct aspm_register_info.support Previously we stored the "ASPM Support" field from the Link Capabilities register in the struct aspm_register_info. Read the Link Capabilities directly when needed and remove it from the struct aspm_register_info. No functional change intended. [bhelgaas: remove pci_dev cached copy since LNKCAP isn't truly read-only, add PCI_EXP_LNKCAP_ASPM_L0S & PCI_EXP_LNKCAP_ASPM_L1, check them directly instead of adding aspm_support()] Link: https://lore.kernel.org/r/20201015193039.12585-5-helgaas@kernel.org Signed-off-by: Saheed O. Bolarinwa Signed-off-by: Bjorn Helgaas --- drivers/pci/pcie/aspm.c | 25 ++++++++++++++----------- include/uapi/linux/pci_regs.h | 2 ++ 2 files changed, 16 insertions(+), 11 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index 0725511cbeb5..82ce34e2ef53 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -381,7 +381,6 @@ static void encode_l12_threshold(u32 threshold_us, u32 *scale, u32 *value) } struct aspm_register_info { - u32 support:2; u32 enabled:2; u32 latency_encoding_l0s; u32 latency_encoding_l1; @@ -400,7 +399,6 @@ static void pcie_get_aspm_reg(struct pci_dev *pdev, u32 reg32; pcie_capability_read_dword(pdev, PCI_EXP_LNKCAP, ®32); - info->support = (reg32 & PCI_EXP_LNKCAP_ASPMS) >> 10; info->latency_encoding_l0s = (reg32 & PCI_EXP_LNKCAP_L0SEL) >> 12; info->latency_encoding_l1 = (reg32 & PCI_EXP_LNKCAP_L1EL) >> 15; pcie_capability_read_word(pdev, PCI_EXP_LNKCTL, ®16); @@ -550,6 +548,7 @@ static void aspm_calc_l1ss_info(struct pcie_link_state *link, static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist) { struct pci_dev *child = link->downstream, *parent = link->pdev; + u32 parent_lnkcap, child_lnkcap; struct pci_bus *linkbus = parent->subordinate; struct aspm_register_info upreg, dwreg; @@ -560,24 +559,26 @@ static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist) return; } - /* Get upstream/downstream components' register state */ - pcie_get_aspm_reg(parent, &upreg); - pcie_get_aspm_reg(child, &dwreg); - /* * If ASPM not supported, don't mess with the clocks and link, * bail out now. */ - if (!(upreg.support & dwreg.support)) + pcie_capability_read_dword(parent, PCI_EXP_LNKCAP, &parent_lnkcap); + pcie_capability_read_dword(child, PCI_EXP_LNKCAP, &child_lnkcap); + if (!(parent_lnkcap & child_lnkcap & PCI_EXP_LNKCAP_ASPMS)) return; /* Configure common clock before checking latencies */ pcie_aspm_configure_common_clock(link); /* - * Re-read upstream/downstream components' register state - * after clock configuration + * Re-read upstream/downstream components' register state after + * clock configuration. L0s & L1 exit latencies in the otherwise + * read-only Link Capabilities may change depending on common clock + * configuration (PCIe r5.0, sec 7.5.3.6). */ + pcie_capability_read_dword(parent, PCI_EXP_LNKCAP, &parent_lnkcap); + pcie_capability_read_dword(child, PCI_EXP_LNKCAP, &child_lnkcap); pcie_get_aspm_reg(parent, &upreg); pcie_get_aspm_reg(child, &dwreg); @@ -588,8 +589,9 @@ static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist) * given link unless components on both sides of the link each * support L0s. */ - if (dwreg.support & upreg.support & PCIE_LINK_STATE_L0S) + if (parent_lnkcap & child_lnkcap & PCI_EXP_LNKCAP_ASPM_L0S) link->aspm_support |= ASPM_STATE_L0S; + if (dwreg.enabled & PCIE_LINK_STATE_L0S) link->aspm_enabled |= ASPM_STATE_L0S_UP; if (upreg.enabled & PCIE_LINK_STATE_L0S) @@ -598,8 +600,9 @@ static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist) link->latency_dw.l0s = calc_l0s_latency(dwreg.latency_encoding_l0s); /* Setup L1 state */ - if (upreg.support & dwreg.support & PCIE_LINK_STATE_L1) + if (parent_lnkcap & child_lnkcap & PCI_EXP_LNKCAP_ASPM_L1) link->aspm_support |= ASPM_STATE_L1; + if (upreg.enabled & dwreg.enabled & PCIE_LINK_STATE_L1) link->aspm_enabled |= ASPM_STATE_L1; link->latency_up.l1 = calc_l1_latency(upreg.latency_encoding_l1); diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index f9701410d3b5..06846ec2e071 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -532,6 +532,8 @@ #define PCI_EXP_LNKCAP_SLS_32_0GB 0x00000005 /* LNKCAP2 SLS Vector bit 4 */ #define PCI_EXP_LNKCAP_MLW 0x000003f0 /* Maximum Link Width */ #define PCI_EXP_LNKCAP_ASPMS 0x00000c00 /* ASPM Support */ +#define PCI_EXP_LNKCAP_ASPM_L0S 0x00000400 /* ASPM L0s Support */ +#define PCI_EXP_LNKCAP_ASPM_L1 0x00000800 /* ASPM L1 Support */ #define PCI_EXP_LNKCAP_L0SEL 0x00007000 /* L0s Exit Latency */ #define PCI_EXP_LNKCAP_L1EL 0x00038000 /* L1 Exit Latency */ #define PCI_EXP_LNKCAP_CLKPM 0x00040000 /* Clock Power Management */ -- cgit v1.2.3 From df8f10587d3d11b055d54138994a1a9a681da0c4 Mon Sep 17 00:00:00 2001 From: "Saheed O. Bolarinwa" Date: Thu, 15 Oct 2020 14:30:39 -0500 Subject: PCI/ASPM: Remove struct pcie_link_state.l1ss Previously we computed L1.2 parameters in the enumeration path, saved them in struct pcie_link_state.l1ss, and programmed them into the devices whenever we enabled or disabled L1.2 on the link. But these parameters are constant and don't need to be updated when enabling/disabling L1.2. Compute and program the L1.2 parameters once during enumeration and remove the struct pcie_link_state.l1ss member. No functional change intended. [bhelgaas: rework to program L1.2 parameters during enumeration] Link: https://lore.kernel.org/r/20201015193039.12585-13-helgaas@kernel.org Signed-off-by: Saheed O. Bolarinwa Signed-off-by: Bjorn Helgaas --- drivers/pci/pcie/aspm.c | 84 +++++++++++++++++++++++++------------------ include/uapi/linux/pci_regs.h | 1 + 2 files changed, 50 insertions(+), 35 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index d76f23908d67..ac0557a305af 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -74,12 +74,6 @@ struct pcie_link_state { * has one slot under it, so at most there are 8 functions. */ struct aspm_latency acceptable[8]; - - /* L1 PM Substate info */ - struct { - u32 ctl1; /* value to be programmed in ctl1 */ - u32 ctl2; /* value to be programmed in ctl2 */ - } l1ss; }; static int aspm_disabled, aspm_force; @@ -461,8 +455,9 @@ static void aspm_calc_l1ss_info(struct pcie_link_state *link, struct pci_dev *child = link->downstream, *parent = link->pdev; u32 val1, val2, scale1, scale2; u32 t_common_mode, t_power_on, l1_2_threshold, scale, value; - - link->l1ss.ctl1 = link->l1ss.ctl2 = 0; + u32 ctl1 = 0, ctl2 = 0; + u32 pctl1, pctl2, cctl1, cctl2; + u32 pl1_2_enables, cl1_2_enables; if (!(link->aspm_support & ASPM_STATE_L1_2_MASK)) return; @@ -480,10 +475,10 @@ static void aspm_calc_l1ss_info(struct pcie_link_state *link, if (calc_l1ss_pwron(parent, scale1, val1) > calc_l1ss_pwron(child, scale2, val2)) { - link->l1ss.ctl2 |= scale1 | (val1 << 3); + ctl2 |= scale1 | (val1 << 3); t_power_on = calc_l1ss_pwron(parent, scale1, val1); } else { - link->l1ss.ctl2 |= scale2 | (val2 << 3); + ctl2 |= scale2 | (val2 << 3); t_power_on = calc_l1ss_pwron(child, scale2, val2); } @@ -499,7 +494,50 @@ static void aspm_calc_l1ss_info(struct pcie_link_state *link, */ l1_2_threshold = 2 + 4 + t_common_mode + t_power_on; encode_l12_threshold(l1_2_threshold, &scale, &value); - link->l1ss.ctl1 |= t_common_mode << 8 | scale << 29 | value << 16; + ctl1 |= t_common_mode << 8 | scale << 29 | value << 16; + + pci_read_config_dword(parent, parent->l1ss + PCI_L1SS_CTL1, &pctl1); + pci_read_config_dword(parent, parent->l1ss + PCI_L1SS_CTL2, &pctl2); + pci_read_config_dword(child, child->l1ss + PCI_L1SS_CTL1, &cctl1); + pci_read_config_dword(child, child->l1ss + PCI_L1SS_CTL2, &cctl2); + + if (ctl1 == pctl1 && ctl1 == cctl1 && + ctl2 == pctl2 && ctl2 == cctl2) + return; + + /* Disable L1.2 while updating. See PCIe r5.0, sec 5.5.4, 7.8.3.3 */ + pl1_2_enables = pctl1 & PCI_L1SS_CTL1_L1_2_MASK; + cl1_2_enables = cctl1 & PCI_L1SS_CTL1_L1_2_MASK; + + if (pl1_2_enables || cl1_2_enables) { + pci_clear_and_set_dword(child, child->l1ss + PCI_L1SS_CTL1, + PCI_L1SS_CTL1_L1_2_MASK, 0); + pci_clear_and_set_dword(parent, parent->l1ss + PCI_L1SS_CTL1, + PCI_L1SS_CTL1_L1_2_MASK, 0); + } + + /* Program T_POWER_ON times in both ports */ + pci_write_config_dword(parent, parent->l1ss + PCI_L1SS_CTL2, ctl2); + pci_write_config_dword(child, child->l1ss + PCI_L1SS_CTL2, ctl2); + + /* Program Common_Mode_Restore_Time in upstream device */ + pci_clear_and_set_dword(parent, parent->l1ss + PCI_L1SS_CTL1, + PCI_L1SS_CTL1_CM_RESTORE_TIME, ctl1); + + /* Program LTR_L1.2_THRESHOLD time in both ports */ + pci_clear_and_set_dword(parent, parent->l1ss + PCI_L1SS_CTL1, + PCI_L1SS_CTL1_LTR_L12_TH_VALUE | + PCI_L1SS_CTL1_LTR_L12_TH_SCALE, ctl1); + pci_clear_and_set_dword(child, child->l1ss + PCI_L1SS_CTL1, + PCI_L1SS_CTL1_LTR_L12_TH_VALUE | + PCI_L1SS_CTL1_LTR_L12_TH_SCALE, ctl1); + + if (pl1_2_enables || cl1_2_enables) { + pci_clear_and_set_dword(parent, parent->l1ss + PCI_L1SS_CTL1, 0, + pl1_2_enables); + pci_clear_and_set_dword(child, child->l1ss + PCI_L1SS_CTL1, 0, + cl1_2_enables); + } } static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist) @@ -679,30 +717,6 @@ static void pcie_config_aspm_l1ss(struct pcie_link_state *link, u32 state) PCI_EXP_LNKCTL_ASPM_L1, 0); } - if (enable_req & ASPM_STATE_L1_2_MASK) { - - /* Program T_POWER_ON times in both ports */ - pci_write_config_dword(parent, parent->l1ss + PCI_L1SS_CTL2, - link->l1ss.ctl2); - pci_write_config_dword(child, child->l1ss + PCI_L1SS_CTL2, - link->l1ss.ctl2); - - /* Program Common_Mode_Restore_Time in upstream device */ - pci_clear_and_set_dword(parent, parent->l1ss + PCI_L1SS_CTL1, - PCI_L1SS_CTL1_CM_RESTORE_TIME, - link->l1ss.ctl1); - - /* Program LTR_L1.2_THRESHOLD time in both ports */ - pci_clear_and_set_dword(parent, parent->l1ss + PCI_L1SS_CTL1, - PCI_L1SS_CTL1_LTR_L12_TH_VALUE | - PCI_L1SS_CTL1_LTR_L12_TH_SCALE, - link->l1ss.ctl1); - pci_clear_and_set_dword(child, child->l1ss + PCI_L1SS_CTL1, - PCI_L1SS_CTL1_LTR_L12_TH_VALUE | - PCI_L1SS_CTL1_LTR_L12_TH_SCALE, - link->l1ss.ctl1); - } - val = 0; if (state & ASPM_STATE_L1_1) val |= PCI_L1SS_CTL1_ASPM_L1_1; diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 06846ec2e071..c7e0acba0e20 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -1058,6 +1058,7 @@ #define PCI_L1SS_CTL1_PCIPM_L1_1 0x00000002 /* PCI-PM L1.1 Enable */ #define PCI_L1SS_CTL1_ASPM_L1_2 0x00000004 /* ASPM L1.2 Enable */ #define PCI_L1SS_CTL1_ASPM_L1_1 0x00000008 /* ASPM L1.1 Enable */ +#define PCI_L1SS_CTL1_L1_2_MASK 0x00000005 #define PCI_L1SS_CTL1_L1SS_MASK 0x0000000f #define PCI_L1SS_CTL1_CM_RESTORE_TIME 0x0000ff00 /* Common_Mode_Restore_Time */ #define PCI_L1SS_CTL1_LTR_L12_TH_VALUE 0x03ff0000 /* LTR_L1.2_THRESHOLD_Value */ -- cgit v1.2.3 From f3d301c1f2f5676465cdf3259737ea19cc82731f Mon Sep 17 00:00:00 2001 From: Al Grant Date: Mon, 21 Sep 2020 21:46:37 +0100 Subject: perf: correct SNOOPX field offset perf_event.h has macros that define the field offsets in the data_src bitmask in perf records. The SNOOPX and REMOTE offsets were both 37. These are distinct fields, and the bitfield layout in perf_mem_data_src confirms that SNOOPX should be at offset 38. Fixes: 52839e653b5629bd ("perf tools: Add support for printing new mem_info encodings") Signed-off-by: Al Grant Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Link: https://lkml.kernel.org/r/4ac9f5cc-4388-b34a-9999-418a4099415d@foss.arm.com --- include/uapi/linux/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 077e7ee69e3d..b95d3c485d27 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -1196,7 +1196,7 @@ union perf_mem_data_src { #define PERF_MEM_SNOOPX_FWD 0x01 /* forward */ /* 1 free */ -#define PERF_MEM_SNOOPX_SHIFT 37 +#define PERF_MEM_SNOOPX_SHIFT 38 /* locked instruction */ #define PERF_MEM_LOCK_NA 0x01 /* not available */ -- cgit v1.2.3 From 66570e966dd9cb4fd57811d0056c6472a14a2c41 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 18 Aug 2020 15:24:28 +0000 Subject: kvm: x86: only provide PV features if enabled in guest's CPUID KVM unconditionally provides PV features to the guest, regardless of the configured CPUID. An unwitting guest that doesn't check KVM_CPUID_FEATURES before use could access paravirt features that userspace did not intend to provide. Fix this by checking the guest's CPUID before performing any paravirtual operations. Introduce a capability, KVM_CAP_ENFORCE_PV_FEATURE_CPUID, to gate the aforementioned enforcement. Migrating a VM from a host w/o this patch to a host with this patch could silently change the ABI exposed to the guest, warranting that we default to the old behavior and opt-in for the new one. Reviewed-by: Jim Mattson Reviewed-by: Peter Shier Signed-off-by: Oliver Upton Change-Id: I202a0926f65035b872bfe8ad15307c026de59a98 Message-Id: <20200818152429.1923996-4-oupton@google.com> Reviewed-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 11 +++++++ arch/x86/include/asm/kvm_host.h | 15 +++++++++ arch/x86/kvm/cpuid.c | 7 +++++ arch/x86/kvm/cpuid.h | 10 ++++++ arch/x86/kvm/x86.c | 67 ++++++++++++++++++++++++++++++++++++++--- include/uapi/linux/kvm.h | 1 + 6 files changed, 106 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 9ece9a827a58..76317221d29f 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6380,3 +6380,14 @@ ranges that KVM should reject access to. In combination with KVM_CAP_X86_USER_SPACE_MSR, this allows user space to trap and emulate MSRs that are outside of the scope of KVM as well as limit the attack surface on KVM's MSR emulation code. + + +8.26 KVM_CAP_ENFORCE_PV_CPUID +----------------------------- + +Architectures: x86 + +When enabled, KVM will disable paravirtual features provided to the +guest according to the bits in the KVM_CPUID_FEATURES CPUID leaf +(0x40000001). Otherwise, a guest may use the paravirtual features +regardless of what has actually been exposed through the CPUID leaf. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d0f77235da92..15e51343957e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -789,6 +789,21 @@ struct kvm_vcpu_arch { /* AMD MSRC001_0015 Hardware Configuration */ u64 msr_hwcr; + + /* pv related cpuid info */ + struct { + /* + * value of the eax register in the KVM_CPUID_FEATURES CPUID + * leaf. + */ + u32 features; + + /* + * indicates whether pv emulation should be disabled if features + * are not present in the guest's cpuid + */ + bool enforce; + } pv_cpuid; }; struct kvm_lpage_info { diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 37c3668a774f..d253c023ee76 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -107,6 +107,13 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); + /* + * save the feature bitmap to avoid cpuid lookup for every PV + * operation + */ + if (best) + vcpu->arch.pv_cpuid.features = best->eax; + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) { best = kvm_find_cpuid_entry(vcpu, 0x1, 0); if (best) diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 1d2c4f2e4bb6..bf8577947ed2 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -5,6 +5,7 @@ #include "x86.h" #include #include +#include extern u32 kvm_cpu_caps[NCAPINTS] __read_mostly; void kvm_set_cpu_caps(void); @@ -313,4 +314,13 @@ static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu)); } +static __always_inline bool guest_pv_has(struct kvm_vcpu *vcpu, + unsigned int kvm_feature) +{ + if (!vcpu->arch.pv_cpuid.enforce) + return true; + + return vcpu->arch.pv_cpuid.features & (1u << kvm_feature); +} + #endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b928e092da03..ca940de53e18 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2877,6 +2877,14 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) if (data & 0x30) return 1; + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) && + (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT)) + return 1; + + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) && + (data & KVM_ASYNC_PF_DELIVERY_AS_INT)) + return 1; + if (!lapic_in_kernel(vcpu)) return data ? 1 : 0; @@ -2954,10 +2962,12 @@ static void record_steal_time(struct kvm_vcpu *vcpu) * Doing a TLB flush here, on the guest's behalf, can avoid * expensive IPIs. */ - trace_kvm_pv_tlb_flush(vcpu->vcpu_id, - st->preempted & KVM_VCPU_FLUSH_TLB); - if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) - kvm_vcpu_flush_tlb_guest(vcpu); + if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) { + trace_kvm_pv_tlb_flush(vcpu->vcpu_id, + st->preempted & KVM_VCPU_FLUSH_TLB); + if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) + kvm_vcpu_flush_tlb_guest(vcpu); + } vcpu->arch.st.preempted = 0; @@ -3118,30 +3128,54 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.smi_count = data; break; case MSR_KVM_WALL_CLOCK_NEW: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) + return 1; + + kvm_write_wall_clock(vcpu->kvm, data); + break; case MSR_KVM_WALL_CLOCK: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) + return 1; + kvm_write_wall_clock(vcpu->kvm, data); break; case MSR_KVM_SYSTEM_TIME_NEW: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) + return 1; + kvm_write_system_time(vcpu, data, false, msr_info->host_initiated); break; case MSR_KVM_SYSTEM_TIME: - kvm_write_system_time(vcpu, data, true, msr_info->host_initiated); + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) + return 1; + + kvm_write_system_time(vcpu, data, true, msr_info->host_initiated); break; case MSR_KVM_ASYNC_PF_EN: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) + return 1; + if (kvm_pv_enable_async_pf(vcpu, data)) return 1; break; case MSR_KVM_ASYNC_PF_INT: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) + return 1; + if (kvm_pv_enable_async_pf_int(vcpu, data)) return 1; break; case MSR_KVM_ASYNC_PF_ACK: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) + return 1; if (data & 0x1) { vcpu->arch.apf.pageready_pending = false; kvm_check_async_pf_completion(vcpu); } break; case MSR_KVM_STEAL_TIME: + if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME)) + return 1; if (unlikely(!sched_info_on())) return 1; @@ -3158,11 +3192,17 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_KVM_PV_EOI_EN: + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) + return 1; + if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8))) return 1; break; case MSR_KVM_POLL_CONTROL: + if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL)) + return 1; + /* only enable bit supported */ if (data & (-1ULL << 1)) return 1; @@ -3658,6 +3698,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_LAST_CPU: case KVM_CAP_X86_USER_SPACE_MSR: case KVM_CAP_X86_MSR_FILTER: + case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: r = 1; break; case KVM_CAP_SYNC_REGS: @@ -4528,6 +4569,11 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return kvm_x86_ops.enable_direct_tlbflush(vcpu); + case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: + vcpu->arch.pv_cpuid.enforce = cap->args[0]; + + return 0; + default: return -EINVAL; } @@ -8000,11 +8046,16 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) goto out; } + ret = -KVM_ENOSYS; + switch (nr) { case KVM_HC_VAPIC_POLL_IRQ: ret = 0; break; case KVM_HC_KICK_CPU: + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT)) + break; + kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1); kvm_sched_yield(vcpu->kvm, a1); ret = 0; @@ -8015,9 +8066,15 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) break; #endif case KVM_HC_SEND_IPI: + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI)) + break; + ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit); break; case KVM_HC_SCHED_YIELD: + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD)) + break; + kvm_sched_yield(vcpu->kvm, a0); ret = 0; break; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 58f43aa1fc21..ca41220b40b8 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1052,6 +1052,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_STEAL_TIME 187 #define KVM_CAP_X86_USER_SPACE_MSR 188 #define KVM_CAP_X86_MSR_FILTER 189 +#define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From ba452c9e996d8a4c347b32805f91abb70de5de7e Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Tue, 20 Oct 2020 23:25:56 +0200 Subject: bpf: Fix bpf_redirect_neigh helper api to support supplying nexthop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Based on the discussion in [0], update the bpf_redirect_neigh() helper to accept an optional parameter specifying the nexthop information. This makes it possible to combine bpf_fib_lookup() and bpf_redirect_neigh() without incurring a duplicate FIB lookup - since the FIB lookup helper will return the nexthop information even if no neighbour is present, this can simply be passed on to bpf_redirect_neigh() if bpf_fib_lookup() returns BPF_FIB_LKUP_RET_NO_NEIGH. Thus fix & extend it before helper API is frozen. [0] https://lore.kernel.org/bpf/393e17fc-d187-3a8d-2f0d-a627c7c63fca@iogearbox.net/ Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Reviewed-by: David Ahern Link: https://lore.kernel.org/bpf/160322915615.32199.1187570224032024535.stgit@toke.dk --- include/linux/filter.h | 9 +++ include/uapi/linux/bpf.h | 22 ++++-- net/core/filter.c | 158 ++++++++++++++++++++++++++--------------- scripts/bpf_helpers_doc.py | 1 + tools/include/uapi/linux/bpf.h | 22 ++++-- 5 files changed, 145 insertions(+), 67 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 20fc24c9779a..72d62cbc1578 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -607,12 +607,21 @@ struct bpf_skb_data_end { void *data_end; }; +struct bpf_nh_params { + u32 nh_family; + union { + u32 ipv4_nh; + struct in6_addr ipv6_nh; + }; +}; + struct bpf_redirect_info { u32 flags; u32 tgt_index; void *tgt_value; struct bpf_map *map; u32 kern_flags; + struct bpf_nh_params nh; }; DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bf5a99d803e4..e6ceac3f7d62 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3677,15 +3677,19 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * long bpf_redirect_neigh(u32 ifindex, u64 flags) + * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags) * Description * Redirect the packet to another net device of index *ifindex* * and fill in L2 addresses from neighboring subsystem. This helper * is somewhat similar to **bpf_redirect**\ (), except that it * populates L2 addresses as well, meaning, internally, the helper - * performs a FIB lookup based on the skb's networking header to - * get the address of the next hop and then relies on the neighbor - * lookup for the L2 address of the nexthop. + * relies on the neighbor lookup for the L2 address of the nexthop. + * + * The helper will perform a FIB lookup based on the skb's + * networking header to get the address of the next hop, unless + * this is supplied by the caller in the *params* argument. The + * *plen* argument indicates the len of *params* and should be set + * to 0 if *params* is NULL. * * The *flags* argument is reserved and must be 0. The helper is * currently only supported for tc BPF program types, and enabled @@ -4906,6 +4910,16 @@ struct bpf_fib_lookup { __u8 dmac[6]; /* ETH_ALEN */ }; +struct bpf_redir_neigh { + /* network family for lookup (AF_INET, AF_INET6) */ + __u32 nh_family; + /* network address of nexthop; skips fib lookup to find gateway */ + union { + __be32 ipv4_nh; + __u32 ipv6_nh[4]; /* in6_addr; network order */ + }; +}; + enum bpf_task_fd_type { BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ BPF_FD_TYPE_TRACEPOINT, /* tp name */ diff --git a/net/core/filter.c b/net/core/filter.c index c5e2a1c5fd8d..6d0fa65a4a46 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2165,12 +2165,12 @@ static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, } #if IS_ENABLED(CONFIG_IPV6) -static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb) +static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, + struct net_device *dev, struct bpf_nh_params *nh) { - struct dst_entry *dst = skb_dst(skb); - struct net_device *dev = dst->dev; u32 hh_len = LL_RESERVED_SPACE(dev); const struct in6_addr *nexthop; + struct dst_entry *dst = NULL; struct neighbour *neigh; if (dev_xmit_recursion()) { @@ -2196,8 +2196,13 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb) } rcu_read_lock_bh(); - nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst), - &ipv6_hdr(skb)->daddr); + if (!nh) { + dst = skb_dst(skb); + nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst), + &ipv6_hdr(skb)->daddr); + } else { + nexthop = &nh->ipv6_nh; + } neigh = ip_neigh_gw6(dev, nexthop); if (likely(!IS_ERR(neigh))) { int ret; @@ -2210,36 +2215,43 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb) return ret; } rcu_read_unlock_bh(); - IP6_INC_STATS(dev_net(dst->dev), - ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); + if (dst) + IP6_INC_STATS(dev_net(dst->dev), + ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); out_drop: kfree_skb(skb); return -ENETDOWN; } -static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev) +static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); struct net *net = dev_net(dev); int err, ret = NET_XMIT_DROP; - struct dst_entry *dst; - struct flowi6 fl6 = { - .flowi6_flags = FLOWI_FLAG_ANYSRC, - .flowi6_mark = skb->mark, - .flowlabel = ip6_flowinfo(ip6h), - .flowi6_oif = dev->ifindex, - .flowi6_proto = ip6h->nexthdr, - .daddr = ip6h->daddr, - .saddr = ip6h->saddr, - }; - dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); - if (IS_ERR(dst)) - goto out_drop; + if (!nh) { + struct dst_entry *dst; + struct flowi6 fl6 = { + .flowi6_flags = FLOWI_FLAG_ANYSRC, + .flowi6_mark = skb->mark, + .flowlabel = ip6_flowinfo(ip6h), + .flowi6_oif = dev->ifindex, + .flowi6_proto = ip6h->nexthdr, + .daddr = ip6h->daddr, + .saddr = ip6h->saddr, + }; + + dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); + if (IS_ERR(dst)) + goto out_drop; - skb_dst_set(skb, dst); + skb_dst_set(skb, dst); + } else if (nh->nh_family != AF_INET6) { + goto out_drop; + } - err = bpf_out_neigh_v6(net, skb); + err = bpf_out_neigh_v6(net, skb, dev, nh); if (unlikely(net_xmit_eval(err))) dev->stats.tx_errors++; else @@ -2252,7 +2264,8 @@ out_xmit: return ret; } #else -static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev) +static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) { kfree_skb(skb); return NET_XMIT_DROP; @@ -2260,11 +2273,9 @@ static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev) #endif /* CONFIG_IPV6 */ #if IS_ENABLED(CONFIG_INET) -static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb) +static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb, + struct net_device *dev, struct bpf_nh_params *nh) { - struct dst_entry *dst = skb_dst(skb); - struct rtable *rt = container_of(dst, struct rtable, dst); - struct net_device *dev = dst->dev; u32 hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; bool is_v6gw = false; @@ -2292,7 +2303,21 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb) } rcu_read_lock_bh(); - neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); + if (!nh) { + struct dst_entry *dst = skb_dst(skb); + struct rtable *rt = container_of(dst, struct rtable, dst); + + neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); + } else if (nh->nh_family == AF_INET6) { + neigh = ip_neigh_gw6(dev, &nh->ipv6_nh); + is_v6gw = true; + } else if (nh->nh_family == AF_INET) { + neigh = ip_neigh_gw4(dev, nh->ipv4_nh); + } else { + rcu_read_unlock_bh(); + goto out_drop; + } + if (likely(!IS_ERR(neigh))) { int ret; @@ -2309,33 +2334,37 @@ out_drop: return -ENETDOWN; } -static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev) +static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) { const struct iphdr *ip4h = ip_hdr(skb); struct net *net = dev_net(dev); int err, ret = NET_XMIT_DROP; - struct rtable *rt; - struct flowi4 fl4 = { - .flowi4_flags = FLOWI_FLAG_ANYSRC, - .flowi4_mark = skb->mark, - .flowi4_tos = RT_TOS(ip4h->tos), - .flowi4_oif = dev->ifindex, - .flowi4_proto = ip4h->protocol, - .daddr = ip4h->daddr, - .saddr = ip4h->saddr, - }; - rt = ip_route_output_flow(net, &fl4, NULL); - if (IS_ERR(rt)) - goto out_drop; - if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { - ip_rt_put(rt); - goto out_drop; - } + if (!nh) { + struct flowi4 fl4 = { + .flowi4_flags = FLOWI_FLAG_ANYSRC, + .flowi4_mark = skb->mark, + .flowi4_tos = RT_TOS(ip4h->tos), + .flowi4_oif = dev->ifindex, + .flowi4_proto = ip4h->protocol, + .daddr = ip4h->daddr, + .saddr = ip4h->saddr, + }; + struct rtable *rt; + + rt = ip_route_output_flow(net, &fl4, NULL); + if (IS_ERR(rt)) + goto out_drop; + if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { + ip_rt_put(rt); + goto out_drop; + } - skb_dst_set(skb, &rt->dst); + skb_dst_set(skb, &rt->dst); + } - err = bpf_out_neigh_v4(net, skb); + err = bpf_out_neigh_v4(net, skb, dev, nh); if (unlikely(net_xmit_eval(err))) dev->stats.tx_errors++; else @@ -2348,14 +2377,16 @@ out_xmit: return ret; } #else -static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev) +static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) { kfree_skb(skb); return NET_XMIT_DROP; } #endif /* CONFIG_INET */ -static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev) +static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) { struct ethhdr *ethh = eth_hdr(skb); @@ -2370,9 +2401,9 @@ static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev) skb_reset_network_header(skb); if (skb->protocol == htons(ETH_P_IP)) - return __bpf_redirect_neigh_v4(skb, dev); + return __bpf_redirect_neigh_v4(skb, dev, nh); else if (skb->protocol == htons(ETH_P_IPV6)) - return __bpf_redirect_neigh_v6(skb, dev); + return __bpf_redirect_neigh_v6(skb, dev, nh); out: kfree_skb(skb); return -ENOTSUPP; @@ -2382,7 +2413,8 @@ out: enum { BPF_F_NEIGH = (1ULL << 1), BPF_F_PEER = (1ULL << 2), -#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER) + BPF_F_NEXTHOP = (1ULL << 3), +#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP) }; BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) @@ -2455,7 +2487,8 @@ int skb_do_redirect(struct sk_buff *skb) return -EAGAIN; } return flags & BPF_F_NEIGH ? - __bpf_redirect_neigh(skb, dev) : + __bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ? + &ri->nh : NULL) : __bpf_redirect(skb, dev, flags); out_drop: kfree_skb(skb); @@ -2504,16 +2537,21 @@ static const struct bpf_func_proto bpf_redirect_peer_proto = { .arg2_type = ARG_ANYTHING, }; -BPF_CALL_2(bpf_redirect_neigh, u32, ifindex, u64, flags) +BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params, + int, plen, u64, flags) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - if (unlikely(flags)) + if (unlikely((plen && plen < sizeof(*params)) || flags)) return TC_ACT_SHOT; - ri->flags = BPF_F_NEIGH; + ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0); ri->tgt_index = ifindex; + BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params)); + if (plen) + memcpy(&ri->nh, params, sizeof(ri->nh)); + return TC_ACT_REDIRECT; } @@ -2522,7 +2560,9 @@ static const struct bpf_func_proto bpf_redirect_neigh_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, - .arg2_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_MEM_OR_NULL, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes) diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 7d86fdd190be..6769caae142f 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -453,6 +453,7 @@ class PrinterHelpers(Printer): 'struct bpf_perf_event_data', 'struct bpf_perf_event_value', 'struct bpf_pidns_info', + 'struct bpf_redir_neigh', 'struct bpf_sk_lookup', 'struct bpf_sock', 'struct bpf_sock_addr', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index bf5a99d803e4..e6ceac3f7d62 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3677,15 +3677,19 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * long bpf_redirect_neigh(u32 ifindex, u64 flags) + * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags) * Description * Redirect the packet to another net device of index *ifindex* * and fill in L2 addresses from neighboring subsystem. This helper * is somewhat similar to **bpf_redirect**\ (), except that it * populates L2 addresses as well, meaning, internally, the helper - * performs a FIB lookup based on the skb's networking header to - * get the address of the next hop and then relies on the neighbor - * lookup for the L2 address of the nexthop. + * relies on the neighbor lookup for the L2 address of the nexthop. + * + * The helper will perform a FIB lookup based on the skb's + * networking header to get the address of the next hop, unless + * this is supplied by the caller in the *params* argument. The + * *plen* argument indicates the len of *params* and should be set + * to 0 if *params* is NULL. * * The *flags* argument is reserved and must be 0. The helper is * currently only supported for tc BPF program types, and enabled @@ -4906,6 +4910,16 @@ struct bpf_fib_lookup { __u8 dmac[6]; /* ETH_ALEN */ }; +struct bpf_redir_neigh { + /* network family for lookup (AF_INET, AF_INET6) */ + __u32 nh_family; + /* network address of nexthop; skips fib lookup to find gateway */ + union { + __be32 ipv4_nh; + __u32 ipv6_nh[4]; /* in6_addr; network order */ + }; +}; + enum bpf_task_fd_type { BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ BPF_FD_TYPE_TRACEPOINT, /* tp name */ -- cgit v1.2.3