diff options
Diffstat (limited to 'drivers/vhost')
-rw-r--r-- | drivers/vhost/Kconfig | 18 | ||||
-rw-r--r-- | drivers/vhost/net.c | 30 | ||||
-rw-r--r-- | drivers/vhost/scsi.c | 196 | ||||
-rw-r--r-- | drivers/vhost/vhost.c | 275 | ||||
-rw-r--r-- | drivers/vhost/vhost.h | 22 | ||||
-rw-r--r-- | drivers/vhost/vringh.c | 19 | ||||
-rw-r--r-- | drivers/vhost/vsock.c | 6 |
7 files changed, 503 insertions, 63 deletions
diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig index 020d4fbb947c..bc0f38574497 100644 --- a/drivers/vhost/Kconfig +++ b/drivers/vhost/Kconfig @@ -95,4 +95,22 @@ config VHOST_CROSS_ENDIAN_LEGACY If unsure, say "N". +config VHOST_ENABLE_FORK_OWNER_CONTROL + bool "Enable VHOST_ENABLE_FORK_OWNER_CONTROL" + default y + help + This option enables two IOCTLs: VHOST_SET_FORK_FROM_OWNER and + VHOST_GET_FORK_FROM_OWNER. These allow userspace applications + to modify the vhost worker mode for vhost devices. + + Also expose module parameter 'fork_from_owner_default' to allow users + to configure the default mode for vhost workers. + + By default, `VHOST_ENABLE_FORK_OWNER_CONTROL` is set to `y`, + users can change the worker thread mode as needed. + If this config is disabled (n),the related IOCTLs and parameters will + be unavailable. + + If unsure, say "Y". + endif diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index b9b9e9d40951..7cbfc7d718b3 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -755,10 +755,10 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) int err; int sent_pkts = 0; bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX); + bool busyloop_intr; do { - bool busyloop_intr = false; - + busyloop_intr = false; if (nvq->done_idx == VHOST_NET_BATCH) vhost_tx_batch(net, nvq, sock, &msg); @@ -769,13 +769,10 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) break; /* Nothing new? Wait for eventfd to tell us they refilled. */ if (head == vq->num) { - if (unlikely(busyloop_intr)) { - vhost_poll_queue(&vq->poll); - } else if (unlikely(vhost_enable_notify(&net->dev, - vq))) { - vhost_disable_notify(&net->dev, vq); - continue; - } + /* Kicks are disabled at this point, break loop and + * process any remaining batched packets. Queue will + * be re-enabled afterwards. + */ break; } @@ -825,7 +822,22 @@ done: ++nvq->done_idx; } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len))); + /* Kicks are still disabled, dispatch any remaining batched msgs. */ vhost_tx_batch(net, nvq, sock, &msg); + + if (unlikely(busyloop_intr)) + /* If interrupted while doing busy polling, requeue the + * handler to be fair handle_rx as well as other tasks + * waiting on cpu. + */ + vhost_poll_queue(&vq->poll); + else + /* All of our work has been completed; however, before + * leaving the TX handler, do one last check for work, + * and requeue handler if necessary. If there is no work, + * queue will be reenabled. + */ + vhost_net_busy_poll_try_queue(net, vq); } static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index 26bcf3a7f70c..63b0829391eb 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -71,7 +71,7 @@ static int vhost_scsi_set_inline_sg_cnt(const char *buf, if (ret) return ret; - if (ret > VHOST_SCSI_PREALLOC_SGLS) { + if (cnt > VHOST_SCSI_PREALLOC_SGLS) { pr_err("Max inline_sg_cnt is %u\n", VHOST_SCSI_PREALLOC_SGLS); return -EINVAL; } @@ -133,6 +133,11 @@ struct vhost_scsi_cmd { struct se_cmd tvc_se_cmd; /* Sense buffer that will be mapped into outgoing status */ unsigned char tvc_sense_buf[TRANSPORT_SENSE_BUFFER]; + /* + * Dirty write descriptors of this command. + */ + struct vhost_log *tvc_log; + unsigned int tvc_log_num; /* Completed commands list, serviced from vhost worker thread */ struct llist_node tvc_completion_list; /* Used to track inflight cmd */ @@ -258,6 +263,12 @@ struct vhost_scsi_tmf { struct iovec resp_iov; int in_iovs; int vq_desc; + + /* + * Dirty write descriptors of this command. + */ + struct vhost_log *tmf_log; + unsigned int tmf_log_num; }; /* @@ -362,6 +373,45 @@ static int vhost_scsi_check_prot_fabric_only(struct se_portal_group *se_tpg) return tpg->tv_fabric_prot_type; } +static int vhost_scsi_copy_cmd_log(struct vhost_virtqueue *vq, + struct vhost_scsi_cmd *cmd, + struct vhost_log *log, + unsigned int log_num) +{ + if (!cmd->tvc_log) + cmd->tvc_log = kmalloc_array(vq->dev->iov_limit, + sizeof(*cmd->tvc_log), + GFP_KERNEL); + + if (unlikely(!cmd->tvc_log)) { + vq_err(vq, "Failed to alloc tvc_log\n"); + return -ENOMEM; + } + + memcpy(cmd->tvc_log, log, sizeof(*cmd->tvc_log) * log_num); + cmd->tvc_log_num = log_num; + + return 0; +} + +static void vhost_scsi_log_write(struct vhost_virtqueue *vq, + struct vhost_log *log, + unsigned int log_num) +{ + if (likely(!vhost_has_feature(vq, VHOST_F_LOG_ALL))) + return; + + if (likely(!log_num || !log)) + return; + + /* + * vhost-scsi doesn't support VIRTIO_F_ACCESS_PLATFORM. + * No requirement for vq->iotlb case. + */ + WARN_ON_ONCE(unlikely(vq->iotlb)); + vhost_log_write(vq, log, log_num, U64_MAX, NULL, 0); +} + static void vhost_scsi_release_cmd_res(struct se_cmd *se_cmd) { struct vhost_scsi_cmd *tv_cmd = container_of(se_cmd, @@ -408,6 +458,10 @@ static void vhost_scsi_release_tmf_res(struct vhost_scsi_tmf *tmf) { struct vhost_scsi_inflight *inflight = tmf->inflight; + /* + * tmf->tmf_log is default NULL unless VHOST_F_LOG_ALL is set. + */ + kfree(tmf->tmf_log); kfree(tmf); vhost_scsi_put_inflight(inflight); } @@ -517,6 +571,8 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt) struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq; struct virtio_scsi_event *event = &evt->event; struct virtio_scsi_event __user *eventp; + struct vhost_log *vq_log; + unsigned int log_num; unsigned out, in; int head, ret; @@ -527,9 +583,19 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt) again: vhost_disable_notify(&vs->dev, vq); + + vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ? + vq->log : NULL; + + /* + * Reset 'log_num' since vhost_get_vq_desc() may reset it only + * after certain condition checks. + */ + log_num = 0; + head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), &out, &in, - NULL, NULL); + vq_log, &log_num); if (head < 0) { vs->vs_events_missed = true; return; @@ -559,6 +625,8 @@ again: vhost_add_used_and_signal(&vs->dev, vq, head, 0); else vq_err(vq, "Faulted on vhost_scsi_send_event\n"); + + vhost_scsi_log_write(vq, vq_log, log_num); } static void vhost_scsi_complete_events(struct vhost_scsi *vs, bool drop) @@ -660,6 +728,9 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work) } else pr_err("Faulted on virtio_scsi_cmd_resp\n"); + vhost_scsi_log_write(cmd->tvc_vq, cmd->tvc_log, + cmd->tvc_log_num); + vhost_scsi_release_cmd_res(se_cmd); } @@ -676,6 +747,7 @@ vhost_scsi_get_cmd(struct vhost_virtqueue *vq, u64 scsi_tag) struct vhost_scsi_virtqueue, vq); struct vhost_scsi_cmd *cmd; struct scatterlist *sgl, *prot_sgl; + struct vhost_log *log; int tag; tag = sbitmap_get(&svq->scsi_tags); @@ -687,9 +759,11 @@ vhost_scsi_get_cmd(struct vhost_virtqueue *vq, u64 scsi_tag) cmd = &svq->scsi_cmds[tag]; sgl = cmd->sgl; prot_sgl = cmd->prot_sgl; + log = cmd->tvc_log; memset(cmd, 0, sizeof(*cmd)); cmd->sgl = sgl; cmd->prot_sgl = prot_sgl; + cmd->tvc_log = log; cmd->tvc_se_cmd.map_tag = tag; cmd->inflight = vhost_scsi_get_inflight(vq); @@ -1063,13 +1137,17 @@ vhost_scsi_send_bad_target(struct vhost_scsi *vs, static int vhost_scsi_get_desc(struct vhost_scsi *vs, struct vhost_virtqueue *vq, - struct vhost_scsi_ctx *vc) + struct vhost_scsi_ctx *vc, + struct vhost_log *log, unsigned int *log_num) { int ret = -ENXIO; + if (likely(log_num)) + *log_num = 0; + vc->head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), &vc->out, &vc->in, - NULL, NULL); + log, log_num); pr_debug("vhost_get_vq_desc: head: %d, out: %u in: %u\n", vc->head, vc->out, vc->in); @@ -1148,10 +1226,8 @@ vhost_scsi_get_req(struct vhost_virtqueue *vq, struct vhost_scsi_ctx *vc, /* validated at handler entry */ vs_tpg = vhost_vq_get_backend(vq); tpg = READ_ONCE(vs_tpg[*vc->target]); - if (unlikely(!tpg)) { - vq_err(vq, "Target 0x%x does not exist\n", *vc->target); + if (unlikely(!tpg)) goto out; - } } if (tpgp) @@ -1221,6 +1297,8 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) u8 task_attr; bool t10_pi = vhost_has_feature(vq, VIRTIO_SCSI_F_T10_PI); u8 *cdb; + struct vhost_log *vq_log; + unsigned int log_num; mutex_lock(&vq->mutex); /* @@ -1236,8 +1314,11 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) vhost_disable_notify(&vs->dev, vq); + vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ? + vq->log : NULL; + do { - ret = vhost_scsi_get_desc(vs, vq, &vc); + ret = vhost_scsi_get_desc(vs, vq, &vc, vq_log, &log_num); if (ret) goto err; @@ -1386,6 +1467,14 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) goto err; } + if (unlikely(vq_log && log_num)) { + ret = vhost_scsi_copy_cmd_log(vq, cmd, vq_log, log_num); + if (unlikely(ret)) { + vhost_scsi_release_cmd_res(&cmd->tvc_se_cmd); + goto err; + } + } + pr_debug("vhost_scsi got command opcode: %#02x, lun: %d\n", cdb[0], lun); pr_debug("cmd: %p exp_data_len: %d, prot_bytes: %d data_direction:" @@ -1421,11 +1510,14 @@ err: */ if (ret == -ENXIO) break; - else if (ret == -EIO) + else if (ret == -EIO) { vhost_scsi_send_bad_target(vs, vq, &vc, TYPE_IO_CMD); - else if (ret == -ENOMEM) + vhost_scsi_log_write(vq, vq_log, log_num); + } else if (ret == -ENOMEM) { vhost_scsi_send_status(vs, vq, &vc, SAM_STAT_TASK_SET_FULL); + vhost_scsi_log_write(vq, vq_log, log_num); + } } while (likely(!vhost_exceeds_weight(vq, ++c, 0))); out: mutex_unlock(&vq->mutex); @@ -1467,6 +1559,8 @@ static void vhost_scsi_tmf_resp_work(struct vhost_work *work) mutex_lock(&tmf->svq->vq.mutex); vhost_scsi_send_tmf_resp(tmf->vhost, &tmf->svq->vq, tmf->in_iovs, tmf->vq_desc, &tmf->resp_iov, resp_code); + vhost_scsi_log_write(&tmf->svq->vq, tmf->tmf_log, + tmf->tmf_log_num); mutex_unlock(&tmf->svq->vq.mutex); vhost_scsi_release_tmf_res(tmf); @@ -1490,7 +1584,8 @@ static void vhost_scsi_handle_tmf(struct vhost_scsi *vs, struct vhost_scsi_tpg *tpg, struct vhost_virtqueue *vq, struct virtio_scsi_ctrl_tmf_req *vtmf, - struct vhost_scsi_ctx *vc) + struct vhost_scsi_ctx *vc, + struct vhost_log *log, unsigned int log_num) { struct vhost_scsi_virtqueue *svq = container_of(vq, struct vhost_scsi_virtqueue, vq); @@ -1518,6 +1613,19 @@ vhost_scsi_handle_tmf(struct vhost_scsi *vs, struct vhost_scsi_tpg *tpg, tmf->in_iovs = vc->in; tmf->inflight = vhost_scsi_get_inflight(vq); + if (unlikely(log && log_num)) { + tmf->tmf_log = kmalloc_array(log_num, sizeof(*tmf->tmf_log), + GFP_KERNEL); + if (tmf->tmf_log) { + memcpy(tmf->tmf_log, log, sizeof(*tmf->tmf_log) * log_num); + tmf->tmf_log_num = log_num; + } else { + pr_err("vhost_scsi tmf log allocation error\n"); + vhost_scsi_release_tmf_res(tmf); + goto send_reject; + } + } + if (target_submit_tmr(&tmf->se_cmd, tpg->tpg_nexus->tvn_se_sess, NULL, vhost_buf_to_lun(vtmf->lun), NULL, TMR_LUN_RESET, GFP_KERNEL, 0, @@ -1531,6 +1639,7 @@ vhost_scsi_handle_tmf(struct vhost_scsi *vs, struct vhost_scsi_tpg *tpg, send_reject: vhost_scsi_send_tmf_resp(vs, vq, vc->in, vc->head, &vq->iov[vc->out], VIRTIO_SCSI_S_FUNCTION_REJECTED); + vhost_scsi_log_write(vq, log, log_num); } static void @@ -1567,6 +1676,8 @@ vhost_scsi_ctl_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) struct vhost_scsi_ctx vc; size_t typ_size; int ret, c = 0; + struct vhost_log *vq_log; + unsigned int log_num; mutex_lock(&vq->mutex); /* @@ -1580,8 +1691,11 @@ vhost_scsi_ctl_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) vhost_disable_notify(&vs->dev, vq); + vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ? + vq->log : NULL; + do { - ret = vhost_scsi_get_desc(vs, vq, &vc); + ret = vhost_scsi_get_desc(vs, vq, &vc, vq_log, &log_num); if (ret) goto err; @@ -1645,9 +1759,12 @@ vhost_scsi_ctl_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) goto err; if (v_req.type == VIRTIO_SCSI_T_TMF) - vhost_scsi_handle_tmf(vs, tpg, vq, &v_req.tmf, &vc); - else + vhost_scsi_handle_tmf(vs, tpg, vq, &v_req.tmf, &vc, + vq_log, log_num); + else { vhost_scsi_send_an_resp(vs, vq, &vc); + vhost_scsi_log_write(vq, vq_log, log_num); + } err: /* * ENXIO: No more requests, or read error, wait for next kick @@ -1657,11 +1774,13 @@ err: */ if (ret == -ENXIO) break; - else if (ret == -EIO) + else if (ret == -EIO) { vhost_scsi_send_bad_target(vs, vq, &vc, v_req.type == VIRTIO_SCSI_T_TMF ? TYPE_CTRL_TMF : TYPE_CTRL_AN); + vhost_scsi_log_write(vq, vq_log, log_num); + } } while (likely(!vhost_exceeds_weight(vq, ++c, 0))); out: mutex_unlock(&vq->mutex); @@ -1756,6 +1875,24 @@ static void vhost_scsi_flush(struct vhost_scsi *vs) wait_for_completion(&vs->old_inflight[i]->comp); } +static void vhost_scsi_destroy_vq_log(struct vhost_virtqueue *vq) +{ + struct vhost_scsi_virtqueue *svq = container_of(vq, + struct vhost_scsi_virtqueue, vq); + struct vhost_scsi_cmd *tv_cmd; + unsigned int i; + + if (!svq->scsi_cmds) + return; + + for (i = 0; i < svq->max_cmds; i++) { + tv_cmd = &svq->scsi_cmds[i]; + kfree(tv_cmd->tvc_log); + tv_cmd->tvc_log = NULL; + tv_cmd->tvc_log_num = 0; + } +} + static void vhost_scsi_destroy_vq_cmds(struct vhost_virtqueue *vq) { struct vhost_scsi_virtqueue *svq = container_of(vq, @@ -1775,6 +1912,7 @@ static void vhost_scsi_destroy_vq_cmds(struct vhost_virtqueue *vq) sbitmap_free(&svq->scsi_tags); kfree(svq->upages); + vhost_scsi_destroy_vq_log(vq); kfree(svq->scsi_cmds); svq->scsi_cmds = NULL; } @@ -2084,6 +2222,7 @@ err_dev: static int vhost_scsi_set_features(struct vhost_scsi *vs, u64 features) { struct vhost_virtqueue *vq; + bool is_log, was_log; int i; if (features & ~VHOST_SCSI_FEATURES) @@ -2096,12 +2235,39 @@ static int vhost_scsi_set_features(struct vhost_scsi *vs, u64 features) return -EFAULT; } + if (!vs->dev.nvqs) + goto out; + + is_log = features & (1 << VHOST_F_LOG_ALL); + /* + * All VQs should have same feature. + */ + was_log = vhost_has_feature(&vs->vqs[0].vq, VHOST_F_LOG_ALL); + for (i = 0; i < vs->dev.nvqs; i++) { vq = &vs->vqs[i].vq; mutex_lock(&vq->mutex); vq->acked_features = features; mutex_unlock(&vq->mutex); } + + /* + * If VHOST_F_LOG_ALL is removed, free tvc_log after + * vq->acked_features is committed. + */ + if (!is_log && was_log) { + for (i = VHOST_SCSI_VQ_IO; i < vs->dev.nvqs; i++) { + if (!vs->vqs[i].scsi_cmds) + continue; + + vq = &vs->vqs[i].vq; + mutex_lock(&vq->mutex); + vhost_scsi_destroy_vq_log(vq); + mutex_unlock(&vq->mutex); + } + } + +out: mutex_unlock(&vs->dev.mutex); return 0; } diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 63612faeab72..478eca3cf113 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -22,6 +22,7 @@ #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/kthread.h> +#include <linux/cgroup.h> #include <linux/module.h> #include <linux/sort.h> #include <linux/sched/mm.h> @@ -41,6 +42,13 @@ static int max_iotlb_entries = 2048; module_param(max_iotlb_entries, int, 0444); MODULE_PARM_DESC(max_iotlb_entries, "Maximum number of iotlb entries. (default: 2048)"); +static bool fork_from_owner_default = VHOST_FORK_OWNER_TASK; + +#ifdef CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL +module_param(fork_from_owner_default, bool, 0444); +MODULE_PARM_DESC(fork_from_owner_default, + "Set task mode as the default(default: Y)"); +#endif enum { VHOST_MEMORY_F_LOG = 0x1, @@ -242,7 +250,7 @@ static void vhost_worker_queue(struct vhost_worker *worker, * test_and_set_bit() implies a memory barrier. */ llist_add(&work->node, &worker->work_list); - vhost_task_wake(worker->vtsk); + worker->ops->wakeup(worker); } } @@ -388,6 +396,44 @@ static void vhost_vq_reset(struct vhost_dev *dev, __vhost_vq_meta_reset(vq); } +static int vhost_run_work_kthread_list(void *data) +{ + struct vhost_worker *worker = data; + struct vhost_work *work, *work_next; + struct vhost_dev *dev = worker->dev; + struct llist_node *node; + + kthread_use_mm(dev->mm); + + for (;;) { + /* mb paired w/ kthread_stop */ + set_current_state(TASK_INTERRUPTIBLE); + + if (kthread_should_stop()) { + __set_current_state(TASK_RUNNING); + break; + } + node = llist_del_all(&worker->work_list); + if (!node) + schedule(); + + node = llist_reverse_order(node); + /* make sure flag is seen after deletion */ + smp_wmb(); + llist_for_each_entry_safe(work, work_next, node, node) { + clear_bit(VHOST_WORK_QUEUED, &work->flags); + __set_current_state(TASK_RUNNING); + kcov_remote_start_common(worker->kcov_handle); + work->fn(work); + kcov_remote_stop(); + cond_resched(); + } + } + kthread_unuse_mm(dev->mm); + + return 0; +} + static bool vhost_run_work_list(void *data) { struct vhost_worker *worker = data; @@ -552,6 +598,7 @@ void vhost_dev_init(struct vhost_dev *dev, dev->byte_weight = byte_weight; dev->use_worker = use_worker; dev->msg_handler = msg_handler; + dev->fork_owner = fork_from_owner_default; init_waitqueue_head(&dev->wait); INIT_LIST_HEAD(&dev->read_list); INIT_LIST_HEAD(&dev->pending_list); @@ -581,6 +628,46 @@ long vhost_dev_check_owner(struct vhost_dev *dev) } EXPORT_SYMBOL_GPL(vhost_dev_check_owner); +struct vhost_attach_cgroups_struct { + struct vhost_work work; + struct task_struct *owner; + int ret; +}; + +static void vhost_attach_cgroups_work(struct vhost_work *work) +{ + struct vhost_attach_cgroups_struct *s; + + s = container_of(work, struct vhost_attach_cgroups_struct, work); + s->ret = cgroup_attach_task_all(s->owner, current); +} + +static int vhost_attach_task_to_cgroups(struct vhost_worker *worker) +{ + struct vhost_attach_cgroups_struct attach; + int saved_cnt; + + attach.owner = current; + + vhost_work_init(&attach.work, vhost_attach_cgroups_work); + vhost_worker_queue(worker, &attach.work); + + mutex_lock(&worker->mutex); + + /* + * Bypass attachment_cnt check in __vhost_worker_flush: + * Temporarily change it to INT_MAX to bypass the check + */ + saved_cnt = worker->attachment_cnt; + worker->attachment_cnt = INT_MAX; + __vhost_worker_flush(worker); + worker->attachment_cnt = saved_cnt; + + mutex_unlock(&worker->mutex); + + return attach.ret; +} + /* Caller should have device mutex */ bool vhost_dev_has_owner(struct vhost_dev *dev) { @@ -626,7 +713,7 @@ static void vhost_worker_destroy(struct vhost_dev *dev, WARN_ON(!llist_empty(&worker->work_list)); xa_erase(&dev->worker_xa, worker->id); - vhost_task_stop(worker->vtsk); + worker->ops->stop(worker); kfree(worker); } @@ -649,42 +736,115 @@ static void vhost_workers_free(struct vhost_dev *dev) xa_destroy(&dev->worker_xa); } +static void vhost_task_wakeup(struct vhost_worker *worker) +{ + return vhost_task_wake(worker->vtsk); +} + +static void vhost_kthread_wakeup(struct vhost_worker *worker) +{ + wake_up_process(worker->kthread_task); +} + +static void vhost_task_do_stop(struct vhost_worker *worker) +{ + return vhost_task_stop(worker->vtsk); +} + +static void vhost_kthread_do_stop(struct vhost_worker *worker) +{ + kthread_stop(worker->kthread_task); +} + +static int vhost_task_worker_create(struct vhost_worker *worker, + struct vhost_dev *dev, const char *name) +{ + struct vhost_task *vtsk; + u32 id; + int ret; + + vtsk = vhost_task_create(vhost_run_work_list, vhost_worker_killed, + worker, name); + if (IS_ERR(vtsk)) + return PTR_ERR(vtsk); + + worker->vtsk = vtsk; + vhost_task_start(vtsk); + ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL); + if (ret < 0) { + vhost_task_do_stop(worker); + return ret; + } + worker->id = id; + return 0; +} + +static int vhost_kthread_worker_create(struct vhost_worker *worker, + struct vhost_dev *dev, const char *name) +{ + struct task_struct *task; + u32 id; + int ret; + + task = kthread_create(vhost_run_work_kthread_list, worker, "%s", name); + if (IS_ERR(task)) + return PTR_ERR(task); + + worker->kthread_task = task; + wake_up_process(task); + ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL); + if (ret < 0) + goto stop_worker; + + ret = vhost_attach_task_to_cgroups(worker); + if (ret) + goto stop_worker; + + worker->id = id; + return 0; + +stop_worker: + vhost_kthread_do_stop(worker); + return ret; +} + +static const struct vhost_worker_ops kthread_ops = { + .create = vhost_kthread_worker_create, + .stop = vhost_kthread_do_stop, + .wakeup = vhost_kthread_wakeup, +}; + +static const struct vhost_worker_ops vhost_task_ops = { + .create = vhost_task_worker_create, + .stop = vhost_task_do_stop, + .wakeup = vhost_task_wakeup, +}; + static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev) { struct vhost_worker *worker; - struct vhost_task *vtsk; char name[TASK_COMM_LEN]; int ret; - u32 id; + const struct vhost_worker_ops *ops = dev->fork_owner ? &vhost_task_ops : + &kthread_ops; worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT); if (!worker) return NULL; worker->dev = dev; + worker->ops = ops; snprintf(name, sizeof(name), "vhost-%d", current->pid); - vtsk = vhost_task_create(vhost_run_work_list, vhost_worker_killed, - worker, name); - if (IS_ERR(vtsk)) - goto free_worker; - mutex_init(&worker->mutex); init_llist_head(&worker->work_list); worker->kcov_handle = kcov_common_handle(); - worker->vtsk = vtsk; - - vhost_task_start(vtsk); - - ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL); + ret = ops->create(worker, dev, name); if (ret < 0) - goto stop_worker; - worker->id = id; + goto free_worker; return worker; -stop_worker: - vhost_task_stop(vtsk); free_worker: kfree(worker); return NULL; @@ -865,6 +1025,14 @@ long vhost_worker_ioctl(struct vhost_dev *dev, unsigned int ioctl, switch (ioctl) { /* dev worker ioctls */ case VHOST_NEW_WORKER: + /* + * vhost_tasks will account for worker threads under the parent's + * NPROC value but kthreads do not. To avoid userspace overflowing + * the system with worker threads fork_owner must be true. + */ + if (!dev->fork_owner) + return -EFAULT; + ret = vhost_new_worker(dev, &state); if (!ret && copy_to_user(argp, &state, sizeof(state))) ret = -EFAULT; @@ -982,6 +1150,7 @@ void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_iotlb *umem) vhost_dev_cleanup(dev); + dev->fork_owner = fork_from_owner_default; dev->umem = umem; /* We don't need VQ locks below since vhost_dev_cleanup makes sure * VQs aren't running. @@ -2135,6 +2304,45 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp) goto done; } +#ifdef CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL + if (ioctl == VHOST_SET_FORK_FROM_OWNER) { + /* Only allow modification before owner is set */ + if (vhost_dev_has_owner(d)) { + r = -EBUSY; + goto done; + } + u8 fork_owner_val; + + if (get_user(fork_owner_val, (u8 __user *)argp)) { + r = -EFAULT; + goto done; + } + if (fork_owner_val != VHOST_FORK_OWNER_TASK && + fork_owner_val != VHOST_FORK_OWNER_KTHREAD) { + r = -EINVAL; + goto done; + } + d->fork_owner = !!fork_owner_val; + r = 0; + goto done; + } + if (ioctl == VHOST_GET_FORK_FROM_OWNER) { + u8 fork_owner_val = d->fork_owner; + + if (fork_owner_val != VHOST_FORK_OWNER_TASK && + fork_owner_val != VHOST_FORK_OWNER_KTHREAD) { + r = -EINVAL; + goto done; + } + if (put_user(fork_owner_val, (u8 __user *)argp)) { + r = -EFAULT; + goto done; + } + r = 0; + goto done; + } +#endif + /* You must be the owner to do anything else */ r = vhost_dev_check_owner(d); if (r) @@ -2304,6 +2512,19 @@ static int log_used(struct vhost_virtqueue *vq, u64 used_offset, u64 len) return 0; } +/* + * vhost_log_write() - Log in dirty page bitmap + * @vq: vhost virtqueue. + * @log: Array of dirty memory in GPA. + * @log_num: Size of vhost_log arrary. + * @len: The total length of memory buffer to log in the dirty bitmap. + * Some drivers may only partially use pages shared via the last + * vring descriptor (i.e. vhost-net RX buffer). + * Use (len == U64_MAX) to indicate the driver would log all + * pages of vring descriptors. + * @iov: Array of dirty memory in HVA. + * @count: Size of iovec array. + */ int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, unsigned int log_num, u64 len, struct iovec *iov, int count) { @@ -2327,15 +2548,14 @@ int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, r = log_write(vq->log_base, log[i].addr, l); if (r < 0) return r; - len -= l; - if (!len) { - if (vq->log_ctx) - eventfd_signal(vq->log_ctx); - return 0; - } + + if (len != U64_MAX) + len -= l; } - /* Length written exceeds what we have stored. This is a bug. */ - BUG(); + + if (vq->log_ctx) + eventfd_signal(vq->log_ctx); + return 0; } EXPORT_SYMBOL_GPL(vhost_log_write); @@ -2763,6 +2983,9 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads, } r = __vhost_add_used_n(vq, heads, count); + if (r < 0) + return r; + /* Make sure buffer is written before we update index. */ smp_wmb(); if (vhost_put_used_idx(vq)) { diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index bb75a292d50c..ab704d84fb34 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -26,7 +26,18 @@ struct vhost_work { unsigned long flags; }; +struct vhost_worker; +struct vhost_dev; + +struct vhost_worker_ops { + int (*create)(struct vhost_worker *worker, struct vhost_dev *dev, + const char *name); + void (*stop)(struct vhost_worker *worker); + void (*wakeup)(struct vhost_worker *worker); +}; + struct vhost_worker { + struct task_struct *kthread_task; struct vhost_task *vtsk; struct vhost_dev *dev; /* Used to serialize device wide flushing with worker swapping. */ @@ -36,6 +47,7 @@ struct vhost_worker { u32 id; int attachment_cnt; bool killed; + const struct vhost_worker_ops *ops; }; /* Poll a file (eventfd or socket) */ @@ -176,6 +188,16 @@ struct vhost_dev { int byte_weight; struct xarray worker_xa; bool use_worker; + /* + * If fork_owner is true we use vhost_tasks to create + * the worker so all settings/limits like cgroups, NPROC, + * scheduler, etc are inherited from the owner. If false, + * we use kthreads and only attach to the same cgroups + * as the owner for compat with older kernels. + * here we use true as default value. + * The default value is set by fork_from_owner_default + */ + bool fork_owner; int (*msg_handler)(struct vhost_dev *dev, u32 asid, struct vhost_iotlb_msg *msg); }; diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c index 73e153f9b449..bbce65452701 100644 --- a/drivers/vhost/vringh.c +++ b/drivers/vhost/vringh.c @@ -225,10 +225,9 @@ static int resize_iovec(struct vringh_kiov *iov, gfp_t gfp) flag = (iov->max_num & VRINGH_IOV_ALLOCATED); if (flag) - new = krealloc_array(iov->iov, new_num, - sizeof(struct iovec), gfp); + new = krealloc_array(iov->iov, new_num, sizeof(*new), gfp); else { - new = kmalloc_array(new_num, sizeof(struct iovec), gfp); + new = kmalloc_array(new_num, sizeof(*new), gfp); if (new) { memcpy(new, iov->iov, iov->max_num * sizeof(struct iovec)); @@ -1291,11 +1290,10 @@ static inline int getu16_iotlb(const struct vringh *vrh, if (ret) return ret; } else { - void *kaddr = kmap_local_page(ivec.iov.bvec[0].bv_page); - void *from = kaddr + ivec.iov.bvec[0].bv_offset; + __virtio16 *from = bvec_kmap_local(&ivec.iov.bvec[0]); - tmp = READ_ONCE(*(__virtio16 *)from); - kunmap_local(kaddr); + tmp = READ_ONCE(*from); + kunmap_local(from); } *val = vringh16_to_cpu(vrh, tmp); @@ -1330,11 +1328,10 @@ static inline int putu16_iotlb(const struct vringh *vrh, if (ret) return ret; } else { - void *kaddr = kmap_local_page(ivec.iov.bvec[0].bv_page); - void *to = kaddr + ivec.iov.bvec[0].bv_offset; + __virtio16 *to = bvec_kmap_local(&ivec.iov.bvec[0]); - WRITE_ONCE(*(__virtio16 *)to, tmp); - kunmap_local(kaddr); + WRITE_ONCE(*to, tmp); + kunmap_local(to); } return 0; diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index 802153e23073..66a0f060770e 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -344,6 +344,9 @@ vhost_vsock_alloc_skb(struct vhost_virtqueue *vq, len = iov_length(vq->iov, out); + if (len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE + VIRTIO_VSOCK_SKB_HEADROOM) + return NULL; + /* len contains both payload and hdr */ skb = virtio_vsock_alloc_skb(len, GFP_KERNEL); if (!skb) @@ -367,8 +370,7 @@ vhost_vsock_alloc_skb(struct vhost_virtqueue *vq, return skb; /* The pkt is too big or the length in the header is invalid */ - if (payload_len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE || - payload_len + sizeof(*hdr) > len) { + if (payload_len + sizeof(*hdr) > len) { kfree_skb(skb); return NULL; } |