diff options
Diffstat (limited to 'drivers/vhost')
-rw-r--r-- | drivers/vhost/Kconfig | 12 | ||||
-rw-r--r-- | drivers/vhost/Kconfig.tcm | 7 | ||||
-rw-r--r-- | drivers/vhost/Makefile | 3 | ||||
-rw-r--r-- | drivers/vhost/net.c | 322 | ||||
-rw-r--r-- | drivers/vhost/scsi.c (renamed from drivers/vhost/tcm_vhost.c) | 707 | ||||
-rw-r--r-- | drivers/vhost/tcm_vhost.h | 115 | ||||
-rw-r--r-- | drivers/vhost/test.c | 9 | ||||
-rw-r--r-- | drivers/vhost/vhost.c | 156 | ||||
-rw-r--r-- | drivers/vhost/vhost.h | 32 |
9 files changed, 891 insertions, 472 deletions
diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig index 85b773a93a5d..8b9226da3f54 100644 --- a/drivers/vhost/Kconfig +++ b/drivers/vhost/Kconfig @@ -10,13 +10,17 @@ config VHOST_NET To compile this driver as a module, choose M here: the module will be called vhost_net. -if STAGING -source "drivers/vhost/Kconfig.tcm" -endif +config VHOST_SCSI + tristate "VHOST_SCSI TCM fabric driver" + depends on TARGET_CORE && EVENTFD && m + select VHOST_RING + default n + ---help--- + Say M here to enable the vhost_scsi TCM fabric module + for use with virtio-scsi guests config VHOST_RING tristate ---help--- This option is selected by any driver which needs to access the host side of a virtio ring. - diff --git a/drivers/vhost/Kconfig.tcm b/drivers/vhost/Kconfig.tcm deleted file mode 100644 index c3a8cfa1de72..000000000000 --- a/drivers/vhost/Kconfig.tcm +++ /dev/null @@ -1,7 +0,0 @@ -config TCM_VHOST - tristate "TCM_VHOST fabric module" - depends on TARGET_CORE && EVENTFD && m - select VHOST_RING - default n - ---help--- - Say M here to enable the TCM_VHOST fabric module for use with virtio-scsi guests diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile index 1d37f5e12be6..654e9afb11f5 100644 --- a/drivers/vhost/Makefile +++ b/drivers/vhost/Makefile @@ -1,6 +1,7 @@ obj-$(CONFIG_VHOST_NET) += vhost_net.o vhost_net-y := vhost.o net.o -obj-$(CONFIG_TCM_VHOST) += tcm_vhost.o +obj-$(CONFIG_VHOST_SCSI) += vhost_scsi.o +vhost_scsi-y := scsi.o obj-$(CONFIG_VHOST_RING) += vringh.o diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 959b1cd89e6a..a3645bd163d8 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -64,20 +64,36 @@ enum { VHOST_NET_VQ_MAX = 2, }; -enum vhost_net_poll_state { - VHOST_NET_POLL_DISABLED = 0, - VHOST_NET_POLL_STARTED = 1, - VHOST_NET_POLL_STOPPED = 2, +struct vhost_ubuf_ref { + struct kref kref; + wait_queue_head_t wait; + struct vhost_virtqueue *vq; +}; + +struct vhost_net_virtqueue { + struct vhost_virtqueue vq; + /* hdr is used to store the virtio header. + * Since each iovec has >= 1 byte length, we never need more than + * header length entries to store the header. */ + struct iovec hdr[sizeof(struct virtio_net_hdr_mrg_rxbuf)]; + size_t vhost_hlen; + size_t sock_hlen; + /* vhost zerocopy support fields below: */ + /* last used idx for outstanding DMA zerocopy buffers */ + int upend_idx; + /* first used idx for DMA done zerocopy buffers */ + int done_idx; + /* an array of userspace buffers info */ + struct ubuf_info *ubuf_info; + /* Reference counting for outstanding ubufs. + * Protected by vq mutex. Writers must also take device mutex. */ + struct vhost_ubuf_ref *ubufs; }; struct vhost_net { struct vhost_dev dev; - struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX]; + struct vhost_net_virtqueue vqs[VHOST_NET_VQ_MAX]; struct vhost_poll poll[VHOST_NET_VQ_MAX]; - /* Tells us whether we are polling a socket for TX. - * We only do this when socket buffer fills up. - * Protected by tx vq lock. */ - enum vhost_net_poll_state tx_poll_state; /* Number of TX recently submitted. * Protected by tx vq lock. */ unsigned tx_packets; @@ -88,6 +104,90 @@ struct vhost_net { bool tx_flush; }; +static unsigned vhost_zcopy_mask __read_mostly; + +void vhost_enable_zcopy(int vq) +{ + vhost_zcopy_mask |= 0x1 << vq; +} + +static void vhost_zerocopy_done_signal(struct kref *kref) +{ + struct vhost_ubuf_ref *ubufs = container_of(kref, struct vhost_ubuf_ref, + kref); + wake_up(&ubufs->wait); +} + +struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *vq, + bool zcopy) +{ + struct vhost_ubuf_ref *ubufs; + /* No zero copy backend? Nothing to count. */ + if (!zcopy) + return NULL; + ubufs = kmalloc(sizeof(*ubufs), GFP_KERNEL); + if (!ubufs) + return ERR_PTR(-ENOMEM); + kref_init(&ubufs->kref); + init_waitqueue_head(&ubufs->wait); + ubufs->vq = vq; + return ubufs; +} + +void vhost_ubuf_put(struct vhost_ubuf_ref *ubufs) +{ + kref_put(&ubufs->kref, vhost_zerocopy_done_signal); +} + +void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *ubufs) +{ + kref_put(&ubufs->kref, vhost_zerocopy_done_signal); + wait_event(ubufs->wait, !atomic_read(&ubufs->kref.refcount)); + kfree(ubufs); +} + +int vhost_net_set_ubuf_info(struct vhost_net *n) +{ + bool zcopy; + int i; + + for (i = 0; i < n->dev.nvqs; ++i) { + zcopy = vhost_zcopy_mask & (0x1 << i); + if (!zcopy) + continue; + n->vqs[i].ubuf_info = kmalloc(sizeof(*n->vqs[i].ubuf_info) * + UIO_MAXIOV, GFP_KERNEL); + if (!n->vqs[i].ubuf_info) + goto err; + } + return 0; + +err: + while (i--) { + zcopy = vhost_zcopy_mask & (0x1 << i); + if (!zcopy) + continue; + kfree(n->vqs[i].ubuf_info); + } + return -ENOMEM; +} + +void vhost_net_vq_reset(struct vhost_net *n) +{ + int i; + + for (i = 0; i < VHOST_NET_VQ_MAX; i++) { + n->vqs[i].done_idx = 0; + n->vqs[i].upend_idx = 0; + n->vqs[i].ubufs = NULL; + kfree(n->vqs[i].ubuf_info); + n->vqs[i].ubuf_info = NULL; + n->vqs[i].vhost_hlen = 0; + n->vqs[i].sock_hlen = 0; + } + +} + static void vhost_net_tx_packet(struct vhost_net *net) { ++net->tx_packets; @@ -155,28 +255,6 @@ static void copy_iovec_hdr(const struct iovec *from, struct iovec *to, } } -/* Caller must have TX VQ lock */ -static void tx_poll_stop(struct vhost_net *net) -{ - if (likely(net->tx_poll_state != VHOST_NET_POLL_STARTED)) - return; - vhost_poll_stop(net->poll + VHOST_NET_VQ_TX); - net->tx_poll_state = VHOST_NET_POLL_STOPPED; -} - -/* Caller must have TX VQ lock */ -static int tx_poll_start(struct vhost_net *net, struct socket *sock) -{ - int ret; - - if (unlikely(net->tx_poll_state != VHOST_NET_POLL_STOPPED)) - return 0; - ret = vhost_poll_start(net->poll + VHOST_NET_VQ_TX, sock->file); - if (!ret) - net->tx_poll_state = VHOST_NET_POLL_STARTED; - return ret; -} - /* In case of DMA done not in order in lower device driver for some reason. * upend_idx is used to track end of used idx, done_idx is used to track head * of used idx. Once lower device DMA done contiguously, we will signal KVM @@ -185,10 +263,12 @@ static int tx_poll_start(struct vhost_net *net, struct socket *sock) static int vhost_zerocopy_signal_used(struct vhost_net *net, struct vhost_virtqueue *vq) { + struct vhost_net_virtqueue *nvq = + container_of(vq, struct vhost_net_virtqueue, vq); int i; int j = 0; - for (i = vq->done_idx; i != vq->upend_idx; i = (i + 1) % UIO_MAXIOV) { + for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) { if (vq->heads[i].len == VHOST_DMA_FAILED_LEN) vhost_net_tx_err(net); if (VHOST_DMA_IS_DONE(vq->heads[i].len)) { @@ -200,7 +280,7 @@ static int vhost_zerocopy_signal_used(struct vhost_net *net, break; } if (j) - vq->done_idx = i; + nvq->done_idx = i; return j; } @@ -230,7 +310,8 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success) * read-size critical section for our kind of RCU. */ static void handle_tx(struct vhost_net *net) { - struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX]; + struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; + struct vhost_virtqueue *vq = &nvq->vq; unsigned out, in, s; int head; struct msghdr msg = { @@ -242,7 +323,7 @@ static void handle_tx(struct vhost_net *net) .msg_flags = MSG_DONTWAIT, }; size_t len, total_len = 0; - int err, wmem; + int err; size_t hdr_size; struct socket *sock; struct vhost_ubuf_ref *uninitialized_var(ubufs); @@ -253,21 +334,11 @@ static void handle_tx(struct vhost_net *net) if (!sock) return; - wmem = atomic_read(&sock->sk->sk_wmem_alloc); - if (wmem >= sock->sk->sk_sndbuf) { - mutex_lock(&vq->mutex); - tx_poll_start(net, sock); - mutex_unlock(&vq->mutex); - return; - } - mutex_lock(&vq->mutex); vhost_disable_notify(&net->dev, vq); - if (wmem < sock->sk->sk_sndbuf / 2) - tx_poll_stop(net); - hdr_size = vq->vhost_hlen; - zcopy = vq->ubufs; + hdr_size = nvq->vhost_hlen; + zcopy = nvq->ubufs; for (;;) { /* Release DMAs done buffers first */ @@ -285,23 +356,15 @@ static void handle_tx(struct vhost_net *net) if (head == vq->num) { int num_pends; - wmem = atomic_read(&sock->sk->sk_wmem_alloc); - if (wmem >= sock->sk->sk_sndbuf * 3 / 4) { - tx_poll_start(net, sock); - set_bit(SOCK_ASYNC_NOSPACE, &sock->flags); - break; - } /* If more outstanding DMAs, queue the work. * Handle upend_idx wrap around */ - num_pends = likely(vq->upend_idx >= vq->done_idx) ? - (vq->upend_idx - vq->done_idx) : - (vq->upend_idx + UIO_MAXIOV - vq->done_idx); - if (unlikely(num_pends > VHOST_MAX_PEND)) { - tx_poll_start(net, sock); - set_bit(SOCK_ASYNC_NOSPACE, &sock->flags); + num_pends = likely(nvq->upend_idx >= nvq->done_idx) ? + (nvq->upend_idx - nvq->done_idx) : + (nvq->upend_idx + UIO_MAXIOV - + nvq->done_idx); + if (unlikely(num_pends > VHOST_MAX_PEND)) break; - } if (unlikely(vhost_enable_notify(&net->dev, vq))) { vhost_disable_notify(&net->dev, vq); continue; @@ -314,44 +377,45 @@ static void handle_tx(struct vhost_net *net) break; } /* Skip header. TODO: support TSO. */ - s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out); + s = move_iovec_hdr(vq->iov, nvq->hdr, hdr_size, out); msg.msg_iovlen = out; len = iov_length(vq->iov, out); /* Sanity check */ if (!len) { vq_err(vq, "Unexpected header len for TX: " "%zd expected %zd\n", - iov_length(vq->hdr, s), hdr_size); + iov_length(nvq->hdr, s), hdr_size); break; } zcopy_used = zcopy && (len >= VHOST_GOODCOPY_LEN || - vq->upend_idx != vq->done_idx); + nvq->upend_idx != nvq->done_idx); /* use msg_control to pass vhost zerocopy ubuf info to skb */ if (zcopy_used) { - vq->heads[vq->upend_idx].id = head; + vq->heads[nvq->upend_idx].id = head; if (!vhost_net_tx_select_zcopy(net) || len < VHOST_GOODCOPY_LEN) { /* copy don't need to wait for DMA done */ - vq->heads[vq->upend_idx].len = + vq->heads[nvq->upend_idx].len = VHOST_DMA_DONE_LEN; msg.msg_control = NULL; msg.msg_controllen = 0; ubufs = NULL; } else { - struct ubuf_info *ubuf = &vq->ubuf_info[head]; + struct ubuf_info *ubuf; + ubuf = nvq->ubuf_info + nvq->upend_idx; - vq->heads[vq->upend_idx].len = + vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS; ubuf->callback = vhost_zerocopy_callback; - ubuf->ctx = vq->ubufs; - ubuf->desc = vq->upend_idx; + ubuf->ctx = nvq->ubufs; + ubuf->desc = nvq->upend_idx; msg.msg_control = ubuf; msg.msg_controllen = sizeof(ubuf); - ubufs = vq->ubufs; + ubufs = nvq->ubufs; kref_get(&ubufs->kref); } - vq->upend_idx = (vq->upend_idx + 1) % UIO_MAXIOV; + nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV; } /* TODO: Check specific error and bomb out unless ENOBUFS? */ err = sock->ops->sendmsg(NULL, sock, &msg, len); @@ -359,12 +423,10 @@ static void handle_tx(struct vhost_net *net) if (zcopy_used) { if (ubufs) vhost_ubuf_put(ubufs); - vq->upend_idx = ((unsigned)vq->upend_idx - 1) % - UIO_MAXIOV; + nvq->upend_idx = ((unsigned)nvq->upend_idx - 1) + % UIO_MAXIOV; } vhost_discard_vq_desc(vq, 1); - if (err == -EAGAIN || err == -ENOBUFS) - tx_poll_start(net, sock); break; } if (err != len) @@ -469,7 +531,8 @@ err: * read-size critical section for our kind of RCU. */ static void handle_rx(struct vhost_net *net) { - struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX]; + struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX]; + struct vhost_virtqueue *vq = &nvq->vq; unsigned uninitialized_var(in), log; struct vhost_log *vq_log; struct msghdr msg = { @@ -497,8 +560,8 @@ static void handle_rx(struct vhost_net *net) mutex_lock(&vq->mutex); vhost_disable_notify(&net->dev, vq); - vhost_hlen = vq->vhost_hlen; - sock_hlen = vq->sock_hlen; + vhost_hlen = nvq->vhost_hlen; + sock_hlen = nvq->sock_hlen; vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ? vq->log : NULL; @@ -528,11 +591,11 @@ static void handle_rx(struct vhost_net *net) /* We don't need to be notified again. */ if (unlikely((vhost_hlen))) /* Skip header. TODO: support TSO. */ - move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, in); + move_iovec_hdr(vq->iov, nvq->hdr, vhost_hlen, in); else /* Copy the header for use in VIRTIO_NET_F_MRG_RXBUF: * needed because recvmsg can modify msg_iov. */ - copy_iovec_hdr(vq->iov, vq->hdr, sock_hlen, in); + copy_iovec_hdr(vq->iov, nvq->hdr, sock_hlen, in); msg.msg_iovlen = in; err = sock->ops->recvmsg(NULL, sock, &msg, sock_len, MSG_DONTWAIT | MSG_TRUNC); @@ -546,7 +609,7 @@ static void handle_rx(struct vhost_net *net) continue; } if (unlikely(vhost_hlen) && - memcpy_toiovecend(vq->hdr, (unsigned char *)&hdr, 0, + memcpy_toiovecend(nvq->hdr, (unsigned char *)&hdr, 0, vhost_hlen)) { vq_err(vq, "Unable to write vnet_hdr at addr %p\n", vq->iov->iov_base); @@ -554,7 +617,7 @@ static void handle_rx(struct vhost_net *net) } /* TODO: Should check and handle checksum. */ if (likely(mergeable) && - memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount, + memcpy_toiovecend(nvq->hdr, (unsigned char *)&headcount, offsetof(typeof(hdr), num_buffers), sizeof hdr.num_buffers)) { vq_err(vq, "Failed num_buffers write"); @@ -611,23 +674,39 @@ static int vhost_net_open(struct inode *inode, struct file *f) { struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL); struct vhost_dev *dev; - int r; + struct vhost_virtqueue **vqs; + int r, i; if (!n) return -ENOMEM; + vqs = kmalloc(VHOST_NET_VQ_MAX * sizeof(*vqs), GFP_KERNEL); + if (!vqs) { + kfree(n); + return -ENOMEM; + } dev = &n->dev; - n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick; - n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick; - r = vhost_dev_init(dev, n->vqs, VHOST_NET_VQ_MAX); + vqs[VHOST_NET_VQ_TX] = &n->vqs[VHOST_NET_VQ_TX].vq; + vqs[VHOST_NET_VQ_RX] = &n->vqs[VHOST_NET_VQ_RX].vq; + n->vqs[VHOST_NET_VQ_TX].vq.handle_kick = handle_tx_kick; + n->vqs[VHOST_NET_VQ_RX].vq.handle_kick = handle_rx_kick; + for (i = 0; i < VHOST_NET_VQ_MAX; i++) { + n->vqs[i].ubufs = NULL; + n->vqs[i].ubuf_info = NULL; + n->vqs[i].upend_idx = 0; + n->vqs[i].done_idx = 0; + n->vqs[i].vhost_hlen = 0; + n->vqs[i].sock_hlen = 0; + } + r = vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX); if (r < 0) { kfree(n); + kfree(vqs); return r; } vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev); vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev); - n->tx_poll_state = VHOST_NET_POLL_DISABLED; f->private_data = n; @@ -637,32 +716,28 @@ static int vhost_net_open(struct inode *inode, struct file *f) static void vhost_net_disable_vq(struct vhost_net *n, struct vhost_virtqueue *vq) { + struct vhost_net_virtqueue *nvq = + container_of(vq, struct vhost_net_virtqueue, vq); + struct vhost_poll *poll = n->poll + (nvq - n->vqs); if (!vq->private_data) return; - if (vq == n->vqs + VHOST_NET_VQ_TX) { - tx_poll_stop(n); - n->tx_poll_state = VHOST_NET_POLL_DISABLED; - } else - vhost_poll_stop(n->poll + VHOST_NET_VQ_RX); + vhost_poll_stop(poll); } static int vhost_net_enable_vq(struct vhost_net *n, struct vhost_virtqueue *vq) { + struct vhost_net_virtqueue *nvq = + container_of(vq, struct vhost_net_virtqueue, vq); + struct vhost_poll *poll = n->poll + (nvq - n->vqs); struct socket *sock; - int ret; sock = rcu_dereference_protected(vq->private_data, lockdep_is_held(&vq->mutex)); if (!sock) return 0; - if (vq == n->vqs + VHOST_NET_VQ_TX) { - n->tx_poll_state = VHOST_NET_POLL_STOPPED; - ret = tx_poll_start(n, sock); - } else - ret = vhost_poll_start(n->poll + VHOST_NET_VQ_RX, sock->file); - return ret; + return vhost_poll_start(poll, sock->file); } static struct socket *vhost_net_stop_vq(struct vhost_net *n, @@ -682,30 +757,30 @@ static struct socket *vhost_net_stop_vq(struct vhost_net *n, static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock, struct socket **rx_sock) { - *tx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_TX); - *rx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_RX); + *tx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_TX].vq); + *rx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_RX].vq); } static void vhost_net_flush_vq(struct vhost_net *n, int index) { vhost_poll_flush(n->poll + index); - vhost_poll_flush(&n->dev.vqs[index].poll); + vhost_poll_flush(&n->vqs[index].vq.poll); } static void vhost_net_flush(struct vhost_net *n) { vhost_net_flush_vq(n, VHOST_NET_VQ_TX); vhost_net_flush_vq(n, VHOST_NET_VQ_RX); - if (n->dev.vqs[VHOST_NET_VQ_TX].ubufs) { - mutex_lock(&n->dev.vqs[VHOST_NET_VQ_TX].mutex); + if (n->vqs[VHOST_NET_VQ_TX].ubufs) { + mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); n->tx_flush = true; - mutex_unlock(&n->dev.vqs[VHOST_NET_VQ_TX].mutex); + mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); /* Wait for all lower device DMAs done. */ - vhost_ubuf_put_and_wait(n->dev.vqs[VHOST_NET_VQ_TX].ubufs); - mutex_lock(&n->dev.vqs[VHOST_NET_VQ_TX].mutex); + vhost_ubuf_put_and_wait(n->vqs[VHOST_NET_VQ_TX].ubufs); + mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); n->tx_flush = false; - kref_init(&n->dev.vqs[VHOST_NET_VQ_TX].ubufs->kref); - mutex_unlock(&n->dev.vqs[VHOST_NET_VQ_TX].mutex); + kref_init(&n->vqs[VHOST_NET_VQ_TX].ubufs->kref); + mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); } } @@ -719,6 +794,7 @@ static int vhost_net_release(struct inode *inode, struct file *f) vhost_net_flush(n); vhost_dev_stop(&n->dev); vhost_dev_cleanup(&n->dev, false); + vhost_net_vq_reset(n); if (tx_sock) fput(tx_sock->file); if (rx_sock) @@ -726,6 +802,7 @@ static int vhost_net_release(struct inode *inode, struct file *f) /* We do an extra flush before freeing memory, * since jobs can re-queue themselves. */ vhost_net_flush(n); + kfree(n->dev.vqs); kfree(n); return 0; } @@ -799,6 +876,7 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) { struct socket *sock, *oldsock; struct vhost_virtqueue *vq; + struct vhost_net_virtqueue *nvq; struct vhost_ubuf_ref *ubufs, *oldubufs = NULL; int r; @@ -811,7 +889,8 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) r = -ENOBUFS; goto err; } - vq = n->vqs + index; + vq = &n->vqs[index].vq; + nvq = &n->vqs[index]; mutex_lock(&vq->mutex); /* Verify that ring has been setup correctly. */ @@ -844,8 +923,8 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) if (r) goto err_used; - oldubufs = vq->ubufs; - vq->ubufs = ubufs; + oldubufs = nvq->ubufs; + nvq->ubufs = ubufs; n->tx_packets = 0; n->tx_zcopy_err = 0; @@ -888,14 +967,21 @@ static long vhost_net_reset_owner(struct vhost_net *n) struct socket *tx_sock = NULL; struct socket *rx_sock = NULL; long err; + struct vhost_memory *memory; mutex_lock(&n->dev.mutex); err = vhost_dev_check_owner(&n->dev); if (err) goto done; + memory = vhost_dev_reset_owner_prepare(); + if (!memory) { + err = -ENOMEM; + goto done; + } vhost_net_stop(n, &tx_sock, &rx_sock); vhost_net_flush(n); - err = vhost_dev_reset_owner(&n->dev); + vhost_dev_reset_owner(&n->dev, memory); + vhost_net_vq_reset(n); done: mutex_unlock(&n->dev.mutex); if (tx_sock) @@ -931,10 +1017,10 @@ static int vhost_net_set_features(struct vhost_net *n, u64 features) n->dev.acked_features = features; smp_wmb(); for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { - mutex_lock(&n->vqs[i].mutex); + mutex_lock(&n->vqs[i].vq.mutex); n->vqs[i].vhost_hlen = vhost_hlen; n->vqs[i].sock_hlen = sock_hlen; - mutex_unlock(&n->vqs[i].mutex); + mutex_unlock(&n->vqs[i].vq.mutex); } vhost_net_flush(n); mutex_unlock(&n->dev.mutex); @@ -971,11 +1057,17 @@ static long vhost_net_ioctl(struct file *f, unsigned int ioctl, return vhost_net_reset_owner(n); default: mutex_lock(&n->dev.mutex); + if (ioctl == VHOST_SET_OWNER) { + r = vhost_net_set_ubuf_info(n); + if (r) + goto out; + } r = vhost_dev_ioctl(&n->dev, ioctl, argp); if (r == -ENOIOCTLCMD) r = vhost_vring_ioctl(&n->dev, ioctl, argp); else vhost_net_flush(n); +out: mutex_unlock(&n->dev.mutex); return r; } diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/scsi.c index 9951297b2427..5179f7aa1b0b 100644 --- a/drivers/vhost/tcm_vhost.c +++ b/drivers/vhost/scsi.c @@ -45,14 +45,116 @@ #include <target/target_core_configfs.h> #include <target/configfs_macros.h> #include <linux/vhost.h> -#include <linux/virtio_net.h> /* TODO vhost.h currently depends on this */ #include <linux/virtio_scsi.h> #include <linux/llist.h> #include <linux/bitmap.h> #include "vhost.c" #include "vhost.h" -#include "tcm_vhost.h" + +#define TCM_VHOST_VERSION "v0.1" +#define TCM_VHOST_NAMELEN 256 +#define TCM_VHOST_MAX_CDB_SIZE 32 + +struct vhost_scsi_inflight { + /* Wait for the flush operation to finish */ + struct completion comp; + /* Refcount for the inflight reqs */ + struct kref kref; +}; + +struct tcm_vhost_cmd { + /* Descriptor from vhost_get_vq_desc() for virt_queue segment */ + int tvc_vq_desc; + /* virtio-scsi initiator task attribute */ + int tvc_task_attr; + /* virtio-scsi initiator data direction */ + enum dma_data_direction tvc_data_direction; + /* Expected data transfer length from virtio-scsi header */ + u32 tvc_exp_data_len; + /* The Tag from include/linux/virtio_scsi.h:struct virtio_scsi_cmd_req */ + u64 tvc_tag; + /* The number of scatterlists associated with this cmd */ + u32 tvc_sgl_count; + /* Saved unpacked SCSI LUN for tcm_vhost_submission_work() */ + u32 tvc_lun; + /* Pointer to the SGL formatted memory from virtio-scsi */ + struct scatterlist *tvc_sgl; + /* Pointer to response */ + struct virtio_scsi_cmd_resp __user *tvc_resp; + /* Pointer to vhost_scsi for our device */ + struct vhost_scsi *tvc_vhost; + /* Pointer to vhost_virtqueue for the cmd */ + struct vhost_virtqueue *tvc_vq; + /* Pointer to vhost nexus memory */ + struct tcm_vhost_nexus *tvc_nexus; + /* The TCM I/O descriptor that is accessed via container_of() */ + struct se_cmd tvc_se_cmd; + /* work item used for cmwq dispatch to tcm_vhost_submission_work() */ + struct work_struct work; + /* Copy of the incoming SCSI command descriptor block (CDB) */ + unsigned char tvc_cdb[TCM_VHOST_MAX_CDB_SIZE]; + /* Sense buffer that will be mapped into outgoing status */ + unsigned char tvc_sense_buf[TRANSPORT_SENSE_BUFFER]; + /* Completed commands list, serviced from vhost worker thread */ + struct llist_node tvc_completion_list; + /* Used to track inflight cmd */ + struct vhost_scsi_inflight *inflight; +}; + +struct tcm_vhost_nexus { + /* Pointer to TCM session for I_T Nexus */ + struct se_session *tvn_se_sess; +}; + +struct tcm_vhost_nacl { + /* Binary World Wide unique Port Name for Vhost Initiator port */ + u64 iport_wwpn; + /* ASCII formatted WWPN for Sas Initiator port */ + char iport_name[TCM_VHOST_NAMELEN]; + /* Returned by tcm_vhost_make_nodeacl() */ + struct se_node_acl se_node_acl; +}; + +struct vhost_scsi; +struct tcm_vhost_tpg { + /* Vhost port target portal group tag for TCM */ + u16 tport_tpgt; + /* Used to track number of TPG Port/Lun Links wrt to explict I_T Nexus shutdown */ + int tv_tpg_port_count; + /* Used for vhost_scsi device reference to tpg_nexus, protected by tv_tpg_mutex */ + int tv_tpg_vhost_count; + /* list for tcm_vhost_list */ + struct list_head tv_tpg_list; + /* Used to protect access for tpg_nexus */ + struct mutex tv_tpg_mutex; + /* Pointer to the TCM VHost I_T Nexus for this TPG endpoint */ + struct tcm_vhost_nexus *tpg_nexus; + /* Pointer back to tcm_vhost_tport */ + struct tcm_vhost_tport *tport; + /* Returned by tcm_vhost_make_tpg() */ + struct se_portal_group se_tpg; + /* Pointer back to vhost_scsi, protected by tv_tpg_mutex */ + struct vhost_scsi *vhost_scsi; +}; + +struct tcm_vhost_tport { + /* SCSI protocol the tport is providing */ + u8 tport_proto_id; + /* Binary World Wide unique Port Name for Vhost Target port */ + u64 tport_wwpn; + /* ASCII formatted WWPN for Vhost Target port */ + char tport_name[TCM_VHOST_NAMELEN]; + /* Returned by tcm_vhost_make_tport() */ + struct se_wwn tport_wwn; +}; + +struct tcm_vhost_evt { + /* event to be sent to guest */ + struct virtio_scsi_event event; + /* event list, serviced from vhost worker thread */ + struct llist_node list; +}; enum { VHOST_SCSI_VQ_CTL = 0, @@ -60,20 +162,51 @@ enum { VHOST_SCSI_VQ_IO = 2, }; +/* + * VIRTIO_RING_F_EVENT_IDX seems broken. Not sure the bug is in + * kernel but disabling it helps. + * TODO: debug and remove the workaround. + */ +enum { + VHOST_SCSI_FEATURES = (VHOST_FEATURES & (~VIRTIO_RING_F_EVENT_IDX)) | + (1ULL << VIRTIO_SCSI_F_HOTPLUG) +}; + #define VHOST_SCSI_MAX_TARGET 256 #define VHOST_SCSI_MAX_VQ 128 +#define VHOST_SCSI_MAX_EVENT 128 + +struct vhost_scsi_virtqueue { + struct vhost_virtqueue vq; + /* + * Reference counting for inflight reqs, used for flush operation. At + * each time, one reference tracks new commands submitted, while we + * wait for another one to reach 0. + */ + struct vhost_scsi_inflight inflights[2]; + /* + * Indicate current inflight in use, protected by vq->mutex. + * Writers must also take dev mutex and flush under it. + */ + int inflight_idx; +}; struct vhost_scsi { /* Protected by vhost_scsi->dev.mutex */ - struct tcm_vhost_tpg *vs_tpg[VHOST_SCSI_MAX_TARGET]; + struct tcm_vhost_tpg **vs_tpg; char vs_vhost_wwpn[TRANSPORT_IQN_LEN]; - bool vs_endpoint; struct vhost_dev dev; - struct vhost_virtqueue vqs[VHOST_SCSI_MAX_VQ]; + struct vhost_scsi_virtqueue vqs[VHOST_SCSI_MAX_VQ]; struct vhost_work vs_completion_work; /* cmd completion work item */ struct llist_head vs_completion_list; /* cmd completion queue */ + + struct vhost_work vs_event_work; /* evt injection work item */ + struct llist_head vs_event_list; /* evt injection queue */ + + bool vs_events_missed; /* any missed events, protected by vq->mutex */ + int vs_events_nr; /* num of pending events, protected by vq->mutex */ }; /* Local pointer to allocated TCM configfs fabric module */ @@ -91,6 +224,59 @@ static int iov_num_pages(struct iovec *iov) ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT; } +void tcm_vhost_done_inflight(struct kref *kref) +{ + struct vhost_scsi_inflight *inflight; + + inflight = container_of(kref, struct vhost_scsi_inflight, kref); + complete(&inflight->comp); +} + +static void tcm_vhost_init_inflight(struct vhost_scsi *vs, + struct vhost_scsi_inflight *old_inflight[]) +{ + struct vhost_scsi_inflight *new_inflight; + struct vhost_virtqueue *vq; + int idx, i; + + for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) { + vq = &vs->vqs[i].vq; + + mutex_lock(&vq->mutex); + + /* store old infight */ + idx = vs->vqs[i].inflight_idx; + if (old_inflight) + old_inflight[i] = &vs->vqs[i].inflights[idx]; + + /* setup new infight */ + vs->vqs[i].inflight_idx = idx ^ 1; + new_inflight = &vs->vqs[i].inflights[idx ^ 1]; + kref_init(&new_inflight->kref); + init_completion(&new_inflight->comp); + + mutex_unlock(&vq->mutex); + } +} + +static struct vhost_scsi_inflight * +tcm_vhost_get_inflight(struct vhost_virtqueue *vq) +{ + struct vhost_scsi_inflight *inflight; + struct vhost_scsi_virtqueue *svq; + + svq = container_of(vq, struct vhost_scsi_virtqueue, vq); + inflight = &svq->inflights[svq->inflight_idx]; + kref_get(&inflight->kref); + + return inflight; +} + +static void tcm_vhost_put_inflight(struct vhost_scsi_inflight *inflight) +{ + kref_put(&inflight->kref, tcm_vhost_done_inflight); +} + static int tcm_vhost_check_true(struct se_portal_group *se_tpg) { return 1; @@ -341,6 +527,37 @@ static int tcm_vhost_queue_tm_rsp(struct se_cmd *se_cmd) return 0; } +static void tcm_vhost_free_evt(struct vhost_scsi *vs, struct tcm_vhost_evt *evt) +{ + vs->vs_events_nr--; + kfree(evt); +} + +static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs, + u32 event, u32 reason) +{ + struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq; + struct tcm_vhost_evt *evt; + + if (vs->vs_events_nr > VHOST_SCSI_MAX_EVENT) { + vs->vs_events_missed = true; + return NULL; + } + + evt = kzalloc(sizeof(*evt), GFP_KERNEL); + if (!evt) { + vq_err(vq, "Failed to allocate tcm_vhost_evt\n"); + vs->vs_events_missed = true; + return NULL; + } + + evt->event.event = event; + evt->event.reason = reason; + vs->vs_events_nr++; + + return evt; +} + static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd) { struct se_cmd *se_cmd = &tv_cmd->tvc_se_cmd; @@ -356,9 +573,80 @@ static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd) kfree(tv_cmd->tvc_sgl); } + tcm_vhost_put_inflight(tv_cmd->inflight); + kfree(tv_cmd); } +static void tcm_vhost_do_evt_work(struct vhost_scsi *vs, + struct tcm_vhost_evt *evt) +{ + struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq; + struct virtio_scsi_event *event = &evt->event; + struct virtio_scsi_event __user *eventp; + unsigned out, in; + int head, ret; + + if (!vq->private_data) { + vs->vs_events_missed = true; + return; + } + +again: + vhost_disable_notify(&vs->dev, vq); + head = vhost_get_vq_desc(&vs->dev, vq, vq->iov, + ARRAY_SIZE(vq->iov), &out, &in, + NULL, NULL); + if (head < 0) { + vs->vs_events_missed = true; + return; + } + if (head == vq->num) { + if (vhost_enable_notify(&vs->dev, vq)) + goto again; + vs->vs_events_missed = true; + return; + } + + if ((vq->iov[out].iov_len != sizeof(struct virtio_scsi_event))) { + vq_err(vq, "Expecting virtio_scsi_event, got %zu bytes\n", + vq->iov[out].iov_len); + vs->vs_events_missed = true; + return; + } + + if (vs->vs_events_missed) { + event->event |= VIRTIO_SCSI_T_EVENTS_MISSED; + vs->vs_events_missed = false; + } + + eventp = vq->iov[out].iov_base; + ret = __copy_to_user(eventp, event, sizeof(*event)); + if (!ret) + vhost_add_used_and_signal(&vs->dev, vq, head, 0); + else + vq_err(vq, "Faulted on tcm_vhost_send_event\n"); +} + +static void tcm_vhost_evt_work(struct vhost_work *work) +{ + struct vhost_scsi *vs = container_of(work, struct vhost_scsi, + vs_event_work); + struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq; + struct tcm_vhost_evt *evt; + struct llist_node *llnode; + + mutex_lock(&vq->mutex); + llnode = llist_del_all(&vs->vs_event_list); + while (llnode) { + evt = llist_entry(llnode, struct tcm_vhost_evt, list); + llnode = llist_next(llnode); + tcm_vhost_do_evt_work(vs, evt); + tcm_vhost_free_evt(vs, evt); + } + mutex_unlock(&vq->mutex); +} + /* Fill in status and signal that we are done processing this command * * This is scheduled in the vhost work queue so we are called with the owner @@ -395,8 +683,10 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work) v_rsp.sense_len); ret = copy_to_user(tv_cmd->tvc_resp, &v_rsp, sizeof(v_rsp)); if (likely(ret == 0)) { + struct vhost_scsi_virtqueue *q; vhost_add_used(tv_cmd->tvc_vq, tv_cmd->tvc_vq_desc, 0); - vq = tv_cmd->tvc_vq - vs->vqs; + q = container_of(tv_cmd->tvc_vq, struct vhost_scsi_virtqueue, vq); + vq = q - vs->vqs; __set_bit(vq, signal); } else pr_err("Faulted on virtio_scsi_cmd_resp\n"); @@ -407,10 +697,11 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work) vq = -1; while ((vq = find_next_bit(signal, VHOST_SCSI_MAX_VQ, vq + 1)) < VHOST_SCSI_MAX_VQ) - vhost_signal(&vs->dev, &vs->vqs[vq]); + vhost_signal(&vs->dev, &vs->vqs[vq].vq); } static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd( + struct vhost_virtqueue *vq, struct tcm_vhost_tpg *tv_tpg, struct virtio_scsi_cmd_req *v_req, u32 exp_data_len, @@ -435,6 +726,7 @@ static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd( tv_cmd->tvc_exp_data_len = exp_data_len; tv_cmd->tvc_data_direction = data_direction; tv_cmd->tvc_nexus = tv_nexus; + tv_cmd->inflight = tcm_vhost_get_inflight(vq); return tv_cmd; } @@ -570,9 +862,27 @@ static void tcm_vhost_submission_work(struct work_struct *work) } } +static void vhost_scsi_send_bad_target(struct vhost_scsi *vs, + struct vhost_virtqueue *vq, int head, unsigned out) +{ + struct virtio_scsi_cmd_resp __user *resp; + struct virtio_scsi_cmd_resp rsp; + int ret; + + memset(&rsp, 0, sizeof(rsp)); + rsp.response = VIRTIO_SCSI_S_BAD_TARGET; + resp = vq->iov[out].iov_base; + ret = __copy_to_user(resp, &rsp, sizeof(rsp)); + if (!ret) + vhost_add_used_and_signal(&vs->dev, vq, head, 0); + else + pr_err("Faulted on virtio_scsi_cmd_resp\n"); +} + static void vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) { + struct tcm_vhost_tpg **vs_tpg; struct virtio_scsi_cmd_req v_req; struct tcm_vhost_tpg *tv_tpg; struct tcm_vhost_cmd *tv_cmd; @@ -581,8 +891,16 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs, int head, ret; u8 target; - /* Must use ioctl VHOST_SCSI_SET_ENDPOINT */ - if (unlikely(!vs->vs_endpoint)) + /* + * We can handle the vq only after the endpoint is setup by calling the + * VHOST_SCSI_SET_ENDPOINT ioctl. + * + * TODO: Check that we are running from vhost_worker which acts + * as read-side critical section for vhost kind of RCU. + * See the comments in struct vhost_virtqueue in drivers/vhost/vhost.h + */ + vs_tpg = rcu_dereference_check(vq->private_data, 1); + if (!vs_tpg) return; mutex_lock(&vq->mutex); @@ -652,23 +970,11 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs, /* Extract the tpgt */ target = v_req.lun[1]; - tv_tpg = vs->vs_tpg[target]; + tv_tpg = ACCESS_ONCE(vs_tpg[target]); /* Target does not exist, fail the request */ if (unlikely(!tv_tpg)) { - struct virtio_scsi_cmd_resp __user *resp; - struct virtio_scsi_cmd_resp rsp; - - memset(&rsp, 0, sizeof(rsp)); - rsp.response = VIRTIO_SCSI_S_BAD_TARGET; - resp = vq->iov[out].iov_base; - ret = __copy_to_user(resp, &rsp, sizeof(rsp)); - if (!ret) - vhost_add_used_and_signal(&vs->dev, - vq, head, 0); - else - pr_err("Faulted on virtio_scsi_cmd_resp\n"); - + vhost_scsi_send_bad_target(vs, vq, head, out); continue; } @@ -676,27 +982,18 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs, for (i = 0; i < data_num; i++) exp_data_len += vq->iov[data_first + i].iov_len; - tv_cmd = vhost_scsi_allocate_cmd(tv_tpg, &v_req, + tv_cmd = vhost_scsi_allocate_cmd(vq, tv_tpg, &v_req, exp_data_len, data_direction); if (IS_ERR(tv_cmd)) { vq_err(vq, "vhost_scsi_allocate_cmd failed %ld\n", PTR_ERR(tv_cmd)); - break; + goto err_cmd; } pr_debug("Allocated tv_cmd: %p exp_data_len: %d, data_direction" ": %d\n", tv_cmd, exp_data_len, data_direction); tv_cmd->tvc_vhost = vs; tv_cmd->tvc_vq = vq; - - if (unlikely(vq->iov[out].iov_len != - sizeof(struct virtio_scsi_cmd_resp))) { - vq_err(vq, "Expecting virtio_scsi_cmd_resp, got %zu" - " bytes, out: %d, in: %d\n", - vq->iov[out].iov_len, out, in); - break; - } - tv_cmd->tvc_resp = vq->iov[out].iov_base; /* @@ -716,7 +1013,7 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs, " exceeds SCSI_MAX_VARLEN_CDB_SIZE: %d\n", scsi_command_size(tv_cmd->tvc_cdb), TCM_VHOST_MAX_CDB_SIZE); - break; /* TODO */ + goto err_free; } tv_cmd->tvc_lun = ((v_req.lun[2] << 8) | v_req.lun[3]) & 0x3FFF; @@ -729,7 +1026,7 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs, data_direction == DMA_TO_DEVICE); if (unlikely(ret)) { vq_err(vq, "Failed to map iov to sgl\n"); - break; /* TODO */ + goto err_free; } } @@ -750,6 +1047,13 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs, } mutex_unlock(&vq->mutex); + return; + +err_free: + vhost_scsi_free_cmd(tv_cmd); +err_cmd: + vhost_scsi_send_bad_target(vs, vq, head, out); + mutex_unlock(&vq->mutex); } static void vhost_scsi_ctl_handle_kick(struct vhost_work *work) @@ -757,9 +1061,46 @@ static void vhost_scsi_ctl_handle_kick(struct vhost_work *work) pr_debug("%s: The handling func for control queue.\n", __func__); } +static void tcm_vhost_send_evt(struct vhost_scsi *vs, struct tcm_vhost_tpg *tpg, + struct se_lun *lun, u32 event, u32 reason) +{ + struct tcm_vhost_evt *evt; + + evt = tcm_vhost_allocate_evt(vs, event, reason); + if (!evt) + return; + + if (tpg && lun) { + /* TODO: share lun setup code with virtio-scsi.ko */ + /* + * Note: evt->event is zeroed when we allocate it and + * lun[4-7] need to be zero according to virtio-scsi spec. + */ + evt->event.lun[0] = 0x01; + evt->event.lun[1] = tpg->tport_tpgt & 0xFF; + if (lun->unpacked_lun >= 256) + evt->event.lun[2] = lun->unpacked_lun >> 8 | 0x40 ; + evt->event.lun[3] = lun->unpacked_lun & 0xFF; + } + + llist_add(&evt->list, &vs->vs_event_list); + vhost_work_queue(&vs->dev, &vs->vs_event_work); +} + static void vhost_scsi_evt_handle_kick(struct vhost_work *work) { - pr_debug("%s: The handling func for event queue.\n", __func__); + struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, + poll.work); + struct vhost_scsi *vs = container_of(vq->dev, struct vhost_scsi, dev); + + mutex_lock(&vq->mutex); + if (!vq->private_data) + goto out; + + if (vs->vs_events_missed) + tcm_vhost_send_evt(vs, NULL, NULL, VIRTIO_SCSI_T_NO_EVENT, 0); +out: + mutex_unlock(&vq->mutex); } static void vhost_scsi_handle_kick(struct vhost_work *work) @@ -771,9 +1112,45 @@ static void vhost_scsi_handle_kick(struct vhost_work *work) vhost_scsi_handle_vq(vs, vq); } +static void vhost_scsi_flush_vq(struct vhost_scsi *vs, int index) +{ + vhost_poll_flush(&vs->vqs[index].vq.poll); +} + +/* Callers must hold dev mutex */ +static void vhost_scsi_flush(struct vhost_scsi *vs) +{ + struct vhost_scsi_inflight *old_inflight[VHOST_SCSI_MAX_VQ]; + int i; + + /* Init new inflight and remember the old inflight */ + tcm_vhost_init_inflight(vs, old_inflight); + + /* + * The inflight->kref was initialized to 1. We decrement it here to + * indicate the start of the flush operation so that it will reach 0 + * when all the reqs are finished. + */ + for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) + kref_put(&old_inflight[i]->kref, tcm_vhost_done_inflight); + + /* Flush both the vhost poll and vhost work */ + for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) + vhost_scsi_flush_vq(vs, i); + vhost_work_flush(&vs->dev, &vs->vs_completion_work); + vhost_work_flush(&vs->dev, &vs->vs_event_work); + + /* Wait for all reqs issued before the flush to be finished */ + for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) + wait_for_completion(&old_inflight[i]->comp); +} + /* * Called from vhost_scsi_ioctl() context to walk the list of available * tcm_vhost_tpg with an active struct tcm_vhost_nexus + * + * The lock nesting rule is: + * tcm_vhost_mutex -> vs->dev.mutex -> tpg->tv_tpg_mutex -> vq->mutex */ static int vhost_scsi_set_endpoint( struct vhost_scsi *vs, @@ -781,20 +1158,32 @@ static int vhost_scsi_set_endpoint( { struct tcm_vhost_tport *tv_tport; struct tcm_vhost_tpg *tv_tpg; + struct tcm_vhost_tpg **vs_tpg; + struct vhost_virtqueue *vq; + int index, ret, i, len; bool match = false; - int index, ret; + mutex_lock(&tcm_vhost_mutex); mutex_lock(&vs->dev.mutex); + /* Verify that ring has been setup correctly. */ for (index = 0; index < vs->dev.nvqs; ++index) { /* Verify that ring has been setup correctly. */ - if (!vhost_vq_access_ok(&vs->vqs[index])) { - mutex_unlock(&vs->dev.mutex); - return -EFAULT; + if (!vhost_vq_access_ok(&vs->vqs[index].vq)) { + ret = -EFAULT; + goto out; } } - mutex_lock(&tcm_vhost_mutex); + len = sizeof(vs_tpg[0]) * VHOST_SCSI_MAX_TARGET; + vs_tpg = kzalloc(len, GFP_KERNEL); + if (!vs_tpg) { + ret = -ENOMEM; + goto out; + } + if (vs->vs_tpg) + memcpy(vs_tpg, vs->vs_tpg, len); + list_for_each_entry(tv_tpg, &tcm_vhost_list, tv_tpg_list) { mutex_lock(&tv_tpg->tv_tpg_mutex); if (!tv_tpg->tpg_nexus) { @@ -808,31 +1197,48 @@ static int vhost_scsi_set_endpoint( tv_tport = tv_tpg->tport; if (!strcmp(tv_tport->tport_name, t->vhost_wwpn)) { - if (vs->vs_tpg[tv_tpg->tport_tpgt]) { + if (vs->vs_tpg && vs->vs_tpg[tv_tpg->tport_tpgt]) { + kfree(vs_tpg); mutex_unlock(&tv_tpg->tv_tpg_mutex); - mutex_unlock(&tcm_vhost_mutex); - mutex_unlock(&vs->dev.mutex); - return -EEXIST; + ret = -EEXIST; + goto out; } tv_tpg->tv_tpg_vhost_count++; - vs->vs_tpg[tv_tpg->tport_tpgt] = tv_tpg; + tv_tpg->vhost_scsi = vs; + vs_tpg[tv_tpg->tport_tpgt] = tv_tpg; smp_mb__after_atomic_inc(); match = true; } mutex_unlock(&tv_tpg->tv_tpg_mutex); } - mutex_unlock(&tcm_vhost_mutex); if (match) { memcpy(vs->vs_vhost_wwpn, t->vhost_wwpn, sizeof(vs->vs_vhost_wwpn)); - vs->vs_endpoint = true; + for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) { + vq = &vs->vqs[i].vq; + /* Flushing the vhost_work acts as synchronize_rcu */ + mutex_lock(&vq->mutex); + rcu_assign_pointer(vq->private_data, vs_tpg); + vhost_init_used(vq); + mutex_unlock(&vq->mutex); + } ret = 0; } else { ret = -EEXIST; } + /* + * Act as synchronize_rcu to make sure access to + * old vs->vs_tpg is finished. + */ + vhost_scsi_flush(vs); + kfree(vs->vs_tpg); + vs->vs_tpg = vs_tpg; + +out: mutex_unlock(&vs->dev.mutex); + mutex_unlock(&tcm_vhost_mutex); return ret; } @@ -842,28 +1248,37 @@ static int vhost_scsi_clear_endpoint( { struct tcm_vhost_tport *tv_tport; struct tcm_vhost_tpg *tv_tpg; + struct vhost_virtqueue *vq; + bool match = false; int index, ret, i; u8 target; + mutex_lock(&tcm_vhost_mutex); mutex_lock(&vs->dev.mutex); /* Verify that ring has been setup correctly. */ for (index = 0; index < vs->dev.nvqs; ++index) { - if (!vhost_vq_access_ok(&vs->vqs[index])) { + if (!vhost_vq_access_ok(&vs->vqs[index].vq)) { ret = -EFAULT; - goto err; + goto err_dev; } } + + if (!vs->vs_tpg) { + ret = 0; + goto err_dev; + } + for (i = 0; i < VHOST_SCSI_MAX_TARGET; i++) { target = i; - tv_tpg = vs->vs_tpg[target]; if (!tv_tpg) continue; + mutex_lock(&tv_tpg->tv_tpg_mutex); tv_tport = tv_tpg->tport; if (!tv_tport) { ret = -ENODEV; - goto err; + goto err_tpg; } if (strcmp(tv_tport->tport_name, t->vhost_wwpn)) { @@ -872,37 +1287,97 @@ static int vhost_scsi_clear_endpoint( tv_tport->tport_name, tv_tpg->tport_tpgt, t->vhost_wwpn, t->vhost_tpgt); ret = -EINVAL; - goto err; + goto err_tpg; } tv_tpg->tv_tpg_vhost_count--; + tv_tpg->vhost_scsi = NULL; vs->vs_tpg[target] = NULL; - vs->vs_endpoint = false; + match = true; + mutex_unlock(&tv_tpg->tv_tpg_mutex); } + if (match) { + for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) { + vq = &vs->vqs[i].vq; + /* Flushing the vhost_work acts as synchronize_rcu */ + mutex_lock(&vq->mutex); + rcu_assign_pointer(vq->private_data, NULL); + mutex_unlock(&vq->mutex); + } + } + /* + * Act as synchronize_rcu to make sure access to + * old vs->vs_tpg is finished. + */ + vhost_scsi_flush(vs); + kfree(vs->vs_tpg); + vs->vs_tpg = NULL; + WARN_ON(vs->vs_events_nr); mutex_unlock(&vs->dev.mutex); + mutex_unlock(&tcm_vhost_mutex); return 0; -err: +err_tpg: + mutex_unlock(&tv_tpg->tv_tpg_mutex); +err_dev: mutex_unlock(&vs->dev.mutex); + mutex_unlock(&tcm_vhost_mutex); return ret; } +static int vhost_scsi_set_features(struct vhost_scsi *vs, u64 features) +{ + if (features & ~VHOST_SCSI_FEATURES) + return -EOPNOTSUPP; + + mutex_lock(&vs->dev.mutex); + if ((features & (1 << VHOST_F_LOG_ALL)) && + !vhost_log_access_ok(&vs->dev)) { + mutex_unlock(&vs->dev.mutex); + return -EFAULT; + } + vs->dev.acked_features = features; + smp_wmb(); + vhost_scsi_flush(vs); + mutex_unlock(&vs->dev.mutex); + return 0; +} + static int vhost_scsi_open(struct inode *inode, struct file *f) { struct vhost_scsi *s; + struct vhost_virtqueue **vqs; int r, i; s = kzalloc(sizeof(*s), GFP_KERNEL); if (!s) return -ENOMEM; + vqs = kmalloc(VHOST_SCSI_MAX_VQ * sizeof(*vqs), GFP_KERNEL); + if (!vqs) { + kfree(s); + return -ENOMEM; + } + vhost_work_init(&s->vs_completion_work, vhost_scsi_complete_cmd_work); + vhost_work_init(&s->vs_event_work, tcm_vhost_evt_work); + + s->vs_events_nr = 0; + s->vs_events_missed = false; + + vqs[VHOST_SCSI_VQ_CTL] = &s->vqs[VHOST_SCSI_VQ_CTL].vq; + vqs[VHOST_SCSI_VQ_EVT] = &s->vqs[VHOST_SCSI_VQ_EVT].vq; + s->vqs[VHOST_SCSI_VQ_CTL].vq.handle_kick = vhost_scsi_ctl_handle_kick; + s->vqs[VHOST_SCSI_VQ_EVT].vq.handle_kick = vhost_scsi_evt_handle_kick; + for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++) { + vqs[i] = &s->vqs[i].vq; + s->vqs[i].vq.handle_kick = vhost_scsi_handle_kick; + } + r = vhost_dev_init(&s->dev, vqs, VHOST_SCSI_MAX_VQ); + + tcm_vhost_init_inflight(s, NULL); - s->vqs[VHOST_SCSI_VQ_CTL].handle_kick = vhost_scsi_ctl_handle_kick; - s->vqs[VHOST_SCSI_VQ_EVT].handle_kick = vhost_scsi_evt_handle_kick; - for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++) - s->vqs[i].handle_kick = vhost_scsi_handle_kick; - r = vhost_dev_init(&s->dev, s->vqs, VHOST_SCSI_MAX_VQ); if (r < 0) { + kfree(vqs); kfree(s); return r; } @@ -922,41 +1397,13 @@ static int vhost_scsi_release(struct inode *inode, struct file *f) vhost_scsi_clear_endpoint(s, &t); vhost_dev_stop(&s->dev); vhost_dev_cleanup(&s->dev, false); + /* Jobs can re-queue themselves in evt kick handler. Do extra flush. */ + vhost_scsi_flush(s); + kfree(s->dev.vqs); kfree(s); return 0; } -static void vhost_scsi_flush_vq(struct vhost_scsi *vs, int index) -{ - vhost_poll_flush(&vs->dev.vqs[index].poll); -} - -static void vhost_scsi_flush(struct vhost_scsi *vs) -{ - int i; - - for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) - vhost_scsi_flush_vq(vs, i); -} - -static int vhost_scsi_set_features(struct vhost_scsi *vs, u64 features) -{ - if (features & ~VHOST_FEATURES) - return -EOPNOTSUPP; - - mutex_lock(&vs->dev.mutex); - if ((features & (1 << VHOST_F_LOG_ALL)) && - !vhost_log_access_ok(&vs->dev)) { - mutex_unlock(&vs->dev.mutex); - return -EFAULT; - } - vs->dev.acked_features = features; - smp_wmb(); - vhost_scsi_flush(vs); - mutex_unlock(&vs->dev.mutex); - return 0; -} - static long vhost_scsi_ioctl(struct file *f, unsigned int ioctl, unsigned long arg) { @@ -964,8 +1411,11 @@ static long vhost_scsi_ioctl(struct file *f, unsigned int ioctl, struct vhost_scsi_target backend; void __user *argp = (void __user *)arg; u64 __user *featurep = argp; + u32 __user *eventsp = argp; + u32 events_missed; u64 features; int r, abi_version = VHOST_SCSI_ABI_VERSION; + struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq; switch (ioctl) { case VHOST_SCSI_SET_ENDPOINT: @@ -986,8 +1436,22 @@ static long vhost_scsi_ioctl(struct file *f, unsigned int ioctl, if (copy_to_user(argp, &abi_version, sizeof abi_version)) return -EFAULT; return 0; + case VHOST_SCSI_SET_EVENTS_MISSED: + if (get_user(events_missed, eventsp)) + return -EFAULT; + mutex_lock(&vq->mutex); + vs->vs_events_missed = events_missed; + mutex_unlock(&vq->mutex); + return 0; + case VHOST_SCSI_GET_EVENTS_MISSED: + mutex_lock(&vq->mutex); + events_missed = vs->vs_events_missed; + mutex_unlock(&vq->mutex); + if (put_user(events_missed, eventsp)) + return -EFAULT; + return 0; case VHOST_GET_FEATURES: - features = VHOST_FEATURES; + features = VHOST_SCSI_FEATURES; if (copy_to_user(featurep, &features, sizeof features)) return -EFAULT; return 0; @@ -1057,28 +1521,80 @@ static char *tcm_vhost_dump_proto_id(struct tcm_vhost_tport *tport) return "Unknown"; } +static void tcm_vhost_do_plug(struct tcm_vhost_tpg *tpg, + struct se_lun *lun, bool plug) +{ + + struct vhost_scsi *vs = tpg->vhost_scsi; + struct vhost_virtqueue *vq; + u32 reason; + + if (!vs) + return; + + mutex_lock(&vs->dev.mutex); + if (!vhost_has_feature(&vs->dev, VIRTIO_SCSI_F_HOTPLUG)) { + mutex_unlock(&vs->dev.mutex); + return; + } + + if (plug) + reason = VIRTIO_SCSI_EVT_RESET_RESCAN; + else + reason = VIRTIO_SCSI_EVT_RESET_REMOVED; + + vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq; + mutex_lock(&vq->mutex); + tcm_vhost_send_evt(vs, tpg, lun, + VIRTIO_SCSI_T_TRANSPORT_RESET, reason); + mutex_unlock(&vq->mutex); + mutex_unlock(&vs->dev.mutex); +} + +static void tcm_vhost_hotplug(struct tcm_vhost_tpg *tpg, struct se_lun *lun) +{ + tcm_vhost_do_plug(tpg, lun, true); +} + +static void tcm_vhost_hotunplug(struct tcm_vhost_tpg *tpg, struct se_lun *lun) +{ + tcm_vhost_do_plug(tpg, lun, false); +} + static int tcm_vhost_port_link(struct se_portal_group *se_tpg, struct se_lun *lun) { struct tcm_vhost_tpg *tv_tpg = container_of(se_tpg, struct tcm_vhost_tpg, se_tpg); + mutex_lock(&tcm_vhost_mutex); + mutex_lock(&tv_tpg->tv_tpg_mutex); tv_tpg->tv_tpg_port_count++; mutex_unlock(&tv_tpg->tv_tpg_mutex); + tcm_vhost_hotplug(tv_tpg, lun); + + mutex_unlock(&tcm_vhost_mutex); + return 0; } static void tcm_vhost_port_unlink(struct se_portal_group *se_tpg, - struct se_lun *se_lun) + struct se_lun *lun) { struct tcm_vhost_tpg *tv_tpg = container_of(se_tpg, struct tcm_vhost_tpg, se_tpg); + mutex_lock(&tcm_vhost_mutex); + mutex_lock(&tv_tpg->tv_tpg_mutex); tv_tpg->tv_tpg_port_count--; mutex_unlock(&tv_tpg->tv_tpg_mutex); + + tcm_vhost_hotunplug(tv_tpg, lun); + + mutex_unlock(&tcm_vhost_mutex); } static struct se_node_acl *tcm_vhost_make_nodeacl( @@ -1620,7 +2136,8 @@ static void tcm_vhost_exit(void) destroy_workqueue(tcm_vhost_workqueue); }; -MODULE_DESCRIPTION("TCM_VHOST series fabric driver"); +MODULE_DESCRIPTION("VHOST_SCSI series fabric driver"); +MODULE_ALIAS("tcm_vhost"); MODULE_LICENSE("GPL"); module_init(tcm_vhost_init); module_exit(tcm_vhost_exit); diff --git a/drivers/vhost/tcm_vhost.h b/drivers/vhost/tcm_vhost.h deleted file mode 100644 index 1d2ae7a60e11..000000000000 --- a/drivers/vhost/tcm_vhost.h +++ /dev/null @@ -1,115 +0,0 @@ -#define TCM_VHOST_VERSION "v0.1" -#define TCM_VHOST_NAMELEN 256 -#define TCM_VHOST_MAX_CDB_SIZE 32 - -struct tcm_vhost_cmd { - /* Descriptor from vhost_get_vq_desc() for virt_queue segment */ - int tvc_vq_desc; - /* virtio-scsi initiator task attribute */ - int tvc_task_attr; - /* virtio-scsi initiator data direction */ - enum dma_data_direction tvc_data_direction; - /* Expected data transfer length from virtio-scsi header */ - u32 tvc_exp_data_len; - /* The Tag from include/linux/virtio_scsi.h:struct virtio_scsi_cmd_req */ - u64 tvc_tag; - /* The number of scatterlists associated with this cmd */ - u32 tvc_sgl_count; - /* Saved unpacked SCSI LUN for tcm_vhost_submission_work() */ - u32 tvc_lun; - /* Pointer to the SGL formatted memory from virtio-scsi */ - struct scatterlist *tvc_sgl; - /* Pointer to response */ - struct virtio_scsi_cmd_resp __user *tvc_resp; - /* Pointer to vhost_scsi for our device */ - struct vhost_scsi *tvc_vhost; - /* Pointer to vhost_virtqueue for the cmd */ - struct vhost_virtqueue *tvc_vq; - /* Pointer to vhost nexus memory */ - struct tcm_vhost_nexus *tvc_nexus; - /* The TCM I/O descriptor that is accessed via container_of() */ - struct se_cmd tvc_se_cmd; - /* work item used for cmwq dispatch to tcm_vhost_submission_work() */ - struct work_struct work; - /* Copy of the incoming SCSI command descriptor block (CDB) */ - unsigned char tvc_cdb[TCM_VHOST_MAX_CDB_SIZE]; - /* Sense buffer that will be mapped into outgoing status */ - unsigned char tvc_sense_buf[TRANSPORT_SENSE_BUFFER]; - /* Completed commands list, serviced from vhost worker thread */ - struct llist_node tvc_completion_list; -}; - -struct tcm_vhost_nexus { - /* Pointer to TCM session for I_T Nexus */ - struct se_session *tvn_se_sess; -}; - -struct tcm_vhost_nacl { - /* Binary World Wide unique Port Name for Vhost Initiator port */ - u64 iport_wwpn; - /* ASCII formatted WWPN for Sas Initiator port */ - char iport_name[TCM_VHOST_NAMELEN]; - /* Returned by tcm_vhost_make_nodeacl() */ - struct se_node_acl se_node_acl; -}; - -struct tcm_vhost_tpg { - /* Vhost port target portal group tag for TCM */ - u16 tport_tpgt; - /* Used to track number of TPG Port/Lun Links wrt to explict I_T Nexus shutdown */ - int tv_tpg_port_count; - /* Used for vhost_scsi device reference to tpg_nexus, protected by tv_tpg_mutex */ - int tv_tpg_vhost_count; - /* list for tcm_vhost_list */ - struct list_head tv_tpg_list; - /* Used to protect access for tpg_nexus */ - struct mutex tv_tpg_mutex; - /* Pointer to the TCM VHost I_T Nexus for this TPG endpoint */ - struct tcm_vhost_nexus *tpg_nexus; - /* Pointer back to tcm_vhost_tport */ - struct tcm_vhost_tport *tport; - /* Returned by tcm_vhost_make_tpg() */ - struct se_portal_group se_tpg; -}; - -struct tcm_vhost_tport { - /* SCSI protocol the tport is providing */ - u8 tport_proto_id; - /* Binary World Wide unique Port Name for Vhost Target port */ - u64 tport_wwpn; - /* ASCII formatted WWPN for Vhost Target port */ - char tport_name[TCM_VHOST_NAMELEN]; - /* Returned by tcm_vhost_make_tport() */ - struct se_wwn tport_wwn; -}; - -/* - * As per request from MST, keep TCM_VHOST related ioctl defines out of - * linux/vhost.h (user-space) for now.. - */ - -#include <linux/vhost.h> - -/* - * Used by QEMU userspace to ensure a consistent vhost-scsi ABI. - * - * ABI Rev 0: July 2012 version starting point for v3.6-rc merge candidate + - * RFC-v2 vhost-scsi userspace. Add GET_ABI_VERSION ioctl usage - * ABI Rev 1: January 2013. Ignore vhost_tpgt filed in struct vhost_scsi_target. - * All the targets under vhost_wwpn can be seen and used by guset. - */ - -#define VHOST_SCSI_ABI_VERSION 1 - -struct vhost_scsi_target { - int abi_version; - char vhost_wwpn[TRANSPORT_IQN_LEN]; - unsigned short vhost_tpgt; - unsigned short reserved; -}; - -/* VHOST_SCSI specific defines */ -#define VHOST_SCSI_SET_ENDPOINT _IOW(VHOST_VIRTIO, 0x40, struct vhost_scsi_target) -#define VHOST_SCSI_CLEAR_ENDPOINT _IOW(VHOST_VIRTIO, 0x41, struct vhost_scsi_target) -/* Changing this breaks userspace. */ -#define VHOST_SCSI_GET_ABI_VERSION _IOW(VHOST_VIRTIO, 0x42, int) diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c index 329d3021d059..1ee45bc85f67 100644 --- a/drivers/vhost/test.c +++ b/drivers/vhost/test.c @@ -219,13 +219,20 @@ static long vhost_test_reset_owner(struct vhost_test *n) { void *priv = NULL; long err; + struct vhost_memory *memory; + mutex_lock(&n->dev.mutex); err = vhost_dev_check_owner(&n->dev); if (err) goto done; + memory = vhost_dev_reset_owner_prepare(); + if (!memory) { + err = -ENOMEM; + goto done; + } vhost_test_stop(n, &priv); vhost_test_flush(n); - err = vhost_dev_reset_owner(&n->dev); + vhost_dev_reset_owner(&n->dev, memory); done: mutex_unlock(&n->dev.mutex); return err; diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 9759249e6d90..749b5ab5bfbb 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -33,8 +33,6 @@ enum { VHOST_MEMORY_F_LOG = 0x1, }; -static unsigned vhost_zcopy_mask __read_mostly; - #define vhost_used_event(vq) ((u16 __user *)&vq->avail->ring[vq->num]) #define vhost_avail_event(vq) ((u16 __user *)&vq->used->ring[vq->num]) @@ -89,6 +87,9 @@ int vhost_poll_start(struct vhost_poll *poll, struct file *file) unsigned long mask; int ret = 0; + if (poll->wqh) + return 0; + mask = file->f_op->poll(file, &poll->table); if (mask) vhost_poll_wakeup(&poll->wait, 0, 0, (void *)mask); @@ -178,8 +179,6 @@ static void vhost_vq_reset(struct vhost_dev *dev, vq->used_flags = 0; vq->log_used = false; vq->log_addr = -1ull; - vq->vhost_hlen = 0; - vq->sock_hlen = 0; vq->private_data = NULL; vq->log_base = NULL; vq->error_ctx = NULL; @@ -188,9 +187,6 @@ static void vhost_vq_reset(struct vhost_dev *dev, vq->call_ctx = NULL; vq->call = NULL; vq->log_ctx = NULL; - vq->upend_idx = 0; - vq->done_idx = 0; - vq->ubufs = NULL; } static int vhost_worker(void *data) @@ -250,43 +246,29 @@ static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq) vq->log = NULL; kfree(vq->heads); vq->heads = NULL; - kfree(vq->ubuf_info); - vq->ubuf_info = NULL; -} - -void vhost_enable_zcopy(int vq) -{ - vhost_zcopy_mask |= 0x1 << vq; } /* Helper to allocate iovec buffers for all vqs. */ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev) { int i; - bool zcopy; for (i = 0; i < dev->nvqs; ++i) { - dev->vqs[i].indirect = kmalloc(sizeof *dev->vqs[i].indirect * + dev->vqs[i]->indirect = kmalloc(sizeof *dev->vqs[i]->indirect * UIO_MAXIOV, GFP_KERNEL); - dev->vqs[i].log = kmalloc(sizeof *dev->vqs[i].log * UIO_MAXIOV, + dev->vqs[i]->log = kmalloc(sizeof *dev->vqs[i]->log * UIO_MAXIOV, GFP_KERNEL); - dev->vqs[i].heads = kmalloc(sizeof *dev->vqs[i].heads * + dev->vqs[i]->heads = kmalloc(sizeof *dev->vqs[i]->heads * UIO_MAXIOV, GFP_KERNEL); - zcopy = vhost_zcopy_mask & (0x1 << i); - if (zcopy) - dev->vqs[i].ubuf_info = - kmalloc(sizeof *dev->vqs[i].ubuf_info * - UIO_MAXIOV, GFP_KERNEL); - if (!dev->vqs[i].indirect || !dev->vqs[i].log || - !dev->vqs[i].heads || - (zcopy && !dev->vqs[i].ubuf_info)) + if (!dev->vqs[i]->indirect || !dev->vqs[i]->log || + !dev->vqs[i]->heads) goto err_nomem; } return 0; err_nomem: for (; i >= 0; --i) - vhost_vq_free_iovecs(&dev->vqs[i]); + vhost_vq_free_iovecs(dev->vqs[i]); return -ENOMEM; } @@ -295,11 +277,11 @@ static void vhost_dev_free_iovecs(struct vhost_dev *dev) int i; for (i = 0; i < dev->nvqs; ++i) - vhost_vq_free_iovecs(&dev->vqs[i]); + vhost_vq_free_iovecs(dev->vqs[i]); } long vhost_dev_init(struct vhost_dev *dev, - struct vhost_virtqueue *vqs, int nvqs) + struct vhost_virtqueue **vqs, int nvqs) { int i; @@ -315,16 +297,15 @@ long vhost_dev_init(struct vhost_dev *dev, dev->worker = NULL; for (i = 0; i < dev->nvqs; ++i) { - dev->vqs[i].log = NULL; - dev->vqs[i].indirect = NULL; - dev->vqs[i].heads = NULL; - dev->vqs[i].ubuf_info = NULL; - dev->vqs[i].dev = dev; - mutex_init(&dev->vqs[i].mutex); - vhost_vq_reset(dev, dev->vqs + i); - if (dev->vqs[i].handle_kick) - vhost_poll_init(&dev->vqs[i].poll, - dev->vqs[i].handle_kick, POLLIN, dev); + dev->vqs[i]->log = NULL; + dev->vqs[i]->indirect = NULL; + dev->vqs[i]->heads = NULL; + dev->vqs[i]->dev = dev; + mutex_init(&dev->vqs[i]->mutex); + vhost_vq_reset(dev, dev->vqs[i]); + if (dev->vqs[i]->handle_kick) + vhost_poll_init(&dev->vqs[i]->poll, + dev->vqs[i]->handle_kick, POLLIN, dev); } return 0; @@ -405,21 +386,19 @@ err_mm: return err; } -/* Caller should have device mutex */ -long vhost_dev_reset_owner(struct vhost_dev *dev) +struct vhost_memory *vhost_dev_reset_owner_prepare(void) { - struct vhost_memory *memory; - - /* Restore memory to default empty mapping. */ - memory = kmalloc(offsetof(struct vhost_memory, regions), GFP_KERNEL); - if (!memory) - return -ENOMEM; + return kmalloc(offsetof(struct vhost_memory, regions), GFP_KERNEL); +} +/* Caller should have device mutex */ +void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_memory *memory) +{ vhost_dev_cleanup(dev, true); + /* Restore memory to default empty mapping. */ memory->nregions = 0; RCU_INIT_POINTER(dev->memory, memory); - return 0; } void vhost_dev_stop(struct vhost_dev *dev) @@ -427,9 +406,9 @@ void vhost_dev_stop(struct vhost_dev *dev) int i; for (i = 0; i < dev->nvqs; ++i) { - if (dev->vqs[i].kick && dev->vqs[i].handle_kick) { - vhost_poll_stop(&dev->vqs[i].poll); - vhost_poll_flush(&dev->vqs[i].poll); + if (dev->vqs[i]->kick && dev->vqs[i]->handle_kick) { + vhost_poll_stop(&dev->vqs[i]->poll); + vhost_poll_flush(&dev->vqs[i]->poll); } } } @@ -440,17 +419,17 @@ void vhost_dev_cleanup(struct vhost_dev *dev, bool locked) int i; for (i = 0; i < dev->nvqs; ++i) { - if (dev->vqs[i].error_ctx) - eventfd_ctx_put(dev->vqs[i].error_ctx); - if (dev->vqs[i].error) - fput(dev->vqs[i].error); - if (dev->vqs[i].kick) - fput(dev->vqs[i].kick); - if (dev->vqs[i].call_ctx) - eventfd_ctx_put(dev->vqs[i].call_ctx); - if (dev->vqs[i].call) - fput(dev->vqs[i].call); - vhost_vq_reset(dev, dev->vqs + i); + if (dev->vqs[i]->error_ctx) + eventfd_ctx_put(dev->vqs[i]->error_ctx); + if (dev->vqs[i]->error) + fput(dev->vqs[i]->error); + if (dev->vqs[i]->kick) + fput(dev->vqs[i]->kick); + if (dev->vqs[i]->call_ctx) + eventfd_ctx_put(dev->vqs[i]->call_ctx); + if (dev->vqs[i]->call) + fput(dev->vqs[i]->call); + vhost_vq_reset(dev, dev->vqs[i]); } vhost_dev_free_iovecs(dev); if (dev->log_ctx) @@ -521,14 +500,14 @@ static int memory_access_ok(struct vhost_dev *d, struct vhost_memory *mem, for (i = 0; i < d->nvqs; ++i) { int ok; - mutex_lock(&d->vqs[i].mutex); + mutex_lock(&d->vqs[i]->mutex); /* If ring is inactive, will check when it's enabled. */ - if (d->vqs[i].private_data) - ok = vq_memory_access_ok(d->vqs[i].log_base, mem, + if (d->vqs[i]->private_data) + ok = vq_memory_access_ok(d->vqs[i]->log_base, mem, log_all); else ok = 1; - mutex_unlock(&d->vqs[i].mutex); + mutex_unlock(&d->vqs[i]->mutex); if (!ok) return 0; } @@ -638,7 +617,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp) if (idx >= d->nvqs) return -ENOBUFS; - vq = d->vqs + idx; + vq = d->vqs[idx]; mutex_lock(&vq->mutex); @@ -849,7 +828,7 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp) for (i = 0; i < d->nvqs; ++i) { struct vhost_virtqueue *vq; void __user *base = (void __user *)(unsigned long)p; - vq = d->vqs + i; + vq = d->vqs[i]; mutex_lock(&vq->mutex); /* If ring is inactive, will check when it's enabled. */ if (vq->private_data && !vq_log_access_ok(d, vq, base)) @@ -876,9 +855,9 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp) } else filep = eventfp; for (i = 0; i < d->nvqs; ++i) { - mutex_lock(&d->vqs[i].mutex); - d->vqs[i].log_ctx = d->log_ctx; - mutex_unlock(&d->vqs[i].mutex); + mutex_lock(&d->vqs[i]->mutex); + d->vqs[i]->log_ctx = d->log_ctx; + mutex_unlock(&d->vqs[i]->mutex); } if (ctx) eventfd_ctx_put(ctx); @@ -1548,38 +1527,3 @@ void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) &vq->used->flags, r); } } - -static void vhost_zerocopy_done_signal(struct kref *kref) -{ - struct vhost_ubuf_ref *ubufs = container_of(kref, struct vhost_ubuf_ref, - kref); - wake_up(&ubufs->wait); -} - -struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *vq, - bool zcopy) -{ - struct vhost_ubuf_ref *ubufs; - /* No zero copy backend? Nothing to count. */ - if (!zcopy) - return NULL; - ubufs = kmalloc(sizeof *ubufs, GFP_KERNEL); - if (!ubufs) - return ERR_PTR(-ENOMEM); - kref_init(&ubufs->kref); - init_waitqueue_head(&ubufs->wait); - ubufs->vq = vq; - return ubufs; -} - -void vhost_ubuf_put(struct vhost_ubuf_ref *ubufs) -{ - kref_put(&ubufs->kref, vhost_zerocopy_done_signal); -} - -void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *ubufs) -{ - kref_put(&ubufs->kref, vhost_zerocopy_done_signal); - wait_event(ubufs->wait, !atomic_read(&ubufs->kref.refcount)); - kfree(ubufs); -} diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 17261e277c02..b58f4ae82cb8 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -54,18 +54,6 @@ struct vhost_log { struct vhost_virtqueue; -struct vhost_ubuf_ref { - struct kref kref; - wait_queue_head_t wait; - struct vhost_virtqueue *vq; -}; - -struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *, bool zcopy); -void vhost_ubuf_put(struct vhost_ubuf_ref *); -void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *); - -struct ubuf_info; - /* The virtqueue structure describes a queue attached to a device. */ struct vhost_virtqueue { struct vhost_dev *dev; @@ -114,10 +102,7 @@ struct vhost_virtqueue { /* hdr is used to store the virtio header. * Since each iovec has >= 1 byte length, we never need more than * header length entries to store the header. */ - struct iovec hdr[sizeof(struct virtio_net_hdr_mrg_rxbuf)]; struct iovec *indirect; - size_t vhost_hlen; - size_t sock_hlen; struct vring_used_elem *heads; /* We use a kind of RCU to access private pointer. * All readers access it from worker, which makes it possible to @@ -130,16 +115,6 @@ struct vhost_virtqueue { /* Log write descriptors */ void __user *log_base; struct vhost_log *log; - /* vhost zerocopy support fields below: */ - /* last used idx for outstanding DMA zerocopy buffers */ - int upend_idx; - /* first used idx for DMA done zerocopy buffers */ - int done_idx; - /* an array of userspace buffers info */ - struct ubuf_info *ubuf_info; - /* Reference counting for outstanding ubufs. - * Protected by vq mutex. Writers must also take device mutex. */ - struct vhost_ubuf_ref *ubufs; }; struct vhost_dev { @@ -150,7 +125,7 @@ struct vhost_dev { struct mm_struct *mm; struct mutex mutex; unsigned acked_features; - struct vhost_virtqueue *vqs; + struct vhost_virtqueue **vqs; int nvqs; struct file *log_file; struct eventfd_ctx *log_ctx; @@ -159,9 +134,10 @@ struct vhost_dev { struct task_struct *worker; }; -long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs); +long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, int nvqs); long vhost_dev_check_owner(struct vhost_dev *); -long vhost_dev_reset_owner(struct vhost_dev *); +struct vhost_memory *vhost_dev_reset_owner_prepare(void); +void vhost_dev_reset_owner(struct vhost_dev *, struct vhost_memory *); void vhost_dev_cleanup(struct vhost_dev *, bool locked); void vhost_dev_stop(struct vhost_dev *); long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user *argp); |