From 597957000ab5b1b38085c20868f3f7b9c305bae5 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Sat, 14 Nov 2015 11:12:15 +0800 Subject: xen/blkback: separate ring information out of struct xen_blkif Split per ring information to an new structure "xen_blkif_ring", so that one vbd device can be associated with one or more rings/hardware queues. Introduce 'pers_gnts_lock' to protect the pool of persistent grants since we may have multi backend threads. This patch is a preparation for supporting multi hardware queues/rings. Signed-off-by: Arianna Avanzini Signed-off-by: Bob Liu Signed-off-by: Konrad Rzeszutek Wilk --- v2: Align the variables in the structure. --- drivers/block/xen-blkback/blkback.c | 235 ++++++++++++++++++++---------------- 1 file changed, 133 insertions(+), 102 deletions(-) (limited to 'drivers/block/xen-blkback/blkback.c') diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index f9099940c272..4fd8640d146c 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -173,11 +173,11 @@ static inline void shrink_free_pagepool(struct xen_blkif *blkif, int num) #define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page))) -static int do_block_io_op(struct xen_blkif *blkif); -static int dispatch_rw_block_io(struct xen_blkif *blkif, +static int do_block_io_op(struct xen_blkif_ring *ring); +static int dispatch_rw_block_io(struct xen_blkif_ring *ring, struct blkif_request *req, struct pending_req *pending_req); -static void make_response(struct xen_blkif *blkif, u64 id, +static void make_response(struct xen_blkif_ring *ring, u64 id, unsigned short op, int st); #define foreach_grant_safe(pos, n, rbtree, node) \ @@ -189,14 +189,8 @@ static void make_response(struct xen_blkif *blkif, u64 id, /* - * We don't need locking around the persistent grant helpers - * because blkback uses a single-thread for each backed, so we - * can be sure that this functions will never be called recursively. - * - * The only exception to that is put_persistent_grant, that can be called - * from interrupt context (by xen_blkbk_unmap), so we have to use atomic - * bit operations to modify the flags of a persistent grant and to count - * the number of used grants. + * pers_gnts_lock must be used around all the persistent grant helpers + * because blkback may use multi-thread/queue for each backend. */ static int add_persistent_gnt(struct xen_blkif *blkif, struct persistent_gnt *persistent_gnt) @@ -204,6 +198,7 @@ static int add_persistent_gnt(struct xen_blkif *blkif, struct rb_node **new = NULL, *parent = NULL; struct persistent_gnt *this; + BUG_ON(!spin_is_locked(&blkif->pers_gnts_lock)); if (blkif->persistent_gnt_c >= xen_blkif_max_pgrants) { if (!blkif->vbd.overflow_max_grants) blkif->vbd.overflow_max_grants = 1; @@ -241,6 +236,7 @@ static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif, struct persistent_gnt *data; struct rb_node *node = NULL; + BUG_ON(!spin_is_locked(&blkif->pers_gnts_lock)); node = blkif->persistent_gnts.rb_node; while (node) { data = container_of(node, struct persistent_gnt, node); @@ -265,6 +261,7 @@ static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif, static void put_persistent_gnt(struct xen_blkif *blkif, struct persistent_gnt *persistent_gnt) { + BUG_ON(!spin_is_locked(&blkif->pers_gnts_lock)); if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags)) pr_alert_ratelimited("freeing a grant already unused\n"); set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags); @@ -286,6 +283,7 @@ static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root, unmap_data.unmap_ops = unmap; unmap_data.kunmap_ops = NULL; + BUG_ON(!spin_is_locked(&blkif->pers_gnts_lock)); foreach_grant_safe(persistent_gnt, n, root, node) { BUG_ON(persistent_gnt->handle == BLKBACK_INVALID_HANDLE); @@ -322,11 +320,13 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work) int segs_to_unmap = 0; struct xen_blkif *blkif = container_of(work, typeof(*blkif), persistent_purge_work); struct gntab_unmap_queue_data unmap_data; + unsigned long flags; unmap_data.pages = pages; unmap_data.unmap_ops = unmap; unmap_data.kunmap_ops = NULL; + spin_lock_irqsave(&blkif->pers_gnts_lock, flags); while(!list_empty(&blkif->persistent_purge_list)) { persistent_gnt = list_first_entry(&blkif->persistent_purge_list, struct persistent_gnt, @@ -348,6 +348,7 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work) } kfree(persistent_gnt); } + spin_unlock_irqrestore(&blkif->pers_gnts_lock, flags); if (segs_to_unmap > 0) { unmap_data.count = segs_to_unmap; BUG_ON(gnttab_unmap_refs_sync(&unmap_data)); @@ -362,16 +363,18 @@ static void purge_persistent_gnt(struct xen_blkif *blkif) unsigned int num_clean, total; bool scan_used = false, clean_used = false; struct rb_root *root; + unsigned long flags; + spin_lock_irqsave(&blkif->pers_gnts_lock, flags); if (blkif->persistent_gnt_c < xen_blkif_max_pgrants || (blkif->persistent_gnt_c == xen_blkif_max_pgrants && !blkif->vbd.overflow_max_grants)) { - return; + goto out; } if (work_busy(&blkif->persistent_purge_work)) { pr_alert_ratelimited("Scheduled work from previous purge is still busy, cannot purge list\n"); - return; + goto out; } num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN; @@ -379,7 +382,7 @@ static void purge_persistent_gnt(struct xen_blkif *blkif) num_clean = min(blkif->persistent_gnt_c, num_clean); if ((num_clean == 0) || (num_clean > (blkif->persistent_gnt_c - atomic_read(&blkif->persistent_gnt_in_use)))) - return; + goto out; /* * At this point, we can assure that there will be no calls @@ -436,29 +439,35 @@ finished: } blkif->persistent_gnt_c -= (total - num_clean); + spin_unlock_irqrestore(&blkif->pers_gnts_lock, flags); blkif->vbd.overflow_max_grants = 0; /* We can defer this work */ schedule_work(&blkif->persistent_purge_work); pr_debug("Purged %u/%u\n", (total - num_clean), total); return; + +out: + spin_unlock_irqrestore(&blkif->pers_gnts_lock, flags); + + return; } /* * Retrieve from the 'pending_reqs' a free pending_req structure to be used. */ -static struct pending_req *alloc_req(struct xen_blkif *blkif) +static struct pending_req *alloc_req(struct xen_blkif_ring *ring) { struct pending_req *req = NULL; unsigned long flags; - spin_lock_irqsave(&blkif->pending_free_lock, flags); - if (!list_empty(&blkif->pending_free)) { - req = list_entry(blkif->pending_free.next, struct pending_req, + spin_lock_irqsave(&ring->pending_free_lock, flags); + if (!list_empty(&ring->pending_free)) { + req = list_entry(ring->pending_free.next, struct pending_req, free_list); list_del(&req->free_list); } - spin_unlock_irqrestore(&blkif->pending_free_lock, flags); + spin_unlock_irqrestore(&ring->pending_free_lock, flags); return req; } @@ -466,17 +475,17 @@ static struct pending_req *alloc_req(struct xen_blkif *blkif) * Return the 'pending_req' structure back to the freepool. We also * wake up the thread if it was waiting for a free page. */ -static void free_req(struct xen_blkif *blkif, struct pending_req *req) +static void free_req(struct xen_blkif_ring *ring, struct pending_req *req) { unsigned long flags; int was_empty; - spin_lock_irqsave(&blkif->pending_free_lock, flags); - was_empty = list_empty(&blkif->pending_free); - list_add(&req->free_list, &blkif->pending_free); - spin_unlock_irqrestore(&blkif->pending_free_lock, flags); + spin_lock_irqsave(&ring->pending_free_lock, flags); + was_empty = list_empty(&ring->pending_free); + list_add(&req->free_list, &ring->pending_free); + spin_unlock_irqrestore(&ring->pending_free_lock, flags); if (was_empty) - wake_up(&blkif->pending_free_wq); + wake_up(&ring->pending_free_wq); } /* @@ -556,10 +565,10 @@ abort: /* * Notification from the guest OS. */ -static void blkif_notify_work(struct xen_blkif *blkif) +static void blkif_notify_work(struct xen_blkif_ring *ring) { - blkif->waiting_reqs = 1; - wake_up(&blkif->wq); + ring->waiting_reqs = 1; + wake_up(&ring->wq); } irqreturn_t xen_blkif_be_int(int irq, void *dev_id) @@ -590,7 +599,8 @@ static void print_stats(struct xen_blkif *blkif) int xen_blkif_schedule(void *arg) { - struct xen_blkif *blkif = arg; + struct xen_blkif_ring *ring = arg; + struct xen_blkif *blkif = ring->blkif; struct xen_vbd *vbd = &blkif->vbd; unsigned long timeout; int ret; @@ -606,27 +616,27 @@ int xen_blkif_schedule(void *arg) timeout = msecs_to_jiffies(LRU_INTERVAL); timeout = wait_event_interruptible_timeout( - blkif->wq, - blkif->waiting_reqs || kthread_should_stop(), + ring->wq, + ring->waiting_reqs || kthread_should_stop(), timeout); if (timeout == 0) goto purge_gnt_list; timeout = wait_event_interruptible_timeout( - blkif->pending_free_wq, - !list_empty(&blkif->pending_free) || + ring->pending_free_wq, + !list_empty(&ring->pending_free) || kthread_should_stop(), timeout); if (timeout == 0) goto purge_gnt_list; - blkif->waiting_reqs = 0; + ring->waiting_reqs = 0; smp_mb(); /* clear flag *before* checking for work */ - ret = do_block_io_op(blkif); + ret = do_block_io_op(ring); if (ret > 0) - blkif->waiting_reqs = 1; + ring->waiting_reqs = 1; if (ret == -EACCES) - wait_event_interruptible(blkif->shutdown_wq, + wait_event_interruptible(ring->shutdown_wq, kthread_should_stop()); purge_gnt_list: @@ -649,7 +659,7 @@ purge_gnt_list: if (log_stats) print_stats(blkif); - blkif->xenblkd = NULL; + ring->xenblkd = NULL; xen_blkif_put(blkif); return 0; @@ -658,32 +668,40 @@ purge_gnt_list: /* * Remove persistent grants and empty the pool of free pages */ -void xen_blkbk_free_caches(struct xen_blkif *blkif) +void xen_blkbk_free_caches(struct xen_blkif_ring *ring) { + struct xen_blkif *blkif = ring->blkif; + unsigned long flags; + /* Free all persistent grant pages */ + spin_lock_irqsave(&blkif->pers_gnts_lock, flags); if (!RB_EMPTY_ROOT(&blkif->persistent_gnts)) free_persistent_gnts(blkif, &blkif->persistent_gnts, blkif->persistent_gnt_c); BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts)); blkif->persistent_gnt_c = 0; + spin_unlock_irqrestore(&blkif->pers_gnts_lock, flags); /* Since we are shutting down remove all pages from the buffer */ shrink_free_pagepool(blkif, 0 /* All */); } static unsigned int xen_blkbk_unmap_prepare( - struct xen_blkif *blkif, + struct xen_blkif_ring *ring, struct grant_page **pages, unsigned int num, struct gnttab_unmap_grant_ref *unmap_ops, struct page **unmap_pages) { unsigned int i, invcount = 0; + unsigned long flags; for (i = 0; i < num; i++) { if (pages[i]->persistent_gnt != NULL) { - put_persistent_gnt(blkif, pages[i]->persistent_gnt); + spin_lock_irqsave(&ring->blkif->pers_gnts_lock, flags); + put_persistent_gnt(ring->blkif, pages[i]->persistent_gnt); + spin_unlock_irqrestore(&ring->blkif->pers_gnts_lock, flags); continue; } if (pages[i]->handle == BLKBACK_INVALID_HANDLE) @@ -700,17 +718,18 @@ static unsigned int xen_blkbk_unmap_prepare( static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_queue_data *data) { - struct pending_req* pending_req = (struct pending_req*) (data->data); - struct xen_blkif *blkif = pending_req->blkif; + struct pending_req *pending_req = (struct pending_req *)(data->data); + struct xen_blkif_ring *ring = pending_req->ring; + struct xen_blkif *blkif = ring->blkif; /* BUG_ON used to reproduce existing behaviour, but is this the best way to deal with this? */ BUG_ON(result); put_free_pages(blkif, data->pages, data->count); - make_response(blkif, pending_req->id, + make_response(ring, pending_req->id, pending_req->operation, pending_req->status); - free_req(blkif, pending_req); + free_req(ring, pending_req); /* * Make sure the request is freed before releasing blkif, * or there could be a race between free_req and the @@ -723,7 +742,7 @@ static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_ * pending_free_wq if there's a drain going on, but it has * to be taken into account if the current model is changed. */ - if (atomic_dec_and_test(&blkif->inflight) && atomic_read(&blkif->drain)) { + if (atomic_dec_and_test(&ring->inflight) && atomic_read(&blkif->drain)) { complete(&blkif->drain_complete); } xen_blkif_put(blkif); @@ -732,11 +751,11 @@ static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_ static void xen_blkbk_unmap_and_respond(struct pending_req *req) { struct gntab_unmap_queue_data* work = &req->gnttab_unmap_data; - struct xen_blkif *blkif = req->blkif; + struct xen_blkif_ring *ring = req->ring; struct grant_page **pages = req->segments; unsigned int invcount; - invcount = xen_blkbk_unmap_prepare(blkif, pages, req->nr_segs, + invcount = xen_blkbk_unmap_prepare(ring, pages, req->nr_segs, req->unmap, req->unmap_pages); work->data = req; @@ -757,7 +776,7 @@ static void xen_blkbk_unmap_and_respond(struct pending_req *req) * of hypercalls, but since this is only used in error paths there's * no real need. */ -static void xen_blkbk_unmap(struct xen_blkif *blkif, +static void xen_blkbk_unmap(struct xen_blkif_ring *ring, struct grant_page *pages[], int num) { @@ -768,20 +787,20 @@ static void xen_blkbk_unmap(struct xen_blkif *blkif, while (num) { unsigned int batch = min(num, BLKIF_MAX_SEGMENTS_PER_REQUEST); - - invcount = xen_blkbk_unmap_prepare(blkif, pages, batch, + + invcount = xen_blkbk_unmap_prepare(ring, pages, batch, unmap, unmap_pages); if (invcount) { ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount); BUG_ON(ret); - put_free_pages(blkif, unmap_pages, invcount); + put_free_pages(ring->blkif, unmap_pages, invcount); } pages += batch; num -= batch; } } -static int xen_blkbk_map(struct xen_blkif *blkif, +static int xen_blkbk_map(struct xen_blkif_ring *ring, struct grant_page *pages[], int num, bool ro) { @@ -794,6 +813,8 @@ static int xen_blkbk_map(struct xen_blkif *blkif, int ret = 0; int last_map = 0, map_until = 0; int use_persistent_gnts; + struct xen_blkif *blkif = ring->blkif; + unsigned long irq_flags; use_persistent_gnts = (blkif->vbd.feature_gnt_persistent); @@ -806,10 +827,13 @@ again: for (i = map_until; i < num; i++) { uint32_t flags; - if (use_persistent_gnts) + if (use_persistent_gnts) { + spin_lock_irqsave(&blkif->pers_gnts_lock, irq_flags); persistent_gnt = get_persistent_gnt( blkif, pages[i]->gref); + spin_unlock_irqrestore(&blkif->pers_gnts_lock, irq_flags); + } if (persistent_gnt) { /* @@ -880,8 +904,10 @@ again: persistent_gnt->gnt = map[new_map_idx].ref; persistent_gnt->handle = map[new_map_idx].handle; persistent_gnt->page = pages[seg_idx]->page; + spin_lock_irqsave(&blkif->pers_gnts_lock, irq_flags); if (add_persistent_gnt(blkif, persistent_gnt)) { + spin_unlock_irqrestore(&blkif->pers_gnts_lock, irq_flags); kfree(persistent_gnt); persistent_gnt = NULL; goto next; @@ -890,6 +916,7 @@ again: pr_debug("grant %u added to the tree of persistent grants, using %u/%u\n", persistent_gnt->gnt, blkif->persistent_gnt_c, xen_blkif_max_pgrants); + spin_unlock_irqrestore(&blkif->pers_gnts_lock, irq_flags); goto next; } if (use_persistent_gnts && !blkif->vbd.overflow_max_grants) { @@ -921,7 +948,7 @@ static int xen_blkbk_map_seg(struct pending_req *pending_req) { int rc; - rc = xen_blkbk_map(pending_req->blkif, pending_req->segments, + rc = xen_blkbk_map(pending_req->ring, pending_req->segments, pending_req->nr_segs, (pending_req->operation != BLKIF_OP_READ)); @@ -934,7 +961,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req, struct phys_req *preq) { struct grant_page **pages = pending_req->indirect_pages; - struct xen_blkif *blkif = pending_req->blkif; + struct xen_blkif_ring *ring = pending_req->ring; int indirect_grefs, rc, n, nseg, i; struct blkif_request_segment *segments = NULL; @@ -945,7 +972,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req, for (i = 0; i < indirect_grefs; i++) pages[i]->gref = req->u.indirect.indirect_grefs[i]; - rc = xen_blkbk_map(blkif, pages, indirect_grefs, true); + rc = xen_blkbk_map(ring, pages, indirect_grefs, true); if (rc) goto unmap; @@ -972,15 +999,16 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req, unmap: if (segments) kunmap_atomic(segments); - xen_blkbk_unmap(blkif, pages, indirect_grefs); + xen_blkbk_unmap(ring, pages, indirect_grefs); return rc; } -static int dispatch_discard_io(struct xen_blkif *blkif, +static int dispatch_discard_io(struct xen_blkif_ring *ring, struct blkif_request *req) { int err = 0; int status = BLKIF_RSP_OKAY; + struct xen_blkif *blkif = ring->blkif; struct block_device *bdev = blkif->vbd.bdev; unsigned long secure; struct phys_req preq; @@ -1013,26 +1041,28 @@ fail_response: } else if (err) status = BLKIF_RSP_ERROR; - make_response(blkif, req->u.discard.id, req->operation, status); + make_response(ring, req->u.discard.id, req->operation, status); xen_blkif_put(blkif); return err; } -static int dispatch_other_io(struct xen_blkif *blkif, +static int dispatch_other_io(struct xen_blkif_ring *ring, struct blkif_request *req, struct pending_req *pending_req) { - free_req(blkif, pending_req); - make_response(blkif, req->u.other.id, req->operation, + free_req(ring, pending_req); + make_response(ring, req->u.other.id, req->operation, BLKIF_RSP_EOPNOTSUPP); return -EIO; } -static void xen_blk_drain_io(struct xen_blkif *blkif) +static void xen_blk_drain_io(struct xen_blkif_ring *ring) { + struct xen_blkif *blkif = ring->blkif; + atomic_set(&blkif->drain, 1); do { - if (atomic_read(&blkif->inflight) == 0) + if (atomic_read(&ring->inflight) == 0) break; wait_for_completion_interruptible_timeout( &blkif->drain_complete, HZ); @@ -1053,12 +1083,12 @@ static void __end_block_io_op(struct pending_req *pending_req, int error) if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) && (error == -EOPNOTSUPP)) { pr_debug("flush diskcache op failed, not supported\n"); - xen_blkbk_flush_diskcache(XBT_NIL, pending_req->blkif->be, 0); + xen_blkbk_flush_diskcache(XBT_NIL, pending_req->ring->blkif->be, 0); pending_req->status = BLKIF_RSP_EOPNOTSUPP; } else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) && (error == -EOPNOTSUPP)) { pr_debug("write barrier op failed, not supported\n"); - xen_blkbk_barrier(XBT_NIL, pending_req->blkif->be, 0); + xen_blkbk_barrier(XBT_NIL, pending_req->ring->blkif->be, 0); pending_req->status = BLKIF_RSP_EOPNOTSUPP; } else if (error) { pr_debug("Buffer not up-to-date at end of operation," @@ -1092,9 +1122,9 @@ static void end_block_io_op(struct bio *bio) * and transmute it to the block API to hand it over to the proper block disk. */ static int -__do_block_io_op(struct xen_blkif *blkif) +__do_block_io_op(struct xen_blkif_ring *ring) { - union blkif_back_rings *blk_rings = &blkif->blk_rings; + union blkif_back_rings *blk_rings = &ring->blk_rings; struct blkif_request req; struct pending_req *pending_req; RING_IDX rc, rp; @@ -1107,7 +1137,7 @@ __do_block_io_op(struct xen_blkif *blkif) if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) { rc = blk_rings->common.rsp_prod_pvt; pr_warn("Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n", - rp, rc, rp - rc, blkif->vbd.pdevice); + rp, rc, rp - rc, ring->blkif->vbd.pdevice); return -EACCES; } while (rc != rp) { @@ -1120,14 +1150,14 @@ __do_block_io_op(struct xen_blkif *blkif) break; } - pending_req = alloc_req(blkif); + pending_req = alloc_req(ring); if (NULL == pending_req) { - blkif->st_oo_req++; + ring->blkif->st_oo_req++; more_to_do = 1; break; } - switch (blkif->blk_protocol) { + switch (ring->blkif->blk_protocol) { case BLKIF_PROTOCOL_NATIVE: memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req)); break; @@ -1151,16 +1181,16 @@ __do_block_io_op(struct xen_blkif *blkif) case BLKIF_OP_WRITE_BARRIER: case BLKIF_OP_FLUSH_DISKCACHE: case BLKIF_OP_INDIRECT: - if (dispatch_rw_block_io(blkif, &req, pending_req)) + if (dispatch_rw_block_io(ring, &req, pending_req)) goto done; break; case BLKIF_OP_DISCARD: - free_req(blkif, pending_req); - if (dispatch_discard_io(blkif, &req)) + free_req(ring, pending_req); + if (dispatch_discard_io(ring, &req)) goto done; break; default: - if (dispatch_other_io(blkif, &req, pending_req)) + if (dispatch_other_io(ring, &req, pending_req)) goto done; break; } @@ -1173,13 +1203,13 @@ done: } static int -do_block_io_op(struct xen_blkif *blkif) +do_block_io_op(struct xen_blkif_ring *ring) { - union blkif_back_rings *blk_rings = &blkif->blk_rings; + union blkif_back_rings *blk_rings = &ring->blk_rings; int more_to_do; do { - more_to_do = __do_block_io_op(blkif); + more_to_do = __do_block_io_op(ring); if (more_to_do) break; @@ -1192,7 +1222,7 @@ do_block_io_op(struct xen_blkif *blkif) * Transmutation of the 'struct blkif_request' to a proper 'struct bio' * and call the 'submit_bio' to pass it to the underlying storage. */ -static int dispatch_rw_block_io(struct xen_blkif *blkif, +static int dispatch_rw_block_io(struct xen_blkif_ring *ring, struct blkif_request *req, struct pending_req *pending_req) { @@ -1220,17 +1250,17 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, switch (req_operation) { case BLKIF_OP_READ: - blkif->st_rd_req++; + ring->blkif->st_rd_req++; operation = READ; break; case BLKIF_OP_WRITE: - blkif->st_wr_req++; + ring->blkif->st_wr_req++; operation = WRITE_ODIRECT; break; case BLKIF_OP_WRITE_BARRIER: drain = true; case BLKIF_OP_FLUSH_DISKCACHE: - blkif->st_f_req++; + ring->blkif->st_f_req++; operation = WRITE_FLUSH; break; default: @@ -1255,7 +1285,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, preq.nr_sects = 0; - pending_req->blkif = blkif; + pending_req->ring = ring; pending_req->id = req->u.rw.id; pending_req->operation = req_operation; pending_req->status = BLKIF_RSP_OKAY; @@ -1282,12 +1312,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, goto fail_response; } - if (xen_vbd_translate(&preq, blkif, operation) != 0) { + if (xen_vbd_translate(&preq, ring->blkif, operation) != 0) { pr_debug("access denied: %s of [%llu,%llu] on dev=%04x\n", operation == READ ? "read" : "write", preq.sector_number, preq.sector_number + preq.nr_sects, - blkif->vbd.pdevice); + ring->blkif->vbd.pdevice); goto fail_response; } @@ -1299,7 +1329,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, if (((int)preq.sector_number|(int)seg[i].nsec) & ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) { pr_debug("Misaligned I/O request from domain %d\n", - blkif->domid); + ring->blkif->domid); goto fail_response; } } @@ -1308,7 +1338,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, * issue the WRITE_FLUSH. */ if (drain) - xen_blk_drain_io(pending_req->blkif); + xen_blk_drain_io(pending_req->ring); /* * If we have failed at this point, we need to undo the M2P override, @@ -1323,8 +1353,8 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, * This corresponding xen_blkif_put is done in __end_block_io_op, or * below (in "!bio") if we are handling a BLKIF_OP_DISCARD. */ - xen_blkif_get(blkif); - atomic_inc(&blkif->inflight); + xen_blkif_get(ring->blkif); + atomic_inc(&ring->inflight); for (i = 0; i < nseg; i++) { while ((bio == NULL) || @@ -1372,19 +1402,19 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, blk_finish_plug(&plug); if (operation == READ) - blkif->st_rd_sect += preq.nr_sects; + ring->blkif->st_rd_sect += preq.nr_sects; else if (operation & WRITE) - blkif->st_wr_sect += preq.nr_sects; + ring->blkif->st_wr_sect += preq.nr_sects; return 0; fail_flush: - xen_blkbk_unmap(blkif, pending_req->segments, + xen_blkbk_unmap(ring, pending_req->segments, pending_req->nr_segs); fail_response: /* Haven't submitted any bio's yet. */ - make_response(blkif, req->u.rw.id, req_operation, BLKIF_RSP_ERROR); - free_req(blkif, pending_req); + make_response(ring, req->u.rw.id, req_operation, BLKIF_RSP_ERROR); + free_req(ring, pending_req); msleep(1); /* back off a bit */ return -EIO; @@ -1402,21 +1432,22 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, /* * Put a response on the ring on how the operation fared. */ -static void make_response(struct xen_blkif *blkif, u64 id, +static void make_response(struct xen_blkif_ring *ring, u64 id, unsigned short op, int st) { struct blkif_response resp; unsigned long flags; - union blkif_back_rings *blk_rings = &blkif->blk_rings; + union blkif_back_rings *blk_rings; int notify; resp.id = id; resp.operation = op; resp.status = st; - spin_lock_irqsave(&blkif->blk_ring_lock, flags); + spin_lock_irqsave(&ring->blk_ring_lock, flags); + blk_rings = &ring->blk_rings; /* Place on the response ring for the relevant domain. */ - switch (blkif->blk_protocol) { + switch (ring->blkif->blk_protocol) { case BLKIF_PROTOCOL_NATIVE: memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt), &resp, sizeof(resp)); @@ -1434,9 +1465,9 @@ static void make_response(struct xen_blkif *blkif, u64 id, } blk_rings->common.rsp_prod_pvt++; RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify); - spin_unlock_irqrestore(&blkif->blk_ring_lock, flags); + spin_unlock_irqrestore(&ring->blk_ring_lock, flags); if (notify) - notify_remote_via_irq(blkif->irq); + notify_remote_via_irq(ring->irq); } static int __init xen_blkif_init(void) -- cgit v1.2.3 From d62d86000316d7ef38e1c2e9602c3ce6d1cb57bd Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Sat, 14 Nov 2015 11:12:17 +0800 Subject: xen/blkback: get the number of hardware queues/rings from blkfront Backend advertises "multi-queue-max-queues" to front, also get the negotiated number from "multi-queue-num-queues" written by blkfront. Signed-off-by: Bob Liu Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkback/blkback.c | 13 +++++++++++++ drivers/block/xen-blkback/common.h | 1 + drivers/block/xen-blkback/xenbus.c | 34 ++++++++++++++++++++++++++++------ 3 files changed, 42 insertions(+), 6 deletions(-) (limited to 'drivers/block/xen-blkback/blkback.c') diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 4fd8640d146c..18b27770d80b 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -83,6 +83,16 @@ module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644); MODULE_PARM_DESC(max_persistent_grants, "Maximum number of grants to map persistently"); +/* + * Maximum number of rings/queues blkback supports, allow as many queues as there + * are CPUs if user has not specified a value. + */ +unsigned int xenblk_max_queues; +module_param_named(max_queues, xenblk_max_queues, uint, 0644); +MODULE_PARM_DESC(max_queues, + "Maximum number of hardware queues per virtual disk." \ + "By default it is the number of online CPUs."); + /* * Maximum order of pages to be used for the shared ring between front and * backend, 4KB page granularity is used. @@ -1483,6 +1493,9 @@ static int __init xen_blkif_init(void) xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER; } + if (xenblk_max_queues == 0) + xenblk_max_queues = num_online_cpus(); + rc = xen_blkif_interface_init(); if (rc) goto failed_init; diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index 310eff3cf43f..847444dc1df4 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -46,6 +46,7 @@ #include extern unsigned int xen_blkif_max_ring_order; +extern unsigned int xenblk_max_queues; /* * This is the maximum number of segments that would be allowed in indirect * requests. This value will also be passed to the frontend. diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index f5bfedd0e948..0d6bb9383a68 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -181,12 +181,6 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid) blkif->st_print = jiffies; INIT_WORK(&blkif->persistent_purge_work, xen_blkbk_unmap_purged_grants); - blkif->nr_rings = 1; - if (xen_blkif_alloc_rings(blkif)) { - kmem_cache_free(xen_blkif_cachep, blkif); - return ERR_PTR(-ENOMEM); - } - return blkif; } @@ -595,6 +589,12 @@ static int xen_blkbk_probe(struct xenbus_device *dev, goto fail; } + /* Multi-queue: advertise how many queues are supported by us.*/ + err = xenbus_printf(XBT_NIL, dev->nodename, + "multi-queue-max-queues", "%u", xenblk_max_queues); + if (err) + pr_warn("Error writing multi-queue-max-queues\n"); + /* setup back pointer */ be->blkif->be = be; @@ -980,6 +980,7 @@ static int connect_ring(struct backend_info *be) char *xspath; size_t xspathsize; const size_t xenstore_path_ext_size = 11; /* sufficient for "/queue-NNN" */ + unsigned int requested_num_queues = 0; pr_debug("%s %s\n", __func__, dev->otherend); @@ -1007,6 +1008,27 @@ static int connect_ring(struct backend_info *be) be->blkif->vbd.feature_gnt_persistent = pers_grants; be->blkif->vbd.overflow_max_grants = 0; + /* + * Read the number of hardware queues from frontend. + */ + err = xenbus_scanf(XBT_NIL, dev->otherend, "multi-queue-num-queues", + "%u", &requested_num_queues); + if (err < 0) { + requested_num_queues = 1; + } else { + if (requested_num_queues > xenblk_max_queues + || requested_num_queues == 0) { + /* Buggy or malicious guest. */ + xenbus_dev_fatal(dev, err, + "guest requested %u queues, exceeding the maximum of %u.", + requested_num_queues, xenblk_max_queues); + return -ENOSYS; + } + } + be->blkif->nr_rings = requested_num_queues; + if (xen_blkif_alloc_rings(be->blkif)) + return -ENOMEM; + pr_info("%s: using %d queues, protocol %d (%s) %s\n", dev->nodename, be->blkif->nr_rings, be->blkif->blk_protocol, protocol, pers_grants ? "persistent grants" : ""); -- cgit v1.2.3 From d4bf0065b7251afb723a29b2fd58f7c38f8ce297 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Sat, 14 Nov 2015 11:12:19 +0800 Subject: xen/blkback: make pool of persistent grants and free pages per-queue Make pool of persistent grants and free pages per-queue/ring instead of per-device to get better scalability. Test was done based on null_blk driver: dom0: v4.2-rc8 16vcpus 10GB "modprobe null_blk" domu: v4.2-rc8 16vcpus 10GB [test] rw=read direct=1 ioengine=libaio bs=4k time_based runtime=30 filename=/dev/xvdb numjobs=16 iodepth=64 iodepth_batch=64 iodepth_batch_complete=64 group_reporting Results: iops1: After patch "xen/blkfront: make persistent grants per-queue". iops2: After this patch. Queues: 1 4 8 16 Iops orig(k): 810 1064 780 700 Iops1(k): 810 1230(~20%) 1024(~20%) 850(~20%) Iops2(k): 810 1410(~35%) 1354(~75%) 1440(~100%) With 4 queues after this commit we can get ~75% increase in IOPS, and performance won't drop if increasing queue numbers. Please find the respective chart in this link: https://www.dropbox.com/s/agrcy2pbzbsvmwv/iops.png?dl=0 Signed-off-by: Bob Liu Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkback/blkback.c | 202 ++++++++++++++++-------------------- drivers/block/xen-blkback/common.h | 32 +++--- drivers/block/xen-blkback/xenbus.c | 21 ++-- 3 files changed, 118 insertions(+), 137 deletions(-) (limited to 'drivers/block/xen-blkback/blkback.c') diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 18b27770d80b..a00d6c6c2880 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -123,60 +123,60 @@ module_param(log_stats, int, 0644); /* Number of free pages to remove on each call to gnttab_free_pages */ #define NUM_BATCH_FREE_PAGES 10 -static inline int get_free_page(struct xen_blkif *blkif, struct page **page) +static inline int get_free_page(struct xen_blkif_ring *ring, struct page **page) { unsigned long flags; - spin_lock_irqsave(&blkif->free_pages_lock, flags); - if (list_empty(&blkif->free_pages)) { - BUG_ON(blkif->free_pages_num != 0); - spin_unlock_irqrestore(&blkif->free_pages_lock, flags); + spin_lock_irqsave(&ring->free_pages_lock, flags); + if (list_empty(&ring->free_pages)) { + BUG_ON(ring->free_pages_num != 0); + spin_unlock_irqrestore(&ring->free_pages_lock, flags); return gnttab_alloc_pages(1, page); } - BUG_ON(blkif->free_pages_num == 0); - page[0] = list_first_entry(&blkif->free_pages, struct page, lru); + BUG_ON(ring->free_pages_num == 0); + page[0] = list_first_entry(&ring->free_pages, struct page, lru); list_del(&page[0]->lru); - blkif->free_pages_num--; - spin_unlock_irqrestore(&blkif->free_pages_lock, flags); + ring->free_pages_num--; + spin_unlock_irqrestore(&ring->free_pages_lock, flags); return 0; } -static inline void put_free_pages(struct xen_blkif *blkif, struct page **page, +static inline void put_free_pages(struct xen_blkif_ring *ring, struct page **page, int num) { unsigned long flags; int i; - spin_lock_irqsave(&blkif->free_pages_lock, flags); + spin_lock_irqsave(&ring->free_pages_lock, flags); for (i = 0; i < num; i++) - list_add(&page[i]->lru, &blkif->free_pages); - blkif->free_pages_num += num; - spin_unlock_irqrestore(&blkif->free_pages_lock, flags); + list_add(&page[i]->lru, &ring->free_pages); + ring->free_pages_num += num; + spin_unlock_irqrestore(&ring->free_pages_lock, flags); } -static inline void shrink_free_pagepool(struct xen_blkif *blkif, int num) +static inline void shrink_free_pagepool(struct xen_blkif_ring *ring, int num) { /* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */ struct page *page[NUM_BATCH_FREE_PAGES]; unsigned int num_pages = 0; unsigned long flags; - spin_lock_irqsave(&blkif->free_pages_lock, flags); - while (blkif->free_pages_num > num) { - BUG_ON(list_empty(&blkif->free_pages)); - page[num_pages] = list_first_entry(&blkif->free_pages, + spin_lock_irqsave(&ring->free_pages_lock, flags); + while (ring->free_pages_num > num) { + BUG_ON(list_empty(&ring->free_pages)); + page[num_pages] = list_first_entry(&ring->free_pages, struct page, lru); list_del(&page[num_pages]->lru); - blkif->free_pages_num--; + ring->free_pages_num--; if (++num_pages == NUM_BATCH_FREE_PAGES) { - spin_unlock_irqrestore(&blkif->free_pages_lock, flags); + spin_unlock_irqrestore(&ring->free_pages_lock, flags); gnttab_free_pages(num_pages, page); - spin_lock_irqsave(&blkif->free_pages_lock, flags); + spin_lock_irqsave(&ring->free_pages_lock, flags); num_pages = 0; } } - spin_unlock_irqrestore(&blkif->free_pages_lock, flags); + spin_unlock_irqrestore(&ring->free_pages_lock, flags); if (num_pages != 0) gnttab_free_pages(num_pages, page); } @@ -199,23 +199,29 @@ static void make_response(struct xen_blkif_ring *ring, u64 id, /* - * pers_gnts_lock must be used around all the persistent grant helpers - * because blkback may use multi-thread/queue for each backend. + * We don't need locking around the persistent grant helpers + * because blkback uses a single-thread for each backend, so we + * can be sure that this functions will never be called recursively. + * + * The only exception to that is put_persistent_grant, that can be called + * from interrupt context (by xen_blkbk_unmap), so we have to use atomic + * bit operations to modify the flags of a persistent grant and to count + * the number of used grants. */ -static int add_persistent_gnt(struct xen_blkif *blkif, +static int add_persistent_gnt(struct xen_blkif_ring *ring, struct persistent_gnt *persistent_gnt) { struct rb_node **new = NULL, *parent = NULL; struct persistent_gnt *this; + struct xen_blkif *blkif = ring->blkif; - BUG_ON(!spin_is_locked(&blkif->pers_gnts_lock)); - if (blkif->persistent_gnt_c >= xen_blkif_max_pgrants) { + if (ring->persistent_gnt_c >= xen_blkif_max_pgrants) { if (!blkif->vbd.overflow_max_grants) blkif->vbd.overflow_max_grants = 1; return -EBUSY; } /* Figure out where to put new node */ - new = &blkif->persistent_gnts.rb_node; + new = &ring->persistent_gnts.rb_node; while (*new) { this = container_of(*new, struct persistent_gnt, node); @@ -234,20 +240,19 @@ static int add_persistent_gnt(struct xen_blkif *blkif, set_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags); /* Add new node and rebalance tree. */ rb_link_node(&(persistent_gnt->node), parent, new); - rb_insert_color(&(persistent_gnt->node), &blkif->persistent_gnts); - blkif->persistent_gnt_c++; - atomic_inc(&blkif->persistent_gnt_in_use); + rb_insert_color(&(persistent_gnt->node), &ring->persistent_gnts); + ring->persistent_gnt_c++; + atomic_inc(&ring->persistent_gnt_in_use); return 0; } -static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif, +static struct persistent_gnt *get_persistent_gnt(struct xen_blkif_ring *ring, grant_ref_t gref) { struct persistent_gnt *data; struct rb_node *node = NULL; - BUG_ON(!spin_is_locked(&blkif->pers_gnts_lock)); - node = blkif->persistent_gnts.rb_node; + node = ring->persistent_gnts.rb_node; while (node) { data = container_of(node, struct persistent_gnt, node); @@ -261,25 +266,24 @@ static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif, return NULL; } set_bit(PERSISTENT_GNT_ACTIVE, data->flags); - atomic_inc(&blkif->persistent_gnt_in_use); + atomic_inc(&ring->persistent_gnt_in_use); return data; } } return NULL; } -static void put_persistent_gnt(struct xen_blkif *blkif, +static void put_persistent_gnt(struct xen_blkif_ring *ring, struct persistent_gnt *persistent_gnt) { - BUG_ON(!spin_is_locked(&blkif->pers_gnts_lock)); if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags)) pr_alert_ratelimited("freeing a grant already unused\n"); set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags); clear_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags); - atomic_dec(&blkif->persistent_gnt_in_use); + atomic_dec(&ring->persistent_gnt_in_use); } -static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root, +static void free_persistent_gnts(struct xen_blkif_ring *ring, struct rb_root *root, unsigned int num) { struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; @@ -293,7 +297,6 @@ static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root, unmap_data.unmap_ops = unmap; unmap_data.kunmap_ops = NULL; - BUG_ON(!spin_is_locked(&blkif->pers_gnts_lock)); foreach_grant_safe(persistent_gnt, n, root, node) { BUG_ON(persistent_gnt->handle == BLKBACK_INVALID_HANDLE); @@ -311,7 +314,7 @@ static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root, unmap_data.count = segs_to_unmap; BUG_ON(gnttab_unmap_refs_sync(&unmap_data)); - put_free_pages(blkif, pages, segs_to_unmap); + put_free_pages(ring, pages, segs_to_unmap); segs_to_unmap = 0; } @@ -328,17 +331,15 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work) struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; struct persistent_gnt *persistent_gnt; int segs_to_unmap = 0; - struct xen_blkif *blkif = container_of(work, typeof(*blkif), persistent_purge_work); + struct xen_blkif_ring *ring = container_of(work, typeof(*ring), persistent_purge_work); struct gntab_unmap_queue_data unmap_data; - unsigned long flags; unmap_data.pages = pages; unmap_data.unmap_ops = unmap; unmap_data.kunmap_ops = NULL; - spin_lock_irqsave(&blkif->pers_gnts_lock, flags); - while(!list_empty(&blkif->persistent_purge_list)) { - persistent_gnt = list_first_entry(&blkif->persistent_purge_list, + while(!list_empty(&ring->persistent_purge_list)) { + persistent_gnt = list_first_entry(&ring->persistent_purge_list, struct persistent_gnt, remove_node); list_del(&persistent_gnt->remove_node); @@ -353,45 +354,42 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work) if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) { unmap_data.count = segs_to_unmap; BUG_ON(gnttab_unmap_refs_sync(&unmap_data)); - put_free_pages(blkif, pages, segs_to_unmap); + put_free_pages(ring, pages, segs_to_unmap); segs_to_unmap = 0; } kfree(persistent_gnt); } - spin_unlock_irqrestore(&blkif->pers_gnts_lock, flags); if (segs_to_unmap > 0) { unmap_data.count = segs_to_unmap; BUG_ON(gnttab_unmap_refs_sync(&unmap_data)); - put_free_pages(blkif, pages, segs_to_unmap); + put_free_pages(ring, pages, segs_to_unmap); } } -static void purge_persistent_gnt(struct xen_blkif *blkif) +static void purge_persistent_gnt(struct xen_blkif_ring *ring) { struct persistent_gnt *persistent_gnt; struct rb_node *n; unsigned int num_clean, total; bool scan_used = false, clean_used = false; struct rb_root *root; - unsigned long flags; - spin_lock_irqsave(&blkif->pers_gnts_lock, flags); - if (blkif->persistent_gnt_c < xen_blkif_max_pgrants || - (blkif->persistent_gnt_c == xen_blkif_max_pgrants && - !blkif->vbd.overflow_max_grants)) { + if (ring->persistent_gnt_c < xen_blkif_max_pgrants || + (ring->persistent_gnt_c == xen_blkif_max_pgrants && + !ring->blkif->vbd.overflow_max_grants)) { goto out; } - if (work_busy(&blkif->persistent_purge_work)) { + if (work_busy(&ring->persistent_purge_work)) { pr_alert_ratelimited("Scheduled work from previous purge is still busy, cannot purge list\n"); goto out; } num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN; - num_clean = blkif->persistent_gnt_c - xen_blkif_max_pgrants + num_clean; - num_clean = min(blkif->persistent_gnt_c, num_clean); + num_clean = ring->persistent_gnt_c - xen_blkif_max_pgrants + num_clean; + num_clean = min(ring->persistent_gnt_c, num_clean); if ((num_clean == 0) || - (num_clean > (blkif->persistent_gnt_c - atomic_read(&blkif->persistent_gnt_in_use)))) + (num_clean > (ring->persistent_gnt_c - atomic_read(&ring->persistent_gnt_in_use)))) goto out; /* @@ -407,8 +405,8 @@ static void purge_persistent_gnt(struct xen_blkif *blkif) pr_debug("Going to purge %u persistent grants\n", num_clean); - BUG_ON(!list_empty(&blkif->persistent_purge_list)); - root = &blkif->persistent_gnts; + BUG_ON(!list_empty(&ring->persistent_purge_list)); + root = &ring->persistent_gnts; purge_list: foreach_grant_safe(persistent_gnt, n, root, node) { BUG_ON(persistent_gnt->handle == @@ -427,7 +425,7 @@ purge_list: rb_erase(&persistent_gnt->node, root); list_add(&persistent_gnt->remove_node, - &blkif->persistent_purge_list); + &ring->persistent_purge_list); if (--num_clean == 0) goto finished; } @@ -448,18 +446,14 @@ finished: goto purge_list; } - blkif->persistent_gnt_c -= (total - num_clean); - spin_unlock_irqrestore(&blkif->pers_gnts_lock, flags); - blkif->vbd.overflow_max_grants = 0; + ring->persistent_gnt_c -= (total - num_clean); + ring->blkif->vbd.overflow_max_grants = 0; /* We can defer this work */ - schedule_work(&blkif->persistent_purge_work); + schedule_work(&ring->persistent_purge_work); pr_debug("Purged %u/%u\n", (total - num_clean), total); - return; out: - spin_unlock_irqrestore(&blkif->pers_gnts_lock, flags); - return; } @@ -591,14 +585,16 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id) * SCHEDULER FUNCTIONS */ -static void print_stats(struct xen_blkif *blkif) +static void print_stats(struct xen_blkif_ring *ring) { + struct xen_blkif *blkif = ring->blkif; + pr_info("(%s): oo %3llu | rd %4llu | wr %4llu | f %4llu" " | ds %4llu | pg: %4u/%4d\n", current->comm, blkif->st_oo_req, blkif->st_rd_req, blkif->st_wr_req, blkif->st_f_req, blkif->st_ds_req, - blkif->persistent_gnt_c, + ring->persistent_gnt_c, xen_blkif_max_pgrants); blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); blkif->st_rd_req = 0; @@ -651,23 +647,23 @@ int xen_blkif_schedule(void *arg) purge_gnt_list: if (blkif->vbd.feature_gnt_persistent && - time_after(jiffies, blkif->next_lru)) { - purge_persistent_gnt(blkif); - blkif->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL); + time_after(jiffies, ring->next_lru)) { + purge_persistent_gnt(ring); + ring->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL); } /* Shrink if we have more than xen_blkif_max_buffer_pages */ - shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages); + shrink_free_pagepool(ring, xen_blkif_max_buffer_pages); - if (log_stats && time_after(jiffies, blkif->st_print)) - print_stats(blkif); + if (log_stats && time_after(jiffies, ring->blkif->st_print)) + print_stats(ring); } /* Drain pending purge work */ - flush_work(&blkif->persistent_purge_work); + flush_work(&ring->persistent_purge_work); if (log_stats) - print_stats(blkif); + print_stats(ring); ring->xenblkd = NULL; xen_blkif_put(blkif); @@ -680,21 +676,16 @@ purge_gnt_list: */ void xen_blkbk_free_caches(struct xen_blkif_ring *ring) { - struct xen_blkif *blkif = ring->blkif; - unsigned long flags; - /* Free all persistent grant pages */ - spin_lock_irqsave(&blkif->pers_gnts_lock, flags); - if (!RB_EMPTY_ROOT(&blkif->persistent_gnts)) - free_persistent_gnts(blkif, &blkif->persistent_gnts, - blkif->persistent_gnt_c); + if (!RB_EMPTY_ROOT(&ring->persistent_gnts)) + free_persistent_gnts(ring, &ring->persistent_gnts, + ring->persistent_gnt_c); - BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts)); - blkif->persistent_gnt_c = 0; - spin_unlock_irqrestore(&blkif->pers_gnts_lock, flags); + BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts)); + ring->persistent_gnt_c = 0; /* Since we are shutting down remove all pages from the buffer */ - shrink_free_pagepool(blkif, 0 /* All */); + shrink_free_pagepool(ring, 0 /* All */); } static unsigned int xen_blkbk_unmap_prepare( @@ -705,13 +696,10 @@ static unsigned int xen_blkbk_unmap_prepare( struct page **unmap_pages) { unsigned int i, invcount = 0; - unsigned long flags; for (i = 0; i < num; i++) { if (pages[i]->persistent_gnt != NULL) { - spin_lock_irqsave(&ring->blkif->pers_gnts_lock, flags); - put_persistent_gnt(ring->blkif, pages[i]->persistent_gnt); - spin_unlock_irqrestore(&ring->blkif->pers_gnts_lock, flags); + put_persistent_gnt(ring, pages[i]->persistent_gnt); continue; } if (pages[i]->handle == BLKBACK_INVALID_HANDLE) @@ -736,7 +724,7 @@ static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_ but is this the best way to deal with this? */ BUG_ON(result); - put_free_pages(blkif, data->pages, data->count); + put_free_pages(ring, data->pages, data->count); make_response(ring, pending_req->id, pending_req->operation, pending_req->status); free_req(ring, pending_req); @@ -803,7 +791,7 @@ static void xen_blkbk_unmap(struct xen_blkif_ring *ring, if (invcount) { ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount); BUG_ON(ret); - put_free_pages(ring->blkif, unmap_pages, invcount); + put_free_pages(ring, unmap_pages, invcount); } pages += batch; num -= batch; @@ -824,7 +812,6 @@ static int xen_blkbk_map(struct xen_blkif_ring *ring, int last_map = 0, map_until = 0; int use_persistent_gnts; struct xen_blkif *blkif = ring->blkif; - unsigned long irq_flags; use_persistent_gnts = (blkif->vbd.feature_gnt_persistent); @@ -838,11 +825,9 @@ again: uint32_t flags; if (use_persistent_gnts) { - spin_lock_irqsave(&blkif->pers_gnts_lock, irq_flags); persistent_gnt = get_persistent_gnt( - blkif, + ring, pages[i]->gref); - spin_unlock_irqrestore(&blkif->pers_gnts_lock, irq_flags); } if (persistent_gnt) { @@ -853,7 +838,7 @@ again: pages[i]->page = persistent_gnt->page; pages[i]->persistent_gnt = persistent_gnt; } else { - if (get_free_page(blkif, &pages[i]->page)) + if (get_free_page(ring, &pages[i]->page)) goto out_of_memory; addr = vaddr(pages[i]->page); pages_to_gnt[segs_to_map] = pages[i]->page; @@ -886,7 +871,7 @@ again: BUG_ON(new_map_idx >= segs_to_map); if (unlikely(map[new_map_idx].status != 0)) { pr_debug("invalid buffer -- could not remap it\n"); - put_free_pages(blkif, &pages[seg_idx]->page, 1); + put_free_pages(ring, &pages[seg_idx]->page, 1); pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE; ret |= 1; goto next; @@ -896,7 +881,7 @@ again: continue; } if (use_persistent_gnts && - blkif->persistent_gnt_c < xen_blkif_max_pgrants) { + ring->persistent_gnt_c < xen_blkif_max_pgrants) { /* * We are using persistent grants, the grant is * not mapped but we might have room for it. @@ -914,19 +899,16 @@ again: persistent_gnt->gnt = map[new_map_idx].ref; persistent_gnt->handle = map[new_map_idx].handle; persistent_gnt->page = pages[seg_idx]->page; - spin_lock_irqsave(&blkif->pers_gnts_lock, irq_flags); - if (add_persistent_gnt(blkif, + if (add_persistent_gnt(ring, persistent_gnt)) { - spin_unlock_irqrestore(&blkif->pers_gnts_lock, irq_flags); kfree(persistent_gnt); persistent_gnt = NULL; goto next; } pages[seg_idx]->persistent_gnt = persistent_gnt; pr_debug("grant %u added to the tree of persistent grants, using %u/%u\n", - persistent_gnt->gnt, blkif->persistent_gnt_c, + persistent_gnt->gnt, ring->persistent_gnt_c, xen_blkif_max_pgrants); - spin_unlock_irqrestore(&blkif->pers_gnts_lock, irq_flags); goto next; } if (use_persistent_gnts && !blkif->vbd.overflow_max_grants) { @@ -950,7 +932,7 @@ next: out_of_memory: pr_alert("%s: out of memory\n", __func__); - put_free_pages(blkif, pages_to_gnt, segs_to_map); + put_free_pages(ring, pages_to_gnt, segs_to_map); return -ENOMEM; } diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index 847444dc1df4..3c244ecf22a4 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -291,6 +291,22 @@ struct xen_blkif_ring { spinlock_t pending_free_lock; wait_queue_head_t pending_free_wq; + /* Tree to store persistent grants. */ + spinlock_t pers_gnts_lock; + struct rb_root persistent_gnts; + unsigned int persistent_gnt_c; + atomic_t persistent_gnt_in_use; + unsigned long next_lru; + + /* Used by the kworker that offload work from the persistent purge. */ + struct list_head persistent_purge_list; + struct work_struct persistent_purge_work; + + /* Buffer of free pages to map grant refs. */ + spinlock_t free_pages_lock; + int free_pages_num; + struct list_head free_pages; + struct work_struct free_work; /* Thread shutdown wait queue. */ wait_queue_head_t shutdown_wq; @@ -312,22 +328,6 @@ struct xen_blkif { struct completion drain_complete; atomic_t drain; - /* tree to store persistent grants */ - spinlock_t pers_gnts_lock; - struct rb_root persistent_gnts; - unsigned int persistent_gnt_c; - atomic_t persistent_gnt_in_use; - unsigned long next_lru; - - /* used by the kworker that offload work from the persistent purge */ - struct list_head persistent_purge_list; - struct work_struct persistent_purge_work; - - /* buffer of free pages to map grant refs */ - spinlock_t free_pages_lock; - int free_pages_num; - struct list_head free_pages; - /* statistics */ unsigned long st_print; unsigned long long st_rd_req; diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 0d6bb9383a68..2b8650a9a6a9 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -150,6 +150,10 @@ static int xen_blkif_alloc_rings(struct xen_blkif *blkif) spin_lock_init(&ring->blk_ring_lock); init_waitqueue_head(&ring->wq); INIT_LIST_HEAD(&ring->pending_free); + INIT_LIST_HEAD(&ring->persistent_purge_list); + INIT_WORK(&ring->persistent_purge_work, xen_blkbk_unmap_purged_grants); + spin_lock_init(&ring->free_pages_lock); + INIT_LIST_HEAD(&ring->free_pages); spin_lock_init(&ring->pending_free_lock); init_waitqueue_head(&ring->pending_free_wq); @@ -175,11 +179,7 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid) atomic_set(&blkif->refcnt, 1); init_completion(&blkif->drain_complete); INIT_WORK(&blkif->free_work, xen_blkif_deferred_free); - spin_lock_init(&blkif->free_pages_lock); - INIT_LIST_HEAD(&blkif->free_pages); - INIT_LIST_HEAD(&blkif->persistent_purge_list); blkif->st_print = jiffies; - INIT_WORK(&blkif->persistent_purge_work, xen_blkbk_unmap_purged_grants); return blkif; } @@ -290,6 +290,12 @@ static int xen_blkif_disconnect(struct xen_blkif *blkif) i++; } + BUG_ON(atomic_read(&ring->persistent_gnt_in_use) != 0); + BUG_ON(!list_empty(&ring->persistent_purge_list)); + BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts)); + BUG_ON(!list_empty(&ring->free_pages)); + BUG_ON(ring->free_pages_num != 0); + BUG_ON(ring->persistent_gnt_c != 0); WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages)); } blkif->nr_ring_pages = 0; @@ -304,13 +310,6 @@ static void xen_blkif_free(struct xen_blkif *blkif) xen_vbd_free(&blkif->vbd); /* Make sure everything is drained before shutting down */ - BUG_ON(blkif->persistent_gnt_c != 0); - BUG_ON(atomic_read(&blkif->persistent_gnt_in_use) != 0); - BUG_ON(blkif->free_pages_num != 0); - BUG_ON(!list_empty(&blkif->persistent_purge_list)); - BUG_ON(!list_empty(&blkif->free_pages)); - BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts)); - kfree(blkif->rings); kmem_cache_free(xen_blkif_cachep, blkif); } -- cgit v1.2.3 From a6e7af1288eeb7fca8361356998d31a92a291531 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Mon, 26 Oct 2015 14:47:21 +0900 Subject: xen-blkback: clear PF_NOFREEZE for xen_blkif_schedule() xen_blkif_schedule() kthread calls try_to_freeze() at the beginning of every attempt to purge the LRU. This operation can't ever succeed though, as the kthread hasn't marked itself as freezable. Before (hopefully eventually) kthread freezing gets converted to fileystem freezing, we'd rather mark xen_blkif_schedule() freezable (as it can generate I/O during suspend). Signed-off-by: Jiri Kosina Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkback/blkback.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/block/xen-blkback/blkback.c') diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index a00d6c6c2880..99b479f330af 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -613,6 +613,7 @@ int xen_blkif_schedule(void *arg) xen_blkif_get(blkif); + set_freezable(); while (!kthread_should_stop()) { if (try_to_freeze()) continue; -- cgit v1.2.3 From db6fbc106786f26d95889c50c18b1f28aa543a17 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Wed, 9 Dec 2015 07:44:02 +0800 Subject: xen/blkback: make st_ statistics per ring Make st_* statistics per ring and the VBD sysfs would iterate over all the rings. Note: xenvbd_sysfs_delif() is called in xen_blkbk_remove() before all rings are torn down, so it's safe. Signed-off-by: Bob Liu Signed-off-by: Konrad Rzeszutek Wilk --- v2: Aligned the variables on the same column. --- drivers/block/xen-blkback/blkback.c | 34 +++++++++++++--------------- drivers/block/xen-blkback/common.h | 20 ++++++++--------- drivers/block/xen-blkback/xenbus.c | 45 ++++++++++++++++++++++++++++--------- 3 files changed, 61 insertions(+), 38 deletions(-) (limited to 'drivers/block/xen-blkback/blkback.c') diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 99b479f330af..148930c8c121 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -587,20 +587,18 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id) static void print_stats(struct xen_blkif_ring *ring) { - struct xen_blkif *blkif = ring->blkif; - pr_info("(%s): oo %3llu | rd %4llu | wr %4llu | f %4llu" " | ds %4llu | pg: %4u/%4d\n", - current->comm, blkif->st_oo_req, - blkif->st_rd_req, blkif->st_wr_req, - blkif->st_f_req, blkif->st_ds_req, + current->comm, ring->st_oo_req, + ring->st_rd_req, ring->st_wr_req, + ring->st_f_req, ring->st_ds_req, ring->persistent_gnt_c, xen_blkif_max_pgrants); - blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); - blkif->st_rd_req = 0; - blkif->st_wr_req = 0; - blkif->st_oo_req = 0; - blkif->st_ds_req = 0; + ring->st_print = jiffies + msecs_to_jiffies(10 * 1000); + ring->st_rd_req = 0; + ring->st_wr_req = 0; + ring->st_oo_req = 0; + ring->st_ds_req = 0; } int xen_blkif_schedule(void *arg) @@ -656,7 +654,7 @@ purge_gnt_list: /* Shrink if we have more than xen_blkif_max_buffer_pages */ shrink_free_pagepool(ring, xen_blkif_max_buffer_pages); - if (log_stats && time_after(jiffies, ring->blkif->st_print)) + if (log_stats && time_after(jiffies, ring->st_print)) print_stats(ring); } @@ -1018,7 +1016,7 @@ static int dispatch_discard_io(struct xen_blkif_ring *ring, preq.sector_number + preq.nr_sects, blkif->vbd.pdevice); goto fail_response; } - blkif->st_ds_req++; + ring->st_ds_req++; secure = (blkif->vbd.discard_secure && (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ? @@ -1145,7 +1143,7 @@ __do_block_io_op(struct xen_blkif_ring *ring) pending_req = alloc_req(ring); if (NULL == pending_req) { - ring->blkif->st_oo_req++; + ring->st_oo_req++; more_to_do = 1; break; } @@ -1243,17 +1241,17 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring, switch (req_operation) { case BLKIF_OP_READ: - ring->blkif->st_rd_req++; + ring->st_rd_req++; operation = READ; break; case BLKIF_OP_WRITE: - ring->blkif->st_wr_req++; + ring->st_wr_req++; operation = WRITE_ODIRECT; break; case BLKIF_OP_WRITE_BARRIER: drain = true; case BLKIF_OP_FLUSH_DISKCACHE: - ring->blkif->st_f_req++; + ring->st_f_req++; operation = WRITE_FLUSH; break; default: @@ -1395,9 +1393,9 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring, blk_finish_plug(&plug); if (operation == READ) - ring->blkif->st_rd_sect += preq.nr_sects; + ring->st_rd_sect += preq.nr_sects; else if (operation & WRITE) - ring->blkif->st_wr_sect += preq.nr_sects; + ring->st_wr_sect += preq.nr_sects; return 0; diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index 3c244ecf22a4..b27c5ba15600 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -298,6 +298,16 @@ struct xen_blkif_ring { atomic_t persistent_gnt_in_use; unsigned long next_lru; + /* Statistics. */ + unsigned long st_print; + unsigned long long st_rd_req; + unsigned long long st_wr_req; + unsigned long long st_oo_req; + unsigned long long st_f_req; + unsigned long long st_ds_req; + unsigned long long st_rd_sect; + unsigned long long st_wr_sect; + /* Used by the kworker that offload work from the persistent purge. */ struct list_head persistent_purge_list; struct work_struct persistent_purge_work; @@ -328,16 +338,6 @@ struct xen_blkif { struct completion drain_complete; atomic_t drain; - /* statistics */ - unsigned long st_print; - unsigned long long st_rd_req; - unsigned long long st_wr_req; - unsigned long long st_oo_req; - unsigned long long st_f_req; - unsigned long long st_ds_req; - unsigned long long st_rd_sect; - unsigned long long st_wr_sect; - struct work_struct free_work; unsigned int nr_ring_pages; /* All rings for this device. */ diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index c92b35882720..44396b8a0cb2 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -159,6 +159,7 @@ static int xen_blkif_alloc_rings(struct xen_blkif *blkif) init_waitqueue_head(&ring->pending_free_wq); init_waitqueue_head(&ring->shutdown_wq); ring->blkif = blkif; + ring->st_print = jiffies; xen_blkif_get(blkif); } @@ -179,7 +180,6 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid) atomic_set(&blkif->refcnt, 1); init_completion(&blkif->drain_complete); INIT_WORK(&blkif->free_work, xen_blkif_deferred_free); - blkif->st_print = jiffies; return blkif; } @@ -329,25 +329,38 @@ int __init xen_blkif_interface_init(void) * sysfs interface for VBD I/O requests */ -#define VBD_SHOW(name, format, args...) \ +#define VBD_SHOW_ALLRING(name, format) \ static ssize_t show_##name(struct device *_dev, \ struct device_attribute *attr, \ char *buf) \ { \ struct xenbus_device *dev = to_xenbus_device(_dev); \ struct backend_info *be = dev_get_drvdata(&dev->dev); \ + struct xen_blkif *blkif = be->blkif; \ + unsigned int i; \ + unsigned long long result = 0; \ \ - return sprintf(buf, format, ##args); \ + if (!blkif->rings) \ + goto out; \ + \ + for (i = 0; i < blkif->nr_rings; i++) { \ + struct xen_blkif_ring *ring = &blkif->rings[i]; \ + \ + result += ring->st_##name; \ + } \ + \ +out: \ + return sprintf(buf, format, result); \ } \ static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) -VBD_SHOW(oo_req, "%llu\n", be->blkif->st_oo_req); -VBD_SHOW(rd_req, "%llu\n", be->blkif->st_rd_req); -VBD_SHOW(wr_req, "%llu\n", be->blkif->st_wr_req); -VBD_SHOW(f_req, "%llu\n", be->blkif->st_f_req); -VBD_SHOW(ds_req, "%llu\n", be->blkif->st_ds_req); -VBD_SHOW(rd_sect, "%llu\n", be->blkif->st_rd_sect); -VBD_SHOW(wr_sect, "%llu\n", be->blkif->st_wr_sect); +VBD_SHOW_ALLRING(oo_req, "%llu\n"); +VBD_SHOW_ALLRING(rd_req, "%llu\n"); +VBD_SHOW_ALLRING(wr_req, "%llu\n"); +VBD_SHOW_ALLRING(f_req, "%llu\n"); +VBD_SHOW_ALLRING(ds_req, "%llu\n"); +VBD_SHOW_ALLRING(rd_sect, "%llu\n"); +VBD_SHOW_ALLRING(wr_sect, "%llu\n"); static struct attribute *xen_vbdstat_attrs[] = { &dev_attr_oo_req.attr, @@ -365,6 +378,18 @@ static struct attribute_group xen_vbdstat_group = { .attrs = xen_vbdstat_attrs, }; +#define VBD_SHOW(name, format, args...) \ + static ssize_t show_##name(struct device *_dev, \ + struct device_attribute *attr, \ + char *buf) \ + { \ + struct xenbus_device *dev = to_xenbus_device(_dev); \ + struct backend_info *be = dev_get_drvdata(&dev->dev); \ + \ + return sprintf(buf, format, ##args); \ + } \ + static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) + VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor); VBD_SHOW(mode, "%s\n", be->mode); -- cgit v1.2.3