summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSagi Grimberg <sagi@grimberg.me>2022-10-23 11:04:43 +0300
committerChristoph Hellwig <hch@lst.de>2022-10-25 18:07:50 +0300
commit83e1226b0ee2d7e3fb6e002fbbfc6ab36aabdc35 (patch)
tree5e2deba4bcb1f8ac63cdb6986e6e2253e558c515
parent5fa9add66b00ad0c796185ff7438eaa3e67c1187 (diff)
downloadlinux-83e1226b0ee2d7e3fb6e002fbbfc6ab36aabdc35.tar.xz
nvme-tcp: fix possible circular locking when deleting a controller under memory pressure
When destroying a queue, when calling sock_release, the network stack might need to allocate an skb to send a FIN/RST. When that happens during memory pressure, there is a need to reclaim memory, which in turn may ask the nvme-tcp device to write out dirty pages, however this is not possible due to a ctrl teardown that is going on. Set PF_MEMALLOC to the task that releases the socket to grant access to PF_MEMALLOC reserves. In addition, do the same for the nvme-tcp thread as this may also originate from the swap itself and should be more resilient to memory pressure situations. This fixes the following lockdep complaint: -- ====================================================== WARNING: possible circular locking dependency detected 6.0.0-rc2+ #25 Tainted: G W ------------------------------------------------------ kswapd0/92 is trying to acquire lock: ffff888114003240 (sk_lock-AF_INET-NVME){+.+.}-{0:0}, at: tcp_sendpage+0x23/0xa0 but task is already holding lock: ffffffff97e95ca0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x987/0x10d0 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (fs_reclaim){+.+.}-{0:0}: fs_reclaim_acquire+0x11e/0x160 kmem_cache_alloc_node+0x44/0x530 __alloc_skb+0x158/0x230 tcp_send_active_reset+0x7e/0x730 tcp_disconnect+0x1272/0x1ae0 __tcp_close+0x707/0xd90 tcp_close+0x26/0x80 inet_release+0xfa/0x220 sock_release+0x85/0x1a0 nvme_tcp_free_queue+0x1fd/0x470 [nvme_tcp] nvme_do_delete_ctrl+0x130/0x13d [nvme_core] nvme_sysfs_delete.cold+0x8/0xd [nvme_core] kernfs_fop_write_iter+0x356/0x530 vfs_write+0x4e8/0xce0 ksys_write+0xfd/0x1d0 do_syscall_64+0x58/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd -> #0 (sk_lock-AF_INET-NVME){+.+.}-{0:0}: __lock_acquire+0x2a0c/0x5690 lock_acquire+0x18e/0x4f0 lock_sock_nested+0x37/0xc0 tcp_sendpage+0x23/0xa0 inet_sendpage+0xad/0x120 kernel_sendpage+0x156/0x440 nvme_tcp_try_send+0x48a/0x2630 [nvme_tcp] nvme_tcp_queue_rq+0xefb/0x17e0 [nvme_tcp] __blk_mq_try_issue_directly+0x452/0x660 blk_mq_plug_issue_direct.constprop.0+0x207/0x700 blk_mq_flush_plug_list+0x6f5/0xc70 __blk_flush_plug+0x264/0x410 blk_finish_plug+0x4b/0xa0 shrink_lruvec+0x1263/0x1ea0 shrink_node+0x736/0x1a80 balance_pgdat+0x740/0x10d0 kswapd+0x5f2/0xaf0 kthread+0x256/0x2f0 ret_from_fork+0x1f/0x30 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(fs_reclaim); lock(sk_lock-AF_INET-NVME); lock(fs_reclaim); lock(sk_lock-AF_INET-NVME); *** DEADLOCK *** 3 locks held by kswapd0/92: #0: ffffffff97e95ca0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x987/0x10d0 #1: ffff88811f21b0b0 (q->srcu){....}-{0:0}, at: blk_mq_flush_plug_list+0x6b3/0xc70 #2: ffff888170b11470 (&queue->send_mutex){+.+.}-{3:3}, at: nvme_tcp_queue_rq+0xeb9/0x17e0 [nvme_tcp] Fixes: 3f2304f8c6d6 ("nvme-tcp: add NVMe over TCP host driver") Reported-by: Daniel Wagner <dwagner@suse.de> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Tested-by: Daniel Wagner <dwagner@suse.de> Signed-off-by: Christoph Hellwig <hch@lst.de>
-rw-r--r--drivers/nvme/host/tcp.c11
1 files changed, 10 insertions, 1 deletions
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index dc2def86076d..9b47dcb2a7d9 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1141,6 +1141,7 @@ static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
{
struct nvme_tcp_request *req;
+ unsigned int noreclaim_flag;
int ret = 1;
if (!queue->request) {
@@ -1150,12 +1151,13 @@ static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
}
req = queue->request;
+ noreclaim_flag = memalloc_noreclaim_save();
if (req->state == NVME_TCP_SEND_CMD_PDU) {
ret = nvme_tcp_try_send_cmd_pdu(req);
if (ret <= 0)
goto done;
if (!nvme_tcp_has_inline_data(req))
- return ret;
+ goto out;
}
if (req->state == NVME_TCP_SEND_H2C_PDU) {
@@ -1181,6 +1183,8 @@ done:
nvme_tcp_fail_request(queue->request);
nvme_tcp_done_send_req(queue);
}
+out:
+ memalloc_noreclaim_restore(noreclaim_flag);
return ret;
}
@@ -1296,6 +1300,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
struct page *page;
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
struct nvme_tcp_queue *queue = &ctrl->queues[qid];
+ unsigned int noreclaim_flag;
if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
return;
@@ -1308,7 +1313,11 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
__page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias);
queue->pf_cache.va = NULL;
}
+
+ noreclaim_flag = memalloc_noreclaim_save();
sock_release(queue->sock);
+ memalloc_noreclaim_restore(noreclaim_flag);
+
kfree(queue->pdu);
mutex_destroy(&queue->send_mutex);
mutex_destroy(&queue->queue_lock);