Skip to content

Commit 1ea0d85

Browse files
committed
nvme-tcp: Fix I/O queue cpu spreading for multiple controllers
JIRA: https://issues.redhat.com/browse/RHEL-76529 Since day-1 we are assigning the queue io_cpu very naively. We always base the queue id (controller scope) and assign it its matching cpu from the online mask. This works fine when the number of queues match the number of cpu cores. The problem starts when we have less queues than cpu cores. First, we should take into account the mq_map and select a cpu within the cpus that are assigned to this queue by the mq_map in order to minimize cross numa cpu bouncing. Second, even worse is that we don't take into account multiple controllers may have assigned queues to a given cpu. As a result we may simply compund more and more queues on the same set of cpus, which is suboptimal. We fix this by introducing global per-cpu counters that tracks the number of queues assigned to each cpu, and we select the least used cpu based on the mq_map and the per-cpu counters, and assign it as the queue io_cpu. The behavior for a single controller is slightly optimized by selecting better cpu candidates by consulting with the mq_map, and multiple controllers are spreading queues among cpu cores much better, resulting in lower average cpu load, and less likelihood to hit hotspots. Note that the accounting is not 100% perfect, but we don't need to be, we're simply putting our best effort to select the best candidate cpu core that we find at any given point. Another byproduct is that every controller reset/reconnect may change the queues io_cpu mapping, based on the current LRU accounting scheme. Here is the baseline queue io_cpu assignment for 4 controllers, 2 queues per controller, and 4 cpus on the host: nvme1: queue 0: using cpu 0 nvme1: queue 1: using cpu 1 nvme2: queue 0: using cpu 0 nvme2: queue 1: using cpu 1 nvme3: queue 0: using cpu 0 nvme3: queue 1: using cpu 1 nvme4: queue 0: using cpu 0 nvme4: queue 1: using cpu 1 And this is the fixed io_cpu assignment: nvme1: queue 0: using cpu 0 nvme1: queue 1: using cpu 2 nvme2: queue 0: using cpu 1 nvme2: queue 1: using cpu 3 nvme3: queue 0: using cpu 0 nvme3: queue 1: using cpu 2 nvme4: queue 0: using cpu 1 nvme4: queue 1: using cpu 3 Fixes: 3f2304f ("nvme-tcp: add NVMe over TCP host driver") Suggested-by: Hannes Reinecke <hare@kernel.org> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com> Reviewed-by: Christoph Hellwig <hch@lst.de> [fixed kbuild reported errors] Signed-off-by: Chaitanya Kulkarni <kch@nvidia.com> Signed-off-by: Keith Busch <kbusch@kernel.org> (cherry picked from commit 3219378) Signed-off-by: John Meneghini <jmeneghi@redhat.com>
1 parent f07b929 commit 1ea0d85

File tree

1 file changed

+57
-13
lines changed

1 file changed

+57
-13
lines changed

drivers/nvme/host/tcp.c

Lines changed: 57 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,8 @@ MODULE_PARM_DESC(tls_handshake_timeout,
5353
"nvme TLS handshake timeout in seconds (default 10)");
5454
#endif
5555

56+
static atomic_t nvme_tcp_cpu_queues[NR_CPUS];
57+
5658
#ifdef CONFIG_DEBUG_LOCK_ALLOC
5759
/* lockdep can detect a circular dependency of the form
5860
* sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock
@@ -126,6 +128,7 @@ enum nvme_tcp_queue_flags {
126128
NVME_TCP_Q_ALLOCATED = 0,
127129
NVME_TCP_Q_LIVE = 1,
128130
NVME_TCP_Q_POLLING = 2,
131+
NVME_TCP_Q_IO_CPU_SET = 3,
129132
};
130133

131134
enum nvme_tcp_recv_state {
@@ -1648,23 +1651,56 @@ static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
16481651
ctrl->io_queues[HCTX_TYPE_POLL];
16491652
}
16501653

1654+
/**
1655+
* Track the number of queues assigned to each cpu using a global per-cpu
1656+
* counter and select the least used cpu from the mq_map. Our goal is to spread
1657+
* different controllers I/O threads across different cpu cores.
1658+
*
1659+
* Note that the accounting is not 100% perfect, but we don't need to be, we're
1660+
* simply putting our best effort to select the best candidate cpu core that we
1661+
* find at any given point.
1662+
*/
16511663
static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
16521664
{
16531665
struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1654-
int qid = nvme_tcp_queue_id(queue);
1655-
int n = 0;
1666+
struct blk_mq_tag_set *set = &ctrl->tag_set;
1667+
int qid = nvme_tcp_queue_id(queue) - 1;
1668+
unsigned int *mq_map = NULL;
1669+
int cpu, min_queues = INT_MAX, io_cpu;
1670+
1671+
if (wq_unbound)
1672+
goto out;
16561673

16571674
if (nvme_tcp_default_queue(queue))
1658-
n = qid - 1;
1675+
mq_map = set->map[HCTX_TYPE_DEFAULT].mq_map;
16591676
else if (nvme_tcp_read_queue(queue))
1660-
n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1;
1677+
mq_map = set->map[HCTX_TYPE_READ].mq_map;
16611678
else if (nvme_tcp_poll_queue(queue))
1662-
n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
1663-
ctrl->io_queues[HCTX_TYPE_READ] - 1;
1664-
if (wq_unbound)
1665-
queue->io_cpu = WORK_CPU_UNBOUND;
1666-
else
1667-
queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
1679+
mq_map = set->map[HCTX_TYPE_POLL].mq_map;
1680+
1681+
if (WARN_ON(!mq_map))
1682+
goto out;
1683+
1684+
/* Search for the least used cpu from the mq_map */
1685+
io_cpu = WORK_CPU_UNBOUND;
1686+
for_each_online_cpu(cpu) {
1687+
int num_queues = atomic_read(&nvme_tcp_cpu_queues[cpu]);
1688+
1689+
if (mq_map[cpu] != qid)
1690+
continue;
1691+
if (num_queues < min_queues) {
1692+
io_cpu = cpu;
1693+
min_queues = num_queues;
1694+
}
1695+
}
1696+
if (io_cpu != WORK_CPU_UNBOUND) {
1697+
queue->io_cpu = io_cpu;
1698+
atomic_inc(&nvme_tcp_cpu_queues[io_cpu]);
1699+
set_bit(NVME_TCP_Q_IO_CPU_SET, &queue->flags);
1700+
}
1701+
out:
1702+
dev_dbg(ctrl->ctrl.device, "queue %d: using cpu %d\n",
1703+
qid, queue->io_cpu);
16681704
}
16691705

16701706
static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid)
@@ -1808,7 +1844,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
18081844

18091845
queue->sock->sk->sk_allocation = GFP_ATOMIC;
18101846
queue->sock->sk->sk_use_task_frag = false;
1811-
nvme_tcp_set_queue_io_cpu(queue);
1847+
queue->io_cpu = WORK_CPU_UNBOUND;
18121848
queue->request = NULL;
18131849
queue->data_remaining = 0;
18141850
queue->ddgst_remaining = 0;
@@ -1930,6 +1966,9 @@ static void nvme_tcp_stop_queue_nowait(struct nvme_ctrl *nctrl, int qid)
19301966
if (!test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
19311967
return;
19321968

1969+
if (test_and_clear_bit(NVME_TCP_Q_IO_CPU_SET, &queue->flags))
1970+
atomic_dec(&nvme_tcp_cpu_queues[queue->io_cpu]);
1971+
19331972
mutex_lock(&queue->queue_lock);
19341973
if (test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
19351974
__nvme_tcp_stop_queue(queue);
@@ -1989,9 +2028,10 @@ static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
19892028
nvme_tcp_init_recv_ctx(queue);
19902029
nvme_tcp_setup_sock_ops(queue);
19912030

1992-
if (idx)
2031+
if (idx) {
2032+
nvme_tcp_set_queue_io_cpu(queue);
19932033
ret = nvmf_connect_io_queue(nctrl, idx);
1994-
else
2034+
} else
19952035
ret = nvmf_connect_admin_queue(nctrl);
19962036

19972037
if (!ret) {
@@ -3012,6 +3052,7 @@ static struct nvmf_transport_ops nvme_tcp_transport = {
30123052
static int __init nvme_tcp_init_module(void)
30133053
{
30143054
unsigned int wq_flags = WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_SYSFS;
3055+
int cpu;
30153056

30163057
BUILD_BUG_ON(sizeof(struct nvme_tcp_hdr) != 8);
30173058
BUILD_BUG_ON(sizeof(struct nvme_tcp_cmd_pdu) != 72);
@@ -3029,6 +3070,9 @@ static int __init nvme_tcp_init_module(void)
30293070
if (!nvme_tcp_wq)
30303071
return -ENOMEM;
30313072

3073+
for_each_possible_cpu(cpu)
3074+
atomic_set(&nvme_tcp_cpu_queues[cpu], 0);
3075+
30323076
nvmf_register_transport(&nvme_tcp_transport);
30333077
return 0;
30343078
}

0 commit comments

Comments
 (0)