@@ -53,6 +53,8 @@ MODULE_PARM_DESC(tls_handshake_timeout,
5353 "nvme TLS handshake timeout in seconds (default 10)" );
5454#endif
5555
56+ static atomic_t nvme_tcp_cpu_queues [NR_CPUS ];
57+
5658#ifdef CONFIG_DEBUG_LOCK_ALLOC
5759/* lockdep can detect a circular dependency of the form
5860 * sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock
@@ -126,6 +128,7 @@ enum nvme_tcp_queue_flags {
126128 NVME_TCP_Q_ALLOCATED = 0 ,
127129 NVME_TCP_Q_LIVE = 1 ,
128130 NVME_TCP_Q_POLLING = 2 ,
131+ NVME_TCP_Q_IO_CPU_SET = 3 ,
129132};
130133
131134enum nvme_tcp_recv_state {
@@ -1648,23 +1651,56 @@ static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
16481651 ctrl -> io_queues [HCTX_TYPE_POLL ];
16491652}
16501653
1654+ /*
1655+ * Track the number of queues assigned to each cpu using a global per-cpu
1656+ * counter and select the least used cpu from the mq_map. Our goal is to spread
1657+ * different controllers I/O threads across different cpu cores.
1658+ *
1659+ * Note that the accounting is not 100% perfect, but we don't need to be, we're
1660+ * simply putting our best effort to select the best candidate cpu core that we
1661+ * find at any given point.
1662+ */
16511663static void nvme_tcp_set_queue_io_cpu (struct nvme_tcp_queue * queue )
16521664{
16531665 struct nvme_tcp_ctrl * ctrl = queue -> ctrl ;
1654- int qid = nvme_tcp_queue_id (queue );
1655- int n = 0 ;
1666+ struct blk_mq_tag_set * set = & ctrl -> tag_set ;
1667+ int qid = nvme_tcp_queue_id (queue ) - 1 ;
1668+ unsigned int * mq_map = NULL ;
1669+ int cpu , min_queues = INT_MAX , io_cpu ;
1670+
1671+ if (wq_unbound )
1672+ goto out ;
16561673
16571674 if (nvme_tcp_default_queue (queue ))
1658- n = qid - 1 ;
1675+ mq_map = set -> map [ HCTX_TYPE_DEFAULT ]. mq_map ;
16591676 else if (nvme_tcp_read_queue (queue ))
1660- n = qid - ctrl -> io_queues [ HCTX_TYPE_DEFAULT ] - 1 ;
1677+ mq_map = set -> map [ HCTX_TYPE_READ ]. mq_map ;
16611678 else if (nvme_tcp_poll_queue (queue ))
1662- n = qid - ctrl -> io_queues [HCTX_TYPE_DEFAULT ] -
1663- ctrl -> io_queues [HCTX_TYPE_READ ] - 1 ;
1664- if (wq_unbound )
1665- queue -> io_cpu = WORK_CPU_UNBOUND ;
1666- else
1667- queue -> io_cpu = cpumask_next_wrap (n - 1 , cpu_online_mask , -1 , false);
1679+ mq_map = set -> map [HCTX_TYPE_POLL ].mq_map ;
1680+
1681+ if (WARN_ON (!mq_map ))
1682+ goto out ;
1683+
1684+ /* Search for the least used cpu from the mq_map */
1685+ io_cpu = WORK_CPU_UNBOUND ;
1686+ for_each_online_cpu (cpu ) {
1687+ int num_queues = atomic_read (& nvme_tcp_cpu_queues [cpu ]);
1688+
1689+ if (mq_map [cpu ] != qid )
1690+ continue ;
1691+ if (num_queues < min_queues ) {
1692+ io_cpu = cpu ;
1693+ min_queues = num_queues ;
1694+ }
1695+ }
1696+ if (io_cpu != WORK_CPU_UNBOUND ) {
1697+ queue -> io_cpu = io_cpu ;
1698+ atomic_inc (& nvme_tcp_cpu_queues [io_cpu ]);
1699+ set_bit (NVME_TCP_Q_IO_CPU_SET , & queue -> flags );
1700+ }
1701+ out :
1702+ dev_dbg (ctrl -> ctrl .device , "queue %d: using cpu %d\n" ,
1703+ qid , queue -> io_cpu );
16681704}
16691705
16701706static void nvme_tcp_tls_done (void * data , int status , key_serial_t pskid )
@@ -1808,7 +1844,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
18081844
18091845 queue -> sock -> sk -> sk_allocation = GFP_ATOMIC ;
18101846 queue -> sock -> sk -> sk_use_task_frag = false;
1811- nvme_tcp_set_queue_io_cpu ( queue ) ;
1847+ queue -> io_cpu = WORK_CPU_UNBOUND ;
18121848 queue -> request = NULL ;
18131849 queue -> data_remaining = 0 ;
18141850 queue -> ddgst_remaining = 0 ;
@@ -1930,6 +1966,9 @@ static void nvme_tcp_stop_queue_nowait(struct nvme_ctrl *nctrl, int qid)
19301966 if (!test_bit (NVME_TCP_Q_ALLOCATED , & queue -> flags ))
19311967 return ;
19321968
1969+ if (test_and_clear_bit (NVME_TCP_Q_IO_CPU_SET , & queue -> flags ))
1970+ atomic_dec (& nvme_tcp_cpu_queues [queue -> io_cpu ]);
1971+
19331972 mutex_lock (& queue -> queue_lock );
19341973 if (test_and_clear_bit (NVME_TCP_Q_LIVE , & queue -> flags ))
19351974 __nvme_tcp_stop_queue (queue );
@@ -1989,9 +2028,10 @@ static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
19892028 nvme_tcp_init_recv_ctx (queue );
19902029 nvme_tcp_setup_sock_ops (queue );
19912030
1992- if (idx )
2031+ if (idx ) {
2032+ nvme_tcp_set_queue_io_cpu (queue );
19932033 ret = nvmf_connect_io_queue (nctrl , idx );
1994- else
2034+ } else
19952035 ret = nvmf_connect_admin_queue (nctrl );
19962036
19972037 if (!ret ) {
@@ -3012,6 +3052,7 @@ static struct nvmf_transport_ops nvme_tcp_transport = {
30123052static int __init nvme_tcp_init_module (void )
30133053{
30143054 unsigned int wq_flags = WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_SYSFS ;
3055+ int cpu ;
30153056
30163057 BUILD_BUG_ON (sizeof (struct nvme_tcp_hdr ) != 8 );
30173058 BUILD_BUG_ON (sizeof (struct nvme_tcp_cmd_pdu ) != 72 );
@@ -3029,6 +3070,9 @@ static int __init nvme_tcp_init_module(void)
30293070 if (!nvme_tcp_wq )
30303071 return - ENOMEM ;
30313072
3073+ for_each_possible_cpu (cpu )
3074+ atomic_set (& nvme_tcp_cpu_queues [cpu ], 0 );
3075+
30323076 nvmf_register_transport (& nvme_tcp_transport );
30333077 return 0 ;
30343078}
0 commit comments