1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun * NVMe over Fabrics RDMA host code.
4*4882a593Smuzhiyun * Copyright (c) 2015-2016 HGST, a Western Digital Company.
5*4882a593Smuzhiyun */
6*4882a593Smuzhiyun #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7*4882a593Smuzhiyun #include <linux/module.h>
8*4882a593Smuzhiyun #include <linux/init.h>
9*4882a593Smuzhiyun #include <linux/slab.h>
10*4882a593Smuzhiyun #include <rdma/mr_pool.h>
11*4882a593Smuzhiyun #include <linux/err.h>
12*4882a593Smuzhiyun #include <linux/string.h>
13*4882a593Smuzhiyun #include <linux/atomic.h>
14*4882a593Smuzhiyun #include <linux/blk-mq.h>
15*4882a593Smuzhiyun #include <linux/blk-mq-rdma.h>
16*4882a593Smuzhiyun #include <linux/types.h>
17*4882a593Smuzhiyun #include <linux/list.h>
18*4882a593Smuzhiyun #include <linux/mutex.h>
19*4882a593Smuzhiyun #include <linux/scatterlist.h>
20*4882a593Smuzhiyun #include <linux/nvme.h>
21*4882a593Smuzhiyun #include <asm/unaligned.h>
22*4882a593Smuzhiyun
23*4882a593Smuzhiyun #include <rdma/ib_verbs.h>
24*4882a593Smuzhiyun #include <rdma/rdma_cm.h>
25*4882a593Smuzhiyun #include <linux/nvme-rdma.h>
26*4882a593Smuzhiyun
27*4882a593Smuzhiyun #include "nvme.h"
28*4882a593Smuzhiyun #include "fabrics.h"
29*4882a593Smuzhiyun
30*4882a593Smuzhiyun
31*4882a593Smuzhiyun #define NVME_RDMA_CONNECT_TIMEOUT_MS 3000 /* 3 second */
32*4882a593Smuzhiyun
33*4882a593Smuzhiyun #define NVME_RDMA_MAX_SEGMENTS 256
34*4882a593Smuzhiyun
35*4882a593Smuzhiyun #define NVME_RDMA_MAX_INLINE_SEGMENTS 4
36*4882a593Smuzhiyun
37*4882a593Smuzhiyun #define NVME_RDMA_DATA_SGL_SIZE \
38*4882a593Smuzhiyun (sizeof(struct scatterlist) * NVME_INLINE_SG_CNT)
39*4882a593Smuzhiyun #define NVME_RDMA_METADATA_SGL_SIZE \
40*4882a593Smuzhiyun (sizeof(struct scatterlist) * NVME_INLINE_METADATA_SG_CNT)
41*4882a593Smuzhiyun
42*4882a593Smuzhiyun struct nvme_rdma_device {
43*4882a593Smuzhiyun struct ib_device *dev;
44*4882a593Smuzhiyun struct ib_pd *pd;
45*4882a593Smuzhiyun struct kref ref;
46*4882a593Smuzhiyun struct list_head entry;
47*4882a593Smuzhiyun unsigned int num_inline_segments;
48*4882a593Smuzhiyun };
49*4882a593Smuzhiyun
50*4882a593Smuzhiyun struct nvme_rdma_qe {
51*4882a593Smuzhiyun struct ib_cqe cqe;
52*4882a593Smuzhiyun void *data;
53*4882a593Smuzhiyun u64 dma;
54*4882a593Smuzhiyun };
55*4882a593Smuzhiyun
56*4882a593Smuzhiyun struct nvme_rdma_sgl {
57*4882a593Smuzhiyun int nents;
58*4882a593Smuzhiyun struct sg_table sg_table;
59*4882a593Smuzhiyun };
60*4882a593Smuzhiyun
61*4882a593Smuzhiyun struct nvme_rdma_queue;
62*4882a593Smuzhiyun struct nvme_rdma_request {
63*4882a593Smuzhiyun struct nvme_request req;
64*4882a593Smuzhiyun struct ib_mr *mr;
65*4882a593Smuzhiyun struct nvme_rdma_qe sqe;
66*4882a593Smuzhiyun union nvme_result result;
67*4882a593Smuzhiyun __le16 status;
68*4882a593Smuzhiyun refcount_t ref;
69*4882a593Smuzhiyun struct ib_sge sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS];
70*4882a593Smuzhiyun u32 num_sge;
71*4882a593Smuzhiyun struct ib_reg_wr reg_wr;
72*4882a593Smuzhiyun struct ib_cqe reg_cqe;
73*4882a593Smuzhiyun struct nvme_rdma_queue *queue;
74*4882a593Smuzhiyun struct nvme_rdma_sgl data_sgl;
75*4882a593Smuzhiyun struct nvme_rdma_sgl *metadata_sgl;
76*4882a593Smuzhiyun bool use_sig_mr;
77*4882a593Smuzhiyun };
78*4882a593Smuzhiyun
79*4882a593Smuzhiyun enum nvme_rdma_queue_flags {
80*4882a593Smuzhiyun NVME_RDMA_Q_ALLOCATED = 0,
81*4882a593Smuzhiyun NVME_RDMA_Q_LIVE = 1,
82*4882a593Smuzhiyun NVME_RDMA_Q_TR_READY = 2,
83*4882a593Smuzhiyun };
84*4882a593Smuzhiyun
85*4882a593Smuzhiyun struct nvme_rdma_queue {
86*4882a593Smuzhiyun struct nvme_rdma_qe *rsp_ring;
87*4882a593Smuzhiyun int queue_size;
88*4882a593Smuzhiyun size_t cmnd_capsule_len;
89*4882a593Smuzhiyun struct nvme_rdma_ctrl *ctrl;
90*4882a593Smuzhiyun struct nvme_rdma_device *device;
91*4882a593Smuzhiyun struct ib_cq *ib_cq;
92*4882a593Smuzhiyun struct ib_qp *qp;
93*4882a593Smuzhiyun
94*4882a593Smuzhiyun unsigned long flags;
95*4882a593Smuzhiyun struct rdma_cm_id *cm_id;
96*4882a593Smuzhiyun int cm_error;
97*4882a593Smuzhiyun struct completion cm_done;
98*4882a593Smuzhiyun bool pi_support;
99*4882a593Smuzhiyun int cq_size;
100*4882a593Smuzhiyun struct mutex queue_lock;
101*4882a593Smuzhiyun };
102*4882a593Smuzhiyun
103*4882a593Smuzhiyun struct nvme_rdma_ctrl {
104*4882a593Smuzhiyun /* read only in the hot path */
105*4882a593Smuzhiyun struct nvme_rdma_queue *queues;
106*4882a593Smuzhiyun
107*4882a593Smuzhiyun /* other member variables */
108*4882a593Smuzhiyun struct blk_mq_tag_set tag_set;
109*4882a593Smuzhiyun struct work_struct err_work;
110*4882a593Smuzhiyun
111*4882a593Smuzhiyun struct nvme_rdma_qe async_event_sqe;
112*4882a593Smuzhiyun
113*4882a593Smuzhiyun struct delayed_work reconnect_work;
114*4882a593Smuzhiyun
115*4882a593Smuzhiyun struct list_head list;
116*4882a593Smuzhiyun
117*4882a593Smuzhiyun struct blk_mq_tag_set admin_tag_set;
118*4882a593Smuzhiyun struct nvme_rdma_device *device;
119*4882a593Smuzhiyun
120*4882a593Smuzhiyun u32 max_fr_pages;
121*4882a593Smuzhiyun
122*4882a593Smuzhiyun struct sockaddr_storage addr;
123*4882a593Smuzhiyun struct sockaddr_storage src_addr;
124*4882a593Smuzhiyun
125*4882a593Smuzhiyun struct nvme_ctrl ctrl;
126*4882a593Smuzhiyun bool use_inline_data;
127*4882a593Smuzhiyun u32 io_queues[HCTX_MAX_TYPES];
128*4882a593Smuzhiyun };
129*4882a593Smuzhiyun
to_rdma_ctrl(struct nvme_ctrl * ctrl)130*4882a593Smuzhiyun static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl)
131*4882a593Smuzhiyun {
132*4882a593Smuzhiyun return container_of(ctrl, struct nvme_rdma_ctrl, ctrl);
133*4882a593Smuzhiyun }
134*4882a593Smuzhiyun
135*4882a593Smuzhiyun static LIST_HEAD(device_list);
136*4882a593Smuzhiyun static DEFINE_MUTEX(device_list_mutex);
137*4882a593Smuzhiyun
138*4882a593Smuzhiyun static LIST_HEAD(nvme_rdma_ctrl_list);
139*4882a593Smuzhiyun static DEFINE_MUTEX(nvme_rdma_ctrl_mutex);
140*4882a593Smuzhiyun
141*4882a593Smuzhiyun /*
142*4882a593Smuzhiyun * Disabling this option makes small I/O goes faster, but is fundamentally
143*4882a593Smuzhiyun * unsafe. With it turned off we will have to register a global rkey that
144*4882a593Smuzhiyun * allows read and write access to all physical memory.
145*4882a593Smuzhiyun */
146*4882a593Smuzhiyun static bool register_always = true;
147*4882a593Smuzhiyun module_param(register_always, bool, 0444);
148*4882a593Smuzhiyun MODULE_PARM_DESC(register_always,
149*4882a593Smuzhiyun "Use memory registration even for contiguous memory regions");
150*4882a593Smuzhiyun
151*4882a593Smuzhiyun static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
152*4882a593Smuzhiyun struct rdma_cm_event *event);
153*4882a593Smuzhiyun static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
154*4882a593Smuzhiyun static void nvme_rdma_complete_rq(struct request *rq);
155*4882a593Smuzhiyun
156*4882a593Smuzhiyun static const struct blk_mq_ops nvme_rdma_mq_ops;
157*4882a593Smuzhiyun static const struct blk_mq_ops nvme_rdma_admin_mq_ops;
158*4882a593Smuzhiyun
nvme_rdma_queue_idx(struct nvme_rdma_queue * queue)159*4882a593Smuzhiyun static inline int nvme_rdma_queue_idx(struct nvme_rdma_queue *queue)
160*4882a593Smuzhiyun {
161*4882a593Smuzhiyun return queue - queue->ctrl->queues;
162*4882a593Smuzhiyun }
163*4882a593Smuzhiyun
nvme_rdma_poll_queue(struct nvme_rdma_queue * queue)164*4882a593Smuzhiyun static bool nvme_rdma_poll_queue(struct nvme_rdma_queue *queue)
165*4882a593Smuzhiyun {
166*4882a593Smuzhiyun return nvme_rdma_queue_idx(queue) >
167*4882a593Smuzhiyun queue->ctrl->io_queues[HCTX_TYPE_DEFAULT] +
168*4882a593Smuzhiyun queue->ctrl->io_queues[HCTX_TYPE_READ];
169*4882a593Smuzhiyun }
170*4882a593Smuzhiyun
nvme_rdma_inline_data_size(struct nvme_rdma_queue * queue)171*4882a593Smuzhiyun static inline size_t nvme_rdma_inline_data_size(struct nvme_rdma_queue *queue)
172*4882a593Smuzhiyun {
173*4882a593Smuzhiyun return queue->cmnd_capsule_len - sizeof(struct nvme_command);
174*4882a593Smuzhiyun }
175*4882a593Smuzhiyun
nvme_rdma_free_qe(struct ib_device * ibdev,struct nvme_rdma_qe * qe,size_t capsule_size,enum dma_data_direction dir)176*4882a593Smuzhiyun static void nvme_rdma_free_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe,
177*4882a593Smuzhiyun size_t capsule_size, enum dma_data_direction dir)
178*4882a593Smuzhiyun {
179*4882a593Smuzhiyun ib_dma_unmap_single(ibdev, qe->dma, capsule_size, dir);
180*4882a593Smuzhiyun kfree(qe->data);
181*4882a593Smuzhiyun }
182*4882a593Smuzhiyun
nvme_rdma_alloc_qe(struct ib_device * ibdev,struct nvme_rdma_qe * qe,size_t capsule_size,enum dma_data_direction dir)183*4882a593Smuzhiyun static int nvme_rdma_alloc_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe,
184*4882a593Smuzhiyun size_t capsule_size, enum dma_data_direction dir)
185*4882a593Smuzhiyun {
186*4882a593Smuzhiyun qe->data = kzalloc(capsule_size, GFP_KERNEL);
187*4882a593Smuzhiyun if (!qe->data)
188*4882a593Smuzhiyun return -ENOMEM;
189*4882a593Smuzhiyun
190*4882a593Smuzhiyun qe->dma = ib_dma_map_single(ibdev, qe->data, capsule_size, dir);
191*4882a593Smuzhiyun if (ib_dma_mapping_error(ibdev, qe->dma)) {
192*4882a593Smuzhiyun kfree(qe->data);
193*4882a593Smuzhiyun qe->data = NULL;
194*4882a593Smuzhiyun return -ENOMEM;
195*4882a593Smuzhiyun }
196*4882a593Smuzhiyun
197*4882a593Smuzhiyun return 0;
198*4882a593Smuzhiyun }
199*4882a593Smuzhiyun
nvme_rdma_free_ring(struct ib_device * ibdev,struct nvme_rdma_qe * ring,size_t ib_queue_size,size_t capsule_size,enum dma_data_direction dir)200*4882a593Smuzhiyun static void nvme_rdma_free_ring(struct ib_device *ibdev,
201*4882a593Smuzhiyun struct nvme_rdma_qe *ring, size_t ib_queue_size,
202*4882a593Smuzhiyun size_t capsule_size, enum dma_data_direction dir)
203*4882a593Smuzhiyun {
204*4882a593Smuzhiyun int i;
205*4882a593Smuzhiyun
206*4882a593Smuzhiyun for (i = 0; i < ib_queue_size; i++)
207*4882a593Smuzhiyun nvme_rdma_free_qe(ibdev, &ring[i], capsule_size, dir);
208*4882a593Smuzhiyun kfree(ring);
209*4882a593Smuzhiyun }
210*4882a593Smuzhiyun
nvme_rdma_alloc_ring(struct ib_device * ibdev,size_t ib_queue_size,size_t capsule_size,enum dma_data_direction dir)211*4882a593Smuzhiyun static struct nvme_rdma_qe *nvme_rdma_alloc_ring(struct ib_device *ibdev,
212*4882a593Smuzhiyun size_t ib_queue_size, size_t capsule_size,
213*4882a593Smuzhiyun enum dma_data_direction dir)
214*4882a593Smuzhiyun {
215*4882a593Smuzhiyun struct nvme_rdma_qe *ring;
216*4882a593Smuzhiyun int i;
217*4882a593Smuzhiyun
218*4882a593Smuzhiyun ring = kcalloc(ib_queue_size, sizeof(struct nvme_rdma_qe), GFP_KERNEL);
219*4882a593Smuzhiyun if (!ring)
220*4882a593Smuzhiyun return NULL;
221*4882a593Smuzhiyun
222*4882a593Smuzhiyun /*
223*4882a593Smuzhiyun * Bind the CQEs (post recv buffers) DMA mapping to the RDMA queue
224*4882a593Smuzhiyun * lifetime. It's safe, since any chage in the underlying RDMA device
225*4882a593Smuzhiyun * will issue error recovery and queue re-creation.
226*4882a593Smuzhiyun */
227*4882a593Smuzhiyun for (i = 0; i < ib_queue_size; i++) {
228*4882a593Smuzhiyun if (nvme_rdma_alloc_qe(ibdev, &ring[i], capsule_size, dir))
229*4882a593Smuzhiyun goto out_free_ring;
230*4882a593Smuzhiyun }
231*4882a593Smuzhiyun
232*4882a593Smuzhiyun return ring;
233*4882a593Smuzhiyun
234*4882a593Smuzhiyun out_free_ring:
235*4882a593Smuzhiyun nvme_rdma_free_ring(ibdev, ring, i, capsule_size, dir);
236*4882a593Smuzhiyun return NULL;
237*4882a593Smuzhiyun }
238*4882a593Smuzhiyun
nvme_rdma_qp_event(struct ib_event * event,void * context)239*4882a593Smuzhiyun static void nvme_rdma_qp_event(struct ib_event *event, void *context)
240*4882a593Smuzhiyun {
241*4882a593Smuzhiyun pr_debug("QP event %s (%d)\n",
242*4882a593Smuzhiyun ib_event_msg(event->event), event->event);
243*4882a593Smuzhiyun
244*4882a593Smuzhiyun }
245*4882a593Smuzhiyun
nvme_rdma_wait_for_cm(struct nvme_rdma_queue * queue)246*4882a593Smuzhiyun static int nvme_rdma_wait_for_cm(struct nvme_rdma_queue *queue)
247*4882a593Smuzhiyun {
248*4882a593Smuzhiyun int ret;
249*4882a593Smuzhiyun
250*4882a593Smuzhiyun ret = wait_for_completion_interruptible_timeout(&queue->cm_done,
251*4882a593Smuzhiyun msecs_to_jiffies(NVME_RDMA_CONNECT_TIMEOUT_MS) + 1);
252*4882a593Smuzhiyun if (ret < 0)
253*4882a593Smuzhiyun return ret;
254*4882a593Smuzhiyun if (ret == 0)
255*4882a593Smuzhiyun return -ETIMEDOUT;
256*4882a593Smuzhiyun WARN_ON_ONCE(queue->cm_error > 0);
257*4882a593Smuzhiyun return queue->cm_error;
258*4882a593Smuzhiyun }
259*4882a593Smuzhiyun
nvme_rdma_create_qp(struct nvme_rdma_queue * queue,const int factor)260*4882a593Smuzhiyun static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor)
261*4882a593Smuzhiyun {
262*4882a593Smuzhiyun struct nvme_rdma_device *dev = queue->device;
263*4882a593Smuzhiyun struct ib_qp_init_attr init_attr;
264*4882a593Smuzhiyun int ret;
265*4882a593Smuzhiyun
266*4882a593Smuzhiyun memset(&init_attr, 0, sizeof(init_attr));
267*4882a593Smuzhiyun init_attr.event_handler = nvme_rdma_qp_event;
268*4882a593Smuzhiyun /* +1 for drain */
269*4882a593Smuzhiyun init_attr.cap.max_send_wr = factor * queue->queue_size + 1;
270*4882a593Smuzhiyun /* +1 for drain */
271*4882a593Smuzhiyun init_attr.cap.max_recv_wr = queue->queue_size + 1;
272*4882a593Smuzhiyun init_attr.cap.max_recv_sge = 1;
273*4882a593Smuzhiyun init_attr.cap.max_send_sge = 1 + dev->num_inline_segments;
274*4882a593Smuzhiyun init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
275*4882a593Smuzhiyun init_attr.qp_type = IB_QPT_RC;
276*4882a593Smuzhiyun init_attr.send_cq = queue->ib_cq;
277*4882a593Smuzhiyun init_attr.recv_cq = queue->ib_cq;
278*4882a593Smuzhiyun if (queue->pi_support)
279*4882a593Smuzhiyun init_attr.create_flags |= IB_QP_CREATE_INTEGRITY_EN;
280*4882a593Smuzhiyun init_attr.qp_context = queue;
281*4882a593Smuzhiyun
282*4882a593Smuzhiyun ret = rdma_create_qp(queue->cm_id, dev->pd, &init_attr);
283*4882a593Smuzhiyun
284*4882a593Smuzhiyun queue->qp = queue->cm_id->qp;
285*4882a593Smuzhiyun return ret;
286*4882a593Smuzhiyun }
287*4882a593Smuzhiyun
nvme_rdma_exit_request(struct blk_mq_tag_set * set,struct request * rq,unsigned int hctx_idx)288*4882a593Smuzhiyun static void nvme_rdma_exit_request(struct blk_mq_tag_set *set,
289*4882a593Smuzhiyun struct request *rq, unsigned int hctx_idx)
290*4882a593Smuzhiyun {
291*4882a593Smuzhiyun struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
292*4882a593Smuzhiyun
293*4882a593Smuzhiyun kfree(req->sqe.data);
294*4882a593Smuzhiyun }
295*4882a593Smuzhiyun
nvme_rdma_init_request(struct blk_mq_tag_set * set,struct request * rq,unsigned int hctx_idx,unsigned int numa_node)296*4882a593Smuzhiyun static int nvme_rdma_init_request(struct blk_mq_tag_set *set,
297*4882a593Smuzhiyun struct request *rq, unsigned int hctx_idx,
298*4882a593Smuzhiyun unsigned int numa_node)
299*4882a593Smuzhiyun {
300*4882a593Smuzhiyun struct nvme_rdma_ctrl *ctrl = set->driver_data;
301*4882a593Smuzhiyun struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
302*4882a593Smuzhiyun int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
303*4882a593Smuzhiyun struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
304*4882a593Smuzhiyun
305*4882a593Smuzhiyun nvme_req(rq)->ctrl = &ctrl->ctrl;
306*4882a593Smuzhiyun req->sqe.data = kzalloc(sizeof(struct nvme_command), GFP_KERNEL);
307*4882a593Smuzhiyun if (!req->sqe.data)
308*4882a593Smuzhiyun return -ENOMEM;
309*4882a593Smuzhiyun
310*4882a593Smuzhiyun /* metadata nvme_rdma_sgl struct is located after command's data SGL */
311*4882a593Smuzhiyun if (queue->pi_support)
312*4882a593Smuzhiyun req->metadata_sgl = (void *)nvme_req(rq) +
313*4882a593Smuzhiyun sizeof(struct nvme_rdma_request) +
314*4882a593Smuzhiyun NVME_RDMA_DATA_SGL_SIZE;
315*4882a593Smuzhiyun
316*4882a593Smuzhiyun req->queue = queue;
317*4882a593Smuzhiyun
318*4882a593Smuzhiyun return 0;
319*4882a593Smuzhiyun }
320*4882a593Smuzhiyun
nvme_rdma_init_hctx(struct blk_mq_hw_ctx * hctx,void * data,unsigned int hctx_idx)321*4882a593Smuzhiyun static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
322*4882a593Smuzhiyun unsigned int hctx_idx)
323*4882a593Smuzhiyun {
324*4882a593Smuzhiyun struct nvme_rdma_ctrl *ctrl = data;
325*4882a593Smuzhiyun struct nvme_rdma_queue *queue = &ctrl->queues[hctx_idx + 1];
326*4882a593Smuzhiyun
327*4882a593Smuzhiyun BUG_ON(hctx_idx >= ctrl->ctrl.queue_count);
328*4882a593Smuzhiyun
329*4882a593Smuzhiyun hctx->driver_data = queue;
330*4882a593Smuzhiyun return 0;
331*4882a593Smuzhiyun }
332*4882a593Smuzhiyun
nvme_rdma_init_admin_hctx(struct blk_mq_hw_ctx * hctx,void * data,unsigned int hctx_idx)333*4882a593Smuzhiyun static int nvme_rdma_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
334*4882a593Smuzhiyun unsigned int hctx_idx)
335*4882a593Smuzhiyun {
336*4882a593Smuzhiyun struct nvme_rdma_ctrl *ctrl = data;
337*4882a593Smuzhiyun struct nvme_rdma_queue *queue = &ctrl->queues[0];
338*4882a593Smuzhiyun
339*4882a593Smuzhiyun BUG_ON(hctx_idx != 0);
340*4882a593Smuzhiyun
341*4882a593Smuzhiyun hctx->driver_data = queue;
342*4882a593Smuzhiyun return 0;
343*4882a593Smuzhiyun }
344*4882a593Smuzhiyun
nvme_rdma_free_dev(struct kref * ref)345*4882a593Smuzhiyun static void nvme_rdma_free_dev(struct kref *ref)
346*4882a593Smuzhiyun {
347*4882a593Smuzhiyun struct nvme_rdma_device *ndev =
348*4882a593Smuzhiyun container_of(ref, struct nvme_rdma_device, ref);
349*4882a593Smuzhiyun
350*4882a593Smuzhiyun mutex_lock(&device_list_mutex);
351*4882a593Smuzhiyun list_del(&ndev->entry);
352*4882a593Smuzhiyun mutex_unlock(&device_list_mutex);
353*4882a593Smuzhiyun
354*4882a593Smuzhiyun ib_dealloc_pd(ndev->pd);
355*4882a593Smuzhiyun kfree(ndev);
356*4882a593Smuzhiyun }
357*4882a593Smuzhiyun
nvme_rdma_dev_put(struct nvme_rdma_device * dev)358*4882a593Smuzhiyun static void nvme_rdma_dev_put(struct nvme_rdma_device *dev)
359*4882a593Smuzhiyun {
360*4882a593Smuzhiyun kref_put(&dev->ref, nvme_rdma_free_dev);
361*4882a593Smuzhiyun }
362*4882a593Smuzhiyun
nvme_rdma_dev_get(struct nvme_rdma_device * dev)363*4882a593Smuzhiyun static int nvme_rdma_dev_get(struct nvme_rdma_device *dev)
364*4882a593Smuzhiyun {
365*4882a593Smuzhiyun return kref_get_unless_zero(&dev->ref);
366*4882a593Smuzhiyun }
367*4882a593Smuzhiyun
368*4882a593Smuzhiyun static struct nvme_rdma_device *
nvme_rdma_find_get_device(struct rdma_cm_id * cm_id)369*4882a593Smuzhiyun nvme_rdma_find_get_device(struct rdma_cm_id *cm_id)
370*4882a593Smuzhiyun {
371*4882a593Smuzhiyun struct nvme_rdma_device *ndev;
372*4882a593Smuzhiyun
373*4882a593Smuzhiyun mutex_lock(&device_list_mutex);
374*4882a593Smuzhiyun list_for_each_entry(ndev, &device_list, entry) {
375*4882a593Smuzhiyun if (ndev->dev->node_guid == cm_id->device->node_guid &&
376*4882a593Smuzhiyun nvme_rdma_dev_get(ndev))
377*4882a593Smuzhiyun goto out_unlock;
378*4882a593Smuzhiyun }
379*4882a593Smuzhiyun
380*4882a593Smuzhiyun ndev = kzalloc(sizeof(*ndev), GFP_KERNEL);
381*4882a593Smuzhiyun if (!ndev)
382*4882a593Smuzhiyun goto out_err;
383*4882a593Smuzhiyun
384*4882a593Smuzhiyun ndev->dev = cm_id->device;
385*4882a593Smuzhiyun kref_init(&ndev->ref);
386*4882a593Smuzhiyun
387*4882a593Smuzhiyun ndev->pd = ib_alloc_pd(ndev->dev,
388*4882a593Smuzhiyun register_always ? 0 : IB_PD_UNSAFE_GLOBAL_RKEY);
389*4882a593Smuzhiyun if (IS_ERR(ndev->pd))
390*4882a593Smuzhiyun goto out_free_dev;
391*4882a593Smuzhiyun
392*4882a593Smuzhiyun if (!(ndev->dev->attrs.device_cap_flags &
393*4882a593Smuzhiyun IB_DEVICE_MEM_MGT_EXTENSIONS)) {
394*4882a593Smuzhiyun dev_err(&ndev->dev->dev,
395*4882a593Smuzhiyun "Memory registrations not supported.\n");
396*4882a593Smuzhiyun goto out_free_pd;
397*4882a593Smuzhiyun }
398*4882a593Smuzhiyun
399*4882a593Smuzhiyun ndev->num_inline_segments = min(NVME_RDMA_MAX_INLINE_SEGMENTS,
400*4882a593Smuzhiyun ndev->dev->attrs.max_send_sge - 1);
401*4882a593Smuzhiyun list_add(&ndev->entry, &device_list);
402*4882a593Smuzhiyun out_unlock:
403*4882a593Smuzhiyun mutex_unlock(&device_list_mutex);
404*4882a593Smuzhiyun return ndev;
405*4882a593Smuzhiyun
406*4882a593Smuzhiyun out_free_pd:
407*4882a593Smuzhiyun ib_dealloc_pd(ndev->pd);
408*4882a593Smuzhiyun out_free_dev:
409*4882a593Smuzhiyun kfree(ndev);
410*4882a593Smuzhiyun out_err:
411*4882a593Smuzhiyun mutex_unlock(&device_list_mutex);
412*4882a593Smuzhiyun return NULL;
413*4882a593Smuzhiyun }
414*4882a593Smuzhiyun
nvme_rdma_free_cq(struct nvme_rdma_queue * queue)415*4882a593Smuzhiyun static void nvme_rdma_free_cq(struct nvme_rdma_queue *queue)
416*4882a593Smuzhiyun {
417*4882a593Smuzhiyun if (nvme_rdma_poll_queue(queue))
418*4882a593Smuzhiyun ib_free_cq(queue->ib_cq);
419*4882a593Smuzhiyun else
420*4882a593Smuzhiyun ib_cq_pool_put(queue->ib_cq, queue->cq_size);
421*4882a593Smuzhiyun }
422*4882a593Smuzhiyun
nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue * queue)423*4882a593Smuzhiyun static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
424*4882a593Smuzhiyun {
425*4882a593Smuzhiyun struct nvme_rdma_device *dev;
426*4882a593Smuzhiyun struct ib_device *ibdev;
427*4882a593Smuzhiyun
428*4882a593Smuzhiyun if (!test_and_clear_bit(NVME_RDMA_Q_TR_READY, &queue->flags))
429*4882a593Smuzhiyun return;
430*4882a593Smuzhiyun
431*4882a593Smuzhiyun dev = queue->device;
432*4882a593Smuzhiyun ibdev = dev->dev;
433*4882a593Smuzhiyun
434*4882a593Smuzhiyun if (queue->pi_support)
435*4882a593Smuzhiyun ib_mr_pool_destroy(queue->qp, &queue->qp->sig_mrs);
436*4882a593Smuzhiyun ib_mr_pool_destroy(queue->qp, &queue->qp->rdma_mrs);
437*4882a593Smuzhiyun
438*4882a593Smuzhiyun /*
439*4882a593Smuzhiyun * The cm_id object might have been destroyed during RDMA connection
440*4882a593Smuzhiyun * establishment error flow to avoid getting other cma events, thus
441*4882a593Smuzhiyun * the destruction of the QP shouldn't use rdma_cm API.
442*4882a593Smuzhiyun */
443*4882a593Smuzhiyun ib_destroy_qp(queue->qp);
444*4882a593Smuzhiyun nvme_rdma_free_cq(queue);
445*4882a593Smuzhiyun
446*4882a593Smuzhiyun nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size,
447*4882a593Smuzhiyun sizeof(struct nvme_completion), DMA_FROM_DEVICE);
448*4882a593Smuzhiyun
449*4882a593Smuzhiyun nvme_rdma_dev_put(dev);
450*4882a593Smuzhiyun }
451*4882a593Smuzhiyun
nvme_rdma_get_max_fr_pages(struct ib_device * ibdev,bool pi_support)452*4882a593Smuzhiyun static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev, bool pi_support)
453*4882a593Smuzhiyun {
454*4882a593Smuzhiyun u32 max_page_list_len;
455*4882a593Smuzhiyun
456*4882a593Smuzhiyun if (pi_support)
457*4882a593Smuzhiyun max_page_list_len = ibdev->attrs.max_pi_fast_reg_page_list_len;
458*4882a593Smuzhiyun else
459*4882a593Smuzhiyun max_page_list_len = ibdev->attrs.max_fast_reg_page_list_len;
460*4882a593Smuzhiyun
461*4882a593Smuzhiyun return min_t(u32, NVME_RDMA_MAX_SEGMENTS, max_page_list_len - 1);
462*4882a593Smuzhiyun }
463*4882a593Smuzhiyun
nvme_rdma_create_cq(struct ib_device * ibdev,struct nvme_rdma_queue * queue)464*4882a593Smuzhiyun static int nvme_rdma_create_cq(struct ib_device *ibdev,
465*4882a593Smuzhiyun struct nvme_rdma_queue *queue)
466*4882a593Smuzhiyun {
467*4882a593Smuzhiyun int ret, comp_vector, idx = nvme_rdma_queue_idx(queue);
468*4882a593Smuzhiyun enum ib_poll_context poll_ctx;
469*4882a593Smuzhiyun
470*4882a593Smuzhiyun /*
471*4882a593Smuzhiyun * Spread I/O queues completion vectors according their queue index.
472*4882a593Smuzhiyun * Admin queues can always go on completion vector 0.
473*4882a593Smuzhiyun */
474*4882a593Smuzhiyun comp_vector = (idx == 0 ? idx : idx - 1) % ibdev->num_comp_vectors;
475*4882a593Smuzhiyun
476*4882a593Smuzhiyun /* Polling queues need direct cq polling context */
477*4882a593Smuzhiyun if (nvme_rdma_poll_queue(queue)) {
478*4882a593Smuzhiyun poll_ctx = IB_POLL_DIRECT;
479*4882a593Smuzhiyun queue->ib_cq = ib_alloc_cq(ibdev, queue, queue->cq_size,
480*4882a593Smuzhiyun comp_vector, poll_ctx);
481*4882a593Smuzhiyun } else {
482*4882a593Smuzhiyun poll_ctx = IB_POLL_SOFTIRQ;
483*4882a593Smuzhiyun queue->ib_cq = ib_cq_pool_get(ibdev, queue->cq_size,
484*4882a593Smuzhiyun comp_vector, poll_ctx);
485*4882a593Smuzhiyun }
486*4882a593Smuzhiyun
487*4882a593Smuzhiyun if (IS_ERR(queue->ib_cq)) {
488*4882a593Smuzhiyun ret = PTR_ERR(queue->ib_cq);
489*4882a593Smuzhiyun return ret;
490*4882a593Smuzhiyun }
491*4882a593Smuzhiyun
492*4882a593Smuzhiyun return 0;
493*4882a593Smuzhiyun }
494*4882a593Smuzhiyun
nvme_rdma_create_queue_ib(struct nvme_rdma_queue * queue)495*4882a593Smuzhiyun static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
496*4882a593Smuzhiyun {
497*4882a593Smuzhiyun struct ib_device *ibdev;
498*4882a593Smuzhiyun const int send_wr_factor = 3; /* MR, SEND, INV */
499*4882a593Smuzhiyun const int cq_factor = send_wr_factor + 1; /* + RECV */
500*4882a593Smuzhiyun int ret, pages_per_mr;
501*4882a593Smuzhiyun
502*4882a593Smuzhiyun queue->device = nvme_rdma_find_get_device(queue->cm_id);
503*4882a593Smuzhiyun if (!queue->device) {
504*4882a593Smuzhiyun dev_err(queue->cm_id->device->dev.parent,
505*4882a593Smuzhiyun "no client data found!\n");
506*4882a593Smuzhiyun return -ECONNREFUSED;
507*4882a593Smuzhiyun }
508*4882a593Smuzhiyun ibdev = queue->device->dev;
509*4882a593Smuzhiyun
510*4882a593Smuzhiyun /* +1 for ib_stop_cq */
511*4882a593Smuzhiyun queue->cq_size = cq_factor * queue->queue_size + 1;
512*4882a593Smuzhiyun
513*4882a593Smuzhiyun ret = nvme_rdma_create_cq(ibdev, queue);
514*4882a593Smuzhiyun if (ret)
515*4882a593Smuzhiyun goto out_put_dev;
516*4882a593Smuzhiyun
517*4882a593Smuzhiyun ret = nvme_rdma_create_qp(queue, send_wr_factor);
518*4882a593Smuzhiyun if (ret)
519*4882a593Smuzhiyun goto out_destroy_ib_cq;
520*4882a593Smuzhiyun
521*4882a593Smuzhiyun queue->rsp_ring = nvme_rdma_alloc_ring(ibdev, queue->queue_size,
522*4882a593Smuzhiyun sizeof(struct nvme_completion), DMA_FROM_DEVICE);
523*4882a593Smuzhiyun if (!queue->rsp_ring) {
524*4882a593Smuzhiyun ret = -ENOMEM;
525*4882a593Smuzhiyun goto out_destroy_qp;
526*4882a593Smuzhiyun }
527*4882a593Smuzhiyun
528*4882a593Smuzhiyun /*
529*4882a593Smuzhiyun * Currently we don't use SG_GAPS MR's so if the first entry is
530*4882a593Smuzhiyun * misaligned we'll end up using two entries for a single data page,
531*4882a593Smuzhiyun * so one additional entry is required.
532*4882a593Smuzhiyun */
533*4882a593Smuzhiyun pages_per_mr = nvme_rdma_get_max_fr_pages(ibdev, queue->pi_support) + 1;
534*4882a593Smuzhiyun ret = ib_mr_pool_init(queue->qp, &queue->qp->rdma_mrs,
535*4882a593Smuzhiyun queue->queue_size,
536*4882a593Smuzhiyun IB_MR_TYPE_MEM_REG,
537*4882a593Smuzhiyun pages_per_mr, 0);
538*4882a593Smuzhiyun if (ret) {
539*4882a593Smuzhiyun dev_err(queue->ctrl->ctrl.device,
540*4882a593Smuzhiyun "failed to initialize MR pool sized %d for QID %d\n",
541*4882a593Smuzhiyun queue->queue_size, nvme_rdma_queue_idx(queue));
542*4882a593Smuzhiyun goto out_destroy_ring;
543*4882a593Smuzhiyun }
544*4882a593Smuzhiyun
545*4882a593Smuzhiyun if (queue->pi_support) {
546*4882a593Smuzhiyun ret = ib_mr_pool_init(queue->qp, &queue->qp->sig_mrs,
547*4882a593Smuzhiyun queue->queue_size, IB_MR_TYPE_INTEGRITY,
548*4882a593Smuzhiyun pages_per_mr, pages_per_mr);
549*4882a593Smuzhiyun if (ret) {
550*4882a593Smuzhiyun dev_err(queue->ctrl->ctrl.device,
551*4882a593Smuzhiyun "failed to initialize PI MR pool sized %d for QID %d\n",
552*4882a593Smuzhiyun queue->queue_size, nvme_rdma_queue_idx(queue));
553*4882a593Smuzhiyun goto out_destroy_mr_pool;
554*4882a593Smuzhiyun }
555*4882a593Smuzhiyun }
556*4882a593Smuzhiyun
557*4882a593Smuzhiyun set_bit(NVME_RDMA_Q_TR_READY, &queue->flags);
558*4882a593Smuzhiyun
559*4882a593Smuzhiyun return 0;
560*4882a593Smuzhiyun
561*4882a593Smuzhiyun out_destroy_mr_pool:
562*4882a593Smuzhiyun ib_mr_pool_destroy(queue->qp, &queue->qp->rdma_mrs);
563*4882a593Smuzhiyun out_destroy_ring:
564*4882a593Smuzhiyun nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size,
565*4882a593Smuzhiyun sizeof(struct nvme_completion), DMA_FROM_DEVICE);
566*4882a593Smuzhiyun out_destroy_qp:
567*4882a593Smuzhiyun rdma_destroy_qp(queue->cm_id);
568*4882a593Smuzhiyun out_destroy_ib_cq:
569*4882a593Smuzhiyun nvme_rdma_free_cq(queue);
570*4882a593Smuzhiyun out_put_dev:
571*4882a593Smuzhiyun nvme_rdma_dev_put(queue->device);
572*4882a593Smuzhiyun return ret;
573*4882a593Smuzhiyun }
574*4882a593Smuzhiyun
nvme_rdma_alloc_queue(struct nvme_rdma_ctrl * ctrl,int idx,size_t queue_size)575*4882a593Smuzhiyun static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl,
576*4882a593Smuzhiyun int idx, size_t queue_size)
577*4882a593Smuzhiyun {
578*4882a593Smuzhiyun struct nvme_rdma_queue *queue;
579*4882a593Smuzhiyun struct sockaddr *src_addr = NULL;
580*4882a593Smuzhiyun int ret;
581*4882a593Smuzhiyun
582*4882a593Smuzhiyun queue = &ctrl->queues[idx];
583*4882a593Smuzhiyun mutex_init(&queue->queue_lock);
584*4882a593Smuzhiyun queue->ctrl = ctrl;
585*4882a593Smuzhiyun if (idx && ctrl->ctrl.max_integrity_segments)
586*4882a593Smuzhiyun queue->pi_support = true;
587*4882a593Smuzhiyun else
588*4882a593Smuzhiyun queue->pi_support = false;
589*4882a593Smuzhiyun init_completion(&queue->cm_done);
590*4882a593Smuzhiyun
591*4882a593Smuzhiyun if (idx > 0)
592*4882a593Smuzhiyun queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16;
593*4882a593Smuzhiyun else
594*4882a593Smuzhiyun queue->cmnd_capsule_len = sizeof(struct nvme_command);
595*4882a593Smuzhiyun
596*4882a593Smuzhiyun queue->queue_size = queue_size;
597*4882a593Smuzhiyun
598*4882a593Smuzhiyun queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue,
599*4882a593Smuzhiyun RDMA_PS_TCP, IB_QPT_RC);
600*4882a593Smuzhiyun if (IS_ERR(queue->cm_id)) {
601*4882a593Smuzhiyun dev_info(ctrl->ctrl.device,
602*4882a593Smuzhiyun "failed to create CM ID: %ld\n", PTR_ERR(queue->cm_id));
603*4882a593Smuzhiyun ret = PTR_ERR(queue->cm_id);
604*4882a593Smuzhiyun goto out_destroy_mutex;
605*4882a593Smuzhiyun }
606*4882a593Smuzhiyun
607*4882a593Smuzhiyun if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR)
608*4882a593Smuzhiyun src_addr = (struct sockaddr *)&ctrl->src_addr;
609*4882a593Smuzhiyun
610*4882a593Smuzhiyun queue->cm_error = -ETIMEDOUT;
611*4882a593Smuzhiyun ret = rdma_resolve_addr(queue->cm_id, src_addr,
612*4882a593Smuzhiyun (struct sockaddr *)&ctrl->addr,
613*4882a593Smuzhiyun NVME_RDMA_CONNECT_TIMEOUT_MS);
614*4882a593Smuzhiyun if (ret) {
615*4882a593Smuzhiyun dev_info(ctrl->ctrl.device,
616*4882a593Smuzhiyun "rdma_resolve_addr failed (%d).\n", ret);
617*4882a593Smuzhiyun goto out_destroy_cm_id;
618*4882a593Smuzhiyun }
619*4882a593Smuzhiyun
620*4882a593Smuzhiyun ret = nvme_rdma_wait_for_cm(queue);
621*4882a593Smuzhiyun if (ret) {
622*4882a593Smuzhiyun dev_info(ctrl->ctrl.device,
623*4882a593Smuzhiyun "rdma connection establishment failed (%d)\n", ret);
624*4882a593Smuzhiyun goto out_destroy_cm_id;
625*4882a593Smuzhiyun }
626*4882a593Smuzhiyun
627*4882a593Smuzhiyun set_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags);
628*4882a593Smuzhiyun
629*4882a593Smuzhiyun return 0;
630*4882a593Smuzhiyun
631*4882a593Smuzhiyun out_destroy_cm_id:
632*4882a593Smuzhiyun rdma_destroy_id(queue->cm_id);
633*4882a593Smuzhiyun nvme_rdma_destroy_queue_ib(queue);
634*4882a593Smuzhiyun out_destroy_mutex:
635*4882a593Smuzhiyun mutex_destroy(&queue->queue_lock);
636*4882a593Smuzhiyun return ret;
637*4882a593Smuzhiyun }
638*4882a593Smuzhiyun
__nvme_rdma_stop_queue(struct nvme_rdma_queue * queue)639*4882a593Smuzhiyun static void __nvme_rdma_stop_queue(struct nvme_rdma_queue *queue)
640*4882a593Smuzhiyun {
641*4882a593Smuzhiyun rdma_disconnect(queue->cm_id);
642*4882a593Smuzhiyun ib_drain_qp(queue->qp);
643*4882a593Smuzhiyun }
644*4882a593Smuzhiyun
nvme_rdma_stop_queue(struct nvme_rdma_queue * queue)645*4882a593Smuzhiyun static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue)
646*4882a593Smuzhiyun {
647*4882a593Smuzhiyun mutex_lock(&queue->queue_lock);
648*4882a593Smuzhiyun if (test_and_clear_bit(NVME_RDMA_Q_LIVE, &queue->flags))
649*4882a593Smuzhiyun __nvme_rdma_stop_queue(queue);
650*4882a593Smuzhiyun mutex_unlock(&queue->queue_lock);
651*4882a593Smuzhiyun }
652*4882a593Smuzhiyun
nvme_rdma_free_queue(struct nvme_rdma_queue * queue)653*4882a593Smuzhiyun static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue)
654*4882a593Smuzhiyun {
655*4882a593Smuzhiyun if (!test_and_clear_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags))
656*4882a593Smuzhiyun return;
657*4882a593Smuzhiyun
658*4882a593Smuzhiyun rdma_destroy_id(queue->cm_id);
659*4882a593Smuzhiyun nvme_rdma_destroy_queue_ib(queue);
660*4882a593Smuzhiyun mutex_destroy(&queue->queue_lock);
661*4882a593Smuzhiyun }
662*4882a593Smuzhiyun
nvme_rdma_free_io_queues(struct nvme_rdma_ctrl * ctrl)663*4882a593Smuzhiyun static void nvme_rdma_free_io_queues(struct nvme_rdma_ctrl *ctrl)
664*4882a593Smuzhiyun {
665*4882a593Smuzhiyun int i;
666*4882a593Smuzhiyun
667*4882a593Smuzhiyun for (i = 1; i < ctrl->ctrl.queue_count; i++)
668*4882a593Smuzhiyun nvme_rdma_free_queue(&ctrl->queues[i]);
669*4882a593Smuzhiyun }
670*4882a593Smuzhiyun
nvme_rdma_stop_io_queues(struct nvme_rdma_ctrl * ctrl)671*4882a593Smuzhiyun static void nvme_rdma_stop_io_queues(struct nvme_rdma_ctrl *ctrl)
672*4882a593Smuzhiyun {
673*4882a593Smuzhiyun int i;
674*4882a593Smuzhiyun
675*4882a593Smuzhiyun for (i = 1; i < ctrl->ctrl.queue_count; i++)
676*4882a593Smuzhiyun nvme_rdma_stop_queue(&ctrl->queues[i]);
677*4882a593Smuzhiyun }
678*4882a593Smuzhiyun
nvme_rdma_start_queue(struct nvme_rdma_ctrl * ctrl,int idx)679*4882a593Smuzhiyun static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx)
680*4882a593Smuzhiyun {
681*4882a593Smuzhiyun struct nvme_rdma_queue *queue = &ctrl->queues[idx];
682*4882a593Smuzhiyun bool poll = nvme_rdma_poll_queue(queue);
683*4882a593Smuzhiyun int ret;
684*4882a593Smuzhiyun
685*4882a593Smuzhiyun if (idx)
686*4882a593Smuzhiyun ret = nvmf_connect_io_queue(&ctrl->ctrl, idx, poll);
687*4882a593Smuzhiyun else
688*4882a593Smuzhiyun ret = nvmf_connect_admin_queue(&ctrl->ctrl);
689*4882a593Smuzhiyun
690*4882a593Smuzhiyun if (!ret) {
691*4882a593Smuzhiyun set_bit(NVME_RDMA_Q_LIVE, &queue->flags);
692*4882a593Smuzhiyun } else {
693*4882a593Smuzhiyun if (test_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags))
694*4882a593Smuzhiyun __nvme_rdma_stop_queue(queue);
695*4882a593Smuzhiyun dev_info(ctrl->ctrl.device,
696*4882a593Smuzhiyun "failed to connect queue: %d ret=%d\n", idx, ret);
697*4882a593Smuzhiyun }
698*4882a593Smuzhiyun return ret;
699*4882a593Smuzhiyun }
700*4882a593Smuzhiyun
nvme_rdma_start_io_queues(struct nvme_rdma_ctrl * ctrl)701*4882a593Smuzhiyun static int nvme_rdma_start_io_queues(struct nvme_rdma_ctrl *ctrl)
702*4882a593Smuzhiyun {
703*4882a593Smuzhiyun int i, ret = 0;
704*4882a593Smuzhiyun
705*4882a593Smuzhiyun for (i = 1; i < ctrl->ctrl.queue_count; i++) {
706*4882a593Smuzhiyun ret = nvme_rdma_start_queue(ctrl, i);
707*4882a593Smuzhiyun if (ret)
708*4882a593Smuzhiyun goto out_stop_queues;
709*4882a593Smuzhiyun }
710*4882a593Smuzhiyun
711*4882a593Smuzhiyun return 0;
712*4882a593Smuzhiyun
713*4882a593Smuzhiyun out_stop_queues:
714*4882a593Smuzhiyun for (i--; i >= 1; i--)
715*4882a593Smuzhiyun nvme_rdma_stop_queue(&ctrl->queues[i]);
716*4882a593Smuzhiyun return ret;
717*4882a593Smuzhiyun }
718*4882a593Smuzhiyun
nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl * ctrl)719*4882a593Smuzhiyun static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl)
720*4882a593Smuzhiyun {
721*4882a593Smuzhiyun struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
722*4882a593Smuzhiyun struct ib_device *ibdev = ctrl->device->dev;
723*4882a593Smuzhiyun unsigned int nr_io_queues, nr_default_queues;
724*4882a593Smuzhiyun unsigned int nr_read_queues, nr_poll_queues;
725*4882a593Smuzhiyun int i, ret;
726*4882a593Smuzhiyun
727*4882a593Smuzhiyun nr_read_queues = min_t(unsigned int, ibdev->num_comp_vectors,
728*4882a593Smuzhiyun min(opts->nr_io_queues, num_online_cpus()));
729*4882a593Smuzhiyun nr_default_queues = min_t(unsigned int, ibdev->num_comp_vectors,
730*4882a593Smuzhiyun min(opts->nr_write_queues, num_online_cpus()));
731*4882a593Smuzhiyun nr_poll_queues = min(opts->nr_poll_queues, num_online_cpus());
732*4882a593Smuzhiyun nr_io_queues = nr_read_queues + nr_default_queues + nr_poll_queues;
733*4882a593Smuzhiyun
734*4882a593Smuzhiyun ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues);
735*4882a593Smuzhiyun if (ret)
736*4882a593Smuzhiyun return ret;
737*4882a593Smuzhiyun
738*4882a593Smuzhiyun if (nr_io_queues == 0) {
739*4882a593Smuzhiyun dev_err(ctrl->ctrl.device,
740*4882a593Smuzhiyun "unable to set any I/O queues\n");
741*4882a593Smuzhiyun return -ENOMEM;
742*4882a593Smuzhiyun }
743*4882a593Smuzhiyun
744*4882a593Smuzhiyun ctrl->ctrl.queue_count = nr_io_queues + 1;
745*4882a593Smuzhiyun dev_info(ctrl->ctrl.device,
746*4882a593Smuzhiyun "creating %d I/O queues.\n", nr_io_queues);
747*4882a593Smuzhiyun
748*4882a593Smuzhiyun if (opts->nr_write_queues && nr_read_queues < nr_io_queues) {
749*4882a593Smuzhiyun /*
750*4882a593Smuzhiyun * separate read/write queues
751*4882a593Smuzhiyun * hand out dedicated default queues only after we have
752*4882a593Smuzhiyun * sufficient read queues.
753*4882a593Smuzhiyun */
754*4882a593Smuzhiyun ctrl->io_queues[HCTX_TYPE_READ] = nr_read_queues;
755*4882a593Smuzhiyun nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ];
756*4882a593Smuzhiyun ctrl->io_queues[HCTX_TYPE_DEFAULT] =
757*4882a593Smuzhiyun min(nr_default_queues, nr_io_queues);
758*4882a593Smuzhiyun nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
759*4882a593Smuzhiyun } else {
760*4882a593Smuzhiyun /*
761*4882a593Smuzhiyun * shared read/write queues
762*4882a593Smuzhiyun * either no write queues were requested, or we don't have
763*4882a593Smuzhiyun * sufficient queue count to have dedicated default queues.
764*4882a593Smuzhiyun */
765*4882a593Smuzhiyun ctrl->io_queues[HCTX_TYPE_DEFAULT] =
766*4882a593Smuzhiyun min(nr_read_queues, nr_io_queues);
767*4882a593Smuzhiyun nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
768*4882a593Smuzhiyun }
769*4882a593Smuzhiyun
770*4882a593Smuzhiyun if (opts->nr_poll_queues && nr_io_queues) {
771*4882a593Smuzhiyun /* map dedicated poll queues only if we have queues left */
772*4882a593Smuzhiyun ctrl->io_queues[HCTX_TYPE_POLL] =
773*4882a593Smuzhiyun min(nr_poll_queues, nr_io_queues);
774*4882a593Smuzhiyun }
775*4882a593Smuzhiyun
776*4882a593Smuzhiyun for (i = 1; i < ctrl->ctrl.queue_count; i++) {
777*4882a593Smuzhiyun ret = nvme_rdma_alloc_queue(ctrl, i,
778*4882a593Smuzhiyun ctrl->ctrl.sqsize + 1);
779*4882a593Smuzhiyun if (ret)
780*4882a593Smuzhiyun goto out_free_queues;
781*4882a593Smuzhiyun }
782*4882a593Smuzhiyun
783*4882a593Smuzhiyun return 0;
784*4882a593Smuzhiyun
785*4882a593Smuzhiyun out_free_queues:
786*4882a593Smuzhiyun for (i--; i >= 1; i--)
787*4882a593Smuzhiyun nvme_rdma_free_queue(&ctrl->queues[i]);
788*4882a593Smuzhiyun
789*4882a593Smuzhiyun return ret;
790*4882a593Smuzhiyun }
791*4882a593Smuzhiyun
nvme_rdma_alloc_tagset(struct nvme_ctrl * nctrl,bool admin)792*4882a593Smuzhiyun static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
793*4882a593Smuzhiyun bool admin)
794*4882a593Smuzhiyun {
795*4882a593Smuzhiyun struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
796*4882a593Smuzhiyun struct blk_mq_tag_set *set;
797*4882a593Smuzhiyun int ret;
798*4882a593Smuzhiyun
799*4882a593Smuzhiyun if (admin) {
800*4882a593Smuzhiyun set = &ctrl->admin_tag_set;
801*4882a593Smuzhiyun memset(set, 0, sizeof(*set));
802*4882a593Smuzhiyun set->ops = &nvme_rdma_admin_mq_ops;
803*4882a593Smuzhiyun set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
804*4882a593Smuzhiyun set->reserved_tags = 2; /* connect + keep-alive */
805*4882a593Smuzhiyun set->numa_node = nctrl->numa_node;
806*4882a593Smuzhiyun set->cmd_size = sizeof(struct nvme_rdma_request) +
807*4882a593Smuzhiyun NVME_RDMA_DATA_SGL_SIZE;
808*4882a593Smuzhiyun set->driver_data = ctrl;
809*4882a593Smuzhiyun set->nr_hw_queues = 1;
810*4882a593Smuzhiyun set->timeout = ADMIN_TIMEOUT;
811*4882a593Smuzhiyun set->flags = BLK_MQ_F_NO_SCHED;
812*4882a593Smuzhiyun } else {
813*4882a593Smuzhiyun set = &ctrl->tag_set;
814*4882a593Smuzhiyun memset(set, 0, sizeof(*set));
815*4882a593Smuzhiyun set->ops = &nvme_rdma_mq_ops;
816*4882a593Smuzhiyun set->queue_depth = nctrl->sqsize + 1;
817*4882a593Smuzhiyun set->reserved_tags = 1; /* fabric connect */
818*4882a593Smuzhiyun set->numa_node = nctrl->numa_node;
819*4882a593Smuzhiyun set->flags = BLK_MQ_F_SHOULD_MERGE;
820*4882a593Smuzhiyun set->cmd_size = sizeof(struct nvme_rdma_request) +
821*4882a593Smuzhiyun NVME_RDMA_DATA_SGL_SIZE;
822*4882a593Smuzhiyun if (nctrl->max_integrity_segments)
823*4882a593Smuzhiyun set->cmd_size += sizeof(struct nvme_rdma_sgl) +
824*4882a593Smuzhiyun NVME_RDMA_METADATA_SGL_SIZE;
825*4882a593Smuzhiyun set->driver_data = ctrl;
826*4882a593Smuzhiyun set->nr_hw_queues = nctrl->queue_count - 1;
827*4882a593Smuzhiyun set->timeout = NVME_IO_TIMEOUT;
828*4882a593Smuzhiyun set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
829*4882a593Smuzhiyun }
830*4882a593Smuzhiyun
831*4882a593Smuzhiyun ret = blk_mq_alloc_tag_set(set);
832*4882a593Smuzhiyun if (ret)
833*4882a593Smuzhiyun return ERR_PTR(ret);
834*4882a593Smuzhiyun
835*4882a593Smuzhiyun return set;
836*4882a593Smuzhiyun }
837*4882a593Smuzhiyun
nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl * ctrl,bool remove)838*4882a593Smuzhiyun static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl,
839*4882a593Smuzhiyun bool remove)
840*4882a593Smuzhiyun {
841*4882a593Smuzhiyun if (remove) {
842*4882a593Smuzhiyun blk_cleanup_queue(ctrl->ctrl.admin_q);
843*4882a593Smuzhiyun blk_cleanup_queue(ctrl->ctrl.fabrics_q);
844*4882a593Smuzhiyun blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
845*4882a593Smuzhiyun }
846*4882a593Smuzhiyun if (ctrl->async_event_sqe.data) {
847*4882a593Smuzhiyun cancel_work_sync(&ctrl->ctrl.async_event_work);
848*4882a593Smuzhiyun nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe,
849*4882a593Smuzhiyun sizeof(struct nvme_command), DMA_TO_DEVICE);
850*4882a593Smuzhiyun ctrl->async_event_sqe.data = NULL;
851*4882a593Smuzhiyun }
852*4882a593Smuzhiyun nvme_rdma_free_queue(&ctrl->queues[0]);
853*4882a593Smuzhiyun }
854*4882a593Smuzhiyun
nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl * ctrl,bool new)855*4882a593Smuzhiyun static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
856*4882a593Smuzhiyun bool new)
857*4882a593Smuzhiyun {
858*4882a593Smuzhiyun bool pi_capable = false;
859*4882a593Smuzhiyun int error;
860*4882a593Smuzhiyun
861*4882a593Smuzhiyun error = nvme_rdma_alloc_queue(ctrl, 0, NVME_AQ_DEPTH);
862*4882a593Smuzhiyun if (error)
863*4882a593Smuzhiyun return error;
864*4882a593Smuzhiyun
865*4882a593Smuzhiyun ctrl->device = ctrl->queues[0].device;
866*4882a593Smuzhiyun ctrl->ctrl.numa_node = ibdev_to_node(ctrl->device->dev);
867*4882a593Smuzhiyun
868*4882a593Smuzhiyun /* T10-PI support */
869*4882a593Smuzhiyun if (ctrl->device->dev->attrs.device_cap_flags &
870*4882a593Smuzhiyun IB_DEVICE_INTEGRITY_HANDOVER)
871*4882a593Smuzhiyun pi_capable = true;
872*4882a593Smuzhiyun
873*4882a593Smuzhiyun ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev,
874*4882a593Smuzhiyun pi_capable);
875*4882a593Smuzhiyun
876*4882a593Smuzhiyun /*
877*4882a593Smuzhiyun * Bind the async event SQE DMA mapping to the admin queue lifetime.
878*4882a593Smuzhiyun * It's safe, since any chage in the underlying RDMA device will issue
879*4882a593Smuzhiyun * error recovery and queue re-creation.
880*4882a593Smuzhiyun */
881*4882a593Smuzhiyun error = nvme_rdma_alloc_qe(ctrl->device->dev, &ctrl->async_event_sqe,
882*4882a593Smuzhiyun sizeof(struct nvme_command), DMA_TO_DEVICE);
883*4882a593Smuzhiyun if (error)
884*4882a593Smuzhiyun goto out_free_queue;
885*4882a593Smuzhiyun
886*4882a593Smuzhiyun if (new) {
887*4882a593Smuzhiyun ctrl->ctrl.admin_tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, true);
888*4882a593Smuzhiyun if (IS_ERR(ctrl->ctrl.admin_tagset)) {
889*4882a593Smuzhiyun error = PTR_ERR(ctrl->ctrl.admin_tagset);
890*4882a593Smuzhiyun goto out_free_async_qe;
891*4882a593Smuzhiyun }
892*4882a593Smuzhiyun
893*4882a593Smuzhiyun ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set);
894*4882a593Smuzhiyun if (IS_ERR(ctrl->ctrl.fabrics_q)) {
895*4882a593Smuzhiyun error = PTR_ERR(ctrl->ctrl.fabrics_q);
896*4882a593Smuzhiyun goto out_free_tagset;
897*4882a593Smuzhiyun }
898*4882a593Smuzhiyun
899*4882a593Smuzhiyun ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
900*4882a593Smuzhiyun if (IS_ERR(ctrl->ctrl.admin_q)) {
901*4882a593Smuzhiyun error = PTR_ERR(ctrl->ctrl.admin_q);
902*4882a593Smuzhiyun goto out_cleanup_fabrics_q;
903*4882a593Smuzhiyun }
904*4882a593Smuzhiyun }
905*4882a593Smuzhiyun
906*4882a593Smuzhiyun error = nvme_rdma_start_queue(ctrl, 0);
907*4882a593Smuzhiyun if (error)
908*4882a593Smuzhiyun goto out_cleanup_queue;
909*4882a593Smuzhiyun
910*4882a593Smuzhiyun error = nvme_enable_ctrl(&ctrl->ctrl);
911*4882a593Smuzhiyun if (error)
912*4882a593Smuzhiyun goto out_stop_queue;
913*4882a593Smuzhiyun
914*4882a593Smuzhiyun ctrl->ctrl.max_segments = ctrl->max_fr_pages;
915*4882a593Smuzhiyun ctrl->ctrl.max_hw_sectors = ctrl->max_fr_pages << (ilog2(SZ_4K) - 9);
916*4882a593Smuzhiyun if (pi_capable)
917*4882a593Smuzhiyun ctrl->ctrl.max_integrity_segments = ctrl->max_fr_pages;
918*4882a593Smuzhiyun else
919*4882a593Smuzhiyun ctrl->ctrl.max_integrity_segments = 0;
920*4882a593Smuzhiyun
921*4882a593Smuzhiyun blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
922*4882a593Smuzhiyun
923*4882a593Smuzhiyun error = nvme_init_identify(&ctrl->ctrl);
924*4882a593Smuzhiyun if (error)
925*4882a593Smuzhiyun goto out_quiesce_queue;
926*4882a593Smuzhiyun
927*4882a593Smuzhiyun return 0;
928*4882a593Smuzhiyun
929*4882a593Smuzhiyun out_quiesce_queue:
930*4882a593Smuzhiyun blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
931*4882a593Smuzhiyun blk_sync_queue(ctrl->ctrl.admin_q);
932*4882a593Smuzhiyun out_stop_queue:
933*4882a593Smuzhiyun nvme_rdma_stop_queue(&ctrl->queues[0]);
934*4882a593Smuzhiyun nvme_cancel_admin_tagset(&ctrl->ctrl);
935*4882a593Smuzhiyun out_cleanup_queue:
936*4882a593Smuzhiyun if (new)
937*4882a593Smuzhiyun blk_cleanup_queue(ctrl->ctrl.admin_q);
938*4882a593Smuzhiyun out_cleanup_fabrics_q:
939*4882a593Smuzhiyun if (new)
940*4882a593Smuzhiyun blk_cleanup_queue(ctrl->ctrl.fabrics_q);
941*4882a593Smuzhiyun out_free_tagset:
942*4882a593Smuzhiyun if (new)
943*4882a593Smuzhiyun blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
944*4882a593Smuzhiyun out_free_async_qe:
945*4882a593Smuzhiyun if (ctrl->async_event_sqe.data) {
946*4882a593Smuzhiyun nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe,
947*4882a593Smuzhiyun sizeof(struct nvme_command), DMA_TO_DEVICE);
948*4882a593Smuzhiyun ctrl->async_event_sqe.data = NULL;
949*4882a593Smuzhiyun }
950*4882a593Smuzhiyun out_free_queue:
951*4882a593Smuzhiyun nvme_rdma_free_queue(&ctrl->queues[0]);
952*4882a593Smuzhiyun return error;
953*4882a593Smuzhiyun }
954*4882a593Smuzhiyun
nvme_rdma_destroy_io_queues(struct nvme_rdma_ctrl * ctrl,bool remove)955*4882a593Smuzhiyun static void nvme_rdma_destroy_io_queues(struct nvme_rdma_ctrl *ctrl,
956*4882a593Smuzhiyun bool remove)
957*4882a593Smuzhiyun {
958*4882a593Smuzhiyun if (remove) {
959*4882a593Smuzhiyun blk_cleanup_queue(ctrl->ctrl.connect_q);
960*4882a593Smuzhiyun blk_mq_free_tag_set(ctrl->ctrl.tagset);
961*4882a593Smuzhiyun }
962*4882a593Smuzhiyun nvme_rdma_free_io_queues(ctrl);
963*4882a593Smuzhiyun }
964*4882a593Smuzhiyun
nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl * ctrl,bool new)965*4882a593Smuzhiyun static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
966*4882a593Smuzhiyun {
967*4882a593Smuzhiyun int ret;
968*4882a593Smuzhiyun
969*4882a593Smuzhiyun ret = nvme_rdma_alloc_io_queues(ctrl);
970*4882a593Smuzhiyun if (ret)
971*4882a593Smuzhiyun return ret;
972*4882a593Smuzhiyun
973*4882a593Smuzhiyun if (new) {
974*4882a593Smuzhiyun ctrl->ctrl.tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, false);
975*4882a593Smuzhiyun if (IS_ERR(ctrl->ctrl.tagset)) {
976*4882a593Smuzhiyun ret = PTR_ERR(ctrl->ctrl.tagset);
977*4882a593Smuzhiyun goto out_free_io_queues;
978*4882a593Smuzhiyun }
979*4882a593Smuzhiyun
980*4882a593Smuzhiyun ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set);
981*4882a593Smuzhiyun if (IS_ERR(ctrl->ctrl.connect_q)) {
982*4882a593Smuzhiyun ret = PTR_ERR(ctrl->ctrl.connect_q);
983*4882a593Smuzhiyun goto out_free_tag_set;
984*4882a593Smuzhiyun }
985*4882a593Smuzhiyun }
986*4882a593Smuzhiyun
987*4882a593Smuzhiyun ret = nvme_rdma_start_io_queues(ctrl);
988*4882a593Smuzhiyun if (ret)
989*4882a593Smuzhiyun goto out_cleanup_connect_q;
990*4882a593Smuzhiyun
991*4882a593Smuzhiyun if (!new) {
992*4882a593Smuzhiyun nvme_start_queues(&ctrl->ctrl);
993*4882a593Smuzhiyun if (!nvme_wait_freeze_timeout(&ctrl->ctrl, NVME_IO_TIMEOUT)) {
994*4882a593Smuzhiyun /*
995*4882a593Smuzhiyun * If we timed out waiting for freeze we are likely to
996*4882a593Smuzhiyun * be stuck. Fail the controller initialization just
997*4882a593Smuzhiyun * to be safe.
998*4882a593Smuzhiyun */
999*4882a593Smuzhiyun ret = -ENODEV;
1000*4882a593Smuzhiyun goto out_wait_freeze_timed_out;
1001*4882a593Smuzhiyun }
1002*4882a593Smuzhiyun blk_mq_update_nr_hw_queues(ctrl->ctrl.tagset,
1003*4882a593Smuzhiyun ctrl->ctrl.queue_count - 1);
1004*4882a593Smuzhiyun nvme_unfreeze(&ctrl->ctrl);
1005*4882a593Smuzhiyun }
1006*4882a593Smuzhiyun
1007*4882a593Smuzhiyun return 0;
1008*4882a593Smuzhiyun
1009*4882a593Smuzhiyun out_wait_freeze_timed_out:
1010*4882a593Smuzhiyun nvme_stop_queues(&ctrl->ctrl);
1011*4882a593Smuzhiyun nvme_sync_io_queues(&ctrl->ctrl);
1012*4882a593Smuzhiyun nvme_rdma_stop_io_queues(ctrl);
1013*4882a593Smuzhiyun out_cleanup_connect_q:
1014*4882a593Smuzhiyun nvme_cancel_tagset(&ctrl->ctrl);
1015*4882a593Smuzhiyun if (new)
1016*4882a593Smuzhiyun blk_cleanup_queue(ctrl->ctrl.connect_q);
1017*4882a593Smuzhiyun out_free_tag_set:
1018*4882a593Smuzhiyun if (new)
1019*4882a593Smuzhiyun blk_mq_free_tag_set(ctrl->ctrl.tagset);
1020*4882a593Smuzhiyun out_free_io_queues:
1021*4882a593Smuzhiyun nvme_rdma_free_io_queues(ctrl);
1022*4882a593Smuzhiyun return ret;
1023*4882a593Smuzhiyun }
1024*4882a593Smuzhiyun
nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl * ctrl,bool remove)1025*4882a593Smuzhiyun static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl,
1026*4882a593Smuzhiyun bool remove)
1027*4882a593Smuzhiyun {
1028*4882a593Smuzhiyun blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
1029*4882a593Smuzhiyun blk_sync_queue(ctrl->ctrl.admin_q);
1030*4882a593Smuzhiyun nvme_rdma_stop_queue(&ctrl->queues[0]);
1031*4882a593Smuzhiyun if (ctrl->ctrl.admin_tagset) {
1032*4882a593Smuzhiyun blk_mq_tagset_busy_iter(ctrl->ctrl.admin_tagset,
1033*4882a593Smuzhiyun nvme_cancel_request, &ctrl->ctrl);
1034*4882a593Smuzhiyun blk_mq_tagset_wait_completed_request(ctrl->ctrl.admin_tagset);
1035*4882a593Smuzhiyun }
1036*4882a593Smuzhiyun if (remove)
1037*4882a593Smuzhiyun blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
1038*4882a593Smuzhiyun nvme_rdma_destroy_admin_queue(ctrl, remove);
1039*4882a593Smuzhiyun }
1040*4882a593Smuzhiyun
nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl * ctrl,bool remove)1041*4882a593Smuzhiyun static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
1042*4882a593Smuzhiyun bool remove)
1043*4882a593Smuzhiyun {
1044*4882a593Smuzhiyun if (ctrl->ctrl.queue_count > 1) {
1045*4882a593Smuzhiyun nvme_start_freeze(&ctrl->ctrl);
1046*4882a593Smuzhiyun nvme_stop_queues(&ctrl->ctrl);
1047*4882a593Smuzhiyun nvme_sync_io_queues(&ctrl->ctrl);
1048*4882a593Smuzhiyun nvme_rdma_stop_io_queues(ctrl);
1049*4882a593Smuzhiyun if (ctrl->ctrl.tagset) {
1050*4882a593Smuzhiyun blk_mq_tagset_busy_iter(ctrl->ctrl.tagset,
1051*4882a593Smuzhiyun nvme_cancel_request, &ctrl->ctrl);
1052*4882a593Smuzhiyun blk_mq_tagset_wait_completed_request(ctrl->ctrl.tagset);
1053*4882a593Smuzhiyun }
1054*4882a593Smuzhiyun if (remove)
1055*4882a593Smuzhiyun nvme_start_queues(&ctrl->ctrl);
1056*4882a593Smuzhiyun nvme_rdma_destroy_io_queues(ctrl, remove);
1057*4882a593Smuzhiyun }
1058*4882a593Smuzhiyun }
1059*4882a593Smuzhiyun
nvme_rdma_stop_ctrl(struct nvme_ctrl * nctrl)1060*4882a593Smuzhiyun static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl)
1061*4882a593Smuzhiyun {
1062*4882a593Smuzhiyun struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
1063*4882a593Smuzhiyun
1064*4882a593Smuzhiyun cancel_work_sync(&ctrl->err_work);
1065*4882a593Smuzhiyun cancel_delayed_work_sync(&ctrl->reconnect_work);
1066*4882a593Smuzhiyun }
1067*4882a593Smuzhiyun
nvme_rdma_free_ctrl(struct nvme_ctrl * nctrl)1068*4882a593Smuzhiyun static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl)
1069*4882a593Smuzhiyun {
1070*4882a593Smuzhiyun struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
1071*4882a593Smuzhiyun
1072*4882a593Smuzhiyun if (list_empty(&ctrl->list))
1073*4882a593Smuzhiyun goto free_ctrl;
1074*4882a593Smuzhiyun
1075*4882a593Smuzhiyun mutex_lock(&nvme_rdma_ctrl_mutex);
1076*4882a593Smuzhiyun list_del(&ctrl->list);
1077*4882a593Smuzhiyun mutex_unlock(&nvme_rdma_ctrl_mutex);
1078*4882a593Smuzhiyun
1079*4882a593Smuzhiyun nvmf_free_options(nctrl->opts);
1080*4882a593Smuzhiyun free_ctrl:
1081*4882a593Smuzhiyun kfree(ctrl->queues);
1082*4882a593Smuzhiyun kfree(ctrl);
1083*4882a593Smuzhiyun }
1084*4882a593Smuzhiyun
nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl * ctrl)1085*4882a593Smuzhiyun static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl)
1086*4882a593Smuzhiyun {
1087*4882a593Smuzhiyun /* If we are resetting/deleting then do nothing */
1088*4882a593Smuzhiyun if (ctrl->ctrl.state != NVME_CTRL_CONNECTING) {
1089*4882a593Smuzhiyun WARN_ON_ONCE(ctrl->ctrl.state == NVME_CTRL_NEW ||
1090*4882a593Smuzhiyun ctrl->ctrl.state == NVME_CTRL_LIVE);
1091*4882a593Smuzhiyun return;
1092*4882a593Smuzhiyun }
1093*4882a593Smuzhiyun
1094*4882a593Smuzhiyun if (nvmf_should_reconnect(&ctrl->ctrl)) {
1095*4882a593Smuzhiyun dev_info(ctrl->ctrl.device, "Reconnecting in %d seconds...\n",
1096*4882a593Smuzhiyun ctrl->ctrl.opts->reconnect_delay);
1097*4882a593Smuzhiyun queue_delayed_work(nvme_wq, &ctrl->reconnect_work,
1098*4882a593Smuzhiyun ctrl->ctrl.opts->reconnect_delay * HZ);
1099*4882a593Smuzhiyun } else {
1100*4882a593Smuzhiyun nvme_delete_ctrl(&ctrl->ctrl);
1101*4882a593Smuzhiyun }
1102*4882a593Smuzhiyun }
1103*4882a593Smuzhiyun
nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl * ctrl,bool new)1104*4882a593Smuzhiyun static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
1105*4882a593Smuzhiyun {
1106*4882a593Smuzhiyun int ret = -EINVAL;
1107*4882a593Smuzhiyun bool changed;
1108*4882a593Smuzhiyun
1109*4882a593Smuzhiyun ret = nvme_rdma_configure_admin_queue(ctrl, new);
1110*4882a593Smuzhiyun if (ret)
1111*4882a593Smuzhiyun return ret;
1112*4882a593Smuzhiyun
1113*4882a593Smuzhiyun if (ctrl->ctrl.icdoff) {
1114*4882a593Smuzhiyun ret = -EOPNOTSUPP;
1115*4882a593Smuzhiyun dev_err(ctrl->ctrl.device, "icdoff is not supported!\n");
1116*4882a593Smuzhiyun goto destroy_admin;
1117*4882a593Smuzhiyun }
1118*4882a593Smuzhiyun
1119*4882a593Smuzhiyun if (!(ctrl->ctrl.sgls & (1 << 2))) {
1120*4882a593Smuzhiyun ret = -EOPNOTSUPP;
1121*4882a593Smuzhiyun dev_err(ctrl->ctrl.device,
1122*4882a593Smuzhiyun "Mandatory keyed sgls are not supported!\n");
1123*4882a593Smuzhiyun goto destroy_admin;
1124*4882a593Smuzhiyun }
1125*4882a593Smuzhiyun
1126*4882a593Smuzhiyun if (ctrl->ctrl.opts->queue_size > ctrl->ctrl.sqsize + 1) {
1127*4882a593Smuzhiyun dev_warn(ctrl->ctrl.device,
1128*4882a593Smuzhiyun "queue_size %zu > ctrl sqsize %u, clamping down\n",
1129*4882a593Smuzhiyun ctrl->ctrl.opts->queue_size, ctrl->ctrl.sqsize + 1);
1130*4882a593Smuzhiyun }
1131*4882a593Smuzhiyun
1132*4882a593Smuzhiyun if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) {
1133*4882a593Smuzhiyun dev_warn(ctrl->ctrl.device,
1134*4882a593Smuzhiyun "sqsize %u > ctrl maxcmd %u, clamping down\n",
1135*4882a593Smuzhiyun ctrl->ctrl.sqsize + 1, ctrl->ctrl.maxcmd);
1136*4882a593Smuzhiyun ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1;
1137*4882a593Smuzhiyun }
1138*4882a593Smuzhiyun
1139*4882a593Smuzhiyun if (ctrl->ctrl.sgls & (1 << 20))
1140*4882a593Smuzhiyun ctrl->use_inline_data = true;
1141*4882a593Smuzhiyun
1142*4882a593Smuzhiyun if (ctrl->ctrl.queue_count > 1) {
1143*4882a593Smuzhiyun ret = nvme_rdma_configure_io_queues(ctrl, new);
1144*4882a593Smuzhiyun if (ret)
1145*4882a593Smuzhiyun goto destroy_admin;
1146*4882a593Smuzhiyun }
1147*4882a593Smuzhiyun
1148*4882a593Smuzhiyun changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
1149*4882a593Smuzhiyun if (!changed) {
1150*4882a593Smuzhiyun /*
1151*4882a593Smuzhiyun * state change failure is ok if we started ctrl delete,
1152*4882a593Smuzhiyun * unless we're during creation of a new controller to
1153*4882a593Smuzhiyun * avoid races with teardown flow.
1154*4882a593Smuzhiyun */
1155*4882a593Smuzhiyun WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING &&
1156*4882a593Smuzhiyun ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO);
1157*4882a593Smuzhiyun WARN_ON_ONCE(new);
1158*4882a593Smuzhiyun ret = -EINVAL;
1159*4882a593Smuzhiyun goto destroy_io;
1160*4882a593Smuzhiyun }
1161*4882a593Smuzhiyun
1162*4882a593Smuzhiyun nvme_start_ctrl(&ctrl->ctrl);
1163*4882a593Smuzhiyun return 0;
1164*4882a593Smuzhiyun
1165*4882a593Smuzhiyun destroy_io:
1166*4882a593Smuzhiyun if (ctrl->ctrl.queue_count > 1) {
1167*4882a593Smuzhiyun nvme_stop_queues(&ctrl->ctrl);
1168*4882a593Smuzhiyun nvme_sync_io_queues(&ctrl->ctrl);
1169*4882a593Smuzhiyun nvme_rdma_stop_io_queues(ctrl);
1170*4882a593Smuzhiyun nvme_cancel_tagset(&ctrl->ctrl);
1171*4882a593Smuzhiyun nvme_rdma_destroy_io_queues(ctrl, new);
1172*4882a593Smuzhiyun }
1173*4882a593Smuzhiyun destroy_admin:
1174*4882a593Smuzhiyun blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
1175*4882a593Smuzhiyun blk_sync_queue(ctrl->ctrl.admin_q);
1176*4882a593Smuzhiyun nvme_rdma_stop_queue(&ctrl->queues[0]);
1177*4882a593Smuzhiyun nvme_cancel_admin_tagset(&ctrl->ctrl);
1178*4882a593Smuzhiyun nvme_rdma_destroy_admin_queue(ctrl, new);
1179*4882a593Smuzhiyun return ret;
1180*4882a593Smuzhiyun }
1181*4882a593Smuzhiyun
nvme_rdma_reconnect_ctrl_work(struct work_struct * work)1182*4882a593Smuzhiyun static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
1183*4882a593Smuzhiyun {
1184*4882a593Smuzhiyun struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work),
1185*4882a593Smuzhiyun struct nvme_rdma_ctrl, reconnect_work);
1186*4882a593Smuzhiyun
1187*4882a593Smuzhiyun ++ctrl->ctrl.nr_reconnects;
1188*4882a593Smuzhiyun
1189*4882a593Smuzhiyun if (nvme_rdma_setup_ctrl(ctrl, false))
1190*4882a593Smuzhiyun goto requeue;
1191*4882a593Smuzhiyun
1192*4882a593Smuzhiyun dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n",
1193*4882a593Smuzhiyun ctrl->ctrl.nr_reconnects);
1194*4882a593Smuzhiyun
1195*4882a593Smuzhiyun ctrl->ctrl.nr_reconnects = 0;
1196*4882a593Smuzhiyun
1197*4882a593Smuzhiyun return;
1198*4882a593Smuzhiyun
1199*4882a593Smuzhiyun requeue:
1200*4882a593Smuzhiyun dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n",
1201*4882a593Smuzhiyun ctrl->ctrl.nr_reconnects);
1202*4882a593Smuzhiyun nvme_rdma_reconnect_or_remove(ctrl);
1203*4882a593Smuzhiyun }
1204*4882a593Smuzhiyun
nvme_rdma_error_recovery_work(struct work_struct * work)1205*4882a593Smuzhiyun static void nvme_rdma_error_recovery_work(struct work_struct *work)
1206*4882a593Smuzhiyun {
1207*4882a593Smuzhiyun struct nvme_rdma_ctrl *ctrl = container_of(work,
1208*4882a593Smuzhiyun struct nvme_rdma_ctrl, err_work);
1209*4882a593Smuzhiyun
1210*4882a593Smuzhiyun nvme_stop_keep_alive(&ctrl->ctrl);
1211*4882a593Smuzhiyun flush_work(&ctrl->ctrl.async_event_work);
1212*4882a593Smuzhiyun nvme_rdma_teardown_io_queues(ctrl, false);
1213*4882a593Smuzhiyun nvme_start_queues(&ctrl->ctrl);
1214*4882a593Smuzhiyun nvme_rdma_teardown_admin_queue(ctrl, false);
1215*4882a593Smuzhiyun blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
1216*4882a593Smuzhiyun
1217*4882a593Smuzhiyun if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
1218*4882a593Smuzhiyun /* state change failure is ok if we started ctrl delete */
1219*4882a593Smuzhiyun WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING &&
1220*4882a593Smuzhiyun ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO);
1221*4882a593Smuzhiyun return;
1222*4882a593Smuzhiyun }
1223*4882a593Smuzhiyun
1224*4882a593Smuzhiyun nvme_rdma_reconnect_or_remove(ctrl);
1225*4882a593Smuzhiyun }
1226*4882a593Smuzhiyun
nvme_rdma_error_recovery(struct nvme_rdma_ctrl * ctrl)1227*4882a593Smuzhiyun static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl)
1228*4882a593Smuzhiyun {
1229*4882a593Smuzhiyun if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
1230*4882a593Smuzhiyun return;
1231*4882a593Smuzhiyun
1232*4882a593Smuzhiyun dev_warn(ctrl->ctrl.device, "starting error recovery\n");
1233*4882a593Smuzhiyun queue_work(nvme_reset_wq, &ctrl->err_work);
1234*4882a593Smuzhiyun }
1235*4882a593Smuzhiyun
nvme_rdma_end_request(struct nvme_rdma_request * req)1236*4882a593Smuzhiyun static void nvme_rdma_end_request(struct nvme_rdma_request *req)
1237*4882a593Smuzhiyun {
1238*4882a593Smuzhiyun struct request *rq = blk_mq_rq_from_pdu(req);
1239*4882a593Smuzhiyun
1240*4882a593Smuzhiyun if (!refcount_dec_and_test(&req->ref))
1241*4882a593Smuzhiyun return;
1242*4882a593Smuzhiyun if (!nvme_try_complete_req(rq, req->status, req->result))
1243*4882a593Smuzhiyun nvme_rdma_complete_rq(rq);
1244*4882a593Smuzhiyun }
1245*4882a593Smuzhiyun
nvme_rdma_wr_error(struct ib_cq * cq,struct ib_wc * wc,const char * op)1246*4882a593Smuzhiyun static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc,
1247*4882a593Smuzhiyun const char *op)
1248*4882a593Smuzhiyun {
1249*4882a593Smuzhiyun struct nvme_rdma_queue *queue = wc->qp->qp_context;
1250*4882a593Smuzhiyun struct nvme_rdma_ctrl *ctrl = queue->ctrl;
1251*4882a593Smuzhiyun
1252*4882a593Smuzhiyun if (ctrl->ctrl.state == NVME_CTRL_LIVE)
1253*4882a593Smuzhiyun dev_info(ctrl->ctrl.device,
1254*4882a593Smuzhiyun "%s for CQE 0x%p failed with status %s (%d)\n",
1255*4882a593Smuzhiyun op, wc->wr_cqe,
1256*4882a593Smuzhiyun ib_wc_status_msg(wc->status), wc->status);
1257*4882a593Smuzhiyun nvme_rdma_error_recovery(ctrl);
1258*4882a593Smuzhiyun }
1259*4882a593Smuzhiyun
nvme_rdma_memreg_done(struct ib_cq * cq,struct ib_wc * wc)1260*4882a593Smuzhiyun static void nvme_rdma_memreg_done(struct ib_cq *cq, struct ib_wc *wc)
1261*4882a593Smuzhiyun {
1262*4882a593Smuzhiyun if (unlikely(wc->status != IB_WC_SUCCESS))
1263*4882a593Smuzhiyun nvme_rdma_wr_error(cq, wc, "MEMREG");
1264*4882a593Smuzhiyun }
1265*4882a593Smuzhiyun
nvme_rdma_inv_rkey_done(struct ib_cq * cq,struct ib_wc * wc)1266*4882a593Smuzhiyun static void nvme_rdma_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc)
1267*4882a593Smuzhiyun {
1268*4882a593Smuzhiyun struct nvme_rdma_request *req =
1269*4882a593Smuzhiyun container_of(wc->wr_cqe, struct nvme_rdma_request, reg_cqe);
1270*4882a593Smuzhiyun
1271*4882a593Smuzhiyun if (unlikely(wc->status != IB_WC_SUCCESS))
1272*4882a593Smuzhiyun nvme_rdma_wr_error(cq, wc, "LOCAL_INV");
1273*4882a593Smuzhiyun else
1274*4882a593Smuzhiyun nvme_rdma_end_request(req);
1275*4882a593Smuzhiyun }
1276*4882a593Smuzhiyun
nvme_rdma_inv_rkey(struct nvme_rdma_queue * queue,struct nvme_rdma_request * req)1277*4882a593Smuzhiyun static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue,
1278*4882a593Smuzhiyun struct nvme_rdma_request *req)
1279*4882a593Smuzhiyun {
1280*4882a593Smuzhiyun struct ib_send_wr wr = {
1281*4882a593Smuzhiyun .opcode = IB_WR_LOCAL_INV,
1282*4882a593Smuzhiyun .next = NULL,
1283*4882a593Smuzhiyun .num_sge = 0,
1284*4882a593Smuzhiyun .send_flags = IB_SEND_SIGNALED,
1285*4882a593Smuzhiyun .ex.invalidate_rkey = req->mr->rkey,
1286*4882a593Smuzhiyun };
1287*4882a593Smuzhiyun
1288*4882a593Smuzhiyun req->reg_cqe.done = nvme_rdma_inv_rkey_done;
1289*4882a593Smuzhiyun wr.wr_cqe = &req->reg_cqe;
1290*4882a593Smuzhiyun
1291*4882a593Smuzhiyun return ib_post_send(queue->qp, &wr, NULL);
1292*4882a593Smuzhiyun }
1293*4882a593Smuzhiyun
nvme_rdma_unmap_data(struct nvme_rdma_queue * queue,struct request * rq)1294*4882a593Smuzhiyun static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue,
1295*4882a593Smuzhiyun struct request *rq)
1296*4882a593Smuzhiyun {
1297*4882a593Smuzhiyun struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
1298*4882a593Smuzhiyun struct nvme_rdma_device *dev = queue->device;
1299*4882a593Smuzhiyun struct ib_device *ibdev = dev->dev;
1300*4882a593Smuzhiyun struct list_head *pool = &queue->qp->rdma_mrs;
1301*4882a593Smuzhiyun
1302*4882a593Smuzhiyun if (!blk_rq_nr_phys_segments(rq))
1303*4882a593Smuzhiyun return;
1304*4882a593Smuzhiyun
1305*4882a593Smuzhiyun if (blk_integrity_rq(rq)) {
1306*4882a593Smuzhiyun ib_dma_unmap_sg(ibdev, req->metadata_sgl->sg_table.sgl,
1307*4882a593Smuzhiyun req->metadata_sgl->nents, rq_dma_dir(rq));
1308*4882a593Smuzhiyun sg_free_table_chained(&req->metadata_sgl->sg_table,
1309*4882a593Smuzhiyun NVME_INLINE_METADATA_SG_CNT);
1310*4882a593Smuzhiyun }
1311*4882a593Smuzhiyun
1312*4882a593Smuzhiyun if (req->use_sig_mr)
1313*4882a593Smuzhiyun pool = &queue->qp->sig_mrs;
1314*4882a593Smuzhiyun
1315*4882a593Smuzhiyun if (req->mr) {
1316*4882a593Smuzhiyun ib_mr_pool_put(queue->qp, pool, req->mr);
1317*4882a593Smuzhiyun req->mr = NULL;
1318*4882a593Smuzhiyun }
1319*4882a593Smuzhiyun
1320*4882a593Smuzhiyun ib_dma_unmap_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents,
1321*4882a593Smuzhiyun rq_dma_dir(rq));
1322*4882a593Smuzhiyun sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT);
1323*4882a593Smuzhiyun }
1324*4882a593Smuzhiyun
nvme_rdma_set_sg_null(struct nvme_command * c)1325*4882a593Smuzhiyun static int nvme_rdma_set_sg_null(struct nvme_command *c)
1326*4882a593Smuzhiyun {
1327*4882a593Smuzhiyun struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
1328*4882a593Smuzhiyun
1329*4882a593Smuzhiyun sg->addr = 0;
1330*4882a593Smuzhiyun put_unaligned_le24(0, sg->length);
1331*4882a593Smuzhiyun put_unaligned_le32(0, sg->key);
1332*4882a593Smuzhiyun sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4;
1333*4882a593Smuzhiyun return 0;
1334*4882a593Smuzhiyun }
1335*4882a593Smuzhiyun
nvme_rdma_map_sg_inline(struct nvme_rdma_queue * queue,struct nvme_rdma_request * req,struct nvme_command * c,int count)1336*4882a593Smuzhiyun static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue,
1337*4882a593Smuzhiyun struct nvme_rdma_request *req, struct nvme_command *c,
1338*4882a593Smuzhiyun int count)
1339*4882a593Smuzhiyun {
1340*4882a593Smuzhiyun struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
1341*4882a593Smuzhiyun struct ib_sge *sge = &req->sge[1];
1342*4882a593Smuzhiyun struct scatterlist *sgl;
1343*4882a593Smuzhiyun u32 len = 0;
1344*4882a593Smuzhiyun int i;
1345*4882a593Smuzhiyun
1346*4882a593Smuzhiyun for_each_sg(req->data_sgl.sg_table.sgl, sgl, count, i) {
1347*4882a593Smuzhiyun sge->addr = sg_dma_address(sgl);
1348*4882a593Smuzhiyun sge->length = sg_dma_len(sgl);
1349*4882a593Smuzhiyun sge->lkey = queue->device->pd->local_dma_lkey;
1350*4882a593Smuzhiyun len += sge->length;
1351*4882a593Smuzhiyun sge++;
1352*4882a593Smuzhiyun }
1353*4882a593Smuzhiyun
1354*4882a593Smuzhiyun sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
1355*4882a593Smuzhiyun sg->length = cpu_to_le32(len);
1356*4882a593Smuzhiyun sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
1357*4882a593Smuzhiyun
1358*4882a593Smuzhiyun req->num_sge += count;
1359*4882a593Smuzhiyun return 0;
1360*4882a593Smuzhiyun }
1361*4882a593Smuzhiyun
nvme_rdma_map_sg_single(struct nvme_rdma_queue * queue,struct nvme_rdma_request * req,struct nvme_command * c)1362*4882a593Smuzhiyun static int nvme_rdma_map_sg_single(struct nvme_rdma_queue *queue,
1363*4882a593Smuzhiyun struct nvme_rdma_request *req, struct nvme_command *c)
1364*4882a593Smuzhiyun {
1365*4882a593Smuzhiyun struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
1366*4882a593Smuzhiyun
1367*4882a593Smuzhiyun sg->addr = cpu_to_le64(sg_dma_address(req->data_sgl.sg_table.sgl));
1368*4882a593Smuzhiyun put_unaligned_le24(sg_dma_len(req->data_sgl.sg_table.sgl), sg->length);
1369*4882a593Smuzhiyun put_unaligned_le32(queue->device->pd->unsafe_global_rkey, sg->key);
1370*4882a593Smuzhiyun sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4;
1371*4882a593Smuzhiyun return 0;
1372*4882a593Smuzhiyun }
1373*4882a593Smuzhiyun
nvme_rdma_map_sg_fr(struct nvme_rdma_queue * queue,struct nvme_rdma_request * req,struct nvme_command * c,int count)1374*4882a593Smuzhiyun static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue,
1375*4882a593Smuzhiyun struct nvme_rdma_request *req, struct nvme_command *c,
1376*4882a593Smuzhiyun int count)
1377*4882a593Smuzhiyun {
1378*4882a593Smuzhiyun struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
1379*4882a593Smuzhiyun int nr;
1380*4882a593Smuzhiyun
1381*4882a593Smuzhiyun req->mr = ib_mr_pool_get(queue->qp, &queue->qp->rdma_mrs);
1382*4882a593Smuzhiyun if (WARN_ON_ONCE(!req->mr))
1383*4882a593Smuzhiyun return -EAGAIN;
1384*4882a593Smuzhiyun
1385*4882a593Smuzhiyun /*
1386*4882a593Smuzhiyun * Align the MR to a 4K page size to match the ctrl page size and
1387*4882a593Smuzhiyun * the block virtual boundary.
1388*4882a593Smuzhiyun */
1389*4882a593Smuzhiyun nr = ib_map_mr_sg(req->mr, req->data_sgl.sg_table.sgl, count, NULL,
1390*4882a593Smuzhiyun SZ_4K);
1391*4882a593Smuzhiyun if (unlikely(nr < count)) {
1392*4882a593Smuzhiyun ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr);
1393*4882a593Smuzhiyun req->mr = NULL;
1394*4882a593Smuzhiyun if (nr < 0)
1395*4882a593Smuzhiyun return nr;
1396*4882a593Smuzhiyun return -EINVAL;
1397*4882a593Smuzhiyun }
1398*4882a593Smuzhiyun
1399*4882a593Smuzhiyun ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey));
1400*4882a593Smuzhiyun
1401*4882a593Smuzhiyun req->reg_cqe.done = nvme_rdma_memreg_done;
1402*4882a593Smuzhiyun memset(&req->reg_wr, 0, sizeof(req->reg_wr));
1403*4882a593Smuzhiyun req->reg_wr.wr.opcode = IB_WR_REG_MR;
1404*4882a593Smuzhiyun req->reg_wr.wr.wr_cqe = &req->reg_cqe;
1405*4882a593Smuzhiyun req->reg_wr.wr.num_sge = 0;
1406*4882a593Smuzhiyun req->reg_wr.mr = req->mr;
1407*4882a593Smuzhiyun req->reg_wr.key = req->mr->rkey;
1408*4882a593Smuzhiyun req->reg_wr.access = IB_ACCESS_LOCAL_WRITE |
1409*4882a593Smuzhiyun IB_ACCESS_REMOTE_READ |
1410*4882a593Smuzhiyun IB_ACCESS_REMOTE_WRITE;
1411*4882a593Smuzhiyun
1412*4882a593Smuzhiyun sg->addr = cpu_to_le64(req->mr->iova);
1413*4882a593Smuzhiyun put_unaligned_le24(req->mr->length, sg->length);
1414*4882a593Smuzhiyun put_unaligned_le32(req->mr->rkey, sg->key);
1415*4882a593Smuzhiyun sg->type = (NVME_KEY_SGL_FMT_DATA_DESC << 4) |
1416*4882a593Smuzhiyun NVME_SGL_FMT_INVALIDATE;
1417*4882a593Smuzhiyun
1418*4882a593Smuzhiyun return 0;
1419*4882a593Smuzhiyun }
1420*4882a593Smuzhiyun
nvme_rdma_set_sig_domain(struct blk_integrity * bi,struct nvme_command * cmd,struct ib_sig_domain * domain,u16 control,u8 pi_type)1421*4882a593Smuzhiyun static void nvme_rdma_set_sig_domain(struct blk_integrity *bi,
1422*4882a593Smuzhiyun struct nvme_command *cmd, struct ib_sig_domain *domain,
1423*4882a593Smuzhiyun u16 control, u8 pi_type)
1424*4882a593Smuzhiyun {
1425*4882a593Smuzhiyun domain->sig_type = IB_SIG_TYPE_T10_DIF;
1426*4882a593Smuzhiyun domain->sig.dif.bg_type = IB_T10DIF_CRC;
1427*4882a593Smuzhiyun domain->sig.dif.pi_interval = 1 << bi->interval_exp;
1428*4882a593Smuzhiyun domain->sig.dif.ref_tag = le32_to_cpu(cmd->rw.reftag);
1429*4882a593Smuzhiyun if (control & NVME_RW_PRINFO_PRCHK_REF)
1430*4882a593Smuzhiyun domain->sig.dif.ref_remap = true;
1431*4882a593Smuzhiyun
1432*4882a593Smuzhiyun domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.apptag);
1433*4882a593Smuzhiyun domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.appmask);
1434*4882a593Smuzhiyun domain->sig.dif.app_escape = true;
1435*4882a593Smuzhiyun if (pi_type == NVME_NS_DPS_PI_TYPE3)
1436*4882a593Smuzhiyun domain->sig.dif.ref_escape = true;
1437*4882a593Smuzhiyun }
1438*4882a593Smuzhiyun
nvme_rdma_set_sig_attrs(struct blk_integrity * bi,struct nvme_command * cmd,struct ib_sig_attrs * sig_attrs,u8 pi_type)1439*4882a593Smuzhiyun static void nvme_rdma_set_sig_attrs(struct blk_integrity *bi,
1440*4882a593Smuzhiyun struct nvme_command *cmd, struct ib_sig_attrs *sig_attrs,
1441*4882a593Smuzhiyun u8 pi_type)
1442*4882a593Smuzhiyun {
1443*4882a593Smuzhiyun u16 control = le16_to_cpu(cmd->rw.control);
1444*4882a593Smuzhiyun
1445*4882a593Smuzhiyun memset(sig_attrs, 0, sizeof(*sig_attrs));
1446*4882a593Smuzhiyun if (control & NVME_RW_PRINFO_PRACT) {
1447*4882a593Smuzhiyun /* for WRITE_INSERT/READ_STRIP no memory domain */
1448*4882a593Smuzhiyun sig_attrs->mem.sig_type = IB_SIG_TYPE_NONE;
1449*4882a593Smuzhiyun nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control,
1450*4882a593Smuzhiyun pi_type);
1451*4882a593Smuzhiyun /* Clear the PRACT bit since HCA will generate/verify the PI */
1452*4882a593Smuzhiyun control &= ~NVME_RW_PRINFO_PRACT;
1453*4882a593Smuzhiyun cmd->rw.control = cpu_to_le16(control);
1454*4882a593Smuzhiyun } else {
1455*4882a593Smuzhiyun /* for WRITE_PASS/READ_PASS both wire/memory domains exist */
1456*4882a593Smuzhiyun nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control,
1457*4882a593Smuzhiyun pi_type);
1458*4882a593Smuzhiyun nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, control,
1459*4882a593Smuzhiyun pi_type);
1460*4882a593Smuzhiyun }
1461*4882a593Smuzhiyun }
1462*4882a593Smuzhiyun
nvme_rdma_set_prot_checks(struct nvme_command * cmd,u8 * mask)1463*4882a593Smuzhiyun static void nvme_rdma_set_prot_checks(struct nvme_command *cmd, u8 *mask)
1464*4882a593Smuzhiyun {
1465*4882a593Smuzhiyun *mask = 0;
1466*4882a593Smuzhiyun if (le16_to_cpu(cmd->rw.control) & NVME_RW_PRINFO_PRCHK_REF)
1467*4882a593Smuzhiyun *mask |= IB_SIG_CHECK_REFTAG;
1468*4882a593Smuzhiyun if (le16_to_cpu(cmd->rw.control) & NVME_RW_PRINFO_PRCHK_GUARD)
1469*4882a593Smuzhiyun *mask |= IB_SIG_CHECK_GUARD;
1470*4882a593Smuzhiyun }
1471*4882a593Smuzhiyun
nvme_rdma_sig_done(struct ib_cq * cq,struct ib_wc * wc)1472*4882a593Smuzhiyun static void nvme_rdma_sig_done(struct ib_cq *cq, struct ib_wc *wc)
1473*4882a593Smuzhiyun {
1474*4882a593Smuzhiyun if (unlikely(wc->status != IB_WC_SUCCESS))
1475*4882a593Smuzhiyun nvme_rdma_wr_error(cq, wc, "SIG");
1476*4882a593Smuzhiyun }
1477*4882a593Smuzhiyun
nvme_rdma_map_sg_pi(struct nvme_rdma_queue * queue,struct nvme_rdma_request * req,struct nvme_command * c,int count,int pi_count)1478*4882a593Smuzhiyun static int nvme_rdma_map_sg_pi(struct nvme_rdma_queue *queue,
1479*4882a593Smuzhiyun struct nvme_rdma_request *req, struct nvme_command *c,
1480*4882a593Smuzhiyun int count, int pi_count)
1481*4882a593Smuzhiyun {
1482*4882a593Smuzhiyun struct nvme_rdma_sgl *sgl = &req->data_sgl;
1483*4882a593Smuzhiyun struct ib_reg_wr *wr = &req->reg_wr;
1484*4882a593Smuzhiyun struct request *rq = blk_mq_rq_from_pdu(req);
1485*4882a593Smuzhiyun struct nvme_ns *ns = rq->q->queuedata;
1486*4882a593Smuzhiyun struct bio *bio = rq->bio;
1487*4882a593Smuzhiyun struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
1488*4882a593Smuzhiyun int nr;
1489*4882a593Smuzhiyun
1490*4882a593Smuzhiyun req->mr = ib_mr_pool_get(queue->qp, &queue->qp->sig_mrs);
1491*4882a593Smuzhiyun if (WARN_ON_ONCE(!req->mr))
1492*4882a593Smuzhiyun return -EAGAIN;
1493*4882a593Smuzhiyun
1494*4882a593Smuzhiyun nr = ib_map_mr_sg_pi(req->mr, sgl->sg_table.sgl, count, NULL,
1495*4882a593Smuzhiyun req->metadata_sgl->sg_table.sgl, pi_count, NULL,
1496*4882a593Smuzhiyun SZ_4K);
1497*4882a593Smuzhiyun if (unlikely(nr))
1498*4882a593Smuzhiyun goto mr_put;
1499*4882a593Smuzhiyun
1500*4882a593Smuzhiyun nvme_rdma_set_sig_attrs(blk_get_integrity(bio->bi_disk), c,
1501*4882a593Smuzhiyun req->mr->sig_attrs, ns->pi_type);
1502*4882a593Smuzhiyun nvme_rdma_set_prot_checks(c, &req->mr->sig_attrs->check_mask);
1503*4882a593Smuzhiyun
1504*4882a593Smuzhiyun ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey));
1505*4882a593Smuzhiyun
1506*4882a593Smuzhiyun req->reg_cqe.done = nvme_rdma_sig_done;
1507*4882a593Smuzhiyun memset(wr, 0, sizeof(*wr));
1508*4882a593Smuzhiyun wr->wr.opcode = IB_WR_REG_MR_INTEGRITY;
1509*4882a593Smuzhiyun wr->wr.wr_cqe = &req->reg_cqe;
1510*4882a593Smuzhiyun wr->wr.num_sge = 0;
1511*4882a593Smuzhiyun wr->wr.send_flags = 0;
1512*4882a593Smuzhiyun wr->mr = req->mr;
1513*4882a593Smuzhiyun wr->key = req->mr->rkey;
1514*4882a593Smuzhiyun wr->access = IB_ACCESS_LOCAL_WRITE |
1515*4882a593Smuzhiyun IB_ACCESS_REMOTE_READ |
1516*4882a593Smuzhiyun IB_ACCESS_REMOTE_WRITE;
1517*4882a593Smuzhiyun
1518*4882a593Smuzhiyun sg->addr = cpu_to_le64(req->mr->iova);
1519*4882a593Smuzhiyun put_unaligned_le24(req->mr->length, sg->length);
1520*4882a593Smuzhiyun put_unaligned_le32(req->mr->rkey, sg->key);
1521*4882a593Smuzhiyun sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4;
1522*4882a593Smuzhiyun
1523*4882a593Smuzhiyun return 0;
1524*4882a593Smuzhiyun
1525*4882a593Smuzhiyun mr_put:
1526*4882a593Smuzhiyun ib_mr_pool_put(queue->qp, &queue->qp->sig_mrs, req->mr);
1527*4882a593Smuzhiyun req->mr = NULL;
1528*4882a593Smuzhiyun if (nr < 0)
1529*4882a593Smuzhiyun return nr;
1530*4882a593Smuzhiyun return -EINVAL;
1531*4882a593Smuzhiyun }
1532*4882a593Smuzhiyun
nvme_rdma_map_data(struct nvme_rdma_queue * queue,struct request * rq,struct nvme_command * c)1533*4882a593Smuzhiyun static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
1534*4882a593Smuzhiyun struct request *rq, struct nvme_command *c)
1535*4882a593Smuzhiyun {
1536*4882a593Smuzhiyun struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
1537*4882a593Smuzhiyun struct nvme_rdma_device *dev = queue->device;
1538*4882a593Smuzhiyun struct ib_device *ibdev = dev->dev;
1539*4882a593Smuzhiyun int pi_count = 0;
1540*4882a593Smuzhiyun int count, ret;
1541*4882a593Smuzhiyun
1542*4882a593Smuzhiyun req->num_sge = 1;
1543*4882a593Smuzhiyun refcount_set(&req->ref, 2); /* send and recv completions */
1544*4882a593Smuzhiyun
1545*4882a593Smuzhiyun c->common.flags |= NVME_CMD_SGL_METABUF;
1546*4882a593Smuzhiyun
1547*4882a593Smuzhiyun if (!blk_rq_nr_phys_segments(rq))
1548*4882a593Smuzhiyun return nvme_rdma_set_sg_null(c);
1549*4882a593Smuzhiyun
1550*4882a593Smuzhiyun req->data_sgl.sg_table.sgl = (struct scatterlist *)(req + 1);
1551*4882a593Smuzhiyun ret = sg_alloc_table_chained(&req->data_sgl.sg_table,
1552*4882a593Smuzhiyun blk_rq_nr_phys_segments(rq), req->data_sgl.sg_table.sgl,
1553*4882a593Smuzhiyun NVME_INLINE_SG_CNT);
1554*4882a593Smuzhiyun if (ret)
1555*4882a593Smuzhiyun return -ENOMEM;
1556*4882a593Smuzhiyun
1557*4882a593Smuzhiyun req->data_sgl.nents = blk_rq_map_sg(rq->q, rq,
1558*4882a593Smuzhiyun req->data_sgl.sg_table.sgl);
1559*4882a593Smuzhiyun
1560*4882a593Smuzhiyun count = ib_dma_map_sg(ibdev, req->data_sgl.sg_table.sgl,
1561*4882a593Smuzhiyun req->data_sgl.nents, rq_dma_dir(rq));
1562*4882a593Smuzhiyun if (unlikely(count <= 0)) {
1563*4882a593Smuzhiyun ret = -EIO;
1564*4882a593Smuzhiyun goto out_free_table;
1565*4882a593Smuzhiyun }
1566*4882a593Smuzhiyun
1567*4882a593Smuzhiyun if (blk_integrity_rq(rq)) {
1568*4882a593Smuzhiyun req->metadata_sgl->sg_table.sgl =
1569*4882a593Smuzhiyun (struct scatterlist *)(req->metadata_sgl + 1);
1570*4882a593Smuzhiyun ret = sg_alloc_table_chained(&req->metadata_sgl->sg_table,
1571*4882a593Smuzhiyun blk_rq_count_integrity_sg(rq->q, rq->bio),
1572*4882a593Smuzhiyun req->metadata_sgl->sg_table.sgl,
1573*4882a593Smuzhiyun NVME_INLINE_METADATA_SG_CNT);
1574*4882a593Smuzhiyun if (unlikely(ret)) {
1575*4882a593Smuzhiyun ret = -ENOMEM;
1576*4882a593Smuzhiyun goto out_unmap_sg;
1577*4882a593Smuzhiyun }
1578*4882a593Smuzhiyun
1579*4882a593Smuzhiyun req->metadata_sgl->nents = blk_rq_map_integrity_sg(rq->q,
1580*4882a593Smuzhiyun rq->bio, req->metadata_sgl->sg_table.sgl);
1581*4882a593Smuzhiyun pi_count = ib_dma_map_sg(ibdev,
1582*4882a593Smuzhiyun req->metadata_sgl->sg_table.sgl,
1583*4882a593Smuzhiyun req->metadata_sgl->nents,
1584*4882a593Smuzhiyun rq_dma_dir(rq));
1585*4882a593Smuzhiyun if (unlikely(pi_count <= 0)) {
1586*4882a593Smuzhiyun ret = -EIO;
1587*4882a593Smuzhiyun goto out_free_pi_table;
1588*4882a593Smuzhiyun }
1589*4882a593Smuzhiyun }
1590*4882a593Smuzhiyun
1591*4882a593Smuzhiyun if (req->use_sig_mr) {
1592*4882a593Smuzhiyun ret = nvme_rdma_map_sg_pi(queue, req, c, count, pi_count);
1593*4882a593Smuzhiyun goto out;
1594*4882a593Smuzhiyun }
1595*4882a593Smuzhiyun
1596*4882a593Smuzhiyun if (count <= dev->num_inline_segments) {
1597*4882a593Smuzhiyun if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) &&
1598*4882a593Smuzhiyun queue->ctrl->use_inline_data &&
1599*4882a593Smuzhiyun blk_rq_payload_bytes(rq) <=
1600*4882a593Smuzhiyun nvme_rdma_inline_data_size(queue)) {
1601*4882a593Smuzhiyun ret = nvme_rdma_map_sg_inline(queue, req, c, count);
1602*4882a593Smuzhiyun goto out;
1603*4882a593Smuzhiyun }
1604*4882a593Smuzhiyun
1605*4882a593Smuzhiyun if (count == 1 && dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) {
1606*4882a593Smuzhiyun ret = nvme_rdma_map_sg_single(queue, req, c);
1607*4882a593Smuzhiyun goto out;
1608*4882a593Smuzhiyun }
1609*4882a593Smuzhiyun }
1610*4882a593Smuzhiyun
1611*4882a593Smuzhiyun ret = nvme_rdma_map_sg_fr(queue, req, c, count);
1612*4882a593Smuzhiyun out:
1613*4882a593Smuzhiyun if (unlikely(ret))
1614*4882a593Smuzhiyun goto out_unmap_pi_sg;
1615*4882a593Smuzhiyun
1616*4882a593Smuzhiyun return 0;
1617*4882a593Smuzhiyun
1618*4882a593Smuzhiyun out_unmap_pi_sg:
1619*4882a593Smuzhiyun if (blk_integrity_rq(rq))
1620*4882a593Smuzhiyun ib_dma_unmap_sg(ibdev, req->metadata_sgl->sg_table.sgl,
1621*4882a593Smuzhiyun req->metadata_sgl->nents, rq_dma_dir(rq));
1622*4882a593Smuzhiyun out_free_pi_table:
1623*4882a593Smuzhiyun if (blk_integrity_rq(rq))
1624*4882a593Smuzhiyun sg_free_table_chained(&req->metadata_sgl->sg_table,
1625*4882a593Smuzhiyun NVME_INLINE_METADATA_SG_CNT);
1626*4882a593Smuzhiyun out_unmap_sg:
1627*4882a593Smuzhiyun ib_dma_unmap_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents,
1628*4882a593Smuzhiyun rq_dma_dir(rq));
1629*4882a593Smuzhiyun out_free_table:
1630*4882a593Smuzhiyun sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT);
1631*4882a593Smuzhiyun return ret;
1632*4882a593Smuzhiyun }
1633*4882a593Smuzhiyun
nvme_rdma_send_done(struct ib_cq * cq,struct ib_wc * wc)1634*4882a593Smuzhiyun static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
1635*4882a593Smuzhiyun {
1636*4882a593Smuzhiyun struct nvme_rdma_qe *qe =
1637*4882a593Smuzhiyun container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
1638*4882a593Smuzhiyun struct nvme_rdma_request *req =
1639*4882a593Smuzhiyun container_of(qe, struct nvme_rdma_request, sqe);
1640*4882a593Smuzhiyun
1641*4882a593Smuzhiyun if (unlikely(wc->status != IB_WC_SUCCESS))
1642*4882a593Smuzhiyun nvme_rdma_wr_error(cq, wc, "SEND");
1643*4882a593Smuzhiyun else
1644*4882a593Smuzhiyun nvme_rdma_end_request(req);
1645*4882a593Smuzhiyun }
1646*4882a593Smuzhiyun
nvme_rdma_post_send(struct nvme_rdma_queue * queue,struct nvme_rdma_qe * qe,struct ib_sge * sge,u32 num_sge,struct ib_send_wr * first)1647*4882a593Smuzhiyun static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
1648*4882a593Smuzhiyun struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge,
1649*4882a593Smuzhiyun struct ib_send_wr *first)
1650*4882a593Smuzhiyun {
1651*4882a593Smuzhiyun struct ib_send_wr wr;
1652*4882a593Smuzhiyun int ret;
1653*4882a593Smuzhiyun
1654*4882a593Smuzhiyun sge->addr = qe->dma;
1655*4882a593Smuzhiyun sge->length = sizeof(struct nvme_command);
1656*4882a593Smuzhiyun sge->lkey = queue->device->pd->local_dma_lkey;
1657*4882a593Smuzhiyun
1658*4882a593Smuzhiyun wr.next = NULL;
1659*4882a593Smuzhiyun wr.wr_cqe = &qe->cqe;
1660*4882a593Smuzhiyun wr.sg_list = sge;
1661*4882a593Smuzhiyun wr.num_sge = num_sge;
1662*4882a593Smuzhiyun wr.opcode = IB_WR_SEND;
1663*4882a593Smuzhiyun wr.send_flags = IB_SEND_SIGNALED;
1664*4882a593Smuzhiyun
1665*4882a593Smuzhiyun if (first)
1666*4882a593Smuzhiyun first->next = ≀
1667*4882a593Smuzhiyun else
1668*4882a593Smuzhiyun first = ≀
1669*4882a593Smuzhiyun
1670*4882a593Smuzhiyun ret = ib_post_send(queue->qp, first, NULL);
1671*4882a593Smuzhiyun if (unlikely(ret)) {
1672*4882a593Smuzhiyun dev_err(queue->ctrl->ctrl.device,
1673*4882a593Smuzhiyun "%s failed with error code %d\n", __func__, ret);
1674*4882a593Smuzhiyun }
1675*4882a593Smuzhiyun return ret;
1676*4882a593Smuzhiyun }
1677*4882a593Smuzhiyun
nvme_rdma_post_recv(struct nvme_rdma_queue * queue,struct nvme_rdma_qe * qe)1678*4882a593Smuzhiyun static int nvme_rdma_post_recv(struct nvme_rdma_queue *queue,
1679*4882a593Smuzhiyun struct nvme_rdma_qe *qe)
1680*4882a593Smuzhiyun {
1681*4882a593Smuzhiyun struct ib_recv_wr wr;
1682*4882a593Smuzhiyun struct ib_sge list;
1683*4882a593Smuzhiyun int ret;
1684*4882a593Smuzhiyun
1685*4882a593Smuzhiyun list.addr = qe->dma;
1686*4882a593Smuzhiyun list.length = sizeof(struct nvme_completion);
1687*4882a593Smuzhiyun list.lkey = queue->device->pd->local_dma_lkey;
1688*4882a593Smuzhiyun
1689*4882a593Smuzhiyun qe->cqe.done = nvme_rdma_recv_done;
1690*4882a593Smuzhiyun
1691*4882a593Smuzhiyun wr.next = NULL;
1692*4882a593Smuzhiyun wr.wr_cqe = &qe->cqe;
1693*4882a593Smuzhiyun wr.sg_list = &list;
1694*4882a593Smuzhiyun wr.num_sge = 1;
1695*4882a593Smuzhiyun
1696*4882a593Smuzhiyun ret = ib_post_recv(queue->qp, &wr, NULL);
1697*4882a593Smuzhiyun if (unlikely(ret)) {
1698*4882a593Smuzhiyun dev_err(queue->ctrl->ctrl.device,
1699*4882a593Smuzhiyun "%s failed with error code %d\n", __func__, ret);
1700*4882a593Smuzhiyun }
1701*4882a593Smuzhiyun return ret;
1702*4882a593Smuzhiyun }
1703*4882a593Smuzhiyun
nvme_rdma_tagset(struct nvme_rdma_queue * queue)1704*4882a593Smuzhiyun static struct blk_mq_tags *nvme_rdma_tagset(struct nvme_rdma_queue *queue)
1705*4882a593Smuzhiyun {
1706*4882a593Smuzhiyun u32 queue_idx = nvme_rdma_queue_idx(queue);
1707*4882a593Smuzhiyun
1708*4882a593Smuzhiyun if (queue_idx == 0)
1709*4882a593Smuzhiyun return queue->ctrl->admin_tag_set.tags[queue_idx];
1710*4882a593Smuzhiyun return queue->ctrl->tag_set.tags[queue_idx - 1];
1711*4882a593Smuzhiyun }
1712*4882a593Smuzhiyun
nvme_rdma_async_done(struct ib_cq * cq,struct ib_wc * wc)1713*4882a593Smuzhiyun static void nvme_rdma_async_done(struct ib_cq *cq, struct ib_wc *wc)
1714*4882a593Smuzhiyun {
1715*4882a593Smuzhiyun if (unlikely(wc->status != IB_WC_SUCCESS))
1716*4882a593Smuzhiyun nvme_rdma_wr_error(cq, wc, "ASYNC");
1717*4882a593Smuzhiyun }
1718*4882a593Smuzhiyun
nvme_rdma_submit_async_event(struct nvme_ctrl * arg)1719*4882a593Smuzhiyun static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg)
1720*4882a593Smuzhiyun {
1721*4882a593Smuzhiyun struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg);
1722*4882a593Smuzhiyun struct nvme_rdma_queue *queue = &ctrl->queues[0];
1723*4882a593Smuzhiyun struct ib_device *dev = queue->device->dev;
1724*4882a593Smuzhiyun struct nvme_rdma_qe *sqe = &ctrl->async_event_sqe;
1725*4882a593Smuzhiyun struct nvme_command *cmd = sqe->data;
1726*4882a593Smuzhiyun struct ib_sge sge;
1727*4882a593Smuzhiyun int ret;
1728*4882a593Smuzhiyun
1729*4882a593Smuzhiyun ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(*cmd), DMA_TO_DEVICE);
1730*4882a593Smuzhiyun
1731*4882a593Smuzhiyun memset(cmd, 0, sizeof(*cmd));
1732*4882a593Smuzhiyun cmd->common.opcode = nvme_admin_async_event;
1733*4882a593Smuzhiyun cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
1734*4882a593Smuzhiyun cmd->common.flags |= NVME_CMD_SGL_METABUF;
1735*4882a593Smuzhiyun nvme_rdma_set_sg_null(cmd);
1736*4882a593Smuzhiyun
1737*4882a593Smuzhiyun sqe->cqe.done = nvme_rdma_async_done;
1738*4882a593Smuzhiyun
1739*4882a593Smuzhiyun ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd),
1740*4882a593Smuzhiyun DMA_TO_DEVICE);
1741*4882a593Smuzhiyun
1742*4882a593Smuzhiyun ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL);
1743*4882a593Smuzhiyun WARN_ON_ONCE(ret);
1744*4882a593Smuzhiyun }
1745*4882a593Smuzhiyun
nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue * queue,struct nvme_completion * cqe,struct ib_wc * wc)1746*4882a593Smuzhiyun static void nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
1747*4882a593Smuzhiyun struct nvme_completion *cqe, struct ib_wc *wc)
1748*4882a593Smuzhiyun {
1749*4882a593Smuzhiyun struct request *rq;
1750*4882a593Smuzhiyun struct nvme_rdma_request *req;
1751*4882a593Smuzhiyun
1752*4882a593Smuzhiyun rq = nvme_find_rq(nvme_rdma_tagset(queue), cqe->command_id);
1753*4882a593Smuzhiyun if (!rq) {
1754*4882a593Smuzhiyun dev_err(queue->ctrl->ctrl.device,
1755*4882a593Smuzhiyun "got bad command_id %#x on QP %#x\n",
1756*4882a593Smuzhiyun cqe->command_id, queue->qp->qp_num);
1757*4882a593Smuzhiyun nvme_rdma_error_recovery(queue->ctrl);
1758*4882a593Smuzhiyun return;
1759*4882a593Smuzhiyun }
1760*4882a593Smuzhiyun req = blk_mq_rq_to_pdu(rq);
1761*4882a593Smuzhiyun
1762*4882a593Smuzhiyun req->status = cqe->status;
1763*4882a593Smuzhiyun req->result = cqe->result;
1764*4882a593Smuzhiyun
1765*4882a593Smuzhiyun if (wc->wc_flags & IB_WC_WITH_INVALIDATE) {
1766*4882a593Smuzhiyun if (unlikely(!req->mr ||
1767*4882a593Smuzhiyun wc->ex.invalidate_rkey != req->mr->rkey)) {
1768*4882a593Smuzhiyun dev_err(queue->ctrl->ctrl.device,
1769*4882a593Smuzhiyun "Bogus remote invalidation for rkey %#x\n",
1770*4882a593Smuzhiyun req->mr ? req->mr->rkey : 0);
1771*4882a593Smuzhiyun nvme_rdma_error_recovery(queue->ctrl);
1772*4882a593Smuzhiyun }
1773*4882a593Smuzhiyun } else if (req->mr) {
1774*4882a593Smuzhiyun int ret;
1775*4882a593Smuzhiyun
1776*4882a593Smuzhiyun ret = nvme_rdma_inv_rkey(queue, req);
1777*4882a593Smuzhiyun if (unlikely(ret < 0)) {
1778*4882a593Smuzhiyun dev_err(queue->ctrl->ctrl.device,
1779*4882a593Smuzhiyun "Queueing INV WR for rkey %#x failed (%d)\n",
1780*4882a593Smuzhiyun req->mr->rkey, ret);
1781*4882a593Smuzhiyun nvme_rdma_error_recovery(queue->ctrl);
1782*4882a593Smuzhiyun }
1783*4882a593Smuzhiyun /* the local invalidation completion will end the request */
1784*4882a593Smuzhiyun return;
1785*4882a593Smuzhiyun }
1786*4882a593Smuzhiyun
1787*4882a593Smuzhiyun nvme_rdma_end_request(req);
1788*4882a593Smuzhiyun }
1789*4882a593Smuzhiyun
nvme_rdma_recv_done(struct ib_cq * cq,struct ib_wc * wc)1790*4882a593Smuzhiyun static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
1791*4882a593Smuzhiyun {
1792*4882a593Smuzhiyun struct nvme_rdma_qe *qe =
1793*4882a593Smuzhiyun container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
1794*4882a593Smuzhiyun struct nvme_rdma_queue *queue = wc->qp->qp_context;
1795*4882a593Smuzhiyun struct ib_device *ibdev = queue->device->dev;
1796*4882a593Smuzhiyun struct nvme_completion *cqe = qe->data;
1797*4882a593Smuzhiyun const size_t len = sizeof(struct nvme_completion);
1798*4882a593Smuzhiyun
1799*4882a593Smuzhiyun if (unlikely(wc->status != IB_WC_SUCCESS)) {
1800*4882a593Smuzhiyun nvme_rdma_wr_error(cq, wc, "RECV");
1801*4882a593Smuzhiyun return;
1802*4882a593Smuzhiyun }
1803*4882a593Smuzhiyun
1804*4882a593Smuzhiyun /* sanity checking for received data length */
1805*4882a593Smuzhiyun if (unlikely(wc->byte_len < len)) {
1806*4882a593Smuzhiyun dev_err(queue->ctrl->ctrl.device,
1807*4882a593Smuzhiyun "Unexpected nvme completion length(%d)\n", wc->byte_len);
1808*4882a593Smuzhiyun nvme_rdma_error_recovery(queue->ctrl);
1809*4882a593Smuzhiyun return;
1810*4882a593Smuzhiyun }
1811*4882a593Smuzhiyun
1812*4882a593Smuzhiyun ib_dma_sync_single_for_cpu(ibdev, qe->dma, len, DMA_FROM_DEVICE);
1813*4882a593Smuzhiyun /*
1814*4882a593Smuzhiyun * AEN requests are special as they don't time out and can
1815*4882a593Smuzhiyun * survive any kind of queue freeze and often don't respond to
1816*4882a593Smuzhiyun * aborts. We don't even bother to allocate a struct request
1817*4882a593Smuzhiyun * for them but rather special case them here.
1818*4882a593Smuzhiyun */
1819*4882a593Smuzhiyun if (unlikely(nvme_is_aen_req(nvme_rdma_queue_idx(queue),
1820*4882a593Smuzhiyun cqe->command_id)))
1821*4882a593Smuzhiyun nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
1822*4882a593Smuzhiyun &cqe->result);
1823*4882a593Smuzhiyun else
1824*4882a593Smuzhiyun nvme_rdma_process_nvme_rsp(queue, cqe, wc);
1825*4882a593Smuzhiyun ib_dma_sync_single_for_device(ibdev, qe->dma, len, DMA_FROM_DEVICE);
1826*4882a593Smuzhiyun
1827*4882a593Smuzhiyun nvme_rdma_post_recv(queue, qe);
1828*4882a593Smuzhiyun }
1829*4882a593Smuzhiyun
nvme_rdma_conn_established(struct nvme_rdma_queue * queue)1830*4882a593Smuzhiyun static int nvme_rdma_conn_established(struct nvme_rdma_queue *queue)
1831*4882a593Smuzhiyun {
1832*4882a593Smuzhiyun int ret, i;
1833*4882a593Smuzhiyun
1834*4882a593Smuzhiyun for (i = 0; i < queue->queue_size; i++) {
1835*4882a593Smuzhiyun ret = nvme_rdma_post_recv(queue, &queue->rsp_ring[i]);
1836*4882a593Smuzhiyun if (ret)
1837*4882a593Smuzhiyun return ret;
1838*4882a593Smuzhiyun }
1839*4882a593Smuzhiyun
1840*4882a593Smuzhiyun return 0;
1841*4882a593Smuzhiyun }
1842*4882a593Smuzhiyun
nvme_rdma_conn_rejected(struct nvme_rdma_queue * queue,struct rdma_cm_event * ev)1843*4882a593Smuzhiyun static int nvme_rdma_conn_rejected(struct nvme_rdma_queue *queue,
1844*4882a593Smuzhiyun struct rdma_cm_event *ev)
1845*4882a593Smuzhiyun {
1846*4882a593Smuzhiyun struct rdma_cm_id *cm_id = queue->cm_id;
1847*4882a593Smuzhiyun int status = ev->status;
1848*4882a593Smuzhiyun const char *rej_msg;
1849*4882a593Smuzhiyun const struct nvme_rdma_cm_rej *rej_data;
1850*4882a593Smuzhiyun u8 rej_data_len;
1851*4882a593Smuzhiyun
1852*4882a593Smuzhiyun rej_msg = rdma_reject_msg(cm_id, status);
1853*4882a593Smuzhiyun rej_data = rdma_consumer_reject_data(cm_id, ev, &rej_data_len);
1854*4882a593Smuzhiyun
1855*4882a593Smuzhiyun if (rej_data && rej_data_len >= sizeof(u16)) {
1856*4882a593Smuzhiyun u16 sts = le16_to_cpu(rej_data->sts);
1857*4882a593Smuzhiyun
1858*4882a593Smuzhiyun dev_err(queue->ctrl->ctrl.device,
1859*4882a593Smuzhiyun "Connect rejected: status %d (%s) nvme status %d (%s).\n",
1860*4882a593Smuzhiyun status, rej_msg, sts, nvme_rdma_cm_msg(sts));
1861*4882a593Smuzhiyun } else {
1862*4882a593Smuzhiyun dev_err(queue->ctrl->ctrl.device,
1863*4882a593Smuzhiyun "Connect rejected: status %d (%s).\n", status, rej_msg);
1864*4882a593Smuzhiyun }
1865*4882a593Smuzhiyun
1866*4882a593Smuzhiyun return -ECONNRESET;
1867*4882a593Smuzhiyun }
1868*4882a593Smuzhiyun
nvme_rdma_addr_resolved(struct nvme_rdma_queue * queue)1869*4882a593Smuzhiyun static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue)
1870*4882a593Smuzhiyun {
1871*4882a593Smuzhiyun struct nvme_ctrl *ctrl = &queue->ctrl->ctrl;
1872*4882a593Smuzhiyun int ret;
1873*4882a593Smuzhiyun
1874*4882a593Smuzhiyun ret = nvme_rdma_create_queue_ib(queue);
1875*4882a593Smuzhiyun if (ret)
1876*4882a593Smuzhiyun return ret;
1877*4882a593Smuzhiyun
1878*4882a593Smuzhiyun if (ctrl->opts->tos >= 0)
1879*4882a593Smuzhiyun rdma_set_service_type(queue->cm_id, ctrl->opts->tos);
1880*4882a593Smuzhiyun ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CONNECT_TIMEOUT_MS);
1881*4882a593Smuzhiyun if (ret) {
1882*4882a593Smuzhiyun dev_err(ctrl->device, "rdma_resolve_route failed (%d).\n",
1883*4882a593Smuzhiyun queue->cm_error);
1884*4882a593Smuzhiyun goto out_destroy_queue;
1885*4882a593Smuzhiyun }
1886*4882a593Smuzhiyun
1887*4882a593Smuzhiyun return 0;
1888*4882a593Smuzhiyun
1889*4882a593Smuzhiyun out_destroy_queue:
1890*4882a593Smuzhiyun nvme_rdma_destroy_queue_ib(queue);
1891*4882a593Smuzhiyun return ret;
1892*4882a593Smuzhiyun }
1893*4882a593Smuzhiyun
nvme_rdma_route_resolved(struct nvme_rdma_queue * queue)1894*4882a593Smuzhiyun static int nvme_rdma_route_resolved(struct nvme_rdma_queue *queue)
1895*4882a593Smuzhiyun {
1896*4882a593Smuzhiyun struct nvme_rdma_ctrl *ctrl = queue->ctrl;
1897*4882a593Smuzhiyun struct rdma_conn_param param = { };
1898*4882a593Smuzhiyun struct nvme_rdma_cm_req priv = { };
1899*4882a593Smuzhiyun int ret;
1900*4882a593Smuzhiyun
1901*4882a593Smuzhiyun param.qp_num = queue->qp->qp_num;
1902*4882a593Smuzhiyun param.flow_control = 1;
1903*4882a593Smuzhiyun
1904*4882a593Smuzhiyun param.responder_resources = queue->device->dev->attrs.max_qp_rd_atom;
1905*4882a593Smuzhiyun /* maximum retry count */
1906*4882a593Smuzhiyun param.retry_count = 7;
1907*4882a593Smuzhiyun param.rnr_retry_count = 7;
1908*4882a593Smuzhiyun param.private_data = &priv;
1909*4882a593Smuzhiyun param.private_data_len = sizeof(priv);
1910*4882a593Smuzhiyun
1911*4882a593Smuzhiyun priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
1912*4882a593Smuzhiyun priv.qid = cpu_to_le16(nvme_rdma_queue_idx(queue));
1913*4882a593Smuzhiyun /*
1914*4882a593Smuzhiyun * set the admin queue depth to the minimum size
1915*4882a593Smuzhiyun * specified by the Fabrics standard.
1916*4882a593Smuzhiyun */
1917*4882a593Smuzhiyun if (priv.qid == 0) {
1918*4882a593Smuzhiyun priv.hrqsize = cpu_to_le16(NVME_AQ_DEPTH);
1919*4882a593Smuzhiyun priv.hsqsize = cpu_to_le16(NVME_AQ_DEPTH - 1);
1920*4882a593Smuzhiyun } else {
1921*4882a593Smuzhiyun /*
1922*4882a593Smuzhiyun * current interpretation of the fabrics spec
1923*4882a593Smuzhiyun * is at minimum you make hrqsize sqsize+1, or a
1924*4882a593Smuzhiyun * 1's based representation of sqsize.
1925*4882a593Smuzhiyun */
1926*4882a593Smuzhiyun priv.hrqsize = cpu_to_le16(queue->queue_size);
1927*4882a593Smuzhiyun priv.hsqsize = cpu_to_le16(queue->ctrl->ctrl.sqsize);
1928*4882a593Smuzhiyun }
1929*4882a593Smuzhiyun
1930*4882a593Smuzhiyun ret = rdma_connect_locked(queue->cm_id, ¶m);
1931*4882a593Smuzhiyun if (ret) {
1932*4882a593Smuzhiyun dev_err(ctrl->ctrl.device,
1933*4882a593Smuzhiyun "rdma_connect_locked failed (%d).\n", ret);
1934*4882a593Smuzhiyun return ret;
1935*4882a593Smuzhiyun }
1936*4882a593Smuzhiyun
1937*4882a593Smuzhiyun return 0;
1938*4882a593Smuzhiyun }
1939*4882a593Smuzhiyun
nvme_rdma_cm_handler(struct rdma_cm_id * cm_id,struct rdma_cm_event * ev)1940*4882a593Smuzhiyun static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
1941*4882a593Smuzhiyun struct rdma_cm_event *ev)
1942*4882a593Smuzhiyun {
1943*4882a593Smuzhiyun struct nvme_rdma_queue *queue = cm_id->context;
1944*4882a593Smuzhiyun int cm_error = 0;
1945*4882a593Smuzhiyun
1946*4882a593Smuzhiyun dev_dbg(queue->ctrl->ctrl.device, "%s (%d): status %d id %p\n",
1947*4882a593Smuzhiyun rdma_event_msg(ev->event), ev->event,
1948*4882a593Smuzhiyun ev->status, cm_id);
1949*4882a593Smuzhiyun
1950*4882a593Smuzhiyun switch (ev->event) {
1951*4882a593Smuzhiyun case RDMA_CM_EVENT_ADDR_RESOLVED:
1952*4882a593Smuzhiyun cm_error = nvme_rdma_addr_resolved(queue);
1953*4882a593Smuzhiyun break;
1954*4882a593Smuzhiyun case RDMA_CM_EVENT_ROUTE_RESOLVED:
1955*4882a593Smuzhiyun cm_error = nvme_rdma_route_resolved(queue);
1956*4882a593Smuzhiyun break;
1957*4882a593Smuzhiyun case RDMA_CM_EVENT_ESTABLISHED:
1958*4882a593Smuzhiyun queue->cm_error = nvme_rdma_conn_established(queue);
1959*4882a593Smuzhiyun /* complete cm_done regardless of success/failure */
1960*4882a593Smuzhiyun complete(&queue->cm_done);
1961*4882a593Smuzhiyun return 0;
1962*4882a593Smuzhiyun case RDMA_CM_EVENT_REJECTED:
1963*4882a593Smuzhiyun cm_error = nvme_rdma_conn_rejected(queue, ev);
1964*4882a593Smuzhiyun break;
1965*4882a593Smuzhiyun case RDMA_CM_EVENT_ROUTE_ERROR:
1966*4882a593Smuzhiyun case RDMA_CM_EVENT_CONNECT_ERROR:
1967*4882a593Smuzhiyun case RDMA_CM_EVENT_UNREACHABLE:
1968*4882a593Smuzhiyun case RDMA_CM_EVENT_ADDR_ERROR:
1969*4882a593Smuzhiyun dev_dbg(queue->ctrl->ctrl.device,
1970*4882a593Smuzhiyun "CM error event %d\n", ev->event);
1971*4882a593Smuzhiyun cm_error = -ECONNRESET;
1972*4882a593Smuzhiyun break;
1973*4882a593Smuzhiyun case RDMA_CM_EVENT_DISCONNECTED:
1974*4882a593Smuzhiyun case RDMA_CM_EVENT_ADDR_CHANGE:
1975*4882a593Smuzhiyun case RDMA_CM_EVENT_TIMEWAIT_EXIT:
1976*4882a593Smuzhiyun dev_dbg(queue->ctrl->ctrl.device,
1977*4882a593Smuzhiyun "disconnect received - connection closed\n");
1978*4882a593Smuzhiyun nvme_rdma_error_recovery(queue->ctrl);
1979*4882a593Smuzhiyun break;
1980*4882a593Smuzhiyun case RDMA_CM_EVENT_DEVICE_REMOVAL:
1981*4882a593Smuzhiyun /* device removal is handled via the ib_client API */
1982*4882a593Smuzhiyun break;
1983*4882a593Smuzhiyun default:
1984*4882a593Smuzhiyun dev_err(queue->ctrl->ctrl.device,
1985*4882a593Smuzhiyun "Unexpected RDMA CM event (%d)\n", ev->event);
1986*4882a593Smuzhiyun nvme_rdma_error_recovery(queue->ctrl);
1987*4882a593Smuzhiyun break;
1988*4882a593Smuzhiyun }
1989*4882a593Smuzhiyun
1990*4882a593Smuzhiyun if (cm_error) {
1991*4882a593Smuzhiyun queue->cm_error = cm_error;
1992*4882a593Smuzhiyun complete(&queue->cm_done);
1993*4882a593Smuzhiyun }
1994*4882a593Smuzhiyun
1995*4882a593Smuzhiyun return 0;
1996*4882a593Smuzhiyun }
1997*4882a593Smuzhiyun
nvme_rdma_complete_timed_out(struct request * rq)1998*4882a593Smuzhiyun static void nvme_rdma_complete_timed_out(struct request *rq)
1999*4882a593Smuzhiyun {
2000*4882a593Smuzhiyun struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
2001*4882a593Smuzhiyun struct nvme_rdma_queue *queue = req->queue;
2002*4882a593Smuzhiyun
2003*4882a593Smuzhiyun nvme_rdma_stop_queue(queue);
2004*4882a593Smuzhiyun if (blk_mq_request_started(rq) && !blk_mq_request_completed(rq)) {
2005*4882a593Smuzhiyun nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD;
2006*4882a593Smuzhiyun blk_mq_complete_request(rq);
2007*4882a593Smuzhiyun }
2008*4882a593Smuzhiyun }
2009*4882a593Smuzhiyun
2010*4882a593Smuzhiyun static enum blk_eh_timer_return
nvme_rdma_timeout(struct request * rq,bool reserved)2011*4882a593Smuzhiyun nvme_rdma_timeout(struct request *rq, bool reserved)
2012*4882a593Smuzhiyun {
2013*4882a593Smuzhiyun struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
2014*4882a593Smuzhiyun struct nvme_rdma_queue *queue = req->queue;
2015*4882a593Smuzhiyun struct nvme_rdma_ctrl *ctrl = queue->ctrl;
2016*4882a593Smuzhiyun
2017*4882a593Smuzhiyun dev_warn(ctrl->ctrl.device, "I/O %d QID %d timeout\n",
2018*4882a593Smuzhiyun rq->tag, nvme_rdma_queue_idx(queue));
2019*4882a593Smuzhiyun
2020*4882a593Smuzhiyun if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
2021*4882a593Smuzhiyun /*
2022*4882a593Smuzhiyun * If we are resetting, connecting or deleting we should
2023*4882a593Smuzhiyun * complete immediately because we may block controller
2024*4882a593Smuzhiyun * teardown or setup sequence
2025*4882a593Smuzhiyun * - ctrl disable/shutdown fabrics requests
2026*4882a593Smuzhiyun * - connect requests
2027*4882a593Smuzhiyun * - initialization admin requests
2028*4882a593Smuzhiyun * - I/O requests that entered after unquiescing and
2029*4882a593Smuzhiyun * the controller stopped responding
2030*4882a593Smuzhiyun *
2031*4882a593Smuzhiyun * All other requests should be cancelled by the error
2032*4882a593Smuzhiyun * recovery work, so it's fine that we fail it here.
2033*4882a593Smuzhiyun */
2034*4882a593Smuzhiyun nvme_rdma_complete_timed_out(rq);
2035*4882a593Smuzhiyun return BLK_EH_DONE;
2036*4882a593Smuzhiyun }
2037*4882a593Smuzhiyun
2038*4882a593Smuzhiyun /*
2039*4882a593Smuzhiyun * LIVE state should trigger the normal error recovery which will
2040*4882a593Smuzhiyun * handle completing this request.
2041*4882a593Smuzhiyun */
2042*4882a593Smuzhiyun nvme_rdma_error_recovery(ctrl);
2043*4882a593Smuzhiyun return BLK_EH_RESET_TIMER;
2044*4882a593Smuzhiyun }
2045*4882a593Smuzhiyun
nvme_rdma_queue_rq(struct blk_mq_hw_ctx * hctx,const struct blk_mq_queue_data * bd)2046*4882a593Smuzhiyun static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
2047*4882a593Smuzhiyun const struct blk_mq_queue_data *bd)
2048*4882a593Smuzhiyun {
2049*4882a593Smuzhiyun struct nvme_ns *ns = hctx->queue->queuedata;
2050*4882a593Smuzhiyun struct nvme_rdma_queue *queue = hctx->driver_data;
2051*4882a593Smuzhiyun struct request *rq = bd->rq;
2052*4882a593Smuzhiyun struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
2053*4882a593Smuzhiyun struct nvme_rdma_qe *sqe = &req->sqe;
2054*4882a593Smuzhiyun struct nvme_command *c = sqe->data;
2055*4882a593Smuzhiyun struct ib_device *dev;
2056*4882a593Smuzhiyun bool queue_ready = test_bit(NVME_RDMA_Q_LIVE, &queue->flags);
2057*4882a593Smuzhiyun blk_status_t ret;
2058*4882a593Smuzhiyun int err;
2059*4882a593Smuzhiyun
2060*4882a593Smuzhiyun WARN_ON_ONCE(rq->tag < 0);
2061*4882a593Smuzhiyun
2062*4882a593Smuzhiyun if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
2063*4882a593Smuzhiyun return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);
2064*4882a593Smuzhiyun
2065*4882a593Smuzhiyun dev = queue->device->dev;
2066*4882a593Smuzhiyun
2067*4882a593Smuzhiyun req->sqe.dma = ib_dma_map_single(dev, req->sqe.data,
2068*4882a593Smuzhiyun sizeof(struct nvme_command),
2069*4882a593Smuzhiyun DMA_TO_DEVICE);
2070*4882a593Smuzhiyun err = ib_dma_mapping_error(dev, req->sqe.dma);
2071*4882a593Smuzhiyun if (unlikely(err))
2072*4882a593Smuzhiyun return BLK_STS_RESOURCE;
2073*4882a593Smuzhiyun
2074*4882a593Smuzhiyun ib_dma_sync_single_for_cpu(dev, sqe->dma,
2075*4882a593Smuzhiyun sizeof(struct nvme_command), DMA_TO_DEVICE);
2076*4882a593Smuzhiyun
2077*4882a593Smuzhiyun ret = nvme_setup_cmd(ns, rq, c);
2078*4882a593Smuzhiyun if (ret)
2079*4882a593Smuzhiyun goto unmap_qe;
2080*4882a593Smuzhiyun
2081*4882a593Smuzhiyun blk_mq_start_request(rq);
2082*4882a593Smuzhiyun
2083*4882a593Smuzhiyun if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
2084*4882a593Smuzhiyun queue->pi_support &&
2085*4882a593Smuzhiyun (c->common.opcode == nvme_cmd_write ||
2086*4882a593Smuzhiyun c->common.opcode == nvme_cmd_read) &&
2087*4882a593Smuzhiyun nvme_ns_has_pi(ns))
2088*4882a593Smuzhiyun req->use_sig_mr = true;
2089*4882a593Smuzhiyun else
2090*4882a593Smuzhiyun req->use_sig_mr = false;
2091*4882a593Smuzhiyun
2092*4882a593Smuzhiyun err = nvme_rdma_map_data(queue, rq, c);
2093*4882a593Smuzhiyun if (unlikely(err < 0)) {
2094*4882a593Smuzhiyun dev_err(queue->ctrl->ctrl.device,
2095*4882a593Smuzhiyun "Failed to map data (%d)\n", err);
2096*4882a593Smuzhiyun goto err;
2097*4882a593Smuzhiyun }
2098*4882a593Smuzhiyun
2099*4882a593Smuzhiyun sqe->cqe.done = nvme_rdma_send_done;
2100*4882a593Smuzhiyun
2101*4882a593Smuzhiyun ib_dma_sync_single_for_device(dev, sqe->dma,
2102*4882a593Smuzhiyun sizeof(struct nvme_command), DMA_TO_DEVICE);
2103*4882a593Smuzhiyun
2104*4882a593Smuzhiyun err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge,
2105*4882a593Smuzhiyun req->mr ? &req->reg_wr.wr : NULL);
2106*4882a593Smuzhiyun if (unlikely(err))
2107*4882a593Smuzhiyun goto err_unmap;
2108*4882a593Smuzhiyun
2109*4882a593Smuzhiyun return BLK_STS_OK;
2110*4882a593Smuzhiyun
2111*4882a593Smuzhiyun err_unmap:
2112*4882a593Smuzhiyun nvme_rdma_unmap_data(queue, rq);
2113*4882a593Smuzhiyun err:
2114*4882a593Smuzhiyun if (err == -ENOMEM || err == -EAGAIN)
2115*4882a593Smuzhiyun ret = BLK_STS_RESOURCE;
2116*4882a593Smuzhiyun else
2117*4882a593Smuzhiyun ret = BLK_STS_IOERR;
2118*4882a593Smuzhiyun nvme_cleanup_cmd(rq);
2119*4882a593Smuzhiyun unmap_qe:
2120*4882a593Smuzhiyun ib_dma_unmap_single(dev, req->sqe.dma, sizeof(struct nvme_command),
2121*4882a593Smuzhiyun DMA_TO_DEVICE);
2122*4882a593Smuzhiyun return ret;
2123*4882a593Smuzhiyun }
2124*4882a593Smuzhiyun
nvme_rdma_poll(struct blk_mq_hw_ctx * hctx)2125*4882a593Smuzhiyun static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx)
2126*4882a593Smuzhiyun {
2127*4882a593Smuzhiyun struct nvme_rdma_queue *queue = hctx->driver_data;
2128*4882a593Smuzhiyun
2129*4882a593Smuzhiyun return ib_process_cq_direct(queue->ib_cq, -1);
2130*4882a593Smuzhiyun }
2131*4882a593Smuzhiyun
nvme_rdma_check_pi_status(struct nvme_rdma_request * req)2132*4882a593Smuzhiyun static void nvme_rdma_check_pi_status(struct nvme_rdma_request *req)
2133*4882a593Smuzhiyun {
2134*4882a593Smuzhiyun struct request *rq = blk_mq_rq_from_pdu(req);
2135*4882a593Smuzhiyun struct ib_mr_status mr_status;
2136*4882a593Smuzhiyun int ret;
2137*4882a593Smuzhiyun
2138*4882a593Smuzhiyun ret = ib_check_mr_status(req->mr, IB_MR_CHECK_SIG_STATUS, &mr_status);
2139*4882a593Smuzhiyun if (ret) {
2140*4882a593Smuzhiyun pr_err("ib_check_mr_status failed, ret %d\n", ret);
2141*4882a593Smuzhiyun nvme_req(rq)->status = NVME_SC_INVALID_PI;
2142*4882a593Smuzhiyun return;
2143*4882a593Smuzhiyun }
2144*4882a593Smuzhiyun
2145*4882a593Smuzhiyun if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) {
2146*4882a593Smuzhiyun switch (mr_status.sig_err.err_type) {
2147*4882a593Smuzhiyun case IB_SIG_BAD_GUARD:
2148*4882a593Smuzhiyun nvme_req(rq)->status = NVME_SC_GUARD_CHECK;
2149*4882a593Smuzhiyun break;
2150*4882a593Smuzhiyun case IB_SIG_BAD_REFTAG:
2151*4882a593Smuzhiyun nvme_req(rq)->status = NVME_SC_REFTAG_CHECK;
2152*4882a593Smuzhiyun break;
2153*4882a593Smuzhiyun case IB_SIG_BAD_APPTAG:
2154*4882a593Smuzhiyun nvme_req(rq)->status = NVME_SC_APPTAG_CHECK;
2155*4882a593Smuzhiyun break;
2156*4882a593Smuzhiyun }
2157*4882a593Smuzhiyun pr_err("PI error found type %d expected 0x%x vs actual 0x%x\n",
2158*4882a593Smuzhiyun mr_status.sig_err.err_type, mr_status.sig_err.expected,
2159*4882a593Smuzhiyun mr_status.sig_err.actual);
2160*4882a593Smuzhiyun }
2161*4882a593Smuzhiyun }
2162*4882a593Smuzhiyun
nvme_rdma_complete_rq(struct request * rq)2163*4882a593Smuzhiyun static void nvme_rdma_complete_rq(struct request *rq)
2164*4882a593Smuzhiyun {
2165*4882a593Smuzhiyun struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
2166*4882a593Smuzhiyun struct nvme_rdma_queue *queue = req->queue;
2167*4882a593Smuzhiyun struct ib_device *ibdev = queue->device->dev;
2168*4882a593Smuzhiyun
2169*4882a593Smuzhiyun if (req->use_sig_mr)
2170*4882a593Smuzhiyun nvme_rdma_check_pi_status(req);
2171*4882a593Smuzhiyun
2172*4882a593Smuzhiyun nvme_rdma_unmap_data(queue, rq);
2173*4882a593Smuzhiyun ib_dma_unmap_single(ibdev, req->sqe.dma, sizeof(struct nvme_command),
2174*4882a593Smuzhiyun DMA_TO_DEVICE);
2175*4882a593Smuzhiyun nvme_complete_rq(rq);
2176*4882a593Smuzhiyun }
2177*4882a593Smuzhiyun
nvme_rdma_map_queues(struct blk_mq_tag_set * set)2178*4882a593Smuzhiyun static int nvme_rdma_map_queues(struct blk_mq_tag_set *set)
2179*4882a593Smuzhiyun {
2180*4882a593Smuzhiyun struct nvme_rdma_ctrl *ctrl = set->driver_data;
2181*4882a593Smuzhiyun struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
2182*4882a593Smuzhiyun
2183*4882a593Smuzhiyun if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) {
2184*4882a593Smuzhiyun /* separate read/write queues */
2185*4882a593Smuzhiyun set->map[HCTX_TYPE_DEFAULT].nr_queues =
2186*4882a593Smuzhiyun ctrl->io_queues[HCTX_TYPE_DEFAULT];
2187*4882a593Smuzhiyun set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
2188*4882a593Smuzhiyun set->map[HCTX_TYPE_READ].nr_queues =
2189*4882a593Smuzhiyun ctrl->io_queues[HCTX_TYPE_READ];
2190*4882a593Smuzhiyun set->map[HCTX_TYPE_READ].queue_offset =
2191*4882a593Smuzhiyun ctrl->io_queues[HCTX_TYPE_DEFAULT];
2192*4882a593Smuzhiyun } else {
2193*4882a593Smuzhiyun /* shared read/write queues */
2194*4882a593Smuzhiyun set->map[HCTX_TYPE_DEFAULT].nr_queues =
2195*4882a593Smuzhiyun ctrl->io_queues[HCTX_TYPE_DEFAULT];
2196*4882a593Smuzhiyun set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
2197*4882a593Smuzhiyun set->map[HCTX_TYPE_READ].nr_queues =
2198*4882a593Smuzhiyun ctrl->io_queues[HCTX_TYPE_DEFAULT];
2199*4882a593Smuzhiyun set->map[HCTX_TYPE_READ].queue_offset = 0;
2200*4882a593Smuzhiyun }
2201*4882a593Smuzhiyun blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_DEFAULT],
2202*4882a593Smuzhiyun ctrl->device->dev, 0);
2203*4882a593Smuzhiyun blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_READ],
2204*4882a593Smuzhiyun ctrl->device->dev, 0);
2205*4882a593Smuzhiyun
2206*4882a593Smuzhiyun if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) {
2207*4882a593Smuzhiyun /* map dedicated poll queues only if we have queues left */
2208*4882a593Smuzhiyun set->map[HCTX_TYPE_POLL].nr_queues =
2209*4882a593Smuzhiyun ctrl->io_queues[HCTX_TYPE_POLL];
2210*4882a593Smuzhiyun set->map[HCTX_TYPE_POLL].queue_offset =
2211*4882a593Smuzhiyun ctrl->io_queues[HCTX_TYPE_DEFAULT] +
2212*4882a593Smuzhiyun ctrl->io_queues[HCTX_TYPE_READ];
2213*4882a593Smuzhiyun blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
2214*4882a593Smuzhiyun }
2215*4882a593Smuzhiyun
2216*4882a593Smuzhiyun dev_info(ctrl->ctrl.device,
2217*4882a593Smuzhiyun "mapped %d/%d/%d default/read/poll queues.\n",
2218*4882a593Smuzhiyun ctrl->io_queues[HCTX_TYPE_DEFAULT],
2219*4882a593Smuzhiyun ctrl->io_queues[HCTX_TYPE_READ],
2220*4882a593Smuzhiyun ctrl->io_queues[HCTX_TYPE_POLL]);
2221*4882a593Smuzhiyun
2222*4882a593Smuzhiyun return 0;
2223*4882a593Smuzhiyun }
2224*4882a593Smuzhiyun
2225*4882a593Smuzhiyun static const struct blk_mq_ops nvme_rdma_mq_ops = {
2226*4882a593Smuzhiyun .queue_rq = nvme_rdma_queue_rq,
2227*4882a593Smuzhiyun .complete = nvme_rdma_complete_rq,
2228*4882a593Smuzhiyun .init_request = nvme_rdma_init_request,
2229*4882a593Smuzhiyun .exit_request = nvme_rdma_exit_request,
2230*4882a593Smuzhiyun .init_hctx = nvme_rdma_init_hctx,
2231*4882a593Smuzhiyun .timeout = nvme_rdma_timeout,
2232*4882a593Smuzhiyun .map_queues = nvme_rdma_map_queues,
2233*4882a593Smuzhiyun .poll = nvme_rdma_poll,
2234*4882a593Smuzhiyun };
2235*4882a593Smuzhiyun
2236*4882a593Smuzhiyun static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
2237*4882a593Smuzhiyun .queue_rq = nvme_rdma_queue_rq,
2238*4882a593Smuzhiyun .complete = nvme_rdma_complete_rq,
2239*4882a593Smuzhiyun .init_request = nvme_rdma_init_request,
2240*4882a593Smuzhiyun .exit_request = nvme_rdma_exit_request,
2241*4882a593Smuzhiyun .init_hctx = nvme_rdma_init_admin_hctx,
2242*4882a593Smuzhiyun .timeout = nvme_rdma_timeout,
2243*4882a593Smuzhiyun };
2244*4882a593Smuzhiyun
nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl * ctrl,bool shutdown)2245*4882a593Smuzhiyun static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
2246*4882a593Smuzhiyun {
2247*4882a593Smuzhiyun nvme_rdma_teardown_io_queues(ctrl, shutdown);
2248*4882a593Smuzhiyun blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
2249*4882a593Smuzhiyun if (shutdown)
2250*4882a593Smuzhiyun nvme_shutdown_ctrl(&ctrl->ctrl);
2251*4882a593Smuzhiyun else
2252*4882a593Smuzhiyun nvme_disable_ctrl(&ctrl->ctrl);
2253*4882a593Smuzhiyun nvme_rdma_teardown_admin_queue(ctrl, shutdown);
2254*4882a593Smuzhiyun }
2255*4882a593Smuzhiyun
nvme_rdma_delete_ctrl(struct nvme_ctrl * ctrl)2256*4882a593Smuzhiyun static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl)
2257*4882a593Smuzhiyun {
2258*4882a593Smuzhiyun nvme_rdma_shutdown_ctrl(to_rdma_ctrl(ctrl), true);
2259*4882a593Smuzhiyun }
2260*4882a593Smuzhiyun
nvme_rdma_reset_ctrl_work(struct work_struct * work)2261*4882a593Smuzhiyun static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
2262*4882a593Smuzhiyun {
2263*4882a593Smuzhiyun struct nvme_rdma_ctrl *ctrl =
2264*4882a593Smuzhiyun container_of(work, struct nvme_rdma_ctrl, ctrl.reset_work);
2265*4882a593Smuzhiyun
2266*4882a593Smuzhiyun nvme_stop_ctrl(&ctrl->ctrl);
2267*4882a593Smuzhiyun nvme_rdma_shutdown_ctrl(ctrl, false);
2268*4882a593Smuzhiyun
2269*4882a593Smuzhiyun if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
2270*4882a593Smuzhiyun /* state change failure should never happen */
2271*4882a593Smuzhiyun WARN_ON_ONCE(1);
2272*4882a593Smuzhiyun return;
2273*4882a593Smuzhiyun }
2274*4882a593Smuzhiyun
2275*4882a593Smuzhiyun if (nvme_rdma_setup_ctrl(ctrl, false))
2276*4882a593Smuzhiyun goto out_fail;
2277*4882a593Smuzhiyun
2278*4882a593Smuzhiyun return;
2279*4882a593Smuzhiyun
2280*4882a593Smuzhiyun out_fail:
2281*4882a593Smuzhiyun ++ctrl->ctrl.nr_reconnects;
2282*4882a593Smuzhiyun nvme_rdma_reconnect_or_remove(ctrl);
2283*4882a593Smuzhiyun }
2284*4882a593Smuzhiyun
2285*4882a593Smuzhiyun static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
2286*4882a593Smuzhiyun .name = "rdma",
2287*4882a593Smuzhiyun .module = THIS_MODULE,
2288*4882a593Smuzhiyun .flags = NVME_F_FABRICS | NVME_F_METADATA_SUPPORTED,
2289*4882a593Smuzhiyun .reg_read32 = nvmf_reg_read32,
2290*4882a593Smuzhiyun .reg_read64 = nvmf_reg_read64,
2291*4882a593Smuzhiyun .reg_write32 = nvmf_reg_write32,
2292*4882a593Smuzhiyun .free_ctrl = nvme_rdma_free_ctrl,
2293*4882a593Smuzhiyun .submit_async_event = nvme_rdma_submit_async_event,
2294*4882a593Smuzhiyun .delete_ctrl = nvme_rdma_delete_ctrl,
2295*4882a593Smuzhiyun .get_address = nvmf_get_address,
2296*4882a593Smuzhiyun .stop_ctrl = nvme_rdma_stop_ctrl,
2297*4882a593Smuzhiyun };
2298*4882a593Smuzhiyun
2299*4882a593Smuzhiyun /*
2300*4882a593Smuzhiyun * Fails a connection request if it matches an existing controller
2301*4882a593Smuzhiyun * (association) with the same tuple:
2302*4882a593Smuzhiyun * <Host NQN, Host ID, local address, remote address, remote port, SUBSYS NQN>
2303*4882a593Smuzhiyun *
2304*4882a593Smuzhiyun * if local address is not specified in the request, it will match an
2305*4882a593Smuzhiyun * existing controller with all the other parameters the same and no
2306*4882a593Smuzhiyun * local port address specified as well.
2307*4882a593Smuzhiyun *
2308*4882a593Smuzhiyun * The ports don't need to be compared as they are intrinsically
2309*4882a593Smuzhiyun * already matched by the port pointers supplied.
2310*4882a593Smuzhiyun */
2311*4882a593Smuzhiyun static bool
nvme_rdma_existing_controller(struct nvmf_ctrl_options * opts)2312*4882a593Smuzhiyun nvme_rdma_existing_controller(struct nvmf_ctrl_options *opts)
2313*4882a593Smuzhiyun {
2314*4882a593Smuzhiyun struct nvme_rdma_ctrl *ctrl;
2315*4882a593Smuzhiyun bool found = false;
2316*4882a593Smuzhiyun
2317*4882a593Smuzhiyun mutex_lock(&nvme_rdma_ctrl_mutex);
2318*4882a593Smuzhiyun list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) {
2319*4882a593Smuzhiyun found = nvmf_ip_options_match(&ctrl->ctrl, opts);
2320*4882a593Smuzhiyun if (found)
2321*4882a593Smuzhiyun break;
2322*4882a593Smuzhiyun }
2323*4882a593Smuzhiyun mutex_unlock(&nvme_rdma_ctrl_mutex);
2324*4882a593Smuzhiyun
2325*4882a593Smuzhiyun return found;
2326*4882a593Smuzhiyun }
2327*4882a593Smuzhiyun
nvme_rdma_create_ctrl(struct device * dev,struct nvmf_ctrl_options * opts)2328*4882a593Smuzhiyun static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
2329*4882a593Smuzhiyun struct nvmf_ctrl_options *opts)
2330*4882a593Smuzhiyun {
2331*4882a593Smuzhiyun struct nvme_rdma_ctrl *ctrl;
2332*4882a593Smuzhiyun int ret;
2333*4882a593Smuzhiyun bool changed;
2334*4882a593Smuzhiyun
2335*4882a593Smuzhiyun ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
2336*4882a593Smuzhiyun if (!ctrl)
2337*4882a593Smuzhiyun return ERR_PTR(-ENOMEM);
2338*4882a593Smuzhiyun ctrl->ctrl.opts = opts;
2339*4882a593Smuzhiyun INIT_LIST_HEAD(&ctrl->list);
2340*4882a593Smuzhiyun
2341*4882a593Smuzhiyun if (!(opts->mask & NVMF_OPT_TRSVCID)) {
2342*4882a593Smuzhiyun opts->trsvcid =
2343*4882a593Smuzhiyun kstrdup(__stringify(NVME_RDMA_IP_PORT), GFP_KERNEL);
2344*4882a593Smuzhiyun if (!opts->trsvcid) {
2345*4882a593Smuzhiyun ret = -ENOMEM;
2346*4882a593Smuzhiyun goto out_free_ctrl;
2347*4882a593Smuzhiyun }
2348*4882a593Smuzhiyun opts->mask |= NVMF_OPT_TRSVCID;
2349*4882a593Smuzhiyun }
2350*4882a593Smuzhiyun
2351*4882a593Smuzhiyun ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2352*4882a593Smuzhiyun opts->traddr, opts->trsvcid, &ctrl->addr);
2353*4882a593Smuzhiyun if (ret) {
2354*4882a593Smuzhiyun pr_err("malformed address passed: %s:%s\n",
2355*4882a593Smuzhiyun opts->traddr, opts->trsvcid);
2356*4882a593Smuzhiyun goto out_free_ctrl;
2357*4882a593Smuzhiyun }
2358*4882a593Smuzhiyun
2359*4882a593Smuzhiyun if (opts->mask & NVMF_OPT_HOST_TRADDR) {
2360*4882a593Smuzhiyun ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2361*4882a593Smuzhiyun opts->host_traddr, NULL, &ctrl->src_addr);
2362*4882a593Smuzhiyun if (ret) {
2363*4882a593Smuzhiyun pr_err("malformed src address passed: %s\n",
2364*4882a593Smuzhiyun opts->host_traddr);
2365*4882a593Smuzhiyun goto out_free_ctrl;
2366*4882a593Smuzhiyun }
2367*4882a593Smuzhiyun }
2368*4882a593Smuzhiyun
2369*4882a593Smuzhiyun if (!opts->duplicate_connect && nvme_rdma_existing_controller(opts)) {
2370*4882a593Smuzhiyun ret = -EALREADY;
2371*4882a593Smuzhiyun goto out_free_ctrl;
2372*4882a593Smuzhiyun }
2373*4882a593Smuzhiyun
2374*4882a593Smuzhiyun INIT_DELAYED_WORK(&ctrl->reconnect_work,
2375*4882a593Smuzhiyun nvme_rdma_reconnect_ctrl_work);
2376*4882a593Smuzhiyun INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work);
2377*4882a593Smuzhiyun INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work);
2378*4882a593Smuzhiyun
2379*4882a593Smuzhiyun ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
2380*4882a593Smuzhiyun opts->nr_poll_queues + 1;
2381*4882a593Smuzhiyun ctrl->ctrl.sqsize = opts->queue_size - 1;
2382*4882a593Smuzhiyun ctrl->ctrl.kato = opts->kato;
2383*4882a593Smuzhiyun
2384*4882a593Smuzhiyun ret = -ENOMEM;
2385*4882a593Smuzhiyun ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
2386*4882a593Smuzhiyun GFP_KERNEL);
2387*4882a593Smuzhiyun if (!ctrl->queues)
2388*4882a593Smuzhiyun goto out_free_ctrl;
2389*4882a593Smuzhiyun
2390*4882a593Smuzhiyun ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops,
2391*4882a593Smuzhiyun 0 /* no quirks, we're perfect! */);
2392*4882a593Smuzhiyun if (ret)
2393*4882a593Smuzhiyun goto out_kfree_queues;
2394*4882a593Smuzhiyun
2395*4882a593Smuzhiyun changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING);
2396*4882a593Smuzhiyun WARN_ON_ONCE(!changed);
2397*4882a593Smuzhiyun
2398*4882a593Smuzhiyun ret = nvme_rdma_setup_ctrl(ctrl, true);
2399*4882a593Smuzhiyun if (ret)
2400*4882a593Smuzhiyun goto out_uninit_ctrl;
2401*4882a593Smuzhiyun
2402*4882a593Smuzhiyun dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n",
2403*4882a593Smuzhiyun ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
2404*4882a593Smuzhiyun
2405*4882a593Smuzhiyun mutex_lock(&nvme_rdma_ctrl_mutex);
2406*4882a593Smuzhiyun list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list);
2407*4882a593Smuzhiyun mutex_unlock(&nvme_rdma_ctrl_mutex);
2408*4882a593Smuzhiyun
2409*4882a593Smuzhiyun return &ctrl->ctrl;
2410*4882a593Smuzhiyun
2411*4882a593Smuzhiyun out_uninit_ctrl:
2412*4882a593Smuzhiyun nvme_uninit_ctrl(&ctrl->ctrl);
2413*4882a593Smuzhiyun nvme_put_ctrl(&ctrl->ctrl);
2414*4882a593Smuzhiyun if (ret > 0)
2415*4882a593Smuzhiyun ret = -EIO;
2416*4882a593Smuzhiyun return ERR_PTR(ret);
2417*4882a593Smuzhiyun out_kfree_queues:
2418*4882a593Smuzhiyun kfree(ctrl->queues);
2419*4882a593Smuzhiyun out_free_ctrl:
2420*4882a593Smuzhiyun kfree(ctrl);
2421*4882a593Smuzhiyun return ERR_PTR(ret);
2422*4882a593Smuzhiyun }
2423*4882a593Smuzhiyun
2424*4882a593Smuzhiyun static struct nvmf_transport_ops nvme_rdma_transport = {
2425*4882a593Smuzhiyun .name = "rdma",
2426*4882a593Smuzhiyun .module = THIS_MODULE,
2427*4882a593Smuzhiyun .required_opts = NVMF_OPT_TRADDR,
2428*4882a593Smuzhiyun .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
2429*4882a593Smuzhiyun NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
2430*4882a593Smuzhiyun NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
2431*4882a593Smuzhiyun NVMF_OPT_TOS,
2432*4882a593Smuzhiyun .create_ctrl = nvme_rdma_create_ctrl,
2433*4882a593Smuzhiyun };
2434*4882a593Smuzhiyun
nvme_rdma_remove_one(struct ib_device * ib_device,void * client_data)2435*4882a593Smuzhiyun static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data)
2436*4882a593Smuzhiyun {
2437*4882a593Smuzhiyun struct nvme_rdma_ctrl *ctrl;
2438*4882a593Smuzhiyun struct nvme_rdma_device *ndev;
2439*4882a593Smuzhiyun bool found = false;
2440*4882a593Smuzhiyun
2441*4882a593Smuzhiyun mutex_lock(&device_list_mutex);
2442*4882a593Smuzhiyun list_for_each_entry(ndev, &device_list, entry) {
2443*4882a593Smuzhiyun if (ndev->dev == ib_device) {
2444*4882a593Smuzhiyun found = true;
2445*4882a593Smuzhiyun break;
2446*4882a593Smuzhiyun }
2447*4882a593Smuzhiyun }
2448*4882a593Smuzhiyun mutex_unlock(&device_list_mutex);
2449*4882a593Smuzhiyun
2450*4882a593Smuzhiyun if (!found)
2451*4882a593Smuzhiyun return;
2452*4882a593Smuzhiyun
2453*4882a593Smuzhiyun /* Delete all controllers using this device */
2454*4882a593Smuzhiyun mutex_lock(&nvme_rdma_ctrl_mutex);
2455*4882a593Smuzhiyun list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) {
2456*4882a593Smuzhiyun if (ctrl->device->dev != ib_device)
2457*4882a593Smuzhiyun continue;
2458*4882a593Smuzhiyun nvme_delete_ctrl(&ctrl->ctrl);
2459*4882a593Smuzhiyun }
2460*4882a593Smuzhiyun mutex_unlock(&nvme_rdma_ctrl_mutex);
2461*4882a593Smuzhiyun
2462*4882a593Smuzhiyun flush_workqueue(nvme_delete_wq);
2463*4882a593Smuzhiyun }
2464*4882a593Smuzhiyun
2465*4882a593Smuzhiyun static struct ib_client nvme_rdma_ib_client = {
2466*4882a593Smuzhiyun .name = "nvme_rdma",
2467*4882a593Smuzhiyun .remove = nvme_rdma_remove_one
2468*4882a593Smuzhiyun };
2469*4882a593Smuzhiyun
nvme_rdma_init_module(void)2470*4882a593Smuzhiyun static int __init nvme_rdma_init_module(void)
2471*4882a593Smuzhiyun {
2472*4882a593Smuzhiyun int ret;
2473*4882a593Smuzhiyun
2474*4882a593Smuzhiyun ret = ib_register_client(&nvme_rdma_ib_client);
2475*4882a593Smuzhiyun if (ret)
2476*4882a593Smuzhiyun return ret;
2477*4882a593Smuzhiyun
2478*4882a593Smuzhiyun ret = nvmf_register_transport(&nvme_rdma_transport);
2479*4882a593Smuzhiyun if (ret)
2480*4882a593Smuzhiyun goto err_unreg_client;
2481*4882a593Smuzhiyun
2482*4882a593Smuzhiyun return 0;
2483*4882a593Smuzhiyun
2484*4882a593Smuzhiyun err_unreg_client:
2485*4882a593Smuzhiyun ib_unregister_client(&nvme_rdma_ib_client);
2486*4882a593Smuzhiyun return ret;
2487*4882a593Smuzhiyun }
2488*4882a593Smuzhiyun
nvme_rdma_cleanup_module(void)2489*4882a593Smuzhiyun static void __exit nvme_rdma_cleanup_module(void)
2490*4882a593Smuzhiyun {
2491*4882a593Smuzhiyun struct nvme_rdma_ctrl *ctrl;
2492*4882a593Smuzhiyun
2493*4882a593Smuzhiyun nvmf_unregister_transport(&nvme_rdma_transport);
2494*4882a593Smuzhiyun ib_unregister_client(&nvme_rdma_ib_client);
2495*4882a593Smuzhiyun
2496*4882a593Smuzhiyun mutex_lock(&nvme_rdma_ctrl_mutex);
2497*4882a593Smuzhiyun list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list)
2498*4882a593Smuzhiyun nvme_delete_ctrl(&ctrl->ctrl);
2499*4882a593Smuzhiyun mutex_unlock(&nvme_rdma_ctrl_mutex);
2500*4882a593Smuzhiyun flush_workqueue(nvme_delete_wq);
2501*4882a593Smuzhiyun }
2502*4882a593Smuzhiyun
2503*4882a593Smuzhiyun module_init(nvme_rdma_init_module);
2504*4882a593Smuzhiyun module_exit(nvme_rdma_cleanup_module);
2505*4882a593Smuzhiyun
2506*4882a593Smuzhiyun MODULE_LICENSE("GPL v2");
2507