1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun * Shared Memory Communications over RDMA (SMC-R) and RoCE
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * IB infrastructure:
6*4882a593Smuzhiyun * Establish SMC-R as an Infiniband Client to be notified about added and
7*4882a593Smuzhiyun * removed IB devices of type RDMA.
8*4882a593Smuzhiyun * Determine device and port characteristics for these IB devices.
9*4882a593Smuzhiyun *
10*4882a593Smuzhiyun * Copyright IBM Corp. 2016
11*4882a593Smuzhiyun *
12*4882a593Smuzhiyun * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
13*4882a593Smuzhiyun */
14*4882a593Smuzhiyun
15*4882a593Smuzhiyun #include <linux/random.h>
16*4882a593Smuzhiyun #include <linux/workqueue.h>
17*4882a593Smuzhiyun #include <linux/scatterlist.h>
18*4882a593Smuzhiyun #include <linux/wait.h>
19*4882a593Smuzhiyun #include <linux/mutex.h>
20*4882a593Smuzhiyun #include <rdma/ib_verbs.h>
21*4882a593Smuzhiyun #include <rdma/ib_cache.h>
22*4882a593Smuzhiyun
23*4882a593Smuzhiyun #include "smc_pnet.h"
24*4882a593Smuzhiyun #include "smc_ib.h"
25*4882a593Smuzhiyun #include "smc_core.h"
26*4882a593Smuzhiyun #include "smc_wr.h"
27*4882a593Smuzhiyun #include "smc.h"
28*4882a593Smuzhiyun
29*4882a593Smuzhiyun #define SMC_MAX_CQE 32766 /* max. # of completion queue elements */
30*4882a593Smuzhiyun
31*4882a593Smuzhiyun #define SMC_QP_MIN_RNR_TIMER 5
32*4882a593Smuzhiyun #define SMC_QP_TIMEOUT 15 /* 4096 * 2 ** timeout usec */
33*4882a593Smuzhiyun #define SMC_QP_RETRY_CNT 7 /* 7: infinite */
34*4882a593Smuzhiyun #define SMC_QP_RNR_RETRY 7 /* 7: infinite */
35*4882a593Smuzhiyun
36*4882a593Smuzhiyun struct smc_ib_devices smc_ib_devices = { /* smc-registered ib devices */
37*4882a593Smuzhiyun .mutex = __MUTEX_INITIALIZER(smc_ib_devices.mutex),
38*4882a593Smuzhiyun .list = LIST_HEAD_INIT(smc_ib_devices.list),
39*4882a593Smuzhiyun };
40*4882a593Smuzhiyun
41*4882a593Smuzhiyun u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */
42*4882a593Smuzhiyun
smc_ib_modify_qp_init(struct smc_link * lnk)43*4882a593Smuzhiyun static int smc_ib_modify_qp_init(struct smc_link *lnk)
44*4882a593Smuzhiyun {
45*4882a593Smuzhiyun struct ib_qp_attr qp_attr;
46*4882a593Smuzhiyun
47*4882a593Smuzhiyun memset(&qp_attr, 0, sizeof(qp_attr));
48*4882a593Smuzhiyun qp_attr.qp_state = IB_QPS_INIT;
49*4882a593Smuzhiyun qp_attr.pkey_index = 0;
50*4882a593Smuzhiyun qp_attr.port_num = lnk->ibport;
51*4882a593Smuzhiyun qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE
52*4882a593Smuzhiyun | IB_ACCESS_REMOTE_WRITE;
53*4882a593Smuzhiyun return ib_modify_qp(lnk->roce_qp, &qp_attr,
54*4882a593Smuzhiyun IB_QP_STATE | IB_QP_PKEY_INDEX |
55*4882a593Smuzhiyun IB_QP_ACCESS_FLAGS | IB_QP_PORT);
56*4882a593Smuzhiyun }
57*4882a593Smuzhiyun
smc_ib_modify_qp_rtr(struct smc_link * lnk)58*4882a593Smuzhiyun static int smc_ib_modify_qp_rtr(struct smc_link *lnk)
59*4882a593Smuzhiyun {
60*4882a593Smuzhiyun enum ib_qp_attr_mask qp_attr_mask =
61*4882a593Smuzhiyun IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN |
62*4882a593Smuzhiyun IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER;
63*4882a593Smuzhiyun struct ib_qp_attr qp_attr;
64*4882a593Smuzhiyun
65*4882a593Smuzhiyun memset(&qp_attr, 0, sizeof(qp_attr));
66*4882a593Smuzhiyun qp_attr.qp_state = IB_QPS_RTR;
67*4882a593Smuzhiyun qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu);
68*4882a593Smuzhiyun qp_attr.ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE;
69*4882a593Smuzhiyun rdma_ah_set_port_num(&qp_attr.ah_attr, lnk->ibport);
70*4882a593Smuzhiyun rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, lnk->sgid_index, 1, 0);
71*4882a593Smuzhiyun rdma_ah_set_dgid_raw(&qp_attr.ah_attr, lnk->peer_gid);
72*4882a593Smuzhiyun memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac,
73*4882a593Smuzhiyun sizeof(lnk->peer_mac));
74*4882a593Smuzhiyun qp_attr.dest_qp_num = lnk->peer_qpn;
75*4882a593Smuzhiyun qp_attr.rq_psn = lnk->peer_psn; /* starting receive packet seq # */
76*4882a593Smuzhiyun qp_attr.max_dest_rd_atomic = 1; /* max # of resources for incoming
77*4882a593Smuzhiyun * requests
78*4882a593Smuzhiyun */
79*4882a593Smuzhiyun qp_attr.min_rnr_timer = SMC_QP_MIN_RNR_TIMER;
80*4882a593Smuzhiyun
81*4882a593Smuzhiyun return ib_modify_qp(lnk->roce_qp, &qp_attr, qp_attr_mask);
82*4882a593Smuzhiyun }
83*4882a593Smuzhiyun
smc_ib_modify_qp_rts(struct smc_link * lnk)84*4882a593Smuzhiyun int smc_ib_modify_qp_rts(struct smc_link *lnk)
85*4882a593Smuzhiyun {
86*4882a593Smuzhiyun struct ib_qp_attr qp_attr;
87*4882a593Smuzhiyun
88*4882a593Smuzhiyun memset(&qp_attr, 0, sizeof(qp_attr));
89*4882a593Smuzhiyun qp_attr.qp_state = IB_QPS_RTS;
90*4882a593Smuzhiyun qp_attr.timeout = SMC_QP_TIMEOUT; /* local ack timeout */
91*4882a593Smuzhiyun qp_attr.retry_cnt = SMC_QP_RETRY_CNT; /* retry count */
92*4882a593Smuzhiyun qp_attr.rnr_retry = SMC_QP_RNR_RETRY; /* RNR retries, 7=infinite */
93*4882a593Smuzhiyun qp_attr.sq_psn = lnk->psn_initial; /* starting send packet seq # */
94*4882a593Smuzhiyun qp_attr.max_rd_atomic = 1; /* # of outstanding RDMA reads and
95*4882a593Smuzhiyun * atomic ops allowed
96*4882a593Smuzhiyun */
97*4882a593Smuzhiyun return ib_modify_qp(lnk->roce_qp, &qp_attr,
98*4882a593Smuzhiyun IB_QP_STATE | IB_QP_TIMEOUT | IB_QP_RETRY_CNT |
99*4882a593Smuzhiyun IB_QP_SQ_PSN | IB_QP_RNR_RETRY |
100*4882a593Smuzhiyun IB_QP_MAX_QP_RD_ATOMIC);
101*4882a593Smuzhiyun }
102*4882a593Smuzhiyun
smc_ib_modify_qp_error(struct smc_link * lnk)103*4882a593Smuzhiyun int smc_ib_modify_qp_error(struct smc_link *lnk)
104*4882a593Smuzhiyun {
105*4882a593Smuzhiyun struct ib_qp_attr qp_attr;
106*4882a593Smuzhiyun
107*4882a593Smuzhiyun memset(&qp_attr, 0, sizeof(qp_attr));
108*4882a593Smuzhiyun qp_attr.qp_state = IB_QPS_ERR;
109*4882a593Smuzhiyun return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE);
110*4882a593Smuzhiyun }
111*4882a593Smuzhiyun
smc_ib_ready_link(struct smc_link * lnk)112*4882a593Smuzhiyun int smc_ib_ready_link(struct smc_link *lnk)
113*4882a593Smuzhiyun {
114*4882a593Smuzhiyun struct smc_link_group *lgr = smc_get_lgr(lnk);
115*4882a593Smuzhiyun int rc = 0;
116*4882a593Smuzhiyun
117*4882a593Smuzhiyun rc = smc_ib_modify_qp_init(lnk);
118*4882a593Smuzhiyun if (rc)
119*4882a593Smuzhiyun goto out;
120*4882a593Smuzhiyun
121*4882a593Smuzhiyun rc = smc_ib_modify_qp_rtr(lnk);
122*4882a593Smuzhiyun if (rc)
123*4882a593Smuzhiyun goto out;
124*4882a593Smuzhiyun smc_wr_remember_qp_attr(lnk);
125*4882a593Smuzhiyun rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv,
126*4882a593Smuzhiyun IB_CQ_SOLICITED_MASK);
127*4882a593Smuzhiyun if (rc)
128*4882a593Smuzhiyun goto out;
129*4882a593Smuzhiyun rc = smc_wr_rx_post_init(lnk);
130*4882a593Smuzhiyun if (rc)
131*4882a593Smuzhiyun goto out;
132*4882a593Smuzhiyun smc_wr_remember_qp_attr(lnk);
133*4882a593Smuzhiyun
134*4882a593Smuzhiyun if (lgr->role == SMC_SERV) {
135*4882a593Smuzhiyun rc = smc_ib_modify_qp_rts(lnk);
136*4882a593Smuzhiyun if (rc)
137*4882a593Smuzhiyun goto out;
138*4882a593Smuzhiyun smc_wr_remember_qp_attr(lnk);
139*4882a593Smuzhiyun }
140*4882a593Smuzhiyun out:
141*4882a593Smuzhiyun return rc;
142*4882a593Smuzhiyun }
143*4882a593Smuzhiyun
smc_ib_fill_mac(struct smc_ib_device * smcibdev,u8 ibport)144*4882a593Smuzhiyun static int smc_ib_fill_mac(struct smc_ib_device *smcibdev, u8 ibport)
145*4882a593Smuzhiyun {
146*4882a593Smuzhiyun const struct ib_gid_attr *attr;
147*4882a593Smuzhiyun int rc;
148*4882a593Smuzhiyun
149*4882a593Smuzhiyun attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, 0);
150*4882a593Smuzhiyun if (IS_ERR(attr))
151*4882a593Smuzhiyun return -ENODEV;
152*4882a593Smuzhiyun
153*4882a593Smuzhiyun rc = rdma_read_gid_l2_fields(attr, NULL, smcibdev->mac[ibport - 1]);
154*4882a593Smuzhiyun rdma_put_gid_attr(attr);
155*4882a593Smuzhiyun return rc;
156*4882a593Smuzhiyun }
157*4882a593Smuzhiyun
158*4882a593Smuzhiyun /* Create an identifier unique for this instance of SMC-R.
159*4882a593Smuzhiyun * The MAC-address of the first active registered IB device
160*4882a593Smuzhiyun * plus a random 2-byte number is used to create this identifier.
161*4882a593Smuzhiyun * This name is delivered to the peer during connection initialization.
162*4882a593Smuzhiyun */
smc_ib_define_local_systemid(struct smc_ib_device * smcibdev,u8 ibport)163*4882a593Smuzhiyun static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev,
164*4882a593Smuzhiyun u8 ibport)
165*4882a593Smuzhiyun {
166*4882a593Smuzhiyun memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1],
167*4882a593Smuzhiyun sizeof(smcibdev->mac[ibport - 1]));
168*4882a593Smuzhiyun }
169*4882a593Smuzhiyun
smc_ib_is_valid_local_systemid(void)170*4882a593Smuzhiyun bool smc_ib_is_valid_local_systemid(void)
171*4882a593Smuzhiyun {
172*4882a593Smuzhiyun return !is_zero_ether_addr(&local_systemid[2]);
173*4882a593Smuzhiyun }
174*4882a593Smuzhiyun
smc_ib_init_local_systemid(void)175*4882a593Smuzhiyun static void smc_ib_init_local_systemid(void)
176*4882a593Smuzhiyun {
177*4882a593Smuzhiyun get_random_bytes(&local_systemid[0], 2);
178*4882a593Smuzhiyun }
179*4882a593Smuzhiyun
smc_ib_port_active(struct smc_ib_device * smcibdev,u8 ibport)180*4882a593Smuzhiyun bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport)
181*4882a593Smuzhiyun {
182*4882a593Smuzhiyun return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE;
183*4882a593Smuzhiyun }
184*4882a593Smuzhiyun
185*4882a593Smuzhiyun /* determine the gid for an ib-device port and vlan id */
smc_ib_determine_gid(struct smc_ib_device * smcibdev,u8 ibport,unsigned short vlan_id,u8 gid[],u8 * sgid_index)186*4882a593Smuzhiyun int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
187*4882a593Smuzhiyun unsigned short vlan_id, u8 gid[], u8 *sgid_index)
188*4882a593Smuzhiyun {
189*4882a593Smuzhiyun const struct ib_gid_attr *attr;
190*4882a593Smuzhiyun const struct net_device *ndev;
191*4882a593Smuzhiyun int i;
192*4882a593Smuzhiyun
193*4882a593Smuzhiyun for (i = 0; i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) {
194*4882a593Smuzhiyun attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, i);
195*4882a593Smuzhiyun if (IS_ERR(attr))
196*4882a593Smuzhiyun continue;
197*4882a593Smuzhiyun
198*4882a593Smuzhiyun rcu_read_lock();
199*4882a593Smuzhiyun ndev = rdma_read_gid_attr_ndev_rcu(attr);
200*4882a593Smuzhiyun if (!IS_ERR(ndev) &&
201*4882a593Smuzhiyun ((!vlan_id && !is_vlan_dev(ndev)) ||
202*4882a593Smuzhiyun (vlan_id && is_vlan_dev(ndev) &&
203*4882a593Smuzhiyun vlan_dev_vlan_id(ndev) == vlan_id)) &&
204*4882a593Smuzhiyun attr->gid_type == IB_GID_TYPE_ROCE) {
205*4882a593Smuzhiyun rcu_read_unlock();
206*4882a593Smuzhiyun if (gid)
207*4882a593Smuzhiyun memcpy(gid, &attr->gid, SMC_GID_SIZE);
208*4882a593Smuzhiyun if (sgid_index)
209*4882a593Smuzhiyun *sgid_index = attr->index;
210*4882a593Smuzhiyun rdma_put_gid_attr(attr);
211*4882a593Smuzhiyun return 0;
212*4882a593Smuzhiyun }
213*4882a593Smuzhiyun rcu_read_unlock();
214*4882a593Smuzhiyun rdma_put_gid_attr(attr);
215*4882a593Smuzhiyun }
216*4882a593Smuzhiyun return -ENODEV;
217*4882a593Smuzhiyun }
218*4882a593Smuzhiyun
smc_ib_remember_port_attr(struct smc_ib_device * smcibdev,u8 ibport)219*4882a593Smuzhiyun static int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport)
220*4882a593Smuzhiyun {
221*4882a593Smuzhiyun int rc;
222*4882a593Smuzhiyun
223*4882a593Smuzhiyun memset(&smcibdev->pattr[ibport - 1], 0,
224*4882a593Smuzhiyun sizeof(smcibdev->pattr[ibport - 1]));
225*4882a593Smuzhiyun rc = ib_query_port(smcibdev->ibdev, ibport,
226*4882a593Smuzhiyun &smcibdev->pattr[ibport - 1]);
227*4882a593Smuzhiyun if (rc)
228*4882a593Smuzhiyun goto out;
229*4882a593Smuzhiyun /* the SMC protocol requires specification of the RoCE MAC address */
230*4882a593Smuzhiyun rc = smc_ib_fill_mac(smcibdev, ibport);
231*4882a593Smuzhiyun if (rc)
232*4882a593Smuzhiyun goto out;
233*4882a593Smuzhiyun if (!smc_ib_is_valid_local_systemid() &&
234*4882a593Smuzhiyun smc_ib_port_active(smcibdev, ibport))
235*4882a593Smuzhiyun /* create unique system identifier */
236*4882a593Smuzhiyun smc_ib_define_local_systemid(smcibdev, ibport);
237*4882a593Smuzhiyun out:
238*4882a593Smuzhiyun return rc;
239*4882a593Smuzhiyun }
240*4882a593Smuzhiyun
241*4882a593Smuzhiyun /* process context wrapper for might_sleep smc_ib_remember_port_attr */
smc_ib_port_event_work(struct work_struct * work)242*4882a593Smuzhiyun static void smc_ib_port_event_work(struct work_struct *work)
243*4882a593Smuzhiyun {
244*4882a593Smuzhiyun struct smc_ib_device *smcibdev = container_of(
245*4882a593Smuzhiyun work, struct smc_ib_device, port_event_work);
246*4882a593Smuzhiyun u8 port_idx;
247*4882a593Smuzhiyun
248*4882a593Smuzhiyun for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) {
249*4882a593Smuzhiyun smc_ib_remember_port_attr(smcibdev, port_idx + 1);
250*4882a593Smuzhiyun clear_bit(port_idx, &smcibdev->port_event_mask);
251*4882a593Smuzhiyun if (!smc_ib_port_active(smcibdev, port_idx + 1)) {
252*4882a593Smuzhiyun set_bit(port_idx, smcibdev->ports_going_away);
253*4882a593Smuzhiyun smcr_port_err(smcibdev, port_idx + 1);
254*4882a593Smuzhiyun } else {
255*4882a593Smuzhiyun clear_bit(port_idx, smcibdev->ports_going_away);
256*4882a593Smuzhiyun smcr_port_add(smcibdev, port_idx + 1);
257*4882a593Smuzhiyun }
258*4882a593Smuzhiyun }
259*4882a593Smuzhiyun }
260*4882a593Smuzhiyun
261*4882a593Smuzhiyun /* can be called in IRQ context */
smc_ib_global_event_handler(struct ib_event_handler * handler,struct ib_event * ibevent)262*4882a593Smuzhiyun static void smc_ib_global_event_handler(struct ib_event_handler *handler,
263*4882a593Smuzhiyun struct ib_event *ibevent)
264*4882a593Smuzhiyun {
265*4882a593Smuzhiyun struct smc_ib_device *smcibdev;
266*4882a593Smuzhiyun bool schedule = false;
267*4882a593Smuzhiyun u8 port_idx;
268*4882a593Smuzhiyun
269*4882a593Smuzhiyun smcibdev = container_of(handler, struct smc_ib_device, event_handler);
270*4882a593Smuzhiyun
271*4882a593Smuzhiyun switch (ibevent->event) {
272*4882a593Smuzhiyun case IB_EVENT_DEVICE_FATAL:
273*4882a593Smuzhiyun /* terminate all ports on device */
274*4882a593Smuzhiyun for (port_idx = 0; port_idx < SMC_MAX_PORTS; port_idx++) {
275*4882a593Smuzhiyun set_bit(port_idx, &smcibdev->port_event_mask);
276*4882a593Smuzhiyun if (!test_and_set_bit(port_idx,
277*4882a593Smuzhiyun smcibdev->ports_going_away))
278*4882a593Smuzhiyun schedule = true;
279*4882a593Smuzhiyun }
280*4882a593Smuzhiyun if (schedule)
281*4882a593Smuzhiyun schedule_work(&smcibdev->port_event_work);
282*4882a593Smuzhiyun break;
283*4882a593Smuzhiyun case IB_EVENT_PORT_ACTIVE:
284*4882a593Smuzhiyun port_idx = ibevent->element.port_num - 1;
285*4882a593Smuzhiyun if (port_idx >= SMC_MAX_PORTS)
286*4882a593Smuzhiyun break;
287*4882a593Smuzhiyun set_bit(port_idx, &smcibdev->port_event_mask);
288*4882a593Smuzhiyun if (test_and_clear_bit(port_idx, smcibdev->ports_going_away))
289*4882a593Smuzhiyun schedule_work(&smcibdev->port_event_work);
290*4882a593Smuzhiyun break;
291*4882a593Smuzhiyun case IB_EVENT_PORT_ERR:
292*4882a593Smuzhiyun port_idx = ibevent->element.port_num - 1;
293*4882a593Smuzhiyun if (port_idx >= SMC_MAX_PORTS)
294*4882a593Smuzhiyun break;
295*4882a593Smuzhiyun set_bit(port_idx, &smcibdev->port_event_mask);
296*4882a593Smuzhiyun if (!test_and_set_bit(port_idx, smcibdev->ports_going_away))
297*4882a593Smuzhiyun schedule_work(&smcibdev->port_event_work);
298*4882a593Smuzhiyun break;
299*4882a593Smuzhiyun case IB_EVENT_GID_CHANGE:
300*4882a593Smuzhiyun port_idx = ibevent->element.port_num - 1;
301*4882a593Smuzhiyun if (port_idx >= SMC_MAX_PORTS)
302*4882a593Smuzhiyun break;
303*4882a593Smuzhiyun set_bit(port_idx, &smcibdev->port_event_mask);
304*4882a593Smuzhiyun schedule_work(&smcibdev->port_event_work);
305*4882a593Smuzhiyun break;
306*4882a593Smuzhiyun default:
307*4882a593Smuzhiyun break;
308*4882a593Smuzhiyun }
309*4882a593Smuzhiyun }
310*4882a593Smuzhiyun
smc_ib_dealloc_protection_domain(struct smc_link * lnk)311*4882a593Smuzhiyun void smc_ib_dealloc_protection_domain(struct smc_link *lnk)
312*4882a593Smuzhiyun {
313*4882a593Smuzhiyun if (lnk->roce_pd)
314*4882a593Smuzhiyun ib_dealloc_pd(lnk->roce_pd);
315*4882a593Smuzhiyun lnk->roce_pd = NULL;
316*4882a593Smuzhiyun }
317*4882a593Smuzhiyun
smc_ib_create_protection_domain(struct smc_link * lnk)318*4882a593Smuzhiyun int smc_ib_create_protection_domain(struct smc_link *lnk)
319*4882a593Smuzhiyun {
320*4882a593Smuzhiyun int rc;
321*4882a593Smuzhiyun
322*4882a593Smuzhiyun lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, 0);
323*4882a593Smuzhiyun rc = PTR_ERR_OR_ZERO(lnk->roce_pd);
324*4882a593Smuzhiyun if (IS_ERR(lnk->roce_pd))
325*4882a593Smuzhiyun lnk->roce_pd = NULL;
326*4882a593Smuzhiyun return rc;
327*4882a593Smuzhiyun }
328*4882a593Smuzhiyun
smc_ib_qp_event_handler(struct ib_event * ibevent,void * priv)329*4882a593Smuzhiyun static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv)
330*4882a593Smuzhiyun {
331*4882a593Smuzhiyun struct smc_link *lnk = (struct smc_link *)priv;
332*4882a593Smuzhiyun struct smc_ib_device *smcibdev = lnk->smcibdev;
333*4882a593Smuzhiyun u8 port_idx;
334*4882a593Smuzhiyun
335*4882a593Smuzhiyun switch (ibevent->event) {
336*4882a593Smuzhiyun case IB_EVENT_QP_FATAL:
337*4882a593Smuzhiyun case IB_EVENT_QP_ACCESS_ERR:
338*4882a593Smuzhiyun port_idx = ibevent->element.qp->port - 1;
339*4882a593Smuzhiyun if (port_idx >= SMC_MAX_PORTS)
340*4882a593Smuzhiyun break;
341*4882a593Smuzhiyun set_bit(port_idx, &smcibdev->port_event_mask);
342*4882a593Smuzhiyun if (!test_and_set_bit(port_idx, smcibdev->ports_going_away))
343*4882a593Smuzhiyun schedule_work(&smcibdev->port_event_work);
344*4882a593Smuzhiyun break;
345*4882a593Smuzhiyun default:
346*4882a593Smuzhiyun break;
347*4882a593Smuzhiyun }
348*4882a593Smuzhiyun }
349*4882a593Smuzhiyun
smc_ib_destroy_queue_pair(struct smc_link * lnk)350*4882a593Smuzhiyun void smc_ib_destroy_queue_pair(struct smc_link *lnk)
351*4882a593Smuzhiyun {
352*4882a593Smuzhiyun if (lnk->roce_qp)
353*4882a593Smuzhiyun ib_destroy_qp(lnk->roce_qp);
354*4882a593Smuzhiyun lnk->roce_qp = NULL;
355*4882a593Smuzhiyun }
356*4882a593Smuzhiyun
357*4882a593Smuzhiyun /* create a queue pair within the protection domain for a link */
smc_ib_create_queue_pair(struct smc_link * lnk)358*4882a593Smuzhiyun int smc_ib_create_queue_pair(struct smc_link *lnk)
359*4882a593Smuzhiyun {
360*4882a593Smuzhiyun struct ib_qp_init_attr qp_attr = {
361*4882a593Smuzhiyun .event_handler = smc_ib_qp_event_handler,
362*4882a593Smuzhiyun .qp_context = lnk,
363*4882a593Smuzhiyun .send_cq = lnk->smcibdev->roce_cq_send,
364*4882a593Smuzhiyun .recv_cq = lnk->smcibdev->roce_cq_recv,
365*4882a593Smuzhiyun .srq = NULL,
366*4882a593Smuzhiyun .cap = {
367*4882a593Smuzhiyun /* include unsolicited rdma_writes as well,
368*4882a593Smuzhiyun * there are max. 2 RDMA_WRITE per 1 WR_SEND
369*4882a593Smuzhiyun */
370*4882a593Smuzhiyun .max_send_wr = SMC_WR_BUF_CNT * 3,
371*4882a593Smuzhiyun .max_recv_wr = SMC_WR_BUF_CNT * 3,
372*4882a593Smuzhiyun .max_send_sge = SMC_IB_MAX_SEND_SGE,
373*4882a593Smuzhiyun .max_recv_sge = 1,
374*4882a593Smuzhiyun },
375*4882a593Smuzhiyun .sq_sig_type = IB_SIGNAL_REQ_WR,
376*4882a593Smuzhiyun .qp_type = IB_QPT_RC,
377*4882a593Smuzhiyun };
378*4882a593Smuzhiyun int rc;
379*4882a593Smuzhiyun
380*4882a593Smuzhiyun lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr);
381*4882a593Smuzhiyun rc = PTR_ERR_OR_ZERO(lnk->roce_qp);
382*4882a593Smuzhiyun if (IS_ERR(lnk->roce_qp))
383*4882a593Smuzhiyun lnk->roce_qp = NULL;
384*4882a593Smuzhiyun else
385*4882a593Smuzhiyun smc_wr_remember_qp_attr(lnk);
386*4882a593Smuzhiyun return rc;
387*4882a593Smuzhiyun }
388*4882a593Smuzhiyun
smc_ib_put_memory_region(struct ib_mr * mr)389*4882a593Smuzhiyun void smc_ib_put_memory_region(struct ib_mr *mr)
390*4882a593Smuzhiyun {
391*4882a593Smuzhiyun ib_dereg_mr(mr);
392*4882a593Smuzhiyun }
393*4882a593Smuzhiyun
smc_ib_map_mr_sg(struct smc_buf_desc * buf_slot,u8 link_idx)394*4882a593Smuzhiyun static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot, u8 link_idx)
395*4882a593Smuzhiyun {
396*4882a593Smuzhiyun unsigned int offset = 0;
397*4882a593Smuzhiyun int sg_num;
398*4882a593Smuzhiyun
399*4882a593Smuzhiyun /* map the largest prefix of a dma mapped SG list */
400*4882a593Smuzhiyun sg_num = ib_map_mr_sg(buf_slot->mr_rx[link_idx],
401*4882a593Smuzhiyun buf_slot->sgt[link_idx].sgl,
402*4882a593Smuzhiyun buf_slot->sgt[link_idx].orig_nents,
403*4882a593Smuzhiyun &offset, PAGE_SIZE);
404*4882a593Smuzhiyun
405*4882a593Smuzhiyun return sg_num;
406*4882a593Smuzhiyun }
407*4882a593Smuzhiyun
408*4882a593Smuzhiyun /* Allocate a memory region and map the dma mapped SG list of buf_slot */
smc_ib_get_memory_region(struct ib_pd * pd,int access_flags,struct smc_buf_desc * buf_slot,u8 link_idx)409*4882a593Smuzhiyun int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
410*4882a593Smuzhiyun struct smc_buf_desc *buf_slot, u8 link_idx)
411*4882a593Smuzhiyun {
412*4882a593Smuzhiyun if (buf_slot->mr_rx[link_idx])
413*4882a593Smuzhiyun return 0; /* already done */
414*4882a593Smuzhiyun
415*4882a593Smuzhiyun buf_slot->mr_rx[link_idx] =
416*4882a593Smuzhiyun ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, 1 << buf_slot->order);
417*4882a593Smuzhiyun if (IS_ERR(buf_slot->mr_rx[link_idx])) {
418*4882a593Smuzhiyun int rc;
419*4882a593Smuzhiyun
420*4882a593Smuzhiyun rc = PTR_ERR(buf_slot->mr_rx[link_idx]);
421*4882a593Smuzhiyun buf_slot->mr_rx[link_idx] = NULL;
422*4882a593Smuzhiyun return rc;
423*4882a593Smuzhiyun }
424*4882a593Smuzhiyun
425*4882a593Smuzhiyun if (smc_ib_map_mr_sg(buf_slot, link_idx) != 1)
426*4882a593Smuzhiyun return -EINVAL;
427*4882a593Smuzhiyun
428*4882a593Smuzhiyun return 0;
429*4882a593Smuzhiyun }
430*4882a593Smuzhiyun
431*4882a593Smuzhiyun /* synchronize buffer usage for cpu access */
smc_ib_sync_sg_for_cpu(struct smc_link * lnk,struct smc_buf_desc * buf_slot,enum dma_data_direction data_direction)432*4882a593Smuzhiyun void smc_ib_sync_sg_for_cpu(struct smc_link *lnk,
433*4882a593Smuzhiyun struct smc_buf_desc *buf_slot,
434*4882a593Smuzhiyun enum dma_data_direction data_direction)
435*4882a593Smuzhiyun {
436*4882a593Smuzhiyun struct scatterlist *sg;
437*4882a593Smuzhiyun unsigned int i;
438*4882a593Smuzhiyun
439*4882a593Smuzhiyun /* for now there is just one DMA address */
440*4882a593Smuzhiyun for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg,
441*4882a593Smuzhiyun buf_slot->sgt[lnk->link_idx].nents, i) {
442*4882a593Smuzhiyun if (!sg_dma_len(sg))
443*4882a593Smuzhiyun break;
444*4882a593Smuzhiyun ib_dma_sync_single_for_cpu(lnk->smcibdev->ibdev,
445*4882a593Smuzhiyun sg_dma_address(sg),
446*4882a593Smuzhiyun sg_dma_len(sg),
447*4882a593Smuzhiyun data_direction);
448*4882a593Smuzhiyun }
449*4882a593Smuzhiyun }
450*4882a593Smuzhiyun
451*4882a593Smuzhiyun /* synchronize buffer usage for device access */
smc_ib_sync_sg_for_device(struct smc_link * lnk,struct smc_buf_desc * buf_slot,enum dma_data_direction data_direction)452*4882a593Smuzhiyun void smc_ib_sync_sg_for_device(struct smc_link *lnk,
453*4882a593Smuzhiyun struct smc_buf_desc *buf_slot,
454*4882a593Smuzhiyun enum dma_data_direction data_direction)
455*4882a593Smuzhiyun {
456*4882a593Smuzhiyun struct scatterlist *sg;
457*4882a593Smuzhiyun unsigned int i;
458*4882a593Smuzhiyun
459*4882a593Smuzhiyun /* for now there is just one DMA address */
460*4882a593Smuzhiyun for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg,
461*4882a593Smuzhiyun buf_slot->sgt[lnk->link_idx].nents, i) {
462*4882a593Smuzhiyun if (!sg_dma_len(sg))
463*4882a593Smuzhiyun break;
464*4882a593Smuzhiyun ib_dma_sync_single_for_device(lnk->smcibdev->ibdev,
465*4882a593Smuzhiyun sg_dma_address(sg),
466*4882a593Smuzhiyun sg_dma_len(sg),
467*4882a593Smuzhiyun data_direction);
468*4882a593Smuzhiyun }
469*4882a593Smuzhiyun }
470*4882a593Smuzhiyun
471*4882a593Smuzhiyun /* Map a new TX or RX buffer SG-table to DMA */
smc_ib_buf_map_sg(struct smc_link * lnk,struct smc_buf_desc * buf_slot,enum dma_data_direction data_direction)472*4882a593Smuzhiyun int smc_ib_buf_map_sg(struct smc_link *lnk,
473*4882a593Smuzhiyun struct smc_buf_desc *buf_slot,
474*4882a593Smuzhiyun enum dma_data_direction data_direction)
475*4882a593Smuzhiyun {
476*4882a593Smuzhiyun int mapped_nents;
477*4882a593Smuzhiyun
478*4882a593Smuzhiyun mapped_nents = ib_dma_map_sg(lnk->smcibdev->ibdev,
479*4882a593Smuzhiyun buf_slot->sgt[lnk->link_idx].sgl,
480*4882a593Smuzhiyun buf_slot->sgt[lnk->link_idx].orig_nents,
481*4882a593Smuzhiyun data_direction);
482*4882a593Smuzhiyun if (!mapped_nents)
483*4882a593Smuzhiyun return -ENOMEM;
484*4882a593Smuzhiyun
485*4882a593Smuzhiyun return mapped_nents;
486*4882a593Smuzhiyun }
487*4882a593Smuzhiyun
smc_ib_buf_unmap_sg(struct smc_link * lnk,struct smc_buf_desc * buf_slot,enum dma_data_direction data_direction)488*4882a593Smuzhiyun void smc_ib_buf_unmap_sg(struct smc_link *lnk,
489*4882a593Smuzhiyun struct smc_buf_desc *buf_slot,
490*4882a593Smuzhiyun enum dma_data_direction data_direction)
491*4882a593Smuzhiyun {
492*4882a593Smuzhiyun if (!buf_slot->sgt[lnk->link_idx].sgl->dma_address)
493*4882a593Smuzhiyun return; /* already unmapped */
494*4882a593Smuzhiyun
495*4882a593Smuzhiyun ib_dma_unmap_sg(lnk->smcibdev->ibdev,
496*4882a593Smuzhiyun buf_slot->sgt[lnk->link_idx].sgl,
497*4882a593Smuzhiyun buf_slot->sgt[lnk->link_idx].orig_nents,
498*4882a593Smuzhiyun data_direction);
499*4882a593Smuzhiyun buf_slot->sgt[lnk->link_idx].sgl->dma_address = 0;
500*4882a593Smuzhiyun }
501*4882a593Smuzhiyun
smc_ib_setup_per_ibdev(struct smc_ib_device * smcibdev)502*4882a593Smuzhiyun long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev)
503*4882a593Smuzhiyun {
504*4882a593Smuzhiyun struct ib_cq_init_attr cqattr = {
505*4882a593Smuzhiyun .cqe = SMC_MAX_CQE, .comp_vector = 0 };
506*4882a593Smuzhiyun int cqe_size_order, smc_order;
507*4882a593Smuzhiyun long rc;
508*4882a593Smuzhiyun
509*4882a593Smuzhiyun mutex_lock(&smcibdev->mutex);
510*4882a593Smuzhiyun rc = 0;
511*4882a593Smuzhiyun if (smcibdev->initialized)
512*4882a593Smuzhiyun goto out;
513*4882a593Smuzhiyun /* the calculated number of cq entries fits to mlx5 cq allocation */
514*4882a593Smuzhiyun cqe_size_order = cache_line_size() == 128 ? 7 : 6;
515*4882a593Smuzhiyun smc_order = MAX_ORDER - cqe_size_order - 1;
516*4882a593Smuzhiyun if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE)
517*4882a593Smuzhiyun cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2;
518*4882a593Smuzhiyun smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev,
519*4882a593Smuzhiyun smc_wr_tx_cq_handler, NULL,
520*4882a593Smuzhiyun smcibdev, &cqattr);
521*4882a593Smuzhiyun rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send);
522*4882a593Smuzhiyun if (IS_ERR(smcibdev->roce_cq_send)) {
523*4882a593Smuzhiyun smcibdev->roce_cq_send = NULL;
524*4882a593Smuzhiyun goto out;
525*4882a593Smuzhiyun }
526*4882a593Smuzhiyun smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev,
527*4882a593Smuzhiyun smc_wr_rx_cq_handler, NULL,
528*4882a593Smuzhiyun smcibdev, &cqattr);
529*4882a593Smuzhiyun rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_recv);
530*4882a593Smuzhiyun if (IS_ERR(smcibdev->roce_cq_recv)) {
531*4882a593Smuzhiyun smcibdev->roce_cq_recv = NULL;
532*4882a593Smuzhiyun goto err;
533*4882a593Smuzhiyun }
534*4882a593Smuzhiyun smc_wr_add_dev(smcibdev);
535*4882a593Smuzhiyun smcibdev->initialized = 1;
536*4882a593Smuzhiyun goto out;
537*4882a593Smuzhiyun
538*4882a593Smuzhiyun err:
539*4882a593Smuzhiyun ib_destroy_cq(smcibdev->roce_cq_send);
540*4882a593Smuzhiyun out:
541*4882a593Smuzhiyun mutex_unlock(&smcibdev->mutex);
542*4882a593Smuzhiyun return rc;
543*4882a593Smuzhiyun }
544*4882a593Smuzhiyun
smc_ib_cleanup_per_ibdev(struct smc_ib_device * smcibdev)545*4882a593Smuzhiyun static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev)
546*4882a593Smuzhiyun {
547*4882a593Smuzhiyun mutex_lock(&smcibdev->mutex);
548*4882a593Smuzhiyun if (!smcibdev->initialized)
549*4882a593Smuzhiyun goto out;
550*4882a593Smuzhiyun smcibdev->initialized = 0;
551*4882a593Smuzhiyun ib_destroy_cq(smcibdev->roce_cq_recv);
552*4882a593Smuzhiyun ib_destroy_cq(smcibdev->roce_cq_send);
553*4882a593Smuzhiyun smc_wr_remove_dev(smcibdev);
554*4882a593Smuzhiyun out:
555*4882a593Smuzhiyun mutex_unlock(&smcibdev->mutex);
556*4882a593Smuzhiyun }
557*4882a593Smuzhiyun
558*4882a593Smuzhiyun static struct ib_client smc_ib_client;
559*4882a593Smuzhiyun
560*4882a593Smuzhiyun /* callback function for ib_register_client() */
smc_ib_add_dev(struct ib_device * ibdev)561*4882a593Smuzhiyun static int smc_ib_add_dev(struct ib_device *ibdev)
562*4882a593Smuzhiyun {
563*4882a593Smuzhiyun struct smc_ib_device *smcibdev;
564*4882a593Smuzhiyun u8 port_cnt;
565*4882a593Smuzhiyun int i;
566*4882a593Smuzhiyun
567*4882a593Smuzhiyun if (ibdev->node_type != RDMA_NODE_IB_CA)
568*4882a593Smuzhiyun return -EOPNOTSUPP;
569*4882a593Smuzhiyun
570*4882a593Smuzhiyun smcibdev = kzalloc(sizeof(*smcibdev), GFP_KERNEL);
571*4882a593Smuzhiyun if (!smcibdev)
572*4882a593Smuzhiyun return -ENOMEM;
573*4882a593Smuzhiyun
574*4882a593Smuzhiyun smcibdev->ibdev = ibdev;
575*4882a593Smuzhiyun INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work);
576*4882a593Smuzhiyun atomic_set(&smcibdev->lnk_cnt, 0);
577*4882a593Smuzhiyun init_waitqueue_head(&smcibdev->lnks_deleted);
578*4882a593Smuzhiyun mutex_init(&smcibdev->mutex);
579*4882a593Smuzhiyun mutex_lock(&smc_ib_devices.mutex);
580*4882a593Smuzhiyun list_add_tail(&smcibdev->list, &smc_ib_devices.list);
581*4882a593Smuzhiyun mutex_unlock(&smc_ib_devices.mutex);
582*4882a593Smuzhiyun ib_set_client_data(ibdev, &smc_ib_client, smcibdev);
583*4882a593Smuzhiyun INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev,
584*4882a593Smuzhiyun smc_ib_global_event_handler);
585*4882a593Smuzhiyun ib_register_event_handler(&smcibdev->event_handler);
586*4882a593Smuzhiyun
587*4882a593Smuzhiyun /* trigger reading of the port attributes */
588*4882a593Smuzhiyun port_cnt = smcibdev->ibdev->phys_port_cnt;
589*4882a593Smuzhiyun pr_warn_ratelimited("smc: adding ib device %s with port count %d\n",
590*4882a593Smuzhiyun smcibdev->ibdev->name, port_cnt);
591*4882a593Smuzhiyun for (i = 0;
592*4882a593Smuzhiyun i < min_t(size_t, port_cnt, SMC_MAX_PORTS);
593*4882a593Smuzhiyun i++) {
594*4882a593Smuzhiyun set_bit(i, &smcibdev->port_event_mask);
595*4882a593Smuzhiyun /* determine pnetids of the port */
596*4882a593Smuzhiyun if (smc_pnetid_by_dev_port(ibdev->dev.parent, i,
597*4882a593Smuzhiyun smcibdev->pnetid[i]))
598*4882a593Smuzhiyun smc_pnetid_by_table_ib(smcibdev, i + 1);
599*4882a593Smuzhiyun pr_warn_ratelimited("smc: ib device %s port %d has pnetid "
600*4882a593Smuzhiyun "%.16s%s\n",
601*4882a593Smuzhiyun smcibdev->ibdev->name, i + 1,
602*4882a593Smuzhiyun smcibdev->pnetid[i],
603*4882a593Smuzhiyun smcibdev->pnetid_by_user[i] ?
604*4882a593Smuzhiyun " (user defined)" :
605*4882a593Smuzhiyun "");
606*4882a593Smuzhiyun }
607*4882a593Smuzhiyun schedule_work(&smcibdev->port_event_work);
608*4882a593Smuzhiyun return 0;
609*4882a593Smuzhiyun }
610*4882a593Smuzhiyun
611*4882a593Smuzhiyun /* callback function for ib_unregister_client() */
smc_ib_remove_dev(struct ib_device * ibdev,void * client_data)612*4882a593Smuzhiyun static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data)
613*4882a593Smuzhiyun {
614*4882a593Smuzhiyun struct smc_ib_device *smcibdev = client_data;
615*4882a593Smuzhiyun
616*4882a593Smuzhiyun mutex_lock(&smc_ib_devices.mutex);
617*4882a593Smuzhiyun list_del_init(&smcibdev->list); /* remove from smc_ib_devices */
618*4882a593Smuzhiyun mutex_unlock(&smc_ib_devices.mutex);
619*4882a593Smuzhiyun pr_warn_ratelimited("smc: removing ib device %s\n",
620*4882a593Smuzhiyun smcibdev->ibdev->name);
621*4882a593Smuzhiyun smc_smcr_terminate_all(smcibdev);
622*4882a593Smuzhiyun smc_ib_cleanup_per_ibdev(smcibdev);
623*4882a593Smuzhiyun ib_unregister_event_handler(&smcibdev->event_handler);
624*4882a593Smuzhiyun cancel_work_sync(&smcibdev->port_event_work);
625*4882a593Smuzhiyun kfree(smcibdev);
626*4882a593Smuzhiyun }
627*4882a593Smuzhiyun
628*4882a593Smuzhiyun static struct ib_client smc_ib_client = {
629*4882a593Smuzhiyun .name = "smc_ib",
630*4882a593Smuzhiyun .add = smc_ib_add_dev,
631*4882a593Smuzhiyun .remove = smc_ib_remove_dev,
632*4882a593Smuzhiyun };
633*4882a593Smuzhiyun
smc_ib_register_client(void)634*4882a593Smuzhiyun int __init smc_ib_register_client(void)
635*4882a593Smuzhiyun {
636*4882a593Smuzhiyun smc_ib_init_local_systemid();
637*4882a593Smuzhiyun return ib_register_client(&smc_ib_client);
638*4882a593Smuzhiyun }
639*4882a593Smuzhiyun
smc_ib_unregister_client(void)640*4882a593Smuzhiyun void smc_ib_unregister_client(void)
641*4882a593Smuzhiyun {
642*4882a593Smuzhiyun ib_unregister_client(&smc_ib_client);
643*4882a593Smuzhiyun }
644